yencode 1.1.5 → 1.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. package/README.md +130 -189
  2. package/binding.gyp +115 -6
  3. package/index.js +2 -0
  4. package/package.json +1 -1
  5. package/src/common.h +37 -7
  6. package/src/crc.cc +121 -47
  7. package/src/crc.h +74 -10
  8. package/src/crc_arm.cc +51 -34
  9. package/src/crc_arm_pmull.cc +215 -0
  10. package/src/crc_common.h +22 -0
  11. package/src/crc_folding.cc +154 -16
  12. package/src/crc_folding_256.cc +7 -14
  13. package/src/crc_riscv.cc +251 -0
  14. package/src/decoder.cc +373 -13
  15. package/src/decoder.h +10 -14
  16. package/src/decoder_avx.cc +5 -6
  17. package/src/decoder_avx2.cc +8 -9
  18. package/src/decoder_avx2_base.h +7 -11
  19. package/src/decoder_common.h +56 -373
  20. package/src/decoder_neon.cc +13 -19
  21. package/src/decoder_neon64.cc +12 -15
  22. package/src/decoder_rvv.cc +280 -0
  23. package/src/decoder_sse2.cc +26 -5
  24. package/src/decoder_sse_base.h +20 -40
  25. package/src/decoder_ssse3.cc +5 -6
  26. package/src/decoder_vbmi2.cc +6 -13
  27. package/src/encoder.cc +42 -26
  28. package/src/encoder.h +5 -7
  29. package/src/encoder_avx.cc +3 -3
  30. package/src/encoder_avx2.cc +3 -3
  31. package/src/encoder_avx_base.h +3 -0
  32. package/src/encoder_common.h +26 -32
  33. package/src/encoder_neon.cc +6 -3
  34. package/src/encoder_rvv.cc +13 -26
  35. package/src/encoder_sse2.cc +3 -2
  36. package/src/encoder_sse_base.h +2 -0
  37. package/src/encoder_ssse3.cc +3 -3
  38. package/src/encoder_vbmi2.cc +6 -7
  39. package/src/platform.cc +24 -23
  40. package/src/yencode.cc +54 -11
  41. package/test/_speedbase.js +4 -2
  42. package/test/speeddec.js +25 -16
  43. package/test/speedenc.js +21 -17
  44. package/test/testcrc.js +17 -1
  45. package/test/testcrcfuncs.c +53 -0
  46. package/test/testdec.js +1 -0
package/src/crc_arm.cc CHANGED
@@ -59,42 +59,35 @@ HEDLEY_WARNING("CRC32 acceleration has been disabled due to missing arm_acle.h")
59
59
  #endif
60
60
 
61
61
 
62
+
63
+ #ifdef __aarch64__
64
+ static uint32_t crc32_multiply_arm(uint32_t a, uint32_t b) {
65
+ // perform PMULL
66
+ uint64_t res = 0;
67
+ uint64_t a64 = (uint64_t)a << 32;
68
+ int64_t b64 = (int64_t)b << 32;
69
+ for(int i=0; i<32; i++) {
70
+ res ^= a64 & (b64 >> 63);
71
+ b64 += b64;
72
+ a64 >>= 1;
73
+ }
74
+ // reduction via CRC
75
+ res = __crc32w(0, res) ^ (res >> 32);
76
+ return res;
77
+ }
78
+ #endif
79
+ // regular multiply is probably better for AArch32
80
+
81
+
62
82
  // exploit CPU pipelining during CRC computation; unfortunately I haven't been able to measure any benefit
63
83
  // - Neoverse N1: no noticeable difference
64
84
  // - Cortex A53: actually runs a bit slower
65
85
  //#define ENABLE_PIPELINE_OPT 1
66
86
 
67
87
  #ifdef ENABLE_PIPELINE_OPT
68
- // workaround MSVC complaining "unary minus operator applied to unsigned type, result still unsigned"
69
- #define NEGATE(n) (uint32_t)(-((int32_t)(n)))
70
-
71
- static HEDLEY_ALWAYS_INLINE uint32_t crc_multiply(uint32_t a, uint32_t b) {
72
- uint32_t res = 0;
73
- for(int i=0; i<31; i++) {
74
- res ^= NEGATE(b>>31) & a;
75
- a = ((a >> 1) ^ (0xEDB88320 & NEGATE(a&1)));
76
- b <<= 1;
77
- }
78
- res ^= NEGATE(b>>31) & a;
79
- return res;
80
- }
81
-
82
- static const uint32_t crc_power[] = { // pre-computed 2^(2^n), with first 3 entries removed (saves a shift)
83
- 0x00800000, 0x00008000, 0xedb88320, 0xb1e6b092, 0xa06a2517, 0xed627dae, 0x88d14467, 0xd7bbfe6a,
84
- 0xec447f11, 0x8e7ea170, 0x6427800e, 0x4d47bae0, 0x09fe548f, 0x83852d0f, 0x30362f1a, 0x7b5a9cc3,
85
- 0x31fec169, 0x9fec022a, 0x6c8dedc4, 0x15d6874d, 0x5fde7a4e, 0xbad90e37, 0x2e4e5eef, 0x4eaba214,
86
- 0xa8a472c0, 0x429a969e, 0x148d302a, 0xc40ba6d0, 0xc4e22c3c, 0x40000000, 0x20000000, 0x08000000
87
- };
88
- /* above table can be computed with
89
- int main(void) {
90
- uint32_t k = 0x80000000 >> 1;
91
- for (size_t i = 0; i < 32+3; ++i) {
92
- if(i>2) printf("0x%08x, ", k);
93
- k = crc_multiply(k, k);
94
- }
95
- return 0;
96
- }
97
- */
88
+ #ifndef __aarch64__
89
+ # define crc32_multiply_arm RapidYenc::crc32_multiply_generic
90
+ #endif
98
91
  #endif
99
92
 
100
93
 
@@ -130,6 +123,7 @@ static uint32_t arm_crc_calc(uint32_t crc, const unsigned char *src, long len) {
130
123
  // (this is a slightly less efficient, but much simpler implementation of the idea)
131
124
  const unsigned SPLIT_WORDS_LOG = 10; // make sure it's at least 2
132
125
  const unsigned SPLIT_WORDS = 1<<SPLIT_WORDS_LOG;
126
+ const unsigned blockCoeff = RapidYenc::crc_power[SPLIT_WORDS_LOG + WORDSIZE_LOG + 3];
133
127
  while(len >= (long)(sizeof(WORD_T)*SPLIT_WORDS*2)) {
134
128
  // compute 2x CRCs concurrently to leverage piplining
135
129
  uint32_t crc2 = 0;
@@ -148,8 +142,7 @@ static uint32_t arm_crc_calc(uint32_t crc, const unsigned char *src, long len) {
148
142
  srcW++;
149
143
  }
150
144
  // merge the CRCs
151
- // since we're multiplying by a fixed number, it could be sped up with some lookup tables
152
- crc = crc_multiply(crc, crc_power[SPLIT_WORDS_LOG + WORDSIZE_LOG]) ^ crc2;
145
+ crc = crc32_multiply_arm(crc, blockCoeff) ^ crc2;
153
146
  srcW += SPLIT_WORDS;
154
147
  len -= sizeof(WORD_T)*SPLIT_WORDS*2;
155
148
  }
@@ -200,10 +193,34 @@ static uint32_t do_crc32_incremental_arm(const void* data, size_t length, uint32
200
193
  return ~arm_crc_calc(~init, (const unsigned char*)data, (long)length);
201
194
  }
202
195
 
203
- void crc_arm_set_funcs() {
196
+
197
+ #if defined(__aarch64__) && (defined(__GNUC__) || defined(_MSC_VER))
198
+ static uint32_t crc32_shift_arm(uint32_t crc1, uint32_t n) {
199
+ uint32_t result = crc1;
200
+ uint64_t prod = result;
201
+ prod <<= 32 - (n&31);
202
+ result = __crc32w(0, prod) ^ (prod >> 32);
203
+ n &= ~31;
204
+
205
+ while(n) {
206
+ result = crc32_multiply_arm(result, RapidYenc::crc_power[ctz32(n)]);
207
+ n &= n-1;
208
+ }
209
+ return result;
210
+ }
211
+ #endif
212
+
213
+
214
+ void RapidYenc::crc_arm_set_funcs() {
204
215
  _do_crc32_incremental = &do_crc32_incremental_arm;
216
+ #ifdef __aarch64__
217
+ _crc32_multiply = &crc32_multiply_arm;
218
+ # if defined(__GNUC__) || defined(_MSC_VER)
219
+ _crc32_shift = &crc32_shift_arm;
220
+ # endif
221
+ #endif
205
222
  _crc32_isa = ISA_FEATURE_CRC;
206
223
  }
207
224
  #else
208
- void crc_arm_set_funcs() {}
225
+ void RapidYenc::crc_arm_set_funcs() {}
209
226
  #endif
@@ -0,0 +1,215 @@
1
+ #include "crc_common.h"
2
+
3
+ // exclude broken/missing arm_acle.h
4
+ #if defined(__ARM_FEATURE_CRYPTO) && defined(HEDLEY_GCC_VERSION)
5
+ # if !defined(__aarch64__) && HEDLEY_GCC_VERSION_CHECK(7,0,0) && !HEDLEY_GCC_VERSION_CHECK(8,1,1)
6
+ # undef __ARM_FEATURE_CRYPTO
7
+ # endif
8
+ # if defined(__aarch64__) && HEDLEY_GCC_VERSION_CHECK(9,4,0) && !HEDLEY_GCC_VERSION_CHECK(9,5,0)
9
+ # undef __ARM_FEATURE_CRYPTO
10
+ # endif
11
+ #endif
12
+ #if defined(__ARM_FEATURE_CRYPTO) && defined(__has_include)
13
+ # if !__has_include(<arm_acle.h>)
14
+ # undef __ARM_FEATURE_CRYPTO
15
+ # endif
16
+ #endif
17
+
18
+ // ARM's intrinsics guide seems to suggest that vmull_p64 is available on A32, but neither Clang/GCC seem to support it on AArch32
19
+ #if (defined(__ARM_FEATURE_CRYPTO) && defined(__ARM_FEATURE_CRC32) && defined(__aarch64__)) || (defined(_M_ARM64) && !defined(__clang__))
20
+
21
+ #include <arm_neon.h>
22
+ #if defined(_MSC_VER) && !defined(__clang__)
23
+ # include <intrin.h>
24
+
25
+ # ifdef _M_ARM64
26
+ // MSVC may detect this pattern: https://devblogs.microsoft.com/cppblog/a-tour-of-4-msvc-backend-improvements/#byteswap-identification
27
+ static HEDLEY_ALWAYS_INLINE uint64_t rbit64(uint64_t x) {
28
+ x = _byteswap_uint64(x);
29
+ x = (x & 0xaaaaaaaaaaaaaaaa) >> 1 | (x & 0x5555555555555555) << 1;
30
+ x = (x & 0xcccccccccccccccc) >> 2 | (x & 0x3333333333333333) << 2;
31
+ x = (x & 0xf0f0f0f0f0f0f0f0) >> 4 | (x & 0x0f0f0f0f0f0f0f0f) << 4;
32
+ return x;
33
+ }
34
+ // ...whilst this seems to work best for 32-bit RBIT
35
+ static HEDLEY_ALWAYS_INLINE uint32_t rbit32(uint32_t x) {
36
+ uint64_t r = rbit64(x);
37
+ return r >> 32;
38
+ }
39
+ # else
40
+ # define rbit32 _arm_rbit
41
+ # endif
42
+ #else
43
+ # include <arm_acle.h>
44
+ // __rbit not present before GCC 11.4.0 or 12.2.0; for ARM32, requires GCC 14
45
+ # if defined(HEDLEY_GCC_VERSION) && !HEDLEY_GCC_VERSION_CHECK(14,0,0) && (!defined(__aarch64__) || !HEDLEY_GCC_VERSION_CHECK(11,3,0) || (HEDLEY_GCC_VERSION_CHECK(12,0,0) && !HEDLEY_GCC_VERSION_CHECK(12,2,0)))
46
+ # ifdef __aarch64__
47
+ static HEDLEY_ALWAYS_INLINE uint64_t rbit64(uint64_t x) {
48
+ uint64_t r;
49
+ __asm__ ("rbit %0,%1\n"
50
+ : "=r"(r) : "r"(x)
51
+ : /* No clobbers */);
52
+ return r;
53
+ }
54
+ # endif
55
+ static HEDLEY_ALWAYS_INLINE uint32_t rbit32(uint32_t x) {
56
+ uint32_t r;
57
+ __asm__ (
58
+ # ifdef __aarch64__
59
+ "rbit %w0,%w1\n"
60
+ # else
61
+ "rbit %0,%1\n"
62
+ # endif
63
+ : "=r"(r) : "r"(x)
64
+ : /* No clobbers */);
65
+ return r;
66
+ }
67
+ # else
68
+ # define rbit32 __rbit
69
+ # define rbit64 __rbitll
70
+ # endif
71
+ #endif
72
+
73
+
74
+ // MSVC doesn't have poly64/poly128 types, so always use uint64 instead
75
+
76
+ #ifdef __aarch64__
77
+ # if defined(__GNUC__) || defined(__clang__)
78
+ static HEDLEY_ALWAYS_INLINE uint64x2_t pmull_low(uint64x1_t a, uint64x1_t b) {
79
+ uint64x2_t result;
80
+ __asm__ ("pmull %0.1q,%1.1d,%2.1d"
81
+ : "=w"(result)
82
+ : "w"(a), "w"(b)
83
+ : /* No clobbers */);
84
+ return result;
85
+ }
86
+ static HEDLEY_ALWAYS_INLINE uint64x2_t pmull_high(uint64x2_t a, uint64x2_t b) {
87
+ uint64x2_t result;
88
+ __asm__ ("pmull2 %0.1q,%1.2d,%2.2d"
89
+ : "=w"(result)
90
+ : "w"(a), "w"(b)
91
+ : /* No clobbers */);
92
+ return result;
93
+ }
94
+ # elif defined(_MSC_VER) && !defined(__clang__)
95
+ # define pmull_low vmull_p64
96
+ # define pmull_high vmull_high_p64
97
+ # else
98
+ # define pmull_low(x, y) vreinterpretq_u64_p128(vmull_p64(vreinterpret_p64_u64(x), vreinterpret_p64_u64(y)))
99
+ # define pmull_high(x, y) vreinterpretq_u64_p128(vmull_high_p64(vreinterpretq_p64_u64(x), vreinterpretq_p64_u64(y)))
100
+ # endif
101
+ #else
102
+ # if defined(_MSC_VER) && !defined(__clang__)
103
+ # define pmull_low vmull_p64
104
+ # define pmull_high(x, y) vmull_p64(vget_high_u64(x), vget_high_u64(y))
105
+ # else
106
+ # define pmull_low(x, y) vreinterpretq_u64_p128(vmull_p64(x, y))
107
+ # define pmull_high(x, y) vreinterpretq_u64_p128(vmull_p64(vget_high_p64(vreinterpretq_p64_u64(x)), vget_high_p64(vreinterpretq_p64_u64(y))))
108
+ # endif
109
+ #endif
110
+
111
+
112
+ static uint32_t crc32_multiply_pmull(uint32_t a, uint32_t b) {
113
+ uint64x1_t prod = vget_low_u64(pmull_low(
114
+ vreinterpret_u64_u32(vset_lane_u32(a, vdup_n_u32(0), 0)),
115
+ vreinterpret_u64_u32(vset_lane_u32(b, vdup_n_u32(0), 0))
116
+ ));
117
+ #ifdef __aarch64__
118
+ uint64_t p = vget_lane_u64(prod, 0);
119
+ return __crc32w(0, p+p) ^ (p >> 31);
120
+ #else
121
+ prod = vadd_u64(prod, prod);
122
+ uint32x2_t prod32 = vreinterpret_u32_u64(prod);
123
+ return __crc32w(0, vget_lane_u32(prod32, 0)) ^ vget_lane_u32(prod32, 1);
124
+ #endif
125
+ }
126
+
127
+
128
+
129
+ static const uint32_t crc_power_rev[32] = { // bit-reversed crc_power
130
+ 0x00000002, 0x00000004, 0x00000010, 0x00000100, 0x00010000, 0x04c11db7, 0x490d678d, 0xe8a45605,
131
+ 0x75be46b7, 0xe6228b11, 0x567fddeb, 0x88fe2237, 0x0e857e71, 0x7001e426, 0x075de2b2, 0xf12a7f90,
132
+ 0xf0b4a1c1, 0x58f46c0c, 0xc3395ade, 0x96837f8c, 0x544037f9, 0x23b7b136, 0xb2e16ba8, 0x725e7bfa,
133
+ 0xec709b5d, 0xf77a7274, 0x2845d572, 0x034e2515, 0x79695942, 0x540cb128, 0x0b65d023, 0x3c344723
134
+ };
135
+
136
+
137
+ static HEDLEY_ALWAYS_INLINE uint64x1_t crc32_shift_pmull_mulred(uint64x1_t a, uint64x1_t b) {
138
+ uint64x2_t r = pmull_low(a, b);
139
+ uint64x2_t h = pmull_high(r, vdupq_n_u64(0x490d678d));
140
+ return veor_u64(vget_low_u64(r), vget_low_u64(h));
141
+ }
142
+
143
+
144
+ static uint32_t crc32_shift_pmull(uint32_t crc1, uint32_t n) {
145
+ crc1 = rbit32(crc1);
146
+
147
+ uint64x1_t res;
148
+ #ifdef __aarch64__
149
+ uint64_t crc = (uint64_t)crc1 << (n & 31);
150
+ res = vset_lane_u64(crc, vdup_n_u64(0), 0);
151
+ #else
152
+ res = vreinterpret_u64_u32(vset_lane_u32(crc1, vdup_n_u32(0), 0));
153
+ res = vshl_u64(res, vdup_n_u64(n&31));
154
+ #endif
155
+ n &= ~31;
156
+
157
+ if(n) {
158
+ #define LOAD_NEXT_POWER vreinterpret_u64_u32(vset_lane_u32(crc_power_rev[ctz32(n)], vdup_n_u32(0), 0))
159
+ uint64x1_t res2 = LOAD_NEXT_POWER;
160
+ n &= n-1;
161
+
162
+ if(n) {
163
+ // first multiply doesn't need reduction
164
+ res2 = vget_low_u64(pmull_low(res2, LOAD_NEXT_POWER));
165
+ n &= n-1;
166
+
167
+ while(n) {
168
+ res = crc32_shift_pmull_mulred(res, LOAD_NEXT_POWER);
169
+ n &= n-1;
170
+
171
+ if(n) {
172
+ res2 = crc32_shift_pmull_mulred(res2, LOAD_NEXT_POWER);
173
+ n &= n-1;
174
+ }
175
+ }
176
+ }
177
+ #undef LOAD_NEXT_POWER
178
+
179
+ // merge two results
180
+ uint64x2_t prod = pmull_low(res, res2);
181
+ // weirdly, vrbitq_u8 is missing in ARM32 MSVC
182
+ prod = vreinterpretq_u64_u8(vrev64q_u8(vrbitq_u8(vreinterpretq_u8_u64(prod))));
183
+ #ifdef __aarch64__
184
+ crc = __crc32d(0, vgetq_lane_u64(prod, 1));
185
+ uint64_t rem = vgetq_lane_u64(prod, 0);
186
+ crc = __crc32w(rem, crc) ^ (rem >> 32);
187
+ #else
188
+ uint32x4_t prod32 = vreinterpretq_u32_u64(prod);
189
+ uint32_t crc = __crc32w(0, vgetq_lane_u32(prod32, 2));
190
+ crc = __crc32w(vgetq_lane_u32(prod32, 3), crc);
191
+ crc = __crc32w(vgetq_lane_u32(prod32, 0), crc) ^ vgetq_lane_u32(prod32, 1);
192
+ #endif
193
+ return crc;
194
+ } else {
195
+ #ifdef __aarch64__
196
+ crc = rbit64(crc);
197
+ crc = __crc32w(0, crc) ^ (crc >> 32);
198
+ return crc;
199
+ #else
200
+ uint32x2_t r = vreinterpret_u32_u64(res);
201
+ return __crc32w(0, rbit32(vget_lane_u32(r, 1))) ^ rbit32(vget_lane_u32(r, 0));
202
+ #endif
203
+ }
204
+ }
205
+
206
+
207
+ void RapidYenc::crc_pmull_set_funcs() {
208
+ _crc32_multiply = &crc32_multiply_pmull;
209
+ _crc32_shift = &crc32_shift_pmull;
210
+ _crc32_isa &= ISA_FEATURE_PMULL;
211
+ }
212
+
213
+ #else
214
+ void RapidYenc::crc_pmull_set_funcs() {}
215
+ #endif /* defined(__ARM_FEATURE_CRYPTO) && defined(__ARM_FEATURE_CRC32) */
package/src/crc_common.h CHANGED
@@ -2,3 +2,25 @@
2
2
  #include <stddef.h> // for size_t
3
3
  #include "crc.h"
4
4
 
5
+ #ifdef __GNUC__
6
+ # define ctz32 __builtin_ctz
7
+ #elif defined(_MSC_VER)
8
+ static HEDLEY_ALWAYS_INLINE unsigned ctz32(uint32_t n) {
9
+ unsigned long r;
10
+ _BitScanForward(&r, n);
11
+ return r;
12
+ }
13
+ #endif
14
+
15
+ namespace RapidYenc {
16
+ void crc_clmul_set_funcs();
17
+ void crc_clmul256_set_funcs();
18
+ void crc_arm_set_funcs();
19
+ void crc_pmull_set_funcs();
20
+ void crc_riscv_set_funcs();
21
+
22
+ extern const uint32_t crc_power[32];
23
+ uint32_t crc32_multiply_generic(uint32_t a, uint32_t b);
24
+ uint32_t crc32_shift_generic(uint32_t crc1, uint32_t n);
25
+
26
+ }
@@ -140,20 +140,10 @@ static uint32_t crc_fold(const unsigned char *src, long len, uint32_t initial) {
140
140
  unsigned long algn_diff;
141
141
  __m128i xmm_t0, xmm_t1, xmm_t2, xmm_t3;
142
142
 
143
- // TODO: consider calculating this via a LUT instead (probably faster)
144
- // info from https://www.reddit.com/r/ReverseEngineering/comments/2zwhl3/mystery_constant_0x9db42487_in_intels_crc32ieee/
145
- // firstly, calculate: xmm_crc0 = (intial * 0x487b9c8a) mod 0x104c11db7, where 0x487b9c8a = inverse(1<<512) mod 0x104c11db7
146
- xmm_t0 = _mm_cvtsi32_si128(~initial);
147
-
148
- xmm_t0 = _mm_clmulepi64_si128(xmm_t0, _mm_set_epi32(0, 0, 0xa273bc24, 0), 0); // reverse(0x487b9c8a)<<1 == 0xa273bc24
149
- xmm_t2 = _mm_set_epi32( // polynomial reduction factors
150
- 1, 0xdb710640, // G* = 0x04c11db7
151
- 0, 0xf7011641 // Q+ = 0x04d101df (+1 to save an additional xor operation)
152
- );
153
- xmm_t1 = _mm_clmulepi64_si128(xmm_t0, xmm_t2, 0);
154
- xmm_t1 = _mm_clmulepi64_si128(xmm_t1, xmm_t2, 0x10);
155
-
156
- __m128i xmm_crc0 = _mm_srli_si128(_mm_xor_si128(xmm_t0, xmm_t1), 8);
143
+ // since the initial value will immediately be multiplied by around 2^512, we need to roll it backwards
144
+ // this is done by dividing the initial value by 2^480
145
+ // the constant used here is reverse(2^-480)<<1 == 0xdfded7ec
146
+ __m128i xmm_crc0 = _mm_clmulepi64_si128(_mm_cvtsi32_si128(~initial), _mm_cvtsi32_si128(0xdfded7ec), 0);
157
147
 
158
148
  __m128i xmm_crc1 = _mm_setzero_si128();
159
149
  __m128i xmm_crc2 = _mm_setzero_si128();
@@ -365,11 +355,159 @@ static uint32_t do_crc32_incremental_clmul(const void* data, size_t length, uint
365
355
  return crc_fold((const unsigned char*)data, (long)length, init);
366
356
  }
367
357
 
368
- void crc_clmul_set_funcs() {
358
+
359
+ static HEDLEY_ALWAYS_INLINE __m128i crc32_reduce(__m128i prod) {
360
+ // do Barrett reduction back into 32-bit field
361
+ const __m128i reduction_const = _mm_load_si128((__m128i*)crc_k + 2);
362
+ __m128i t = _mm_clmulepi64_si128(prod, reduction_const, 0);
363
+ t = _mm_clmulepi64_si128(t, reduction_const, 0x10);
364
+ t = _mm_xor_si128(t, prod);
365
+ return t;
366
+ }
367
+
368
+ static uint32_t crc32_multiply_clmul(uint32_t a, uint32_t b) {
369
+ // do the actual multiply
370
+ __m128i prod = _mm_clmulepi64_si128(_mm_cvtsi32_si128(a), _mm_cvtsi32_si128(b), 0);
371
+
372
+ // prepare product for reduction
373
+ prod = _mm_add_epi64(prod, prod); // bit alignment fix, due to CRC32 being bit-reversal
374
+ prod = _mm_slli_si128(prod, 4); // straddle low/high halves across 64-bit boundary - this provides automatic truncation during reduction
375
+
376
+ prod = crc32_reduce(prod);
377
+ return _mm_extract_epi32(prod, 2);
378
+ }
379
+
380
+ #if defined(__GNUC__) || defined(_MSC_VER)
381
+ static HEDLEY_ALWAYS_INLINE __m128i reverse_bits_epi8(__m128i src) {
382
+ #if defined(__GFNI__) && defined(YENC_BUILD_NATIVE) && YENC_BUILD_NATIVE!=0
383
+ return _mm_gf2p8affine_epi64_epi8(src, _mm_set_epi32(
384
+ 0x80402010, 0x08040201,
385
+ 0x80402010, 0x08040201
386
+ ), 0);
387
+ /*
388
+ #elif defined(ENABLE_AVX512)
389
+ // !! this only processes the bottom 32 bits !!
390
+ src = _mm_maskz_mov_epi32(1, src);
391
+ src = _mm_ternarylogic_epi32(src, _mm_slli_epi64(src, 28), _mm_set1_epi8(0xf), 0xa8); // (a|b)&c
392
+ src = _mm_shuffle_epi8(_mm_set_epi8(
393
+ -16, 112, -80, 48, -48, 80, -112, 16, -32, 96, -96, 32, -64, 64, -128, 0
394
+ ), src);
395
+ return _mm_maskz_or_epi32(1, src, _mm_srli_epi64(src, 36));
396
+ */
397
+ #else
398
+ __m128i xmm_t0 = _mm_and_si128(src, _mm_set1_epi8(0x0f));
399
+ __m128i xmm_t1 = _mm_and_si128(_mm_srli_epi16(src, 4), _mm_set1_epi8(0x0f));
400
+ xmm_t0 = _mm_shuffle_epi8(_mm_set_epi8(
401
+ -16, 112, -80, 48, -48, 80, -112, 16, -32, 96, -96, 32, -64, 64, -128, 0
402
+ //0xf0, 0x70, 0xb0, 0x30, 0xd0, 0x50, 0x90, 0x10, 0xe0, 0x60, 0xa0, 0x20, 0xc0, 0x40, 0x80, 0
403
+ ), xmm_t0);
404
+ xmm_t1 = _mm_shuffle_epi8(_mm_set_epi8(
405
+ 15, 7, 11, 3, 13, 5, 9, 1, 14, 6, 10, 2, 12, 4, 8, 0
406
+ ), xmm_t1);
407
+ return _mm_or_si128(xmm_t0, xmm_t1);
408
+ #endif
409
+ }
410
+
411
+ #ifdef _MSC_VER
412
+ // because MSVC doesn't use BSWAP unless you specifically tell it to...
413
+ # include <stdlib.h>
414
+ # define BSWAP32 _byteswap_ulong
415
+ #else
416
+ # define BSWAP32(n) ((((n)&0xff)<<24) | (((n)&0xff00)<<8) | (((n)&0xff0000)>>8) | (((n)&0xff000000)>>24))
417
+ #endif
418
+
419
+
420
+
421
+ static const uint32_t crc_power_rev[32] = { // bit-reversed crc_power
422
+ 0x00000002, 0x00000004, 0x00000010, 0x00000100, 0x00010000, 0x04c11db7, 0x490d678d, 0xe8a45605,
423
+ 0x75be46b7, 0xe6228b11, 0x567fddeb, 0x88fe2237, 0x0e857e71, 0x7001e426, 0x075de2b2, 0xf12a7f90,
424
+ 0xf0b4a1c1, 0x58f46c0c, 0xc3395ade, 0x96837f8c, 0x544037f9, 0x23b7b136, 0xb2e16ba8, 0x725e7bfa,
425
+ 0xec709b5d, 0xf77a7274, 0x2845d572, 0x034e2515, 0x79695942, 0x540cb128, 0x0b65d023, 0x3c344723
426
+ };
427
+
428
+ static HEDLEY_ALWAYS_INLINE __m128i crc32_shift_clmul_mulred(unsigned pos, __m128i prod) {
429
+ // this multiplies a 64-bit `prod` with a 32-bit CRC power
430
+ // compared with crc32_multiply_clmul, this only reduces the result to 64-bit, saving a multiply
431
+ __m128i coeff = _mm_cvtsi32_si128(crc_power_rev[pos]);
432
+
433
+ const __m128i fold_const = _mm_set_epi32(0, 0x490d678d, 0, 0xf200aa66);
434
+ prod = _mm_clmulepi64_si128(prod, coeff, 0);
435
+ __m128i hi = _mm_clmulepi64_si128(prod, fold_const, 0x11);
436
+ return _mm_xor_si128(hi, prod);
437
+ }
438
+
439
+ static uint32_t crc32_shift_clmul(uint32_t crc1, uint32_t n) {
440
+ if(!n) return crc1;
441
+
442
+ __m128i result = _mm_cvtsi32_si128(BSWAP32(crc1));
443
+ result = reverse_bits_epi8(result);
444
+
445
+ // handle n < 32 with a shift
446
+ result = _mm_sll_epi64(result, _mm_cvtsi32_si128(n & 31));
447
+ n &= ~31;
448
+
449
+ __m128i t;
450
+ if(n) {
451
+ // use a second accumulator to leverage some IPC from slow CLMUL
452
+ __m128i result2 = _mm_cvtsi32_si128(crc_power_rev[ctz32(n)]);
453
+ n &= n-1;
454
+
455
+ if(n) {
456
+ // first multiply doesn't need reduction
457
+ result2 = _mm_clmulepi64_si128(result2, _mm_cvtsi32_si128(crc_power_rev[ctz32(n)]), 0);
458
+ n &= n-1;
459
+
460
+ while(n) {
461
+ result = crc32_shift_clmul_mulred(ctz32(n), result);
462
+ n &= n-1;
463
+
464
+ if(n) {
465
+ result2 = crc32_shift_clmul_mulred(ctz32(n), result2);
466
+ n &= n-1;
467
+ }
468
+ }
469
+ }
470
+
471
+ const __m128i fold_const = _mm_set_epi32(0, 0x490d678d, 0, 0xf200aa66);
472
+
473
+ // merge two results
474
+ result = _mm_clmulepi64_si128(result, result2, 0);
475
+
476
+ // do 128b reduction
477
+ t = _mm_unpackhi_epi32(result, _mm_setzero_si128());
478
+ // fold [127:96] -> [63:0]
479
+ __m128i hi = _mm_clmulepi64_si128(t, fold_const, 1);
480
+ // fold [95:64] -> [63:0]
481
+ __m128i lo = _mm_clmulepi64_si128(t, fold_const, 0x10);
482
+ #ifdef ENABLE_AVX512
483
+ result = _mm_ternarylogic_epi32(result, hi, lo, 0x96);
484
+ #else
485
+ result = _mm_xor_si128(result, hi);
486
+ result = _mm_xor_si128(result, lo);
487
+ #endif
488
+ }
489
+
490
+ // do Barrett reduction back into 32-bit field
491
+ const __m128i reduction_const = _mm_set_epi32(0, 0x04c11db7, 1, 0x04d101df);
492
+ t = _mm_clmulepi64_si128(_mm_blend_epi16(_mm_setzero_si128(), result, 0x3c), reduction_const, 0);
493
+ t = _mm_clmulepi64_si128(t, reduction_const, 0x11);
494
+ result = _mm_xor_si128(t, result);
495
+
496
+ result = reverse_bits_epi8(result);
497
+ return BSWAP32(_mm_cvtsi128_si32(result));
498
+ }
499
+ #endif
500
+
501
+
502
+ void RapidYenc::crc_clmul_set_funcs() {
369
503
  _do_crc32_incremental = &do_crc32_incremental_clmul;
504
+ _crc32_multiply = &crc32_multiply_clmul;
505
+ #if defined(__GNUC__) || defined(_MSC_VER)
506
+ _crc32_shift = &crc32_shift_clmul;
507
+ #endif
370
508
  _crc32_isa = ISA_LEVEL_PCLMUL;
371
509
  }
372
510
  #else
373
- void crc_clmul_set_funcs() {}
511
+ void RapidYenc::crc_clmul_set_funcs() {}
374
512
  #endif
375
513
 
@@ -99,19 +99,12 @@ ALIGN_TO(16, static const unsigned crc_k[]) = {
99
99
 
100
100
 
101
101
  static uint32_t crc_fold(const unsigned char *src, long len, uint32_t initial) {
102
- // info from https://www.reddit.com/r/ReverseEngineering/comments/2zwhl3/mystery_constant_0x9db42487_in_intels_crc32ieee/
103
- // firstly, calculate: xmm_crc0 = (intial * 0x487b9c8a) mod 0x104c11db7, where 0x487b9c8a = inverse(1<<512) mod 0x104c11db7
104
- __m128i xmm_t0 = _mm_cvtsi32_si128(~initial);
105
-
106
- xmm_t0 = _mm_clmulepi64_si128(xmm_t0, _mm_set_epi32(0, 0, 0xa273bc24, 0), 0); // reverse(0x487b9c8a)<<1 == 0xa273bc24
107
- __m128i reduction = _mm_set_epi32( // polynomial reduction factors
108
- 1, 0xdb710640, // G* = 0x04c11db7
109
- 0, 0xf7011641 // Q+ = 0x04d101df (+1 to save an additional xor operation)
102
+ __m128i xmm_t0 = _mm_clmulepi64_si128(
103
+ _mm_cvtsi32_si128(~initial),
104
+ _mm_cvtsi32_si128(0xdfded7ec),
105
+ 0
110
106
  );
111
- __m128i xmm_t1 = _mm_clmulepi64_si128(xmm_t0, reduction, 0);
112
- xmm_t1 = _mm_clmulepi64_si128(xmm_t1, reduction, 0x10);
113
107
 
114
- xmm_t0 = _mm_srli_si128(_mm_xor_si128(xmm_t0, xmm_t1), 8);
115
108
  __m256i crc0 = zext128_256(xmm_t0);
116
109
  __m256i crc1 = _mm256_setzero_si256();
117
110
 
@@ -217,13 +210,13 @@ static uint32_t do_crc32_incremental_clmul(const void* data, size_t length, uint
217
210
  return crc_fold((const unsigned char*)data, (long)length, init);
218
211
  }
219
212
 
220
- void crc_clmul256_set_funcs() {
213
+ void RapidYenc::crc_clmul256_set_funcs() {
214
+ crc_clmul_set_funcs(); // set multiply/shift function
221
215
  _do_crc32_incremental = &do_crc32_incremental_clmul;
222
216
  _crc32_isa = ISA_LEVEL_VPCLMUL;
223
217
  }
224
218
  #else
225
- void crc_clmul_set_funcs();
226
- void crc_clmul256_set_funcs() {
219
+ void RapidYenc::crc_clmul256_set_funcs() {
227
220
  crc_clmul_set_funcs();
228
221
  }
229
222
  #endif