yencode 1.1.4 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "yencode",
3
- "version": "1.1.4",
3
+ "version": "1.2.0",
4
4
  "description": "SIMD accelerated yEnc encoder/decoder and CRC32 calculator",
5
5
  "keywords": [
6
6
  "yenc",
package/src/common.h CHANGED
@@ -221,6 +221,7 @@ bool cpu_supports_neon();
221
221
 
222
222
  #ifdef PLATFORM_X86
223
223
  enum YEncDecIsaLevel {
224
+ ISA_GENERIC = 0,
224
225
  ISA_FEATURE_POPCNT = 0x1,
225
226
  ISA_FEATURE_LZCNT = 0x2,
226
227
  ISA_FEATURE_EVEX512 = 0x4, // AVX512 support
@@ -228,11 +229,31 @@ enum YEncDecIsaLevel {
228
229
  ISA_LEVEL_SSSE3 = 0x200,
229
230
  ISA_LEVEL_SSE41 = 0x300,
230
231
  ISA_LEVEL_SSE4_POPCNT = 0x301,
232
+ ISA_LEVEL_PCLMUL = 0x340,
231
233
  ISA_LEVEL_AVX = 0x381, // same as above, just used as a differentiator for `cpu_supports_isa`
232
234
  ISA_LEVEL_AVX2 = 0x403, // also includes BMI1/2 and LZCNT
235
+ ISA_LEVEL_VPCLMUL = 0x440,
233
236
  ISA_LEVEL_AVX3 = 0x507, // SKX variant; AVX512VL + AVX512BW
234
237
  ISA_LEVEL_VBMI2 = 0x603 // ICL, AVX10
235
238
  };
239
+ #elif defined(PLATFORM_ARM)
240
+ enum YEncDecIsaLevel {
241
+ ISA_GENERIC = 0,
242
+ ISA_FEATURE_CRC = 8,
243
+ ISA_LEVEL_NEON = 0x1000
244
+ };
245
+ #elif defined(__riscv)
246
+ enum YEncDecIsaLevel {
247
+ ISA_GENERIC = 0,
248
+ ISA_FEATURE_ZBC = 16,
249
+ ISA_LEVEL_RVV = 0x10000
250
+ };
251
+ #else
252
+ enum YEncDecIsaLevel {
253
+ ISA_GENERIC = 0
254
+ };
255
+ #endif
256
+ #ifdef PLATFORM_X86
236
257
  #ifdef _MSC_VER
237
258
  // native tuning not supported in MSVC
238
259
  # define ISA_NATIVE ISA_LEVEL_SSE2
@@ -271,8 +292,25 @@ bool cpu_supports_rvv();
271
292
  #if defined(__riscv_vector) && defined(HEDLEY_GCC_VERSION) && !HEDLEY_GCC_VERSION_CHECK(13,0,0)
272
293
  // GCC added RVV intrinsics in GCC13
273
294
  # undef __riscv_vector
295
+ #elif defined(__riscv_vector) && defined(HEDLEY_GCC_VERSION) && !HEDLEY_GCC_VERSION_CHECK(14,0,0)
296
+ // ...however, GCC13 lacks necessary mask<>vector vreinterpret casts, and it crashes on type punning, so I can't be bothered trying to make it work
297
+ # undef __riscv_vector
298
+ #endif
299
+ #ifdef __riscv_vector
300
+ # include <riscv_vector.h>
301
+ # ifdef __riscv_v_intrinsic
302
+ # define RV(f) __riscv_##f
303
+ # else
304
+ # define RV(f) f
305
+ # endif
306
+ # if defined(__riscv_v_intrinsic) && __riscv_v_intrinsic >= 12000
307
+ # define RV_MASK_CAST(masksz, vecsz, vec) RV(vreinterpret_v_u##vecsz##m1_b##masksz)(vec)
308
+ # define RV_VEC_U8MF4_CAST(vec) RV(vlmul_trunc_v_u8m1_u8mf4)(RV(vreinterpret_v_b4_u8m1)(vec))
309
+ # else
310
+ # define RV_MASK_CAST(masksz, vecsz, vec) *(vbool##masksz##_t*)(&(vec))
311
+ # define RV_VEC_U8MF4_CAST(vec) *(vuint8mf4_t*)(&(vec))
312
+ # endif
274
313
  #endif
275
-
276
314
 
277
315
  #include <string.h>
278
316
  #if !defined(_MSC_VER) || defined(_STDINT) || _MSC_VER >= 1900
package/src/crc.cc CHANGED
@@ -1,16 +1,23 @@
1
1
  #include "crc_common.h"
2
2
 
3
+ #if defined(PLATFORM_X86) && !defined(__ILP32__) && !defined(YENC_DISABLE_CRCUTIL)
4
+ // Use crcutil for computing CRC32 (generic implementation)
5
+
3
6
  #include "interface.h"
4
7
  crcutil_interface::CRC* crc = NULL;
8
+ #define GENERIC_CRC_INIT crc = crcutil_interface::CRC::Create(0xEDB88320, 0, 32, true, 0, 0, 0, 0, NULL)
9
+ // instance never deleted... oh well...
5
10
 
6
- #if defined(PLATFORM_X86) && !defined(__ILP32__)
7
11
  static uint32_t do_crc32_incremental_generic(const void* data, size_t length, uint32_t init) {
8
12
  // use optimised ASM on x86 platforms
9
13
  crcutil_interface::UINT64 tmp = init;
10
14
  crc->Compute(data, length, &tmp);
11
15
  return (uint32_t)tmp;
12
16
  }
17
+
13
18
  #else
19
+ // don't use crcutil
20
+
14
21
  static uint32_t* HEDLEY_RESTRICT crc_slice_table;
15
22
  #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
16
23
  # if defined(__GNUC__) || defined(__clang__)
@@ -121,28 +128,73 @@ static void generate_crc32_slice_table() {
121
128
  #endif
122
129
  }
123
130
  }
131
+
132
+ #define GENERIC_CRC_INIT generate_crc32_slice_table()
124
133
  #endif
125
134
 
126
135
 
127
- crc_func _do_crc32_incremental = &do_crc32_incremental_generic;
128
136
 
137
+ // workaround MSVC complaining "unary minus operator applied to unsigned type, result still unsigned"
138
+ #define NEGATE(n) (uint32_t)(-((int32_t)(n)))
139
+ uint32_t crc32_multiply_generic(uint32_t a, uint32_t b) {
140
+ uint32_t res = 0;
141
+ for(int i=0; i<31; i++) {
142
+ res ^= NEGATE(b>>31) & a;
143
+ a = ((a >> 1) ^ (0xEDB88320 & NEGATE(a&1)));
144
+ b <<= 1;
145
+ }
146
+ res ^= NEGATE(b>>31) & a;
147
+ return res;
148
+ }
149
+ #undef NEGATE
129
150
 
151
+ const uint32_t crc_power[32] = { // pre-computed 2^(2^n)
152
+ 0x40000000, 0x20000000, 0x08000000, 0x00800000, 0x00008000, 0xedb88320, 0xb1e6b092, 0xa06a2517,
153
+ 0xed627dae, 0x88d14467, 0xd7bbfe6a, 0xec447f11, 0x8e7ea170, 0x6427800e, 0x4d47bae0, 0x09fe548f,
154
+ 0x83852d0f, 0x30362f1a, 0x7b5a9cc3, 0x31fec169, 0x9fec022a, 0x6c8dedc4, 0x15d6874d, 0x5fde7a4e,
155
+ 0xbad90e37, 0x2e4e5eef, 0x4eaba214, 0xa8a472c0, 0x429a969e, 0x148d302a, 0xc40ba6d0, 0xc4e22c3c
156
+ };
130
157
 
131
- uint32_t do_crc32_combine(uint32_t crc1, uint32_t crc2, size_t len2) {
132
- crcutil_interface::UINT64 crc1_ = crc1, crc2_ = crc2;
133
- crc->Concatenate(crc2_, 0, len2, &crc1_);
134
- return (uint32_t)crc1_;
158
+ uint32_t crc32_shift_generic(uint32_t crc1, uint32_t n) {
159
+ uint32_t result = crc1;
160
+ #ifdef __GNUC__
161
+ while(n) {
162
+ result = crc32_multiply_generic(result, crc_power[__builtin_ctz(n)]);
163
+ n &= n-1;
164
+ }
165
+ #elif defined(_MSC_VER)
166
+ unsigned long power;
167
+ while(_BitScanForward(&power, n)) {
168
+ result = crc32_multiply_generic(result, crc_power[power]);
169
+ n &= n-1;
170
+ }
171
+ #else
172
+ unsigned power = 0;
173
+ while(n) {
174
+ if(n & 1) {
175
+ result = crc32_multiply_generic(result, crc_power[power]);
176
+ }
177
+ n >>= 1;
178
+ power++;
179
+ }
180
+ #endif
181
+ return result;
135
182
  }
136
183
 
137
- uint32_t do_crc32_zeros(uint32_t crc1, size_t len) {
138
- crcutil_interface::UINT64 crc_ = crc1;
139
- crc->CrcOfZeroes(len, &crc_);
140
- return (uint32_t)crc_;
184
+
185
+ extern "C" {
186
+ crc_func _do_crc32_incremental = &do_crc32_incremental_generic;
187
+ crc_mul_func _crc32_shift = &crc32_shift_generic;
188
+ crc_mul_func _crc32_multiply = &crc32_multiply_generic;
189
+ int _crc32_isa = ISA_GENERIC;
141
190
  }
142
191
 
143
- void crc_clmul_set_funcs(crc_func*);
144
- void crc_clmul256_set_funcs(crc_func*);
145
- void crc_arm_set_funcs(crc_func*);
192
+
193
+
194
+ void crc_clmul_set_funcs();
195
+ void crc_clmul256_set_funcs();
196
+ void crc_arm_set_funcs();
197
+ void crc_riscv_set_funcs();
146
198
 
147
199
  #ifdef PLATFORM_X86
148
200
  int cpu_supports_crc_isa();
@@ -174,21 +226,23 @@ static unsigned long getauxval(unsigned long cap) {
174
226
  # endif
175
227
  # endif
176
228
  #endif
177
- void crc_init() {
178
- crc = crcutil_interface::CRC::Create(
179
- 0xEDB88320, 0, 32, true, 0, 0, 0, 0, NULL);
180
- // instance never deleted... oh well...
181
-
182
- #if !defined(PLATFORM_X86) || defined(__ILP32__)
183
- generate_crc32_slice_table();
229
+ #if defined(__riscv) && defined(__has_include)
230
+ # if __has_include(<asm/hwprobe.h>)
231
+ # include <asm/hwprobe.h>
232
+ # include <asm/unistd.h>
233
+ # include <unistd.h>
234
+ # endif
184
235
  #endif
236
+
237
+ void crc_init() {
238
+ GENERIC_CRC_INIT;
185
239
 
186
240
  #ifdef PLATFORM_X86
187
241
  int support = cpu_supports_crc_isa();
188
242
  if(support == 2)
189
- crc_clmul256_set_funcs(&_do_crc32_incremental);
243
+ crc_clmul256_set_funcs();
190
244
  else if(support == 1)
191
- crc_clmul_set_funcs(&_do_crc32_incremental);
245
+ crc_clmul_set_funcs();
192
246
  #endif
193
247
  #ifdef PLATFORM_ARM
194
248
  # ifdef __APPLE__
@@ -216,7 +270,19 @@ void crc_init() {
216
270
  false
217
271
  # endif
218
272
  ) {
219
- crc_arm_set_funcs(&_do_crc32_incremental);
273
+ crc_arm_set_funcs();
274
+ }
275
+ #endif
276
+ #ifdef __riscv
277
+ # if defined(RISCV_HWPROBE_KEY_IMA_EXT_0) && defined(__NR_riscv_hwprobe)
278
+ const int rv_hwprobe_ext_zbc = 1 << 7, rv_hwprobe_ext_zbkc = 1 << 9;
279
+ struct riscv_hwprobe p;
280
+ p.key = RISCV_HWPROBE_KEY_IMA_EXT_0;
281
+ if(!syscall(__NR_riscv_hwprobe, &p, 1, 0, NULL, 0)) {
282
+ if(p.value & (rv_hwprobe_ext_zbc | rv_hwprobe_ext_zbkc)) {
283
+ crc_riscv_set_funcs();
284
+ }
220
285
  }
286
+ # endif
221
287
  #endif
222
288
  }
package/src/crc.h CHANGED
@@ -1,5 +1,6 @@
1
1
  #ifndef __YENC_CRC_H
2
2
  #define __YENC_CRC_H
3
+ #include <stdlib.h> // for llabs
3
4
 
4
5
  #ifdef __cplusplus
5
6
  extern "C" {
@@ -9,10 +10,75 @@ extern "C" {
9
10
 
10
11
  typedef uint32_t (*crc_func)(const void*, size_t, uint32_t);
11
12
  extern crc_func _do_crc32_incremental;
13
+
14
+ extern int _crc32_isa;
12
15
  #define do_crc32 (*_do_crc32_incremental)
16
+ static inline int crc32_isa_level() {
17
+ return _crc32_isa;
18
+ }
19
+
20
+
21
+ #if !defined(__GNUC__) && defined(_MSC_VER)
22
+ # include <intrin.h>
23
+ #endif
24
+ // computes `n % 0xffffffff` (well, almost), using some bit-hacks
25
+ static inline uint32_t crc32_powmod(uint64_t n) {
26
+ #ifdef __GNUC__
27
+ unsigned res;
28
+ unsigned carry = __builtin_uadd_overflow(n >> 32, n, &res);
29
+ res += carry;
30
+ return res;
31
+ #elif defined(_MSC_VER)
32
+ unsigned res;
33
+ unsigned char carry = _addcarry_u32(0, n >> 32, n, &res);
34
+ _addcarry_u32(carry, res, 0, &res);
35
+ return res;
36
+ #else
37
+ n = (n >> 32) + (n & 0xffffffff);
38
+ n += n >> 32;
39
+ return n;
40
+ #endif
41
+ }
42
+ // computes `crc32_powmod(n*8)` avoiding overflow
43
+ static inline uint32_t crc32_bytepow(uint64_t n) {
44
+ #if defined(__GNUC__) || defined(_MSC_VER)
45
+ unsigned res = crc32_powmod(n);
46
+ # ifdef _MSC_VER
47
+ return _rotl(res, 3);
48
+ # else
49
+ return (res << 3) | (res >> 29);
50
+ # endif
51
+ #else
52
+ n = (n >> 32) + (n & 0xffffffff);
53
+ n <<= 3;
54
+ n += n >> 32;
55
+ return n;
56
+ #endif
57
+ }
58
+
59
+ typedef uint32_t (*crc_mul_func)(uint32_t, uint32_t);
60
+ extern crc_mul_func _crc32_shift;
61
+ extern crc_mul_func _crc32_multiply;
62
+ #define crc32_shift (*_crc32_shift)
63
+ #define crc32_multiply (*_crc32_multiply)
64
+
65
+ static inline uint32_t crc32_combine(uint32_t crc1, uint32_t crc2, uint64_t len2) {
66
+ return crc32_shift(crc1, crc32_bytepow(len2)) ^ crc2;
67
+ }
68
+ static inline uint32_t crc32_zeros(uint32_t crc1, uint64_t len) {
69
+ return ~crc32_shift(~crc1, crc32_bytepow(len));
70
+ }
71
+ static inline uint32_t crc32_unzero(uint32_t crc1, uint64_t len) {
72
+ return ~crc32_shift(~crc1, ~crc32_bytepow(len));
73
+ }
74
+ static inline uint32_t crc32_2pow(int64_t n) {
75
+ uint32_t sign = (uint32_t)(n >> 63);
76
+ return crc32_shift(0x80000000, crc32_powmod(llabs(n)) ^ sign);
77
+ }
78
+ static inline uint32_t crc32_256pow(uint64_t n) {
79
+ return crc32_shift(0x80000000, crc32_bytepow(n));
80
+ }
13
81
 
14
- uint32_t do_crc32_combine(uint32_t crc1, const uint32_t crc2, size_t len2);
15
- uint32_t do_crc32_zeros(uint32_t crc1, size_t len);
16
82
  void crc_init();
17
83
 
18
84
 
package/src/crc_arm.cc CHANGED
@@ -59,42 +59,36 @@ HEDLEY_WARNING("CRC32 acceleration has been disabled due to missing arm_acle.h")
59
59
  #endif
60
60
 
61
61
 
62
+
63
+ #ifdef __aarch64__
64
+ uint32_t crc32_multiply_arm(uint32_t a, uint32_t b) {
65
+ // perform PMULL
66
+ uint64_t res = 0;
67
+ uint64_t a64 = (uint64_t)a << 32;
68
+ int64_t b64 = (int64_t)b << 32;
69
+ for(int i=0; i<32; i++) {
70
+ res ^= a64 & (b64 >> 63);
71
+ b64 += b64;
72
+ a64 >>= 1;
73
+ }
74
+ // reduction via CRC
75
+ res = __crc32w(0, res) ^ (res >> 32);
76
+ return res;
77
+ }
78
+ #endif
79
+ // regular multiply is probably better for AArch32
80
+
81
+
62
82
  // exploit CPU pipelining during CRC computation; unfortunately I haven't been able to measure any benefit
63
83
  // - Neoverse N1: no noticeable difference
64
84
  // - Cortex A53: actually runs a bit slower
65
85
  //#define ENABLE_PIPELINE_OPT 1
66
86
 
67
87
  #ifdef ENABLE_PIPELINE_OPT
68
- // workaround MSVC complaining "unary minus operator applied to unsigned type, result still unsigned"
69
- #define NEGATE(n) (uint32_t)(-((int32_t)(n)))
70
-
71
- static HEDLEY_ALWAYS_INLINE uint32_t crc_multiply(uint32_t a, uint32_t b) {
72
- uint32_t res = 0;
73
- for(int i=0; i<31; i++) {
74
- res ^= NEGATE(b>>31) & a;
75
- a = ((a >> 1) ^ (0xEDB88320 & NEGATE(a&1)));
76
- b <<= 1;
77
- }
78
- res ^= NEGATE(b>>31) & a;
79
- return res;
80
- }
81
-
82
- static const uint32_t crc_power[] = { // pre-computed 2^(2^n), with first 3 entries removed (saves a shift)
83
- 0x00800000, 0x00008000, 0xedb88320, 0xb1e6b092, 0xa06a2517, 0xed627dae, 0x88d14467, 0xd7bbfe6a,
84
- 0xec447f11, 0x8e7ea170, 0x6427800e, 0x4d47bae0, 0x09fe548f, 0x83852d0f, 0x30362f1a, 0x7b5a9cc3,
85
- 0x31fec169, 0x9fec022a, 0x6c8dedc4, 0x15d6874d, 0x5fde7a4e, 0xbad90e37, 0x2e4e5eef, 0x4eaba214,
86
- 0xa8a472c0, 0x429a969e, 0x148d302a, 0xc40ba6d0, 0xc4e22c3c, 0x40000000, 0x20000000, 0x08000000
87
- };
88
- /* above table can be computed with
89
- int main(void) {
90
- uint32_t k = 0x80000000 >> 1;
91
- for (size_t i = 0; i < 32+3; ++i) {
92
- if(i>2) printf("0x%08x, ", k);
93
- k = crc_multiply(k, k);
94
- }
95
- return 0;
96
- }
97
- */
88
+ #ifndef __aarch64__
89
+ uint32_t crc32_multiply_generic(uint32_t a, uint32_t b);
90
+ # define crc32_multiply_arm crc32_multiply_generic
91
+ #endif
98
92
  #endif
99
93
 
100
94
 
@@ -130,6 +124,7 @@ static uint32_t arm_crc_calc(uint32_t crc, const unsigned char *src, long len) {
130
124
  // (this is a slightly less efficient, but much simpler implementation of the idea)
131
125
  const unsigned SPLIT_WORDS_LOG = 10; // make sure it's at least 2
132
126
  const unsigned SPLIT_WORDS = 1<<SPLIT_WORDS_LOG;
127
+ const unsigned blockCoeff = crc_power[SPLIT_WORDS_LOG + WORDSIZE_LOG + 3];
133
128
  while(len >= (long)(sizeof(WORD_T)*SPLIT_WORDS*2)) {
134
129
  // compute 2x CRCs concurrently to leverage piplining
135
130
  uint32_t crc2 = 0;
@@ -148,8 +143,7 @@ static uint32_t arm_crc_calc(uint32_t crc, const unsigned char *src, long len) {
148
143
  srcW++;
149
144
  }
150
145
  // merge the CRCs
151
- // since we're multiplying by a fixed number, it could be sped up with some lookup tables
152
- crc = crc_multiply(crc, crc_power[SPLIT_WORDS_LOG + WORDSIZE_LOG]) ^ crc2;
146
+ crc = crc32_multiply_arm(crc, blockCoeff) ^ crc2;
153
147
  srcW += SPLIT_WORDS;
154
148
  len -= sizeof(WORD_T)*SPLIT_WORDS*2;
155
149
  }
@@ -200,11 +194,34 @@ static uint32_t do_crc32_incremental_arm(const void* data, size_t length, uint32
200
194
  return ~arm_crc_calc(~init, (const unsigned char*)data, (long)length);
201
195
  }
202
196
 
203
- void crc_arm_set_funcs(crc_func* _do_crc32_incremental) {
204
- *_do_crc32_incremental = &do_crc32_incremental_arm;
197
+
198
+ #if defined(__aarch64__) && (defined(__GNUC__) || defined(_MSC_VER))
199
+ uint32_t crc32_shift_arm(uint32_t crc1, uint32_t n) {
200
+ uint32_t result = crc1;
201
+ uint64_t prod = result;
202
+ prod <<= 32 - (n&31);
203
+ result = __crc32w(0, prod) ^ (prod >> 32);
204
+ n &= ~31;
205
+
206
+ while(n) {
207
+ result = crc32_multiply_arm(result, crc_power[ctz32(n)]);
208
+ n &= n-1;
209
+ }
210
+ return result;
205
211
  }
206
- #else
207
- void crc_arm_set_funcs(crc_func* _do_crc32_incremental) {
208
- (void)_do_crc32_incremental;
212
+ #endif
213
+
214
+
215
+ void crc_arm_set_funcs() {
216
+ _do_crc32_incremental = &do_crc32_incremental_arm;
217
+ #ifdef __aarch64__
218
+ _crc32_multiply = &crc32_multiply_arm;
219
+ # if defined(__GNUC__) || defined(_MSC_VER)
220
+ _crc32_shift = &crc32_shift_arm;
221
+ # endif
222
+ #endif
223
+ _crc32_isa = ISA_FEATURE_CRC;
209
224
  }
225
+ #else
226
+ void crc_arm_set_funcs() {}
210
227
  #endif
package/src/crc_common.h CHANGED
@@ -2,3 +2,14 @@
2
2
  #include <stddef.h> // for size_t
3
3
  #include "crc.h"
4
4
 
5
+ extern const uint32_t crc_power[32];
6
+
7
+ #ifdef __GNUC__
8
+ # define ctz32 __builtin_ctz
9
+ #elif defined(_MSC_VER)
10
+ static HEDLEY_ALWAYS_INLINE unsigned ctz32(uint32_t n) {
11
+ unsigned long r;
12
+ _BitScanForward(&r, n);
13
+ return r;
14
+ }
15
+ #endif
@@ -140,20 +140,10 @@ static uint32_t crc_fold(const unsigned char *src, long len, uint32_t initial) {
140
140
  unsigned long algn_diff;
141
141
  __m128i xmm_t0, xmm_t1, xmm_t2, xmm_t3;
142
142
 
143
- // TODO: consider calculating this via a LUT instead (probably faster)
144
- // info from https://www.reddit.com/r/ReverseEngineering/comments/2zwhl3/mystery_constant_0x9db42487_in_intels_crc32ieee/
145
- // firstly, calculate: xmm_crc0 = (intial * 0x487b9c8a) mod 0x104c11db7, where 0x487b9c8a = inverse(1<<512) mod 0x104c11db7
146
- xmm_t0 = _mm_cvtsi32_si128(~initial);
147
-
148
- xmm_t0 = _mm_clmulepi64_si128(xmm_t0, _mm_set_epi32(0, 0, 0xa273bc24, 0), 0); // reverse(0x487b9c8a)<<1 == 0xa273bc24
149
- xmm_t2 = _mm_set_epi32( // polynomial reduction factors
150
- 1, 0xdb710640, // G* = 0x04c11db7
151
- 0, 0xf7011641 // Q+ = 0x04d101df (+1 to save an additional xor operation)
152
- );
153
- xmm_t1 = _mm_clmulepi64_si128(xmm_t0, xmm_t2, 0);
154
- xmm_t1 = _mm_clmulepi64_si128(xmm_t1, xmm_t2, 0x10);
155
-
156
- __m128i xmm_crc0 = _mm_srli_si128(_mm_xor_si128(xmm_t0, xmm_t1), 8);
143
+ // since the initial value will immediately be multiplied by around 2^512, we need to roll it backwards
144
+ // this is done by dividing the initial value by 2^480
145
+ // the constant used here is reverse(2^-480)<<1 == 0xdfded7ec
146
+ __m128i xmm_crc0 = _mm_clmulepi64_si128(_mm_cvtsi32_si128(~initial), _mm_cvtsi32_si128(0xdfded7ec), 0);
157
147
 
158
148
  __m128i xmm_crc1 = _mm_setzero_si128();
159
149
  __m128i xmm_crc2 = _mm_setzero_si128();
@@ -365,12 +355,159 @@ static uint32_t do_crc32_incremental_clmul(const void* data, size_t length, uint
365
355
  return crc_fold((const unsigned char*)data, (long)length, init);
366
356
  }
367
357
 
368
- void crc_clmul_set_funcs(crc_func* _do_crc32_incremental) {
369
- *_do_crc32_incremental = &do_crc32_incremental_clmul;
358
+
359
+ static HEDLEY_ALWAYS_INLINE __m128i crc32_reduce(__m128i prod) {
360
+ // do Barrett reduction back into 32-bit field
361
+ const __m128i reduction_const = _mm_load_si128((__m128i*)crc_k + 2);
362
+ __m128i t = _mm_clmulepi64_si128(prod, reduction_const, 0);
363
+ t = _mm_clmulepi64_si128(t, reduction_const, 0x10);
364
+ t = _mm_xor_si128(t, prod);
365
+ return t;
366
+ }
367
+
368
+ uint32_t crc32_multiply_clmul(uint32_t a, uint32_t b) {
369
+ // do the actual multiply
370
+ __m128i prod = _mm_clmulepi64_si128(_mm_cvtsi32_si128(a), _mm_cvtsi32_si128(b), 0);
371
+
372
+ // prepare product for reduction
373
+ prod = _mm_add_epi64(prod, prod); // bit alignment fix, due to CRC32 being bit-reversal
374
+ prod = _mm_slli_si128(prod, 4); // straddle low/high halves across 64-bit boundary - this provides automatic truncation during reduction
375
+
376
+ prod = crc32_reduce(prod);
377
+ return _mm_extract_epi32(prod, 2);
370
378
  }
379
+
380
+ #if defined(__GNUC__) || defined(_MSC_VER)
381
+ static HEDLEY_ALWAYS_INLINE __m128i reverse_bits_epi8(__m128i src) {
382
+ #if defined(__GFNI__) && defined(YENC_BUILD_NATIVE) && YENC_BUILD_NATIVE!=0
383
+ return _mm_gf2p8affine_epi64_epi8(src, _mm_set_epi32(
384
+ 0x80402010, 0x08040201,
385
+ 0x80402010, 0x08040201
386
+ ), 0);
387
+ /*
388
+ #elif defined(ENABLE_AVX512)
389
+ // !! this only processes the bottom 32 bits !!
390
+ src = _mm_maskz_mov_epi32(1, src);
391
+ src = _mm_ternarylogic_epi32(src, _mm_slli_epi64(src, 28), _mm_set1_epi8(0xf), 0xa8); // (a|b)&c
392
+ src = _mm_shuffle_epi8(_mm_set_epi8(
393
+ -16, 112, -80, 48, -48, 80, -112, 16, -32, 96, -96, 32, -64, 64, -128, 0
394
+ ), src);
395
+ return _mm_maskz_or_epi32(1, src, _mm_srli_epi64(src, 36));
396
+ */
371
397
  #else
372
- void crc_clmul_set_funcs(crc_func* _do_crc32_incremental) {
373
- (void)_do_crc32_incremental;
398
+ __m128i xmm_t0 = _mm_and_si128(src, _mm_set1_epi8(0x0f));
399
+ __m128i xmm_t1 = _mm_and_si128(_mm_srli_epi16(src, 4), _mm_set1_epi8(0x0f));
400
+ xmm_t0 = _mm_shuffle_epi8(_mm_set_epi8(
401
+ -16, 112, -80, 48, -48, 80, -112, 16, -32, 96, -96, 32, -64, 64, -128, 0
402
+ //0xf0, 0x70, 0xb0, 0x30, 0xd0, 0x50, 0x90, 0x10, 0xe0, 0x60, 0xa0, 0x20, 0xc0, 0x40, 0x80, 0
403
+ ), xmm_t0);
404
+ xmm_t1 = _mm_shuffle_epi8(_mm_set_epi8(
405
+ 15, 7, 11, 3, 13, 5, 9, 1, 14, 6, 10, 2, 12, 4, 8, 0
406
+ ), xmm_t1);
407
+ return _mm_or_si128(xmm_t0, xmm_t1);
408
+ #endif
374
409
  }
410
+
411
+ #ifdef _MSC_VER
412
+ // because MSVC doesn't use BSWAP unless you specifically tell it to...
413
+ # include <stdlib.h>
414
+ # define BSWAP32 _byteswap_ulong
415
+ #else
416
+ # define BSWAP32(n) ((((n)&0xff)<<24) | (((n)&0xff00)<<8) | (((n)&0xff0000)>>8) | (((n)&0xff000000)>>24))
417
+ #endif
418
+
419
+
420
+
421
+ const uint32_t crc_power_rev[32] = { // bit-reversed crc_power
422
+ 0x00000002, 0x00000004, 0x00000010, 0x00000100, 0x00010000, 0x04c11db7, 0x490d678d, 0xe8a45605,
423
+ 0x75be46b7, 0xe6228b11, 0x567fddeb, 0x88fe2237, 0x0e857e71, 0x7001e426, 0x075de2b2, 0xf12a7f90,
424
+ 0xf0b4a1c1, 0x58f46c0c, 0xc3395ade, 0x96837f8c, 0x544037f9, 0x23b7b136, 0xb2e16ba8, 0x725e7bfa,
425
+ 0xec709b5d, 0xf77a7274, 0x2845d572, 0x034e2515, 0x79695942, 0x540cb128, 0x0b65d023, 0x3c344723
426
+ };
427
+
428
+ static HEDLEY_ALWAYS_INLINE __m128i crc32_shift_clmul_mulred(unsigned pos, __m128i prod) {
429
+ // this multiplies a 64-bit `prod` with a 32-bit CRC power
430
+ // compared with crc32_multiply_clmul, this only reduces the result to 64-bit, saving a multiply
431
+ __m128i coeff = _mm_cvtsi32_si128(crc_power_rev[pos]);
432
+
433
+ const __m128i fold_const = _mm_set_epi32(0, 0x490d678d, 0, 0xf200aa66);
434
+ prod = _mm_clmulepi64_si128(prod, coeff, 0);
435
+ __m128i hi = _mm_clmulepi64_si128(prod, fold_const, 0x11);
436
+ return _mm_xor_si128(hi, prod);
437
+ }
438
+
439
+ uint32_t crc32_shift_clmul(uint32_t crc1, uint32_t n) {
440
+ if(!n) return crc1;
441
+
442
+ __m128i result = _mm_cvtsi32_si128(BSWAP32(crc1));
443
+ result = reverse_bits_epi8(result);
444
+
445
+ // handle n < 32 with a shift
446
+ result = _mm_sll_epi64(result, _mm_cvtsi32_si128(n & 31));
447
+ n &= ~31;
448
+
449
+ __m128i t;
450
+ if(n) {
451
+ // use a second accumulator to leverage some IPC from slow CLMUL
452
+ __m128i result2 = _mm_cvtsi32_si128(crc_power_rev[ctz32(n)]);
453
+ n &= n-1;
454
+
455
+ if(n) {
456
+ // first multiply doesn't need reduction
457
+ result2 = _mm_clmulepi64_si128(result2, _mm_cvtsi32_si128(crc_power_rev[ctz32(n)]), 0);
458
+ n &= n-1;
459
+
460
+ while(n) {
461
+ result = crc32_shift_clmul_mulred(ctz32(n), result);
462
+ n &= n-1;
463
+
464
+ if(n) {
465
+ result2 = crc32_shift_clmul_mulred(ctz32(n), result2);
466
+ n &= n-1;
467
+ }
468
+ }
469
+ }
470
+
471
+ const __m128i fold_const = _mm_set_epi32(0, 0x490d678d, 0, 0xf200aa66);
472
+
473
+ // merge two results
474
+ result = _mm_clmulepi64_si128(result, result2, 0);
475
+
476
+ // do 128b reduction
477
+ t = _mm_unpackhi_epi32(result, _mm_setzero_si128());
478
+ // fold [127:96] -> [63:0]
479
+ __m128i hi = _mm_clmulepi64_si128(t, fold_const, 1);
480
+ // fold [95:64] -> [63:0]
481
+ __m128i lo = _mm_clmulepi64_si128(t, fold_const, 0x10);
482
+ #ifdef ENABLE_AVX512
483
+ result = _mm_ternarylogic_epi32(result, hi, lo, 0x96);
484
+ #else
485
+ result = _mm_xor_si128(result, hi);
486
+ result = _mm_xor_si128(result, lo);
487
+ #endif
488
+ }
489
+
490
+ // do Barrett reduction back into 32-bit field
491
+ const __m128i reduction_const = _mm_set_epi32(0, 0x04c11db7, 1, 0x04d101df);
492
+ t = _mm_clmulepi64_si128(_mm_blend_epi16(_mm_setzero_si128(), result, 0x3c), reduction_const, 0);
493
+ t = _mm_clmulepi64_si128(t, reduction_const, 0x11);
494
+ result = _mm_xor_si128(t, result);
495
+
496
+ result = reverse_bits_epi8(result);
497
+ return BSWAP32(_mm_cvtsi128_si32(result));
498
+ }
499
+ #endif
500
+
501
+
502
+ void crc_clmul_set_funcs() {
503
+ _do_crc32_incremental = &do_crc32_incremental_clmul;
504
+ _crc32_multiply = &crc32_multiply_clmul;
505
+ #if defined(__GNUC__) || defined(_MSC_VER)
506
+ _crc32_shift = &crc32_shift_clmul;
507
+ #endif
508
+ _crc32_isa = ISA_LEVEL_PCLMUL;
509
+ }
510
+ #else
511
+ void crc_clmul_set_funcs() {}
375
512
  #endif
376
513