yencode 1.1.5 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "yencode",
3
- "version": "1.1.5",
3
+ "version": "1.2.0",
4
4
  "description": "SIMD accelerated yEnc encoder/decoder and CRC32 calculator",
5
5
  "keywords": [
6
6
  "yenc",
package/src/common.h CHANGED
@@ -245,6 +245,7 @@ enum YEncDecIsaLevel {
245
245
  #elif defined(__riscv)
246
246
  enum YEncDecIsaLevel {
247
247
  ISA_GENERIC = 0,
248
+ ISA_FEATURE_ZBC = 16,
248
249
  ISA_LEVEL_RVV = 0x10000
249
250
  };
250
251
  #else
@@ -291,8 +292,25 @@ bool cpu_supports_rvv();
291
292
  #if defined(__riscv_vector) && defined(HEDLEY_GCC_VERSION) && !HEDLEY_GCC_VERSION_CHECK(13,0,0)
292
293
  // GCC added RVV intrinsics in GCC13
293
294
  # undef __riscv_vector
295
+ #elif defined(__riscv_vector) && defined(HEDLEY_GCC_VERSION) && !HEDLEY_GCC_VERSION_CHECK(14,0,0)
296
+ // ...however, GCC13 lacks necessary mask<>vector vreinterpret casts, and it crashes on type punning, so I can't be bothered trying to make it work
297
+ # undef __riscv_vector
298
+ #endif
299
+ #ifdef __riscv_vector
300
+ # include <riscv_vector.h>
301
+ # ifdef __riscv_v_intrinsic
302
+ # define RV(f) __riscv_##f
303
+ # else
304
+ # define RV(f) f
305
+ # endif
306
+ # if defined(__riscv_v_intrinsic) && __riscv_v_intrinsic >= 12000
307
+ # define RV_MASK_CAST(masksz, vecsz, vec) RV(vreinterpret_v_u##vecsz##m1_b##masksz)(vec)
308
+ # define RV_VEC_U8MF4_CAST(vec) RV(vlmul_trunc_v_u8m1_u8mf4)(RV(vreinterpret_v_b4_u8m1)(vec))
309
+ # else
310
+ # define RV_MASK_CAST(masksz, vecsz, vec) *(vbool##masksz##_t*)(&(vec))
311
+ # define RV_VEC_U8MF4_CAST(vec) *(vuint8mf4_t*)(&(vec))
312
+ # endif
294
313
  #endif
295
-
296
314
 
297
315
  #include <string.h>
298
316
  #if !defined(_MSC_VER) || defined(_STDINT) || _MSC_VER >= 1900
package/src/crc.cc CHANGED
@@ -1,16 +1,23 @@
1
1
  #include "crc_common.h"
2
2
 
3
+ #if defined(PLATFORM_X86) && !defined(__ILP32__) && !defined(YENC_DISABLE_CRCUTIL)
4
+ // Use crcutil for computing CRC32 (generic implementation)
5
+
3
6
  #include "interface.h"
4
7
  crcutil_interface::CRC* crc = NULL;
8
+ #define GENERIC_CRC_INIT crc = crcutil_interface::CRC::Create(0xEDB88320, 0, 32, true, 0, 0, 0, 0, NULL)
9
+ // instance never deleted... oh well...
5
10
 
6
- #if defined(PLATFORM_X86) && !defined(__ILP32__)
7
11
  static uint32_t do_crc32_incremental_generic(const void* data, size_t length, uint32_t init) {
8
12
  // use optimised ASM on x86 platforms
9
13
  crcutil_interface::UINT64 tmp = init;
10
14
  crc->Compute(data, length, &tmp);
11
15
  return (uint32_t)tmp;
12
16
  }
17
+
13
18
  #else
19
+ // don't use crcutil
20
+
14
21
  static uint32_t* HEDLEY_RESTRICT crc_slice_table;
15
22
  #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
16
23
  # if defined(__GNUC__) || defined(__clang__)
@@ -121,29 +128,73 @@ static void generate_crc32_slice_table() {
121
128
  #endif
122
129
  }
123
130
  }
131
+
132
+ #define GENERIC_CRC_INIT generate_crc32_slice_table()
124
133
  #endif
125
134
 
126
- extern "C" {
127
- crc_func _do_crc32_incremental = &do_crc32_incremental_generic;
128
- int _crc32_isa = ISA_GENERIC;
135
+
136
+
137
+ // workaround MSVC complaining "unary minus operator applied to unsigned type, result still unsigned"
138
+ #define NEGATE(n) (uint32_t)(-((int32_t)(n)))
139
+ uint32_t crc32_multiply_generic(uint32_t a, uint32_t b) {
140
+ uint32_t res = 0;
141
+ for(int i=0; i<31; i++) {
142
+ res ^= NEGATE(b>>31) & a;
143
+ a = ((a >> 1) ^ (0xEDB88320 & NEGATE(a&1)));
144
+ b <<= 1;
145
+ }
146
+ res ^= NEGATE(b>>31) & a;
147
+ return res;
129
148
  }
149
+ #undef NEGATE
130
150
 
151
+ const uint32_t crc_power[32] = { // pre-computed 2^(2^n)
152
+ 0x40000000, 0x20000000, 0x08000000, 0x00800000, 0x00008000, 0xedb88320, 0xb1e6b092, 0xa06a2517,
153
+ 0xed627dae, 0x88d14467, 0xd7bbfe6a, 0xec447f11, 0x8e7ea170, 0x6427800e, 0x4d47bae0, 0x09fe548f,
154
+ 0x83852d0f, 0x30362f1a, 0x7b5a9cc3, 0x31fec169, 0x9fec022a, 0x6c8dedc4, 0x15d6874d, 0x5fde7a4e,
155
+ 0xbad90e37, 0x2e4e5eef, 0x4eaba214, 0xa8a472c0, 0x429a969e, 0x148d302a, 0xc40ba6d0, 0xc4e22c3c
156
+ };
131
157
 
132
- uint32_t do_crc32_combine(uint32_t crc1, uint32_t crc2, size_t len2) {
133
- crcutil_interface::UINT64 crc1_ = crc1, crc2_ = crc2;
134
- crc->Concatenate(crc2_, 0, len2, &crc1_);
135
- return (uint32_t)crc1_;
158
+ uint32_t crc32_shift_generic(uint32_t crc1, uint32_t n) {
159
+ uint32_t result = crc1;
160
+ #ifdef __GNUC__
161
+ while(n) {
162
+ result = crc32_multiply_generic(result, crc_power[__builtin_ctz(n)]);
163
+ n &= n-1;
164
+ }
165
+ #elif defined(_MSC_VER)
166
+ unsigned long power;
167
+ while(_BitScanForward(&power, n)) {
168
+ result = crc32_multiply_generic(result, crc_power[power]);
169
+ n &= n-1;
170
+ }
171
+ #else
172
+ unsigned power = 0;
173
+ while(n) {
174
+ if(n & 1) {
175
+ result = crc32_multiply_generic(result, crc_power[power]);
176
+ }
177
+ n >>= 1;
178
+ power++;
179
+ }
180
+ #endif
181
+ return result;
136
182
  }
137
183
 
138
- uint32_t do_crc32_zeros(uint32_t crc1, size_t len) {
139
- crcutil_interface::UINT64 crc_ = crc1;
140
- crc->CrcOfZeroes(len, &crc_);
141
- return (uint32_t)crc_;
184
+
185
+ extern "C" {
186
+ crc_func _do_crc32_incremental = &do_crc32_incremental_generic;
187
+ crc_mul_func _crc32_shift = &crc32_shift_generic;
188
+ crc_mul_func _crc32_multiply = &crc32_multiply_generic;
189
+ int _crc32_isa = ISA_GENERIC;
142
190
  }
143
191
 
192
+
193
+
144
194
  void crc_clmul_set_funcs();
145
195
  void crc_clmul256_set_funcs();
146
196
  void crc_arm_set_funcs();
197
+ void crc_riscv_set_funcs();
147
198
 
148
199
  #ifdef PLATFORM_X86
149
200
  int cpu_supports_crc_isa();
@@ -175,14 +226,16 @@ static unsigned long getauxval(unsigned long cap) {
175
226
  # endif
176
227
  # endif
177
228
  #endif
178
- void crc_init() {
179
- crc = crcutil_interface::CRC::Create(
180
- 0xEDB88320, 0, 32, true, 0, 0, 0, 0, NULL);
181
- // instance never deleted... oh well...
182
-
183
- #if !defined(PLATFORM_X86) || defined(__ILP32__)
184
- generate_crc32_slice_table();
229
+ #if defined(__riscv) && defined(__has_include)
230
+ # if __has_include(<asm/hwprobe.h>)
231
+ # include <asm/hwprobe.h>
232
+ # include <asm/unistd.h>
233
+ # include <unistd.h>
234
+ # endif
185
235
  #endif
236
+
237
+ void crc_init() {
238
+ GENERIC_CRC_INIT;
186
239
 
187
240
  #ifdef PLATFORM_X86
188
241
  int support = cpu_supports_crc_isa();
@@ -220,4 +273,16 @@ void crc_init() {
220
273
  crc_arm_set_funcs();
221
274
  }
222
275
  #endif
276
+ #ifdef __riscv
277
+ # if defined(RISCV_HWPROBE_KEY_IMA_EXT_0) && defined(__NR_riscv_hwprobe)
278
+ const int rv_hwprobe_ext_zbc = 1 << 7, rv_hwprobe_ext_zbkc = 1 << 9;
279
+ struct riscv_hwprobe p;
280
+ p.key = RISCV_HWPROBE_KEY_IMA_EXT_0;
281
+ if(!syscall(__NR_riscv_hwprobe, &p, 1, 0, NULL, 0)) {
282
+ if(p.value & (rv_hwprobe_ext_zbc | rv_hwprobe_ext_zbkc)) {
283
+ crc_riscv_set_funcs();
284
+ }
285
+ }
286
+ # endif
287
+ #endif
223
288
  }
package/src/crc.h CHANGED
@@ -1,5 +1,6 @@
1
1
  #ifndef __YENC_CRC_H
2
2
  #define __YENC_CRC_H
3
+ #include <stdlib.h> // for llabs
3
4
 
4
5
  #ifdef __cplusplus
5
6
  extern "C" {
@@ -9,17 +10,78 @@ extern "C" {
9
10
 
10
11
  typedef uint32_t (*crc_func)(const void*, size_t, uint32_t);
11
12
  extern crc_func _do_crc32_incremental;
13
+
12
14
  extern int _crc32_isa;
13
15
  #define do_crc32 (*_do_crc32_incremental)
14
-
15
- uint32_t do_crc32_combine(uint32_t crc1, const uint32_t crc2, size_t len2);
16
- uint32_t do_crc32_zeros(uint32_t crc1, size_t len);
17
- void crc_init();
18
16
  static inline int crc32_isa_level() {
19
17
  return _crc32_isa;
20
18
  }
21
19
 
22
20
 
21
+ #if !defined(__GNUC__) && defined(_MSC_VER)
22
+ # include <intrin.h>
23
+ #endif
24
+ // computes `n % 0xffffffff` (well, almost), using some bit-hacks
25
+ static inline uint32_t crc32_powmod(uint64_t n) {
26
+ #ifdef __GNUC__
27
+ unsigned res;
28
+ unsigned carry = __builtin_uadd_overflow(n >> 32, n, &res);
29
+ res += carry;
30
+ return res;
31
+ #elif defined(_MSC_VER)
32
+ unsigned res;
33
+ unsigned char carry = _addcarry_u32(0, n >> 32, n, &res);
34
+ _addcarry_u32(carry, res, 0, &res);
35
+ return res;
36
+ #else
37
+ n = (n >> 32) + (n & 0xffffffff);
38
+ n += n >> 32;
39
+ return n;
40
+ #endif
41
+ }
42
+ // computes `crc32_powmod(n*8)` avoiding overflow
43
+ static inline uint32_t crc32_bytepow(uint64_t n) {
44
+ #if defined(__GNUC__) || defined(_MSC_VER)
45
+ unsigned res = crc32_powmod(n);
46
+ # ifdef _MSC_VER
47
+ return _rotl(res, 3);
48
+ # else
49
+ return (res << 3) | (res >> 29);
50
+ # endif
51
+ #else
52
+ n = (n >> 32) + (n & 0xffffffff);
53
+ n <<= 3;
54
+ n += n >> 32;
55
+ return n;
56
+ #endif
57
+ }
58
+
59
+ typedef uint32_t (*crc_mul_func)(uint32_t, uint32_t);
60
+ extern crc_mul_func _crc32_shift;
61
+ extern crc_mul_func _crc32_multiply;
62
+ #define crc32_shift (*_crc32_shift)
63
+ #define crc32_multiply (*_crc32_multiply)
64
+
65
+ static inline uint32_t crc32_combine(uint32_t crc1, uint32_t crc2, uint64_t len2) {
66
+ return crc32_shift(crc1, crc32_bytepow(len2)) ^ crc2;
67
+ }
68
+ static inline uint32_t crc32_zeros(uint32_t crc1, uint64_t len) {
69
+ return ~crc32_shift(~crc1, crc32_bytepow(len));
70
+ }
71
+ static inline uint32_t crc32_unzero(uint32_t crc1, uint64_t len) {
72
+ return ~crc32_shift(~crc1, ~crc32_bytepow(len));
73
+ }
74
+ static inline uint32_t crc32_2pow(int64_t n) {
75
+ uint32_t sign = (uint32_t)(n >> 63);
76
+ return crc32_shift(0x80000000, crc32_powmod(llabs(n)) ^ sign);
77
+ }
78
+ static inline uint32_t crc32_256pow(uint64_t n) {
79
+ return crc32_shift(0x80000000, crc32_bytepow(n));
80
+ }
81
+
82
+ void crc_init();
83
+
84
+
23
85
 
24
86
  #ifdef __cplusplus
25
87
  }
package/src/crc_arm.cc CHANGED
@@ -59,42 +59,36 @@ HEDLEY_WARNING("CRC32 acceleration has been disabled due to missing arm_acle.h")
59
59
  #endif
60
60
 
61
61
 
62
+
63
+ #ifdef __aarch64__
64
+ uint32_t crc32_multiply_arm(uint32_t a, uint32_t b) {
65
+ // perform PMULL
66
+ uint64_t res = 0;
67
+ uint64_t a64 = (uint64_t)a << 32;
68
+ int64_t b64 = (int64_t)b << 32;
69
+ for(int i=0; i<32; i++) {
70
+ res ^= a64 & (b64 >> 63);
71
+ b64 += b64;
72
+ a64 >>= 1;
73
+ }
74
+ // reduction via CRC
75
+ res = __crc32w(0, res) ^ (res >> 32);
76
+ return res;
77
+ }
78
+ #endif
79
+ // regular multiply is probably better for AArch32
80
+
81
+
62
82
  // exploit CPU pipelining during CRC computation; unfortunately I haven't been able to measure any benefit
63
83
  // - Neoverse N1: no noticeable difference
64
84
  // - Cortex A53: actually runs a bit slower
65
85
  //#define ENABLE_PIPELINE_OPT 1
66
86
 
67
87
  #ifdef ENABLE_PIPELINE_OPT
68
- // workaround MSVC complaining "unary minus operator applied to unsigned type, result still unsigned"
69
- #define NEGATE(n) (uint32_t)(-((int32_t)(n)))
70
-
71
- static HEDLEY_ALWAYS_INLINE uint32_t crc_multiply(uint32_t a, uint32_t b) {
72
- uint32_t res = 0;
73
- for(int i=0; i<31; i++) {
74
- res ^= NEGATE(b>>31) & a;
75
- a = ((a >> 1) ^ (0xEDB88320 & NEGATE(a&1)));
76
- b <<= 1;
77
- }
78
- res ^= NEGATE(b>>31) & a;
79
- return res;
80
- }
81
-
82
- static const uint32_t crc_power[] = { // pre-computed 2^(2^n), with first 3 entries removed (saves a shift)
83
- 0x00800000, 0x00008000, 0xedb88320, 0xb1e6b092, 0xa06a2517, 0xed627dae, 0x88d14467, 0xd7bbfe6a,
84
- 0xec447f11, 0x8e7ea170, 0x6427800e, 0x4d47bae0, 0x09fe548f, 0x83852d0f, 0x30362f1a, 0x7b5a9cc3,
85
- 0x31fec169, 0x9fec022a, 0x6c8dedc4, 0x15d6874d, 0x5fde7a4e, 0xbad90e37, 0x2e4e5eef, 0x4eaba214,
86
- 0xa8a472c0, 0x429a969e, 0x148d302a, 0xc40ba6d0, 0xc4e22c3c, 0x40000000, 0x20000000, 0x08000000
87
- };
88
- /* above table can be computed with
89
- int main(void) {
90
- uint32_t k = 0x80000000 >> 1;
91
- for (size_t i = 0; i < 32+3; ++i) {
92
- if(i>2) printf("0x%08x, ", k);
93
- k = crc_multiply(k, k);
94
- }
95
- return 0;
96
- }
97
- */
88
+ #ifndef __aarch64__
89
+ uint32_t crc32_multiply_generic(uint32_t a, uint32_t b);
90
+ # define crc32_multiply_arm crc32_multiply_generic
91
+ #endif
98
92
  #endif
99
93
 
100
94
 
@@ -130,6 +124,7 @@ static uint32_t arm_crc_calc(uint32_t crc, const unsigned char *src, long len) {
130
124
  // (this is a slightly less efficient, but much simpler implementation of the idea)
131
125
  const unsigned SPLIT_WORDS_LOG = 10; // make sure it's at least 2
132
126
  const unsigned SPLIT_WORDS = 1<<SPLIT_WORDS_LOG;
127
+ const unsigned blockCoeff = crc_power[SPLIT_WORDS_LOG + WORDSIZE_LOG + 3];
133
128
  while(len >= (long)(sizeof(WORD_T)*SPLIT_WORDS*2)) {
134
129
  // compute 2x CRCs concurrently to leverage piplining
135
130
  uint32_t crc2 = 0;
@@ -148,8 +143,7 @@ static uint32_t arm_crc_calc(uint32_t crc, const unsigned char *src, long len) {
148
143
  srcW++;
149
144
  }
150
145
  // merge the CRCs
151
- // since we're multiplying by a fixed number, it could be sped up with some lookup tables
152
- crc = crc_multiply(crc, crc_power[SPLIT_WORDS_LOG + WORDSIZE_LOG]) ^ crc2;
146
+ crc = crc32_multiply_arm(crc, blockCoeff) ^ crc2;
153
147
  srcW += SPLIT_WORDS;
154
148
  len -= sizeof(WORD_T)*SPLIT_WORDS*2;
155
149
  }
@@ -200,8 +194,32 @@ static uint32_t do_crc32_incremental_arm(const void* data, size_t length, uint32
200
194
  return ~arm_crc_calc(~init, (const unsigned char*)data, (long)length);
201
195
  }
202
196
 
197
+
198
+ #if defined(__aarch64__) && (defined(__GNUC__) || defined(_MSC_VER))
199
+ uint32_t crc32_shift_arm(uint32_t crc1, uint32_t n) {
200
+ uint32_t result = crc1;
201
+ uint64_t prod = result;
202
+ prod <<= 32 - (n&31);
203
+ result = __crc32w(0, prod) ^ (prod >> 32);
204
+ n &= ~31;
205
+
206
+ while(n) {
207
+ result = crc32_multiply_arm(result, crc_power[ctz32(n)]);
208
+ n &= n-1;
209
+ }
210
+ return result;
211
+ }
212
+ #endif
213
+
214
+
203
215
  void crc_arm_set_funcs() {
204
216
  _do_crc32_incremental = &do_crc32_incremental_arm;
217
+ #ifdef __aarch64__
218
+ _crc32_multiply = &crc32_multiply_arm;
219
+ # if defined(__GNUC__) || defined(_MSC_VER)
220
+ _crc32_shift = &crc32_shift_arm;
221
+ # endif
222
+ #endif
205
223
  _crc32_isa = ISA_FEATURE_CRC;
206
224
  }
207
225
  #else
package/src/crc_common.h CHANGED
@@ -2,3 +2,14 @@
2
2
  #include <stddef.h> // for size_t
3
3
  #include "crc.h"
4
4
 
5
+ extern const uint32_t crc_power[32];
6
+
7
+ #ifdef __GNUC__
8
+ # define ctz32 __builtin_ctz
9
+ #elif defined(_MSC_VER)
10
+ static HEDLEY_ALWAYS_INLINE unsigned ctz32(uint32_t n) {
11
+ unsigned long r;
12
+ _BitScanForward(&r, n);
13
+ return r;
14
+ }
15
+ #endif
@@ -140,20 +140,10 @@ static uint32_t crc_fold(const unsigned char *src, long len, uint32_t initial) {
140
140
  unsigned long algn_diff;
141
141
  __m128i xmm_t0, xmm_t1, xmm_t2, xmm_t3;
142
142
 
143
- // TODO: consider calculating this via a LUT instead (probably faster)
144
- // info from https://www.reddit.com/r/ReverseEngineering/comments/2zwhl3/mystery_constant_0x9db42487_in_intels_crc32ieee/
145
- // firstly, calculate: xmm_crc0 = (intial * 0x487b9c8a) mod 0x104c11db7, where 0x487b9c8a = inverse(1<<512) mod 0x104c11db7
146
- xmm_t0 = _mm_cvtsi32_si128(~initial);
147
-
148
- xmm_t0 = _mm_clmulepi64_si128(xmm_t0, _mm_set_epi32(0, 0, 0xa273bc24, 0), 0); // reverse(0x487b9c8a)<<1 == 0xa273bc24
149
- xmm_t2 = _mm_set_epi32( // polynomial reduction factors
150
- 1, 0xdb710640, // G* = 0x04c11db7
151
- 0, 0xf7011641 // Q+ = 0x04d101df (+1 to save an additional xor operation)
152
- );
153
- xmm_t1 = _mm_clmulepi64_si128(xmm_t0, xmm_t2, 0);
154
- xmm_t1 = _mm_clmulepi64_si128(xmm_t1, xmm_t2, 0x10);
155
-
156
- __m128i xmm_crc0 = _mm_srli_si128(_mm_xor_si128(xmm_t0, xmm_t1), 8);
143
+ // since the initial value will immediately be multiplied by around 2^512, we need to roll it backwards
144
+ // this is done by dividing the initial value by 2^480
145
+ // the constant used here is reverse(2^-480)<<1 == 0xdfded7ec
146
+ __m128i xmm_crc0 = _mm_clmulepi64_si128(_mm_cvtsi32_si128(~initial), _mm_cvtsi32_si128(0xdfded7ec), 0);
157
147
 
158
148
  __m128i xmm_crc1 = _mm_setzero_si128();
159
149
  __m128i xmm_crc2 = _mm_setzero_si128();
@@ -365,8 +355,156 @@ static uint32_t do_crc32_incremental_clmul(const void* data, size_t length, uint
365
355
  return crc_fold((const unsigned char*)data, (long)length, init);
366
356
  }
367
357
 
358
+
359
+ static HEDLEY_ALWAYS_INLINE __m128i crc32_reduce(__m128i prod) {
360
+ // do Barrett reduction back into 32-bit field
361
+ const __m128i reduction_const = _mm_load_si128((__m128i*)crc_k + 2);
362
+ __m128i t = _mm_clmulepi64_si128(prod, reduction_const, 0);
363
+ t = _mm_clmulepi64_si128(t, reduction_const, 0x10);
364
+ t = _mm_xor_si128(t, prod);
365
+ return t;
366
+ }
367
+
368
+ uint32_t crc32_multiply_clmul(uint32_t a, uint32_t b) {
369
+ // do the actual multiply
370
+ __m128i prod = _mm_clmulepi64_si128(_mm_cvtsi32_si128(a), _mm_cvtsi32_si128(b), 0);
371
+
372
+ // prepare product for reduction
373
+ prod = _mm_add_epi64(prod, prod); // bit alignment fix, due to CRC32 being bit-reversal
374
+ prod = _mm_slli_si128(prod, 4); // straddle low/high halves across 64-bit boundary - this provides automatic truncation during reduction
375
+
376
+ prod = crc32_reduce(prod);
377
+ return _mm_extract_epi32(prod, 2);
378
+ }
379
+
380
+ #if defined(__GNUC__) || defined(_MSC_VER)
381
+ static HEDLEY_ALWAYS_INLINE __m128i reverse_bits_epi8(__m128i src) {
382
+ #if defined(__GFNI__) && defined(YENC_BUILD_NATIVE) && YENC_BUILD_NATIVE!=0
383
+ return _mm_gf2p8affine_epi64_epi8(src, _mm_set_epi32(
384
+ 0x80402010, 0x08040201,
385
+ 0x80402010, 0x08040201
386
+ ), 0);
387
+ /*
388
+ #elif defined(ENABLE_AVX512)
389
+ // !! this only processes the bottom 32 bits !!
390
+ src = _mm_maskz_mov_epi32(1, src);
391
+ src = _mm_ternarylogic_epi32(src, _mm_slli_epi64(src, 28), _mm_set1_epi8(0xf), 0xa8); // (a|b)&c
392
+ src = _mm_shuffle_epi8(_mm_set_epi8(
393
+ -16, 112, -80, 48, -48, 80, -112, 16, -32, 96, -96, 32, -64, 64, -128, 0
394
+ ), src);
395
+ return _mm_maskz_or_epi32(1, src, _mm_srli_epi64(src, 36));
396
+ */
397
+ #else
398
+ __m128i xmm_t0 = _mm_and_si128(src, _mm_set1_epi8(0x0f));
399
+ __m128i xmm_t1 = _mm_and_si128(_mm_srli_epi16(src, 4), _mm_set1_epi8(0x0f));
400
+ xmm_t0 = _mm_shuffle_epi8(_mm_set_epi8(
401
+ -16, 112, -80, 48, -48, 80, -112, 16, -32, 96, -96, 32, -64, 64, -128, 0
402
+ //0xf0, 0x70, 0xb0, 0x30, 0xd0, 0x50, 0x90, 0x10, 0xe0, 0x60, 0xa0, 0x20, 0xc0, 0x40, 0x80, 0
403
+ ), xmm_t0);
404
+ xmm_t1 = _mm_shuffle_epi8(_mm_set_epi8(
405
+ 15, 7, 11, 3, 13, 5, 9, 1, 14, 6, 10, 2, 12, 4, 8, 0
406
+ ), xmm_t1);
407
+ return _mm_or_si128(xmm_t0, xmm_t1);
408
+ #endif
409
+ }
410
+
411
+ #ifdef _MSC_VER
412
+ // because MSVC doesn't use BSWAP unless you specifically tell it to...
413
+ # include <stdlib.h>
414
+ # define BSWAP32 _byteswap_ulong
415
+ #else
416
+ # define BSWAP32(n) ((((n)&0xff)<<24) | (((n)&0xff00)<<8) | (((n)&0xff0000)>>8) | (((n)&0xff000000)>>24))
417
+ #endif
418
+
419
+
420
+
421
+ const uint32_t crc_power_rev[32] = { // bit-reversed crc_power
422
+ 0x00000002, 0x00000004, 0x00000010, 0x00000100, 0x00010000, 0x04c11db7, 0x490d678d, 0xe8a45605,
423
+ 0x75be46b7, 0xe6228b11, 0x567fddeb, 0x88fe2237, 0x0e857e71, 0x7001e426, 0x075de2b2, 0xf12a7f90,
424
+ 0xf0b4a1c1, 0x58f46c0c, 0xc3395ade, 0x96837f8c, 0x544037f9, 0x23b7b136, 0xb2e16ba8, 0x725e7bfa,
425
+ 0xec709b5d, 0xf77a7274, 0x2845d572, 0x034e2515, 0x79695942, 0x540cb128, 0x0b65d023, 0x3c344723
426
+ };
427
+
428
+ static HEDLEY_ALWAYS_INLINE __m128i crc32_shift_clmul_mulred(unsigned pos, __m128i prod) {
429
+ // this multiplies a 64-bit `prod` with a 32-bit CRC power
430
+ // compared with crc32_multiply_clmul, this only reduces the result to 64-bit, saving a multiply
431
+ __m128i coeff = _mm_cvtsi32_si128(crc_power_rev[pos]);
432
+
433
+ const __m128i fold_const = _mm_set_epi32(0, 0x490d678d, 0, 0xf200aa66);
434
+ prod = _mm_clmulepi64_si128(prod, coeff, 0);
435
+ __m128i hi = _mm_clmulepi64_si128(prod, fold_const, 0x11);
436
+ return _mm_xor_si128(hi, prod);
437
+ }
438
+
439
+ uint32_t crc32_shift_clmul(uint32_t crc1, uint32_t n) {
440
+ if(!n) return crc1;
441
+
442
+ __m128i result = _mm_cvtsi32_si128(BSWAP32(crc1));
443
+ result = reverse_bits_epi8(result);
444
+
445
+ // handle n < 32 with a shift
446
+ result = _mm_sll_epi64(result, _mm_cvtsi32_si128(n & 31));
447
+ n &= ~31;
448
+
449
+ __m128i t;
450
+ if(n) {
451
+ // use a second accumulator to leverage some IPC from slow CLMUL
452
+ __m128i result2 = _mm_cvtsi32_si128(crc_power_rev[ctz32(n)]);
453
+ n &= n-1;
454
+
455
+ if(n) {
456
+ // first multiply doesn't need reduction
457
+ result2 = _mm_clmulepi64_si128(result2, _mm_cvtsi32_si128(crc_power_rev[ctz32(n)]), 0);
458
+ n &= n-1;
459
+
460
+ while(n) {
461
+ result = crc32_shift_clmul_mulred(ctz32(n), result);
462
+ n &= n-1;
463
+
464
+ if(n) {
465
+ result2 = crc32_shift_clmul_mulred(ctz32(n), result2);
466
+ n &= n-1;
467
+ }
468
+ }
469
+ }
470
+
471
+ const __m128i fold_const = _mm_set_epi32(0, 0x490d678d, 0, 0xf200aa66);
472
+
473
+ // merge two results
474
+ result = _mm_clmulepi64_si128(result, result2, 0);
475
+
476
+ // do 128b reduction
477
+ t = _mm_unpackhi_epi32(result, _mm_setzero_si128());
478
+ // fold [127:96] -> [63:0]
479
+ __m128i hi = _mm_clmulepi64_si128(t, fold_const, 1);
480
+ // fold [95:64] -> [63:0]
481
+ __m128i lo = _mm_clmulepi64_si128(t, fold_const, 0x10);
482
+ #ifdef ENABLE_AVX512
483
+ result = _mm_ternarylogic_epi32(result, hi, lo, 0x96);
484
+ #else
485
+ result = _mm_xor_si128(result, hi);
486
+ result = _mm_xor_si128(result, lo);
487
+ #endif
488
+ }
489
+
490
+ // do Barrett reduction back into 32-bit field
491
+ const __m128i reduction_const = _mm_set_epi32(0, 0x04c11db7, 1, 0x04d101df);
492
+ t = _mm_clmulepi64_si128(_mm_blend_epi16(_mm_setzero_si128(), result, 0x3c), reduction_const, 0);
493
+ t = _mm_clmulepi64_si128(t, reduction_const, 0x11);
494
+ result = _mm_xor_si128(t, result);
495
+
496
+ result = reverse_bits_epi8(result);
497
+ return BSWAP32(_mm_cvtsi128_si32(result));
498
+ }
499
+ #endif
500
+
501
+
368
502
  void crc_clmul_set_funcs() {
369
503
  _do_crc32_incremental = &do_crc32_incremental_clmul;
504
+ _crc32_multiply = &crc32_multiply_clmul;
505
+ #if defined(__GNUC__) || defined(_MSC_VER)
506
+ _crc32_shift = &crc32_shift_clmul;
507
+ #endif
370
508
  _crc32_isa = ISA_LEVEL_PCLMUL;
371
509
  }
372
510
  #else
@@ -1,6 +1,8 @@
1
1
  // 256-bit version of crc_folding
2
2
 
3
3
  #include "crc_common.h"
4
+
5
+ void crc_clmul_set_funcs();
4
6
 
5
7
  #if !defined(YENC_DISABLE_AVX256) && ((defined(__VPCLMULQDQ__) && defined(__AVX2__) && defined(__PCLMUL__)) || (defined(_MSC_VER) && _MSC_VER >= 1920 && defined(PLATFORM_X86) && !defined(__clang__)))
6
8
  #include <inttypes.h>
@@ -99,19 +101,12 @@ ALIGN_TO(16, static const unsigned crc_k[]) = {
99
101
 
100
102
 
101
103
  static uint32_t crc_fold(const unsigned char *src, long len, uint32_t initial) {
102
- // info from https://www.reddit.com/r/ReverseEngineering/comments/2zwhl3/mystery_constant_0x9db42487_in_intels_crc32ieee/
103
- // firstly, calculate: xmm_crc0 = (intial * 0x487b9c8a) mod 0x104c11db7, where 0x487b9c8a = inverse(1<<512) mod 0x104c11db7
104
- __m128i xmm_t0 = _mm_cvtsi32_si128(~initial);
105
-
106
- xmm_t0 = _mm_clmulepi64_si128(xmm_t0, _mm_set_epi32(0, 0, 0xa273bc24, 0), 0); // reverse(0x487b9c8a)<<1 == 0xa273bc24
107
- __m128i reduction = _mm_set_epi32( // polynomial reduction factors
108
- 1, 0xdb710640, // G* = 0x04c11db7
109
- 0, 0xf7011641 // Q+ = 0x04d101df (+1 to save an additional xor operation)
104
+ __m128i xmm_t0 = _mm_clmulepi64_si128(
105
+ _mm_cvtsi32_si128(~initial),
106
+ _mm_cvtsi32_si128(0xdfded7ec),
107
+ 0
110
108
  );
111
- __m128i xmm_t1 = _mm_clmulepi64_si128(xmm_t0, reduction, 0);
112
- xmm_t1 = _mm_clmulepi64_si128(xmm_t1, reduction, 0x10);
113
109
 
114
- xmm_t0 = _mm_srli_si128(_mm_xor_si128(xmm_t0, xmm_t1), 8);
115
110
  __m256i crc0 = zext128_256(xmm_t0);
116
111
  __m256i crc1 = _mm256_setzero_si256();
117
112
 
@@ -218,11 +213,11 @@ static uint32_t do_crc32_incremental_clmul(const void* data, size_t length, uint
218
213
  }
219
214
 
220
215
  void crc_clmul256_set_funcs() {
216
+ crc_clmul_set_funcs(); // set multiply/shift function
221
217
  _do_crc32_incremental = &do_crc32_incremental_clmul;
222
218
  _crc32_isa = ISA_LEVEL_VPCLMUL;
223
219
  }
224
220
  #else
225
- void crc_clmul_set_funcs();
226
221
  void crc_clmul256_set_funcs() {
227
222
  crc_clmul_set_funcs();
228
223
  }