yencode 1.1.0 → 1.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/crc_arm.cc CHANGED
@@ -1,15 +1,98 @@
1
- #include "common.h"
2
1
  #include "crc_common.h"
3
2
 
4
- #if defined(__ARM_FEATURE_CRC32) || defined(_M_ARM64) /* TODO: AArch32 for MSVC? */
3
+ #if defined(PLATFORM_ARM) && defined(_MSC_VER) && defined(__clang__) && !defined(__ARM_FEATURE_CRC32)
4
+ // I don't think GYP provides a nice way to detect whether MSVC or clang-cl is being used, but it doesn't use clang-cl by default, so a warning here is probably sufficient
5
+ HEDLEY_WARNING("CRC32 acceleration is not been enabled under ARM clang-cl by default; add `-march=armv8-a+crc` to additional compiler arguments to enable");
6
+ #endif
7
+
8
+ // disable CRC on GCC versions with broken arm_acle.h
9
+ #if defined(__ARM_FEATURE_CRC32) && defined(HEDLEY_GCC_VERSION)
10
+ # if !defined(__aarch64__) && HEDLEY_GCC_VERSION_CHECK(7,0,0) && !HEDLEY_GCC_VERSION_CHECK(8,1,1)
11
+ # undef __ARM_FEATURE_CRC32
12
+ HEDLEY_WARNING("CRC32 acceleration has been disabled due to broken arm_acle.h shipped in GCC 7.0 - 8.1 [https://gcc.gnu.org/bugzilla/show_bug.cgi?id=81497]. If you need this feature, please use a different compiler or version of GCC");
13
+ # endif
14
+ # if defined(__aarch64__) && HEDLEY_GCC_VERSION_CHECK(9,4,0) && !HEDLEY_GCC_VERSION_CHECK(9,5,0)
15
+ # undef __ARM_FEATURE_CRC32
16
+ HEDLEY_WARNING("CRC32 acceleration has been disabled due to broken arm_acle.h shipped in GCC 9.4 [https://gcc.gnu.org/bugzilla/show_bug.cgi?id=100985]. If you need this feature, please use a different compiler or version of GCC");
17
+ # endif
18
+ #endif
19
+
20
+ #if defined(__ARM_FEATURE_CRC32) || (defined(_M_ARM64) && !defined(__clang__)) // MSVC doesn't support CRC for ARM32
5
21
 
6
22
  /* ARMv8 accelerated CRC */
7
- #ifdef _MSC_VER
23
+ #if defined(_MSC_VER) && !defined(__clang__)
8
24
  #include <intrin.h>
9
25
  #else
10
26
  #include <arm_acle.h>
11
27
  #endif
12
28
 
29
+
30
+ #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
31
+ # ifdef __GNUC__
32
+ # define _LE16 __builtin_bswap16
33
+ # define _LE32 __builtin_bswap32
34
+ # define _LE64 __builtin_bswap64
35
+ # else
36
+ // currently not supported
37
+ # error No endian swap intrinsic defined
38
+ # endif
39
+ #else
40
+ # define _LE16(x) (x)
41
+ # define _LE32(x) (x)
42
+ # define _LE64(x) (x)
43
+ #endif
44
+
45
+ #ifdef __aarch64__
46
+ # define WORD_T uint64_t
47
+ # define WORDSIZE_LOG 3 // sizeof(WORD_T) == 1<<WORDSIZE_LOG
48
+ # define CRC_WORD(crc, data) __crc32d(crc, _LE64(data))
49
+ #else
50
+ # define WORD_T uint32_t
51
+ # define WORDSIZE_LOG 2 // sizeof(WORD_T) == 1<<WORDSIZE_LOG
52
+ # define CRC_WORD(crc, data) __crc32w(crc, _LE32(data))
53
+ #endif
54
+
55
+
56
+ // exploit CPU pipelining during CRC computation; unfortunately I haven't been able to measure any benefit
57
+ // - Neoverse N1: no noticeable difference
58
+ // - Cortex A53: actually runs a bit slower
59
+ //#define ENABLE_PIPELINE_OPT 1
60
+
61
+ #ifdef ENABLE_PIPELINE_OPT
62
+ // workaround MSVC complaining "unary minus operator applied to unsigned type, result still unsigned"
63
+ #define NEGATE(n) (uint32_t)(-((int32_t)(n)))
64
+
65
+ static HEDLEY_ALWAYS_INLINE uint32_t crc_multiply(uint32_t a, uint32_t b) {
66
+ uint32_t res = 0;
67
+ for(int i=0; i<31; i++) {
68
+ res ^= NEGATE(b>>31) & a;
69
+ a = ((a >> 1) ^ (0xEDB88320 & NEGATE(a&1)));
70
+ b <<= 1;
71
+ }
72
+ res ^= NEGATE(b>>31) & a;
73
+ return res;
74
+ }
75
+
76
+ static const uint32_t crc_power[] = { // pre-computed 2^n, with first 3 entries removed (saves a shift)
77
+ 0x00800000, 0x00008000, 0xedb88320, 0xb1e6b092, 0xa06a2517, 0xed627dae, 0x88d14467, 0xd7bbfe6a,
78
+ 0xec447f11, 0x8e7ea170, 0x6427800e, 0x4d47bae0, 0x09fe548f, 0x83852d0f, 0x30362f1a, 0x7b5a9cc3,
79
+ 0x31fec169, 0x9fec022a, 0x6c8dedc4, 0x15d6874d, 0x5fde7a4e, 0xbad90e37, 0x2e4e5eef, 0x4eaba214,
80
+ 0xa8a472c0, 0x429a969e, 0x148d302a, 0xc40ba6d0, 0xc4e22c3c, 0x40000000, 0x20000000, 0x08000000
81
+ };
82
+ /* above table can be computed with
83
+ int main(void) {
84
+ uint32_t k = 0x80000000 >> 1;
85
+ for (size_t i = 0; i < 32+3; ++i) {
86
+ if(i>2) printf("0x%08x, ", k);
87
+ k = crc_multiply(k, k);
88
+ }
89
+ return 0;
90
+ }
91
+ */
92
+ #endif
93
+
94
+
95
+
13
96
  // inspired/stolen off https://github.com/jocover/crc32_armv8/blob/master/crc32_armv8.c
14
97
  static uint32_t arm_crc_calc(uint32_t crc, const unsigned char *src, long len) {
15
98
 
@@ -21,35 +104,84 @@ static uint32_t arm_crc_calc(uint32_t crc, const unsigned char *src, long len) {
21
104
  len--;
22
105
  }
23
106
  if ((uintptr_t)src & sizeof(uint16_t)) {
24
- crc = __crc32h(crc, *((uint16_t *)src));
107
+ crc = __crc32h(crc, _LE16(*((uint16_t *)src)));
25
108
  src += sizeof(uint16_t);
26
109
  len -= sizeof(uint16_t);
27
110
  }
28
-
29
111
  #ifdef __aarch64__
30
112
  if ((uintptr_t)src & sizeof(uint32_t)) {
31
- crc = __crc32w(crc, *((uint32_t *)src));
113
+ crc = __crc32w(crc, _LE32(*((uint32_t *)src)));
32
114
  src += sizeof(uint32_t);
33
115
  len -= sizeof(uint32_t);
34
116
  }
117
+ #endif
35
118
  }
36
- while ((len -= sizeof(uint64_t)) >= 0) {
37
- crc = __crc32d(crc, *((uint64_t *)src));
38
- src += sizeof(uint64_t);
119
+
120
+ const WORD_T* srcW = (const WORD_T*)src;
121
+
122
+ #ifdef ENABLE_PIPELINE_OPT
123
+ // uses ideas from https://github.com/komrad36/crc#option-13-golden
124
+ // (this is a slightly less efficient, but much simpler implementation of the idea)
125
+ const unsigned SPLIT_WORDS_LOG = 10; // make sure it's at least 2
126
+ const unsigned SPLIT_WORDS = 1<<SPLIT_WORDS_LOG;
127
+ while(len >= (long)(sizeof(WORD_T)*SPLIT_WORDS*2)) {
128
+ // compute 2x CRCs concurrently to leverage piplining
129
+ uint32_t crc2 = 0;
130
+ for(unsigned i=0; i<SPLIT_WORDS; i+=4) {
131
+ crc = CRC_WORD(crc, *srcW);
132
+ crc2 = CRC_WORD(crc2, *(srcW + SPLIT_WORDS));
133
+ srcW++;
134
+ crc = CRC_WORD(crc, *srcW);
135
+ crc2 = CRC_WORD(crc2, *(srcW + SPLIT_WORDS));
136
+ srcW++;
137
+ crc = CRC_WORD(crc, *srcW);
138
+ crc2 = CRC_WORD(crc2, *(srcW + SPLIT_WORDS));
139
+ srcW++;
140
+ crc = CRC_WORD(crc, *srcW);
141
+ crc2 = CRC_WORD(crc2, *(srcW + SPLIT_WORDS));
142
+ srcW++;
143
+ }
144
+ // merge the CRCs
145
+ // since we're multiplying by a fixed number, it could be sped up with some lookup tables
146
+ crc = crc_multiply(crc, crc_power[SPLIT_WORDS_LOG + WORDSIZE_LOG]) ^ crc2;
147
+ srcW += SPLIT_WORDS;
148
+ len -= sizeof(WORD_T)*SPLIT_WORDS*2;
39
149
  }
40
- if (len & sizeof(uint32_t)) {
41
- crc = __crc32w(crc, *((uint32_t *)src));
42
- src += sizeof(uint32_t);
150
+ #endif
151
+
152
+ while ((len -= sizeof(WORD_T)*8) >= 0) {
153
+ crc = CRC_WORD(crc, *(srcW++));
154
+ crc = CRC_WORD(crc, *(srcW++));
155
+ crc = CRC_WORD(crc, *(srcW++));
156
+ crc = CRC_WORD(crc, *(srcW++));
157
+ crc = CRC_WORD(crc, *(srcW++));
158
+ crc = CRC_WORD(crc, *(srcW++));
159
+ crc = CRC_WORD(crc, *(srcW++));
160
+ crc = CRC_WORD(crc, *(srcW++));
43
161
  }
44
- #else
162
+ if (len & sizeof(WORD_T)*4) {
163
+ crc = CRC_WORD(crc, *(srcW++));
164
+ crc = CRC_WORD(crc, *(srcW++));
165
+ crc = CRC_WORD(crc, *(srcW++));
166
+ crc = CRC_WORD(crc, *(srcW++));
167
+ }
168
+ if (len & sizeof(WORD_T)*2) {
169
+ crc = CRC_WORD(crc, *(srcW++));
170
+ crc = CRC_WORD(crc, *(srcW++));
171
+ }
172
+ if (len & sizeof(WORD_T)) {
173
+ crc = CRC_WORD(crc, *(srcW++));
45
174
  }
46
- while ((len -= sizeof(uint32_t)) >= 0) {
47
- crc = __crc32w(crc, *((uint32_t *)src));
175
+ src = (const unsigned char*)srcW;
176
+
177
+ #ifdef __aarch64__
178
+ if (len & sizeof(uint32_t)) {
179
+ crc = __crc32w(crc, _LE32(*((uint32_t *)src)));
48
180
  src += sizeof(uint32_t);
49
181
  }
50
182
  #endif
51
183
  if (len & sizeof(uint16_t)) {
52
- crc = __crc32h(crc, *((uint16_t *)src));
184
+ crc = __crc32h(crc, _LE16(*((uint16_t *)src)));
53
185
  src += sizeof(uint16_t);
54
186
  }
55
187
  if (len & sizeof(uint8_t))
@@ -58,20 +190,15 @@ static uint32_t arm_crc_calc(uint32_t crc, const unsigned char *src, long len) {
58
190
  return crc;
59
191
  }
60
192
 
61
- static void do_crc32_arm(const void* data, size_t length, unsigned char out[4]) {
62
- uint32_t crc = arm_crc_calc(~0, (const unsigned char*)data, (long)length);
63
- UNPACK_4(out, ~crc);
64
- }
65
- static void do_crc32_incremental_arm(const void* data, size_t length, unsigned char init[4]) {
66
- uint32_t crc = PACK_4(init);
67
- crc = arm_crc_calc(~crc, (const unsigned char*)data, (long)length);
68
- UNPACK_4(init, ~crc);
193
+ static uint32_t do_crc32_incremental_arm(const void* data, size_t length, uint32_t init) {
194
+ return ~arm_crc_calc(~init, (const unsigned char*)data, (long)length);
69
195
  }
70
196
 
71
- void crc_arm_set_funcs(crc_func* _do_crc32, crc_func* _do_crc32_incremental) {
72
- *_do_crc32 = &do_crc32_arm;
197
+ void crc_arm_set_funcs(crc_func* _do_crc32_incremental) {
73
198
  *_do_crc32_incremental = &do_crc32_incremental_arm;
74
199
  }
75
200
  #else
76
- void crc_arm_set_funcs(crc_func* _do_crc32, crc_func* _do_crc32_incremental) {}
201
+ void crc_arm_set_funcs(crc_func* _do_crc32_incremental) {
202
+ (void)_do_crc32_incremental;
203
+ }
77
204
  #endif
package/src/crc_common.h CHANGED
@@ -1,11 +1,4 @@
1
-
2
- #define PACK_4(arr) (((uint_fast32_t)arr[0] << 24) | ((uint_fast32_t)arr[1] << 16) | ((uint_fast32_t)arr[2] << 8) | (uint_fast32_t)arr[3])
3
- #define UNPACK_4(arr, val) { \
4
- arr[0] = (unsigned char)(val >> 24) & 0xFF; \
5
- arr[1] = (unsigned char)(val >> 16) & 0xFF; \
6
- arr[2] = (unsigned char)(val >> 8) & 0xFF; \
7
- arr[3] = (unsigned char)val & 0xFF; \
8
- }
9
-
1
+ #include "common.h"
10
2
  #include <stddef.h> // for size_t
11
- typedef void (*crc_func)(const void*, size_t, unsigned char[4]);
3
+ #include "crc.h"
4
+
@@ -19,44 +19,29 @@
19
19
 
20
20
  #include "crc_common.h"
21
21
 
22
- #if !defined(_MSC_VER) || defined(_STDINT) || _MSC_VER >= 1900
23
- # include <stdint.h>
24
- #else
25
- /* Workaround for older MSVC not supporting stdint.h - just pull it from V8 */
26
- # include <v8.h>
27
- #endif
28
-
29
- #if (defined(__PCLMUL__) && defined(__SSSE3__) && defined(__SSE4_1__)) || (defined(_MSC_VER) && _MSC_VER >= 1600)
22
+ #if (defined(__PCLMUL__) && defined(__SSSE3__) && defined(__SSE4_1__)) || (defined(_MSC_VER) && _MSC_VER >= 1600 && defined(PLATFORM_X86) && !defined(__clang__))
30
23
  #include <inttypes.h>
31
24
  #include <immintrin.h>
32
25
  #include <wmmintrin.h>
33
26
 
34
- #define local static
35
27
 
36
- #ifdef _MSC_VER
37
- # define ALIGN(_a, v) __declspec(align(_a)) v
38
- /* Because we don't have dynamic dispatch for AVX, disable it for MSVC builds (only use AVX for -march=native style builds) */
39
- # undef __AVX__
40
- # undef __AVX512F__
41
- # undef __AVX512VL__
42
- # undef __GFNI__
43
- #else
44
- # define ALIGN(_a, v) v __attribute__((aligned(_a)))
28
+ #if defined(__AVX512VL__) && defined(YENC_BUILD_NATIVE) && YENC_BUILD_NATIVE!=0
29
+ # define ENABLE_AVX512 1
45
30
  #endif
46
31
 
47
32
 
48
33
  // interestingly, MSVC seems to generate better code if using VXORPS over VPXOR
49
34
  // original Intel code uses XORPS for many XOR operations, but PXOR is pretty much always better (more port freedom on Intel CPUs). The only advantage of XORPS is that it's 1 byte shorter, an advantage which disappears under AVX as both instructions have the same length
50
- #ifdef __AVX__
35
+ #if defined(__AVX__) && defined(YENC_BUILD_NATIVE) && YENC_BUILD_NATIVE!=0
51
36
  # define fold_xor _mm_xor_si128
52
37
  #else
53
- local __m128i fold_xor(__m128i a, __m128i b) {
38
+ static __m128i fold_xor(__m128i a, __m128i b) {
54
39
  return _mm_castps_si128(_mm_xor_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b)));
55
40
  }
56
41
  #endif
57
42
 
58
- #ifdef __AVX512VL__
59
- local __m128i do_one_fold_merge(__m128i src, __m128i data) {
43
+ #ifdef ENABLE_AVX512
44
+ static __m128i do_one_fold_merge(__m128i src, __m128i data) {
60
45
  const __m128i xmm_fold4 = _mm_set_epi32(
61
46
  0x00000001, 0x54442bd4,
62
47
  0x00000001, 0xc6e41596);
@@ -68,7 +53,7 @@ local __m128i do_one_fold_merge(__m128i src, __m128i data) {
68
53
  );
69
54
  }
70
55
  #else
71
- local __m128i do_one_fold(__m128i src) {
56
+ static __m128i do_one_fold(__m128i src) {
72
57
  const __m128i xmm_fold4 = _mm_set_epi32(
73
58
  0x00000001, 0x54442bd4,
74
59
  0x00000001, 0xc6e41596);
@@ -79,7 +64,7 @@ local __m128i do_one_fold(__m128i src) {
79
64
  }
80
65
  #endif
81
66
 
82
- ALIGN(32, local const unsigned pshufb_shf_table[60]) = {
67
+ ALIGN_TO(32, static const unsigned pshufb_shf_table[60]) = {
83
68
  0x84838281, 0x88878685, 0x8c8b8a89, 0x008f8e8d, /* shl 15 (16 - 1)/shr1 */
84
69
  0x85848382, 0x89888786, 0x8d8c8b8a, 0x01008f8e, /* shl 14 (16 - 3)/shr2 */
85
70
  0x86858483, 0x8a898887, 0x8e8d8c8b, 0x0201008f, /* shl 13 (16 - 4)/shr3 */
@@ -97,7 +82,7 @@ ALIGN(32, local const unsigned pshufb_shf_table[60]) = {
97
82
  0x0201008f, 0x06050403, 0x0a090807, 0x0e0d0c0b /* shl 1 (16 -15)/shr15*/
98
83
  };
99
84
 
100
- local void partial_fold(const size_t len, __m128i *xmm_crc0, __m128i *xmm_crc1,
85
+ static void partial_fold(const size_t len, __m128i *xmm_crc0, __m128i *xmm_crc1,
101
86
  __m128i *xmm_crc2, __m128i *xmm_crc3, __m128i *xmm_crc_part) {
102
87
 
103
88
  const __m128i xmm_mask3 = _mm_set1_epi32(0x80808080);
@@ -127,7 +112,7 @@ local void partial_fold(const size_t len, __m128i *xmm_crc0, __m128i *xmm_crc1,
127
112
  *xmm_crc_part = _mm_shuffle_epi8(*xmm_crc_part, xmm_shl);
128
113
  *xmm_crc3 = _mm_or_si128(*xmm_crc3, *xmm_crc_part);
129
114
 
130
- #ifdef __AVX512VL__
115
+ #ifdef ENABLE_AVX512
131
116
  *xmm_crc3 = do_one_fold_merge(xmm_a0_0, *xmm_crc3);
132
117
  #else
133
118
  *xmm_crc3 = fold_xor(
@@ -137,74 +122,38 @@ local void partial_fold(const size_t len, __m128i *xmm_crc0, __m128i *xmm_crc1,
137
122
  #endif
138
123
  }
139
124
 
140
- ALIGN(16, local const unsigned crc_k[]) = {
125
+ ALIGN_TO(16, static const unsigned crc_k[]) = {
141
126
  0xccaa009e, 0x00000000, /* rk1 */
142
127
  0x751997d0, 0x00000001, /* rk2 */
143
128
  0xccaa009e, 0x00000000, /* rk5 */
144
129
  0x63cd6124, 0x00000001, /* rk6 */
145
- 0xf7011640, 0x00000001, /* rk7 */
130
+ 0xf7011641, 0x00000000, /* rk7 */
146
131
  0xdb710640, 0x00000001 /* rk8 */
147
132
  };
148
133
 
149
- ALIGN(16, local const unsigned crc_mask[4]) = {
150
- 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000
151
- };
152
-
153
- ALIGN(16, local const unsigned crc_mask2[4]) = {
134
+ ALIGN_TO(16, static const unsigned crc_mask[4]) = {
154
135
  0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF
155
136
  };
156
137
 
157
- local __m128i reverse_bits_epi8(__m128i src) {
158
- #ifdef __GFNI__
159
- return _mm_gf2p8affine_epi64_epi8(src, _mm_set_epi32(
160
- 0x80402010, 0x08040201,
161
- 0x80402010, 0x08040201
162
- ), 0);
163
- #else
164
- __m128i xmm_t0 = _mm_and_si128(src, _mm_set1_epi8(0x0f));
165
- __m128i xmm_t1 = _mm_and_si128(_mm_srli_epi16(src, 4), _mm_set1_epi8(0x0f));
166
- xmm_t0 = _mm_shuffle_epi8(_mm_set_epi8(
167
- 0xf0, 0x70, 0xb0, 0x30, 0xd0, 0x50, 0x90, 0x10, 0xe0, 0x60, 0xa0, 0x20, 0xc0, 0x40, 0x80, 0
168
- ), xmm_t0);
169
- xmm_t1 = _mm_shuffle_epi8(_mm_set_epi8(
170
- 15, 7, 11, 3, 13, 5, 9, 1, 14, 6, 10, 2, 12, 4, 8, 0
171
- ), xmm_t1);
172
- return _mm_or_si128(xmm_t0, xmm_t1);
173
- #endif
174
- }
175
-
176
- #ifdef _MSC_VER
177
- // because MSVC doesn't use BSWAP unless you specifically tell it to...
178
- # include <stdlib.h>
179
- # define BSWAP32 _byteswap_ulong
180
- #else
181
- # define BSWAP32(n) ((((n)&0xff)<<24) | (((n)&0xff00)<<8) | (((n)&0xff0000)>>8) | (((n)&0xff000000)>>24))
182
- #endif
183
138
 
184
- local uint32_t crc_fold(const unsigned char *src, long len, uint32_t initial) {
139
+ static uint32_t crc_fold(const unsigned char *src, long len, uint32_t initial) {
185
140
  unsigned long algn_diff;
186
141
  __m128i xmm_t0, xmm_t1, xmm_t2, xmm_t3;
187
142
 
188
143
  // TODO: consider calculating this via a LUT instead (probably faster)
189
144
  // info from https://www.reddit.com/r/ReverseEngineering/comments/2zwhl3/mystery_constant_0x9db42487_in_intels_crc32ieee/
190
145
  // firstly, calculate: xmm_crc0 = (intial * 0x487b9c8a) mod 0x104c11db7, where 0x487b9c8a = inverse(1<<512) mod 0x104c11db7
146
+ xmm_t0 = _mm_cvtsi32_si128(~initial);
191
147
 
192
- // reverse input bits + load into XMM register
193
- uint32_t init_t = BSWAP32(initial);
194
- xmm_t0 = reverse_bits_epi8(_mm_cvtsi32_si128(~init_t));
195
-
196
- xmm_t0 = _mm_clmulepi64_si128(xmm_t0, _mm_cvtsi32_si128(0x487b9c8a), 0);
197
- xmm_t1 = _mm_and_si128(xmm_t0, _mm_set_epi32(-1,-1,-1,0)); // shifted up by 32bits to avoid shifts by using clmul's capability to select top 64bits instead
148
+ xmm_t0 = _mm_clmulepi64_si128(xmm_t0, _mm_set_epi32(0, 0, 0xa273bc24, 0), 0); // reverse(0x487b9c8a)<<1 == 0xa273bc24
198
149
  xmm_t2 = _mm_set_epi32( // polynomial reduction factors
199
- 0, 0x04c11db7, // G*
200
- 1, 0x04d101df // Q+
150
+ 1, 0xdb710640, // G* = 0x04c11db7
151
+ 0, 0xf7011641 // Q+ = 0x04d101df (+1 to save an additional xor operation)
201
152
  );
202
- xmm_t1 = _mm_clmulepi64_si128(xmm_t1, xmm_t2, 0);
203
- xmm_t1 = _mm_clmulepi64_si128(xmm_t1, xmm_t2, 0x11);
153
+ xmm_t1 = _mm_clmulepi64_si128(xmm_t0, xmm_t2, 0);
154
+ xmm_t1 = _mm_clmulepi64_si128(xmm_t1, xmm_t2, 0x10);
204
155
 
205
- __m128i xmm_crc0 = _mm_xor_si128(xmm_t0, xmm_t1);
206
- // reverse bits
207
- xmm_crc0 = _mm_shuffle_epi8(reverse_bits_epi8(xmm_crc0), _mm_set_epi32(-1,-1,-1,0x00010203));
156
+ __m128i xmm_crc0 = _mm_srli_si128(_mm_xor_si128(xmm_t0, xmm_t1), 8);
208
157
 
209
158
  __m128i xmm_crc1 = _mm_setzero_si128();
210
159
  __m128i xmm_crc2 = _mm_setzero_si128();
@@ -214,7 +163,8 @@ local uint32_t crc_fold(const unsigned char *src, long len, uint32_t initial) {
214
163
  if (len < 16) {
215
164
  if (len == 0)
216
165
  return initial;
217
- xmm_crc_part = _mm_loadu_si128((__m128i *)src);
166
+ xmm_crc_part = _mm_setzero_si128();
167
+ memcpy(&xmm_crc_part, src, len);
218
168
  goto partial;
219
169
  }
220
170
 
@@ -229,13 +179,13 @@ local uint32_t crc_fold(const unsigned char *src, long len, uint32_t initial) {
229
179
  &xmm_crc_part);
230
180
  }
231
181
 
232
- while ((len -= 64) >= 0) {
182
+ while (len >= 64) {
233
183
  xmm_t0 = _mm_load_si128((__m128i *)src);
234
184
  xmm_t1 = _mm_load_si128((__m128i *)src + 1);
235
185
  xmm_t2 = _mm_load_si128((__m128i *)src + 2);
236
186
  xmm_t3 = _mm_load_si128((__m128i *)src + 3);
237
187
 
238
- #ifdef __AVX512VL__
188
+ #ifdef ENABLE_AVX512
239
189
  xmm_crc0 = do_one_fold_merge(xmm_crc0, xmm_t0);
240
190
  xmm_crc1 = do_one_fold_merge(xmm_crc1, xmm_t1);
241
191
  xmm_crc2 = do_one_fold_merge(xmm_crc2, xmm_t2);
@@ -253,20 +203,18 @@ local uint32_t crc_fold(const unsigned char *src, long len, uint32_t initial) {
253
203
  #endif
254
204
 
255
205
  src += 64;
206
+ len -= 64;
256
207
  }
257
208
 
258
- /*
259
- * len = num bytes left - 64
260
- */
261
- if (len + 16 >= 0) {
262
- len += 16;
209
+ if (len >= 48) {
210
+ len -= 48;
263
211
 
264
212
  xmm_t0 = _mm_load_si128((__m128i *)src);
265
213
  xmm_t1 = _mm_load_si128((__m128i *)src + 1);
266
214
  xmm_t2 = _mm_load_si128((__m128i *)src + 2);
267
215
 
268
216
  xmm_t3 = xmm_crc3;
269
- #ifdef __AVX512VL__
217
+ #ifdef ENABLE_AVX512
270
218
  xmm_crc3 = do_one_fold_merge(xmm_crc2, xmm_t2);
271
219
  xmm_crc2 = do_one_fold_merge(xmm_crc1, xmm_t1);
272
220
  xmm_crc1 = do_one_fold_merge(xmm_crc0, xmm_t0);
@@ -284,15 +232,15 @@ local uint32_t crc_fold(const unsigned char *src, long len, uint32_t initial) {
284
232
  goto done;
285
233
 
286
234
  xmm_crc_part = _mm_load_si128((__m128i *)src + 3);
287
- } else if (len + 32 >= 0) {
288
- len += 32;
235
+ } else if (len >= 32) {
236
+ len -= 32;
289
237
 
290
238
  xmm_t0 = _mm_load_si128((__m128i *)src);
291
239
  xmm_t1 = _mm_load_si128((__m128i *)src + 1);
292
240
 
293
241
  xmm_t2 = xmm_crc2;
294
242
  xmm_t3 = xmm_crc3;
295
- #ifdef __AVX512VL__
243
+ #ifdef ENABLE_AVX512
296
244
  xmm_crc3 = do_one_fold_merge(xmm_crc1, xmm_t1);
297
245
  xmm_crc2 = do_one_fold_merge(xmm_crc0, xmm_t0);
298
246
  #else
@@ -308,13 +256,13 @@ local uint32_t crc_fold(const unsigned char *src, long len, uint32_t initial) {
308
256
  goto done;
309
257
 
310
258
  xmm_crc_part = _mm_load_si128((__m128i *)src + 2);
311
- } else if (len + 48 >= 0) {
312
- len += 48;
259
+ } else if (len >= 16) {
260
+ len -= 16;
313
261
 
314
262
  xmm_t0 = _mm_load_si128((__m128i *)src);
315
263
 
316
264
  xmm_t3 = xmm_crc3;
317
- #ifdef __AVX512VL__
265
+ #ifdef ENABLE_AVX512
318
266
  xmm_crc3 = do_one_fold_merge(xmm_crc0, xmm_t0);
319
267
  #else
320
268
  xmm_crc3 = _mm_xor_si128(do_one_fold(xmm_crc0), xmm_t0);
@@ -328,7 +276,6 @@ local uint32_t crc_fold(const unsigned char *src, long len, uint32_t initial) {
328
276
 
329
277
  xmm_crc_part = _mm_load_si128((__m128i *)src + 1);
330
278
  } else {
331
- len += 64;
332
279
  if (len == 0)
333
280
  goto done;
334
281
  xmm_crc_part = _mm_load_si128((__m128i *)src);
@@ -339,8 +286,7 @@ partial:
339
286
  &xmm_crc_part);
340
287
  done:
341
288
  {
342
- const __m128i xmm_mask = _mm_load_si128((__m128i *)crc_mask);
343
- const __m128i xmm_mask2 = _mm_load_si128((__m128i *)crc_mask2);
289
+ const __m128i xmm_mask = _mm_load_si128((__m128i *)crc_mask);
344
290
  __m128i x_tmp0, x_tmp1, x_tmp2, crc_fold;
345
291
 
346
292
  /*
@@ -350,7 +296,7 @@ done:
350
296
 
351
297
  x_tmp0 = _mm_clmulepi64_si128(xmm_crc0, crc_fold, 0x10);
352
298
  xmm_crc0 = _mm_clmulepi64_si128(xmm_crc0, crc_fold, 0x01);
353
- #ifdef __AVX512VL__
299
+ #ifdef ENABLE_AVX512
354
300
  xmm_crc1 = _mm_ternarylogic_epi32(xmm_crc1, x_tmp0, xmm_crc0, 0x96);
355
301
  #else
356
302
  xmm_crc1 = _mm_xor_si128(xmm_crc1, x_tmp0);
@@ -359,7 +305,7 @@ done:
359
305
 
360
306
  x_tmp1 = _mm_clmulepi64_si128(xmm_crc1, crc_fold, 0x10);
361
307
  xmm_crc1 = _mm_clmulepi64_si128(xmm_crc1, crc_fold, 0x01);
362
- #ifdef __AVX512VL__
308
+ #ifdef ENABLE_AVX512
363
309
  xmm_crc2 = _mm_ternarylogic_epi32(xmm_crc2, x_tmp1, xmm_crc1, 0x96);
364
310
  #else
365
311
  xmm_crc2 = _mm_xor_si128(xmm_crc2, x_tmp1);
@@ -368,7 +314,7 @@ done:
368
314
 
369
315
  x_tmp2 = _mm_clmulepi64_si128(xmm_crc2, crc_fold, 0x10);
370
316
  xmm_crc2 = _mm_clmulepi64_si128(xmm_crc2, crc_fold, 0x01);
371
- #ifdef __AVX512VL__
317
+ #ifdef ENABLE_AVX512
372
318
  xmm_crc3 = _mm_ternarylogic_epi32(xmm_crc3, x_tmp2, xmm_crc2, 0x96);
373
319
  #else
374
320
  xmm_crc3 = _mm_xor_si128(xmm_crc3, x_tmp2);
@@ -388,58 +334,43 @@ done:
388
334
  xmm_crc0 = xmm_crc3;
389
335
  xmm_crc3 = _mm_slli_si128(xmm_crc3, 4);
390
336
  xmm_crc3 = _mm_clmulepi64_si128(xmm_crc3, crc_fold, 0x10);
391
- #ifdef __AVX512VL__
337
+ #ifdef ENABLE_AVX512
392
338
  //xmm_crc3 = _mm_maskz_xor_epi32(14, xmm_crc3, xmm_crc0);
393
- xmm_crc3 = _mm_ternarylogic_epi32(xmm_crc3, xmm_crc0, xmm_mask2, 0x28);
339
+ xmm_crc3 = _mm_ternarylogic_epi32(xmm_crc3, xmm_crc0, xmm_mask, 0x28);
394
340
  #else
341
+ xmm_crc0 = _mm_and_si128(xmm_crc0, xmm_mask);
395
342
  xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc0);
396
- xmm_crc3 = _mm_and_si128(xmm_crc3, xmm_mask2);
397
343
  #endif
398
344
 
399
345
  /*
400
346
  * k7
401
347
  */
402
348
  xmm_crc1 = xmm_crc3;
403
- xmm_crc2 = xmm_crc3;
404
349
  crc_fold = _mm_load_si128((__m128i *)crc_k + 2);
405
350
 
406
351
  xmm_crc3 = _mm_clmulepi64_si128(xmm_crc3, crc_fold, 0);
407
- #ifdef __AVX512VL__
408
- //xmm_crc3 = _mm_maskz_xor_epi32(3, xmm_crc3, xmm_crc2);
409
- xmm_crc3 = _mm_ternarylogic_epi32(xmm_crc3, xmm_crc2, xmm_mask, 0x28);
410
- #else
411
- xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc2);
412
- xmm_crc3 = _mm_and_si128(xmm_crc3, xmm_mask);
413
- #endif
414
-
415
- xmm_crc2 = xmm_crc3;
416
352
  xmm_crc3 = _mm_clmulepi64_si128(xmm_crc3, crc_fold, 0x10);
417
- #ifdef __AVX512VL__
418
- xmm_crc3 = _mm_ternarylogic_epi32(xmm_crc3, xmm_crc2, xmm_crc1, 0x69); // NOT(double-XOR)
419
- return _mm_extract_epi32(xmm_crc3, 2);
353
+ #ifdef ENABLE_AVX512
354
+ xmm_crc3 = _mm_ternarylogic_epi32(xmm_crc3, xmm_crc1, xmm_crc1, 0xC3); // NOT(xmm_crc3 ^ xmm_crc1)
420
355
  #else
421
- xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc2);
356
+ xmm_crc1 = _mm_xor_si128(xmm_crc1, xmm_mask);
422
357
  xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc1);
423
- return ~_mm_extract_epi32(xmm_crc3, 2);
424
358
  #endif
359
+ return _mm_extract_epi32(xmm_crc3, 2);
425
360
  }
426
361
 
427
362
  }
428
363
 
429
- static void do_crc32_clmul(const void* data, size_t length, unsigned char out[4]) {
430
- uint32_t tmp = crc_fold((const unsigned char*)data, (long)length, 0);
431
- UNPACK_4(out, tmp);
432
- }
433
- static void do_crc32_incremental_clmul(const void* data, size_t length, unsigned char init[4]) {
434
- uint32_t tmp = crc_fold((const unsigned char*)data, (long)length, PACK_4(init));
435
- UNPACK_4(init, tmp);
364
+ static uint32_t do_crc32_incremental_clmul(const void* data, size_t length, uint32_t init) {
365
+ return crc_fold((const unsigned char*)data, (long)length, init);
436
366
  }
437
367
 
438
- void crc_clmul_set_funcs(crc_func* _do_crc32, crc_func* _do_crc32_incremental) {
439
- *_do_crc32 = &do_crc32_clmul;
368
+ void crc_clmul_set_funcs(crc_func* _do_crc32_incremental) {
440
369
  *_do_crc32_incremental = &do_crc32_incremental_clmul;
441
370
  }
442
371
  #else
443
- void crc_clmul_set_funcs(crc_func* _do_crc32, crc_func* _do_crc32_incremental) {}
372
+ void crc_clmul_set_funcs(crc_func* _do_crc32_incremental) {
373
+ (void)_do_crc32_incremental;
374
+ }
444
375
  #endif
445
376