yencode 1.1.2 → 1.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/crc_arm.cc CHANGED
@@ -5,6 +5,24 @@
5
5
  HEDLEY_WARNING("CRC32 acceleration is not been enabled under ARM clang-cl by default; add `-march=armv8-a+crc` to additional compiler arguments to enable");
6
6
  #endif
7
7
 
8
+ // disable CRC on GCC versions with broken arm_acle.h
9
+ #if defined(__ARM_FEATURE_CRC32) && defined(HEDLEY_GCC_VERSION)
10
+ # if !defined(__aarch64__) && HEDLEY_GCC_VERSION_CHECK(7,0,0) && !HEDLEY_GCC_VERSION_CHECK(8,1,1)
11
+ # undef __ARM_FEATURE_CRC32
12
+ HEDLEY_WARNING("CRC32 acceleration has been disabled due to broken arm_acle.h shipped in GCC 7.0 - 8.1 [https://gcc.gnu.org/bugzilla/show_bug.cgi?id=81497]. If you need this feature, please use a different compiler or version of GCC");
13
+ # endif
14
+ # if defined(__aarch64__) && HEDLEY_GCC_VERSION_CHECK(9,4,0) && !HEDLEY_GCC_VERSION_CHECK(9,5,0)
15
+ # undef __ARM_FEATURE_CRC32
16
+ HEDLEY_WARNING("CRC32 acceleration has been disabled due to broken arm_acle.h shipped in GCC 9.4 [https://gcc.gnu.org/bugzilla/show_bug.cgi?id=100985]. If you need this feature, please use a different compiler or version of GCC");
17
+ # endif
18
+ #endif
19
+ #if defined(__ARM_FEATURE_CRC32) && defined(__has_include)
20
+ # if !__has_include(<arm_acle.h>)
21
+ # undef __ARM_FEATURE_CRC32
22
+ HEDLEY_WARNING("CRC32 acceleration has been disabled due to missing arm_acle.h");
23
+ # endif
24
+ #endif
25
+
8
26
  #if defined(__ARM_FEATURE_CRC32) || (defined(_M_ARM64) && !defined(__clang__)) // MSVC doesn't support CRC for ARM32
9
27
 
10
28
  /* ARMv8 accelerated CRC */
@@ -14,14 +32,30 @@ HEDLEY_WARNING("CRC32 acceleration is not been enabled under ARM clang-cl by def
14
32
  #include <arm_acle.h>
15
33
  #endif
16
34
 
35
+
36
+ #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
37
+ # ifdef __GNUC__
38
+ # define _LE16 __builtin_bswap16
39
+ # define _LE32 __builtin_bswap32
40
+ # define _LE64 __builtin_bswap64
41
+ # else
42
+ // currently not supported
43
+ # error No endian swap intrinsic defined
44
+ # endif
45
+ #else
46
+ # define _LE16(x) (x)
47
+ # define _LE32(x) (x)
48
+ # define _LE64(x) (x)
49
+ #endif
50
+
17
51
  #ifdef __aarch64__
18
52
  # define WORD_T uint64_t
19
53
  # define WORDSIZE_LOG 3 // sizeof(WORD_T) == 1<<WORDSIZE_LOG
20
- # define CRC_WORD __crc32d
54
+ # define CRC_WORD(crc, data) __crc32d(crc, _LE64(data))
21
55
  #else
22
56
  # define WORD_T uint32_t
23
57
  # define WORDSIZE_LOG 2 // sizeof(WORD_T) == 1<<WORDSIZE_LOG
24
- # define CRC_WORD __crc32w
58
+ # define CRC_WORD(crc, data) __crc32w(crc, _LE32(data))
25
59
  #endif
26
60
 
27
61
 
@@ -45,7 +79,7 @@ static HEDLEY_ALWAYS_INLINE uint32_t crc_multiply(uint32_t a, uint32_t b) {
45
79
  return res;
46
80
  }
47
81
 
48
- static const uint32_t crc_power[] = { // pre-computed 2^n, with first 3 entries removed (saves a shift)
82
+ static const uint32_t crc_power[] = { // pre-computed 2^(2^n), with first 3 entries removed (saves a shift)
49
83
  0x00800000, 0x00008000, 0xedb88320, 0xb1e6b092, 0xa06a2517, 0xed627dae, 0x88d14467, 0xd7bbfe6a,
50
84
  0xec447f11, 0x8e7ea170, 0x6427800e, 0x4d47bae0, 0x09fe548f, 0x83852d0f, 0x30362f1a, 0x7b5a9cc3,
51
85
  0x31fec169, 0x9fec022a, 0x6c8dedc4, 0x15d6874d, 0x5fde7a4e, 0xbad90e37, 0x2e4e5eef, 0x4eaba214,
@@ -64,6 +98,7 @@ static const uint32_t crc_power[] = { // pre-computed 2^n, with first 3 entries
64
98
  #endif
65
99
 
66
100
 
101
+
67
102
  // inspired/stolen off https://github.com/jocover/crc32_armv8/blob/master/crc32_armv8.c
68
103
  static uint32_t arm_crc_calc(uint32_t crc, const unsigned char *src, long len) {
69
104
 
@@ -75,13 +110,13 @@ static uint32_t arm_crc_calc(uint32_t crc, const unsigned char *src, long len) {
75
110
  len--;
76
111
  }
77
112
  if ((uintptr_t)src & sizeof(uint16_t)) {
78
- crc = __crc32h(crc, *((uint16_t *)src));
113
+ crc = __crc32h(crc, _LE16(*((uint16_t *)src)));
79
114
  src += sizeof(uint16_t);
80
115
  len -= sizeof(uint16_t);
81
116
  }
82
117
  #ifdef __aarch64__
83
118
  if ((uintptr_t)src & sizeof(uint32_t)) {
84
- crc = __crc32w(crc, *((uint32_t *)src));
119
+ crc = __crc32w(crc, _LE32(*((uint32_t *)src)));
85
120
  src += sizeof(uint32_t);
86
121
  len -= sizeof(uint32_t);
87
122
  }
@@ -147,12 +182,12 @@ static uint32_t arm_crc_calc(uint32_t crc, const unsigned char *src, long len) {
147
182
 
148
183
  #ifdef __aarch64__
149
184
  if (len & sizeof(uint32_t)) {
150
- crc = __crc32w(crc, *((uint32_t *)src));
185
+ crc = __crc32w(crc, _LE32(*((uint32_t *)src)));
151
186
  src += sizeof(uint32_t);
152
187
  }
153
188
  #endif
154
189
  if (len & sizeof(uint16_t)) {
155
- crc = __crc32h(crc, *((uint16_t *)src));
190
+ crc = __crc32h(crc, _LE16(*((uint16_t *)src)));
156
191
  src += sizeof(uint16_t);
157
192
  }
158
193
  if (len & sizeof(uint8_t))
@@ -19,7 +19,7 @@
19
19
 
20
20
  #include "crc_common.h"
21
21
 
22
- #if (defined(__PCLMUL__) && defined(__SSSE3__) && defined(__SSE4_1__)) || (defined(_MSC_VER) && _MSC_VER >= 1600 && defined(PLATFORM_X86))
22
+ #if (defined(__PCLMUL__) && defined(__SSSE3__) && defined(__SSE4_1__)) || (defined(_MSC_VER) && _MSC_VER >= 1600 && defined(PLATFORM_X86) && !defined(__clang__))
23
23
  #include <inttypes.h>
24
24
  #include <immintrin.h>
25
25
  #include <wmmintrin.h>
@@ -135,33 +135,6 @@ ALIGN_TO(16, static const unsigned crc_mask[4]) = {
135
135
  0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF
136
136
  };
137
137
 
138
- static __m128i reverse_bits_epi8(__m128i src) {
139
- #if defined(__GFNI__) && defined(YENC_BUILD_NATIVE) && YENC_BUILD_NATIVE!=0
140
- return _mm_gf2p8affine_epi64_epi8(src, _mm_set_epi32(
141
- 0x80402010, 0x08040201,
142
- 0x80402010, 0x08040201
143
- ), 0);
144
- #else
145
- __m128i xmm_t0 = _mm_and_si128(src, _mm_set1_epi8(0x0f));
146
- __m128i xmm_t1 = _mm_and_si128(_mm_srli_epi16(src, 4), _mm_set1_epi8(0x0f));
147
- xmm_t0 = _mm_shuffle_epi8(_mm_set_epi8(
148
- -16, 112, -80, 48, -48, 80, -112, 16, -32, 96, -96, 32, -64, 64, -128, 0
149
- //0xf0, 0x70, 0xb0, 0x30, 0xd0, 0x50, 0x90, 0x10, 0xe0, 0x60, 0xa0, 0x20, 0xc0, 0x40, 0x80, 0
150
- ), xmm_t0);
151
- xmm_t1 = _mm_shuffle_epi8(_mm_set_epi8(
152
- 15, 7, 11, 3, 13, 5, 9, 1, 14, 6, 10, 2, 12, 4, 8, 0
153
- ), xmm_t1);
154
- return _mm_or_si128(xmm_t0, xmm_t1);
155
- #endif
156
- }
157
-
158
- #ifdef _MSC_VER
159
- // because MSVC doesn't use BSWAP unless you specifically tell it to...
160
- # include <stdlib.h>
161
- # define BSWAP32 _byteswap_ulong
162
- #else
163
- # define BSWAP32(n) ((((n)&0xff)<<24) | (((n)&0xff00)<<8) | (((n)&0xff0000)>>8) | (((n)&0xff000000)>>24))
164
- #endif
165
138
 
166
139
  static uint32_t crc_fold(const unsigned char *src, long len, uint32_t initial) {
167
140
  unsigned long algn_diff;
@@ -170,23 +143,17 @@ static uint32_t crc_fold(const unsigned char *src, long len, uint32_t initial) {
170
143
  // TODO: consider calculating this via a LUT instead (probably faster)
171
144
  // info from https://www.reddit.com/r/ReverseEngineering/comments/2zwhl3/mystery_constant_0x9db42487_in_intels_crc32ieee/
172
145
  // firstly, calculate: xmm_crc0 = (intial * 0x487b9c8a) mod 0x104c11db7, where 0x487b9c8a = inverse(1<<512) mod 0x104c11db7
146
+ xmm_t0 = _mm_cvtsi32_si128(~initial);
173
147
 
174
- // reverse input bits + load into XMM register
175
- uint32_t init_t = BSWAP32(initial);
176
- xmm_t0 = reverse_bits_epi8(_mm_cvtsi32_si128(~init_t));
177
-
178
- xmm_t0 = _mm_clmulepi64_si128(xmm_t0, _mm_cvtsi32_si128(0x487b9c8a), 0);
179
- xmm_t1 = _mm_and_si128(xmm_t0, _mm_set_epi32(-1,-1,-1,0)); // shifted up by 32bits to avoid shifts by using clmul's capability to select top 64bits instead
148
+ xmm_t0 = _mm_clmulepi64_si128(xmm_t0, _mm_set_epi32(0, 0, 0xa273bc24, 0), 0); // reverse(0x487b9c8a)<<1 == 0xa273bc24
180
149
  xmm_t2 = _mm_set_epi32( // polynomial reduction factors
181
- 0, 0x04c11db7, // G*
182
- 1, 0x04d101df // Q+
150
+ 1, 0xdb710640, // G* = 0x04c11db7
151
+ 0, 0xf7011641 // Q+ = 0x04d101df (+1 to save an additional xor operation)
183
152
  );
184
- xmm_t1 = _mm_clmulepi64_si128(xmm_t1, xmm_t2, 0);
185
- xmm_t1 = _mm_clmulepi64_si128(xmm_t1, xmm_t2, 0x11);
153
+ xmm_t1 = _mm_clmulepi64_si128(xmm_t0, xmm_t2, 0);
154
+ xmm_t1 = _mm_clmulepi64_si128(xmm_t1, xmm_t2, 0x10);
186
155
 
187
- __m128i xmm_crc0 = _mm_xor_si128(xmm_t0, xmm_t1);
188
- // reverse bits
189
- xmm_crc0 = _mm_shuffle_epi8(reverse_bits_epi8(xmm_crc0), _mm_set_epi32(-1,-1,-1,0x00010203));
156
+ __m128i xmm_crc0 = _mm_srli_si128(_mm_xor_si128(xmm_t0, xmm_t1), 8);
190
157
 
191
158
  __m128i xmm_crc1 = _mm_setzero_si128();
192
159
  __m128i xmm_crc2 = _mm_setzero_si128();
@@ -196,7 +163,8 @@ static uint32_t crc_fold(const unsigned char *src, long len, uint32_t initial) {
196
163
  if (len < 16) {
197
164
  if (len == 0)
198
165
  return initial;
199
- xmm_crc_part = _mm_loadu_si128((__m128i *)src);
166
+ xmm_crc_part = _mm_setzero_si128();
167
+ memcpy(&xmm_crc_part, src, len);
200
168
  goto partial;
201
169
  }
202
170
 
@@ -211,7 +179,7 @@ static uint32_t crc_fold(const unsigned char *src, long len, uint32_t initial) {
211
179
  &xmm_crc_part);
212
180
  }
213
181
 
214
- while ((len -= 64) >= 0) {
182
+ while (len >= 64) {
215
183
  xmm_t0 = _mm_load_si128((__m128i *)src);
216
184
  xmm_t1 = _mm_load_si128((__m128i *)src + 1);
217
185
  xmm_t2 = _mm_load_si128((__m128i *)src + 2);
@@ -235,13 +203,11 @@ static uint32_t crc_fold(const unsigned char *src, long len, uint32_t initial) {
235
203
  #endif
236
204
 
237
205
  src += 64;
206
+ len -= 64;
238
207
  }
239
208
 
240
- /*
241
- * len = num bytes left - 64
242
- */
243
- if (len + 16 >= 0) {
244
- len += 16;
209
+ if (len >= 48) {
210
+ len -= 48;
245
211
 
246
212
  xmm_t0 = _mm_load_si128((__m128i *)src);
247
213
  xmm_t1 = _mm_load_si128((__m128i *)src + 1);
@@ -266,8 +232,8 @@ static uint32_t crc_fold(const unsigned char *src, long len, uint32_t initial) {
266
232
  goto done;
267
233
 
268
234
  xmm_crc_part = _mm_load_si128((__m128i *)src + 3);
269
- } else if (len + 32 >= 0) {
270
- len += 32;
235
+ } else if (len >= 32) {
236
+ len -= 32;
271
237
 
272
238
  xmm_t0 = _mm_load_si128((__m128i *)src);
273
239
  xmm_t1 = _mm_load_si128((__m128i *)src + 1);
@@ -290,8 +256,8 @@ static uint32_t crc_fold(const unsigned char *src, long len, uint32_t initial) {
290
256
  goto done;
291
257
 
292
258
  xmm_crc_part = _mm_load_si128((__m128i *)src + 2);
293
- } else if (len + 48 >= 0) {
294
- len += 48;
259
+ } else if (len >= 16) {
260
+ len -= 16;
295
261
 
296
262
  xmm_t0 = _mm_load_si128((__m128i *)src);
297
263
 
@@ -310,7 +276,6 @@ static uint32_t crc_fold(const unsigned char *src, long len, uint32_t initial) {
310
276
 
311
277
  xmm_crc_part = _mm_load_si128((__m128i *)src + 1);
312
278
  } else {
313
- len += 64;
314
279
  if (len == 0)
315
280
  goto done;
316
281
  xmm_crc_part = _mm_load_si128((__m128i *)src);
@@ -0,0 +1,229 @@
1
+ // 256-bit version of crc_folding
2
+
3
+ #include "crc_common.h"
4
+
5
+ #if !defined(YENC_DISABLE_AVX256) && ((defined(__VPCLMULQDQ__) && defined(__AVX2__) && defined(__PCLMUL__)) || (defined(_MSC_VER) && _MSC_VER >= 1920 && defined(PLATFORM_X86) && !defined(__clang__)))
6
+ #include <inttypes.h>
7
+ #include <immintrin.h>
8
+
9
+
10
+ #if defined(__AVX512VL__) && defined(YENC_BUILD_NATIVE) && YENC_BUILD_NATIVE!=0
11
+ # define ENABLE_AVX512 1
12
+ #endif
13
+
14
+ static __m256i do_one_fold(__m256i src, __m256i data) {
15
+ const __m256i fold4 = _mm256_set_epi32(
16
+ 0x00000001, 0x54442bd4,
17
+ 0x00000001, 0xc6e41596,
18
+ 0x00000001, 0x54442bd4,
19
+ 0x00000001, 0xc6e41596
20
+ );
21
+ #ifdef ENABLE_AVX512
22
+ return _mm256_ternarylogic_epi32(
23
+ _mm256_clmulepi64_epi128(src, fold4, 0x01),
24
+ _mm256_clmulepi64_epi128(src, fold4, 0x10),
25
+ data,
26
+ 0x96
27
+ );
28
+ #else
29
+ return _mm256_xor_si256(_mm256_xor_si256(
30
+ data, _mm256_clmulepi64_epi128(src, fold4, 0x01)
31
+ ), _mm256_clmulepi64_epi128(src, fold4, 0x10));
32
+ #endif
33
+ }
34
+
35
+ ALIGN_TO(32, static const uint8_t pshufb_rot_table[]) = {
36
+ 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,
37
+ 16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31
38
+ };
39
+ // _mm256_castsi128_si256, but upper is defined to be 0
40
+ #if (defined(__clang__) && __clang_major__ >= 5 && (!defined(__APPLE__) || __clang_major__ >= 7)) || (defined(__GNUC__) && __GNUC__ >= 10) || (defined(_MSC_VER) && _MSC_VER >= 1910)
41
+ // intrinsic unsupported in GCC 9 and MSVC < 2017
42
+ # define zext128_256 _mm256_zextsi128_si256
43
+ #else
44
+ // technically a cast is incorrect, due to upper 128 bits being undefined, but should usually work fine
45
+ // alternative may be `_mm256_set_m128i(_mm_setzero_si128(), v)` but unsupported on GCC < 7, and most compilers generate a VINSERTF128 instruction for it
46
+ # ifdef __OPTIMIZE__
47
+ # define zext128_256 _mm256_castsi128_si256
48
+ # else
49
+ # define zext128_256(x) _mm256_inserti128_si256(_mm256_setzero_si256(), x, 0)
50
+ # endif
51
+ #endif
52
+
53
+ #ifdef ENABLE_AVX512
54
+ # define MM256_BLENDV(a, b, m) _mm256_ternarylogic_epi32(a, b, m, 0xd8)
55
+ # define MM_2XOR(a, b, c) _mm_ternarylogic_epi32(a, b, c, 0x96)
56
+ #else
57
+ # define MM256_BLENDV _mm256_blendv_epi8
58
+ # define MM_2XOR(a, b, c) _mm_xor_si128(_mm_xor_si128(a, b), c)
59
+ #endif
60
+
61
+ static void partial_fold(const size_t len, __m256i *crc0, __m256i *crc1, __m256i crc_part) {
62
+ __m256i shuf = _mm256_broadcastsi128_si256(_mm_loadu_si128((__m128i*)(pshufb_rot_table + (len&15))));
63
+ __m256i mask = _mm256_cmpgt_epi8(shuf, _mm256_set1_epi8(15));
64
+
65
+ *crc0 = _mm256_shuffle_epi8(*crc0, shuf);
66
+ *crc1 = _mm256_shuffle_epi8(*crc1, shuf);
67
+ crc_part = _mm256_shuffle_epi8(crc_part, shuf);
68
+
69
+ __m256i crc_out = _mm256_permute2x128_si256(*crc0, *crc0, 0x08); // move bottom->top
70
+ __m256i crc01, crc1p;
71
+ if(len >= 16) {
72
+ crc_out = MM256_BLENDV(crc_out, *crc0, mask);
73
+ crc01 = *crc1;
74
+ crc1p = crc_part;
75
+ *crc0 = _mm256_permute2x128_si256(*crc0, *crc1, 0x21);
76
+ *crc1 = _mm256_permute2x128_si256(*crc1, crc_part, 0x21);
77
+ crc_part = zext128_256(_mm256_extracti128_si256(crc_part, 1));
78
+ } else {
79
+ crc_out = _mm256_and_si256(crc_out, mask);
80
+ crc01 = _mm256_permute2x128_si256(*crc0, *crc1, 0x21);
81
+ crc1p = _mm256_permute2x128_si256(*crc1, crc_part, 0x21);
82
+ }
83
+
84
+ *crc0 = MM256_BLENDV(*crc0, crc01, mask);
85
+ *crc1 = MM256_BLENDV(*crc1, crc1p, mask);
86
+
87
+ *crc1 = do_one_fold(crc_out, *crc1);
88
+ }
89
+
90
+
91
+ ALIGN_TO(16, static const unsigned crc_k[]) = {
92
+ 0xccaa009e, 0x00000000, /* rk1 */
93
+ 0x751997d0, 0x00000001, /* rk2 */
94
+ 0xccaa009e, 0x00000000, /* rk5 */
95
+ 0x63cd6124, 0x00000001, /* rk6 */
96
+ 0xf7011641, 0x00000000, /* rk7 */
97
+ 0xdb710640, 0x00000001 /* rk8 */
98
+ };
99
+
100
+
101
+ static uint32_t crc_fold(const unsigned char *src, long len, uint32_t initial) {
102
+ // info from https://www.reddit.com/r/ReverseEngineering/comments/2zwhl3/mystery_constant_0x9db42487_in_intels_crc32ieee/
103
+ // firstly, calculate: xmm_crc0 = (intial * 0x487b9c8a) mod 0x104c11db7, where 0x487b9c8a = inverse(1<<512) mod 0x104c11db7
104
+ __m128i xmm_t0 = _mm_cvtsi32_si128(~initial);
105
+
106
+ xmm_t0 = _mm_clmulepi64_si128(xmm_t0, _mm_set_epi32(0, 0, 0xa273bc24, 0), 0); // reverse(0x487b9c8a)<<1 == 0xa273bc24
107
+ __m128i reduction = _mm_set_epi32( // polynomial reduction factors
108
+ 1, 0xdb710640, // G* = 0x04c11db7
109
+ 0, 0xf7011641 // Q+ = 0x04d101df (+1 to save an additional xor operation)
110
+ );
111
+ __m128i xmm_t1 = _mm_clmulepi64_si128(xmm_t0, reduction, 0);
112
+ xmm_t1 = _mm_clmulepi64_si128(xmm_t1, reduction, 0x10);
113
+
114
+ xmm_t0 = _mm_srli_si128(_mm_xor_si128(xmm_t0, xmm_t1), 8);
115
+ __m256i crc0 = zext128_256(xmm_t0);
116
+ __m256i crc1 = _mm256_setzero_si256();
117
+
118
+ if (len < 32) {
119
+ if (len == 0)
120
+ return initial;
121
+ __m256i crc_part = _mm256_setzero_si256();
122
+ memcpy(&crc_part, src, len);
123
+ partial_fold(len, &crc0, &crc1, crc_part);
124
+ } else {
125
+ uintptr_t algn_diff = (0 - (uintptr_t)src) & 0x1F;
126
+ if (algn_diff) {
127
+ partial_fold(algn_diff, &crc0, &crc1, _mm256_loadu_si256((__m256i *)src));
128
+ src += algn_diff;
129
+ len -= algn_diff;
130
+ }
131
+
132
+ while (len >= 64) {
133
+ crc0 = do_one_fold(crc0, _mm256_load_si256((__m256i*)src));
134
+ crc1 = do_one_fold(crc1, _mm256_load_si256((__m256i*)src + 1));
135
+ src += 64;
136
+ len -= 64;
137
+ }
138
+
139
+ if (len >= 32) {
140
+ __m256i old = crc1;
141
+ crc1 = do_one_fold(crc0, _mm256_load_si256((__m256i*)src));
142
+ crc0 = old;
143
+
144
+ len -= 32;
145
+ src += 32;
146
+ }
147
+
148
+ if(len != 0) {
149
+ partial_fold(len, &crc0, &crc1, _mm256_load_si256((__m256i *)src));
150
+ }
151
+ }
152
+
153
+ const __m128i xmm_mask = _mm_set_epi32(-1,-1,-1,0);
154
+ __m128i x_tmp0, x_tmp1, x_tmp2, crc_fold;
155
+
156
+ __m128i xmm_crc0 = _mm256_castsi256_si128(crc0);
157
+ __m128i xmm_crc1 = _mm256_extracti128_si256(crc0, 1);
158
+ __m128i xmm_crc2 = _mm256_castsi256_si128(crc1);
159
+ __m128i xmm_crc3 = _mm256_extracti128_si256(crc1, 1);
160
+
161
+ /*
162
+ * k1
163
+ */
164
+ crc_fold = _mm_load_si128((__m128i *)crc_k);
165
+
166
+ x_tmp0 = _mm_clmulepi64_si128(xmm_crc0, crc_fold, 0x10);
167
+ xmm_crc0 = _mm_clmulepi64_si128(xmm_crc0, crc_fold, 0x01);
168
+ xmm_crc1 = MM_2XOR(xmm_crc1, x_tmp0, xmm_crc0);
169
+
170
+ x_tmp1 = _mm_clmulepi64_si128(xmm_crc1, crc_fold, 0x10);
171
+ xmm_crc1 = _mm_clmulepi64_si128(xmm_crc1, crc_fold, 0x01);
172
+ xmm_crc2 = MM_2XOR(xmm_crc2, x_tmp1, xmm_crc1);
173
+
174
+ x_tmp2 = _mm_clmulepi64_si128(xmm_crc2, crc_fold, 0x10);
175
+ xmm_crc2 = _mm_clmulepi64_si128(xmm_crc2, crc_fold, 0x01);
176
+ xmm_crc3 = MM_2XOR(xmm_crc3, x_tmp2, xmm_crc2);
177
+
178
+ /*
179
+ * k5
180
+ */
181
+ crc_fold = _mm_load_si128((__m128i *)crc_k + 1);
182
+
183
+ xmm_crc0 = xmm_crc3;
184
+ xmm_crc3 = _mm_clmulepi64_si128(xmm_crc3, crc_fold, 0);
185
+ xmm_crc0 = _mm_srli_si128(xmm_crc0, 8);
186
+ xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc0);
187
+
188
+ xmm_crc0 = xmm_crc3;
189
+ xmm_crc3 = _mm_slli_si128(xmm_crc3, 4);
190
+ xmm_crc3 = _mm_clmulepi64_si128(xmm_crc3, crc_fold, 0x10);
191
+ #ifdef ENABLE_AVX512
192
+ //xmm_crc3 = _mm_maskz_xor_epi32(14, xmm_crc3, xmm_crc0);
193
+ xmm_crc3 = _mm_ternarylogic_epi32(xmm_crc3, xmm_crc0, xmm_mask, 0x28);
194
+ #else
195
+ xmm_crc0 = _mm_and_si128(xmm_crc0, xmm_mask);
196
+ xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc0);
197
+ #endif
198
+
199
+ /*
200
+ * k7
201
+ */
202
+ xmm_crc1 = xmm_crc3;
203
+ crc_fold = _mm_load_si128((__m128i *)crc_k + 2);
204
+
205
+ xmm_crc3 = _mm_clmulepi64_si128(xmm_crc3, crc_fold, 0);
206
+ xmm_crc3 = _mm_clmulepi64_si128(xmm_crc3, crc_fold, 0x10);
207
+ #ifdef ENABLE_AVX512
208
+ xmm_crc3 = _mm_ternarylogic_epi32(xmm_crc3, xmm_crc1, xmm_crc1, 0xC3); // NOT(xmm_crc3 ^ xmm_crc1)
209
+ #else
210
+ xmm_crc1 = _mm_xor_si128(xmm_crc1, xmm_mask);
211
+ xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc1);
212
+ #endif
213
+ return _mm_extract_epi32(xmm_crc3, 2);
214
+ }
215
+
216
+ static uint32_t do_crc32_incremental_clmul(const void* data, size_t length, uint32_t init) {
217
+ return crc_fold((const unsigned char*)data, (long)length, init);
218
+ }
219
+
220
+ void crc_clmul256_set_funcs(crc_func* _do_crc32_incremental) {
221
+ *_do_crc32_incremental = &do_crc32_incremental_clmul;
222
+ }
223
+ #else
224
+ void crc_clmul_set_funcs(crc_func* _do_crc32_incremental);
225
+ void crc_clmul256_set_funcs(crc_func* _do_crc32_incremental) {
226
+ crc_clmul_set_funcs(_do_crc32_incremental);
227
+ }
228
+ #endif
229
+
package/src/decoder.cc CHANGED
@@ -4,15 +4,17 @@
4
4
  #include "decoder.h"
5
5
 
6
6
  extern "C" {
7
- YencDecoderEnd (*_do_decode)(const unsigned char*HEDLEY_RESTRICT*, unsigned char*HEDLEY_RESTRICT*, size_t, YencDecoderState*) = &do_decode_scalar<false, false>;
8
- YencDecoderEnd (*_do_decode_raw)(const unsigned char*HEDLEY_RESTRICT*, unsigned char*HEDLEY_RESTRICT*, size_t, YencDecoderState*) = &do_decode_scalar<true, false>;
9
- YencDecoderEnd (*_do_decode_end_raw)(const unsigned char*HEDLEY_RESTRICT*, unsigned char*HEDLEY_RESTRICT*, size_t, YencDecoderState*) = &do_decode_end_scalar<true>;
7
+ YencDecoderEnd (*_do_decode)(const unsigned char**, unsigned char**, size_t, YencDecoderState*) = &do_decode_scalar<false, false>;
8
+ YencDecoderEnd (*_do_decode_raw)(const unsigned char**, unsigned char**, size_t, YencDecoderState*) = &do_decode_scalar<true, false>;
9
+ YencDecoderEnd (*_do_decode_end_raw)(const unsigned char**, unsigned char**, size_t, YencDecoderState*) = &do_decode_end_scalar<true>;
10
10
  }
11
11
 
12
12
  void decoder_set_sse2_funcs();
13
13
  void decoder_set_ssse3_funcs();
14
14
  void decoder_set_avx_funcs();
15
15
  void decoder_set_avx2_funcs();
16
+ void decoder_set_vbmi2_funcs();
17
+ extern const bool decoder_has_avx10;
16
18
  void decoder_set_neon_funcs();
17
19
 
18
20
 
@@ -44,7 +46,9 @@ void decoder_init() {
44
46
  decoder_set_native_funcs();
45
47
  # else
46
48
  int use_isa = cpu_supports_isa();
47
- if(use_isa >= ISA_LEVEL_AVX2)
49
+ if(use_isa >= ISA_LEVEL_VBMI2 && (decoder_has_avx10 || (use_isa & ISA_FEATURE_EVEX512)))
50
+ decoder_set_vbmi2_funcs();
51
+ else if(use_isa >= ISA_LEVEL_AVX2)
48
52
  decoder_set_avx2_funcs();
49
53
  else if(use_isa >= ISA_LEVEL_AVX)
50
54
  decoder_set_avx_funcs();
package/src/decoder.h CHANGED
@@ -29,17 +29,17 @@ typedef enum {
29
29
 
30
30
  #include "hedley.h"
31
31
 
32
- extern YencDecoderEnd (*_do_decode)(const unsigned char*HEDLEY_RESTRICT*, unsigned char*HEDLEY_RESTRICT*, size_t, YencDecoderState*);
33
- extern YencDecoderEnd (*_do_decode_raw)(const unsigned char*HEDLEY_RESTRICT*, unsigned char*HEDLEY_RESTRICT*, size_t, YencDecoderState*);
34
- extern YencDecoderEnd (*_do_decode_end_raw)(const unsigned char*HEDLEY_RESTRICT*, unsigned char*HEDLEY_RESTRICT*, size_t, YencDecoderState*);
32
+ extern YencDecoderEnd (*_do_decode)(const unsigned char**, unsigned char**, size_t, YencDecoderState*);
33
+ extern YencDecoderEnd (*_do_decode_raw)(const unsigned char**, unsigned char**, size_t, YencDecoderState*);
34
+ extern YencDecoderEnd (*_do_decode_end_raw)(const unsigned char**, unsigned char**, size_t, YencDecoderState*);
35
35
 
36
- static inline size_t do_decode(int isRaw, const unsigned char* HEDLEY_RESTRICT src, unsigned char* HEDLEY_RESTRICT dest, size_t len, YencDecoderState* state) {
36
+ static inline size_t do_decode(int isRaw, const unsigned char* src, unsigned char* dest, size_t len, YencDecoderState* state) {
37
37
  unsigned char* ds = dest;
38
38
  (*(isRaw ? _do_decode_raw : _do_decode))(&src, &ds, len, state);
39
39
  return ds - dest;
40
40
  }
41
41
 
42
- static inline YencDecoderEnd do_decode_end(const unsigned char*HEDLEY_RESTRICT* src, unsigned char*HEDLEY_RESTRICT* dest, size_t len, YencDecoderState* state) {
42
+ static inline YencDecoderEnd do_decode_end(const unsigned char** src, unsigned char** dest, size_t len, YencDecoderState* state) {
43
43
  return _do_decode_end_raw(src, dest, len, state);
44
44
  }
45
45
 
@@ -1,8 +1,8 @@
1
1
 
2
2
  #ifdef __AVX2__
3
3
 
4
- // GCC (ver 6-10(dev)) fails to optimize pure C version of mask testing, but has this intrinsic; Clang >= 7 optimizes C version fine
5
- #if defined(__GNUC__) && __GNUC__ >= 7
4
+ // GCC (ver 6-10(dev)) fails to optimize pure C version of mask testing, but has this intrinsic; Clang >= 7 optimizes C version fine; functions added in Clang 8
5
+ #if (defined(__GNUC__) && __GNUC__ >= 7) || (defined(_MSC_VER) && _MSC_VER >= 1924)
6
6
  # define KORTEST32(a, b) !_kortestz_mask32_u8((a), (b))
7
7
  # define KAND32(a, b) _kand_mask32((a), (b))
8
8
  # define KOR32(a, b) _kor_mask32((a), (b))
@@ -30,7 +30,7 @@ static HEDLEY_ALWAYS_INLINE __m256i force_align_read_256(const void* p) {
30
30
  }
31
31
 
32
32
  // _mm256_castsi128_si256, but upper is defined to be 0
33
- #if (defined(__clang__) && __clang_major__ >= 5 && (!defined(__APPLE__) || __clang_major__ >= 7)) || (defined(__GNUC__) && __GNUC__ >= 10)
33
+ #if (defined(__clang__) && __clang_major__ >= 5 && (!defined(__APPLE__) || __clang_major__ >= 7)) || (defined(__GNUC__) && __GNUC__ >= 10) || (defined(_MSC_VER) && _MSC_VER >= 1910)
34
34
  // intrinsic unsupported in GCC 9 and MSVC < 2017
35
35
  # define zext128_256 _mm256_zextsi128_si256
36
36
  #else
@@ -43,9 +43,15 @@ static HEDLEY_ALWAYS_INLINE __m256i force_align_read_256(const void* p) {
43
43
  # endif
44
44
  #endif
45
45
 
46
+ #if defined(__tune_icelake_client__) || defined(__tune_icelake_server__) || defined(__tune_tigerlake__) || defined(__tune_rocketlake__) || defined(__tune_alderlake__) || defined(__tune_sapphirerapids__)
47
+ # define COMPRESS_STORE _mm256_mask_compressstoreu_epi8
48
+ #else
49
+ // avoid uCode on Zen4
50
+ # define COMPRESS_STORE(dst, mask, vec) _mm256_storeu_si256((__m256i*)(dst), _mm256_maskz_compress_epi8(mask, vec))
51
+ #endif
46
52
 
47
53
  template<bool isRaw, bool searchEnd, enum YEncDecIsaLevel use_isa>
48
- HEDLEY_ALWAYS_INLINE void do_decode_avx2(const uint8_t* HEDLEY_RESTRICT src, long& len, unsigned char* HEDLEY_RESTRICT & p, unsigned char& _escFirst, uint16_t& _nextMask) {
54
+ HEDLEY_ALWAYS_INLINE void do_decode_avx2(const uint8_t* src, long& len, unsigned char*& p, unsigned char& _escFirst, uint16_t& _nextMask) {
49
55
  HEDLEY_ASSUME(_escFirst == 0 || _escFirst == 1);
50
56
  HEDLEY_ASSUME(_nextMask == 0 || _nextMask == 1 || _nextMask == 2);
51
57
  uintptr_t escFirst = _escFirst;
@@ -60,6 +66,17 @@ HEDLEY_ALWAYS_INLINE void do_decode_avx2(const uint8_t* HEDLEY_RESTRICT src, lon
60
66
  '.','.','.','.','.','.','.','.','.','.','.','.','.','.',_nextMask==2?0:'.',_nextMask==1?0:'.'
61
67
  );
62
68
  }
69
+
70
+ // for some reason, MSVC Win32 seems to crash when trying to compile _mm256_mask_cmpeq_epi8_mask
71
+ // the crash can be fixed by switching the order of the last two arguments, but it seems to generate wrong code
72
+ // so just disable the optimisation as it seems to be problematic there
73
+ #if defined(__AVX512VL__) && defined(__AVX512BW__)
74
+ # if defined(_MSC_VER) && !defined(PLATFORM_AMD64) && !defined(__clang__)
75
+ const bool useAVX3MaskCmp = false;
76
+ # else
77
+ const bool useAVX3MaskCmp = (use_isa >= ISA_LEVEL_AVX3);
78
+ # endif
79
+ #endif
63
80
  intptr_t i;
64
81
  for(i = -len; i; i += sizeof(__m256i)*2) {
65
82
  __m256i oDataA = _mm256_load_si256((__m256i *)(src+i));
@@ -126,7 +143,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_avx2(const uint8_t* HEDLEY_RESTRICT src, lon
126
143
  __mmask32 match2EqMaskA, match2EqMaskB;
127
144
  __mmask32 match0CrMaskA, match0CrMaskB;
128
145
  __mmask32 match2CrXDtMaskA, match2CrXDtMaskB;
129
- if(use_isa >= ISA_LEVEL_AVX3 && searchEnd) {
146
+ if(useAVX3MaskCmp && searchEnd) {
130
147
  match2EqMaskA = _mm256_cmpeq_epi8_mask(_mm256_set1_epi8('='), tmpData2A);
131
148
  match2EqMaskB = _mm256_cmpeq_epi8_mask(_mm256_set1_epi8('='), tmpData2B);
132
149
  } else
@@ -142,7 +159,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_avx2(const uint8_t* HEDLEY_RESTRICT src, lon
142
159
  // find patterns of \r_.
143
160
 
144
161
  #if defined(__AVX512VL__) && defined(__AVX512BW__)
145
- if(use_isa >= ISA_LEVEL_AVX3) {
162
+ if(useAVX3MaskCmp) {
146
163
  match0CrMaskA = _mm256_cmpeq_epi8_mask(oDataA, _mm256_set1_epi8('\r'));
147
164
  match0CrMaskB = _mm256_cmpeq_epi8_mask(oDataB, _mm256_set1_epi8('\r'));
148
165
  match2CrXDtMaskA = _mm256_mask_cmpeq_epi8_mask(match0CrMaskA, tmpData2A, _mm256_set1_epi8('.'));
@@ -172,7 +189,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_avx2(const uint8_t* HEDLEY_RESTRICT src, lon
172
189
  #if defined(__AVX512VL__) && defined(__AVX512BW__)
173
190
  __mmask32 match1NlMaskA, match1NlMaskB;
174
191
  __mmask32 match2NlDotMaskA, match2NlDotMaskB;
175
- if(use_isa >= ISA_LEVEL_AVX3) {
192
+ if(useAVX3MaskCmp) {
176
193
  match1NlMaskA = _mm256_mask_cmpeq_epi8_mask(
177
194
  match0CrMaskA,
178
195
  _mm256_set1_epi8('\n'),
@@ -228,7 +245,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_avx2(const uint8_t* HEDLEY_RESTRICT src, lon
228
245
 
229
246
  int matchEnd;
230
247
  #if defined(__AVX512VL__) && defined(__AVX512BW__)
231
- if(use_isa >= ISA_LEVEL_AVX3) {
248
+ if(useAVX3MaskCmp) {
232
249
  __mmask32 match3EqYMaskA = _mm256_mask_cmpeq_epi8_mask(
233
250
  match2EqMaskA,
234
251
  _mm256_set1_epi8('y'),
@@ -307,7 +324,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_avx2(const uint8_t* HEDLEY_RESTRICT src, lon
307
324
  }
308
325
  }
309
326
  #if defined(__AVX512VL__) && defined(__AVX512BW__)
310
- if(use_isa >= ISA_LEVEL_AVX3) {
327
+ if(useAVX3MaskCmp) {
311
328
  mask |= (uint64_t)match2NlDotMaskA << 2;
312
329
  mask |= (uint64_t)match2NlDotMaskB << 34;
313
330
  minMask = _mm256_maskz_mov_epi8(~(match2NlDotMaskB>>30), _mm256_set1_epi8('.'));
@@ -325,7 +342,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_avx2(const uint8_t* HEDLEY_RESTRICT src, lon
325
342
  __m256i match3EqYA, match3EqYB;
326
343
  #if defined(__AVX512VL__) && defined(__AVX512BW__)
327
344
  __mmask32 match3EqYMaskA, match3EqYMaskB;
328
- if(use_isa >= ISA_LEVEL_AVX3) {
345
+ if(useAVX3MaskCmp) {
329
346
  match3EqYMaskA = _mm256_mask_cmpeq_epi8_mask(
330
347
  match2EqMaskA,
331
348
  _mm256_set1_epi8('y'),
@@ -355,7 +372,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_avx2(const uint8_t* HEDLEY_RESTRICT src, lon
355
372
  if(LIKELIHOOD(0.002, partialEndFound)) {
356
373
  bool endFound;
357
374
  #if defined(__AVX512VL__) && defined(__AVX512BW__)
358
- if(use_isa >= ISA_LEVEL_AVX3) {
375
+ if(useAVX3MaskCmp) {
359
376
  __mmask32 match3LfEqYMaskA = _mm256_mask_cmpeq_epi8_mask(
360
377
  match3EqYMaskA,
361
378
  _mm256_set1_epi8('\n'),
@@ -530,9 +547,9 @@ HEDLEY_ALWAYS_INLINE void do_decode_avx2(const uint8_t* HEDLEY_RESTRICT src, lon
530
547
  // all that's left is to 'compress' the data (skip over masked chars)
531
548
  #if defined(__AVX512VBMI2__) && defined(__AVX512VL__)
532
549
  if(use_isa >= ISA_LEVEL_VBMI2) {
533
- _mm256_mask_compressstoreu_epi8(p, KNOT32(mask), dataA);
550
+ COMPRESS_STORE(p, KNOT32(mask), dataA);
534
551
  p -= popcnt32(mask & 0xffffffff);
535
- _mm256_mask_compressstoreu_epi8((p + XMM_SIZE*2), KNOT32(mask>>32), dataB);
552
+ COMPRESS_STORE((p + XMM_SIZE*2), KNOT32(mask>>32), dataB);
536
553
  p += XMM_SIZE*4 - popcnt32(mask >> 32);
537
554
  } else
538
555
  #endif