yencode 1.1.2 → 1.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/binding.gyp CHANGED
@@ -64,7 +64,7 @@
64
64
  "targets": [
65
65
  {
66
66
  "target_name": "yencode",
67
- "dependencies": ["crcutil", "yencode_sse2", "yencode_ssse3", "yencode_clmul", "yencode_avx", "yencode_avx2", "yencode_neon", "yencode_armcrc"],
67
+ "dependencies": ["crcutil", "yencode_sse2", "yencode_ssse3", "yencode_clmul", "yencode_clmul256", "yencode_avx", "yencode_avx2", "yencode_vbmi2", "yencode_neon", "yencode_armcrc"],
68
68
  "sources": [
69
69
  "src/yencode.cc",
70
70
  "src/platform.cc",
@@ -206,6 +206,70 @@
206
206
  }]
207
207
  ]
208
208
  },
209
+ {
210
+ "target_name": "yencode_clmul256",
211
+ "type": "static_library",
212
+ "sources": [
213
+ "src/crc_folding_256.cc"
214
+ ],
215
+ "cflags!": ["-fno-omit-frame-pointer", "-fno-tree-vrp", "-fno-strict-aliasing"],
216
+ "cxxflags!": ["-fno-omit-frame-pointer", "-fno-tree-vrp", "-fno-strict-aliasing"],
217
+ "xcode_settings": {
218
+ "OTHER_CFLAGS!": ["-fno-omit-frame-pointer", "-fno-tree-vrp", "-fno-strict-aliasing"],
219
+ "OTHER_CXXFLAGS!": ["-fno-omit-frame-pointer", "-fno-tree-vrp", "-fno-strict-aliasing"]
220
+ },
221
+ "msvs_settings": {"VCCLCompilerTool": {"BufferSecurityCheck": "false"}},
222
+ "conditions": [
223
+ ['target_arch in "ia32 x64" and OS!="win"', {
224
+ "variables": {"supports_vpclmul%": "<!(<!(echo ${CC_target:-${CC:-cc}}) -MM -E src/crc_folding_256.cc -mavx2 -mvpclmulqdq 2>/dev/null || true)"},
225
+ "conditions": [
226
+ ['supports_vpclmul!=""', {
227
+ "cflags": ["-mavx2", "-mvpclmulqdq", "-mpclmul"],
228
+ "cxxflags": ["-mavx2", "-mvpclmulqdq", "-mpclmul"],
229
+ "xcode_settings": {
230
+ "OTHER_CFLAGS": ["-mavx2", "-mvpclmulqdq", "-mpclmul"],
231
+ "OTHER_CXXFLAGS": ["-mavx2", "-mvpclmulqdq", "-mpclmul"],
232
+ }
233
+ }]
234
+ ]
235
+ }],
236
+ ['target_arch in "ia32 x64" and OS=="win"', {
237
+ "msvs_settings": {"VCCLCompilerTool": {"EnableEnhancedInstructionSet": "3"}}
238
+ }]
239
+ ]
240
+ },
241
+ {
242
+ "target_name": "yencode_vbmi2",
243
+ "type": "static_library",
244
+ "sources": [
245
+ "src/decoder_vbmi2.cc", "src/encoder_vbmi2.cc"
246
+ ],
247
+ "cflags!": ["-fno-omit-frame-pointer", "-fno-tree-vrp", "-fno-strict-aliasing"],
248
+ "cxxflags!": ["-fno-omit-frame-pointer", "-fno-tree-vrp", "-fno-strict-aliasing"],
249
+ "xcode_settings": {
250
+ "OTHER_CFLAGS!": ["-fno-omit-frame-pointer", "-fno-tree-vrp", "-fno-strict-aliasing"],
251
+ "OTHER_CXXFLAGS!": ["-fno-omit-frame-pointer", "-fno-tree-vrp", "-fno-strict-aliasing"]
252
+ },
253
+ "msvs_settings": {"VCCLCompilerTool": {"BufferSecurityCheck": "false"}},
254
+ "conditions": [
255
+ ['target_arch in "ia32 x64" and OS!="win"', {
256
+ "variables": {"supports_vbmi2%": "<!(<!(echo ${CC_target:-${CC:-cc}}) -MM -E src/encoder_vbmi2.cc -mavx512vl -mavx512vbmi2 2>/dev/null || true)"},
257
+ "conditions": [
258
+ ['supports_vbmi2!=""', {
259
+ "cflags": ["-mavx512vbmi2", "-mavx512vl", "-mavx512bw", "-mpopcnt", "-mbmi", "-mbmi2", "-mlzcnt"],
260
+ "cxxflags": ["-mavx512vbmi2", "-mavx512vl", "-mavx512bw", "-mpopcnt", "-mbmi", "-mbmi2", "-mlzcnt"],
261
+ "xcode_settings": {
262
+ "OTHER_CFLAGS": ["-mavx512vbmi2", "-mavx512vl", "-mavx512bw", "-mpopcnt", "-mbmi", "-mbmi2", "-mlzcnt"],
263
+ "OTHER_CXXFLAGS": ["-mavx512vbmi2", "-mavx512vl", "-mavx512bw", "-mpopcnt", "-mbmi", "-mbmi2", "-mlzcnt"],
264
+ }
265
+ }]
266
+ ]
267
+ }],
268
+ ['target_arch in "ia32 x64" and OS=="win"', {
269
+ "msvs_settings": {"VCCLCompilerTool": {"AdditionalOptions": ["/arch:AVX512"], "EnableEnhancedInstructionSet": "0"}}
270
+ }]
271
+ ]
272
+ },
209
273
  {
210
274
  "target_name": "yencode_neon",
211
275
  "type": "static_library",
@@ -260,6 +324,14 @@
260
324
  "OTHER_CFLAGS": ["-march=armv8-a+crc"],
261
325
  "OTHER_CXXFLAGS": ["-march=armv8-a+crc"],
262
326
  }
327
+ }],
328
+ ['OS!="win" and target_arch=="arm"', {
329
+ "cflags": ["-mfpu=fp-armv8"],
330
+ "cxxflags": ["-mfpu=fp-armv8"],
331
+ "xcode_settings": {
332
+ "OTHER_CFLAGS": ["-mfpu=fp-armv8"],
333
+ "OTHER_CXXFLAGS": ["-mfpu=fp-armv8"]
334
+ }
263
335
  }]
264
336
  ]
265
337
  },
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "yencode",
3
- "version": "1.1.2",
3
+ "version": "1.1.3",
4
4
  "description": "SIMD accelerated yEnc encoder/decoder and CRC32 calculator",
5
5
  "keywords": [
6
6
  "yenc",
package/src/common.h CHANGED
@@ -35,18 +35,22 @@
35
35
  #endif
36
36
 
37
37
 
38
+ #include <stdlib.h>
38
39
  #if defined(_MSC_VER) || defined(__MINGW32__) || defined(__MINGW64__)
39
- #include <stdlib.h> // MSVC ARM64 seems to need this
40
+ // MSVC doesn't support C11 aligned_alloc: https://stackoverflow.com/a/62963007
40
41
  #define ALIGN_ALLOC(buf, len, align) *(void**)&(buf) = _aligned_malloc((len), align)
41
42
  #define ALIGN_FREE _aligned_free
42
- #elif defined(__cplusplus) && __cplusplus >= 201100 && !(defined(_MSC_VER) && (defined(__clang__) || defined(_M_ARM64) || defined(_M_ARM))) && !defined(__APPLE__)
43
- // C++11 method
43
+ #elif defined(_ISOC11_SOURCE)
44
+ // C11 method
44
45
  // len needs to be a multiple of alignment, although it sometimes works if it isn't...
45
- #include <cstdlib>
46
46
  #define ALIGN_ALLOC(buf, len, align) *(void**)&(buf) = aligned_alloc(align, ((len) + (align)-1) & ~((align)-1))
47
47
  #define ALIGN_FREE free
48
+ #elif defined(__cplusplus) && __cplusplus >= 201700
49
+ // C++17 method
50
+ #include <cstdlib>
51
+ #define ALIGN_ALLOC(buf, len, align) *(void**)&(buf) = std::aligned_alloc(align, ((len) + (align)-1) & ~((align)-1))
52
+ #define ALIGN_FREE free
48
53
  #else
49
- #include <stdlib.h>
50
54
  #define ALIGN_ALLOC(buf, len, align) if(posix_memalign((void**)&(buf), align, (len))) (buf) = NULL
51
55
  #define ALIGN_FREE free
52
56
  #endif
@@ -217,9 +221,9 @@ enum YEncDecIsaLevel {
217
221
  ISA_LEVEL_SSE41 = 0x300,
218
222
  ISA_LEVEL_SSE4_POPCNT = 0x301,
219
223
  ISA_LEVEL_AVX = 0x381, // same as above, just used as a differentiator for `cpu_supports_isa`
220
- ISA_LEVEL_AVX2 = 0x383, // also includes BMI1/2 and LZCNT
221
- ISA_LEVEL_AVX3 = 0x403, // SKX variant; AVX512VL + AVX512BW
222
- ISA_LEVEL_VBMI2 = 0x503 // ICL
224
+ ISA_LEVEL_AVX2 = 0x403, // also includes BMI1/2 and LZCNT
225
+ ISA_LEVEL_AVX3 = 0x503, // SKX variant; AVX512VL + AVX512BW
226
+ ISA_LEVEL_VBMI2 = 0x603 // ICL
223
227
  };
224
228
  #ifdef _MSC_VER
225
229
  // native tuning not supported in MSVC
@@ -249,13 +253,6 @@ enum YEncDecIsaLevel {
249
253
  # endif
250
254
  #endif
251
255
 
252
- #ifdef _MSC_VER
253
- # define _cpuid1(ar) __cpuid(ar, 1)
254
- #else
255
- # include <cpuid.h>
256
- # define _cpuid1(ar) __cpuid(1, ar[0], ar[1], ar[2], ar[3])
257
- #endif
258
-
259
256
  int cpu_supports_isa();
260
257
  #endif // PLATFORM_X86
261
258
 
@@ -270,7 +267,7 @@ int cpu_supports_isa();
270
267
 
271
268
 
272
269
  // GCC 8/9/10(dev) fails to optimize cases where KNOT should be used, so use intrinsic explicitly; Clang 6+ has no issue, but Clang 6/7 doesn't have the intrinsic; MSVC 2019 also fails and lacks the intrinsic
273
- #if defined(__GNUC__) && __GNUC__ >= 7
270
+ #if (defined(__GNUC__) && __GNUC__ >= 7) || (defined(_MSC_VER) && _MSC_VER >= 1924)
274
271
  # define KNOT16 _knot_mask16
275
272
  # define KNOT32 _knot_mask32
276
273
  #else
package/src/crc.cc CHANGED
@@ -25,8 +25,13 @@ uint32_t do_crc32_zeros(uint32_t crc1, size_t len) {
25
25
  }
26
26
 
27
27
  void crc_clmul_set_funcs(crc_func*);
28
+ void crc_clmul256_set_funcs(crc_func*);
28
29
  void crc_arm_set_funcs(crc_func*);
29
30
 
31
+ #ifdef PLATFORM_X86
32
+ int cpu_supports_crc_isa();
33
+ #endif
34
+
30
35
  #if defined(PLATFORM_ARM) && defined(_WIN32)
31
36
  # define WIN32_LEAN_AND_MEAN
32
37
  # include <Windows.h>
@@ -58,9 +63,10 @@ void crc_init() {
58
63
  // instance never deleted... oh well...
59
64
 
60
65
  #ifdef PLATFORM_X86
61
- int flags[4];
62
- _cpuid1(flags);
63
- if((flags[2] & 0x80202) == 0x80202) // SSE4.1 + SSSE3 + CLMUL
66
+ int support = cpu_supports_crc_isa();
67
+ if(support == 2)
68
+ crc_clmul256_set_funcs(&_do_crc32_incremental);
69
+ else if(support == 1)
64
70
  crc_clmul_set_funcs(&_do_crc32_incremental);
65
71
  #endif
66
72
  #ifdef PLATFORM_ARM
package/src/crc_arm.cc CHANGED
@@ -5,6 +5,18 @@
5
5
  HEDLEY_WARNING("CRC32 acceleration is not been enabled under ARM clang-cl by default; add `-march=armv8-a+crc` to additional compiler arguments to enable");
6
6
  #endif
7
7
 
8
+ // disable CRC on GCC versions with broken arm_acle.h
9
+ #if defined(__ARM_FEATURE_CRC32) && defined(HEDLEY_GCC_VERSION)
10
+ # if !defined(__aarch64__) && HEDLEY_GCC_VERSION_CHECK(7,0,0) && !HEDLEY_GCC_VERSION_CHECK(8,1,1)
11
+ # undef __ARM_FEATURE_CRC32
12
+ HEDLEY_WARNING("CRC32 acceleration has been disabled due to broken arm_acle.h shipped in GCC 7.0 - 8.1 [https://gcc.gnu.org/bugzilla/show_bug.cgi?id=81497]. If you need this feature, please use a different compiler or version of GCC");
13
+ # endif
14
+ # if defined(__aarch64__) && HEDLEY_GCC_VERSION_CHECK(9,4,0) && !HEDLEY_GCC_VERSION_CHECK(9,5,0)
15
+ # undef __ARM_FEATURE_CRC32
16
+ HEDLEY_WARNING("CRC32 acceleration has been disabled due to broken arm_acle.h shipped in GCC 9.4 [https://gcc.gnu.org/bugzilla/show_bug.cgi?id=100985]. If you need this feature, please use a different compiler or version of GCC");
17
+ # endif
18
+ #endif
19
+
8
20
  #if defined(__ARM_FEATURE_CRC32) || (defined(_M_ARM64) && !defined(__clang__)) // MSVC doesn't support CRC for ARM32
9
21
 
10
22
  /* ARMv8 accelerated CRC */
@@ -14,14 +26,30 @@ HEDLEY_WARNING("CRC32 acceleration is not been enabled under ARM clang-cl by def
14
26
  #include <arm_acle.h>
15
27
  #endif
16
28
 
29
+
30
+ #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
31
+ # ifdef __GNUC__
32
+ # define _LE16 __builtin_bswap16
33
+ # define _LE32 __builtin_bswap32
34
+ # define _LE64 __builtin_bswap64
35
+ # else
36
+ // currently not supported
37
+ # error No endian swap intrinsic defined
38
+ # endif
39
+ #else
40
+ # define _LE16(x) (x)
41
+ # define _LE32(x) (x)
42
+ # define _LE64(x) (x)
43
+ #endif
44
+
17
45
  #ifdef __aarch64__
18
46
  # define WORD_T uint64_t
19
47
  # define WORDSIZE_LOG 3 // sizeof(WORD_T) == 1<<WORDSIZE_LOG
20
- # define CRC_WORD __crc32d
48
+ # define CRC_WORD(crc, data) __crc32d(crc, _LE64(data))
21
49
  #else
22
50
  # define WORD_T uint32_t
23
51
  # define WORDSIZE_LOG 2 // sizeof(WORD_T) == 1<<WORDSIZE_LOG
24
- # define CRC_WORD __crc32w
52
+ # define CRC_WORD(crc, data) __crc32w(crc, _LE32(data))
25
53
  #endif
26
54
 
27
55
 
@@ -64,6 +92,7 @@ static const uint32_t crc_power[] = { // pre-computed 2^n, with first 3 entries
64
92
  #endif
65
93
 
66
94
 
95
+
67
96
  // inspired/stolen off https://github.com/jocover/crc32_armv8/blob/master/crc32_armv8.c
68
97
  static uint32_t arm_crc_calc(uint32_t crc, const unsigned char *src, long len) {
69
98
 
@@ -75,13 +104,13 @@ static uint32_t arm_crc_calc(uint32_t crc, const unsigned char *src, long len) {
75
104
  len--;
76
105
  }
77
106
  if ((uintptr_t)src & sizeof(uint16_t)) {
78
- crc = __crc32h(crc, *((uint16_t *)src));
107
+ crc = __crc32h(crc, _LE16(*((uint16_t *)src)));
79
108
  src += sizeof(uint16_t);
80
109
  len -= sizeof(uint16_t);
81
110
  }
82
111
  #ifdef __aarch64__
83
112
  if ((uintptr_t)src & sizeof(uint32_t)) {
84
- crc = __crc32w(crc, *((uint32_t *)src));
113
+ crc = __crc32w(crc, _LE32(*((uint32_t *)src)));
85
114
  src += sizeof(uint32_t);
86
115
  len -= sizeof(uint32_t);
87
116
  }
@@ -147,12 +176,12 @@ static uint32_t arm_crc_calc(uint32_t crc, const unsigned char *src, long len) {
147
176
 
148
177
  #ifdef __aarch64__
149
178
  if (len & sizeof(uint32_t)) {
150
- crc = __crc32w(crc, *((uint32_t *)src));
179
+ crc = __crc32w(crc, _LE32(*((uint32_t *)src)));
151
180
  src += sizeof(uint32_t);
152
181
  }
153
182
  #endif
154
183
  if (len & sizeof(uint16_t)) {
155
- crc = __crc32h(crc, *((uint16_t *)src));
184
+ crc = __crc32h(crc, _LE16(*((uint16_t *)src)));
156
185
  src += sizeof(uint16_t);
157
186
  }
158
187
  if (len & sizeof(uint8_t))
@@ -19,7 +19,7 @@
19
19
 
20
20
  #include "crc_common.h"
21
21
 
22
- #if (defined(__PCLMUL__) && defined(__SSSE3__) && defined(__SSE4_1__)) || (defined(_MSC_VER) && _MSC_VER >= 1600 && defined(PLATFORM_X86))
22
+ #if (defined(__PCLMUL__) && defined(__SSSE3__) && defined(__SSE4_1__)) || (defined(_MSC_VER) && _MSC_VER >= 1600 && defined(PLATFORM_X86) && !defined(__clang__))
23
23
  #include <inttypes.h>
24
24
  #include <immintrin.h>
25
25
  #include <wmmintrin.h>
@@ -135,33 +135,6 @@ ALIGN_TO(16, static const unsigned crc_mask[4]) = {
135
135
  0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF
136
136
  };
137
137
 
138
- static __m128i reverse_bits_epi8(__m128i src) {
139
- #if defined(__GFNI__) && defined(YENC_BUILD_NATIVE) && YENC_BUILD_NATIVE!=0
140
- return _mm_gf2p8affine_epi64_epi8(src, _mm_set_epi32(
141
- 0x80402010, 0x08040201,
142
- 0x80402010, 0x08040201
143
- ), 0);
144
- #else
145
- __m128i xmm_t0 = _mm_and_si128(src, _mm_set1_epi8(0x0f));
146
- __m128i xmm_t1 = _mm_and_si128(_mm_srli_epi16(src, 4), _mm_set1_epi8(0x0f));
147
- xmm_t0 = _mm_shuffle_epi8(_mm_set_epi8(
148
- -16, 112, -80, 48, -48, 80, -112, 16, -32, 96, -96, 32, -64, 64, -128, 0
149
- //0xf0, 0x70, 0xb0, 0x30, 0xd0, 0x50, 0x90, 0x10, 0xe0, 0x60, 0xa0, 0x20, 0xc0, 0x40, 0x80, 0
150
- ), xmm_t0);
151
- xmm_t1 = _mm_shuffle_epi8(_mm_set_epi8(
152
- 15, 7, 11, 3, 13, 5, 9, 1, 14, 6, 10, 2, 12, 4, 8, 0
153
- ), xmm_t1);
154
- return _mm_or_si128(xmm_t0, xmm_t1);
155
- #endif
156
- }
157
-
158
- #ifdef _MSC_VER
159
- // because MSVC doesn't use BSWAP unless you specifically tell it to...
160
- # include <stdlib.h>
161
- # define BSWAP32 _byteswap_ulong
162
- #else
163
- # define BSWAP32(n) ((((n)&0xff)<<24) | (((n)&0xff00)<<8) | (((n)&0xff0000)>>8) | (((n)&0xff000000)>>24))
164
- #endif
165
138
 
166
139
  static uint32_t crc_fold(const unsigned char *src, long len, uint32_t initial) {
167
140
  unsigned long algn_diff;
@@ -170,23 +143,17 @@ static uint32_t crc_fold(const unsigned char *src, long len, uint32_t initial) {
170
143
  // TODO: consider calculating this via a LUT instead (probably faster)
171
144
  // info from https://www.reddit.com/r/ReverseEngineering/comments/2zwhl3/mystery_constant_0x9db42487_in_intels_crc32ieee/
172
145
  // firstly, calculate: xmm_crc0 = (intial * 0x487b9c8a) mod 0x104c11db7, where 0x487b9c8a = inverse(1<<512) mod 0x104c11db7
146
+ xmm_t0 = _mm_cvtsi32_si128(~initial);
173
147
 
174
- // reverse input bits + load into XMM register
175
- uint32_t init_t = BSWAP32(initial);
176
- xmm_t0 = reverse_bits_epi8(_mm_cvtsi32_si128(~init_t));
177
-
178
- xmm_t0 = _mm_clmulepi64_si128(xmm_t0, _mm_cvtsi32_si128(0x487b9c8a), 0);
179
- xmm_t1 = _mm_and_si128(xmm_t0, _mm_set_epi32(-1,-1,-1,0)); // shifted up by 32bits to avoid shifts by using clmul's capability to select top 64bits instead
148
+ xmm_t0 = _mm_clmulepi64_si128(xmm_t0, _mm_set_epi32(0, 0, 0xa273bc24, 0), 0); // reverse(0x487b9c8a)<<1 == 0xa273bc24
180
149
  xmm_t2 = _mm_set_epi32( // polynomial reduction factors
181
- 0, 0x04c11db7, // G*
182
- 1, 0x04d101df // Q+
150
+ 1, 0xdb710640, // G* = 0x04c11db7
151
+ 0, 0xf7011641 // Q+ = 0x04d101df (+1 to save an additional xor operation)
183
152
  );
184
- xmm_t1 = _mm_clmulepi64_si128(xmm_t1, xmm_t2, 0);
185
- xmm_t1 = _mm_clmulepi64_si128(xmm_t1, xmm_t2, 0x11);
153
+ xmm_t1 = _mm_clmulepi64_si128(xmm_t0, xmm_t2, 0);
154
+ xmm_t1 = _mm_clmulepi64_si128(xmm_t1, xmm_t2, 0x10);
186
155
 
187
- __m128i xmm_crc0 = _mm_xor_si128(xmm_t0, xmm_t1);
188
- // reverse bits
189
- xmm_crc0 = _mm_shuffle_epi8(reverse_bits_epi8(xmm_crc0), _mm_set_epi32(-1,-1,-1,0x00010203));
156
+ __m128i xmm_crc0 = _mm_srli_si128(_mm_xor_si128(xmm_t0, xmm_t1), 8);
190
157
 
191
158
  __m128i xmm_crc1 = _mm_setzero_si128();
192
159
  __m128i xmm_crc2 = _mm_setzero_si128();
@@ -196,7 +163,8 @@ static uint32_t crc_fold(const unsigned char *src, long len, uint32_t initial) {
196
163
  if (len < 16) {
197
164
  if (len == 0)
198
165
  return initial;
199
- xmm_crc_part = _mm_loadu_si128((__m128i *)src);
166
+ xmm_crc_part = _mm_setzero_si128();
167
+ memcpy(&xmm_crc_part, src, len);
200
168
  goto partial;
201
169
  }
202
170
 
@@ -211,7 +179,7 @@ static uint32_t crc_fold(const unsigned char *src, long len, uint32_t initial) {
211
179
  &xmm_crc_part);
212
180
  }
213
181
 
214
- while ((len -= 64) >= 0) {
182
+ while (len >= 64) {
215
183
  xmm_t0 = _mm_load_si128((__m128i *)src);
216
184
  xmm_t1 = _mm_load_si128((__m128i *)src + 1);
217
185
  xmm_t2 = _mm_load_si128((__m128i *)src + 2);
@@ -235,13 +203,11 @@ static uint32_t crc_fold(const unsigned char *src, long len, uint32_t initial) {
235
203
  #endif
236
204
 
237
205
  src += 64;
206
+ len -= 64;
238
207
  }
239
208
 
240
- /*
241
- * len = num bytes left - 64
242
- */
243
- if (len + 16 >= 0) {
244
- len += 16;
209
+ if (len >= 48) {
210
+ len -= 48;
245
211
 
246
212
  xmm_t0 = _mm_load_si128((__m128i *)src);
247
213
  xmm_t1 = _mm_load_si128((__m128i *)src + 1);
@@ -266,8 +232,8 @@ static uint32_t crc_fold(const unsigned char *src, long len, uint32_t initial) {
266
232
  goto done;
267
233
 
268
234
  xmm_crc_part = _mm_load_si128((__m128i *)src + 3);
269
- } else if (len + 32 >= 0) {
270
- len += 32;
235
+ } else if (len >= 32) {
236
+ len -= 32;
271
237
 
272
238
  xmm_t0 = _mm_load_si128((__m128i *)src);
273
239
  xmm_t1 = _mm_load_si128((__m128i *)src + 1);
@@ -290,8 +256,8 @@ static uint32_t crc_fold(const unsigned char *src, long len, uint32_t initial) {
290
256
  goto done;
291
257
 
292
258
  xmm_crc_part = _mm_load_si128((__m128i *)src + 2);
293
- } else if (len + 48 >= 0) {
294
- len += 48;
259
+ } else if (len >= 16) {
260
+ len -= 16;
295
261
 
296
262
  xmm_t0 = _mm_load_si128((__m128i *)src);
297
263
 
@@ -310,7 +276,6 @@ static uint32_t crc_fold(const unsigned char *src, long len, uint32_t initial) {
310
276
 
311
277
  xmm_crc_part = _mm_load_si128((__m128i *)src + 1);
312
278
  } else {
313
- len += 64;
314
279
  if (len == 0)
315
280
  goto done;
316
281
  xmm_crc_part = _mm_load_si128((__m128i *)src);
@@ -0,0 +1,230 @@
1
+ // 256-bit version of crc_folding
2
+
3
+ #include "crc_common.h"
4
+
5
+ #if !defined(YENC_DISABLE_AVX256) && ((defined(__VPCLMULQDQ__) && defined(__AVX2__) && defined(__PCLMUL__)) || (defined(_MSC_VER) && _MSC_VER >= 1920 && defined(PLATFORM_X86) && !defined(__clang__)))
6
+ #include <inttypes.h>
7
+ #include <immintrin.h>
8
+
9
+
10
+ #if defined(__AVX512VL__) && defined(YENC_BUILD_NATIVE) && YENC_BUILD_NATIVE!=0
11
+ # define ENABLE_AVX512 1
12
+ #endif
13
+
14
+ static __m256i do_one_fold(__m256i src, __m256i data) {
15
+ const __m256i fold4 = _mm256_set_epi32(
16
+ 0x00000001, 0x54442bd4,
17
+ 0x00000001, 0xc6e41596,
18
+ 0x00000001, 0x54442bd4,
19
+ 0x00000001, 0xc6e41596
20
+ );
21
+ #ifdef ENABLE_AVX512
22
+ return _mm256_ternarylogic_epi32(
23
+ _mm256_clmulepi64_epi128(src, fold4, 0x01),
24
+ _mm256_clmulepi64_epi128(src, fold4, 0x10),
25
+ data,
26
+ 0x96
27
+ );
28
+ #else
29
+ return _mm256_xor_si256(data, _mm256_xor_si256(
30
+ _mm256_clmulepi64_epi128(src, fold4, 0x01),
31
+ _mm256_clmulepi64_epi128(src, fold4, 0x10)
32
+ ));
33
+ #endif
34
+ }
35
+
36
+ ALIGN_TO(32, static const uint8_t pshufb_rot_table[]) = {
37
+ 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,
38
+ 16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31
39
+ };
40
+ // _mm256_castsi128_si256, but upper is defined to be 0
41
+ #if (defined(__clang__) && __clang_major__ >= 5 && (!defined(__APPLE__) || __clang_major__ >= 7)) || (defined(__GNUC__) && __GNUC__ >= 10)
42
+ // intrinsic unsupported in GCC 9 and MSVC < 2017
43
+ # define zext128_256 _mm256_zextsi128_si256
44
+ #else
45
+ // technically a cast is incorrect, due to upper 128 bits being undefined, but should usually work fine
46
+ // alternative may be `_mm256_set_m128i(_mm_setzero_si128(), v)` but unsupported on GCC < 7, and most compilers generate a VINSERTF128 instruction for it
47
+ # ifdef __OPTIMIZE__
48
+ # define zext128_256 _mm256_castsi128_si256
49
+ # else
50
+ # define zext128_256(x) _mm256_inserti128_si256(_mm256_setzero_si256(), x, 0)
51
+ # endif
52
+ #endif
53
+
54
+ #ifdef ENABLE_AVX512
55
+ # define MM256_BLENDV(a, b, m) _mm256_ternarylogic_epi32(a, b, m, 0xd8)
56
+ # define MM_2XOR(a, b, c) _mm_ternarylogic_epi32(a, b, c, 0x96)
57
+ #else
58
+ # define MM256_BLENDV _mm256_blendv_epi8
59
+ # define MM_2XOR(a, b, c) _mm_xor_si128(_mm_xor_si128(a, b), c)
60
+ #endif
61
+
62
+ static void partial_fold(const size_t len, __m256i *crc0, __m256i *crc1, __m256i crc_part) {
63
+ __m256i shuf = _mm256_broadcastsi128_si256(_mm_loadu_si128((__m128i*)(pshufb_rot_table + (len&15))));
64
+ __m256i mask = _mm256_cmpgt_epi8(shuf, _mm256_set1_epi8(15));
65
+
66
+ *crc0 = _mm256_shuffle_epi8(*crc0, shuf);
67
+ *crc1 = _mm256_shuffle_epi8(*crc1, shuf);
68
+ crc_part = _mm256_shuffle_epi8(crc_part, shuf);
69
+
70
+ __m256i crc_out = _mm256_permute2x128_si256(*crc0, *crc0, 0x08); // move bottom->top
71
+ __m256i crc01, crc1p;
72
+ if(len >= 16) {
73
+ crc_out = MM256_BLENDV(crc_out, *crc0, mask);
74
+ crc01 = *crc1;
75
+ crc1p = crc_part;
76
+ *crc0 = _mm256_permute2x128_si256(*crc0, *crc1, 0x21);
77
+ *crc1 = _mm256_permute2x128_si256(*crc1, crc_part, 0x21);
78
+ crc_part = zext128_256(_mm256_extracti128_si256(crc_part, 1));
79
+ } else {
80
+ crc_out = _mm256_and_si256(crc_out, mask);
81
+ crc01 = _mm256_permute2x128_si256(*crc0, *crc1, 0x21);
82
+ crc1p = _mm256_permute2x128_si256(*crc1, crc_part, 0x21);
83
+ }
84
+
85
+ *crc0 = MM256_BLENDV(*crc0, crc01, mask);
86
+ *crc1 = MM256_BLENDV(*crc1, crc1p, mask);
87
+
88
+ *crc1 = do_one_fold(crc_out, *crc1);
89
+ }
90
+
91
+
92
+ ALIGN_TO(16, static const unsigned crc_k[]) = {
93
+ 0xccaa009e, 0x00000000, /* rk1 */
94
+ 0x751997d0, 0x00000001, /* rk2 */
95
+ 0xccaa009e, 0x00000000, /* rk5 */
96
+ 0x63cd6124, 0x00000001, /* rk6 */
97
+ 0xf7011641, 0x00000000, /* rk7 */
98
+ 0xdb710640, 0x00000001 /* rk8 */
99
+ };
100
+
101
+
102
+ static uint32_t crc_fold(const unsigned char *src, long len, uint32_t initial) {
103
+ // info from https://www.reddit.com/r/ReverseEngineering/comments/2zwhl3/mystery_constant_0x9db42487_in_intels_crc32ieee/
104
+ // firstly, calculate: xmm_crc0 = (intial * 0x487b9c8a) mod 0x104c11db7, where 0x487b9c8a = inverse(1<<512) mod 0x104c11db7
105
+ __m128i xmm_t0 = _mm_cvtsi32_si128(~initial);
106
+
107
+ xmm_t0 = _mm_clmulepi64_si128(xmm_t0, _mm_set_epi32(0, 0, 0xa273bc24, 0), 0); // reverse(0x487b9c8a)<<1 == 0xa273bc24
108
+ __m128i reduction = _mm_set_epi32( // polynomial reduction factors
109
+ 1, 0xdb710640, // G* = 0x04c11db7
110
+ 0, 0xf7011641 // Q+ = 0x04d101df (+1 to save an additional xor operation)
111
+ );
112
+ __m128i xmm_t1 = _mm_clmulepi64_si128(xmm_t0, reduction, 0);
113
+ xmm_t1 = _mm_clmulepi64_si128(xmm_t1, reduction, 0x10);
114
+
115
+ xmm_t0 = _mm_srli_si128(_mm_xor_si128(xmm_t0, xmm_t1), 8);
116
+ __m256i crc0 = zext128_256(xmm_t0);
117
+ __m256i crc1 = _mm256_setzero_si256();
118
+
119
+ if (len < 32) {
120
+ if (len == 0)
121
+ return initial;
122
+ __m256i crc_part = _mm256_setzero_si256();
123
+ memcpy(&crc_part, src, len);
124
+ partial_fold(len, &crc0, &crc1, crc_part);
125
+ } else {
126
+ uintptr_t algn_diff = (0 - (uintptr_t)src) & 0x1F;
127
+ if (algn_diff) {
128
+ partial_fold(algn_diff, &crc0, &crc1, _mm256_loadu_si256((__m256i *)src));
129
+ src += algn_diff;
130
+ len -= algn_diff;
131
+ }
132
+
133
+ while (len >= 64) {
134
+ crc0 = do_one_fold(crc0, _mm256_load_si256((__m256i*)src));
135
+ crc1 = do_one_fold(crc1, _mm256_load_si256((__m256i*)src + 1));
136
+ src += 64;
137
+ len -= 64;
138
+ }
139
+
140
+ if (len >= 32) {
141
+ __m256i old = crc1;
142
+ crc1 = do_one_fold(crc0, _mm256_load_si256((__m256i*)src));
143
+ crc0 = old;
144
+
145
+ len -= 32;
146
+ src += 32;
147
+ }
148
+
149
+ if(len != 0) {
150
+ partial_fold(len, &crc0, &crc1, _mm256_load_si256((__m256i *)src));
151
+ }
152
+ }
153
+
154
+ const __m128i xmm_mask = _mm_set_epi32(-1,-1,-1,0);
155
+ __m128i x_tmp0, x_tmp1, x_tmp2, crc_fold;
156
+
157
+ __m128i xmm_crc0 = _mm256_castsi256_si128(crc0);
158
+ __m128i xmm_crc1 = _mm256_extracti128_si256(crc0, 1);
159
+ __m128i xmm_crc2 = _mm256_castsi256_si128(crc1);
160
+ __m128i xmm_crc3 = _mm256_extracti128_si256(crc1, 1);
161
+
162
+ /*
163
+ * k1
164
+ */
165
+ crc_fold = _mm_load_si128((__m128i *)crc_k);
166
+
167
+ x_tmp0 = _mm_clmulepi64_si128(xmm_crc0, crc_fold, 0x10);
168
+ xmm_crc0 = _mm_clmulepi64_si128(xmm_crc0, crc_fold, 0x01);
169
+ xmm_crc1 = MM_2XOR(xmm_crc1, x_tmp0, xmm_crc0);
170
+
171
+ x_tmp1 = _mm_clmulepi64_si128(xmm_crc1, crc_fold, 0x10);
172
+ xmm_crc1 = _mm_clmulepi64_si128(xmm_crc1, crc_fold, 0x01);
173
+ xmm_crc2 = MM_2XOR(xmm_crc2, x_tmp1, xmm_crc1);
174
+
175
+ x_tmp2 = _mm_clmulepi64_si128(xmm_crc2, crc_fold, 0x10);
176
+ xmm_crc2 = _mm_clmulepi64_si128(xmm_crc2, crc_fold, 0x01);
177
+ xmm_crc3 = MM_2XOR(xmm_crc3, x_tmp2, xmm_crc2);
178
+
179
+ /*
180
+ * k5
181
+ */
182
+ crc_fold = _mm_load_si128((__m128i *)crc_k + 1);
183
+
184
+ xmm_crc0 = xmm_crc3;
185
+ xmm_crc3 = _mm_clmulepi64_si128(xmm_crc3, crc_fold, 0);
186
+ xmm_crc0 = _mm_srli_si128(xmm_crc0, 8);
187
+ xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc0);
188
+
189
+ xmm_crc0 = xmm_crc3;
190
+ xmm_crc3 = _mm_slli_si128(xmm_crc3, 4);
191
+ xmm_crc3 = _mm_clmulepi64_si128(xmm_crc3, crc_fold, 0x10);
192
+ #ifdef ENABLE_AVX512
193
+ //xmm_crc3 = _mm_maskz_xor_epi32(14, xmm_crc3, xmm_crc0);
194
+ xmm_crc3 = _mm_ternarylogic_epi32(xmm_crc3, xmm_crc0, xmm_mask, 0x28);
195
+ #else
196
+ xmm_crc0 = _mm_and_si128(xmm_crc0, xmm_mask);
197
+ xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc0);
198
+ #endif
199
+
200
+ /*
201
+ * k7
202
+ */
203
+ xmm_crc1 = xmm_crc3;
204
+ crc_fold = _mm_load_si128((__m128i *)crc_k + 2);
205
+
206
+ xmm_crc3 = _mm_clmulepi64_si128(xmm_crc3, crc_fold, 0);
207
+ xmm_crc3 = _mm_clmulepi64_si128(xmm_crc3, crc_fold, 0x10);
208
+ #ifdef ENABLE_AVX512
209
+ xmm_crc3 = _mm_ternarylogic_epi32(xmm_crc3, xmm_crc1, xmm_crc1, 0xC3); // NOT(xmm_crc3 ^ xmm_crc1)
210
+ #else
211
+ xmm_crc1 = _mm_xor_si128(xmm_crc1, xmm_mask);
212
+ xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc1);
213
+ #endif
214
+ return _mm_extract_epi32(xmm_crc3, 2);
215
+ }
216
+
217
+ static uint32_t do_crc32_incremental_clmul(const void* data, size_t length, uint32_t init) {
218
+ return crc_fold((const unsigned char*)data, (long)length, init);
219
+ }
220
+
221
+ void crc_clmul256_set_funcs(crc_func* _do_crc32_incremental) {
222
+ *_do_crc32_incremental = &do_crc32_incremental_clmul;
223
+ }
224
+ #else
225
+ void crc_clmul_set_funcs(crc_func* _do_crc32_incremental);
226
+ void crc_clmul256_set_funcs(crc_func* _do_crc32_incremental) {
227
+ crc_clmul_set_funcs(_do_crc32_incremental);
228
+ }
229
+ #endif
230
+
package/src/decoder.cc CHANGED
@@ -13,6 +13,7 @@ void decoder_set_sse2_funcs();
13
13
  void decoder_set_ssse3_funcs();
14
14
  void decoder_set_avx_funcs();
15
15
  void decoder_set_avx2_funcs();
16
+ void decoder_set_vbmi2_funcs();
16
17
  void decoder_set_neon_funcs();
17
18
 
18
19
 
@@ -44,7 +45,9 @@ void decoder_init() {
44
45
  decoder_set_native_funcs();
45
46
  # else
46
47
  int use_isa = cpu_supports_isa();
47
- if(use_isa >= ISA_LEVEL_AVX2)
48
+ if(use_isa >= ISA_LEVEL_VBMI2)
49
+ decoder_set_vbmi2_funcs();
50
+ else if(use_isa >= ISA_LEVEL_AVX2)
48
51
  decoder_set_avx2_funcs();
49
52
  else if(use_isa >= ISA_LEVEL_AVX)
50
53
  decoder_set_avx_funcs();
@@ -1,8 +1,8 @@
1
1
 
2
2
  #ifdef __AVX2__
3
3
 
4
- // GCC (ver 6-10(dev)) fails to optimize pure C version of mask testing, but has this intrinsic; Clang >= 7 optimizes C version fine
5
- #if defined(__GNUC__) && __GNUC__ >= 7
4
+ // GCC (ver 6-10(dev)) fails to optimize pure C version of mask testing, but has this intrinsic; Clang >= 7 optimizes C version fine; functions added in Clang 8
5
+ #if (defined(__GNUC__) && __GNUC__ >= 7) || (defined(_MSC_VER) && _MSC_VER >= 1924)
6
6
  # define KORTEST32(a, b) !_kortestz_mask32_u8((a), (b))
7
7
  # define KAND32(a, b) _kand_mask32((a), (b))
8
8
  # define KOR32(a, b) _kor_mask32((a), (b))
@@ -60,6 +60,17 @@ HEDLEY_ALWAYS_INLINE void do_decode_avx2(const uint8_t* HEDLEY_RESTRICT src, lon
60
60
  '.','.','.','.','.','.','.','.','.','.','.','.','.','.',_nextMask==2?0:'.',_nextMask==1?0:'.'
61
61
  );
62
62
  }
63
+
64
+ // for some reason, MSVC Win32 seems to crash when trying to compile _mm256_mask_cmpeq_epi8_mask
65
+ // the crash can be fixed by switching the order of the last two arguments, but it seems to generate wrong code
66
+ // so just disable the optimisation as it seems to be problematic there
67
+ #if defined(__AVX512VL__) && defined(__AVX512BW__)
68
+ # if defined(_MSC_VER) && !defined(PLATFORM_AMD64) && !defined(__clang__)
69
+ const bool useAVX3MaskCmp = false;
70
+ # else
71
+ const bool useAVX3MaskCmp = (use_isa >= ISA_LEVEL_AVX3);
72
+ # endif
73
+ #endif
63
74
  intptr_t i;
64
75
  for(i = -len; i; i += sizeof(__m256i)*2) {
65
76
  __m256i oDataA = _mm256_load_si256((__m256i *)(src+i));
@@ -126,7 +137,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_avx2(const uint8_t* HEDLEY_RESTRICT src, lon
126
137
  __mmask32 match2EqMaskA, match2EqMaskB;
127
138
  __mmask32 match0CrMaskA, match0CrMaskB;
128
139
  __mmask32 match2CrXDtMaskA, match2CrXDtMaskB;
129
- if(use_isa >= ISA_LEVEL_AVX3 && searchEnd) {
140
+ if(useAVX3MaskCmp && searchEnd) {
130
141
  match2EqMaskA = _mm256_cmpeq_epi8_mask(_mm256_set1_epi8('='), tmpData2A);
131
142
  match2EqMaskB = _mm256_cmpeq_epi8_mask(_mm256_set1_epi8('='), tmpData2B);
132
143
  } else
@@ -142,7 +153,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_avx2(const uint8_t* HEDLEY_RESTRICT src, lon
142
153
  // find patterns of \r_.
143
154
 
144
155
  #if defined(__AVX512VL__) && defined(__AVX512BW__)
145
- if(use_isa >= ISA_LEVEL_AVX3) {
156
+ if(useAVX3MaskCmp) {
146
157
  match0CrMaskA = _mm256_cmpeq_epi8_mask(oDataA, _mm256_set1_epi8('\r'));
147
158
  match0CrMaskB = _mm256_cmpeq_epi8_mask(oDataB, _mm256_set1_epi8('\r'));
148
159
  match2CrXDtMaskA = _mm256_mask_cmpeq_epi8_mask(match0CrMaskA, tmpData2A, _mm256_set1_epi8('.'));
@@ -172,7 +183,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_avx2(const uint8_t* HEDLEY_RESTRICT src, lon
172
183
  #if defined(__AVX512VL__) && defined(__AVX512BW__)
173
184
  __mmask32 match1NlMaskA, match1NlMaskB;
174
185
  __mmask32 match2NlDotMaskA, match2NlDotMaskB;
175
- if(use_isa >= ISA_LEVEL_AVX3) {
186
+ if(useAVX3MaskCmp) {
176
187
  match1NlMaskA = _mm256_mask_cmpeq_epi8_mask(
177
188
  match0CrMaskA,
178
189
  _mm256_set1_epi8('\n'),
@@ -228,7 +239,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_avx2(const uint8_t* HEDLEY_RESTRICT src, lon
228
239
 
229
240
  int matchEnd;
230
241
  #if defined(__AVX512VL__) && defined(__AVX512BW__)
231
- if(use_isa >= ISA_LEVEL_AVX3) {
242
+ if(useAVX3MaskCmp) {
232
243
  __mmask32 match3EqYMaskA = _mm256_mask_cmpeq_epi8_mask(
233
244
  match2EqMaskA,
234
245
  _mm256_set1_epi8('y'),
@@ -307,7 +318,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_avx2(const uint8_t* HEDLEY_RESTRICT src, lon
307
318
  }
308
319
  }
309
320
  #if defined(__AVX512VL__) && defined(__AVX512BW__)
310
- if(use_isa >= ISA_LEVEL_AVX3) {
321
+ if(useAVX3MaskCmp) {
311
322
  mask |= (uint64_t)match2NlDotMaskA << 2;
312
323
  mask |= (uint64_t)match2NlDotMaskB << 34;
313
324
  minMask = _mm256_maskz_mov_epi8(~(match2NlDotMaskB>>30), _mm256_set1_epi8('.'));
@@ -325,7 +336,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_avx2(const uint8_t* HEDLEY_RESTRICT src, lon
325
336
  __m256i match3EqYA, match3EqYB;
326
337
  #if defined(__AVX512VL__) && defined(__AVX512BW__)
327
338
  __mmask32 match3EqYMaskA, match3EqYMaskB;
328
- if(use_isa >= ISA_LEVEL_AVX3) {
339
+ if(useAVX3MaskCmp) {
329
340
  match3EqYMaskA = _mm256_mask_cmpeq_epi8_mask(
330
341
  match2EqMaskA,
331
342
  _mm256_set1_epi8('y'),
@@ -355,7 +366,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_avx2(const uint8_t* HEDLEY_RESTRICT src, lon
355
366
  if(LIKELIHOOD(0.002, partialEndFound)) {
356
367
  bool endFound;
357
368
  #if defined(__AVX512VL__) && defined(__AVX512BW__)
358
- if(use_isa >= ISA_LEVEL_AVX3) {
369
+ if(useAVX3MaskCmp) {
359
370
  __mmask32 match3LfEqYMaskA = _mm256_mask_cmpeq_epi8_mask(
360
371
  match3EqYMaskA,
361
372
  _mm256_set1_epi8('\n'),
@@ -19,14 +19,14 @@
19
19
  #endif
20
20
 
21
21
 
22
- // for compilers that lack these functions
23
- #if defined(__clang__) || (defined(__GNUC__) && (defined(__aarch64__) && __GNUC__ >= 8))
22
+ // for compilers that lack these functions (Clang armv7 9-12 seems to have issues with multi-vector loads)
23
+ #if (defined(__clang__) && (defined(__aarch64__) || __clang_major__<9 || __clang_major__>12)) || (defined(__GNUC__) && (defined(__aarch64__) && __GNUC__ >= 8))
24
24
  # define vld1q_u8_x2_align(p, n) vld1q_u8_x2((uint8_t*)__builtin_assume_aligned(p, n))
25
25
  #else
26
26
  # define vld1q_u8_x2_align(p, n) vcreate2_u8(vld1q_u8_align(p, (n)/2), vld1q_u8_align((p)+16, (n)/2))
27
27
  #endif
28
28
  // Clang wrongly assumes alignment on vld1q_u8_x2, and ARMv7 GCC doesn't support the function, so effectively, it can only be used in ARMv8 compilers
29
- #if defined(__aarch64__) && (defined(__clang__) || (defined(__GNUC__) && __GNUC__ >= 9))
29
+ #if defined(__aarch64__) && (defined(__clang__) || HEDLEY_GCC_VERSION_CHECK(8,5,0))
30
30
  # define vst1q_u8_x2_unaligned vst1q_u8_x2
31
31
  #else
32
32
  static HEDLEY_ALWAYS_INLINE void vst1q_u8_x2_unaligned(uint8_t* p, uint8x16x2_t data) {
@@ -10,9 +10,9 @@ static struct { char bytes[16]; } ALIGN_TO(16, compactLUT[32768]);
10
10
  static uint8_t eqFixLUT[256];
11
11
 
12
12
 
13
-
14
- #if !defined(__clang__) && !defined(_MSC_VER) && (!defined(__aarch64__) || !HEDLEY_GCC_VERSION_CHECK(10,0,0))
15
- static HEDLEY_ALWAYS_INLINE uint8x16x4_t vld1q_u8_x4(const uint8_t* p) {
13
+ // AArch64 GCC lacks these functions until 8.5, 9.4 and 10.1 (10.0 unknown)
14
+ #if !defined(__clang__) && !defined(_MSC_VER) && (!defined(__aarch64__) || !(HEDLEY_GCC_VERSION_CHECK(9,4,0) || (!HEDLEY_GCC_VERSION_CHECK(9,0,0) && HEDLEY_GCC_VERSION_CHECK(8,5,0))))
15
+ static HEDLEY_ALWAYS_INLINE uint8x16x4_t _vld1q_u8_x4(const uint8_t* p) {
16
16
  uint8x16x4_t ret;
17
17
  ret.val[0] = vld1q_u8(p);
18
18
  ret.val[1] = vld1q_u8(p+16);
@@ -20,12 +20,15 @@ static HEDLEY_ALWAYS_INLINE uint8x16x4_t vld1q_u8_x4(const uint8_t* p) {
20
20
  ret.val[3] = vld1q_u8(p+48);
21
21
  return ret;
22
22
  }
23
- static HEDLEY_ALWAYS_INLINE void vst1q_u8_x4(uint8_t* p, uint8x16x4_t data) {
23
+ static HEDLEY_ALWAYS_INLINE void _vst1q_u8_x4(uint8_t* p, uint8x16x4_t data) {
24
24
  vst1q_u8(p, data.val[0]);
25
25
  vst1q_u8(p+16, data.val[1]);
26
26
  vst1q_u8(p+32, data.val[2]);
27
27
  vst1q_u8(p+48, data.val[3]);
28
28
  }
29
+ #else
30
+ # define _vld1q_u8_x4 vld1q_u8_x4
31
+ # define _vst1q_u8_x4 vst1q_u8_x4
29
32
  #endif
30
33
 
31
34
 
@@ -55,7 +58,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_neon(const uint8_t* HEDLEY_RESTRICT src, lon
55
58
  uint8x16_t yencOffset = escFirst ? vmakeq_u8(42+64,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42) : vdupq_n_u8(42);
56
59
  long i;
57
60
  for(i = -len; i; i += sizeof(uint8x16_t)*4) {
58
- uint8x16x4_t data = vld1q_u8_x4(src+i);
61
+ uint8x16x4_t data = _vld1q_u8_x4(src+i);
59
62
  uint8x16_t dataA = data.val[0];
60
63
  uint8x16_t dataB = data.val[1];
61
64
  uint8x16_t dataC = data.val[2];
@@ -421,7 +424,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_neon(const uint8_t* HEDLEY_RESTRICT src, lon
421
424
  dataB = vsubq_u8(dataB, vdupq_n_u8(42));
422
425
  dataC = vsubq_u8(dataC, vdupq_n_u8(42));
423
426
  dataD = vsubq_u8(dataD, vdupq_n_u8(42));
424
- vst1q_u8_x4(p, vcreate4_u8(dataA, dataB, dataC, dataD));
427
+ _vst1q_u8_x4(p, vcreate4_u8(dataA, dataB, dataC, dataD));
425
428
  p += sizeof(uint8x16_t)*4;
426
429
  escFirst = 0;
427
430
  yencOffset = vdupq_n_u8(42);
@@ -8,7 +8,7 @@
8
8
  #endif
9
9
 
10
10
  // GCC (ver 6-10(dev)) fails to optimize pure C version of mask testing, but has this intrinsic; Clang >= 7 optimizes C version fine
11
- #if defined(__GNUC__) && __GNUC__ >= 7
11
+ #if (defined(__GNUC__) && __GNUC__ >= 7) || (defined(_MSC_VER) && _MSC_VER >= 1924)
12
12
  # define KORTEST16(a, b) !_kortestz_mask16_u8((a), (b))
13
13
  # define KAND16(a, b) _kand_mask16((a), (b))
14
14
  # define KOR16(a, b) _kor_mask16((a), (b))
@@ -112,7 +112,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_sse(const uint8_t* HEDLEY_RESTRICT src, long
112
112
  -42,-42,-42,-42,-42,-42,-42,-42,-42,-42,-42,-42,-42,-42,-42,-42-64
113
113
  ) : _mm_set1_epi8(-42);
114
114
 
115
- #if defined(__SSSE3__) && !defined(__tune_atom__) && !defined(__tune_slm__) && !defined(__tune_btver1__)
115
+ #if defined(__SSSE3__) && !defined(__tune_atom__) && !defined(__tune_slm__) && !defined(__tune_btver1__) && !defined(__tune_btver2__)
116
116
  const bool _USING_FAST_MATCH = (use_isa >= ISA_LEVEL_SSSE3);
117
117
  #else
118
118
  const bool _USING_FAST_MATCH = false;
@@ -121,6 +121,13 @@ HEDLEY_ALWAYS_INLINE void do_decode_sse(const uint8_t* HEDLEY_RESTRICT src, long
121
121
  const bool _USING_BLEND_ADD = (use_isa >= ISA_LEVEL_SSE41);
122
122
  #else
123
123
  const bool _USING_BLEND_ADD = false;
124
+ #endif
125
+ #if defined(__AVX512VL__) && defined(__AVX512BW__)
126
+ # if defined(_MSC_VER) && !defined(PLATFORM_AMD64) && !defined(__clang__)
127
+ const bool useAVX3MaskCmp = false;
128
+ # else
129
+ const bool useAVX3MaskCmp = (use_isa >= ISA_LEVEL_AVX3);
130
+ # endif
124
131
  #endif
125
132
 
126
133
  __m128i lfCompare = _mm_set1_epi8('\n');
@@ -214,7 +221,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_sse(const uint8_t* HEDLEY_RESTRICT src, long
214
221
  __mmask16 match2EqMaskA, match2EqMaskB;
215
222
  __mmask16 match0CrMaskA, match0CrMaskB;
216
223
  __mmask16 match2CrXDtMaskA, match2CrXDtMaskB;
217
- if(use_isa >= ISA_LEVEL_AVX3 && searchEnd) {
224
+ if(useAVX3MaskCmp && searchEnd) {
218
225
  match2EqMaskA = _mm_cmpeq_epi8_mask(_mm_set1_epi8('='), tmpData2A);
219
226
  match2EqMaskB = _mm_cmpeq_epi8_mask(_mm_set1_epi8('='), tmpData2B);
220
227
  } else
@@ -230,7 +237,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_sse(const uint8_t* HEDLEY_RESTRICT src, long
230
237
  __m128i match2CrXDtA, match2CrXDtB;
231
238
  if(isRaw) {
232
239
  #if defined(__AVX512VL__) && defined(__AVX512BW__)
233
- if(use_isa >= ISA_LEVEL_AVX3) {
240
+ if(useAVX3MaskCmp) {
234
241
  match0CrMaskA = _mm_cmpeq_epi8_mask(oDataA, _mm_set1_epi8('\r'));
235
242
  match0CrMaskB = _mm_cmpeq_epi8_mask(oDataB, _mm_set1_epi8('\r'));
236
243
  match2CrXDtMaskA = _mm_mask_cmpeq_epi8_mask(match0CrMaskA, tmpData2A, _mm_set1_epi8('.'));
@@ -256,7 +263,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_sse(const uint8_t* HEDLEY_RESTRICT src, long
256
263
  #if defined(__AVX512VL__) && defined(__AVX512BW__)
257
264
  __mmask16 match1NlMaskA, match1NlMaskB;
258
265
  __mmask16 match2NlDotMaskA, match2NlDotMaskB;
259
- if(use_isa >= ISA_LEVEL_AVX3) {
266
+ if(useAVX3MaskCmp) {
260
267
  match1NlMaskA = _mm_mask_cmpeq_epi8_mask(
261
268
  match0CrMaskA,
262
269
  _mm_set1_epi8('\n'),
@@ -299,7 +306,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_sse(const uint8_t* HEDLEY_RESTRICT src, long
299
306
 
300
307
  int matchEnd;
301
308
  #if defined(__AVX512VL__) && defined(__AVX512BW__)
302
- if(use_isa >= ISA_LEVEL_AVX3) {
309
+ if(useAVX3MaskCmp) {
303
310
  __mmask16 match3EqYMaskA = _mm_mask_cmpeq_epi8_mask(
304
311
  match2EqMaskA, _mm_set1_epi8('y'), tmpData3A
305
312
  );
@@ -373,7 +380,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_sse(const uint8_t* HEDLEY_RESTRICT src, long
373
380
  }
374
381
  }
375
382
  #if defined(__AVX512VL__) && defined(__AVX512BW__)
376
- if(use_isa >= ISA_LEVEL_AVX3) {
383
+ if(useAVX3MaskCmp) {
377
384
  mask |= match2NlDotMaskA << 2;
378
385
  mask |= (match2NlDotMaskB << 18) & 0xffffffff;
379
386
  minMask = _mm_maskz_mov_epi8(~(match2NlDotMaskB>>14), _mm_set1_epi8('.'));
@@ -398,7 +405,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_sse(const uint8_t* HEDLEY_RESTRICT src, long
398
405
  __m128i match3EqYA, match3EqYB;
399
406
  #if defined(__AVX512VL__) && defined(__AVX512BW__)
400
407
  __mmask16 match3EqYMaskA, match3EqYMaskB;
401
- if(use_isa >= ISA_LEVEL_AVX3) {
408
+ if(useAVX3MaskCmp) {
402
409
  match3EqYMaskA = _mm_mask_cmpeq_epi8_mask(
403
410
  match2EqMaskA,
404
411
  _mm_set1_epi8('y'),
@@ -434,7 +441,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_sse(const uint8_t* HEDLEY_RESTRICT src, long
434
441
  bool endFound;
435
442
 
436
443
  #if defined(__AVX512VL__) && defined(__AVX512BW__)
437
- if(use_isa >= ISA_LEVEL_AVX3) {
444
+ if(useAVX3MaskCmp) {
438
445
  __mmask16 match3LfEqYMaskA = _mm_mask_cmpeq_epi8_mask(
439
446
  match3EqYMaskA,
440
447
  _mm_set1_epi8('\n'),
@@ -0,0 +1,30 @@
1
+ #include "common.h"
2
+
3
+ #if defined(__AVX512VL__) && defined(__AVX512VBMI2__) && defined(__AVX512BW__)
4
+ # include "decoder_common.h"
5
+ # ifndef YENC_DISABLE_AVX256
6
+ # include "decoder_avx2_base.h"
7
+ void decoder_set_vbmi2_funcs() {
8
+ ALIGN_ALLOC(lookups, sizeof(*lookups), 16);
9
+ // TODO: consider removing compact LUT
10
+ decoder_init_lut(lookups->eqFix, lookups->compact);
11
+ _do_decode = &do_decode_simd<false, false, sizeof(__m256i)*2, do_decode_avx2<false, false, ISA_LEVEL_VBMI2> >;
12
+ _do_decode_raw = &do_decode_simd<true, false, sizeof(__m256i)*2, do_decode_avx2<true, false, ISA_LEVEL_VBMI2> >;
13
+ _do_decode_end_raw = &do_decode_simd<true, true, sizeof(__m256i)*2, do_decode_avx2<true, true, ISA_LEVEL_VBMI2> >;
14
+ }
15
+ # else
16
+ # include "decoder_sse_base.h"
17
+ void decoder_set_vbmi2_funcs() {
18
+ decoder_sse_init();
19
+ decoder_init_lut(lookups->eqFix, lookups->compact);
20
+ _do_decode = &do_decode_simd<false, false, sizeof(__m128i)*2, do_decode_sse<false, false, ISA_LEVEL_VBMI2> >;
21
+ _do_decode_raw = &do_decode_simd<true, false, sizeof(__m128i)*2, do_decode_sse<true, false, ISA_LEVEL_VBMI2> >;
22
+ _do_decode_end_raw = &do_decode_simd<true, true, sizeof(__m128i)*2, do_decode_sse<true, true, ISA_LEVEL_VBMI2> >;
23
+ }
24
+ # endif
25
+ #else
26
+ void decoder_set_avx2_funcs();
27
+ void decoder_set_vbmi2_funcs() {
28
+ decoder_set_avx2_funcs();
29
+ }
30
+ #endif
package/src/encoder.cc CHANGED
@@ -128,6 +128,7 @@ void encoder_sse2_init();
128
128
  void encoder_ssse3_init();
129
129
  void encoder_avx_init();
130
130
  void encoder_avx2_init();
131
+ void encoder_vbmi2_init();
131
132
  void encoder_neon_init();
132
133
 
133
134
  #if defined(PLATFORM_X86) && defined(YENC_BUILD_NATIVE) && YENC_BUILD_NATIVE!=0
@@ -153,7 +154,9 @@ void encoder_init() {
153
154
  encoder_native_init();
154
155
  # else
155
156
  int use_isa = cpu_supports_isa();
156
- if(use_isa >= ISA_LEVEL_AVX2)
157
+ if(use_isa >= ISA_LEVEL_VBMI2)
158
+ encoder_vbmi2_init();
159
+ else if(use_isa >= ISA_LEVEL_AVX2)
157
160
  encoder_avx2_init();
158
161
  else if(use_isa >= ISA_LEVEL_AVX)
159
162
  encoder_avx_init();
@@ -6,7 +6,7 @@
6
6
  #include "encoder_common.h"
7
7
  #define YMM_SIZE 32
8
8
 
9
- #if defined(__GNUC__) && __GNUC__ >= 7
9
+ #if (defined(__GNUC__) && __GNUC__ >= 7) || (defined(_MSC_VER) && _MSC_VER >= 1924)
10
10
  # define KLOAD32(a, offs) _load_mask32((__mmask32*)(a) + (offs))
11
11
  #else
12
12
  # define KLOAD32(a, offs) (((uint32_t*)(a))[(offs)])
@@ -293,7 +293,7 @@ HEDLEY_ALWAYS_INLINE void do_encode_avx2(int line_size, int* colOffset, const ui
293
293
  asm(
294
294
  "shrq $1, %[eqMask] \n"
295
295
  "shrq %%cl, %[eqMask] \n"
296
- "adcq %[col], %[p] \n"
296
+ "adcq %q[col], %q[p] \n"
297
297
  : [eqMask]"+r"(eqMask), [p]"+r"(p)
298
298
  : "c"(shiftAmt), [col]"r"(~col)
299
299
  );
@@ -5,7 +5,7 @@
5
5
  #include "encoder_common.h"
6
6
 
7
7
  // Clang wrongly assumes alignment on vst1q_u8_x2, and ARMv7 GCC doesn't support the function, so effectively, it can only be used in ARMv8 compilers
8
- #if defined(__aarch64__) && (defined(__clang__) || (defined(__GNUC__) && __GNUC__ >= 9))
8
+ #if defined(__aarch64__) && (defined(__clang__) || HEDLEY_GCC_VERSION_CHECK(8,5,0))
9
9
  # define vst1q_u8_x2_unaligned vst1q_u8_x2
10
10
  #else
11
11
  static HEDLEY_ALWAYS_INLINE void vst1q_u8_x2_unaligned(uint8_t* p, uint8x16x2_t data) {
@@ -8,7 +8,7 @@
8
8
  # define _mm_mask_expand_epi8 _mm128_mask_expand_epi8
9
9
  #endif
10
10
 
11
- #if defined(__GNUC__) && __GNUC__ >= 7
11
+ #if (defined(__GNUC__) && __GNUC__ >= 7) || (defined(_MSC_VER) && _MSC_VER >= 1924)
12
12
  # define KLOAD16(a, offs) _load_mask16((__mmask16*)(a) + (offs))
13
13
  #else
14
14
  # define KLOAD16(a, offs) (((uint16_t*)(a))[(offs)])
@@ -155,7 +155,7 @@ HEDLEY_ALWAYS_INLINE void do_encode_sse(int line_size, int* colOffset, const uin
155
155
  if(len <= INPUT_OFFSET || line_size < XMM_SIZE) return;
156
156
 
157
157
  // slower CPUs prefer to branch as mispredict penalty is probably small relative to general execution
158
- #if defined(__tune_atom__) || defined(__tune_slm__) || defined(__tune_btver1__)
158
+ #if defined(__tune_atom__) || defined(__tune_slm__) || defined(__tune_btver1__) || defined(__tune_btver2__)
159
159
  const bool _PREFER_BRANCHING = true;
160
160
  #else
161
161
  const bool _PREFER_BRANCHING = (use_isa < ISA_LEVEL_SSSE3);
@@ -412,8 +412,8 @@ HEDLEY_ALWAYS_INLINE void do_encode_sse(int line_size, int* colOffset, const uin
412
412
  asm(
413
413
  "shrl $1, %[eqMask] \n"
414
414
  "shrl %%cl, %[eqMask] \n" // TODO: can use shrq to avoid above shift?
415
- # if defined(PLATFORM_AMD64)
416
- "adcq %[col], %[p] \n"
415
+ # if defined(PLATFORM_AMD64) && !defined(__ILP32__)
416
+ "adcq %q[col], %q[p] \n"
417
417
  # else
418
418
  "adcl %[col], %[p] \n"
419
419
  # endif
@@ -539,7 +539,6 @@ HEDLEY_ALWAYS_INLINE void do_encode_sse(int line_size, int* colOffset, const uin
539
539
  dataA = _mm_shuffle_epi8(dataA, shufMaskA);
540
540
 
541
541
  # if defined(__SSE4_1__) && !defined(__tune_slm__) && !defined(__tune_goldmont__) && !defined(__tune_goldmont_plus__) && !defined(__tune_tremont__)
542
- // unsure if worth on: Jaguar/Puma (3|2), Core2 (2|2)
543
542
  if(use_isa >= ISA_LEVEL_SSE41) {
544
543
  dataB = _mm_blendv_epi8(dataBShifted, dataB, mergeMaskB);
545
544
  } else
@@ -0,0 +1,23 @@
1
+ #include "common.h"
2
+
3
+ #if defined(__AVX512VL__) && defined(__AVX512VBMI2__) && defined(__AVX512BW__)
4
+ # ifndef YENC_DISABLE_AVX256
5
+ # include "encoder_avx_base.h"
6
+
7
+ void encoder_vbmi2_init() {
8
+ _do_encode = &do_encode_simd< do_encode_avx2<ISA_LEVEL_VBMI2> >;
9
+ encoder_avx2_lut<ISA_LEVEL_VBMI2>();
10
+ }
11
+ # else
12
+ # include "encoder_sse_base.h"
13
+ void encoder_vbmi2_init() {
14
+ _do_encode = &do_encode_simd< do_encode_sse<ISA_LEVEL_VBMI2> >;
15
+ encoder_sse_lut<ISA_LEVEL_VBMI2>();
16
+ }
17
+ # endif
18
+ #else
19
+ void encoder_avx2_init();
20
+ void encoder_vbmi2_init() {
21
+ encoder_avx2_init();
22
+ }
23
+ #endif
package/src/platform.cc CHANGED
@@ -55,6 +55,7 @@ bool cpu_supports_neon() {
55
55
 
56
56
  #ifdef PLATFORM_X86
57
57
  #ifdef _MSC_VER
58
+ # define _cpuid1(ar) __cpuid(ar, 1)
58
59
  # define _cpuid1x(ar) __cpuid(ar, 0x80000001)
59
60
  # if _MSC_VER >= 1600
60
61
  # define _cpuidX __cpuidex
@@ -66,6 +67,8 @@ bool cpu_supports_neon() {
66
67
  # define _GET_XCR() 0
67
68
  # endif
68
69
  #else
70
+ # include <cpuid.h>
71
+ # define _cpuid1(ar) __cpuid(1, ar[0], ar[1], ar[2], ar[3])
69
72
  # define _cpuid1x(ar) __cpuid(0x80000001, ar[0], ar[1], ar[2], ar[3])
70
73
  # define _cpuidX(ar, eax, ecx) __cpuid_count(eax, ecx, ar[0], ar[1], ar[2], ar[3])
71
74
  static inline int _GET_XCR() {
@@ -112,8 +115,6 @@ int cpu_supports_isa() {
112
115
  // AMD Bobcat with slow SSSE3 instructions - pretend it doesn't exist
113
116
  return ret | ISA_LEVEL_SSE2;
114
117
 
115
- // Jaguar/Puma performance unkown (slowish PSHUFB/PBLENDVB)
116
-
117
118
  if((flags[2] & 0x200) == 0x200) { // SSSE3
118
119
  if(family == 6 && (model == 0x5c || model == 0x5f || model == 0x7a || model == 0x9c))
119
120
  // Intel Goldmont/plus / Tremont with slow PBLENDVB
@@ -144,4 +145,24 @@ int cpu_supports_isa() {
144
145
  return ret | ISA_LEVEL_SSE2;
145
146
  }
146
147
 
148
+ int cpu_supports_crc_isa() {
149
+ int flags[4];
150
+ _cpuid1(flags);
151
+
152
+ if((flags[2] & 0x80202) == 0x80202) { // SSE4.1 + SSSE3 + CLMUL
153
+ if((flags[2] & 0x18000000) == 0x18000000) { // OSXSAVE + AVX
154
+ int xcr = _GET_XCR() & 0xff; // ignore unused bits
155
+ if((xcr & 6) == 6) { // AVX enabled
156
+ int cpuInfo[4];
157
+ _cpuidX(cpuInfo, 7, 0);
158
+ if((cpuInfo[1] & 0x20) == 0x20 && (cpuInfo[2] & 0x400) == 0x400) { // AVX2 + VPCLMULQDQ
159
+ return 2;
160
+ }
161
+ }
162
+ }
163
+ return 1;
164
+ }
165
+ return 0;
166
+ }
167
+
147
168
  #endif // PLATFORM_X86
package/test/testcrc.js CHANGED
@@ -50,4 +50,18 @@ doTest('Random', 'crc32', 'fj[-oqijnw34-59n26 4345j8yn89032q78t9ab9gabh023quhoiB
50
50
  doTest('Random Continue', 'crc32', ['KZSHZ5EDOVAmDdakZZOrGSUGGKSpCJoWH7M0MHy6ohnSzvHY4DjpxXmyfWYJQoJ7tKdNhGcuRVUzrgXM', ycrc32('BdenbmoBgiB10ZkeUBjrsZV3dg2Da2fhHqU9TMdi69AHhLRck3Nk60YuFBXh6lvtefBpjdTxbeEmsaEm')], crc32('BdenbmoBgiB10ZkeUBjrsZV3dg2Da2fhHqU9TMdi69AHhLRck3Nk60YuFBXh6lvtefBpjdTxbeEmsaEmKZSHZ5EDOVAmDdakZZOrGSUGGKSpCJoWH7M0MHy6ohnSzvHY4DjpxXmyfWYJQoJ7tKdNhGcuRVUzrgXM'));
51
51
 
52
52
 
53
+ // random tests
54
+ for(var i=1; i<128; i++) {
55
+ var rand = require('crypto').pseudoRandomBytes(i);
56
+ doTest('Random Short Buffer', 'crc32', rand);
57
+ }
58
+ for(var i=0; i<32; i++) {
59
+ var rand = require('crypto').pseudoRandomBytes(100000);
60
+ doTest('Random Buffer', 'crc32', rand);
61
+
62
+ var split = Math.random()*rand.length;
63
+ doTest('Random Continue Buffer', 'crc32', [rand.slice(split), ycrc32(rand.slice(0, split))], crc32(rand));
64
+ }
65
+
66
+
53
67
  console.log('All tests passed');