yencode 1.1.2 → 1.1.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +3 -2
- package/binding.gyp +141 -6
- package/index.js +21 -19
- package/package.json +2 -1
- package/src/common.h +34 -19
- package/src/crc.cc +138 -11
- package/src/crc_arm.cc +42 -7
- package/src/crc_folding.cc +18 -53
- package/src/crc_folding_256.cc +229 -0
- package/src/decoder.cc +8 -4
- package/src/decoder.h +5 -5
- package/src/decoder_avx2_base.h +30 -13
- package/src/decoder_common.h +5 -5
- package/src/decoder_neon.cc +4 -4
- package/src/decoder_neon64.cc +10 -7
- package/src/decoder_sse_base.h +26 -12
- package/src/decoder_vbmi2.cc +37 -0
- package/src/encoder.cc +10 -1
- package/src/encoder_avx_base.h +24 -16
- package/src/encoder_neon.cc +40 -41
- package/src/encoder_rvv.cc +219 -0
- package/src/encoder_sse_base.h +7 -8
- package/src/encoder_vbmi2.cc +30 -0
- package/src/hedley.h +278 -135
- package/src/platform.cc +79 -10
- package/src/test_alignalloc.c +6 -0
- package/test/_speedbase.js +12 -11
- package/test/speeddec.js +6 -5
- package/test/testcrc.js +14 -0
- package/test/testdec.js +30 -14
- package/test/testenc.js +10 -7
- package/test/testpostdec.js +6 -5
package/src/crc_arm.cc
CHANGED
|
@@ -5,6 +5,24 @@
|
|
|
5
5
|
HEDLEY_WARNING("CRC32 acceleration is not been enabled under ARM clang-cl by default; add `-march=armv8-a+crc` to additional compiler arguments to enable");
|
|
6
6
|
#endif
|
|
7
7
|
|
|
8
|
+
// disable CRC on GCC versions with broken arm_acle.h
|
|
9
|
+
#if defined(__ARM_FEATURE_CRC32) && defined(HEDLEY_GCC_VERSION)
|
|
10
|
+
# if !defined(__aarch64__) && HEDLEY_GCC_VERSION_CHECK(7,0,0) && !HEDLEY_GCC_VERSION_CHECK(8,1,1)
|
|
11
|
+
# undef __ARM_FEATURE_CRC32
|
|
12
|
+
HEDLEY_WARNING("CRC32 acceleration has been disabled due to broken arm_acle.h shipped in GCC 7.0 - 8.1 [https://gcc.gnu.org/bugzilla/show_bug.cgi?id=81497]. If you need this feature, please use a different compiler or version of GCC");
|
|
13
|
+
# endif
|
|
14
|
+
# if defined(__aarch64__) && HEDLEY_GCC_VERSION_CHECK(9,4,0) && !HEDLEY_GCC_VERSION_CHECK(9,5,0)
|
|
15
|
+
# undef __ARM_FEATURE_CRC32
|
|
16
|
+
HEDLEY_WARNING("CRC32 acceleration has been disabled due to broken arm_acle.h shipped in GCC 9.4 [https://gcc.gnu.org/bugzilla/show_bug.cgi?id=100985]. If you need this feature, please use a different compiler or version of GCC");
|
|
17
|
+
# endif
|
|
18
|
+
#endif
|
|
19
|
+
#if defined(__ARM_FEATURE_CRC32) && defined(__has_include)
|
|
20
|
+
# if !__has_include(<arm_acle.h>)
|
|
21
|
+
# undef __ARM_FEATURE_CRC32
|
|
22
|
+
HEDLEY_WARNING("CRC32 acceleration has been disabled due to missing arm_acle.h");
|
|
23
|
+
# endif
|
|
24
|
+
#endif
|
|
25
|
+
|
|
8
26
|
#if defined(__ARM_FEATURE_CRC32) || (defined(_M_ARM64) && !defined(__clang__)) // MSVC doesn't support CRC for ARM32
|
|
9
27
|
|
|
10
28
|
/* ARMv8 accelerated CRC */
|
|
@@ -14,14 +32,30 @@ HEDLEY_WARNING("CRC32 acceleration is not been enabled under ARM clang-cl by def
|
|
|
14
32
|
#include <arm_acle.h>
|
|
15
33
|
#endif
|
|
16
34
|
|
|
35
|
+
|
|
36
|
+
#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
|
|
37
|
+
# ifdef __GNUC__
|
|
38
|
+
# define _LE16 __builtin_bswap16
|
|
39
|
+
# define _LE32 __builtin_bswap32
|
|
40
|
+
# define _LE64 __builtin_bswap64
|
|
41
|
+
# else
|
|
42
|
+
// currently not supported
|
|
43
|
+
# error No endian swap intrinsic defined
|
|
44
|
+
# endif
|
|
45
|
+
#else
|
|
46
|
+
# define _LE16(x) (x)
|
|
47
|
+
# define _LE32(x) (x)
|
|
48
|
+
# define _LE64(x) (x)
|
|
49
|
+
#endif
|
|
50
|
+
|
|
17
51
|
#ifdef __aarch64__
|
|
18
52
|
# define WORD_T uint64_t
|
|
19
53
|
# define WORDSIZE_LOG 3 // sizeof(WORD_T) == 1<<WORDSIZE_LOG
|
|
20
|
-
# define CRC_WORD __crc32d
|
|
54
|
+
# define CRC_WORD(crc, data) __crc32d(crc, _LE64(data))
|
|
21
55
|
#else
|
|
22
56
|
# define WORD_T uint32_t
|
|
23
57
|
# define WORDSIZE_LOG 2 // sizeof(WORD_T) == 1<<WORDSIZE_LOG
|
|
24
|
-
# define CRC_WORD __crc32w
|
|
58
|
+
# define CRC_WORD(crc, data) __crc32w(crc, _LE32(data))
|
|
25
59
|
#endif
|
|
26
60
|
|
|
27
61
|
|
|
@@ -45,7 +79,7 @@ static HEDLEY_ALWAYS_INLINE uint32_t crc_multiply(uint32_t a, uint32_t b) {
|
|
|
45
79
|
return res;
|
|
46
80
|
}
|
|
47
81
|
|
|
48
|
-
static const uint32_t crc_power[] = { // pre-computed 2^n, with first 3 entries removed (saves a shift)
|
|
82
|
+
static const uint32_t crc_power[] = { // pre-computed 2^(2^n), with first 3 entries removed (saves a shift)
|
|
49
83
|
0x00800000, 0x00008000, 0xedb88320, 0xb1e6b092, 0xa06a2517, 0xed627dae, 0x88d14467, 0xd7bbfe6a,
|
|
50
84
|
0xec447f11, 0x8e7ea170, 0x6427800e, 0x4d47bae0, 0x09fe548f, 0x83852d0f, 0x30362f1a, 0x7b5a9cc3,
|
|
51
85
|
0x31fec169, 0x9fec022a, 0x6c8dedc4, 0x15d6874d, 0x5fde7a4e, 0xbad90e37, 0x2e4e5eef, 0x4eaba214,
|
|
@@ -64,6 +98,7 @@ static const uint32_t crc_power[] = { // pre-computed 2^n, with first 3 entries
|
|
|
64
98
|
#endif
|
|
65
99
|
|
|
66
100
|
|
|
101
|
+
|
|
67
102
|
// inspired/stolen off https://github.com/jocover/crc32_armv8/blob/master/crc32_armv8.c
|
|
68
103
|
static uint32_t arm_crc_calc(uint32_t crc, const unsigned char *src, long len) {
|
|
69
104
|
|
|
@@ -75,13 +110,13 @@ static uint32_t arm_crc_calc(uint32_t crc, const unsigned char *src, long len) {
|
|
|
75
110
|
len--;
|
|
76
111
|
}
|
|
77
112
|
if ((uintptr_t)src & sizeof(uint16_t)) {
|
|
78
|
-
crc = __crc32h(crc, *((uint16_t *)src));
|
|
113
|
+
crc = __crc32h(crc, _LE16(*((uint16_t *)src)));
|
|
79
114
|
src += sizeof(uint16_t);
|
|
80
115
|
len -= sizeof(uint16_t);
|
|
81
116
|
}
|
|
82
117
|
#ifdef __aarch64__
|
|
83
118
|
if ((uintptr_t)src & sizeof(uint32_t)) {
|
|
84
|
-
crc = __crc32w(crc, *((uint32_t *)src));
|
|
119
|
+
crc = __crc32w(crc, _LE32(*((uint32_t *)src)));
|
|
85
120
|
src += sizeof(uint32_t);
|
|
86
121
|
len -= sizeof(uint32_t);
|
|
87
122
|
}
|
|
@@ -147,12 +182,12 @@ static uint32_t arm_crc_calc(uint32_t crc, const unsigned char *src, long len) {
|
|
|
147
182
|
|
|
148
183
|
#ifdef __aarch64__
|
|
149
184
|
if (len & sizeof(uint32_t)) {
|
|
150
|
-
crc = __crc32w(crc, *((uint32_t *)src));
|
|
185
|
+
crc = __crc32w(crc, _LE32(*((uint32_t *)src)));
|
|
151
186
|
src += sizeof(uint32_t);
|
|
152
187
|
}
|
|
153
188
|
#endif
|
|
154
189
|
if (len & sizeof(uint16_t)) {
|
|
155
|
-
crc = __crc32h(crc, *((uint16_t *)src));
|
|
190
|
+
crc = __crc32h(crc, _LE16(*((uint16_t *)src)));
|
|
156
191
|
src += sizeof(uint16_t);
|
|
157
192
|
}
|
|
158
193
|
if (len & sizeof(uint8_t))
|
package/src/crc_folding.cc
CHANGED
|
@@ -19,7 +19,7 @@
|
|
|
19
19
|
|
|
20
20
|
#include "crc_common.h"
|
|
21
21
|
|
|
22
|
-
#if (defined(__PCLMUL__) && defined(__SSSE3__) && defined(__SSE4_1__)) || (defined(_MSC_VER) && _MSC_VER >= 1600 && defined(PLATFORM_X86))
|
|
22
|
+
#if (defined(__PCLMUL__) && defined(__SSSE3__) && defined(__SSE4_1__)) || (defined(_MSC_VER) && _MSC_VER >= 1600 && defined(PLATFORM_X86) && !defined(__clang__))
|
|
23
23
|
#include <inttypes.h>
|
|
24
24
|
#include <immintrin.h>
|
|
25
25
|
#include <wmmintrin.h>
|
|
@@ -135,33 +135,6 @@ ALIGN_TO(16, static const unsigned crc_mask[4]) = {
|
|
|
135
135
|
0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF
|
|
136
136
|
};
|
|
137
137
|
|
|
138
|
-
static __m128i reverse_bits_epi8(__m128i src) {
|
|
139
|
-
#if defined(__GFNI__) && defined(YENC_BUILD_NATIVE) && YENC_BUILD_NATIVE!=0
|
|
140
|
-
return _mm_gf2p8affine_epi64_epi8(src, _mm_set_epi32(
|
|
141
|
-
0x80402010, 0x08040201,
|
|
142
|
-
0x80402010, 0x08040201
|
|
143
|
-
), 0);
|
|
144
|
-
#else
|
|
145
|
-
__m128i xmm_t0 = _mm_and_si128(src, _mm_set1_epi8(0x0f));
|
|
146
|
-
__m128i xmm_t1 = _mm_and_si128(_mm_srli_epi16(src, 4), _mm_set1_epi8(0x0f));
|
|
147
|
-
xmm_t0 = _mm_shuffle_epi8(_mm_set_epi8(
|
|
148
|
-
-16, 112, -80, 48, -48, 80, -112, 16, -32, 96, -96, 32, -64, 64, -128, 0
|
|
149
|
-
//0xf0, 0x70, 0xb0, 0x30, 0xd0, 0x50, 0x90, 0x10, 0xe0, 0x60, 0xa0, 0x20, 0xc0, 0x40, 0x80, 0
|
|
150
|
-
), xmm_t0);
|
|
151
|
-
xmm_t1 = _mm_shuffle_epi8(_mm_set_epi8(
|
|
152
|
-
15, 7, 11, 3, 13, 5, 9, 1, 14, 6, 10, 2, 12, 4, 8, 0
|
|
153
|
-
), xmm_t1);
|
|
154
|
-
return _mm_or_si128(xmm_t0, xmm_t1);
|
|
155
|
-
#endif
|
|
156
|
-
}
|
|
157
|
-
|
|
158
|
-
#ifdef _MSC_VER
|
|
159
|
-
// because MSVC doesn't use BSWAP unless you specifically tell it to...
|
|
160
|
-
# include <stdlib.h>
|
|
161
|
-
# define BSWAP32 _byteswap_ulong
|
|
162
|
-
#else
|
|
163
|
-
# define BSWAP32(n) ((((n)&0xff)<<24) | (((n)&0xff00)<<8) | (((n)&0xff0000)>>8) | (((n)&0xff000000)>>24))
|
|
164
|
-
#endif
|
|
165
138
|
|
|
166
139
|
static uint32_t crc_fold(const unsigned char *src, long len, uint32_t initial) {
|
|
167
140
|
unsigned long algn_diff;
|
|
@@ -170,23 +143,17 @@ static uint32_t crc_fold(const unsigned char *src, long len, uint32_t initial) {
|
|
|
170
143
|
// TODO: consider calculating this via a LUT instead (probably faster)
|
|
171
144
|
// info from https://www.reddit.com/r/ReverseEngineering/comments/2zwhl3/mystery_constant_0x9db42487_in_intels_crc32ieee/
|
|
172
145
|
// firstly, calculate: xmm_crc0 = (intial * 0x487b9c8a) mod 0x104c11db7, where 0x487b9c8a = inverse(1<<512) mod 0x104c11db7
|
|
146
|
+
xmm_t0 = _mm_cvtsi32_si128(~initial);
|
|
173
147
|
|
|
174
|
-
|
|
175
|
-
uint32_t init_t = BSWAP32(initial);
|
|
176
|
-
xmm_t0 = reverse_bits_epi8(_mm_cvtsi32_si128(~init_t));
|
|
177
|
-
|
|
178
|
-
xmm_t0 = _mm_clmulepi64_si128(xmm_t0, _mm_cvtsi32_si128(0x487b9c8a), 0);
|
|
179
|
-
xmm_t1 = _mm_and_si128(xmm_t0, _mm_set_epi32(-1,-1,-1,0)); // shifted up by 32bits to avoid shifts by using clmul's capability to select top 64bits instead
|
|
148
|
+
xmm_t0 = _mm_clmulepi64_si128(xmm_t0, _mm_set_epi32(0, 0, 0xa273bc24, 0), 0); // reverse(0x487b9c8a)<<1 == 0xa273bc24
|
|
180
149
|
xmm_t2 = _mm_set_epi32( // polynomial reduction factors
|
|
181
|
-
|
|
182
|
-
|
|
150
|
+
1, 0xdb710640, // G* = 0x04c11db7
|
|
151
|
+
0, 0xf7011641 // Q+ = 0x04d101df (+1 to save an additional xor operation)
|
|
183
152
|
);
|
|
184
|
-
xmm_t1 = _mm_clmulepi64_si128(
|
|
185
|
-
xmm_t1 = _mm_clmulepi64_si128(xmm_t1, xmm_t2,
|
|
153
|
+
xmm_t1 = _mm_clmulepi64_si128(xmm_t0, xmm_t2, 0);
|
|
154
|
+
xmm_t1 = _mm_clmulepi64_si128(xmm_t1, xmm_t2, 0x10);
|
|
186
155
|
|
|
187
|
-
__m128i xmm_crc0 = _mm_xor_si128(xmm_t0, xmm_t1);
|
|
188
|
-
// reverse bits
|
|
189
|
-
xmm_crc0 = _mm_shuffle_epi8(reverse_bits_epi8(xmm_crc0), _mm_set_epi32(-1,-1,-1,0x00010203));
|
|
156
|
+
__m128i xmm_crc0 = _mm_srli_si128(_mm_xor_si128(xmm_t0, xmm_t1), 8);
|
|
190
157
|
|
|
191
158
|
__m128i xmm_crc1 = _mm_setzero_si128();
|
|
192
159
|
__m128i xmm_crc2 = _mm_setzero_si128();
|
|
@@ -196,7 +163,8 @@ static uint32_t crc_fold(const unsigned char *src, long len, uint32_t initial) {
|
|
|
196
163
|
if (len < 16) {
|
|
197
164
|
if (len == 0)
|
|
198
165
|
return initial;
|
|
199
|
-
xmm_crc_part =
|
|
166
|
+
xmm_crc_part = _mm_setzero_si128();
|
|
167
|
+
memcpy(&xmm_crc_part, src, len);
|
|
200
168
|
goto partial;
|
|
201
169
|
}
|
|
202
170
|
|
|
@@ -211,7 +179,7 @@ static uint32_t crc_fold(const unsigned char *src, long len, uint32_t initial) {
|
|
|
211
179
|
&xmm_crc_part);
|
|
212
180
|
}
|
|
213
181
|
|
|
214
|
-
while (
|
|
182
|
+
while (len >= 64) {
|
|
215
183
|
xmm_t0 = _mm_load_si128((__m128i *)src);
|
|
216
184
|
xmm_t1 = _mm_load_si128((__m128i *)src + 1);
|
|
217
185
|
xmm_t2 = _mm_load_si128((__m128i *)src + 2);
|
|
@@ -235,13 +203,11 @@ static uint32_t crc_fold(const unsigned char *src, long len, uint32_t initial) {
|
|
|
235
203
|
#endif
|
|
236
204
|
|
|
237
205
|
src += 64;
|
|
206
|
+
len -= 64;
|
|
238
207
|
}
|
|
239
208
|
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
*/
|
|
243
|
-
if (len + 16 >= 0) {
|
|
244
|
-
len += 16;
|
|
209
|
+
if (len >= 48) {
|
|
210
|
+
len -= 48;
|
|
245
211
|
|
|
246
212
|
xmm_t0 = _mm_load_si128((__m128i *)src);
|
|
247
213
|
xmm_t1 = _mm_load_si128((__m128i *)src + 1);
|
|
@@ -266,8 +232,8 @@ static uint32_t crc_fold(const unsigned char *src, long len, uint32_t initial) {
|
|
|
266
232
|
goto done;
|
|
267
233
|
|
|
268
234
|
xmm_crc_part = _mm_load_si128((__m128i *)src + 3);
|
|
269
|
-
} else if (len
|
|
270
|
-
len
|
|
235
|
+
} else if (len >= 32) {
|
|
236
|
+
len -= 32;
|
|
271
237
|
|
|
272
238
|
xmm_t0 = _mm_load_si128((__m128i *)src);
|
|
273
239
|
xmm_t1 = _mm_load_si128((__m128i *)src + 1);
|
|
@@ -290,8 +256,8 @@ static uint32_t crc_fold(const unsigned char *src, long len, uint32_t initial) {
|
|
|
290
256
|
goto done;
|
|
291
257
|
|
|
292
258
|
xmm_crc_part = _mm_load_si128((__m128i *)src + 2);
|
|
293
|
-
} else if (len
|
|
294
|
-
len
|
|
259
|
+
} else if (len >= 16) {
|
|
260
|
+
len -= 16;
|
|
295
261
|
|
|
296
262
|
xmm_t0 = _mm_load_si128((__m128i *)src);
|
|
297
263
|
|
|
@@ -310,7 +276,6 @@ static uint32_t crc_fold(const unsigned char *src, long len, uint32_t initial) {
|
|
|
310
276
|
|
|
311
277
|
xmm_crc_part = _mm_load_si128((__m128i *)src + 1);
|
|
312
278
|
} else {
|
|
313
|
-
len += 64;
|
|
314
279
|
if (len == 0)
|
|
315
280
|
goto done;
|
|
316
281
|
xmm_crc_part = _mm_load_si128((__m128i *)src);
|
|
@@ -0,0 +1,229 @@
|
|
|
1
|
+
// 256-bit version of crc_folding
|
|
2
|
+
|
|
3
|
+
#include "crc_common.h"
|
|
4
|
+
|
|
5
|
+
#if !defined(YENC_DISABLE_AVX256) && ((defined(__VPCLMULQDQ__) && defined(__AVX2__) && defined(__PCLMUL__)) || (defined(_MSC_VER) && _MSC_VER >= 1920 && defined(PLATFORM_X86) && !defined(__clang__)))
|
|
6
|
+
#include <inttypes.h>
|
|
7
|
+
#include <immintrin.h>
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
#if defined(__AVX512VL__) && defined(YENC_BUILD_NATIVE) && YENC_BUILD_NATIVE!=0
|
|
11
|
+
# define ENABLE_AVX512 1
|
|
12
|
+
#endif
|
|
13
|
+
|
|
14
|
+
static __m256i do_one_fold(__m256i src, __m256i data) {
|
|
15
|
+
const __m256i fold4 = _mm256_set_epi32(
|
|
16
|
+
0x00000001, 0x54442bd4,
|
|
17
|
+
0x00000001, 0xc6e41596,
|
|
18
|
+
0x00000001, 0x54442bd4,
|
|
19
|
+
0x00000001, 0xc6e41596
|
|
20
|
+
);
|
|
21
|
+
#ifdef ENABLE_AVX512
|
|
22
|
+
return _mm256_ternarylogic_epi32(
|
|
23
|
+
_mm256_clmulepi64_epi128(src, fold4, 0x01),
|
|
24
|
+
_mm256_clmulepi64_epi128(src, fold4, 0x10),
|
|
25
|
+
data,
|
|
26
|
+
0x96
|
|
27
|
+
);
|
|
28
|
+
#else
|
|
29
|
+
return _mm256_xor_si256(_mm256_xor_si256(
|
|
30
|
+
data, _mm256_clmulepi64_epi128(src, fold4, 0x01)
|
|
31
|
+
), _mm256_clmulepi64_epi128(src, fold4, 0x10));
|
|
32
|
+
#endif
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
ALIGN_TO(32, static const uint8_t pshufb_rot_table[]) = {
|
|
36
|
+
0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,
|
|
37
|
+
16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31
|
|
38
|
+
};
|
|
39
|
+
// _mm256_castsi128_si256, but upper is defined to be 0
|
|
40
|
+
#if (defined(__clang__) && __clang_major__ >= 5 && (!defined(__APPLE__) || __clang_major__ >= 7)) || (defined(__GNUC__) && __GNUC__ >= 10) || (defined(_MSC_VER) && _MSC_VER >= 1910)
|
|
41
|
+
// intrinsic unsupported in GCC 9 and MSVC < 2017
|
|
42
|
+
# define zext128_256 _mm256_zextsi128_si256
|
|
43
|
+
#else
|
|
44
|
+
// technically a cast is incorrect, due to upper 128 bits being undefined, but should usually work fine
|
|
45
|
+
// alternative may be `_mm256_set_m128i(_mm_setzero_si128(), v)` but unsupported on GCC < 7, and most compilers generate a VINSERTF128 instruction for it
|
|
46
|
+
# ifdef __OPTIMIZE__
|
|
47
|
+
# define zext128_256 _mm256_castsi128_si256
|
|
48
|
+
# else
|
|
49
|
+
# define zext128_256(x) _mm256_inserti128_si256(_mm256_setzero_si256(), x, 0)
|
|
50
|
+
# endif
|
|
51
|
+
#endif
|
|
52
|
+
|
|
53
|
+
#ifdef ENABLE_AVX512
|
|
54
|
+
# define MM256_BLENDV(a, b, m) _mm256_ternarylogic_epi32(a, b, m, 0xd8)
|
|
55
|
+
# define MM_2XOR(a, b, c) _mm_ternarylogic_epi32(a, b, c, 0x96)
|
|
56
|
+
#else
|
|
57
|
+
# define MM256_BLENDV _mm256_blendv_epi8
|
|
58
|
+
# define MM_2XOR(a, b, c) _mm_xor_si128(_mm_xor_si128(a, b), c)
|
|
59
|
+
#endif
|
|
60
|
+
|
|
61
|
+
static void partial_fold(const size_t len, __m256i *crc0, __m256i *crc1, __m256i crc_part) {
|
|
62
|
+
__m256i shuf = _mm256_broadcastsi128_si256(_mm_loadu_si128((__m128i*)(pshufb_rot_table + (len&15))));
|
|
63
|
+
__m256i mask = _mm256_cmpgt_epi8(shuf, _mm256_set1_epi8(15));
|
|
64
|
+
|
|
65
|
+
*crc0 = _mm256_shuffle_epi8(*crc0, shuf);
|
|
66
|
+
*crc1 = _mm256_shuffle_epi8(*crc1, shuf);
|
|
67
|
+
crc_part = _mm256_shuffle_epi8(crc_part, shuf);
|
|
68
|
+
|
|
69
|
+
__m256i crc_out = _mm256_permute2x128_si256(*crc0, *crc0, 0x08); // move bottom->top
|
|
70
|
+
__m256i crc01, crc1p;
|
|
71
|
+
if(len >= 16) {
|
|
72
|
+
crc_out = MM256_BLENDV(crc_out, *crc0, mask);
|
|
73
|
+
crc01 = *crc1;
|
|
74
|
+
crc1p = crc_part;
|
|
75
|
+
*crc0 = _mm256_permute2x128_si256(*crc0, *crc1, 0x21);
|
|
76
|
+
*crc1 = _mm256_permute2x128_si256(*crc1, crc_part, 0x21);
|
|
77
|
+
crc_part = zext128_256(_mm256_extracti128_si256(crc_part, 1));
|
|
78
|
+
} else {
|
|
79
|
+
crc_out = _mm256_and_si256(crc_out, mask);
|
|
80
|
+
crc01 = _mm256_permute2x128_si256(*crc0, *crc1, 0x21);
|
|
81
|
+
crc1p = _mm256_permute2x128_si256(*crc1, crc_part, 0x21);
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
*crc0 = MM256_BLENDV(*crc0, crc01, mask);
|
|
85
|
+
*crc1 = MM256_BLENDV(*crc1, crc1p, mask);
|
|
86
|
+
|
|
87
|
+
*crc1 = do_one_fold(crc_out, *crc1);
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
ALIGN_TO(16, static const unsigned crc_k[]) = {
|
|
92
|
+
0xccaa009e, 0x00000000, /* rk1 */
|
|
93
|
+
0x751997d0, 0x00000001, /* rk2 */
|
|
94
|
+
0xccaa009e, 0x00000000, /* rk5 */
|
|
95
|
+
0x63cd6124, 0x00000001, /* rk6 */
|
|
96
|
+
0xf7011641, 0x00000000, /* rk7 */
|
|
97
|
+
0xdb710640, 0x00000001 /* rk8 */
|
|
98
|
+
};
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
static uint32_t crc_fold(const unsigned char *src, long len, uint32_t initial) {
|
|
102
|
+
// info from https://www.reddit.com/r/ReverseEngineering/comments/2zwhl3/mystery_constant_0x9db42487_in_intels_crc32ieee/
|
|
103
|
+
// firstly, calculate: xmm_crc0 = (intial * 0x487b9c8a) mod 0x104c11db7, where 0x487b9c8a = inverse(1<<512) mod 0x104c11db7
|
|
104
|
+
__m128i xmm_t0 = _mm_cvtsi32_si128(~initial);
|
|
105
|
+
|
|
106
|
+
xmm_t0 = _mm_clmulepi64_si128(xmm_t0, _mm_set_epi32(0, 0, 0xa273bc24, 0), 0); // reverse(0x487b9c8a)<<1 == 0xa273bc24
|
|
107
|
+
__m128i reduction = _mm_set_epi32( // polynomial reduction factors
|
|
108
|
+
1, 0xdb710640, // G* = 0x04c11db7
|
|
109
|
+
0, 0xf7011641 // Q+ = 0x04d101df (+1 to save an additional xor operation)
|
|
110
|
+
);
|
|
111
|
+
__m128i xmm_t1 = _mm_clmulepi64_si128(xmm_t0, reduction, 0);
|
|
112
|
+
xmm_t1 = _mm_clmulepi64_si128(xmm_t1, reduction, 0x10);
|
|
113
|
+
|
|
114
|
+
xmm_t0 = _mm_srli_si128(_mm_xor_si128(xmm_t0, xmm_t1), 8);
|
|
115
|
+
__m256i crc0 = zext128_256(xmm_t0);
|
|
116
|
+
__m256i crc1 = _mm256_setzero_si256();
|
|
117
|
+
|
|
118
|
+
if (len < 32) {
|
|
119
|
+
if (len == 0)
|
|
120
|
+
return initial;
|
|
121
|
+
__m256i crc_part = _mm256_setzero_si256();
|
|
122
|
+
memcpy(&crc_part, src, len);
|
|
123
|
+
partial_fold(len, &crc0, &crc1, crc_part);
|
|
124
|
+
} else {
|
|
125
|
+
uintptr_t algn_diff = (0 - (uintptr_t)src) & 0x1F;
|
|
126
|
+
if (algn_diff) {
|
|
127
|
+
partial_fold(algn_diff, &crc0, &crc1, _mm256_loadu_si256((__m256i *)src));
|
|
128
|
+
src += algn_diff;
|
|
129
|
+
len -= algn_diff;
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
while (len >= 64) {
|
|
133
|
+
crc0 = do_one_fold(crc0, _mm256_load_si256((__m256i*)src));
|
|
134
|
+
crc1 = do_one_fold(crc1, _mm256_load_si256((__m256i*)src + 1));
|
|
135
|
+
src += 64;
|
|
136
|
+
len -= 64;
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
if (len >= 32) {
|
|
140
|
+
__m256i old = crc1;
|
|
141
|
+
crc1 = do_one_fold(crc0, _mm256_load_si256((__m256i*)src));
|
|
142
|
+
crc0 = old;
|
|
143
|
+
|
|
144
|
+
len -= 32;
|
|
145
|
+
src += 32;
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
if(len != 0) {
|
|
149
|
+
partial_fold(len, &crc0, &crc1, _mm256_load_si256((__m256i *)src));
|
|
150
|
+
}
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
const __m128i xmm_mask = _mm_set_epi32(-1,-1,-1,0);
|
|
154
|
+
__m128i x_tmp0, x_tmp1, x_tmp2, crc_fold;
|
|
155
|
+
|
|
156
|
+
__m128i xmm_crc0 = _mm256_castsi256_si128(crc0);
|
|
157
|
+
__m128i xmm_crc1 = _mm256_extracti128_si256(crc0, 1);
|
|
158
|
+
__m128i xmm_crc2 = _mm256_castsi256_si128(crc1);
|
|
159
|
+
__m128i xmm_crc3 = _mm256_extracti128_si256(crc1, 1);
|
|
160
|
+
|
|
161
|
+
/*
|
|
162
|
+
* k1
|
|
163
|
+
*/
|
|
164
|
+
crc_fold = _mm_load_si128((__m128i *)crc_k);
|
|
165
|
+
|
|
166
|
+
x_tmp0 = _mm_clmulepi64_si128(xmm_crc0, crc_fold, 0x10);
|
|
167
|
+
xmm_crc0 = _mm_clmulepi64_si128(xmm_crc0, crc_fold, 0x01);
|
|
168
|
+
xmm_crc1 = MM_2XOR(xmm_crc1, x_tmp0, xmm_crc0);
|
|
169
|
+
|
|
170
|
+
x_tmp1 = _mm_clmulepi64_si128(xmm_crc1, crc_fold, 0x10);
|
|
171
|
+
xmm_crc1 = _mm_clmulepi64_si128(xmm_crc1, crc_fold, 0x01);
|
|
172
|
+
xmm_crc2 = MM_2XOR(xmm_crc2, x_tmp1, xmm_crc1);
|
|
173
|
+
|
|
174
|
+
x_tmp2 = _mm_clmulepi64_si128(xmm_crc2, crc_fold, 0x10);
|
|
175
|
+
xmm_crc2 = _mm_clmulepi64_si128(xmm_crc2, crc_fold, 0x01);
|
|
176
|
+
xmm_crc3 = MM_2XOR(xmm_crc3, x_tmp2, xmm_crc2);
|
|
177
|
+
|
|
178
|
+
/*
|
|
179
|
+
* k5
|
|
180
|
+
*/
|
|
181
|
+
crc_fold = _mm_load_si128((__m128i *)crc_k + 1);
|
|
182
|
+
|
|
183
|
+
xmm_crc0 = xmm_crc3;
|
|
184
|
+
xmm_crc3 = _mm_clmulepi64_si128(xmm_crc3, crc_fold, 0);
|
|
185
|
+
xmm_crc0 = _mm_srli_si128(xmm_crc0, 8);
|
|
186
|
+
xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc0);
|
|
187
|
+
|
|
188
|
+
xmm_crc0 = xmm_crc3;
|
|
189
|
+
xmm_crc3 = _mm_slli_si128(xmm_crc3, 4);
|
|
190
|
+
xmm_crc3 = _mm_clmulepi64_si128(xmm_crc3, crc_fold, 0x10);
|
|
191
|
+
#ifdef ENABLE_AVX512
|
|
192
|
+
//xmm_crc3 = _mm_maskz_xor_epi32(14, xmm_crc3, xmm_crc0);
|
|
193
|
+
xmm_crc3 = _mm_ternarylogic_epi32(xmm_crc3, xmm_crc0, xmm_mask, 0x28);
|
|
194
|
+
#else
|
|
195
|
+
xmm_crc0 = _mm_and_si128(xmm_crc0, xmm_mask);
|
|
196
|
+
xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc0);
|
|
197
|
+
#endif
|
|
198
|
+
|
|
199
|
+
/*
|
|
200
|
+
* k7
|
|
201
|
+
*/
|
|
202
|
+
xmm_crc1 = xmm_crc3;
|
|
203
|
+
crc_fold = _mm_load_si128((__m128i *)crc_k + 2);
|
|
204
|
+
|
|
205
|
+
xmm_crc3 = _mm_clmulepi64_si128(xmm_crc3, crc_fold, 0);
|
|
206
|
+
xmm_crc3 = _mm_clmulepi64_si128(xmm_crc3, crc_fold, 0x10);
|
|
207
|
+
#ifdef ENABLE_AVX512
|
|
208
|
+
xmm_crc3 = _mm_ternarylogic_epi32(xmm_crc3, xmm_crc1, xmm_crc1, 0xC3); // NOT(xmm_crc3 ^ xmm_crc1)
|
|
209
|
+
#else
|
|
210
|
+
xmm_crc1 = _mm_xor_si128(xmm_crc1, xmm_mask);
|
|
211
|
+
xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc1);
|
|
212
|
+
#endif
|
|
213
|
+
return _mm_extract_epi32(xmm_crc3, 2);
|
|
214
|
+
}
|
|
215
|
+
|
|
216
|
+
static uint32_t do_crc32_incremental_clmul(const void* data, size_t length, uint32_t init) {
|
|
217
|
+
return crc_fold((const unsigned char*)data, (long)length, init);
|
|
218
|
+
}
|
|
219
|
+
|
|
220
|
+
void crc_clmul256_set_funcs(crc_func* _do_crc32_incremental) {
|
|
221
|
+
*_do_crc32_incremental = &do_crc32_incremental_clmul;
|
|
222
|
+
}
|
|
223
|
+
#else
|
|
224
|
+
void crc_clmul_set_funcs(crc_func* _do_crc32_incremental);
|
|
225
|
+
void crc_clmul256_set_funcs(crc_func* _do_crc32_incremental) {
|
|
226
|
+
crc_clmul_set_funcs(_do_crc32_incremental);
|
|
227
|
+
}
|
|
228
|
+
#endif
|
|
229
|
+
|
package/src/decoder.cc
CHANGED
|
@@ -4,15 +4,17 @@
|
|
|
4
4
|
#include "decoder.h"
|
|
5
5
|
|
|
6
6
|
extern "C" {
|
|
7
|
-
YencDecoderEnd (*_do_decode)(const unsigned char
|
|
8
|
-
YencDecoderEnd (*_do_decode_raw)(const unsigned char
|
|
9
|
-
YencDecoderEnd (*_do_decode_end_raw)(const unsigned char
|
|
7
|
+
YencDecoderEnd (*_do_decode)(const unsigned char**, unsigned char**, size_t, YencDecoderState*) = &do_decode_scalar<false, false>;
|
|
8
|
+
YencDecoderEnd (*_do_decode_raw)(const unsigned char**, unsigned char**, size_t, YencDecoderState*) = &do_decode_scalar<true, false>;
|
|
9
|
+
YencDecoderEnd (*_do_decode_end_raw)(const unsigned char**, unsigned char**, size_t, YencDecoderState*) = &do_decode_end_scalar<true>;
|
|
10
10
|
}
|
|
11
11
|
|
|
12
12
|
void decoder_set_sse2_funcs();
|
|
13
13
|
void decoder_set_ssse3_funcs();
|
|
14
14
|
void decoder_set_avx_funcs();
|
|
15
15
|
void decoder_set_avx2_funcs();
|
|
16
|
+
void decoder_set_vbmi2_funcs();
|
|
17
|
+
extern const bool decoder_has_avx10;
|
|
16
18
|
void decoder_set_neon_funcs();
|
|
17
19
|
|
|
18
20
|
|
|
@@ -44,7 +46,9 @@ void decoder_init() {
|
|
|
44
46
|
decoder_set_native_funcs();
|
|
45
47
|
# else
|
|
46
48
|
int use_isa = cpu_supports_isa();
|
|
47
|
-
if(use_isa >=
|
|
49
|
+
if(use_isa >= ISA_LEVEL_VBMI2 && (decoder_has_avx10 || (use_isa & ISA_FEATURE_EVEX512)))
|
|
50
|
+
decoder_set_vbmi2_funcs();
|
|
51
|
+
else if(use_isa >= ISA_LEVEL_AVX2)
|
|
48
52
|
decoder_set_avx2_funcs();
|
|
49
53
|
else if(use_isa >= ISA_LEVEL_AVX)
|
|
50
54
|
decoder_set_avx_funcs();
|
package/src/decoder.h
CHANGED
|
@@ -29,17 +29,17 @@ typedef enum {
|
|
|
29
29
|
|
|
30
30
|
#include "hedley.h"
|
|
31
31
|
|
|
32
|
-
extern YencDecoderEnd (*_do_decode)(const unsigned char
|
|
33
|
-
extern YencDecoderEnd (*_do_decode_raw)(const unsigned char
|
|
34
|
-
extern YencDecoderEnd (*_do_decode_end_raw)(const unsigned char
|
|
32
|
+
extern YencDecoderEnd (*_do_decode)(const unsigned char**, unsigned char**, size_t, YencDecoderState*);
|
|
33
|
+
extern YencDecoderEnd (*_do_decode_raw)(const unsigned char**, unsigned char**, size_t, YencDecoderState*);
|
|
34
|
+
extern YencDecoderEnd (*_do_decode_end_raw)(const unsigned char**, unsigned char**, size_t, YencDecoderState*);
|
|
35
35
|
|
|
36
|
-
static inline size_t do_decode(int isRaw, const unsigned char*
|
|
36
|
+
static inline size_t do_decode(int isRaw, const unsigned char* src, unsigned char* dest, size_t len, YencDecoderState* state) {
|
|
37
37
|
unsigned char* ds = dest;
|
|
38
38
|
(*(isRaw ? _do_decode_raw : _do_decode))(&src, &ds, len, state);
|
|
39
39
|
return ds - dest;
|
|
40
40
|
}
|
|
41
41
|
|
|
42
|
-
static inline YencDecoderEnd do_decode_end(const unsigned char
|
|
42
|
+
static inline YencDecoderEnd do_decode_end(const unsigned char** src, unsigned char** dest, size_t len, YencDecoderState* state) {
|
|
43
43
|
return _do_decode_end_raw(src, dest, len, state);
|
|
44
44
|
}
|
|
45
45
|
|
package/src/decoder_avx2_base.h
CHANGED
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
|
|
2
2
|
#ifdef __AVX2__
|
|
3
3
|
|
|
4
|
-
// GCC (ver 6-10(dev)) fails to optimize pure C version of mask testing, but has this intrinsic; Clang >= 7 optimizes C version fine
|
|
5
|
-
#if defined(__GNUC__) && __GNUC__ >= 7
|
|
4
|
+
// GCC (ver 6-10(dev)) fails to optimize pure C version of mask testing, but has this intrinsic; Clang >= 7 optimizes C version fine; functions added in Clang 8
|
|
5
|
+
#if (defined(__GNUC__) && __GNUC__ >= 7) || (defined(_MSC_VER) && _MSC_VER >= 1924)
|
|
6
6
|
# define KORTEST32(a, b) !_kortestz_mask32_u8((a), (b))
|
|
7
7
|
# define KAND32(a, b) _kand_mask32((a), (b))
|
|
8
8
|
# define KOR32(a, b) _kor_mask32((a), (b))
|
|
@@ -30,7 +30,7 @@ static HEDLEY_ALWAYS_INLINE __m256i force_align_read_256(const void* p) {
|
|
|
30
30
|
}
|
|
31
31
|
|
|
32
32
|
// _mm256_castsi128_si256, but upper is defined to be 0
|
|
33
|
-
#if (defined(__clang__) && __clang_major__ >= 5 && (!defined(__APPLE__) || __clang_major__ >= 7)) || (defined(__GNUC__) && __GNUC__ >= 10)
|
|
33
|
+
#if (defined(__clang__) && __clang_major__ >= 5 && (!defined(__APPLE__) || __clang_major__ >= 7)) || (defined(__GNUC__) && __GNUC__ >= 10) || (defined(_MSC_VER) && _MSC_VER >= 1910)
|
|
34
34
|
// intrinsic unsupported in GCC 9 and MSVC < 2017
|
|
35
35
|
# define zext128_256 _mm256_zextsi128_si256
|
|
36
36
|
#else
|
|
@@ -43,9 +43,15 @@ static HEDLEY_ALWAYS_INLINE __m256i force_align_read_256(const void* p) {
|
|
|
43
43
|
# endif
|
|
44
44
|
#endif
|
|
45
45
|
|
|
46
|
+
#if defined(__tune_icelake_client__) || defined(__tune_icelake_server__) || defined(__tune_tigerlake__) || defined(__tune_rocketlake__) || defined(__tune_alderlake__) || defined(__tune_sapphirerapids__)
|
|
47
|
+
# define COMPRESS_STORE _mm256_mask_compressstoreu_epi8
|
|
48
|
+
#else
|
|
49
|
+
// avoid uCode on Zen4
|
|
50
|
+
# define COMPRESS_STORE(dst, mask, vec) _mm256_storeu_si256((__m256i*)(dst), _mm256_maskz_compress_epi8(mask, vec))
|
|
51
|
+
#endif
|
|
46
52
|
|
|
47
53
|
template<bool isRaw, bool searchEnd, enum YEncDecIsaLevel use_isa>
|
|
48
|
-
HEDLEY_ALWAYS_INLINE void do_decode_avx2(const uint8_t*
|
|
54
|
+
HEDLEY_ALWAYS_INLINE void do_decode_avx2(const uint8_t* src, long& len, unsigned char*& p, unsigned char& _escFirst, uint16_t& _nextMask) {
|
|
49
55
|
HEDLEY_ASSUME(_escFirst == 0 || _escFirst == 1);
|
|
50
56
|
HEDLEY_ASSUME(_nextMask == 0 || _nextMask == 1 || _nextMask == 2);
|
|
51
57
|
uintptr_t escFirst = _escFirst;
|
|
@@ -60,6 +66,17 @@ HEDLEY_ALWAYS_INLINE void do_decode_avx2(const uint8_t* HEDLEY_RESTRICT src, lon
|
|
|
60
66
|
'.','.','.','.','.','.','.','.','.','.','.','.','.','.',_nextMask==2?0:'.',_nextMask==1?0:'.'
|
|
61
67
|
);
|
|
62
68
|
}
|
|
69
|
+
|
|
70
|
+
// for some reason, MSVC Win32 seems to crash when trying to compile _mm256_mask_cmpeq_epi8_mask
|
|
71
|
+
// the crash can be fixed by switching the order of the last two arguments, but it seems to generate wrong code
|
|
72
|
+
// so just disable the optimisation as it seems to be problematic there
|
|
73
|
+
#if defined(__AVX512VL__) && defined(__AVX512BW__)
|
|
74
|
+
# if defined(_MSC_VER) && !defined(PLATFORM_AMD64) && !defined(__clang__)
|
|
75
|
+
const bool useAVX3MaskCmp = false;
|
|
76
|
+
# else
|
|
77
|
+
const bool useAVX3MaskCmp = (use_isa >= ISA_LEVEL_AVX3);
|
|
78
|
+
# endif
|
|
79
|
+
#endif
|
|
63
80
|
intptr_t i;
|
|
64
81
|
for(i = -len; i; i += sizeof(__m256i)*2) {
|
|
65
82
|
__m256i oDataA = _mm256_load_si256((__m256i *)(src+i));
|
|
@@ -126,7 +143,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_avx2(const uint8_t* HEDLEY_RESTRICT src, lon
|
|
|
126
143
|
__mmask32 match2EqMaskA, match2EqMaskB;
|
|
127
144
|
__mmask32 match0CrMaskA, match0CrMaskB;
|
|
128
145
|
__mmask32 match2CrXDtMaskA, match2CrXDtMaskB;
|
|
129
|
-
if(
|
|
146
|
+
if(useAVX3MaskCmp && searchEnd) {
|
|
130
147
|
match2EqMaskA = _mm256_cmpeq_epi8_mask(_mm256_set1_epi8('='), tmpData2A);
|
|
131
148
|
match2EqMaskB = _mm256_cmpeq_epi8_mask(_mm256_set1_epi8('='), tmpData2B);
|
|
132
149
|
} else
|
|
@@ -142,7 +159,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_avx2(const uint8_t* HEDLEY_RESTRICT src, lon
|
|
|
142
159
|
// find patterns of \r_.
|
|
143
160
|
|
|
144
161
|
#if defined(__AVX512VL__) && defined(__AVX512BW__)
|
|
145
|
-
if(
|
|
162
|
+
if(useAVX3MaskCmp) {
|
|
146
163
|
match0CrMaskA = _mm256_cmpeq_epi8_mask(oDataA, _mm256_set1_epi8('\r'));
|
|
147
164
|
match0CrMaskB = _mm256_cmpeq_epi8_mask(oDataB, _mm256_set1_epi8('\r'));
|
|
148
165
|
match2CrXDtMaskA = _mm256_mask_cmpeq_epi8_mask(match0CrMaskA, tmpData2A, _mm256_set1_epi8('.'));
|
|
@@ -172,7 +189,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_avx2(const uint8_t* HEDLEY_RESTRICT src, lon
|
|
|
172
189
|
#if defined(__AVX512VL__) && defined(__AVX512BW__)
|
|
173
190
|
__mmask32 match1NlMaskA, match1NlMaskB;
|
|
174
191
|
__mmask32 match2NlDotMaskA, match2NlDotMaskB;
|
|
175
|
-
if(
|
|
192
|
+
if(useAVX3MaskCmp) {
|
|
176
193
|
match1NlMaskA = _mm256_mask_cmpeq_epi8_mask(
|
|
177
194
|
match0CrMaskA,
|
|
178
195
|
_mm256_set1_epi8('\n'),
|
|
@@ -228,7 +245,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_avx2(const uint8_t* HEDLEY_RESTRICT src, lon
|
|
|
228
245
|
|
|
229
246
|
int matchEnd;
|
|
230
247
|
#if defined(__AVX512VL__) && defined(__AVX512BW__)
|
|
231
|
-
if(
|
|
248
|
+
if(useAVX3MaskCmp) {
|
|
232
249
|
__mmask32 match3EqYMaskA = _mm256_mask_cmpeq_epi8_mask(
|
|
233
250
|
match2EqMaskA,
|
|
234
251
|
_mm256_set1_epi8('y'),
|
|
@@ -307,7 +324,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_avx2(const uint8_t* HEDLEY_RESTRICT src, lon
|
|
|
307
324
|
}
|
|
308
325
|
}
|
|
309
326
|
#if defined(__AVX512VL__) && defined(__AVX512BW__)
|
|
310
|
-
if(
|
|
327
|
+
if(useAVX3MaskCmp) {
|
|
311
328
|
mask |= (uint64_t)match2NlDotMaskA << 2;
|
|
312
329
|
mask |= (uint64_t)match2NlDotMaskB << 34;
|
|
313
330
|
minMask = _mm256_maskz_mov_epi8(~(match2NlDotMaskB>>30), _mm256_set1_epi8('.'));
|
|
@@ -325,7 +342,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_avx2(const uint8_t* HEDLEY_RESTRICT src, lon
|
|
|
325
342
|
__m256i match3EqYA, match3EqYB;
|
|
326
343
|
#if defined(__AVX512VL__) && defined(__AVX512BW__)
|
|
327
344
|
__mmask32 match3EqYMaskA, match3EqYMaskB;
|
|
328
|
-
if(
|
|
345
|
+
if(useAVX3MaskCmp) {
|
|
329
346
|
match3EqYMaskA = _mm256_mask_cmpeq_epi8_mask(
|
|
330
347
|
match2EqMaskA,
|
|
331
348
|
_mm256_set1_epi8('y'),
|
|
@@ -355,7 +372,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_avx2(const uint8_t* HEDLEY_RESTRICT src, lon
|
|
|
355
372
|
if(LIKELIHOOD(0.002, partialEndFound)) {
|
|
356
373
|
bool endFound;
|
|
357
374
|
#if defined(__AVX512VL__) && defined(__AVX512BW__)
|
|
358
|
-
if(
|
|
375
|
+
if(useAVX3MaskCmp) {
|
|
359
376
|
__mmask32 match3LfEqYMaskA = _mm256_mask_cmpeq_epi8_mask(
|
|
360
377
|
match3EqYMaskA,
|
|
361
378
|
_mm256_set1_epi8('\n'),
|
|
@@ -530,9 +547,9 @@ HEDLEY_ALWAYS_INLINE void do_decode_avx2(const uint8_t* HEDLEY_RESTRICT src, lon
|
|
|
530
547
|
// all that's left is to 'compress' the data (skip over masked chars)
|
|
531
548
|
#if defined(__AVX512VBMI2__) && defined(__AVX512VL__)
|
|
532
549
|
if(use_isa >= ISA_LEVEL_VBMI2) {
|
|
533
|
-
|
|
550
|
+
COMPRESS_STORE(p, KNOT32(mask), dataA);
|
|
534
551
|
p -= popcnt32(mask & 0xffffffff);
|
|
535
|
-
|
|
552
|
+
COMPRESS_STORE((p + XMM_SIZE*2), KNOT32(mask>>32), dataB);
|
|
536
553
|
p += XMM_SIZE*4 - popcnt32(mask >> 32);
|
|
537
554
|
} else
|
|
538
555
|
#endif
|