yencode 1.1.2 → 1.1.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +3 -2
- package/binding.gyp +141 -6
- package/index.js +21 -19
- package/package.json +2 -1
- package/src/common.h +34 -19
- package/src/crc.cc +138 -11
- package/src/crc_arm.cc +42 -7
- package/src/crc_folding.cc +18 -53
- package/src/crc_folding_256.cc +229 -0
- package/src/decoder.cc +8 -4
- package/src/decoder.h +5 -5
- package/src/decoder_avx2_base.h +30 -13
- package/src/decoder_common.h +5 -5
- package/src/decoder_neon.cc +4 -4
- package/src/decoder_neon64.cc +10 -7
- package/src/decoder_sse_base.h +26 -12
- package/src/decoder_vbmi2.cc +37 -0
- package/src/encoder.cc +10 -1
- package/src/encoder_avx_base.h +24 -16
- package/src/encoder_neon.cc +40 -41
- package/src/encoder_rvv.cc +219 -0
- package/src/encoder_sse_base.h +7 -8
- package/src/encoder_vbmi2.cc +30 -0
- package/src/hedley.h +278 -135
- package/src/platform.cc +79 -10
- package/src/test_alignalloc.c +6 -0
- package/test/_speedbase.js +12 -11
- package/test/speeddec.js +6 -5
- package/test/testcrc.js +14 -0
- package/test/testdec.js +30 -14
- package/test/testenc.js +10 -7
- package/test/testpostdec.js +6 -5
package/src/decoder_common.h
CHANGED
|
@@ -6,7 +6,7 @@
|
|
|
6
6
|
|
|
7
7
|
// state var: refers to the previous state - only used for incremental processing
|
|
8
8
|
template<bool isRaw>
|
|
9
|
-
size_t do_decode_noend_scalar(const unsigned char*
|
|
9
|
+
size_t do_decode_noend_scalar(const unsigned char* src, unsigned char* dest, size_t len, YencDecoderState* state) {
|
|
10
10
|
const unsigned char *es = src + len; // end source pointer
|
|
11
11
|
unsigned char *p = dest; // destination pointer
|
|
12
12
|
long i = -(long)len; // input position
|
|
@@ -140,7 +140,7 @@ size_t do_decode_noend_scalar(const unsigned char* HEDLEY_RESTRICT src, unsigned
|
|
|
140
140
|
}
|
|
141
141
|
|
|
142
142
|
template<bool isRaw>
|
|
143
|
-
YencDecoderEnd do_decode_end_scalar(const unsigned char
|
|
143
|
+
YencDecoderEnd do_decode_end_scalar(const unsigned char** src, unsigned char** dest, size_t len, YencDecoderState* state) {
|
|
144
144
|
const unsigned char *es = (*src) + len; // end source pointer
|
|
145
145
|
unsigned char *p = *dest; // destination pointer
|
|
146
146
|
long i = -(long)len; // input position
|
|
@@ -321,7 +321,7 @@ YencDecoderEnd do_decode_end_scalar(const unsigned char* HEDLEY_RESTRICT* src, u
|
|
|
321
321
|
}
|
|
322
322
|
|
|
323
323
|
template<bool isRaw, bool searchEnd>
|
|
324
|
-
YencDecoderEnd do_decode_scalar(const unsigned char
|
|
324
|
+
YencDecoderEnd do_decode_scalar(const unsigned char** src, unsigned char** dest, size_t len, YencDecoderState* state) {
|
|
325
325
|
if(searchEnd)
|
|
326
326
|
return do_decode_end_scalar<isRaw>(src, dest, len, state);
|
|
327
327
|
*dest += do_decode_noend_scalar<isRaw>(*src, *dest, len, state);
|
|
@@ -331,8 +331,8 @@ YencDecoderEnd do_decode_scalar(const unsigned char* HEDLEY_RESTRICT* src, unsig
|
|
|
331
331
|
|
|
332
332
|
|
|
333
333
|
|
|
334
|
-
template<bool isRaw, bool searchEnd, int width, void(&kernel)(const uint8_t
|
|
335
|
-
YencDecoderEnd do_decode_simd(const unsigned char
|
|
334
|
+
template<bool isRaw, bool searchEnd, int width, void(&kernel)(const uint8_t*, long&, unsigned char*&, unsigned char&, uint16_t&)>
|
|
335
|
+
YencDecoderEnd do_decode_simd(const unsigned char** src, unsigned char** dest, size_t len, YencDecoderState* state) {
|
|
336
336
|
if(len <= width*2) return do_decode_scalar<isRaw, searchEnd>(src, dest, len, state);
|
|
337
337
|
|
|
338
338
|
YencDecoderState tState = YDEC_STATE_CRLF;
|
package/src/decoder_neon.cc
CHANGED
|
@@ -19,14 +19,14 @@
|
|
|
19
19
|
#endif
|
|
20
20
|
|
|
21
21
|
|
|
22
|
-
// for compilers that lack these functions
|
|
23
|
-
#if defined(__clang__) || (defined(__GNUC__) && (defined(__aarch64__) && __GNUC__ >= 8))
|
|
22
|
+
// for compilers that lack these functions (Clang armv7 9-12 seems to have issues with multi-vector loads)
|
|
23
|
+
#if (defined(__clang__) && (defined(__aarch64__) || __clang_major__<9 || __clang_major__>12)) || (defined(__GNUC__) && (defined(__aarch64__) && __GNUC__ >= 8))
|
|
24
24
|
# define vld1q_u8_x2_align(p, n) vld1q_u8_x2((uint8_t*)__builtin_assume_aligned(p, n))
|
|
25
25
|
#else
|
|
26
26
|
# define vld1q_u8_x2_align(p, n) vcreate2_u8(vld1q_u8_align(p, (n)/2), vld1q_u8_align((p)+16, (n)/2))
|
|
27
27
|
#endif
|
|
28
28
|
// Clang wrongly assumes alignment on vld1q_u8_x2, and ARMv7 GCC doesn't support the function, so effectively, it can only be used in ARMv8 compilers
|
|
29
|
-
#if defined(__aarch64__) && (defined(__clang__) || (
|
|
29
|
+
#if defined(__aarch64__) && (defined(__clang__) || HEDLEY_GCC_VERSION_CHECK(8,5,0))
|
|
30
30
|
# define vst1q_u8_x2_unaligned vst1q_u8_x2
|
|
31
31
|
#else
|
|
32
32
|
static HEDLEY_ALWAYS_INLINE void vst1q_u8_x2_unaligned(uint8_t* p, uint8x16x2_t data) {
|
|
@@ -59,7 +59,7 @@ static bool neon_vect_is_nonzero(uint8x16_t v) {
|
|
|
59
59
|
|
|
60
60
|
|
|
61
61
|
template<bool isRaw, bool searchEnd>
|
|
62
|
-
HEDLEY_ALWAYS_INLINE void do_decode_neon(const uint8_t*
|
|
62
|
+
HEDLEY_ALWAYS_INLINE void do_decode_neon(const uint8_t* src, long& len, unsigned char*& p, unsigned char& escFirst, uint16_t& nextMask) {
|
|
63
63
|
HEDLEY_ASSUME(escFirst == 0 || escFirst == 1);
|
|
64
64
|
HEDLEY_ASSUME(nextMask == 0 || nextMask == 1 || nextMask == 2);
|
|
65
65
|
uint8x16_t yencOffset = escFirst ? vmakeq_u8(42+64,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42) : vdupq_n_u8(42);
|
package/src/decoder_neon64.cc
CHANGED
|
@@ -10,9 +10,9 @@ static struct { char bytes[16]; } ALIGN_TO(16, compactLUT[32768]);
|
|
|
10
10
|
static uint8_t eqFixLUT[256];
|
|
11
11
|
|
|
12
12
|
|
|
13
|
-
|
|
14
|
-
#if !defined(__clang__) && !defined(_MSC_VER) && (!defined(__aarch64__) || !HEDLEY_GCC_VERSION_CHECK(
|
|
15
|
-
static HEDLEY_ALWAYS_INLINE uint8x16x4_t
|
|
13
|
+
// AArch64 GCC lacks these functions until 8.5, 9.4 and 10.1 (10.0 unknown)
|
|
14
|
+
#if !defined(__clang__) && !defined(_MSC_VER) && (!defined(__aarch64__) || !(HEDLEY_GCC_VERSION_CHECK(9,4,0) || (!HEDLEY_GCC_VERSION_CHECK(9,0,0) && HEDLEY_GCC_VERSION_CHECK(8,5,0))))
|
|
15
|
+
static HEDLEY_ALWAYS_INLINE uint8x16x4_t _vld1q_u8_x4(const uint8_t* p) {
|
|
16
16
|
uint8x16x4_t ret;
|
|
17
17
|
ret.val[0] = vld1q_u8(p);
|
|
18
18
|
ret.val[1] = vld1q_u8(p+16);
|
|
@@ -20,12 +20,15 @@ static HEDLEY_ALWAYS_INLINE uint8x16x4_t vld1q_u8_x4(const uint8_t* p) {
|
|
|
20
20
|
ret.val[3] = vld1q_u8(p+48);
|
|
21
21
|
return ret;
|
|
22
22
|
}
|
|
23
|
-
static HEDLEY_ALWAYS_INLINE void
|
|
23
|
+
static HEDLEY_ALWAYS_INLINE void _vst1q_u8_x4(uint8_t* p, uint8x16x4_t data) {
|
|
24
24
|
vst1q_u8(p, data.val[0]);
|
|
25
25
|
vst1q_u8(p+16, data.val[1]);
|
|
26
26
|
vst1q_u8(p+32, data.val[2]);
|
|
27
27
|
vst1q_u8(p+48, data.val[3]);
|
|
28
28
|
}
|
|
29
|
+
#else
|
|
30
|
+
# define _vld1q_u8_x4 vld1q_u8_x4
|
|
31
|
+
# define _vst1q_u8_x4 vst1q_u8_x4
|
|
29
32
|
#endif
|
|
30
33
|
|
|
31
34
|
|
|
@@ -44,7 +47,7 @@ static HEDLEY_ALWAYS_INLINE uint8x16_t mergeCompares(uint8x16_t a, uint8x16_t b,
|
|
|
44
47
|
|
|
45
48
|
|
|
46
49
|
template<bool isRaw, bool searchEnd>
|
|
47
|
-
HEDLEY_ALWAYS_INLINE void do_decode_neon(const uint8_t*
|
|
50
|
+
HEDLEY_ALWAYS_INLINE void do_decode_neon(const uint8_t* src, long& len, unsigned char*& p, unsigned char& escFirst, uint16_t& nextMask) {
|
|
48
51
|
HEDLEY_ASSUME(escFirst == 0 || escFirst == 1);
|
|
49
52
|
HEDLEY_ASSUME(nextMask == 0 || nextMask == 1 || nextMask == 2);
|
|
50
53
|
uint8x16_t nextMaskMix = vdupq_n_u8(0);
|
|
@@ -55,7 +58,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_neon(const uint8_t* HEDLEY_RESTRICT src, lon
|
|
|
55
58
|
uint8x16_t yencOffset = escFirst ? vmakeq_u8(42+64,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42) : vdupq_n_u8(42);
|
|
56
59
|
long i;
|
|
57
60
|
for(i = -len; i; i += sizeof(uint8x16_t)*4) {
|
|
58
|
-
uint8x16x4_t data =
|
|
61
|
+
uint8x16x4_t data = _vld1q_u8_x4(src+i);
|
|
59
62
|
uint8x16_t dataA = data.val[0];
|
|
60
63
|
uint8x16_t dataB = data.val[1];
|
|
61
64
|
uint8x16_t dataC = data.val[2];
|
|
@@ -421,7 +424,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_neon(const uint8_t* HEDLEY_RESTRICT src, lon
|
|
|
421
424
|
dataB = vsubq_u8(dataB, vdupq_n_u8(42));
|
|
422
425
|
dataC = vsubq_u8(dataC, vdupq_n_u8(42));
|
|
423
426
|
dataD = vsubq_u8(dataD, vdupq_n_u8(42));
|
|
424
|
-
|
|
427
|
+
_vst1q_u8_x4(p, vcreate4_u8(dataA, dataB, dataC, dataD));
|
|
425
428
|
p += sizeof(uint8x16_t)*4;
|
|
426
429
|
escFirst = 0;
|
|
427
430
|
yencOffset = vdupq_n_u8(42);
|
package/src/decoder_sse_base.h
CHANGED
|
@@ -7,8 +7,15 @@
|
|
|
7
7
|
# define _mm_shrdi_epi16 _mm128_shrdi_epi16
|
|
8
8
|
#endif
|
|
9
9
|
|
|
10
|
+
#if defined(__tune_icelake_client__) || defined(__tune_icelake_server__) || defined(__tune_tigerlake__) || defined(__tune_rocketlake__) || defined(__tune_alderlake__) || defined(__tune_sapphirerapids__)
|
|
11
|
+
# define COMPRESS_STORE _mm_mask_compressstoreu_epi8
|
|
12
|
+
#else
|
|
13
|
+
// avoid uCode on Zen4
|
|
14
|
+
# define COMPRESS_STORE(dst, mask, vec) _mm_storeu_si128((__m128i*)(dst), _mm_maskz_compress_epi8(mask, vec))
|
|
15
|
+
#endif
|
|
16
|
+
|
|
10
17
|
// GCC (ver 6-10(dev)) fails to optimize pure C version of mask testing, but has this intrinsic; Clang >= 7 optimizes C version fine
|
|
11
|
-
#if defined(__GNUC__) && __GNUC__ >= 7
|
|
18
|
+
#if (defined(__GNUC__) && __GNUC__ >= 7) || (defined(_MSC_VER) && _MSC_VER >= 1924)
|
|
12
19
|
# define KORTEST16(a, b) !_kortestz_mask16_u8((a), (b))
|
|
13
20
|
# define KAND16(a, b) _kand_mask16((a), (b))
|
|
14
21
|
# define KOR16(a, b) _kor_mask16((a), (b))
|
|
@@ -104,7 +111,7 @@ static HEDLEY_ALWAYS_INLINE __m128i sse2_compact_vect(uint32_t mask, __m128i dat
|
|
|
104
111
|
}
|
|
105
112
|
|
|
106
113
|
template<bool isRaw, bool searchEnd, enum YEncDecIsaLevel use_isa>
|
|
107
|
-
HEDLEY_ALWAYS_INLINE void do_decode_sse(const uint8_t*
|
|
114
|
+
HEDLEY_ALWAYS_INLINE void do_decode_sse(const uint8_t* src, long& len, unsigned char*& p, unsigned char& _escFirst, uint16_t& _nextMask) {
|
|
108
115
|
HEDLEY_ASSUME(_escFirst == 0 || _escFirst == 1);
|
|
109
116
|
HEDLEY_ASSUME(_nextMask == 0 || _nextMask == 1 || _nextMask == 2);
|
|
110
117
|
uintptr_t escFirst = _escFirst;
|
|
@@ -112,7 +119,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_sse(const uint8_t* HEDLEY_RESTRICT src, long
|
|
|
112
119
|
-42,-42,-42,-42,-42,-42,-42,-42,-42,-42,-42,-42,-42,-42,-42,-42-64
|
|
113
120
|
) : _mm_set1_epi8(-42);
|
|
114
121
|
|
|
115
|
-
#if defined(__SSSE3__) && !defined(__tune_atom__) && !defined(__tune_slm__) && !defined(__tune_btver1__)
|
|
122
|
+
#if defined(__SSSE3__) && !defined(__tune_atom__) && !defined(__tune_slm__) && !defined(__tune_btver1__) && !defined(__tune_btver2__)
|
|
116
123
|
const bool _USING_FAST_MATCH = (use_isa >= ISA_LEVEL_SSSE3);
|
|
117
124
|
#else
|
|
118
125
|
const bool _USING_FAST_MATCH = false;
|
|
@@ -121,6 +128,13 @@ HEDLEY_ALWAYS_INLINE void do_decode_sse(const uint8_t* HEDLEY_RESTRICT src, long
|
|
|
121
128
|
const bool _USING_BLEND_ADD = (use_isa >= ISA_LEVEL_SSE41);
|
|
122
129
|
#else
|
|
123
130
|
const bool _USING_BLEND_ADD = false;
|
|
131
|
+
#endif
|
|
132
|
+
#if defined(__AVX512VL__) && defined(__AVX512BW__)
|
|
133
|
+
# if defined(_MSC_VER) && !defined(PLATFORM_AMD64) && !defined(__clang__)
|
|
134
|
+
const bool useAVX3MaskCmp = false;
|
|
135
|
+
# else
|
|
136
|
+
const bool useAVX3MaskCmp = (use_isa >= ISA_LEVEL_AVX3);
|
|
137
|
+
# endif
|
|
124
138
|
#endif
|
|
125
139
|
|
|
126
140
|
__m128i lfCompare = _mm_set1_epi8('\n');
|
|
@@ -214,7 +228,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_sse(const uint8_t* HEDLEY_RESTRICT src, long
|
|
|
214
228
|
__mmask16 match2EqMaskA, match2EqMaskB;
|
|
215
229
|
__mmask16 match0CrMaskA, match0CrMaskB;
|
|
216
230
|
__mmask16 match2CrXDtMaskA, match2CrXDtMaskB;
|
|
217
|
-
if(
|
|
231
|
+
if(useAVX3MaskCmp && searchEnd) {
|
|
218
232
|
match2EqMaskA = _mm_cmpeq_epi8_mask(_mm_set1_epi8('='), tmpData2A);
|
|
219
233
|
match2EqMaskB = _mm_cmpeq_epi8_mask(_mm_set1_epi8('='), tmpData2B);
|
|
220
234
|
} else
|
|
@@ -230,7 +244,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_sse(const uint8_t* HEDLEY_RESTRICT src, long
|
|
|
230
244
|
__m128i match2CrXDtA, match2CrXDtB;
|
|
231
245
|
if(isRaw) {
|
|
232
246
|
#if defined(__AVX512VL__) && defined(__AVX512BW__)
|
|
233
|
-
if(
|
|
247
|
+
if(useAVX3MaskCmp) {
|
|
234
248
|
match0CrMaskA = _mm_cmpeq_epi8_mask(oDataA, _mm_set1_epi8('\r'));
|
|
235
249
|
match0CrMaskB = _mm_cmpeq_epi8_mask(oDataB, _mm_set1_epi8('\r'));
|
|
236
250
|
match2CrXDtMaskA = _mm_mask_cmpeq_epi8_mask(match0CrMaskA, tmpData2A, _mm_set1_epi8('.'));
|
|
@@ -256,7 +270,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_sse(const uint8_t* HEDLEY_RESTRICT src, long
|
|
|
256
270
|
#if defined(__AVX512VL__) && defined(__AVX512BW__)
|
|
257
271
|
__mmask16 match1NlMaskA, match1NlMaskB;
|
|
258
272
|
__mmask16 match2NlDotMaskA, match2NlDotMaskB;
|
|
259
|
-
if(
|
|
273
|
+
if(useAVX3MaskCmp) {
|
|
260
274
|
match1NlMaskA = _mm_mask_cmpeq_epi8_mask(
|
|
261
275
|
match0CrMaskA,
|
|
262
276
|
_mm_set1_epi8('\n'),
|
|
@@ -299,7 +313,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_sse(const uint8_t* HEDLEY_RESTRICT src, long
|
|
|
299
313
|
|
|
300
314
|
int matchEnd;
|
|
301
315
|
#if defined(__AVX512VL__) && defined(__AVX512BW__)
|
|
302
|
-
if(
|
|
316
|
+
if(useAVX3MaskCmp) {
|
|
303
317
|
__mmask16 match3EqYMaskA = _mm_mask_cmpeq_epi8_mask(
|
|
304
318
|
match2EqMaskA, _mm_set1_epi8('y'), tmpData3A
|
|
305
319
|
);
|
|
@@ -373,7 +387,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_sse(const uint8_t* HEDLEY_RESTRICT src, long
|
|
|
373
387
|
}
|
|
374
388
|
}
|
|
375
389
|
#if defined(__AVX512VL__) && defined(__AVX512BW__)
|
|
376
|
-
if(
|
|
390
|
+
if(useAVX3MaskCmp) {
|
|
377
391
|
mask |= match2NlDotMaskA << 2;
|
|
378
392
|
mask |= (match2NlDotMaskB << 18) & 0xffffffff;
|
|
379
393
|
minMask = _mm_maskz_mov_epi8(~(match2NlDotMaskB>>14), _mm_set1_epi8('.'));
|
|
@@ -398,7 +412,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_sse(const uint8_t* HEDLEY_RESTRICT src, long
|
|
|
398
412
|
__m128i match3EqYA, match3EqYB;
|
|
399
413
|
#if defined(__AVX512VL__) && defined(__AVX512BW__)
|
|
400
414
|
__mmask16 match3EqYMaskA, match3EqYMaskB;
|
|
401
|
-
if(
|
|
415
|
+
if(useAVX3MaskCmp) {
|
|
402
416
|
match3EqYMaskA = _mm_mask_cmpeq_epi8_mask(
|
|
403
417
|
match2EqMaskA,
|
|
404
418
|
_mm_set1_epi8('y'),
|
|
@@ -434,7 +448,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_sse(const uint8_t* HEDLEY_RESTRICT src, long
|
|
|
434
448
|
bool endFound;
|
|
435
449
|
|
|
436
450
|
#if defined(__AVX512VL__) && defined(__AVX512BW__)
|
|
437
|
-
if(
|
|
451
|
+
if(useAVX3MaskCmp) {
|
|
438
452
|
__mmask16 match3LfEqYMaskA = _mm_mask_cmpeq_epi8_mask(
|
|
439
453
|
match3EqYMaskA,
|
|
440
454
|
_mm_set1_epi8('\n'),
|
|
@@ -642,9 +656,9 @@ HEDLEY_ALWAYS_INLINE void do_decode_sse(const uint8_t* HEDLEY_RESTRICT src, long
|
|
|
642
656
|
if(use_isa >= ISA_LEVEL_SSSE3) {
|
|
643
657
|
# if defined(__AVX512VBMI2__) && defined(__AVX512VL__) && defined(__POPCNT__)
|
|
644
658
|
if(use_isa >= ISA_LEVEL_VBMI2) {
|
|
645
|
-
|
|
659
|
+
COMPRESS_STORE(p, KNOT16(mask), dataA);
|
|
646
660
|
p -= popcnt32(mask & 0xffff);
|
|
647
|
-
|
|
661
|
+
COMPRESS_STORE(p+XMM_SIZE, KNOT16(mask>>16), dataB);
|
|
648
662
|
p -= popcnt32(mask>>16);
|
|
649
663
|
p += XMM_SIZE*2;
|
|
650
664
|
} else
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
#include "common.h"
|
|
2
|
+
|
|
3
|
+
extern const bool decoder_has_avx10;
|
|
4
|
+
#if !defined(__EVEX512__) && (defined(__AVX10_1__) || defined(__EVEX256__)) && defined(__AVX512VL__) && defined(__AVX512VBMI2__) && defined(__AVX512BW__)
|
|
5
|
+
const bool decoder_has_avx10 = true;
|
|
6
|
+
#else
|
|
7
|
+
const bool decoder_has_avx10 = false;
|
|
8
|
+
#endif
|
|
9
|
+
|
|
10
|
+
#if defined(__AVX512VL__) && defined(__AVX512VBMI2__) && defined(__AVX512BW__)
|
|
11
|
+
# include "decoder_common.h"
|
|
12
|
+
# ifndef YENC_DISABLE_AVX256
|
|
13
|
+
# include "decoder_avx2_base.h"
|
|
14
|
+
void decoder_set_vbmi2_funcs() {
|
|
15
|
+
ALIGN_ALLOC(lookups, sizeof(*lookups), 16);
|
|
16
|
+
// TODO: consider removing compact LUT
|
|
17
|
+
decoder_init_lut(lookups->eqFix, lookups->compact);
|
|
18
|
+
_do_decode = &do_decode_simd<false, false, sizeof(__m256i)*2, do_decode_avx2<false, false, ISA_LEVEL_VBMI2> >;
|
|
19
|
+
_do_decode_raw = &do_decode_simd<true, false, sizeof(__m256i)*2, do_decode_avx2<true, false, ISA_LEVEL_VBMI2> >;
|
|
20
|
+
_do_decode_end_raw = &do_decode_simd<true, true, sizeof(__m256i)*2, do_decode_avx2<true, true, ISA_LEVEL_VBMI2> >;
|
|
21
|
+
}
|
|
22
|
+
# else
|
|
23
|
+
# include "decoder_sse_base.h"
|
|
24
|
+
void decoder_set_vbmi2_funcs() {
|
|
25
|
+
decoder_sse_init();
|
|
26
|
+
decoder_init_lut(lookups->eqFix, lookups->compact);
|
|
27
|
+
_do_decode = &do_decode_simd<false, false, sizeof(__m128i)*2, do_decode_sse<false, false, ISA_LEVEL_VBMI2> >;
|
|
28
|
+
_do_decode_raw = &do_decode_simd<true, false, sizeof(__m128i)*2, do_decode_sse<true, false, ISA_LEVEL_VBMI2> >;
|
|
29
|
+
_do_decode_end_raw = &do_decode_simd<true, true, sizeof(__m128i)*2, do_decode_sse<true, true, ISA_LEVEL_VBMI2> >;
|
|
30
|
+
}
|
|
31
|
+
# endif
|
|
32
|
+
#else
|
|
33
|
+
void decoder_set_avx2_funcs();
|
|
34
|
+
void decoder_set_vbmi2_funcs() {
|
|
35
|
+
decoder_set_avx2_funcs();
|
|
36
|
+
}
|
|
37
|
+
#endif
|
package/src/encoder.cc
CHANGED
|
@@ -128,7 +128,10 @@ void encoder_sse2_init();
|
|
|
128
128
|
void encoder_ssse3_init();
|
|
129
129
|
void encoder_avx_init();
|
|
130
130
|
void encoder_avx2_init();
|
|
131
|
+
void encoder_vbmi2_init();
|
|
132
|
+
extern const bool encoder_has_avx10;
|
|
131
133
|
void encoder_neon_init();
|
|
134
|
+
void encoder_rvv_init();
|
|
132
135
|
|
|
133
136
|
#if defined(PLATFORM_X86) && defined(YENC_BUILD_NATIVE) && YENC_BUILD_NATIVE!=0
|
|
134
137
|
# if defined(__AVX2__) && !defined(YENC_DISABLE_AVX256)
|
|
@@ -153,7 +156,9 @@ void encoder_init() {
|
|
|
153
156
|
encoder_native_init();
|
|
154
157
|
# else
|
|
155
158
|
int use_isa = cpu_supports_isa();
|
|
156
|
-
if(use_isa >=
|
|
159
|
+
if(use_isa >= ISA_LEVEL_VBMI2 && (encoder_has_avx10 || (use_isa & ISA_FEATURE_EVEX512)))
|
|
160
|
+
encoder_vbmi2_init();
|
|
161
|
+
else if(use_isa >= ISA_LEVEL_AVX2)
|
|
157
162
|
encoder_avx2_init();
|
|
158
163
|
else if(use_isa >= ISA_LEVEL_AVX)
|
|
159
164
|
encoder_avx_init();
|
|
@@ -167,4 +172,8 @@ void encoder_init() {
|
|
|
167
172
|
if(cpu_supports_neon())
|
|
168
173
|
encoder_neon_init();
|
|
169
174
|
#endif
|
|
175
|
+
#ifdef __riscv
|
|
176
|
+
if(cpu_supports_rvv())
|
|
177
|
+
encoder_rvv_init();
|
|
178
|
+
#endif
|
|
170
179
|
}
|
package/src/encoder_avx_base.h
CHANGED
|
@@ -6,7 +6,7 @@
|
|
|
6
6
|
#include "encoder_common.h"
|
|
7
7
|
#define YMM_SIZE 32
|
|
8
8
|
|
|
9
|
-
#if defined(__GNUC__) && __GNUC__ >= 7
|
|
9
|
+
#if (defined(__GNUC__) && __GNUC__ >= 7) || (defined(_MSC_VER) && _MSC_VER >= 1924)
|
|
10
10
|
# define KLOAD32(a, offs) _load_mask32((__mmask32*)(a) + (offs))
|
|
11
11
|
#else
|
|
12
12
|
# define KLOAD32(a, offs) (((uint32_t*)(a))[(offs)])
|
|
@@ -215,7 +215,7 @@ HEDLEY_ALWAYS_INLINE void do_encode_avx2(int line_size, int* colOffset, const ui
|
|
|
215
215
|
// duplicate halves
|
|
216
216
|
data1A = _mm256_inserti128_si256(dataA, _mm256_castsi256_si128(dataA), 1);
|
|
217
217
|
data1B = _mm256_inserti128_si256(dataB, _mm256_castsi256_si128(dataB), 1);
|
|
218
|
-
#if defined(__tune_znver2__) || defined(__tune_znver3__)
|
|
218
|
+
#if defined(__tune_znver2__) || defined(__tune_znver3__) || defined(__tune_znver4__)
|
|
219
219
|
data2A = _mm256_permute2x128_si256(dataA, dataA, 0x11);
|
|
220
220
|
data2B = _mm256_permute2x128_si256(dataB, dataB, 0x11);
|
|
221
221
|
#else
|
|
@@ -290,10 +290,10 @@ HEDLEY_ALWAYS_INLINE void do_encode_avx2(int line_size, int* colOffset, const ui
|
|
|
290
290
|
|
|
291
291
|
#if defined(__GNUC__) && defined(PLATFORM_AMD64)
|
|
292
292
|
if(use_isa >= ISA_LEVEL_VBMI2) {
|
|
293
|
-
|
|
293
|
+
__asm__(
|
|
294
294
|
"shrq $1, %[eqMask] \n"
|
|
295
295
|
"shrq %%cl, %[eqMask] \n"
|
|
296
|
-
"adcq %[col], %[p] \n"
|
|
296
|
+
"adcq %q[col], %q[p] \n"
|
|
297
297
|
: [eqMask]"+r"(eqMask), [p]"+r"(p)
|
|
298
298
|
: "c"(shiftAmt), [col]"r"(~col)
|
|
299
299
|
);
|
|
@@ -334,28 +334,32 @@ HEDLEY_ALWAYS_INLINE void do_encode_avx2(int line_size, int* colOffset, const ui
|
|
|
334
334
|
if(use_isa >= ISA_LEVEL_AVX3) {
|
|
335
335
|
# if defined(__AVX512VBMI2__)
|
|
336
336
|
if(use_isa >= ISA_LEVEL_VBMI2) {
|
|
337
|
-
|
|
337
|
+
__m128i dataTop = _mm256_extracti128_si256(dataA, 1);
|
|
338
338
|
dataA = _mm256_mask_expand_epi8(_mm256_set1_epi8('='), KNOT32(maskA), dataA);
|
|
339
339
|
_mm256_storeu_si256((__m256i*)p, dataA);
|
|
340
|
+
p[32] = _mm_extract_epi8(dataTop, 15);
|
|
340
341
|
p += outputBytesA;
|
|
341
342
|
|
|
342
|
-
|
|
343
|
+
dataTop = _mm256_extracti128_si256(dataB, 1);
|
|
343
344
|
dataB = _mm256_mask_expand_epi8(_mm256_set1_epi8('='), KNOT32(maskB), dataB);
|
|
344
345
|
_mm256_storeu_si256((__m256i*)p, dataB);
|
|
346
|
+
p[32] = _mm_extract_epi8(dataTop, 15);
|
|
345
347
|
p += maskBitsB;
|
|
346
348
|
} else
|
|
347
349
|
# endif
|
|
348
350
|
{
|
|
349
|
-
|
|
350
|
-
dataA = _mm256_mask_alignr_epi8(dataA, (uint32_t)(-(int32_t)maskA), dataA,
|
|
351
|
+
__m256i dataSwapped = _mm256_permute4x64_epi64(dataA, _MM_SHUFFLE(1,0,3,2));
|
|
352
|
+
dataA = _mm256_mask_alignr_epi8(dataA, (uint32_t)(-(int32_t)maskA), dataA, dataSwapped, 15);
|
|
351
353
|
dataA = _mm256_ternarylogic_epi32(dataA, cmpA, _mm256_set1_epi8('='), 0xb8); // (data & ~cmp) | (cmp & '=')
|
|
352
354
|
_mm256_storeu_si256((__m256i*)p, dataA);
|
|
355
|
+
p[32] = _mm_extract_epi8(_mm256_castsi256_si128(dataSwapped), 15);
|
|
353
356
|
p += outputBytesA;
|
|
354
357
|
|
|
355
|
-
|
|
356
|
-
dataB = _mm256_mask_alignr_epi8(dataB, (uint32_t)(-(int32_t)maskB), dataB,
|
|
358
|
+
dataSwapped = _mm256_permute4x64_epi64(dataB, _MM_SHUFFLE(1,0,3,2));
|
|
359
|
+
dataB = _mm256_mask_alignr_epi8(dataB, (uint32_t)(-(int32_t)maskB), dataB, dataSwapped, 15);
|
|
357
360
|
dataB = _mm256_ternarylogic_epi32(dataB, cmpB, _mm256_set1_epi8('='), 0xb8);
|
|
358
361
|
_mm256_storeu_si256((__m256i*)p, dataB);
|
|
362
|
+
p[32] = _mm_extract_epi8(_mm256_castsi256_si128(dataSwapped), 15);
|
|
359
363
|
p += maskBitsB;
|
|
360
364
|
}
|
|
361
365
|
} else
|
|
@@ -484,28 +488,32 @@ HEDLEY_ALWAYS_INLINE void do_encode_avx2(int line_size, int* colOffset, const ui
|
|
|
484
488
|
if(use_isa >= ISA_LEVEL_AVX3) {
|
|
485
489
|
# if defined(__AVX512VBMI2__)
|
|
486
490
|
if(use_isa >= ISA_LEVEL_VBMI2) {
|
|
487
|
-
|
|
491
|
+
__m128i dataTop = _mm256_extracti128_si256(dataA, 1);
|
|
488
492
|
dataA = _mm256_mask_expand_epi8(_mm256_set1_epi8('='), KNOT32(maskA), dataA);
|
|
489
493
|
_mm256_storeu_si256((__m256i*)p, dataA);
|
|
494
|
+
p[32] = _mm_extract_epi8(dataTop, 15);
|
|
490
495
|
p += outputBytesA;
|
|
491
496
|
|
|
492
|
-
|
|
497
|
+
dataTop = _mm256_extracti128_si256(dataB, 1);
|
|
493
498
|
dataB = _mm256_mask_expand_epi8(_mm256_set1_epi8('='), KNOT32(maskB), dataB);
|
|
494
499
|
_mm256_storeu_si256((__m256i*)p, dataB);
|
|
500
|
+
p[32] = _mm_extract_epi8(dataTop, 15);
|
|
495
501
|
p += maskBitsB;
|
|
496
502
|
} else
|
|
497
503
|
# endif
|
|
498
504
|
{
|
|
499
|
-
|
|
500
|
-
dataA = _mm256_mask_alignr_epi8(dataA, (uint32_t)(-(int32_t)maskA), dataA,
|
|
505
|
+
__m256i dataSwapped = _mm256_permute4x64_epi64(dataA, _MM_SHUFFLE(1,0,3,2));
|
|
506
|
+
dataA = _mm256_mask_alignr_epi8(dataA, (uint32_t)(-(int32_t)maskA), dataA, dataSwapped, 15);
|
|
501
507
|
dataA = _mm256_ternarylogic_epi32(dataA, cmpA, _mm256_set1_epi8('='), 0xb8); // (data & ~cmp) | (cmp & '=')
|
|
502
508
|
_mm256_storeu_si256((__m256i*)p, dataA);
|
|
509
|
+
p[32] = _mm_extract_epi8(_mm256_castsi256_si128(dataSwapped), 15);
|
|
503
510
|
p += outputBytesA;
|
|
504
511
|
|
|
505
|
-
|
|
506
|
-
dataB = _mm256_mask_alignr_epi8(dataB, (uint32_t)(-(int32_t)maskB), dataB,
|
|
512
|
+
dataSwapped = _mm256_permute4x64_epi64(dataB, _MM_SHUFFLE(1,0,3,2));
|
|
513
|
+
dataB = _mm256_mask_alignr_epi8(dataB, (uint32_t)(-(int32_t)maskB), dataB, dataSwapped, 15);
|
|
507
514
|
dataB = _mm256_ternarylogic_epi32(dataB, cmpB, _mm256_set1_epi8('='), 0xb8);
|
|
508
515
|
_mm256_storeu_si256((__m256i*)p, dataB);
|
|
516
|
+
p[32] = _mm_extract_epi8(_mm256_castsi256_si128(dataSwapped), 15);
|
|
509
517
|
p += maskBitsB;
|
|
510
518
|
}
|
|
511
519
|
} else
|
package/src/encoder_neon.cc
CHANGED
|
@@ -5,7 +5,7 @@
|
|
|
5
5
|
#include "encoder_common.h"
|
|
6
6
|
|
|
7
7
|
// Clang wrongly assumes alignment on vst1q_u8_x2, and ARMv7 GCC doesn't support the function, so effectively, it can only be used in ARMv8 compilers
|
|
8
|
-
#if defined(__aarch64__) && (defined(__clang__) || (
|
|
8
|
+
#if defined(__aarch64__) && (defined(__clang__) || HEDLEY_GCC_VERSION_CHECK(8,5,0))
|
|
9
9
|
# define vst1q_u8_x2_unaligned vst1q_u8_x2
|
|
10
10
|
#else
|
|
11
11
|
static HEDLEY_ALWAYS_INLINE void vst1q_u8_x2_unaligned(uint8_t* p, uint8x16x2_t data) {
|
|
@@ -15,6 +15,43 @@ static HEDLEY_ALWAYS_INLINE void vst1q_u8_x2_unaligned(uint8_t* p, uint8x16x2_t
|
|
|
15
15
|
#endif
|
|
16
16
|
|
|
17
17
|
|
|
18
|
+
// ARM's CLZ instruction at native bit-width
|
|
19
|
+
#ifdef __aarch64__
|
|
20
|
+
static HEDLEY_ALWAYS_INLINE int clz_n(uint64_t v) {
|
|
21
|
+
# ifdef _MSC_VER
|
|
22
|
+
long r;
|
|
23
|
+
// does this work?
|
|
24
|
+
if(_BitScanReverse64((unsigned long*)&r, v))
|
|
25
|
+
r ^= 63;
|
|
26
|
+
else
|
|
27
|
+
r = 64;
|
|
28
|
+
return r;
|
|
29
|
+
# else
|
|
30
|
+
# if defined(__clang__) || HEDLEY_GCC_VERSION_CHECK(11,0,0)
|
|
31
|
+
// this pattern is only detected on GCC >= 11 (Clang 9 seems to as well, unsure about earlier versions)
|
|
32
|
+
// - note: return type must be 'int'; GCC fails to optimise this if type is 'long'
|
|
33
|
+
// GCC <= 10 doesn't optimize around the '0 = undefined behaviour', so not needed there
|
|
34
|
+
if(v == 0) return 64;
|
|
35
|
+
# endif
|
|
36
|
+
return __builtin_clzll(v);
|
|
37
|
+
# endif
|
|
38
|
+
}
|
|
39
|
+
#else
|
|
40
|
+
static HEDLEY_ALWAYS_INLINE int clz_n(uint32_t v) {
|
|
41
|
+
# ifdef __GNUC__
|
|
42
|
+
# if defined(__clang__) || HEDLEY_GCC_VERSION_CHECK(7,0,0)
|
|
43
|
+
// as with AArch64 version above, only insert this check if compiler can optimise it away
|
|
44
|
+
if(v == 0) return 32;
|
|
45
|
+
# endif
|
|
46
|
+
return __builtin_clz(v);
|
|
47
|
+
# elif defined(_MSC_VER)
|
|
48
|
+
return _arm_clz(v);
|
|
49
|
+
# else
|
|
50
|
+
return __clz(v); // ARM compiler?
|
|
51
|
+
# endif
|
|
52
|
+
}
|
|
53
|
+
#endif
|
|
54
|
+
|
|
18
55
|
static uint8x16_t ALIGN_TO(16, shufLUT[256]);
|
|
19
56
|
static uint16_t expandLUT[256];
|
|
20
57
|
|
|
@@ -195,26 +232,7 @@ static HEDLEY_ALWAYS_INLINE void encode_eol_handle_pre(const uint8_t* HEDLEY_RES
|
|
|
195
232
|
col = shufTotalLen+1 + lineSizeOffset-32;
|
|
196
233
|
} else {
|
|
197
234
|
// shuffle stuff up
|
|
198
|
-
|
|
199
|
-
# ifdef _MSC_VER
|
|
200
|
-
long bitIndex;
|
|
201
|
-
if(_BitScanReverse64((unsigned long*)&bitIndex, mask))
|
|
202
|
-
bitIndex ^= 63;
|
|
203
|
-
else
|
|
204
|
-
bitIndex = 64;
|
|
205
|
-
# else
|
|
206
|
-
long bitIndex = __builtin_clzll(mask);
|
|
207
|
-
# endif
|
|
208
|
-
#else
|
|
209
|
-
# ifdef __GNUC__
|
|
210
|
-
long bitIndex = __builtin_clz(mask); // TODO: is the 'undefined if 0' case problematic here?
|
|
211
|
-
# elif defined(_MSC_VER)
|
|
212
|
-
long bitIndex = _arm_clz(mask);
|
|
213
|
-
# else
|
|
214
|
-
long bitIndex = __clz(mask); // ARM compiler?
|
|
215
|
-
# endif
|
|
216
|
-
#endif
|
|
217
|
-
|
|
235
|
+
long bitIndex = clz_n(mask);
|
|
218
236
|
uint8x16_t vClz = vdupq_n_u8(bitIndex & ~(sizeof(mask)*8));
|
|
219
237
|
#ifdef __aarch64__
|
|
220
238
|
uint8x16_t blendA = vcgtq_u8(vmakeq_u8(63,62,61,60,51,50,49,48,47,46,45,44,35,34,33,32), vClz);
|
|
@@ -450,26 +468,7 @@ HEDLEY_ALWAYS_INLINE void do_encode_neon(int line_size, int* colOffset, const ui
|
|
|
450
468
|
}
|
|
451
469
|
} else {
|
|
452
470
|
{
|
|
453
|
-
|
|
454
|
-
# ifdef _MSC_VER
|
|
455
|
-
// does this work?
|
|
456
|
-
if(_BitScanReverse64((unsigned long*)&bitIndex, mask))
|
|
457
|
-
bitIndex ^= 63;
|
|
458
|
-
else
|
|
459
|
-
bitIndex = 64;
|
|
460
|
-
# else
|
|
461
|
-
bitIndex = __builtin_clzll(mask); // TODO: is the 'undefined if 0' case problematic here?
|
|
462
|
-
# endif
|
|
463
|
-
#else
|
|
464
|
-
# ifdef __GNUC__
|
|
465
|
-
bitIndex = __builtin_clz(mask);
|
|
466
|
-
# elif defined(_MSC_VER)
|
|
467
|
-
bitIndex = _arm_clz(mask);
|
|
468
|
-
# else
|
|
469
|
-
bitIndex = __clz(mask); // ARM compiler?
|
|
470
|
-
# endif
|
|
471
|
-
#endif
|
|
472
|
-
|
|
471
|
+
bitIndex = clz_n(mask);
|
|
473
472
|
uint8x16_t vClz = vdupq_n_u8(bitIndex & ~(sizeof(mask)*8));
|
|
474
473
|
#ifdef __aarch64__
|
|
475
474
|
uint8x16_t blendA = vcgeq_u8(vmakeq_u8(63,62,61,60,51,50,49,48,47,46,45,44,35,34,33,32), vClz);
|