yencode 1.1.3 → 1.1.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +3 -2
- package/binding.gyp +75 -12
- package/index.js +21 -19
- package/package.json +2 -1
- package/src/common.h +43 -5
- package/src/crc.cc +137 -15
- package/src/crc.h +4 -0
- package/src/crc_arm.cc +11 -6
- package/src/crc_folding.cc +4 -5
- package/src/crc_folding_256.cc +10 -10
- package/src/decoder.cc +9 -4
- package/src/decoder.h +9 -5
- package/src/decoder_avx.cc +1 -0
- package/src/decoder_avx2.cc +1 -0
- package/src/decoder_avx2_base.h +14 -18
- package/src/decoder_common.h +30 -5
- package/src/decoder_neon.cc +7 -13
- package/src/decoder_neon64.cc +7 -12
- package/src/decoder_sse2.cc +1 -0
- package/src/decoder_sse_base.h +15 -14
- package/src/decoder_ssse3.cc +1 -0
- package/src/decoder_vbmi2.cc +9 -0
- package/src/encoder.cc +10 -1
- package/src/encoder.h +4 -0
- package/src/encoder_avx.cc +1 -0
- package/src/encoder_avx2.cc +1 -0
- package/src/encoder_avx_base.h +22 -14
- package/src/encoder_neon.cc +40 -40
- package/src/encoder_rvv.cc +220 -0
- package/src/encoder_sse2.cc +1 -0
- package/src/encoder_sse_base.h +3 -3
- package/src/encoder_ssse3.cc +1 -0
- package/src/encoder_vbmi2.cc +9 -0
- package/src/hedley.h +278 -135
- package/src/platform.cc +57 -9
- package/src/test_alignalloc.c +6 -0
- package/test/_speedbase.js +12 -11
- package/test/speeddec.js +6 -5
- package/test/testcrc.js +2 -2
- package/test/testdec.js +31 -15
- package/test/testenc.js +11 -8
- package/test/testpostdec.js +6 -5
package/src/crc_folding.cc
CHANGED
|
@@ -365,12 +365,11 @@ static uint32_t do_crc32_incremental_clmul(const void* data, size_t length, uint
|
|
|
365
365
|
return crc_fold((const unsigned char*)data, (long)length, init);
|
|
366
366
|
}
|
|
367
367
|
|
|
368
|
-
void crc_clmul_set_funcs(
|
|
369
|
-
|
|
368
|
+
void crc_clmul_set_funcs() {
|
|
369
|
+
_do_crc32_incremental = &do_crc32_incremental_clmul;
|
|
370
|
+
_crc32_isa = ISA_LEVEL_PCLMUL;
|
|
370
371
|
}
|
|
371
372
|
#else
|
|
372
|
-
void crc_clmul_set_funcs(
|
|
373
|
-
(void)_do_crc32_incremental;
|
|
374
|
-
}
|
|
373
|
+
void crc_clmul_set_funcs() {}
|
|
375
374
|
#endif
|
|
376
375
|
|
package/src/crc_folding_256.cc
CHANGED
|
@@ -26,10 +26,9 @@ static __m256i do_one_fold(__m256i src, __m256i data) {
|
|
|
26
26
|
0x96
|
|
27
27
|
);
|
|
28
28
|
#else
|
|
29
|
-
return _mm256_xor_si256(
|
|
30
|
-
_mm256_clmulepi64_epi128(src, fold4, 0x01)
|
|
31
|
-
|
|
32
|
-
));
|
|
29
|
+
return _mm256_xor_si256(_mm256_xor_si256(
|
|
30
|
+
data, _mm256_clmulepi64_epi128(src, fold4, 0x01)
|
|
31
|
+
), _mm256_clmulepi64_epi128(src, fold4, 0x10));
|
|
33
32
|
#endif
|
|
34
33
|
}
|
|
35
34
|
|
|
@@ -38,7 +37,7 @@ ALIGN_TO(32, static const uint8_t pshufb_rot_table[]) = {
|
|
|
38
37
|
16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31
|
|
39
38
|
};
|
|
40
39
|
// _mm256_castsi128_si256, but upper is defined to be 0
|
|
41
|
-
#if (defined(__clang__) && __clang_major__ >= 5 && (!defined(__APPLE__) || __clang_major__ >= 7)) || (defined(__GNUC__) && __GNUC__ >= 10)
|
|
40
|
+
#if (defined(__clang__) && __clang_major__ >= 5 && (!defined(__APPLE__) || __clang_major__ >= 7)) || (defined(__GNUC__) && __GNUC__ >= 10) || (defined(_MSC_VER) && _MSC_VER >= 1910)
|
|
42
41
|
// intrinsic unsupported in GCC 9 and MSVC < 2017
|
|
43
42
|
# define zext128_256 _mm256_zextsi128_si256
|
|
44
43
|
#else
|
|
@@ -218,13 +217,14 @@ static uint32_t do_crc32_incremental_clmul(const void* data, size_t length, uint
|
|
|
218
217
|
return crc_fold((const unsigned char*)data, (long)length, init);
|
|
219
218
|
}
|
|
220
219
|
|
|
221
|
-
void crc_clmul256_set_funcs(
|
|
222
|
-
|
|
220
|
+
void crc_clmul256_set_funcs() {
|
|
221
|
+
_do_crc32_incremental = &do_crc32_incremental_clmul;
|
|
222
|
+
_crc32_isa = ISA_LEVEL_VPCLMUL;
|
|
223
223
|
}
|
|
224
224
|
#else
|
|
225
|
-
void crc_clmul_set_funcs(
|
|
226
|
-
void crc_clmul256_set_funcs(
|
|
227
|
-
crc_clmul_set_funcs(
|
|
225
|
+
void crc_clmul_set_funcs();
|
|
226
|
+
void crc_clmul256_set_funcs() {
|
|
227
|
+
crc_clmul_set_funcs();
|
|
228
228
|
}
|
|
229
229
|
#endif
|
|
230
230
|
|
package/src/decoder.cc
CHANGED
|
@@ -4,9 +4,11 @@
|
|
|
4
4
|
#include "decoder.h"
|
|
5
5
|
|
|
6
6
|
extern "C" {
|
|
7
|
-
YencDecoderEnd (*_do_decode)(const unsigned char
|
|
8
|
-
YencDecoderEnd (*_do_decode_raw)(const unsigned char
|
|
9
|
-
YencDecoderEnd (*_do_decode_end_raw)(const unsigned char
|
|
7
|
+
YencDecoderEnd (*_do_decode)(const unsigned char**, unsigned char**, size_t, YencDecoderState*) = &do_decode_scalar<false, false>;
|
|
8
|
+
YencDecoderEnd (*_do_decode_raw)(const unsigned char**, unsigned char**, size_t, YencDecoderState*) = &do_decode_scalar<true, false>;
|
|
9
|
+
YencDecoderEnd (*_do_decode_end_raw)(const unsigned char**, unsigned char**, size_t, YencDecoderState*) = &do_decode_end_scalar<true>;
|
|
10
|
+
|
|
11
|
+
int _decode_isa = ISA_GENERIC;
|
|
10
12
|
}
|
|
11
13
|
|
|
12
14
|
void decoder_set_sse2_funcs();
|
|
@@ -14,6 +16,7 @@ void decoder_set_ssse3_funcs();
|
|
|
14
16
|
void decoder_set_avx_funcs();
|
|
15
17
|
void decoder_set_avx2_funcs();
|
|
16
18
|
void decoder_set_vbmi2_funcs();
|
|
19
|
+
extern const bool decoder_has_avx10;
|
|
17
20
|
void decoder_set_neon_funcs();
|
|
18
21
|
|
|
19
22
|
|
|
@@ -26,6 +29,7 @@ static inline void decoder_set_native_funcs() {
|
|
|
26
29
|
_do_decode = &do_decode_simd<false, false, sizeof(__m256i)*2, do_decode_avx2<false, false, ISA_NATIVE> >;
|
|
27
30
|
_do_decode_raw = &do_decode_simd<true, false, sizeof(__m256i)*2, do_decode_avx2<true, false, ISA_NATIVE> >;
|
|
28
31
|
_do_decode_end_raw = &do_decode_simd<true, true, sizeof(__m256i)*2, do_decode_avx2<true, true, ISA_NATIVE> >;
|
|
32
|
+
_decode_isa = ISA_NATIVE;
|
|
29
33
|
}
|
|
30
34
|
# else
|
|
31
35
|
# include "decoder_sse_base.h"
|
|
@@ -35,6 +39,7 @@ static inline void decoder_set_native_funcs() {
|
|
|
35
39
|
_do_decode = &do_decode_simd<false, false, sizeof(__m128i)*2, do_decode_sse<false, false, ISA_NATIVE> >;
|
|
36
40
|
_do_decode_raw = &do_decode_simd<true, false, sizeof(__m128i)*2, do_decode_sse<true, false, ISA_NATIVE> >;
|
|
37
41
|
_do_decode_end_raw = &do_decode_simd<true, true, sizeof(__m128i)*2, do_decode_sse<true, true, ISA_NATIVE> >;
|
|
42
|
+
_decode_isa = ISA_NATIVE;
|
|
38
43
|
}
|
|
39
44
|
# endif
|
|
40
45
|
#endif
|
|
@@ -45,7 +50,7 @@ void decoder_init() {
|
|
|
45
50
|
decoder_set_native_funcs();
|
|
46
51
|
# else
|
|
47
52
|
int use_isa = cpu_supports_isa();
|
|
48
|
-
if(use_isa >= ISA_LEVEL_VBMI2)
|
|
53
|
+
if(use_isa >= ISA_LEVEL_VBMI2 && (decoder_has_avx10 || (use_isa & ISA_FEATURE_EVEX512)))
|
|
49
54
|
decoder_set_vbmi2_funcs();
|
|
50
55
|
else if(use_isa >= ISA_LEVEL_AVX2)
|
|
51
56
|
decoder_set_avx2_funcs();
|
package/src/decoder.h
CHANGED
|
@@ -29,22 +29,26 @@ typedef enum {
|
|
|
29
29
|
|
|
30
30
|
#include "hedley.h"
|
|
31
31
|
|
|
32
|
-
extern YencDecoderEnd (*_do_decode)(const unsigned char
|
|
33
|
-
extern YencDecoderEnd (*_do_decode_raw)(const unsigned char
|
|
34
|
-
extern YencDecoderEnd (*_do_decode_end_raw)(const unsigned char
|
|
32
|
+
extern YencDecoderEnd (*_do_decode)(const unsigned char**, unsigned char**, size_t, YencDecoderState*);
|
|
33
|
+
extern YencDecoderEnd (*_do_decode_raw)(const unsigned char**, unsigned char**, size_t, YencDecoderState*);
|
|
34
|
+
extern YencDecoderEnd (*_do_decode_end_raw)(const unsigned char**, unsigned char**, size_t, YencDecoderState*);
|
|
35
|
+
extern int _decode_isa;
|
|
35
36
|
|
|
36
|
-
static inline size_t do_decode(int isRaw, const unsigned char*
|
|
37
|
+
static inline size_t do_decode(int isRaw, const unsigned char* src, unsigned char* dest, size_t len, YencDecoderState* state) {
|
|
37
38
|
unsigned char* ds = dest;
|
|
38
39
|
(*(isRaw ? _do_decode_raw : _do_decode))(&src, &ds, len, state);
|
|
39
40
|
return ds - dest;
|
|
40
41
|
}
|
|
41
42
|
|
|
42
|
-
static inline YencDecoderEnd do_decode_end(const unsigned char
|
|
43
|
+
static inline YencDecoderEnd do_decode_end(const unsigned char** src, unsigned char** dest, size_t len, YencDecoderState* state) {
|
|
43
44
|
return _do_decode_end_raw(src, dest, len, state);
|
|
44
45
|
}
|
|
45
46
|
|
|
46
47
|
void decoder_init();
|
|
47
48
|
|
|
49
|
+
static inline int decode_isa_level() {
|
|
50
|
+
return _decode_isa;
|
|
51
|
+
}
|
|
48
52
|
|
|
49
53
|
|
|
50
54
|
#ifdef __cplusplus
|
package/src/decoder_avx.cc
CHANGED
|
@@ -9,6 +9,7 @@ void decoder_set_avx_funcs() {
|
|
|
9
9
|
_do_decode = &do_decode_simd<false, false, sizeof(__m128i)*2, do_decode_sse<false, false, ISA_LEVEL_SSE4_POPCNT> >;
|
|
10
10
|
_do_decode_raw = &do_decode_simd<true, false, sizeof(__m128i)*2, do_decode_sse<true, false, ISA_LEVEL_SSE4_POPCNT> >;
|
|
11
11
|
_do_decode_end_raw = &do_decode_simd<true, true, sizeof(__m128i)*2, do_decode_sse<true, true, ISA_LEVEL_SSE4_POPCNT> >;
|
|
12
|
+
_decode_isa = ISA_LEVEL_AVX;
|
|
12
13
|
}
|
|
13
14
|
#else
|
|
14
15
|
void decoder_set_ssse3_funcs();
|
package/src/decoder_avx2.cc
CHANGED
|
@@ -9,6 +9,7 @@ void decoder_set_avx2_funcs() {
|
|
|
9
9
|
_do_decode = &do_decode_simd<false, false, sizeof(__m256i)*2, do_decode_avx2<false, false, ISA_LEVEL_AVX2> >;
|
|
10
10
|
_do_decode_raw = &do_decode_simd<true, false, sizeof(__m256i)*2, do_decode_avx2<true, false, ISA_LEVEL_AVX2> >;
|
|
11
11
|
_do_decode_end_raw = &do_decode_simd<true, true, sizeof(__m256i)*2, do_decode_avx2<true, true, ISA_LEVEL_AVX2> >;
|
|
12
|
+
_decode_isa = ISA_LEVEL_AVX2;
|
|
12
13
|
}
|
|
13
14
|
#else
|
|
14
15
|
void decoder_set_avx_funcs();
|
package/src/decoder_avx2_base.h
CHANGED
|
@@ -30,7 +30,7 @@ static HEDLEY_ALWAYS_INLINE __m256i force_align_read_256(const void* p) {
|
|
|
30
30
|
}
|
|
31
31
|
|
|
32
32
|
// _mm256_castsi128_si256, but upper is defined to be 0
|
|
33
|
-
#if (defined(__clang__) && __clang_major__ >= 5 && (!defined(__APPLE__) || __clang_major__ >= 7)) || (defined(__GNUC__) && __GNUC__ >= 10)
|
|
33
|
+
#if (defined(__clang__) && __clang_major__ >= 5 && (!defined(__APPLE__) || __clang_major__ >= 7)) || (defined(__GNUC__) && __GNUC__ >= 10) || (defined(_MSC_VER) && _MSC_VER >= 1910)
|
|
34
34
|
// intrinsic unsupported in GCC 9 and MSVC < 2017
|
|
35
35
|
# define zext128_256 _mm256_zextsi128_si256
|
|
36
36
|
#else
|
|
@@ -43,9 +43,15 @@ static HEDLEY_ALWAYS_INLINE __m256i force_align_read_256(const void* p) {
|
|
|
43
43
|
# endif
|
|
44
44
|
#endif
|
|
45
45
|
|
|
46
|
+
#if defined(__tune_icelake_client__) || defined(__tune_icelake_server__) || defined(__tune_tigerlake__) || defined(__tune_rocketlake__) || defined(__tune_alderlake__) || defined(__tune_sapphirerapids__)
|
|
47
|
+
# define COMPRESS_STORE _mm256_mask_compressstoreu_epi8
|
|
48
|
+
#else
|
|
49
|
+
// avoid uCode on Zen4
|
|
50
|
+
# define COMPRESS_STORE(dst, mask, vec) _mm256_storeu_si256((__m256i*)(dst), _mm256_maskz_compress_epi8(mask, vec))
|
|
51
|
+
#endif
|
|
46
52
|
|
|
47
53
|
template<bool isRaw, bool searchEnd, enum YEncDecIsaLevel use_isa>
|
|
48
|
-
HEDLEY_ALWAYS_INLINE void do_decode_avx2(const uint8_t*
|
|
54
|
+
HEDLEY_ALWAYS_INLINE void do_decode_avx2(const uint8_t* src, long& len, unsigned char*& p, unsigned char& _escFirst, uint16_t& _nextMask) {
|
|
49
55
|
HEDLEY_ASSUME(_escFirst == 0 || _escFirst == 1);
|
|
50
56
|
HEDLEY_ASSUME(_nextMask == 0 || _nextMask == 1 || _nextMask == 2);
|
|
51
57
|
uintptr_t escFirst = _escFirst;
|
|
@@ -61,6 +67,8 @@ HEDLEY_ALWAYS_INLINE void do_decode_avx2(const uint8_t* HEDLEY_RESTRICT src, lon
|
|
|
61
67
|
);
|
|
62
68
|
}
|
|
63
69
|
|
|
70
|
+
decoder_set_nextMask<isRaw>(src, len, _nextMask); // set this before the loop because we can't check src after it's been overwritten
|
|
71
|
+
|
|
64
72
|
// for some reason, MSVC Win32 seems to crash when trying to compile _mm256_mask_cmpeq_epi8_mask
|
|
65
73
|
// the crash can be fixed by switching the order of the last two arguments, but it seems to generate wrong code
|
|
66
74
|
// so just disable the optimisation as it seems to be problematic there
|
|
@@ -314,6 +322,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_avx2(const uint8_t* HEDLEY_RESTRICT src, lon
|
|
|
314
322
|
// terminator found
|
|
315
323
|
// there's probably faster ways to do this, but reverting to scalar code should be good enough
|
|
316
324
|
len += (long)i;
|
|
325
|
+
_nextMask = decoder_set_nextMask<isRaw>(src+i, mask);
|
|
317
326
|
break;
|
|
318
327
|
}
|
|
319
328
|
}
|
|
@@ -406,6 +415,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_avx2(const uint8_t* HEDLEY_RESTRICT src, lon
|
|
|
406
415
|
}
|
|
407
416
|
if(endFound) {
|
|
408
417
|
len += (long)i;
|
|
418
|
+
_nextMask = decoder_set_nextMask<isRaw>(src+i, mask);
|
|
409
419
|
break;
|
|
410
420
|
}
|
|
411
421
|
}
|
|
@@ -541,9 +551,9 @@ HEDLEY_ALWAYS_INLINE void do_decode_avx2(const uint8_t* HEDLEY_RESTRICT src, lon
|
|
|
541
551
|
// all that's left is to 'compress' the data (skip over masked chars)
|
|
542
552
|
#if defined(__AVX512VBMI2__) && defined(__AVX512VL__)
|
|
543
553
|
if(use_isa >= ISA_LEVEL_VBMI2) {
|
|
544
|
-
|
|
554
|
+
COMPRESS_STORE(p, KNOT32(mask), dataA);
|
|
545
555
|
p -= popcnt32(mask & 0xffffffff);
|
|
546
|
-
|
|
556
|
+
COMPRESS_STORE((p + XMM_SIZE*2), KNOT32(mask>>32), dataB);
|
|
547
557
|
p += XMM_SIZE*4 - popcnt32(mask >> 32);
|
|
548
558
|
} else
|
|
549
559
|
#endif
|
|
@@ -607,20 +617,6 @@ HEDLEY_ALWAYS_INLINE void do_decode_avx2(const uint8_t* HEDLEY_RESTRICT src, lon
|
|
|
607
617
|
}
|
|
608
618
|
}
|
|
609
619
|
_escFirst = (unsigned char)escFirst;
|
|
610
|
-
if(isRaw) {
|
|
611
|
-
// this would be the trivial solution, but requires the compiler holding onto minMask throughout the loop:
|
|
612
|
-
//_nextMask = ~(uint16_t)_mm256_movemask_epi8(_mm256_cmpeq_epi8(minMask, _mm256_set1_epi8('.')));
|
|
613
|
-
// instead, just scan the memory to determine what to set nextMask to
|
|
614
|
-
if(len != 0) { // have to gone through at least one loop cycle
|
|
615
|
-
if(src[i-2] == '\r' && src[i-1] == '\n' && src[i] == '.')
|
|
616
|
-
_nextMask = 1;
|
|
617
|
-
else if(src[i-1] == '\r' && src[i] == '\n' && src[i+1] == '.')
|
|
618
|
-
_nextMask = 2;
|
|
619
|
-
else
|
|
620
|
-
_nextMask = 0;
|
|
621
|
-
}
|
|
622
|
-
} else
|
|
623
|
-
_nextMask = 0;
|
|
624
620
|
_mm256_zeroupper();
|
|
625
621
|
}
|
|
626
622
|
#endif
|
package/src/decoder_common.h
CHANGED
|
@@ -6,7 +6,7 @@
|
|
|
6
6
|
|
|
7
7
|
// state var: refers to the previous state - only used for incremental processing
|
|
8
8
|
template<bool isRaw>
|
|
9
|
-
size_t do_decode_noend_scalar(const unsigned char*
|
|
9
|
+
size_t do_decode_noend_scalar(const unsigned char* src, unsigned char* dest, size_t len, YencDecoderState* state) {
|
|
10
10
|
const unsigned char *es = src + len; // end source pointer
|
|
11
11
|
unsigned char *p = dest; // destination pointer
|
|
12
12
|
long i = -(long)len; // input position
|
|
@@ -140,7 +140,7 @@ size_t do_decode_noend_scalar(const unsigned char* HEDLEY_RESTRICT src, unsigned
|
|
|
140
140
|
}
|
|
141
141
|
|
|
142
142
|
template<bool isRaw>
|
|
143
|
-
YencDecoderEnd do_decode_end_scalar(const unsigned char
|
|
143
|
+
YencDecoderEnd do_decode_end_scalar(const unsigned char** src, unsigned char** dest, size_t len, YencDecoderState* state) {
|
|
144
144
|
const unsigned char *es = (*src) + len; // end source pointer
|
|
145
145
|
unsigned char *p = *dest; // destination pointer
|
|
146
146
|
long i = -(long)len; // input position
|
|
@@ -321,7 +321,7 @@ YencDecoderEnd do_decode_end_scalar(const unsigned char* HEDLEY_RESTRICT* src, u
|
|
|
321
321
|
}
|
|
322
322
|
|
|
323
323
|
template<bool isRaw, bool searchEnd>
|
|
324
|
-
YencDecoderEnd do_decode_scalar(const unsigned char
|
|
324
|
+
YencDecoderEnd do_decode_scalar(const unsigned char** src, unsigned char** dest, size_t len, YencDecoderState* state) {
|
|
325
325
|
if(searchEnd)
|
|
326
326
|
return do_decode_end_scalar<isRaw>(src, dest, len, state);
|
|
327
327
|
*dest += do_decode_noend_scalar<isRaw>(*src, *dest, len, state);
|
|
@@ -331,8 +331,8 @@ YencDecoderEnd do_decode_scalar(const unsigned char* HEDLEY_RESTRICT* src, unsig
|
|
|
331
331
|
|
|
332
332
|
|
|
333
333
|
|
|
334
|
-
template<bool isRaw, bool searchEnd, int width, void(&kernel)(const uint8_t
|
|
335
|
-
YencDecoderEnd do_decode_simd(const unsigned char
|
|
334
|
+
template<bool isRaw, bool searchEnd, int width, void(&kernel)(const uint8_t*, long&, unsigned char*&, unsigned char&, uint16_t&)>
|
|
335
|
+
YencDecoderEnd do_decode_simd(const unsigned char** src, unsigned char** dest, size_t len, YencDecoderState* state) {
|
|
336
336
|
if(len <= width*2) return do_decode_scalar<isRaw, searchEnd>(src, dest, len, state);
|
|
337
337
|
|
|
338
338
|
YencDecoderState tState = YDEC_STATE_CRLF;
|
|
@@ -509,4 +509,29 @@ static inline void decoder_init_lut(uint8_t* eqFixLUT, void* compactLUT) {
|
|
|
509
509
|
}
|
|
510
510
|
#endif
|
|
511
511
|
}
|
|
512
|
+
template<bool isRaw>
|
|
513
|
+
static inline void decoder_set_nextMask(const uint8_t* src, size_t len, uint16_t& nextMask) {
|
|
514
|
+
if(isRaw) {
|
|
515
|
+
if(len != 0) { // have to gone through at least one loop cycle
|
|
516
|
+
if(src[-2] == '\r' && src[-1] == '\n' && src[0] == '.')
|
|
517
|
+
nextMask = 1;
|
|
518
|
+
else if(src[-1] == '\r' && src[0] == '\n' && src[1] == '.')
|
|
519
|
+
nextMask = 2;
|
|
520
|
+
else
|
|
521
|
+
nextMask = 0;
|
|
522
|
+
}
|
|
523
|
+
} else
|
|
524
|
+
nextMask = 0;
|
|
525
|
+
}
|
|
512
526
|
|
|
527
|
+
// without backtracking
|
|
528
|
+
template<bool isRaw>
|
|
529
|
+
static inline uint16_t decoder_set_nextMask(const uint8_t* src, unsigned mask) {
|
|
530
|
+
if(isRaw) {
|
|
531
|
+
if(src[0] == '.')
|
|
532
|
+
return mask & 1;
|
|
533
|
+
if(src[1] == '.')
|
|
534
|
+
return mask & 2;
|
|
535
|
+
}
|
|
536
|
+
return 0;
|
|
537
|
+
}
|
package/src/decoder_neon.cc
CHANGED
|
@@ -59,7 +59,7 @@ static bool neon_vect_is_nonzero(uint8x16_t v) {
|
|
|
59
59
|
|
|
60
60
|
|
|
61
61
|
template<bool isRaw, bool searchEnd>
|
|
62
|
-
HEDLEY_ALWAYS_INLINE void do_decode_neon(const uint8_t*
|
|
62
|
+
HEDLEY_ALWAYS_INLINE void do_decode_neon(const uint8_t* src, long& len, unsigned char*& p, unsigned char& escFirst, uint16_t& nextMask) {
|
|
63
63
|
HEDLEY_ASSUME(escFirst == 0 || escFirst == 1);
|
|
64
64
|
HEDLEY_ASSUME(nextMask == 0 || nextMask == 1 || nextMask == 2);
|
|
65
65
|
uint8x16_t yencOffset = escFirst ? vmakeq_u8(42+64,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42) : vdupq_n_u8(42);
|
|
@@ -78,6 +78,9 @@ HEDLEY_ALWAYS_INLINE void do_decode_neon(const uint8_t* HEDLEY_RESTRICT src, lon
|
|
|
78
78
|
lfCompare = vsetq_lane_u8('.', lfCompare, 1);
|
|
79
79
|
}
|
|
80
80
|
#endif
|
|
81
|
+
|
|
82
|
+
decoder_set_nextMask<isRaw>(src, len, nextMask);
|
|
83
|
+
|
|
81
84
|
long i;
|
|
82
85
|
for(i = -len; i; i += sizeof(uint8x16_t)*2) {
|
|
83
86
|
uint8x16x2_t data = vld1q_u8_x2_align(src+i, 32);
|
|
@@ -251,6 +254,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_neon(const uint8_t* HEDLEY_RESTRICT src, lon
|
|
|
251
254
|
// terminator found
|
|
252
255
|
// there's probably faster ways to do this, but reverting to scalar code should be good enough
|
|
253
256
|
len += i;
|
|
257
|
+
nextMask = decoder_set_nextMask<isRaw>(src+i, mask);
|
|
254
258
|
break;
|
|
255
259
|
}
|
|
256
260
|
}
|
|
@@ -301,6 +305,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_neon(const uint8_t* HEDLEY_RESTRICT src, lon
|
|
|
301
305
|
);
|
|
302
306
|
if(LIKELIHOOD(0.001, neon_vect_is_nonzero(matchEnd))) {
|
|
303
307
|
len += i;
|
|
308
|
+
nextMask = decoder_set_nextMask<isRaw>(src+i, mask);
|
|
304
309
|
break;
|
|
305
310
|
}
|
|
306
311
|
}
|
|
@@ -449,18 +454,6 @@ HEDLEY_ALWAYS_INLINE void do_decode_neon(const uint8_t* HEDLEY_RESTRICT src, lon
|
|
|
449
454
|
#endif
|
|
450
455
|
}
|
|
451
456
|
}
|
|
452
|
-
|
|
453
|
-
if(isRaw) {
|
|
454
|
-
if(len != 0) { // have to gone through at least one loop cycle
|
|
455
|
-
if(src[i-2] == '\r' && src[i-1] == '\n' && src[i] == '.')
|
|
456
|
-
nextMask = 1;
|
|
457
|
-
else if(src[i-1] == '\r' && src[i] == '\n' && src[i+1] == '.')
|
|
458
|
-
nextMask = 2;
|
|
459
|
-
else
|
|
460
|
-
nextMask = 0;
|
|
461
|
-
}
|
|
462
|
-
} else
|
|
463
|
-
nextMask = 0;
|
|
464
457
|
}
|
|
465
458
|
|
|
466
459
|
void decoder_set_neon_funcs() {
|
|
@@ -468,6 +461,7 @@ void decoder_set_neon_funcs() {
|
|
|
468
461
|
_do_decode = &do_decode_simd<false, false, sizeof(uint8x16_t)*2, do_decode_neon<false, false> >;
|
|
469
462
|
_do_decode_raw = &do_decode_simd<true, false, sizeof(uint8x16_t)*2, do_decode_neon<true, false> >;
|
|
470
463
|
_do_decode_end_raw = &do_decode_simd<true, true, sizeof(uint8x16_t)*2, do_decode_neon<true, true> >;
|
|
464
|
+
_decode_isa = ISA_LEVEL_NEON;
|
|
471
465
|
}
|
|
472
466
|
#else
|
|
473
467
|
void decoder_set_neon_funcs() {}
|
package/src/decoder_neon64.cc
CHANGED
|
@@ -47,7 +47,7 @@ static HEDLEY_ALWAYS_INLINE uint8x16_t mergeCompares(uint8x16_t a, uint8x16_t b,
|
|
|
47
47
|
|
|
48
48
|
|
|
49
49
|
template<bool isRaw, bool searchEnd>
|
|
50
|
-
HEDLEY_ALWAYS_INLINE void do_decode_neon(const uint8_t*
|
|
50
|
+
HEDLEY_ALWAYS_INLINE void do_decode_neon(const uint8_t* src, long& len, unsigned char*& p, unsigned char& escFirst, uint16_t& nextMask) {
|
|
51
51
|
HEDLEY_ASSUME(escFirst == 0 || escFirst == 1);
|
|
52
52
|
HEDLEY_ASSUME(nextMask == 0 || nextMask == 1 || nextMask == 2);
|
|
53
53
|
uint8x16_t nextMaskMix = vdupq_n_u8(0);
|
|
@@ -56,6 +56,9 @@ HEDLEY_ALWAYS_INLINE void do_decode_neon(const uint8_t* HEDLEY_RESTRICT src, lon
|
|
|
56
56
|
if(nextMask == 2)
|
|
57
57
|
nextMaskMix = vsetq_lane_u8(2, nextMaskMix, 1);
|
|
58
58
|
uint8x16_t yencOffset = escFirst ? vmakeq_u8(42+64,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42) : vdupq_n_u8(42);
|
|
59
|
+
|
|
60
|
+
decoder_set_nextMask<isRaw>(src, len, nextMask);
|
|
61
|
+
|
|
59
62
|
long i;
|
|
60
63
|
for(i = -len; i; i += sizeof(uint8x16_t)*4) {
|
|
61
64
|
uint8x16x4_t data = _vld1q_u8_x4(src+i);
|
|
@@ -227,6 +230,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_neon(const uint8_t* HEDLEY_RESTRICT src, lon
|
|
|
227
230
|
// terminator found
|
|
228
231
|
// there's probably faster ways to do this, but reverting to scalar code should be good enough
|
|
229
232
|
len += i;
|
|
233
|
+
nextMask = decoder_set_nextMask<isRaw>(src+i, mask);
|
|
230
234
|
break;
|
|
231
235
|
}
|
|
232
236
|
}
|
|
@@ -275,6 +279,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_neon(const uint8_t* HEDLEY_RESTRICT src, lon
|
|
|
275
279
|
);
|
|
276
280
|
if(LIKELIHOOD(0.001, neon_vect_is_nonzero(matchEnd))) {
|
|
277
281
|
len += i;
|
|
282
|
+
nextMask = decoder_set_nextMask<isRaw>(src+i, mask);
|
|
278
283
|
break;
|
|
279
284
|
}
|
|
280
285
|
}
|
|
@@ -430,17 +435,6 @@ HEDLEY_ALWAYS_INLINE void do_decode_neon(const uint8_t* HEDLEY_RESTRICT src, lon
|
|
|
430
435
|
yencOffset = vdupq_n_u8(42);
|
|
431
436
|
}
|
|
432
437
|
}
|
|
433
|
-
if(isRaw) {
|
|
434
|
-
if(len != 0) { // have to gone through at least one loop cycle
|
|
435
|
-
if(src[i-2] == '\r' && src[i-1] == '\n' && src[i] == '.')
|
|
436
|
-
nextMask = 1;
|
|
437
|
-
else if(src[i-1] == '\r' && src[i] == '\n' && src[i+1] == '.')
|
|
438
|
-
nextMask = 2;
|
|
439
|
-
else
|
|
440
|
-
nextMask = 0;
|
|
441
|
-
}
|
|
442
|
-
} else
|
|
443
|
-
nextMask = 0;
|
|
444
438
|
}
|
|
445
439
|
|
|
446
440
|
void decoder_set_neon_funcs() {
|
|
@@ -448,6 +442,7 @@ void decoder_set_neon_funcs() {
|
|
|
448
442
|
_do_decode = &do_decode_simd<false, false, sizeof(uint8x16_t)*4, do_decode_neon<false, false> >;
|
|
449
443
|
_do_decode_raw = &do_decode_simd<true, false, sizeof(uint8x16_t)*4, do_decode_neon<true, false> >;
|
|
450
444
|
_do_decode_end_raw = &do_decode_simd<true, true, sizeof(uint8x16_t)*4, do_decode_neon<true, true> >;
|
|
445
|
+
_decode_isa = ISA_LEVEL_NEON;
|
|
451
446
|
}
|
|
452
447
|
#else
|
|
453
448
|
void decoder_set_neon_funcs() {}
|
package/src/decoder_sse2.cc
CHANGED
|
@@ -10,6 +10,7 @@ void decoder_set_sse2_funcs() {
|
|
|
10
10
|
_do_decode = &do_decode_simd<false, false, sizeof(__m128i)*2, do_decode_sse<false, false, ISA_LEVEL_SSE2> >;
|
|
11
11
|
_do_decode_raw = &do_decode_simd<true, false, sizeof(__m128i)*2, do_decode_sse<true, false, ISA_LEVEL_SSE2> >;
|
|
12
12
|
_do_decode_end_raw = &do_decode_simd<true, true, sizeof(__m128i)*2, do_decode_sse<true, true, ISA_LEVEL_SSE2> >;
|
|
13
|
+
_decode_isa = ISA_LEVEL_SSE2;
|
|
13
14
|
}
|
|
14
15
|
#else
|
|
15
16
|
void decoder_set_sse2_funcs() {}
|
package/src/decoder_sse_base.h
CHANGED
|
@@ -7,6 +7,13 @@
|
|
|
7
7
|
# define _mm_shrdi_epi16 _mm128_shrdi_epi16
|
|
8
8
|
#endif
|
|
9
9
|
|
|
10
|
+
#if defined(__tune_icelake_client__) || defined(__tune_icelake_server__) || defined(__tune_tigerlake__) || defined(__tune_rocketlake__) || defined(__tune_alderlake__) || defined(__tune_sapphirerapids__)
|
|
11
|
+
# define COMPRESS_STORE _mm_mask_compressstoreu_epi8
|
|
12
|
+
#else
|
|
13
|
+
// avoid uCode on Zen4
|
|
14
|
+
# define COMPRESS_STORE(dst, mask, vec) _mm_storeu_si128((__m128i*)(dst), _mm_maskz_compress_epi8(mask, vec))
|
|
15
|
+
#endif
|
|
16
|
+
|
|
10
17
|
// GCC (ver 6-10(dev)) fails to optimize pure C version of mask testing, but has this intrinsic; Clang >= 7 optimizes C version fine
|
|
11
18
|
#if (defined(__GNUC__) && __GNUC__ >= 7) || (defined(_MSC_VER) && _MSC_VER >= 1924)
|
|
12
19
|
# define KORTEST16(a, b) !_kortestz_mask16_u8((a), (b))
|
|
@@ -104,7 +111,7 @@ static HEDLEY_ALWAYS_INLINE __m128i sse2_compact_vect(uint32_t mask, __m128i dat
|
|
|
104
111
|
}
|
|
105
112
|
|
|
106
113
|
template<bool isRaw, bool searchEnd, enum YEncDecIsaLevel use_isa>
|
|
107
|
-
HEDLEY_ALWAYS_INLINE void do_decode_sse(const uint8_t*
|
|
114
|
+
HEDLEY_ALWAYS_INLINE void do_decode_sse(const uint8_t* src, long& len, unsigned char*& p, unsigned char& _escFirst, uint16_t& _nextMask) {
|
|
108
115
|
HEDLEY_ASSUME(_escFirst == 0 || _escFirst == 1);
|
|
109
116
|
HEDLEY_ASSUME(_nextMask == 0 || _nextMask == 1 || _nextMask == 2);
|
|
110
117
|
uintptr_t escFirst = _escFirst;
|
|
@@ -138,6 +145,9 @@ HEDLEY_ALWAYS_INLINE void do_decode_sse(const uint8_t* HEDLEY_RESTRICT src, long
|
|
|
138
145
|
else
|
|
139
146
|
lfCompare = _mm_insert_epi16(lfCompare, _nextMask == 1 ? 0x0a2e /*".\n"*/ : 0x2e0a /*"\n."*/, 0);
|
|
140
147
|
}
|
|
148
|
+
|
|
149
|
+
decoder_set_nextMask<isRaw>(src, len, _nextMask); // set this before the loop because we can't check src after it's been overwritten
|
|
150
|
+
|
|
141
151
|
intptr_t i;
|
|
142
152
|
for(i = -len; i; i += sizeof(__m128i)*2) {
|
|
143
153
|
__m128i oDataA = _mm_load_si128((__m128i *)(src+i));
|
|
@@ -376,6 +386,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_sse(const uint8_t* HEDLEY_RESTRICT src, long
|
|
|
376
386
|
// terminator found
|
|
377
387
|
// there's probably faster ways to do this, but reverting to scalar code should be good enough
|
|
378
388
|
len += (long)i;
|
|
389
|
+
_nextMask = decoder_set_nextMask<isRaw>(src+i, mask);
|
|
379
390
|
break;
|
|
380
391
|
}
|
|
381
392
|
}
|
|
@@ -485,6 +496,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_sse(const uint8_t* HEDLEY_RESTRICT src, long
|
|
|
485
496
|
|
|
486
497
|
if(endFound) {
|
|
487
498
|
len += (long)i;
|
|
499
|
+
_nextMask = decoder_set_nextMask<isRaw>(src+i, mask);
|
|
488
500
|
break;
|
|
489
501
|
}
|
|
490
502
|
}
|
|
@@ -649,9 +661,9 @@ HEDLEY_ALWAYS_INLINE void do_decode_sse(const uint8_t* HEDLEY_RESTRICT src, long
|
|
|
649
661
|
if(use_isa >= ISA_LEVEL_SSSE3) {
|
|
650
662
|
# if defined(__AVX512VBMI2__) && defined(__AVX512VL__) && defined(__POPCNT__)
|
|
651
663
|
if(use_isa >= ISA_LEVEL_VBMI2) {
|
|
652
|
-
|
|
664
|
+
COMPRESS_STORE(p, KNOT16(mask), dataA);
|
|
653
665
|
p -= popcnt32(mask & 0xffff);
|
|
654
|
-
|
|
666
|
+
COMPRESS_STORE(p+XMM_SIZE, KNOT16(mask>>16), dataB);
|
|
655
667
|
p -= popcnt32(mask>>16);
|
|
656
668
|
p += XMM_SIZE*2;
|
|
657
669
|
} else
|
|
@@ -703,16 +715,5 @@ HEDLEY_ALWAYS_INLINE void do_decode_sse(const uint8_t* HEDLEY_RESTRICT src, long
|
|
|
703
715
|
}
|
|
704
716
|
}
|
|
705
717
|
_escFirst = (unsigned char)escFirst;
|
|
706
|
-
if(isRaw) {
|
|
707
|
-
if(len != 0) { // have to gone through at least one loop cycle
|
|
708
|
-
if(src[i-2] == '\r' && src[i-1] == '\n' && src[i] == '.')
|
|
709
|
-
_nextMask = 1;
|
|
710
|
-
else if(src[i-1] == '\r' && src[i] == '\n' && src[i+1] == '.')
|
|
711
|
-
_nextMask = 2;
|
|
712
|
-
else
|
|
713
|
-
_nextMask = 0;
|
|
714
|
-
}
|
|
715
|
-
} else
|
|
716
|
-
_nextMask = 0;
|
|
717
718
|
}
|
|
718
719
|
#endif
|
package/src/decoder_ssse3.cc
CHANGED
|
@@ -9,6 +9,7 @@ void decoder_set_ssse3_funcs() {
|
|
|
9
9
|
_do_decode = &do_decode_simd<false, false, sizeof(__m128i)*2, do_decode_sse<false, false, ISA_LEVEL_SSSE3> >;
|
|
10
10
|
_do_decode_raw = &do_decode_simd<true, false, sizeof(__m128i)*2, do_decode_sse<true, false, ISA_LEVEL_SSSE3> >;
|
|
11
11
|
_do_decode_end_raw = &do_decode_simd<true, true, sizeof(__m128i)*2, do_decode_sse<true, true, ISA_LEVEL_SSSE3> >;
|
|
12
|
+
_decode_isa = ISA_LEVEL_SSSE3;
|
|
12
13
|
}
|
|
13
14
|
#else
|
|
14
15
|
void decoder_set_sse2_funcs();
|
package/src/decoder_vbmi2.cc
CHANGED
|
@@ -1,5 +1,12 @@
|
|
|
1
1
|
#include "common.h"
|
|
2
2
|
|
|
3
|
+
extern const bool decoder_has_avx10;
|
|
4
|
+
#if !defined(__EVEX512__) && (defined(__AVX10_1__) || defined(__EVEX256__)) && defined(__AVX512VL__) && defined(__AVX512VBMI2__) && defined(__AVX512BW__)
|
|
5
|
+
const bool decoder_has_avx10 = true;
|
|
6
|
+
#else
|
|
7
|
+
const bool decoder_has_avx10 = false;
|
|
8
|
+
#endif
|
|
9
|
+
|
|
3
10
|
#if defined(__AVX512VL__) && defined(__AVX512VBMI2__) && defined(__AVX512BW__)
|
|
4
11
|
# include "decoder_common.h"
|
|
5
12
|
# ifndef YENC_DISABLE_AVX256
|
|
@@ -11,6 +18,7 @@ void decoder_set_vbmi2_funcs() {
|
|
|
11
18
|
_do_decode = &do_decode_simd<false, false, sizeof(__m256i)*2, do_decode_avx2<false, false, ISA_LEVEL_VBMI2> >;
|
|
12
19
|
_do_decode_raw = &do_decode_simd<true, false, sizeof(__m256i)*2, do_decode_avx2<true, false, ISA_LEVEL_VBMI2> >;
|
|
13
20
|
_do_decode_end_raw = &do_decode_simd<true, true, sizeof(__m256i)*2, do_decode_avx2<true, true, ISA_LEVEL_VBMI2> >;
|
|
21
|
+
_decode_isa = ISA_LEVEL_VBMI2;
|
|
14
22
|
}
|
|
15
23
|
# else
|
|
16
24
|
# include "decoder_sse_base.h"
|
|
@@ -20,6 +28,7 @@ void decoder_set_vbmi2_funcs() {
|
|
|
20
28
|
_do_decode = &do_decode_simd<false, false, sizeof(__m128i)*2, do_decode_sse<false, false, ISA_LEVEL_VBMI2> >;
|
|
21
29
|
_do_decode_raw = &do_decode_simd<true, false, sizeof(__m128i)*2, do_decode_sse<true, false, ISA_LEVEL_VBMI2> >;
|
|
22
30
|
_do_decode_end_raw = &do_decode_simd<true, true, sizeof(__m128i)*2, do_decode_sse<true, true, ISA_LEVEL_VBMI2> >;
|
|
31
|
+
_decode_isa = ISA_LEVEL_VBMI2;
|
|
23
32
|
}
|
|
24
33
|
# endif
|
|
25
34
|
#else
|
package/src/encoder.cc
CHANGED
|
@@ -122,6 +122,7 @@ size_t do_encode_generic(int line_size, int* colOffset, const unsigned char* HED
|
|
|
122
122
|
|
|
123
123
|
extern "C" {
|
|
124
124
|
size_t (*_do_encode)(int, int*, const unsigned char* HEDLEY_RESTRICT, unsigned char* HEDLEY_RESTRICT, size_t, int) = &do_encode_generic;
|
|
125
|
+
int _encode_isa = ISA_GENERIC;
|
|
125
126
|
}
|
|
126
127
|
|
|
127
128
|
void encoder_sse2_init();
|
|
@@ -129,7 +130,9 @@ void encoder_ssse3_init();
|
|
|
129
130
|
void encoder_avx_init();
|
|
130
131
|
void encoder_avx2_init();
|
|
131
132
|
void encoder_vbmi2_init();
|
|
133
|
+
extern const bool encoder_has_avx10;
|
|
132
134
|
void encoder_neon_init();
|
|
135
|
+
void encoder_rvv_init();
|
|
133
136
|
|
|
134
137
|
#if defined(PLATFORM_X86) && defined(YENC_BUILD_NATIVE) && YENC_BUILD_NATIVE!=0
|
|
135
138
|
# if defined(__AVX2__) && !defined(YENC_DISABLE_AVX256)
|
|
@@ -137,12 +140,14 @@ void encoder_neon_init();
|
|
|
137
140
|
static inline void encoder_native_init() {
|
|
138
141
|
_do_encode = &do_encode_simd< do_encode_avx2<ISA_NATIVE> >;
|
|
139
142
|
encoder_avx2_lut<ISA_NATIVE>();
|
|
143
|
+
_encode_isa = ISA_NATIVE;
|
|
140
144
|
}
|
|
141
145
|
# else
|
|
142
146
|
# include "encoder_sse_base.h"
|
|
143
147
|
static inline void encoder_native_init() {
|
|
144
148
|
_do_encode = &do_encode_simd< do_encode_sse<ISA_NATIVE> >;
|
|
145
149
|
encoder_sse_lut<ISA_NATIVE>();
|
|
150
|
+
_encode_isa = ISA_NATIVE;
|
|
146
151
|
}
|
|
147
152
|
# endif
|
|
148
153
|
#endif
|
|
@@ -154,7 +159,7 @@ void encoder_init() {
|
|
|
154
159
|
encoder_native_init();
|
|
155
160
|
# else
|
|
156
161
|
int use_isa = cpu_supports_isa();
|
|
157
|
-
if(use_isa >= ISA_LEVEL_VBMI2)
|
|
162
|
+
if(use_isa >= ISA_LEVEL_VBMI2 && (encoder_has_avx10 || (use_isa & ISA_FEATURE_EVEX512)))
|
|
158
163
|
encoder_vbmi2_init();
|
|
159
164
|
else if(use_isa >= ISA_LEVEL_AVX2)
|
|
160
165
|
encoder_avx2_init();
|
|
@@ -170,4 +175,8 @@ void encoder_init() {
|
|
|
170
175
|
if(cpu_supports_neon())
|
|
171
176
|
encoder_neon_init();
|
|
172
177
|
#endif
|
|
178
|
+
#ifdef __riscv
|
|
179
|
+
if(cpu_supports_rvv())
|
|
180
|
+
encoder_rvv_init();
|
|
181
|
+
#endif
|
|
173
182
|
}
|
package/src/encoder.h
CHANGED
|
@@ -10,8 +10,12 @@ extern "C" {
|
|
|
10
10
|
#include "hedley.h"
|
|
11
11
|
|
|
12
12
|
extern size_t (*_do_encode)(int, int*, const unsigned char* HEDLEY_RESTRICT, unsigned char* HEDLEY_RESTRICT, size_t, int);
|
|
13
|
+
extern int _encode_isa;
|
|
13
14
|
#define do_encode (*_do_encode)
|
|
14
15
|
void encoder_init();
|
|
16
|
+
static inline int encode_isa_level() {
|
|
17
|
+
return _encode_isa;
|
|
18
|
+
}
|
|
15
19
|
|
|
16
20
|
|
|
17
21
|
|
package/src/encoder_avx.cc
CHANGED