yencode 1.1.3 → 1.1.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +3 -2
- package/binding.gyp +75 -12
- package/index.js +21 -19
- package/package.json +2 -1
- package/src/common.h +23 -5
- package/src/crc.cc +129 -8
- package/src/crc_arm.cc +7 -1
- package/src/crc_folding_256.cc +4 -5
- package/src/decoder.cc +5 -4
- package/src/decoder.h +5 -5
- package/src/decoder_avx2_base.h +10 -4
- package/src/decoder_common.h +5 -5
- package/src/decoder_neon.cc +1 -1
- package/src/decoder_neon64.cc +1 -1
- package/src/decoder_sse_base.h +10 -3
- package/src/decoder_vbmi2.cc +7 -0
- package/src/encoder.cc +7 -1
- package/src/encoder_avx_base.h +22 -14
- package/src/encoder_neon.cc +39 -40
- package/src/encoder_rvv.cc +219 -0
- package/src/encoder_sse_base.h +3 -3
- package/src/encoder_vbmi2.cc +7 -0
- package/src/hedley.h +278 -135
- package/src/platform.cc +57 -9
- package/src/test_alignalloc.c +6 -0
- package/test/_speedbase.js +12 -11
- package/test/speeddec.js +6 -5
- package/test/testdec.js +30 -14
- package/test/testenc.js +10 -7
- package/test/testpostdec.js +6 -5
package/src/decoder_avx2_base.h
CHANGED
|
@@ -30,7 +30,7 @@ static HEDLEY_ALWAYS_INLINE __m256i force_align_read_256(const void* p) {
|
|
|
30
30
|
}
|
|
31
31
|
|
|
32
32
|
// _mm256_castsi128_si256, but upper is defined to be 0
|
|
33
|
-
#if (defined(__clang__) && __clang_major__ >= 5 && (!defined(__APPLE__) || __clang_major__ >= 7)) || (defined(__GNUC__) && __GNUC__ >= 10)
|
|
33
|
+
#if (defined(__clang__) && __clang_major__ >= 5 && (!defined(__APPLE__) || __clang_major__ >= 7)) || (defined(__GNUC__) && __GNUC__ >= 10) || (defined(_MSC_VER) && _MSC_VER >= 1910)
|
|
34
34
|
// intrinsic unsupported in GCC 9 and MSVC < 2017
|
|
35
35
|
# define zext128_256 _mm256_zextsi128_si256
|
|
36
36
|
#else
|
|
@@ -43,9 +43,15 @@ static HEDLEY_ALWAYS_INLINE __m256i force_align_read_256(const void* p) {
|
|
|
43
43
|
# endif
|
|
44
44
|
#endif
|
|
45
45
|
|
|
46
|
+
#if defined(__tune_icelake_client__) || defined(__tune_icelake_server__) || defined(__tune_tigerlake__) || defined(__tune_rocketlake__) || defined(__tune_alderlake__) || defined(__tune_sapphirerapids__)
|
|
47
|
+
# define COMPRESS_STORE _mm256_mask_compressstoreu_epi8
|
|
48
|
+
#else
|
|
49
|
+
// avoid uCode on Zen4
|
|
50
|
+
# define COMPRESS_STORE(dst, mask, vec) _mm256_storeu_si256((__m256i*)(dst), _mm256_maskz_compress_epi8(mask, vec))
|
|
51
|
+
#endif
|
|
46
52
|
|
|
47
53
|
template<bool isRaw, bool searchEnd, enum YEncDecIsaLevel use_isa>
|
|
48
|
-
HEDLEY_ALWAYS_INLINE void do_decode_avx2(const uint8_t*
|
|
54
|
+
HEDLEY_ALWAYS_INLINE void do_decode_avx2(const uint8_t* src, long& len, unsigned char*& p, unsigned char& _escFirst, uint16_t& _nextMask) {
|
|
49
55
|
HEDLEY_ASSUME(_escFirst == 0 || _escFirst == 1);
|
|
50
56
|
HEDLEY_ASSUME(_nextMask == 0 || _nextMask == 1 || _nextMask == 2);
|
|
51
57
|
uintptr_t escFirst = _escFirst;
|
|
@@ -541,9 +547,9 @@ HEDLEY_ALWAYS_INLINE void do_decode_avx2(const uint8_t* HEDLEY_RESTRICT src, lon
|
|
|
541
547
|
// all that's left is to 'compress' the data (skip over masked chars)
|
|
542
548
|
#if defined(__AVX512VBMI2__) && defined(__AVX512VL__)
|
|
543
549
|
if(use_isa >= ISA_LEVEL_VBMI2) {
|
|
544
|
-
|
|
550
|
+
COMPRESS_STORE(p, KNOT32(mask), dataA);
|
|
545
551
|
p -= popcnt32(mask & 0xffffffff);
|
|
546
|
-
|
|
552
|
+
COMPRESS_STORE((p + XMM_SIZE*2), KNOT32(mask>>32), dataB);
|
|
547
553
|
p += XMM_SIZE*4 - popcnt32(mask >> 32);
|
|
548
554
|
} else
|
|
549
555
|
#endif
|
package/src/decoder_common.h
CHANGED
|
@@ -6,7 +6,7 @@
|
|
|
6
6
|
|
|
7
7
|
// state var: refers to the previous state - only used for incremental processing
|
|
8
8
|
template<bool isRaw>
|
|
9
|
-
size_t do_decode_noend_scalar(const unsigned char*
|
|
9
|
+
size_t do_decode_noend_scalar(const unsigned char* src, unsigned char* dest, size_t len, YencDecoderState* state) {
|
|
10
10
|
const unsigned char *es = src + len; // end source pointer
|
|
11
11
|
unsigned char *p = dest; // destination pointer
|
|
12
12
|
long i = -(long)len; // input position
|
|
@@ -140,7 +140,7 @@ size_t do_decode_noend_scalar(const unsigned char* HEDLEY_RESTRICT src, unsigned
|
|
|
140
140
|
}
|
|
141
141
|
|
|
142
142
|
template<bool isRaw>
|
|
143
|
-
YencDecoderEnd do_decode_end_scalar(const unsigned char
|
|
143
|
+
YencDecoderEnd do_decode_end_scalar(const unsigned char** src, unsigned char** dest, size_t len, YencDecoderState* state) {
|
|
144
144
|
const unsigned char *es = (*src) + len; // end source pointer
|
|
145
145
|
unsigned char *p = *dest; // destination pointer
|
|
146
146
|
long i = -(long)len; // input position
|
|
@@ -321,7 +321,7 @@ YencDecoderEnd do_decode_end_scalar(const unsigned char* HEDLEY_RESTRICT* src, u
|
|
|
321
321
|
}
|
|
322
322
|
|
|
323
323
|
template<bool isRaw, bool searchEnd>
|
|
324
|
-
YencDecoderEnd do_decode_scalar(const unsigned char
|
|
324
|
+
YencDecoderEnd do_decode_scalar(const unsigned char** src, unsigned char** dest, size_t len, YencDecoderState* state) {
|
|
325
325
|
if(searchEnd)
|
|
326
326
|
return do_decode_end_scalar<isRaw>(src, dest, len, state);
|
|
327
327
|
*dest += do_decode_noend_scalar<isRaw>(*src, *dest, len, state);
|
|
@@ -331,8 +331,8 @@ YencDecoderEnd do_decode_scalar(const unsigned char* HEDLEY_RESTRICT* src, unsig
|
|
|
331
331
|
|
|
332
332
|
|
|
333
333
|
|
|
334
|
-
template<bool isRaw, bool searchEnd, int width, void(&kernel)(const uint8_t
|
|
335
|
-
YencDecoderEnd do_decode_simd(const unsigned char
|
|
334
|
+
template<bool isRaw, bool searchEnd, int width, void(&kernel)(const uint8_t*, long&, unsigned char*&, unsigned char&, uint16_t&)>
|
|
335
|
+
YencDecoderEnd do_decode_simd(const unsigned char** src, unsigned char** dest, size_t len, YencDecoderState* state) {
|
|
336
336
|
if(len <= width*2) return do_decode_scalar<isRaw, searchEnd>(src, dest, len, state);
|
|
337
337
|
|
|
338
338
|
YencDecoderState tState = YDEC_STATE_CRLF;
|
package/src/decoder_neon.cc
CHANGED
|
@@ -59,7 +59,7 @@ static bool neon_vect_is_nonzero(uint8x16_t v) {
|
|
|
59
59
|
|
|
60
60
|
|
|
61
61
|
template<bool isRaw, bool searchEnd>
|
|
62
|
-
HEDLEY_ALWAYS_INLINE void do_decode_neon(const uint8_t*
|
|
62
|
+
HEDLEY_ALWAYS_INLINE void do_decode_neon(const uint8_t* src, long& len, unsigned char*& p, unsigned char& escFirst, uint16_t& nextMask) {
|
|
63
63
|
HEDLEY_ASSUME(escFirst == 0 || escFirst == 1);
|
|
64
64
|
HEDLEY_ASSUME(nextMask == 0 || nextMask == 1 || nextMask == 2);
|
|
65
65
|
uint8x16_t yencOffset = escFirst ? vmakeq_u8(42+64,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42) : vdupq_n_u8(42);
|
package/src/decoder_neon64.cc
CHANGED
|
@@ -47,7 +47,7 @@ static HEDLEY_ALWAYS_INLINE uint8x16_t mergeCompares(uint8x16_t a, uint8x16_t b,
|
|
|
47
47
|
|
|
48
48
|
|
|
49
49
|
template<bool isRaw, bool searchEnd>
|
|
50
|
-
HEDLEY_ALWAYS_INLINE void do_decode_neon(const uint8_t*
|
|
50
|
+
HEDLEY_ALWAYS_INLINE void do_decode_neon(const uint8_t* src, long& len, unsigned char*& p, unsigned char& escFirst, uint16_t& nextMask) {
|
|
51
51
|
HEDLEY_ASSUME(escFirst == 0 || escFirst == 1);
|
|
52
52
|
HEDLEY_ASSUME(nextMask == 0 || nextMask == 1 || nextMask == 2);
|
|
53
53
|
uint8x16_t nextMaskMix = vdupq_n_u8(0);
|
package/src/decoder_sse_base.h
CHANGED
|
@@ -7,6 +7,13 @@
|
|
|
7
7
|
# define _mm_shrdi_epi16 _mm128_shrdi_epi16
|
|
8
8
|
#endif
|
|
9
9
|
|
|
10
|
+
#if defined(__tune_icelake_client__) || defined(__tune_icelake_server__) || defined(__tune_tigerlake__) || defined(__tune_rocketlake__) || defined(__tune_alderlake__) || defined(__tune_sapphirerapids__)
|
|
11
|
+
# define COMPRESS_STORE _mm_mask_compressstoreu_epi8
|
|
12
|
+
#else
|
|
13
|
+
// avoid uCode on Zen4
|
|
14
|
+
# define COMPRESS_STORE(dst, mask, vec) _mm_storeu_si128((__m128i*)(dst), _mm_maskz_compress_epi8(mask, vec))
|
|
15
|
+
#endif
|
|
16
|
+
|
|
10
17
|
// GCC (ver 6-10(dev)) fails to optimize pure C version of mask testing, but has this intrinsic; Clang >= 7 optimizes C version fine
|
|
11
18
|
#if (defined(__GNUC__) && __GNUC__ >= 7) || (defined(_MSC_VER) && _MSC_VER >= 1924)
|
|
12
19
|
# define KORTEST16(a, b) !_kortestz_mask16_u8((a), (b))
|
|
@@ -104,7 +111,7 @@ static HEDLEY_ALWAYS_INLINE __m128i sse2_compact_vect(uint32_t mask, __m128i dat
|
|
|
104
111
|
}
|
|
105
112
|
|
|
106
113
|
template<bool isRaw, bool searchEnd, enum YEncDecIsaLevel use_isa>
|
|
107
|
-
HEDLEY_ALWAYS_INLINE void do_decode_sse(const uint8_t*
|
|
114
|
+
HEDLEY_ALWAYS_INLINE void do_decode_sse(const uint8_t* src, long& len, unsigned char*& p, unsigned char& _escFirst, uint16_t& _nextMask) {
|
|
108
115
|
HEDLEY_ASSUME(_escFirst == 0 || _escFirst == 1);
|
|
109
116
|
HEDLEY_ASSUME(_nextMask == 0 || _nextMask == 1 || _nextMask == 2);
|
|
110
117
|
uintptr_t escFirst = _escFirst;
|
|
@@ -649,9 +656,9 @@ HEDLEY_ALWAYS_INLINE void do_decode_sse(const uint8_t* HEDLEY_RESTRICT src, long
|
|
|
649
656
|
if(use_isa >= ISA_LEVEL_SSSE3) {
|
|
650
657
|
# if defined(__AVX512VBMI2__) && defined(__AVX512VL__) && defined(__POPCNT__)
|
|
651
658
|
if(use_isa >= ISA_LEVEL_VBMI2) {
|
|
652
|
-
|
|
659
|
+
COMPRESS_STORE(p, KNOT16(mask), dataA);
|
|
653
660
|
p -= popcnt32(mask & 0xffff);
|
|
654
|
-
|
|
661
|
+
COMPRESS_STORE(p+XMM_SIZE, KNOT16(mask>>16), dataB);
|
|
655
662
|
p -= popcnt32(mask>>16);
|
|
656
663
|
p += XMM_SIZE*2;
|
|
657
664
|
} else
|
package/src/decoder_vbmi2.cc
CHANGED
|
@@ -1,5 +1,12 @@
|
|
|
1
1
|
#include "common.h"
|
|
2
2
|
|
|
3
|
+
extern const bool decoder_has_avx10;
|
|
4
|
+
#if !defined(__EVEX512__) && (defined(__AVX10_1__) || defined(__EVEX256__)) && defined(__AVX512VL__) && defined(__AVX512VBMI2__) && defined(__AVX512BW__)
|
|
5
|
+
const bool decoder_has_avx10 = true;
|
|
6
|
+
#else
|
|
7
|
+
const bool decoder_has_avx10 = false;
|
|
8
|
+
#endif
|
|
9
|
+
|
|
3
10
|
#if defined(__AVX512VL__) && defined(__AVX512VBMI2__) && defined(__AVX512BW__)
|
|
4
11
|
# include "decoder_common.h"
|
|
5
12
|
# ifndef YENC_DISABLE_AVX256
|
package/src/encoder.cc
CHANGED
|
@@ -129,7 +129,9 @@ void encoder_ssse3_init();
|
|
|
129
129
|
void encoder_avx_init();
|
|
130
130
|
void encoder_avx2_init();
|
|
131
131
|
void encoder_vbmi2_init();
|
|
132
|
+
extern const bool encoder_has_avx10;
|
|
132
133
|
void encoder_neon_init();
|
|
134
|
+
void encoder_rvv_init();
|
|
133
135
|
|
|
134
136
|
#if defined(PLATFORM_X86) && defined(YENC_BUILD_NATIVE) && YENC_BUILD_NATIVE!=0
|
|
135
137
|
# if defined(__AVX2__) && !defined(YENC_DISABLE_AVX256)
|
|
@@ -154,7 +156,7 @@ void encoder_init() {
|
|
|
154
156
|
encoder_native_init();
|
|
155
157
|
# else
|
|
156
158
|
int use_isa = cpu_supports_isa();
|
|
157
|
-
if(use_isa >= ISA_LEVEL_VBMI2)
|
|
159
|
+
if(use_isa >= ISA_LEVEL_VBMI2 && (encoder_has_avx10 || (use_isa & ISA_FEATURE_EVEX512)))
|
|
158
160
|
encoder_vbmi2_init();
|
|
159
161
|
else if(use_isa >= ISA_LEVEL_AVX2)
|
|
160
162
|
encoder_avx2_init();
|
|
@@ -170,4 +172,8 @@ void encoder_init() {
|
|
|
170
172
|
if(cpu_supports_neon())
|
|
171
173
|
encoder_neon_init();
|
|
172
174
|
#endif
|
|
175
|
+
#ifdef __riscv
|
|
176
|
+
if(cpu_supports_rvv())
|
|
177
|
+
encoder_rvv_init();
|
|
178
|
+
#endif
|
|
173
179
|
}
|
package/src/encoder_avx_base.h
CHANGED
|
@@ -215,7 +215,7 @@ HEDLEY_ALWAYS_INLINE void do_encode_avx2(int line_size, int* colOffset, const ui
|
|
|
215
215
|
// duplicate halves
|
|
216
216
|
data1A = _mm256_inserti128_si256(dataA, _mm256_castsi256_si128(dataA), 1);
|
|
217
217
|
data1B = _mm256_inserti128_si256(dataB, _mm256_castsi256_si128(dataB), 1);
|
|
218
|
-
#if defined(__tune_znver2__) || defined(__tune_znver3__)
|
|
218
|
+
#if defined(__tune_znver2__) || defined(__tune_znver3__) || defined(__tune_znver4__)
|
|
219
219
|
data2A = _mm256_permute2x128_si256(dataA, dataA, 0x11);
|
|
220
220
|
data2B = _mm256_permute2x128_si256(dataB, dataB, 0x11);
|
|
221
221
|
#else
|
|
@@ -290,7 +290,7 @@ HEDLEY_ALWAYS_INLINE void do_encode_avx2(int line_size, int* colOffset, const ui
|
|
|
290
290
|
|
|
291
291
|
#if defined(__GNUC__) && defined(PLATFORM_AMD64)
|
|
292
292
|
if(use_isa >= ISA_LEVEL_VBMI2) {
|
|
293
|
-
|
|
293
|
+
__asm__(
|
|
294
294
|
"shrq $1, %[eqMask] \n"
|
|
295
295
|
"shrq %%cl, %[eqMask] \n"
|
|
296
296
|
"adcq %q[col], %q[p] \n"
|
|
@@ -334,28 +334,32 @@ HEDLEY_ALWAYS_INLINE void do_encode_avx2(int line_size, int* colOffset, const ui
|
|
|
334
334
|
if(use_isa >= ISA_LEVEL_AVX3) {
|
|
335
335
|
# if defined(__AVX512VBMI2__)
|
|
336
336
|
if(use_isa >= ISA_LEVEL_VBMI2) {
|
|
337
|
-
|
|
337
|
+
__m128i dataTop = _mm256_extracti128_si256(dataA, 1);
|
|
338
338
|
dataA = _mm256_mask_expand_epi8(_mm256_set1_epi8('='), KNOT32(maskA), dataA);
|
|
339
339
|
_mm256_storeu_si256((__m256i*)p, dataA);
|
|
340
|
+
p[32] = _mm_extract_epi8(dataTop, 15);
|
|
340
341
|
p += outputBytesA;
|
|
341
342
|
|
|
342
|
-
|
|
343
|
+
dataTop = _mm256_extracti128_si256(dataB, 1);
|
|
343
344
|
dataB = _mm256_mask_expand_epi8(_mm256_set1_epi8('='), KNOT32(maskB), dataB);
|
|
344
345
|
_mm256_storeu_si256((__m256i*)p, dataB);
|
|
346
|
+
p[32] = _mm_extract_epi8(dataTop, 15);
|
|
345
347
|
p += maskBitsB;
|
|
346
348
|
} else
|
|
347
349
|
# endif
|
|
348
350
|
{
|
|
349
|
-
|
|
350
|
-
dataA = _mm256_mask_alignr_epi8(dataA, (uint32_t)(-(int32_t)maskA), dataA,
|
|
351
|
+
__m256i dataSwapped = _mm256_permute4x64_epi64(dataA, _MM_SHUFFLE(1,0,3,2));
|
|
352
|
+
dataA = _mm256_mask_alignr_epi8(dataA, (uint32_t)(-(int32_t)maskA), dataA, dataSwapped, 15);
|
|
351
353
|
dataA = _mm256_ternarylogic_epi32(dataA, cmpA, _mm256_set1_epi8('='), 0xb8); // (data & ~cmp) | (cmp & '=')
|
|
352
354
|
_mm256_storeu_si256((__m256i*)p, dataA);
|
|
355
|
+
p[32] = _mm_extract_epi8(_mm256_castsi256_si128(dataSwapped), 15);
|
|
353
356
|
p += outputBytesA;
|
|
354
357
|
|
|
355
|
-
|
|
356
|
-
dataB = _mm256_mask_alignr_epi8(dataB, (uint32_t)(-(int32_t)maskB), dataB,
|
|
358
|
+
dataSwapped = _mm256_permute4x64_epi64(dataB, _MM_SHUFFLE(1,0,3,2));
|
|
359
|
+
dataB = _mm256_mask_alignr_epi8(dataB, (uint32_t)(-(int32_t)maskB), dataB, dataSwapped, 15);
|
|
357
360
|
dataB = _mm256_ternarylogic_epi32(dataB, cmpB, _mm256_set1_epi8('='), 0xb8);
|
|
358
361
|
_mm256_storeu_si256((__m256i*)p, dataB);
|
|
362
|
+
p[32] = _mm_extract_epi8(_mm256_castsi256_si128(dataSwapped), 15);
|
|
359
363
|
p += maskBitsB;
|
|
360
364
|
}
|
|
361
365
|
} else
|
|
@@ -484,28 +488,32 @@ HEDLEY_ALWAYS_INLINE void do_encode_avx2(int line_size, int* colOffset, const ui
|
|
|
484
488
|
if(use_isa >= ISA_LEVEL_AVX3) {
|
|
485
489
|
# if defined(__AVX512VBMI2__)
|
|
486
490
|
if(use_isa >= ISA_LEVEL_VBMI2) {
|
|
487
|
-
|
|
491
|
+
__m128i dataTop = _mm256_extracti128_si256(dataA, 1);
|
|
488
492
|
dataA = _mm256_mask_expand_epi8(_mm256_set1_epi8('='), KNOT32(maskA), dataA);
|
|
489
493
|
_mm256_storeu_si256((__m256i*)p, dataA);
|
|
494
|
+
p[32] = _mm_extract_epi8(dataTop, 15);
|
|
490
495
|
p += outputBytesA;
|
|
491
496
|
|
|
492
|
-
|
|
497
|
+
dataTop = _mm256_extracti128_si256(dataB, 1);
|
|
493
498
|
dataB = _mm256_mask_expand_epi8(_mm256_set1_epi8('='), KNOT32(maskB), dataB);
|
|
494
499
|
_mm256_storeu_si256((__m256i*)p, dataB);
|
|
500
|
+
p[32] = _mm_extract_epi8(dataTop, 15);
|
|
495
501
|
p += maskBitsB;
|
|
496
502
|
} else
|
|
497
503
|
# endif
|
|
498
504
|
{
|
|
499
|
-
|
|
500
|
-
dataA = _mm256_mask_alignr_epi8(dataA, (uint32_t)(-(int32_t)maskA), dataA,
|
|
505
|
+
__m256i dataSwapped = _mm256_permute4x64_epi64(dataA, _MM_SHUFFLE(1,0,3,2));
|
|
506
|
+
dataA = _mm256_mask_alignr_epi8(dataA, (uint32_t)(-(int32_t)maskA), dataA, dataSwapped, 15);
|
|
501
507
|
dataA = _mm256_ternarylogic_epi32(dataA, cmpA, _mm256_set1_epi8('='), 0xb8); // (data & ~cmp) | (cmp & '=')
|
|
502
508
|
_mm256_storeu_si256((__m256i*)p, dataA);
|
|
509
|
+
p[32] = _mm_extract_epi8(_mm256_castsi256_si128(dataSwapped), 15);
|
|
503
510
|
p += outputBytesA;
|
|
504
511
|
|
|
505
|
-
|
|
506
|
-
dataB = _mm256_mask_alignr_epi8(dataB, (uint32_t)(-(int32_t)maskB), dataB,
|
|
512
|
+
dataSwapped = _mm256_permute4x64_epi64(dataB, _MM_SHUFFLE(1,0,3,2));
|
|
513
|
+
dataB = _mm256_mask_alignr_epi8(dataB, (uint32_t)(-(int32_t)maskB), dataB, dataSwapped, 15);
|
|
507
514
|
dataB = _mm256_ternarylogic_epi32(dataB, cmpB, _mm256_set1_epi8('='), 0xb8);
|
|
508
515
|
_mm256_storeu_si256((__m256i*)p, dataB);
|
|
516
|
+
p[32] = _mm_extract_epi8(_mm256_castsi256_si128(dataSwapped), 15);
|
|
509
517
|
p += maskBitsB;
|
|
510
518
|
}
|
|
511
519
|
} else
|
package/src/encoder_neon.cc
CHANGED
|
@@ -15,6 +15,43 @@ static HEDLEY_ALWAYS_INLINE void vst1q_u8_x2_unaligned(uint8_t* p, uint8x16x2_t
|
|
|
15
15
|
#endif
|
|
16
16
|
|
|
17
17
|
|
|
18
|
+
// ARM's CLZ instruction at native bit-width
|
|
19
|
+
#ifdef __aarch64__
|
|
20
|
+
static HEDLEY_ALWAYS_INLINE int clz_n(uint64_t v) {
|
|
21
|
+
# ifdef _MSC_VER
|
|
22
|
+
long r;
|
|
23
|
+
// does this work?
|
|
24
|
+
if(_BitScanReverse64((unsigned long*)&r, v))
|
|
25
|
+
r ^= 63;
|
|
26
|
+
else
|
|
27
|
+
r = 64;
|
|
28
|
+
return r;
|
|
29
|
+
# else
|
|
30
|
+
# if defined(__clang__) || HEDLEY_GCC_VERSION_CHECK(11,0,0)
|
|
31
|
+
// this pattern is only detected on GCC >= 11 (Clang 9 seems to as well, unsure about earlier versions)
|
|
32
|
+
// - note: return type must be 'int'; GCC fails to optimise this if type is 'long'
|
|
33
|
+
// GCC <= 10 doesn't optimize around the '0 = undefined behaviour', so not needed there
|
|
34
|
+
if(v == 0) return 64;
|
|
35
|
+
# endif
|
|
36
|
+
return __builtin_clzll(v);
|
|
37
|
+
# endif
|
|
38
|
+
}
|
|
39
|
+
#else
|
|
40
|
+
static HEDLEY_ALWAYS_INLINE int clz_n(uint32_t v) {
|
|
41
|
+
# ifdef __GNUC__
|
|
42
|
+
# if defined(__clang__) || HEDLEY_GCC_VERSION_CHECK(7,0,0)
|
|
43
|
+
// as with AArch64 version above, only insert this check if compiler can optimise it away
|
|
44
|
+
if(v == 0) return 32;
|
|
45
|
+
# endif
|
|
46
|
+
return __builtin_clz(v);
|
|
47
|
+
# elif defined(_MSC_VER)
|
|
48
|
+
return _arm_clz(v);
|
|
49
|
+
# else
|
|
50
|
+
return __clz(v); // ARM compiler?
|
|
51
|
+
# endif
|
|
52
|
+
}
|
|
53
|
+
#endif
|
|
54
|
+
|
|
18
55
|
static uint8x16_t ALIGN_TO(16, shufLUT[256]);
|
|
19
56
|
static uint16_t expandLUT[256];
|
|
20
57
|
|
|
@@ -195,26 +232,7 @@ static HEDLEY_ALWAYS_INLINE void encode_eol_handle_pre(const uint8_t* HEDLEY_RES
|
|
|
195
232
|
col = shufTotalLen+1 + lineSizeOffset-32;
|
|
196
233
|
} else {
|
|
197
234
|
// shuffle stuff up
|
|
198
|
-
|
|
199
|
-
# ifdef _MSC_VER
|
|
200
|
-
long bitIndex;
|
|
201
|
-
if(_BitScanReverse64((unsigned long*)&bitIndex, mask))
|
|
202
|
-
bitIndex ^= 63;
|
|
203
|
-
else
|
|
204
|
-
bitIndex = 64;
|
|
205
|
-
# else
|
|
206
|
-
long bitIndex = __builtin_clzll(mask);
|
|
207
|
-
# endif
|
|
208
|
-
#else
|
|
209
|
-
# ifdef __GNUC__
|
|
210
|
-
long bitIndex = __builtin_clz(mask); // TODO: is the 'undefined if 0' case problematic here?
|
|
211
|
-
# elif defined(_MSC_VER)
|
|
212
|
-
long bitIndex = _arm_clz(mask);
|
|
213
|
-
# else
|
|
214
|
-
long bitIndex = __clz(mask); // ARM compiler?
|
|
215
|
-
# endif
|
|
216
|
-
#endif
|
|
217
|
-
|
|
235
|
+
long bitIndex = clz_n(mask);
|
|
218
236
|
uint8x16_t vClz = vdupq_n_u8(bitIndex & ~(sizeof(mask)*8));
|
|
219
237
|
#ifdef __aarch64__
|
|
220
238
|
uint8x16_t blendA = vcgtq_u8(vmakeq_u8(63,62,61,60,51,50,49,48,47,46,45,44,35,34,33,32), vClz);
|
|
@@ -450,26 +468,7 @@ HEDLEY_ALWAYS_INLINE void do_encode_neon(int line_size, int* colOffset, const ui
|
|
|
450
468
|
}
|
|
451
469
|
} else {
|
|
452
470
|
{
|
|
453
|
-
|
|
454
|
-
# ifdef _MSC_VER
|
|
455
|
-
// does this work?
|
|
456
|
-
if(_BitScanReverse64((unsigned long*)&bitIndex, mask))
|
|
457
|
-
bitIndex ^= 63;
|
|
458
|
-
else
|
|
459
|
-
bitIndex = 64;
|
|
460
|
-
# else
|
|
461
|
-
bitIndex = __builtin_clzll(mask); // TODO: is the 'undefined if 0' case problematic here?
|
|
462
|
-
# endif
|
|
463
|
-
#else
|
|
464
|
-
# ifdef __GNUC__
|
|
465
|
-
bitIndex = __builtin_clz(mask);
|
|
466
|
-
# elif defined(_MSC_VER)
|
|
467
|
-
bitIndex = _arm_clz(mask);
|
|
468
|
-
# else
|
|
469
|
-
bitIndex = __clz(mask); // ARM compiler?
|
|
470
|
-
# endif
|
|
471
|
-
#endif
|
|
472
|
-
|
|
471
|
+
bitIndex = clz_n(mask);
|
|
473
472
|
uint8x16_t vClz = vdupq_n_u8(bitIndex & ~(sizeof(mask)*8));
|
|
474
473
|
#ifdef __aarch64__
|
|
475
474
|
uint8x16_t blendA = vcgeq_u8(vmakeq_u8(63,62,61,60,51,50,49,48,47,46,45,44,35,34,33,32), vClz);
|
|
@@ -0,0 +1,219 @@
|
|
|
1
|
+
#include "common.h"
|
|
2
|
+
|
|
3
|
+
#ifdef __riscv_vector
|
|
4
|
+
#include "encoder.h"
|
|
5
|
+
#include "encoder_common.h"
|
|
6
|
+
|
|
7
|
+
# include <riscv_vector.h>
|
|
8
|
+
# if defined(__clang__) && __clang_major__ < 16
|
|
9
|
+
# define RV(f) f
|
|
10
|
+
# else
|
|
11
|
+
# define RV(f) __riscv_##f
|
|
12
|
+
# endif
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
static HEDLEY_ALWAYS_INLINE void encode_eol_handle_pre(const uint8_t* HEDLEY_RESTRICT _src, long& inpos, uint8_t*& outp, long& col, long lineSizeOffset) {
|
|
16
|
+
// TODO: vectorize
|
|
17
|
+
uint8_t c = _src[inpos++];
|
|
18
|
+
if(HEDLEY_UNLIKELY(escapedLUT[c] && c != '.'-42)) {
|
|
19
|
+
memcpy(outp, &escapedLUT[c], sizeof(uint16_t));
|
|
20
|
+
outp += 2;
|
|
21
|
+
} else {
|
|
22
|
+
*(outp++) = c + 42;
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
c = _src[inpos++];
|
|
26
|
+
if(LIKELIHOOD(0.0273, escapedLUT[c]!=0)) {
|
|
27
|
+
uint32_t w = UINT32_16_PACK(UINT16_PACK('\r', '\n'), (uint32_t)escapedLUT[c]);
|
|
28
|
+
memcpy(outp, &w, sizeof(w));
|
|
29
|
+
outp += 4;
|
|
30
|
+
col = lineSizeOffset + 2;
|
|
31
|
+
} else {
|
|
32
|
+
uint32_t w = UINT32_PACK('\r', '\n', (uint32_t)(c+42), 0);
|
|
33
|
+
memcpy(outp, &w, sizeof(w));
|
|
34
|
+
outp += 3;
|
|
35
|
+
col = lineSizeOffset + 1;
|
|
36
|
+
}
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
HEDLEY_ALWAYS_INLINE void do_encode_rvv(int line_size, int* colOffset, const uint8_t* HEDLEY_RESTRICT srcEnd, uint8_t* HEDLEY_RESTRICT& dest, size_t& len) {
|
|
41
|
+
size_t vl2 = RV(vsetvlmax_e8m2)(); // TODO: limit to line length
|
|
42
|
+
// TODO: have a LMUL=1 variant if line_size < vl
|
|
43
|
+
|
|
44
|
+
// offset position to enable simpler loop condition checking
|
|
45
|
+
const int INPUT_OFFSET = vl2*2 -1; // extra chars for EOL handling, -1 to change <= to <
|
|
46
|
+
if((intptr_t)len <= INPUT_OFFSET || line_size < (int)vl2*2) return;
|
|
47
|
+
|
|
48
|
+
uint8_t *outp = dest;
|
|
49
|
+
long inpos = -(long)len;
|
|
50
|
+
long lineSizeOffset = -line_size +1;
|
|
51
|
+
long col = *colOffset - line_size +1;
|
|
52
|
+
|
|
53
|
+
inpos += INPUT_OFFSET;
|
|
54
|
+
const uint8_t* _src = srcEnd - INPUT_OFFSET;
|
|
55
|
+
|
|
56
|
+
if (HEDLEY_LIKELY(col == -line_size+1)) {
|
|
57
|
+
uint8_t c = _src[inpos++];
|
|
58
|
+
if (LIKELIHOOD(0.0273, escapedLUT[c] != 0)) {
|
|
59
|
+
memcpy(outp, escapedLUT + c, 2);
|
|
60
|
+
outp += 2;
|
|
61
|
+
col += 2;
|
|
62
|
+
} else {
|
|
63
|
+
*(outp++) = c + 42;
|
|
64
|
+
col += 1;
|
|
65
|
+
}
|
|
66
|
+
}
|
|
67
|
+
if(HEDLEY_UNLIKELY(col >= 0)) {
|
|
68
|
+
if(col == 0)
|
|
69
|
+
encode_eol_handle_pre(_src, inpos, outp, col, lineSizeOffset);
|
|
70
|
+
else {
|
|
71
|
+
uint8_t c = _src[inpos++];
|
|
72
|
+
if(LIKELIHOOD(0.0273, escapedLUT[c]!=0)) {
|
|
73
|
+
uint32_t v = UINT32_16_PACK(UINT16_PACK('\r', '\n'), (uint32_t)escapedLUT[c]);
|
|
74
|
+
memcpy(outp, &v, sizeof(v));
|
|
75
|
+
outp += 4;
|
|
76
|
+
col = 2-line_size + 1;
|
|
77
|
+
} else {
|
|
78
|
+
uint32_t v = UINT32_PACK('\r', '\n', (uint32_t)(c+42), 0);
|
|
79
|
+
memcpy(outp, &v, sizeof(v));
|
|
80
|
+
outp += 3;
|
|
81
|
+
col = 2-line_size;
|
|
82
|
+
}
|
|
83
|
+
}
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
// vector constants
|
|
87
|
+
const vuint8mf2_t ALT_SHIFT = RV(vreinterpret_v_u16mf2_u8mf2)(RV(vmv_v_x_u16mf2)(4, vl2));
|
|
88
|
+
const uint8_t _MASK_EXPAND[] = {0xAA, 0xAB, 0xAE, 0xAF, 0xBA, 0xBB, 0xBE, 0xBF, 0xEA, 0xEB, 0xEE, 0xEF, 0xFA, 0xFB, 0xFE, 0xFF};
|
|
89
|
+
const vuint8m1_t MASK_EXPAND = RV(vle8_v_u8m1)(_MASK_EXPAND, 16);
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
// TODO: consider exploiting partial vector capability
|
|
93
|
+
while(inpos < 0) {
|
|
94
|
+
vuint8m2_t data = RV(vle8_v_u8m2)(_src + inpos, vl2);
|
|
95
|
+
inpos += vl2;
|
|
96
|
+
|
|
97
|
+
// search for special chars
|
|
98
|
+
// TODO: vrgather strat
|
|
99
|
+
|
|
100
|
+
vuint8m2_t tmpData = RV(vsub_vx_u8m2)(data, -42, vl2);
|
|
101
|
+
vbool4_t cmp = RV(vmor_mm_b4)(
|
|
102
|
+
RV(vmor_mm_b4)(
|
|
103
|
+
RV(vmseq_vx_u8m2_b4)(data, -42, vl2),
|
|
104
|
+
RV(vmseq_vx_u8m2_b4)(tmpData, '=', vl2),
|
|
105
|
+
vl2
|
|
106
|
+
),
|
|
107
|
+
RV(vmor_mm_b4)(
|
|
108
|
+
RV(vmseq_vx_u8m2_b4)(data, '\r'-42, vl2),
|
|
109
|
+
RV(vmseq_vx_u8m2_b4)(data, '\n'-42, vl2),
|
|
110
|
+
vl2
|
|
111
|
+
),
|
|
112
|
+
vl2
|
|
113
|
+
);
|
|
114
|
+
|
|
115
|
+
#if defined(__riscv_v_intrinsic) && __riscv_v_intrinsic >= 13000
|
|
116
|
+
data = RV(vor_vx_u8m2_mu)(cmp, tmpData, tmpData, 64, vl2);
|
|
117
|
+
#else
|
|
118
|
+
data = RV(vor_vx_u8m2_m)(cmp, tmpData, tmpData, 64, vl2);
|
|
119
|
+
#endif
|
|
120
|
+
|
|
121
|
+
int idx;
|
|
122
|
+
size_t count = RV(vcpop_m_b4)(cmp, vl2);
|
|
123
|
+
if(count > 1) {
|
|
124
|
+
// widen mask: 4b->8b
|
|
125
|
+
#if defined(__riscv_v_intrinsic) && __riscv_v_intrinsic >= 13000
|
|
126
|
+
vuint8mf4_t vcmp = RV(vlmul_trunc_v_u8m1_u8mf4)(RV(vreinterpret_v_b4_u8m1)(cmp));
|
|
127
|
+
#else
|
|
128
|
+
vuint8mf4_t vcmp = *(vuint8mf4_t*)(&cmp);
|
|
129
|
+
#endif
|
|
130
|
+
// TODO: use vwsll instead if available
|
|
131
|
+
// - is clmul useful here?
|
|
132
|
+
vuint8mf2_t xcmp = RV(vreinterpret_v_u16mf2_u8mf2)(RV(vwmulu_vx_u16mf2)(vcmp, 16, vl2));
|
|
133
|
+
xcmp = RV(vsrl_vv_u8mf2)(xcmp, ALT_SHIFT, vl2);
|
|
134
|
+
|
|
135
|
+
// expand mask by inserting '1' between each bit (0000abcd -> 1a1b1c1d)
|
|
136
|
+
vuint8m1_t xcmpTmp = RV(vrgather_vv_u8m1)(MASK_EXPAND, RV(vlmul_ext_v_u8mf2_u8m1)(xcmp), vl2);
|
|
137
|
+
#if defined(__riscv_v_intrinsic) && __riscv_v_intrinsic >= 13000
|
|
138
|
+
vbool2_t cmpmask = RV(vreinterpret_b2)(xcmpTmp);
|
|
139
|
+
#else
|
|
140
|
+
vbool2_t cmpmask = *(vbool2_t*)(&xcmpTmp);
|
|
141
|
+
#endif
|
|
142
|
+
|
|
143
|
+
// expand data and insert =
|
|
144
|
+
// TODO: use vwsll instead if available
|
|
145
|
+
vuint16m4_t data2 = RV(vzext_vf2_u16m4)(data, vl2);
|
|
146
|
+
data2 = RV(vsll_vx_u16m4)(data2, 8, vl2);
|
|
147
|
+
data2 = RV(vor_vx_u16m4)(data2, '=', vl2);
|
|
148
|
+
|
|
149
|
+
// prune unneeded =
|
|
150
|
+
vuint8m4_t dataTmp = RV(vreinterpret_v_u16m4_u8m4)(data2);
|
|
151
|
+
vuint8m4_t final_data = RV(vcompress_vm_u8m4)(
|
|
152
|
+
#if defined(__riscv_v_intrinsic) && __riscv_v_intrinsic >= 13000
|
|
153
|
+
dataTmp, cmpmask, vl2*2
|
|
154
|
+
#else
|
|
155
|
+
cmpmask, dataTmp, dataTmp, vl2*2
|
|
156
|
+
#endif
|
|
157
|
+
);
|
|
158
|
+
|
|
159
|
+
RV(vse8_v_u8m4)(outp, final_data, vl2*2);
|
|
160
|
+
outp += vl2 + count;
|
|
161
|
+
col += vl2 + count;
|
|
162
|
+
|
|
163
|
+
if(col >= 0) {
|
|
164
|
+
// we overflowed - find correct position to revert back to
|
|
165
|
+
// TODO: stick with u8 type for vlmax <= 2048 (need to check if ok if vlmax == 2048)
|
|
166
|
+
// - considering that it's rare for colWidth > 128, maybe just don't support vectors that long
|
|
167
|
+
vuint16m8_t xidx = RV(viota_m_u16m8)(cmpmask, vl2*2);
|
|
168
|
+
vbool2_t discardmask = RV(vmsgeu_vx_u16m8_b2)(xidx, vl2 + count - col, vl2*2);
|
|
169
|
+
long idx_revert = RV(vcpop_m_b2)(discardmask, vl2*2);
|
|
170
|
+
|
|
171
|
+
outp -= col + (idx_revert & 1);
|
|
172
|
+
inpos -= ((idx_revert+1) >> 1);
|
|
173
|
+
|
|
174
|
+
goto _encode_eol_handle_pre;
|
|
175
|
+
}
|
|
176
|
+
} else {
|
|
177
|
+
// 0 or 1 special characters
|
|
178
|
+
{
|
|
179
|
+
vbool4_t mask = RV(vmsbf_m_b4)(cmp, vl2);
|
|
180
|
+
// TODO: is it better to shuffle this into two stores, instead of three?
|
|
181
|
+
RV(vse8_v_u8m2_m)(mask, outp, data, vl2);
|
|
182
|
+
idx = RV(vcpop_m_b4)(mask, vl2);
|
|
183
|
+
outp[idx] = '=';
|
|
184
|
+
RV(vse8_v_u8m2_m)(RV(vmnot_m_b4)(mask, vl2), outp+1, data, vl2);
|
|
185
|
+
|
|
186
|
+
outp += vl2 + count;
|
|
187
|
+
col += vl2 + count;
|
|
188
|
+
}
|
|
189
|
+
|
|
190
|
+
if(col >= 0) {
|
|
191
|
+
if(count > 0) {
|
|
192
|
+
idx = vl2 - idx;
|
|
193
|
+
if(HEDLEY_UNLIKELY(col == idx)) {
|
|
194
|
+
// this is an escape character, so line will need to overflow
|
|
195
|
+
outp--;
|
|
196
|
+
} else {
|
|
197
|
+
inpos += (col > idx);
|
|
198
|
+
}
|
|
199
|
+
}
|
|
200
|
+
outp -= col;
|
|
201
|
+
inpos -= col;
|
|
202
|
+
|
|
203
|
+
_encode_eol_handle_pre:
|
|
204
|
+
encode_eol_handle_pre(_src, inpos, outp, col, lineSizeOffset);
|
|
205
|
+
}
|
|
206
|
+
}
|
|
207
|
+
}
|
|
208
|
+
|
|
209
|
+
*colOffset = col + line_size -1;
|
|
210
|
+
dest = outp;
|
|
211
|
+
len = -(inpos - INPUT_OFFSET);
|
|
212
|
+
}
|
|
213
|
+
|
|
214
|
+
void encoder_rvv_init() {
|
|
215
|
+
_do_encode = &do_encode_simd<do_encode_rvv>;
|
|
216
|
+
}
|
|
217
|
+
#else
|
|
218
|
+
void encoder_rvv_init() {}
|
|
219
|
+
#endif /* defined(__riscv_vector) */
|
package/src/encoder_sse_base.h
CHANGED
|
@@ -350,7 +350,7 @@ HEDLEY_ALWAYS_INLINE void do_encode_sse(int line_size, int* colOffset, const uin
|
|
|
350
350
|
#if defined(__POPCNT__) && !defined(__tune_btver1__)
|
|
351
351
|
if(use_isa & ISA_FEATURE_POPCNT) {
|
|
352
352
|
shuf2Len = popcnt32(maskA) + 16;
|
|
353
|
-
# if defined(__tune_znver3__) || defined(__tune_znver2__) || defined(__tune_znver1__) || defined(__tune_btver2__)
|
|
353
|
+
# if defined(__tune_znver4__) || defined(__tune_znver3__) || defined(__tune_znver2__) || defined(__tune_znver1__) || defined(__tune_btver2__)
|
|
354
354
|
shuf1Len = popcnt32(m1) + 8;
|
|
355
355
|
shuf3Len = popcnt32(m3) + shuf2Len + 8;
|
|
356
356
|
# else
|
|
@@ -409,7 +409,7 @@ HEDLEY_ALWAYS_INLINE void do_encode_sse(int line_size, int* colOffset, const uin
|
|
|
409
409
|
if(use_isa >= ISA_LEVEL_VBMI2)
|
|
410
410
|
# endif
|
|
411
411
|
{
|
|
412
|
-
|
|
412
|
+
__asm__(
|
|
413
413
|
"shrl $1, %[eqMask] \n"
|
|
414
414
|
"shrl %%cl, %[eqMask] \n" // TODO: can use shrq to avoid above shift?
|
|
415
415
|
# if defined(PLATFORM_AMD64) && !defined(__ILP32__)
|
|
@@ -484,7 +484,7 @@ HEDLEY_ALWAYS_INLINE void do_encode_sse(int line_size, int* colOffset, const uin
|
|
|
484
484
|
dataB = _mm_ternarylogic_epi32(dataB, cmpB, _mm_set1_epi8(64), 0xf8);
|
|
485
485
|
|
|
486
486
|
// store last char
|
|
487
|
-
|
|
487
|
+
p[XMM_SIZE*2] = _mm_extract_epi8(dataB, 15);
|
|
488
488
|
|
|
489
489
|
uint32_t blendMask = (uint32_t)(-(int32_t)mask);
|
|
490
490
|
dataB = _mm_mask_alignr_epi8(dataB, blendMask>>16, dataB, dataA, 15);
|
package/src/encoder_vbmi2.cc
CHANGED
|
@@ -1,5 +1,12 @@
|
|
|
1
1
|
#include "common.h"
|
|
2
2
|
|
|
3
|
+
extern const bool encoder_has_avx10;
|
|
4
|
+
#if !defined(__EVEX512__) && (defined(__AVX10_1__) || defined(__EVEX256__)) && defined(__AVX512VL__) && defined(__AVX512VBMI2__) && defined(__AVX512BW__)
|
|
5
|
+
const bool encoder_has_avx10 = true;
|
|
6
|
+
#else
|
|
7
|
+
const bool encoder_has_avx10 = false;
|
|
8
|
+
#endif
|
|
9
|
+
|
|
3
10
|
#if defined(__AVX512VL__) && defined(__AVX512VBMI2__) && defined(__AVX512BW__)
|
|
4
11
|
# ifndef YENC_DISABLE_AVX256
|
|
5
12
|
# include "encoder_avx_base.h"
|