yencode 1.1.3 → 1.1.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +3 -2
- package/binding.gyp +75 -12
- package/index.js +21 -19
- package/package.json +2 -1
- package/src/common.h +43 -5
- package/src/crc.cc +137 -15
- package/src/crc.h +4 -0
- package/src/crc_arm.cc +11 -6
- package/src/crc_folding.cc +4 -5
- package/src/crc_folding_256.cc +10 -10
- package/src/decoder.cc +9 -4
- package/src/decoder.h +9 -5
- package/src/decoder_avx.cc +1 -0
- package/src/decoder_avx2.cc +1 -0
- package/src/decoder_avx2_base.h +14 -18
- package/src/decoder_common.h +30 -5
- package/src/decoder_neon.cc +7 -13
- package/src/decoder_neon64.cc +7 -12
- package/src/decoder_sse2.cc +1 -0
- package/src/decoder_sse_base.h +15 -14
- package/src/decoder_ssse3.cc +1 -0
- package/src/decoder_vbmi2.cc +9 -0
- package/src/encoder.cc +10 -1
- package/src/encoder.h +4 -0
- package/src/encoder_avx.cc +1 -0
- package/src/encoder_avx2.cc +1 -0
- package/src/encoder_avx_base.h +22 -14
- package/src/encoder_neon.cc +40 -40
- package/src/encoder_rvv.cc +220 -0
- package/src/encoder_sse2.cc +1 -0
- package/src/encoder_sse_base.h +3 -3
- package/src/encoder_ssse3.cc +1 -0
- package/src/encoder_vbmi2.cc +9 -0
- package/src/hedley.h +278 -135
- package/src/platform.cc +57 -9
- package/src/test_alignalloc.c +6 -0
- package/test/_speedbase.js +12 -11
- package/test/speeddec.js +6 -5
- package/test/testcrc.js +2 -2
- package/test/testdec.js +31 -15
- package/test/testenc.js +11 -8
- package/test/testpostdec.js +6 -5
package/src/encoder_avx_base.h
CHANGED
|
@@ -215,7 +215,7 @@ HEDLEY_ALWAYS_INLINE void do_encode_avx2(int line_size, int* colOffset, const ui
|
|
|
215
215
|
// duplicate halves
|
|
216
216
|
data1A = _mm256_inserti128_si256(dataA, _mm256_castsi256_si128(dataA), 1);
|
|
217
217
|
data1B = _mm256_inserti128_si256(dataB, _mm256_castsi256_si128(dataB), 1);
|
|
218
|
-
#if defined(__tune_znver2__) || defined(__tune_znver3__)
|
|
218
|
+
#if defined(__tune_znver2__) || defined(__tune_znver3__) || defined(__tune_znver4__)
|
|
219
219
|
data2A = _mm256_permute2x128_si256(dataA, dataA, 0x11);
|
|
220
220
|
data2B = _mm256_permute2x128_si256(dataB, dataB, 0x11);
|
|
221
221
|
#else
|
|
@@ -290,7 +290,7 @@ HEDLEY_ALWAYS_INLINE void do_encode_avx2(int line_size, int* colOffset, const ui
|
|
|
290
290
|
|
|
291
291
|
#if defined(__GNUC__) && defined(PLATFORM_AMD64)
|
|
292
292
|
if(use_isa >= ISA_LEVEL_VBMI2) {
|
|
293
|
-
|
|
293
|
+
__asm__(
|
|
294
294
|
"shrq $1, %[eqMask] \n"
|
|
295
295
|
"shrq %%cl, %[eqMask] \n"
|
|
296
296
|
"adcq %q[col], %q[p] \n"
|
|
@@ -334,28 +334,32 @@ HEDLEY_ALWAYS_INLINE void do_encode_avx2(int line_size, int* colOffset, const ui
|
|
|
334
334
|
if(use_isa >= ISA_LEVEL_AVX3) {
|
|
335
335
|
# if defined(__AVX512VBMI2__)
|
|
336
336
|
if(use_isa >= ISA_LEVEL_VBMI2) {
|
|
337
|
-
|
|
337
|
+
__m128i dataTop = _mm256_extracti128_si256(dataA, 1);
|
|
338
338
|
dataA = _mm256_mask_expand_epi8(_mm256_set1_epi8('='), KNOT32(maskA), dataA);
|
|
339
339
|
_mm256_storeu_si256((__m256i*)p, dataA);
|
|
340
|
+
p[32] = _mm_extract_epi8(dataTop, 15);
|
|
340
341
|
p += outputBytesA;
|
|
341
342
|
|
|
342
|
-
|
|
343
|
+
dataTop = _mm256_extracti128_si256(dataB, 1);
|
|
343
344
|
dataB = _mm256_mask_expand_epi8(_mm256_set1_epi8('='), KNOT32(maskB), dataB);
|
|
344
345
|
_mm256_storeu_si256((__m256i*)p, dataB);
|
|
346
|
+
p[32] = _mm_extract_epi8(dataTop, 15);
|
|
345
347
|
p += maskBitsB;
|
|
346
348
|
} else
|
|
347
349
|
# endif
|
|
348
350
|
{
|
|
349
|
-
|
|
350
|
-
dataA = _mm256_mask_alignr_epi8(dataA, (uint32_t)(-(int32_t)maskA), dataA,
|
|
351
|
+
__m256i dataSwapped = _mm256_permute4x64_epi64(dataA, _MM_SHUFFLE(1,0,3,2));
|
|
352
|
+
dataA = _mm256_mask_alignr_epi8(dataA, (uint32_t)(-(int32_t)maskA), dataA, dataSwapped, 15);
|
|
351
353
|
dataA = _mm256_ternarylogic_epi32(dataA, cmpA, _mm256_set1_epi8('='), 0xb8); // (data & ~cmp) | (cmp & '=')
|
|
352
354
|
_mm256_storeu_si256((__m256i*)p, dataA);
|
|
355
|
+
p[32] = _mm_extract_epi8(_mm256_castsi256_si128(dataSwapped), 15);
|
|
353
356
|
p += outputBytesA;
|
|
354
357
|
|
|
355
|
-
|
|
356
|
-
dataB = _mm256_mask_alignr_epi8(dataB, (uint32_t)(-(int32_t)maskB), dataB,
|
|
358
|
+
dataSwapped = _mm256_permute4x64_epi64(dataB, _MM_SHUFFLE(1,0,3,2));
|
|
359
|
+
dataB = _mm256_mask_alignr_epi8(dataB, (uint32_t)(-(int32_t)maskB), dataB, dataSwapped, 15);
|
|
357
360
|
dataB = _mm256_ternarylogic_epi32(dataB, cmpB, _mm256_set1_epi8('='), 0xb8);
|
|
358
361
|
_mm256_storeu_si256((__m256i*)p, dataB);
|
|
362
|
+
p[32] = _mm_extract_epi8(_mm256_castsi256_si128(dataSwapped), 15);
|
|
359
363
|
p += maskBitsB;
|
|
360
364
|
}
|
|
361
365
|
} else
|
|
@@ -484,28 +488,32 @@ HEDLEY_ALWAYS_INLINE void do_encode_avx2(int line_size, int* colOffset, const ui
|
|
|
484
488
|
if(use_isa >= ISA_LEVEL_AVX3) {
|
|
485
489
|
# if defined(__AVX512VBMI2__)
|
|
486
490
|
if(use_isa >= ISA_LEVEL_VBMI2) {
|
|
487
|
-
|
|
491
|
+
__m128i dataTop = _mm256_extracti128_si256(dataA, 1);
|
|
488
492
|
dataA = _mm256_mask_expand_epi8(_mm256_set1_epi8('='), KNOT32(maskA), dataA);
|
|
489
493
|
_mm256_storeu_si256((__m256i*)p, dataA);
|
|
494
|
+
p[32] = _mm_extract_epi8(dataTop, 15);
|
|
490
495
|
p += outputBytesA;
|
|
491
496
|
|
|
492
|
-
|
|
497
|
+
dataTop = _mm256_extracti128_si256(dataB, 1);
|
|
493
498
|
dataB = _mm256_mask_expand_epi8(_mm256_set1_epi8('='), KNOT32(maskB), dataB);
|
|
494
499
|
_mm256_storeu_si256((__m256i*)p, dataB);
|
|
500
|
+
p[32] = _mm_extract_epi8(dataTop, 15);
|
|
495
501
|
p += maskBitsB;
|
|
496
502
|
} else
|
|
497
503
|
# endif
|
|
498
504
|
{
|
|
499
|
-
|
|
500
|
-
dataA = _mm256_mask_alignr_epi8(dataA, (uint32_t)(-(int32_t)maskA), dataA,
|
|
505
|
+
__m256i dataSwapped = _mm256_permute4x64_epi64(dataA, _MM_SHUFFLE(1,0,3,2));
|
|
506
|
+
dataA = _mm256_mask_alignr_epi8(dataA, (uint32_t)(-(int32_t)maskA), dataA, dataSwapped, 15);
|
|
501
507
|
dataA = _mm256_ternarylogic_epi32(dataA, cmpA, _mm256_set1_epi8('='), 0xb8); // (data & ~cmp) | (cmp & '=')
|
|
502
508
|
_mm256_storeu_si256((__m256i*)p, dataA);
|
|
509
|
+
p[32] = _mm_extract_epi8(_mm256_castsi256_si128(dataSwapped), 15);
|
|
503
510
|
p += outputBytesA;
|
|
504
511
|
|
|
505
|
-
|
|
506
|
-
dataB = _mm256_mask_alignr_epi8(dataB, (uint32_t)(-(int32_t)maskB), dataB,
|
|
512
|
+
dataSwapped = _mm256_permute4x64_epi64(dataB, _MM_SHUFFLE(1,0,3,2));
|
|
513
|
+
dataB = _mm256_mask_alignr_epi8(dataB, (uint32_t)(-(int32_t)maskB), dataB, dataSwapped, 15);
|
|
507
514
|
dataB = _mm256_ternarylogic_epi32(dataB, cmpB, _mm256_set1_epi8('='), 0xb8);
|
|
508
515
|
_mm256_storeu_si256((__m256i*)p, dataB);
|
|
516
|
+
p[32] = _mm_extract_epi8(_mm256_castsi256_si128(dataSwapped), 15);
|
|
509
517
|
p += maskBitsB;
|
|
510
518
|
}
|
|
511
519
|
} else
|
package/src/encoder_neon.cc
CHANGED
|
@@ -15,6 +15,43 @@ static HEDLEY_ALWAYS_INLINE void vst1q_u8_x2_unaligned(uint8_t* p, uint8x16x2_t
|
|
|
15
15
|
#endif
|
|
16
16
|
|
|
17
17
|
|
|
18
|
+
// ARM's CLZ instruction at native bit-width
|
|
19
|
+
#ifdef __aarch64__
|
|
20
|
+
static HEDLEY_ALWAYS_INLINE int clz_n(uint64_t v) {
|
|
21
|
+
# ifdef _MSC_VER
|
|
22
|
+
long r;
|
|
23
|
+
// does this work?
|
|
24
|
+
if(_BitScanReverse64((unsigned long*)&r, v))
|
|
25
|
+
r ^= 63;
|
|
26
|
+
else
|
|
27
|
+
r = 64;
|
|
28
|
+
return r;
|
|
29
|
+
# else
|
|
30
|
+
# if defined(__clang__) || HEDLEY_GCC_VERSION_CHECK(11,0,0)
|
|
31
|
+
// this pattern is only detected on GCC >= 11 (Clang 9 seems to as well, unsure about earlier versions)
|
|
32
|
+
// - note: return type must be 'int'; GCC fails to optimise this if type is 'long'
|
|
33
|
+
// GCC <= 10 doesn't optimize around the '0 = undefined behaviour', so not needed there
|
|
34
|
+
if(v == 0) return 64;
|
|
35
|
+
# endif
|
|
36
|
+
return __builtin_clzll(v);
|
|
37
|
+
# endif
|
|
38
|
+
}
|
|
39
|
+
#else
|
|
40
|
+
static HEDLEY_ALWAYS_INLINE int clz_n(uint32_t v) {
|
|
41
|
+
# ifdef __GNUC__
|
|
42
|
+
# if defined(__clang__) || HEDLEY_GCC_VERSION_CHECK(7,0,0)
|
|
43
|
+
// as with AArch64 version above, only insert this check if compiler can optimise it away
|
|
44
|
+
if(v == 0) return 32;
|
|
45
|
+
# endif
|
|
46
|
+
return __builtin_clz(v);
|
|
47
|
+
# elif defined(_MSC_VER)
|
|
48
|
+
return _arm_clz(v);
|
|
49
|
+
# else
|
|
50
|
+
return __clz(v); // ARM compiler?
|
|
51
|
+
# endif
|
|
52
|
+
}
|
|
53
|
+
#endif
|
|
54
|
+
|
|
18
55
|
static uint8x16_t ALIGN_TO(16, shufLUT[256]);
|
|
19
56
|
static uint16_t expandLUT[256];
|
|
20
57
|
|
|
@@ -195,26 +232,7 @@ static HEDLEY_ALWAYS_INLINE void encode_eol_handle_pre(const uint8_t* HEDLEY_RES
|
|
|
195
232
|
col = shufTotalLen+1 + lineSizeOffset-32;
|
|
196
233
|
} else {
|
|
197
234
|
// shuffle stuff up
|
|
198
|
-
|
|
199
|
-
# ifdef _MSC_VER
|
|
200
|
-
long bitIndex;
|
|
201
|
-
if(_BitScanReverse64((unsigned long*)&bitIndex, mask))
|
|
202
|
-
bitIndex ^= 63;
|
|
203
|
-
else
|
|
204
|
-
bitIndex = 64;
|
|
205
|
-
# else
|
|
206
|
-
long bitIndex = __builtin_clzll(mask);
|
|
207
|
-
# endif
|
|
208
|
-
#else
|
|
209
|
-
# ifdef __GNUC__
|
|
210
|
-
long bitIndex = __builtin_clz(mask); // TODO: is the 'undefined if 0' case problematic here?
|
|
211
|
-
# elif defined(_MSC_VER)
|
|
212
|
-
long bitIndex = _arm_clz(mask);
|
|
213
|
-
# else
|
|
214
|
-
long bitIndex = __clz(mask); // ARM compiler?
|
|
215
|
-
# endif
|
|
216
|
-
#endif
|
|
217
|
-
|
|
235
|
+
long bitIndex = clz_n(mask);
|
|
218
236
|
uint8x16_t vClz = vdupq_n_u8(bitIndex & ~(sizeof(mask)*8));
|
|
219
237
|
#ifdef __aarch64__
|
|
220
238
|
uint8x16_t blendA = vcgtq_u8(vmakeq_u8(63,62,61,60,51,50,49,48,47,46,45,44,35,34,33,32), vClz);
|
|
@@ -450,26 +468,7 @@ HEDLEY_ALWAYS_INLINE void do_encode_neon(int line_size, int* colOffset, const ui
|
|
|
450
468
|
}
|
|
451
469
|
} else {
|
|
452
470
|
{
|
|
453
|
-
|
|
454
|
-
# ifdef _MSC_VER
|
|
455
|
-
// does this work?
|
|
456
|
-
if(_BitScanReverse64((unsigned long*)&bitIndex, mask))
|
|
457
|
-
bitIndex ^= 63;
|
|
458
|
-
else
|
|
459
|
-
bitIndex = 64;
|
|
460
|
-
# else
|
|
461
|
-
bitIndex = __builtin_clzll(mask); // TODO: is the 'undefined if 0' case problematic here?
|
|
462
|
-
# endif
|
|
463
|
-
#else
|
|
464
|
-
# ifdef __GNUC__
|
|
465
|
-
bitIndex = __builtin_clz(mask);
|
|
466
|
-
# elif defined(_MSC_VER)
|
|
467
|
-
bitIndex = _arm_clz(mask);
|
|
468
|
-
# else
|
|
469
|
-
bitIndex = __clz(mask); // ARM compiler?
|
|
470
|
-
# endif
|
|
471
|
-
#endif
|
|
472
|
-
|
|
471
|
+
bitIndex = clz_n(mask);
|
|
473
472
|
uint8x16_t vClz = vdupq_n_u8(bitIndex & ~(sizeof(mask)*8));
|
|
474
473
|
#ifdef __aarch64__
|
|
475
474
|
uint8x16_t blendA = vcgeq_u8(vmakeq_u8(63,62,61,60,51,50,49,48,47,46,45,44,35,34,33,32), vClz);
|
|
@@ -521,6 +520,7 @@ HEDLEY_ALWAYS_INLINE void do_encode_neon(int line_size, int* colOffset, const ui
|
|
|
521
520
|
|
|
522
521
|
void encoder_neon_init() {
|
|
523
522
|
_do_encode = &do_encode_simd<do_encode_neon>;
|
|
523
|
+
_encode_isa = ISA_LEVEL_NEON;
|
|
524
524
|
// generate shuf LUT
|
|
525
525
|
for(int i=0; i<256; i++) {
|
|
526
526
|
int k = i;
|
|
@@ -0,0 +1,220 @@
|
|
|
1
|
+
#include "common.h"
|
|
2
|
+
|
|
3
|
+
#ifdef __riscv_vector
|
|
4
|
+
#include "encoder.h"
|
|
5
|
+
#include "encoder_common.h"
|
|
6
|
+
|
|
7
|
+
# include <riscv_vector.h>
|
|
8
|
+
# if defined(__clang__) && __clang_major__ < 16
|
|
9
|
+
# define RV(f) f
|
|
10
|
+
# else
|
|
11
|
+
# define RV(f) __riscv_##f
|
|
12
|
+
# endif
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
static HEDLEY_ALWAYS_INLINE void encode_eol_handle_pre(const uint8_t* HEDLEY_RESTRICT _src, long& inpos, uint8_t*& outp, long& col, long lineSizeOffset) {
|
|
16
|
+
// TODO: vectorize
|
|
17
|
+
uint8_t c = _src[inpos++];
|
|
18
|
+
if(HEDLEY_UNLIKELY(escapedLUT[c] && c != '.'-42)) {
|
|
19
|
+
memcpy(outp, &escapedLUT[c], sizeof(uint16_t));
|
|
20
|
+
outp += 2;
|
|
21
|
+
} else {
|
|
22
|
+
*(outp++) = c + 42;
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
c = _src[inpos++];
|
|
26
|
+
if(LIKELIHOOD(0.0273, escapedLUT[c]!=0)) {
|
|
27
|
+
uint32_t w = UINT32_16_PACK(UINT16_PACK('\r', '\n'), (uint32_t)escapedLUT[c]);
|
|
28
|
+
memcpy(outp, &w, sizeof(w));
|
|
29
|
+
outp += 4;
|
|
30
|
+
col = lineSizeOffset + 2;
|
|
31
|
+
} else {
|
|
32
|
+
uint32_t w = UINT32_PACK('\r', '\n', (uint32_t)(c+42), 0);
|
|
33
|
+
memcpy(outp, &w, sizeof(w));
|
|
34
|
+
outp += 3;
|
|
35
|
+
col = lineSizeOffset + 1;
|
|
36
|
+
}
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
HEDLEY_ALWAYS_INLINE void do_encode_rvv(int line_size, int* colOffset, const uint8_t* HEDLEY_RESTRICT srcEnd, uint8_t* HEDLEY_RESTRICT& dest, size_t& len) {
|
|
41
|
+
size_t vl2 = RV(vsetvlmax_e8m2)(); // TODO: limit to line length
|
|
42
|
+
// TODO: have a LMUL=1 variant if line_size < vl
|
|
43
|
+
|
|
44
|
+
// offset position to enable simpler loop condition checking
|
|
45
|
+
const int INPUT_OFFSET = vl2*2 -1; // extra chars for EOL handling, -1 to change <= to <
|
|
46
|
+
if((intptr_t)len <= INPUT_OFFSET || line_size < (int)vl2*2) return;
|
|
47
|
+
|
|
48
|
+
uint8_t *outp = dest;
|
|
49
|
+
long inpos = -(long)len;
|
|
50
|
+
long lineSizeOffset = -line_size +1;
|
|
51
|
+
long col = *colOffset - line_size +1;
|
|
52
|
+
|
|
53
|
+
inpos += INPUT_OFFSET;
|
|
54
|
+
const uint8_t* _src = srcEnd - INPUT_OFFSET;
|
|
55
|
+
|
|
56
|
+
if (HEDLEY_LIKELY(col == -line_size+1)) {
|
|
57
|
+
uint8_t c = _src[inpos++];
|
|
58
|
+
if (LIKELIHOOD(0.0273, escapedLUT[c] != 0)) {
|
|
59
|
+
memcpy(outp, escapedLUT + c, 2);
|
|
60
|
+
outp += 2;
|
|
61
|
+
col += 2;
|
|
62
|
+
} else {
|
|
63
|
+
*(outp++) = c + 42;
|
|
64
|
+
col += 1;
|
|
65
|
+
}
|
|
66
|
+
}
|
|
67
|
+
if(HEDLEY_UNLIKELY(col >= 0)) {
|
|
68
|
+
if(col == 0)
|
|
69
|
+
encode_eol_handle_pre(_src, inpos, outp, col, lineSizeOffset);
|
|
70
|
+
else {
|
|
71
|
+
uint8_t c = _src[inpos++];
|
|
72
|
+
if(LIKELIHOOD(0.0273, escapedLUT[c]!=0)) {
|
|
73
|
+
uint32_t v = UINT32_16_PACK(UINT16_PACK('\r', '\n'), (uint32_t)escapedLUT[c]);
|
|
74
|
+
memcpy(outp, &v, sizeof(v));
|
|
75
|
+
outp += 4;
|
|
76
|
+
col = 2-line_size + 1;
|
|
77
|
+
} else {
|
|
78
|
+
uint32_t v = UINT32_PACK('\r', '\n', (uint32_t)(c+42), 0);
|
|
79
|
+
memcpy(outp, &v, sizeof(v));
|
|
80
|
+
outp += 3;
|
|
81
|
+
col = 2-line_size;
|
|
82
|
+
}
|
|
83
|
+
}
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
// vector constants
|
|
87
|
+
const vuint8mf2_t ALT_SHIFT = RV(vreinterpret_v_u16mf2_u8mf2)(RV(vmv_v_x_u16mf2)(4, vl2));
|
|
88
|
+
const uint8_t _MASK_EXPAND[] = {0xAA, 0xAB, 0xAE, 0xAF, 0xBA, 0xBB, 0xBE, 0xBF, 0xEA, 0xEB, 0xEE, 0xEF, 0xFA, 0xFB, 0xFE, 0xFF};
|
|
89
|
+
const vuint8m1_t MASK_EXPAND = RV(vle8_v_u8m1)(_MASK_EXPAND, 16);
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
// TODO: consider exploiting partial vector capability
|
|
93
|
+
while(inpos < 0) {
|
|
94
|
+
vuint8m2_t data = RV(vle8_v_u8m2)(_src + inpos, vl2);
|
|
95
|
+
inpos += vl2;
|
|
96
|
+
|
|
97
|
+
// search for special chars
|
|
98
|
+
// TODO: vrgather strat
|
|
99
|
+
|
|
100
|
+
vuint8m2_t tmpData = RV(vsub_vx_u8m2)(data, -42, vl2);
|
|
101
|
+
vbool4_t cmp = RV(vmor_mm_b4)(
|
|
102
|
+
RV(vmor_mm_b4)(
|
|
103
|
+
RV(vmseq_vx_u8m2_b4)(data, -42, vl2),
|
|
104
|
+
RV(vmseq_vx_u8m2_b4)(tmpData, '=', vl2),
|
|
105
|
+
vl2
|
|
106
|
+
),
|
|
107
|
+
RV(vmor_mm_b4)(
|
|
108
|
+
RV(vmseq_vx_u8m2_b4)(data, '\r'-42, vl2),
|
|
109
|
+
RV(vmseq_vx_u8m2_b4)(data, '\n'-42, vl2),
|
|
110
|
+
vl2
|
|
111
|
+
),
|
|
112
|
+
vl2
|
|
113
|
+
);
|
|
114
|
+
|
|
115
|
+
#if defined(__riscv_v_intrinsic) && __riscv_v_intrinsic >= 13000
|
|
116
|
+
data = RV(vor_vx_u8m2_mu)(cmp, tmpData, tmpData, 64, vl2);
|
|
117
|
+
#else
|
|
118
|
+
data = RV(vor_vx_u8m2_m)(cmp, tmpData, tmpData, 64, vl2);
|
|
119
|
+
#endif
|
|
120
|
+
|
|
121
|
+
int idx;
|
|
122
|
+
size_t count = RV(vcpop_m_b4)(cmp, vl2);
|
|
123
|
+
if(count > 1) {
|
|
124
|
+
// widen mask: 4b->8b
|
|
125
|
+
#if defined(__riscv_v_intrinsic) && __riscv_v_intrinsic >= 13000
|
|
126
|
+
vuint8mf4_t vcmp = RV(vlmul_trunc_v_u8m1_u8mf4)(RV(vreinterpret_v_b4_u8m1)(cmp));
|
|
127
|
+
#else
|
|
128
|
+
vuint8mf4_t vcmp = *(vuint8mf4_t*)(&cmp);
|
|
129
|
+
#endif
|
|
130
|
+
// TODO: use vwsll instead if available
|
|
131
|
+
// - is clmul useful here?
|
|
132
|
+
vuint8mf2_t xcmp = RV(vreinterpret_v_u16mf2_u8mf2)(RV(vwmulu_vx_u16mf2)(vcmp, 16, vl2));
|
|
133
|
+
xcmp = RV(vsrl_vv_u8mf2)(xcmp, ALT_SHIFT, vl2);
|
|
134
|
+
|
|
135
|
+
// expand mask by inserting '1' between each bit (0000abcd -> 1a1b1c1d)
|
|
136
|
+
vuint8m1_t xcmpTmp = RV(vrgather_vv_u8m1)(MASK_EXPAND, RV(vlmul_ext_v_u8mf2_u8m1)(xcmp), vl2);
|
|
137
|
+
#if defined(__riscv_v_intrinsic) && __riscv_v_intrinsic >= 13000
|
|
138
|
+
vbool2_t cmpmask = RV(vreinterpret_b2)(xcmpTmp);
|
|
139
|
+
#else
|
|
140
|
+
vbool2_t cmpmask = *(vbool2_t*)(&xcmpTmp);
|
|
141
|
+
#endif
|
|
142
|
+
|
|
143
|
+
// expand data and insert =
|
|
144
|
+
// TODO: use vwsll instead if available
|
|
145
|
+
vuint16m4_t data2 = RV(vzext_vf2_u16m4)(data, vl2);
|
|
146
|
+
data2 = RV(vsll_vx_u16m4)(data2, 8, vl2);
|
|
147
|
+
data2 = RV(vor_vx_u16m4)(data2, '=', vl2);
|
|
148
|
+
|
|
149
|
+
// prune unneeded =
|
|
150
|
+
vuint8m4_t dataTmp = RV(vreinterpret_v_u16m4_u8m4)(data2);
|
|
151
|
+
vuint8m4_t final_data = RV(vcompress_vm_u8m4)(
|
|
152
|
+
#if defined(__riscv_v_intrinsic) && __riscv_v_intrinsic >= 13000
|
|
153
|
+
dataTmp, cmpmask, vl2*2
|
|
154
|
+
#else
|
|
155
|
+
cmpmask, dataTmp, dataTmp, vl2*2
|
|
156
|
+
#endif
|
|
157
|
+
);
|
|
158
|
+
|
|
159
|
+
RV(vse8_v_u8m4)(outp, final_data, vl2*2);
|
|
160
|
+
outp += vl2 + count;
|
|
161
|
+
col += vl2 + count;
|
|
162
|
+
|
|
163
|
+
if(col >= 0) {
|
|
164
|
+
// we overflowed - find correct position to revert back to
|
|
165
|
+
// TODO: stick with u8 type for vlmax <= 2048 (need to check if ok if vlmax == 2048)
|
|
166
|
+
// - considering that it's rare for colWidth > 128, maybe just don't support vectors that long
|
|
167
|
+
vuint16m8_t xidx = RV(viota_m_u16m8)(cmpmask, vl2*2);
|
|
168
|
+
vbool2_t discardmask = RV(vmsgeu_vx_u16m8_b2)(xidx, vl2 + count - col, vl2*2);
|
|
169
|
+
long idx_revert = RV(vcpop_m_b2)(discardmask, vl2*2);
|
|
170
|
+
|
|
171
|
+
outp -= col + (idx_revert & 1);
|
|
172
|
+
inpos -= ((idx_revert+1) >> 1);
|
|
173
|
+
|
|
174
|
+
goto _encode_eol_handle_pre;
|
|
175
|
+
}
|
|
176
|
+
} else {
|
|
177
|
+
// 0 or 1 special characters
|
|
178
|
+
{
|
|
179
|
+
vbool4_t mask = RV(vmsbf_m_b4)(cmp, vl2);
|
|
180
|
+
// TODO: is it better to shuffle this into two stores, instead of three?
|
|
181
|
+
RV(vse8_v_u8m2_m)(mask, outp, data, vl2);
|
|
182
|
+
idx = RV(vcpop_m_b4)(mask, vl2);
|
|
183
|
+
outp[idx] = '=';
|
|
184
|
+
RV(vse8_v_u8m2_m)(RV(vmnot_m_b4)(mask, vl2), outp+1, data, vl2);
|
|
185
|
+
|
|
186
|
+
outp += vl2 + count;
|
|
187
|
+
col += vl2 + count;
|
|
188
|
+
}
|
|
189
|
+
|
|
190
|
+
if(col >= 0) {
|
|
191
|
+
if(count > 0) {
|
|
192
|
+
idx = vl2 - idx;
|
|
193
|
+
if(HEDLEY_UNLIKELY(col == idx)) {
|
|
194
|
+
// this is an escape character, so line will need to overflow
|
|
195
|
+
outp--;
|
|
196
|
+
} else {
|
|
197
|
+
inpos += (col > idx);
|
|
198
|
+
}
|
|
199
|
+
}
|
|
200
|
+
outp -= col;
|
|
201
|
+
inpos -= col;
|
|
202
|
+
|
|
203
|
+
_encode_eol_handle_pre:
|
|
204
|
+
encode_eol_handle_pre(_src, inpos, outp, col, lineSizeOffset);
|
|
205
|
+
}
|
|
206
|
+
}
|
|
207
|
+
}
|
|
208
|
+
|
|
209
|
+
*colOffset = col + line_size -1;
|
|
210
|
+
dest = outp;
|
|
211
|
+
len = -(inpos - INPUT_OFFSET);
|
|
212
|
+
}
|
|
213
|
+
|
|
214
|
+
void encoder_rvv_init() {
|
|
215
|
+
_do_encode = &do_encode_simd<do_encode_rvv>;
|
|
216
|
+
_encode_isa = ISA_LEVEL_RVV;
|
|
217
|
+
}
|
|
218
|
+
#else
|
|
219
|
+
void encoder_rvv_init() {}
|
|
220
|
+
#endif /* defined(__riscv_vector) */
|
package/src/encoder_sse2.cc
CHANGED
package/src/encoder_sse_base.h
CHANGED
|
@@ -350,7 +350,7 @@ HEDLEY_ALWAYS_INLINE void do_encode_sse(int line_size, int* colOffset, const uin
|
|
|
350
350
|
#if defined(__POPCNT__) && !defined(__tune_btver1__)
|
|
351
351
|
if(use_isa & ISA_FEATURE_POPCNT) {
|
|
352
352
|
shuf2Len = popcnt32(maskA) + 16;
|
|
353
|
-
# if defined(__tune_znver3__) || defined(__tune_znver2__) || defined(__tune_znver1__) || defined(__tune_btver2__)
|
|
353
|
+
# if defined(__tune_znver4__) || defined(__tune_znver3__) || defined(__tune_znver2__) || defined(__tune_znver1__) || defined(__tune_btver2__)
|
|
354
354
|
shuf1Len = popcnt32(m1) + 8;
|
|
355
355
|
shuf3Len = popcnt32(m3) + shuf2Len + 8;
|
|
356
356
|
# else
|
|
@@ -409,7 +409,7 @@ HEDLEY_ALWAYS_INLINE void do_encode_sse(int line_size, int* colOffset, const uin
|
|
|
409
409
|
if(use_isa >= ISA_LEVEL_VBMI2)
|
|
410
410
|
# endif
|
|
411
411
|
{
|
|
412
|
-
|
|
412
|
+
__asm__(
|
|
413
413
|
"shrl $1, %[eqMask] \n"
|
|
414
414
|
"shrl %%cl, %[eqMask] \n" // TODO: can use shrq to avoid above shift?
|
|
415
415
|
# if defined(PLATFORM_AMD64) && !defined(__ILP32__)
|
|
@@ -484,7 +484,7 @@ HEDLEY_ALWAYS_INLINE void do_encode_sse(int line_size, int* colOffset, const uin
|
|
|
484
484
|
dataB = _mm_ternarylogic_epi32(dataB, cmpB, _mm_set1_epi8(64), 0xf8);
|
|
485
485
|
|
|
486
486
|
// store last char
|
|
487
|
-
|
|
487
|
+
p[XMM_SIZE*2] = _mm_extract_epi8(dataB, 15);
|
|
488
488
|
|
|
489
489
|
uint32_t blendMask = (uint32_t)(-(int32_t)mask);
|
|
490
490
|
dataB = _mm_mask_alignr_epi8(dataB, blendMask>>16, dataB, dataA, 15);
|
package/src/encoder_ssse3.cc
CHANGED
package/src/encoder_vbmi2.cc
CHANGED
|
@@ -1,5 +1,12 @@
|
|
|
1
1
|
#include "common.h"
|
|
2
2
|
|
|
3
|
+
extern const bool encoder_has_avx10;
|
|
4
|
+
#if !defined(__EVEX512__) && (defined(__AVX10_1__) || defined(__EVEX256__)) && defined(__AVX512VL__) && defined(__AVX512VBMI2__) && defined(__AVX512BW__)
|
|
5
|
+
const bool encoder_has_avx10 = true;
|
|
6
|
+
#else
|
|
7
|
+
const bool encoder_has_avx10 = false;
|
|
8
|
+
#endif
|
|
9
|
+
|
|
3
10
|
#if defined(__AVX512VL__) && defined(__AVX512VBMI2__) && defined(__AVX512BW__)
|
|
4
11
|
# ifndef YENC_DISABLE_AVX256
|
|
5
12
|
# include "encoder_avx_base.h"
|
|
@@ -7,12 +14,14 @@
|
|
|
7
14
|
void encoder_vbmi2_init() {
|
|
8
15
|
_do_encode = &do_encode_simd< do_encode_avx2<ISA_LEVEL_VBMI2> >;
|
|
9
16
|
encoder_avx2_lut<ISA_LEVEL_VBMI2>();
|
|
17
|
+
_encode_isa = ISA_LEVEL_VBMI2;
|
|
10
18
|
}
|
|
11
19
|
# else
|
|
12
20
|
# include "encoder_sse_base.h"
|
|
13
21
|
void encoder_vbmi2_init() {
|
|
14
22
|
_do_encode = &do_encode_simd< do_encode_sse<ISA_LEVEL_VBMI2> >;
|
|
15
23
|
encoder_sse_lut<ISA_LEVEL_VBMI2>();
|
|
24
|
+
_encode_isa = ISA_LEVEL_VBMI2;
|
|
16
25
|
}
|
|
17
26
|
# endif
|
|
18
27
|
#else
|