yencode 1.1.4 → 1.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +130 -189
- package/binding.gyp +79 -6
- package/index.js +2 -0
- package/package.json +1 -1
- package/src/common.h +39 -1
- package/src/crc.cc +89 -23
- package/src/crc.h +68 -2
- package/src/crc_arm.cc +54 -37
- package/src/crc_common.h +11 -0
- package/src/crc_folding.cc +155 -18
- package/src/crc_folding_256.cc +12 -16
- package/src/crc_riscv.cc +251 -0
- package/src/decoder.cc +37 -3
- package/src/decoder.h +4 -0
- package/src/decoder_avx.cc +3 -2
- package/src/decoder_avx2.cc +2 -1
- package/src/decoder_avx2_base.h +6 -24
- package/src/decoder_common.h +61 -49
- package/src/decoder_neon.cc +10 -26
- package/src/decoder_neon64.cc +10 -22
- package/src/decoder_rvv.cc +274 -0
- package/src/decoder_sse2.cc +24 -2
- package/src/decoder_sse_base.h +11 -45
- package/src/decoder_ssse3.cc +3 -2
- package/src/decoder_vbmi2.cc +2 -5
- package/src/encoder.cc +28 -0
- package/src/encoder.h +4 -0
- package/src/encoder_avx.cc +1 -0
- package/src/encoder_avx2.cc +1 -0
- package/src/encoder_common.h +2 -20
- package/src/encoder_neon.cc +1 -0
- package/src/encoder_rvv.cc +5 -19
- package/src/encoder_sse2.cc +1 -0
- package/src/encoder_ssse3.cc +1 -0
- package/src/encoder_vbmi2.cc +2 -0
- package/src/platform.cc +4 -4
- package/src/yencode.cc +45 -3
- package/test/testcrc.js +19 -3
- package/test/testcrcfuncs.c +53 -0
- package/test/testdec.js +2 -1
- package/test/testenc.js +1 -1
package/src/decoder_neon.cc
CHANGED
|
@@ -1,9 +1,6 @@
|
|
|
1
1
|
#include "common.h"
|
|
2
2
|
#ifdef __ARM_NEON
|
|
3
3
|
|
|
4
|
-
#ifndef __aarch64__
|
|
5
|
-
#define YENC_DEC_USE_THINTABLE 1
|
|
6
|
-
#endif
|
|
7
4
|
#include "decoder_common.h"
|
|
8
5
|
|
|
9
6
|
|
|
@@ -43,8 +40,6 @@ static struct { char bytes[16]; } ALIGN_TO(16, compactLUT[32768]);
|
|
|
43
40
|
# pragma pack()
|
|
44
41
|
#endif
|
|
45
42
|
|
|
46
|
-
static uint8_t eqFixLUT[256];
|
|
47
|
-
|
|
48
43
|
|
|
49
44
|
|
|
50
45
|
static bool neon_vect_is_nonzero(uint8x16_t v) {
|
|
@@ -78,6 +73,9 @@ HEDLEY_ALWAYS_INLINE void do_decode_neon(const uint8_t* src, long& len, unsigned
|
|
|
78
73
|
lfCompare = vsetq_lane_u8('.', lfCompare, 1);
|
|
79
74
|
}
|
|
80
75
|
#endif
|
|
76
|
+
|
|
77
|
+
decoder_set_nextMask<isRaw>(src, len, nextMask);
|
|
78
|
+
|
|
81
79
|
long i;
|
|
82
80
|
for(i = -len; i; i += sizeof(uint8x16_t)*2) {
|
|
83
81
|
uint8x16x2_t data = vld1q_u8_x2_align(src+i, 32);
|
|
@@ -251,6 +249,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_neon(const uint8_t* src, long& len, unsigned
|
|
|
251
249
|
// terminator found
|
|
252
250
|
// there's probably faster ways to do this, but reverting to scalar code should be good enough
|
|
253
251
|
len += i;
|
|
252
|
+
nextMask = decoder_set_nextMask<isRaw>(src+i, mask);
|
|
254
253
|
break;
|
|
255
254
|
}
|
|
256
255
|
}
|
|
@@ -301,6 +300,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_neon(const uint8_t* src, long& len, unsigned
|
|
|
301
300
|
);
|
|
302
301
|
if(LIKELIHOOD(0.001, neon_vect_is_nonzero(matchEnd))) {
|
|
303
302
|
len += i;
|
|
303
|
+
nextMask = decoder_set_nextMask<isRaw>(src+i, mask);
|
|
304
304
|
break;
|
|
305
305
|
}
|
|
306
306
|
}
|
|
@@ -323,18 +323,13 @@ HEDLEY_ALWAYS_INLINE void do_decode_neon(const uint8_t* src, long& len, unsigned
|
|
|
323
323
|
// the yEnc specification requires any character following = to be unescaped, not skipped over, so we'll deal with that
|
|
324
324
|
// firstly, check for invalid sequences of = (we assume that these are rare, as a spec compliant yEnc encoder should not generate these)
|
|
325
325
|
if(LIKELIHOOD(0.0001, (mask & ((maskEq << 1) | escFirst)) != 0)) {
|
|
326
|
-
|
|
327
|
-
uint32_t maskEq2 = tmp;
|
|
328
|
-
for(int j=8; j<32; j+=8) {
|
|
329
|
-
tmp = eqFixLUT[((maskEq>>j)&0xff) & ~(tmp>>7)];
|
|
330
|
-
maskEq2 |= tmp<<j;
|
|
331
|
-
}
|
|
332
|
-
maskEq = maskEq2;
|
|
326
|
+
maskEq = fix_eqMask<uint32_t>(maskEq & ~escFirst);
|
|
333
327
|
|
|
328
|
+
unsigned char nextEscFirst = maskEq>>31;
|
|
334
329
|
// next, eliminate anything following a `=` from the special char mask; this eliminates cases of `=\r` so that they aren't removed
|
|
335
330
|
maskEq = (maskEq<<1) | escFirst;
|
|
336
331
|
mask &= ~maskEq;
|
|
337
|
-
escFirst =
|
|
332
|
+
escFirst = nextEscFirst;
|
|
338
333
|
|
|
339
334
|
// unescape chars following `=`
|
|
340
335
|
uint8x8_t maskEqTemp = vreinterpret_u8_u32(vmov_n_u32(maskEq));
|
|
@@ -449,25 +444,14 @@ HEDLEY_ALWAYS_INLINE void do_decode_neon(const uint8_t* src, long& len, unsigned
|
|
|
449
444
|
#endif
|
|
450
445
|
}
|
|
451
446
|
}
|
|
452
|
-
|
|
453
|
-
if(isRaw) {
|
|
454
|
-
if(len != 0) { // have to gone through at least one loop cycle
|
|
455
|
-
if(src[i-2] == '\r' && src[i-1] == '\n' && src[i] == '.')
|
|
456
|
-
nextMask = 1;
|
|
457
|
-
else if(src[i-1] == '\r' && src[i] == '\n' && src[i+1] == '.')
|
|
458
|
-
nextMask = 2;
|
|
459
|
-
else
|
|
460
|
-
nextMask = 0;
|
|
461
|
-
}
|
|
462
|
-
} else
|
|
463
|
-
nextMask = 0;
|
|
464
447
|
}
|
|
465
448
|
|
|
466
449
|
void decoder_set_neon_funcs() {
|
|
467
|
-
decoder_init_lut(
|
|
450
|
+
decoder_init_lut(compactLUT);
|
|
468
451
|
_do_decode = &do_decode_simd<false, false, sizeof(uint8x16_t)*2, do_decode_neon<false, false> >;
|
|
469
452
|
_do_decode_raw = &do_decode_simd<true, false, sizeof(uint8x16_t)*2, do_decode_neon<true, false> >;
|
|
470
453
|
_do_decode_end_raw = &do_decode_simd<true, true, sizeof(uint8x16_t)*2, do_decode_neon<true, true> >;
|
|
454
|
+
_decode_isa = ISA_LEVEL_NEON;
|
|
471
455
|
}
|
|
472
456
|
#else
|
|
473
457
|
void decoder_set_neon_funcs() {}
|
package/src/decoder_neon64.cc
CHANGED
|
@@ -7,8 +7,6 @@
|
|
|
7
7
|
static struct { char bytes[16]; } ALIGN_TO(16, compactLUT[32768]);
|
|
8
8
|
#pragma pack()
|
|
9
9
|
|
|
10
|
-
static uint8_t eqFixLUT[256];
|
|
11
|
-
|
|
12
10
|
|
|
13
11
|
// AArch64 GCC lacks these functions until 8.5, 9.4 and 10.1 (10.0 unknown)
|
|
14
12
|
#if !defined(__clang__) && !defined(_MSC_VER) && (!defined(__aarch64__) || !(HEDLEY_GCC_VERSION_CHECK(9,4,0) || (!HEDLEY_GCC_VERSION_CHECK(9,0,0) && HEDLEY_GCC_VERSION_CHECK(8,5,0))))
|
|
@@ -56,6 +54,9 @@ HEDLEY_ALWAYS_INLINE void do_decode_neon(const uint8_t* src, long& len, unsigned
|
|
|
56
54
|
if(nextMask == 2)
|
|
57
55
|
nextMaskMix = vsetq_lane_u8(2, nextMaskMix, 1);
|
|
58
56
|
uint8x16_t yencOffset = escFirst ? vmakeq_u8(42+64,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42) : vdupq_n_u8(42);
|
|
57
|
+
|
|
58
|
+
decoder_set_nextMask<isRaw>(src, len, nextMask);
|
|
59
|
+
|
|
59
60
|
long i;
|
|
60
61
|
for(i = -len; i; i += sizeof(uint8x16_t)*4) {
|
|
61
62
|
uint8x16x4_t data = _vld1q_u8_x4(src+i);
|
|
@@ -227,6 +228,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_neon(const uint8_t* src, long& len, unsigned
|
|
|
227
228
|
// terminator found
|
|
228
229
|
// there's probably faster ways to do this, but reverting to scalar code should be good enough
|
|
229
230
|
len += i;
|
|
231
|
+
nextMask = decoder_set_nextMask<isRaw>(src+i, mask);
|
|
230
232
|
break;
|
|
231
233
|
}
|
|
232
234
|
}
|
|
@@ -275,6 +277,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_neon(const uint8_t* src, long& len, unsigned
|
|
|
275
277
|
);
|
|
276
278
|
if(LIKELIHOOD(0.001, neon_vect_is_nonzero(matchEnd))) {
|
|
277
279
|
len += i;
|
|
280
|
+
nextMask = decoder_set_nextMask<isRaw>(src+i, mask);
|
|
278
281
|
break;
|
|
279
282
|
}
|
|
280
283
|
}
|
|
@@ -288,18 +291,13 @@ HEDLEY_ALWAYS_INLINE void do_decode_neon(const uint8_t* src, long& len, unsigned
|
|
|
288
291
|
// the yEnc specification requires any character following = to be unescaped, not skipped over, so we'll deal with that
|
|
289
292
|
// firstly, check for invalid sequences of = (we assume that these are rare, as a spec compliant yEnc encoder should not generate these)
|
|
290
293
|
if(LIKELIHOOD(0.0001, (mask & ((maskEq << 1) | escFirst)) != 0)) {
|
|
291
|
-
|
|
292
|
-
uint64_t maskEq2 = tmp;
|
|
293
|
-
for(int j=8; j<64; j+=8) {
|
|
294
|
-
tmp = eqFixLUT[((maskEq>>j)&0xff) & ~(tmp>>7)];
|
|
295
|
-
maskEq2 |= ((uint64_t)tmp)<<j;
|
|
296
|
-
}
|
|
297
|
-
maskEq = maskEq2;
|
|
294
|
+
maskEq = fix_eqMask<uint64_t>(maskEq & ~(uint64_t)escFirst);
|
|
298
295
|
|
|
296
|
+
unsigned char nextEscFirst = maskEq>>63;
|
|
299
297
|
// next, eliminate anything following a `=` from the special char mask; this eliminates cases of `=\r` so that they aren't removed
|
|
300
298
|
maskEq = (maskEq<<1) | escFirst;
|
|
301
299
|
mask &= ~maskEq;
|
|
302
|
-
escFirst =
|
|
300
|
+
escFirst = nextEscFirst;
|
|
303
301
|
|
|
304
302
|
// unescape chars following `=`
|
|
305
303
|
#if defined(__GNUC__) && !defined(__clang__)
|
|
@@ -430,24 +428,14 @@ HEDLEY_ALWAYS_INLINE void do_decode_neon(const uint8_t* src, long& len, unsigned
|
|
|
430
428
|
yencOffset = vdupq_n_u8(42);
|
|
431
429
|
}
|
|
432
430
|
}
|
|
433
|
-
if(isRaw) {
|
|
434
|
-
if(len != 0) { // have to gone through at least one loop cycle
|
|
435
|
-
if(src[i-2] == '\r' && src[i-1] == '\n' && src[i] == '.')
|
|
436
|
-
nextMask = 1;
|
|
437
|
-
else if(src[i-1] == '\r' && src[i] == '\n' && src[i+1] == '.')
|
|
438
|
-
nextMask = 2;
|
|
439
|
-
else
|
|
440
|
-
nextMask = 0;
|
|
441
|
-
}
|
|
442
|
-
} else
|
|
443
|
-
nextMask = 0;
|
|
444
431
|
}
|
|
445
432
|
|
|
446
433
|
void decoder_set_neon_funcs() {
|
|
447
|
-
decoder_init_lut(
|
|
434
|
+
decoder_init_lut(compactLUT);
|
|
448
435
|
_do_decode = &do_decode_simd<false, false, sizeof(uint8x16_t)*4, do_decode_neon<false, false> >;
|
|
449
436
|
_do_decode_raw = &do_decode_simd<true, false, sizeof(uint8x16_t)*4, do_decode_neon<true, false> >;
|
|
450
437
|
_do_decode_end_raw = &do_decode_simd<true, true, sizeof(uint8x16_t)*4, do_decode_neon<true, true> >;
|
|
438
|
+
_decode_isa = ISA_LEVEL_NEON;
|
|
451
439
|
}
|
|
452
440
|
#else
|
|
453
441
|
void decoder_set_neon_funcs() {}
|
|
@@ -0,0 +1,274 @@
|
|
|
1
|
+
#include "common.h"
|
|
2
|
+
#ifdef __riscv_vector
|
|
3
|
+
#include "decoder_common.h"
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
#ifdef __riscv_v_intrinsic
|
|
7
|
+
# define RV_vmerge_vxm_u8m2 RV(vmerge_vxm_u8m2)
|
|
8
|
+
# define RV_vmerge_vxm_u16m2 RV(vmerge_vxm_u16m2)
|
|
9
|
+
#else
|
|
10
|
+
# define RV_vmerge_vxm_u8m2(v, x, m, vl) RV(vmerge_vxm_u8m2)(m, v, x, vl)
|
|
11
|
+
# define RV_vmerge_vxm_u16m2(v, x, m, vl) RV(vmerge_vxm_u16m2)(m, v, x, vl)
|
|
12
|
+
#endif
|
|
13
|
+
|
|
14
|
+
#if defined(__riscv_v_intrinsic) && __riscv_v_intrinsic >= 12000
|
|
15
|
+
# define RV_VEC_CAST(masksz, vecsz, vec) RV(vreinterpret_v_b##masksz##_u##vecsz##m1)(vec)
|
|
16
|
+
#else
|
|
17
|
+
# define RV_VEC_CAST(masksz, vecsz, vec) *(vuint##vecsz##m1_t*)(&(vec))
|
|
18
|
+
#endif
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
template<int shift>
|
|
22
|
+
static inline vbool4_t mask_lshift(vbool4_t m, unsigned shiftIn, size_t vl) {
|
|
23
|
+
vuint8m1_t mv = RV_VEC_CAST(4, 8, m);
|
|
24
|
+
vuint8m1_t mvl = RV(vsll_vx_u8m1)(mv, shift, vl/8);
|
|
25
|
+
vuint8m1_t mvr = RV(vsrl_vx_u8m1)(mv, 8-shift, vl/8);
|
|
26
|
+
mvr = RV(vslide1up_vx_u8m1)(mvr, shiftIn, vl/8);
|
|
27
|
+
|
|
28
|
+
return RV(vmor_mm_b4)(
|
|
29
|
+
RV_MASK_CAST(4, 8, mvl), RV_MASK_CAST(4, 8, mvr), vl
|
|
30
|
+
);
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
static inline vuint8m2_t set_first_vu8(vuint8m2_t src, uint8_t item, size_t vl) {
|
|
34
|
+
#ifdef __riscv_v_intrinsic
|
|
35
|
+
return RV(vmv_s_x_u8m2_tu)(src, item, vl);
|
|
36
|
+
#else
|
|
37
|
+
vuint8m1_t m = RV(vslide1up_vx_u8m1)(RV(vmv_v_x_u8m1)(0, ~0), 1, ~0);
|
|
38
|
+
return RV_vmerge_vxm_u8m2(src, item, RV_MASK_CAST(4, 8, m), vl);
|
|
39
|
+
#endif
|
|
40
|
+
}
|
|
41
|
+
static inline vuint16m2_t set_first_vu16(vuint16m2_t src, uint16_t item, size_t vl) {
|
|
42
|
+
#ifdef __riscv_v_intrinsic
|
|
43
|
+
return RV(vmv_s_x_u16m2_tu)(src, item, vl);
|
|
44
|
+
#else
|
|
45
|
+
vuint16m1_t m = RV(vslide1up_vx_u16m1)(RV(vmv_v_x_u16m1)(0, ~0), 1, ~0);
|
|
46
|
+
return RV_vmerge_vxm_u16m2(src, item, RV_MASK_CAST(8, 16, m), vl);
|
|
47
|
+
#endif
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
template<bool isRaw, bool searchEnd>
|
|
53
|
+
HEDLEY_ALWAYS_INLINE void do_decode_rvv(const uint8_t* src, long& len, unsigned char*& outp, unsigned char& escFirst, uint16_t& nextMask) {
|
|
54
|
+
HEDLEY_ASSUME(escFirst == 0 || escFirst == 1);
|
|
55
|
+
HEDLEY_ASSUME(nextMask == 0 || nextMask == 1 || nextMask == 2);
|
|
56
|
+
|
|
57
|
+
size_t vl2 = RV(vsetvlmax_e8m2)();
|
|
58
|
+
|
|
59
|
+
vuint8m2_t yencOffset = RV(vmv_v_x_u8m2)(42, vl2);
|
|
60
|
+
if(escFirst) yencOffset = set_first_vu8(yencOffset, 42+64, vl2);
|
|
61
|
+
vuint8m2_t lfCompare = RV(vmv_v_x_u8m2)('\n', vl2);
|
|
62
|
+
if(nextMask && isRaw) {
|
|
63
|
+
lfCompare = RV(vreinterpret_v_u16m2_u8m2)(
|
|
64
|
+
set_first_vu16(RV(vreinterpret_v_u8m2_u16m2)(lfCompare), nextMask == 1 ? 0x0a2e /*".\n"*/ : 0x2e0a /*"\n."*/, vl2/2)
|
|
65
|
+
);
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
// mask where only the highest bit is set
|
|
69
|
+
vbool4_t lastBit = RV(vmseq_vx_u8m2_b4)(
|
|
70
|
+
RV(vslide1down_vx_u8m2)(RV(vmv_v_x_u8m2)(0, vl2), 1, vl2),
|
|
71
|
+
1, vl2
|
|
72
|
+
);
|
|
73
|
+
|
|
74
|
+
decoder_set_nextMask<isRaw>(src, len, nextMask);
|
|
75
|
+
|
|
76
|
+
// TODO: consider exploiting partial vector capability
|
|
77
|
+
long inpos;
|
|
78
|
+
for(inpos = -len; inpos; inpos += vl2) {
|
|
79
|
+
vuint8m2_t data = RV(vle8_v_u8m2)(src + inpos, vl2);
|
|
80
|
+
|
|
81
|
+
// search for special chars
|
|
82
|
+
vbool4_t cmpEq = RV(vmseq_vx_u8m2_b4)(data, '=', vl2);
|
|
83
|
+
vbool4_t cmpCr = RV(vmseq_vx_u8m2_b4)(data, '\r', vl2);
|
|
84
|
+
// note: cmp is always negated (unlike cmpEq/Cr)
|
|
85
|
+
vbool4_t cmp = RV(vmnor_mm_b4)(
|
|
86
|
+
RV(vmor_mm_b4)(cmpEq, cmpCr, vl2),
|
|
87
|
+
isRaw ? RV(vmseq_vv_u8m2_b4)(data, lfCompare, vl2) : RV(vmseq_vx_u8m2_b4)(data, '\n', vl2),
|
|
88
|
+
vl2
|
|
89
|
+
);
|
|
90
|
+
|
|
91
|
+
size_t numOutputChars = RV(vcpop_m_b4)(cmp, vl2);
|
|
92
|
+
|
|
93
|
+
if(numOutputChars != vl2) {
|
|
94
|
+
// dot-unstuffing + end detection
|
|
95
|
+
if((isRaw || searchEnd) && RV(vcpop_m_b4)(RV(vmxnor_mm_b4)(cmp, cmpEq, vl2), vl2)) {
|
|
96
|
+
uint32_t nextWord;
|
|
97
|
+
if(!searchEnd) {
|
|
98
|
+
memcpy(&nextWord, src + inpos + vl2, 2);
|
|
99
|
+
} else {
|
|
100
|
+
memcpy(&nextWord, src + inpos + vl2, 4);
|
|
101
|
+
}
|
|
102
|
+
vuint8m2_t nextData2 = RV(vreinterpret_v_u16m2_u8m2)(RV(vslide1down_vx_u16m2)(RV(vreinterpret_v_u8m2_u16m2)(data), nextWord, vl2/2));
|
|
103
|
+
|
|
104
|
+
vbool4_t match2Cr_Dot, match3EqY;
|
|
105
|
+
vuint8m2_t nextData3;
|
|
106
|
+
if(isRaw) {
|
|
107
|
+
match2Cr_Dot = RV(vmand_mm_b4)(cmpCr, RV(vmseq_vx_u8m2_b4)(nextData2, '.', vl2), vl2);
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
if(searchEnd) {
|
|
111
|
+
nextData3 = RV(vslide1down_vx_u8m2)(nextData2, nextWord>>16, vl2);
|
|
112
|
+
match3EqY = RV(vmand_mm_b4)(
|
|
113
|
+
RV(vmseq_vx_u8m2_b4)(nextData2, '=', vl2),
|
|
114
|
+
RV(vmseq_vx_u8m2_b4)(nextData3, 'y', vl2),
|
|
115
|
+
vl2
|
|
116
|
+
);
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
// find patterns of \r_.
|
|
120
|
+
if(isRaw && LIKELIHOOD(0.001, RV(vcpop_m_b4)(match2Cr_Dot, vl2) > 0)) {
|
|
121
|
+
// find \r\n.
|
|
122
|
+
vuint8m2_t nextData1 = RV(vslide1down_vx_u8m2)(data, nextWord, vl2);
|
|
123
|
+
vbool4_t match1Lf = RV(vmseq_vx_u8m2_b4)(nextData1, '\n', vl2);
|
|
124
|
+
vbool4_t match2NlDot = RV(vmand_mm_b4)(match2Cr_Dot, match1Lf, vl2);
|
|
125
|
+
|
|
126
|
+
if(searchEnd) {
|
|
127
|
+
vbool4_t match1Nl = RV(vmand_mm_b4)(cmpCr, match1Lf, vl2);
|
|
128
|
+
|
|
129
|
+
vuint8m2_t nextData4 = RV(vreinterpret_v_u32m2_u8m2)(RV(vslide1down_vx_u32m2)(RV(vreinterpret_v_u8m2_u32m2)(data), nextWord, vl2/4));
|
|
130
|
+
|
|
131
|
+
// match instances of \r\n.\r\n and \r\n.=y
|
|
132
|
+
vbool4_t match4Nl = RV(vmand_mm_b4)(
|
|
133
|
+
RV(vmseq_vx_u8m2_b4)(nextData3, '\r', vl2),
|
|
134
|
+
RV(vmseq_vx_u8m2_b4)(nextData4, '\n', vl2),
|
|
135
|
+
vl2
|
|
136
|
+
);
|
|
137
|
+
vbool4_t match4EqY = RV(vmand_mm_b4)(
|
|
138
|
+
RV(vmseq_vx_u8m2_b4)(nextData3, '=', vl2),
|
|
139
|
+
RV(vmseq_vx_u8m2_b4)(nextData4, 'y', vl2),
|
|
140
|
+
vl2
|
|
141
|
+
);
|
|
142
|
+
|
|
143
|
+
// merge \r\n and =y matches
|
|
144
|
+
vbool4_t match4End = RV(vmor_mm_b4)(match4Nl, match4EqY, vl2);
|
|
145
|
+
// merge with \r\n.
|
|
146
|
+
match4End = RV(vmand_mm_b4)(match4End, match2NlDot, vl2);
|
|
147
|
+
// merge \r\n=y
|
|
148
|
+
vbool4_t match3End = RV(vmand_mm_b4)(match1Nl, match3EqY, vl2);
|
|
149
|
+
|
|
150
|
+
vbool4_t matchEnd = RV(vmor_mm_b4)(match4End, match3End, vl2);
|
|
151
|
+
|
|
152
|
+
// combine match sequences
|
|
153
|
+
if(LIKELIHOOD(0.001, RV(vcpop_m_b4)(matchEnd, vl2) > 0)) {
|
|
154
|
+
// terminator found
|
|
155
|
+
len += inpos;
|
|
156
|
+
nextMask = decoder_set_nextMask<isRaw>(src+inpos, ~RV(vmv_x_s_u8m1_u8)(RV_VEC_CAST(4, 8, cmp)));
|
|
157
|
+
break;
|
|
158
|
+
}
|
|
159
|
+
}
|
|
160
|
+
|
|
161
|
+
// shift match2NlDot by 2
|
|
162
|
+
cmp = RV(vmandn_mm_b4)(cmp, mask_lshift<2>(match2NlDot, 0, vl2), vl2);
|
|
163
|
+
numOutputChars = RV(vcpop_m_b4)(cmp, vl2);
|
|
164
|
+
|
|
165
|
+
vuint8mf4_t nextNlDot = RV(vslidedown_vx_u8mf4)(
|
|
166
|
+
#ifndef __riscv_v_intrinsic
|
|
167
|
+
RV(vmv_v_x_u8mf4)(0, vl2/8),
|
|
168
|
+
#endif
|
|
169
|
+
RV_VEC_U8MF4_CAST(match2NlDot), vl2/8-1, vl2/8
|
|
170
|
+
);
|
|
171
|
+
nextNlDot = RV(vsrl_vx_u8mf4)(nextNlDot, 6, vl2/8);
|
|
172
|
+
vuint8m1_t nextNlDotVec = RV(vlmul_ext_v_u8mf4_u8m1)(nextNlDot);
|
|
173
|
+
lfCompare = RV_vmerge_vxm_u8m2(RV(vmv_v_x_u8m2)('\n', vl2), '.', RV_MASK_CAST(4, 8, nextNlDotVec), vl2);
|
|
174
|
+
} else if(searchEnd) {
|
|
175
|
+
if(LIKELIHOOD(0.001, RV(vcpop_m_b4)(match3EqY, vl2) != 0)) {
|
|
176
|
+
vuint8m2_t nextData1 = RV(vslide1down_vx_u8m2)(data, nextWord, vl2);
|
|
177
|
+
vbool4_t match1Lf = RV(vmseq_vx_u8m2_b4)(nextData1, '\n', vl2);
|
|
178
|
+
vbool4_t matchEnd = RV(vmand_mm_b4)(RV(vmand_mm_b4)(match3EqY, cmpCr, vl2), match1Lf, vl2);
|
|
179
|
+
if(LIKELIHOOD(0.001, RV(vcpop_m_b4)(matchEnd, vl2) > 0)) {
|
|
180
|
+
len += inpos;
|
|
181
|
+
nextMask = decoder_set_nextMask<isRaw>(src+inpos, ~RV(vmv_x_s_u8m1_u8)(RV_VEC_CAST(4, 8, cmp)));
|
|
182
|
+
break;
|
|
183
|
+
}
|
|
184
|
+
}
|
|
185
|
+
if(isRaw)
|
|
186
|
+
lfCompare = RV(vmv_v_x_u8m2)('\n', vl2);
|
|
187
|
+
} else if(isRaw) // no \r_. found
|
|
188
|
+
lfCompare = RV(vmv_v_x_u8m2)('\n', vl2);
|
|
189
|
+
}
|
|
190
|
+
|
|
191
|
+
// the second character in an escape sequence
|
|
192
|
+
vbool4_t cmpEqShift1 = mask_lshift<1>(cmpEq, escFirst, vl2);
|
|
193
|
+
|
|
194
|
+
// a spec compliant encoder should never generate sequences: ==, =\n and =\r, but we'll handle them to be spec compliant
|
|
195
|
+
// the yEnc specification requires any character following = to be unescaped, not skipped over, so we'll deal with that
|
|
196
|
+
// firstly, check for invalid sequences of = (we assume that these are rare, as a spec compliant yEnc encoder should not generate these)
|
|
197
|
+
if(LIKELIHOOD(0.0001, RV(vcpop_m_b4)(RV(vmandn_mm_b4)(cmpEqShift1, cmp, vl2), vl2) != 0)) {
|
|
198
|
+
// note: we assume that uintptr_t corresponds with __riscv_xlen
|
|
199
|
+
#if __riscv_xlen == 64
|
|
200
|
+
vuint64m1_t cmpEqW = RV_VEC_CAST(4, 64, cmpEq);
|
|
201
|
+
#else
|
|
202
|
+
vuint32m1_t cmpEqW = RV_VEC_CAST(4, 32, cmpEq);
|
|
203
|
+
#endif
|
|
204
|
+
size_t nextShiftDown = (vl2 > sizeof(uintptr_t)*8 ? sizeof(uintptr_t)*8 : vl2) - 1;
|
|
205
|
+
size_t wvl = (vl2 + sizeof(uintptr_t)*8 -1) / (sizeof(uintptr_t)*8);
|
|
206
|
+
for(size_t w=0; w<vl2; w+=sizeof(uintptr_t)*8) {
|
|
207
|
+
// extract bottom word
|
|
208
|
+
#if __riscv_xlen == 64
|
|
209
|
+
uintptr_t maskW = RV(vmv_x_s_u64m1_u64)(cmpEqW);
|
|
210
|
+
#else
|
|
211
|
+
uintptr_t maskW = RV(vmv_x_s_u32m1_u32)(cmpEqW);
|
|
212
|
+
#endif
|
|
213
|
+
|
|
214
|
+
// fix it
|
|
215
|
+
maskW = fix_eqMask<uintptr_t>(maskW & ~(uintptr_t)escFirst);
|
|
216
|
+
uint8_t nextEscFirst = (maskW >> nextShiftDown) & 1;
|
|
217
|
+
|
|
218
|
+
// shift it up (will be used for cmpEqShift1)
|
|
219
|
+
maskW = (maskW<<1) | escFirst; // TODO: should this be done using mask_lshift<1> instead?
|
|
220
|
+
escFirst = nextEscFirst;
|
|
221
|
+
|
|
222
|
+
// slide the new value in from the top
|
|
223
|
+
#if __riscv_xlen == 64
|
|
224
|
+
cmpEqW = RV(vslide1down_vx_u64m1)(cmpEqW, maskW, wvl);
|
|
225
|
+
#else
|
|
226
|
+
cmpEqW = RV(vslide1down_vx_u32m1)(cmpEqW, maskW, wvl);
|
|
227
|
+
#endif
|
|
228
|
+
}
|
|
229
|
+
#if __riscv_xlen == 64
|
|
230
|
+
cmpEqShift1 = RV_MASK_CAST(4, 64, cmpEqW);
|
|
231
|
+
#else
|
|
232
|
+
cmpEqShift1 = RV_MASK_CAST(4, 32, cmpEqW);
|
|
233
|
+
#endif
|
|
234
|
+
cmp = RV(vmor_mm_b4)(cmpEqShift1, cmp, vl2); // ~(~cmp & ~cmpEqShift1)
|
|
235
|
+
numOutputChars = RV(vcpop_m_b4)(cmp, vl2);
|
|
236
|
+
} else {
|
|
237
|
+
// no invalid = sequences found - don't need to fix up cmpEq
|
|
238
|
+
escFirst = RV(vcpop_m_b4)(RV(vmand_mm_b4)(cmpEq, lastBit, vl2), vl2);
|
|
239
|
+
}
|
|
240
|
+
data = RV(vsub_vv_u8m2)(data, RV_vmerge_vxm_u8m2(yencOffset, 64+42, cmpEqShift1, vl2), vl2);
|
|
241
|
+
yencOffset = set_first_vu8(yencOffset, 42 | (escFirst<<6), vl2);
|
|
242
|
+
|
|
243
|
+
// all that's left is to remove unwanted chars
|
|
244
|
+
#ifdef __riscv_v_intrinsic
|
|
245
|
+
data = RV(vcompress_vm_u8m2)(data, cmp, vl2);
|
|
246
|
+
#else
|
|
247
|
+
data = RV(vcompress_vm_u8m2)(cmp, data, data, vl2);
|
|
248
|
+
#endif
|
|
249
|
+
RV(vse8_v_u8m2)(outp, data, vl2);
|
|
250
|
+
} else {
|
|
251
|
+
data = RV(vsub_vv_u8m2)(data, yencOffset, vl2);
|
|
252
|
+
RV(vse8_v_u8m2)(outp, data, vl2);
|
|
253
|
+
// TODO: should these be done at LMUL=1? or, it might not be worth this strategy (e.g. do an additional OR instead), considering the cost of LMUL=2
|
|
254
|
+
yencOffset = RV(vmv_v_x_u8m2)(42, vl2);
|
|
255
|
+
if(isRaw) lfCompare = RV(vmv_v_x_u8m2)('\n', vl2);
|
|
256
|
+
escFirst = 0;
|
|
257
|
+
}
|
|
258
|
+
outp += numOutputChars;
|
|
259
|
+
}
|
|
260
|
+
}
|
|
261
|
+
|
|
262
|
+
size_t decoder_rvv_width() {
|
|
263
|
+
return RV(vsetvlmax_e8m2)();
|
|
264
|
+
}
|
|
265
|
+
|
|
266
|
+
void decoder_set_rvv_funcs() {
|
|
267
|
+
_do_decode = &do_decode_simd<false, false, decoder_rvv_width, do_decode_rvv<false, false> >;
|
|
268
|
+
_do_decode_raw = &do_decode_simd<true, false, decoder_rvv_width, do_decode_rvv<true, false> >;
|
|
269
|
+
_do_decode_end_raw = &do_decode_simd<true, true, decoder_rvv_width, do_decode_rvv<true, true> >;
|
|
270
|
+
_decode_isa = ISA_LEVEL_RVV;
|
|
271
|
+
}
|
|
272
|
+
#else
|
|
273
|
+
void decoder_set_rvv_funcs() {}
|
|
274
|
+
#endif
|
package/src/decoder_sse2.cc
CHANGED
|
@@ -4,12 +4,34 @@
|
|
|
4
4
|
#include "decoder_common.h"
|
|
5
5
|
#include "decoder_sse_base.h"
|
|
6
6
|
|
|
7
|
+
void decoder_sse_init(SSELookups* HEDLEY_RESTRICT& lookups) {
|
|
8
|
+
ALIGN_ALLOC(lookups, sizeof(SSELookups), 16);
|
|
9
|
+
for(int i=0; i<256; i++) {
|
|
10
|
+
lookups->BitsSetTable256inv[i] = 8 - (
|
|
11
|
+
(i & 1) + ((i>>1) & 1) + ((i>>2) & 1) + ((i>>3) & 1) + ((i>>4) & 1) + ((i>>5) & 1) + ((i>>6) & 1) + ((i>>7) & 1)
|
|
12
|
+
);
|
|
13
|
+
|
|
14
|
+
#define _X(n, k) ((((n) & (1<<k)) ? 192ULL : 0ULL) << (k*8))
|
|
15
|
+
lookups->eqAdd[i] = _X(i, 0) | _X(i, 1) | _X(i, 2) | _X(i, 3) | _X(i, 4) | _X(i, 5) | _X(i, 6) | _X(i, 7);
|
|
16
|
+
#undef _X
|
|
17
|
+
}
|
|
18
|
+
for(int i=0; i<32; i++) {
|
|
19
|
+
for(int j=0; j<16; j++) {
|
|
20
|
+
if(i >= 16) // only used for LZCNT
|
|
21
|
+
lookups->unshufMask[i*16 + j] = ((31-i)>j ? -1 : 0);
|
|
22
|
+
else // only used for BSR
|
|
23
|
+
lookups->unshufMask[i*16 + j] = (i>j ? -1 : 0);
|
|
24
|
+
}
|
|
25
|
+
}
|
|
26
|
+
}
|
|
27
|
+
|
|
7
28
|
void decoder_set_sse2_funcs() {
|
|
8
|
-
decoder_sse_init();
|
|
9
|
-
decoder_init_lut(lookups->
|
|
29
|
+
decoder_sse_init(lookups);
|
|
30
|
+
decoder_init_lut(lookups->compact);
|
|
10
31
|
_do_decode = &do_decode_simd<false, false, sizeof(__m128i)*2, do_decode_sse<false, false, ISA_LEVEL_SSE2> >;
|
|
11
32
|
_do_decode_raw = &do_decode_simd<true, false, sizeof(__m128i)*2, do_decode_sse<true, false, ISA_LEVEL_SSE2> >;
|
|
12
33
|
_do_decode_end_raw = &do_decode_simd<true, true, sizeof(__m128i)*2, do_decode_sse<true, true, ISA_LEVEL_SSE2> >;
|
|
34
|
+
_decode_isa = ISA_LEVEL_SSE2;
|
|
13
35
|
}
|
|
14
36
|
#else
|
|
15
37
|
void decoder_set_sse2_funcs() {}
|
package/src/decoder_sse_base.h
CHANGED
|
@@ -26,13 +26,13 @@
|
|
|
26
26
|
#endif
|
|
27
27
|
|
|
28
28
|
#pragma pack(16)
|
|
29
|
-
|
|
29
|
+
typedef struct {
|
|
30
30
|
unsigned char BitsSetTable256inv[256];
|
|
31
31
|
/*align16*/ struct { char bytes[16]; } compact[32768];
|
|
32
|
-
uint8_t eqFix[256];
|
|
33
32
|
/*align8*/ uint64_t eqAdd[256];
|
|
34
33
|
/*align16*/ int8_t unshufMask[32*16];
|
|
35
|
-
}
|
|
34
|
+
} SSELookups;
|
|
35
|
+
static SSELookups* HEDLEY_RESTRICT lookups;
|
|
36
36
|
#pragma pack()
|
|
37
37
|
|
|
38
38
|
|
|
@@ -45,27 +45,7 @@ static HEDLEY_ALWAYS_INLINE __m128i force_align_read_128(const void* p) {
|
|
|
45
45
|
#endif
|
|
46
46
|
}
|
|
47
47
|
|
|
48
|
-
|
|
49
|
-
static void decoder_sse_init() {
|
|
50
|
-
ALIGN_ALLOC(lookups, sizeof(*lookups), 16);
|
|
51
|
-
for(int i=0; i<256; i++) {
|
|
52
|
-
lookups->BitsSetTable256inv[i] = 8 - (
|
|
53
|
-
(i & 1) + ((i>>1) & 1) + ((i>>2) & 1) + ((i>>3) & 1) + ((i>>4) & 1) + ((i>>5) & 1) + ((i>>6) & 1) + ((i>>7) & 1)
|
|
54
|
-
);
|
|
55
|
-
|
|
56
|
-
#define _X(n, k) ((((n) & (1<<k)) ? 192ULL : 0ULL) << (k*8))
|
|
57
|
-
lookups->eqAdd[i] = _X(i, 0) | _X(i, 1) | _X(i, 2) | _X(i, 3) | _X(i, 4) | _X(i, 5) | _X(i, 6) | _X(i, 7);
|
|
58
|
-
#undef _X
|
|
59
|
-
}
|
|
60
|
-
for(int i=0; i<32; i++) {
|
|
61
|
-
for(int j=0; j<16; j++) {
|
|
62
|
-
if(i >= 16) // only used for LZCNT
|
|
63
|
-
lookups->unshufMask[i*16 + j] = ((31-i)>j ? -1 : 0);
|
|
64
|
-
else // only used for BSR
|
|
65
|
-
lookups->unshufMask[i*16 + j] = (i>j ? -1 : 0);
|
|
66
|
-
}
|
|
67
|
-
}
|
|
68
|
-
}
|
|
48
|
+
void decoder_sse_init(SSELookups* HEDLEY_RESTRICT& lookups); // defined in decoder_sse2.cc
|
|
69
49
|
|
|
70
50
|
|
|
71
51
|
// for LZCNT/BSR
|
|
@@ -145,6 +125,9 @@ HEDLEY_ALWAYS_INLINE void do_decode_sse(const uint8_t* src, long& len, unsigned
|
|
|
145
125
|
else
|
|
146
126
|
lfCompare = _mm_insert_epi16(lfCompare, _nextMask == 1 ? 0x0a2e /*".\n"*/ : 0x2e0a /*"\n."*/, 0);
|
|
147
127
|
}
|
|
128
|
+
|
|
129
|
+
decoder_set_nextMask<isRaw>(src, len, _nextMask); // set this before the loop because we can't check src after it's been overwritten
|
|
130
|
+
|
|
148
131
|
intptr_t i;
|
|
149
132
|
for(i = -len; i; i += sizeof(__m128i)*2) {
|
|
150
133
|
__m128i oDataA = _mm_load_si128((__m128i *)(src+i));
|
|
@@ -383,6 +366,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_sse(const uint8_t* src, long& len, unsigned
|
|
|
383
366
|
// terminator found
|
|
384
367
|
// there's probably faster ways to do this, but reverting to scalar code should be good enough
|
|
385
368
|
len += (long)i;
|
|
369
|
+
_nextMask = decoder_set_nextMask<isRaw>(src+i, mask);
|
|
386
370
|
break;
|
|
387
371
|
}
|
|
388
372
|
}
|
|
@@ -492,6 +476,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_sse(const uint8_t* src, long& len, unsigned
|
|
|
492
476
|
|
|
493
477
|
if(endFound) {
|
|
494
478
|
len += (long)i;
|
|
479
|
+
_nextMask = decoder_set_nextMask<isRaw>(src+i, mask);
|
|
495
480
|
break;
|
|
496
481
|
}
|
|
497
482
|
}
|
|
@@ -516,17 +501,9 @@ HEDLEY_ALWAYS_INLINE void do_decode_sse(const uint8_t* src, long& len, unsigned
|
|
|
516
501
|
dataB = _mm_add_epi8(oDataB, _mm_set1_epi8(-42));
|
|
517
502
|
|
|
518
503
|
if(LIKELIHOOD(0.0001, (mask & ((maskEq << 1) + escFirst)) != 0)) {
|
|
519
|
-
|
|
520
|
-
unsigned tmp = lookups->eqFix[(maskEq&0xff) & ~escFirst];
|
|
521
|
-
uint32_t maskEq2 = tmp;
|
|
522
|
-
for(int j=8; j<32; j+=8) {
|
|
523
|
-
tmp = lookups->eqFix[((maskEq>>j)&0xff) & ~(tmp>>7)];
|
|
524
|
-
maskEq2 |= tmp<<j;
|
|
525
|
-
}
|
|
526
|
-
maskEq = maskEq2;
|
|
527
|
-
|
|
504
|
+
maskEq = fix_eqMask<uint32_t>(maskEq & ~escFirst);
|
|
528
505
|
mask &= ~escFirst;
|
|
529
|
-
escFirst =
|
|
506
|
+
escFirst = maskEq >> 31;
|
|
530
507
|
// next, eliminate anything following a `=` from the special char mask; this eliminates cases of `=\r` so that they aren't removed
|
|
531
508
|
maskEq <<= 1;
|
|
532
509
|
mask &= ~maskEq;
|
|
@@ -710,16 +687,5 @@ HEDLEY_ALWAYS_INLINE void do_decode_sse(const uint8_t* src, long& len, unsigned
|
|
|
710
687
|
}
|
|
711
688
|
}
|
|
712
689
|
_escFirst = (unsigned char)escFirst;
|
|
713
|
-
if(isRaw) {
|
|
714
|
-
if(len != 0) { // have to gone through at least one loop cycle
|
|
715
|
-
if(src[i-2] == '\r' && src[i-1] == '\n' && src[i] == '.')
|
|
716
|
-
_nextMask = 1;
|
|
717
|
-
else if(src[i-1] == '\r' && src[i] == '\n' && src[i+1] == '.')
|
|
718
|
-
_nextMask = 2;
|
|
719
|
-
else
|
|
720
|
-
_nextMask = 0;
|
|
721
|
-
}
|
|
722
|
-
} else
|
|
723
|
-
_nextMask = 0;
|
|
724
690
|
}
|
|
725
691
|
#endif
|
package/src/decoder_ssse3.cc
CHANGED
|
@@ -4,11 +4,12 @@
|
|
|
4
4
|
#include "decoder_common.h"
|
|
5
5
|
#include "decoder_sse_base.h"
|
|
6
6
|
void decoder_set_ssse3_funcs() {
|
|
7
|
-
decoder_sse_init();
|
|
8
|
-
decoder_init_lut(lookups->
|
|
7
|
+
decoder_sse_init(lookups);
|
|
8
|
+
decoder_init_lut(lookups->compact);
|
|
9
9
|
_do_decode = &do_decode_simd<false, false, sizeof(__m128i)*2, do_decode_sse<false, false, ISA_LEVEL_SSSE3> >;
|
|
10
10
|
_do_decode_raw = &do_decode_simd<true, false, sizeof(__m128i)*2, do_decode_sse<true, false, ISA_LEVEL_SSSE3> >;
|
|
11
11
|
_do_decode_end_raw = &do_decode_simd<true, true, sizeof(__m128i)*2, do_decode_sse<true, true, ISA_LEVEL_SSSE3> >;
|
|
12
|
+
_decode_isa = ISA_LEVEL_SSSE3;
|
|
12
13
|
}
|
|
13
14
|
#else
|
|
14
15
|
void decoder_set_sse2_funcs();
|
package/src/decoder_vbmi2.cc
CHANGED
|
@@ -12,21 +12,18 @@ const bool decoder_has_avx10 = false;
|
|
|
12
12
|
# ifndef YENC_DISABLE_AVX256
|
|
13
13
|
# include "decoder_avx2_base.h"
|
|
14
14
|
void decoder_set_vbmi2_funcs() {
|
|
15
|
-
ALIGN_ALLOC(lookups, sizeof(*lookups), 16);
|
|
16
|
-
// TODO: consider removing compact LUT
|
|
17
|
-
decoder_init_lut(lookups->eqFix, lookups->compact);
|
|
18
15
|
_do_decode = &do_decode_simd<false, false, sizeof(__m256i)*2, do_decode_avx2<false, false, ISA_LEVEL_VBMI2> >;
|
|
19
16
|
_do_decode_raw = &do_decode_simd<true, false, sizeof(__m256i)*2, do_decode_avx2<true, false, ISA_LEVEL_VBMI2> >;
|
|
20
17
|
_do_decode_end_raw = &do_decode_simd<true, true, sizeof(__m256i)*2, do_decode_avx2<true, true, ISA_LEVEL_VBMI2> >;
|
|
18
|
+
_decode_isa = ISA_LEVEL_VBMI2;
|
|
21
19
|
}
|
|
22
20
|
# else
|
|
23
21
|
# include "decoder_sse_base.h"
|
|
24
22
|
void decoder_set_vbmi2_funcs() {
|
|
25
|
-
decoder_sse_init();
|
|
26
|
-
decoder_init_lut(lookups->eqFix, lookups->compact);
|
|
27
23
|
_do_decode = &do_decode_simd<false, false, sizeof(__m128i)*2, do_decode_sse<false, false, ISA_LEVEL_VBMI2> >;
|
|
28
24
|
_do_decode_raw = &do_decode_simd<true, false, sizeof(__m128i)*2, do_decode_sse<true, false, ISA_LEVEL_VBMI2> >;
|
|
29
25
|
_do_decode_end_raw = &do_decode_simd<true, true, sizeof(__m128i)*2, do_decode_sse<true, true, ISA_LEVEL_VBMI2> >;
|
|
26
|
+
_decode_isa = ISA_LEVEL_VBMI2;
|
|
30
27
|
}
|
|
31
28
|
# endif
|
|
32
29
|
#else
|
package/src/encoder.cc
CHANGED
|
@@ -2,6 +2,31 @@
|
|
|
2
2
|
#include "encoder_common.h"
|
|
3
3
|
#include "encoder.h"
|
|
4
4
|
|
|
5
|
+
|
|
6
|
+
// lookup tables for scalar processing
|
|
7
|
+
#define _B1(n) _B(n), _B(n+1), _B(n+2), _B(n+3)
|
|
8
|
+
#define _B2(n) _B1(n), _B1(n+4), _B1(n+8), _B1(n+12)
|
|
9
|
+
#define _B3(n) _B2(n), _B2(n+16), _B2(n+32), _B2(n+48)
|
|
10
|
+
#define _BX _B3(0), _B3(64), _B3(128), _B3(192)
|
|
11
|
+
|
|
12
|
+
const unsigned char escapeLUT[256] = { // whether or not the character is critical
|
|
13
|
+
#define _B(n) ((n == 214 || n == '\r'+214 || n == '\n'+214 || n == '='-42) ? 0 : (n+42) & 0xff)
|
|
14
|
+
_BX
|
|
15
|
+
#undef _B
|
|
16
|
+
};
|
|
17
|
+
const uint16_t escapedLUT[256] = { // escaped sequences for characters that need escaping
|
|
18
|
+
#define _B(n) ((n == 214 || n == 214+'\r' || n == 214+'\n' || n == '='-42 || n == 214+'\t' || n == 214+' ' || n == '.'-42) ? UINT16_PACK('=', ((n+42+64)&0xff)) : 0)
|
|
19
|
+
_BX
|
|
20
|
+
#undef _B
|
|
21
|
+
};
|
|
22
|
+
|
|
23
|
+
#undef _B1
|
|
24
|
+
#undef _B2
|
|
25
|
+
#undef _B3
|
|
26
|
+
#undef _BX
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
|
|
5
30
|
size_t do_encode_generic(int line_size, int* colOffset, const unsigned char* HEDLEY_RESTRICT src, unsigned char* HEDLEY_RESTRICT dest, size_t len, int doEnd) {
|
|
6
31
|
unsigned char* es = (unsigned char*)src + len;
|
|
7
32
|
unsigned char *p = dest; // destination pointer
|
|
@@ -122,6 +147,7 @@ size_t do_encode_generic(int line_size, int* colOffset, const unsigned char* HED
|
|
|
122
147
|
|
|
123
148
|
extern "C" {
|
|
124
149
|
size_t (*_do_encode)(int, int*, const unsigned char* HEDLEY_RESTRICT, unsigned char* HEDLEY_RESTRICT, size_t, int) = &do_encode_generic;
|
|
150
|
+
int _encode_isa = ISA_GENERIC;
|
|
125
151
|
}
|
|
126
152
|
|
|
127
153
|
void encoder_sse2_init();
|
|
@@ -139,12 +165,14 @@ void encoder_rvv_init();
|
|
|
139
165
|
static inline void encoder_native_init() {
|
|
140
166
|
_do_encode = &do_encode_simd< do_encode_avx2<ISA_NATIVE> >;
|
|
141
167
|
encoder_avx2_lut<ISA_NATIVE>();
|
|
168
|
+
_encode_isa = ISA_NATIVE;
|
|
142
169
|
}
|
|
143
170
|
# else
|
|
144
171
|
# include "encoder_sse_base.h"
|
|
145
172
|
static inline void encoder_native_init() {
|
|
146
173
|
_do_encode = &do_encode_simd< do_encode_sse<ISA_NATIVE> >;
|
|
147
174
|
encoder_sse_lut<ISA_NATIVE>();
|
|
175
|
+
_encode_isa = ISA_NATIVE;
|
|
148
176
|
}
|
|
149
177
|
# endif
|
|
150
178
|
#endif
|