yencode 1.1.0 → 1.1.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/binding.gyp +79 -7
- package/crcutil-1.0/code/multiword_64_64_intrinsic_i386_mmx.cc +1 -1
- package/package.json +1 -1
- package/src/common.h +88 -24
- package/src/crc.cc +59 -27
- package/src/crc.h +20 -6
- package/src/crc_arm.cc +154 -27
- package/src/crc_common.h +3 -10
- package/src/{crc_folding.c → crc_folding.cc} +53 -122
- package/src/crc_folding_256.cc +230 -0
- package/src/decoder.cc +10 -4
- package/src/decoder.h +16 -2
- package/src/decoder_avx2_base.h +32 -21
- package/src/decoder_common.h +2 -2
- package/src/decoder_neon.cc +37 -37
- package/src/decoder_neon64.cc +41 -36
- package/src/decoder_sse_base.h +21 -14
- package/src/decoder_vbmi2.cc +30 -0
- package/src/encoder.cc +9 -3
- package/src/encoder.h +17 -1
- package/src/encoder_avx_base.h +8 -8
- package/src/encoder_common.h +3 -3
- package/src/encoder_neon.cc +31 -31
- package/src/encoder_sse_base.h +7 -8
- package/src/encoder_vbmi2.cc +23 -0
- package/src/platform.cc +57 -8
- package/src/yencode.cc +33 -44
- package/test/testcrc.js +14 -0
package/src/decoder_neon64.cc
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
#include "common.h"
|
|
2
|
-
#
|
|
2
|
+
#if defined(__ARM_NEON) && defined(__aarch64__)
|
|
3
3
|
|
|
4
4
|
#include "decoder_common.h"
|
|
5
5
|
|
|
@@ -10,9 +10,9 @@ static struct { char bytes[16]; } ALIGN_TO(16, compactLUT[32768]);
|
|
|
10
10
|
static uint8_t eqFixLUT[256];
|
|
11
11
|
|
|
12
12
|
|
|
13
|
-
|
|
14
|
-
#if !defined(__clang__)
|
|
15
|
-
HEDLEY_ALWAYS_INLINE uint8x16x4_t
|
|
13
|
+
// AArch64 GCC lacks these functions until 8.5, 9.4 and 10.1 (10.0 unknown)
|
|
14
|
+
#if !defined(__clang__) && !defined(_MSC_VER) && (!defined(__aarch64__) || !(HEDLEY_GCC_VERSION_CHECK(9,4,0) || (!HEDLEY_GCC_VERSION_CHECK(9,0,0) && HEDLEY_GCC_VERSION_CHECK(8,5,0))))
|
|
15
|
+
static HEDLEY_ALWAYS_INLINE uint8x16x4_t _vld1q_u8_x4(const uint8_t* p) {
|
|
16
16
|
uint8x16x4_t ret;
|
|
17
17
|
ret.val[0] = vld1q_u8(p);
|
|
18
18
|
ret.val[1] = vld1q_u8(p+16);
|
|
@@ -20,12 +20,15 @@ HEDLEY_ALWAYS_INLINE uint8x16x4_t vld1q_u8_x4(const uint8_t* p) {
|
|
|
20
20
|
ret.val[3] = vld1q_u8(p+48);
|
|
21
21
|
return ret;
|
|
22
22
|
}
|
|
23
|
-
HEDLEY_ALWAYS_INLINE void
|
|
23
|
+
static HEDLEY_ALWAYS_INLINE void _vst1q_u8_x4(uint8_t* p, uint8x16x4_t data) {
|
|
24
24
|
vst1q_u8(p, data.val[0]);
|
|
25
25
|
vst1q_u8(p+16, data.val[1]);
|
|
26
26
|
vst1q_u8(p+32, data.val[2]);
|
|
27
27
|
vst1q_u8(p+48, data.val[3]);
|
|
28
28
|
}
|
|
29
|
+
#else
|
|
30
|
+
# define _vld1q_u8_x4 vld1q_u8_x4
|
|
31
|
+
# define _vst1q_u8_x4 vst1q_u8_x4
|
|
29
32
|
#endif
|
|
30
33
|
|
|
31
34
|
|
|
@@ -48,12 +51,14 @@ HEDLEY_ALWAYS_INLINE void do_decode_neon(const uint8_t* HEDLEY_RESTRICT src, lon
|
|
|
48
51
|
HEDLEY_ASSUME(escFirst == 0 || escFirst == 1);
|
|
49
52
|
HEDLEY_ASSUME(nextMask == 0 || nextMask == 1 || nextMask == 2);
|
|
50
53
|
uint8x16_t nextMaskMix = vdupq_n_u8(0);
|
|
51
|
-
if(nextMask)
|
|
52
|
-
nextMaskMix
|
|
53
|
-
|
|
54
|
+
if(nextMask == 1)
|
|
55
|
+
nextMaskMix = vsetq_lane_u8(1, nextMaskMix, 0);
|
|
56
|
+
if(nextMask == 2)
|
|
57
|
+
nextMaskMix = vsetq_lane_u8(2, nextMaskMix, 1);
|
|
58
|
+
uint8x16_t yencOffset = escFirst ? vmakeq_u8(42+64,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42) : vdupq_n_u8(42);
|
|
54
59
|
long i;
|
|
55
60
|
for(i = -len; i; i += sizeof(uint8x16_t)*4) {
|
|
56
|
-
uint8x16x4_t data =
|
|
61
|
+
uint8x16x4_t data = _vld1q_u8_x4(src+i);
|
|
57
62
|
uint8x16_t dataA = data.val[0];
|
|
58
63
|
uint8x16_t dataB = data.val[1];
|
|
59
64
|
uint8x16_t dataC = data.val[2];
|
|
@@ -66,23 +71,23 @@ HEDLEY_ALWAYS_INLINE void do_decode_neon(const uint8_t* HEDLEY_RESTRICT src, lon
|
|
|
66
71
|
cmpEqD = vceqq_u8(dataD, vdupq_n_u8('=')),
|
|
67
72
|
cmpA = vqtbx1q_u8(
|
|
68
73
|
cmpEqA,
|
|
69
|
-
//
|
|
70
|
-
(
|
|
74
|
+
// \n \r
|
|
75
|
+
vmakeq_u8(0,0,0,0,0,0,0,0,0,0,255,0,0,255,0,0),
|
|
71
76
|
dataA
|
|
72
77
|
),
|
|
73
78
|
cmpB = vqtbx1q_u8(
|
|
74
79
|
cmpEqB,
|
|
75
|
-
(
|
|
80
|
+
vmakeq_u8(0,0,0,0,0,0,0,0,0,0,255,0,0,255,0,0),
|
|
76
81
|
dataB
|
|
77
82
|
),
|
|
78
83
|
cmpC = vqtbx1q_u8(
|
|
79
84
|
cmpEqC,
|
|
80
|
-
(
|
|
85
|
+
vmakeq_u8(0,0,0,0,0,0,0,0,0,0,255,0,0,255,0,0),
|
|
81
86
|
dataC
|
|
82
87
|
),
|
|
83
88
|
cmpD = vqtbx1q_u8(
|
|
84
89
|
cmpEqD,
|
|
85
|
-
(
|
|
90
|
+
vmakeq_u8(0,0,0,0,0,0,0,0,0,0,255,0,0,255,0,0),
|
|
86
91
|
dataD
|
|
87
92
|
);
|
|
88
93
|
if(isRaw) cmpA = vorrq_u8(cmpA, nextMaskMix);
|
|
@@ -93,22 +98,22 @@ HEDLEY_ALWAYS_INLINE void do_decode_neon(const uint8_t* HEDLEY_RESTRICT src, lon
|
|
|
93
98
|
)))) {
|
|
94
99
|
uint8x16_t cmpMerge = vpaddq_u8(
|
|
95
100
|
vpaddq_u8(
|
|
96
|
-
vandq_u8(cmpA, (
|
|
97
|
-
vandq_u8(cmpB, (
|
|
101
|
+
vandq_u8(cmpA, vmakeq_u8(1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128)),
|
|
102
|
+
vandq_u8(cmpB, vmakeq_u8(1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128))
|
|
98
103
|
),
|
|
99
104
|
vpaddq_u8(
|
|
100
|
-
vandq_u8(cmpC, (
|
|
101
|
-
vandq_u8(cmpD, (
|
|
105
|
+
vandq_u8(cmpC, vmakeq_u8(1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128)),
|
|
106
|
+
vandq_u8(cmpD, vmakeq_u8(1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128))
|
|
102
107
|
)
|
|
103
108
|
);
|
|
104
109
|
uint8x16_t cmpEqMerge = vpaddq_u8(
|
|
105
110
|
vpaddq_u8(
|
|
106
|
-
vandq_u8(cmpEqA, (
|
|
107
|
-
vandq_u8(cmpEqB, (
|
|
111
|
+
vandq_u8(cmpEqA, vmakeq_u8(1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128)),
|
|
112
|
+
vandq_u8(cmpEqB, vmakeq_u8(1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128))
|
|
108
113
|
),
|
|
109
114
|
vpaddq_u8(
|
|
110
|
-
vandq_u8(cmpEqC, (
|
|
111
|
-
vandq_u8(cmpEqD, (
|
|
115
|
+
vandq_u8(cmpEqC, vmakeq_u8(1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128)),
|
|
116
|
+
vandq_u8(cmpEqD, vmakeq_u8(1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128))
|
|
112
117
|
)
|
|
113
118
|
);
|
|
114
119
|
|
|
@@ -225,14 +230,14 @@ HEDLEY_ALWAYS_INLINE void do_decode_neon(const uint8_t* HEDLEY_RESTRICT src, lon
|
|
|
225
230
|
break;
|
|
226
231
|
}
|
|
227
232
|
}
|
|
228
|
-
uint8x16_t match2NlDotDMasked = vandq_u8(match2NlDotD, (
|
|
233
|
+
uint8x16_t match2NlDotDMasked = vandq_u8(match2NlDotD, vmakeq_u8(1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128));
|
|
229
234
|
uint8x16_t mergeKillDots = vpaddq_u8(
|
|
230
235
|
vpaddq_u8(
|
|
231
|
-
vandq_u8(match2NlDotA, (
|
|
232
|
-
vandq_u8(match2NlDotB, (
|
|
236
|
+
vandq_u8(match2NlDotA, vmakeq_u8(1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128)),
|
|
237
|
+
vandq_u8(match2NlDotB, vmakeq_u8(1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128))
|
|
233
238
|
),
|
|
234
239
|
vpaddq_u8(
|
|
235
|
-
vandq_u8(match2NlDotC, (
|
|
240
|
+
vandq_u8(match2NlDotC, vmakeq_u8(1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128)),
|
|
236
241
|
match2NlDotDMasked
|
|
237
242
|
)
|
|
238
243
|
);
|
|
@@ -308,27 +313,27 @@ HEDLEY_ALWAYS_INLINE void do_decode_neon(const uint8_t* HEDLEY_RESTRICT src, lon
|
|
|
308
313
|
|
|
309
314
|
uint8x16_t vMaskEqA = vqtbl1q_u8(
|
|
310
315
|
maskEqTemp,
|
|
311
|
-
(
|
|
316
|
+
vmakeq_u8(0,0,0,0,0,0,0,0, 1,1,1,1,1,1,1,1)
|
|
312
317
|
);
|
|
313
318
|
maskEqTemp = vextq_u8(maskEqTemp, maskEqTemp, 2);
|
|
314
319
|
uint8x16_t vMaskEqB = vqtbl1q_u8(
|
|
315
320
|
maskEqTemp,
|
|
316
|
-
(
|
|
321
|
+
vmakeq_u8(0,0,0,0,0,0,0,0, 1,1,1,1,1,1,1,1)
|
|
317
322
|
);
|
|
318
323
|
maskEqTemp = vextq_u8(maskEqTemp, maskEqTemp, 2);
|
|
319
324
|
uint8x16_t vMaskEqC = vqtbl1q_u8(
|
|
320
325
|
maskEqTemp,
|
|
321
|
-
(
|
|
326
|
+
vmakeq_u8(0,0,0,0,0,0,0,0, 1,1,1,1,1,1,1,1)
|
|
322
327
|
);
|
|
323
328
|
maskEqTemp = vextq_u8(maskEqTemp, maskEqTemp, 2);
|
|
324
329
|
uint8x16_t vMaskEqD = vqtbl1q_u8(
|
|
325
330
|
maskEqTemp,
|
|
326
|
-
(
|
|
331
|
+
vmakeq_u8(0,0,0,0,0,0,0,0, 1,1,1,1,1,1,1,1)
|
|
327
332
|
);
|
|
328
|
-
vMaskEqA = vtstq_u8(vMaskEqA, (
|
|
329
|
-
vMaskEqB = vtstq_u8(vMaskEqB, (
|
|
330
|
-
vMaskEqC = vtstq_u8(vMaskEqC, (
|
|
331
|
-
vMaskEqD = vtstq_u8(vMaskEqD, (
|
|
333
|
+
vMaskEqA = vtstq_u8(vMaskEqA, vmakeq_u8(1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128));
|
|
334
|
+
vMaskEqB = vtstq_u8(vMaskEqB, vmakeq_u8(1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128));
|
|
335
|
+
vMaskEqC = vtstq_u8(vMaskEqC, vmakeq_u8(1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128));
|
|
336
|
+
vMaskEqD = vtstq_u8(vMaskEqD, vmakeq_u8(1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128));
|
|
332
337
|
|
|
333
338
|
dataA = vsubq_u8(
|
|
334
339
|
dataA,
|
|
@@ -384,7 +389,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_neon(const uint8_t* HEDLEY_RESTRICT src, lon
|
|
|
384
389
|
)
|
|
385
390
|
);
|
|
386
391
|
}
|
|
387
|
-
yencOffset
|
|
392
|
+
yencOffset = vsetq_lane_u8((escFirst << 6) | 42, yencOffset, 0);
|
|
388
393
|
|
|
389
394
|
// all that's left is to 'compress' the data (skip over masked chars)
|
|
390
395
|
uint64_t counts = vget_lane_u64(vreinterpret_u64_u8(vcnt_u8(vget_low_u8(cmpCombined))), 0);
|
|
@@ -419,7 +424,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_neon(const uint8_t* HEDLEY_RESTRICT src, lon
|
|
|
419
424
|
dataB = vsubq_u8(dataB, vdupq_n_u8(42));
|
|
420
425
|
dataC = vsubq_u8(dataC, vdupq_n_u8(42));
|
|
421
426
|
dataD = vsubq_u8(dataD, vdupq_n_u8(42));
|
|
422
|
-
|
|
427
|
+
_vst1q_u8_x4(p, vcreate4_u8(dataA, dataB, dataC, dataD));
|
|
423
428
|
p += sizeof(uint8x16_t)*4;
|
|
424
429
|
escFirst = 0;
|
|
425
430
|
yencOffset = vdupq_n_u8(42);
|
package/src/decoder_sse_base.h
CHANGED
|
@@ -8,7 +8,7 @@
|
|
|
8
8
|
#endif
|
|
9
9
|
|
|
10
10
|
// GCC (ver 6-10(dev)) fails to optimize pure C version of mask testing, but has this intrinsic; Clang >= 7 optimizes C version fine
|
|
11
|
-
#if defined(__GNUC__) && __GNUC__ >= 7
|
|
11
|
+
#if (defined(__GNUC__) && __GNUC__ >= 7) || (defined(_MSC_VER) && _MSC_VER >= 1924)
|
|
12
12
|
# define KORTEST16(a, b) !_kortestz_mask16_u8((a), (b))
|
|
13
13
|
# define KAND16(a, b) _kand_mask16((a), (b))
|
|
14
14
|
# define KOR16(a, b) _kor_mask16((a), (b))
|
|
@@ -112,15 +112,22 @@ HEDLEY_ALWAYS_INLINE void do_decode_sse(const uint8_t* HEDLEY_RESTRICT src, long
|
|
|
112
112
|
-42,-42,-42,-42,-42,-42,-42,-42,-42,-42,-42,-42,-42,-42,-42,-42-64
|
|
113
113
|
) : _mm_set1_epi8(-42);
|
|
114
114
|
|
|
115
|
-
#if defined(__SSSE3__) && !defined(__tune_atom__) && !defined(__tune_slm__) && !defined(__tune_btver1__)
|
|
115
|
+
#if defined(__SSSE3__) && !defined(__tune_atom__) && !defined(__tune_slm__) && !defined(__tune_btver1__) && !defined(__tune_btver2__)
|
|
116
116
|
const bool _USING_FAST_MATCH = (use_isa >= ISA_LEVEL_SSSE3);
|
|
117
117
|
#else
|
|
118
118
|
const bool _USING_FAST_MATCH = false;
|
|
119
119
|
#endif
|
|
120
|
-
#if defined(__SSE4_1__) && !defined(__tune_slm__) && !defined(__tune_goldmont__) && !defined(__tune_goldmont_plus__)
|
|
120
|
+
#if defined(__SSE4_1__) && !defined(__tune_slm__) && !defined(__tune_goldmont__) && !defined(__tune_goldmont_plus__) && !defined(__tune_tremont__)
|
|
121
121
|
const bool _USING_BLEND_ADD = (use_isa >= ISA_LEVEL_SSE41);
|
|
122
122
|
#else
|
|
123
123
|
const bool _USING_BLEND_ADD = false;
|
|
124
|
+
#endif
|
|
125
|
+
#if defined(__AVX512VL__) && defined(__AVX512BW__)
|
|
126
|
+
# if defined(_MSC_VER) && !defined(PLATFORM_AMD64) && !defined(__clang__)
|
|
127
|
+
const bool useAVX3MaskCmp = false;
|
|
128
|
+
# else
|
|
129
|
+
const bool useAVX3MaskCmp = (use_isa >= ISA_LEVEL_AVX3);
|
|
130
|
+
# endif
|
|
124
131
|
#endif
|
|
125
132
|
|
|
126
133
|
__m128i lfCompare = _mm_set1_epi8('\n');
|
|
@@ -214,7 +221,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_sse(const uint8_t* HEDLEY_RESTRICT src, long
|
|
|
214
221
|
__mmask16 match2EqMaskA, match2EqMaskB;
|
|
215
222
|
__mmask16 match0CrMaskA, match0CrMaskB;
|
|
216
223
|
__mmask16 match2CrXDtMaskA, match2CrXDtMaskB;
|
|
217
|
-
if(
|
|
224
|
+
if(useAVX3MaskCmp && searchEnd) {
|
|
218
225
|
match2EqMaskA = _mm_cmpeq_epi8_mask(_mm_set1_epi8('='), tmpData2A);
|
|
219
226
|
match2EqMaskB = _mm_cmpeq_epi8_mask(_mm_set1_epi8('='), tmpData2B);
|
|
220
227
|
} else
|
|
@@ -230,7 +237,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_sse(const uint8_t* HEDLEY_RESTRICT src, long
|
|
|
230
237
|
__m128i match2CrXDtA, match2CrXDtB;
|
|
231
238
|
if(isRaw) {
|
|
232
239
|
#if defined(__AVX512VL__) && defined(__AVX512BW__)
|
|
233
|
-
if(
|
|
240
|
+
if(useAVX3MaskCmp) {
|
|
234
241
|
match0CrMaskA = _mm_cmpeq_epi8_mask(oDataA, _mm_set1_epi8('\r'));
|
|
235
242
|
match0CrMaskB = _mm_cmpeq_epi8_mask(oDataB, _mm_set1_epi8('\r'));
|
|
236
243
|
match2CrXDtMaskA = _mm_mask_cmpeq_epi8_mask(match0CrMaskA, tmpData2A, _mm_set1_epi8('.'));
|
|
@@ -256,7 +263,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_sse(const uint8_t* HEDLEY_RESTRICT src, long
|
|
|
256
263
|
#if defined(__AVX512VL__) && defined(__AVX512BW__)
|
|
257
264
|
__mmask16 match1NlMaskA, match1NlMaskB;
|
|
258
265
|
__mmask16 match2NlDotMaskA, match2NlDotMaskB;
|
|
259
|
-
if(
|
|
266
|
+
if(useAVX3MaskCmp) {
|
|
260
267
|
match1NlMaskA = _mm_mask_cmpeq_epi8_mask(
|
|
261
268
|
match0CrMaskA,
|
|
262
269
|
_mm_set1_epi8('\n'),
|
|
@@ -299,7 +306,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_sse(const uint8_t* HEDLEY_RESTRICT src, long
|
|
|
299
306
|
|
|
300
307
|
int matchEnd;
|
|
301
308
|
#if defined(__AVX512VL__) && defined(__AVX512BW__)
|
|
302
|
-
if(
|
|
309
|
+
if(useAVX3MaskCmp) {
|
|
303
310
|
__mmask16 match3EqYMaskA = _mm_mask_cmpeq_epi8_mask(
|
|
304
311
|
match2EqMaskA, _mm_set1_epi8('y'), tmpData3A
|
|
305
312
|
);
|
|
@@ -368,12 +375,12 @@ HEDLEY_ALWAYS_INLINE void do_decode_sse(const uint8_t* HEDLEY_RESTRICT src, long
|
|
|
368
375
|
if(LIKELIHOOD(0.001, matchEnd)) {
|
|
369
376
|
// terminator found
|
|
370
377
|
// there's probably faster ways to do this, but reverting to scalar code should be good enough
|
|
371
|
-
len += i;
|
|
378
|
+
len += (long)i;
|
|
372
379
|
break;
|
|
373
380
|
}
|
|
374
381
|
}
|
|
375
382
|
#if defined(__AVX512VL__) && defined(__AVX512BW__)
|
|
376
|
-
if(
|
|
383
|
+
if(useAVX3MaskCmp) {
|
|
377
384
|
mask |= match2NlDotMaskA << 2;
|
|
378
385
|
mask |= (match2NlDotMaskB << 18) & 0xffffffff;
|
|
379
386
|
minMask = _mm_maskz_mov_epi8(~(match2NlDotMaskB>>14), _mm_set1_epi8('.'));
|
|
@@ -398,7 +405,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_sse(const uint8_t* HEDLEY_RESTRICT src, long
|
|
|
398
405
|
__m128i match3EqYA, match3EqYB;
|
|
399
406
|
#if defined(__AVX512VL__) && defined(__AVX512BW__)
|
|
400
407
|
__mmask16 match3EqYMaskA, match3EqYMaskB;
|
|
401
|
-
if(
|
|
408
|
+
if(useAVX3MaskCmp) {
|
|
402
409
|
match3EqYMaskA = _mm_mask_cmpeq_epi8_mask(
|
|
403
410
|
match2EqMaskA,
|
|
404
411
|
_mm_set1_epi8('y'),
|
|
@@ -434,7 +441,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_sse(const uint8_t* HEDLEY_RESTRICT src, long
|
|
|
434
441
|
bool endFound;
|
|
435
442
|
|
|
436
443
|
#if defined(__AVX512VL__) && defined(__AVX512BW__)
|
|
437
|
-
if(
|
|
444
|
+
if(useAVX3MaskCmp) {
|
|
438
445
|
__mmask16 match3LfEqYMaskA = _mm_mask_cmpeq_epi8_mask(
|
|
439
446
|
match3EqYMaskA,
|
|
440
447
|
_mm_set1_epi8('\n'),
|
|
@@ -477,7 +484,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_sse(const uint8_t* HEDLEY_RESTRICT src, long
|
|
|
477
484
|
}
|
|
478
485
|
|
|
479
486
|
if(endFound) {
|
|
480
|
-
len += i;
|
|
487
|
+
len += (long)i;
|
|
481
488
|
break;
|
|
482
489
|
}
|
|
483
490
|
}
|
|
@@ -558,7 +565,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_sse(const uint8_t* HEDLEY_RESTRICT src, long
|
|
|
558
565
|
);
|
|
559
566
|
|
|
560
567
|
yencOffset = _mm_xor_si128(_mm_set1_epi8(-42),
|
|
561
|
-
_mm_slli_epi16(_mm_cvtsi32_si128(escFirst), 6)
|
|
568
|
+
_mm_slli_epi16(_mm_cvtsi32_si128((int)escFirst), 6)
|
|
562
569
|
);
|
|
563
570
|
}
|
|
564
571
|
} else {
|
|
@@ -608,7 +615,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_sse(const uint8_t* HEDLEY_RESTRICT src, long
|
|
|
608
615
|
)
|
|
609
616
|
);
|
|
610
617
|
yencOffset = _mm_xor_si128(_mm_set1_epi8(-42),
|
|
611
|
-
_mm_slli_epi16(_mm_cvtsi32_si128(escFirst), 6)
|
|
618
|
+
_mm_slli_epi16(_mm_cvtsi32_si128((int)escFirst), 6)
|
|
612
619
|
);
|
|
613
620
|
} else
|
|
614
621
|
#endif
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
#include "common.h"
|
|
2
|
+
|
|
3
|
+
#if defined(__AVX512VL__) && defined(__AVX512VBMI2__) && defined(__AVX512BW__)
|
|
4
|
+
# include "decoder_common.h"
|
|
5
|
+
# ifndef YENC_DISABLE_AVX256
|
|
6
|
+
# include "decoder_avx2_base.h"
|
|
7
|
+
void decoder_set_vbmi2_funcs() {
|
|
8
|
+
ALIGN_ALLOC(lookups, sizeof(*lookups), 16);
|
|
9
|
+
// TODO: consider removing compact LUT
|
|
10
|
+
decoder_init_lut(lookups->eqFix, lookups->compact);
|
|
11
|
+
_do_decode = &do_decode_simd<false, false, sizeof(__m256i)*2, do_decode_avx2<false, false, ISA_LEVEL_VBMI2> >;
|
|
12
|
+
_do_decode_raw = &do_decode_simd<true, false, sizeof(__m256i)*2, do_decode_avx2<true, false, ISA_LEVEL_VBMI2> >;
|
|
13
|
+
_do_decode_end_raw = &do_decode_simd<true, true, sizeof(__m256i)*2, do_decode_avx2<true, true, ISA_LEVEL_VBMI2> >;
|
|
14
|
+
}
|
|
15
|
+
# else
|
|
16
|
+
# include "decoder_sse_base.h"
|
|
17
|
+
void decoder_set_vbmi2_funcs() {
|
|
18
|
+
decoder_sse_init();
|
|
19
|
+
decoder_init_lut(lookups->eqFix, lookups->compact);
|
|
20
|
+
_do_decode = &do_decode_simd<false, false, sizeof(__m128i)*2, do_decode_sse<false, false, ISA_LEVEL_VBMI2> >;
|
|
21
|
+
_do_decode_raw = &do_decode_simd<true, false, sizeof(__m128i)*2, do_decode_sse<true, false, ISA_LEVEL_VBMI2> >;
|
|
22
|
+
_do_decode_end_raw = &do_decode_simd<true, true, sizeof(__m128i)*2, do_decode_sse<true, true, ISA_LEVEL_VBMI2> >;
|
|
23
|
+
}
|
|
24
|
+
# endif
|
|
25
|
+
#else
|
|
26
|
+
void decoder_set_avx2_funcs();
|
|
27
|
+
void decoder_set_vbmi2_funcs() {
|
|
28
|
+
decoder_set_avx2_funcs();
|
|
29
|
+
}
|
|
30
|
+
#endif
|
package/src/encoder.cc
CHANGED
|
@@ -1,7 +1,8 @@
|
|
|
1
1
|
#include "common.h"
|
|
2
2
|
#include "encoder_common.h"
|
|
3
|
+
#include "encoder.h"
|
|
3
4
|
|
|
4
|
-
size_t do_encode_generic(int line_size, int* colOffset, const unsigned char* HEDLEY_RESTRICT src, unsigned char* HEDLEY_RESTRICT dest, size_t len,
|
|
5
|
+
size_t do_encode_generic(int line_size, int* colOffset, const unsigned char* HEDLEY_RESTRICT src, unsigned char* HEDLEY_RESTRICT dest, size_t len, int doEnd) {
|
|
5
6
|
unsigned char* es = (unsigned char*)src + len;
|
|
6
7
|
unsigned char *p = dest; // destination pointer
|
|
7
8
|
long i = -(long)len; // input position
|
|
@@ -119,12 +120,15 @@ size_t do_encode_generic(int line_size, int* colOffset, const unsigned char* HED
|
|
|
119
120
|
}
|
|
120
121
|
|
|
121
122
|
|
|
122
|
-
|
|
123
|
+
extern "C" {
|
|
124
|
+
size_t (*_do_encode)(int, int*, const unsigned char* HEDLEY_RESTRICT, unsigned char* HEDLEY_RESTRICT, size_t, int) = &do_encode_generic;
|
|
125
|
+
}
|
|
123
126
|
|
|
124
127
|
void encoder_sse2_init();
|
|
125
128
|
void encoder_ssse3_init();
|
|
126
129
|
void encoder_avx_init();
|
|
127
130
|
void encoder_avx2_init();
|
|
131
|
+
void encoder_vbmi2_init();
|
|
128
132
|
void encoder_neon_init();
|
|
129
133
|
|
|
130
134
|
#if defined(PLATFORM_X86) && defined(YENC_BUILD_NATIVE) && YENC_BUILD_NATIVE!=0
|
|
@@ -150,7 +154,9 @@ void encoder_init() {
|
|
|
150
154
|
encoder_native_init();
|
|
151
155
|
# else
|
|
152
156
|
int use_isa = cpu_supports_isa();
|
|
153
|
-
if(use_isa >=
|
|
157
|
+
if(use_isa >= ISA_LEVEL_VBMI2)
|
|
158
|
+
encoder_vbmi2_init();
|
|
159
|
+
else if(use_isa >= ISA_LEVEL_AVX2)
|
|
154
160
|
encoder_avx2_init();
|
|
155
161
|
else if(use_isa >= ISA_LEVEL_AVX)
|
|
156
162
|
encoder_avx_init();
|
package/src/encoder.h
CHANGED
|
@@ -1,5 +1,21 @@
|
|
|
1
|
+
#ifndef __YENC_ENCODER_H
|
|
2
|
+
#define __YENC_ENCODER_H
|
|
3
|
+
|
|
4
|
+
#ifdef __cplusplus
|
|
5
|
+
extern "C" {
|
|
6
|
+
#endif
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
|
|
1
10
|
#include "hedley.h"
|
|
2
11
|
|
|
3
|
-
extern size_t (*_do_encode)(int, int*, const unsigned char* HEDLEY_RESTRICT, unsigned char* HEDLEY_RESTRICT, size_t,
|
|
12
|
+
extern size_t (*_do_encode)(int, int*, const unsigned char* HEDLEY_RESTRICT, unsigned char* HEDLEY_RESTRICT, size_t, int);
|
|
4
13
|
#define do_encode (*_do_encode)
|
|
5
14
|
void encoder_init();
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
#ifdef __cplusplus
|
|
19
|
+
}
|
|
20
|
+
#endif
|
|
21
|
+
#endif
|
package/src/encoder_avx_base.h
CHANGED
|
@@ -6,7 +6,7 @@
|
|
|
6
6
|
#include "encoder_common.h"
|
|
7
7
|
#define YMM_SIZE 32
|
|
8
8
|
|
|
9
|
-
#if defined(__GNUC__) && __GNUC__ >= 7
|
|
9
|
+
#if (defined(__GNUC__) && __GNUC__ >= 7) || (defined(_MSC_VER) && _MSC_VER >= 1924)
|
|
10
10
|
# define KLOAD32(a, offs) _load_mask32((__mmask32*)(a) + (offs))
|
|
11
11
|
#else
|
|
12
12
|
# define KLOAD32(a, offs) (((uint32_t*)(a))[(offs)])
|
|
@@ -112,7 +112,7 @@ HEDLEY_ALWAYS_INLINE void do_encode_avx2(int line_size, int* colOffset, const ui
|
|
|
112
112
|
// last char
|
|
113
113
|
uint32_t eolChar = (use_isa >= ISA_LEVEL_VBMI2 ? lookupsVBMI2->eolLastChar[c] : lookupsAVX2->eolLastChar[c]);
|
|
114
114
|
*(uint32_t*)p = eolChar;
|
|
115
|
-
p += 3 + (eolChar>>27);
|
|
115
|
+
p += 3 + (uintptr_t)(eolChar>>27);
|
|
116
116
|
col = -line_size+1;
|
|
117
117
|
} else {
|
|
118
118
|
// line overflowed, insert a newline
|
|
@@ -215,7 +215,7 @@ HEDLEY_ALWAYS_INLINE void do_encode_avx2(int line_size, int* colOffset, const ui
|
|
|
215
215
|
// duplicate halves
|
|
216
216
|
data1A = _mm256_inserti128_si256(dataA, _mm256_castsi256_si128(dataA), 1);
|
|
217
217
|
data1B = _mm256_inserti128_si256(dataB, _mm256_castsi256_si128(dataB), 1);
|
|
218
|
-
#
|
|
218
|
+
#if defined(__tune_znver2__) || defined(__tune_znver3__)
|
|
219
219
|
data2A = _mm256_permute2x128_si256(dataA, dataA, 0x11);
|
|
220
220
|
data2B = _mm256_permute2x128_si256(dataB, dataB, 0x11);
|
|
221
221
|
#else
|
|
@@ -254,7 +254,7 @@ HEDLEY_ALWAYS_INLINE void do_encode_avx2(int line_size, int* colOffset, const ui
|
|
|
254
254
|
// we overflowed - find correct position to revert back to
|
|
255
255
|
// this is perhaps sub-optimal on 32-bit, but who still uses that with AVX2?
|
|
256
256
|
uint64_t eqMask;
|
|
257
|
-
int shiftAmt = maskBitsB + YMM_SIZE - col
|
|
257
|
+
int shiftAmt = (int)(maskBitsB + YMM_SIZE -1 - col);
|
|
258
258
|
if(HEDLEY_UNLIKELY(shiftAmt < 0)) {
|
|
259
259
|
uint32_t eqMask1, eqMask2;
|
|
260
260
|
#if defined(__AVX512VBMI2__) && defined(__AVX512VL__) && defined(__AVX512BW__)
|
|
@@ -293,7 +293,7 @@ HEDLEY_ALWAYS_INLINE void do_encode_avx2(int line_size, int* colOffset, const ui
|
|
|
293
293
|
asm(
|
|
294
294
|
"shrq $1, %[eqMask] \n"
|
|
295
295
|
"shrq %%cl, %[eqMask] \n"
|
|
296
|
-
"adcq %[col], %[p] \n"
|
|
296
|
+
"adcq %q[col], %q[p] \n"
|
|
297
297
|
: [eqMask]"+r"(eqMask), [p]"+r"(p)
|
|
298
298
|
: "c"(shiftAmt), [col]"r"(~col)
|
|
299
299
|
);
|
|
@@ -320,7 +320,7 @@ HEDLEY_ALWAYS_INLINE void do_encode_avx2(int line_size, int* colOffset, const ui
|
|
|
320
320
|
#endif
|
|
321
321
|
{
|
|
322
322
|
i += bitCount;
|
|
323
|
-
unsigned int revert = col + (eqMask & 1);
|
|
323
|
+
unsigned int revert = (unsigned int)(col + (eqMask & 1));
|
|
324
324
|
p -= revert;
|
|
325
325
|
i -= revert;
|
|
326
326
|
}
|
|
@@ -429,7 +429,7 @@ HEDLEY_ALWAYS_INLINE void do_encode_avx2(int line_size, int* colOffset, const ui
|
|
|
429
429
|
_encode_eol_handle_pre:
|
|
430
430
|
uint32_t eolChar = (use_isa >= ISA_LEVEL_VBMI2 ? lookupsVBMI2->eolLastChar[es[i]] : lookupsAVX2->eolLastChar[es[i]]);
|
|
431
431
|
*(uint32_t*)p = eolChar;
|
|
432
|
-
p += 3 + (eolChar>>27);
|
|
432
|
+
p += 3 + (uintptr_t)(eolChar>>27);
|
|
433
433
|
col = lineSizeOffset;
|
|
434
434
|
|
|
435
435
|
if(HEDLEY_UNLIKELY(i >= 0)) { // this isn't really a proper check - it's only needed to support short lines; basically, if the line is too short, `i` never gets checked, so we need one somewhere
|
|
@@ -556,7 +556,7 @@ HEDLEY_ALWAYS_INLINE void do_encode_avx2(int line_size, int* colOffset, const ui
|
|
|
556
556
|
|
|
557
557
|
_mm256_zeroupper();
|
|
558
558
|
|
|
559
|
-
*colOffset = col + line_size -1;
|
|
559
|
+
*colOffset = (int)(col + line_size -1);
|
|
560
560
|
dest = p;
|
|
561
561
|
len = -(i - INPUT_OFFSET);
|
|
562
562
|
}
|
package/src/encoder_common.h
CHANGED
|
@@ -8,7 +8,7 @@
|
|
|
8
8
|
#define _BX _B3(0), _B3(64), _B3(128), _B3(192)
|
|
9
9
|
|
|
10
10
|
static const unsigned char escapeLUT[256] = { // whether or not the character is critical
|
|
11
|
-
#define _B(n) ((n == 214 || n ==
|
|
11
|
+
#define _B(n) ((n == 214 || n == '\r'+214 || n == '\n'+214 || n == '='-42) ? 0 : (n+42) & 0xff)
|
|
12
12
|
_BX
|
|
13
13
|
#undef _B
|
|
14
14
|
};
|
|
@@ -24,10 +24,10 @@ static const uint16_t escapedLUT[256] = { // escaped sequences for characters th
|
|
|
24
24
|
#undef _BX
|
|
25
25
|
|
|
26
26
|
|
|
27
|
-
size_t do_encode_generic(int line_size, int* colOffset, const unsigned char* HEDLEY_RESTRICT src, unsigned char* HEDLEY_RESTRICT dest, size_t len,
|
|
27
|
+
size_t do_encode_generic(int line_size, int* colOffset, const unsigned char* HEDLEY_RESTRICT src, unsigned char* HEDLEY_RESTRICT dest, size_t len, int doEnd);
|
|
28
28
|
|
|
29
29
|
template<void(&kernel)(int, int*, const uint8_t* HEDLEY_RESTRICT, uint8_t* HEDLEY_RESTRICT&, size_t&)>
|
|
30
|
-
static size_t do_encode_simd(int line_size, int* colOffset, const uint8_t* HEDLEY_RESTRICT src, uint8_t* HEDLEY_RESTRICT dest, size_t len,
|
|
30
|
+
static size_t do_encode_simd(int line_size, int* colOffset, const uint8_t* HEDLEY_RESTRICT src, uint8_t* HEDLEY_RESTRICT dest, size_t len, int doEnd) {
|
|
31
31
|
if(len < 1) return 0;
|
|
32
32
|
if(line_size < 12) { // short lines probably not worth processing in a SIMD way
|
|
33
33
|
// we assume at least the first and last char exist in the line, and since the first char could be escaped, and SIMD encoder assumes at least one non-first/last char, assumption means that line size has to be >= 4
|