yencode 1.1.4 → 1.1.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/common.h +20 -0
- package/src/crc.cc +10 -9
- package/src/crc.h +4 -0
- package/src/crc_arm.cc +4 -5
- package/src/crc_folding.cc +4 -5
- package/src/crc_folding_256.cc +6 -5
- package/src/decoder.cc +4 -0
- package/src/decoder.h +4 -0
- package/src/decoder_avx.cc +1 -0
- package/src/decoder_avx2.cc +1 -0
- package/src/decoder_avx2_base.h +4 -14
- package/src/decoder_common.h +25 -0
- package/src/decoder_neon.cc +6 -12
- package/src/decoder_neon64.cc +6 -11
- package/src/decoder_sse2.cc +1 -0
- package/src/decoder_sse_base.h +5 -11
- package/src/decoder_ssse3.cc +1 -0
- package/src/decoder_vbmi2.cc +2 -0
- package/src/encoder.cc +3 -0
- package/src/encoder.h +4 -0
- package/src/encoder_avx.cc +1 -0
- package/src/encoder_avx2.cc +1 -0
- package/src/encoder_neon.cc +1 -0
- package/src/encoder_rvv.cc +1 -0
- package/src/encoder_sse2.cc +1 -0
- package/src/encoder_ssse3.cc +1 -0
- package/src/encoder_vbmi2.cc +2 -0
- package/test/testcrc.js +2 -2
- package/test/testdec.js +1 -1
- package/test/testenc.js +1 -1
package/package.json
CHANGED
package/src/common.h
CHANGED
|
@@ -221,6 +221,7 @@ bool cpu_supports_neon();
|
|
|
221
221
|
|
|
222
222
|
#ifdef PLATFORM_X86
|
|
223
223
|
enum YEncDecIsaLevel {
|
|
224
|
+
ISA_GENERIC = 0,
|
|
224
225
|
ISA_FEATURE_POPCNT = 0x1,
|
|
225
226
|
ISA_FEATURE_LZCNT = 0x2,
|
|
226
227
|
ISA_FEATURE_EVEX512 = 0x4, // AVX512 support
|
|
@@ -228,11 +229,30 @@ enum YEncDecIsaLevel {
|
|
|
228
229
|
ISA_LEVEL_SSSE3 = 0x200,
|
|
229
230
|
ISA_LEVEL_SSE41 = 0x300,
|
|
230
231
|
ISA_LEVEL_SSE4_POPCNT = 0x301,
|
|
232
|
+
ISA_LEVEL_PCLMUL = 0x340,
|
|
231
233
|
ISA_LEVEL_AVX = 0x381, // same as above, just used as a differentiator for `cpu_supports_isa`
|
|
232
234
|
ISA_LEVEL_AVX2 = 0x403, // also includes BMI1/2 and LZCNT
|
|
235
|
+
ISA_LEVEL_VPCLMUL = 0x440,
|
|
233
236
|
ISA_LEVEL_AVX3 = 0x507, // SKX variant; AVX512VL + AVX512BW
|
|
234
237
|
ISA_LEVEL_VBMI2 = 0x603 // ICL, AVX10
|
|
235
238
|
};
|
|
239
|
+
#elif defined(PLATFORM_ARM)
|
|
240
|
+
enum YEncDecIsaLevel {
|
|
241
|
+
ISA_GENERIC = 0,
|
|
242
|
+
ISA_FEATURE_CRC = 8,
|
|
243
|
+
ISA_LEVEL_NEON = 0x1000
|
|
244
|
+
};
|
|
245
|
+
#elif defined(__riscv)
|
|
246
|
+
enum YEncDecIsaLevel {
|
|
247
|
+
ISA_GENERIC = 0,
|
|
248
|
+
ISA_LEVEL_RVV = 0x10000
|
|
249
|
+
};
|
|
250
|
+
#else
|
|
251
|
+
enum YEncDecIsaLevel {
|
|
252
|
+
ISA_GENERIC = 0
|
|
253
|
+
};
|
|
254
|
+
#endif
|
|
255
|
+
#ifdef PLATFORM_X86
|
|
236
256
|
#ifdef _MSC_VER
|
|
237
257
|
// native tuning not supported in MSVC
|
|
238
258
|
# define ISA_NATIVE ISA_LEVEL_SSE2
|
package/src/crc.cc
CHANGED
|
@@ -123,9 +123,10 @@ static void generate_crc32_slice_table() {
|
|
|
123
123
|
}
|
|
124
124
|
#endif
|
|
125
125
|
|
|
126
|
-
|
|
127
|
-
crc_func _do_crc32_incremental = &do_crc32_incremental_generic;
|
|
128
|
-
|
|
126
|
+
extern "C" {
|
|
127
|
+
crc_func _do_crc32_incremental = &do_crc32_incremental_generic;
|
|
128
|
+
int _crc32_isa = ISA_GENERIC;
|
|
129
|
+
}
|
|
129
130
|
|
|
130
131
|
|
|
131
132
|
uint32_t do_crc32_combine(uint32_t crc1, uint32_t crc2, size_t len2) {
|
|
@@ -140,9 +141,9 @@ uint32_t do_crc32_zeros(uint32_t crc1, size_t len) {
|
|
|
140
141
|
return (uint32_t)crc_;
|
|
141
142
|
}
|
|
142
143
|
|
|
143
|
-
void crc_clmul_set_funcs(
|
|
144
|
-
void crc_clmul256_set_funcs(
|
|
145
|
-
void crc_arm_set_funcs(
|
|
144
|
+
void crc_clmul_set_funcs();
|
|
145
|
+
void crc_clmul256_set_funcs();
|
|
146
|
+
void crc_arm_set_funcs();
|
|
146
147
|
|
|
147
148
|
#ifdef PLATFORM_X86
|
|
148
149
|
int cpu_supports_crc_isa();
|
|
@@ -186,9 +187,9 @@ void crc_init() {
|
|
|
186
187
|
#ifdef PLATFORM_X86
|
|
187
188
|
int support = cpu_supports_crc_isa();
|
|
188
189
|
if(support == 2)
|
|
189
|
-
crc_clmul256_set_funcs(
|
|
190
|
+
crc_clmul256_set_funcs();
|
|
190
191
|
else if(support == 1)
|
|
191
|
-
crc_clmul_set_funcs(
|
|
192
|
+
crc_clmul_set_funcs();
|
|
192
193
|
#endif
|
|
193
194
|
#ifdef PLATFORM_ARM
|
|
194
195
|
# ifdef __APPLE__
|
|
@@ -216,7 +217,7 @@ void crc_init() {
|
|
|
216
217
|
false
|
|
217
218
|
# endif
|
|
218
219
|
) {
|
|
219
|
-
crc_arm_set_funcs(
|
|
220
|
+
crc_arm_set_funcs();
|
|
220
221
|
}
|
|
221
222
|
#endif
|
|
222
223
|
}
|
package/src/crc.h
CHANGED
|
@@ -9,11 +9,15 @@ extern "C" {
|
|
|
9
9
|
|
|
10
10
|
typedef uint32_t (*crc_func)(const void*, size_t, uint32_t);
|
|
11
11
|
extern crc_func _do_crc32_incremental;
|
|
12
|
+
extern int _crc32_isa;
|
|
12
13
|
#define do_crc32 (*_do_crc32_incremental)
|
|
13
14
|
|
|
14
15
|
uint32_t do_crc32_combine(uint32_t crc1, const uint32_t crc2, size_t len2);
|
|
15
16
|
uint32_t do_crc32_zeros(uint32_t crc1, size_t len);
|
|
16
17
|
void crc_init();
|
|
18
|
+
static inline int crc32_isa_level() {
|
|
19
|
+
return _crc32_isa;
|
|
20
|
+
}
|
|
17
21
|
|
|
18
22
|
|
|
19
23
|
|
package/src/crc_arm.cc
CHANGED
|
@@ -200,11 +200,10 @@ static uint32_t do_crc32_incremental_arm(const void* data, size_t length, uint32
|
|
|
200
200
|
return ~arm_crc_calc(~init, (const unsigned char*)data, (long)length);
|
|
201
201
|
}
|
|
202
202
|
|
|
203
|
-
void crc_arm_set_funcs(
|
|
204
|
-
|
|
203
|
+
void crc_arm_set_funcs() {
|
|
204
|
+
_do_crc32_incremental = &do_crc32_incremental_arm;
|
|
205
|
+
_crc32_isa = ISA_FEATURE_CRC;
|
|
205
206
|
}
|
|
206
207
|
#else
|
|
207
|
-
void crc_arm_set_funcs(
|
|
208
|
-
(void)_do_crc32_incremental;
|
|
209
|
-
}
|
|
208
|
+
void crc_arm_set_funcs() {}
|
|
210
209
|
#endif
|
package/src/crc_folding.cc
CHANGED
|
@@ -365,12 +365,11 @@ static uint32_t do_crc32_incremental_clmul(const void* data, size_t length, uint
|
|
|
365
365
|
return crc_fold((const unsigned char*)data, (long)length, init);
|
|
366
366
|
}
|
|
367
367
|
|
|
368
|
-
void crc_clmul_set_funcs(
|
|
369
|
-
|
|
368
|
+
void crc_clmul_set_funcs() {
|
|
369
|
+
_do_crc32_incremental = &do_crc32_incremental_clmul;
|
|
370
|
+
_crc32_isa = ISA_LEVEL_PCLMUL;
|
|
370
371
|
}
|
|
371
372
|
#else
|
|
372
|
-
void crc_clmul_set_funcs(
|
|
373
|
-
(void)_do_crc32_incremental;
|
|
374
|
-
}
|
|
373
|
+
void crc_clmul_set_funcs() {}
|
|
375
374
|
#endif
|
|
376
375
|
|
package/src/crc_folding_256.cc
CHANGED
|
@@ -217,13 +217,14 @@ static uint32_t do_crc32_incremental_clmul(const void* data, size_t length, uint
|
|
|
217
217
|
return crc_fold((const unsigned char*)data, (long)length, init);
|
|
218
218
|
}
|
|
219
219
|
|
|
220
|
-
void crc_clmul256_set_funcs(
|
|
221
|
-
|
|
220
|
+
void crc_clmul256_set_funcs() {
|
|
221
|
+
_do_crc32_incremental = &do_crc32_incremental_clmul;
|
|
222
|
+
_crc32_isa = ISA_LEVEL_VPCLMUL;
|
|
222
223
|
}
|
|
223
224
|
#else
|
|
224
|
-
void crc_clmul_set_funcs(
|
|
225
|
-
void crc_clmul256_set_funcs(
|
|
226
|
-
crc_clmul_set_funcs(
|
|
225
|
+
void crc_clmul_set_funcs();
|
|
226
|
+
void crc_clmul256_set_funcs() {
|
|
227
|
+
crc_clmul_set_funcs();
|
|
227
228
|
}
|
|
228
229
|
#endif
|
|
229
230
|
|
package/src/decoder.cc
CHANGED
|
@@ -7,6 +7,8 @@ extern "C" {
|
|
|
7
7
|
YencDecoderEnd (*_do_decode)(const unsigned char**, unsigned char**, size_t, YencDecoderState*) = &do_decode_scalar<false, false>;
|
|
8
8
|
YencDecoderEnd (*_do_decode_raw)(const unsigned char**, unsigned char**, size_t, YencDecoderState*) = &do_decode_scalar<true, false>;
|
|
9
9
|
YencDecoderEnd (*_do_decode_end_raw)(const unsigned char**, unsigned char**, size_t, YencDecoderState*) = &do_decode_end_scalar<true>;
|
|
10
|
+
|
|
11
|
+
int _decode_isa = ISA_GENERIC;
|
|
10
12
|
}
|
|
11
13
|
|
|
12
14
|
void decoder_set_sse2_funcs();
|
|
@@ -27,6 +29,7 @@ static inline void decoder_set_native_funcs() {
|
|
|
27
29
|
_do_decode = &do_decode_simd<false, false, sizeof(__m256i)*2, do_decode_avx2<false, false, ISA_NATIVE> >;
|
|
28
30
|
_do_decode_raw = &do_decode_simd<true, false, sizeof(__m256i)*2, do_decode_avx2<true, false, ISA_NATIVE> >;
|
|
29
31
|
_do_decode_end_raw = &do_decode_simd<true, true, sizeof(__m256i)*2, do_decode_avx2<true, true, ISA_NATIVE> >;
|
|
32
|
+
_decode_isa = ISA_NATIVE;
|
|
30
33
|
}
|
|
31
34
|
# else
|
|
32
35
|
# include "decoder_sse_base.h"
|
|
@@ -36,6 +39,7 @@ static inline void decoder_set_native_funcs() {
|
|
|
36
39
|
_do_decode = &do_decode_simd<false, false, sizeof(__m128i)*2, do_decode_sse<false, false, ISA_NATIVE> >;
|
|
37
40
|
_do_decode_raw = &do_decode_simd<true, false, sizeof(__m128i)*2, do_decode_sse<true, false, ISA_NATIVE> >;
|
|
38
41
|
_do_decode_end_raw = &do_decode_simd<true, true, sizeof(__m128i)*2, do_decode_sse<true, true, ISA_NATIVE> >;
|
|
42
|
+
_decode_isa = ISA_NATIVE;
|
|
39
43
|
}
|
|
40
44
|
# endif
|
|
41
45
|
#endif
|
package/src/decoder.h
CHANGED
|
@@ -32,6 +32,7 @@ typedef enum {
|
|
|
32
32
|
extern YencDecoderEnd (*_do_decode)(const unsigned char**, unsigned char**, size_t, YencDecoderState*);
|
|
33
33
|
extern YencDecoderEnd (*_do_decode_raw)(const unsigned char**, unsigned char**, size_t, YencDecoderState*);
|
|
34
34
|
extern YencDecoderEnd (*_do_decode_end_raw)(const unsigned char**, unsigned char**, size_t, YencDecoderState*);
|
|
35
|
+
extern int _decode_isa;
|
|
35
36
|
|
|
36
37
|
static inline size_t do_decode(int isRaw, const unsigned char* src, unsigned char* dest, size_t len, YencDecoderState* state) {
|
|
37
38
|
unsigned char* ds = dest;
|
|
@@ -45,6 +46,9 @@ static inline YencDecoderEnd do_decode_end(const unsigned char** src, unsigned c
|
|
|
45
46
|
|
|
46
47
|
void decoder_init();
|
|
47
48
|
|
|
49
|
+
static inline int decode_isa_level() {
|
|
50
|
+
return _decode_isa;
|
|
51
|
+
}
|
|
48
52
|
|
|
49
53
|
|
|
50
54
|
#ifdef __cplusplus
|
package/src/decoder_avx.cc
CHANGED
|
@@ -9,6 +9,7 @@ void decoder_set_avx_funcs() {
|
|
|
9
9
|
_do_decode = &do_decode_simd<false, false, sizeof(__m128i)*2, do_decode_sse<false, false, ISA_LEVEL_SSE4_POPCNT> >;
|
|
10
10
|
_do_decode_raw = &do_decode_simd<true, false, sizeof(__m128i)*2, do_decode_sse<true, false, ISA_LEVEL_SSE4_POPCNT> >;
|
|
11
11
|
_do_decode_end_raw = &do_decode_simd<true, true, sizeof(__m128i)*2, do_decode_sse<true, true, ISA_LEVEL_SSE4_POPCNT> >;
|
|
12
|
+
_decode_isa = ISA_LEVEL_AVX;
|
|
12
13
|
}
|
|
13
14
|
#else
|
|
14
15
|
void decoder_set_ssse3_funcs();
|
package/src/decoder_avx2.cc
CHANGED
|
@@ -9,6 +9,7 @@ void decoder_set_avx2_funcs() {
|
|
|
9
9
|
_do_decode = &do_decode_simd<false, false, sizeof(__m256i)*2, do_decode_avx2<false, false, ISA_LEVEL_AVX2> >;
|
|
10
10
|
_do_decode_raw = &do_decode_simd<true, false, sizeof(__m256i)*2, do_decode_avx2<true, false, ISA_LEVEL_AVX2> >;
|
|
11
11
|
_do_decode_end_raw = &do_decode_simd<true, true, sizeof(__m256i)*2, do_decode_avx2<true, true, ISA_LEVEL_AVX2> >;
|
|
12
|
+
_decode_isa = ISA_LEVEL_AVX2;
|
|
12
13
|
}
|
|
13
14
|
#else
|
|
14
15
|
void decoder_set_avx_funcs();
|
package/src/decoder_avx2_base.h
CHANGED
|
@@ -67,6 +67,8 @@ HEDLEY_ALWAYS_INLINE void do_decode_avx2(const uint8_t* src, long& len, unsigned
|
|
|
67
67
|
);
|
|
68
68
|
}
|
|
69
69
|
|
|
70
|
+
decoder_set_nextMask<isRaw>(src, len, _nextMask); // set this before the loop because we can't check src after it's been overwritten
|
|
71
|
+
|
|
70
72
|
// for some reason, MSVC Win32 seems to crash when trying to compile _mm256_mask_cmpeq_epi8_mask
|
|
71
73
|
// the crash can be fixed by switching the order of the last two arguments, but it seems to generate wrong code
|
|
72
74
|
// so just disable the optimisation as it seems to be problematic there
|
|
@@ -320,6 +322,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_avx2(const uint8_t* src, long& len, unsigned
|
|
|
320
322
|
// terminator found
|
|
321
323
|
// there's probably faster ways to do this, but reverting to scalar code should be good enough
|
|
322
324
|
len += (long)i;
|
|
325
|
+
_nextMask = decoder_set_nextMask<isRaw>(src+i, mask);
|
|
323
326
|
break;
|
|
324
327
|
}
|
|
325
328
|
}
|
|
@@ -412,6 +415,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_avx2(const uint8_t* src, long& len, unsigned
|
|
|
412
415
|
}
|
|
413
416
|
if(endFound) {
|
|
414
417
|
len += (long)i;
|
|
418
|
+
_nextMask = decoder_set_nextMask<isRaw>(src+i, mask);
|
|
415
419
|
break;
|
|
416
420
|
}
|
|
417
421
|
}
|
|
@@ -613,20 +617,6 @@ HEDLEY_ALWAYS_INLINE void do_decode_avx2(const uint8_t* src, long& len, unsigned
|
|
|
613
617
|
}
|
|
614
618
|
}
|
|
615
619
|
_escFirst = (unsigned char)escFirst;
|
|
616
|
-
if(isRaw) {
|
|
617
|
-
// this would be the trivial solution, but requires the compiler holding onto minMask throughout the loop:
|
|
618
|
-
//_nextMask = ~(uint16_t)_mm256_movemask_epi8(_mm256_cmpeq_epi8(minMask, _mm256_set1_epi8('.')));
|
|
619
|
-
// instead, just scan the memory to determine what to set nextMask to
|
|
620
|
-
if(len != 0) { // have to gone through at least one loop cycle
|
|
621
|
-
if(src[i-2] == '\r' && src[i-1] == '\n' && src[i] == '.')
|
|
622
|
-
_nextMask = 1;
|
|
623
|
-
else if(src[i-1] == '\r' && src[i] == '\n' && src[i+1] == '.')
|
|
624
|
-
_nextMask = 2;
|
|
625
|
-
else
|
|
626
|
-
_nextMask = 0;
|
|
627
|
-
}
|
|
628
|
-
} else
|
|
629
|
-
_nextMask = 0;
|
|
630
620
|
_mm256_zeroupper();
|
|
631
621
|
}
|
|
632
622
|
#endif
|
package/src/decoder_common.h
CHANGED
|
@@ -509,4 +509,29 @@ static inline void decoder_init_lut(uint8_t* eqFixLUT, void* compactLUT) {
|
|
|
509
509
|
}
|
|
510
510
|
#endif
|
|
511
511
|
}
|
|
512
|
+
template<bool isRaw>
|
|
513
|
+
static inline void decoder_set_nextMask(const uint8_t* src, size_t len, uint16_t& nextMask) {
|
|
514
|
+
if(isRaw) {
|
|
515
|
+
if(len != 0) { // have to gone through at least one loop cycle
|
|
516
|
+
if(src[-2] == '\r' && src[-1] == '\n' && src[0] == '.')
|
|
517
|
+
nextMask = 1;
|
|
518
|
+
else if(src[-1] == '\r' && src[0] == '\n' && src[1] == '.')
|
|
519
|
+
nextMask = 2;
|
|
520
|
+
else
|
|
521
|
+
nextMask = 0;
|
|
522
|
+
}
|
|
523
|
+
} else
|
|
524
|
+
nextMask = 0;
|
|
525
|
+
}
|
|
512
526
|
|
|
527
|
+
// without backtracking
|
|
528
|
+
template<bool isRaw>
|
|
529
|
+
static inline uint16_t decoder_set_nextMask(const uint8_t* src, unsigned mask) {
|
|
530
|
+
if(isRaw) {
|
|
531
|
+
if(src[0] == '.')
|
|
532
|
+
return mask & 1;
|
|
533
|
+
if(src[1] == '.')
|
|
534
|
+
return mask & 2;
|
|
535
|
+
}
|
|
536
|
+
return 0;
|
|
537
|
+
}
|
package/src/decoder_neon.cc
CHANGED
|
@@ -78,6 +78,9 @@ HEDLEY_ALWAYS_INLINE void do_decode_neon(const uint8_t* src, long& len, unsigned
|
|
|
78
78
|
lfCompare = vsetq_lane_u8('.', lfCompare, 1);
|
|
79
79
|
}
|
|
80
80
|
#endif
|
|
81
|
+
|
|
82
|
+
decoder_set_nextMask<isRaw>(src, len, nextMask);
|
|
83
|
+
|
|
81
84
|
long i;
|
|
82
85
|
for(i = -len; i; i += sizeof(uint8x16_t)*2) {
|
|
83
86
|
uint8x16x2_t data = vld1q_u8_x2_align(src+i, 32);
|
|
@@ -251,6 +254,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_neon(const uint8_t* src, long& len, unsigned
|
|
|
251
254
|
// terminator found
|
|
252
255
|
// there's probably faster ways to do this, but reverting to scalar code should be good enough
|
|
253
256
|
len += i;
|
|
257
|
+
nextMask = decoder_set_nextMask<isRaw>(src+i, mask);
|
|
254
258
|
break;
|
|
255
259
|
}
|
|
256
260
|
}
|
|
@@ -301,6 +305,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_neon(const uint8_t* src, long& len, unsigned
|
|
|
301
305
|
);
|
|
302
306
|
if(LIKELIHOOD(0.001, neon_vect_is_nonzero(matchEnd))) {
|
|
303
307
|
len += i;
|
|
308
|
+
nextMask = decoder_set_nextMask<isRaw>(src+i, mask);
|
|
304
309
|
break;
|
|
305
310
|
}
|
|
306
311
|
}
|
|
@@ -449,18 +454,6 @@ HEDLEY_ALWAYS_INLINE void do_decode_neon(const uint8_t* src, long& len, unsigned
|
|
|
449
454
|
#endif
|
|
450
455
|
}
|
|
451
456
|
}
|
|
452
|
-
|
|
453
|
-
if(isRaw) {
|
|
454
|
-
if(len != 0) { // have to gone through at least one loop cycle
|
|
455
|
-
if(src[i-2] == '\r' && src[i-1] == '\n' && src[i] == '.')
|
|
456
|
-
nextMask = 1;
|
|
457
|
-
else if(src[i-1] == '\r' && src[i] == '\n' && src[i+1] == '.')
|
|
458
|
-
nextMask = 2;
|
|
459
|
-
else
|
|
460
|
-
nextMask = 0;
|
|
461
|
-
}
|
|
462
|
-
} else
|
|
463
|
-
nextMask = 0;
|
|
464
457
|
}
|
|
465
458
|
|
|
466
459
|
void decoder_set_neon_funcs() {
|
|
@@ -468,6 +461,7 @@ void decoder_set_neon_funcs() {
|
|
|
468
461
|
_do_decode = &do_decode_simd<false, false, sizeof(uint8x16_t)*2, do_decode_neon<false, false> >;
|
|
469
462
|
_do_decode_raw = &do_decode_simd<true, false, sizeof(uint8x16_t)*2, do_decode_neon<true, false> >;
|
|
470
463
|
_do_decode_end_raw = &do_decode_simd<true, true, sizeof(uint8x16_t)*2, do_decode_neon<true, true> >;
|
|
464
|
+
_decode_isa = ISA_LEVEL_NEON;
|
|
471
465
|
}
|
|
472
466
|
#else
|
|
473
467
|
void decoder_set_neon_funcs() {}
|
package/src/decoder_neon64.cc
CHANGED
|
@@ -56,6 +56,9 @@ HEDLEY_ALWAYS_INLINE void do_decode_neon(const uint8_t* src, long& len, unsigned
|
|
|
56
56
|
if(nextMask == 2)
|
|
57
57
|
nextMaskMix = vsetq_lane_u8(2, nextMaskMix, 1);
|
|
58
58
|
uint8x16_t yencOffset = escFirst ? vmakeq_u8(42+64,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42) : vdupq_n_u8(42);
|
|
59
|
+
|
|
60
|
+
decoder_set_nextMask<isRaw>(src, len, nextMask);
|
|
61
|
+
|
|
59
62
|
long i;
|
|
60
63
|
for(i = -len; i; i += sizeof(uint8x16_t)*4) {
|
|
61
64
|
uint8x16x4_t data = _vld1q_u8_x4(src+i);
|
|
@@ -227,6 +230,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_neon(const uint8_t* src, long& len, unsigned
|
|
|
227
230
|
// terminator found
|
|
228
231
|
// there's probably faster ways to do this, but reverting to scalar code should be good enough
|
|
229
232
|
len += i;
|
|
233
|
+
nextMask = decoder_set_nextMask<isRaw>(src+i, mask);
|
|
230
234
|
break;
|
|
231
235
|
}
|
|
232
236
|
}
|
|
@@ -275,6 +279,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_neon(const uint8_t* src, long& len, unsigned
|
|
|
275
279
|
);
|
|
276
280
|
if(LIKELIHOOD(0.001, neon_vect_is_nonzero(matchEnd))) {
|
|
277
281
|
len += i;
|
|
282
|
+
nextMask = decoder_set_nextMask<isRaw>(src+i, mask);
|
|
278
283
|
break;
|
|
279
284
|
}
|
|
280
285
|
}
|
|
@@ -430,17 +435,6 @@ HEDLEY_ALWAYS_INLINE void do_decode_neon(const uint8_t* src, long& len, unsigned
|
|
|
430
435
|
yencOffset = vdupq_n_u8(42);
|
|
431
436
|
}
|
|
432
437
|
}
|
|
433
|
-
if(isRaw) {
|
|
434
|
-
if(len != 0) { // have to gone through at least one loop cycle
|
|
435
|
-
if(src[i-2] == '\r' && src[i-1] == '\n' && src[i] == '.')
|
|
436
|
-
nextMask = 1;
|
|
437
|
-
else if(src[i-1] == '\r' && src[i] == '\n' && src[i+1] == '.')
|
|
438
|
-
nextMask = 2;
|
|
439
|
-
else
|
|
440
|
-
nextMask = 0;
|
|
441
|
-
}
|
|
442
|
-
} else
|
|
443
|
-
nextMask = 0;
|
|
444
438
|
}
|
|
445
439
|
|
|
446
440
|
void decoder_set_neon_funcs() {
|
|
@@ -448,6 +442,7 @@ void decoder_set_neon_funcs() {
|
|
|
448
442
|
_do_decode = &do_decode_simd<false, false, sizeof(uint8x16_t)*4, do_decode_neon<false, false> >;
|
|
449
443
|
_do_decode_raw = &do_decode_simd<true, false, sizeof(uint8x16_t)*4, do_decode_neon<true, false> >;
|
|
450
444
|
_do_decode_end_raw = &do_decode_simd<true, true, sizeof(uint8x16_t)*4, do_decode_neon<true, true> >;
|
|
445
|
+
_decode_isa = ISA_LEVEL_NEON;
|
|
451
446
|
}
|
|
452
447
|
#else
|
|
453
448
|
void decoder_set_neon_funcs() {}
|
package/src/decoder_sse2.cc
CHANGED
|
@@ -10,6 +10,7 @@ void decoder_set_sse2_funcs() {
|
|
|
10
10
|
_do_decode = &do_decode_simd<false, false, sizeof(__m128i)*2, do_decode_sse<false, false, ISA_LEVEL_SSE2> >;
|
|
11
11
|
_do_decode_raw = &do_decode_simd<true, false, sizeof(__m128i)*2, do_decode_sse<true, false, ISA_LEVEL_SSE2> >;
|
|
12
12
|
_do_decode_end_raw = &do_decode_simd<true, true, sizeof(__m128i)*2, do_decode_sse<true, true, ISA_LEVEL_SSE2> >;
|
|
13
|
+
_decode_isa = ISA_LEVEL_SSE2;
|
|
13
14
|
}
|
|
14
15
|
#else
|
|
15
16
|
void decoder_set_sse2_funcs() {}
|
package/src/decoder_sse_base.h
CHANGED
|
@@ -145,6 +145,9 @@ HEDLEY_ALWAYS_INLINE void do_decode_sse(const uint8_t* src, long& len, unsigned
|
|
|
145
145
|
else
|
|
146
146
|
lfCompare = _mm_insert_epi16(lfCompare, _nextMask == 1 ? 0x0a2e /*".\n"*/ : 0x2e0a /*"\n."*/, 0);
|
|
147
147
|
}
|
|
148
|
+
|
|
149
|
+
decoder_set_nextMask<isRaw>(src, len, _nextMask); // set this before the loop because we can't check src after it's been overwritten
|
|
150
|
+
|
|
148
151
|
intptr_t i;
|
|
149
152
|
for(i = -len; i; i += sizeof(__m128i)*2) {
|
|
150
153
|
__m128i oDataA = _mm_load_si128((__m128i *)(src+i));
|
|
@@ -383,6 +386,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_sse(const uint8_t* src, long& len, unsigned
|
|
|
383
386
|
// terminator found
|
|
384
387
|
// there's probably faster ways to do this, but reverting to scalar code should be good enough
|
|
385
388
|
len += (long)i;
|
|
389
|
+
_nextMask = decoder_set_nextMask<isRaw>(src+i, mask);
|
|
386
390
|
break;
|
|
387
391
|
}
|
|
388
392
|
}
|
|
@@ -492,6 +496,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_sse(const uint8_t* src, long& len, unsigned
|
|
|
492
496
|
|
|
493
497
|
if(endFound) {
|
|
494
498
|
len += (long)i;
|
|
499
|
+
_nextMask = decoder_set_nextMask<isRaw>(src+i, mask);
|
|
495
500
|
break;
|
|
496
501
|
}
|
|
497
502
|
}
|
|
@@ -710,16 +715,5 @@ HEDLEY_ALWAYS_INLINE void do_decode_sse(const uint8_t* src, long& len, unsigned
|
|
|
710
715
|
}
|
|
711
716
|
}
|
|
712
717
|
_escFirst = (unsigned char)escFirst;
|
|
713
|
-
if(isRaw) {
|
|
714
|
-
if(len != 0) { // have to gone through at least one loop cycle
|
|
715
|
-
if(src[i-2] == '\r' && src[i-1] == '\n' && src[i] == '.')
|
|
716
|
-
_nextMask = 1;
|
|
717
|
-
else if(src[i-1] == '\r' && src[i] == '\n' && src[i+1] == '.')
|
|
718
|
-
_nextMask = 2;
|
|
719
|
-
else
|
|
720
|
-
_nextMask = 0;
|
|
721
|
-
}
|
|
722
|
-
} else
|
|
723
|
-
_nextMask = 0;
|
|
724
718
|
}
|
|
725
719
|
#endif
|
package/src/decoder_ssse3.cc
CHANGED
|
@@ -9,6 +9,7 @@ void decoder_set_ssse3_funcs() {
|
|
|
9
9
|
_do_decode = &do_decode_simd<false, false, sizeof(__m128i)*2, do_decode_sse<false, false, ISA_LEVEL_SSSE3> >;
|
|
10
10
|
_do_decode_raw = &do_decode_simd<true, false, sizeof(__m128i)*2, do_decode_sse<true, false, ISA_LEVEL_SSSE3> >;
|
|
11
11
|
_do_decode_end_raw = &do_decode_simd<true, true, sizeof(__m128i)*2, do_decode_sse<true, true, ISA_LEVEL_SSSE3> >;
|
|
12
|
+
_decode_isa = ISA_LEVEL_SSSE3;
|
|
12
13
|
}
|
|
13
14
|
#else
|
|
14
15
|
void decoder_set_sse2_funcs();
|
package/src/decoder_vbmi2.cc
CHANGED
|
@@ -18,6 +18,7 @@ void decoder_set_vbmi2_funcs() {
|
|
|
18
18
|
_do_decode = &do_decode_simd<false, false, sizeof(__m256i)*2, do_decode_avx2<false, false, ISA_LEVEL_VBMI2> >;
|
|
19
19
|
_do_decode_raw = &do_decode_simd<true, false, sizeof(__m256i)*2, do_decode_avx2<true, false, ISA_LEVEL_VBMI2> >;
|
|
20
20
|
_do_decode_end_raw = &do_decode_simd<true, true, sizeof(__m256i)*2, do_decode_avx2<true, true, ISA_LEVEL_VBMI2> >;
|
|
21
|
+
_decode_isa = ISA_LEVEL_VBMI2;
|
|
21
22
|
}
|
|
22
23
|
# else
|
|
23
24
|
# include "decoder_sse_base.h"
|
|
@@ -27,6 +28,7 @@ void decoder_set_vbmi2_funcs() {
|
|
|
27
28
|
_do_decode = &do_decode_simd<false, false, sizeof(__m128i)*2, do_decode_sse<false, false, ISA_LEVEL_VBMI2> >;
|
|
28
29
|
_do_decode_raw = &do_decode_simd<true, false, sizeof(__m128i)*2, do_decode_sse<true, false, ISA_LEVEL_VBMI2> >;
|
|
29
30
|
_do_decode_end_raw = &do_decode_simd<true, true, sizeof(__m128i)*2, do_decode_sse<true, true, ISA_LEVEL_VBMI2> >;
|
|
31
|
+
_decode_isa = ISA_LEVEL_VBMI2;
|
|
30
32
|
}
|
|
31
33
|
# endif
|
|
32
34
|
#else
|
package/src/encoder.cc
CHANGED
|
@@ -122,6 +122,7 @@ size_t do_encode_generic(int line_size, int* colOffset, const unsigned char* HED
|
|
|
122
122
|
|
|
123
123
|
extern "C" {
|
|
124
124
|
size_t (*_do_encode)(int, int*, const unsigned char* HEDLEY_RESTRICT, unsigned char* HEDLEY_RESTRICT, size_t, int) = &do_encode_generic;
|
|
125
|
+
int _encode_isa = ISA_GENERIC;
|
|
125
126
|
}
|
|
126
127
|
|
|
127
128
|
void encoder_sse2_init();
|
|
@@ -139,12 +140,14 @@ void encoder_rvv_init();
|
|
|
139
140
|
static inline void encoder_native_init() {
|
|
140
141
|
_do_encode = &do_encode_simd< do_encode_avx2<ISA_NATIVE> >;
|
|
141
142
|
encoder_avx2_lut<ISA_NATIVE>();
|
|
143
|
+
_encode_isa = ISA_NATIVE;
|
|
142
144
|
}
|
|
143
145
|
# else
|
|
144
146
|
# include "encoder_sse_base.h"
|
|
145
147
|
static inline void encoder_native_init() {
|
|
146
148
|
_do_encode = &do_encode_simd< do_encode_sse<ISA_NATIVE> >;
|
|
147
149
|
encoder_sse_lut<ISA_NATIVE>();
|
|
150
|
+
_encode_isa = ISA_NATIVE;
|
|
148
151
|
}
|
|
149
152
|
# endif
|
|
150
153
|
#endif
|
package/src/encoder.h
CHANGED
|
@@ -10,8 +10,12 @@ extern "C" {
|
|
|
10
10
|
#include "hedley.h"
|
|
11
11
|
|
|
12
12
|
extern size_t (*_do_encode)(int, int*, const unsigned char* HEDLEY_RESTRICT, unsigned char* HEDLEY_RESTRICT, size_t, int);
|
|
13
|
+
extern int _encode_isa;
|
|
13
14
|
#define do_encode (*_do_encode)
|
|
14
15
|
void encoder_init();
|
|
16
|
+
static inline int encode_isa_level() {
|
|
17
|
+
return _encode_isa;
|
|
18
|
+
}
|
|
15
19
|
|
|
16
20
|
|
|
17
21
|
|
package/src/encoder_avx.cc
CHANGED
package/src/encoder_avx2.cc
CHANGED
package/src/encoder_neon.cc
CHANGED
|
@@ -520,6 +520,7 @@ HEDLEY_ALWAYS_INLINE void do_encode_neon(int line_size, int* colOffset, const ui
|
|
|
520
520
|
|
|
521
521
|
void encoder_neon_init() {
|
|
522
522
|
_do_encode = &do_encode_simd<do_encode_neon>;
|
|
523
|
+
_encode_isa = ISA_LEVEL_NEON;
|
|
523
524
|
// generate shuf LUT
|
|
524
525
|
for(int i=0; i<256; i++) {
|
|
525
526
|
int k = i;
|
package/src/encoder_rvv.cc
CHANGED
package/src/encoder_sse2.cc
CHANGED
package/src/encoder_ssse3.cc
CHANGED
package/src/encoder_vbmi2.cc
CHANGED
|
@@ -14,12 +14,14 @@ const bool encoder_has_avx10 = false;
|
|
|
14
14
|
void encoder_vbmi2_init() {
|
|
15
15
|
_do_encode = &do_encode_simd< do_encode_avx2<ISA_LEVEL_VBMI2> >;
|
|
16
16
|
encoder_avx2_lut<ISA_LEVEL_VBMI2>();
|
|
17
|
+
_encode_isa = ISA_LEVEL_VBMI2;
|
|
17
18
|
}
|
|
18
19
|
# else
|
|
19
20
|
# include "encoder_sse_base.h"
|
|
20
21
|
void encoder_vbmi2_init() {
|
|
21
22
|
_do_encode = &do_encode_simd< do_encode_sse<ISA_LEVEL_VBMI2> >;
|
|
22
23
|
encoder_sse_lut<ISA_LEVEL_VBMI2>();
|
|
24
|
+
_encode_isa = ISA_LEVEL_VBMI2;
|
|
23
25
|
}
|
|
24
26
|
# endif
|
|
25
27
|
#else
|
package/test/testcrc.js
CHANGED
|
@@ -52,11 +52,11 @@ doTest('Random Continue', 'crc32', ['KZSHZ5EDOVAmDdakZZOrGSUGGKSpCJoWH7M0MHy6ohn
|
|
|
52
52
|
|
|
53
53
|
// random tests
|
|
54
54
|
for(var i=1; i<128; i++) {
|
|
55
|
-
var rand = require('crypto').
|
|
55
|
+
var rand = Buffer(require('crypto').randomBytes(i)); // Bun needs explicit Buffer for pseudoRandomBytes
|
|
56
56
|
doTest('Random Short Buffer', 'crc32', rand);
|
|
57
57
|
}
|
|
58
58
|
for(var i=0; i<32; i++) {
|
|
59
|
-
var rand = require('crypto').
|
|
59
|
+
var rand = Buffer(require('crypto').randomBytes(100000));
|
|
60
60
|
doTest('Random Buffer', 'crc32', rand);
|
|
61
61
|
|
|
62
62
|
var split = Math.random()*rand.length;
|
package/test/testdec.js
CHANGED
|
@@ -177,7 +177,7 @@ doTest('Extra null issue', toBuffer('2e900a4fb6054c9126171cdc196dc41237bb1b76da9
|
|
|
177
177
|
|
|
178
178
|
// random tests
|
|
179
179
|
for(var i=0; i<32; i++) {
|
|
180
|
-
var rand = require('crypto').
|
|
180
|
+
var rand = require('crypto').randomBytes(128*1024);
|
|
181
181
|
doTest('Random', rand);
|
|
182
182
|
}
|
|
183
183
|
|
package/test/testenc.js
CHANGED
|
@@ -141,7 +141,7 @@ padding.fill(97); // 'a'
|
|
|
141
141
|
|
|
142
142
|
// random tests
|
|
143
143
|
for(var i=0; i<32; i++) {
|
|
144
|
-
var rand = require('crypto').
|
|
144
|
+
var rand = require('crypto').randomBytes(4*1024);
|
|
145
145
|
runLineSizes(function(ls, offs) {
|
|
146
146
|
doTest('Random [ls='+ls+', offs='+offs+']', [rand, ls, offs]);
|
|
147
147
|
});
|