yencode 1.1.1 → 1.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/binding.gyp +6 -6
- package/crcutil-1.0/code/multiword_64_64_intrinsic_i386_mmx.cc +1 -1
- package/package.json +1 -1
- package/src/common.h +74 -12
- package/src/crc.cc +50 -24
- package/src/crc.h +20 -6
- package/src/crc_arm.cc +121 -23
- package/src/crc_common.h +3 -10
- package/src/{crc_folding.c → crc_folding.cc} +40 -74
- package/src/decoder.cc +6 -3
- package/src/decoder.h +16 -2
- package/src/decoder_avx2_base.h +12 -12
- package/src/decoder_common.h +2 -2
- package/src/decoder_neon.cc +34 -34
- package/src/decoder_neon64.cc +36 -34
- package/src/decoder_sse_base.h +5 -5
- package/src/encoder.cc +5 -2
- package/src/encoder.h +17 -1
- package/src/encoder_avx_base.h +6 -6
- package/src/encoder_common.h +3 -3
- package/src/encoder_neon.cc +30 -30
- package/src/encoder_sse_base.h +3 -3
- package/src/platform.cc +34 -6
- package/src/yencode.cc +33 -44
|
@@ -19,44 +19,29 @@
|
|
|
19
19
|
|
|
20
20
|
#include "crc_common.h"
|
|
21
21
|
|
|
22
|
-
#if
|
|
23
|
-
# include <stdint.h>
|
|
24
|
-
#else
|
|
25
|
-
/* Workaround for older MSVC not supporting stdint.h - just pull it from V8 */
|
|
26
|
-
# include <v8.h>
|
|
27
|
-
#endif
|
|
28
|
-
|
|
29
|
-
#if (defined(__PCLMUL__) && defined(__SSSE3__) && defined(__SSE4_1__)) || (defined(_MSC_VER) && _MSC_VER >= 1600)
|
|
22
|
+
#if (defined(__PCLMUL__) && defined(__SSSE3__) && defined(__SSE4_1__)) || (defined(_MSC_VER) && _MSC_VER >= 1600 && defined(PLATFORM_X86))
|
|
30
23
|
#include <inttypes.h>
|
|
31
24
|
#include <immintrin.h>
|
|
32
25
|
#include <wmmintrin.h>
|
|
33
26
|
|
|
34
|
-
#define local static
|
|
35
27
|
|
|
36
|
-
#
|
|
37
|
-
# define
|
|
38
|
-
/* Because we don't have dynamic dispatch for AVX, disable it for MSVC builds (only use AVX for -march=native style builds) */
|
|
39
|
-
# undef __AVX__
|
|
40
|
-
# undef __AVX512F__
|
|
41
|
-
# undef __AVX512VL__
|
|
42
|
-
# undef __GFNI__
|
|
43
|
-
#else
|
|
44
|
-
# define ALIGN(_a, v) v __attribute__((aligned(_a)))
|
|
28
|
+
#if defined(__AVX512VL__) && defined(YENC_BUILD_NATIVE) && YENC_BUILD_NATIVE!=0
|
|
29
|
+
# define ENABLE_AVX512 1
|
|
45
30
|
#endif
|
|
46
31
|
|
|
47
32
|
|
|
48
33
|
// interestingly, MSVC seems to generate better code if using VXORPS over VPXOR
|
|
49
34
|
// original Intel code uses XORPS for many XOR operations, but PXOR is pretty much always better (more port freedom on Intel CPUs). The only advantage of XORPS is that it's 1 byte shorter, an advantage which disappears under AVX as both instructions have the same length
|
|
50
|
-
#
|
|
35
|
+
#if defined(__AVX__) && defined(YENC_BUILD_NATIVE) && YENC_BUILD_NATIVE!=0
|
|
51
36
|
# define fold_xor _mm_xor_si128
|
|
52
37
|
#else
|
|
53
|
-
|
|
38
|
+
static __m128i fold_xor(__m128i a, __m128i b) {
|
|
54
39
|
return _mm_castps_si128(_mm_xor_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b)));
|
|
55
40
|
}
|
|
56
41
|
#endif
|
|
57
42
|
|
|
58
|
-
#ifdef
|
|
59
|
-
|
|
43
|
+
#ifdef ENABLE_AVX512
|
|
44
|
+
static __m128i do_one_fold_merge(__m128i src, __m128i data) {
|
|
60
45
|
const __m128i xmm_fold4 = _mm_set_epi32(
|
|
61
46
|
0x00000001, 0x54442bd4,
|
|
62
47
|
0x00000001, 0xc6e41596);
|
|
@@ -68,7 +53,7 @@ local __m128i do_one_fold_merge(__m128i src, __m128i data) {
|
|
|
68
53
|
);
|
|
69
54
|
}
|
|
70
55
|
#else
|
|
71
|
-
|
|
56
|
+
static __m128i do_one_fold(__m128i src) {
|
|
72
57
|
const __m128i xmm_fold4 = _mm_set_epi32(
|
|
73
58
|
0x00000001, 0x54442bd4,
|
|
74
59
|
0x00000001, 0xc6e41596);
|
|
@@ -79,7 +64,7 @@ local __m128i do_one_fold(__m128i src) {
|
|
|
79
64
|
}
|
|
80
65
|
#endif
|
|
81
66
|
|
|
82
|
-
|
|
67
|
+
ALIGN_TO(32, static const unsigned pshufb_shf_table[60]) = {
|
|
83
68
|
0x84838281, 0x88878685, 0x8c8b8a89, 0x008f8e8d, /* shl 15 (16 - 1)/shr1 */
|
|
84
69
|
0x85848382, 0x89888786, 0x8d8c8b8a, 0x01008f8e, /* shl 14 (16 - 3)/shr2 */
|
|
85
70
|
0x86858483, 0x8a898887, 0x8e8d8c8b, 0x0201008f, /* shl 13 (16 - 4)/shr3 */
|
|
@@ -97,7 +82,7 @@ ALIGN(32, local const unsigned pshufb_shf_table[60]) = {
|
|
|
97
82
|
0x0201008f, 0x06050403, 0x0a090807, 0x0e0d0c0b /* shl 1 (16 -15)/shr15*/
|
|
98
83
|
};
|
|
99
84
|
|
|
100
|
-
|
|
85
|
+
static void partial_fold(const size_t len, __m128i *xmm_crc0, __m128i *xmm_crc1,
|
|
101
86
|
__m128i *xmm_crc2, __m128i *xmm_crc3, __m128i *xmm_crc_part) {
|
|
102
87
|
|
|
103
88
|
const __m128i xmm_mask3 = _mm_set1_epi32(0x80808080);
|
|
@@ -127,7 +112,7 @@ local void partial_fold(const size_t len, __m128i *xmm_crc0, __m128i *xmm_crc1,
|
|
|
127
112
|
*xmm_crc_part = _mm_shuffle_epi8(*xmm_crc_part, xmm_shl);
|
|
128
113
|
*xmm_crc3 = _mm_or_si128(*xmm_crc3, *xmm_crc_part);
|
|
129
114
|
|
|
130
|
-
#ifdef
|
|
115
|
+
#ifdef ENABLE_AVX512
|
|
131
116
|
*xmm_crc3 = do_one_fold_merge(xmm_a0_0, *xmm_crc3);
|
|
132
117
|
#else
|
|
133
118
|
*xmm_crc3 = fold_xor(
|
|
@@ -137,25 +122,21 @@ local void partial_fold(const size_t len, __m128i *xmm_crc0, __m128i *xmm_crc1,
|
|
|
137
122
|
#endif
|
|
138
123
|
}
|
|
139
124
|
|
|
140
|
-
|
|
125
|
+
ALIGN_TO(16, static const unsigned crc_k[]) = {
|
|
141
126
|
0xccaa009e, 0x00000000, /* rk1 */
|
|
142
127
|
0x751997d0, 0x00000001, /* rk2 */
|
|
143
128
|
0xccaa009e, 0x00000000, /* rk5 */
|
|
144
129
|
0x63cd6124, 0x00000001, /* rk6 */
|
|
145
|
-
|
|
130
|
+
0xf7011641, 0x00000000, /* rk7 */
|
|
146
131
|
0xdb710640, 0x00000001 /* rk8 */
|
|
147
132
|
};
|
|
148
133
|
|
|
149
|
-
|
|
150
|
-
0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000
|
|
151
|
-
};
|
|
152
|
-
|
|
153
|
-
ALIGN(16, local const unsigned crc_mask2[4]) = {
|
|
134
|
+
ALIGN_TO(16, static const unsigned crc_mask[4]) = {
|
|
154
135
|
0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF
|
|
155
136
|
};
|
|
156
137
|
|
|
157
|
-
|
|
158
|
-
#
|
|
138
|
+
static __m128i reverse_bits_epi8(__m128i src) {
|
|
139
|
+
#if defined(__GFNI__) && defined(YENC_BUILD_NATIVE) && YENC_BUILD_NATIVE!=0
|
|
159
140
|
return _mm_gf2p8affine_epi64_epi8(src, _mm_set_epi32(
|
|
160
141
|
0x80402010, 0x08040201,
|
|
161
142
|
0x80402010, 0x08040201
|
|
@@ -164,7 +145,8 @@ local __m128i reverse_bits_epi8(__m128i src) {
|
|
|
164
145
|
__m128i xmm_t0 = _mm_and_si128(src, _mm_set1_epi8(0x0f));
|
|
165
146
|
__m128i xmm_t1 = _mm_and_si128(_mm_srli_epi16(src, 4), _mm_set1_epi8(0x0f));
|
|
166
147
|
xmm_t0 = _mm_shuffle_epi8(_mm_set_epi8(
|
|
167
|
-
|
|
148
|
+
-16, 112, -80, 48, -48, 80, -112, 16, -32, 96, -96, 32, -64, 64, -128, 0
|
|
149
|
+
//0xf0, 0x70, 0xb0, 0x30, 0xd0, 0x50, 0x90, 0x10, 0xe0, 0x60, 0xa0, 0x20, 0xc0, 0x40, 0x80, 0
|
|
168
150
|
), xmm_t0);
|
|
169
151
|
xmm_t1 = _mm_shuffle_epi8(_mm_set_epi8(
|
|
170
152
|
15, 7, 11, 3, 13, 5, 9, 1, 14, 6, 10, 2, 12, 4, 8, 0
|
|
@@ -181,7 +163,7 @@ local __m128i reverse_bits_epi8(__m128i src) {
|
|
|
181
163
|
# define BSWAP32(n) ((((n)&0xff)<<24) | (((n)&0xff00)<<8) | (((n)&0xff0000)>>8) | (((n)&0xff000000)>>24))
|
|
182
164
|
#endif
|
|
183
165
|
|
|
184
|
-
|
|
166
|
+
static uint32_t crc_fold(const unsigned char *src, long len, uint32_t initial) {
|
|
185
167
|
unsigned long algn_diff;
|
|
186
168
|
__m128i xmm_t0, xmm_t1, xmm_t2, xmm_t3;
|
|
187
169
|
|
|
@@ -235,7 +217,7 @@ local uint32_t crc_fold(const unsigned char *src, long len, uint32_t initial) {
|
|
|
235
217
|
xmm_t2 = _mm_load_si128((__m128i *)src + 2);
|
|
236
218
|
xmm_t3 = _mm_load_si128((__m128i *)src + 3);
|
|
237
219
|
|
|
238
|
-
#ifdef
|
|
220
|
+
#ifdef ENABLE_AVX512
|
|
239
221
|
xmm_crc0 = do_one_fold_merge(xmm_crc0, xmm_t0);
|
|
240
222
|
xmm_crc1 = do_one_fold_merge(xmm_crc1, xmm_t1);
|
|
241
223
|
xmm_crc2 = do_one_fold_merge(xmm_crc2, xmm_t2);
|
|
@@ -266,7 +248,7 @@ local uint32_t crc_fold(const unsigned char *src, long len, uint32_t initial) {
|
|
|
266
248
|
xmm_t2 = _mm_load_si128((__m128i *)src + 2);
|
|
267
249
|
|
|
268
250
|
xmm_t3 = xmm_crc3;
|
|
269
|
-
#ifdef
|
|
251
|
+
#ifdef ENABLE_AVX512
|
|
270
252
|
xmm_crc3 = do_one_fold_merge(xmm_crc2, xmm_t2);
|
|
271
253
|
xmm_crc2 = do_one_fold_merge(xmm_crc1, xmm_t1);
|
|
272
254
|
xmm_crc1 = do_one_fold_merge(xmm_crc0, xmm_t0);
|
|
@@ -292,7 +274,7 @@ local uint32_t crc_fold(const unsigned char *src, long len, uint32_t initial) {
|
|
|
292
274
|
|
|
293
275
|
xmm_t2 = xmm_crc2;
|
|
294
276
|
xmm_t3 = xmm_crc3;
|
|
295
|
-
#ifdef
|
|
277
|
+
#ifdef ENABLE_AVX512
|
|
296
278
|
xmm_crc3 = do_one_fold_merge(xmm_crc1, xmm_t1);
|
|
297
279
|
xmm_crc2 = do_one_fold_merge(xmm_crc0, xmm_t0);
|
|
298
280
|
#else
|
|
@@ -314,7 +296,7 @@ local uint32_t crc_fold(const unsigned char *src, long len, uint32_t initial) {
|
|
|
314
296
|
xmm_t0 = _mm_load_si128((__m128i *)src);
|
|
315
297
|
|
|
316
298
|
xmm_t3 = xmm_crc3;
|
|
317
|
-
#ifdef
|
|
299
|
+
#ifdef ENABLE_AVX512
|
|
318
300
|
xmm_crc3 = do_one_fold_merge(xmm_crc0, xmm_t0);
|
|
319
301
|
#else
|
|
320
302
|
xmm_crc3 = _mm_xor_si128(do_one_fold(xmm_crc0), xmm_t0);
|
|
@@ -339,8 +321,7 @@ partial:
|
|
|
339
321
|
&xmm_crc_part);
|
|
340
322
|
done:
|
|
341
323
|
{
|
|
342
|
-
const __m128i xmm_mask
|
|
343
|
-
const __m128i xmm_mask2 = _mm_load_si128((__m128i *)crc_mask2);
|
|
324
|
+
const __m128i xmm_mask = _mm_load_si128((__m128i *)crc_mask);
|
|
344
325
|
__m128i x_tmp0, x_tmp1, x_tmp2, crc_fold;
|
|
345
326
|
|
|
346
327
|
/*
|
|
@@ -350,7 +331,7 @@ done:
|
|
|
350
331
|
|
|
351
332
|
x_tmp0 = _mm_clmulepi64_si128(xmm_crc0, crc_fold, 0x10);
|
|
352
333
|
xmm_crc0 = _mm_clmulepi64_si128(xmm_crc0, crc_fold, 0x01);
|
|
353
|
-
#ifdef
|
|
334
|
+
#ifdef ENABLE_AVX512
|
|
354
335
|
xmm_crc1 = _mm_ternarylogic_epi32(xmm_crc1, x_tmp0, xmm_crc0, 0x96);
|
|
355
336
|
#else
|
|
356
337
|
xmm_crc1 = _mm_xor_si128(xmm_crc1, x_tmp0);
|
|
@@ -359,7 +340,7 @@ done:
|
|
|
359
340
|
|
|
360
341
|
x_tmp1 = _mm_clmulepi64_si128(xmm_crc1, crc_fold, 0x10);
|
|
361
342
|
xmm_crc1 = _mm_clmulepi64_si128(xmm_crc1, crc_fold, 0x01);
|
|
362
|
-
#ifdef
|
|
343
|
+
#ifdef ENABLE_AVX512
|
|
363
344
|
xmm_crc2 = _mm_ternarylogic_epi32(xmm_crc2, x_tmp1, xmm_crc1, 0x96);
|
|
364
345
|
#else
|
|
365
346
|
xmm_crc2 = _mm_xor_si128(xmm_crc2, x_tmp1);
|
|
@@ -368,7 +349,7 @@ done:
|
|
|
368
349
|
|
|
369
350
|
x_tmp2 = _mm_clmulepi64_si128(xmm_crc2, crc_fold, 0x10);
|
|
370
351
|
xmm_crc2 = _mm_clmulepi64_si128(xmm_crc2, crc_fold, 0x01);
|
|
371
|
-
#ifdef
|
|
352
|
+
#ifdef ENABLE_AVX512
|
|
372
353
|
xmm_crc3 = _mm_ternarylogic_epi32(xmm_crc3, x_tmp2, xmm_crc2, 0x96);
|
|
373
354
|
#else
|
|
374
355
|
xmm_crc3 = _mm_xor_si128(xmm_crc3, x_tmp2);
|
|
@@ -388,58 +369,43 @@ done:
|
|
|
388
369
|
xmm_crc0 = xmm_crc3;
|
|
389
370
|
xmm_crc3 = _mm_slli_si128(xmm_crc3, 4);
|
|
390
371
|
xmm_crc3 = _mm_clmulepi64_si128(xmm_crc3, crc_fold, 0x10);
|
|
391
|
-
#ifdef
|
|
372
|
+
#ifdef ENABLE_AVX512
|
|
392
373
|
//xmm_crc3 = _mm_maskz_xor_epi32(14, xmm_crc3, xmm_crc0);
|
|
393
|
-
xmm_crc3 = _mm_ternarylogic_epi32(xmm_crc3, xmm_crc0,
|
|
374
|
+
xmm_crc3 = _mm_ternarylogic_epi32(xmm_crc3, xmm_crc0, xmm_mask, 0x28);
|
|
394
375
|
#else
|
|
376
|
+
xmm_crc0 = _mm_and_si128(xmm_crc0, xmm_mask);
|
|
395
377
|
xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc0);
|
|
396
|
-
xmm_crc3 = _mm_and_si128(xmm_crc3, xmm_mask2);
|
|
397
378
|
#endif
|
|
398
379
|
|
|
399
380
|
/*
|
|
400
381
|
* k7
|
|
401
382
|
*/
|
|
402
383
|
xmm_crc1 = xmm_crc3;
|
|
403
|
-
xmm_crc2 = xmm_crc3;
|
|
404
384
|
crc_fold = _mm_load_si128((__m128i *)crc_k + 2);
|
|
405
385
|
|
|
406
386
|
xmm_crc3 = _mm_clmulepi64_si128(xmm_crc3, crc_fold, 0);
|
|
407
|
-
#ifdef __AVX512VL__
|
|
408
|
-
//xmm_crc3 = _mm_maskz_xor_epi32(3, xmm_crc3, xmm_crc2);
|
|
409
|
-
xmm_crc3 = _mm_ternarylogic_epi32(xmm_crc3, xmm_crc2, xmm_mask, 0x28);
|
|
410
|
-
#else
|
|
411
|
-
xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc2);
|
|
412
|
-
xmm_crc3 = _mm_and_si128(xmm_crc3, xmm_mask);
|
|
413
|
-
#endif
|
|
414
|
-
|
|
415
|
-
xmm_crc2 = xmm_crc3;
|
|
416
387
|
xmm_crc3 = _mm_clmulepi64_si128(xmm_crc3, crc_fold, 0x10);
|
|
417
|
-
#ifdef
|
|
418
|
-
xmm_crc3 = _mm_ternarylogic_epi32(xmm_crc3,
|
|
419
|
-
return _mm_extract_epi32(xmm_crc3, 2);
|
|
388
|
+
#ifdef ENABLE_AVX512
|
|
389
|
+
xmm_crc3 = _mm_ternarylogic_epi32(xmm_crc3, xmm_crc1, xmm_crc1, 0xC3); // NOT(xmm_crc3 ^ xmm_crc1)
|
|
420
390
|
#else
|
|
421
|
-
|
|
391
|
+
xmm_crc1 = _mm_xor_si128(xmm_crc1, xmm_mask);
|
|
422
392
|
xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc1);
|
|
423
|
-
return ~_mm_extract_epi32(xmm_crc3, 2);
|
|
424
393
|
#endif
|
|
394
|
+
return _mm_extract_epi32(xmm_crc3, 2);
|
|
425
395
|
}
|
|
426
396
|
|
|
427
397
|
}
|
|
428
398
|
|
|
429
|
-
static
|
|
430
|
-
|
|
431
|
-
UNPACK_4(out, tmp);
|
|
432
|
-
}
|
|
433
|
-
static void do_crc32_incremental_clmul(const void* data, size_t length, unsigned char init[4]) {
|
|
434
|
-
uint32_t tmp = crc_fold((const unsigned char*)data, (long)length, PACK_4(init));
|
|
435
|
-
UNPACK_4(init, tmp);
|
|
399
|
+
static uint32_t do_crc32_incremental_clmul(const void* data, size_t length, uint32_t init) {
|
|
400
|
+
return crc_fold((const unsigned char*)data, (long)length, init);
|
|
436
401
|
}
|
|
437
402
|
|
|
438
|
-
void crc_clmul_set_funcs(crc_func*
|
|
439
|
-
*_do_crc32 = &do_crc32_clmul;
|
|
403
|
+
void crc_clmul_set_funcs(crc_func* _do_crc32_incremental) {
|
|
440
404
|
*_do_crc32_incremental = &do_crc32_incremental_clmul;
|
|
441
405
|
}
|
|
442
406
|
#else
|
|
443
|
-
void crc_clmul_set_funcs(crc_func*
|
|
407
|
+
void crc_clmul_set_funcs(crc_func* _do_crc32_incremental) {
|
|
408
|
+
(void)_do_crc32_incremental;
|
|
409
|
+
}
|
|
444
410
|
#endif
|
|
445
411
|
|
package/src/decoder.cc
CHANGED
|
@@ -1,10 +1,13 @@
|
|
|
1
1
|
#include "common.h"
|
|
2
2
|
|
|
3
3
|
#include "decoder_common.h"
|
|
4
|
+
#include "decoder.h"
|
|
4
5
|
|
|
5
|
-
|
|
6
|
-
YencDecoderEnd (*
|
|
7
|
-
YencDecoderEnd (*
|
|
6
|
+
extern "C" {
|
|
7
|
+
YencDecoderEnd (*_do_decode)(const unsigned char*HEDLEY_RESTRICT*, unsigned char*HEDLEY_RESTRICT*, size_t, YencDecoderState*) = &do_decode_scalar<false, false>;
|
|
8
|
+
YencDecoderEnd (*_do_decode_raw)(const unsigned char*HEDLEY_RESTRICT*, unsigned char*HEDLEY_RESTRICT*, size_t, YencDecoderState*) = &do_decode_scalar<true, false>;
|
|
9
|
+
YencDecoderEnd (*_do_decode_end_raw)(const unsigned char*HEDLEY_RESTRICT*, unsigned char*HEDLEY_RESTRICT*, size_t, YencDecoderState*) = &do_decode_end_scalar<true>;
|
|
10
|
+
}
|
|
8
11
|
|
|
9
12
|
void decoder_set_sse2_funcs();
|
|
10
13
|
void decoder_set_ssse3_funcs();
|
package/src/decoder.h
CHANGED
|
@@ -1,3 +1,11 @@
|
|
|
1
|
+
#ifndef __YENC_DECODER_H
|
|
2
|
+
#define __YENC_DECODER_H
|
|
3
|
+
|
|
4
|
+
#ifdef __cplusplus
|
|
5
|
+
extern "C" {
|
|
6
|
+
#endif
|
|
7
|
+
|
|
8
|
+
|
|
1
9
|
|
|
2
10
|
// the last state that the decoder was in (i.e. last few characters processed)
|
|
3
11
|
// the state is needed for incremental decoders as its behavior is affected by what it processed last
|
|
@@ -25,8 +33,7 @@ extern YencDecoderEnd (*_do_decode)(const unsigned char*HEDLEY_RESTRICT*, unsign
|
|
|
25
33
|
extern YencDecoderEnd (*_do_decode_raw)(const unsigned char*HEDLEY_RESTRICT*, unsigned char*HEDLEY_RESTRICT*, size_t, YencDecoderState*);
|
|
26
34
|
extern YencDecoderEnd (*_do_decode_end_raw)(const unsigned char*HEDLEY_RESTRICT*, unsigned char*HEDLEY_RESTRICT*, size_t, YencDecoderState*);
|
|
27
35
|
|
|
28
|
-
|
|
29
|
-
static inline size_t do_decode(const unsigned char* HEDLEY_RESTRICT src, unsigned char* HEDLEY_RESTRICT dest, size_t len, YencDecoderState* state) {
|
|
36
|
+
static inline size_t do_decode(int isRaw, const unsigned char* HEDLEY_RESTRICT src, unsigned char* HEDLEY_RESTRICT dest, size_t len, YencDecoderState* state) {
|
|
30
37
|
unsigned char* ds = dest;
|
|
31
38
|
(*(isRaw ? _do_decode_raw : _do_decode))(&src, &ds, len, state);
|
|
32
39
|
return ds - dest;
|
|
@@ -37,3 +44,10 @@ static inline YencDecoderEnd do_decode_end(const unsigned char*HEDLEY_RESTRICT*
|
|
|
37
44
|
}
|
|
38
45
|
|
|
39
46
|
void decoder_init();
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
#ifdef __cplusplus
|
|
51
|
+
}
|
|
52
|
+
#endif
|
|
53
|
+
#endif
|
package/src/decoder_avx2_base.h
CHANGED
|
@@ -30,13 +30,17 @@ static HEDLEY_ALWAYS_INLINE __m256i force_align_read_256(const void* p) {
|
|
|
30
30
|
}
|
|
31
31
|
|
|
32
32
|
// _mm256_castsi128_si256, but upper is defined to be 0
|
|
33
|
-
#if defined(__clang__) && __clang_major__ >= 5
|
|
33
|
+
#if (defined(__clang__) && __clang_major__ >= 5 && (!defined(__APPLE__) || __clang_major__ >= 7)) || (defined(__GNUC__) && __GNUC__ >= 10)
|
|
34
34
|
// intrinsic unsupported in GCC 9 and MSVC < 2017
|
|
35
35
|
# define zext128_256 _mm256_zextsi128_si256
|
|
36
36
|
#else
|
|
37
37
|
// technically a cast is incorrect, due to upper 128 bits being undefined, but should usually work fine
|
|
38
38
|
// alternative may be `_mm256_set_m128i(_mm_setzero_si128(), v)` but unsupported on GCC < 7, and most compilers generate a VINSERTF128 instruction for it
|
|
39
|
-
#
|
|
39
|
+
# ifdef __OPTIMIZE__
|
|
40
|
+
# define zext128_256 _mm256_castsi128_si256
|
|
41
|
+
# else
|
|
42
|
+
# define zext128_256(x) _mm256_inserti128_si256(_mm256_setzero_si256(), x, 0)
|
|
43
|
+
# endif
|
|
40
44
|
#endif
|
|
41
45
|
|
|
42
46
|
|
|
@@ -298,7 +302,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_avx2(const uint8_t* HEDLEY_RESTRICT src, lon
|
|
|
298
302
|
if(LIKELIHOOD(0.002, matchEnd)) {
|
|
299
303
|
// terminator found
|
|
300
304
|
// there's probably faster ways to do this, but reverting to scalar code should be good enough
|
|
301
|
-
len += i;
|
|
305
|
+
len += (long)i;
|
|
302
306
|
break;
|
|
303
307
|
}
|
|
304
308
|
}
|
|
@@ -390,7 +394,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_avx2(const uint8_t* HEDLEY_RESTRICT src, lon
|
|
|
390
394
|
));
|
|
391
395
|
}
|
|
392
396
|
if(endFound) {
|
|
393
|
-
len += i;
|
|
397
|
+
len += (long)i;
|
|
394
398
|
break;
|
|
395
399
|
}
|
|
396
400
|
}
|
|
@@ -489,14 +493,10 @@ HEDLEY_ALWAYS_INLINE void do_decode_avx2(const uint8_t* HEDLEY_RESTRICT src, lon
|
|
|
489
493
|
#endif
|
|
490
494
|
{
|
|
491
495
|
// << 1 byte
|
|
492
|
-
cmpEqB = _mm256_cmpeq_epi8(_mm256_set1_epi8('='), _mm256_loadu_si256((__m256i *)(src+i-1) + 1));
|
|
493
|
-
#if defined(__tune_znver1__) || defined(__tune_bdver4__)
|
|
494
496
|
cmpEqA = _mm256_alignr_epi8(cmpEqA, _mm256_inserti128_si256(
|
|
495
|
-
|
|
497
|
+
_mm256_set1_epi8('='), _mm256_castsi256_si128(cmpEqA), 1
|
|
496
498
|
), 15);
|
|
497
|
-
|
|
498
|
-
cmpEqA = _mm256_alignr_epi8(cmpEqA, _mm256_permute2x128_si256(cmpEqA, cmpEqA, 0x08), 15);
|
|
499
|
-
#endif
|
|
499
|
+
cmpEqB = _mm256_cmpeq_epi8(_mm256_set1_epi8('='), _mm256_loadu_si256((__m256i *)(src+i-1) + 1));
|
|
500
500
|
dataA = _mm256_add_epi8(
|
|
501
501
|
oDataA,
|
|
502
502
|
_mm256_blendv_epi8(
|
|
@@ -523,7 +523,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_avx2(const uint8_t* HEDLEY_RESTRICT src, lon
|
|
|
523
523
|
#endif
|
|
524
524
|
{
|
|
525
525
|
yencOffset = _mm256_xor_si256(_mm256_set1_epi8(-42), zext128_256(
|
|
526
|
-
_mm_slli_epi16(_mm_cvtsi32_si128(escFirst), 6)
|
|
526
|
+
_mm_slli_epi16(_mm_cvtsi32_si128((int)escFirst), 6)
|
|
527
527
|
));
|
|
528
528
|
}
|
|
529
529
|
|
|
@@ -565,7 +565,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_avx2(const uint8_t* HEDLEY_RESTRICT src, lon
|
|
|
565
565
|
p -= popcnt32(mask & 0xffff0);
|
|
566
566
|
|
|
567
567
|
_mm_storeu_si128((__m128i*)(p + XMM_SIZE*3), _mm256_extracti128_si256(dataB, 1));
|
|
568
|
-
p -= popcnt32(mask >> 20);
|
|
568
|
+
p -= popcnt32((unsigned int)(mask >> 20));
|
|
569
569
|
#else
|
|
570
570
|
mask >>= 32;
|
|
571
571
|
shuf = _mm256_inserti128_si256(
|
package/src/decoder_common.h
CHANGED
|
@@ -340,7 +340,7 @@ YencDecoderEnd do_decode_simd(const unsigned char* HEDLEY_RESTRICT* src, unsigne
|
|
|
340
340
|
if((uintptr_t)(*src) & ((width-1))) {
|
|
341
341
|
// find source memory alignment
|
|
342
342
|
unsigned char* aSrc = (unsigned char*)(((uintptr_t)(*src) + (width-1)) & ~(width-1));
|
|
343
|
-
int amount = aSrc - *src;
|
|
343
|
+
int amount = (int)(aSrc - *src);
|
|
344
344
|
len -= amount;
|
|
345
345
|
YencDecoderEnd ended = do_decode_scalar<isRaw, searchEnd>(src, dest, amount, pState);
|
|
346
346
|
if(ended) return ended;
|
|
@@ -427,7 +427,7 @@ YencDecoderEnd do_decode_simd(const unsigned char* HEDLEY_RESTRICT* src, unsigne
|
|
|
427
427
|
escFirst = (*pState == YDEC_STATE_EQ || *pState == YDEC_STATE_CRLFEQ);
|
|
428
428
|
|
|
429
429
|
// our algorithm may perform an aligned load on the next part, of which we consider 2 bytes (for \r\n. sequence checking)
|
|
430
|
-
long dLen = len - lenBuffer;
|
|
430
|
+
long dLen = (long)(len - lenBuffer);
|
|
431
431
|
dLen = (dLen + (width-1)) & ~(width-1);
|
|
432
432
|
|
|
433
433
|
kernel((const uint8_t*)(*src) + dLen, dLen, p, escFirst, nextMask);
|
package/src/decoder_neon.cc
CHANGED
|
@@ -7,9 +7,9 @@
|
|
|
7
7
|
#include "decoder_common.h"
|
|
8
8
|
|
|
9
9
|
|
|
10
|
-
#
|
|
11
|
-
# define vld1_u8_align vld1_u8_ex
|
|
12
|
-
# define vld1q_u8_align vld1q_u8_ex
|
|
10
|
+
#if defined(_MSC_VER) && !defined(__clang__)
|
|
11
|
+
# define vld1_u8_align(p, a) vld1_u8_ex(p, a*8)
|
|
12
|
+
# define vld1q_u8_align(p, a) vld1q_u8_ex(p, a*8)
|
|
13
13
|
#elif defined(__GNUC__)
|
|
14
14
|
# define vld1_u8_align(p, n) vld1_u8((uint8_t*)__builtin_assume_aligned(p, n))
|
|
15
15
|
# define vld1q_u8_align(p, n) vld1q_u8((uint8_t*)__builtin_assume_aligned(p, n))
|
|
@@ -23,15 +23,13 @@
|
|
|
23
23
|
#if defined(__clang__) || (defined(__GNUC__) && (defined(__aarch64__) && __GNUC__ >= 8))
|
|
24
24
|
# define vld1q_u8_x2_align(p, n) vld1q_u8_x2((uint8_t*)__builtin_assume_aligned(p, n))
|
|
25
25
|
#else
|
|
26
|
-
|
|
27
|
-
return (uint8x16x2_t){vld1q_u8_align(p, n), vld1q_u8_align(p+16, n)};
|
|
28
|
-
}
|
|
26
|
+
# define vld1q_u8_x2_align(p, n) vcreate2_u8(vld1q_u8_align(p, (n)/2), vld1q_u8_align((p)+16, (n)/2))
|
|
29
27
|
#endif
|
|
30
28
|
// Clang wrongly assumes alignment on vld1q_u8_x2, and ARMv7 GCC doesn't support the function, so effectively, it can only be used in ARMv8 compilers
|
|
31
29
|
#if defined(__aarch64__) && (defined(__clang__) || (defined(__GNUC__) && __GNUC__ >= 9))
|
|
32
30
|
# define vst1q_u8_x2_unaligned vst1q_u8_x2
|
|
33
31
|
#else
|
|
34
|
-
HEDLEY_ALWAYS_INLINE void vst1q_u8_x2_unaligned(uint8_t* p, uint8x16x2_t data) {
|
|
32
|
+
static HEDLEY_ALWAYS_INLINE void vst1q_u8_x2_unaligned(uint8_t* p, uint8x16x2_t data) {
|
|
35
33
|
vst1q_u8(p, data.val[0]);
|
|
36
34
|
vst1q_u8(p+16, data.val[1]);
|
|
37
35
|
}
|
|
@@ -64,18 +62,20 @@ template<bool isRaw, bool searchEnd>
|
|
|
64
62
|
HEDLEY_ALWAYS_INLINE void do_decode_neon(const uint8_t* HEDLEY_RESTRICT src, long& len, unsigned char* HEDLEY_RESTRICT & p, unsigned char& escFirst, uint16_t& nextMask) {
|
|
65
63
|
HEDLEY_ASSUME(escFirst == 0 || escFirst == 1);
|
|
66
64
|
HEDLEY_ASSUME(nextMask == 0 || nextMask == 1 || nextMask == 2);
|
|
67
|
-
uint8x16_t yencOffset = escFirst ? (
|
|
65
|
+
uint8x16_t yencOffset = escFirst ? vmakeq_u8(42+64,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42) : vdupq_n_u8(42);
|
|
68
66
|
#ifdef __aarch64__
|
|
69
67
|
uint8x16_t nextMaskMix = vdupq_n_u8(0);
|
|
70
|
-
if(nextMask)
|
|
71
|
-
nextMaskMix
|
|
68
|
+
if(nextMask == 1)
|
|
69
|
+
nextMaskMix = vsetq_lane_u8(1, nextMaskMix, 0);
|
|
70
|
+
if(nextMask == 2)
|
|
71
|
+
nextMaskMix = vsetq_lane_u8(2, nextMaskMix, 1);
|
|
72
72
|
#else
|
|
73
73
|
uint8x16_t lfCompare = vdupq_n_u8('\n');
|
|
74
74
|
if(isRaw) {
|
|
75
75
|
if(nextMask == 1)
|
|
76
|
-
lfCompare
|
|
76
|
+
lfCompare = vsetq_lane_u8('.', lfCompare, 0);
|
|
77
77
|
if(nextMask == 2)
|
|
78
|
-
lfCompare
|
|
78
|
+
lfCompare = vsetq_lane_u8('.', lfCompare, 1);
|
|
79
79
|
}
|
|
80
80
|
#endif
|
|
81
81
|
long i;
|
|
@@ -90,13 +90,13 @@ HEDLEY_ALWAYS_INLINE void do_decode_neon(const uint8_t* HEDLEY_RESTRICT src, lon
|
|
|
90
90
|
#ifdef __aarch64__
|
|
91
91
|
cmpA = vqtbx1q_u8(
|
|
92
92
|
cmpEqA,
|
|
93
|
-
//
|
|
94
|
-
(
|
|
93
|
+
// \n \r
|
|
94
|
+
vmakeq_u8(0,0,0,0,0,0,0,0,0,0,255,0,0,255,0,0),
|
|
95
95
|
dataA
|
|
96
96
|
),
|
|
97
97
|
cmpB = vqtbx1q_u8(
|
|
98
98
|
cmpEqB,
|
|
99
|
-
(
|
|
99
|
+
vmakeq_u8(0,0,0,0,0,0,0,0,0,0,255,0,0,255,0,0),
|
|
100
100
|
dataB
|
|
101
101
|
);
|
|
102
102
|
if(isRaw) cmpA = vorrq_u8(cmpA, nextMaskMix);
|
|
@@ -122,12 +122,12 @@ HEDLEY_ALWAYS_INLINE void do_decode_neon(const uint8_t* HEDLEY_RESTRICT src, lon
|
|
|
122
122
|
|
|
123
123
|
#ifdef __aarch64__
|
|
124
124
|
if (LIKELIHOOD(0.42 /*guess*/, neon_vect_is_nonzero(vorrq_u8(cmpA, cmpB)))) {
|
|
125
|
-
cmpA = vandq_u8(cmpA, (
|
|
126
|
-
cmpB = vandq_u8(cmpB, (
|
|
125
|
+
cmpA = vandq_u8(cmpA, vmakeq_u8(1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128));
|
|
126
|
+
cmpB = vandq_u8(cmpB, vmakeq_u8(1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128));
|
|
127
127
|
uint8x16_t cmpMerge = vpaddq_u8(cmpA, cmpB);
|
|
128
128
|
uint8x16_t cmpEqMerge = vpaddq_u8(
|
|
129
|
-
vandq_u8(cmpEqA, (
|
|
130
|
-
vandq_u8(cmpEqB, (
|
|
129
|
+
vandq_u8(cmpEqA, vmakeq_u8(1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128)),
|
|
130
|
+
vandq_u8(cmpEqB, vmakeq_u8(1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128))
|
|
131
131
|
);
|
|
132
132
|
|
|
133
133
|
uint8x16_t cmpCombined = vpaddq_u8(cmpMerge, cmpEqMerge);
|
|
@@ -136,8 +136,8 @@ HEDLEY_ALWAYS_INLINE void do_decode_neon(const uint8_t* HEDLEY_RESTRICT src, lon
|
|
|
136
136
|
uint32_t mask = vgetq_lane_u32(vreinterpretq_u32_u8(cmpCombined), 0);
|
|
137
137
|
uint32_t maskEq = vgetq_lane_u32(vreinterpretq_u32_u8(cmpCombined), 1);
|
|
138
138
|
#else
|
|
139
|
-
cmpA = vandq_u8(cmpA, (
|
|
140
|
-
cmpB = vandq_u8(cmpB, (
|
|
139
|
+
cmpA = vandq_u8(cmpA, vmakeq_u8(1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128));
|
|
140
|
+
cmpB = vandq_u8(cmpB, vmakeq_u8(1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128));
|
|
141
141
|
// no vpaddq_u8 in ARMv7, so need extra 64-bit VPADD
|
|
142
142
|
uint8x8_t cmpPacked = vpadd_u8(
|
|
143
143
|
vpadd_u8(
|
|
@@ -150,8 +150,8 @@ HEDLEY_ALWAYS_INLINE void do_decode_neon(const uint8_t* HEDLEY_RESTRICT src, lon
|
|
|
150
150
|
cmpPacked = vpadd_u8(cmpPacked, cmpPacked);
|
|
151
151
|
uint32_t mask = vget_lane_u32(vreinterpret_u32_u8(cmpPacked), 0);
|
|
152
152
|
if(LIKELIHOOD(0.42, mask != 0)) {
|
|
153
|
-
uint8x16_t cmpEqMaskedA = vandq_u8(cmpEqA, (
|
|
154
|
-
uint8x16_t cmpEqMaskedB = vandq_u8(cmpEqB, (
|
|
153
|
+
uint8x16_t cmpEqMaskedA = vandq_u8(cmpEqA, vmakeq_u8(1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128));
|
|
154
|
+
uint8x16_t cmpEqMaskedB = vandq_u8(cmpEqB, vmakeq_u8(1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128));
|
|
155
155
|
uint8x8_t cmpEqPacked = vpadd_u8(
|
|
156
156
|
vpadd_u8(
|
|
157
157
|
vget_low_u8(cmpEqMaskedA), vget_high_u8(cmpEqMaskedA)
|
|
@@ -170,7 +170,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_neon(const uint8_t* HEDLEY_RESTRICT src, lon
|
|
|
170
170
|
// vext seems to be a cheap operation on ARM, relative to loads, so only avoid it if there's only one load (isRaw only)
|
|
171
171
|
uint8x16_t tmpData2, nextData;
|
|
172
172
|
if(isRaw && !searchEnd) {
|
|
173
|
-
tmpData2 =
|
|
173
|
+
tmpData2 = vld1q_u8(src+i + 2 + sizeof(uint8x16_t));
|
|
174
174
|
} else {
|
|
175
175
|
nextData = vld1q_u8_align(src+i + sizeof(uint8x16_t)*2, 16); // only 32-bits needed, but there doesn't appear a nice way to do this via intrinsics: https://stackoverflow.com/questions/46910799/arm-neon-intrinsics-convert-d-64-bit-register-to-low-half-of-q-128-bit-regis
|
|
176
176
|
tmpData2 = vextq_u8(dataB, nextData, 2);
|
|
@@ -255,15 +255,15 @@ HEDLEY_ALWAYS_INLINE void do_decode_neon(const uint8_t* HEDLEY_RESTRICT src, lon
|
|
|
255
255
|
}
|
|
256
256
|
}
|
|
257
257
|
#ifdef __aarch64__
|
|
258
|
-
uint8x16_t match2NlDotBMasked = vandq_u8(match2NlDotB, (
|
|
258
|
+
uint8x16_t match2NlDotBMasked = vandq_u8(match2NlDotB, vmakeq_u8(1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128));
|
|
259
259
|
uint8x16_t mergeKillDots = vpaddq_u8(
|
|
260
|
-
vandq_u8(match2NlDotA, (
|
|
260
|
+
vandq_u8(match2NlDotA, vmakeq_u8(1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128)),
|
|
261
261
|
match2NlDotBMasked
|
|
262
262
|
);
|
|
263
263
|
uint8x8_t mergeKillDots2 = vget_low_u8(vpaddq_u8(mergeKillDots, mergeKillDots));
|
|
264
264
|
#else
|
|
265
|
-
uint8x16_t match2NlDotMaskedA = vandq_u8(match2NlDotA, (
|
|
266
|
-
uint8x16_t match2NlDotMaskedB = vandq_u8(match2NlDotB, (
|
|
265
|
+
uint8x16_t match2NlDotMaskedA = vandq_u8(match2NlDotA, vmakeq_u8(1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128));
|
|
266
|
+
uint8x16_t match2NlDotMaskedB = vandq_u8(match2NlDotB, vmakeq_u8(1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128));
|
|
267
267
|
uint8x8_t mergeKillDots2 = vpadd_u8(
|
|
268
268
|
vpadd_u8(
|
|
269
269
|
vget_low_u8(match2NlDotMaskedA), vget_high_u8(match2NlDotMaskedA)
|
|
@@ -342,11 +342,11 @@ HEDLEY_ALWAYS_INLINE void do_decode_neon(const uint8_t* HEDLEY_RESTRICT src, lon
|
|
|
342
342
|
#ifdef __aarch64__
|
|
343
343
|
uint8x16_t vMaskEqA = vqtbl1q_u8(
|
|
344
344
|
vcombine_u8(maskEqTemp, vdup_n_u8(0)),
|
|
345
|
-
(
|
|
345
|
+
vmakeq_u8(0,0,0,0,0,0,0,0, 1,1,1,1,1,1,1,1)
|
|
346
346
|
);
|
|
347
347
|
uint8x16_t vMaskEqB = vqtbl1q_u8(
|
|
348
348
|
vcombine_u8(maskEqTemp, vdup_n_u8(0)),
|
|
349
|
-
(
|
|
349
|
+
vmakeq_u8(2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3)
|
|
350
350
|
);
|
|
351
351
|
#else
|
|
352
352
|
uint8x16_t vMaskEqA = vcombine_u8(
|
|
@@ -358,8 +358,8 @@ HEDLEY_ALWAYS_INLINE void do_decode_neon(const uint8_t* HEDLEY_RESTRICT src, lon
|
|
|
358
358
|
vdup_lane_u8(maskEqTemp, 3)
|
|
359
359
|
);
|
|
360
360
|
#endif
|
|
361
|
-
vMaskEqA = vtstq_u8(vMaskEqA, (
|
|
362
|
-
vMaskEqB = vtstq_u8(vMaskEqB, (
|
|
361
|
+
vMaskEqA = vtstq_u8(vMaskEqA, vmakeq_u8(1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128));
|
|
362
|
+
vMaskEqB = vtstq_u8(vMaskEqB, vmakeq_u8(1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128));
|
|
363
363
|
|
|
364
364
|
dataA = vsubq_u8(
|
|
365
365
|
dataA,
|
|
@@ -391,7 +391,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_neon(const uint8_t* HEDLEY_RESTRICT src, lon
|
|
|
391
391
|
)
|
|
392
392
|
);
|
|
393
393
|
}
|
|
394
|
-
yencOffset
|
|
394
|
+
yencOffset = vsetq_lane_u8((escFirst << 6) | 42, yencOffset, 0);
|
|
395
395
|
|
|
396
396
|
// all that's left is to 'compress' the data (skip over masked chars)
|
|
397
397
|
uint32_t counts = 0x08080808 - vget_lane_u32(vreinterpret_u32_u8(vcnt_u8(cmpPacked)), 0);
|
|
@@ -439,7 +439,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_neon(const uint8_t* HEDLEY_RESTRICT src, lon
|
|
|
439
439
|
} else {
|
|
440
440
|
dataA = vsubq_u8(dataA, yencOffset);
|
|
441
441
|
dataB = vsubq_u8(dataB, vdupq_n_u8(42));
|
|
442
|
-
vst1q_u8_x2_unaligned(p, (
|
|
442
|
+
vst1q_u8_x2_unaligned(p, vcreate2_u8(dataA, dataB));
|
|
443
443
|
p += sizeof(uint8x16_t)*2;
|
|
444
444
|
escFirst = 0;
|
|
445
445
|
#ifdef __aarch64__
|