yencode 1.1.0 → 1.1.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/binding.gyp +79 -7
- package/crcutil-1.0/code/multiword_64_64_intrinsic_i386_mmx.cc +1 -1
- package/package.json +1 -1
- package/src/common.h +88 -24
- package/src/crc.cc +59 -27
- package/src/crc.h +20 -6
- package/src/crc_arm.cc +154 -27
- package/src/crc_common.h +3 -10
- package/src/{crc_folding.c → crc_folding.cc} +53 -122
- package/src/crc_folding_256.cc +230 -0
- package/src/decoder.cc +10 -4
- package/src/decoder.h +16 -2
- package/src/decoder_avx2_base.h +32 -21
- package/src/decoder_common.h +2 -2
- package/src/decoder_neon.cc +37 -37
- package/src/decoder_neon64.cc +41 -36
- package/src/decoder_sse_base.h +21 -14
- package/src/decoder_vbmi2.cc +30 -0
- package/src/encoder.cc +9 -3
- package/src/encoder.h +17 -1
- package/src/encoder_avx_base.h +8 -8
- package/src/encoder_common.h +3 -3
- package/src/encoder_neon.cc +31 -31
- package/src/encoder_sse_base.h +7 -8
- package/src/encoder_vbmi2.cc +23 -0
- package/src/platform.cc +57 -8
- package/src/yencode.cc +33 -44
- package/test/testcrc.js +14 -0
package/src/encoder_neon.cc
CHANGED
|
@@ -5,10 +5,10 @@
|
|
|
5
5
|
#include "encoder_common.h"
|
|
6
6
|
|
|
7
7
|
// Clang wrongly assumes alignment on vst1q_u8_x2, and ARMv7 GCC doesn't support the function, so effectively, it can only be used in ARMv8 compilers
|
|
8
|
-
#if defined(__aarch64__) && (defined(__clang__) || (
|
|
8
|
+
#if defined(__aarch64__) && (defined(__clang__) || HEDLEY_GCC_VERSION_CHECK(8,5,0))
|
|
9
9
|
# define vst1q_u8_x2_unaligned vst1q_u8_x2
|
|
10
10
|
#else
|
|
11
|
-
HEDLEY_ALWAYS_INLINE void vst1q_u8_x2_unaligned(uint8_t* p, uint8x16x2_t data) {
|
|
11
|
+
static HEDLEY_ALWAYS_INLINE void vst1q_u8_x2_unaligned(uint8_t* p, uint8x16x2_t data) {
|
|
12
12
|
vst1q_u8(p, data.val[0]);
|
|
13
13
|
vst1q_u8(p+16, data.val[1]);
|
|
14
14
|
}
|
|
@@ -26,16 +26,16 @@ static HEDLEY_ALWAYS_INLINE void encode_eol_handle_pre(const uint8_t* HEDLEY_RES
|
|
|
26
26
|
#ifdef __aarch64__
|
|
27
27
|
uint8x16_t cmpA = vreinterpretq_u8_s8(vqtbx2q_s8(
|
|
28
28
|
vdupq_n_s8('='-42),
|
|
29
|
-
(
|
|
30
|
-
vreinterpretq_u8_s8(vhaddq_s8(vreinterpretq_s8_u8(dataA), (
|
|
29
|
+
vcreate2_s8(vmakeq_s8('\0'-42,-128,-128,'\0'-42,'\t'-42,'\n'-42,'\r'-42,'\t'-42,'\n'-42,'\r'-42,-128,-128,'\0'-42,-128,-128,-128), vmakeq_s8(' '-42,'\n'-42,'\r'-42,' '-42,-128,-128,-128,-128,-128,-128,'.'-42,-128,-128,-128,'='-42,-128)),
|
|
30
|
+
vreinterpretq_u8_s8(vhaddq_s8(vreinterpretq_s8_u8(dataA), vmakeq_s8(42,48,66,66, 66,66,66,66, 66,66,66,66, 66,66,66,66)))
|
|
31
31
|
));
|
|
32
32
|
cmpA = vceqq_u8(cmpA, dataA);
|
|
33
33
|
|
|
34
34
|
dataB = vaddq_u8(oDataB, vdupq_n_u8(42));
|
|
35
35
|
uint8x16_t cmpB = vqtbx1q_u8(
|
|
36
36
|
vceqq_u8(oDataB, vdupq_n_u8('='-42)),
|
|
37
|
-
//
|
|
38
|
-
(
|
|
37
|
+
// \0 \n \r
|
|
38
|
+
vmakeq_u8(255,0,0,0,0,0,0,0,0,0,255,0,0,255,0,0),
|
|
39
39
|
dataB
|
|
40
40
|
);
|
|
41
41
|
dataA = vaddq_u8(dataA, vbslq_u8(cmpA, vdupq_n_u8(64+42), vdupq_n_u8(42)));
|
|
@@ -64,9 +64,9 @@ static HEDLEY_ALWAYS_INLINE void encode_eol_handle_pre(const uint8_t* HEDLEY_RES
|
|
|
64
64
|
|
|
65
65
|
// dup low 2 bytes & compare
|
|
66
66
|
uint8x8_t firstTwoChars = vreinterpret_u8_u16(vdup_lane_u16(vreinterpret_u16_u8(vget_low_u8(oDataA)), 0));
|
|
67
|
-
uint8x8_t cmpNl = vceq_u8(firstTwoChars,
|
|
68
|
-
' '
|
|
69
|
-
|
|
67
|
+
uint8x8_t cmpNl = vceq_u8(firstTwoChars, vmake_u8(
|
|
68
|
+
' '+214,' '+214,'\t'+214,'\t'+214,'\r'+214,'.'-42,'='-42,'='-42
|
|
69
|
+
));
|
|
70
70
|
// use padd to merge comparisons
|
|
71
71
|
uint16x4_t cmpNl2 = vreinterpret_u16_u8(cmpNl);
|
|
72
72
|
cmpNl2 = vpadd_u16(cmpNl2, vdup_n_u16(0));
|
|
@@ -80,8 +80,8 @@ static HEDLEY_ALWAYS_INLINE void encode_eol_handle_pre(const uint8_t* HEDLEY_RES
|
|
|
80
80
|
#endif
|
|
81
81
|
|
|
82
82
|
|
|
83
|
-
uint8x16_t cmpAMasked = vandq_u8(cmpA, (
|
|
84
|
-
uint8x16_t cmpBMasked = vandq_u8(cmpB, (
|
|
83
|
+
uint8x16_t cmpAMasked = vandq_u8(cmpA, vmakeq_u8(1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128));
|
|
84
|
+
uint8x16_t cmpBMasked = vandq_u8(cmpB, vmakeq_u8(1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128));
|
|
85
85
|
#ifdef __aarch64__
|
|
86
86
|
uint8x16_t cmpMerge = vpaddq_u8(cmpAMasked, cmpBMasked);
|
|
87
87
|
cmpMerge = vpaddq_u8(cmpMerge, cmpMerge);
|
|
@@ -95,7 +95,7 @@ static HEDLEY_ALWAYS_INLINE void encode_eol_handle_pre(const uint8_t* HEDLEY_RES
|
|
|
95
95
|
memcpy(p, &firstChar, sizeof(firstChar));
|
|
96
96
|
p += 4;
|
|
97
97
|
mask ^= 1;
|
|
98
|
-
cmpMerge = vbicq_u8(cmpMerge, (
|
|
98
|
+
cmpMerge = vbicq_u8(cmpMerge, vmakeq_u8(1,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0));
|
|
99
99
|
} else {
|
|
100
100
|
firstChar |= 0x0a0d00;
|
|
101
101
|
memcpy(p, &firstChar, sizeof(firstChar));
|
|
@@ -130,7 +130,7 @@ static HEDLEY_ALWAYS_INLINE void encode_eol_handle_pre(const uint8_t* HEDLEY_RES
|
|
|
130
130
|
memcpy(p, &firstChar, sizeof(firstChar));
|
|
131
131
|
p += 4;
|
|
132
132
|
mask ^= 1;
|
|
133
|
-
cmpPacked = vbic_u8(cmpPacked, (
|
|
133
|
+
cmpPacked = vbic_u8(cmpPacked, vmake_u8(1,0,0,0, 0,0,0,0));
|
|
134
134
|
} else {
|
|
135
135
|
firstChar |= 0x0a0d00;
|
|
136
136
|
memcpy(p, &firstChar, sizeof(firstChar));
|
|
@@ -198,7 +198,7 @@ static HEDLEY_ALWAYS_INLINE void encode_eol_handle_pre(const uint8_t* HEDLEY_RES
|
|
|
198
198
|
#ifdef __aarch64__
|
|
199
199
|
# ifdef _MSC_VER
|
|
200
200
|
long bitIndex;
|
|
201
|
-
if(_BitScanReverse64(&bitIndex, mask))
|
|
201
|
+
if(_BitScanReverse64((unsigned long*)&bitIndex, mask))
|
|
202
202
|
bitIndex ^= 63;
|
|
203
203
|
else
|
|
204
204
|
bitIndex = 64;
|
|
@@ -217,11 +217,11 @@ static HEDLEY_ALWAYS_INLINE void encode_eol_handle_pre(const uint8_t* HEDLEY_RES
|
|
|
217
217
|
|
|
218
218
|
uint8x16_t vClz = vdupq_n_u8(bitIndex & ~(sizeof(mask)*8));
|
|
219
219
|
#ifdef __aarch64__
|
|
220
|
-
uint8x16_t blendA = vcgtq_u8((
|
|
221
|
-
uint8x16_t blendB = vcgtq_u8((
|
|
220
|
+
uint8x16_t blendA = vcgtq_u8(vmakeq_u8(63,62,61,60,51,50,49,48,47,46,45,44,35,34,33,32), vClz);
|
|
221
|
+
uint8x16_t blendB = vcgtq_u8(vmakeq_u8(31,30,29,28,19,18,17,16,15,14,13,12, 3, 2, 1, 0), vClz);
|
|
222
222
|
#else
|
|
223
|
-
uint8x16_t blendA = vcgtq_u8((
|
|
224
|
-
uint8x16_t blendB = vcgtq_u8((
|
|
223
|
+
uint8x16_t blendA = vcgtq_u8(vmakeq_u8(31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16), vClz);
|
|
224
|
+
uint8x16_t blendB = vcgtq_u8(vmakeq_u8(15,14,13,12,11,10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0), vClz);
|
|
225
225
|
#endif
|
|
226
226
|
uint8x16_t dataAShifted = vbslq_u8(cmpA, vdupq_n_u8('='), dataA);
|
|
227
227
|
uint8x16_t dataBShifted = vbslq_u8(cmpB, vdupq_n_u8('='), dataB);
|
|
@@ -230,7 +230,7 @@ static HEDLEY_ALWAYS_INLINE void encode_eol_handle_pre(const uint8_t* HEDLEY_RES
|
|
|
230
230
|
dataA = vbslq_u8(blendA, dataAShifted, dataA);
|
|
231
231
|
dataB = vbslq_u8(blendB, dataBShifted, dataB);
|
|
232
232
|
|
|
233
|
-
vst1q_u8_x2_unaligned(p, (
|
|
233
|
+
vst1q_u8_x2_unaligned(p, vcreate2_u8(dataA, dataB));
|
|
234
234
|
p += sizeof(uint8x16_t)*2 - 1;
|
|
235
235
|
p += (mask != 0);
|
|
236
236
|
col = lineSizeOffset + (mask != 0);
|
|
@@ -296,14 +296,14 @@ HEDLEY_ALWAYS_INLINE void do_encode_neon(int line_size, int* colOffset, const ui
|
|
|
296
296
|
dataB = vaddq_u8(dataB, vdupq_n_u8(42));
|
|
297
297
|
uint8x16_t cmpA = vqtbx1q_u8(
|
|
298
298
|
cmpEqA,
|
|
299
|
-
//
|
|
300
|
-
(
|
|
299
|
+
// \0 \n \r
|
|
300
|
+
vmakeq_u8(255,0,0,0,0,0,0,0,0,0,255,0,0,255,0,0),
|
|
301
301
|
dataA
|
|
302
302
|
);
|
|
303
303
|
uint8x16_t cmpB = vqtbx1q_u8(
|
|
304
304
|
cmpEqB,
|
|
305
|
-
//
|
|
306
|
-
(
|
|
305
|
+
// \0 \n \r
|
|
306
|
+
vmakeq_u8(255,0,0,0,0,0,0,0,0,0,255,0,0,255,0,0),
|
|
307
307
|
dataB
|
|
308
308
|
);
|
|
309
309
|
|
|
@@ -338,8 +338,8 @@ HEDLEY_ALWAYS_INLINE void do_encode_neon(int line_size, int* colOffset, const ui
|
|
|
338
338
|
|
|
339
339
|
|
|
340
340
|
long bitIndex; // prevent compiler whining
|
|
341
|
-
uint8x16_t cmpAMasked = vandq_u8(cmpA, (
|
|
342
|
-
uint8x16_t cmpBMasked = vandq_u8(cmpB, (
|
|
341
|
+
uint8x16_t cmpAMasked = vandq_u8(cmpA, vmakeq_u8(1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128));
|
|
342
|
+
uint8x16_t cmpBMasked = vandq_u8(cmpB, vmakeq_u8(1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128));
|
|
343
343
|
#ifdef __aarch64__
|
|
344
344
|
uint8x16_t cmpMerge = vpaddq_u8(cmpAMasked, cmpBMasked);
|
|
345
345
|
cmpMerge = vpaddq_u8(cmpMerge, cmpMerge);
|
|
@@ -453,7 +453,7 @@ HEDLEY_ALWAYS_INLINE void do_encode_neon(int line_size, int* colOffset, const ui
|
|
|
453
453
|
#ifdef __aarch64__
|
|
454
454
|
# ifdef _MSC_VER
|
|
455
455
|
// does this work?
|
|
456
|
-
if(_BitScanReverse64(&bitIndex, mask))
|
|
456
|
+
if(_BitScanReverse64((unsigned long*)&bitIndex, mask))
|
|
457
457
|
bitIndex ^= 63;
|
|
458
458
|
else
|
|
459
459
|
bitIndex = 64;
|
|
@@ -472,11 +472,11 @@ HEDLEY_ALWAYS_INLINE void do_encode_neon(int line_size, int* colOffset, const ui
|
|
|
472
472
|
|
|
473
473
|
uint8x16_t vClz = vdupq_n_u8(bitIndex & ~(sizeof(mask)*8));
|
|
474
474
|
#ifdef __aarch64__
|
|
475
|
-
uint8x16_t blendA = vcgeq_u8((
|
|
476
|
-
uint8x16_t blendB = vcgeq_u8((
|
|
475
|
+
uint8x16_t blendA = vcgeq_u8(vmakeq_u8(63,62,61,60,51,50,49,48,47,46,45,44,35,34,33,32), vClz);
|
|
476
|
+
uint8x16_t blendB = vcgeq_u8(vmakeq_u8(31,30,29,28,19,18,17,16,15,14,13,12, 3, 2, 1, 0), vClz);
|
|
477
477
|
#else
|
|
478
|
-
uint8x16_t blendA = vcgeq_u8((
|
|
479
|
-
uint8x16_t blendB = vcgeq_u8((
|
|
478
|
+
uint8x16_t blendA = vcgeq_u8(vmakeq_u8(31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16), vClz);
|
|
479
|
+
uint8x16_t blendB = vcgeq_u8(vmakeq_u8(15,14,13,12,11,10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0), vClz);
|
|
480
480
|
#endif
|
|
481
481
|
uint8x16_t dataAShifted = vextq_u8(dataA, dataA, 15);
|
|
482
482
|
uint8x16_t dataBShifted = vextq_u8(dataA, dataB, 15);
|
|
@@ -485,7 +485,7 @@ HEDLEY_ALWAYS_INLINE void do_encode_neon(int line_size, int* colOffset, const ui
|
|
|
485
485
|
dataA = vbslq_u8(blendA, dataA, dataAShifted);
|
|
486
486
|
outDataB = vbslq_u8(blendB, outDataB, dataBShifted);
|
|
487
487
|
|
|
488
|
-
vst1q_u8_x2_unaligned(p, (
|
|
488
|
+
vst1q_u8_x2_unaligned(p, vcreate2_u8(dataA, outDataB));
|
|
489
489
|
p += sizeof(uint8x16_t)*2;
|
|
490
490
|
// write last byte
|
|
491
491
|
*p = vgetq_lane_u8(dataB, 15);
|
package/src/encoder_sse_base.h
CHANGED
|
@@ -8,7 +8,7 @@
|
|
|
8
8
|
# define _mm_mask_expand_epi8 _mm128_mask_expand_epi8
|
|
9
9
|
#endif
|
|
10
10
|
|
|
11
|
-
#if defined(__GNUC__) && __GNUC__ >= 7
|
|
11
|
+
#if (defined(__GNUC__) && __GNUC__ >= 7) || (defined(_MSC_VER) && _MSC_VER >= 1924)
|
|
12
12
|
# define KLOAD16(a, offs) _load_mask16((__mmask16*)(a) + (offs))
|
|
13
13
|
#else
|
|
14
14
|
# define KLOAD16(a, offs) (((uint16_t*)(a))[(offs)])
|
|
@@ -155,7 +155,7 @@ HEDLEY_ALWAYS_INLINE void do_encode_sse(int line_size, int* colOffset, const uin
|
|
|
155
155
|
if(len <= INPUT_OFFSET || line_size < XMM_SIZE) return;
|
|
156
156
|
|
|
157
157
|
// slower CPUs prefer to branch as mispredict penalty is probably small relative to general execution
|
|
158
|
-
#if defined(__tune_atom__) || defined(__tune_slm__) || defined(__tune_btver1__)
|
|
158
|
+
#if defined(__tune_atom__) || defined(__tune_slm__) || defined(__tune_btver1__) || defined(__tune_btver2__)
|
|
159
159
|
const bool _PREFER_BRANCHING = true;
|
|
160
160
|
#else
|
|
161
161
|
const bool _PREFER_BRANCHING = (use_isa < ISA_LEVEL_SSSE3);
|
|
@@ -350,7 +350,7 @@ HEDLEY_ALWAYS_INLINE void do_encode_sse(int line_size, int* colOffset, const uin
|
|
|
350
350
|
#if defined(__POPCNT__) && !defined(__tune_btver1__)
|
|
351
351
|
if(use_isa & ISA_FEATURE_POPCNT) {
|
|
352
352
|
shuf2Len = popcnt32(maskA) + 16;
|
|
353
|
-
# if defined(__tune_znver2__) || defined(__tune_znver1__) || defined(__tune_btver2__)
|
|
353
|
+
# if defined(__tune_znver3__) || defined(__tune_znver2__) || defined(__tune_znver1__) || defined(__tune_btver2__)
|
|
354
354
|
shuf1Len = popcnt32(m1) + 8;
|
|
355
355
|
shuf3Len = popcnt32(m3) + shuf2Len + 8;
|
|
356
356
|
# else
|
|
@@ -412,8 +412,8 @@ HEDLEY_ALWAYS_INLINE void do_encode_sse(int line_size, int* colOffset, const uin
|
|
|
412
412
|
asm(
|
|
413
413
|
"shrl $1, %[eqMask] \n"
|
|
414
414
|
"shrl %%cl, %[eqMask] \n" // TODO: can use shrq to avoid above shift?
|
|
415
|
-
# if defined(PLATFORM_AMD64)
|
|
416
|
-
"adcq %[col], %[p] \n"
|
|
415
|
+
# if defined(PLATFORM_AMD64) && !defined(__ILP32__)
|
|
416
|
+
"adcq %q[col], %q[p] \n"
|
|
417
417
|
# else
|
|
418
418
|
"adcl %[col], %[p] \n"
|
|
419
419
|
# endif
|
|
@@ -538,8 +538,7 @@ HEDLEY_ALWAYS_INLINE void do_encode_sse(int line_size, int* colOffset, const uin
|
|
|
538
538
|
|
|
539
539
|
dataA = _mm_shuffle_epi8(dataA, shufMaskA);
|
|
540
540
|
|
|
541
|
-
# if defined(__SSE4_1__) && !defined(__tune_slm__) && !defined(__tune_goldmont__) && !defined(__tune_goldmont_plus__)
|
|
542
|
-
// unsure if worth on: Jaguar/Puma (3|2), Core2 (2|2)
|
|
541
|
+
# if defined(__SSE4_1__) && !defined(__tune_slm__) && !defined(__tune_goldmont__) && !defined(__tune_goldmont_plus__) && !defined(__tune_tremont__)
|
|
543
542
|
if(use_isa >= ISA_LEVEL_SSE41) {
|
|
544
543
|
dataB = _mm_blendv_epi8(dataBShifted, dataB, mergeMaskB);
|
|
545
544
|
} else
|
|
@@ -717,7 +716,7 @@ HEDLEY_ALWAYS_INLINE void do_encode_sse(int line_size, int* colOffset, const uin
|
|
|
717
716
|
}
|
|
718
717
|
} while(i < 0);
|
|
719
718
|
|
|
720
|
-
*colOffset = col + line_size -1;
|
|
719
|
+
*colOffset = (int)(col + line_size -1);
|
|
721
720
|
dest = p;
|
|
722
721
|
len = -(i - INPUT_OFFSET);
|
|
723
722
|
}
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
#include "common.h"
|
|
2
|
+
|
|
3
|
+
#if defined(__AVX512VL__) && defined(__AVX512VBMI2__) && defined(__AVX512BW__)
|
|
4
|
+
# ifndef YENC_DISABLE_AVX256
|
|
5
|
+
# include "encoder_avx_base.h"
|
|
6
|
+
|
|
7
|
+
void encoder_vbmi2_init() {
|
|
8
|
+
_do_encode = &do_encode_simd< do_encode_avx2<ISA_LEVEL_VBMI2> >;
|
|
9
|
+
encoder_avx2_lut<ISA_LEVEL_VBMI2>();
|
|
10
|
+
}
|
|
11
|
+
# else
|
|
12
|
+
# include "encoder_sse_base.h"
|
|
13
|
+
void encoder_vbmi2_init() {
|
|
14
|
+
_do_encode = &do_encode_simd< do_encode_sse<ISA_LEVEL_VBMI2> >;
|
|
15
|
+
encoder_sse_lut<ISA_LEVEL_VBMI2>();
|
|
16
|
+
}
|
|
17
|
+
# endif
|
|
18
|
+
#else
|
|
19
|
+
void encoder_avx2_init();
|
|
20
|
+
void encoder_vbmi2_init() {
|
|
21
|
+
encoder_avx2_init();
|
|
22
|
+
}
|
|
23
|
+
#endif
|
package/src/platform.cc
CHANGED
|
@@ -2,16 +2,36 @@
|
|
|
2
2
|
#ifdef PLATFORM_ARM
|
|
3
3
|
# ifdef __ANDROID__
|
|
4
4
|
# include <cpu-features.h>
|
|
5
|
-
# elif defined(__linux__)
|
|
5
|
+
# elif defined(__linux__) || (defined(__FreeBSD__) && __FreeBSD__ >= 12)
|
|
6
6
|
# include <sys/auxv.h>
|
|
7
7
|
# include <asm/hwcap.h>
|
|
8
|
+
# elif (defined(__FreeBSD__) && __FreeBSD__ < 12)
|
|
9
|
+
# include <sys/sysctl.h>
|
|
10
|
+
# include <asm/hwcap.h>
|
|
11
|
+
# elif defined(_WIN32)
|
|
12
|
+
# define WIN32_LEAN_AND_MEAN
|
|
13
|
+
# define NOMINMAX
|
|
14
|
+
# include <Windows.h>
|
|
15
|
+
# elif defined(__APPLE__)
|
|
16
|
+
# include <sys/types.h>
|
|
17
|
+
# include <sys/sysctl.h>
|
|
8
18
|
# endif
|
|
9
19
|
bool cpu_supports_neon() {
|
|
10
20
|
# if defined(AT_HWCAP)
|
|
11
|
-
# ifdef
|
|
12
|
-
|
|
21
|
+
# ifdef __FreeBSD__
|
|
22
|
+
unsigned long supported;
|
|
23
|
+
elf_aux_info(AT_HWCAP, &supported, sizeof(supported));
|
|
24
|
+
# ifdef __aarch64__
|
|
25
|
+
return supported & HWCAP_ASIMD;
|
|
26
|
+
# else
|
|
27
|
+
return supported & HWCAP_NEON;
|
|
28
|
+
# endif
|
|
13
29
|
# else
|
|
30
|
+
# ifdef __aarch64__
|
|
31
|
+
return getauxval(AT_HWCAP) & HWCAP_ASIMD;
|
|
32
|
+
# else
|
|
14
33
|
return getauxval(AT_HWCAP) & HWCAP_NEON;
|
|
34
|
+
# endif
|
|
15
35
|
# endif
|
|
16
36
|
# elif defined(ANDROID_CPU_FAMILY_ARM)
|
|
17
37
|
# ifdef __aarch64__
|
|
@@ -19,14 +39,23 @@ bool cpu_supports_neon() {
|
|
|
19
39
|
# else
|
|
20
40
|
return android_getCpuFeatures() & ANDROID_CPU_ARM_FEATURE_NEON;
|
|
21
41
|
# endif
|
|
42
|
+
# elif defined(_WIN32)
|
|
43
|
+
return IsProcessorFeaturePresent(PF_ARM_NEON_INSTRUCTIONS_AVAILABLE);
|
|
44
|
+
# elif defined(__APPLE__)
|
|
45
|
+
int supported = 0;
|
|
46
|
+
size_t len = sizeof(supported);
|
|
47
|
+
if(sysctlbyname("hw.optional.neon", &supported, &len, NULL, 0))
|
|
48
|
+
return false;
|
|
49
|
+
return (bool)supported;
|
|
22
50
|
# endif
|
|
23
|
-
return true; // assume NEON support, if compiled as such, otherwise
|
|
51
|
+
return true; // assume NEON support, if compiled as such, otherwise (I think Windows and iOS require it)
|
|
24
52
|
}
|
|
25
53
|
#endif
|
|
26
54
|
|
|
27
55
|
|
|
28
56
|
#ifdef PLATFORM_X86
|
|
29
57
|
#ifdef _MSC_VER
|
|
58
|
+
# define _cpuid1(ar) __cpuid(ar, 1)
|
|
30
59
|
# define _cpuid1x(ar) __cpuid(ar, 0x80000001)
|
|
31
60
|
# if _MSC_VER >= 1600
|
|
32
61
|
# define _cpuidX __cpuidex
|
|
@@ -38,6 +67,8 @@ bool cpu_supports_neon() {
|
|
|
38
67
|
# define _GET_XCR() 0
|
|
39
68
|
# endif
|
|
40
69
|
#else
|
|
70
|
+
# include <cpuid.h>
|
|
71
|
+
# define _cpuid1(ar) __cpuid(1, ar[0], ar[1], ar[2], ar[3])
|
|
41
72
|
# define _cpuid1x(ar) __cpuid(0x80000001, ar[0], ar[1], ar[2], ar[3])
|
|
42
73
|
# define _cpuidX(ar, eax, ecx) __cpuid_count(eax, ecx, ar[0], ar[1], ar[2], ar[3])
|
|
43
74
|
static inline int _GET_XCR() {
|
|
@@ -84,11 +115,9 @@ int cpu_supports_isa() {
|
|
|
84
115
|
// AMD Bobcat with slow SSSE3 instructions - pretend it doesn't exist
|
|
85
116
|
return ret | ISA_LEVEL_SSE2;
|
|
86
117
|
|
|
87
|
-
// Jaguar/Puma performance unkown (slowish PSHUFB/PBLENDVB)
|
|
88
|
-
|
|
89
118
|
if((flags[2] & 0x200) == 0x200) { // SSSE3
|
|
90
|
-
if(family == 6 && (model == 0x5c || model == 0x5f || model == 0x7a))
|
|
91
|
-
// Intel Goldmont/plus with slow PBLENDVB
|
|
119
|
+
if(family == 6 && (model == 0x5c || model == 0x5f || model == 0x7a || model == 0x9c))
|
|
120
|
+
// Intel Goldmont/plus / Tremont with slow PBLENDVB
|
|
92
121
|
return ret | ISA_LEVEL_SSSE3;
|
|
93
122
|
|
|
94
123
|
if(flags[2] & 0x80000) { // SSE4.1
|
|
@@ -116,4 +145,24 @@ int cpu_supports_isa() {
|
|
|
116
145
|
return ret | ISA_LEVEL_SSE2;
|
|
117
146
|
}
|
|
118
147
|
|
|
148
|
+
int cpu_supports_crc_isa() {
|
|
149
|
+
int flags[4];
|
|
150
|
+
_cpuid1(flags);
|
|
151
|
+
|
|
152
|
+
if((flags[2] & 0x80202) == 0x80202) { // SSE4.1 + SSSE3 + CLMUL
|
|
153
|
+
if((flags[2] & 0x18000000) == 0x18000000) { // OSXSAVE + AVX
|
|
154
|
+
int xcr = _GET_XCR() & 0xff; // ignore unused bits
|
|
155
|
+
if((xcr & 6) == 6) { // AVX enabled
|
|
156
|
+
int cpuInfo[4];
|
|
157
|
+
_cpuidX(cpuInfo, 7, 0);
|
|
158
|
+
if((cpuInfo[1] & 0x20) == 0x20 && (cpuInfo[2] & 0x400) == 0x400) { // AVX2 + VPCLMULQDQ
|
|
159
|
+
return 2;
|
|
160
|
+
}
|
|
161
|
+
}
|
|
162
|
+
}
|
|
163
|
+
return 1;
|
|
164
|
+
}
|
|
165
|
+
return 0;
|
|
166
|
+
}
|
|
167
|
+
|
|
119
168
|
#endif // PLATFORM_X86
|
package/src/yencode.cc
CHANGED
|
@@ -12,11 +12,6 @@
|
|
|
12
12
|
|
|
13
13
|
using namespace v8;
|
|
14
14
|
|
|
15
|
-
union crc32 {
|
|
16
|
-
uint32_t u32;
|
|
17
|
-
unsigned char u8a[4];
|
|
18
|
-
};
|
|
19
|
-
|
|
20
15
|
static void free_buffer(char* data, void* _size) {
|
|
21
16
|
#if !NODE_VERSION_AT_LEAST(0, 11, 0)
|
|
22
17
|
int size = (int)(size_t)_size;
|
|
@@ -252,7 +247,7 @@ FUNC(Decode) {
|
|
|
252
247
|
isRaw = ARG_TO_BOOL(args[1]);
|
|
253
248
|
|
|
254
249
|
unsigned char *result = (unsigned char*) malloc(arg_len);
|
|
255
|
-
size_t len = (isRaw
|
|
250
|
+
size_t len = do_decode(isRaw, (const unsigned char*)node::Buffer::Data(args[0]), result, arg_len, NULL);
|
|
256
251
|
result = (unsigned char*)realloc(result, len);
|
|
257
252
|
MARK_EXT_MEM(len);
|
|
258
253
|
RETURN_VAL( NEW_BUFFER((char*)result, len, free_buffer, (void*)len) );
|
|
@@ -276,7 +271,7 @@ FUNC(DecodeTo) {
|
|
|
276
271
|
if (args.Length() > 2)
|
|
277
272
|
isRaw = ARG_TO_BOOL(args[2]);
|
|
278
273
|
|
|
279
|
-
size_t len = (isRaw
|
|
274
|
+
size_t len = do_decode(isRaw, (const unsigned char*)node::Buffer::Data(args[0]), (unsigned char*)node::Buffer::Data(args[1]), arg_len, NULL);
|
|
280
275
|
RETURN_VAL( Integer::New(ISOLATE len) );
|
|
281
276
|
}
|
|
282
277
|
|
|
@@ -336,17 +331,23 @@ FUNC(DecodeIncr) {
|
|
|
336
331
|
}
|
|
337
332
|
|
|
338
333
|
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
#else
|
|
347
|
-
#define RETURN_CRC(x) RETURN_VAL( NEW_BUFFER((char*)x.u8a, 4) )
|
|
334
|
+
static inline uint32_t read_crc32(const Local<Value>& buf) {
|
|
335
|
+
const uint8_t* arr = (const uint8_t*)node::Buffer::Data(buf);
|
|
336
|
+
return (((uint_fast32_t)arr[0] << 24) | ((uint_fast32_t)arr[1] << 16) | ((uint_fast32_t)arr[2] << 8) | (uint_fast32_t)arr[3]);
|
|
337
|
+
}
|
|
338
|
+
static inline Local<Object> pack_crc32(
|
|
339
|
+
#if NODE_VERSION_AT_LEAST(0, 11, 0)
|
|
340
|
+
Isolate* isolate,
|
|
348
341
|
#endif
|
|
349
|
-
|
|
342
|
+
uint32_t crc) {
|
|
343
|
+
Local<Object> buff = NEW_BUFFER(4);
|
|
344
|
+
unsigned char* d = (unsigned char*)node::Buffer::Data(buff);
|
|
345
|
+
d[0] = (unsigned char)(crc >> 24) & 0xFF;
|
|
346
|
+
d[1] = (unsigned char)(crc >> 16) & 0xFF;
|
|
347
|
+
d[2] = (unsigned char)(crc >> 8) & 0xFF;
|
|
348
|
+
d[3] = (unsigned char)crc & 0xFF;
|
|
349
|
+
return buff;
|
|
350
|
+
}
|
|
350
351
|
|
|
351
352
|
// crc32(str, init)
|
|
352
353
|
FUNC(CRC32) {
|
|
@@ -356,25 +357,18 @@ FUNC(CRC32) {
|
|
|
356
357
|
RETURN_ERROR("You must supply a Buffer");
|
|
357
358
|
// TODO: support string args??
|
|
358
359
|
|
|
359
|
-
|
|
360
|
-
init.u32 = 0;
|
|
360
|
+
uint32_t crc = 0;
|
|
361
361
|
if (args.Length() >= 2) {
|
|
362
362
|
if (!node::Buffer::HasInstance(args[1]) || node::Buffer::Length(args[1]) != 4)
|
|
363
363
|
RETURN_ERROR("Second argument must be a 4 byte buffer");
|
|
364
|
-
|
|
365
|
-
do_crc32_incremental(
|
|
366
|
-
(const void*)node::Buffer::Data(args[0]),
|
|
367
|
-
node::Buffer::Length(args[0]),
|
|
368
|
-
init.u8a
|
|
369
|
-
);
|
|
370
|
-
} else {
|
|
371
|
-
do_crc32(
|
|
372
|
-
(const void*)node::Buffer::Data(args[0]),
|
|
373
|
-
node::Buffer::Length(args[0]),
|
|
374
|
-
init.u8a
|
|
375
|
-
);
|
|
364
|
+
crc = read_crc32(args[1]);
|
|
376
365
|
}
|
|
377
|
-
|
|
366
|
+
crc = do_crc32(
|
|
367
|
+
(const void*)node::Buffer::Data(args[0]),
|
|
368
|
+
node::Buffer::Length(args[0]),
|
|
369
|
+
crc
|
|
370
|
+
);
|
|
371
|
+
RETURN_VAL(pack_crc32(ISOLATE crc));
|
|
378
372
|
}
|
|
379
373
|
|
|
380
374
|
FUNC(CRC32Combine) {
|
|
@@ -386,14 +380,11 @@ FUNC(CRC32Combine) {
|
|
|
386
380
|
|| !node::Buffer::HasInstance(args[1]) || node::Buffer::Length(args[1]) != 4)
|
|
387
381
|
RETURN_ERROR("You must supply a 4 byte Buffer for the first two arguments");
|
|
388
382
|
|
|
389
|
-
|
|
383
|
+
uint32_t crc1 = read_crc32(args[0]), crc2 = read_crc32(args[1]);
|
|
390
384
|
size_t len = (size_t)ARG_TO_INT(args[2]);
|
|
391
385
|
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
|
|
395
|
-
do_crc32_combine(crc1.u8a, crc2.u8a, len);
|
|
396
|
-
RETURN_CRC(crc1);
|
|
386
|
+
crc1 = do_crc32_combine(crc1, crc2, len);
|
|
387
|
+
RETURN_VAL(pack_crc32(ISOLATE crc1));
|
|
397
388
|
}
|
|
398
389
|
|
|
399
390
|
FUNC(CRC32Zeroes) {
|
|
@@ -402,17 +393,15 @@ FUNC(CRC32Zeroes) {
|
|
|
402
393
|
if (args.Length() < 1)
|
|
403
394
|
RETURN_ERROR("At least 1 argument required");
|
|
404
395
|
|
|
405
|
-
|
|
396
|
+
uint32_t crc1 = 0;
|
|
406
397
|
if (args.Length() >= 2) {
|
|
407
398
|
if (!node::Buffer::HasInstance(args[1]) || node::Buffer::Length(args[1]) != 4)
|
|
408
399
|
RETURN_ERROR("Second argument must be a 4 byte buffer");
|
|
409
|
-
|
|
410
|
-
} else {
|
|
411
|
-
crc1.u32 = 0;
|
|
400
|
+
crc1 = read_crc32(args[1]);
|
|
412
401
|
}
|
|
413
402
|
size_t len = (size_t)ARG_TO_INT(args[0]);
|
|
414
|
-
do_crc32_zeros(crc1
|
|
415
|
-
|
|
403
|
+
crc1 = do_crc32_zeros(crc1, len);
|
|
404
|
+
RETURN_VAL(pack_crc32(ISOLATE crc1));
|
|
416
405
|
}
|
|
417
406
|
|
|
418
407
|
static void init_all() {
|
package/test/testcrc.js
CHANGED
|
@@ -50,4 +50,18 @@ doTest('Random', 'crc32', 'fj[-oqijnw34-59n26 4345j8yn89032q78t9ab9gabh023quhoiB
|
|
|
50
50
|
doTest('Random Continue', 'crc32', ['KZSHZ5EDOVAmDdakZZOrGSUGGKSpCJoWH7M0MHy6ohnSzvHY4DjpxXmyfWYJQoJ7tKdNhGcuRVUzrgXM', ycrc32('BdenbmoBgiB10ZkeUBjrsZV3dg2Da2fhHqU9TMdi69AHhLRck3Nk60YuFBXh6lvtefBpjdTxbeEmsaEm')], crc32('BdenbmoBgiB10ZkeUBjrsZV3dg2Da2fhHqU9TMdi69AHhLRck3Nk60YuFBXh6lvtefBpjdTxbeEmsaEmKZSHZ5EDOVAmDdakZZOrGSUGGKSpCJoWH7M0MHy6ohnSzvHY4DjpxXmyfWYJQoJ7tKdNhGcuRVUzrgXM'));
|
|
51
51
|
|
|
52
52
|
|
|
53
|
+
// random tests
|
|
54
|
+
for(var i=1; i<128; i++) {
|
|
55
|
+
var rand = require('crypto').pseudoRandomBytes(i);
|
|
56
|
+
doTest('Random Short Buffer', 'crc32', rand);
|
|
57
|
+
}
|
|
58
|
+
for(var i=0; i<32; i++) {
|
|
59
|
+
var rand = require('crypto').pseudoRandomBytes(100000);
|
|
60
|
+
doTest('Random Buffer', 'crc32', rand);
|
|
61
|
+
|
|
62
|
+
var split = Math.random()*rand.length;
|
|
63
|
+
doTest('Random Continue Buffer', 'crc32', [rand.slice(split), ycrc32(rand.slice(0, split))], crc32(rand));
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
|
|
53
67
|
console.log('All tests passed');
|