yencode 1.1.0 → 1.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -5,10 +5,10 @@
5
5
  #include "encoder_common.h"
6
6
 
7
7
  // Clang wrongly assumes alignment on vst1q_u8_x2, and ARMv7 GCC doesn't support the function, so effectively, it can only be used in ARMv8 compilers
8
- #if defined(__aarch64__) && (defined(__clang__) || (defined(__GNUC__) && __GNUC__ >= 9))
8
+ #if defined(__aarch64__) && (defined(__clang__) || HEDLEY_GCC_VERSION_CHECK(8,5,0))
9
9
  # define vst1q_u8_x2_unaligned vst1q_u8_x2
10
10
  #else
11
- HEDLEY_ALWAYS_INLINE void vst1q_u8_x2_unaligned(uint8_t* p, uint8x16x2_t data) {
11
+ static HEDLEY_ALWAYS_INLINE void vst1q_u8_x2_unaligned(uint8_t* p, uint8x16x2_t data) {
12
12
  vst1q_u8(p, data.val[0]);
13
13
  vst1q_u8(p+16, data.val[1]);
14
14
  }
@@ -26,16 +26,16 @@ static HEDLEY_ALWAYS_INLINE void encode_eol_handle_pre(const uint8_t* HEDLEY_RES
26
26
  #ifdef __aarch64__
27
27
  uint8x16_t cmpA = vreinterpretq_u8_s8(vqtbx2q_s8(
28
28
  vdupq_n_s8('='-42),
29
- (int8x16x2_t){'\0'-42,-128,-128,'\0'-42,'\t'-42,'\n'-42,'\r'-42,'\t'-42,'\n'-42,'\r'-42,-128,-128,'\0'-42,-128,-128,-128, ' '-42,'\n'-42,'\r'-42,' '-42,-128,-128,-128,-128,-128,-128,'.'-42,-128,-128,-128,'='-42,-128},
30
- vreinterpretq_u8_s8(vhaddq_s8(vreinterpretq_s8_u8(dataA), (int8x16_t){42,48,66,66, 66,66,66,66, 66,66,66,66, 66,66,66,66}))
29
+ vcreate2_s8(vmakeq_s8('\0'-42,-128,-128,'\0'-42,'\t'-42,'\n'-42,'\r'-42,'\t'-42,'\n'-42,'\r'-42,-128,-128,'\0'-42,-128,-128,-128), vmakeq_s8(' '-42,'\n'-42,'\r'-42,' '-42,-128,-128,-128,-128,-128,-128,'.'-42,-128,-128,-128,'='-42,-128)),
30
+ vreinterpretq_u8_s8(vhaddq_s8(vreinterpretq_s8_u8(dataA), vmakeq_s8(42,48,66,66, 66,66,66,66, 66,66,66,66, 66,66,66,66)))
31
31
  ));
32
32
  cmpA = vceqq_u8(cmpA, dataA);
33
33
 
34
34
  dataB = vaddq_u8(oDataB, vdupq_n_u8(42));
35
35
  uint8x16_t cmpB = vqtbx1q_u8(
36
36
  vceqq_u8(oDataB, vdupq_n_u8('='-42)),
37
- // \0 \n \r
38
- (uint8x16_t){255,0,0,0,0,0,0,0,0,0,255,0,0,255,0,0},
37
+ // \0 \n \r
38
+ vmakeq_u8(255,0,0,0,0,0,0,0,0,0,255,0,0,255,0,0),
39
39
  dataB
40
40
  );
41
41
  dataA = vaddq_u8(dataA, vbslq_u8(cmpA, vdupq_n_u8(64+42), vdupq_n_u8(42)));
@@ -64,9 +64,9 @@ static HEDLEY_ALWAYS_INLINE void encode_eol_handle_pre(const uint8_t* HEDLEY_RES
64
64
 
65
65
  // dup low 2 bytes & compare
66
66
  uint8x8_t firstTwoChars = vreinterpret_u8_u16(vdup_lane_u16(vreinterpret_u16_u8(vget_low_u8(oDataA)), 0));
67
- uint8x8_t cmpNl = vceq_u8(firstTwoChars, vreinterpret_u8_s8((int8x8_t){
68
- ' '-42,' '-42,'\t'-42,'\t'-42,'\r'-42,'.'-42,'='-42,'='-42
69
- }));
67
+ uint8x8_t cmpNl = vceq_u8(firstTwoChars, vmake_u8(
68
+ ' '+214,' '+214,'\t'+214,'\t'+214,'\r'+214,'.'-42,'='-42,'='-42
69
+ ));
70
70
  // use padd to merge comparisons
71
71
  uint16x4_t cmpNl2 = vreinterpret_u16_u8(cmpNl);
72
72
  cmpNl2 = vpadd_u16(cmpNl2, vdup_n_u16(0));
@@ -80,8 +80,8 @@ static HEDLEY_ALWAYS_INLINE void encode_eol_handle_pre(const uint8_t* HEDLEY_RES
80
80
  #endif
81
81
 
82
82
 
83
- uint8x16_t cmpAMasked = vandq_u8(cmpA, (uint8x16_t){1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128});
84
- uint8x16_t cmpBMasked = vandq_u8(cmpB, (uint8x16_t){1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128});
83
+ uint8x16_t cmpAMasked = vandq_u8(cmpA, vmakeq_u8(1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128));
84
+ uint8x16_t cmpBMasked = vandq_u8(cmpB, vmakeq_u8(1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128));
85
85
  #ifdef __aarch64__
86
86
  uint8x16_t cmpMerge = vpaddq_u8(cmpAMasked, cmpBMasked);
87
87
  cmpMerge = vpaddq_u8(cmpMerge, cmpMerge);
@@ -95,7 +95,7 @@ static HEDLEY_ALWAYS_INLINE void encode_eol_handle_pre(const uint8_t* HEDLEY_RES
95
95
  memcpy(p, &firstChar, sizeof(firstChar));
96
96
  p += 4;
97
97
  mask ^= 1;
98
- cmpMerge = vbicq_u8(cmpMerge, (uint8x16_t){1,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0});
98
+ cmpMerge = vbicq_u8(cmpMerge, vmakeq_u8(1,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0));
99
99
  } else {
100
100
  firstChar |= 0x0a0d00;
101
101
  memcpy(p, &firstChar, sizeof(firstChar));
@@ -130,7 +130,7 @@ static HEDLEY_ALWAYS_INLINE void encode_eol_handle_pre(const uint8_t* HEDLEY_RES
130
130
  memcpy(p, &firstChar, sizeof(firstChar));
131
131
  p += 4;
132
132
  mask ^= 1;
133
- cmpPacked = vbic_u8(cmpPacked, (uint8x8_t){1,0,0,0, 0,0,0,0});
133
+ cmpPacked = vbic_u8(cmpPacked, vmake_u8(1,0,0,0, 0,0,0,0));
134
134
  } else {
135
135
  firstChar |= 0x0a0d00;
136
136
  memcpy(p, &firstChar, sizeof(firstChar));
@@ -198,7 +198,7 @@ static HEDLEY_ALWAYS_INLINE void encode_eol_handle_pre(const uint8_t* HEDLEY_RES
198
198
  #ifdef __aarch64__
199
199
  # ifdef _MSC_VER
200
200
  long bitIndex;
201
- if(_BitScanReverse64(&bitIndex, mask))
201
+ if(_BitScanReverse64((unsigned long*)&bitIndex, mask))
202
202
  bitIndex ^= 63;
203
203
  else
204
204
  bitIndex = 64;
@@ -217,11 +217,11 @@ static HEDLEY_ALWAYS_INLINE void encode_eol_handle_pre(const uint8_t* HEDLEY_RES
217
217
 
218
218
  uint8x16_t vClz = vdupq_n_u8(bitIndex & ~(sizeof(mask)*8));
219
219
  #ifdef __aarch64__
220
- uint8x16_t blendA = vcgtq_u8((uint8x16_t){63,62,61,60,51,50,49,48,47,46,45,44,35,34,33,32}, vClz);
221
- uint8x16_t blendB = vcgtq_u8((uint8x16_t){31,30,29,28,19,18,17,16,15,14,13,12, 3, 2, 1, 0}, vClz);
220
+ uint8x16_t blendA = vcgtq_u8(vmakeq_u8(63,62,61,60,51,50,49,48,47,46,45,44,35,34,33,32), vClz);
221
+ uint8x16_t blendB = vcgtq_u8(vmakeq_u8(31,30,29,28,19,18,17,16,15,14,13,12, 3, 2, 1, 0), vClz);
222
222
  #else
223
- uint8x16_t blendA = vcgtq_u8((uint8x16_t){31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16}, vClz);
224
- uint8x16_t blendB = vcgtq_u8((uint8x16_t){15,14,13,12,11,10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0}, vClz);
223
+ uint8x16_t blendA = vcgtq_u8(vmakeq_u8(31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16), vClz);
224
+ uint8x16_t blendB = vcgtq_u8(vmakeq_u8(15,14,13,12,11,10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0), vClz);
225
225
  #endif
226
226
  uint8x16_t dataAShifted = vbslq_u8(cmpA, vdupq_n_u8('='), dataA);
227
227
  uint8x16_t dataBShifted = vbslq_u8(cmpB, vdupq_n_u8('='), dataB);
@@ -230,7 +230,7 @@ static HEDLEY_ALWAYS_INLINE void encode_eol_handle_pre(const uint8_t* HEDLEY_RES
230
230
  dataA = vbslq_u8(blendA, dataAShifted, dataA);
231
231
  dataB = vbslq_u8(blendB, dataBShifted, dataB);
232
232
 
233
- vst1q_u8_x2_unaligned(p, ((uint8x16x2_t){dataA, dataB}));
233
+ vst1q_u8_x2_unaligned(p, vcreate2_u8(dataA, dataB));
234
234
  p += sizeof(uint8x16_t)*2 - 1;
235
235
  p += (mask != 0);
236
236
  col = lineSizeOffset + (mask != 0);
@@ -296,14 +296,14 @@ HEDLEY_ALWAYS_INLINE void do_encode_neon(int line_size, int* colOffset, const ui
296
296
  dataB = vaddq_u8(dataB, vdupq_n_u8(42));
297
297
  uint8x16_t cmpA = vqtbx1q_u8(
298
298
  cmpEqA,
299
- // \0 \n \r
300
- (uint8x16_t){255,0,0,0,0,0,0,0,0,0,255,0,0,255,0,0},
299
+ // \0 \n \r
300
+ vmakeq_u8(255,0,0,0,0,0,0,0,0,0,255,0,0,255,0,0),
301
301
  dataA
302
302
  );
303
303
  uint8x16_t cmpB = vqtbx1q_u8(
304
304
  cmpEqB,
305
- // \0 \n \r
306
- (uint8x16_t){255,0,0,0,0,0,0,0,0,0,255,0,0,255,0,0},
305
+ // \0 \n \r
306
+ vmakeq_u8(255,0,0,0,0,0,0,0,0,0,255,0,0,255,0,0),
307
307
  dataB
308
308
  );
309
309
 
@@ -338,8 +338,8 @@ HEDLEY_ALWAYS_INLINE void do_encode_neon(int line_size, int* colOffset, const ui
338
338
 
339
339
 
340
340
  long bitIndex; // prevent compiler whining
341
- uint8x16_t cmpAMasked = vandq_u8(cmpA, (uint8x16_t){1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128});
342
- uint8x16_t cmpBMasked = vandq_u8(cmpB, (uint8x16_t){1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128});
341
+ uint8x16_t cmpAMasked = vandq_u8(cmpA, vmakeq_u8(1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128));
342
+ uint8x16_t cmpBMasked = vandq_u8(cmpB, vmakeq_u8(1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128));
343
343
  #ifdef __aarch64__
344
344
  uint8x16_t cmpMerge = vpaddq_u8(cmpAMasked, cmpBMasked);
345
345
  cmpMerge = vpaddq_u8(cmpMerge, cmpMerge);
@@ -453,7 +453,7 @@ HEDLEY_ALWAYS_INLINE void do_encode_neon(int line_size, int* colOffset, const ui
453
453
  #ifdef __aarch64__
454
454
  # ifdef _MSC_VER
455
455
  // does this work?
456
- if(_BitScanReverse64(&bitIndex, mask))
456
+ if(_BitScanReverse64((unsigned long*)&bitIndex, mask))
457
457
  bitIndex ^= 63;
458
458
  else
459
459
  bitIndex = 64;
@@ -472,11 +472,11 @@ HEDLEY_ALWAYS_INLINE void do_encode_neon(int line_size, int* colOffset, const ui
472
472
 
473
473
  uint8x16_t vClz = vdupq_n_u8(bitIndex & ~(sizeof(mask)*8));
474
474
  #ifdef __aarch64__
475
- uint8x16_t blendA = vcgeq_u8((uint8x16_t){63,62,61,60,51,50,49,48,47,46,45,44,35,34,33,32}, vClz);
476
- uint8x16_t blendB = vcgeq_u8((uint8x16_t){31,30,29,28,19,18,17,16,15,14,13,12, 3, 2, 1, 0}, vClz);
475
+ uint8x16_t blendA = vcgeq_u8(vmakeq_u8(63,62,61,60,51,50,49,48,47,46,45,44,35,34,33,32), vClz);
476
+ uint8x16_t blendB = vcgeq_u8(vmakeq_u8(31,30,29,28,19,18,17,16,15,14,13,12, 3, 2, 1, 0), vClz);
477
477
  #else
478
- uint8x16_t blendA = vcgeq_u8((uint8x16_t){31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16}, vClz);
479
- uint8x16_t blendB = vcgeq_u8((uint8x16_t){15,14,13,12,11,10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0}, vClz);
478
+ uint8x16_t blendA = vcgeq_u8(vmakeq_u8(31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16), vClz);
479
+ uint8x16_t blendB = vcgeq_u8(vmakeq_u8(15,14,13,12,11,10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0), vClz);
480
480
  #endif
481
481
  uint8x16_t dataAShifted = vextq_u8(dataA, dataA, 15);
482
482
  uint8x16_t dataBShifted = vextq_u8(dataA, dataB, 15);
@@ -485,7 +485,7 @@ HEDLEY_ALWAYS_INLINE void do_encode_neon(int line_size, int* colOffset, const ui
485
485
  dataA = vbslq_u8(blendA, dataA, dataAShifted);
486
486
  outDataB = vbslq_u8(blendB, outDataB, dataBShifted);
487
487
 
488
- vst1q_u8_x2_unaligned(p, ((uint8x16x2_t){dataA, outDataB}));
488
+ vst1q_u8_x2_unaligned(p, vcreate2_u8(dataA, outDataB));
489
489
  p += sizeof(uint8x16_t)*2;
490
490
  // write last byte
491
491
  *p = vgetq_lane_u8(dataB, 15);
@@ -8,7 +8,7 @@
8
8
  # define _mm_mask_expand_epi8 _mm128_mask_expand_epi8
9
9
  #endif
10
10
 
11
- #if defined(__GNUC__) && __GNUC__ >= 7
11
+ #if (defined(__GNUC__) && __GNUC__ >= 7) || (defined(_MSC_VER) && _MSC_VER >= 1924)
12
12
  # define KLOAD16(a, offs) _load_mask16((__mmask16*)(a) + (offs))
13
13
  #else
14
14
  # define KLOAD16(a, offs) (((uint16_t*)(a))[(offs)])
@@ -155,7 +155,7 @@ HEDLEY_ALWAYS_INLINE void do_encode_sse(int line_size, int* colOffset, const uin
155
155
  if(len <= INPUT_OFFSET || line_size < XMM_SIZE) return;
156
156
 
157
157
  // slower CPUs prefer to branch as mispredict penalty is probably small relative to general execution
158
- #if defined(__tune_atom__) || defined(__tune_slm__) || defined(__tune_btver1__)
158
+ #if defined(__tune_atom__) || defined(__tune_slm__) || defined(__tune_btver1__) || defined(__tune_btver2__)
159
159
  const bool _PREFER_BRANCHING = true;
160
160
  #else
161
161
  const bool _PREFER_BRANCHING = (use_isa < ISA_LEVEL_SSSE3);
@@ -350,7 +350,7 @@ HEDLEY_ALWAYS_INLINE void do_encode_sse(int line_size, int* colOffset, const uin
350
350
  #if defined(__POPCNT__) && !defined(__tune_btver1__)
351
351
  if(use_isa & ISA_FEATURE_POPCNT) {
352
352
  shuf2Len = popcnt32(maskA) + 16;
353
- # if defined(__tune_znver2__) || defined(__tune_znver1__) || defined(__tune_btver2__)
353
+ # if defined(__tune_znver3__) || defined(__tune_znver2__) || defined(__tune_znver1__) || defined(__tune_btver2__)
354
354
  shuf1Len = popcnt32(m1) + 8;
355
355
  shuf3Len = popcnt32(m3) + shuf2Len + 8;
356
356
  # else
@@ -412,8 +412,8 @@ HEDLEY_ALWAYS_INLINE void do_encode_sse(int line_size, int* colOffset, const uin
412
412
  asm(
413
413
  "shrl $1, %[eqMask] \n"
414
414
  "shrl %%cl, %[eqMask] \n" // TODO: can use shrq to avoid above shift?
415
- # if defined(PLATFORM_AMD64)
416
- "adcq %[col], %[p] \n"
415
+ # if defined(PLATFORM_AMD64) && !defined(__ILP32__)
416
+ "adcq %q[col], %q[p] \n"
417
417
  # else
418
418
  "adcl %[col], %[p] \n"
419
419
  # endif
@@ -538,8 +538,7 @@ HEDLEY_ALWAYS_INLINE void do_encode_sse(int line_size, int* colOffset, const uin
538
538
 
539
539
  dataA = _mm_shuffle_epi8(dataA, shufMaskA);
540
540
 
541
- # if defined(__SSE4_1__) && !defined(__tune_slm__) && !defined(__tune_goldmont__) && !defined(__tune_goldmont_plus__)
542
- // unsure if worth on: Jaguar/Puma (3|2), Core2 (2|2)
541
+ # if defined(__SSE4_1__) && !defined(__tune_slm__) && !defined(__tune_goldmont__) && !defined(__tune_goldmont_plus__) && !defined(__tune_tremont__)
543
542
  if(use_isa >= ISA_LEVEL_SSE41) {
544
543
  dataB = _mm_blendv_epi8(dataBShifted, dataB, mergeMaskB);
545
544
  } else
@@ -717,7 +716,7 @@ HEDLEY_ALWAYS_INLINE void do_encode_sse(int line_size, int* colOffset, const uin
717
716
  }
718
717
  } while(i < 0);
719
718
 
720
- *colOffset = col + line_size -1;
719
+ *colOffset = (int)(col + line_size -1);
721
720
  dest = p;
722
721
  len = -(i - INPUT_OFFSET);
723
722
  }
@@ -0,0 +1,23 @@
1
+ #include "common.h"
2
+
3
+ #if defined(__AVX512VL__) && defined(__AVX512VBMI2__) && defined(__AVX512BW__)
4
+ # ifndef YENC_DISABLE_AVX256
5
+ # include "encoder_avx_base.h"
6
+
7
+ void encoder_vbmi2_init() {
8
+ _do_encode = &do_encode_simd< do_encode_avx2<ISA_LEVEL_VBMI2> >;
9
+ encoder_avx2_lut<ISA_LEVEL_VBMI2>();
10
+ }
11
+ # else
12
+ # include "encoder_sse_base.h"
13
+ void encoder_vbmi2_init() {
14
+ _do_encode = &do_encode_simd< do_encode_sse<ISA_LEVEL_VBMI2> >;
15
+ encoder_sse_lut<ISA_LEVEL_VBMI2>();
16
+ }
17
+ # endif
18
+ #else
19
+ void encoder_avx2_init();
20
+ void encoder_vbmi2_init() {
21
+ encoder_avx2_init();
22
+ }
23
+ #endif
package/src/platform.cc CHANGED
@@ -2,16 +2,36 @@
2
2
  #ifdef PLATFORM_ARM
3
3
  # ifdef __ANDROID__
4
4
  # include <cpu-features.h>
5
- # elif defined(__linux__)
5
+ # elif defined(__linux__) || (defined(__FreeBSD__) && __FreeBSD__ >= 12)
6
6
  # include <sys/auxv.h>
7
7
  # include <asm/hwcap.h>
8
+ # elif (defined(__FreeBSD__) && __FreeBSD__ < 12)
9
+ # include <sys/sysctl.h>
10
+ # include <asm/hwcap.h>
11
+ # elif defined(_WIN32)
12
+ # define WIN32_LEAN_AND_MEAN
13
+ # define NOMINMAX
14
+ # include <Windows.h>
15
+ # elif defined(__APPLE__)
16
+ # include <sys/types.h>
17
+ # include <sys/sysctl.h>
8
18
  # endif
9
19
  bool cpu_supports_neon() {
10
20
  # if defined(AT_HWCAP)
11
- # ifdef __aarch64__
12
- return getauxval(AT_HWCAP) & HWCAP_ASIMD;
21
+ # ifdef __FreeBSD__
22
+ unsigned long supported;
23
+ elf_aux_info(AT_HWCAP, &supported, sizeof(supported));
24
+ # ifdef __aarch64__
25
+ return supported & HWCAP_ASIMD;
26
+ # else
27
+ return supported & HWCAP_NEON;
28
+ # endif
13
29
  # else
30
+ # ifdef __aarch64__
31
+ return getauxval(AT_HWCAP) & HWCAP_ASIMD;
32
+ # else
14
33
  return getauxval(AT_HWCAP) & HWCAP_NEON;
34
+ # endif
15
35
  # endif
16
36
  # elif defined(ANDROID_CPU_FAMILY_ARM)
17
37
  # ifdef __aarch64__
@@ -19,14 +39,23 @@ bool cpu_supports_neon() {
19
39
  # else
20
40
  return android_getCpuFeatures() & ANDROID_CPU_ARM_FEATURE_NEON;
21
41
  # endif
42
+ # elif defined(_WIN32)
43
+ return IsProcessorFeaturePresent(PF_ARM_NEON_INSTRUCTIONS_AVAILABLE);
44
+ # elif defined(__APPLE__)
45
+ int supported = 0;
46
+ size_t len = sizeof(supported);
47
+ if(sysctlbyname("hw.optional.neon", &supported, &len, NULL, 0))
48
+ return false;
49
+ return (bool)supported;
22
50
  # endif
23
- return true; // assume NEON support, if compiled as such, otherwise
51
+ return true; // assume NEON support, if compiled as such, otherwise (I think Windows and iOS require it)
24
52
  }
25
53
  #endif
26
54
 
27
55
 
28
56
  #ifdef PLATFORM_X86
29
57
  #ifdef _MSC_VER
58
+ # define _cpuid1(ar) __cpuid(ar, 1)
30
59
  # define _cpuid1x(ar) __cpuid(ar, 0x80000001)
31
60
  # if _MSC_VER >= 1600
32
61
  # define _cpuidX __cpuidex
@@ -38,6 +67,8 @@ bool cpu_supports_neon() {
38
67
  # define _GET_XCR() 0
39
68
  # endif
40
69
  #else
70
+ # include <cpuid.h>
71
+ # define _cpuid1(ar) __cpuid(1, ar[0], ar[1], ar[2], ar[3])
41
72
  # define _cpuid1x(ar) __cpuid(0x80000001, ar[0], ar[1], ar[2], ar[3])
42
73
  # define _cpuidX(ar, eax, ecx) __cpuid_count(eax, ecx, ar[0], ar[1], ar[2], ar[3])
43
74
  static inline int _GET_XCR() {
@@ -84,11 +115,9 @@ int cpu_supports_isa() {
84
115
  // AMD Bobcat with slow SSSE3 instructions - pretend it doesn't exist
85
116
  return ret | ISA_LEVEL_SSE2;
86
117
 
87
- // Jaguar/Puma performance unkown (slowish PSHUFB/PBLENDVB)
88
-
89
118
  if((flags[2] & 0x200) == 0x200) { // SSSE3
90
- if(family == 6 && (model == 0x5c || model == 0x5f || model == 0x7a))
91
- // Intel Goldmont/plus with slow PBLENDVB
119
+ if(family == 6 && (model == 0x5c || model == 0x5f || model == 0x7a || model == 0x9c))
120
+ // Intel Goldmont/plus / Tremont with slow PBLENDVB
92
121
  return ret | ISA_LEVEL_SSSE3;
93
122
 
94
123
  if(flags[2] & 0x80000) { // SSE4.1
@@ -116,4 +145,24 @@ int cpu_supports_isa() {
116
145
  return ret | ISA_LEVEL_SSE2;
117
146
  }
118
147
 
148
+ int cpu_supports_crc_isa() {
149
+ int flags[4];
150
+ _cpuid1(flags);
151
+
152
+ if((flags[2] & 0x80202) == 0x80202) { // SSE4.1 + SSSE3 + CLMUL
153
+ if((flags[2] & 0x18000000) == 0x18000000) { // OSXSAVE + AVX
154
+ int xcr = _GET_XCR() & 0xff; // ignore unused bits
155
+ if((xcr & 6) == 6) { // AVX enabled
156
+ int cpuInfo[4];
157
+ _cpuidX(cpuInfo, 7, 0);
158
+ if((cpuInfo[1] & 0x20) == 0x20 && (cpuInfo[2] & 0x400) == 0x400) { // AVX2 + VPCLMULQDQ
159
+ return 2;
160
+ }
161
+ }
162
+ }
163
+ return 1;
164
+ }
165
+ return 0;
166
+ }
167
+
119
168
  #endif // PLATFORM_X86
package/src/yencode.cc CHANGED
@@ -12,11 +12,6 @@
12
12
 
13
13
  using namespace v8;
14
14
 
15
- union crc32 {
16
- uint32_t u32;
17
- unsigned char u8a[4];
18
- };
19
-
20
15
  static void free_buffer(char* data, void* _size) {
21
16
  #if !NODE_VERSION_AT_LEAST(0, 11, 0)
22
17
  int size = (int)(size_t)_size;
@@ -252,7 +247,7 @@ FUNC(Decode) {
252
247
  isRaw = ARG_TO_BOOL(args[1]);
253
248
 
254
249
  unsigned char *result = (unsigned char*) malloc(arg_len);
255
- size_t len = (isRaw ? do_decode<true> : do_decode<false>)((const unsigned char*)node::Buffer::Data(args[0]), result, arg_len, NULL);
250
+ size_t len = do_decode(isRaw, (const unsigned char*)node::Buffer::Data(args[0]), result, arg_len, NULL);
256
251
  result = (unsigned char*)realloc(result, len);
257
252
  MARK_EXT_MEM(len);
258
253
  RETURN_VAL( NEW_BUFFER((char*)result, len, free_buffer, (void*)len) );
@@ -276,7 +271,7 @@ FUNC(DecodeTo) {
276
271
  if (args.Length() > 2)
277
272
  isRaw = ARG_TO_BOOL(args[2]);
278
273
 
279
- size_t len = (isRaw ? do_decode<true> : do_decode<false>)((const unsigned char*)node::Buffer::Data(args[0]), (unsigned char*)node::Buffer::Data(args[1]), arg_len, NULL);
274
+ size_t len = do_decode(isRaw, (const unsigned char*)node::Buffer::Data(args[0]), (unsigned char*)node::Buffer::Data(args[1]), arg_len, NULL);
280
275
  RETURN_VAL( Integer::New(ISOLATE len) );
281
276
  }
282
277
 
@@ -336,17 +331,23 @@ FUNC(DecodeIncr) {
336
331
  }
337
332
 
338
333
 
339
- #if NODE_VERSION_AT_LEAST(3, 0, 0)
340
- // for whatever reason, iojs 3 gives buffer corruption if you pass in a pointer without a free function
341
- #define RETURN_CRC(x) do { \
342
- Local<Object> buff = NEW_BUFFER(4); \
343
- memcpy(node::Buffer::Data(buff), &x.u32, sizeof(uint32_t)); \
344
- args.GetReturnValue().Set( buff ); \
345
- } while(0)
346
- #else
347
- #define RETURN_CRC(x) RETURN_VAL( NEW_BUFFER((char*)x.u8a, 4) )
334
+ static inline uint32_t read_crc32(const Local<Value>& buf) {
335
+ const uint8_t* arr = (const uint8_t*)node::Buffer::Data(buf);
336
+ return (((uint_fast32_t)arr[0] << 24) | ((uint_fast32_t)arr[1] << 16) | ((uint_fast32_t)arr[2] << 8) | (uint_fast32_t)arr[3]);
337
+ }
338
+ static inline Local<Object> pack_crc32(
339
+ #if NODE_VERSION_AT_LEAST(0, 11, 0)
340
+ Isolate* isolate,
348
341
  #endif
349
-
342
+ uint32_t crc) {
343
+ Local<Object> buff = NEW_BUFFER(4);
344
+ unsigned char* d = (unsigned char*)node::Buffer::Data(buff);
345
+ d[0] = (unsigned char)(crc >> 24) & 0xFF;
346
+ d[1] = (unsigned char)(crc >> 16) & 0xFF;
347
+ d[2] = (unsigned char)(crc >> 8) & 0xFF;
348
+ d[3] = (unsigned char)crc & 0xFF;
349
+ return buff;
350
+ }
350
351
 
351
352
  // crc32(str, init)
352
353
  FUNC(CRC32) {
@@ -356,25 +357,18 @@ FUNC(CRC32) {
356
357
  RETURN_ERROR("You must supply a Buffer");
357
358
  // TODO: support string args??
358
359
 
359
- union crc32 init;
360
- init.u32 = 0;
360
+ uint32_t crc = 0;
361
361
  if (args.Length() >= 2) {
362
362
  if (!node::Buffer::HasInstance(args[1]) || node::Buffer::Length(args[1]) != 4)
363
363
  RETURN_ERROR("Second argument must be a 4 byte buffer");
364
- memcpy(&init.u32, node::Buffer::Data(args[1]), sizeof(uint32_t));
365
- do_crc32_incremental(
366
- (const void*)node::Buffer::Data(args[0]),
367
- node::Buffer::Length(args[0]),
368
- init.u8a
369
- );
370
- } else {
371
- do_crc32(
372
- (const void*)node::Buffer::Data(args[0]),
373
- node::Buffer::Length(args[0]),
374
- init.u8a
375
- );
364
+ crc = read_crc32(args[1]);
376
365
  }
377
- RETURN_CRC(init);
366
+ crc = do_crc32(
367
+ (const void*)node::Buffer::Data(args[0]),
368
+ node::Buffer::Length(args[0]),
369
+ crc
370
+ );
371
+ RETURN_VAL(pack_crc32(ISOLATE crc));
378
372
  }
379
373
 
380
374
  FUNC(CRC32Combine) {
@@ -386,14 +380,11 @@ FUNC(CRC32Combine) {
386
380
  || !node::Buffer::HasInstance(args[1]) || node::Buffer::Length(args[1]) != 4)
387
381
  RETURN_ERROR("You must supply a 4 byte Buffer for the first two arguments");
388
382
 
389
- union crc32 crc1, crc2;
383
+ uint32_t crc1 = read_crc32(args[0]), crc2 = read_crc32(args[1]);
390
384
  size_t len = (size_t)ARG_TO_INT(args[2]);
391
385
 
392
- memcpy(&crc1.u32, node::Buffer::Data(args[0]), sizeof(uint32_t));
393
- memcpy(&crc2.u32, node::Buffer::Data(args[1]), sizeof(uint32_t));
394
-
395
- do_crc32_combine(crc1.u8a, crc2.u8a, len);
396
- RETURN_CRC(crc1);
386
+ crc1 = do_crc32_combine(crc1, crc2, len);
387
+ RETURN_VAL(pack_crc32(ISOLATE crc1));
397
388
  }
398
389
 
399
390
  FUNC(CRC32Zeroes) {
@@ -402,17 +393,15 @@ FUNC(CRC32Zeroes) {
402
393
  if (args.Length() < 1)
403
394
  RETURN_ERROR("At least 1 argument required");
404
395
 
405
- union crc32 crc1;
396
+ uint32_t crc1 = 0;
406
397
  if (args.Length() >= 2) {
407
398
  if (!node::Buffer::HasInstance(args[1]) || node::Buffer::Length(args[1]) != 4)
408
399
  RETURN_ERROR("Second argument must be a 4 byte buffer");
409
- memcpy(&crc1.u32, node::Buffer::Data(args[1]), sizeof(uint32_t));
410
- } else {
411
- crc1.u32 = 0;
400
+ crc1 = read_crc32(args[1]);
412
401
  }
413
402
  size_t len = (size_t)ARG_TO_INT(args[0]);
414
- do_crc32_zeros(crc1.u8a, len);
415
- RETURN_CRC(crc1);
403
+ crc1 = do_crc32_zeros(crc1, len);
404
+ RETURN_VAL(pack_crc32(ISOLATE crc1));
416
405
  }
417
406
 
418
407
  static void init_all() {
package/test/testcrc.js CHANGED
@@ -50,4 +50,18 @@ doTest('Random', 'crc32', 'fj[-oqijnw34-59n26 4345j8yn89032q78t9ab9gabh023quhoiB
50
50
  doTest('Random Continue', 'crc32', ['KZSHZ5EDOVAmDdakZZOrGSUGGKSpCJoWH7M0MHy6ohnSzvHY4DjpxXmyfWYJQoJ7tKdNhGcuRVUzrgXM', ycrc32('BdenbmoBgiB10ZkeUBjrsZV3dg2Da2fhHqU9TMdi69AHhLRck3Nk60YuFBXh6lvtefBpjdTxbeEmsaEm')], crc32('BdenbmoBgiB10ZkeUBjrsZV3dg2Da2fhHqU9TMdi69AHhLRck3Nk60YuFBXh6lvtefBpjdTxbeEmsaEmKZSHZ5EDOVAmDdakZZOrGSUGGKSpCJoWH7M0MHy6ohnSzvHY4DjpxXmyfWYJQoJ7tKdNhGcuRVUzrgXM'));
51
51
 
52
52
 
53
+ // random tests
54
+ for(var i=1; i<128; i++) {
55
+ var rand = require('crypto').pseudoRandomBytes(i);
56
+ doTest('Random Short Buffer', 'crc32', rand);
57
+ }
58
+ for(var i=0; i<32; i++) {
59
+ var rand = require('crypto').pseudoRandomBytes(100000);
60
+ doTest('Random Buffer', 'crc32', rand);
61
+
62
+ var split = Math.random()*rand.length;
63
+ doTest('Random Continue Buffer', 'crc32', [rand.slice(split), ycrc32(rand.slice(0, split))], crc32(rand));
64
+ }
65
+
66
+
53
67
  console.log('All tests passed');