npm - yencode - Versions diffs - 1.1.0 → 1.1.3 - Mend

yencode 1.1.0 → 1.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (28) hide show

package/binding.gyp +79 -7
package/crcutil-1.0/code/multiword_64_64_intrinsic_i386_mmx.cc +1 -1
package/package.json +1 -1
package/src/common.h +88 -24
package/src/crc.cc +59 -27
package/src/crc.h +20 -6
package/src/crc_arm.cc +154 -27
package/src/crc_common.h +3 -10
package/src/{crc_folding.c → crc_folding.cc} +53 -122
package/src/crc_folding_256.cc +230 -0
package/src/decoder.cc +10 -4
package/src/decoder.h +16 -2
package/src/decoder_avx2_base.h +32 -21
package/src/decoder_common.h +2 -2
package/src/decoder_neon.cc +37 -37
package/src/decoder_neon64.cc +41 -36
package/src/decoder_sse_base.h +21 -14
package/src/decoder_vbmi2.cc +30 -0
package/src/encoder.cc +9 -3
package/src/encoder.h +17 -1
package/src/encoder_avx_base.h +8 -8
package/src/encoder_common.h +3 -3
package/src/encoder_neon.cc +31 -31
package/src/encoder_sse_base.h +7 -8
package/src/encoder_vbmi2.cc +23 -0
package/src/platform.cc +57 -8
package/src/yencode.cc +33 -44
package/test/testcrc.js +14 -0

package/src/encoder_neon.cc CHANGED Viewed

@@ -5,10 +5,10 @@
 #include "encoder_common.h"
 // Clang wrongly assumes alignment on vst1q_u8_x2, and ARMv7 GCC doesn't support the function, so effectively, it can only be used in ARMv8 compilers
-#if defined(__aarch64__) && (defined(__clang__) || (defined(__GNUC__) && __GNUC__ >= 9))
+#if defined(__aarch64__) && (defined(__clang__) || HEDLEY_GCC_VERSION_CHECK(8,5,0))
 # define vst1q_u8_x2_unaligned vst1q_u8_x2
 #else
-HEDLEY_ALWAYS_INLINE void vst1q_u8_x2_unaligned(uint8_t* p, uint8x16x2_t data) {
+static HEDLEY_ALWAYS_INLINE void vst1q_u8_x2_unaligned(uint8_t* p, uint8x16x2_t data) {
 	vst1q_u8(p, data.val[0]);
 	vst1q_u8(p+16, data.val[1]);
 }
@@ -26,16 +26,16 @@ static HEDLEY_ALWAYS_INLINE void encode_eol_handle_pre(const uint8_t* HEDLEY_RES
 #ifdef __aarch64__
 	uint8x16_t cmpA = vreinterpretq_u8_s8(vqtbx2q_s8(
 		vdupq_n_s8('='-42),
-		(int8x16x2_t){'\0'-42,-128,-128,'\0'-42,'\t'-42,'\n'-42,'\r'-42,'\t'-42,'\n'-42,'\r'-42,-128,-128,'\0'-42,-128,-128,-128, ' '-42,'\n'-42,'\r'-42,' '-42,-128,-128,-128,-128,-128,-128,'.'-42,-128,-128,-128,'='-42,-128},
-		vreinterpretq_u8_s8(vhaddq_s8(vreinterpretq_s8_u8(dataA), (int8x16_t){42,48,66,66, 66,66,66,66, 66,66,66,66, 66,66,66,66}))
+		vcreate2_s8(vmakeq_s8('\0'-42,-128,-128,'\0'-42,'\t'-42,'\n'-42,'\r'-42,'\t'-42,'\n'-42,'\r'-42,-128,-128,'\0'-42,-128,-128,-128), vmakeq_s8(' '-42,'\n'-42,'\r'-42,' '-42,-128,-128,-128,-128,-128,-128,'.'-42,-128,-128,-128,'='-42,-128)),
+		vreinterpretq_u8_s8(vhaddq_s8(vreinterpretq_s8_u8(dataA), vmakeq_s8(42,48,66,66, 66,66,66,66, 66,66,66,66, 66,66,66,66)))
 	));
 	cmpA = vceqq_u8(cmpA, dataA);
 	dataB = vaddq_u8(oDataB, vdupq_n_u8(42));
 	uint8x16_t cmpB = vqtbx1q_u8(
 		vceqq_u8(oDataB, vdupq_n_u8('='-42)),
-		//            \0                    \n      \r
-		(uint8x16_t){255,0,0,0,0,0,0,0,0,0,255,0,0,255,0,0},
+		//         \0                    \n      \r
+		vmakeq_u8(255,0,0,0,0,0,0,0,0,0,255,0,0,255,0,0),
 		dataB
 	);
 	dataA = vaddq_u8(dataA, vbslq_u8(cmpA, vdupq_n_u8(64+42), vdupq_n_u8(42)));
@@ -64,9 +64,9 @@ static HEDLEY_ALWAYS_INLINE void encode_eol_handle_pre(const uint8_t* HEDLEY_RES
 	// dup low 2 bytes & compare
 	uint8x8_t firstTwoChars = vreinterpret_u8_u16(vdup_lane_u16(vreinterpret_u16_u8(vget_low_u8(oDataA)), 0));
-	uint8x8_t cmpNl = vceq_u8(firstTwoChars, vreinterpret_u8_s8((int8x8_t){
-		' '-42,' '-42,'\t'-42,'\t'-42,'\r'-42,'.'-42,'='-42,'='-42
-	}));
+	uint8x8_t cmpNl = vceq_u8(firstTwoChars, vmake_u8(
+		' '+214,' '+214,'\t'+214,'\t'+214,'\r'+214,'.'-42,'='-42,'='-42
+	));
 	// use padd to merge comparisons
 	uint16x4_t cmpNl2 = vreinterpret_u16_u8(cmpNl);
 	cmpNl2 = vpadd_u16(cmpNl2, vdup_n_u16(0));
@@ -80,8 +80,8 @@ static HEDLEY_ALWAYS_INLINE void encode_eol_handle_pre(const uint8_t* HEDLEY_RES
 #endif
-	uint8x16_t cmpAMasked = vandq_u8(cmpA, (uint8x16_t){1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128});
-	uint8x16_t cmpBMasked = vandq_u8(cmpB, (uint8x16_t){1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128});
+	uint8x16_t cmpAMasked = vandq_u8(cmpA, vmakeq_u8(1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128));
+	uint8x16_t cmpBMasked = vandq_u8(cmpB, vmakeq_u8(1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128));
 #ifdef __aarch64__
 	uint8x16_t cmpMerge = vpaddq_u8(cmpAMasked, cmpBMasked);
 	cmpMerge = vpaddq_u8(cmpMerge, cmpMerge);
@@ -95,7 +95,7 @@ static HEDLEY_ALWAYS_INLINE void encode_eol_handle_pre(const uint8_t* HEDLEY_RES
 		memcpy(p, &firstChar, sizeof(firstChar));
 		p += 4;
 		mask ^= 1;
-		cmpMerge = vbicq_u8(cmpMerge, (uint8x16_t){1,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0});
+		cmpMerge = vbicq_u8(cmpMerge, vmakeq_u8(1,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0));
 	} else {
 		firstChar |= 0x0a0d00;
 		memcpy(p, &firstChar, sizeof(firstChar));
@@ -130,7 +130,7 @@ static HEDLEY_ALWAYS_INLINE void encode_eol_handle_pre(const uint8_t* HEDLEY_RES
 		memcpy(p, &firstChar, sizeof(firstChar));
 		p += 4;
 		mask ^= 1;
-		cmpPacked = vbic_u8(cmpPacked, (uint8x8_t){1,0,0,0, 0,0,0,0});
+		cmpPacked = vbic_u8(cmpPacked, vmake_u8(1,0,0,0, 0,0,0,0));
 	} else {
 		firstChar |= 0x0a0d00;
 		memcpy(p, &firstChar, sizeof(firstChar));
@@ -198,7 +198,7 @@ static HEDLEY_ALWAYS_INLINE void encode_eol_handle_pre(const uint8_t* HEDLEY_RES
 #ifdef __aarch64__
 # ifdef _MSC_VER
 		long bitIndex;
-		if(_BitScanReverse64(&bitIndex, mask))
+		if(_BitScanReverse64((unsigned long*)&bitIndex, mask))
 			bitIndex ^= 63;
 		else
 			bitIndex = 64;
@@ -217,11 +217,11 @@ static HEDLEY_ALWAYS_INLINE void encode_eol_handle_pre(const uint8_t* HEDLEY_RES
 		uint8x16_t vClz = vdupq_n_u8(bitIndex & ~(sizeof(mask)*8));
 #ifdef __aarch64__
-		uint8x16_t blendA = vcgtq_u8((uint8x16_t){63,62,61,60,51,50,49,48,47,46,45,44,35,34,33,32}, vClz);
-		uint8x16_t blendB = vcgtq_u8((uint8x16_t){31,30,29,28,19,18,17,16,15,14,13,12, 3, 2, 1, 0}, vClz);
+		uint8x16_t blendA = vcgtq_u8(vmakeq_u8(63,62,61,60,51,50,49,48,47,46,45,44,35,34,33,32), vClz);
+		uint8x16_t blendB = vcgtq_u8(vmakeq_u8(31,30,29,28,19,18,17,16,15,14,13,12, 3, 2, 1, 0), vClz);
 #else
-		uint8x16_t blendA = vcgtq_u8((uint8x16_t){31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16}, vClz);
-		uint8x16_t blendB = vcgtq_u8((uint8x16_t){15,14,13,12,11,10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0}, vClz);
+		uint8x16_t blendA = vcgtq_u8(vmakeq_u8(31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16), vClz);
+		uint8x16_t blendB = vcgtq_u8(vmakeq_u8(15,14,13,12,11,10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0), vClz);
 #endif
 		uint8x16_t dataAShifted = vbslq_u8(cmpA, vdupq_n_u8('='), dataA);
 		uint8x16_t dataBShifted = vbslq_u8(cmpB, vdupq_n_u8('='), dataB);
@@ -230,7 +230,7 @@ static HEDLEY_ALWAYS_INLINE void encode_eol_handle_pre(const uint8_t* HEDLEY_RES
 		dataA = vbslq_u8(blendA, dataAShifted, dataA);
 		dataB = vbslq_u8(blendB, dataBShifted, dataB);
-		vst1q_u8_x2_unaligned(p, ((uint8x16x2_t){dataA, dataB}));
+		vst1q_u8_x2_unaligned(p, vcreate2_u8(dataA, dataB));
 		p += sizeof(uint8x16_t)*2 - 1;
 		p += (mask != 0);
 		col = lineSizeOffset + (mask != 0);
@@ -296,14 +296,14 @@ HEDLEY_ALWAYS_INLINE void do_encode_neon(int line_size, int* colOffset, const ui
 		dataB = vaddq_u8(dataB, vdupq_n_u8(42));
 		uint8x16_t cmpA = vqtbx1q_u8(
 			cmpEqA,
-			//            \0                    \n      \r
-			(uint8x16_t){255,0,0,0,0,0,0,0,0,0,255,0,0,255,0,0},
+			//         \0                    \n      \r
+			vmakeq_u8(255,0,0,0,0,0,0,0,0,0,255,0,0,255,0,0),
 			dataA
 		);
 		uint8x16_t cmpB = vqtbx1q_u8(
 			cmpEqB,
-			//            \0                    \n      \r
-			(uint8x16_t){255,0,0,0,0,0,0,0,0,0,255,0,0,255,0,0},
+			//         \0                    \n      \r
+			vmakeq_u8(255,0,0,0,0,0,0,0,0,0,255,0,0,255,0,0),
 			dataB
 		);
@@ -338,8 +338,8 @@ HEDLEY_ALWAYS_INLINE void do_encode_neon(int line_size, int* colOffset, const ui
 		long bitIndex; // prevent compiler whining
-		uint8x16_t cmpAMasked = vandq_u8(cmpA, (uint8x16_t){1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128});
-		uint8x16_t cmpBMasked = vandq_u8(cmpB, (uint8x16_t){1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128});
+		uint8x16_t cmpAMasked = vandq_u8(cmpA, vmakeq_u8(1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128));
+		uint8x16_t cmpBMasked = vandq_u8(cmpB, vmakeq_u8(1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128));
 #ifdef __aarch64__
 		uint8x16_t cmpMerge = vpaddq_u8(cmpAMasked, cmpBMasked);
 		cmpMerge = vpaddq_u8(cmpMerge, cmpMerge);
@@ -453,7 +453,7 @@ HEDLEY_ALWAYS_INLINE void do_encode_neon(int line_size, int* colOffset, const ui
 #ifdef __aarch64__
 # ifdef _MSC_VER
 				// does this work?
-				if(_BitScanReverse64(&bitIndex, mask))
+				if(_BitScanReverse64((unsigned long*)&bitIndex, mask))
 					bitIndex ^= 63;
 				else
 					bitIndex = 64;
@@ -472,11 +472,11 @@ HEDLEY_ALWAYS_INLINE void do_encode_neon(int line_size, int* colOffset, const ui
 				uint8x16_t vClz = vdupq_n_u8(bitIndex & ~(sizeof(mask)*8));
 #ifdef __aarch64__
-				uint8x16_t blendA = vcgeq_u8((uint8x16_t){63,62,61,60,51,50,49,48,47,46,45,44,35,34,33,32}, vClz);
-				uint8x16_t blendB = vcgeq_u8((uint8x16_t){31,30,29,28,19,18,17,16,15,14,13,12, 3, 2, 1, 0}, vClz);
+				uint8x16_t blendA = vcgeq_u8(vmakeq_u8(63,62,61,60,51,50,49,48,47,46,45,44,35,34,33,32), vClz);
+				uint8x16_t blendB = vcgeq_u8(vmakeq_u8(31,30,29,28,19,18,17,16,15,14,13,12, 3, 2, 1, 0), vClz);
 #else
-				uint8x16_t blendA = vcgeq_u8((uint8x16_t){31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16}, vClz);
-				uint8x16_t blendB = vcgeq_u8((uint8x16_t){15,14,13,12,11,10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0}, vClz);
+				uint8x16_t blendA = vcgeq_u8(vmakeq_u8(31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16), vClz);
+				uint8x16_t blendB = vcgeq_u8(vmakeq_u8(15,14,13,12,11,10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0), vClz);
 #endif
 				uint8x16_t dataAShifted = vextq_u8(dataA, dataA, 15);
 				uint8x16_t dataBShifted = vextq_u8(dataA, dataB, 15);
@@ -485,7 +485,7 @@ HEDLEY_ALWAYS_INLINE void do_encode_neon(int line_size, int* colOffset, const ui
 				dataA = vbslq_u8(blendA, dataA, dataAShifted);
 				outDataB = vbslq_u8(blendB, outDataB, dataBShifted);
-				vst1q_u8_x2_unaligned(p, ((uint8x16x2_t){dataA, outDataB}));
+				vst1q_u8_x2_unaligned(p, vcreate2_u8(dataA, outDataB));
 				p += sizeof(uint8x16_t)*2;
 				// write last byte
 				*p = vgetq_lane_u8(dataB, 15);

package/src/encoder_sse_base.h CHANGED Viewed

@@ -8,7 +8,7 @@
 # define _mm_mask_expand_epi8 _mm128_mask_expand_epi8
 #endif
-#if defined(__GNUC__) && __GNUC__ >= 7
+#if (defined(__GNUC__) && __GNUC__ >= 7) || (defined(_MSC_VER) && _MSC_VER >= 1924)
 # define KLOAD16(a, offs) _load_mask16((__mmask16*)(a) + (offs))
 #else
 # define KLOAD16(a, offs) (((uint16_t*)(a))[(offs)])
@@ -155,7 +155,7 @@ HEDLEY_ALWAYS_INLINE void do_encode_sse(int line_size, int* colOffset, const uin
 	if(len <= INPUT_OFFSET || line_size < XMM_SIZE) return;
 	// slower CPUs prefer to branch as mispredict penalty is probably small relative to general execution
-#if defined(__tune_atom__) || defined(__tune_slm__) || defined(__tune_btver1__)
+#if defined(__tune_atom__) || defined(__tune_slm__) || defined(__tune_btver1__) || defined(__tune_btver2__)
 	const bool _PREFER_BRANCHING = true;
 #else
 	const bool _PREFER_BRANCHING = (use_isa < ISA_LEVEL_SSSE3);
@@ -350,7 +350,7 @@ HEDLEY_ALWAYS_INLINE void do_encode_sse(int line_size, int* colOffset, const uin
 #if defined(__POPCNT__) && !defined(__tune_btver1__)
 				if(use_isa & ISA_FEATURE_POPCNT) {
 					shuf2Len = popcnt32(maskA) + 16;
-# if defined(__tune_znver2__) || defined(__tune_znver1__) || defined(__tune_btver2__)
+# if defined(__tune_znver3__) || defined(__tune_znver2__) || defined(__tune_znver1__) || defined(__tune_btver2__)
 					shuf1Len = popcnt32(m1) + 8;
 					shuf3Len = popcnt32(m3) + shuf2Len + 8;
 # else
@@ -412,8 +412,8 @@ HEDLEY_ALWAYS_INLINE void do_encode_sse(int line_size, int* colOffset, const uin
 						asm(
 							"shrl $1, %[eqMask] \n"
 							"shrl %%cl, %[eqMask] \n" // TODO: can use shrq to avoid above shift?
-# if defined(PLATFORM_AMD64)
-							"adcq %[col], %[p] \n"
+# if defined(PLATFORM_AMD64) && !defined(__ILP32__)
+							"adcq %q[col], %q[p] \n"
 # else
 							"adcl %[col], %[p] \n"
 # endif
@@ -538,8 +538,7 @@ HEDLEY_ALWAYS_INLINE void do_encode_sse(int line_size, int* colOffset, const uin
 					dataA = _mm_shuffle_epi8(dataA, shufMaskA);
-# if defined(__SSE4_1__) && !defined(__tune_slm__) && !defined(__tune_goldmont__) && !defined(__tune_goldmont_plus__)
-					// unsure if worth on: Jaguar/Puma (3|2), Core2 (2|2)
+# if defined(__SSE4_1__) && !defined(__tune_slm__) && !defined(__tune_goldmont__) && !defined(__tune_goldmont_plus__) && !defined(__tune_tremont__)
 					if(use_isa >= ISA_LEVEL_SSE41) {
 						dataB = _mm_blendv_epi8(dataBShifted, dataB, mergeMaskB);
 					} else
@@ -717,7 +716,7 @@ HEDLEY_ALWAYS_INLINE void do_encode_sse(int line_size, int* colOffset, const uin
 		}
 	} while(i < 0);
-	*colOffset = col + line_size -1;
+	*colOffset = (int)(col + line_size -1);
 	dest = p;
 	len = -(i - INPUT_OFFSET);
 }

package/src/encoder_vbmi2.cc ADDED Viewed

@@ -0,0 +1,23 @@
+#include "common.h"
+#if defined(__AVX512VL__) && defined(__AVX512VBMI2__) && defined(__AVX512BW__)
+# ifndef YENC_DISABLE_AVX256
+#  include "encoder_avx_base.h"
+void encoder_vbmi2_init() {
+	_do_encode = &do_encode_simd< do_encode_avx2<ISA_LEVEL_VBMI2> >;
+	encoder_avx2_lut<ISA_LEVEL_VBMI2>();
+}
+# else
+#  include "encoder_sse_base.h"
+void encoder_vbmi2_init() {
+	_do_encode = &do_encode_simd< do_encode_sse<ISA_LEVEL_VBMI2> >;
+	encoder_sse_lut<ISA_LEVEL_VBMI2>();
+}
+# endif
+#else
+void encoder_avx2_init();
+void encoder_vbmi2_init() {
+	encoder_avx2_init();
+}
+#endif

package/src/platform.cc CHANGED Viewed

@@ -2,16 +2,36 @@
 #ifdef PLATFORM_ARM
 # ifdef __ANDROID__
 #  include <cpu-features.h>
-# elif defined(__linux__)
+# elif defined(__linux__) || (defined(__FreeBSD__) && __FreeBSD__ >= 12)
 #  include <sys/auxv.h>
 #  include <asm/hwcap.h>
+# elif (defined(__FreeBSD__) && __FreeBSD__ < 12)
+#  include <sys/sysctl.h>
+#  include <asm/hwcap.h>
+# elif defined(_WIN32)
+#  define WIN32_LEAN_AND_MEAN
+#  define NOMINMAX
+#  include <Windows.h>
+# elif defined(__APPLE__)
+#  include <sys/types.h>
+#  include <sys/sysctl.h>
 # endif
 bool cpu_supports_neon() {
 # if defined(AT_HWCAP)
-#  ifdef __aarch64__
-	return getauxval(AT_HWCAP) & HWCAP_ASIMD;
+#  ifdef __FreeBSD__
+	unsigned long supported;
+	elf_aux_info(AT_HWCAP, &supported, sizeof(supported));
+#   ifdef __aarch64__
+	return supported & HWCAP_ASIMD;
+#   else
+	return supported & HWCAP_NEON;
+#   endif
 #  else
+#   ifdef __aarch64__
+	return getauxval(AT_HWCAP) & HWCAP_ASIMD;
+#   else
 	return getauxval(AT_HWCAP) & HWCAP_NEON;
+#   endif
 #  endif
 # elif defined(ANDROID_CPU_FAMILY_ARM)
 #  ifdef __aarch64__
@@ -19,14 +39,23 @@ bool cpu_supports_neon() {
 #  else
 	return android_getCpuFeatures() & ANDROID_CPU_ARM_FEATURE_NEON;
 #  endif
+# elif defined(_WIN32)
+	return IsProcessorFeaturePresent(PF_ARM_NEON_INSTRUCTIONS_AVAILABLE);
+# elif defined(__APPLE__)
+	int supported = 0;
+	size_t len = sizeof(supported);
+	if(sysctlbyname("hw.optional.neon", &supported, &len, NULL, 0))
+		return false;
+	return (bool)supported;
 # endif
-	return true; // assume NEON support, if compiled as such, otherwise
+	return true; // assume NEON support, if compiled as such, otherwise (I think Windows and iOS require it)
 }
 #endif
 #ifdef PLATFORM_X86
 #ifdef _MSC_VER
+# define _cpuid1(ar) __cpuid(ar, 1)
 # define _cpuid1x(ar) __cpuid(ar, 0x80000001)
 # if _MSC_VER >= 1600
 #  define _cpuidX __cpuidex
@@ -38,6 +67,8 @@ bool cpu_supports_neon() {
 #  define _GET_XCR() 0
 # endif
 #else
+# include <cpuid.h>
+# define _cpuid1(ar) __cpuid(1, ar[0], ar[1], ar[2], ar[3])
 # define _cpuid1x(ar) __cpuid(0x80000001, ar[0], ar[1], ar[2], ar[3])
 # define _cpuidX(ar, eax, ecx) __cpuid_count(eax, ecx, ar[0], ar[1], ar[2], ar[3])
 static inline int _GET_XCR() {
@@ -84,11 +115,9 @@ int cpu_supports_isa() {
 		// AMD Bobcat with slow SSSE3 instructions - pretend it doesn't exist
 		return ret | ISA_LEVEL_SSE2;
-	// Jaguar/Puma performance unkown (slowish PSHUFB/PBLENDVB)
 	if((flags[2] & 0x200) == 0x200) { // SSSE3
-		if(family == 6 && (model == 0x5c || model == 0x5f || model == 0x7a))
-			// Intel Goldmont/plus with slow PBLENDVB
+		if(family == 6 && (model == 0x5c || model == 0x5f || model == 0x7a || model == 0x9c))
+			// Intel Goldmont/plus / Tremont with slow PBLENDVB
 			return ret | ISA_LEVEL_SSSE3;
 		if(flags[2] & 0x80000) { // SSE4.1
@@ -116,4 +145,24 @@ int cpu_supports_isa() {
 	return ret | ISA_LEVEL_SSE2;
 }
+int cpu_supports_crc_isa() {
+	int flags[4];
+	_cpuid1(flags);
+	if((flags[2] & 0x80202) == 0x80202) { // SSE4.1 + SSSE3 + CLMUL
+		if((flags[2] & 0x18000000) == 0x18000000) { // OSXSAVE + AVX
+			int xcr = _GET_XCR() & 0xff; // ignore unused bits
+			if((xcr & 6) == 6) { // AVX enabled
+				int cpuInfo[4];
+				_cpuidX(cpuInfo, 7, 0);
+				if((cpuInfo[1] & 0x20) == 0x20 && (cpuInfo[2] & 0x400) == 0x400) { // AVX2 + VPCLMULQDQ
+					return 2;
+				}
+			}
+		}
+		return 1;
+	}
+	return 0;
+}
 #endif // PLATFORM_X86

package/src/yencode.cc CHANGED Viewed

@@ -12,11 +12,6 @@
 using namespace v8;
-union crc32 {
-	uint32_t u32;
-	unsigned char u8a[4];
-};
 static void free_buffer(char* data, void* _size) {
 #if !NODE_VERSION_AT_LEAST(0, 11, 0)
 	int size = (int)(size_t)_size;
@@ -252,7 +247,7 @@ FUNC(Decode) {
 		isRaw = ARG_TO_BOOL(args[1]);
 	unsigned char *result = (unsigned char*) malloc(arg_len);
-	size_t len = (isRaw ? do_decode<true> : do_decode<false>)((const unsigned char*)node::Buffer::Data(args[0]), result, arg_len, NULL);
+	size_t len = do_decode(isRaw, (const unsigned char*)node::Buffer::Data(args[0]), result, arg_len, NULL);
 	result = (unsigned char*)realloc(result, len);
 	MARK_EXT_MEM(len);
 	RETURN_VAL( NEW_BUFFER((char*)result, len, free_buffer, (void*)len) );
@@ -276,7 +271,7 @@ FUNC(DecodeTo) {
 	if (args.Length() > 2)
 		isRaw = ARG_TO_BOOL(args[2]);
-	size_t len = (isRaw ? do_decode<true> : do_decode<false>)((const unsigned char*)node::Buffer::Data(args[0]), (unsigned char*)node::Buffer::Data(args[1]), arg_len, NULL);
+	size_t len = do_decode(isRaw, (const unsigned char*)node::Buffer::Data(args[0]), (unsigned char*)node::Buffer::Data(args[1]), arg_len, NULL);
 	RETURN_VAL( Integer::New(ISOLATE len) );
 }
@@ -336,17 +331,23 @@ FUNC(DecodeIncr) {
 }
-#if NODE_VERSION_AT_LEAST(3, 0, 0)
-// for whatever reason, iojs 3 gives buffer corruption if you pass in a pointer without a free function
-#define RETURN_CRC(x) do { \
-	Local<Object> buff = NEW_BUFFER(4); \
-	memcpy(node::Buffer::Data(buff), &x.u32, sizeof(uint32_t)); \
-	args.GetReturnValue().Set( buff ); \
-} while(0)
-#else
-#define RETURN_CRC(x) RETURN_VAL( NEW_BUFFER((char*)x.u8a, 4) )
+static inline uint32_t read_crc32(const Local<Value>& buf) {
+	const uint8_t* arr = (const uint8_t*)node::Buffer::Data(buf);
+	return (((uint_fast32_t)arr[0] << 24) | ((uint_fast32_t)arr[1] << 16) | ((uint_fast32_t)arr[2] << 8) | (uint_fast32_t)arr[3]);
+}
+static inline Local<Object> pack_crc32(
+#if NODE_VERSION_AT_LEAST(0, 11, 0)
+	Isolate* isolate,
 #endif
+uint32_t crc) {
+	Local<Object> buff = NEW_BUFFER(4);
+	unsigned char* d = (unsigned char*)node::Buffer::Data(buff);
+	d[0] = (unsigned char)(crc >> 24) & 0xFF;
+	d[1] = (unsigned char)(crc >> 16) & 0xFF;
+	d[2] = (unsigned char)(crc >>  8) & 0xFF;
+	d[3] = (unsigned char)crc & 0xFF;
+	return buff;
+}
 // crc32(str, init)
 FUNC(CRC32) {
@@ -356,25 +357,18 @@ FUNC(CRC32) {
 		RETURN_ERROR("You must supply a Buffer");
 	// TODO: support string args??
-	union crc32 init;
-	init.u32 = 0;
+	uint32_t crc = 0;
 	if (args.Length() >= 2) {
 		if (!node::Buffer::HasInstance(args[1]) || node::Buffer::Length(args[1]) != 4)
 			RETURN_ERROR("Second argument must be a 4 byte buffer");
-		memcpy(&init.u32, node::Buffer::Data(args[1]), sizeof(uint32_t));
-		do_crc32_incremental(
-			(const void*)node::Buffer::Data(args[0]),
-			node::Buffer::Length(args[0]),
-			init.u8a
-		);
-	} else {
-		do_crc32(
-			(const void*)node::Buffer::Data(args[0]),
-			node::Buffer::Length(args[0]),
-			init.u8a
-		);
+		crc = read_crc32(args[1]);
 	}
-	RETURN_CRC(init);
+	crc = do_crc32(
+		(const void*)node::Buffer::Data(args[0]),
+		node::Buffer::Length(args[0]),
+		crc
+	);
+	RETURN_VAL(pack_crc32(ISOLATE crc));
 }
 FUNC(CRC32Combine) {
@@ -386,14 +380,11 @@ FUNC(CRC32Combine) {
 	|| !node::Buffer::HasInstance(args[1]) || node::Buffer::Length(args[1]) != 4)
 		RETURN_ERROR("You must supply a 4 byte Buffer for the first two arguments");
-	union crc32 crc1, crc2;
+	uint32_t crc1 = read_crc32(args[0]), crc2 = read_crc32(args[1]);
 	size_t len = (size_t)ARG_TO_INT(args[2]);
-	memcpy(&crc1.u32, node::Buffer::Data(args[0]), sizeof(uint32_t));
-	memcpy(&crc2.u32, node::Buffer::Data(args[1]), sizeof(uint32_t));
-	do_crc32_combine(crc1.u8a, crc2.u8a, len);
-	RETURN_CRC(crc1);
+	crc1 = do_crc32_combine(crc1, crc2, len);
+	RETURN_VAL(pack_crc32(ISOLATE crc1));
 }
 FUNC(CRC32Zeroes) {
@@ -402,17 +393,15 @@ FUNC(CRC32Zeroes) {
 	if (args.Length() < 1)
 		RETURN_ERROR("At least 1 argument required");
-	union crc32 crc1;
+	uint32_t crc1 = 0;
 	if (args.Length() >= 2) {
 		if (!node::Buffer::HasInstance(args[1]) || node::Buffer::Length(args[1]) != 4)
 			RETURN_ERROR("Second argument must be a 4 byte buffer");
-		memcpy(&crc1.u32, node::Buffer::Data(args[1]), sizeof(uint32_t));
-	} else {
-		crc1.u32 = 0;
+		crc1 = read_crc32(args[1]);
 	}
 	size_t len = (size_t)ARG_TO_INT(args[0]);
-	do_crc32_zeros(crc1.u8a, len);
-	RETURN_CRC(crc1);
+	crc1 = do_crc32_zeros(crc1, len);
+	RETURN_VAL(pack_crc32(ISOLATE crc1));
 }
 static void init_all() {

package/test/testcrc.js CHANGED Viewed

@@ -50,4 +50,18 @@ doTest('Random', 'crc32', 'fj[-oqijnw34-59n26 4345j8yn89032q78t9ab9gabh023quhoiB
 doTest('Random Continue', 'crc32', ['KZSHZ5EDOVAmDdakZZOrGSUGGKSpCJoWH7M0MHy6ohnSzvHY4DjpxXmyfWYJQoJ7tKdNhGcuRVUzrgXM', ycrc32('BdenbmoBgiB10ZkeUBjrsZV3dg2Da2fhHqU9TMdi69AHhLRck3Nk60YuFBXh6lvtefBpjdTxbeEmsaEm')], crc32('BdenbmoBgiB10ZkeUBjrsZV3dg2Da2fhHqU9TMdi69AHhLRck3Nk60YuFBXh6lvtefBpjdTxbeEmsaEmKZSHZ5EDOVAmDdakZZOrGSUGGKSpCJoWH7M0MHy6ohnSzvHY4DjpxXmyfWYJQoJ7tKdNhGcuRVUzrgXM'));
+// random tests
+for(var i=1; i<128; i++) {
+	var rand = require('crypto').pseudoRandomBytes(i);
+	doTest('Random Short Buffer', 'crc32', rand);
+}
+for(var i=0; i<32; i++) {
+	var rand = require('crypto').pseudoRandomBytes(100000);
+	doTest('Random Buffer', 'crc32', rand);
+	var split = Math.random()*rand.length;
+	doTest('Random Continue Buffer', 'crc32', [rand.slice(split), ycrc32(rand.slice(0, split))], crc32(rand));
+}
 console.log('All tests passed');