npm - yencode - Versions diffs - 1.2.0 → 1.2.1 - Mend

yencode 1.2.0 → 1.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (41) hide show

package/binding.gyp CHANGED Viewed

@@ -78,7 +78,7 @@
   "targets": [
     {
       "target_name": "yencode",
-      "dependencies": ["yencode_sse2", "yencode_ssse3", "yencode_clmul", "yencode_clmul256", "yencode_avx", "yencode_avx2", "yencode_vbmi2", "yencode_neon", "yencode_armcrc", "yencode_rvv", "yencode_zbkc"],
+      "dependencies": ["yencode_sse2", "yencode_ssse3", "yencode_clmul", "yencode_clmul256", "yencode_avx", "yencode_avx2", "yencode_vbmi2", "yencode_neon", "yencode_armcrc", "yencode_pmull", "yencode_rvv", "yencode_zbkc"],
       "sources": [
         "src/yencode.cc",
         "src/platform.cc",
@@ -416,6 +416,42 @@
         }]
       ]
     },
+    {
+      "target_name": "yencode_pmull",
+      "type": "static_library",
+      "sources": [
+        "src/crc_arm_pmull.cc"
+      ],
+      "cflags!": ["-fno-omit-frame-pointer", "-fno-tree-vrp", "-fno-strict-aliasing"],
+      "cxxflags!": ["-fno-omit-frame-pointer", "-fno-tree-vrp", "-fno-strict-aliasing"],
+      "xcode_settings": {
+        "OTHER_CFLAGS!": ["-fno-omit-frame-pointer", "-fno-tree-vrp", "-fno-strict-aliasing"],
+        "OTHER_CXXFLAGS!": ["-fno-omit-frame-pointer", "-fno-tree-vrp", "-fno-strict-aliasing"]
+      },
+      "msvs_settings": {"VCCLCompilerTool": {"BufferSecurityCheck": "false"}},
+      "conditions": [
+        ['target_arch in "arm arm64"', {
+          "cflags!": ["-march=native"],
+          "cxxflags!": ["-march=native"],
+          "cflags": ["-march=armv8-a+crc+crypto"],
+          "cxxflags": ["-march=armv8-a+crc+crypto"],
+          "xcode_settings": {
+            "OTHER_CFLAGS!": ["-march=native"],
+            "OTHER_CXXFLAGS!": ["-march=native"],
+            "OTHER_CFLAGS": ["-march=armv8-a+crc+crypto"],
+            "OTHER_CXXFLAGS": ["-march=armv8-a+crc+crypto"],
+          }
+        }],
+        ['OS!="win" and target_arch=="arm"', {
+          "cflags": ["-mfpu=neon","-fno-lto"],
+          "cxxflags": ["-mfpu=neon","-fno-lto"],
+          "xcode_settings": {
+            "OTHER_CFLAGS": ["-mfpu=neon","-fno-lto"],
+            "OTHER_CXXFLAGS": ["-mfpu=neon","-fno-lto"]
+          }
+        }]
+      ]
+    },
     {
       "target_name": "yencode_zbkc",
       "type": "static_library",

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "yencode",
-  "version": "1.2.0",
+  "version": "1.2.1",
   "description": "SIMD accelerated yEnc encoder/decoder and CRC32 calculator",
   "keywords": [
     "yenc",

package/src/common.h CHANGED Viewed

@@ -125,7 +125,7 @@
 #ifdef __POPCNT__
 #include <nmmintrin.h>
 // POPCNT can never return a negative result, but GCC doesn't seem to realise this, so typecast it to hint it better
-#define popcnt32 (unsigned int)_mm_popcnt_u32
+#define popcnt32 (unsigned int)_mm_popcnt_u32
 #endif
 #if defined(__AVX2__) || defined(__AVX512F__)
@@ -209,7 +209,9 @@ static HEDLEY_ALWAYS_INLINE uint8x16x4_t vcreate4_u8(uint8x16_t a, uint8x16_t b,
 # undef _CREATE_TUPLE
 #endif
 #ifdef PLATFORM_ARM
-bool cpu_supports_neon();
+namespace RapidYenc {
+	bool cpu_supports_neon();
+}
 #endif
 #ifdef _MSC_VER
@@ -240,6 +242,7 @@ enum YEncDecIsaLevel {
 enum YEncDecIsaLevel {
 	ISA_GENERIC = 0,
 	ISA_FEATURE_CRC = 8,
+	ISA_FEATURE_PMULL = 0x40,
 	ISA_LEVEL_NEON = 0x1000
 };
 #elif defined(__riscv)
@@ -274,7 +277,7 @@ enum YEncDecIsaLevel {
 # if defined(__POPCNT__)
 #  if defined(__LZCNT__)
 #   define ISA_NATIVE (enum YEncDecIsaLevel)(_ISA_NATIVE | ISA_FEATURE_POPCNT | ISA_FEATURE_LZCNT)
-#  else
+#  else
 #   define ISA_NATIVE (enum YEncDecIsaLevel)(_ISA_NATIVE | ISA_FEATURE_POPCNT)
 #  endif
 # else
@@ -282,12 +285,17 @@ enum YEncDecIsaLevel {
 # endif
 #endif
-int cpu_supports_isa();
+namespace RapidYenc {
+	int cpu_supports_isa();
+	int cpu_supports_crc_isa();
+}
 #endif // PLATFORM_X86
 #ifdef __riscv
-bool cpu_supports_rvv();
+namespace RapidYenc {
+	bool cpu_supports_rvv();
+}
 #endif
 #if defined(__riscv_vector) && defined(HEDLEY_GCC_VERSION) && !HEDLEY_GCC_VERSION_CHECK(13,0,0)
 // GCC added RVV intrinsics in GCC13
@@ -318,7 +326,11 @@ bool cpu_supports_rvv();
 # include <stddef.h>
 #else
 /* Workaround for older MSVC not supporting stdint.h - just pull it from V8 */
-# include <v8.h>
+# if defined(NODE_GYP_MODULE_NAME) || defined(V8_DEPRECATION_WARNINGS)
+#  include <v8.h>
+# else
+#  include "stdint.h"
+# endif
 #endif

package/src/crc.cc CHANGED Viewed

@@ -133,6 +133,7 @@ static void generate_crc32_slice_table() {
 #endif
+namespace RapidYenc {
 // workaround MSVC complaining "unary minus operator applied to unsigned type, result still unsigned"
 #define NEGATE(n) (uint32_t)(-((int32_t)(n)))
@@ -180,9 +181,10 @@ uint32_t crc32_shift_generic(uint32_t crc1, uint32_t n) {
 #endif
 	return result;
 }
+} // namespace
-extern "C" {
+namespace RapidYenc {
 	crc_func _do_crc32_incremental = &do_crc32_incremental_generic;
 	crc_mul_func _crc32_shift = &crc32_shift_generic;
 	crc_mul_func _crc32_multiply = &crc32_multiply_generic;
@@ -191,15 +193,6 @@ extern "C" {
-void crc_clmul_set_funcs();
-void crc_clmul256_set_funcs();
-void crc_arm_set_funcs();
-void crc_riscv_set_funcs();
-#ifdef PLATFORM_X86
-int cpu_supports_crc_isa();
-#endif
 #if defined(PLATFORM_ARM) && defined(_WIN32)
 # define WIN32_LEAN_AND_MEAN
 # include <Windows.h>
@@ -234,7 +227,7 @@ static unsigned long getauxval(unsigned long cap) {
 # endif
 #endif
-void crc_init() {
+void RapidYenc::crc32_init() {
 	GENERIC_CRC_INIT;
 #ifdef PLATFORM_X86
@@ -246,31 +239,47 @@ void crc_init() {
 #endif
 #ifdef PLATFORM_ARM
 # ifdef __APPLE__
-	int supported = 0;
-	size_t len = sizeof(supported);
-	if(sysctlbyname("hw.optional.armv8_crc32", &supported, &len, NULL, 0))
-		supported = 0;
-# endif
-	if(
-# if defined(AT_HWCAP2) && defined(HWCAP2_CRC32)
-		getauxval(AT_HWCAP2) & HWCAP2_CRC32
-# elif defined(AT_HWCAP) && defined(HWCAP_CRC32)
-		getauxval(AT_HWCAP) & HWCAP_CRC32
-# elif defined(ANDROID_CPU_FAMILY_ARM) && defined(__aarch64__)
-		android_getCpuFeatures() & ANDROID_CPU_ARM64_FEATURE_CRC32
-# elif defined(ANDROID_CPU_FAMILY_ARM) /* aarch32 */
-		android_getCpuFeatures() & ANDROID_CPU_ARM_FEATURE_CRC32
-# elif defined(_WIN32)
-		IsProcessorFeaturePresent(PF_ARM_V8_CRC32_INSTRUCTIONS_AVAILABLE)
-# elif defined(__APPLE__)
-		supported
-# elif defined(__ARM_FEATURE_CRC32)
-		true /* assume available if compiled as such */
+	int supports_crc = 0;
+	int supports_pmull = 0;
+	size_t len = sizeof(supports_crc);
+	if(sysctlbyname("hw.optional.armv8_crc32", &supports_crc, &len, NULL, 0))
+		supports_crc = 0;
+	if(sysctlbyname("hw.optional.arm.FEAT_PMULL", &supports_pmull, &len, NULL, 0))
+		supports_pmull = 0;
 # else
-		false
+	bool supports_crc = false;
+	bool supports_pmull = false;
+#  if defined(AT_HWCAP2) && defined(HWCAP2_CRC32)
+	supports_crc = getauxval(AT_HWCAP2) & HWCAP2_CRC32;
+#  elif defined(AT_HWCAP) && defined(HWCAP_CRC32)
+	supports_crc = getauxval(AT_HWCAP) & HWCAP_CRC32;
+#  elif defined(ANDROID_CPU_FAMILY_ARM) && defined(__aarch64__)
+	supports_crc = android_getCpuFeatures() & ANDROID_CPU_ARM64_FEATURE_CRC32;
+	supports_pmull = android_getCpuFeatures() & ANDROID_CPU_ARM64_FEATURE_PMULL;
+#  elif defined(ANDROID_CPU_FAMILY_ARM) /* aarch32 */
+	supports_crc = android_getCpuFeatures() & ANDROID_CPU_ARM_FEATURE_CRC32;
+	supports_pmull = android_getCpuFeatures() & ANDROID_CPU_ARM_FEATURE_PMULL;
+#  elif defined(_WIN32)
+	supports_crc = IsProcessorFeaturePresent(PF_ARM_V8_CRC32_INSTRUCTIONS_AVAILABLE);
+	supports_pmull = IsProcessorFeaturePresent(PF_ARM_V8_CRYPTO_INSTRUCTIONS_AVAILABLE);
+#  else
+	#ifdef __ARM_FEATURE_CRC32
+	supports_crc = true; /* assume available if compiled as such */
+	#endif
+	#ifdef __ARM_FEATURE_CRYPTO
+	supports_pmull = true;
+	#endif
+#  endif
+#  if defined(AT_HWCAP2) && defined(HWCAP2_PMULL)
+	supports_pmull = getauxval(AT_HWCAP2) & HWCAP2_PMULL;
+#  elif defined(AT_HWCAP) && defined(HWCAP_PMULL)
+	supports_pmull = getauxval(AT_HWCAP) & HWCAP_PMULL;
+#  endif
 # endif
-	) {
+	if(supports_crc) {
 		crc_arm_set_funcs();
+		if(supports_pmull) crc_pmull_set_funcs();
 	}
 #endif
 #ifdef __riscv

package/src/crc.h CHANGED Viewed

@@ -2,25 +2,25 @@
 #define __YENC_CRC_H
 #include <stdlib.h> // for llabs
-#ifdef __cplusplus
-extern "C" {
+#if !defined(__GNUC__) && defined(_MSC_VER)
+# include <intrin.h>
 #endif
+namespace RapidYenc {
 typedef uint32_t (*crc_func)(const void*, size_t, uint32_t);
 extern crc_func _do_crc32_incremental;
 extern int _crc32_isa;
-#define do_crc32 (*_do_crc32_incremental)
+static inline uint32_t crc32(const void* data, size_t length, uint32_t init) {
+	return (*_do_crc32_incremental)(data, length, init);
+}
 static inline int crc32_isa_level() {
 	return _crc32_isa;
 }
-#if !defined(__GNUC__) && defined(_MSC_VER)
-# include <intrin.h>
-#endif
 // computes `n % 0xffffffff` (well, almost), using some bit-hacks
 static inline uint32_t crc32_powmod(uint64_t n) {
 #ifdef __GNUC__
@@ -28,7 +28,7 @@ static inline uint32_t crc32_powmod(uint64_t n) {
 	unsigned carry = __builtin_uadd_overflow(n >> 32, n, &res);
 	res += carry;
 	return res;
-#elif defined(_MSC_VER)
+#elif defined(_MSC_VER) && defined(PLATFORM_X86)
 	unsigned res;
 	unsigned char carry = _addcarry_u32(0, n >> 32, n, &res);
 	_addcarry_u32(carry, res, 0, &res);
@@ -59,8 +59,12 @@ static inline uint32_t crc32_bytepow(uint64_t n) {
 typedef uint32_t (*crc_mul_func)(uint32_t, uint32_t);
 extern crc_mul_func _crc32_shift;
 extern crc_mul_func _crc32_multiply;
-#define crc32_shift (*_crc32_shift)
-#define crc32_multiply (*_crc32_multiply)
+static inline uint32_t crc32_shift(uint32_t a, uint32_t b) {
+	return (*_crc32_shift)(a, b);
+}
+static inline uint32_t crc32_multiply(uint32_t a, uint32_t b) {
+	return (*_crc32_multiply)(a, b);
+}
 static inline uint32_t crc32_combine(uint32_t crc1, uint32_t crc2, uint64_t len2) {
 	return crc32_shift(crc1, crc32_bytepow(len2)) ^ crc2;
@@ -79,11 +83,9 @@ static inline uint32_t crc32_256pow(uint64_t n) {
 	return crc32_shift(0x80000000, crc32_bytepow(n));
 }
-void crc_init();
+void crc32_init();
-#ifdef __cplusplus
-}
-#endif
-#endif
+} // namespace
+#endif // defined(__YENC_CRC_H)

package/src/crc_arm.cc CHANGED Viewed

@@ -61,7 +61,7 @@ HEDLEY_WARNING("CRC32 acceleration has been disabled due to missing arm_acle.h")
 #ifdef __aarch64__
-uint32_t crc32_multiply_arm(uint32_t a, uint32_t b) {
+static uint32_t crc32_multiply_arm(uint32_t a, uint32_t b) {
 	// perform PMULL
 	uint64_t res = 0;
 	uint64_t a64 = (uint64_t)a << 32;
@@ -86,8 +86,7 @@ uint32_t crc32_multiply_arm(uint32_t a, uint32_t b) {
 #ifdef ENABLE_PIPELINE_OPT
 #ifndef __aarch64__
-uint32_t crc32_multiply_generic(uint32_t a, uint32_t b);
-# define crc32_multiply_arm crc32_multiply_generic
+# define crc32_multiply_arm RapidYenc::crc32_multiply_generic
 #endif
 #endif
@@ -124,7 +123,7 @@ static uint32_t arm_crc_calc(uint32_t crc, const unsigned char *src, long len) {
 	// (this is a slightly less efficient, but much simpler implementation of the idea)
 	const unsigned SPLIT_WORDS_LOG = 10;  // make sure it's at least 2
 	const unsigned SPLIT_WORDS = 1<<SPLIT_WORDS_LOG;
-	const unsigned blockCoeff = crc_power[SPLIT_WORDS_LOG + WORDSIZE_LOG + 3];
+	const unsigned blockCoeff = RapidYenc::crc_power[SPLIT_WORDS_LOG + WORDSIZE_LOG + 3];
 	while(len >= (long)(sizeof(WORD_T)*SPLIT_WORDS*2)) {
 		// compute 2x CRCs concurrently to leverage piplining
 		uint32_t crc2 = 0;
@@ -196,7 +195,7 @@ static uint32_t do_crc32_incremental_arm(const void* data, size_t length, uint32
 #if defined(__aarch64__) && (defined(__GNUC__) || defined(_MSC_VER))
-uint32_t crc32_shift_arm(uint32_t crc1, uint32_t n) {
+static uint32_t crc32_shift_arm(uint32_t crc1, uint32_t n) {
 	uint32_t result = crc1;
 	uint64_t prod = result;
 	prod <<= 32 - (n&31);
@@ -204,7 +203,7 @@ uint32_t crc32_shift_arm(uint32_t crc1, uint32_t n) {
 	n &= ~31;
 	while(n) {
-		result = crc32_multiply_arm(result, crc_power[ctz32(n)]);
+		result = crc32_multiply_arm(result, RapidYenc::crc_power[ctz32(n)]);
 		n &= n-1;
 	}
 	return result;
@@ -212,7 +211,7 @@ uint32_t crc32_shift_arm(uint32_t crc1, uint32_t n) {
 #endif
-void crc_arm_set_funcs() {
+void RapidYenc::crc_arm_set_funcs() {
 	_do_crc32_incremental = &do_crc32_incremental_arm;
 #ifdef __aarch64__
 	_crc32_multiply = &crc32_multiply_arm;
@@ -223,5 +222,5 @@ void crc_arm_set_funcs() {
 	_crc32_isa = ISA_FEATURE_CRC;
 }
 #else
-void crc_arm_set_funcs() {}
+void RapidYenc::crc_arm_set_funcs() {}
 #endif

package/src/crc_arm_pmull.cc ADDED Viewed

@@ -0,0 +1,215 @@
+#include "crc_common.h"
+// exclude broken/missing arm_acle.h
+#if defined(__ARM_FEATURE_CRYPTO) && defined(HEDLEY_GCC_VERSION)
+# if !defined(__aarch64__) && HEDLEY_GCC_VERSION_CHECK(7,0,0) && !HEDLEY_GCC_VERSION_CHECK(8,1,1)
+#  undef __ARM_FEATURE_CRYPTO
+# endif
+# if defined(__aarch64__) && HEDLEY_GCC_VERSION_CHECK(9,4,0) && !HEDLEY_GCC_VERSION_CHECK(9,5,0)
+#  undef __ARM_FEATURE_CRYPTO
+# endif
+#endif
+#if defined(__ARM_FEATURE_CRYPTO) && defined(__has_include)
+# if !__has_include(<arm_acle.h>)
+#  undef __ARM_FEATURE_CRYPTO
+# endif
+#endif
+// ARM's intrinsics guide seems to suggest that vmull_p64 is available on A32, but neither Clang/GCC seem to support it on AArch32
+#if (defined(__ARM_FEATURE_CRYPTO) && defined(__ARM_FEATURE_CRC32) && defined(__aarch64__)) || (defined(_M_ARM64) && !defined(__clang__))
+#include <arm_neon.h>
+#if defined(_MSC_VER) && !defined(__clang__)
+# include <intrin.h>
+# ifdef _M_ARM64
+// MSVC may detect this pattern: https://devblogs.microsoft.com/cppblog/a-tour-of-4-msvc-backend-improvements/#byteswap-identification
+static HEDLEY_ALWAYS_INLINE uint64_t rbit64(uint64_t x) {
+	x = _byteswap_uint64(x);
+	x = (x & 0xaaaaaaaaaaaaaaaa) >> 1 | (x & 0x5555555555555555) << 1;
+	x = (x & 0xcccccccccccccccc) >> 2 | (x & 0x3333333333333333) << 2;
+	x = (x & 0xf0f0f0f0f0f0f0f0) >> 4 | (x & 0x0f0f0f0f0f0f0f0f) << 4;
+	return x;
+}
+// ...whilst this seems to work best for 32-bit RBIT
+static HEDLEY_ALWAYS_INLINE uint32_t rbit32(uint32_t x) {
+	uint64_t r = rbit64(x);
+	return r >> 32;
+}
+# else
+#  define rbit32 _arm_rbit
+# endif
+#else
+# include <arm_acle.h>
+// __rbit not present before GCC 11.4.0 or 12.2.0; for ARM32, requires GCC 14
+# if defined(HEDLEY_GCC_VERSION) && !HEDLEY_GCC_VERSION_CHECK(14,0,0) && (!defined(__aarch64__) || !HEDLEY_GCC_VERSION_CHECK(11,3,0) || (HEDLEY_GCC_VERSION_CHECK(12,0,0) && !HEDLEY_GCC_VERSION_CHECK(12,2,0)))
+#  ifdef __aarch64__
+static HEDLEY_ALWAYS_INLINE uint64_t rbit64(uint64_t x) {
+	uint64_t r;
+	__asm__ ("rbit %0,%1\n"
+		: "=r"(r) : "r"(x)
+		: /* No clobbers */);
+	return r;
+}
+#  endif
+static HEDLEY_ALWAYS_INLINE uint32_t rbit32(uint32_t x) {
+	uint32_t r;
+	__asm__ (
+#  ifdef __aarch64__
+		"rbit %w0,%w1\n"
+#  else
+		"rbit %0,%1\n"
+#  endif
+		: "=r"(r) : "r"(x)
+		: /* No clobbers */);
+	return r;
+}
+# else
+#  define rbit32 __rbit
+#  define rbit64 __rbitll
+# endif
+#endif
+// MSVC doesn't have poly64/poly128 types, so always use uint64 instead
+#ifdef __aarch64__
+# if defined(__GNUC__) || defined(__clang__)
+static HEDLEY_ALWAYS_INLINE uint64x2_t pmull_low(uint64x1_t a, uint64x1_t b) {
+	uint64x2_t result;
+	__asm__ ("pmull %0.1q,%1.1d,%2.1d"
+		: "=w"(result)
+		: "w"(a), "w"(b)
+		: /* No clobbers */);
+	return result;
+}
+static HEDLEY_ALWAYS_INLINE uint64x2_t pmull_high(uint64x2_t a, uint64x2_t b) {
+	uint64x2_t result;
+	__asm__ ("pmull2 %0.1q,%1.2d,%2.2d"
+		: "=w"(result)
+		: "w"(a), "w"(b)
+		: /* No clobbers */);
+	return result;
+}
+# elif defined(_MSC_VER) && !defined(__clang__)
+#  define pmull_low vmull_p64
+#  define pmull_high vmull_high_p64
+# else
+#  define pmull_low(x, y) vreinterpretq_u64_p128(vmull_p64(vreinterpret_p64_u64(x), vreinterpret_p64_u64(y)))
+#  define pmull_high(x, y) vreinterpretq_u64_p128(vmull_high_p64(vreinterpretq_p64_u64(x), vreinterpretq_p64_u64(y)))
+# endif
+#else
+# if defined(_MSC_VER) && !defined(__clang__)
+#  define pmull_low vmull_p64
+#  define pmull_high(x, y) vmull_p64(vget_high_u64(x), vget_high_u64(y))
+# else
+#  define pmull_low(x, y) vreinterpretq_u64_p128(vmull_p64(x, y))
+#  define pmull_high(x, y) vreinterpretq_u64_p128(vmull_p64(vget_high_p64(vreinterpretq_p64_u64(x)), vget_high_p64(vreinterpretq_p64_u64(y))))
+# endif
+#endif
+static uint32_t crc32_multiply_pmull(uint32_t a, uint32_t b) {
+	uint64x1_t prod = vget_low_u64(pmull_low(
+		vreinterpret_u64_u32(vset_lane_u32(a, vdup_n_u32(0), 0)),
+		vreinterpret_u64_u32(vset_lane_u32(b, vdup_n_u32(0), 0))
+	));
+	#ifdef __aarch64__
+	uint64_t p = vget_lane_u64(prod, 0);
+	return __crc32w(0, p+p) ^ (p >> 31);
+	#else
+	prod = vadd_u64(prod, prod);
+	uint32x2_t prod32 = vreinterpret_u32_u64(prod);
+	return __crc32w(0, vget_lane_u32(prod32, 0)) ^ vget_lane_u32(prod32, 1);
+	#endif
+}
+static const uint32_t crc_power_rev[32] = { // bit-reversed crc_power
+	0x00000002, 0x00000004, 0x00000010, 0x00000100, 0x00010000, 0x04c11db7, 0x490d678d, 0xe8a45605,
+	0x75be46b7, 0xe6228b11, 0x567fddeb, 0x88fe2237, 0x0e857e71, 0x7001e426, 0x075de2b2, 0xf12a7f90,
+	0xf0b4a1c1, 0x58f46c0c, 0xc3395ade, 0x96837f8c, 0x544037f9, 0x23b7b136, 0xb2e16ba8, 0x725e7bfa,
+	0xec709b5d, 0xf77a7274, 0x2845d572, 0x034e2515, 0x79695942, 0x540cb128, 0x0b65d023, 0x3c344723
+};
+static HEDLEY_ALWAYS_INLINE uint64x1_t crc32_shift_pmull_mulred(uint64x1_t a, uint64x1_t b) {
+	uint64x2_t r = pmull_low(a, b);
+	uint64x2_t h = pmull_high(r, vdupq_n_u64(0x490d678d));
+	return veor_u64(vget_low_u64(r), vget_low_u64(h));
+}
+static uint32_t crc32_shift_pmull(uint32_t crc1, uint32_t n) {
+	crc1 = rbit32(crc1);
+	uint64x1_t res;
+	#ifdef __aarch64__
+	uint64_t crc = (uint64_t)crc1 << (n & 31);
+	res = vset_lane_u64(crc, vdup_n_u64(0), 0);
+	#else
+	res = vreinterpret_u64_u32(vset_lane_u32(crc1, vdup_n_u32(0), 0));
+	res = vshl_u64(res, vdup_n_u64(n&31));
+	#endif
+	n &= ~31;
+	if(n) {
+		#define LOAD_NEXT_POWER vreinterpret_u64_u32(vset_lane_u32(crc_power_rev[ctz32(n)], vdup_n_u32(0), 0))
+		uint64x1_t res2 = LOAD_NEXT_POWER;
+		n &= n-1;
+		if(n) {
+			// first multiply doesn't need reduction
+			res2 = vget_low_u64(pmull_low(res2, LOAD_NEXT_POWER));
+			n &= n-1;
+			while(n) {
+				res = crc32_shift_pmull_mulred(res, LOAD_NEXT_POWER);
+				n &= n-1;
+				if(n) {
+					res2 = crc32_shift_pmull_mulred(res2, LOAD_NEXT_POWER);
+					n &= n-1;
+				}
+			}
+		}
+		#undef LOAD_NEXT_POWER
+		// merge two results
+		uint64x2_t prod = pmull_low(res, res2);
+		// weirdly, vrbitq_u8 is missing in ARM32 MSVC
+		prod = vreinterpretq_u64_u8(vrev64q_u8(vrbitq_u8(vreinterpretq_u8_u64(prod))));
+		#ifdef __aarch64__
+		crc = __crc32d(0, vgetq_lane_u64(prod, 1));
+		uint64_t rem = vgetq_lane_u64(prod, 0);
+		crc = __crc32w(rem, crc) ^ (rem >> 32);
+		#else
+		uint32x4_t prod32 = vreinterpretq_u32_u64(prod);
+		uint32_t crc = __crc32w(0, vgetq_lane_u32(prod32, 2));
+		crc = __crc32w(vgetq_lane_u32(prod32, 3), crc);
+		crc = __crc32w(vgetq_lane_u32(prod32, 0), crc) ^ vgetq_lane_u32(prod32, 1);
+		#endif
+		return crc;
+	} else {
+		#ifdef __aarch64__
+		crc = rbit64(crc);
+		crc = __crc32w(0, crc) ^ (crc >> 32);
+		return crc;
+		#else
+		uint32x2_t r = vreinterpret_u32_u64(res);
+		return __crc32w(0, rbit32(vget_lane_u32(r, 1))) ^ rbit32(vget_lane_u32(r, 0));
+		#endif
+	}
+}
+void RapidYenc::crc_pmull_set_funcs() {
+	_crc32_multiply = &crc32_multiply_pmull;
+	_crc32_shift = &crc32_shift_pmull;
+	_crc32_isa &= ISA_FEATURE_PMULL;
+}
+#else
+void RapidYenc::crc_pmull_set_funcs() {}
+#endif /* defined(__ARM_FEATURE_CRYPTO) && defined(__ARM_FEATURE_CRC32) */

package/src/crc_common.h CHANGED Viewed

@@ -2,8 +2,6 @@
 #include <stddef.h> // for size_t
 #include "crc.h"
-extern const uint32_t crc_power[32];
 #ifdef __GNUC__
 # define ctz32 __builtin_ctz
 #elif defined(_MSC_VER)
@@ -13,3 +11,16 @@ static HEDLEY_ALWAYS_INLINE unsigned ctz32(uint32_t n) {
 	return r;
 }
 #endif
+namespace RapidYenc {
+	void crc_clmul_set_funcs();
+	void crc_clmul256_set_funcs();
+	void crc_arm_set_funcs();
+	void crc_pmull_set_funcs();
+	void crc_riscv_set_funcs();
+	extern const uint32_t crc_power[32];
+	uint32_t crc32_multiply_generic(uint32_t a, uint32_t b);
+	uint32_t crc32_shift_generic(uint32_t crc1, uint32_t n);
+}

package/src/crc_folding.cc CHANGED Viewed

@@ -365,7 +365,7 @@ static HEDLEY_ALWAYS_INLINE __m128i crc32_reduce(__m128i prod) {
 	return t;
 }
-uint32_t crc32_multiply_clmul(uint32_t a, uint32_t b) {
+static uint32_t crc32_multiply_clmul(uint32_t a, uint32_t b) {
 	// do the actual multiply
 	__m128i prod = _mm_clmulepi64_si128(_mm_cvtsi32_si128(a), _mm_cvtsi32_si128(b), 0);
@@ -418,7 +418,7 @@ static HEDLEY_ALWAYS_INLINE __m128i reverse_bits_epi8(__m128i src) {
-const uint32_t crc_power_rev[32] = { // bit-reversed crc_power
+static const uint32_t crc_power_rev[32] = { // bit-reversed crc_power
 	0x00000002, 0x00000004, 0x00000010, 0x00000100, 0x00010000, 0x04c11db7, 0x490d678d, 0xe8a45605,
 	0x75be46b7, 0xe6228b11, 0x567fddeb, 0x88fe2237, 0x0e857e71, 0x7001e426, 0x075de2b2, 0xf12a7f90,
 	0xf0b4a1c1, 0x58f46c0c, 0xc3395ade, 0x96837f8c, 0x544037f9, 0x23b7b136, 0xb2e16ba8, 0x725e7bfa,
@@ -436,7 +436,7 @@ static HEDLEY_ALWAYS_INLINE __m128i crc32_shift_clmul_mulred(unsigned pos, __m12
 	return _mm_xor_si128(hi, prod);
 }
-uint32_t crc32_shift_clmul(uint32_t crc1, uint32_t n) {
+static uint32_t crc32_shift_clmul(uint32_t crc1, uint32_t n) {
 	if(!n) return crc1;
 	__m128i result = _mm_cvtsi32_si128(BSWAP32(crc1));
@@ -499,7 +499,7 @@ uint32_t crc32_shift_clmul(uint32_t crc1, uint32_t n) {
 #endif
-void crc_clmul_set_funcs() {
+void RapidYenc::crc_clmul_set_funcs() {
 	_do_crc32_incremental = &do_crc32_incremental_clmul;
 	_crc32_multiply = &crc32_multiply_clmul;
 #if defined(__GNUC__) || defined(_MSC_VER)
@@ -508,6 +508,6 @@ void crc_clmul_set_funcs() {
 	_crc32_isa = ISA_LEVEL_PCLMUL;
 }
 #else
-void crc_clmul_set_funcs() {}
+void RapidYenc::crc_clmul_set_funcs() {}
 #endif

package/src/crc_folding_256.cc CHANGED Viewed

@@ -1,8 +1,6 @@
 // 256-bit version of crc_folding
 #include "crc_common.h"
-void crc_clmul_set_funcs();
 #if !defined(YENC_DISABLE_AVX256) && ((defined(__VPCLMULQDQ__) && defined(__AVX2__) && defined(__PCLMUL__)) || (defined(_MSC_VER) && _MSC_VER >= 1920 && defined(PLATFORM_X86) && !defined(__clang__)))
 #include <inttypes.h>
@@ -212,13 +210,13 @@ static uint32_t do_crc32_incremental_clmul(const void* data, size_t length, uint
 	return crc_fold((const unsigned char*)data, (long)length, init);
 }
-void crc_clmul256_set_funcs() {
+void RapidYenc::crc_clmul256_set_funcs() {
 	crc_clmul_set_funcs(); // set multiply/shift function
 	_do_crc32_incremental = &do_crc32_incremental_clmul;
 	_crc32_isa = ISA_LEVEL_VPCLMUL;
 }
 #else
-void crc_clmul256_set_funcs() {
+void RapidYenc::crc_clmul256_set_funcs() {
 	crc_clmul_set_funcs();
 }
 #endif