npm - yencode - Versions diffs - 1.0.8 → 1.1.2 - Mend

yencode 1.0.8 → 1.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (49) hide show

package/README.md +339 -231
package/binding.gyp +292 -39
package/crcutil-1.0/code/multiword_64_64_gcc_amd64_asm.cc +7 -7
package/crcutil-1.0/code/multiword_64_64_gcc_i386_mmx.cc +14 -14
package/crcutil-1.0/code/multiword_64_64_intrinsic_i386_mmx.cc +1 -1
package/crcutil-1.0/code/uint128_sse2.h +2 -0
package/index.js +329 -22
package/package.json +2 -2
package/src/common.h +299 -0
package/src/crc.cc +95 -0
package/src/crc.h +23 -0
package/src/crc_arm.cc +175 -0
package/src/crc_common.h +4 -0
package/{crc_folding.c → src/crc_folding.cc} +175 -185
package/src/decoder.cc +61 -0
package/src/decoder.h +53 -0
package/src/decoder_avx.cc +18 -0
package/src/decoder_avx2.cc +18 -0
package/src/decoder_avx2_base.h +615 -0
package/src/decoder_common.h +512 -0
package/src/decoder_neon.cc +474 -0
package/src/decoder_neon64.cc +451 -0
package/src/decoder_sse2.cc +16 -0
package/src/decoder_sse_base.h +711 -0
package/src/decoder_ssse3.cc +18 -0
package/src/encoder.cc +170 -0
package/src/encoder.h +21 -0
package/src/encoder_avx.cc +16 -0
package/src/encoder_avx2.cc +16 -0
package/src/encoder_avx_base.h +564 -0
package/src/encoder_common.h +109 -0
package/src/encoder_neon.cc +547 -0
package/src/encoder_sse2.cc +13 -0
package/src/encoder_sse_base.h +724 -0
package/src/encoder_ssse3.cc +18 -0
package/src/hedley.h +1899 -0
package/src/platform.cc +147 -0
package/src/yencode.cc +449 -0
package/test/_maxsize.js +9 -0
package/test/_speedbase.js +147 -0
package/test/speedcrc.js +20 -0
package/test/speeddec.js +92 -0
package/test/speedenc.js +44 -0
package/{testcrc.js → test/testcrc.js} +53 -39
package/test/testdec.js +183 -0
package/test/testenc.js +163 -0
package/test/testpostdec.js +126 -0
package/test.js +0 -91
package/yencode.cc +0 -1622

package/src/common.h ADDED Viewed

@@ -0,0 +1,299 @@
+#ifndef __YENC_COMMON
+#define __YENC_COMMON
+#include "hedley.h"
+#if defined(__x86_64__) || \
+    defined(__amd64__ ) || \
+    defined(__LP64    ) || \
+    defined(_M_X64    ) || \
+    defined(_M_AMD64  ) || \
+    (defined(_WIN64) && !defined(_M_ARM64))
+	#define PLATFORM_AMD64 1
+#endif
+#if defined(PLATFORM_AMD64) || \
+    defined(__i386__  ) || \
+    defined(__i486__  ) || \
+    defined(__i586__  ) || \
+    defined(__i686__  ) || \
+    defined(_M_I86    ) || \
+    defined(_M_IX86   ) || \
+    (defined(_WIN32) && !defined(_M_ARM) && !defined(_M_ARM64))
+	#define PLATFORM_X86 1
+#endif
+#if defined(__aarch64__) || \
+    defined(__armv7__  ) || \
+    defined(__arm__    ) || \
+    defined(_M_ARM64   ) || \
+    defined(_M_ARM     ) || \
+    defined(__ARM_ARCH_6__ ) || \
+    defined(__ARM_ARCH_7__ ) || \
+    defined(__ARM_ARCH_7A__) || \
+    defined(__ARM_ARCH_8A__) || \
+    (defined(__ARM_ARCH    ) && __ARM_ARCH >= 6)
+	#define PLATFORM_ARM 1
+#endif
+#if defined(_MSC_VER) || defined(__MINGW32__) || defined(__MINGW64__)
+	#include <stdlib.h> // MSVC ARM64 seems to need this
+	#define ALIGN_ALLOC(buf, len, align) *(void**)&(buf) = _aligned_malloc((len), align)
+	#define ALIGN_FREE _aligned_free
+#elif defined(__cplusplus) && __cplusplus >= 201100 && !(defined(_MSC_VER) && (defined(__clang__) || defined(_M_ARM64) || defined(_M_ARM))) && !defined(__APPLE__)
+	// C++11 method
+	// len needs to be a multiple of alignment, although it sometimes works if it isn't...
+	#include <cstdlib>
+	#define ALIGN_ALLOC(buf, len, align) *(void**)&(buf) = aligned_alloc(align, ((len) + (align)-1) & ~((align)-1))
+	#define ALIGN_FREE free
+#else
+	#include <stdlib.h>
+	#define ALIGN_ALLOC(buf, len, align) if(posix_memalign((void**)&(buf), align, (len))) (buf) = NULL
+	#define ALIGN_FREE free
+#endif
+// MSVC compatibility
+#if ((defined(_M_IX86_FP) && _M_IX86_FP == 2) || defined(_M_X64)) && !defined(__clang__)
+	#define __SSE2__ 1
+	#define __SSSE3__ 1
+	#define __SSE4_1__ 1
+	#if defined(_MSC_VER) && _MSC_VER >= 1600
+		#define __POPCNT__ 1
+		#define __LZCNT__ 1
+	#endif
+	#if !defined(__AVX__) && (_MSC_VER >= 1700 && defined(__SSE2__))
+		#define __AVX__ 1
+	#endif
+	#if !defined(__AVX2__) && (_MSC_VER >= 1800 && defined(__SSE2__))
+		#define __AVX2__ 1
+		#define __BMI2__ 1
+	#endif
+	/* AVX512 requires VS 15.3 */
+	#if !defined(__AVX512F__) && (_MSC_VER >= 1911 && defined(__AVX__))
+		#define __AVX512BW__ 1
+		#define __AVX512F__ 1
+	#endif
+	/* AVX512VL not available until VS 15.5 */
+	#if defined(__AVX512F__) && _MSC_VER >= 1912
+		#define __AVX512VL__ 1
+	#endif
+	#if defined(__AVX512F__) && _MSC_VER >= 1920
+		#define __AVX512VBMI__ 1
+		#define __AVX512VBMI2__ 1
+	#endif
+#endif
+#if defined(_M_ARM64)
+	#define __aarch64__ 1
+	#define __ARM_NEON 1
+#endif
+#if defined(_M_ARM)
+	#define __ARM_NEON 1
+#endif
+#ifdef _MSC_VER
+# ifndef __BYTE_ORDER__
+#  define __BYTE_ORDER__ 1234
+# endif
+# ifndef __ORDER_BIG_ENDIAN__
+#  define __ORDER_BIG_ENDIAN__ 4321
+# endif
+# include <intrin.h>
+#endif
+// combine two 8-bit ints into a 16-bit one
+#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+#define UINT16_PACK(a, b) (((a) << 8) | (b))
+#define UINT32_PACK(a, b, c, d) (((a) << 24) | ((b) << 16) | ((c) << 8) | (d))
+#define UINT32_16_PACK(a, b) (((a) << 16) | (b))
+#else
+#define UINT16_PACK(a, b) ((a) | ((b) << 8))
+#define UINT32_PACK(a, b, c, d) ((a) | ((b) << 8) | ((c) << 16) | ((d) << 24))
+#define UINT32_16_PACK(a, b) ((a) | ((b) << 16))
+#endif
+#ifdef __SSE2__
+#include <emmintrin.h>
+#define XMM_SIZE 16 /*== (signed int)sizeof(__m128i)*/
+#ifdef __SSSE3__
+#include <tmmintrin.h>
+#endif
+#ifdef __POPCNT__
+#include <nmmintrin.h>
+// POPCNT can never return a negative result, but GCC doesn't seem to realise this, so typecast it to hint it better
+#define popcnt32 (unsigned int)_mm_popcnt_u32
+#endif
+#if defined(__AVX2__) || defined(__AVX512F__)
+#include <immintrin.h>
+#endif
+#if defined(__tune_core2__) || defined(__tune_atom__)
+/* on older Intel CPUs, plus first gen Atom, it is faster to store XMM registers in half */
+# define STOREU_XMM(dest, xmm) \
+  _mm_storel_epi64((__m128i*)(dest), xmm); \
+  _mm_storeh_pi(((__m64*)(dest) +1), _mm_castsi128_ps(xmm))
+#else
+# define STOREU_XMM(dest, xmm) \
+  _mm_storeu_si128((__m128i*)(dest), xmm)
+#endif
+#endif
+#ifdef __ARM_NEON
+# include <arm_neon.h>
+// ARM provides no standard way to inline define a vector :(
+static HEDLEY_ALWAYS_INLINE uint8x8_t vmake_u8(
+	uint8_t a, uint8_t b, uint8_t c, uint8_t d, uint8_t e, uint8_t f, uint8_t g, uint8_t h
+) {
+# if defined(_MSC_VER)
+	uint8_t t[] = {a,b,c,d,e,f,g,h};
+	return vld1_u8(t);
+# else
+	return (uint8x8_t){a,b,c,d,e,f,g,h};
+# endif
+}
+static HEDLEY_ALWAYS_INLINE uint8x16_t vmakeq_u8(
+	uint8_t a, uint8_t b, uint8_t c, uint8_t d, uint8_t e, uint8_t f, uint8_t g, uint8_t h,
+	uint8_t i, uint8_t j, uint8_t k, uint8_t l, uint8_t m, uint8_t n, uint8_t o, uint8_t p
+) {
+# if defined(_MSC_VER)
+	uint8_t t[] = {a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p};
+	return vld1q_u8(t);
+# else
+	return (uint8x16_t){a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p};
+# endif
+}
+static HEDLEY_ALWAYS_INLINE int8x16_t vmakeq_s8(
+	int8_t a, int8_t b, int8_t c, int8_t d, int8_t e, int8_t f, int8_t g, int8_t h,
+	int8_t i, int8_t j, int8_t k, int8_t l, int8_t m, int8_t n, int8_t o, int8_t p
+) {
+# if defined(_MSC_VER)
+	int8_t t[] = {a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p};
+	return vld1q_s8(t);
+# else
+	return (int8x16_t){a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p};
+# endif
+}
+# ifdef _MSC_VER
+#  define _CREATE_TUPLE(type, ...) type{{ __VA_ARGS__ }}
+# else
+#  define _CREATE_TUPLE(type, ...) (type){{ __VA_ARGS__ }}
+# endif
+static HEDLEY_ALWAYS_INLINE uint8x16x2_t vcreate2_u8(uint8x16_t a, uint8x16_t b) {
+	return _CREATE_TUPLE(uint8x16x2_t, a, b);
+}
+static HEDLEY_ALWAYS_INLINE int8x16x2_t vcreate2_s8(int8x16_t a, int8x16_t b) {
+	return _CREATE_TUPLE(int8x16x2_t, a, b);
+}
+static HEDLEY_ALWAYS_INLINE uint8x16x3_t vcreate3_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c) {
+	return _CREATE_TUPLE(uint8x16x3_t, a, b, c);
+}
+static HEDLEY_ALWAYS_INLINE uint8x16x4_t vcreate4_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c, uint8x16_t d) {
+	return _CREATE_TUPLE(uint8x16x4_t, a, b, c, d);
+}
+# undef _CREATE_TUPLE
+#endif
+#ifdef PLATFORM_ARM
+bool cpu_supports_neon();
+#endif
+#ifdef _MSC_VER
+#define ALIGN_TO(a, v) __declspec(align(a)) v
+#else
+#define ALIGN_TO(a, v) v __attribute__((aligned(a)))
+#endif
+#ifdef PLATFORM_X86
+enum YEncDecIsaLevel {
+	ISA_FEATURE_POPCNT = 0x1,
+	ISA_FEATURE_LZCNT = 0x2,
+	ISA_LEVEL_SSE2 = 0x100,
+	ISA_LEVEL_SSSE3 = 0x200,
+	ISA_LEVEL_SSE41 = 0x300,
+	ISA_LEVEL_SSE4_POPCNT = 0x301,
+	ISA_LEVEL_AVX = 0x381, // same as above, just used as a differentiator for `cpu_supports_isa`
+	ISA_LEVEL_AVX2 = 0x383, // also includes BMI1/2 and LZCNT
+	ISA_LEVEL_AVX3 = 0x403, // SKX variant; AVX512VL + AVX512BW
+	ISA_LEVEL_VBMI2 = 0x503 // ICL
+};
+#ifdef _MSC_VER
+// native tuning not supported in MSVC
+# define ISA_NATIVE ISA_LEVEL_SSE2
+#else
+# if defined(__AVX512VBMI2__)
+#  define _ISA_NATIVE ISA_LEVEL_VBMI2
+# elif defined(__AVX512BW__)
+#  define _ISA_NATIVE ISA_LEVEL_AVX3
+# elif defined(__AVX2__)
+#  define _ISA_NATIVE ISA_LEVEL_AVX2
+# elif defined(__SSE4_1__)
+#  define _ISA_NATIVE ISA_LEVEL_SSE41
+# elif defined(__SSSE3__)
+#  define _ISA_NATIVE ISA_LEVEL_SSSE3
+# else
+#  define _ISA_NATIVE ISA_LEVEL_SSE2
+# endif
+# if defined(__POPCNT__)
+#  if defined(__LZCNT__)
+#   define ISA_NATIVE (enum YEncDecIsaLevel)(_ISA_NATIVE | ISA_FEATURE_POPCNT | ISA_FEATURE_LZCNT)
+#  else
+#   define ISA_NATIVE (enum YEncDecIsaLevel)(_ISA_NATIVE | ISA_FEATURE_POPCNT)
+#  endif
+# else
+#  define ISA_NATIVE _ISA_NATIVE
+# endif
+#endif
+#ifdef _MSC_VER
+# define _cpuid1(ar) __cpuid(ar, 1)
+#else
+# include <cpuid.h>
+# define _cpuid1(ar) __cpuid(1, ar[0], ar[1], ar[2], ar[3])
+#endif
+int cpu_supports_isa();
+#endif // PLATFORM_X86
+#include <string.h>
+#if !defined(_MSC_VER) || defined(_STDINT) || _MSC_VER >= 1900
+# include <stdint.h>
+# include <stddef.h>
+#else
+/* Workaround for older MSVC not supporting stdint.h - just pull it from V8 */
+# include <v8.h>
+#endif
+// GCC 8/9/10(dev) fails to optimize cases where KNOT should be used, so use intrinsic explicitly; Clang 6+ has no issue, but Clang 6/7 doesn't have the intrinsic; MSVC 2019 also fails and lacks the intrinsic
+#if defined(__GNUC__) && __GNUC__ >= 7
+# define KNOT16 _knot_mask16
+# define KNOT32 _knot_mask32
+#else
+# define KNOT16(x) ((__mmask16)~(x))
+# define KNOT32(x) ((__mmask32)~(x))
+#endif
+// weird thing with Apple's Clang; doesn't seem to always occur, so assume that Clang >= 9 is fine: https://github.com/animetosho/node-yencode/issues/8#issuecomment-583385864
+// seems that Clang < 3.6 also uses the old name
+#if defined(__clang__) && ((defined(__APPLE__) && __clang_major__ < 9) || __clang_major__ < 3 || (__clang_major__ == 3 && __clang_minor__ < 6))
+# define _lzcnt_u32 __lzcnt32
+#endif
+#ifdef __GNUC__
+# if __GNUC__ >= 9
+#  define LIKELIHOOD(p, c) (HEDLEY_PREDICT(!!(c), 1, p))
+# else
+#  define LIKELIHOOD(p, c) (p>0.3 && p<0.7 ? HEDLEY_UNPREDICTABLE(!!(c)) : __builtin_expect(!!(c), (p >= 0.5)))
+# endif
+#else
+# define LIKELIHOOD(p, c) (c)
+#endif
+#endif /* __YENC_COMMON */

package/src/crc.cc ADDED Viewed

@@ -0,0 +1,95 @@
+#include "crc_common.h"
+#include "interface.h"
+crcutil_interface::CRC* crc = NULL;
+static uint32_t do_crc32_incremental_generic(const void* data, size_t length, uint32_t init) {
+	crcutil_interface::UINT64 tmp = init;
+	crc->Compute(data, length, &tmp);
+	return (uint32_t)tmp;
+}
+crc_func _do_crc32_incremental = &do_crc32_incremental_generic;
+uint32_t do_crc32_combine(uint32_t crc1, uint32_t crc2, size_t len2) {
+	crcutil_interface::UINT64 crc1_ = crc1, crc2_ = crc2;
+	crc->Concatenate(crc2_, 0, len2, &crc1_);
+	return (uint32_t)crc1_;
+}
+uint32_t do_crc32_zeros(uint32_t crc1, size_t len) {
+	crcutil_interface::UINT64 crc_ = crc1;
+	crc->CrcOfZeroes(len, &crc_);
+	return (uint32_t)crc_;
+}
+void crc_clmul_set_funcs(crc_func*);
+void crc_arm_set_funcs(crc_func*);
+#if defined(PLATFORM_ARM) && defined(_WIN32)
+# define WIN32_LEAN_AND_MEAN
+# include <Windows.h>
+#endif
+#ifdef PLATFORM_ARM
+# ifdef __ANDROID__
+#  include <cpu-features.h>
+# elif defined(__linux__) || (defined(__FreeBSD__) && __FreeBSD__ >= 12)
+#  include <sys/auxv.h>
+#  include <asm/hwcap.h>
+# elif (defined(__FreeBSD__) && __FreeBSD__ < 12)
+#  include <sys/sysctl.h>
+#  include <asm/hwcap.h>
+# elif defined(__APPLE__)
+#  include <sys/types.h>
+#  include <sys/sysctl.h>
+# endif
+# ifdef __FreeBSD__
+static unsigned long getauxval(unsigned long cap) {
+	unsigned long ret;
+	elf_aux_info(cap, &ret, sizeof(ret));
+	return ret;
+}
+# endif
+#endif
+void crc_init() {
+	crc = crcutil_interface::CRC::Create(
+		0xEDB88320, 0, 32, true, 0, 0, 0, 0, NULL);
+	// instance never deleted... oh well...
+#ifdef PLATFORM_X86
+	int flags[4];
+	_cpuid1(flags);
+	if((flags[2] & 0x80202) == 0x80202) // SSE4.1 + SSSE3 + CLMUL
+		crc_clmul_set_funcs(&_do_crc32_incremental);
+#endif
+#ifdef PLATFORM_ARM
+# ifdef __APPLE__
+	int supported = 0;
+	size_t len = sizeof(supported);
+	if(sysctlbyname("hw.optional.armv8_crc32", &supported, &len, NULL, 0))
+		supported = 0;
+# endif
+	if(
+# if defined(AT_HWCAP2) && defined(HWCAP2_CRC32)
+		getauxval(AT_HWCAP2) & HWCAP2_CRC32
+# elif defined(AT_HWCAP) && defined(HWCAP_CRC32)
+		getauxval(AT_HWCAP) & HWCAP_CRC32
+# elif defined(ANDROID_CPU_FAMILY_ARM) && defined(__aarch64__)
+		android_getCpuFeatures() & ANDROID_CPU_ARM64_FEATURE_CRC32
+# elif defined(ANDROID_CPU_FAMILY_ARM) /* aarch32 */
+		android_getCpuFeatures() & ANDROID_CPU_ARM_FEATURE_CRC32
+# elif defined(_WIN32)
+		IsProcessorFeaturePresent(PF_ARM_V8_CRC32_INSTRUCTIONS_AVAILABLE)
+# elif defined(__APPLE__)
+		supported
+# elif defined(__ARM_FEATURE_CRC32)
+		true /* assume available if compiled as such */
+# else
+		false
+# endif
+	) {
+		crc_arm_set_funcs(&_do_crc32_incremental);
+	}
+#endif
+}

package/src/crc.h ADDED Viewed

@@ -0,0 +1,23 @@
+#ifndef __YENC_CRC_H
+#define __YENC_CRC_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+typedef uint32_t (*crc_func)(const void*, size_t, uint32_t);
+extern crc_func _do_crc32_incremental;
+#define do_crc32 (*_do_crc32_incremental)
+uint32_t do_crc32_combine(uint32_t crc1, const uint32_t crc2, size_t len2);
+uint32_t do_crc32_zeros(uint32_t crc1, size_t len);
+void crc_init();
+#ifdef __cplusplus
+}
+#endif
+#endif

package/src/crc_arm.cc ADDED Viewed

@@ -0,0 +1,175 @@
+#include "crc_common.h"
+#if defined(PLATFORM_ARM) && defined(_MSC_VER) && defined(__clang__) && !defined(__ARM_FEATURE_CRC32)
+// I don't think GYP provides a nice way to detect whether MSVC or clang-cl is being used, but it doesn't use clang-cl by default, so a warning here is probably sufficient
+HEDLEY_WARNING("CRC32 acceleration is not been enabled under ARM clang-cl by default; add `-march=armv8-a+crc` to additional compiler arguments to enable");
+#endif
+#if defined(__ARM_FEATURE_CRC32) || (defined(_M_ARM64) && !defined(__clang__)) // MSVC doesn't support CRC for ARM32
+/* ARMv8 accelerated CRC */
+#if defined(_MSC_VER) && !defined(__clang__)
+#include <intrin.h>
+#else
+#include <arm_acle.h>
+#endif
+#ifdef __aarch64__
+# define WORD_T uint64_t
+# define WORDSIZE_LOG 3  // sizeof(WORD_T) == 1<<WORDSIZE_LOG
+# define CRC_WORD __crc32d
+#else
+# define WORD_T uint32_t
+# define WORDSIZE_LOG 2  // sizeof(WORD_T) == 1<<WORDSIZE_LOG
+# define CRC_WORD __crc32w
+#endif
+// exploit CPU pipelining during CRC computation; unfortunately I haven't been able to measure any benefit
+// - Neoverse N1: no noticeable difference
+// - Cortex A53: actually runs a bit slower
+//#define ENABLE_PIPELINE_OPT 1
+#ifdef ENABLE_PIPELINE_OPT
+// workaround MSVC complaining "unary minus operator applied to unsigned type, result still unsigned"
+#define NEGATE(n) (uint32_t)(-((int32_t)(n)))
+static HEDLEY_ALWAYS_INLINE uint32_t crc_multiply(uint32_t a, uint32_t b) {
+	uint32_t res = 0;
+	for(int i=0; i<31; i++) {
+		res ^= NEGATE(b>>31) & a;
+		a = ((a >> 1) ^ (0xEDB88320 & NEGATE(a&1)));
+		b <<= 1;
+	}
+	res ^= NEGATE(b>>31) & a;
+	return res;
+}
+static const uint32_t crc_power[] = { // pre-computed 2^n, with first 3 entries removed (saves a shift)
+	0x00800000, 0x00008000, 0xedb88320, 0xb1e6b092, 0xa06a2517, 0xed627dae, 0x88d14467, 0xd7bbfe6a,
+	0xec447f11, 0x8e7ea170, 0x6427800e, 0x4d47bae0, 0x09fe548f, 0x83852d0f, 0x30362f1a, 0x7b5a9cc3,
+	0x31fec169, 0x9fec022a, 0x6c8dedc4, 0x15d6874d, 0x5fde7a4e, 0xbad90e37, 0x2e4e5eef, 0x4eaba214,
+	0xa8a472c0, 0x429a969e, 0x148d302a, 0xc40ba6d0, 0xc4e22c3c, 0x40000000, 0x20000000, 0x08000000
+};
+/* above table can be computed with
+	int main(void) {
+		uint32_t k = 0x80000000 >> 1;
+		for (size_t i = 0; i < 32+3; ++i) {
+			if(i>2) printf("0x%08x, ", k);
+			k = crc_multiply(k, k);
+		}
+		return 0;
+	}
+*/
+#endif
+// inspired/stolen off https://github.com/jocover/crc32_armv8/blob/master/crc32_armv8.c
+static uint32_t arm_crc_calc(uint32_t crc, const unsigned char *src, long len) {
+	// initial alignment
+	if (len >= 16) { // 16 is an arbitrary number; it just needs to be >=8
+		if ((uintptr_t)src & sizeof(uint8_t)) {
+			crc = __crc32b(crc, *src);
+			src++;
+			len--;
+		}
+		if ((uintptr_t)src & sizeof(uint16_t)) {
+			crc = __crc32h(crc, *((uint16_t *)src));
+			src += sizeof(uint16_t);
+			len -= sizeof(uint16_t);
+		}
+#ifdef __aarch64__
+		if ((uintptr_t)src & sizeof(uint32_t)) {
+			crc = __crc32w(crc, *((uint32_t *)src));
+			src += sizeof(uint32_t);
+			len -= sizeof(uint32_t);
+		}
+#endif
+	}
+	const WORD_T* srcW = (const WORD_T*)src;
+#ifdef ENABLE_PIPELINE_OPT
+	// uses ideas from https://github.com/komrad36/crc#option-13-golden
+	// (this is a slightly less efficient, but much simpler implementation of the idea)
+	const unsigned SPLIT_WORDS_LOG = 10;  // make sure it's at least 2
+	const unsigned SPLIT_WORDS = 1<<SPLIT_WORDS_LOG;
+	while(len >= (long)(sizeof(WORD_T)*SPLIT_WORDS*2)) {
+		// compute 2x CRCs concurrently to leverage piplining
+		uint32_t crc2 = 0;
+		for(unsigned i=0; i<SPLIT_WORDS; i+=4) {
+			crc = CRC_WORD(crc, *srcW);
+			crc2 = CRC_WORD(crc2, *(srcW + SPLIT_WORDS));
+			srcW++;
+			crc = CRC_WORD(crc, *srcW);
+			crc2 = CRC_WORD(crc2, *(srcW + SPLIT_WORDS));
+			srcW++;
+			crc = CRC_WORD(crc, *srcW);
+			crc2 = CRC_WORD(crc2, *(srcW + SPLIT_WORDS));
+			srcW++;
+			crc = CRC_WORD(crc, *srcW);
+			crc2 = CRC_WORD(crc2, *(srcW + SPLIT_WORDS));
+			srcW++;
+		}
+		// merge the CRCs
+		// since we're multiplying by a fixed number, it could be sped up with some lookup tables
+		crc = crc_multiply(crc, crc_power[SPLIT_WORDS_LOG + WORDSIZE_LOG]) ^ crc2;
+		srcW += SPLIT_WORDS;
+		len -= sizeof(WORD_T)*SPLIT_WORDS*2;
+	}
+#endif
+	while ((len -= sizeof(WORD_T)*8) >= 0) {
+		crc = CRC_WORD(crc, *(srcW++));
+		crc = CRC_WORD(crc, *(srcW++));
+		crc = CRC_WORD(crc, *(srcW++));
+		crc = CRC_WORD(crc, *(srcW++));
+		crc = CRC_WORD(crc, *(srcW++));
+		crc = CRC_WORD(crc, *(srcW++));
+		crc = CRC_WORD(crc, *(srcW++));
+		crc = CRC_WORD(crc, *(srcW++));
+	}
+	if (len & sizeof(WORD_T)*4) {
+		crc = CRC_WORD(crc, *(srcW++));
+		crc = CRC_WORD(crc, *(srcW++));
+		crc = CRC_WORD(crc, *(srcW++));
+		crc = CRC_WORD(crc, *(srcW++));
+	}
+	if (len & sizeof(WORD_T)*2) {
+		crc = CRC_WORD(crc, *(srcW++));
+		crc = CRC_WORD(crc, *(srcW++));
+	}
+	if (len & sizeof(WORD_T)) {
+		crc = CRC_WORD(crc, *(srcW++));
+	}
+	src = (const unsigned char*)srcW;
+#ifdef __aarch64__
+	if (len & sizeof(uint32_t)) {
+		crc = __crc32w(crc, *((uint32_t *)src));
+		src += sizeof(uint32_t);
+	}
+#endif
+	if (len & sizeof(uint16_t)) {
+		crc = __crc32h(crc, *((uint16_t *)src));
+		src += sizeof(uint16_t);
+	}
+	if (len & sizeof(uint8_t))
+		crc = __crc32b(crc, *src);
+	return crc;
+}
+static uint32_t do_crc32_incremental_arm(const void* data, size_t length, uint32_t init) {
+	return ~arm_crc_calc(~init, (const unsigned char*)data, (long)length);
+}
+void crc_arm_set_funcs(crc_func* _do_crc32_incremental) {
+	*_do_crc32_incremental = &do_crc32_incremental_arm;
+}
+#else
+void crc_arm_set_funcs(crc_func* _do_crc32_incremental) {
+	(void)_do_crc32_incremental;
+}
+#endif

package/src/crc_common.h ADDED Viewed

@@ -0,0 +1,4 @@
+#include "common.h"
+#include <stddef.h> // for size_t
+#include "crc.h"