yencode 1.0.8 → 1.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. package/README.md +339 -231
  2. package/binding.gyp +292 -39
  3. package/crcutil-1.0/code/multiword_64_64_gcc_amd64_asm.cc +7 -7
  4. package/crcutil-1.0/code/multiword_64_64_gcc_i386_mmx.cc +14 -14
  5. package/crcutil-1.0/code/multiword_64_64_intrinsic_i386_mmx.cc +1 -1
  6. package/crcutil-1.0/code/uint128_sse2.h +2 -0
  7. package/index.js +329 -22
  8. package/package.json +2 -2
  9. package/src/common.h +299 -0
  10. package/src/crc.cc +95 -0
  11. package/src/crc.h +23 -0
  12. package/src/crc_arm.cc +175 -0
  13. package/src/crc_common.h +4 -0
  14. package/{crc_folding.c → src/crc_folding.cc} +175 -185
  15. package/src/decoder.cc +61 -0
  16. package/src/decoder.h +53 -0
  17. package/src/decoder_avx.cc +18 -0
  18. package/src/decoder_avx2.cc +18 -0
  19. package/src/decoder_avx2_base.h +615 -0
  20. package/src/decoder_common.h +512 -0
  21. package/src/decoder_neon.cc +474 -0
  22. package/src/decoder_neon64.cc +451 -0
  23. package/src/decoder_sse2.cc +16 -0
  24. package/src/decoder_sse_base.h +711 -0
  25. package/src/decoder_ssse3.cc +18 -0
  26. package/src/encoder.cc +170 -0
  27. package/src/encoder.h +21 -0
  28. package/src/encoder_avx.cc +16 -0
  29. package/src/encoder_avx2.cc +16 -0
  30. package/src/encoder_avx_base.h +564 -0
  31. package/src/encoder_common.h +109 -0
  32. package/src/encoder_neon.cc +547 -0
  33. package/src/encoder_sse2.cc +13 -0
  34. package/src/encoder_sse_base.h +724 -0
  35. package/src/encoder_ssse3.cc +18 -0
  36. package/src/hedley.h +1899 -0
  37. package/src/platform.cc +147 -0
  38. package/src/yencode.cc +449 -0
  39. package/test/_maxsize.js +9 -0
  40. package/test/_speedbase.js +147 -0
  41. package/test/speedcrc.js +20 -0
  42. package/test/speeddec.js +92 -0
  43. package/test/speedenc.js +44 -0
  44. package/{testcrc.js → test/testcrc.js} +53 -39
  45. package/test/testdec.js +183 -0
  46. package/test/testenc.js +163 -0
  47. package/test/testpostdec.js +126 -0
  48. package/test.js +0 -91
  49. package/yencode.cc +0 -1622
package/src/common.h ADDED
@@ -0,0 +1,299 @@
1
+ #ifndef __YENC_COMMON
2
+ #define __YENC_COMMON
3
+
4
+ #include "hedley.h"
5
+
6
+ #if defined(__x86_64__) || \
7
+ defined(__amd64__ ) || \
8
+ defined(__LP64 ) || \
9
+ defined(_M_X64 ) || \
10
+ defined(_M_AMD64 ) || \
11
+ (defined(_WIN64) && !defined(_M_ARM64))
12
+ #define PLATFORM_AMD64 1
13
+ #endif
14
+ #if defined(PLATFORM_AMD64) || \
15
+ defined(__i386__ ) || \
16
+ defined(__i486__ ) || \
17
+ defined(__i586__ ) || \
18
+ defined(__i686__ ) || \
19
+ defined(_M_I86 ) || \
20
+ defined(_M_IX86 ) || \
21
+ (defined(_WIN32) && !defined(_M_ARM) && !defined(_M_ARM64))
22
+ #define PLATFORM_X86 1
23
+ #endif
24
+ #if defined(__aarch64__) || \
25
+ defined(__armv7__ ) || \
26
+ defined(__arm__ ) || \
27
+ defined(_M_ARM64 ) || \
28
+ defined(_M_ARM ) || \
29
+ defined(__ARM_ARCH_6__ ) || \
30
+ defined(__ARM_ARCH_7__ ) || \
31
+ defined(__ARM_ARCH_7A__) || \
32
+ defined(__ARM_ARCH_8A__) || \
33
+ (defined(__ARM_ARCH ) && __ARM_ARCH >= 6)
34
+ #define PLATFORM_ARM 1
35
+ #endif
36
+
37
+
38
+ #if defined(_MSC_VER) || defined(__MINGW32__) || defined(__MINGW64__)
39
+ #include <stdlib.h> // MSVC ARM64 seems to need this
40
+ #define ALIGN_ALLOC(buf, len, align) *(void**)&(buf) = _aligned_malloc((len), align)
41
+ #define ALIGN_FREE _aligned_free
42
+ #elif defined(__cplusplus) && __cplusplus >= 201100 && !(defined(_MSC_VER) && (defined(__clang__) || defined(_M_ARM64) || defined(_M_ARM))) && !defined(__APPLE__)
43
+ // C++11 method
44
+ // len needs to be a multiple of alignment, although it sometimes works if it isn't...
45
+ #include <cstdlib>
46
+ #define ALIGN_ALLOC(buf, len, align) *(void**)&(buf) = aligned_alloc(align, ((len) + (align)-1) & ~((align)-1))
47
+ #define ALIGN_FREE free
48
+ #else
49
+ #include <stdlib.h>
50
+ #define ALIGN_ALLOC(buf, len, align) if(posix_memalign((void**)&(buf), align, (len))) (buf) = NULL
51
+ #define ALIGN_FREE free
52
+ #endif
53
+
54
+
55
+ // MSVC compatibility
56
+ #if ((defined(_M_IX86_FP) && _M_IX86_FP == 2) || defined(_M_X64)) && !defined(__clang__)
57
+ #define __SSE2__ 1
58
+ #define __SSSE3__ 1
59
+ #define __SSE4_1__ 1
60
+ #if defined(_MSC_VER) && _MSC_VER >= 1600
61
+ #define __POPCNT__ 1
62
+ #define __LZCNT__ 1
63
+ #endif
64
+ #if !defined(__AVX__) && (_MSC_VER >= 1700 && defined(__SSE2__))
65
+ #define __AVX__ 1
66
+ #endif
67
+ #if !defined(__AVX2__) && (_MSC_VER >= 1800 && defined(__SSE2__))
68
+ #define __AVX2__ 1
69
+ #define __BMI2__ 1
70
+ #endif
71
+ /* AVX512 requires VS 15.3 */
72
+ #if !defined(__AVX512F__) && (_MSC_VER >= 1911 && defined(__AVX__))
73
+ #define __AVX512BW__ 1
74
+ #define __AVX512F__ 1
75
+ #endif
76
+ /* AVX512VL not available until VS 15.5 */
77
+ #if defined(__AVX512F__) && _MSC_VER >= 1912
78
+ #define __AVX512VL__ 1
79
+ #endif
80
+ #if defined(__AVX512F__) && _MSC_VER >= 1920
81
+ #define __AVX512VBMI__ 1
82
+ #define __AVX512VBMI2__ 1
83
+ #endif
84
+ #endif
85
+ #if defined(_M_ARM64)
86
+ #define __aarch64__ 1
87
+ #define __ARM_NEON 1
88
+ #endif
89
+ #if defined(_M_ARM)
90
+ #define __ARM_NEON 1
91
+ #endif
92
+ #ifdef _MSC_VER
93
+ # ifndef __BYTE_ORDER__
94
+ # define __BYTE_ORDER__ 1234
95
+ # endif
96
+ # ifndef __ORDER_BIG_ENDIAN__
97
+ # define __ORDER_BIG_ENDIAN__ 4321
98
+ # endif
99
+ # include <intrin.h>
100
+ #endif
101
+
102
+
103
+ // combine two 8-bit ints into a 16-bit one
104
+ #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
105
+ #define UINT16_PACK(a, b) (((a) << 8) | (b))
106
+ #define UINT32_PACK(a, b, c, d) (((a) << 24) | ((b) << 16) | ((c) << 8) | (d))
107
+ #define UINT32_16_PACK(a, b) (((a) << 16) | (b))
108
+ #else
109
+ #define UINT16_PACK(a, b) ((a) | ((b) << 8))
110
+ #define UINT32_PACK(a, b, c, d) ((a) | ((b) << 8) | ((c) << 16) | ((d) << 24))
111
+ #define UINT32_16_PACK(a, b) ((a) | ((b) << 16))
112
+ #endif
113
+
114
+ #ifdef __SSE2__
115
+ #include <emmintrin.h>
116
+ #define XMM_SIZE 16 /*== (signed int)sizeof(__m128i)*/
117
+
118
+ #ifdef __SSSE3__
119
+ #include <tmmintrin.h>
120
+ #endif
121
+ #ifdef __POPCNT__
122
+ #include <nmmintrin.h>
123
+ // POPCNT can never return a negative result, but GCC doesn't seem to realise this, so typecast it to hint it better
124
+ #define popcnt32 (unsigned int)_mm_popcnt_u32
125
+ #endif
126
+
127
+ #if defined(__AVX2__) || defined(__AVX512F__)
128
+ #include <immintrin.h>
129
+ #endif
130
+
131
+
132
+ #if defined(__tune_core2__) || defined(__tune_atom__)
133
+ /* on older Intel CPUs, plus first gen Atom, it is faster to store XMM registers in half */
134
+ # define STOREU_XMM(dest, xmm) \
135
+ _mm_storel_epi64((__m128i*)(dest), xmm); \
136
+ _mm_storeh_pi(((__m64*)(dest) +1), _mm_castsi128_ps(xmm))
137
+ #else
138
+ # define STOREU_XMM(dest, xmm) \
139
+ _mm_storeu_si128((__m128i*)(dest), xmm)
140
+ #endif
141
+
142
+ #endif
143
+
144
+ #ifdef __ARM_NEON
145
+ # include <arm_neon.h>
146
+
147
+ // ARM provides no standard way to inline define a vector :(
148
+ static HEDLEY_ALWAYS_INLINE uint8x8_t vmake_u8(
149
+ uint8_t a, uint8_t b, uint8_t c, uint8_t d, uint8_t e, uint8_t f, uint8_t g, uint8_t h
150
+ ) {
151
+ # if defined(_MSC_VER)
152
+ uint8_t t[] = {a,b,c,d,e,f,g,h};
153
+ return vld1_u8(t);
154
+ # else
155
+ return (uint8x8_t){a,b,c,d,e,f,g,h};
156
+ # endif
157
+ }
158
+ static HEDLEY_ALWAYS_INLINE uint8x16_t vmakeq_u8(
159
+ uint8_t a, uint8_t b, uint8_t c, uint8_t d, uint8_t e, uint8_t f, uint8_t g, uint8_t h,
160
+ uint8_t i, uint8_t j, uint8_t k, uint8_t l, uint8_t m, uint8_t n, uint8_t o, uint8_t p
161
+ ) {
162
+ # if defined(_MSC_VER)
163
+ uint8_t t[] = {a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p};
164
+ return vld1q_u8(t);
165
+ # else
166
+ return (uint8x16_t){a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p};
167
+ # endif
168
+ }
169
+ static HEDLEY_ALWAYS_INLINE int8x16_t vmakeq_s8(
170
+ int8_t a, int8_t b, int8_t c, int8_t d, int8_t e, int8_t f, int8_t g, int8_t h,
171
+ int8_t i, int8_t j, int8_t k, int8_t l, int8_t m, int8_t n, int8_t o, int8_t p
172
+ ) {
173
+ # if defined(_MSC_VER)
174
+ int8_t t[] = {a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p};
175
+ return vld1q_s8(t);
176
+ # else
177
+ return (int8x16_t){a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p};
178
+ # endif
179
+ }
180
+
181
+ # ifdef _MSC_VER
182
+ # define _CREATE_TUPLE(type, ...) type{{ __VA_ARGS__ }}
183
+ # else
184
+ # define _CREATE_TUPLE(type, ...) (type){{ __VA_ARGS__ }}
185
+ # endif
186
+ static HEDLEY_ALWAYS_INLINE uint8x16x2_t vcreate2_u8(uint8x16_t a, uint8x16_t b) {
187
+ return _CREATE_TUPLE(uint8x16x2_t, a, b);
188
+ }
189
+ static HEDLEY_ALWAYS_INLINE int8x16x2_t vcreate2_s8(int8x16_t a, int8x16_t b) {
190
+ return _CREATE_TUPLE(int8x16x2_t, a, b);
191
+ }
192
+ static HEDLEY_ALWAYS_INLINE uint8x16x3_t vcreate3_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c) {
193
+ return _CREATE_TUPLE(uint8x16x3_t, a, b, c);
194
+ }
195
+ static HEDLEY_ALWAYS_INLINE uint8x16x4_t vcreate4_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c, uint8x16_t d) {
196
+ return _CREATE_TUPLE(uint8x16x4_t, a, b, c, d);
197
+ }
198
+ # undef _CREATE_TUPLE
199
+ #endif
200
+ #ifdef PLATFORM_ARM
201
+ bool cpu_supports_neon();
202
+ #endif
203
+
204
+ #ifdef _MSC_VER
205
+ #define ALIGN_TO(a, v) __declspec(align(a)) v
206
+ #else
207
+ #define ALIGN_TO(a, v) v __attribute__((aligned(a)))
208
+ #endif
209
+
210
+
211
+ #ifdef PLATFORM_X86
212
+ enum YEncDecIsaLevel {
213
+ ISA_FEATURE_POPCNT = 0x1,
214
+ ISA_FEATURE_LZCNT = 0x2,
215
+ ISA_LEVEL_SSE2 = 0x100,
216
+ ISA_LEVEL_SSSE3 = 0x200,
217
+ ISA_LEVEL_SSE41 = 0x300,
218
+ ISA_LEVEL_SSE4_POPCNT = 0x301,
219
+ ISA_LEVEL_AVX = 0x381, // same as above, just used as a differentiator for `cpu_supports_isa`
220
+ ISA_LEVEL_AVX2 = 0x383, // also includes BMI1/2 and LZCNT
221
+ ISA_LEVEL_AVX3 = 0x403, // SKX variant; AVX512VL + AVX512BW
222
+ ISA_LEVEL_VBMI2 = 0x503 // ICL
223
+ };
224
+ #ifdef _MSC_VER
225
+ // native tuning not supported in MSVC
226
+ # define ISA_NATIVE ISA_LEVEL_SSE2
227
+ #else
228
+ # if defined(__AVX512VBMI2__)
229
+ # define _ISA_NATIVE ISA_LEVEL_VBMI2
230
+ # elif defined(__AVX512BW__)
231
+ # define _ISA_NATIVE ISA_LEVEL_AVX3
232
+ # elif defined(__AVX2__)
233
+ # define _ISA_NATIVE ISA_LEVEL_AVX2
234
+ # elif defined(__SSE4_1__)
235
+ # define _ISA_NATIVE ISA_LEVEL_SSE41
236
+ # elif defined(__SSSE3__)
237
+ # define _ISA_NATIVE ISA_LEVEL_SSSE3
238
+ # else
239
+ # define _ISA_NATIVE ISA_LEVEL_SSE2
240
+ # endif
241
+ # if defined(__POPCNT__)
242
+ # if defined(__LZCNT__)
243
+ # define ISA_NATIVE (enum YEncDecIsaLevel)(_ISA_NATIVE | ISA_FEATURE_POPCNT | ISA_FEATURE_LZCNT)
244
+ # else
245
+ # define ISA_NATIVE (enum YEncDecIsaLevel)(_ISA_NATIVE | ISA_FEATURE_POPCNT)
246
+ # endif
247
+ # else
248
+ # define ISA_NATIVE _ISA_NATIVE
249
+ # endif
250
+ #endif
251
+
252
+ #ifdef _MSC_VER
253
+ # define _cpuid1(ar) __cpuid(ar, 1)
254
+ #else
255
+ # include <cpuid.h>
256
+ # define _cpuid1(ar) __cpuid(1, ar[0], ar[1], ar[2], ar[3])
257
+ #endif
258
+
259
+ int cpu_supports_isa();
260
+ #endif // PLATFORM_X86
261
+
262
+ #include <string.h>
263
+ #if !defined(_MSC_VER) || defined(_STDINT) || _MSC_VER >= 1900
264
+ # include <stdint.h>
265
+ # include <stddef.h>
266
+ #else
267
+ /* Workaround for older MSVC not supporting stdint.h - just pull it from V8 */
268
+ # include <v8.h>
269
+ #endif
270
+
271
+
272
+ // GCC 8/9/10(dev) fails to optimize cases where KNOT should be used, so use intrinsic explicitly; Clang 6+ has no issue, but Clang 6/7 doesn't have the intrinsic; MSVC 2019 also fails and lacks the intrinsic
273
+ #if defined(__GNUC__) && __GNUC__ >= 7
274
+ # define KNOT16 _knot_mask16
275
+ # define KNOT32 _knot_mask32
276
+ #else
277
+ # define KNOT16(x) ((__mmask16)~(x))
278
+ # define KNOT32(x) ((__mmask32)~(x))
279
+ #endif
280
+
281
+ // weird thing with Apple's Clang; doesn't seem to always occur, so assume that Clang >= 9 is fine: https://github.com/animetosho/node-yencode/issues/8#issuecomment-583385864
282
+ // seems that Clang < 3.6 also uses the old name
283
+ #if defined(__clang__) && ((defined(__APPLE__) && __clang_major__ < 9) || __clang_major__ < 3 || (__clang_major__ == 3 && __clang_minor__ < 6))
284
+ # define _lzcnt_u32 __lzcnt32
285
+ #endif
286
+
287
+
288
+
289
+ #ifdef __GNUC__
290
+ # if __GNUC__ >= 9
291
+ # define LIKELIHOOD(p, c) (HEDLEY_PREDICT(!!(c), 1, p))
292
+ # else
293
+ # define LIKELIHOOD(p, c) (p>0.3 && p<0.7 ? HEDLEY_UNPREDICTABLE(!!(c)) : __builtin_expect(!!(c), (p >= 0.5)))
294
+ # endif
295
+ #else
296
+ # define LIKELIHOOD(p, c) (c)
297
+ #endif
298
+
299
+ #endif /* __YENC_COMMON */
package/src/crc.cc ADDED
@@ -0,0 +1,95 @@
1
+ #include "crc_common.h"
2
+
3
+ #include "interface.h"
4
+ crcutil_interface::CRC* crc = NULL;
5
+
6
+ static uint32_t do_crc32_incremental_generic(const void* data, size_t length, uint32_t init) {
7
+ crcutil_interface::UINT64 tmp = init;
8
+ crc->Compute(data, length, &tmp);
9
+ return (uint32_t)tmp;
10
+ }
11
+ crc_func _do_crc32_incremental = &do_crc32_incremental_generic;
12
+
13
+
14
+
15
+ uint32_t do_crc32_combine(uint32_t crc1, uint32_t crc2, size_t len2) {
16
+ crcutil_interface::UINT64 crc1_ = crc1, crc2_ = crc2;
17
+ crc->Concatenate(crc2_, 0, len2, &crc1_);
18
+ return (uint32_t)crc1_;
19
+ }
20
+
21
+ uint32_t do_crc32_zeros(uint32_t crc1, size_t len) {
22
+ crcutil_interface::UINT64 crc_ = crc1;
23
+ crc->CrcOfZeroes(len, &crc_);
24
+ return (uint32_t)crc_;
25
+ }
26
+
27
+ void crc_clmul_set_funcs(crc_func*);
28
+ void crc_arm_set_funcs(crc_func*);
29
+
30
+ #if defined(PLATFORM_ARM) && defined(_WIN32)
31
+ # define WIN32_LEAN_AND_MEAN
32
+ # include <Windows.h>
33
+ #endif
34
+ #ifdef PLATFORM_ARM
35
+ # ifdef __ANDROID__
36
+ # include <cpu-features.h>
37
+ # elif defined(__linux__) || (defined(__FreeBSD__) && __FreeBSD__ >= 12)
38
+ # include <sys/auxv.h>
39
+ # include <asm/hwcap.h>
40
+ # elif (defined(__FreeBSD__) && __FreeBSD__ < 12)
41
+ # include <sys/sysctl.h>
42
+ # include <asm/hwcap.h>
43
+ # elif defined(__APPLE__)
44
+ # include <sys/types.h>
45
+ # include <sys/sysctl.h>
46
+ # endif
47
+ # ifdef __FreeBSD__
48
+ static unsigned long getauxval(unsigned long cap) {
49
+ unsigned long ret;
50
+ elf_aux_info(cap, &ret, sizeof(ret));
51
+ return ret;
52
+ }
53
+ # endif
54
+ #endif
55
+ void crc_init() {
56
+ crc = crcutil_interface::CRC::Create(
57
+ 0xEDB88320, 0, 32, true, 0, 0, 0, 0, NULL);
58
+ // instance never deleted... oh well...
59
+
60
+ #ifdef PLATFORM_X86
61
+ int flags[4];
62
+ _cpuid1(flags);
63
+ if((flags[2] & 0x80202) == 0x80202) // SSE4.1 + SSSE3 + CLMUL
64
+ crc_clmul_set_funcs(&_do_crc32_incremental);
65
+ #endif
66
+ #ifdef PLATFORM_ARM
67
+ # ifdef __APPLE__
68
+ int supported = 0;
69
+ size_t len = sizeof(supported);
70
+ if(sysctlbyname("hw.optional.armv8_crc32", &supported, &len, NULL, 0))
71
+ supported = 0;
72
+ # endif
73
+ if(
74
+ # if defined(AT_HWCAP2) && defined(HWCAP2_CRC32)
75
+ getauxval(AT_HWCAP2) & HWCAP2_CRC32
76
+ # elif defined(AT_HWCAP) && defined(HWCAP_CRC32)
77
+ getauxval(AT_HWCAP) & HWCAP_CRC32
78
+ # elif defined(ANDROID_CPU_FAMILY_ARM) && defined(__aarch64__)
79
+ android_getCpuFeatures() & ANDROID_CPU_ARM64_FEATURE_CRC32
80
+ # elif defined(ANDROID_CPU_FAMILY_ARM) /* aarch32 */
81
+ android_getCpuFeatures() & ANDROID_CPU_ARM_FEATURE_CRC32
82
+ # elif defined(_WIN32)
83
+ IsProcessorFeaturePresent(PF_ARM_V8_CRC32_INSTRUCTIONS_AVAILABLE)
84
+ # elif defined(__APPLE__)
85
+ supported
86
+ # elif defined(__ARM_FEATURE_CRC32)
87
+ true /* assume available if compiled as such */
88
+ # else
89
+ false
90
+ # endif
91
+ ) {
92
+ crc_arm_set_funcs(&_do_crc32_incremental);
93
+ }
94
+ #endif
95
+ }
package/src/crc.h ADDED
@@ -0,0 +1,23 @@
1
+ #ifndef __YENC_CRC_H
2
+ #define __YENC_CRC_H
3
+
4
+ #ifdef __cplusplus
5
+ extern "C" {
6
+ #endif
7
+
8
+
9
+
10
+ typedef uint32_t (*crc_func)(const void*, size_t, uint32_t);
11
+ extern crc_func _do_crc32_incremental;
12
+ #define do_crc32 (*_do_crc32_incremental)
13
+
14
+ uint32_t do_crc32_combine(uint32_t crc1, const uint32_t crc2, size_t len2);
15
+ uint32_t do_crc32_zeros(uint32_t crc1, size_t len);
16
+ void crc_init();
17
+
18
+
19
+
20
+ #ifdef __cplusplus
21
+ }
22
+ #endif
23
+ #endif
package/src/crc_arm.cc ADDED
@@ -0,0 +1,175 @@
1
+ #include "crc_common.h"
2
+
3
+ #if defined(PLATFORM_ARM) && defined(_MSC_VER) && defined(__clang__) && !defined(__ARM_FEATURE_CRC32)
4
+ // I don't think GYP provides a nice way to detect whether MSVC or clang-cl is being used, but it doesn't use clang-cl by default, so a warning here is probably sufficient
5
+ HEDLEY_WARNING("CRC32 acceleration is not been enabled under ARM clang-cl by default; add `-march=armv8-a+crc` to additional compiler arguments to enable");
6
+ #endif
7
+
8
+ #if defined(__ARM_FEATURE_CRC32) || (defined(_M_ARM64) && !defined(__clang__)) // MSVC doesn't support CRC for ARM32
9
+
10
+ /* ARMv8 accelerated CRC */
11
+ #if defined(_MSC_VER) && !defined(__clang__)
12
+ #include <intrin.h>
13
+ #else
14
+ #include <arm_acle.h>
15
+ #endif
16
+
17
+ #ifdef __aarch64__
18
+ # define WORD_T uint64_t
19
+ # define WORDSIZE_LOG 3 // sizeof(WORD_T) == 1<<WORDSIZE_LOG
20
+ # define CRC_WORD __crc32d
21
+ #else
22
+ # define WORD_T uint32_t
23
+ # define WORDSIZE_LOG 2 // sizeof(WORD_T) == 1<<WORDSIZE_LOG
24
+ # define CRC_WORD __crc32w
25
+ #endif
26
+
27
+
28
+ // exploit CPU pipelining during CRC computation; unfortunately I haven't been able to measure any benefit
29
+ // - Neoverse N1: no noticeable difference
30
+ // - Cortex A53: actually runs a bit slower
31
+ //#define ENABLE_PIPELINE_OPT 1
32
+
33
+ #ifdef ENABLE_PIPELINE_OPT
34
+ // workaround MSVC complaining "unary minus operator applied to unsigned type, result still unsigned"
35
+ #define NEGATE(n) (uint32_t)(-((int32_t)(n)))
36
+
37
+ static HEDLEY_ALWAYS_INLINE uint32_t crc_multiply(uint32_t a, uint32_t b) {
38
+ uint32_t res = 0;
39
+ for(int i=0; i<31; i++) {
40
+ res ^= NEGATE(b>>31) & a;
41
+ a = ((a >> 1) ^ (0xEDB88320 & NEGATE(a&1)));
42
+ b <<= 1;
43
+ }
44
+ res ^= NEGATE(b>>31) & a;
45
+ return res;
46
+ }
47
+
48
+ static const uint32_t crc_power[] = { // pre-computed 2^n, with first 3 entries removed (saves a shift)
49
+ 0x00800000, 0x00008000, 0xedb88320, 0xb1e6b092, 0xa06a2517, 0xed627dae, 0x88d14467, 0xd7bbfe6a,
50
+ 0xec447f11, 0x8e7ea170, 0x6427800e, 0x4d47bae0, 0x09fe548f, 0x83852d0f, 0x30362f1a, 0x7b5a9cc3,
51
+ 0x31fec169, 0x9fec022a, 0x6c8dedc4, 0x15d6874d, 0x5fde7a4e, 0xbad90e37, 0x2e4e5eef, 0x4eaba214,
52
+ 0xa8a472c0, 0x429a969e, 0x148d302a, 0xc40ba6d0, 0xc4e22c3c, 0x40000000, 0x20000000, 0x08000000
53
+ };
54
+ /* above table can be computed with
55
+ int main(void) {
56
+ uint32_t k = 0x80000000 >> 1;
57
+ for (size_t i = 0; i < 32+3; ++i) {
58
+ if(i>2) printf("0x%08x, ", k);
59
+ k = crc_multiply(k, k);
60
+ }
61
+ return 0;
62
+ }
63
+ */
64
+ #endif
65
+
66
+
67
+ // inspired/stolen off https://github.com/jocover/crc32_armv8/blob/master/crc32_armv8.c
68
+ static uint32_t arm_crc_calc(uint32_t crc, const unsigned char *src, long len) {
69
+
70
+ // initial alignment
71
+ if (len >= 16) { // 16 is an arbitrary number; it just needs to be >=8
72
+ if ((uintptr_t)src & sizeof(uint8_t)) {
73
+ crc = __crc32b(crc, *src);
74
+ src++;
75
+ len--;
76
+ }
77
+ if ((uintptr_t)src & sizeof(uint16_t)) {
78
+ crc = __crc32h(crc, *((uint16_t *)src));
79
+ src += sizeof(uint16_t);
80
+ len -= sizeof(uint16_t);
81
+ }
82
+ #ifdef __aarch64__
83
+ if ((uintptr_t)src & sizeof(uint32_t)) {
84
+ crc = __crc32w(crc, *((uint32_t *)src));
85
+ src += sizeof(uint32_t);
86
+ len -= sizeof(uint32_t);
87
+ }
88
+ #endif
89
+ }
90
+
91
+ const WORD_T* srcW = (const WORD_T*)src;
92
+
93
+ #ifdef ENABLE_PIPELINE_OPT
94
+ // uses ideas from https://github.com/komrad36/crc#option-13-golden
95
+ // (this is a slightly less efficient, but much simpler implementation of the idea)
96
+ const unsigned SPLIT_WORDS_LOG = 10; // make sure it's at least 2
97
+ const unsigned SPLIT_WORDS = 1<<SPLIT_WORDS_LOG;
98
+ while(len >= (long)(sizeof(WORD_T)*SPLIT_WORDS*2)) {
99
+ // compute 2x CRCs concurrently to leverage piplining
100
+ uint32_t crc2 = 0;
101
+ for(unsigned i=0; i<SPLIT_WORDS; i+=4) {
102
+ crc = CRC_WORD(crc, *srcW);
103
+ crc2 = CRC_WORD(crc2, *(srcW + SPLIT_WORDS));
104
+ srcW++;
105
+ crc = CRC_WORD(crc, *srcW);
106
+ crc2 = CRC_WORD(crc2, *(srcW + SPLIT_WORDS));
107
+ srcW++;
108
+ crc = CRC_WORD(crc, *srcW);
109
+ crc2 = CRC_WORD(crc2, *(srcW + SPLIT_WORDS));
110
+ srcW++;
111
+ crc = CRC_WORD(crc, *srcW);
112
+ crc2 = CRC_WORD(crc2, *(srcW + SPLIT_WORDS));
113
+ srcW++;
114
+ }
115
+ // merge the CRCs
116
+ // since we're multiplying by a fixed number, it could be sped up with some lookup tables
117
+ crc = crc_multiply(crc, crc_power[SPLIT_WORDS_LOG + WORDSIZE_LOG]) ^ crc2;
118
+ srcW += SPLIT_WORDS;
119
+ len -= sizeof(WORD_T)*SPLIT_WORDS*2;
120
+ }
121
+ #endif
122
+
123
+ while ((len -= sizeof(WORD_T)*8) >= 0) {
124
+ crc = CRC_WORD(crc, *(srcW++));
125
+ crc = CRC_WORD(crc, *(srcW++));
126
+ crc = CRC_WORD(crc, *(srcW++));
127
+ crc = CRC_WORD(crc, *(srcW++));
128
+ crc = CRC_WORD(crc, *(srcW++));
129
+ crc = CRC_WORD(crc, *(srcW++));
130
+ crc = CRC_WORD(crc, *(srcW++));
131
+ crc = CRC_WORD(crc, *(srcW++));
132
+ }
133
+ if (len & sizeof(WORD_T)*4) {
134
+ crc = CRC_WORD(crc, *(srcW++));
135
+ crc = CRC_WORD(crc, *(srcW++));
136
+ crc = CRC_WORD(crc, *(srcW++));
137
+ crc = CRC_WORD(crc, *(srcW++));
138
+ }
139
+ if (len & sizeof(WORD_T)*2) {
140
+ crc = CRC_WORD(crc, *(srcW++));
141
+ crc = CRC_WORD(crc, *(srcW++));
142
+ }
143
+ if (len & sizeof(WORD_T)) {
144
+ crc = CRC_WORD(crc, *(srcW++));
145
+ }
146
+ src = (const unsigned char*)srcW;
147
+
148
+ #ifdef __aarch64__
149
+ if (len & sizeof(uint32_t)) {
150
+ crc = __crc32w(crc, *((uint32_t *)src));
151
+ src += sizeof(uint32_t);
152
+ }
153
+ #endif
154
+ if (len & sizeof(uint16_t)) {
155
+ crc = __crc32h(crc, *((uint16_t *)src));
156
+ src += sizeof(uint16_t);
157
+ }
158
+ if (len & sizeof(uint8_t))
159
+ crc = __crc32b(crc, *src);
160
+
161
+ return crc;
162
+ }
163
+
164
+ static uint32_t do_crc32_incremental_arm(const void* data, size_t length, uint32_t init) {
165
+ return ~arm_crc_calc(~init, (const unsigned char*)data, (long)length);
166
+ }
167
+
168
+ void crc_arm_set_funcs(crc_func* _do_crc32_incremental) {
169
+ *_do_crc32_incremental = &do_crc32_incremental_arm;
170
+ }
171
+ #else
172
+ void crc_arm_set_funcs(crc_func* _do_crc32_incremental) {
173
+ (void)_do_crc32_incremental;
174
+ }
175
+ #endif
@@ -0,0 +1,4 @@
1
+ #include "common.h"
2
+ #include <stddef.h> // for size_t
3
+ #include "crc.h"
4
+