yencode 1.2.0 → 1.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/binding.gyp CHANGED
@@ -78,7 +78,7 @@
78
78
  "targets": [
79
79
  {
80
80
  "target_name": "yencode",
81
- "dependencies": ["yencode_sse2", "yencode_ssse3", "yencode_clmul", "yencode_clmul256", "yencode_avx", "yencode_avx2", "yencode_vbmi2", "yencode_neon", "yencode_armcrc", "yencode_rvv", "yencode_zbkc"],
81
+ "dependencies": ["yencode_sse2", "yencode_ssse3", "yencode_clmul", "yencode_clmul256", "yencode_avx", "yencode_avx2", "yencode_vbmi2", "yencode_neon", "yencode_armcrc", "yencode_pmull", "yencode_rvv", "yencode_zbkc"],
82
82
  "sources": [
83
83
  "src/yencode.cc",
84
84
  "src/platform.cc",
@@ -416,6 +416,42 @@
416
416
  }]
417
417
  ]
418
418
  },
419
+ {
420
+ "target_name": "yencode_pmull",
421
+ "type": "static_library",
422
+ "sources": [
423
+ "src/crc_arm_pmull.cc"
424
+ ],
425
+ "cflags!": ["-fno-omit-frame-pointer", "-fno-tree-vrp", "-fno-strict-aliasing"],
426
+ "cxxflags!": ["-fno-omit-frame-pointer", "-fno-tree-vrp", "-fno-strict-aliasing"],
427
+ "xcode_settings": {
428
+ "OTHER_CFLAGS!": ["-fno-omit-frame-pointer", "-fno-tree-vrp", "-fno-strict-aliasing"],
429
+ "OTHER_CXXFLAGS!": ["-fno-omit-frame-pointer", "-fno-tree-vrp", "-fno-strict-aliasing"]
430
+ },
431
+ "msvs_settings": {"VCCLCompilerTool": {"BufferSecurityCheck": "false"}},
432
+ "conditions": [
433
+ ['target_arch in "arm arm64"', {
434
+ "cflags!": ["-march=native"],
435
+ "cxxflags!": ["-march=native"],
436
+ "cflags": ["-march=armv8-a+crc+crypto"],
437
+ "cxxflags": ["-march=armv8-a+crc+crypto"],
438
+ "xcode_settings": {
439
+ "OTHER_CFLAGS!": ["-march=native"],
440
+ "OTHER_CXXFLAGS!": ["-march=native"],
441
+ "OTHER_CFLAGS": ["-march=armv8-a+crc+crypto"],
442
+ "OTHER_CXXFLAGS": ["-march=armv8-a+crc+crypto"],
443
+ }
444
+ }],
445
+ ['OS!="win" and target_arch=="arm"', {
446
+ "cflags": ["-mfpu=neon","-fno-lto"],
447
+ "cxxflags": ["-mfpu=neon","-fno-lto"],
448
+ "xcode_settings": {
449
+ "OTHER_CFLAGS": ["-mfpu=neon","-fno-lto"],
450
+ "OTHER_CXXFLAGS": ["-mfpu=neon","-fno-lto"]
451
+ }
452
+ }]
453
+ ]
454
+ },
419
455
  {
420
456
  "target_name": "yencode_zbkc",
421
457
  "type": "static_library",
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "yencode",
3
- "version": "1.2.0",
3
+ "version": "1.2.1",
4
4
  "description": "SIMD accelerated yEnc encoder/decoder and CRC32 calculator",
5
5
  "keywords": [
6
6
  "yenc",
package/src/common.h CHANGED
@@ -125,7 +125,7 @@
125
125
  #ifdef __POPCNT__
126
126
  #include <nmmintrin.h>
127
127
  // POPCNT can never return a negative result, but GCC doesn't seem to realise this, so typecast it to hint it better
128
- #define popcnt32 (unsigned int)_mm_popcnt_u32
128
+ #define popcnt32 (unsigned int)_mm_popcnt_u32
129
129
  #endif
130
130
 
131
131
  #if defined(__AVX2__) || defined(__AVX512F__)
@@ -209,7 +209,9 @@ static HEDLEY_ALWAYS_INLINE uint8x16x4_t vcreate4_u8(uint8x16_t a, uint8x16_t b,
209
209
  # undef _CREATE_TUPLE
210
210
  #endif
211
211
  #ifdef PLATFORM_ARM
212
- bool cpu_supports_neon();
212
+ namespace RapidYenc {
213
+ bool cpu_supports_neon();
214
+ }
213
215
  #endif
214
216
 
215
217
  #ifdef _MSC_VER
@@ -240,6 +242,7 @@ enum YEncDecIsaLevel {
240
242
  enum YEncDecIsaLevel {
241
243
  ISA_GENERIC = 0,
242
244
  ISA_FEATURE_CRC = 8,
245
+ ISA_FEATURE_PMULL = 0x40,
243
246
  ISA_LEVEL_NEON = 0x1000
244
247
  };
245
248
  #elif defined(__riscv)
@@ -274,7 +277,7 @@ enum YEncDecIsaLevel {
274
277
  # if defined(__POPCNT__)
275
278
  # if defined(__LZCNT__)
276
279
  # define ISA_NATIVE (enum YEncDecIsaLevel)(_ISA_NATIVE | ISA_FEATURE_POPCNT | ISA_FEATURE_LZCNT)
277
- # else
280
+ # else
278
281
  # define ISA_NATIVE (enum YEncDecIsaLevel)(_ISA_NATIVE | ISA_FEATURE_POPCNT)
279
282
  # endif
280
283
  # else
@@ -282,12 +285,17 @@ enum YEncDecIsaLevel {
282
285
  # endif
283
286
  #endif
284
287
 
285
- int cpu_supports_isa();
288
+ namespace RapidYenc {
289
+ int cpu_supports_isa();
290
+ int cpu_supports_crc_isa();
291
+ }
286
292
  #endif // PLATFORM_X86
287
293
 
288
294
 
289
295
  #ifdef __riscv
290
- bool cpu_supports_rvv();
296
+ namespace RapidYenc {
297
+ bool cpu_supports_rvv();
298
+ }
291
299
  #endif
292
300
  #if defined(__riscv_vector) && defined(HEDLEY_GCC_VERSION) && !HEDLEY_GCC_VERSION_CHECK(13,0,0)
293
301
  // GCC added RVV intrinsics in GCC13
@@ -318,7 +326,11 @@ bool cpu_supports_rvv();
318
326
  # include <stddef.h>
319
327
  #else
320
328
  /* Workaround for older MSVC not supporting stdint.h - just pull it from V8 */
321
- # include <v8.h>
329
+ # if defined(NODE_GYP_MODULE_NAME) || defined(V8_DEPRECATION_WARNINGS)
330
+ # include <v8.h>
331
+ # else
332
+ # include "stdint.h"
333
+ # endif
322
334
  #endif
323
335
 
324
336
 
package/src/crc.cc CHANGED
@@ -133,6 +133,7 @@ static void generate_crc32_slice_table() {
133
133
  #endif
134
134
 
135
135
 
136
+ namespace RapidYenc {
136
137
 
137
138
  // workaround MSVC complaining "unary minus operator applied to unsigned type, result still unsigned"
138
139
  #define NEGATE(n) (uint32_t)(-((int32_t)(n)))
@@ -180,9 +181,10 @@ uint32_t crc32_shift_generic(uint32_t crc1, uint32_t n) {
180
181
  #endif
181
182
  return result;
182
183
  }
184
+ } // namespace
183
185
 
184
186
 
185
- extern "C" {
187
+ namespace RapidYenc {
186
188
  crc_func _do_crc32_incremental = &do_crc32_incremental_generic;
187
189
  crc_mul_func _crc32_shift = &crc32_shift_generic;
188
190
  crc_mul_func _crc32_multiply = &crc32_multiply_generic;
@@ -191,15 +193,6 @@ extern "C" {
191
193
 
192
194
 
193
195
 
194
- void crc_clmul_set_funcs();
195
- void crc_clmul256_set_funcs();
196
- void crc_arm_set_funcs();
197
- void crc_riscv_set_funcs();
198
-
199
- #ifdef PLATFORM_X86
200
- int cpu_supports_crc_isa();
201
- #endif
202
-
203
196
  #if defined(PLATFORM_ARM) && defined(_WIN32)
204
197
  # define WIN32_LEAN_AND_MEAN
205
198
  # include <Windows.h>
@@ -234,7 +227,7 @@ static unsigned long getauxval(unsigned long cap) {
234
227
  # endif
235
228
  #endif
236
229
 
237
- void crc_init() {
230
+ void RapidYenc::crc32_init() {
238
231
  GENERIC_CRC_INIT;
239
232
 
240
233
  #ifdef PLATFORM_X86
@@ -246,31 +239,47 @@ void crc_init() {
246
239
  #endif
247
240
  #ifdef PLATFORM_ARM
248
241
  # ifdef __APPLE__
249
- int supported = 0;
250
- size_t len = sizeof(supported);
251
- if(sysctlbyname("hw.optional.armv8_crc32", &supported, &len, NULL, 0))
252
- supported = 0;
253
- # endif
254
- if(
255
- # if defined(AT_HWCAP2) && defined(HWCAP2_CRC32)
256
- getauxval(AT_HWCAP2) & HWCAP2_CRC32
257
- # elif defined(AT_HWCAP) && defined(HWCAP_CRC32)
258
- getauxval(AT_HWCAP) & HWCAP_CRC32
259
- # elif defined(ANDROID_CPU_FAMILY_ARM) && defined(__aarch64__)
260
- android_getCpuFeatures() & ANDROID_CPU_ARM64_FEATURE_CRC32
261
- # elif defined(ANDROID_CPU_FAMILY_ARM) /* aarch32 */
262
- android_getCpuFeatures() & ANDROID_CPU_ARM_FEATURE_CRC32
263
- # elif defined(_WIN32)
264
- IsProcessorFeaturePresent(PF_ARM_V8_CRC32_INSTRUCTIONS_AVAILABLE)
265
- # elif defined(__APPLE__)
266
- supported
267
- # elif defined(__ARM_FEATURE_CRC32)
268
- true /* assume available if compiled as such */
242
+ int supports_crc = 0;
243
+ int supports_pmull = 0;
244
+ size_t len = sizeof(supports_crc);
245
+ if(sysctlbyname("hw.optional.armv8_crc32", &supports_crc, &len, NULL, 0))
246
+ supports_crc = 0;
247
+ if(sysctlbyname("hw.optional.arm.FEAT_PMULL", &supports_pmull, &len, NULL, 0))
248
+ supports_pmull = 0;
269
249
  # else
270
- false
250
+ bool supports_crc = false;
251
+ bool supports_pmull = false;
252
+ # if defined(AT_HWCAP2) && defined(HWCAP2_CRC32)
253
+ supports_crc = getauxval(AT_HWCAP2) & HWCAP2_CRC32;
254
+ # elif defined(AT_HWCAP) && defined(HWCAP_CRC32)
255
+ supports_crc = getauxval(AT_HWCAP) & HWCAP_CRC32;
256
+ # elif defined(ANDROID_CPU_FAMILY_ARM) && defined(__aarch64__)
257
+ supports_crc = android_getCpuFeatures() & ANDROID_CPU_ARM64_FEATURE_CRC32;
258
+ supports_pmull = android_getCpuFeatures() & ANDROID_CPU_ARM64_FEATURE_PMULL;
259
+ # elif defined(ANDROID_CPU_FAMILY_ARM) /* aarch32 */
260
+ supports_crc = android_getCpuFeatures() & ANDROID_CPU_ARM_FEATURE_CRC32;
261
+ supports_pmull = android_getCpuFeatures() & ANDROID_CPU_ARM_FEATURE_PMULL;
262
+ # elif defined(_WIN32)
263
+ supports_crc = IsProcessorFeaturePresent(PF_ARM_V8_CRC32_INSTRUCTIONS_AVAILABLE);
264
+ supports_pmull = IsProcessorFeaturePresent(PF_ARM_V8_CRYPTO_INSTRUCTIONS_AVAILABLE);
265
+ # else
266
+ #ifdef __ARM_FEATURE_CRC32
267
+ supports_crc = true; /* assume available if compiled as such */
268
+ #endif
269
+ #ifdef __ARM_FEATURE_CRYPTO
270
+ supports_pmull = true;
271
+ #endif
272
+ # endif
273
+ # if defined(AT_HWCAP2) && defined(HWCAP2_PMULL)
274
+ supports_pmull = getauxval(AT_HWCAP2) & HWCAP2_PMULL;
275
+ # elif defined(AT_HWCAP) && defined(HWCAP_PMULL)
276
+ supports_pmull = getauxval(AT_HWCAP) & HWCAP_PMULL;
277
+ # endif
271
278
  # endif
272
- ) {
279
+
280
+ if(supports_crc) {
273
281
  crc_arm_set_funcs();
282
+ if(supports_pmull) crc_pmull_set_funcs();
274
283
  }
275
284
  #endif
276
285
  #ifdef __riscv
package/src/crc.h CHANGED
@@ -2,25 +2,25 @@
2
2
  #define __YENC_CRC_H
3
3
  #include <stdlib.h> // for llabs
4
4
 
5
- #ifdef __cplusplus
6
- extern "C" {
5
+ #if !defined(__GNUC__) && defined(_MSC_VER)
6
+ # include <intrin.h>
7
7
  #endif
8
8
 
9
+ namespace RapidYenc {
9
10
 
10
11
 
11
12
  typedef uint32_t (*crc_func)(const void*, size_t, uint32_t);
12
13
  extern crc_func _do_crc32_incremental;
13
14
 
14
15
  extern int _crc32_isa;
15
- #define do_crc32 (*_do_crc32_incremental)
16
+ static inline uint32_t crc32(const void* data, size_t length, uint32_t init) {
17
+ return (*_do_crc32_incremental)(data, length, init);
18
+ }
16
19
  static inline int crc32_isa_level() {
17
20
  return _crc32_isa;
18
21
  }
19
22
 
20
23
 
21
- #if !defined(__GNUC__) && defined(_MSC_VER)
22
- # include <intrin.h>
23
- #endif
24
24
  // computes `n % 0xffffffff` (well, almost), using some bit-hacks
25
25
  static inline uint32_t crc32_powmod(uint64_t n) {
26
26
  #ifdef __GNUC__
@@ -28,7 +28,7 @@ static inline uint32_t crc32_powmod(uint64_t n) {
28
28
  unsigned carry = __builtin_uadd_overflow(n >> 32, n, &res);
29
29
  res += carry;
30
30
  return res;
31
- #elif defined(_MSC_VER)
31
+ #elif defined(_MSC_VER) && defined(PLATFORM_X86)
32
32
  unsigned res;
33
33
  unsigned char carry = _addcarry_u32(0, n >> 32, n, &res);
34
34
  _addcarry_u32(carry, res, 0, &res);
@@ -59,8 +59,12 @@ static inline uint32_t crc32_bytepow(uint64_t n) {
59
59
  typedef uint32_t (*crc_mul_func)(uint32_t, uint32_t);
60
60
  extern crc_mul_func _crc32_shift;
61
61
  extern crc_mul_func _crc32_multiply;
62
- #define crc32_shift (*_crc32_shift)
63
- #define crc32_multiply (*_crc32_multiply)
62
+ static inline uint32_t crc32_shift(uint32_t a, uint32_t b) {
63
+ return (*_crc32_shift)(a, b);
64
+ }
65
+ static inline uint32_t crc32_multiply(uint32_t a, uint32_t b) {
66
+ return (*_crc32_multiply)(a, b);
67
+ }
64
68
 
65
69
  static inline uint32_t crc32_combine(uint32_t crc1, uint32_t crc2, uint64_t len2) {
66
70
  return crc32_shift(crc1, crc32_bytepow(len2)) ^ crc2;
@@ -79,11 +83,9 @@ static inline uint32_t crc32_256pow(uint64_t n) {
79
83
  return crc32_shift(0x80000000, crc32_bytepow(n));
80
84
  }
81
85
 
82
- void crc_init();
86
+ void crc32_init();
83
87
 
84
88
 
85
89
 
86
- #ifdef __cplusplus
87
- }
88
- #endif
89
- #endif
90
+ } // namespace
91
+ #endif // defined(__YENC_CRC_H)
package/src/crc_arm.cc CHANGED
@@ -61,7 +61,7 @@ HEDLEY_WARNING("CRC32 acceleration has been disabled due to missing arm_acle.h")
61
61
 
62
62
 
63
63
  #ifdef __aarch64__
64
- uint32_t crc32_multiply_arm(uint32_t a, uint32_t b) {
64
+ static uint32_t crc32_multiply_arm(uint32_t a, uint32_t b) {
65
65
  // perform PMULL
66
66
  uint64_t res = 0;
67
67
  uint64_t a64 = (uint64_t)a << 32;
@@ -86,8 +86,7 @@ uint32_t crc32_multiply_arm(uint32_t a, uint32_t b) {
86
86
 
87
87
  #ifdef ENABLE_PIPELINE_OPT
88
88
  #ifndef __aarch64__
89
- uint32_t crc32_multiply_generic(uint32_t a, uint32_t b);
90
- # define crc32_multiply_arm crc32_multiply_generic
89
+ # define crc32_multiply_arm RapidYenc::crc32_multiply_generic
91
90
  #endif
92
91
  #endif
93
92
 
@@ -124,7 +123,7 @@ static uint32_t arm_crc_calc(uint32_t crc, const unsigned char *src, long len) {
124
123
  // (this is a slightly less efficient, but much simpler implementation of the idea)
125
124
  const unsigned SPLIT_WORDS_LOG = 10; // make sure it's at least 2
126
125
  const unsigned SPLIT_WORDS = 1<<SPLIT_WORDS_LOG;
127
- const unsigned blockCoeff = crc_power[SPLIT_WORDS_LOG + WORDSIZE_LOG + 3];
126
+ const unsigned blockCoeff = RapidYenc::crc_power[SPLIT_WORDS_LOG + WORDSIZE_LOG + 3];
128
127
  while(len >= (long)(sizeof(WORD_T)*SPLIT_WORDS*2)) {
129
128
  // compute 2x CRCs concurrently to leverage piplining
130
129
  uint32_t crc2 = 0;
@@ -196,7 +195,7 @@ static uint32_t do_crc32_incremental_arm(const void* data, size_t length, uint32
196
195
 
197
196
 
198
197
  #if defined(__aarch64__) && (defined(__GNUC__) || defined(_MSC_VER))
199
- uint32_t crc32_shift_arm(uint32_t crc1, uint32_t n) {
198
+ static uint32_t crc32_shift_arm(uint32_t crc1, uint32_t n) {
200
199
  uint32_t result = crc1;
201
200
  uint64_t prod = result;
202
201
  prod <<= 32 - (n&31);
@@ -204,7 +203,7 @@ uint32_t crc32_shift_arm(uint32_t crc1, uint32_t n) {
204
203
  n &= ~31;
205
204
 
206
205
  while(n) {
207
- result = crc32_multiply_arm(result, crc_power[ctz32(n)]);
206
+ result = crc32_multiply_arm(result, RapidYenc::crc_power[ctz32(n)]);
208
207
  n &= n-1;
209
208
  }
210
209
  return result;
@@ -212,7 +211,7 @@ uint32_t crc32_shift_arm(uint32_t crc1, uint32_t n) {
212
211
  #endif
213
212
 
214
213
 
215
- void crc_arm_set_funcs() {
214
+ void RapidYenc::crc_arm_set_funcs() {
216
215
  _do_crc32_incremental = &do_crc32_incremental_arm;
217
216
  #ifdef __aarch64__
218
217
  _crc32_multiply = &crc32_multiply_arm;
@@ -223,5 +222,5 @@ void crc_arm_set_funcs() {
223
222
  _crc32_isa = ISA_FEATURE_CRC;
224
223
  }
225
224
  #else
226
- void crc_arm_set_funcs() {}
225
+ void RapidYenc::crc_arm_set_funcs() {}
227
226
  #endif
@@ -0,0 +1,215 @@
1
+ #include "crc_common.h"
2
+
3
+ // exclude broken/missing arm_acle.h
4
+ #if defined(__ARM_FEATURE_CRYPTO) && defined(HEDLEY_GCC_VERSION)
5
+ # if !defined(__aarch64__) && HEDLEY_GCC_VERSION_CHECK(7,0,0) && !HEDLEY_GCC_VERSION_CHECK(8,1,1)
6
+ # undef __ARM_FEATURE_CRYPTO
7
+ # endif
8
+ # if defined(__aarch64__) && HEDLEY_GCC_VERSION_CHECK(9,4,0) && !HEDLEY_GCC_VERSION_CHECK(9,5,0)
9
+ # undef __ARM_FEATURE_CRYPTO
10
+ # endif
11
+ #endif
12
+ #if defined(__ARM_FEATURE_CRYPTO) && defined(__has_include)
13
+ # if !__has_include(<arm_acle.h>)
14
+ # undef __ARM_FEATURE_CRYPTO
15
+ # endif
16
+ #endif
17
+
18
+ // ARM's intrinsics guide seems to suggest that vmull_p64 is available on A32, but neither Clang/GCC seem to support it on AArch32
19
+ #if (defined(__ARM_FEATURE_CRYPTO) && defined(__ARM_FEATURE_CRC32) && defined(__aarch64__)) || (defined(_M_ARM64) && !defined(__clang__))
20
+
21
+ #include <arm_neon.h>
22
+ #if defined(_MSC_VER) && !defined(__clang__)
23
+ # include <intrin.h>
24
+
25
+ # ifdef _M_ARM64
26
+ // MSVC may detect this pattern: https://devblogs.microsoft.com/cppblog/a-tour-of-4-msvc-backend-improvements/#byteswap-identification
27
+ static HEDLEY_ALWAYS_INLINE uint64_t rbit64(uint64_t x) {
28
+ x = _byteswap_uint64(x);
29
+ x = (x & 0xaaaaaaaaaaaaaaaa) >> 1 | (x & 0x5555555555555555) << 1;
30
+ x = (x & 0xcccccccccccccccc) >> 2 | (x & 0x3333333333333333) << 2;
31
+ x = (x & 0xf0f0f0f0f0f0f0f0) >> 4 | (x & 0x0f0f0f0f0f0f0f0f) << 4;
32
+ return x;
33
+ }
34
+ // ...whilst this seems to work best for 32-bit RBIT
35
+ static HEDLEY_ALWAYS_INLINE uint32_t rbit32(uint32_t x) {
36
+ uint64_t r = rbit64(x);
37
+ return r >> 32;
38
+ }
39
+ # else
40
+ # define rbit32 _arm_rbit
41
+ # endif
42
+ #else
43
+ # include <arm_acle.h>
44
+ // __rbit not present before GCC 11.4.0 or 12.2.0; for ARM32, requires GCC 14
45
+ # if defined(HEDLEY_GCC_VERSION) && !HEDLEY_GCC_VERSION_CHECK(14,0,0) && (!defined(__aarch64__) || !HEDLEY_GCC_VERSION_CHECK(11,3,0) || (HEDLEY_GCC_VERSION_CHECK(12,0,0) && !HEDLEY_GCC_VERSION_CHECK(12,2,0)))
46
+ # ifdef __aarch64__
47
+ static HEDLEY_ALWAYS_INLINE uint64_t rbit64(uint64_t x) {
48
+ uint64_t r;
49
+ __asm__ ("rbit %0,%1\n"
50
+ : "=r"(r) : "r"(x)
51
+ : /* No clobbers */);
52
+ return r;
53
+ }
54
+ # endif
55
+ static HEDLEY_ALWAYS_INLINE uint32_t rbit32(uint32_t x) {
56
+ uint32_t r;
57
+ __asm__ (
58
+ # ifdef __aarch64__
59
+ "rbit %w0,%w1\n"
60
+ # else
61
+ "rbit %0,%1\n"
62
+ # endif
63
+ : "=r"(r) : "r"(x)
64
+ : /* No clobbers */);
65
+ return r;
66
+ }
67
+ # else
68
+ # define rbit32 __rbit
69
+ # define rbit64 __rbitll
70
+ # endif
71
+ #endif
72
+
73
+
74
+ // MSVC doesn't have poly64/poly128 types, so always use uint64 instead
75
+
76
+ #ifdef __aarch64__
77
+ # if defined(__GNUC__) || defined(__clang__)
78
+ static HEDLEY_ALWAYS_INLINE uint64x2_t pmull_low(uint64x1_t a, uint64x1_t b) {
79
+ uint64x2_t result;
80
+ __asm__ ("pmull %0.1q,%1.1d,%2.1d"
81
+ : "=w"(result)
82
+ : "w"(a), "w"(b)
83
+ : /* No clobbers */);
84
+ return result;
85
+ }
86
+ static HEDLEY_ALWAYS_INLINE uint64x2_t pmull_high(uint64x2_t a, uint64x2_t b) {
87
+ uint64x2_t result;
88
+ __asm__ ("pmull2 %0.1q,%1.2d,%2.2d"
89
+ : "=w"(result)
90
+ : "w"(a), "w"(b)
91
+ : /* No clobbers */);
92
+ return result;
93
+ }
94
+ # elif defined(_MSC_VER) && !defined(__clang__)
95
+ # define pmull_low vmull_p64
96
+ # define pmull_high vmull_high_p64
97
+ # else
98
+ # define pmull_low(x, y) vreinterpretq_u64_p128(vmull_p64(vreinterpret_p64_u64(x), vreinterpret_p64_u64(y)))
99
+ # define pmull_high(x, y) vreinterpretq_u64_p128(vmull_high_p64(vreinterpretq_p64_u64(x), vreinterpretq_p64_u64(y)))
100
+ # endif
101
+ #else
102
+ # if defined(_MSC_VER) && !defined(__clang__)
103
+ # define pmull_low vmull_p64
104
+ # define pmull_high(x, y) vmull_p64(vget_high_u64(x), vget_high_u64(y))
105
+ # else
106
+ # define pmull_low(x, y) vreinterpretq_u64_p128(vmull_p64(x, y))
107
+ # define pmull_high(x, y) vreinterpretq_u64_p128(vmull_p64(vget_high_p64(vreinterpretq_p64_u64(x)), vget_high_p64(vreinterpretq_p64_u64(y))))
108
+ # endif
109
+ #endif
110
+
111
+
112
+ static uint32_t crc32_multiply_pmull(uint32_t a, uint32_t b) {
113
+ uint64x1_t prod = vget_low_u64(pmull_low(
114
+ vreinterpret_u64_u32(vset_lane_u32(a, vdup_n_u32(0), 0)),
115
+ vreinterpret_u64_u32(vset_lane_u32(b, vdup_n_u32(0), 0))
116
+ ));
117
+ #ifdef __aarch64__
118
+ uint64_t p = vget_lane_u64(prod, 0);
119
+ return __crc32w(0, p+p) ^ (p >> 31);
120
+ #else
121
+ prod = vadd_u64(prod, prod);
122
+ uint32x2_t prod32 = vreinterpret_u32_u64(prod);
123
+ return __crc32w(0, vget_lane_u32(prod32, 0)) ^ vget_lane_u32(prod32, 1);
124
+ #endif
125
+ }
126
+
127
+
128
+
129
+ static const uint32_t crc_power_rev[32] = { // bit-reversed crc_power
130
+ 0x00000002, 0x00000004, 0x00000010, 0x00000100, 0x00010000, 0x04c11db7, 0x490d678d, 0xe8a45605,
131
+ 0x75be46b7, 0xe6228b11, 0x567fddeb, 0x88fe2237, 0x0e857e71, 0x7001e426, 0x075de2b2, 0xf12a7f90,
132
+ 0xf0b4a1c1, 0x58f46c0c, 0xc3395ade, 0x96837f8c, 0x544037f9, 0x23b7b136, 0xb2e16ba8, 0x725e7bfa,
133
+ 0xec709b5d, 0xf77a7274, 0x2845d572, 0x034e2515, 0x79695942, 0x540cb128, 0x0b65d023, 0x3c344723
134
+ };
135
+
136
+
137
+ static HEDLEY_ALWAYS_INLINE uint64x1_t crc32_shift_pmull_mulred(uint64x1_t a, uint64x1_t b) {
138
+ uint64x2_t r = pmull_low(a, b);
139
+ uint64x2_t h = pmull_high(r, vdupq_n_u64(0x490d678d));
140
+ return veor_u64(vget_low_u64(r), vget_low_u64(h));
141
+ }
142
+
143
+
144
+ static uint32_t crc32_shift_pmull(uint32_t crc1, uint32_t n) {
145
+ crc1 = rbit32(crc1);
146
+
147
+ uint64x1_t res;
148
+ #ifdef __aarch64__
149
+ uint64_t crc = (uint64_t)crc1 << (n & 31);
150
+ res = vset_lane_u64(crc, vdup_n_u64(0), 0);
151
+ #else
152
+ res = vreinterpret_u64_u32(vset_lane_u32(crc1, vdup_n_u32(0), 0));
153
+ res = vshl_u64(res, vdup_n_u64(n&31));
154
+ #endif
155
+ n &= ~31;
156
+
157
+ if(n) {
158
+ #define LOAD_NEXT_POWER vreinterpret_u64_u32(vset_lane_u32(crc_power_rev[ctz32(n)], vdup_n_u32(0), 0))
159
+ uint64x1_t res2 = LOAD_NEXT_POWER;
160
+ n &= n-1;
161
+
162
+ if(n) {
163
+ // first multiply doesn't need reduction
164
+ res2 = vget_low_u64(pmull_low(res2, LOAD_NEXT_POWER));
165
+ n &= n-1;
166
+
167
+ while(n) {
168
+ res = crc32_shift_pmull_mulred(res, LOAD_NEXT_POWER);
169
+ n &= n-1;
170
+
171
+ if(n) {
172
+ res2 = crc32_shift_pmull_mulred(res2, LOAD_NEXT_POWER);
173
+ n &= n-1;
174
+ }
175
+ }
176
+ }
177
+ #undef LOAD_NEXT_POWER
178
+
179
+ // merge two results
180
+ uint64x2_t prod = pmull_low(res, res2);
181
+ // weirdly, vrbitq_u8 is missing in ARM32 MSVC
182
+ prod = vreinterpretq_u64_u8(vrev64q_u8(vrbitq_u8(vreinterpretq_u8_u64(prod))));
183
+ #ifdef __aarch64__
184
+ crc = __crc32d(0, vgetq_lane_u64(prod, 1));
185
+ uint64_t rem = vgetq_lane_u64(prod, 0);
186
+ crc = __crc32w(rem, crc) ^ (rem >> 32);
187
+ #else
188
+ uint32x4_t prod32 = vreinterpretq_u32_u64(prod);
189
+ uint32_t crc = __crc32w(0, vgetq_lane_u32(prod32, 2));
190
+ crc = __crc32w(vgetq_lane_u32(prod32, 3), crc);
191
+ crc = __crc32w(vgetq_lane_u32(prod32, 0), crc) ^ vgetq_lane_u32(prod32, 1);
192
+ #endif
193
+ return crc;
194
+ } else {
195
+ #ifdef __aarch64__
196
+ crc = rbit64(crc);
197
+ crc = __crc32w(0, crc) ^ (crc >> 32);
198
+ return crc;
199
+ #else
200
+ uint32x2_t r = vreinterpret_u32_u64(res);
201
+ return __crc32w(0, rbit32(vget_lane_u32(r, 1))) ^ rbit32(vget_lane_u32(r, 0));
202
+ #endif
203
+ }
204
+ }
205
+
206
+
207
+ void RapidYenc::crc_pmull_set_funcs() {
208
+ _crc32_multiply = &crc32_multiply_pmull;
209
+ _crc32_shift = &crc32_shift_pmull;
210
+ _crc32_isa &= ISA_FEATURE_PMULL;
211
+ }
212
+
213
+ #else
214
+ void RapidYenc::crc_pmull_set_funcs() {}
215
+ #endif /* defined(__ARM_FEATURE_CRYPTO) && defined(__ARM_FEATURE_CRC32) */
package/src/crc_common.h CHANGED
@@ -2,8 +2,6 @@
2
2
  #include <stddef.h> // for size_t
3
3
  #include "crc.h"
4
4
 
5
- extern const uint32_t crc_power[32];
6
-
7
5
  #ifdef __GNUC__
8
6
  # define ctz32 __builtin_ctz
9
7
  #elif defined(_MSC_VER)
@@ -13,3 +11,16 @@ static HEDLEY_ALWAYS_INLINE unsigned ctz32(uint32_t n) {
13
11
  return r;
14
12
  }
15
13
  #endif
14
+
15
+ namespace RapidYenc {
16
+ void crc_clmul_set_funcs();
17
+ void crc_clmul256_set_funcs();
18
+ void crc_arm_set_funcs();
19
+ void crc_pmull_set_funcs();
20
+ void crc_riscv_set_funcs();
21
+
22
+ extern const uint32_t crc_power[32];
23
+ uint32_t crc32_multiply_generic(uint32_t a, uint32_t b);
24
+ uint32_t crc32_shift_generic(uint32_t crc1, uint32_t n);
25
+
26
+ }
@@ -365,7 +365,7 @@ static HEDLEY_ALWAYS_INLINE __m128i crc32_reduce(__m128i prod) {
365
365
  return t;
366
366
  }
367
367
 
368
- uint32_t crc32_multiply_clmul(uint32_t a, uint32_t b) {
368
+ static uint32_t crc32_multiply_clmul(uint32_t a, uint32_t b) {
369
369
  // do the actual multiply
370
370
  __m128i prod = _mm_clmulepi64_si128(_mm_cvtsi32_si128(a), _mm_cvtsi32_si128(b), 0);
371
371
 
@@ -418,7 +418,7 @@ static HEDLEY_ALWAYS_INLINE __m128i reverse_bits_epi8(__m128i src) {
418
418
 
419
419
 
420
420
 
421
- const uint32_t crc_power_rev[32] = { // bit-reversed crc_power
421
+ static const uint32_t crc_power_rev[32] = { // bit-reversed crc_power
422
422
  0x00000002, 0x00000004, 0x00000010, 0x00000100, 0x00010000, 0x04c11db7, 0x490d678d, 0xe8a45605,
423
423
  0x75be46b7, 0xe6228b11, 0x567fddeb, 0x88fe2237, 0x0e857e71, 0x7001e426, 0x075de2b2, 0xf12a7f90,
424
424
  0xf0b4a1c1, 0x58f46c0c, 0xc3395ade, 0x96837f8c, 0x544037f9, 0x23b7b136, 0xb2e16ba8, 0x725e7bfa,
@@ -436,7 +436,7 @@ static HEDLEY_ALWAYS_INLINE __m128i crc32_shift_clmul_mulred(unsigned pos, __m12
436
436
  return _mm_xor_si128(hi, prod);
437
437
  }
438
438
 
439
- uint32_t crc32_shift_clmul(uint32_t crc1, uint32_t n) {
439
+ static uint32_t crc32_shift_clmul(uint32_t crc1, uint32_t n) {
440
440
  if(!n) return crc1;
441
441
 
442
442
  __m128i result = _mm_cvtsi32_si128(BSWAP32(crc1));
@@ -499,7 +499,7 @@ uint32_t crc32_shift_clmul(uint32_t crc1, uint32_t n) {
499
499
  #endif
500
500
 
501
501
 
502
- void crc_clmul_set_funcs() {
502
+ void RapidYenc::crc_clmul_set_funcs() {
503
503
  _do_crc32_incremental = &do_crc32_incremental_clmul;
504
504
  _crc32_multiply = &crc32_multiply_clmul;
505
505
  #if defined(__GNUC__) || defined(_MSC_VER)
@@ -508,6 +508,6 @@ void crc_clmul_set_funcs() {
508
508
  _crc32_isa = ISA_LEVEL_PCLMUL;
509
509
  }
510
510
  #else
511
- void crc_clmul_set_funcs() {}
511
+ void RapidYenc::crc_clmul_set_funcs() {}
512
512
  #endif
513
513
 
@@ -1,8 +1,6 @@
1
1
  // 256-bit version of crc_folding
2
2
 
3
3
  #include "crc_common.h"
4
-
5
- void crc_clmul_set_funcs();
6
4
 
7
5
  #if !defined(YENC_DISABLE_AVX256) && ((defined(__VPCLMULQDQ__) && defined(__AVX2__) && defined(__PCLMUL__)) || (defined(_MSC_VER) && _MSC_VER >= 1920 && defined(PLATFORM_X86) && !defined(__clang__)))
8
6
  #include <inttypes.h>
@@ -212,13 +210,13 @@ static uint32_t do_crc32_incremental_clmul(const void* data, size_t length, uint
212
210
  return crc_fold((const unsigned char*)data, (long)length, init);
213
211
  }
214
212
 
215
- void crc_clmul256_set_funcs() {
213
+ void RapidYenc::crc_clmul256_set_funcs() {
216
214
  crc_clmul_set_funcs(); // set multiply/shift function
217
215
  _do_crc32_incremental = &do_crc32_incremental_clmul;
218
216
  _crc32_isa = ISA_LEVEL_VPCLMUL;
219
217
  }
220
218
  #else
221
- void crc_clmul256_set_funcs() {
219
+ void RapidYenc::crc_clmul256_set_funcs() {
222
220
  crc_clmul_set_funcs();
223
221
  }
224
222
  #endif