yencode 1.2.0 → 1.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/binding.gyp +37 -1
- package/package.json +1 -1
- package/src/common.h +18 -6
- package/src/crc.cc +42 -33
- package/src/crc.h +16 -14
- package/src/crc_arm.cc +7 -8
- package/src/crc_arm_pmull.cc +215 -0
- package/src/crc_common.h +13 -2
- package/src/crc_folding.cc +5 -5
- package/src/crc_folding_256.cc +2 -4
- package/src/crc_riscv.cc +7 -7
- package/src/decoder.cc +342 -12
- package/src/decoder.h +10 -14
- package/src/decoder_avx.cc +3 -4
- package/src/decoder_avx2.cc +7 -8
- package/src/decoder_avx2_base.h +6 -2
- package/src/decoder_common.h +34 -338
- package/src/decoder_neon.cc +10 -6
- package/src/decoder_neon64.cc +9 -5
- package/src/decoder_rvv.cc +47 -41
- package/src/decoder_sse2.cc +4 -4
- package/src/decoder_sse_base.h +20 -12
- package/src/decoder_ssse3.cc +3 -4
- package/src/decoder_vbmi2.cc +6 -8
- package/src/encoder.cc +19 -28
- package/src/encoder.h +5 -7
- package/src/encoder_avx.cc +3 -3
- package/src/encoder_avx2.cc +3 -3
- package/src/encoder_avx_base.h +3 -0
- package/src/encoder_common.h +26 -14
- package/src/encoder_neon.cc +6 -3
- package/src/encoder_rvv.cc +9 -7
- package/src/encoder_sse2.cc +3 -2
- package/src/encoder_sse_base.h +2 -0
- package/src/encoder_ssse3.cc +3 -3
- package/src/encoder_vbmi2.cc +6 -7
- package/src/platform.cc +24 -23
- package/src/yencode.cc +9 -8
- package/test/_speedbase.js +4 -2
- package/test/speeddec.js +25 -16
- package/test/speedenc.js +21 -17
package/binding.gyp
CHANGED
|
@@ -78,7 +78,7 @@
|
|
|
78
78
|
"targets": [
|
|
79
79
|
{
|
|
80
80
|
"target_name": "yencode",
|
|
81
|
-
"dependencies": ["yencode_sse2", "yencode_ssse3", "yencode_clmul", "yencode_clmul256", "yencode_avx", "yencode_avx2", "yencode_vbmi2", "yencode_neon", "yencode_armcrc", "yencode_rvv", "yencode_zbkc"],
|
|
81
|
+
"dependencies": ["yencode_sse2", "yencode_ssse3", "yencode_clmul", "yencode_clmul256", "yencode_avx", "yencode_avx2", "yencode_vbmi2", "yencode_neon", "yencode_armcrc", "yencode_pmull", "yencode_rvv", "yencode_zbkc"],
|
|
82
82
|
"sources": [
|
|
83
83
|
"src/yencode.cc",
|
|
84
84
|
"src/platform.cc",
|
|
@@ -416,6 +416,42 @@
|
|
|
416
416
|
}]
|
|
417
417
|
]
|
|
418
418
|
},
|
|
419
|
+
{
|
|
420
|
+
"target_name": "yencode_pmull",
|
|
421
|
+
"type": "static_library",
|
|
422
|
+
"sources": [
|
|
423
|
+
"src/crc_arm_pmull.cc"
|
|
424
|
+
],
|
|
425
|
+
"cflags!": ["-fno-omit-frame-pointer", "-fno-tree-vrp", "-fno-strict-aliasing"],
|
|
426
|
+
"cxxflags!": ["-fno-omit-frame-pointer", "-fno-tree-vrp", "-fno-strict-aliasing"],
|
|
427
|
+
"xcode_settings": {
|
|
428
|
+
"OTHER_CFLAGS!": ["-fno-omit-frame-pointer", "-fno-tree-vrp", "-fno-strict-aliasing"],
|
|
429
|
+
"OTHER_CXXFLAGS!": ["-fno-omit-frame-pointer", "-fno-tree-vrp", "-fno-strict-aliasing"]
|
|
430
|
+
},
|
|
431
|
+
"msvs_settings": {"VCCLCompilerTool": {"BufferSecurityCheck": "false"}},
|
|
432
|
+
"conditions": [
|
|
433
|
+
['target_arch in "arm arm64"', {
|
|
434
|
+
"cflags!": ["-march=native"],
|
|
435
|
+
"cxxflags!": ["-march=native"],
|
|
436
|
+
"cflags": ["-march=armv8-a+crc+crypto"],
|
|
437
|
+
"cxxflags": ["-march=armv8-a+crc+crypto"],
|
|
438
|
+
"xcode_settings": {
|
|
439
|
+
"OTHER_CFLAGS!": ["-march=native"],
|
|
440
|
+
"OTHER_CXXFLAGS!": ["-march=native"],
|
|
441
|
+
"OTHER_CFLAGS": ["-march=armv8-a+crc+crypto"],
|
|
442
|
+
"OTHER_CXXFLAGS": ["-march=armv8-a+crc+crypto"],
|
|
443
|
+
}
|
|
444
|
+
}],
|
|
445
|
+
['OS!="win" and target_arch=="arm"', {
|
|
446
|
+
"cflags": ["-mfpu=neon","-fno-lto"],
|
|
447
|
+
"cxxflags": ["-mfpu=neon","-fno-lto"],
|
|
448
|
+
"xcode_settings": {
|
|
449
|
+
"OTHER_CFLAGS": ["-mfpu=neon","-fno-lto"],
|
|
450
|
+
"OTHER_CXXFLAGS": ["-mfpu=neon","-fno-lto"]
|
|
451
|
+
}
|
|
452
|
+
}]
|
|
453
|
+
]
|
|
454
|
+
},
|
|
419
455
|
{
|
|
420
456
|
"target_name": "yencode_zbkc",
|
|
421
457
|
"type": "static_library",
|
package/package.json
CHANGED
package/src/common.h
CHANGED
|
@@ -125,7 +125,7 @@
|
|
|
125
125
|
#ifdef __POPCNT__
|
|
126
126
|
#include <nmmintrin.h>
|
|
127
127
|
// POPCNT can never return a negative result, but GCC doesn't seem to realise this, so typecast it to hint it better
|
|
128
|
-
#define popcnt32 (unsigned int)_mm_popcnt_u32
|
|
128
|
+
#define popcnt32 (unsigned int)_mm_popcnt_u32
|
|
129
129
|
#endif
|
|
130
130
|
|
|
131
131
|
#if defined(__AVX2__) || defined(__AVX512F__)
|
|
@@ -209,7 +209,9 @@ static HEDLEY_ALWAYS_INLINE uint8x16x4_t vcreate4_u8(uint8x16_t a, uint8x16_t b,
|
|
|
209
209
|
# undef _CREATE_TUPLE
|
|
210
210
|
#endif
|
|
211
211
|
#ifdef PLATFORM_ARM
|
|
212
|
-
|
|
212
|
+
namespace RapidYenc {
|
|
213
|
+
bool cpu_supports_neon();
|
|
214
|
+
}
|
|
213
215
|
#endif
|
|
214
216
|
|
|
215
217
|
#ifdef _MSC_VER
|
|
@@ -240,6 +242,7 @@ enum YEncDecIsaLevel {
|
|
|
240
242
|
enum YEncDecIsaLevel {
|
|
241
243
|
ISA_GENERIC = 0,
|
|
242
244
|
ISA_FEATURE_CRC = 8,
|
|
245
|
+
ISA_FEATURE_PMULL = 0x40,
|
|
243
246
|
ISA_LEVEL_NEON = 0x1000
|
|
244
247
|
};
|
|
245
248
|
#elif defined(__riscv)
|
|
@@ -274,7 +277,7 @@ enum YEncDecIsaLevel {
|
|
|
274
277
|
# if defined(__POPCNT__)
|
|
275
278
|
# if defined(__LZCNT__)
|
|
276
279
|
# define ISA_NATIVE (enum YEncDecIsaLevel)(_ISA_NATIVE | ISA_FEATURE_POPCNT | ISA_FEATURE_LZCNT)
|
|
277
|
-
# else
|
|
280
|
+
# else
|
|
278
281
|
# define ISA_NATIVE (enum YEncDecIsaLevel)(_ISA_NATIVE | ISA_FEATURE_POPCNT)
|
|
279
282
|
# endif
|
|
280
283
|
# else
|
|
@@ -282,12 +285,17 @@ enum YEncDecIsaLevel {
|
|
|
282
285
|
# endif
|
|
283
286
|
#endif
|
|
284
287
|
|
|
285
|
-
|
|
288
|
+
namespace RapidYenc {
|
|
289
|
+
int cpu_supports_isa();
|
|
290
|
+
int cpu_supports_crc_isa();
|
|
291
|
+
}
|
|
286
292
|
#endif // PLATFORM_X86
|
|
287
293
|
|
|
288
294
|
|
|
289
295
|
#ifdef __riscv
|
|
290
|
-
|
|
296
|
+
namespace RapidYenc {
|
|
297
|
+
bool cpu_supports_rvv();
|
|
298
|
+
}
|
|
291
299
|
#endif
|
|
292
300
|
#if defined(__riscv_vector) && defined(HEDLEY_GCC_VERSION) && !HEDLEY_GCC_VERSION_CHECK(13,0,0)
|
|
293
301
|
// GCC added RVV intrinsics in GCC13
|
|
@@ -318,7 +326,11 @@ bool cpu_supports_rvv();
|
|
|
318
326
|
# include <stddef.h>
|
|
319
327
|
#else
|
|
320
328
|
/* Workaround for older MSVC not supporting stdint.h - just pull it from V8 */
|
|
321
|
-
#
|
|
329
|
+
# if defined(NODE_GYP_MODULE_NAME) || defined(V8_DEPRECATION_WARNINGS)
|
|
330
|
+
# include <v8.h>
|
|
331
|
+
# else
|
|
332
|
+
# include "stdint.h"
|
|
333
|
+
# endif
|
|
322
334
|
#endif
|
|
323
335
|
|
|
324
336
|
|
package/src/crc.cc
CHANGED
|
@@ -133,6 +133,7 @@ static void generate_crc32_slice_table() {
|
|
|
133
133
|
#endif
|
|
134
134
|
|
|
135
135
|
|
|
136
|
+
namespace RapidYenc {
|
|
136
137
|
|
|
137
138
|
// workaround MSVC complaining "unary minus operator applied to unsigned type, result still unsigned"
|
|
138
139
|
#define NEGATE(n) (uint32_t)(-((int32_t)(n)))
|
|
@@ -180,9 +181,10 @@ uint32_t crc32_shift_generic(uint32_t crc1, uint32_t n) {
|
|
|
180
181
|
#endif
|
|
181
182
|
return result;
|
|
182
183
|
}
|
|
184
|
+
} // namespace
|
|
183
185
|
|
|
184
186
|
|
|
185
|
-
|
|
187
|
+
namespace RapidYenc {
|
|
186
188
|
crc_func _do_crc32_incremental = &do_crc32_incremental_generic;
|
|
187
189
|
crc_mul_func _crc32_shift = &crc32_shift_generic;
|
|
188
190
|
crc_mul_func _crc32_multiply = &crc32_multiply_generic;
|
|
@@ -191,15 +193,6 @@ extern "C" {
|
|
|
191
193
|
|
|
192
194
|
|
|
193
195
|
|
|
194
|
-
void crc_clmul_set_funcs();
|
|
195
|
-
void crc_clmul256_set_funcs();
|
|
196
|
-
void crc_arm_set_funcs();
|
|
197
|
-
void crc_riscv_set_funcs();
|
|
198
|
-
|
|
199
|
-
#ifdef PLATFORM_X86
|
|
200
|
-
int cpu_supports_crc_isa();
|
|
201
|
-
#endif
|
|
202
|
-
|
|
203
196
|
#if defined(PLATFORM_ARM) && defined(_WIN32)
|
|
204
197
|
# define WIN32_LEAN_AND_MEAN
|
|
205
198
|
# include <Windows.h>
|
|
@@ -234,7 +227,7 @@ static unsigned long getauxval(unsigned long cap) {
|
|
|
234
227
|
# endif
|
|
235
228
|
#endif
|
|
236
229
|
|
|
237
|
-
void
|
|
230
|
+
void RapidYenc::crc32_init() {
|
|
238
231
|
GENERIC_CRC_INIT;
|
|
239
232
|
|
|
240
233
|
#ifdef PLATFORM_X86
|
|
@@ -246,31 +239,47 @@ void crc_init() {
|
|
|
246
239
|
#endif
|
|
247
240
|
#ifdef PLATFORM_ARM
|
|
248
241
|
# ifdef __APPLE__
|
|
249
|
-
int
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
if(
|
|
255
|
-
|
|
256
|
-
getauxval(AT_HWCAP2) & HWCAP2_CRC32
|
|
257
|
-
# elif defined(AT_HWCAP) && defined(HWCAP_CRC32)
|
|
258
|
-
getauxval(AT_HWCAP) & HWCAP_CRC32
|
|
259
|
-
# elif defined(ANDROID_CPU_FAMILY_ARM) && defined(__aarch64__)
|
|
260
|
-
android_getCpuFeatures() & ANDROID_CPU_ARM64_FEATURE_CRC32
|
|
261
|
-
# elif defined(ANDROID_CPU_FAMILY_ARM) /* aarch32 */
|
|
262
|
-
android_getCpuFeatures() & ANDROID_CPU_ARM_FEATURE_CRC32
|
|
263
|
-
# elif defined(_WIN32)
|
|
264
|
-
IsProcessorFeaturePresent(PF_ARM_V8_CRC32_INSTRUCTIONS_AVAILABLE)
|
|
265
|
-
# elif defined(__APPLE__)
|
|
266
|
-
supported
|
|
267
|
-
# elif defined(__ARM_FEATURE_CRC32)
|
|
268
|
-
true /* assume available if compiled as such */
|
|
242
|
+
int supports_crc = 0;
|
|
243
|
+
int supports_pmull = 0;
|
|
244
|
+
size_t len = sizeof(supports_crc);
|
|
245
|
+
if(sysctlbyname("hw.optional.armv8_crc32", &supports_crc, &len, NULL, 0))
|
|
246
|
+
supports_crc = 0;
|
|
247
|
+
if(sysctlbyname("hw.optional.arm.FEAT_PMULL", &supports_pmull, &len, NULL, 0))
|
|
248
|
+
supports_pmull = 0;
|
|
269
249
|
# else
|
|
270
|
-
|
|
250
|
+
bool supports_crc = false;
|
|
251
|
+
bool supports_pmull = false;
|
|
252
|
+
# if defined(AT_HWCAP2) && defined(HWCAP2_CRC32)
|
|
253
|
+
supports_crc = getauxval(AT_HWCAP2) & HWCAP2_CRC32;
|
|
254
|
+
# elif defined(AT_HWCAP) && defined(HWCAP_CRC32)
|
|
255
|
+
supports_crc = getauxval(AT_HWCAP) & HWCAP_CRC32;
|
|
256
|
+
# elif defined(ANDROID_CPU_FAMILY_ARM) && defined(__aarch64__)
|
|
257
|
+
supports_crc = android_getCpuFeatures() & ANDROID_CPU_ARM64_FEATURE_CRC32;
|
|
258
|
+
supports_pmull = android_getCpuFeatures() & ANDROID_CPU_ARM64_FEATURE_PMULL;
|
|
259
|
+
# elif defined(ANDROID_CPU_FAMILY_ARM) /* aarch32 */
|
|
260
|
+
supports_crc = android_getCpuFeatures() & ANDROID_CPU_ARM_FEATURE_CRC32;
|
|
261
|
+
supports_pmull = android_getCpuFeatures() & ANDROID_CPU_ARM_FEATURE_PMULL;
|
|
262
|
+
# elif defined(_WIN32)
|
|
263
|
+
supports_crc = IsProcessorFeaturePresent(PF_ARM_V8_CRC32_INSTRUCTIONS_AVAILABLE);
|
|
264
|
+
supports_pmull = IsProcessorFeaturePresent(PF_ARM_V8_CRYPTO_INSTRUCTIONS_AVAILABLE);
|
|
265
|
+
# else
|
|
266
|
+
#ifdef __ARM_FEATURE_CRC32
|
|
267
|
+
supports_crc = true; /* assume available if compiled as such */
|
|
268
|
+
#endif
|
|
269
|
+
#ifdef __ARM_FEATURE_CRYPTO
|
|
270
|
+
supports_pmull = true;
|
|
271
|
+
#endif
|
|
272
|
+
# endif
|
|
273
|
+
# if defined(AT_HWCAP2) && defined(HWCAP2_PMULL)
|
|
274
|
+
supports_pmull = getauxval(AT_HWCAP2) & HWCAP2_PMULL;
|
|
275
|
+
# elif defined(AT_HWCAP) && defined(HWCAP_PMULL)
|
|
276
|
+
supports_pmull = getauxval(AT_HWCAP) & HWCAP_PMULL;
|
|
277
|
+
# endif
|
|
271
278
|
# endif
|
|
272
|
-
|
|
279
|
+
|
|
280
|
+
if(supports_crc) {
|
|
273
281
|
crc_arm_set_funcs();
|
|
282
|
+
if(supports_pmull) crc_pmull_set_funcs();
|
|
274
283
|
}
|
|
275
284
|
#endif
|
|
276
285
|
#ifdef __riscv
|
package/src/crc.h
CHANGED
|
@@ -2,25 +2,25 @@
|
|
|
2
2
|
#define __YENC_CRC_H
|
|
3
3
|
#include <stdlib.h> // for llabs
|
|
4
4
|
|
|
5
|
-
#
|
|
6
|
-
|
|
5
|
+
#if !defined(__GNUC__) && defined(_MSC_VER)
|
|
6
|
+
# include <intrin.h>
|
|
7
7
|
#endif
|
|
8
8
|
|
|
9
|
+
namespace RapidYenc {
|
|
9
10
|
|
|
10
11
|
|
|
11
12
|
typedef uint32_t (*crc_func)(const void*, size_t, uint32_t);
|
|
12
13
|
extern crc_func _do_crc32_incremental;
|
|
13
14
|
|
|
14
15
|
extern int _crc32_isa;
|
|
15
|
-
|
|
16
|
+
static inline uint32_t crc32(const void* data, size_t length, uint32_t init) {
|
|
17
|
+
return (*_do_crc32_incremental)(data, length, init);
|
|
18
|
+
}
|
|
16
19
|
static inline int crc32_isa_level() {
|
|
17
20
|
return _crc32_isa;
|
|
18
21
|
}
|
|
19
22
|
|
|
20
23
|
|
|
21
|
-
#if !defined(__GNUC__) && defined(_MSC_VER)
|
|
22
|
-
# include <intrin.h>
|
|
23
|
-
#endif
|
|
24
24
|
// computes `n % 0xffffffff` (well, almost), using some bit-hacks
|
|
25
25
|
static inline uint32_t crc32_powmod(uint64_t n) {
|
|
26
26
|
#ifdef __GNUC__
|
|
@@ -28,7 +28,7 @@ static inline uint32_t crc32_powmod(uint64_t n) {
|
|
|
28
28
|
unsigned carry = __builtin_uadd_overflow(n >> 32, n, &res);
|
|
29
29
|
res += carry;
|
|
30
30
|
return res;
|
|
31
|
-
#elif defined(_MSC_VER)
|
|
31
|
+
#elif defined(_MSC_VER) && defined(PLATFORM_X86)
|
|
32
32
|
unsigned res;
|
|
33
33
|
unsigned char carry = _addcarry_u32(0, n >> 32, n, &res);
|
|
34
34
|
_addcarry_u32(carry, res, 0, &res);
|
|
@@ -59,8 +59,12 @@ static inline uint32_t crc32_bytepow(uint64_t n) {
|
|
|
59
59
|
typedef uint32_t (*crc_mul_func)(uint32_t, uint32_t);
|
|
60
60
|
extern crc_mul_func _crc32_shift;
|
|
61
61
|
extern crc_mul_func _crc32_multiply;
|
|
62
|
-
|
|
63
|
-
|
|
62
|
+
static inline uint32_t crc32_shift(uint32_t a, uint32_t b) {
|
|
63
|
+
return (*_crc32_shift)(a, b);
|
|
64
|
+
}
|
|
65
|
+
static inline uint32_t crc32_multiply(uint32_t a, uint32_t b) {
|
|
66
|
+
return (*_crc32_multiply)(a, b);
|
|
67
|
+
}
|
|
64
68
|
|
|
65
69
|
static inline uint32_t crc32_combine(uint32_t crc1, uint32_t crc2, uint64_t len2) {
|
|
66
70
|
return crc32_shift(crc1, crc32_bytepow(len2)) ^ crc2;
|
|
@@ -79,11 +83,9 @@ static inline uint32_t crc32_256pow(uint64_t n) {
|
|
|
79
83
|
return crc32_shift(0x80000000, crc32_bytepow(n));
|
|
80
84
|
}
|
|
81
85
|
|
|
82
|
-
void
|
|
86
|
+
void crc32_init();
|
|
83
87
|
|
|
84
88
|
|
|
85
89
|
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
#endif
|
|
89
|
-
#endif
|
|
90
|
+
} // namespace
|
|
91
|
+
#endif // defined(__YENC_CRC_H)
|
package/src/crc_arm.cc
CHANGED
|
@@ -61,7 +61,7 @@ HEDLEY_WARNING("CRC32 acceleration has been disabled due to missing arm_acle.h")
|
|
|
61
61
|
|
|
62
62
|
|
|
63
63
|
#ifdef __aarch64__
|
|
64
|
-
uint32_t crc32_multiply_arm(uint32_t a, uint32_t b) {
|
|
64
|
+
static uint32_t crc32_multiply_arm(uint32_t a, uint32_t b) {
|
|
65
65
|
// perform PMULL
|
|
66
66
|
uint64_t res = 0;
|
|
67
67
|
uint64_t a64 = (uint64_t)a << 32;
|
|
@@ -86,8 +86,7 @@ uint32_t crc32_multiply_arm(uint32_t a, uint32_t b) {
|
|
|
86
86
|
|
|
87
87
|
#ifdef ENABLE_PIPELINE_OPT
|
|
88
88
|
#ifndef __aarch64__
|
|
89
|
-
|
|
90
|
-
# define crc32_multiply_arm crc32_multiply_generic
|
|
89
|
+
# define crc32_multiply_arm RapidYenc::crc32_multiply_generic
|
|
91
90
|
#endif
|
|
92
91
|
#endif
|
|
93
92
|
|
|
@@ -124,7 +123,7 @@ static uint32_t arm_crc_calc(uint32_t crc, const unsigned char *src, long len) {
|
|
|
124
123
|
// (this is a slightly less efficient, but much simpler implementation of the idea)
|
|
125
124
|
const unsigned SPLIT_WORDS_LOG = 10; // make sure it's at least 2
|
|
126
125
|
const unsigned SPLIT_WORDS = 1<<SPLIT_WORDS_LOG;
|
|
127
|
-
const unsigned blockCoeff = crc_power[SPLIT_WORDS_LOG + WORDSIZE_LOG + 3];
|
|
126
|
+
const unsigned blockCoeff = RapidYenc::crc_power[SPLIT_WORDS_LOG + WORDSIZE_LOG + 3];
|
|
128
127
|
while(len >= (long)(sizeof(WORD_T)*SPLIT_WORDS*2)) {
|
|
129
128
|
// compute 2x CRCs concurrently to leverage piplining
|
|
130
129
|
uint32_t crc2 = 0;
|
|
@@ -196,7 +195,7 @@ static uint32_t do_crc32_incremental_arm(const void* data, size_t length, uint32
|
|
|
196
195
|
|
|
197
196
|
|
|
198
197
|
#if defined(__aarch64__) && (defined(__GNUC__) || defined(_MSC_VER))
|
|
199
|
-
uint32_t crc32_shift_arm(uint32_t crc1, uint32_t n) {
|
|
198
|
+
static uint32_t crc32_shift_arm(uint32_t crc1, uint32_t n) {
|
|
200
199
|
uint32_t result = crc1;
|
|
201
200
|
uint64_t prod = result;
|
|
202
201
|
prod <<= 32 - (n&31);
|
|
@@ -204,7 +203,7 @@ uint32_t crc32_shift_arm(uint32_t crc1, uint32_t n) {
|
|
|
204
203
|
n &= ~31;
|
|
205
204
|
|
|
206
205
|
while(n) {
|
|
207
|
-
result = crc32_multiply_arm(result, crc_power[ctz32(n)]);
|
|
206
|
+
result = crc32_multiply_arm(result, RapidYenc::crc_power[ctz32(n)]);
|
|
208
207
|
n &= n-1;
|
|
209
208
|
}
|
|
210
209
|
return result;
|
|
@@ -212,7 +211,7 @@ uint32_t crc32_shift_arm(uint32_t crc1, uint32_t n) {
|
|
|
212
211
|
#endif
|
|
213
212
|
|
|
214
213
|
|
|
215
|
-
void crc_arm_set_funcs() {
|
|
214
|
+
void RapidYenc::crc_arm_set_funcs() {
|
|
216
215
|
_do_crc32_incremental = &do_crc32_incremental_arm;
|
|
217
216
|
#ifdef __aarch64__
|
|
218
217
|
_crc32_multiply = &crc32_multiply_arm;
|
|
@@ -223,5 +222,5 @@ void crc_arm_set_funcs() {
|
|
|
223
222
|
_crc32_isa = ISA_FEATURE_CRC;
|
|
224
223
|
}
|
|
225
224
|
#else
|
|
226
|
-
void crc_arm_set_funcs() {}
|
|
225
|
+
void RapidYenc::crc_arm_set_funcs() {}
|
|
227
226
|
#endif
|
|
@@ -0,0 +1,215 @@
|
|
|
1
|
+
#include "crc_common.h"
|
|
2
|
+
|
|
3
|
+
// exclude broken/missing arm_acle.h
|
|
4
|
+
#if defined(__ARM_FEATURE_CRYPTO) && defined(HEDLEY_GCC_VERSION)
|
|
5
|
+
# if !defined(__aarch64__) && HEDLEY_GCC_VERSION_CHECK(7,0,0) && !HEDLEY_GCC_VERSION_CHECK(8,1,1)
|
|
6
|
+
# undef __ARM_FEATURE_CRYPTO
|
|
7
|
+
# endif
|
|
8
|
+
# if defined(__aarch64__) && HEDLEY_GCC_VERSION_CHECK(9,4,0) && !HEDLEY_GCC_VERSION_CHECK(9,5,0)
|
|
9
|
+
# undef __ARM_FEATURE_CRYPTO
|
|
10
|
+
# endif
|
|
11
|
+
#endif
|
|
12
|
+
#if defined(__ARM_FEATURE_CRYPTO) && defined(__has_include)
|
|
13
|
+
# if !__has_include(<arm_acle.h>)
|
|
14
|
+
# undef __ARM_FEATURE_CRYPTO
|
|
15
|
+
# endif
|
|
16
|
+
#endif
|
|
17
|
+
|
|
18
|
+
// ARM's intrinsics guide seems to suggest that vmull_p64 is available on A32, but neither Clang/GCC seem to support it on AArch32
|
|
19
|
+
#if (defined(__ARM_FEATURE_CRYPTO) && defined(__ARM_FEATURE_CRC32) && defined(__aarch64__)) || (defined(_M_ARM64) && !defined(__clang__))
|
|
20
|
+
|
|
21
|
+
#include <arm_neon.h>
|
|
22
|
+
#if defined(_MSC_VER) && !defined(__clang__)
|
|
23
|
+
# include <intrin.h>
|
|
24
|
+
|
|
25
|
+
# ifdef _M_ARM64
|
|
26
|
+
// MSVC may detect this pattern: https://devblogs.microsoft.com/cppblog/a-tour-of-4-msvc-backend-improvements/#byteswap-identification
|
|
27
|
+
static HEDLEY_ALWAYS_INLINE uint64_t rbit64(uint64_t x) {
|
|
28
|
+
x = _byteswap_uint64(x);
|
|
29
|
+
x = (x & 0xaaaaaaaaaaaaaaaa) >> 1 | (x & 0x5555555555555555) << 1;
|
|
30
|
+
x = (x & 0xcccccccccccccccc) >> 2 | (x & 0x3333333333333333) << 2;
|
|
31
|
+
x = (x & 0xf0f0f0f0f0f0f0f0) >> 4 | (x & 0x0f0f0f0f0f0f0f0f) << 4;
|
|
32
|
+
return x;
|
|
33
|
+
}
|
|
34
|
+
// ...whilst this seems to work best for 32-bit RBIT
|
|
35
|
+
static HEDLEY_ALWAYS_INLINE uint32_t rbit32(uint32_t x) {
|
|
36
|
+
uint64_t r = rbit64(x);
|
|
37
|
+
return r >> 32;
|
|
38
|
+
}
|
|
39
|
+
# else
|
|
40
|
+
# define rbit32 _arm_rbit
|
|
41
|
+
# endif
|
|
42
|
+
#else
|
|
43
|
+
# include <arm_acle.h>
|
|
44
|
+
// __rbit not present before GCC 11.4.0 or 12.2.0; for ARM32, requires GCC 14
|
|
45
|
+
# if defined(HEDLEY_GCC_VERSION) && !HEDLEY_GCC_VERSION_CHECK(14,0,0) && (!defined(__aarch64__) || !HEDLEY_GCC_VERSION_CHECK(11,3,0) || (HEDLEY_GCC_VERSION_CHECK(12,0,0) && !HEDLEY_GCC_VERSION_CHECK(12,2,0)))
|
|
46
|
+
# ifdef __aarch64__
|
|
47
|
+
static HEDLEY_ALWAYS_INLINE uint64_t rbit64(uint64_t x) {
|
|
48
|
+
uint64_t r;
|
|
49
|
+
__asm__ ("rbit %0,%1\n"
|
|
50
|
+
: "=r"(r) : "r"(x)
|
|
51
|
+
: /* No clobbers */);
|
|
52
|
+
return r;
|
|
53
|
+
}
|
|
54
|
+
# endif
|
|
55
|
+
static HEDLEY_ALWAYS_INLINE uint32_t rbit32(uint32_t x) {
|
|
56
|
+
uint32_t r;
|
|
57
|
+
__asm__ (
|
|
58
|
+
# ifdef __aarch64__
|
|
59
|
+
"rbit %w0,%w1\n"
|
|
60
|
+
# else
|
|
61
|
+
"rbit %0,%1\n"
|
|
62
|
+
# endif
|
|
63
|
+
: "=r"(r) : "r"(x)
|
|
64
|
+
: /* No clobbers */);
|
|
65
|
+
return r;
|
|
66
|
+
}
|
|
67
|
+
# else
|
|
68
|
+
# define rbit32 __rbit
|
|
69
|
+
# define rbit64 __rbitll
|
|
70
|
+
# endif
|
|
71
|
+
#endif
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
// MSVC doesn't have poly64/poly128 types, so always use uint64 instead
|
|
75
|
+
|
|
76
|
+
#ifdef __aarch64__
|
|
77
|
+
# if defined(__GNUC__) || defined(__clang__)
|
|
78
|
+
static HEDLEY_ALWAYS_INLINE uint64x2_t pmull_low(uint64x1_t a, uint64x1_t b) {
|
|
79
|
+
uint64x2_t result;
|
|
80
|
+
__asm__ ("pmull %0.1q,%1.1d,%2.1d"
|
|
81
|
+
: "=w"(result)
|
|
82
|
+
: "w"(a), "w"(b)
|
|
83
|
+
: /* No clobbers */);
|
|
84
|
+
return result;
|
|
85
|
+
}
|
|
86
|
+
static HEDLEY_ALWAYS_INLINE uint64x2_t pmull_high(uint64x2_t a, uint64x2_t b) {
|
|
87
|
+
uint64x2_t result;
|
|
88
|
+
__asm__ ("pmull2 %0.1q,%1.2d,%2.2d"
|
|
89
|
+
: "=w"(result)
|
|
90
|
+
: "w"(a), "w"(b)
|
|
91
|
+
: /* No clobbers */);
|
|
92
|
+
return result;
|
|
93
|
+
}
|
|
94
|
+
# elif defined(_MSC_VER) && !defined(__clang__)
|
|
95
|
+
# define pmull_low vmull_p64
|
|
96
|
+
# define pmull_high vmull_high_p64
|
|
97
|
+
# else
|
|
98
|
+
# define pmull_low(x, y) vreinterpretq_u64_p128(vmull_p64(vreinterpret_p64_u64(x), vreinterpret_p64_u64(y)))
|
|
99
|
+
# define pmull_high(x, y) vreinterpretq_u64_p128(vmull_high_p64(vreinterpretq_p64_u64(x), vreinterpretq_p64_u64(y)))
|
|
100
|
+
# endif
|
|
101
|
+
#else
|
|
102
|
+
# if defined(_MSC_VER) && !defined(__clang__)
|
|
103
|
+
# define pmull_low vmull_p64
|
|
104
|
+
# define pmull_high(x, y) vmull_p64(vget_high_u64(x), vget_high_u64(y))
|
|
105
|
+
# else
|
|
106
|
+
# define pmull_low(x, y) vreinterpretq_u64_p128(vmull_p64(x, y))
|
|
107
|
+
# define pmull_high(x, y) vreinterpretq_u64_p128(vmull_p64(vget_high_p64(vreinterpretq_p64_u64(x)), vget_high_p64(vreinterpretq_p64_u64(y))))
|
|
108
|
+
# endif
|
|
109
|
+
#endif
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
static uint32_t crc32_multiply_pmull(uint32_t a, uint32_t b) {
|
|
113
|
+
uint64x1_t prod = vget_low_u64(pmull_low(
|
|
114
|
+
vreinterpret_u64_u32(vset_lane_u32(a, vdup_n_u32(0), 0)),
|
|
115
|
+
vreinterpret_u64_u32(vset_lane_u32(b, vdup_n_u32(0), 0))
|
|
116
|
+
));
|
|
117
|
+
#ifdef __aarch64__
|
|
118
|
+
uint64_t p = vget_lane_u64(prod, 0);
|
|
119
|
+
return __crc32w(0, p+p) ^ (p >> 31);
|
|
120
|
+
#else
|
|
121
|
+
prod = vadd_u64(prod, prod);
|
|
122
|
+
uint32x2_t prod32 = vreinterpret_u32_u64(prod);
|
|
123
|
+
return __crc32w(0, vget_lane_u32(prod32, 0)) ^ vget_lane_u32(prod32, 1);
|
|
124
|
+
#endif
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
static const uint32_t crc_power_rev[32] = { // bit-reversed crc_power
|
|
130
|
+
0x00000002, 0x00000004, 0x00000010, 0x00000100, 0x00010000, 0x04c11db7, 0x490d678d, 0xe8a45605,
|
|
131
|
+
0x75be46b7, 0xe6228b11, 0x567fddeb, 0x88fe2237, 0x0e857e71, 0x7001e426, 0x075de2b2, 0xf12a7f90,
|
|
132
|
+
0xf0b4a1c1, 0x58f46c0c, 0xc3395ade, 0x96837f8c, 0x544037f9, 0x23b7b136, 0xb2e16ba8, 0x725e7bfa,
|
|
133
|
+
0xec709b5d, 0xf77a7274, 0x2845d572, 0x034e2515, 0x79695942, 0x540cb128, 0x0b65d023, 0x3c344723
|
|
134
|
+
};
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
static HEDLEY_ALWAYS_INLINE uint64x1_t crc32_shift_pmull_mulred(uint64x1_t a, uint64x1_t b) {
|
|
138
|
+
uint64x2_t r = pmull_low(a, b);
|
|
139
|
+
uint64x2_t h = pmull_high(r, vdupq_n_u64(0x490d678d));
|
|
140
|
+
return veor_u64(vget_low_u64(r), vget_low_u64(h));
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
static uint32_t crc32_shift_pmull(uint32_t crc1, uint32_t n) {
|
|
145
|
+
crc1 = rbit32(crc1);
|
|
146
|
+
|
|
147
|
+
uint64x1_t res;
|
|
148
|
+
#ifdef __aarch64__
|
|
149
|
+
uint64_t crc = (uint64_t)crc1 << (n & 31);
|
|
150
|
+
res = vset_lane_u64(crc, vdup_n_u64(0), 0);
|
|
151
|
+
#else
|
|
152
|
+
res = vreinterpret_u64_u32(vset_lane_u32(crc1, vdup_n_u32(0), 0));
|
|
153
|
+
res = vshl_u64(res, vdup_n_u64(n&31));
|
|
154
|
+
#endif
|
|
155
|
+
n &= ~31;
|
|
156
|
+
|
|
157
|
+
if(n) {
|
|
158
|
+
#define LOAD_NEXT_POWER vreinterpret_u64_u32(vset_lane_u32(crc_power_rev[ctz32(n)], vdup_n_u32(0), 0))
|
|
159
|
+
uint64x1_t res2 = LOAD_NEXT_POWER;
|
|
160
|
+
n &= n-1;
|
|
161
|
+
|
|
162
|
+
if(n) {
|
|
163
|
+
// first multiply doesn't need reduction
|
|
164
|
+
res2 = vget_low_u64(pmull_low(res2, LOAD_NEXT_POWER));
|
|
165
|
+
n &= n-1;
|
|
166
|
+
|
|
167
|
+
while(n) {
|
|
168
|
+
res = crc32_shift_pmull_mulred(res, LOAD_NEXT_POWER);
|
|
169
|
+
n &= n-1;
|
|
170
|
+
|
|
171
|
+
if(n) {
|
|
172
|
+
res2 = crc32_shift_pmull_mulred(res2, LOAD_NEXT_POWER);
|
|
173
|
+
n &= n-1;
|
|
174
|
+
}
|
|
175
|
+
}
|
|
176
|
+
}
|
|
177
|
+
#undef LOAD_NEXT_POWER
|
|
178
|
+
|
|
179
|
+
// merge two results
|
|
180
|
+
uint64x2_t prod = pmull_low(res, res2);
|
|
181
|
+
// weirdly, vrbitq_u8 is missing in ARM32 MSVC
|
|
182
|
+
prod = vreinterpretq_u64_u8(vrev64q_u8(vrbitq_u8(vreinterpretq_u8_u64(prod))));
|
|
183
|
+
#ifdef __aarch64__
|
|
184
|
+
crc = __crc32d(0, vgetq_lane_u64(prod, 1));
|
|
185
|
+
uint64_t rem = vgetq_lane_u64(prod, 0);
|
|
186
|
+
crc = __crc32w(rem, crc) ^ (rem >> 32);
|
|
187
|
+
#else
|
|
188
|
+
uint32x4_t prod32 = vreinterpretq_u32_u64(prod);
|
|
189
|
+
uint32_t crc = __crc32w(0, vgetq_lane_u32(prod32, 2));
|
|
190
|
+
crc = __crc32w(vgetq_lane_u32(prod32, 3), crc);
|
|
191
|
+
crc = __crc32w(vgetq_lane_u32(prod32, 0), crc) ^ vgetq_lane_u32(prod32, 1);
|
|
192
|
+
#endif
|
|
193
|
+
return crc;
|
|
194
|
+
} else {
|
|
195
|
+
#ifdef __aarch64__
|
|
196
|
+
crc = rbit64(crc);
|
|
197
|
+
crc = __crc32w(0, crc) ^ (crc >> 32);
|
|
198
|
+
return crc;
|
|
199
|
+
#else
|
|
200
|
+
uint32x2_t r = vreinterpret_u32_u64(res);
|
|
201
|
+
return __crc32w(0, rbit32(vget_lane_u32(r, 1))) ^ rbit32(vget_lane_u32(r, 0));
|
|
202
|
+
#endif
|
|
203
|
+
}
|
|
204
|
+
}
|
|
205
|
+
|
|
206
|
+
|
|
207
|
+
void RapidYenc::crc_pmull_set_funcs() {
|
|
208
|
+
_crc32_multiply = &crc32_multiply_pmull;
|
|
209
|
+
_crc32_shift = &crc32_shift_pmull;
|
|
210
|
+
_crc32_isa &= ISA_FEATURE_PMULL;
|
|
211
|
+
}
|
|
212
|
+
|
|
213
|
+
#else
|
|
214
|
+
void RapidYenc::crc_pmull_set_funcs() {}
|
|
215
|
+
#endif /* defined(__ARM_FEATURE_CRYPTO) && defined(__ARM_FEATURE_CRC32) */
|
package/src/crc_common.h
CHANGED
|
@@ -2,8 +2,6 @@
|
|
|
2
2
|
#include <stddef.h> // for size_t
|
|
3
3
|
#include "crc.h"
|
|
4
4
|
|
|
5
|
-
extern const uint32_t crc_power[32];
|
|
6
|
-
|
|
7
5
|
#ifdef __GNUC__
|
|
8
6
|
# define ctz32 __builtin_ctz
|
|
9
7
|
#elif defined(_MSC_VER)
|
|
@@ -13,3 +11,16 @@ static HEDLEY_ALWAYS_INLINE unsigned ctz32(uint32_t n) {
|
|
|
13
11
|
return r;
|
|
14
12
|
}
|
|
15
13
|
#endif
|
|
14
|
+
|
|
15
|
+
namespace RapidYenc {
|
|
16
|
+
void crc_clmul_set_funcs();
|
|
17
|
+
void crc_clmul256_set_funcs();
|
|
18
|
+
void crc_arm_set_funcs();
|
|
19
|
+
void crc_pmull_set_funcs();
|
|
20
|
+
void crc_riscv_set_funcs();
|
|
21
|
+
|
|
22
|
+
extern const uint32_t crc_power[32];
|
|
23
|
+
uint32_t crc32_multiply_generic(uint32_t a, uint32_t b);
|
|
24
|
+
uint32_t crc32_shift_generic(uint32_t crc1, uint32_t n);
|
|
25
|
+
|
|
26
|
+
}
|
package/src/crc_folding.cc
CHANGED
|
@@ -365,7 +365,7 @@ static HEDLEY_ALWAYS_INLINE __m128i crc32_reduce(__m128i prod) {
|
|
|
365
365
|
return t;
|
|
366
366
|
}
|
|
367
367
|
|
|
368
|
-
uint32_t crc32_multiply_clmul(uint32_t a, uint32_t b) {
|
|
368
|
+
static uint32_t crc32_multiply_clmul(uint32_t a, uint32_t b) {
|
|
369
369
|
// do the actual multiply
|
|
370
370
|
__m128i prod = _mm_clmulepi64_si128(_mm_cvtsi32_si128(a), _mm_cvtsi32_si128(b), 0);
|
|
371
371
|
|
|
@@ -418,7 +418,7 @@ static HEDLEY_ALWAYS_INLINE __m128i reverse_bits_epi8(__m128i src) {
|
|
|
418
418
|
|
|
419
419
|
|
|
420
420
|
|
|
421
|
-
const uint32_t crc_power_rev[32] = { // bit-reversed crc_power
|
|
421
|
+
static const uint32_t crc_power_rev[32] = { // bit-reversed crc_power
|
|
422
422
|
0x00000002, 0x00000004, 0x00000010, 0x00000100, 0x00010000, 0x04c11db7, 0x490d678d, 0xe8a45605,
|
|
423
423
|
0x75be46b7, 0xe6228b11, 0x567fddeb, 0x88fe2237, 0x0e857e71, 0x7001e426, 0x075de2b2, 0xf12a7f90,
|
|
424
424
|
0xf0b4a1c1, 0x58f46c0c, 0xc3395ade, 0x96837f8c, 0x544037f9, 0x23b7b136, 0xb2e16ba8, 0x725e7bfa,
|
|
@@ -436,7 +436,7 @@ static HEDLEY_ALWAYS_INLINE __m128i crc32_shift_clmul_mulred(unsigned pos, __m12
|
|
|
436
436
|
return _mm_xor_si128(hi, prod);
|
|
437
437
|
}
|
|
438
438
|
|
|
439
|
-
uint32_t crc32_shift_clmul(uint32_t crc1, uint32_t n) {
|
|
439
|
+
static uint32_t crc32_shift_clmul(uint32_t crc1, uint32_t n) {
|
|
440
440
|
if(!n) return crc1;
|
|
441
441
|
|
|
442
442
|
__m128i result = _mm_cvtsi32_si128(BSWAP32(crc1));
|
|
@@ -499,7 +499,7 @@ uint32_t crc32_shift_clmul(uint32_t crc1, uint32_t n) {
|
|
|
499
499
|
#endif
|
|
500
500
|
|
|
501
501
|
|
|
502
|
-
void crc_clmul_set_funcs() {
|
|
502
|
+
void RapidYenc::crc_clmul_set_funcs() {
|
|
503
503
|
_do_crc32_incremental = &do_crc32_incremental_clmul;
|
|
504
504
|
_crc32_multiply = &crc32_multiply_clmul;
|
|
505
505
|
#if defined(__GNUC__) || defined(_MSC_VER)
|
|
@@ -508,6 +508,6 @@ void crc_clmul_set_funcs() {
|
|
|
508
508
|
_crc32_isa = ISA_LEVEL_PCLMUL;
|
|
509
509
|
}
|
|
510
510
|
#else
|
|
511
|
-
void crc_clmul_set_funcs() {}
|
|
511
|
+
void RapidYenc::crc_clmul_set_funcs() {}
|
|
512
512
|
#endif
|
|
513
513
|
|
package/src/crc_folding_256.cc
CHANGED
|
@@ -1,8 +1,6 @@
|
|
|
1
1
|
// 256-bit version of crc_folding
|
|
2
2
|
|
|
3
3
|
#include "crc_common.h"
|
|
4
|
-
|
|
5
|
-
void crc_clmul_set_funcs();
|
|
6
4
|
|
|
7
5
|
#if !defined(YENC_DISABLE_AVX256) && ((defined(__VPCLMULQDQ__) && defined(__AVX2__) && defined(__PCLMUL__)) || (defined(_MSC_VER) && _MSC_VER >= 1920 && defined(PLATFORM_X86) && !defined(__clang__)))
|
|
8
6
|
#include <inttypes.h>
|
|
@@ -212,13 +210,13 @@ static uint32_t do_crc32_incremental_clmul(const void* data, size_t length, uint
|
|
|
212
210
|
return crc_fold((const unsigned char*)data, (long)length, init);
|
|
213
211
|
}
|
|
214
212
|
|
|
215
|
-
void crc_clmul256_set_funcs() {
|
|
213
|
+
void RapidYenc::crc_clmul256_set_funcs() {
|
|
216
214
|
crc_clmul_set_funcs(); // set multiply/shift function
|
|
217
215
|
_do_crc32_incremental = &do_crc32_incremental_clmul;
|
|
218
216
|
_crc32_isa = ISA_LEVEL_VPCLMUL;
|
|
219
217
|
}
|
|
220
218
|
#else
|
|
221
|
-
void crc_clmul256_set_funcs() {
|
|
219
|
+
void RapidYenc::crc_clmul256_set_funcs() {
|
|
222
220
|
crc_clmul_set_funcs();
|
|
223
221
|
}
|
|
224
222
|
#endif
|