yencode 1.1.4 → 1.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +130 -189
- package/binding.gyp +79 -6
- package/index.js +2 -0
- package/package.json +1 -1
- package/src/common.h +39 -1
- package/src/crc.cc +89 -23
- package/src/crc.h +68 -2
- package/src/crc_arm.cc +54 -37
- package/src/crc_common.h +11 -0
- package/src/crc_folding.cc +155 -18
- package/src/crc_folding_256.cc +12 -16
- package/src/crc_riscv.cc +251 -0
- package/src/decoder.cc +37 -3
- package/src/decoder.h +4 -0
- package/src/decoder_avx.cc +3 -2
- package/src/decoder_avx2.cc +2 -1
- package/src/decoder_avx2_base.h +6 -24
- package/src/decoder_common.h +61 -49
- package/src/decoder_neon.cc +10 -26
- package/src/decoder_neon64.cc +10 -22
- package/src/decoder_rvv.cc +274 -0
- package/src/decoder_sse2.cc +24 -2
- package/src/decoder_sse_base.h +11 -45
- package/src/decoder_ssse3.cc +3 -2
- package/src/decoder_vbmi2.cc +2 -5
- package/src/encoder.cc +28 -0
- package/src/encoder.h +4 -0
- package/src/encoder_avx.cc +1 -0
- package/src/encoder_avx2.cc +1 -0
- package/src/encoder_common.h +2 -20
- package/src/encoder_neon.cc +1 -0
- package/src/encoder_rvv.cc +5 -19
- package/src/encoder_sse2.cc +1 -0
- package/src/encoder_ssse3.cc +1 -0
- package/src/encoder_vbmi2.cc +2 -0
- package/src/platform.cc +4 -4
- package/src/yencode.cc +45 -3
- package/test/testcrc.js +19 -3
- package/test/testcrcfuncs.c +53 -0
- package/test/testdec.js +2 -1
- package/test/testenc.js +1 -1
package/package.json
CHANGED
package/src/common.h
CHANGED
|
@@ -221,6 +221,7 @@ bool cpu_supports_neon();
|
|
|
221
221
|
|
|
222
222
|
#ifdef PLATFORM_X86
|
|
223
223
|
enum YEncDecIsaLevel {
|
|
224
|
+
ISA_GENERIC = 0,
|
|
224
225
|
ISA_FEATURE_POPCNT = 0x1,
|
|
225
226
|
ISA_FEATURE_LZCNT = 0x2,
|
|
226
227
|
ISA_FEATURE_EVEX512 = 0x4, // AVX512 support
|
|
@@ -228,11 +229,31 @@ enum YEncDecIsaLevel {
|
|
|
228
229
|
ISA_LEVEL_SSSE3 = 0x200,
|
|
229
230
|
ISA_LEVEL_SSE41 = 0x300,
|
|
230
231
|
ISA_LEVEL_SSE4_POPCNT = 0x301,
|
|
232
|
+
ISA_LEVEL_PCLMUL = 0x340,
|
|
231
233
|
ISA_LEVEL_AVX = 0x381, // same as above, just used as a differentiator for `cpu_supports_isa`
|
|
232
234
|
ISA_LEVEL_AVX2 = 0x403, // also includes BMI1/2 and LZCNT
|
|
235
|
+
ISA_LEVEL_VPCLMUL = 0x440,
|
|
233
236
|
ISA_LEVEL_AVX3 = 0x507, // SKX variant; AVX512VL + AVX512BW
|
|
234
237
|
ISA_LEVEL_VBMI2 = 0x603 // ICL, AVX10
|
|
235
238
|
};
|
|
239
|
+
#elif defined(PLATFORM_ARM)
|
|
240
|
+
enum YEncDecIsaLevel {
|
|
241
|
+
ISA_GENERIC = 0,
|
|
242
|
+
ISA_FEATURE_CRC = 8,
|
|
243
|
+
ISA_LEVEL_NEON = 0x1000
|
|
244
|
+
};
|
|
245
|
+
#elif defined(__riscv)
|
|
246
|
+
enum YEncDecIsaLevel {
|
|
247
|
+
ISA_GENERIC = 0,
|
|
248
|
+
ISA_FEATURE_ZBC = 16,
|
|
249
|
+
ISA_LEVEL_RVV = 0x10000
|
|
250
|
+
};
|
|
251
|
+
#else
|
|
252
|
+
enum YEncDecIsaLevel {
|
|
253
|
+
ISA_GENERIC = 0
|
|
254
|
+
};
|
|
255
|
+
#endif
|
|
256
|
+
#ifdef PLATFORM_X86
|
|
236
257
|
#ifdef _MSC_VER
|
|
237
258
|
// native tuning not supported in MSVC
|
|
238
259
|
# define ISA_NATIVE ISA_LEVEL_SSE2
|
|
@@ -271,8 +292,25 @@ bool cpu_supports_rvv();
|
|
|
271
292
|
#if defined(__riscv_vector) && defined(HEDLEY_GCC_VERSION) && !HEDLEY_GCC_VERSION_CHECK(13,0,0)
|
|
272
293
|
// GCC added RVV intrinsics in GCC13
|
|
273
294
|
# undef __riscv_vector
|
|
295
|
+
#elif defined(__riscv_vector) && defined(HEDLEY_GCC_VERSION) && !HEDLEY_GCC_VERSION_CHECK(14,0,0)
|
|
296
|
+
// ...however, GCC13 lacks necessary mask<>vector vreinterpret casts, and it crashes on type punning, so I can't be bothered trying to make it work
|
|
297
|
+
# undef __riscv_vector
|
|
298
|
+
#endif
|
|
299
|
+
#ifdef __riscv_vector
|
|
300
|
+
# include <riscv_vector.h>
|
|
301
|
+
# ifdef __riscv_v_intrinsic
|
|
302
|
+
# define RV(f) __riscv_##f
|
|
303
|
+
# else
|
|
304
|
+
# define RV(f) f
|
|
305
|
+
# endif
|
|
306
|
+
# if defined(__riscv_v_intrinsic) && __riscv_v_intrinsic >= 12000
|
|
307
|
+
# define RV_MASK_CAST(masksz, vecsz, vec) RV(vreinterpret_v_u##vecsz##m1_b##masksz)(vec)
|
|
308
|
+
# define RV_VEC_U8MF4_CAST(vec) RV(vlmul_trunc_v_u8m1_u8mf4)(RV(vreinterpret_v_b4_u8m1)(vec))
|
|
309
|
+
# else
|
|
310
|
+
# define RV_MASK_CAST(masksz, vecsz, vec) *(vbool##masksz##_t*)(&(vec))
|
|
311
|
+
# define RV_VEC_U8MF4_CAST(vec) *(vuint8mf4_t*)(&(vec))
|
|
312
|
+
# endif
|
|
274
313
|
#endif
|
|
275
|
-
|
|
276
314
|
|
|
277
315
|
#include <string.h>
|
|
278
316
|
#if !defined(_MSC_VER) || defined(_STDINT) || _MSC_VER >= 1900
|
package/src/crc.cc
CHANGED
|
@@ -1,16 +1,23 @@
|
|
|
1
1
|
#include "crc_common.h"
|
|
2
2
|
|
|
3
|
+
#if defined(PLATFORM_X86) && !defined(__ILP32__) && !defined(YENC_DISABLE_CRCUTIL)
|
|
4
|
+
// Use crcutil for computing CRC32 (generic implementation)
|
|
5
|
+
|
|
3
6
|
#include "interface.h"
|
|
4
7
|
crcutil_interface::CRC* crc = NULL;
|
|
8
|
+
#define GENERIC_CRC_INIT crc = crcutil_interface::CRC::Create(0xEDB88320, 0, 32, true, 0, 0, 0, 0, NULL)
|
|
9
|
+
// instance never deleted... oh well...
|
|
5
10
|
|
|
6
|
-
#if defined(PLATFORM_X86) && !defined(__ILP32__)
|
|
7
11
|
static uint32_t do_crc32_incremental_generic(const void* data, size_t length, uint32_t init) {
|
|
8
12
|
// use optimised ASM on x86 platforms
|
|
9
13
|
crcutil_interface::UINT64 tmp = init;
|
|
10
14
|
crc->Compute(data, length, &tmp);
|
|
11
15
|
return (uint32_t)tmp;
|
|
12
16
|
}
|
|
17
|
+
|
|
13
18
|
#else
|
|
19
|
+
// don't use crcutil
|
|
20
|
+
|
|
14
21
|
static uint32_t* HEDLEY_RESTRICT crc_slice_table;
|
|
15
22
|
#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
|
|
16
23
|
# if defined(__GNUC__) || defined(__clang__)
|
|
@@ -121,28 +128,73 @@ static void generate_crc32_slice_table() {
|
|
|
121
128
|
#endif
|
|
122
129
|
}
|
|
123
130
|
}
|
|
131
|
+
|
|
132
|
+
#define GENERIC_CRC_INIT generate_crc32_slice_table()
|
|
124
133
|
#endif
|
|
125
134
|
|
|
126
135
|
|
|
127
|
-
crc_func _do_crc32_incremental = &do_crc32_incremental_generic;
|
|
128
136
|
|
|
137
|
+
// workaround MSVC complaining "unary minus operator applied to unsigned type, result still unsigned"
|
|
138
|
+
#define NEGATE(n) (uint32_t)(-((int32_t)(n)))
|
|
139
|
+
uint32_t crc32_multiply_generic(uint32_t a, uint32_t b) {
|
|
140
|
+
uint32_t res = 0;
|
|
141
|
+
for(int i=0; i<31; i++) {
|
|
142
|
+
res ^= NEGATE(b>>31) & a;
|
|
143
|
+
a = ((a >> 1) ^ (0xEDB88320 & NEGATE(a&1)));
|
|
144
|
+
b <<= 1;
|
|
145
|
+
}
|
|
146
|
+
res ^= NEGATE(b>>31) & a;
|
|
147
|
+
return res;
|
|
148
|
+
}
|
|
149
|
+
#undef NEGATE
|
|
129
150
|
|
|
151
|
+
const uint32_t crc_power[32] = { // pre-computed 2^(2^n)
|
|
152
|
+
0x40000000, 0x20000000, 0x08000000, 0x00800000, 0x00008000, 0xedb88320, 0xb1e6b092, 0xa06a2517,
|
|
153
|
+
0xed627dae, 0x88d14467, 0xd7bbfe6a, 0xec447f11, 0x8e7ea170, 0x6427800e, 0x4d47bae0, 0x09fe548f,
|
|
154
|
+
0x83852d0f, 0x30362f1a, 0x7b5a9cc3, 0x31fec169, 0x9fec022a, 0x6c8dedc4, 0x15d6874d, 0x5fde7a4e,
|
|
155
|
+
0xbad90e37, 0x2e4e5eef, 0x4eaba214, 0xa8a472c0, 0x429a969e, 0x148d302a, 0xc40ba6d0, 0xc4e22c3c
|
|
156
|
+
};
|
|
130
157
|
|
|
131
|
-
uint32_t
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
158
|
+
uint32_t crc32_shift_generic(uint32_t crc1, uint32_t n) {
|
|
159
|
+
uint32_t result = crc1;
|
|
160
|
+
#ifdef __GNUC__
|
|
161
|
+
while(n) {
|
|
162
|
+
result = crc32_multiply_generic(result, crc_power[__builtin_ctz(n)]);
|
|
163
|
+
n &= n-1;
|
|
164
|
+
}
|
|
165
|
+
#elif defined(_MSC_VER)
|
|
166
|
+
unsigned long power;
|
|
167
|
+
while(_BitScanForward(&power, n)) {
|
|
168
|
+
result = crc32_multiply_generic(result, crc_power[power]);
|
|
169
|
+
n &= n-1;
|
|
170
|
+
}
|
|
171
|
+
#else
|
|
172
|
+
unsigned power = 0;
|
|
173
|
+
while(n) {
|
|
174
|
+
if(n & 1) {
|
|
175
|
+
result = crc32_multiply_generic(result, crc_power[power]);
|
|
176
|
+
}
|
|
177
|
+
n >>= 1;
|
|
178
|
+
power++;
|
|
179
|
+
}
|
|
180
|
+
#endif
|
|
181
|
+
return result;
|
|
135
182
|
}
|
|
136
183
|
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
184
|
+
|
|
185
|
+
extern "C" {
|
|
186
|
+
crc_func _do_crc32_incremental = &do_crc32_incremental_generic;
|
|
187
|
+
crc_mul_func _crc32_shift = &crc32_shift_generic;
|
|
188
|
+
crc_mul_func _crc32_multiply = &crc32_multiply_generic;
|
|
189
|
+
int _crc32_isa = ISA_GENERIC;
|
|
141
190
|
}
|
|
142
191
|
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
void
|
|
192
|
+
|
|
193
|
+
|
|
194
|
+
void crc_clmul_set_funcs();
|
|
195
|
+
void crc_clmul256_set_funcs();
|
|
196
|
+
void crc_arm_set_funcs();
|
|
197
|
+
void crc_riscv_set_funcs();
|
|
146
198
|
|
|
147
199
|
#ifdef PLATFORM_X86
|
|
148
200
|
int cpu_supports_crc_isa();
|
|
@@ -174,21 +226,23 @@ static unsigned long getauxval(unsigned long cap) {
|
|
|
174
226
|
# endif
|
|
175
227
|
# endif
|
|
176
228
|
#endif
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
#
|
|
183
|
-
generate_crc32_slice_table();
|
|
229
|
+
#if defined(__riscv) && defined(__has_include)
|
|
230
|
+
# if __has_include(<asm/hwprobe.h>)
|
|
231
|
+
# include <asm/hwprobe.h>
|
|
232
|
+
# include <asm/unistd.h>
|
|
233
|
+
# include <unistd.h>
|
|
234
|
+
# endif
|
|
184
235
|
#endif
|
|
236
|
+
|
|
237
|
+
void crc_init() {
|
|
238
|
+
GENERIC_CRC_INIT;
|
|
185
239
|
|
|
186
240
|
#ifdef PLATFORM_X86
|
|
187
241
|
int support = cpu_supports_crc_isa();
|
|
188
242
|
if(support == 2)
|
|
189
|
-
crc_clmul256_set_funcs(
|
|
243
|
+
crc_clmul256_set_funcs();
|
|
190
244
|
else if(support == 1)
|
|
191
|
-
crc_clmul_set_funcs(
|
|
245
|
+
crc_clmul_set_funcs();
|
|
192
246
|
#endif
|
|
193
247
|
#ifdef PLATFORM_ARM
|
|
194
248
|
# ifdef __APPLE__
|
|
@@ -216,7 +270,19 @@ void crc_init() {
|
|
|
216
270
|
false
|
|
217
271
|
# endif
|
|
218
272
|
) {
|
|
219
|
-
crc_arm_set_funcs(
|
|
273
|
+
crc_arm_set_funcs();
|
|
274
|
+
}
|
|
275
|
+
#endif
|
|
276
|
+
#ifdef __riscv
|
|
277
|
+
# if defined(RISCV_HWPROBE_KEY_IMA_EXT_0) && defined(__NR_riscv_hwprobe)
|
|
278
|
+
const int rv_hwprobe_ext_zbc = 1 << 7, rv_hwprobe_ext_zbkc = 1 << 9;
|
|
279
|
+
struct riscv_hwprobe p;
|
|
280
|
+
p.key = RISCV_HWPROBE_KEY_IMA_EXT_0;
|
|
281
|
+
if(!syscall(__NR_riscv_hwprobe, &p, 1, 0, NULL, 0)) {
|
|
282
|
+
if(p.value & (rv_hwprobe_ext_zbc | rv_hwprobe_ext_zbkc)) {
|
|
283
|
+
crc_riscv_set_funcs();
|
|
284
|
+
}
|
|
220
285
|
}
|
|
286
|
+
# endif
|
|
221
287
|
#endif
|
|
222
288
|
}
|
package/src/crc.h
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
#ifndef __YENC_CRC_H
|
|
2
2
|
#define __YENC_CRC_H
|
|
3
|
+
#include <stdlib.h> // for llabs
|
|
3
4
|
|
|
4
5
|
#ifdef __cplusplus
|
|
5
6
|
extern "C" {
|
|
@@ -9,10 +10,75 @@ extern "C" {
|
|
|
9
10
|
|
|
10
11
|
typedef uint32_t (*crc_func)(const void*, size_t, uint32_t);
|
|
11
12
|
extern crc_func _do_crc32_incremental;
|
|
13
|
+
|
|
14
|
+
extern int _crc32_isa;
|
|
12
15
|
#define do_crc32 (*_do_crc32_incremental)
|
|
16
|
+
static inline int crc32_isa_level() {
|
|
17
|
+
return _crc32_isa;
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
#if !defined(__GNUC__) && defined(_MSC_VER)
|
|
22
|
+
# include <intrin.h>
|
|
23
|
+
#endif
|
|
24
|
+
// computes `n % 0xffffffff` (well, almost), using some bit-hacks
|
|
25
|
+
static inline uint32_t crc32_powmod(uint64_t n) {
|
|
26
|
+
#ifdef __GNUC__
|
|
27
|
+
unsigned res;
|
|
28
|
+
unsigned carry = __builtin_uadd_overflow(n >> 32, n, &res);
|
|
29
|
+
res += carry;
|
|
30
|
+
return res;
|
|
31
|
+
#elif defined(_MSC_VER)
|
|
32
|
+
unsigned res;
|
|
33
|
+
unsigned char carry = _addcarry_u32(0, n >> 32, n, &res);
|
|
34
|
+
_addcarry_u32(carry, res, 0, &res);
|
|
35
|
+
return res;
|
|
36
|
+
#else
|
|
37
|
+
n = (n >> 32) + (n & 0xffffffff);
|
|
38
|
+
n += n >> 32;
|
|
39
|
+
return n;
|
|
40
|
+
#endif
|
|
41
|
+
}
|
|
42
|
+
// computes `crc32_powmod(n*8)` avoiding overflow
|
|
43
|
+
static inline uint32_t crc32_bytepow(uint64_t n) {
|
|
44
|
+
#if defined(__GNUC__) || defined(_MSC_VER)
|
|
45
|
+
unsigned res = crc32_powmod(n);
|
|
46
|
+
# ifdef _MSC_VER
|
|
47
|
+
return _rotl(res, 3);
|
|
48
|
+
# else
|
|
49
|
+
return (res << 3) | (res >> 29);
|
|
50
|
+
# endif
|
|
51
|
+
#else
|
|
52
|
+
n = (n >> 32) + (n & 0xffffffff);
|
|
53
|
+
n <<= 3;
|
|
54
|
+
n += n >> 32;
|
|
55
|
+
return n;
|
|
56
|
+
#endif
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
typedef uint32_t (*crc_mul_func)(uint32_t, uint32_t);
|
|
60
|
+
extern crc_mul_func _crc32_shift;
|
|
61
|
+
extern crc_mul_func _crc32_multiply;
|
|
62
|
+
#define crc32_shift (*_crc32_shift)
|
|
63
|
+
#define crc32_multiply (*_crc32_multiply)
|
|
64
|
+
|
|
65
|
+
static inline uint32_t crc32_combine(uint32_t crc1, uint32_t crc2, uint64_t len2) {
|
|
66
|
+
return crc32_shift(crc1, crc32_bytepow(len2)) ^ crc2;
|
|
67
|
+
}
|
|
68
|
+
static inline uint32_t crc32_zeros(uint32_t crc1, uint64_t len) {
|
|
69
|
+
return ~crc32_shift(~crc1, crc32_bytepow(len));
|
|
70
|
+
}
|
|
71
|
+
static inline uint32_t crc32_unzero(uint32_t crc1, uint64_t len) {
|
|
72
|
+
return ~crc32_shift(~crc1, ~crc32_bytepow(len));
|
|
73
|
+
}
|
|
74
|
+
static inline uint32_t crc32_2pow(int64_t n) {
|
|
75
|
+
uint32_t sign = (uint32_t)(n >> 63);
|
|
76
|
+
return crc32_shift(0x80000000, crc32_powmod(llabs(n)) ^ sign);
|
|
77
|
+
}
|
|
78
|
+
static inline uint32_t crc32_256pow(uint64_t n) {
|
|
79
|
+
return crc32_shift(0x80000000, crc32_bytepow(n));
|
|
80
|
+
}
|
|
13
81
|
|
|
14
|
-
uint32_t do_crc32_combine(uint32_t crc1, const uint32_t crc2, size_t len2);
|
|
15
|
-
uint32_t do_crc32_zeros(uint32_t crc1, size_t len);
|
|
16
82
|
void crc_init();
|
|
17
83
|
|
|
18
84
|
|
package/src/crc_arm.cc
CHANGED
|
@@ -59,42 +59,36 @@ HEDLEY_WARNING("CRC32 acceleration has been disabled due to missing arm_acle.h")
|
|
|
59
59
|
#endif
|
|
60
60
|
|
|
61
61
|
|
|
62
|
+
|
|
63
|
+
#ifdef __aarch64__
|
|
64
|
+
uint32_t crc32_multiply_arm(uint32_t a, uint32_t b) {
|
|
65
|
+
// perform PMULL
|
|
66
|
+
uint64_t res = 0;
|
|
67
|
+
uint64_t a64 = (uint64_t)a << 32;
|
|
68
|
+
int64_t b64 = (int64_t)b << 32;
|
|
69
|
+
for(int i=0; i<32; i++) {
|
|
70
|
+
res ^= a64 & (b64 >> 63);
|
|
71
|
+
b64 += b64;
|
|
72
|
+
a64 >>= 1;
|
|
73
|
+
}
|
|
74
|
+
// reduction via CRC
|
|
75
|
+
res = __crc32w(0, res) ^ (res >> 32);
|
|
76
|
+
return res;
|
|
77
|
+
}
|
|
78
|
+
#endif
|
|
79
|
+
// regular multiply is probably better for AArch32
|
|
80
|
+
|
|
81
|
+
|
|
62
82
|
// exploit CPU pipelining during CRC computation; unfortunately I haven't been able to measure any benefit
|
|
63
83
|
// - Neoverse N1: no noticeable difference
|
|
64
84
|
// - Cortex A53: actually runs a bit slower
|
|
65
85
|
//#define ENABLE_PIPELINE_OPT 1
|
|
66
86
|
|
|
67
87
|
#ifdef ENABLE_PIPELINE_OPT
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
uint32_t res = 0;
|
|
73
|
-
for(int i=0; i<31; i++) {
|
|
74
|
-
res ^= NEGATE(b>>31) & a;
|
|
75
|
-
a = ((a >> 1) ^ (0xEDB88320 & NEGATE(a&1)));
|
|
76
|
-
b <<= 1;
|
|
77
|
-
}
|
|
78
|
-
res ^= NEGATE(b>>31) & a;
|
|
79
|
-
return res;
|
|
80
|
-
}
|
|
81
|
-
|
|
82
|
-
static const uint32_t crc_power[] = { // pre-computed 2^(2^n), with first 3 entries removed (saves a shift)
|
|
83
|
-
0x00800000, 0x00008000, 0xedb88320, 0xb1e6b092, 0xa06a2517, 0xed627dae, 0x88d14467, 0xd7bbfe6a,
|
|
84
|
-
0xec447f11, 0x8e7ea170, 0x6427800e, 0x4d47bae0, 0x09fe548f, 0x83852d0f, 0x30362f1a, 0x7b5a9cc3,
|
|
85
|
-
0x31fec169, 0x9fec022a, 0x6c8dedc4, 0x15d6874d, 0x5fde7a4e, 0xbad90e37, 0x2e4e5eef, 0x4eaba214,
|
|
86
|
-
0xa8a472c0, 0x429a969e, 0x148d302a, 0xc40ba6d0, 0xc4e22c3c, 0x40000000, 0x20000000, 0x08000000
|
|
87
|
-
};
|
|
88
|
-
/* above table can be computed with
|
|
89
|
-
int main(void) {
|
|
90
|
-
uint32_t k = 0x80000000 >> 1;
|
|
91
|
-
for (size_t i = 0; i < 32+3; ++i) {
|
|
92
|
-
if(i>2) printf("0x%08x, ", k);
|
|
93
|
-
k = crc_multiply(k, k);
|
|
94
|
-
}
|
|
95
|
-
return 0;
|
|
96
|
-
}
|
|
97
|
-
*/
|
|
88
|
+
#ifndef __aarch64__
|
|
89
|
+
uint32_t crc32_multiply_generic(uint32_t a, uint32_t b);
|
|
90
|
+
# define crc32_multiply_arm crc32_multiply_generic
|
|
91
|
+
#endif
|
|
98
92
|
#endif
|
|
99
93
|
|
|
100
94
|
|
|
@@ -130,6 +124,7 @@ static uint32_t arm_crc_calc(uint32_t crc, const unsigned char *src, long len) {
|
|
|
130
124
|
// (this is a slightly less efficient, but much simpler implementation of the idea)
|
|
131
125
|
const unsigned SPLIT_WORDS_LOG = 10; // make sure it's at least 2
|
|
132
126
|
const unsigned SPLIT_WORDS = 1<<SPLIT_WORDS_LOG;
|
|
127
|
+
const unsigned blockCoeff = crc_power[SPLIT_WORDS_LOG + WORDSIZE_LOG + 3];
|
|
133
128
|
while(len >= (long)(sizeof(WORD_T)*SPLIT_WORDS*2)) {
|
|
134
129
|
// compute 2x CRCs concurrently to leverage piplining
|
|
135
130
|
uint32_t crc2 = 0;
|
|
@@ -148,8 +143,7 @@ static uint32_t arm_crc_calc(uint32_t crc, const unsigned char *src, long len) {
|
|
|
148
143
|
srcW++;
|
|
149
144
|
}
|
|
150
145
|
// merge the CRCs
|
|
151
|
-
|
|
152
|
-
crc = crc_multiply(crc, crc_power[SPLIT_WORDS_LOG + WORDSIZE_LOG]) ^ crc2;
|
|
146
|
+
crc = crc32_multiply_arm(crc, blockCoeff) ^ crc2;
|
|
153
147
|
srcW += SPLIT_WORDS;
|
|
154
148
|
len -= sizeof(WORD_T)*SPLIT_WORDS*2;
|
|
155
149
|
}
|
|
@@ -200,11 +194,34 @@ static uint32_t do_crc32_incremental_arm(const void* data, size_t length, uint32
|
|
|
200
194
|
return ~arm_crc_calc(~init, (const unsigned char*)data, (long)length);
|
|
201
195
|
}
|
|
202
196
|
|
|
203
|
-
|
|
204
|
-
|
|
197
|
+
|
|
198
|
+
#if defined(__aarch64__) && (defined(__GNUC__) || defined(_MSC_VER))
|
|
199
|
+
uint32_t crc32_shift_arm(uint32_t crc1, uint32_t n) {
|
|
200
|
+
uint32_t result = crc1;
|
|
201
|
+
uint64_t prod = result;
|
|
202
|
+
prod <<= 32 - (n&31);
|
|
203
|
+
result = __crc32w(0, prod) ^ (prod >> 32);
|
|
204
|
+
n &= ~31;
|
|
205
|
+
|
|
206
|
+
while(n) {
|
|
207
|
+
result = crc32_multiply_arm(result, crc_power[ctz32(n)]);
|
|
208
|
+
n &= n-1;
|
|
209
|
+
}
|
|
210
|
+
return result;
|
|
205
211
|
}
|
|
206
|
-
#
|
|
207
|
-
|
|
208
|
-
|
|
212
|
+
#endif
|
|
213
|
+
|
|
214
|
+
|
|
215
|
+
void crc_arm_set_funcs() {
|
|
216
|
+
_do_crc32_incremental = &do_crc32_incremental_arm;
|
|
217
|
+
#ifdef __aarch64__
|
|
218
|
+
_crc32_multiply = &crc32_multiply_arm;
|
|
219
|
+
# if defined(__GNUC__) || defined(_MSC_VER)
|
|
220
|
+
_crc32_shift = &crc32_shift_arm;
|
|
221
|
+
# endif
|
|
222
|
+
#endif
|
|
223
|
+
_crc32_isa = ISA_FEATURE_CRC;
|
|
209
224
|
}
|
|
225
|
+
#else
|
|
226
|
+
void crc_arm_set_funcs() {}
|
|
210
227
|
#endif
|
package/src/crc_common.h
CHANGED
|
@@ -2,3 +2,14 @@
|
|
|
2
2
|
#include <stddef.h> // for size_t
|
|
3
3
|
#include "crc.h"
|
|
4
4
|
|
|
5
|
+
extern const uint32_t crc_power[32];
|
|
6
|
+
|
|
7
|
+
#ifdef __GNUC__
|
|
8
|
+
# define ctz32 __builtin_ctz
|
|
9
|
+
#elif defined(_MSC_VER)
|
|
10
|
+
static HEDLEY_ALWAYS_INLINE unsigned ctz32(uint32_t n) {
|
|
11
|
+
unsigned long r;
|
|
12
|
+
_BitScanForward(&r, n);
|
|
13
|
+
return r;
|
|
14
|
+
}
|
|
15
|
+
#endif
|
package/src/crc_folding.cc
CHANGED
|
@@ -140,20 +140,10 @@ static uint32_t crc_fold(const unsigned char *src, long len, uint32_t initial) {
|
|
|
140
140
|
unsigned long algn_diff;
|
|
141
141
|
__m128i xmm_t0, xmm_t1, xmm_t2, xmm_t3;
|
|
142
142
|
|
|
143
|
-
//
|
|
144
|
-
//
|
|
145
|
-
//
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
xmm_t0 = _mm_clmulepi64_si128(xmm_t0, _mm_set_epi32(0, 0, 0xa273bc24, 0), 0); // reverse(0x487b9c8a)<<1 == 0xa273bc24
|
|
149
|
-
xmm_t2 = _mm_set_epi32( // polynomial reduction factors
|
|
150
|
-
1, 0xdb710640, // G* = 0x04c11db7
|
|
151
|
-
0, 0xf7011641 // Q+ = 0x04d101df (+1 to save an additional xor operation)
|
|
152
|
-
);
|
|
153
|
-
xmm_t1 = _mm_clmulepi64_si128(xmm_t0, xmm_t2, 0);
|
|
154
|
-
xmm_t1 = _mm_clmulepi64_si128(xmm_t1, xmm_t2, 0x10);
|
|
155
|
-
|
|
156
|
-
__m128i xmm_crc0 = _mm_srli_si128(_mm_xor_si128(xmm_t0, xmm_t1), 8);
|
|
143
|
+
// since the initial value will immediately be multiplied by around 2^512, we need to roll it backwards
|
|
144
|
+
// this is done by dividing the initial value by 2^480
|
|
145
|
+
// the constant used here is reverse(2^-480)<<1 == 0xdfded7ec
|
|
146
|
+
__m128i xmm_crc0 = _mm_clmulepi64_si128(_mm_cvtsi32_si128(~initial), _mm_cvtsi32_si128(0xdfded7ec), 0);
|
|
157
147
|
|
|
158
148
|
__m128i xmm_crc1 = _mm_setzero_si128();
|
|
159
149
|
__m128i xmm_crc2 = _mm_setzero_si128();
|
|
@@ -365,12 +355,159 @@ static uint32_t do_crc32_incremental_clmul(const void* data, size_t length, uint
|
|
|
365
355
|
return crc_fold((const unsigned char*)data, (long)length, init);
|
|
366
356
|
}
|
|
367
357
|
|
|
368
|
-
|
|
369
|
-
|
|
358
|
+
|
|
359
|
+
static HEDLEY_ALWAYS_INLINE __m128i crc32_reduce(__m128i prod) {
|
|
360
|
+
// do Barrett reduction back into 32-bit field
|
|
361
|
+
const __m128i reduction_const = _mm_load_si128((__m128i*)crc_k + 2);
|
|
362
|
+
__m128i t = _mm_clmulepi64_si128(prod, reduction_const, 0);
|
|
363
|
+
t = _mm_clmulepi64_si128(t, reduction_const, 0x10);
|
|
364
|
+
t = _mm_xor_si128(t, prod);
|
|
365
|
+
return t;
|
|
366
|
+
}
|
|
367
|
+
|
|
368
|
+
uint32_t crc32_multiply_clmul(uint32_t a, uint32_t b) {
|
|
369
|
+
// do the actual multiply
|
|
370
|
+
__m128i prod = _mm_clmulepi64_si128(_mm_cvtsi32_si128(a), _mm_cvtsi32_si128(b), 0);
|
|
371
|
+
|
|
372
|
+
// prepare product for reduction
|
|
373
|
+
prod = _mm_add_epi64(prod, prod); // bit alignment fix, due to CRC32 being bit-reversal
|
|
374
|
+
prod = _mm_slli_si128(prod, 4); // straddle low/high halves across 64-bit boundary - this provides automatic truncation during reduction
|
|
375
|
+
|
|
376
|
+
prod = crc32_reduce(prod);
|
|
377
|
+
return _mm_extract_epi32(prod, 2);
|
|
370
378
|
}
|
|
379
|
+
|
|
380
|
+
#if defined(__GNUC__) || defined(_MSC_VER)
|
|
381
|
+
static HEDLEY_ALWAYS_INLINE __m128i reverse_bits_epi8(__m128i src) {
|
|
382
|
+
#if defined(__GFNI__) && defined(YENC_BUILD_NATIVE) && YENC_BUILD_NATIVE!=0
|
|
383
|
+
return _mm_gf2p8affine_epi64_epi8(src, _mm_set_epi32(
|
|
384
|
+
0x80402010, 0x08040201,
|
|
385
|
+
0x80402010, 0x08040201
|
|
386
|
+
), 0);
|
|
387
|
+
/*
|
|
388
|
+
#elif defined(ENABLE_AVX512)
|
|
389
|
+
// !! this only processes the bottom 32 bits !!
|
|
390
|
+
src = _mm_maskz_mov_epi32(1, src);
|
|
391
|
+
src = _mm_ternarylogic_epi32(src, _mm_slli_epi64(src, 28), _mm_set1_epi8(0xf), 0xa8); // (a|b)&c
|
|
392
|
+
src = _mm_shuffle_epi8(_mm_set_epi8(
|
|
393
|
+
-16, 112, -80, 48, -48, 80, -112, 16, -32, 96, -96, 32, -64, 64, -128, 0
|
|
394
|
+
), src);
|
|
395
|
+
return _mm_maskz_or_epi32(1, src, _mm_srli_epi64(src, 36));
|
|
396
|
+
*/
|
|
371
397
|
#else
|
|
372
|
-
|
|
373
|
-
(
|
|
398
|
+
__m128i xmm_t0 = _mm_and_si128(src, _mm_set1_epi8(0x0f));
|
|
399
|
+
__m128i xmm_t1 = _mm_and_si128(_mm_srli_epi16(src, 4), _mm_set1_epi8(0x0f));
|
|
400
|
+
xmm_t0 = _mm_shuffle_epi8(_mm_set_epi8(
|
|
401
|
+
-16, 112, -80, 48, -48, 80, -112, 16, -32, 96, -96, 32, -64, 64, -128, 0
|
|
402
|
+
//0xf0, 0x70, 0xb0, 0x30, 0xd0, 0x50, 0x90, 0x10, 0xe0, 0x60, 0xa0, 0x20, 0xc0, 0x40, 0x80, 0
|
|
403
|
+
), xmm_t0);
|
|
404
|
+
xmm_t1 = _mm_shuffle_epi8(_mm_set_epi8(
|
|
405
|
+
15, 7, 11, 3, 13, 5, 9, 1, 14, 6, 10, 2, 12, 4, 8, 0
|
|
406
|
+
), xmm_t1);
|
|
407
|
+
return _mm_or_si128(xmm_t0, xmm_t1);
|
|
408
|
+
#endif
|
|
374
409
|
}
|
|
410
|
+
|
|
411
|
+
#ifdef _MSC_VER
|
|
412
|
+
// because MSVC doesn't use BSWAP unless you specifically tell it to...
|
|
413
|
+
# include <stdlib.h>
|
|
414
|
+
# define BSWAP32 _byteswap_ulong
|
|
415
|
+
#else
|
|
416
|
+
# define BSWAP32(n) ((((n)&0xff)<<24) | (((n)&0xff00)<<8) | (((n)&0xff0000)>>8) | (((n)&0xff000000)>>24))
|
|
417
|
+
#endif
|
|
418
|
+
|
|
419
|
+
|
|
420
|
+
|
|
421
|
+
const uint32_t crc_power_rev[32] = { // bit-reversed crc_power
|
|
422
|
+
0x00000002, 0x00000004, 0x00000010, 0x00000100, 0x00010000, 0x04c11db7, 0x490d678d, 0xe8a45605,
|
|
423
|
+
0x75be46b7, 0xe6228b11, 0x567fddeb, 0x88fe2237, 0x0e857e71, 0x7001e426, 0x075de2b2, 0xf12a7f90,
|
|
424
|
+
0xf0b4a1c1, 0x58f46c0c, 0xc3395ade, 0x96837f8c, 0x544037f9, 0x23b7b136, 0xb2e16ba8, 0x725e7bfa,
|
|
425
|
+
0xec709b5d, 0xf77a7274, 0x2845d572, 0x034e2515, 0x79695942, 0x540cb128, 0x0b65d023, 0x3c344723
|
|
426
|
+
};
|
|
427
|
+
|
|
428
|
+
static HEDLEY_ALWAYS_INLINE __m128i crc32_shift_clmul_mulred(unsigned pos, __m128i prod) {
|
|
429
|
+
// this multiplies a 64-bit `prod` with a 32-bit CRC power
|
|
430
|
+
// compared with crc32_multiply_clmul, this only reduces the result to 64-bit, saving a multiply
|
|
431
|
+
__m128i coeff = _mm_cvtsi32_si128(crc_power_rev[pos]);
|
|
432
|
+
|
|
433
|
+
const __m128i fold_const = _mm_set_epi32(0, 0x490d678d, 0, 0xf200aa66);
|
|
434
|
+
prod = _mm_clmulepi64_si128(prod, coeff, 0);
|
|
435
|
+
__m128i hi = _mm_clmulepi64_si128(prod, fold_const, 0x11);
|
|
436
|
+
return _mm_xor_si128(hi, prod);
|
|
437
|
+
}
|
|
438
|
+
|
|
439
|
+
uint32_t crc32_shift_clmul(uint32_t crc1, uint32_t n) {
|
|
440
|
+
if(!n) return crc1;
|
|
441
|
+
|
|
442
|
+
__m128i result = _mm_cvtsi32_si128(BSWAP32(crc1));
|
|
443
|
+
result = reverse_bits_epi8(result);
|
|
444
|
+
|
|
445
|
+
// handle n < 32 with a shift
|
|
446
|
+
result = _mm_sll_epi64(result, _mm_cvtsi32_si128(n & 31));
|
|
447
|
+
n &= ~31;
|
|
448
|
+
|
|
449
|
+
__m128i t;
|
|
450
|
+
if(n) {
|
|
451
|
+
// use a second accumulator to leverage some IPC from slow CLMUL
|
|
452
|
+
__m128i result2 = _mm_cvtsi32_si128(crc_power_rev[ctz32(n)]);
|
|
453
|
+
n &= n-1;
|
|
454
|
+
|
|
455
|
+
if(n) {
|
|
456
|
+
// first multiply doesn't need reduction
|
|
457
|
+
result2 = _mm_clmulepi64_si128(result2, _mm_cvtsi32_si128(crc_power_rev[ctz32(n)]), 0);
|
|
458
|
+
n &= n-1;
|
|
459
|
+
|
|
460
|
+
while(n) {
|
|
461
|
+
result = crc32_shift_clmul_mulred(ctz32(n), result);
|
|
462
|
+
n &= n-1;
|
|
463
|
+
|
|
464
|
+
if(n) {
|
|
465
|
+
result2 = crc32_shift_clmul_mulred(ctz32(n), result2);
|
|
466
|
+
n &= n-1;
|
|
467
|
+
}
|
|
468
|
+
}
|
|
469
|
+
}
|
|
470
|
+
|
|
471
|
+
const __m128i fold_const = _mm_set_epi32(0, 0x490d678d, 0, 0xf200aa66);
|
|
472
|
+
|
|
473
|
+
// merge two results
|
|
474
|
+
result = _mm_clmulepi64_si128(result, result2, 0);
|
|
475
|
+
|
|
476
|
+
// do 128b reduction
|
|
477
|
+
t = _mm_unpackhi_epi32(result, _mm_setzero_si128());
|
|
478
|
+
// fold [127:96] -> [63:0]
|
|
479
|
+
__m128i hi = _mm_clmulepi64_si128(t, fold_const, 1);
|
|
480
|
+
// fold [95:64] -> [63:0]
|
|
481
|
+
__m128i lo = _mm_clmulepi64_si128(t, fold_const, 0x10);
|
|
482
|
+
#ifdef ENABLE_AVX512
|
|
483
|
+
result = _mm_ternarylogic_epi32(result, hi, lo, 0x96);
|
|
484
|
+
#else
|
|
485
|
+
result = _mm_xor_si128(result, hi);
|
|
486
|
+
result = _mm_xor_si128(result, lo);
|
|
487
|
+
#endif
|
|
488
|
+
}
|
|
489
|
+
|
|
490
|
+
// do Barrett reduction back into 32-bit field
|
|
491
|
+
const __m128i reduction_const = _mm_set_epi32(0, 0x04c11db7, 1, 0x04d101df);
|
|
492
|
+
t = _mm_clmulepi64_si128(_mm_blend_epi16(_mm_setzero_si128(), result, 0x3c), reduction_const, 0);
|
|
493
|
+
t = _mm_clmulepi64_si128(t, reduction_const, 0x11);
|
|
494
|
+
result = _mm_xor_si128(t, result);
|
|
495
|
+
|
|
496
|
+
result = reverse_bits_epi8(result);
|
|
497
|
+
return BSWAP32(_mm_cvtsi128_si32(result));
|
|
498
|
+
}
|
|
499
|
+
#endif
|
|
500
|
+
|
|
501
|
+
|
|
502
|
+
void crc_clmul_set_funcs() {
|
|
503
|
+
_do_crc32_incremental = &do_crc32_incremental_clmul;
|
|
504
|
+
_crc32_multiply = &crc32_multiply_clmul;
|
|
505
|
+
#if defined(__GNUC__) || defined(_MSC_VER)
|
|
506
|
+
_crc32_shift = &crc32_shift_clmul;
|
|
507
|
+
#endif
|
|
508
|
+
_crc32_isa = ISA_LEVEL_PCLMUL;
|
|
509
|
+
}
|
|
510
|
+
#else
|
|
511
|
+
void crc_clmul_set_funcs() {}
|
|
375
512
|
#endif
|
|
376
513
|
|