yencode 1.1.5 → 1.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +130 -189
- package/binding.gyp +115 -6
- package/index.js +2 -0
- package/package.json +1 -1
- package/src/common.h +37 -7
- package/src/crc.cc +121 -47
- package/src/crc.h +74 -10
- package/src/crc_arm.cc +51 -34
- package/src/crc_arm_pmull.cc +215 -0
- package/src/crc_common.h +22 -0
- package/src/crc_folding.cc +154 -16
- package/src/crc_folding_256.cc +7 -14
- package/src/crc_riscv.cc +251 -0
- package/src/decoder.cc +373 -13
- package/src/decoder.h +10 -14
- package/src/decoder_avx.cc +5 -6
- package/src/decoder_avx2.cc +8 -9
- package/src/decoder_avx2_base.h +7 -11
- package/src/decoder_common.h +56 -373
- package/src/decoder_neon.cc +13 -19
- package/src/decoder_neon64.cc +12 -15
- package/src/decoder_rvv.cc +280 -0
- package/src/decoder_sse2.cc +26 -5
- package/src/decoder_sse_base.h +20 -40
- package/src/decoder_ssse3.cc +5 -6
- package/src/decoder_vbmi2.cc +6 -13
- package/src/encoder.cc +42 -26
- package/src/encoder.h +5 -7
- package/src/encoder_avx.cc +3 -3
- package/src/encoder_avx2.cc +3 -3
- package/src/encoder_avx_base.h +3 -0
- package/src/encoder_common.h +26 -32
- package/src/encoder_neon.cc +6 -3
- package/src/encoder_rvv.cc +13 -26
- package/src/encoder_sse2.cc +3 -2
- package/src/encoder_sse_base.h +2 -0
- package/src/encoder_ssse3.cc +3 -3
- package/src/encoder_vbmi2.cc +6 -7
- package/src/platform.cc +24 -23
- package/src/yencode.cc +54 -11
- package/test/_speedbase.js +4 -2
- package/test/speeddec.js +25 -16
- package/test/speedenc.js +21 -17
- package/test/testcrc.js +17 -1
- package/test/testcrcfuncs.c +53 -0
- package/test/testdec.js +1 -0
package/src/crc_arm.cc
CHANGED
|
@@ -59,42 +59,35 @@ HEDLEY_WARNING("CRC32 acceleration has been disabled due to missing arm_acle.h")
|
|
|
59
59
|
#endif
|
|
60
60
|
|
|
61
61
|
|
|
62
|
+
|
|
63
|
+
#ifdef __aarch64__
|
|
64
|
+
static uint32_t crc32_multiply_arm(uint32_t a, uint32_t b) {
|
|
65
|
+
// perform PMULL
|
|
66
|
+
uint64_t res = 0;
|
|
67
|
+
uint64_t a64 = (uint64_t)a << 32;
|
|
68
|
+
int64_t b64 = (int64_t)b << 32;
|
|
69
|
+
for(int i=0; i<32; i++) {
|
|
70
|
+
res ^= a64 & (b64 >> 63);
|
|
71
|
+
b64 += b64;
|
|
72
|
+
a64 >>= 1;
|
|
73
|
+
}
|
|
74
|
+
// reduction via CRC
|
|
75
|
+
res = __crc32w(0, res) ^ (res >> 32);
|
|
76
|
+
return res;
|
|
77
|
+
}
|
|
78
|
+
#endif
|
|
79
|
+
// regular multiply is probably better for AArch32
|
|
80
|
+
|
|
81
|
+
|
|
62
82
|
// exploit CPU pipelining during CRC computation; unfortunately I haven't been able to measure any benefit
|
|
63
83
|
// - Neoverse N1: no noticeable difference
|
|
64
84
|
// - Cortex A53: actually runs a bit slower
|
|
65
85
|
//#define ENABLE_PIPELINE_OPT 1
|
|
66
86
|
|
|
67
87
|
#ifdef ENABLE_PIPELINE_OPT
|
|
68
|
-
|
|
69
|
-
#define
|
|
70
|
-
|
|
71
|
-
static HEDLEY_ALWAYS_INLINE uint32_t crc_multiply(uint32_t a, uint32_t b) {
|
|
72
|
-
uint32_t res = 0;
|
|
73
|
-
for(int i=0; i<31; i++) {
|
|
74
|
-
res ^= NEGATE(b>>31) & a;
|
|
75
|
-
a = ((a >> 1) ^ (0xEDB88320 & NEGATE(a&1)));
|
|
76
|
-
b <<= 1;
|
|
77
|
-
}
|
|
78
|
-
res ^= NEGATE(b>>31) & a;
|
|
79
|
-
return res;
|
|
80
|
-
}
|
|
81
|
-
|
|
82
|
-
static const uint32_t crc_power[] = { // pre-computed 2^(2^n), with first 3 entries removed (saves a shift)
|
|
83
|
-
0x00800000, 0x00008000, 0xedb88320, 0xb1e6b092, 0xa06a2517, 0xed627dae, 0x88d14467, 0xd7bbfe6a,
|
|
84
|
-
0xec447f11, 0x8e7ea170, 0x6427800e, 0x4d47bae0, 0x09fe548f, 0x83852d0f, 0x30362f1a, 0x7b5a9cc3,
|
|
85
|
-
0x31fec169, 0x9fec022a, 0x6c8dedc4, 0x15d6874d, 0x5fde7a4e, 0xbad90e37, 0x2e4e5eef, 0x4eaba214,
|
|
86
|
-
0xa8a472c0, 0x429a969e, 0x148d302a, 0xc40ba6d0, 0xc4e22c3c, 0x40000000, 0x20000000, 0x08000000
|
|
87
|
-
};
|
|
88
|
-
/* above table can be computed with
|
|
89
|
-
int main(void) {
|
|
90
|
-
uint32_t k = 0x80000000 >> 1;
|
|
91
|
-
for (size_t i = 0; i < 32+3; ++i) {
|
|
92
|
-
if(i>2) printf("0x%08x, ", k);
|
|
93
|
-
k = crc_multiply(k, k);
|
|
94
|
-
}
|
|
95
|
-
return 0;
|
|
96
|
-
}
|
|
97
|
-
*/
|
|
88
|
+
#ifndef __aarch64__
|
|
89
|
+
# define crc32_multiply_arm RapidYenc::crc32_multiply_generic
|
|
90
|
+
#endif
|
|
98
91
|
#endif
|
|
99
92
|
|
|
100
93
|
|
|
@@ -130,6 +123,7 @@ static uint32_t arm_crc_calc(uint32_t crc, const unsigned char *src, long len) {
|
|
|
130
123
|
// (this is a slightly less efficient, but much simpler implementation of the idea)
|
|
131
124
|
const unsigned SPLIT_WORDS_LOG = 10; // make sure it's at least 2
|
|
132
125
|
const unsigned SPLIT_WORDS = 1<<SPLIT_WORDS_LOG;
|
|
126
|
+
const unsigned blockCoeff = RapidYenc::crc_power[SPLIT_WORDS_LOG + WORDSIZE_LOG + 3];
|
|
133
127
|
while(len >= (long)(sizeof(WORD_T)*SPLIT_WORDS*2)) {
|
|
134
128
|
// compute 2x CRCs concurrently to leverage piplining
|
|
135
129
|
uint32_t crc2 = 0;
|
|
@@ -148,8 +142,7 @@ static uint32_t arm_crc_calc(uint32_t crc, const unsigned char *src, long len) {
|
|
|
148
142
|
srcW++;
|
|
149
143
|
}
|
|
150
144
|
// merge the CRCs
|
|
151
|
-
|
|
152
|
-
crc = crc_multiply(crc, crc_power[SPLIT_WORDS_LOG + WORDSIZE_LOG]) ^ crc2;
|
|
145
|
+
crc = crc32_multiply_arm(crc, blockCoeff) ^ crc2;
|
|
153
146
|
srcW += SPLIT_WORDS;
|
|
154
147
|
len -= sizeof(WORD_T)*SPLIT_WORDS*2;
|
|
155
148
|
}
|
|
@@ -200,10 +193,34 @@ static uint32_t do_crc32_incremental_arm(const void* data, size_t length, uint32
|
|
|
200
193
|
return ~arm_crc_calc(~init, (const unsigned char*)data, (long)length);
|
|
201
194
|
}
|
|
202
195
|
|
|
203
|
-
|
|
196
|
+
|
|
197
|
+
#if defined(__aarch64__) && (defined(__GNUC__) || defined(_MSC_VER))
|
|
198
|
+
static uint32_t crc32_shift_arm(uint32_t crc1, uint32_t n) {
|
|
199
|
+
uint32_t result = crc1;
|
|
200
|
+
uint64_t prod = result;
|
|
201
|
+
prod <<= 32 - (n&31);
|
|
202
|
+
result = __crc32w(0, prod) ^ (prod >> 32);
|
|
203
|
+
n &= ~31;
|
|
204
|
+
|
|
205
|
+
while(n) {
|
|
206
|
+
result = crc32_multiply_arm(result, RapidYenc::crc_power[ctz32(n)]);
|
|
207
|
+
n &= n-1;
|
|
208
|
+
}
|
|
209
|
+
return result;
|
|
210
|
+
}
|
|
211
|
+
#endif
|
|
212
|
+
|
|
213
|
+
|
|
214
|
+
void RapidYenc::crc_arm_set_funcs() {
|
|
204
215
|
_do_crc32_incremental = &do_crc32_incremental_arm;
|
|
216
|
+
#ifdef __aarch64__
|
|
217
|
+
_crc32_multiply = &crc32_multiply_arm;
|
|
218
|
+
# if defined(__GNUC__) || defined(_MSC_VER)
|
|
219
|
+
_crc32_shift = &crc32_shift_arm;
|
|
220
|
+
# endif
|
|
221
|
+
#endif
|
|
205
222
|
_crc32_isa = ISA_FEATURE_CRC;
|
|
206
223
|
}
|
|
207
224
|
#else
|
|
208
|
-
void crc_arm_set_funcs() {}
|
|
225
|
+
void RapidYenc::crc_arm_set_funcs() {}
|
|
209
226
|
#endif
|
|
@@ -0,0 +1,215 @@
|
|
|
1
|
+
#include "crc_common.h"
|
|
2
|
+
|
|
3
|
+
// exclude broken/missing arm_acle.h
|
|
4
|
+
#if defined(__ARM_FEATURE_CRYPTO) && defined(HEDLEY_GCC_VERSION)
|
|
5
|
+
# if !defined(__aarch64__) && HEDLEY_GCC_VERSION_CHECK(7,0,0) && !HEDLEY_GCC_VERSION_CHECK(8,1,1)
|
|
6
|
+
# undef __ARM_FEATURE_CRYPTO
|
|
7
|
+
# endif
|
|
8
|
+
# if defined(__aarch64__) && HEDLEY_GCC_VERSION_CHECK(9,4,0) && !HEDLEY_GCC_VERSION_CHECK(9,5,0)
|
|
9
|
+
# undef __ARM_FEATURE_CRYPTO
|
|
10
|
+
# endif
|
|
11
|
+
#endif
|
|
12
|
+
#if defined(__ARM_FEATURE_CRYPTO) && defined(__has_include)
|
|
13
|
+
# if !__has_include(<arm_acle.h>)
|
|
14
|
+
# undef __ARM_FEATURE_CRYPTO
|
|
15
|
+
# endif
|
|
16
|
+
#endif
|
|
17
|
+
|
|
18
|
+
// ARM's intrinsics guide seems to suggest that vmull_p64 is available on A32, but neither Clang/GCC seem to support it on AArch32
|
|
19
|
+
#if (defined(__ARM_FEATURE_CRYPTO) && defined(__ARM_FEATURE_CRC32) && defined(__aarch64__)) || (defined(_M_ARM64) && !defined(__clang__))
|
|
20
|
+
|
|
21
|
+
#include <arm_neon.h>
|
|
22
|
+
#if defined(_MSC_VER) && !defined(__clang__)
|
|
23
|
+
# include <intrin.h>
|
|
24
|
+
|
|
25
|
+
# ifdef _M_ARM64
|
|
26
|
+
// MSVC may detect this pattern: https://devblogs.microsoft.com/cppblog/a-tour-of-4-msvc-backend-improvements/#byteswap-identification
|
|
27
|
+
static HEDLEY_ALWAYS_INLINE uint64_t rbit64(uint64_t x) {
|
|
28
|
+
x = _byteswap_uint64(x);
|
|
29
|
+
x = (x & 0xaaaaaaaaaaaaaaaa) >> 1 | (x & 0x5555555555555555) << 1;
|
|
30
|
+
x = (x & 0xcccccccccccccccc) >> 2 | (x & 0x3333333333333333) << 2;
|
|
31
|
+
x = (x & 0xf0f0f0f0f0f0f0f0) >> 4 | (x & 0x0f0f0f0f0f0f0f0f) << 4;
|
|
32
|
+
return x;
|
|
33
|
+
}
|
|
34
|
+
// ...whilst this seems to work best for 32-bit RBIT
|
|
35
|
+
static HEDLEY_ALWAYS_INLINE uint32_t rbit32(uint32_t x) {
|
|
36
|
+
uint64_t r = rbit64(x);
|
|
37
|
+
return r >> 32;
|
|
38
|
+
}
|
|
39
|
+
# else
|
|
40
|
+
# define rbit32 _arm_rbit
|
|
41
|
+
# endif
|
|
42
|
+
#else
|
|
43
|
+
# include <arm_acle.h>
|
|
44
|
+
// __rbit not present before GCC 11.4.0 or 12.2.0; for ARM32, requires GCC 14
|
|
45
|
+
# if defined(HEDLEY_GCC_VERSION) && !HEDLEY_GCC_VERSION_CHECK(14,0,0) && (!defined(__aarch64__) || !HEDLEY_GCC_VERSION_CHECK(11,3,0) || (HEDLEY_GCC_VERSION_CHECK(12,0,0) && !HEDLEY_GCC_VERSION_CHECK(12,2,0)))
|
|
46
|
+
# ifdef __aarch64__
|
|
47
|
+
static HEDLEY_ALWAYS_INLINE uint64_t rbit64(uint64_t x) {
|
|
48
|
+
uint64_t r;
|
|
49
|
+
__asm__ ("rbit %0,%1\n"
|
|
50
|
+
: "=r"(r) : "r"(x)
|
|
51
|
+
: /* No clobbers */);
|
|
52
|
+
return r;
|
|
53
|
+
}
|
|
54
|
+
# endif
|
|
55
|
+
static HEDLEY_ALWAYS_INLINE uint32_t rbit32(uint32_t x) {
|
|
56
|
+
uint32_t r;
|
|
57
|
+
__asm__ (
|
|
58
|
+
# ifdef __aarch64__
|
|
59
|
+
"rbit %w0,%w1\n"
|
|
60
|
+
# else
|
|
61
|
+
"rbit %0,%1\n"
|
|
62
|
+
# endif
|
|
63
|
+
: "=r"(r) : "r"(x)
|
|
64
|
+
: /* No clobbers */);
|
|
65
|
+
return r;
|
|
66
|
+
}
|
|
67
|
+
# else
|
|
68
|
+
# define rbit32 __rbit
|
|
69
|
+
# define rbit64 __rbitll
|
|
70
|
+
# endif
|
|
71
|
+
#endif
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
// MSVC doesn't have poly64/poly128 types, so always use uint64 instead
|
|
75
|
+
|
|
76
|
+
#ifdef __aarch64__
|
|
77
|
+
# if defined(__GNUC__) || defined(__clang__)
|
|
78
|
+
static HEDLEY_ALWAYS_INLINE uint64x2_t pmull_low(uint64x1_t a, uint64x1_t b) {
|
|
79
|
+
uint64x2_t result;
|
|
80
|
+
__asm__ ("pmull %0.1q,%1.1d,%2.1d"
|
|
81
|
+
: "=w"(result)
|
|
82
|
+
: "w"(a), "w"(b)
|
|
83
|
+
: /* No clobbers */);
|
|
84
|
+
return result;
|
|
85
|
+
}
|
|
86
|
+
static HEDLEY_ALWAYS_INLINE uint64x2_t pmull_high(uint64x2_t a, uint64x2_t b) {
|
|
87
|
+
uint64x2_t result;
|
|
88
|
+
__asm__ ("pmull2 %0.1q,%1.2d,%2.2d"
|
|
89
|
+
: "=w"(result)
|
|
90
|
+
: "w"(a), "w"(b)
|
|
91
|
+
: /* No clobbers */);
|
|
92
|
+
return result;
|
|
93
|
+
}
|
|
94
|
+
# elif defined(_MSC_VER) && !defined(__clang__)
|
|
95
|
+
# define pmull_low vmull_p64
|
|
96
|
+
# define pmull_high vmull_high_p64
|
|
97
|
+
# else
|
|
98
|
+
# define pmull_low(x, y) vreinterpretq_u64_p128(vmull_p64(vreinterpret_p64_u64(x), vreinterpret_p64_u64(y)))
|
|
99
|
+
# define pmull_high(x, y) vreinterpretq_u64_p128(vmull_high_p64(vreinterpretq_p64_u64(x), vreinterpretq_p64_u64(y)))
|
|
100
|
+
# endif
|
|
101
|
+
#else
|
|
102
|
+
# if defined(_MSC_VER) && !defined(__clang__)
|
|
103
|
+
# define pmull_low vmull_p64
|
|
104
|
+
# define pmull_high(x, y) vmull_p64(vget_high_u64(x), vget_high_u64(y))
|
|
105
|
+
# else
|
|
106
|
+
# define pmull_low(x, y) vreinterpretq_u64_p128(vmull_p64(x, y))
|
|
107
|
+
# define pmull_high(x, y) vreinterpretq_u64_p128(vmull_p64(vget_high_p64(vreinterpretq_p64_u64(x)), vget_high_p64(vreinterpretq_p64_u64(y))))
|
|
108
|
+
# endif
|
|
109
|
+
#endif
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
static uint32_t crc32_multiply_pmull(uint32_t a, uint32_t b) {
|
|
113
|
+
uint64x1_t prod = vget_low_u64(pmull_low(
|
|
114
|
+
vreinterpret_u64_u32(vset_lane_u32(a, vdup_n_u32(0), 0)),
|
|
115
|
+
vreinterpret_u64_u32(vset_lane_u32(b, vdup_n_u32(0), 0))
|
|
116
|
+
));
|
|
117
|
+
#ifdef __aarch64__
|
|
118
|
+
uint64_t p = vget_lane_u64(prod, 0);
|
|
119
|
+
return __crc32w(0, p+p) ^ (p >> 31);
|
|
120
|
+
#else
|
|
121
|
+
prod = vadd_u64(prod, prod);
|
|
122
|
+
uint32x2_t prod32 = vreinterpret_u32_u64(prod);
|
|
123
|
+
return __crc32w(0, vget_lane_u32(prod32, 0)) ^ vget_lane_u32(prod32, 1);
|
|
124
|
+
#endif
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
static const uint32_t crc_power_rev[32] = { // bit-reversed crc_power
|
|
130
|
+
0x00000002, 0x00000004, 0x00000010, 0x00000100, 0x00010000, 0x04c11db7, 0x490d678d, 0xe8a45605,
|
|
131
|
+
0x75be46b7, 0xe6228b11, 0x567fddeb, 0x88fe2237, 0x0e857e71, 0x7001e426, 0x075de2b2, 0xf12a7f90,
|
|
132
|
+
0xf0b4a1c1, 0x58f46c0c, 0xc3395ade, 0x96837f8c, 0x544037f9, 0x23b7b136, 0xb2e16ba8, 0x725e7bfa,
|
|
133
|
+
0xec709b5d, 0xf77a7274, 0x2845d572, 0x034e2515, 0x79695942, 0x540cb128, 0x0b65d023, 0x3c344723
|
|
134
|
+
};
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
static HEDLEY_ALWAYS_INLINE uint64x1_t crc32_shift_pmull_mulred(uint64x1_t a, uint64x1_t b) {
|
|
138
|
+
uint64x2_t r = pmull_low(a, b);
|
|
139
|
+
uint64x2_t h = pmull_high(r, vdupq_n_u64(0x490d678d));
|
|
140
|
+
return veor_u64(vget_low_u64(r), vget_low_u64(h));
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
static uint32_t crc32_shift_pmull(uint32_t crc1, uint32_t n) {
|
|
145
|
+
crc1 = rbit32(crc1);
|
|
146
|
+
|
|
147
|
+
uint64x1_t res;
|
|
148
|
+
#ifdef __aarch64__
|
|
149
|
+
uint64_t crc = (uint64_t)crc1 << (n & 31);
|
|
150
|
+
res = vset_lane_u64(crc, vdup_n_u64(0), 0);
|
|
151
|
+
#else
|
|
152
|
+
res = vreinterpret_u64_u32(vset_lane_u32(crc1, vdup_n_u32(0), 0));
|
|
153
|
+
res = vshl_u64(res, vdup_n_u64(n&31));
|
|
154
|
+
#endif
|
|
155
|
+
n &= ~31;
|
|
156
|
+
|
|
157
|
+
if(n) {
|
|
158
|
+
#define LOAD_NEXT_POWER vreinterpret_u64_u32(vset_lane_u32(crc_power_rev[ctz32(n)], vdup_n_u32(0), 0))
|
|
159
|
+
uint64x1_t res2 = LOAD_NEXT_POWER;
|
|
160
|
+
n &= n-1;
|
|
161
|
+
|
|
162
|
+
if(n) {
|
|
163
|
+
// first multiply doesn't need reduction
|
|
164
|
+
res2 = vget_low_u64(pmull_low(res2, LOAD_NEXT_POWER));
|
|
165
|
+
n &= n-1;
|
|
166
|
+
|
|
167
|
+
while(n) {
|
|
168
|
+
res = crc32_shift_pmull_mulred(res, LOAD_NEXT_POWER);
|
|
169
|
+
n &= n-1;
|
|
170
|
+
|
|
171
|
+
if(n) {
|
|
172
|
+
res2 = crc32_shift_pmull_mulred(res2, LOAD_NEXT_POWER);
|
|
173
|
+
n &= n-1;
|
|
174
|
+
}
|
|
175
|
+
}
|
|
176
|
+
}
|
|
177
|
+
#undef LOAD_NEXT_POWER
|
|
178
|
+
|
|
179
|
+
// merge two results
|
|
180
|
+
uint64x2_t prod = pmull_low(res, res2);
|
|
181
|
+
// weirdly, vrbitq_u8 is missing in ARM32 MSVC
|
|
182
|
+
prod = vreinterpretq_u64_u8(vrev64q_u8(vrbitq_u8(vreinterpretq_u8_u64(prod))));
|
|
183
|
+
#ifdef __aarch64__
|
|
184
|
+
crc = __crc32d(0, vgetq_lane_u64(prod, 1));
|
|
185
|
+
uint64_t rem = vgetq_lane_u64(prod, 0);
|
|
186
|
+
crc = __crc32w(rem, crc) ^ (rem >> 32);
|
|
187
|
+
#else
|
|
188
|
+
uint32x4_t prod32 = vreinterpretq_u32_u64(prod);
|
|
189
|
+
uint32_t crc = __crc32w(0, vgetq_lane_u32(prod32, 2));
|
|
190
|
+
crc = __crc32w(vgetq_lane_u32(prod32, 3), crc);
|
|
191
|
+
crc = __crc32w(vgetq_lane_u32(prod32, 0), crc) ^ vgetq_lane_u32(prod32, 1);
|
|
192
|
+
#endif
|
|
193
|
+
return crc;
|
|
194
|
+
} else {
|
|
195
|
+
#ifdef __aarch64__
|
|
196
|
+
crc = rbit64(crc);
|
|
197
|
+
crc = __crc32w(0, crc) ^ (crc >> 32);
|
|
198
|
+
return crc;
|
|
199
|
+
#else
|
|
200
|
+
uint32x2_t r = vreinterpret_u32_u64(res);
|
|
201
|
+
return __crc32w(0, rbit32(vget_lane_u32(r, 1))) ^ rbit32(vget_lane_u32(r, 0));
|
|
202
|
+
#endif
|
|
203
|
+
}
|
|
204
|
+
}
|
|
205
|
+
|
|
206
|
+
|
|
207
|
+
void RapidYenc::crc_pmull_set_funcs() {
|
|
208
|
+
_crc32_multiply = &crc32_multiply_pmull;
|
|
209
|
+
_crc32_shift = &crc32_shift_pmull;
|
|
210
|
+
_crc32_isa &= ISA_FEATURE_PMULL;
|
|
211
|
+
}
|
|
212
|
+
|
|
213
|
+
#else
|
|
214
|
+
void RapidYenc::crc_pmull_set_funcs() {}
|
|
215
|
+
#endif /* defined(__ARM_FEATURE_CRYPTO) && defined(__ARM_FEATURE_CRC32) */
|
package/src/crc_common.h
CHANGED
|
@@ -2,3 +2,25 @@
|
|
|
2
2
|
#include <stddef.h> // for size_t
|
|
3
3
|
#include "crc.h"
|
|
4
4
|
|
|
5
|
+
#ifdef __GNUC__
|
|
6
|
+
# define ctz32 __builtin_ctz
|
|
7
|
+
#elif defined(_MSC_VER)
|
|
8
|
+
static HEDLEY_ALWAYS_INLINE unsigned ctz32(uint32_t n) {
|
|
9
|
+
unsigned long r;
|
|
10
|
+
_BitScanForward(&r, n);
|
|
11
|
+
return r;
|
|
12
|
+
}
|
|
13
|
+
#endif
|
|
14
|
+
|
|
15
|
+
namespace RapidYenc {
|
|
16
|
+
void crc_clmul_set_funcs();
|
|
17
|
+
void crc_clmul256_set_funcs();
|
|
18
|
+
void crc_arm_set_funcs();
|
|
19
|
+
void crc_pmull_set_funcs();
|
|
20
|
+
void crc_riscv_set_funcs();
|
|
21
|
+
|
|
22
|
+
extern const uint32_t crc_power[32];
|
|
23
|
+
uint32_t crc32_multiply_generic(uint32_t a, uint32_t b);
|
|
24
|
+
uint32_t crc32_shift_generic(uint32_t crc1, uint32_t n);
|
|
25
|
+
|
|
26
|
+
}
|
package/src/crc_folding.cc
CHANGED
|
@@ -140,20 +140,10 @@ static uint32_t crc_fold(const unsigned char *src, long len, uint32_t initial) {
|
|
|
140
140
|
unsigned long algn_diff;
|
|
141
141
|
__m128i xmm_t0, xmm_t1, xmm_t2, xmm_t3;
|
|
142
142
|
|
|
143
|
-
//
|
|
144
|
-
//
|
|
145
|
-
//
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
xmm_t0 = _mm_clmulepi64_si128(xmm_t0, _mm_set_epi32(0, 0, 0xa273bc24, 0), 0); // reverse(0x487b9c8a)<<1 == 0xa273bc24
|
|
149
|
-
xmm_t2 = _mm_set_epi32( // polynomial reduction factors
|
|
150
|
-
1, 0xdb710640, // G* = 0x04c11db7
|
|
151
|
-
0, 0xf7011641 // Q+ = 0x04d101df (+1 to save an additional xor operation)
|
|
152
|
-
);
|
|
153
|
-
xmm_t1 = _mm_clmulepi64_si128(xmm_t0, xmm_t2, 0);
|
|
154
|
-
xmm_t1 = _mm_clmulepi64_si128(xmm_t1, xmm_t2, 0x10);
|
|
155
|
-
|
|
156
|
-
__m128i xmm_crc0 = _mm_srli_si128(_mm_xor_si128(xmm_t0, xmm_t1), 8);
|
|
143
|
+
// since the initial value will immediately be multiplied by around 2^512, we need to roll it backwards
|
|
144
|
+
// this is done by dividing the initial value by 2^480
|
|
145
|
+
// the constant used here is reverse(2^-480)<<1 == 0xdfded7ec
|
|
146
|
+
__m128i xmm_crc0 = _mm_clmulepi64_si128(_mm_cvtsi32_si128(~initial), _mm_cvtsi32_si128(0xdfded7ec), 0);
|
|
157
147
|
|
|
158
148
|
__m128i xmm_crc1 = _mm_setzero_si128();
|
|
159
149
|
__m128i xmm_crc2 = _mm_setzero_si128();
|
|
@@ -365,11 +355,159 @@ static uint32_t do_crc32_incremental_clmul(const void* data, size_t length, uint
|
|
|
365
355
|
return crc_fold((const unsigned char*)data, (long)length, init);
|
|
366
356
|
}
|
|
367
357
|
|
|
368
|
-
|
|
358
|
+
|
|
359
|
+
static HEDLEY_ALWAYS_INLINE __m128i crc32_reduce(__m128i prod) {
|
|
360
|
+
// do Barrett reduction back into 32-bit field
|
|
361
|
+
const __m128i reduction_const = _mm_load_si128((__m128i*)crc_k + 2);
|
|
362
|
+
__m128i t = _mm_clmulepi64_si128(prod, reduction_const, 0);
|
|
363
|
+
t = _mm_clmulepi64_si128(t, reduction_const, 0x10);
|
|
364
|
+
t = _mm_xor_si128(t, prod);
|
|
365
|
+
return t;
|
|
366
|
+
}
|
|
367
|
+
|
|
368
|
+
static uint32_t crc32_multiply_clmul(uint32_t a, uint32_t b) {
|
|
369
|
+
// do the actual multiply
|
|
370
|
+
__m128i prod = _mm_clmulepi64_si128(_mm_cvtsi32_si128(a), _mm_cvtsi32_si128(b), 0);
|
|
371
|
+
|
|
372
|
+
// prepare product for reduction
|
|
373
|
+
prod = _mm_add_epi64(prod, prod); // bit alignment fix, due to CRC32 being bit-reversal
|
|
374
|
+
prod = _mm_slli_si128(prod, 4); // straddle low/high halves across 64-bit boundary - this provides automatic truncation during reduction
|
|
375
|
+
|
|
376
|
+
prod = crc32_reduce(prod);
|
|
377
|
+
return _mm_extract_epi32(prod, 2);
|
|
378
|
+
}
|
|
379
|
+
|
|
380
|
+
#if defined(__GNUC__) || defined(_MSC_VER)
|
|
381
|
+
static HEDLEY_ALWAYS_INLINE __m128i reverse_bits_epi8(__m128i src) {
|
|
382
|
+
#if defined(__GFNI__) && defined(YENC_BUILD_NATIVE) && YENC_BUILD_NATIVE!=0
|
|
383
|
+
return _mm_gf2p8affine_epi64_epi8(src, _mm_set_epi32(
|
|
384
|
+
0x80402010, 0x08040201,
|
|
385
|
+
0x80402010, 0x08040201
|
|
386
|
+
), 0);
|
|
387
|
+
/*
|
|
388
|
+
#elif defined(ENABLE_AVX512)
|
|
389
|
+
// !! this only processes the bottom 32 bits !!
|
|
390
|
+
src = _mm_maskz_mov_epi32(1, src);
|
|
391
|
+
src = _mm_ternarylogic_epi32(src, _mm_slli_epi64(src, 28), _mm_set1_epi8(0xf), 0xa8); // (a|b)&c
|
|
392
|
+
src = _mm_shuffle_epi8(_mm_set_epi8(
|
|
393
|
+
-16, 112, -80, 48, -48, 80, -112, 16, -32, 96, -96, 32, -64, 64, -128, 0
|
|
394
|
+
), src);
|
|
395
|
+
return _mm_maskz_or_epi32(1, src, _mm_srli_epi64(src, 36));
|
|
396
|
+
*/
|
|
397
|
+
#else
|
|
398
|
+
__m128i xmm_t0 = _mm_and_si128(src, _mm_set1_epi8(0x0f));
|
|
399
|
+
__m128i xmm_t1 = _mm_and_si128(_mm_srli_epi16(src, 4), _mm_set1_epi8(0x0f));
|
|
400
|
+
xmm_t0 = _mm_shuffle_epi8(_mm_set_epi8(
|
|
401
|
+
-16, 112, -80, 48, -48, 80, -112, 16, -32, 96, -96, 32, -64, 64, -128, 0
|
|
402
|
+
//0xf0, 0x70, 0xb0, 0x30, 0xd0, 0x50, 0x90, 0x10, 0xe0, 0x60, 0xa0, 0x20, 0xc0, 0x40, 0x80, 0
|
|
403
|
+
), xmm_t0);
|
|
404
|
+
xmm_t1 = _mm_shuffle_epi8(_mm_set_epi8(
|
|
405
|
+
15, 7, 11, 3, 13, 5, 9, 1, 14, 6, 10, 2, 12, 4, 8, 0
|
|
406
|
+
), xmm_t1);
|
|
407
|
+
return _mm_or_si128(xmm_t0, xmm_t1);
|
|
408
|
+
#endif
|
|
409
|
+
}
|
|
410
|
+
|
|
411
|
+
#ifdef _MSC_VER
|
|
412
|
+
// because MSVC doesn't use BSWAP unless you specifically tell it to...
|
|
413
|
+
# include <stdlib.h>
|
|
414
|
+
# define BSWAP32 _byteswap_ulong
|
|
415
|
+
#else
|
|
416
|
+
# define BSWAP32(n) ((((n)&0xff)<<24) | (((n)&0xff00)<<8) | (((n)&0xff0000)>>8) | (((n)&0xff000000)>>24))
|
|
417
|
+
#endif
|
|
418
|
+
|
|
419
|
+
|
|
420
|
+
|
|
421
|
+
static const uint32_t crc_power_rev[32] = { // bit-reversed crc_power
|
|
422
|
+
0x00000002, 0x00000004, 0x00000010, 0x00000100, 0x00010000, 0x04c11db7, 0x490d678d, 0xe8a45605,
|
|
423
|
+
0x75be46b7, 0xe6228b11, 0x567fddeb, 0x88fe2237, 0x0e857e71, 0x7001e426, 0x075de2b2, 0xf12a7f90,
|
|
424
|
+
0xf0b4a1c1, 0x58f46c0c, 0xc3395ade, 0x96837f8c, 0x544037f9, 0x23b7b136, 0xb2e16ba8, 0x725e7bfa,
|
|
425
|
+
0xec709b5d, 0xf77a7274, 0x2845d572, 0x034e2515, 0x79695942, 0x540cb128, 0x0b65d023, 0x3c344723
|
|
426
|
+
};
|
|
427
|
+
|
|
428
|
+
static HEDLEY_ALWAYS_INLINE __m128i crc32_shift_clmul_mulred(unsigned pos, __m128i prod) {
|
|
429
|
+
// this multiplies a 64-bit `prod` with a 32-bit CRC power
|
|
430
|
+
// compared with crc32_multiply_clmul, this only reduces the result to 64-bit, saving a multiply
|
|
431
|
+
__m128i coeff = _mm_cvtsi32_si128(crc_power_rev[pos]);
|
|
432
|
+
|
|
433
|
+
const __m128i fold_const = _mm_set_epi32(0, 0x490d678d, 0, 0xf200aa66);
|
|
434
|
+
prod = _mm_clmulepi64_si128(prod, coeff, 0);
|
|
435
|
+
__m128i hi = _mm_clmulepi64_si128(prod, fold_const, 0x11);
|
|
436
|
+
return _mm_xor_si128(hi, prod);
|
|
437
|
+
}
|
|
438
|
+
|
|
439
|
+
static uint32_t crc32_shift_clmul(uint32_t crc1, uint32_t n) {
|
|
440
|
+
if(!n) return crc1;
|
|
441
|
+
|
|
442
|
+
__m128i result = _mm_cvtsi32_si128(BSWAP32(crc1));
|
|
443
|
+
result = reverse_bits_epi8(result);
|
|
444
|
+
|
|
445
|
+
// handle n < 32 with a shift
|
|
446
|
+
result = _mm_sll_epi64(result, _mm_cvtsi32_si128(n & 31));
|
|
447
|
+
n &= ~31;
|
|
448
|
+
|
|
449
|
+
__m128i t;
|
|
450
|
+
if(n) {
|
|
451
|
+
// use a second accumulator to leverage some IPC from slow CLMUL
|
|
452
|
+
__m128i result2 = _mm_cvtsi32_si128(crc_power_rev[ctz32(n)]);
|
|
453
|
+
n &= n-1;
|
|
454
|
+
|
|
455
|
+
if(n) {
|
|
456
|
+
// first multiply doesn't need reduction
|
|
457
|
+
result2 = _mm_clmulepi64_si128(result2, _mm_cvtsi32_si128(crc_power_rev[ctz32(n)]), 0);
|
|
458
|
+
n &= n-1;
|
|
459
|
+
|
|
460
|
+
while(n) {
|
|
461
|
+
result = crc32_shift_clmul_mulred(ctz32(n), result);
|
|
462
|
+
n &= n-1;
|
|
463
|
+
|
|
464
|
+
if(n) {
|
|
465
|
+
result2 = crc32_shift_clmul_mulred(ctz32(n), result2);
|
|
466
|
+
n &= n-1;
|
|
467
|
+
}
|
|
468
|
+
}
|
|
469
|
+
}
|
|
470
|
+
|
|
471
|
+
const __m128i fold_const = _mm_set_epi32(0, 0x490d678d, 0, 0xf200aa66);
|
|
472
|
+
|
|
473
|
+
// merge two results
|
|
474
|
+
result = _mm_clmulepi64_si128(result, result2, 0);
|
|
475
|
+
|
|
476
|
+
// do 128b reduction
|
|
477
|
+
t = _mm_unpackhi_epi32(result, _mm_setzero_si128());
|
|
478
|
+
// fold [127:96] -> [63:0]
|
|
479
|
+
__m128i hi = _mm_clmulepi64_si128(t, fold_const, 1);
|
|
480
|
+
// fold [95:64] -> [63:0]
|
|
481
|
+
__m128i lo = _mm_clmulepi64_si128(t, fold_const, 0x10);
|
|
482
|
+
#ifdef ENABLE_AVX512
|
|
483
|
+
result = _mm_ternarylogic_epi32(result, hi, lo, 0x96);
|
|
484
|
+
#else
|
|
485
|
+
result = _mm_xor_si128(result, hi);
|
|
486
|
+
result = _mm_xor_si128(result, lo);
|
|
487
|
+
#endif
|
|
488
|
+
}
|
|
489
|
+
|
|
490
|
+
// do Barrett reduction back into 32-bit field
|
|
491
|
+
const __m128i reduction_const = _mm_set_epi32(0, 0x04c11db7, 1, 0x04d101df);
|
|
492
|
+
t = _mm_clmulepi64_si128(_mm_blend_epi16(_mm_setzero_si128(), result, 0x3c), reduction_const, 0);
|
|
493
|
+
t = _mm_clmulepi64_si128(t, reduction_const, 0x11);
|
|
494
|
+
result = _mm_xor_si128(t, result);
|
|
495
|
+
|
|
496
|
+
result = reverse_bits_epi8(result);
|
|
497
|
+
return BSWAP32(_mm_cvtsi128_si32(result));
|
|
498
|
+
}
|
|
499
|
+
#endif
|
|
500
|
+
|
|
501
|
+
|
|
502
|
+
void RapidYenc::crc_clmul_set_funcs() {
|
|
369
503
|
_do_crc32_incremental = &do_crc32_incremental_clmul;
|
|
504
|
+
_crc32_multiply = &crc32_multiply_clmul;
|
|
505
|
+
#if defined(__GNUC__) || defined(_MSC_VER)
|
|
506
|
+
_crc32_shift = &crc32_shift_clmul;
|
|
507
|
+
#endif
|
|
370
508
|
_crc32_isa = ISA_LEVEL_PCLMUL;
|
|
371
509
|
}
|
|
372
510
|
#else
|
|
373
|
-
void crc_clmul_set_funcs() {}
|
|
511
|
+
void RapidYenc::crc_clmul_set_funcs() {}
|
|
374
512
|
#endif
|
|
375
513
|
|
package/src/crc_folding_256.cc
CHANGED
|
@@ -99,19 +99,12 @@ ALIGN_TO(16, static const unsigned crc_k[]) = {
|
|
|
99
99
|
|
|
100
100
|
|
|
101
101
|
static uint32_t crc_fold(const unsigned char *src, long len, uint32_t initial) {
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
xmm_t0 = _mm_clmulepi64_si128(xmm_t0, _mm_set_epi32(0, 0, 0xa273bc24, 0), 0); // reverse(0x487b9c8a)<<1 == 0xa273bc24
|
|
107
|
-
__m128i reduction = _mm_set_epi32( // polynomial reduction factors
|
|
108
|
-
1, 0xdb710640, // G* = 0x04c11db7
|
|
109
|
-
0, 0xf7011641 // Q+ = 0x04d101df (+1 to save an additional xor operation)
|
|
102
|
+
__m128i xmm_t0 = _mm_clmulepi64_si128(
|
|
103
|
+
_mm_cvtsi32_si128(~initial),
|
|
104
|
+
_mm_cvtsi32_si128(0xdfded7ec),
|
|
105
|
+
0
|
|
110
106
|
);
|
|
111
|
-
__m128i xmm_t1 = _mm_clmulepi64_si128(xmm_t0, reduction, 0);
|
|
112
|
-
xmm_t1 = _mm_clmulepi64_si128(xmm_t1, reduction, 0x10);
|
|
113
107
|
|
|
114
|
-
xmm_t0 = _mm_srli_si128(_mm_xor_si128(xmm_t0, xmm_t1), 8);
|
|
115
108
|
__m256i crc0 = zext128_256(xmm_t0);
|
|
116
109
|
__m256i crc1 = _mm256_setzero_si256();
|
|
117
110
|
|
|
@@ -217,13 +210,13 @@ static uint32_t do_crc32_incremental_clmul(const void* data, size_t length, uint
|
|
|
217
210
|
return crc_fold((const unsigned char*)data, (long)length, init);
|
|
218
211
|
}
|
|
219
212
|
|
|
220
|
-
void crc_clmul256_set_funcs() {
|
|
213
|
+
void RapidYenc::crc_clmul256_set_funcs() {
|
|
214
|
+
crc_clmul_set_funcs(); // set multiply/shift function
|
|
221
215
|
_do_crc32_incremental = &do_crc32_incremental_clmul;
|
|
222
216
|
_crc32_isa = ISA_LEVEL_VPCLMUL;
|
|
223
217
|
}
|
|
224
218
|
#else
|
|
225
|
-
void
|
|
226
|
-
void crc_clmul256_set_funcs() {
|
|
219
|
+
void RapidYenc::crc_clmul256_set_funcs() {
|
|
227
220
|
crc_clmul_set_funcs();
|
|
228
221
|
}
|
|
229
222
|
#endif
|