yencode 1.1.4 → 1.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +130 -189
- package/binding.gyp +79 -6
- package/index.js +2 -0
- package/package.json +1 -1
- package/src/common.h +39 -1
- package/src/crc.cc +89 -23
- package/src/crc.h +68 -2
- package/src/crc_arm.cc +54 -37
- package/src/crc_common.h +11 -0
- package/src/crc_folding.cc +155 -18
- package/src/crc_folding_256.cc +12 -16
- package/src/crc_riscv.cc +251 -0
- package/src/decoder.cc +37 -3
- package/src/decoder.h +4 -0
- package/src/decoder_avx.cc +3 -2
- package/src/decoder_avx2.cc +2 -1
- package/src/decoder_avx2_base.h +6 -24
- package/src/decoder_common.h +61 -49
- package/src/decoder_neon.cc +10 -26
- package/src/decoder_neon64.cc +10 -22
- package/src/decoder_rvv.cc +274 -0
- package/src/decoder_sse2.cc +24 -2
- package/src/decoder_sse_base.h +11 -45
- package/src/decoder_ssse3.cc +3 -2
- package/src/decoder_vbmi2.cc +2 -5
- package/src/encoder.cc +28 -0
- package/src/encoder.h +4 -0
- package/src/encoder_avx.cc +1 -0
- package/src/encoder_avx2.cc +1 -0
- package/src/encoder_common.h +2 -20
- package/src/encoder_neon.cc +1 -0
- package/src/encoder_rvv.cc +5 -19
- package/src/encoder_sse2.cc +1 -0
- package/src/encoder_ssse3.cc +1 -0
- package/src/encoder_vbmi2.cc +2 -0
- package/src/platform.cc +4 -4
- package/src/yencode.cc +45 -3
- package/test/testcrc.js +19 -3
- package/test/testcrcfuncs.c +53 -0
- package/test/testdec.js +2 -1
- package/test/testenc.js +1 -1
package/src/crc_folding_256.cc
CHANGED
|
@@ -1,6 +1,8 @@
|
|
|
1
1
|
// 256-bit version of crc_folding
|
|
2
2
|
|
|
3
3
|
#include "crc_common.h"
|
|
4
|
+
|
|
5
|
+
void crc_clmul_set_funcs();
|
|
4
6
|
|
|
5
7
|
#if !defined(YENC_DISABLE_AVX256) && ((defined(__VPCLMULQDQ__) && defined(__AVX2__) && defined(__PCLMUL__)) || (defined(_MSC_VER) && _MSC_VER >= 1920 && defined(PLATFORM_X86) && !defined(__clang__)))
|
|
6
8
|
#include <inttypes.h>
|
|
@@ -99,19 +101,12 @@ ALIGN_TO(16, static const unsigned crc_k[]) = {
|
|
|
99
101
|
|
|
100
102
|
|
|
101
103
|
static uint32_t crc_fold(const unsigned char *src, long len, uint32_t initial) {
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
xmm_t0 = _mm_clmulepi64_si128(xmm_t0, _mm_set_epi32(0, 0, 0xa273bc24, 0), 0); // reverse(0x487b9c8a)<<1 == 0xa273bc24
|
|
107
|
-
__m128i reduction = _mm_set_epi32( // polynomial reduction factors
|
|
108
|
-
1, 0xdb710640, // G* = 0x04c11db7
|
|
109
|
-
0, 0xf7011641 // Q+ = 0x04d101df (+1 to save an additional xor operation)
|
|
104
|
+
__m128i xmm_t0 = _mm_clmulepi64_si128(
|
|
105
|
+
_mm_cvtsi32_si128(~initial),
|
|
106
|
+
_mm_cvtsi32_si128(0xdfded7ec),
|
|
107
|
+
0
|
|
110
108
|
);
|
|
111
|
-
__m128i xmm_t1 = _mm_clmulepi64_si128(xmm_t0, reduction, 0);
|
|
112
|
-
xmm_t1 = _mm_clmulepi64_si128(xmm_t1, reduction, 0x10);
|
|
113
109
|
|
|
114
|
-
xmm_t0 = _mm_srli_si128(_mm_xor_si128(xmm_t0, xmm_t1), 8);
|
|
115
110
|
__m256i crc0 = zext128_256(xmm_t0);
|
|
116
111
|
__m256i crc1 = _mm256_setzero_si256();
|
|
117
112
|
|
|
@@ -217,13 +212,14 @@ static uint32_t do_crc32_incremental_clmul(const void* data, size_t length, uint
|
|
|
217
212
|
return crc_fold((const unsigned char*)data, (long)length, init);
|
|
218
213
|
}
|
|
219
214
|
|
|
220
|
-
void crc_clmul256_set_funcs(
|
|
221
|
-
|
|
215
|
+
void crc_clmul256_set_funcs() {
|
|
216
|
+
crc_clmul_set_funcs(); // set multiply/shift function
|
|
217
|
+
_do_crc32_incremental = &do_crc32_incremental_clmul;
|
|
218
|
+
_crc32_isa = ISA_LEVEL_VPCLMUL;
|
|
222
219
|
}
|
|
223
220
|
#else
|
|
224
|
-
void
|
|
225
|
-
|
|
226
|
-
crc_clmul_set_funcs(_do_crc32_incremental);
|
|
221
|
+
void crc_clmul256_set_funcs() {
|
|
222
|
+
crc_clmul_set_funcs();
|
|
227
223
|
}
|
|
228
224
|
#endif
|
|
229
225
|
|
package/src/crc_riscv.cc
ADDED
|
@@ -0,0 +1,251 @@
|
|
|
1
|
+
#include "crc_common.h"
|
|
2
|
+
|
|
3
|
+
#if defined(__riscv) && defined(__GNUC__) && (defined(__riscv_zbkc) || defined(__riscv_zbc))
|
|
4
|
+
|
|
5
|
+
#if __has_include(<riscv_bitmanip.h>)
|
|
6
|
+
# include <riscv_bitmanip.h>
|
|
7
|
+
# if __riscv_xlen == 64
|
|
8
|
+
# define rv_clmul __riscv_clmul_64
|
|
9
|
+
# define rv_clmulh __riscv_clmulh_64
|
|
10
|
+
# else
|
|
11
|
+
# define rv_clmul __riscv_clmul_32
|
|
12
|
+
# define rv_clmulh __riscv_clmulh_32
|
|
13
|
+
# endif
|
|
14
|
+
#else
|
|
15
|
+
static HEDLEY_ALWAYS_INLINE uintptr_t rv_clmul(uintptr_t x, uintptr_t y) {
|
|
16
|
+
uintptr_t r;
|
|
17
|
+
__asm__("clmul %0, %1, %2\n"
|
|
18
|
+
: "=r"(r)
|
|
19
|
+
: "r"(x), "r"(y)
|
|
20
|
+
:);
|
|
21
|
+
return r;
|
|
22
|
+
}
|
|
23
|
+
static HEDLEY_ALWAYS_INLINE uintptr_t rv_clmulh(uintptr_t x, uintptr_t y) {
|
|
24
|
+
uintptr_t r;
|
|
25
|
+
__asm__("clmulh %0, %1, %2\n"
|
|
26
|
+
: "=r"(r)
|
|
27
|
+
: "r"(x), "r"(y)
|
|
28
|
+
:);
|
|
29
|
+
return r;
|
|
30
|
+
}
|
|
31
|
+
#endif
|
|
32
|
+
|
|
33
|
+
// TODO: test big-endian
|
|
34
|
+
#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
|
|
35
|
+
# if __riscv_xlen == 64
|
|
36
|
+
# define SWAP __builtin_bswap64
|
|
37
|
+
# else
|
|
38
|
+
# define SWAP __builtin_bswap32
|
|
39
|
+
# endif
|
|
40
|
+
#else
|
|
41
|
+
# define SWAP(d) (d)
|
|
42
|
+
#endif
|
|
43
|
+
static HEDLEY_ALWAYS_INLINE uintptr_t read_partial(const void* p, unsigned sz) {
|
|
44
|
+
uintptr_t data = 0;
|
|
45
|
+
memcpy(&data, p, sz);
|
|
46
|
+
return SWAP(data);
|
|
47
|
+
}
|
|
48
|
+
static HEDLEY_ALWAYS_INLINE uintptr_t read_full(const uintptr_t* p) {
|
|
49
|
+
return SWAP(*p);
|
|
50
|
+
}
|
|
51
|
+
#undef SWAP
|
|
52
|
+
|
|
53
|
+
static uint32_t rv_crc_calc(uint32_t crc, const unsigned char *src, long len) {
|
|
54
|
+
uintptr_t accum[4] = {};
|
|
55
|
+
|
|
56
|
+
// note: constants here are bit-reflected and shifted left by 1
|
|
57
|
+
// Zbc does also have clmulr to avoid the shift, but:
|
|
58
|
+
// - there's no clmulhr, so for XLEN=64, just shift the constant instead to get the same result
|
|
59
|
+
// - it's unavailable in Zbkc
|
|
60
|
+
// - for XLEN=32, 2x constants is likely worth it to avoid the additional XORs in the loop
|
|
61
|
+
|
|
62
|
+
#if __riscv_xlen == 64
|
|
63
|
+
const uint64_t MUL_HI = 0x15a546366 /*2^224*/, MUL_LO = 0xf1da05aa /*2^288*/;
|
|
64
|
+
#define CLMULL rv_clmul
|
|
65
|
+
#define CLMULH rv_clmulh
|
|
66
|
+
|
|
67
|
+
accum[3] = rv_clmul(crc, 0xb66b1fa6); // 2^-32
|
|
68
|
+
#elif __riscv_xlen == 32
|
|
69
|
+
const uint64_t MUL_HI = 0x140d44a2e /*2^128*/, MUL_LO = 0x1751997d0 /*2^160*/;
|
|
70
|
+
#define CLMULL(x, k) rv_clmul(x, k & 0xffffffff)
|
|
71
|
+
#define CLMULH(x, k) (rv_clmulh(x, k & 0xffffffff) ^ (k > 0xffffffffULL ? (x) : 0))
|
|
72
|
+
|
|
73
|
+
accum[2] = rv_clmul(crc, 0xb66b1fa6);
|
|
74
|
+
accum[3] = rv_clmulh(crc, 0xb66b1fa6);
|
|
75
|
+
#else
|
|
76
|
+
#error "Unknown __riscv_xlen"
|
|
77
|
+
#endif
|
|
78
|
+
const size_t WS = sizeof(uintptr_t);
|
|
79
|
+
|
|
80
|
+
// if src isn't word-aligned, process until it is so
|
|
81
|
+
long initial_alignment = ((uintptr_t)src & (WS-1));
|
|
82
|
+
long initial_process = WS - initial_alignment;
|
|
83
|
+
if(initial_alignment && len >= initial_process) {
|
|
84
|
+
unsigned shl = initial_alignment * 8, shr = initial_process * 8;
|
|
85
|
+
#if __riscv_xlen == 64
|
|
86
|
+
accum[2] = accum[3] << shl;
|
|
87
|
+
#else
|
|
88
|
+
accum[1] = accum[2] << shl;
|
|
89
|
+
accum[2] = (accum[3] << shl) | (accum[2] >> shr);
|
|
90
|
+
#endif
|
|
91
|
+
accum[3] = (read_partial(src, initial_process) << shl) | (accum[3] >> shr);
|
|
92
|
+
src += initial_process;
|
|
93
|
+
len -= initial_process;
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
// main processing loop
|
|
97
|
+
const uintptr_t* srcW = (const uintptr_t*)src;
|
|
98
|
+
while((len -= WS*4) >= 0) {
|
|
99
|
+
uintptr_t tmpHi, tmpLo;
|
|
100
|
+
tmpLo = CLMULL(accum[0], MUL_LO) ^ CLMULL(accum[1], MUL_HI);
|
|
101
|
+
tmpHi = CLMULH(accum[0], MUL_LO) ^ CLMULH(accum[1], MUL_HI);
|
|
102
|
+
accum[0] = tmpLo ^ read_full(srcW++);
|
|
103
|
+
accum[1] = tmpHi ^ read_full(srcW++);
|
|
104
|
+
|
|
105
|
+
tmpLo = CLMULL(accum[2], MUL_LO) ^ CLMULL(accum[3], MUL_HI);
|
|
106
|
+
tmpHi = CLMULH(accum[2], MUL_LO) ^ CLMULH(accum[3], MUL_HI);
|
|
107
|
+
accum[2] = tmpLo ^ read_full(srcW++);
|
|
108
|
+
accum[3] = tmpHi ^ read_full(srcW++);
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
// process trailing bytes
|
|
112
|
+
if(len & (WS*2)) {
|
|
113
|
+
uintptr_t tmpLo = CLMULL(accum[0], MUL_LO) ^ CLMULL(accum[1], MUL_HI);
|
|
114
|
+
uintptr_t tmpHi = CLMULH(accum[0], MUL_LO) ^ CLMULH(accum[1], MUL_HI);
|
|
115
|
+
accum[0] = accum[2];
|
|
116
|
+
accum[1] = accum[3];
|
|
117
|
+
accum[2] = tmpLo ^ read_full(srcW++);
|
|
118
|
+
accum[3] = tmpHi ^ read_full(srcW++);
|
|
119
|
+
}
|
|
120
|
+
if(len & WS) {
|
|
121
|
+
uintptr_t tmpLo = CLMULL(accum[0], MUL_HI);
|
|
122
|
+
uintptr_t tmpHi = CLMULH(accum[0], MUL_HI);
|
|
123
|
+
accum[0] = accum[1];
|
|
124
|
+
accum[1] = accum[2];
|
|
125
|
+
accum[2] = accum[3] ^ tmpLo;
|
|
126
|
+
accum[3] = tmpHi ^ read_full(srcW++);
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
size_t tail = len & (WS-1);
|
|
130
|
+
if(tail) {
|
|
131
|
+
unsigned shl = ((WS - tail) * 8), shr = tail * 8;
|
|
132
|
+
uintptr_t tmp = accum[0] << shl;
|
|
133
|
+
uintptr_t tmpLo = CLMULL(tmp, MUL_HI);
|
|
134
|
+
uintptr_t tmpHi = CLMULH(tmp, MUL_HI);
|
|
135
|
+
accum[0] = (accum[0] >> shr) | (accum[1] << shl);
|
|
136
|
+
accum[1] = (accum[1] >> shr) | (accum[2] << shl);
|
|
137
|
+
accum[2] = (accum[2] >> shr) | (accum[3] << shl);
|
|
138
|
+
accum[3] = (accum[3] >> shr) | (read_partial(srcW, tail) << shl);
|
|
139
|
+
accum[2] ^= tmpLo;
|
|
140
|
+
accum[3] ^= tmpHi;
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
// done processing: fold everything down
|
|
145
|
+
#if __riscv_xlen == 64
|
|
146
|
+
// fold 0,1 -> 2,3
|
|
147
|
+
accum[2] ^= rv_clmul(accum[0], 0x1751997d0) ^ rv_clmul(accum[1], 0xccaa009e);
|
|
148
|
+
accum[3] ^= rv_clmulh(accum[0], 0x1751997d0) ^ rv_clmulh(accum[1], 0xccaa009e);
|
|
149
|
+
|
|
150
|
+
// fold 2->3
|
|
151
|
+
accum[0] = rv_clmulh(accum[2], 0xccaa009e);
|
|
152
|
+
accum[3] ^= rv_clmul(accum[2], 0xccaa009e);
|
|
153
|
+
|
|
154
|
+
// fold 64b->32b
|
|
155
|
+
accum[1] = rv_clmul(accum[3] & 0xffffffff, 0x163cd6124);
|
|
156
|
+
accum[0] ^= accum[1] >> 32;
|
|
157
|
+
accum[3] = accum[1] ^ (accum[3] >> 32);
|
|
158
|
+
accum[3] <<= 32;
|
|
159
|
+
#else
|
|
160
|
+
// fold 0,1 -> 2,3
|
|
161
|
+
accum[2] ^= rv_clmul(accum[0], 0xccaa009e) ^ CLMULL(accum[1], 0x163cd6124);
|
|
162
|
+
accum[3] ^= rv_clmulh(accum[0], 0xccaa009e) ^ CLMULH(accum[1], 0x163cd6124);
|
|
163
|
+
|
|
164
|
+
// fold 2->3
|
|
165
|
+
accum[0] = CLMULH(accum[2], 0x163cd6124);
|
|
166
|
+
accum[3] ^= CLMULL(accum[2], 0x163cd6124);
|
|
167
|
+
#endif
|
|
168
|
+
|
|
169
|
+
// reduction
|
|
170
|
+
accum[3] = CLMULL(accum[3], 0xf7011641);
|
|
171
|
+
accum[3] = CLMULH(accum[3], 0x1db710640); // maybe consider clmulr for XLEN=32
|
|
172
|
+
crc = accum[0] ^ accum[3];
|
|
173
|
+
return crc;
|
|
174
|
+
#undef CLMULL
|
|
175
|
+
#undef CLMULH
|
|
176
|
+
}
|
|
177
|
+
|
|
178
|
+
static uint32_t do_crc32_incremental_rv_zbc(const void* data, size_t length, uint32_t init) {
|
|
179
|
+
return ~rv_crc_calc(~init, (const unsigned char*)data, (long)length);
|
|
180
|
+
}
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
#if __riscv_xlen == 64
|
|
184
|
+
// note that prod is shifted by 1 place to the right, due to bit-reflection
|
|
185
|
+
static uint32_t crc32_reduce_rv_zbc(uint64_t prod) {
|
|
186
|
+
uint64_t t = rv_clmul(prod << 33, 0xf7011641);
|
|
187
|
+
t = rv_clmulh(t, 0x1db710640);
|
|
188
|
+
t ^= prod >> 31;
|
|
189
|
+
return t;
|
|
190
|
+
}
|
|
191
|
+
#endif
|
|
192
|
+
uint32_t crc32_multiply_rv_zbc(uint32_t a, uint32_t b) {
|
|
193
|
+
#if __riscv_xlen == 64
|
|
194
|
+
uint64_t t = crc32_reduce_rv_zbc(rv_clmul(a, b));
|
|
195
|
+
#else
|
|
196
|
+
uint32_t prodLo = rv_clmul(a, b);
|
|
197
|
+
uint32_t prodHi = rv_clmulh(a, b);
|
|
198
|
+
|
|
199
|
+
// fix prodHi for bit-reflection (clmulr would be ideal here)
|
|
200
|
+
prodHi += prodHi;
|
|
201
|
+
prodHi |= prodLo >> 31;
|
|
202
|
+
prodLo += prodLo;
|
|
203
|
+
|
|
204
|
+
uint32_t t = rv_clmul(prodLo, 0xf7011641);
|
|
205
|
+
t ^= rv_clmulh(t, 0xdb710640);
|
|
206
|
+
t ^= prodHi;
|
|
207
|
+
#endif
|
|
208
|
+
return t;
|
|
209
|
+
}
|
|
210
|
+
|
|
211
|
+
#if defined(__GNUC__) || defined(_MSC_VER)
|
|
212
|
+
uint32_t crc32_shift_rv_zbc(uint32_t crc1, uint32_t n) {
|
|
213
|
+
// TODO: require Zbb for ctz
|
|
214
|
+
uint32_t result = crc1;
|
|
215
|
+
#if __riscv_xlen == 64
|
|
216
|
+
// for n<32, can shift directly
|
|
217
|
+
uint64_t prod = result;
|
|
218
|
+
prod <<= 31 ^ (n&31);
|
|
219
|
+
n &= ~31;
|
|
220
|
+
result = crc32_reduce_rv_zbc(prod);
|
|
221
|
+
#endif
|
|
222
|
+
if(!n) return result;
|
|
223
|
+
|
|
224
|
+
uint32_t result2 = crc_power[ctz32(n)];
|
|
225
|
+
n &= n-1;
|
|
226
|
+
|
|
227
|
+
while(n) {
|
|
228
|
+
result = crc32_multiply_rv_zbc(result, crc_power[ctz32(n)]);
|
|
229
|
+
n &= n-1;
|
|
230
|
+
|
|
231
|
+
if(n) {
|
|
232
|
+
result2 = crc32_multiply_rv_zbc(result2, crc_power[ctz32(n)]);
|
|
233
|
+
n &= n-1;
|
|
234
|
+
}
|
|
235
|
+
}
|
|
236
|
+
return crc32_multiply_rv_zbc(result, result2);
|
|
237
|
+
}
|
|
238
|
+
#endif
|
|
239
|
+
|
|
240
|
+
|
|
241
|
+
void crc_riscv_set_funcs() {
|
|
242
|
+
_do_crc32_incremental = &do_crc32_incremental_rv_zbc;
|
|
243
|
+
_crc32_multiply = &crc32_multiply_rv_zbc;
|
|
244
|
+
#if defined(__GNUC__) || defined(_MSC_VER)
|
|
245
|
+
_crc32_shift = &crc32_shift_rv_zbc;
|
|
246
|
+
#endif
|
|
247
|
+
_crc32_isa = ISA_FEATURE_ZBC;
|
|
248
|
+
}
|
|
249
|
+
#else
|
|
250
|
+
void crc_riscv_set_funcs() {}
|
|
251
|
+
#endif
|
package/src/decoder.cc
CHANGED
|
@@ -7,6 +7,8 @@ extern "C" {
|
|
|
7
7
|
YencDecoderEnd (*_do_decode)(const unsigned char**, unsigned char**, size_t, YencDecoderState*) = &do_decode_scalar<false, false>;
|
|
8
8
|
YencDecoderEnd (*_do_decode_raw)(const unsigned char**, unsigned char**, size_t, YencDecoderState*) = &do_decode_scalar<true, false>;
|
|
9
9
|
YencDecoderEnd (*_do_decode_end_raw)(const unsigned char**, unsigned char**, size_t, YencDecoderState*) = &do_decode_end_scalar<true>;
|
|
10
|
+
|
|
11
|
+
int _decode_isa = ISA_GENERIC;
|
|
10
12
|
}
|
|
11
13
|
|
|
12
14
|
void decoder_set_sse2_funcs();
|
|
@@ -16,6 +18,7 @@ void decoder_set_avx2_funcs();
|
|
|
16
18
|
void decoder_set_vbmi2_funcs();
|
|
17
19
|
extern const bool decoder_has_avx10;
|
|
18
20
|
void decoder_set_neon_funcs();
|
|
21
|
+
void decoder_set_rvv_funcs();
|
|
19
22
|
|
|
20
23
|
|
|
21
24
|
#if defined(PLATFORM_X86) && defined(YENC_BUILD_NATIVE) && YENC_BUILD_NATIVE!=0
|
|
@@ -23,23 +26,50 @@ void decoder_set_neon_funcs();
|
|
|
23
26
|
# include "decoder_avx2_base.h"
|
|
24
27
|
static inline void decoder_set_native_funcs() {
|
|
25
28
|
ALIGN_ALLOC(lookups, sizeof(*lookups), 16);
|
|
26
|
-
decoder_init_lut(lookups->
|
|
29
|
+
decoder_init_lut(lookups->compact);
|
|
27
30
|
_do_decode = &do_decode_simd<false, false, sizeof(__m256i)*2, do_decode_avx2<false, false, ISA_NATIVE> >;
|
|
28
31
|
_do_decode_raw = &do_decode_simd<true, false, sizeof(__m256i)*2, do_decode_avx2<true, false, ISA_NATIVE> >;
|
|
29
32
|
_do_decode_end_raw = &do_decode_simd<true, true, sizeof(__m256i)*2, do_decode_avx2<true, true, ISA_NATIVE> >;
|
|
33
|
+
_decode_isa = ISA_NATIVE;
|
|
30
34
|
}
|
|
31
35
|
# else
|
|
32
36
|
# include "decoder_sse_base.h"
|
|
33
37
|
static inline void decoder_set_native_funcs() {
|
|
34
|
-
decoder_sse_init();
|
|
35
|
-
decoder_init_lut(lookups->
|
|
38
|
+
decoder_sse_init(lookups);
|
|
39
|
+
decoder_init_lut(lookups->compact);
|
|
36
40
|
_do_decode = &do_decode_simd<false, false, sizeof(__m128i)*2, do_decode_sse<false, false, ISA_NATIVE> >;
|
|
37
41
|
_do_decode_raw = &do_decode_simd<true, false, sizeof(__m128i)*2, do_decode_sse<true, false, ISA_NATIVE> >;
|
|
38
42
|
_do_decode_end_raw = &do_decode_simd<true, true, sizeof(__m128i)*2, do_decode_sse<true, true, ISA_NATIVE> >;
|
|
43
|
+
_decode_isa = ISA_NATIVE;
|
|
39
44
|
}
|
|
40
45
|
# endif
|
|
41
46
|
#endif
|
|
42
47
|
|
|
48
|
+
|
|
49
|
+
#if defined(PLATFORM_X86) || defined(PLATFORM_ARM)
|
|
50
|
+
void decoder_init_lut(void* compactLUT) {
|
|
51
|
+
#ifdef YENC_DEC_USE_THINTABLE
|
|
52
|
+
const int tableSize = 8;
|
|
53
|
+
#else
|
|
54
|
+
const int tableSize = 16;
|
|
55
|
+
#endif
|
|
56
|
+
for(int i=0; i<(tableSize==8?256:32768); i++) {
|
|
57
|
+
int k = i;
|
|
58
|
+
uint8_t* res = (uint8_t*)compactLUT + i*tableSize;
|
|
59
|
+
int p = 0;
|
|
60
|
+
for(int j=0; j<tableSize; j++) {
|
|
61
|
+
if(!(k & 1)) {
|
|
62
|
+
res[p++] = j;
|
|
63
|
+
}
|
|
64
|
+
k >>= 1;
|
|
65
|
+
}
|
|
66
|
+
for(; p<tableSize; p++)
|
|
67
|
+
res[p] = 0x80;
|
|
68
|
+
}
|
|
69
|
+
}
|
|
70
|
+
#endif
|
|
71
|
+
|
|
72
|
+
|
|
43
73
|
void decoder_init() {
|
|
44
74
|
#ifdef PLATFORM_X86
|
|
45
75
|
# if defined(YENC_BUILD_NATIVE) && YENC_BUILD_NATIVE!=0
|
|
@@ -62,4 +92,8 @@ void decoder_init() {
|
|
|
62
92
|
if(cpu_supports_neon())
|
|
63
93
|
decoder_set_neon_funcs();
|
|
64
94
|
#endif
|
|
95
|
+
#ifdef __riscv
|
|
96
|
+
if(cpu_supports_rvv())
|
|
97
|
+
decoder_set_rvv_funcs();
|
|
98
|
+
#endif
|
|
65
99
|
}
|
package/src/decoder.h
CHANGED
|
@@ -32,6 +32,7 @@ typedef enum {
|
|
|
32
32
|
extern YencDecoderEnd (*_do_decode)(const unsigned char**, unsigned char**, size_t, YencDecoderState*);
|
|
33
33
|
extern YencDecoderEnd (*_do_decode_raw)(const unsigned char**, unsigned char**, size_t, YencDecoderState*);
|
|
34
34
|
extern YencDecoderEnd (*_do_decode_end_raw)(const unsigned char**, unsigned char**, size_t, YencDecoderState*);
|
|
35
|
+
extern int _decode_isa;
|
|
35
36
|
|
|
36
37
|
static inline size_t do_decode(int isRaw, const unsigned char* src, unsigned char* dest, size_t len, YencDecoderState* state) {
|
|
37
38
|
unsigned char* ds = dest;
|
|
@@ -45,6 +46,9 @@ static inline YencDecoderEnd do_decode_end(const unsigned char** src, unsigned c
|
|
|
45
46
|
|
|
46
47
|
void decoder_init();
|
|
47
48
|
|
|
49
|
+
static inline int decode_isa_level() {
|
|
50
|
+
return _decode_isa;
|
|
51
|
+
}
|
|
48
52
|
|
|
49
53
|
|
|
50
54
|
#ifdef __cplusplus
|
package/src/decoder_avx.cc
CHANGED
|
@@ -4,11 +4,12 @@
|
|
|
4
4
|
#include "decoder_common.h"
|
|
5
5
|
#include "decoder_sse_base.h"
|
|
6
6
|
void decoder_set_avx_funcs() {
|
|
7
|
-
decoder_sse_init();
|
|
8
|
-
decoder_init_lut(lookups->
|
|
7
|
+
decoder_sse_init(lookups);
|
|
8
|
+
decoder_init_lut(lookups->compact);
|
|
9
9
|
_do_decode = &do_decode_simd<false, false, sizeof(__m128i)*2, do_decode_sse<false, false, ISA_LEVEL_SSE4_POPCNT> >;
|
|
10
10
|
_do_decode_raw = &do_decode_simd<true, false, sizeof(__m128i)*2, do_decode_sse<true, false, ISA_LEVEL_SSE4_POPCNT> >;
|
|
11
11
|
_do_decode_end_raw = &do_decode_simd<true, true, sizeof(__m128i)*2, do_decode_sse<true, true, ISA_LEVEL_SSE4_POPCNT> >;
|
|
12
|
+
_decode_isa = ISA_LEVEL_AVX;
|
|
12
13
|
}
|
|
13
14
|
#else
|
|
14
15
|
void decoder_set_ssse3_funcs();
|
package/src/decoder_avx2.cc
CHANGED
|
@@ -5,10 +5,11 @@
|
|
|
5
5
|
#include "decoder_avx2_base.h"
|
|
6
6
|
void decoder_set_avx2_funcs() {
|
|
7
7
|
ALIGN_ALLOC(lookups, sizeof(*lookups), 16);
|
|
8
|
-
decoder_init_lut(lookups->
|
|
8
|
+
decoder_init_lut(lookups->compact);
|
|
9
9
|
_do_decode = &do_decode_simd<false, false, sizeof(__m256i)*2, do_decode_avx2<false, false, ISA_LEVEL_AVX2> >;
|
|
10
10
|
_do_decode_raw = &do_decode_simd<true, false, sizeof(__m256i)*2, do_decode_avx2<true, false, ISA_LEVEL_AVX2> >;
|
|
11
11
|
_do_decode_end_raw = &do_decode_simd<true, true, sizeof(__m256i)*2, do_decode_avx2<true, true, ISA_LEVEL_AVX2> >;
|
|
12
|
+
_decode_isa = ISA_LEVEL_AVX2;
|
|
12
13
|
}
|
|
13
14
|
#else
|
|
14
15
|
void decoder_set_avx_funcs();
|
package/src/decoder_avx2_base.h
CHANGED
|
@@ -15,7 +15,6 @@
|
|
|
15
15
|
#pragma pack(16)
|
|
16
16
|
static struct {
|
|
17
17
|
/*align16*/ struct { char bytes[16]; } compact[32768];
|
|
18
|
-
uint8_t eqFix[256];
|
|
19
18
|
} * HEDLEY_RESTRICT lookups;
|
|
20
19
|
#pragma pack()
|
|
21
20
|
|
|
@@ -67,6 +66,8 @@ HEDLEY_ALWAYS_INLINE void do_decode_avx2(const uint8_t* src, long& len, unsigned
|
|
|
67
66
|
);
|
|
68
67
|
}
|
|
69
68
|
|
|
69
|
+
decoder_set_nextMask<isRaw>(src, len, _nextMask); // set this before the loop because we can't check src after it's been overwritten
|
|
70
|
+
|
|
70
71
|
// for some reason, MSVC Win32 seems to crash when trying to compile _mm256_mask_cmpeq_epi8_mask
|
|
71
72
|
// the crash can be fixed by switching the order of the last two arguments, but it seems to generate wrong code
|
|
72
73
|
// so just disable the optimisation as it seems to be problematic there
|
|
@@ -320,6 +321,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_avx2(const uint8_t* src, long& len, unsigned
|
|
|
320
321
|
// terminator found
|
|
321
322
|
// there's probably faster ways to do this, but reverting to scalar code should be good enough
|
|
322
323
|
len += (long)i;
|
|
324
|
+
_nextMask = decoder_set_nextMask<isRaw>(src+i, mask);
|
|
323
325
|
break;
|
|
324
326
|
}
|
|
325
327
|
}
|
|
@@ -412,6 +414,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_avx2(const uint8_t* src, long& len, unsigned
|
|
|
412
414
|
}
|
|
413
415
|
if(endFound) {
|
|
414
416
|
len += (long)i;
|
|
417
|
+
_nextMask = decoder_set_nextMask<isRaw>(src+i, mask);
|
|
415
418
|
break;
|
|
416
419
|
}
|
|
417
420
|
}
|
|
@@ -427,16 +430,9 @@ HEDLEY_ALWAYS_INLINE void do_decode_avx2(const uint8_t* src, long& len, unsigned
|
|
|
427
430
|
dataB = _mm256_add_epi8(oDataB, _mm256_set1_epi8(-42));
|
|
428
431
|
|
|
429
432
|
if(LIKELIHOOD(0.0001, (mask & ((maskEq << 1) + escFirst)) != 0)) {
|
|
430
|
-
|
|
431
|
-
uint64_t maskEq2 = tmp;
|
|
432
|
-
for(int j=8; j<64; j+=8) {
|
|
433
|
-
tmp = lookups->eqFix[(unsigned)((maskEq>>j)&0xff) & ~(tmp>>7)];
|
|
434
|
-
maskEq2 |= (uint64_t)tmp<<j;
|
|
435
|
-
}
|
|
436
|
-
maskEq = maskEq2;
|
|
437
|
-
|
|
433
|
+
maskEq = fix_eqMask<uint64_t>(maskEq & ~(uint64_t)escFirst);
|
|
438
434
|
mask &= ~(uint64_t)escFirst;
|
|
439
|
-
escFirst =
|
|
435
|
+
escFirst = maskEq>>63;
|
|
440
436
|
// next, eliminate anything following a `=` from the special char mask; this eliminates cases of `=\r` so that they aren't removed
|
|
441
437
|
maskEq <<= 1;
|
|
442
438
|
mask &= ~maskEq;
|
|
@@ -613,20 +609,6 @@ HEDLEY_ALWAYS_INLINE void do_decode_avx2(const uint8_t* src, long& len, unsigned
|
|
|
613
609
|
}
|
|
614
610
|
}
|
|
615
611
|
_escFirst = (unsigned char)escFirst;
|
|
616
|
-
if(isRaw) {
|
|
617
|
-
// this would be the trivial solution, but requires the compiler holding onto minMask throughout the loop:
|
|
618
|
-
//_nextMask = ~(uint16_t)_mm256_movemask_epi8(_mm256_cmpeq_epi8(minMask, _mm256_set1_epi8('.')));
|
|
619
|
-
// instead, just scan the memory to determine what to set nextMask to
|
|
620
|
-
if(len != 0) { // have to gone through at least one loop cycle
|
|
621
|
-
if(src[i-2] == '\r' && src[i-1] == '\n' && src[i] == '.')
|
|
622
|
-
_nextMask = 1;
|
|
623
|
-
else if(src[i-1] == '\r' && src[i] == '\n' && src[i+1] == '.')
|
|
624
|
-
_nextMask = 2;
|
|
625
|
-
else
|
|
626
|
-
_nextMask = 0;
|
|
627
|
-
}
|
|
628
|
-
} else
|
|
629
|
-
_nextMask = 0;
|
|
630
612
|
_mm256_zeroupper();
|
|
631
613
|
}
|
|
632
614
|
#endif
|
package/src/decoder_common.h
CHANGED
|
@@ -1,5 +1,9 @@
|
|
|
1
1
|
#include "decoder.h"
|
|
2
2
|
|
|
3
|
+
#if defined(PLATFORM_ARM) && !defined(__aarch64__)
|
|
4
|
+
#define YENC_DEC_USE_THINTABLE 1
|
|
5
|
+
#endif
|
|
6
|
+
|
|
3
7
|
// TODO: need to support max output length somehow
|
|
4
8
|
// TODO: add branch probabilities
|
|
5
9
|
|
|
@@ -178,24 +182,24 @@ YencDecoderEnd do_decode_end_scalar(const unsigned char** src, unsigned char** d
|
|
|
178
182
|
if(es[i] == '.' && isRaw) {
|
|
179
183
|
i++;
|
|
180
184
|
YDEC_CHECK_END(YDEC_STATE_CRLFDT)
|
|
181
|
-
// fall-through
|
|
182
185
|
} else if(es[i] == '=') {
|
|
183
186
|
i++;
|
|
184
187
|
YDEC_CHECK_END(YDEC_STATE_CRLFEQ)
|
|
185
188
|
goto do_decode_endable_scalar_ceq;
|
|
186
189
|
} else
|
|
187
190
|
break;
|
|
191
|
+
// fall-through
|
|
188
192
|
case YDEC_STATE_CRLFDT:
|
|
189
193
|
if(isRaw && es[i] == '\r') {
|
|
190
194
|
i++;
|
|
191
195
|
YDEC_CHECK_END(YDEC_STATE_CRLFDTCR)
|
|
192
|
-
// fall-through
|
|
193
196
|
} else if(isRaw && es[i] == '=') { // check for dot-stuffed ending: \r\n.=y
|
|
194
197
|
i++;
|
|
195
198
|
YDEC_CHECK_END(YDEC_STATE_CRLFEQ)
|
|
196
199
|
goto do_decode_endable_scalar_ceq;
|
|
197
200
|
} else
|
|
198
201
|
break;
|
|
202
|
+
// fall-through
|
|
199
203
|
case YDEC_STATE_CRLFDTCR:
|
|
200
204
|
if(es[i] == '\n') {
|
|
201
205
|
if(isRaw) {
|
|
@@ -331,8 +335,8 @@ YencDecoderEnd do_decode_scalar(const unsigned char** src, unsigned char** dest,
|
|
|
331
335
|
|
|
332
336
|
|
|
333
337
|
|
|
334
|
-
template<bool isRaw, bool searchEnd,
|
|
335
|
-
YencDecoderEnd
|
|
338
|
+
template<bool isRaw, bool searchEnd, void(&kernel)(const uint8_t*, long&, unsigned char*&, unsigned char&, uint16_t&)>
|
|
339
|
+
inline YencDecoderEnd _do_decode_simd(size_t width, const unsigned char** src, unsigned char** dest, size_t len, YencDecoderState* state) {
|
|
336
340
|
if(len <= width*2) return do_decode_scalar<isRaw, searchEnd>(src, dest, len, state);
|
|
337
341
|
|
|
338
342
|
YencDecoderState tState = YDEC_STATE_CRLF;
|
|
@@ -461,52 +465,60 @@ YencDecoderEnd do_decode_simd(const unsigned char** src, unsigned char** dest, s
|
|
|
461
465
|
return YDEC_END_NONE;
|
|
462
466
|
}
|
|
463
467
|
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
|
|
472
|
-
|
|
473
|
-
|
|
474
|
-
|
|
475
|
-
|
|
476
|
-
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
|
|
480
|
-
|
|
481
|
-
|
|
482
|
-
|
|
483
|
-
|
|
484
|
-
|
|
485
|
-
|
|
486
|
-
|
|
487
|
-
|
|
488
|
-
}
|
|
489
|
-
k >>= 1;
|
|
490
|
-
}
|
|
491
|
-
for(; p<8; p++)
|
|
492
|
-
res[p] = 0x80;
|
|
493
|
-
#endif
|
|
494
|
-
}
|
|
495
|
-
#ifndef YENC_DEC_USE_THINTABLE
|
|
496
|
-
for(int i=0; i<32768; i++) {
|
|
497
|
-
int k = i;
|
|
498
|
-
uint8_t* res = (uint8_t*)compactLUT + i*16;
|
|
499
|
-
int p = 0;
|
|
500
|
-
|
|
501
|
-
for(int j=0; j<16; j++) {
|
|
502
|
-
if(!(k & 1)) {
|
|
503
|
-
res[p++] = j;
|
|
504
|
-
}
|
|
505
|
-
k >>= 1;
|
|
468
|
+
template<bool isRaw, bool searchEnd, size_t width, void(&kernel)(const uint8_t*, long&, unsigned char*&, unsigned char&, uint16_t&)>
|
|
469
|
+
YencDecoderEnd do_decode_simd(const unsigned char** src, unsigned char** dest, size_t len, YencDecoderState* state) {
|
|
470
|
+
return _do_decode_simd<isRaw, searchEnd, kernel>(width, src, dest, len, state);
|
|
471
|
+
}
|
|
472
|
+
template<bool isRaw, bool searchEnd, size_t(&getWidth)(), void(&kernel)(const uint8_t*, long&, unsigned char*&, unsigned char&, uint16_t&)>
|
|
473
|
+
YencDecoderEnd do_decode_simd(const unsigned char** src, unsigned char** dest, size_t len, YencDecoderState* state) {
|
|
474
|
+
return _do_decode_simd<isRaw, searchEnd, kernel>(getWidth(), src, dest, len, state);
|
|
475
|
+
}
|
|
476
|
+
|
|
477
|
+
|
|
478
|
+
#if defined(PLATFORM_X86) || defined(PLATFORM_ARM)
|
|
479
|
+
void decoder_init_lut(void* compactLUT);
|
|
480
|
+
#endif
|
|
481
|
+
|
|
482
|
+
template<bool isRaw>
|
|
483
|
+
static inline void decoder_set_nextMask(const uint8_t* src, size_t len, uint16_t& nextMask) {
|
|
484
|
+
if(isRaw) {
|
|
485
|
+
if(len != 0) { // have to gone through at least one loop cycle
|
|
486
|
+
if(src[-2] == '\r' && src[-1] == '\n' && src[0] == '.')
|
|
487
|
+
nextMask = 1;
|
|
488
|
+
else if(src[-1] == '\r' && src[0] == '\n' && src[1] == '.')
|
|
489
|
+
nextMask = 2;
|
|
490
|
+
else
|
|
491
|
+
nextMask = 0;
|
|
506
492
|
}
|
|
507
|
-
|
|
508
|
-
|
|
493
|
+
} else
|
|
494
|
+
nextMask = 0;
|
|
495
|
+
}
|
|
496
|
+
|
|
497
|
+
// without backtracking
|
|
498
|
+
template<bool isRaw>
|
|
499
|
+
static inline uint16_t decoder_set_nextMask(const uint8_t* src, unsigned mask) {
|
|
500
|
+
if(isRaw) {
|
|
501
|
+
if(src[0] == '.')
|
|
502
|
+
return mask & 1;
|
|
503
|
+
if(src[1] == '.')
|
|
504
|
+
return mask & 2;
|
|
509
505
|
}
|
|
510
|
-
|
|
506
|
+
return 0;
|
|
511
507
|
}
|
|
512
508
|
|
|
509
|
+
// resolve invalid sequences of = to deal with cases like '===='
|
|
510
|
+
// bit hack inspired from simdjson: https://youtu.be/wlvKAT7SZIQ?t=33m38s
|
|
511
|
+
template<typename T>
|
|
512
|
+
static inline T fix_eqMask(T mask) {
|
|
513
|
+
// isolate the start of each consecutive bit group (e.g. 01011101 -> 01000101)
|
|
514
|
+
T start = mask & ~(mask << 1);
|
|
515
|
+
|
|
516
|
+
const T odd = (T)0xaaaaaaaaaaaaaaaa; // every odd bit (10101010...)
|
|
517
|
+
|
|
518
|
+
// obtain groups which start on an even bit (clear groups that start on an odd bit, but this leaves an unwanted trailing bit)
|
|
519
|
+
T evenGroups = mask + (start & odd);
|
|
520
|
+
|
|
521
|
+
// clear odd bits in even groups, whilst conversely preserving odd bits in odd groups
|
|
522
|
+
// the `& mask` also conveniently gets rid of unwanted trailing bits
|
|
523
|
+
return (evenGroups ^ odd) & mask;
|
|
524
|
+
}
|