yencode 1.1.5 → 1.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +130 -189
- package/binding.gyp +115 -6
- package/index.js +2 -0
- package/package.json +1 -1
- package/src/common.h +37 -7
- package/src/crc.cc +121 -47
- package/src/crc.h +74 -10
- package/src/crc_arm.cc +51 -34
- package/src/crc_arm_pmull.cc +215 -0
- package/src/crc_common.h +22 -0
- package/src/crc_folding.cc +154 -16
- package/src/crc_folding_256.cc +7 -14
- package/src/crc_riscv.cc +251 -0
- package/src/decoder.cc +373 -13
- package/src/decoder.h +10 -14
- package/src/decoder_avx.cc +5 -6
- package/src/decoder_avx2.cc +8 -9
- package/src/decoder_avx2_base.h +7 -11
- package/src/decoder_common.h +56 -373
- package/src/decoder_neon.cc +13 -19
- package/src/decoder_neon64.cc +12 -15
- package/src/decoder_rvv.cc +280 -0
- package/src/decoder_sse2.cc +26 -5
- package/src/decoder_sse_base.h +20 -40
- package/src/decoder_ssse3.cc +5 -6
- package/src/decoder_vbmi2.cc +6 -13
- package/src/encoder.cc +42 -26
- package/src/encoder.h +5 -7
- package/src/encoder_avx.cc +3 -3
- package/src/encoder_avx2.cc +3 -3
- package/src/encoder_avx_base.h +3 -0
- package/src/encoder_common.h +26 -32
- package/src/encoder_neon.cc +6 -3
- package/src/encoder_rvv.cc +13 -26
- package/src/encoder_sse2.cc +3 -2
- package/src/encoder_sse_base.h +2 -0
- package/src/encoder_ssse3.cc +3 -3
- package/src/encoder_vbmi2.cc +6 -7
- package/src/platform.cc +24 -23
- package/src/yencode.cc +54 -11
- package/test/_speedbase.js +4 -2
- package/test/speeddec.js +25 -16
- package/test/speedenc.js +21 -17
- package/test/testcrc.js +17 -1
- package/test/testcrcfuncs.c +53 -0
- package/test/testdec.js +1 -0
package/src/crc_riscv.cc
ADDED
|
@@ -0,0 +1,251 @@
|
|
|
1
|
+
#include "crc_common.h"
|
|
2
|
+
|
|
3
|
+
#if defined(__riscv) && defined(__GNUC__) && (defined(__riscv_zbkc) || defined(__riscv_zbc))
|
|
4
|
+
|
|
5
|
+
#if __has_include(<riscv_bitmanip.h>)
|
|
6
|
+
# include <riscv_bitmanip.h>
|
|
7
|
+
# if __riscv_xlen == 64
|
|
8
|
+
# define rv_clmul __riscv_clmul_64
|
|
9
|
+
# define rv_clmulh __riscv_clmulh_64
|
|
10
|
+
# else
|
|
11
|
+
# define rv_clmul __riscv_clmul_32
|
|
12
|
+
# define rv_clmulh __riscv_clmulh_32
|
|
13
|
+
# endif
|
|
14
|
+
#else
|
|
15
|
+
static HEDLEY_ALWAYS_INLINE uintptr_t rv_clmul(uintptr_t x, uintptr_t y) {
|
|
16
|
+
uintptr_t r;
|
|
17
|
+
__asm__("clmul %0, %1, %2\n"
|
|
18
|
+
: "=r"(r)
|
|
19
|
+
: "r"(x), "r"(y)
|
|
20
|
+
:);
|
|
21
|
+
return r;
|
|
22
|
+
}
|
|
23
|
+
static HEDLEY_ALWAYS_INLINE uintptr_t rv_clmulh(uintptr_t x, uintptr_t y) {
|
|
24
|
+
uintptr_t r;
|
|
25
|
+
__asm__("clmulh %0, %1, %2\n"
|
|
26
|
+
: "=r"(r)
|
|
27
|
+
: "r"(x), "r"(y)
|
|
28
|
+
:);
|
|
29
|
+
return r;
|
|
30
|
+
}
|
|
31
|
+
#endif
|
|
32
|
+
|
|
33
|
+
// TODO: test big-endian
|
|
34
|
+
#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
|
|
35
|
+
# if __riscv_xlen == 64
|
|
36
|
+
# define SWAP __builtin_bswap64
|
|
37
|
+
# else
|
|
38
|
+
# define SWAP __builtin_bswap32
|
|
39
|
+
# endif
|
|
40
|
+
#else
|
|
41
|
+
# define SWAP(d) (d)
|
|
42
|
+
#endif
|
|
43
|
+
static HEDLEY_ALWAYS_INLINE uintptr_t read_partial(const void* p, unsigned sz) {
|
|
44
|
+
uintptr_t data = 0;
|
|
45
|
+
memcpy(&data, p, sz);
|
|
46
|
+
return SWAP(data);
|
|
47
|
+
}
|
|
48
|
+
static HEDLEY_ALWAYS_INLINE uintptr_t read_full(const uintptr_t* p) {
|
|
49
|
+
return SWAP(*p);
|
|
50
|
+
}
|
|
51
|
+
#undef SWAP
|
|
52
|
+
|
|
53
|
+
static uint32_t rv_crc_calc(uint32_t crc, const unsigned char *src, long len) {
|
|
54
|
+
uintptr_t accum[4] = {};
|
|
55
|
+
|
|
56
|
+
// note: constants here are bit-reflected and shifted left by 1
|
|
57
|
+
// Zbc does also have clmulr to avoid the shift, but:
|
|
58
|
+
// - there's no clmulhr, so for XLEN=64, just shift the constant instead to get the same result
|
|
59
|
+
// - it's unavailable in Zbkc
|
|
60
|
+
// - for XLEN=32, 2x constants is likely worth it to avoid the additional XORs in the loop
|
|
61
|
+
|
|
62
|
+
#if __riscv_xlen == 64
|
|
63
|
+
const uint64_t MUL_HI = 0x15a546366 /*2^224*/, MUL_LO = 0xf1da05aa /*2^288*/;
|
|
64
|
+
#define CLMULL rv_clmul
|
|
65
|
+
#define CLMULH rv_clmulh
|
|
66
|
+
|
|
67
|
+
accum[3] = rv_clmul(crc, 0xb66b1fa6); // 2^-32
|
|
68
|
+
#elif __riscv_xlen == 32
|
|
69
|
+
const uint64_t MUL_HI = 0x140d44a2e /*2^128*/, MUL_LO = 0x1751997d0 /*2^160*/;
|
|
70
|
+
#define CLMULL(x, k) rv_clmul(x, k & 0xffffffff)
|
|
71
|
+
#define CLMULH(x, k) (rv_clmulh(x, k & 0xffffffff) ^ (k > 0xffffffffULL ? (x) : 0))
|
|
72
|
+
|
|
73
|
+
accum[2] = rv_clmul(crc, 0xb66b1fa6);
|
|
74
|
+
accum[3] = rv_clmulh(crc, 0xb66b1fa6);
|
|
75
|
+
#else
|
|
76
|
+
#error "Unknown __riscv_xlen"
|
|
77
|
+
#endif
|
|
78
|
+
const size_t WS = sizeof(uintptr_t);
|
|
79
|
+
|
|
80
|
+
// if src isn't word-aligned, process until it is so
|
|
81
|
+
long initial_alignment = ((uintptr_t)src & (WS-1));
|
|
82
|
+
long initial_process = WS - initial_alignment;
|
|
83
|
+
if(initial_alignment && len >= initial_process) {
|
|
84
|
+
unsigned shl = initial_alignment * 8, shr = initial_process * 8;
|
|
85
|
+
#if __riscv_xlen == 64
|
|
86
|
+
accum[2] = accum[3] << shl;
|
|
87
|
+
#else
|
|
88
|
+
accum[1] = accum[2] << shl;
|
|
89
|
+
accum[2] = (accum[3] << shl) | (accum[2] >> shr);
|
|
90
|
+
#endif
|
|
91
|
+
accum[3] = (read_partial(src, initial_process) << shl) | (accum[3] >> shr);
|
|
92
|
+
src += initial_process;
|
|
93
|
+
len -= initial_process;
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
// main processing loop
|
|
97
|
+
const uintptr_t* srcW = (const uintptr_t*)src;
|
|
98
|
+
while((len -= WS*4) >= 0) {
|
|
99
|
+
uintptr_t tmpHi, tmpLo;
|
|
100
|
+
tmpLo = CLMULL(accum[0], MUL_LO) ^ CLMULL(accum[1], MUL_HI);
|
|
101
|
+
tmpHi = CLMULH(accum[0], MUL_LO) ^ CLMULH(accum[1], MUL_HI);
|
|
102
|
+
accum[0] = tmpLo ^ read_full(srcW++);
|
|
103
|
+
accum[1] = tmpHi ^ read_full(srcW++);
|
|
104
|
+
|
|
105
|
+
tmpLo = CLMULL(accum[2], MUL_LO) ^ CLMULL(accum[3], MUL_HI);
|
|
106
|
+
tmpHi = CLMULH(accum[2], MUL_LO) ^ CLMULH(accum[3], MUL_HI);
|
|
107
|
+
accum[2] = tmpLo ^ read_full(srcW++);
|
|
108
|
+
accum[3] = tmpHi ^ read_full(srcW++);
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
// process trailing bytes
|
|
112
|
+
if(len & (WS*2)) {
|
|
113
|
+
uintptr_t tmpLo = CLMULL(accum[0], MUL_LO) ^ CLMULL(accum[1], MUL_HI);
|
|
114
|
+
uintptr_t tmpHi = CLMULH(accum[0], MUL_LO) ^ CLMULH(accum[1], MUL_HI);
|
|
115
|
+
accum[0] = accum[2];
|
|
116
|
+
accum[1] = accum[3];
|
|
117
|
+
accum[2] = tmpLo ^ read_full(srcW++);
|
|
118
|
+
accum[3] = tmpHi ^ read_full(srcW++);
|
|
119
|
+
}
|
|
120
|
+
if(len & WS) {
|
|
121
|
+
uintptr_t tmpLo = CLMULL(accum[0], MUL_HI);
|
|
122
|
+
uintptr_t tmpHi = CLMULH(accum[0], MUL_HI);
|
|
123
|
+
accum[0] = accum[1];
|
|
124
|
+
accum[1] = accum[2];
|
|
125
|
+
accum[2] = accum[3] ^ tmpLo;
|
|
126
|
+
accum[3] = tmpHi ^ read_full(srcW++);
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
size_t tail = len & (WS-1);
|
|
130
|
+
if(tail) {
|
|
131
|
+
unsigned shl = ((WS - tail) * 8), shr = tail * 8;
|
|
132
|
+
uintptr_t tmp = accum[0] << shl;
|
|
133
|
+
uintptr_t tmpLo = CLMULL(tmp, MUL_HI);
|
|
134
|
+
uintptr_t tmpHi = CLMULH(tmp, MUL_HI);
|
|
135
|
+
accum[0] = (accum[0] >> shr) | (accum[1] << shl);
|
|
136
|
+
accum[1] = (accum[1] >> shr) | (accum[2] << shl);
|
|
137
|
+
accum[2] = (accum[2] >> shr) | (accum[3] << shl);
|
|
138
|
+
accum[3] = (accum[3] >> shr) | (read_partial(srcW, tail) << shl);
|
|
139
|
+
accum[2] ^= tmpLo;
|
|
140
|
+
accum[3] ^= tmpHi;
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
// done processing: fold everything down
|
|
145
|
+
#if __riscv_xlen == 64
|
|
146
|
+
// fold 0,1 -> 2,3
|
|
147
|
+
accum[2] ^= rv_clmul(accum[0], 0x1751997d0) ^ rv_clmul(accum[1], 0xccaa009e);
|
|
148
|
+
accum[3] ^= rv_clmulh(accum[0], 0x1751997d0) ^ rv_clmulh(accum[1], 0xccaa009e);
|
|
149
|
+
|
|
150
|
+
// fold 2->3
|
|
151
|
+
accum[0] = rv_clmulh(accum[2], 0xccaa009e);
|
|
152
|
+
accum[3] ^= rv_clmul(accum[2], 0xccaa009e);
|
|
153
|
+
|
|
154
|
+
// fold 64b->32b
|
|
155
|
+
accum[1] = rv_clmul(accum[3] & 0xffffffff, 0x163cd6124);
|
|
156
|
+
accum[0] ^= accum[1] >> 32;
|
|
157
|
+
accum[3] = accum[1] ^ (accum[3] >> 32);
|
|
158
|
+
accum[3] <<= 32;
|
|
159
|
+
#else
|
|
160
|
+
// fold 0,1 -> 2,3
|
|
161
|
+
accum[2] ^= rv_clmul(accum[0], 0xccaa009e) ^ CLMULL(accum[1], 0x163cd6124);
|
|
162
|
+
accum[3] ^= rv_clmulh(accum[0], 0xccaa009e) ^ CLMULH(accum[1], 0x163cd6124);
|
|
163
|
+
|
|
164
|
+
// fold 2->3
|
|
165
|
+
accum[0] = CLMULH(accum[2], 0x163cd6124);
|
|
166
|
+
accum[3] ^= CLMULL(accum[2], 0x163cd6124);
|
|
167
|
+
#endif
|
|
168
|
+
|
|
169
|
+
// reduction
|
|
170
|
+
accum[3] = CLMULL(accum[3], 0xf7011641);
|
|
171
|
+
accum[3] = CLMULH(accum[3], 0x1db710640); // maybe consider clmulr for XLEN=32
|
|
172
|
+
crc = accum[0] ^ accum[3];
|
|
173
|
+
return crc;
|
|
174
|
+
#undef CLMULL
|
|
175
|
+
#undef CLMULH
|
|
176
|
+
}
|
|
177
|
+
|
|
178
|
+
static uint32_t do_crc32_incremental_rv_zbc(const void* data, size_t length, uint32_t init) {
|
|
179
|
+
return ~rv_crc_calc(~init, (const unsigned char*)data, (long)length);
|
|
180
|
+
}
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
#if __riscv_xlen == 64
|
|
184
|
+
// note that prod is shifted by 1 place to the right, due to bit-reflection
|
|
185
|
+
static uint32_t crc32_reduce_rv_zbc(uint64_t prod) {
|
|
186
|
+
uint64_t t = rv_clmul(prod << 33, 0xf7011641);
|
|
187
|
+
t = rv_clmulh(t, 0x1db710640);
|
|
188
|
+
t ^= prod >> 31;
|
|
189
|
+
return t;
|
|
190
|
+
}
|
|
191
|
+
#endif
|
|
192
|
+
static uint32_t crc32_multiply_rv_zbc(uint32_t a, uint32_t b) {
|
|
193
|
+
#if __riscv_xlen == 64
|
|
194
|
+
uint64_t t = crc32_reduce_rv_zbc(rv_clmul(a, b));
|
|
195
|
+
#else
|
|
196
|
+
uint32_t prodLo = rv_clmul(a, b);
|
|
197
|
+
uint32_t prodHi = rv_clmulh(a, b);
|
|
198
|
+
|
|
199
|
+
// fix prodHi for bit-reflection (clmulr would be ideal here)
|
|
200
|
+
prodHi += prodHi;
|
|
201
|
+
prodHi |= prodLo >> 31;
|
|
202
|
+
prodLo += prodLo;
|
|
203
|
+
|
|
204
|
+
uint32_t t = rv_clmul(prodLo, 0xf7011641);
|
|
205
|
+
t ^= rv_clmulh(t, 0xdb710640);
|
|
206
|
+
t ^= prodHi;
|
|
207
|
+
#endif
|
|
208
|
+
return t;
|
|
209
|
+
}
|
|
210
|
+
|
|
211
|
+
#if defined(__GNUC__) || defined(_MSC_VER)
|
|
212
|
+
static uint32_t crc32_shift_rv_zbc(uint32_t crc1, uint32_t n) {
|
|
213
|
+
// TODO: require Zbb for ctz
|
|
214
|
+
uint32_t result = crc1;
|
|
215
|
+
#if __riscv_xlen == 64
|
|
216
|
+
// for n<32, can shift directly
|
|
217
|
+
uint64_t prod = result;
|
|
218
|
+
prod <<= 31 ^ (n&31);
|
|
219
|
+
n &= ~31;
|
|
220
|
+
result = crc32_reduce_rv_zbc(prod);
|
|
221
|
+
#endif
|
|
222
|
+
if(!n) return result;
|
|
223
|
+
|
|
224
|
+
uint32_t result2 = RapidYenc::crc_power[ctz32(n)];
|
|
225
|
+
n &= n-1;
|
|
226
|
+
|
|
227
|
+
while(n) {
|
|
228
|
+
result = crc32_multiply_rv_zbc(result, RapidYenc::crc_power[ctz32(n)]);
|
|
229
|
+
n &= n-1;
|
|
230
|
+
|
|
231
|
+
if(n) {
|
|
232
|
+
result2 = crc32_multiply_rv_zbc(result2, RapidYenc::crc_power[ctz32(n)]);
|
|
233
|
+
n &= n-1;
|
|
234
|
+
}
|
|
235
|
+
}
|
|
236
|
+
return crc32_multiply_rv_zbc(result, result2);
|
|
237
|
+
}
|
|
238
|
+
#endif
|
|
239
|
+
|
|
240
|
+
|
|
241
|
+
void RapidYenc::crc_riscv_set_funcs() {
|
|
242
|
+
_do_crc32_incremental = &do_crc32_incremental_rv_zbc;
|
|
243
|
+
_crc32_multiply = &crc32_multiply_rv_zbc;
|
|
244
|
+
#if defined(__GNUC__) || defined(_MSC_VER)
|
|
245
|
+
_crc32_shift = &crc32_shift_rv_zbc;
|
|
246
|
+
#endif
|
|
247
|
+
_crc32_isa = ISA_FEATURE_ZBC;
|
|
248
|
+
}
|
|
249
|
+
#else
|
|
250
|
+
void RapidYenc::crc_riscv_set_funcs() {}
|
|
251
|
+
#endif
|
package/src/decoder.cc
CHANGED
|
@@ -3,29 +3,359 @@
|
|
|
3
3
|
#include "decoder_common.h"
|
|
4
4
|
#include "decoder.h"
|
|
5
5
|
|
|
6
|
-
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
// TODO: add branch probabilities
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
// state var: refers to the previous state - only used for incremental processing
|
|
12
|
+
template<bool isRaw>
|
|
13
|
+
static size_t do_decode_noend_scalar(const unsigned char* src, unsigned char* dest, size_t len, RapidYenc::YencDecoderState* state) {
|
|
14
|
+
using namespace RapidYenc;
|
|
15
|
+
|
|
16
|
+
const unsigned char *es = src + len; // end source pointer
|
|
17
|
+
unsigned char *p = dest; // destination pointer
|
|
18
|
+
long i = -(long)len; // input position
|
|
19
|
+
unsigned char c; // input character
|
|
20
|
+
|
|
21
|
+
if(len < 1) return 0;
|
|
22
|
+
|
|
23
|
+
if(isRaw) {
|
|
24
|
+
|
|
25
|
+
if(state) switch(*state) {
|
|
26
|
+
case YDEC_STATE_EQ:
|
|
27
|
+
c = es[i];
|
|
28
|
+
*p++ = c - 42 - 64;
|
|
29
|
+
i++;
|
|
30
|
+
if(c == '\r') {
|
|
31
|
+
*state = YDEC_STATE_CR;
|
|
32
|
+
if(i >= 0) return 0;
|
|
33
|
+
} else {
|
|
34
|
+
*state = YDEC_STATE_NONE;
|
|
35
|
+
break;
|
|
36
|
+
}
|
|
37
|
+
// fall-thru
|
|
38
|
+
case YDEC_STATE_CR:
|
|
39
|
+
if(es[i] != '\n') break;
|
|
40
|
+
i++;
|
|
41
|
+
*state = YDEC_STATE_CRLF;
|
|
42
|
+
if(i >= 0) return 0;
|
|
43
|
+
// Else fall-thru
|
|
44
|
+
case YDEC_STATE_CRLF:
|
|
45
|
+
// skip past first dot
|
|
46
|
+
if(es[i] == '.') i++;
|
|
47
|
+
// fall-thru
|
|
48
|
+
default: break; // silence compiler warnings
|
|
49
|
+
} else // treat as YDEC_STATE_CRLF
|
|
50
|
+
if(es[i] == '.') i++;
|
|
51
|
+
|
|
52
|
+
for(; i < -2; i++) {
|
|
53
|
+
c = es[i];
|
|
54
|
+
switch(c) {
|
|
55
|
+
case '\r':
|
|
56
|
+
// skip past \r\n. sequences
|
|
57
|
+
//i += (es[i+1] == '\n' && es[i+2] == '.') << 1;
|
|
58
|
+
if(es[i+1] == '\n' && es[i+2] == '.')
|
|
59
|
+
i += 2;
|
|
60
|
+
// fall-thru
|
|
61
|
+
case '\n':
|
|
62
|
+
continue;
|
|
63
|
+
case '=':
|
|
64
|
+
c = es[i+1];
|
|
65
|
+
*p++ = c - 42 - 64;
|
|
66
|
+
i += (c != '\r'); // if we have a \r, reprocess character to deal with \r\n. case
|
|
67
|
+
continue;
|
|
68
|
+
default:
|
|
69
|
+
*p++ = c - 42;
|
|
70
|
+
}
|
|
71
|
+
}
|
|
72
|
+
if(state) *state = YDEC_STATE_NONE;
|
|
73
|
+
|
|
74
|
+
if(i == -2) { // 2nd last char
|
|
75
|
+
c = es[i];
|
|
76
|
+
switch(c) {
|
|
77
|
+
case '\r':
|
|
78
|
+
if(state && es[i+1] == '\n') {
|
|
79
|
+
*state = YDEC_STATE_CRLF;
|
|
80
|
+
return p - dest;
|
|
81
|
+
}
|
|
82
|
+
// Else fall-thru
|
|
83
|
+
case '\n':
|
|
84
|
+
break;
|
|
85
|
+
case '=':
|
|
86
|
+
c = es[i+1];
|
|
87
|
+
*p++ = c - 42 - 64;
|
|
88
|
+
i += (c != '\r');
|
|
89
|
+
break;
|
|
90
|
+
default:
|
|
91
|
+
*p++ = c - 42;
|
|
92
|
+
}
|
|
93
|
+
i++;
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
// do final char; we process this separately to prevent an overflow if the final char is '='
|
|
97
|
+
if(i == -1) {
|
|
98
|
+
c = es[i];
|
|
99
|
+
if(c != '\n' && c != '\r' && c != '=') {
|
|
100
|
+
*p++ = c - 42;
|
|
101
|
+
} else if(state) {
|
|
102
|
+
if(c == '=') *state = YDEC_STATE_EQ;
|
|
103
|
+
else if(c == '\r') *state = YDEC_STATE_CR;
|
|
104
|
+
else *state = YDEC_STATE_NONE;
|
|
105
|
+
}
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
} else {
|
|
109
|
+
|
|
110
|
+
if(state && *state == YDEC_STATE_EQ) {
|
|
111
|
+
*p++ = es[i] - 42 - 64;
|
|
112
|
+
i++;
|
|
113
|
+
*state = YDEC_STATE_NONE;
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
/*for(i = 0; i < len - 1; i++) {
|
|
117
|
+
c = src[i];
|
|
118
|
+
if(c == '\n' || c == '\r') continue;
|
|
119
|
+
unsigned char isEquals = (c == '=');
|
|
120
|
+
i += isEquals;
|
|
121
|
+
*p++ = src[i] - (42 + (isEquals << 6));
|
|
122
|
+
}*/
|
|
123
|
+
for(; i < -1; i++) {
|
|
124
|
+
c = es[i];
|
|
125
|
+
switch(c) {
|
|
126
|
+
case '\n': case '\r': continue;
|
|
127
|
+
case '=':
|
|
128
|
+
i++;
|
|
129
|
+
c = es[i] - 64;
|
|
130
|
+
}
|
|
131
|
+
*p++ = c - 42;
|
|
132
|
+
}
|
|
133
|
+
if(state) *state = YDEC_STATE_NONE;
|
|
134
|
+
// do final char; we process this separately to prevent an overflow if the final char is '='
|
|
135
|
+
if(i == -1) {
|
|
136
|
+
c = es[i];
|
|
137
|
+
if(c != '\n' && c != '\r' && c != '=') {
|
|
138
|
+
*p++ = c - 42;
|
|
139
|
+
} else
|
|
140
|
+
if(state) *state = (c == '=' ? YDEC_STATE_EQ : YDEC_STATE_NONE);
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
return p - dest;
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
template<bool isRaw>
|
|
149
|
+
static RapidYenc::YencDecoderEnd do_decode_end_scalar(const unsigned char** src, unsigned char** dest, size_t len, RapidYenc::YencDecoderState* state) {
|
|
150
|
+
using namespace RapidYenc;
|
|
151
|
+
|
|
152
|
+
const unsigned char *es = (*src) + len; // end source pointer
|
|
153
|
+
unsigned char *p = *dest; // destination pointer
|
|
154
|
+
long i = -(long)len; // input position
|
|
155
|
+
unsigned char c; // input character
|
|
156
|
+
|
|
157
|
+
if(len < 1) return YDEC_END_NONE;
|
|
158
|
+
|
|
159
|
+
#define YDEC_CHECK_END(s) if(i == 0) { \
|
|
160
|
+
*state = s; \
|
|
161
|
+
*src = es; \
|
|
162
|
+
*dest = p; \
|
|
163
|
+
return YDEC_END_NONE; \
|
|
164
|
+
}
|
|
165
|
+
if(state) switch(*state) {
|
|
166
|
+
case YDEC_STATE_CRLFEQ: do_decode_endable_scalar_ceq:
|
|
167
|
+
if(es[i] == 'y') {
|
|
168
|
+
*state = YDEC_STATE_NONE;
|
|
169
|
+
*src = es+i+1;
|
|
170
|
+
*dest = p;
|
|
171
|
+
return YDEC_END_CONTROL;
|
|
172
|
+
} // Else fall-thru
|
|
173
|
+
case YDEC_STATE_EQ:
|
|
174
|
+
c = es[i];
|
|
175
|
+
*p++ = c - 42 - 64;
|
|
176
|
+
i++;
|
|
177
|
+
if(c != '\r') break;
|
|
178
|
+
YDEC_CHECK_END(YDEC_STATE_CR)
|
|
179
|
+
// fall-through
|
|
180
|
+
case YDEC_STATE_CR:
|
|
181
|
+
if(es[i] != '\n') break;
|
|
182
|
+
i++;
|
|
183
|
+
YDEC_CHECK_END(YDEC_STATE_CRLF)
|
|
184
|
+
// fall-through
|
|
185
|
+
case YDEC_STATE_CRLF: do_decode_endable_scalar_c0:
|
|
186
|
+
if(es[i] == '.' && isRaw) {
|
|
187
|
+
i++;
|
|
188
|
+
YDEC_CHECK_END(YDEC_STATE_CRLFDT)
|
|
189
|
+
} else if(es[i] == '=') {
|
|
190
|
+
i++;
|
|
191
|
+
YDEC_CHECK_END(YDEC_STATE_CRLFEQ)
|
|
192
|
+
goto do_decode_endable_scalar_ceq;
|
|
193
|
+
} else
|
|
194
|
+
break;
|
|
195
|
+
// fall-through
|
|
196
|
+
case YDEC_STATE_CRLFDT:
|
|
197
|
+
if(isRaw && es[i] == '\r') {
|
|
198
|
+
i++;
|
|
199
|
+
YDEC_CHECK_END(YDEC_STATE_CRLFDTCR)
|
|
200
|
+
} else if(isRaw && es[i] == '=') { // check for dot-stuffed ending: \r\n.=y
|
|
201
|
+
i++;
|
|
202
|
+
YDEC_CHECK_END(YDEC_STATE_CRLFEQ)
|
|
203
|
+
goto do_decode_endable_scalar_ceq;
|
|
204
|
+
} else
|
|
205
|
+
break;
|
|
206
|
+
// fall-through
|
|
207
|
+
case YDEC_STATE_CRLFDTCR:
|
|
208
|
+
if(es[i] == '\n') {
|
|
209
|
+
if(isRaw) {
|
|
210
|
+
*state = YDEC_STATE_CRLF;
|
|
211
|
+
*src = es + i + 1;
|
|
212
|
+
*dest = p;
|
|
213
|
+
return YDEC_END_ARTICLE;
|
|
214
|
+
} else {
|
|
215
|
+
i++;
|
|
216
|
+
YDEC_CHECK_END(YDEC_STATE_CRLF)
|
|
217
|
+
goto do_decode_endable_scalar_c0; // handle as CRLF
|
|
218
|
+
}
|
|
219
|
+
} else
|
|
220
|
+
break;
|
|
221
|
+
case YDEC_STATE_NONE: break; // silence compiler warning
|
|
222
|
+
} else // treat as YDEC_STATE_CRLF
|
|
223
|
+
goto do_decode_endable_scalar_c0;
|
|
224
|
+
|
|
225
|
+
for(; i < -2; i++) {
|
|
226
|
+
c = es[i];
|
|
227
|
+
switch(c) {
|
|
228
|
+
case '\r': if(es[i+1] == '\n') {
|
|
229
|
+
if(isRaw && es[i+2] == '.') {
|
|
230
|
+
// skip past \r\n. sequences
|
|
231
|
+
i += 3;
|
|
232
|
+
YDEC_CHECK_END(YDEC_STATE_CRLFDT)
|
|
233
|
+
// check for end
|
|
234
|
+
if(es[i] == '\r') {
|
|
235
|
+
i++;
|
|
236
|
+
YDEC_CHECK_END(YDEC_STATE_CRLFDTCR)
|
|
237
|
+
if(es[i] == '\n') {
|
|
238
|
+
*src = es + i + 1;
|
|
239
|
+
*dest = p;
|
|
240
|
+
*state = YDEC_STATE_CRLF;
|
|
241
|
+
return YDEC_END_ARTICLE;
|
|
242
|
+
} else i--;
|
|
243
|
+
} else if(es[i] == '=') {
|
|
244
|
+
i++;
|
|
245
|
+
YDEC_CHECK_END(YDEC_STATE_CRLFEQ)
|
|
246
|
+
if(es[i] == 'y') {
|
|
247
|
+
*src = es + i + 1;
|
|
248
|
+
*dest = p;
|
|
249
|
+
*state = YDEC_STATE_NONE;
|
|
250
|
+
return YDEC_END_CONTROL;
|
|
251
|
+
} else {
|
|
252
|
+
// escape char & continue
|
|
253
|
+
c = es[i];
|
|
254
|
+
*p++ = c - 42 - 64;
|
|
255
|
+
i -= (c == '\r');
|
|
256
|
+
}
|
|
257
|
+
} else i--;
|
|
258
|
+
}
|
|
259
|
+
else if(es[i+2] == '=') {
|
|
260
|
+
i += 3;
|
|
261
|
+
YDEC_CHECK_END(YDEC_STATE_CRLFEQ)
|
|
262
|
+
if(es[i] == 'y') {
|
|
263
|
+
// ended
|
|
264
|
+
*src = es + i + 1;
|
|
265
|
+
*dest = p;
|
|
266
|
+
*state = YDEC_STATE_NONE;
|
|
267
|
+
return YDEC_END_CONTROL;
|
|
268
|
+
} else {
|
|
269
|
+
// escape char & continue
|
|
270
|
+
c = es[i];
|
|
271
|
+
*p++ = c - 42 - 64;
|
|
272
|
+
i -= (c == '\r');
|
|
273
|
+
}
|
|
274
|
+
}
|
|
275
|
+
} // fall-thru
|
|
276
|
+
case '\n':
|
|
277
|
+
continue;
|
|
278
|
+
case '=':
|
|
279
|
+
c = es[i+1];
|
|
280
|
+
*p++ = c - 42 - 64;
|
|
281
|
+
i += (c != '\r'); // if we have a \r, reprocess character to deal with \r\n. case
|
|
282
|
+
continue;
|
|
283
|
+
default:
|
|
284
|
+
*p++ = c - 42;
|
|
285
|
+
}
|
|
286
|
+
}
|
|
287
|
+
if(state) *state = YDEC_STATE_NONE;
|
|
288
|
+
|
|
289
|
+
if(i == -2) { // 2nd last char
|
|
290
|
+
c = es[i];
|
|
291
|
+
switch(c) {
|
|
292
|
+
case '\r':
|
|
293
|
+
if(state && es[i+1] == '\n') {
|
|
294
|
+
*state = YDEC_STATE_CRLF;
|
|
295
|
+
*src = es;
|
|
296
|
+
*dest = p;
|
|
297
|
+
return YDEC_END_NONE;
|
|
298
|
+
}
|
|
299
|
+
// Else fall-thru
|
|
300
|
+
case '\n':
|
|
301
|
+
break;
|
|
302
|
+
case '=':
|
|
303
|
+
c = es[i+1];
|
|
304
|
+
*p++ = c - 42 - 64;
|
|
305
|
+
i += (c != '\r');
|
|
306
|
+
break;
|
|
307
|
+
default:
|
|
308
|
+
*p++ = c - 42;
|
|
309
|
+
}
|
|
310
|
+
i++;
|
|
311
|
+
}
|
|
312
|
+
|
|
313
|
+
// do final char; we process this separately to prevent an overflow if the final char is '='
|
|
314
|
+
if(i == -1) {
|
|
315
|
+
c = es[i];
|
|
316
|
+
if(c != '\n' && c != '\r' && c != '=') {
|
|
317
|
+
*p++ = c - 42;
|
|
318
|
+
} else if(state) {
|
|
319
|
+
if(c == '=') *state = YDEC_STATE_EQ;
|
|
320
|
+
else if(c == '\r') *state = YDEC_STATE_CR;
|
|
321
|
+
else *state = YDEC_STATE_NONE;
|
|
322
|
+
}
|
|
323
|
+
}
|
|
324
|
+
#undef YDEC_CHECK_END
|
|
325
|
+
|
|
326
|
+
*src = es;
|
|
327
|
+
*dest = p;
|
|
328
|
+
return YDEC_END_NONE;
|
|
329
|
+
}
|
|
330
|
+
|
|
331
|
+
template<bool isRaw, bool searchEnd>
|
|
332
|
+
RapidYenc::YencDecoderEnd RapidYenc::do_decode_scalar(const unsigned char** src, unsigned char** dest, size_t len, RapidYenc::YencDecoderState* state) {
|
|
333
|
+
if(searchEnd)
|
|
334
|
+
return do_decode_end_scalar<isRaw>(src, dest, len, state);
|
|
335
|
+
*dest += do_decode_noend_scalar<isRaw>(*src, *dest, len, state);
|
|
336
|
+
*src += len;
|
|
337
|
+
return YDEC_END_NONE;
|
|
338
|
+
}
|
|
339
|
+
|
|
340
|
+
|
|
341
|
+
namespace RapidYenc {
|
|
7
342
|
YencDecoderEnd (*_do_decode)(const unsigned char**, unsigned char**, size_t, YencDecoderState*) = &do_decode_scalar<false, false>;
|
|
8
343
|
YencDecoderEnd (*_do_decode_raw)(const unsigned char**, unsigned char**, size_t, YencDecoderState*) = &do_decode_scalar<true, false>;
|
|
9
344
|
YencDecoderEnd (*_do_decode_end_raw)(const unsigned char**, unsigned char**, size_t, YencDecoderState*) = &do_decode_end_scalar<true>;
|
|
10
345
|
|
|
11
346
|
int _decode_isa = ISA_GENERIC;
|
|
347
|
+
|
|
348
|
+
template YencDecoderEnd do_decode_scalar<true, true>(const unsigned char**, unsigned char**, size_t, YencDecoderState*);
|
|
12
349
|
}
|
|
13
350
|
|
|
14
|
-
void decoder_set_sse2_funcs();
|
|
15
|
-
void decoder_set_ssse3_funcs();
|
|
16
|
-
void decoder_set_avx_funcs();
|
|
17
|
-
void decoder_set_avx2_funcs();
|
|
18
|
-
void decoder_set_vbmi2_funcs();
|
|
19
|
-
extern const bool decoder_has_avx10;
|
|
20
|
-
void decoder_set_neon_funcs();
|
|
21
|
-
|
|
22
351
|
|
|
23
352
|
#if defined(PLATFORM_X86) && defined(YENC_BUILD_NATIVE) && YENC_BUILD_NATIVE!=0
|
|
24
353
|
# if defined(__AVX2__) && !defined(YENC_DISABLE_AVX256)
|
|
25
354
|
# include "decoder_avx2_base.h"
|
|
26
355
|
static inline void decoder_set_native_funcs() {
|
|
27
356
|
ALIGN_ALLOC(lookups, sizeof(*lookups), 16);
|
|
28
|
-
|
|
357
|
+
using namespace RapidYenc;
|
|
358
|
+
decoder_init_lut(lookups->compact);
|
|
29
359
|
_do_decode = &do_decode_simd<false, false, sizeof(__m256i)*2, do_decode_avx2<false, false, ISA_NATIVE> >;
|
|
30
360
|
_do_decode_raw = &do_decode_simd<true, false, sizeof(__m256i)*2, do_decode_avx2<true, false, ISA_NATIVE> >;
|
|
31
361
|
_do_decode_end_raw = &do_decode_simd<true, true, sizeof(__m256i)*2, do_decode_avx2<true, true, ISA_NATIVE> >;
|
|
@@ -34,8 +364,9 @@ static inline void decoder_set_native_funcs() {
|
|
|
34
364
|
# else
|
|
35
365
|
# include "decoder_sse_base.h"
|
|
36
366
|
static inline void decoder_set_native_funcs() {
|
|
37
|
-
|
|
38
|
-
|
|
367
|
+
using namespace RapidYenc;
|
|
368
|
+
decoder_sse_init(lookups);
|
|
369
|
+
decoder_init_lut(lookups->compact);
|
|
39
370
|
_do_decode = &do_decode_simd<false, false, sizeof(__m128i)*2, do_decode_sse<false, false, ISA_NATIVE> >;
|
|
40
371
|
_do_decode_raw = &do_decode_simd<true, false, sizeof(__m128i)*2, do_decode_sse<true, false, ISA_NATIVE> >;
|
|
41
372
|
_do_decode_end_raw = &do_decode_simd<true, true, sizeof(__m128i)*2, do_decode_sse<true, true, ISA_NATIVE> >;
|
|
@@ -44,7 +375,32 @@ static inline void decoder_set_native_funcs() {
|
|
|
44
375
|
# endif
|
|
45
376
|
#endif
|
|
46
377
|
|
|
47
|
-
|
|
378
|
+
|
|
379
|
+
#if defined(PLATFORM_X86) || defined(PLATFORM_ARM)
|
|
380
|
+
void RapidYenc::decoder_init_lut(void* compactLUT) {
|
|
381
|
+
#ifdef YENC_DEC_USE_THINTABLE
|
|
382
|
+
const int tableSize = 8;
|
|
383
|
+
#else
|
|
384
|
+
const int tableSize = 16;
|
|
385
|
+
#endif
|
|
386
|
+
for(int i=0; i<(tableSize==8?256:32768); i++) {
|
|
387
|
+
int k = i;
|
|
388
|
+
uint8_t* res = (uint8_t*)compactLUT + i*tableSize;
|
|
389
|
+
int p = 0;
|
|
390
|
+
for(int j=0; j<tableSize; j++) {
|
|
391
|
+
if(!(k & 1)) {
|
|
392
|
+
res[p++] = j;
|
|
393
|
+
}
|
|
394
|
+
k >>= 1;
|
|
395
|
+
}
|
|
396
|
+
for(; p<tableSize; p++)
|
|
397
|
+
res[p] = 0x80;
|
|
398
|
+
}
|
|
399
|
+
}
|
|
400
|
+
#endif
|
|
401
|
+
|
|
402
|
+
|
|
403
|
+
void RapidYenc::decoder_init() {
|
|
48
404
|
#ifdef PLATFORM_X86
|
|
49
405
|
# if defined(YENC_BUILD_NATIVE) && YENC_BUILD_NATIVE!=0
|
|
50
406
|
decoder_set_native_funcs();
|
|
@@ -66,4 +422,8 @@ void decoder_init() {
|
|
|
66
422
|
if(cpu_supports_neon())
|
|
67
423
|
decoder_set_neon_funcs();
|
|
68
424
|
#endif
|
|
425
|
+
#ifdef __riscv
|
|
426
|
+
if(cpu_supports_rvv())
|
|
427
|
+
decoder_set_rvv_funcs();
|
|
428
|
+
#endif
|
|
69
429
|
}
|