yencode 1.2.0 → 1.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/binding.gyp +37 -1
- package/package.json +1 -1
- package/src/common.h +18 -6
- package/src/crc.cc +42 -33
- package/src/crc.h +16 -14
- package/src/crc_arm.cc +7 -8
- package/src/crc_arm_pmull.cc +215 -0
- package/src/crc_common.h +13 -2
- package/src/crc_folding.cc +5 -5
- package/src/crc_folding_256.cc +2 -4
- package/src/crc_riscv.cc +7 -7
- package/src/decoder.cc +342 -12
- package/src/decoder.h +10 -14
- package/src/decoder_avx.cc +3 -4
- package/src/decoder_avx2.cc +7 -8
- package/src/decoder_avx2_base.h +6 -2
- package/src/decoder_common.h +34 -338
- package/src/decoder_neon.cc +10 -6
- package/src/decoder_neon64.cc +9 -5
- package/src/decoder_rvv.cc +47 -41
- package/src/decoder_sse2.cc +4 -4
- package/src/decoder_sse_base.h +20 -12
- package/src/decoder_ssse3.cc +3 -4
- package/src/decoder_vbmi2.cc +6 -8
- package/src/encoder.cc +19 -28
- package/src/encoder.h +5 -7
- package/src/encoder_avx.cc +3 -3
- package/src/encoder_avx2.cc +3 -3
- package/src/encoder_avx_base.h +3 -0
- package/src/encoder_common.h +26 -14
- package/src/encoder_neon.cc +6 -3
- package/src/encoder_rvv.cc +9 -7
- package/src/encoder_sse2.cc +3 -2
- package/src/encoder_sse_base.h +2 -0
- package/src/encoder_ssse3.cc +3 -3
- package/src/encoder_vbmi2.cc +6 -7
- package/src/platform.cc +24 -23
- package/src/yencode.cc +9 -8
- package/test/_speedbase.js +4 -2
- package/test/speeddec.js +25 -16
- package/test/speedenc.js +21 -17
package/src/decoder_common.h
CHANGED
|
@@ -1,342 +1,32 @@
|
|
|
1
1
|
#include "decoder.h"
|
|
2
2
|
|
|
3
|
+
namespace RapidYenc {
|
|
4
|
+
void decoder_set_sse2_funcs();
|
|
5
|
+
void decoder_set_ssse3_funcs();
|
|
6
|
+
void decoder_set_avx_funcs();
|
|
7
|
+
void decoder_set_avx2_funcs();
|
|
8
|
+
void decoder_set_vbmi2_funcs();
|
|
9
|
+
extern const bool decoder_has_avx10;
|
|
10
|
+
void decoder_set_neon_funcs();
|
|
11
|
+
void decoder_set_rvv_funcs();
|
|
12
|
+
|
|
13
|
+
template<bool isRaw, bool searchEnd>
|
|
14
|
+
YencDecoderEnd do_decode_scalar(const unsigned char** src, unsigned char** dest, size_t len, YencDecoderState* state);
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
|
|
3
18
|
#if defined(PLATFORM_ARM) && !defined(__aarch64__)
|
|
4
19
|
#define YENC_DEC_USE_THINTABLE 1
|
|
5
20
|
#endif
|
|
6
21
|
|
|
7
22
|
// TODO: need to support max output length somehow
|
|
8
|
-
// TODO: add branch probabilities
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
// state var: refers to the previous state - only used for incremental processing
|
|
12
|
-
template<bool isRaw>
|
|
13
|
-
size_t do_decode_noend_scalar(const unsigned char* src, unsigned char* dest, size_t len, YencDecoderState* state) {
|
|
14
|
-
const unsigned char *es = src + len; // end source pointer
|
|
15
|
-
unsigned char *p = dest; // destination pointer
|
|
16
|
-
long i = -(long)len; // input position
|
|
17
|
-
unsigned char c; // input character
|
|
18
|
-
|
|
19
|
-
if(len < 1) return 0;
|
|
20
|
-
|
|
21
|
-
if(isRaw) {
|
|
22
|
-
|
|
23
|
-
if(state) switch(*state) {
|
|
24
|
-
case YDEC_STATE_EQ:
|
|
25
|
-
c = es[i];
|
|
26
|
-
*p++ = c - 42 - 64;
|
|
27
|
-
i++;
|
|
28
|
-
if(c == '\r') {
|
|
29
|
-
*state = YDEC_STATE_CR;
|
|
30
|
-
if(i >= 0) return 0;
|
|
31
|
-
} else {
|
|
32
|
-
*state = YDEC_STATE_NONE;
|
|
33
|
-
break;
|
|
34
|
-
}
|
|
35
|
-
// fall-thru
|
|
36
|
-
case YDEC_STATE_CR:
|
|
37
|
-
if(es[i] != '\n') break;
|
|
38
|
-
i++;
|
|
39
|
-
*state = YDEC_STATE_CRLF;
|
|
40
|
-
if(i >= 0) return 0;
|
|
41
|
-
// Else fall-thru
|
|
42
|
-
case YDEC_STATE_CRLF:
|
|
43
|
-
// skip past first dot
|
|
44
|
-
if(es[i] == '.') i++;
|
|
45
|
-
// fall-thru
|
|
46
|
-
default: break; // silence compiler warnings
|
|
47
|
-
} else // treat as YDEC_STATE_CRLF
|
|
48
|
-
if(es[i] == '.') i++;
|
|
49
|
-
|
|
50
|
-
for(; i < -2; i++) {
|
|
51
|
-
c = es[i];
|
|
52
|
-
switch(c) {
|
|
53
|
-
case '\r':
|
|
54
|
-
// skip past \r\n. sequences
|
|
55
|
-
//i += (es[i+1] == '\n' && es[i+2] == '.') << 1;
|
|
56
|
-
if(es[i+1] == '\n' && es[i+2] == '.')
|
|
57
|
-
i += 2;
|
|
58
|
-
// fall-thru
|
|
59
|
-
case '\n':
|
|
60
|
-
continue;
|
|
61
|
-
case '=':
|
|
62
|
-
c = es[i+1];
|
|
63
|
-
*p++ = c - 42 - 64;
|
|
64
|
-
i += (c != '\r'); // if we have a \r, reprocess character to deal with \r\n. case
|
|
65
|
-
continue;
|
|
66
|
-
default:
|
|
67
|
-
*p++ = c - 42;
|
|
68
|
-
}
|
|
69
|
-
}
|
|
70
|
-
if(state) *state = YDEC_STATE_NONE;
|
|
71
|
-
|
|
72
|
-
if(i == -2) { // 2nd last char
|
|
73
|
-
c = es[i];
|
|
74
|
-
switch(c) {
|
|
75
|
-
case '\r':
|
|
76
|
-
if(state && es[i+1] == '\n') {
|
|
77
|
-
*state = YDEC_STATE_CRLF;
|
|
78
|
-
return p - dest;
|
|
79
|
-
}
|
|
80
|
-
// Else fall-thru
|
|
81
|
-
case '\n':
|
|
82
|
-
break;
|
|
83
|
-
case '=':
|
|
84
|
-
c = es[i+1];
|
|
85
|
-
*p++ = c - 42 - 64;
|
|
86
|
-
i += (c != '\r');
|
|
87
|
-
break;
|
|
88
|
-
default:
|
|
89
|
-
*p++ = c - 42;
|
|
90
|
-
}
|
|
91
|
-
i++;
|
|
92
|
-
}
|
|
93
|
-
|
|
94
|
-
// do final char; we process this separately to prevent an overflow if the final char is '='
|
|
95
|
-
if(i == -1) {
|
|
96
|
-
c = es[i];
|
|
97
|
-
if(c != '\n' && c != '\r' && c != '=') {
|
|
98
|
-
*p++ = c - 42;
|
|
99
|
-
} else if(state) {
|
|
100
|
-
if(c == '=') *state = YDEC_STATE_EQ;
|
|
101
|
-
else if(c == '\r') *state = YDEC_STATE_CR;
|
|
102
|
-
else *state = YDEC_STATE_NONE;
|
|
103
|
-
}
|
|
104
|
-
}
|
|
105
|
-
|
|
106
|
-
} else {
|
|
107
|
-
|
|
108
|
-
if(state && *state == YDEC_STATE_EQ) {
|
|
109
|
-
*p++ = es[i] - 42 - 64;
|
|
110
|
-
i++;
|
|
111
|
-
*state = YDEC_STATE_NONE;
|
|
112
|
-
}
|
|
113
|
-
|
|
114
|
-
/*for(i = 0; i < len - 1; i++) {
|
|
115
|
-
c = src[i];
|
|
116
|
-
if(c == '\n' || c == '\r') continue;
|
|
117
|
-
unsigned char isEquals = (c == '=');
|
|
118
|
-
i += isEquals;
|
|
119
|
-
*p++ = src[i] - (42 + (isEquals << 6));
|
|
120
|
-
}*/
|
|
121
|
-
for(; i < -1; i++) {
|
|
122
|
-
c = es[i];
|
|
123
|
-
switch(c) {
|
|
124
|
-
case '\n': case '\r': continue;
|
|
125
|
-
case '=':
|
|
126
|
-
i++;
|
|
127
|
-
c = es[i] - 64;
|
|
128
|
-
}
|
|
129
|
-
*p++ = c - 42;
|
|
130
|
-
}
|
|
131
|
-
if(state) *state = YDEC_STATE_NONE;
|
|
132
|
-
// do final char; we process this separately to prevent an overflow if the final char is '='
|
|
133
|
-
if(i == -1) {
|
|
134
|
-
c = es[i];
|
|
135
|
-
if(c != '\n' && c != '\r' && c != '=') {
|
|
136
|
-
*p++ = c - 42;
|
|
137
|
-
} else
|
|
138
|
-
if(state) *state = (c == '=' ? YDEC_STATE_EQ : YDEC_STATE_NONE);
|
|
139
|
-
}
|
|
140
|
-
|
|
141
|
-
}
|
|
142
|
-
|
|
143
|
-
return p - dest;
|
|
144
|
-
}
|
|
145
|
-
|
|
146
|
-
template<bool isRaw>
|
|
147
|
-
YencDecoderEnd do_decode_end_scalar(const unsigned char** src, unsigned char** dest, size_t len, YencDecoderState* state) {
|
|
148
|
-
const unsigned char *es = (*src) + len; // end source pointer
|
|
149
|
-
unsigned char *p = *dest; // destination pointer
|
|
150
|
-
long i = -(long)len; // input position
|
|
151
|
-
unsigned char c; // input character
|
|
152
|
-
|
|
153
|
-
if(len < 1) return YDEC_END_NONE;
|
|
154
|
-
|
|
155
|
-
#define YDEC_CHECK_END(s) if(i == 0) { \
|
|
156
|
-
*state = s; \
|
|
157
|
-
*src = es; \
|
|
158
|
-
*dest = p; \
|
|
159
|
-
return YDEC_END_NONE; \
|
|
160
|
-
}
|
|
161
|
-
if(state) switch(*state) {
|
|
162
|
-
case YDEC_STATE_CRLFEQ: do_decode_endable_scalar_ceq:
|
|
163
|
-
if(es[i] == 'y') {
|
|
164
|
-
*state = YDEC_STATE_NONE;
|
|
165
|
-
*src = es+i+1;
|
|
166
|
-
*dest = p;
|
|
167
|
-
return YDEC_END_CONTROL;
|
|
168
|
-
} // Else fall-thru
|
|
169
|
-
case YDEC_STATE_EQ:
|
|
170
|
-
c = es[i];
|
|
171
|
-
*p++ = c - 42 - 64;
|
|
172
|
-
i++;
|
|
173
|
-
if(c != '\r') break;
|
|
174
|
-
YDEC_CHECK_END(YDEC_STATE_CR)
|
|
175
|
-
// fall-through
|
|
176
|
-
case YDEC_STATE_CR:
|
|
177
|
-
if(es[i] != '\n') break;
|
|
178
|
-
i++;
|
|
179
|
-
YDEC_CHECK_END(YDEC_STATE_CRLF)
|
|
180
|
-
// fall-through
|
|
181
|
-
case YDEC_STATE_CRLF: do_decode_endable_scalar_c0:
|
|
182
|
-
if(es[i] == '.' && isRaw) {
|
|
183
|
-
i++;
|
|
184
|
-
YDEC_CHECK_END(YDEC_STATE_CRLFDT)
|
|
185
|
-
} else if(es[i] == '=') {
|
|
186
|
-
i++;
|
|
187
|
-
YDEC_CHECK_END(YDEC_STATE_CRLFEQ)
|
|
188
|
-
goto do_decode_endable_scalar_ceq;
|
|
189
|
-
} else
|
|
190
|
-
break;
|
|
191
|
-
// fall-through
|
|
192
|
-
case YDEC_STATE_CRLFDT:
|
|
193
|
-
if(isRaw && es[i] == '\r') {
|
|
194
|
-
i++;
|
|
195
|
-
YDEC_CHECK_END(YDEC_STATE_CRLFDTCR)
|
|
196
|
-
} else if(isRaw && es[i] == '=') { // check for dot-stuffed ending: \r\n.=y
|
|
197
|
-
i++;
|
|
198
|
-
YDEC_CHECK_END(YDEC_STATE_CRLFEQ)
|
|
199
|
-
goto do_decode_endable_scalar_ceq;
|
|
200
|
-
} else
|
|
201
|
-
break;
|
|
202
|
-
// fall-through
|
|
203
|
-
case YDEC_STATE_CRLFDTCR:
|
|
204
|
-
if(es[i] == '\n') {
|
|
205
|
-
if(isRaw) {
|
|
206
|
-
*state = YDEC_STATE_CRLF;
|
|
207
|
-
*src = es + i + 1;
|
|
208
|
-
*dest = p;
|
|
209
|
-
return YDEC_END_ARTICLE;
|
|
210
|
-
} else {
|
|
211
|
-
i++;
|
|
212
|
-
YDEC_CHECK_END(YDEC_STATE_CRLF)
|
|
213
|
-
goto do_decode_endable_scalar_c0; // handle as CRLF
|
|
214
|
-
}
|
|
215
|
-
} else
|
|
216
|
-
break;
|
|
217
|
-
case YDEC_STATE_NONE: break; // silence compiler warning
|
|
218
|
-
} else // treat as YDEC_STATE_CRLF
|
|
219
|
-
goto do_decode_endable_scalar_c0;
|
|
220
|
-
|
|
221
|
-
for(; i < -2; i++) {
|
|
222
|
-
c = es[i];
|
|
223
|
-
switch(c) {
|
|
224
|
-
case '\r': if(es[i+1] == '\n') {
|
|
225
|
-
if(isRaw && es[i+2] == '.') {
|
|
226
|
-
// skip past \r\n. sequences
|
|
227
|
-
i += 3;
|
|
228
|
-
YDEC_CHECK_END(YDEC_STATE_CRLFDT)
|
|
229
|
-
// check for end
|
|
230
|
-
if(es[i] == '\r') {
|
|
231
|
-
i++;
|
|
232
|
-
YDEC_CHECK_END(YDEC_STATE_CRLFDTCR)
|
|
233
|
-
if(es[i] == '\n') {
|
|
234
|
-
*src = es + i + 1;
|
|
235
|
-
*dest = p;
|
|
236
|
-
*state = YDEC_STATE_CRLF;
|
|
237
|
-
return YDEC_END_ARTICLE;
|
|
238
|
-
} else i--;
|
|
239
|
-
} else if(es[i] == '=') {
|
|
240
|
-
i++;
|
|
241
|
-
YDEC_CHECK_END(YDEC_STATE_CRLFEQ)
|
|
242
|
-
if(es[i] == 'y') {
|
|
243
|
-
*src = es + i + 1;
|
|
244
|
-
*dest = p;
|
|
245
|
-
*state = YDEC_STATE_NONE;
|
|
246
|
-
return YDEC_END_CONTROL;
|
|
247
|
-
} else {
|
|
248
|
-
// escape char & continue
|
|
249
|
-
c = es[i];
|
|
250
|
-
*p++ = c - 42 - 64;
|
|
251
|
-
i -= (c == '\r');
|
|
252
|
-
}
|
|
253
|
-
} else i--;
|
|
254
|
-
}
|
|
255
|
-
else if(es[i+2] == '=') {
|
|
256
|
-
i += 3;
|
|
257
|
-
YDEC_CHECK_END(YDEC_STATE_CRLFEQ)
|
|
258
|
-
if(es[i] == 'y') {
|
|
259
|
-
// ended
|
|
260
|
-
*src = es + i + 1;
|
|
261
|
-
*dest = p;
|
|
262
|
-
*state = YDEC_STATE_NONE;
|
|
263
|
-
return YDEC_END_CONTROL;
|
|
264
|
-
} else {
|
|
265
|
-
// escape char & continue
|
|
266
|
-
c = es[i];
|
|
267
|
-
*p++ = c - 42 - 64;
|
|
268
|
-
i -= (c == '\r');
|
|
269
|
-
}
|
|
270
|
-
}
|
|
271
|
-
} // fall-thru
|
|
272
|
-
case '\n':
|
|
273
|
-
continue;
|
|
274
|
-
case '=':
|
|
275
|
-
c = es[i+1];
|
|
276
|
-
*p++ = c - 42 - 64;
|
|
277
|
-
i += (c != '\r'); // if we have a \r, reprocess character to deal with \r\n. case
|
|
278
|
-
continue;
|
|
279
|
-
default:
|
|
280
|
-
*p++ = c - 42;
|
|
281
|
-
}
|
|
282
|
-
}
|
|
283
|
-
if(state) *state = YDEC_STATE_NONE;
|
|
284
|
-
|
|
285
|
-
if(i == -2) { // 2nd last char
|
|
286
|
-
c = es[i];
|
|
287
|
-
switch(c) {
|
|
288
|
-
case '\r':
|
|
289
|
-
if(state && es[i+1] == '\n') {
|
|
290
|
-
*state = YDEC_STATE_CRLF;
|
|
291
|
-
*src = es;
|
|
292
|
-
*dest = p;
|
|
293
|
-
return YDEC_END_NONE;
|
|
294
|
-
}
|
|
295
|
-
// Else fall-thru
|
|
296
|
-
case '\n':
|
|
297
|
-
break;
|
|
298
|
-
case '=':
|
|
299
|
-
c = es[i+1];
|
|
300
|
-
*p++ = c - 42 - 64;
|
|
301
|
-
i += (c != '\r');
|
|
302
|
-
break;
|
|
303
|
-
default:
|
|
304
|
-
*p++ = c - 42;
|
|
305
|
-
}
|
|
306
|
-
i++;
|
|
307
|
-
}
|
|
308
|
-
|
|
309
|
-
// do final char; we process this separately to prevent an overflow if the final char is '='
|
|
310
|
-
if(i == -1) {
|
|
311
|
-
c = es[i];
|
|
312
|
-
if(c != '\n' && c != '\r' && c != '=') {
|
|
313
|
-
*p++ = c - 42;
|
|
314
|
-
} else if(state) {
|
|
315
|
-
if(c == '=') *state = YDEC_STATE_EQ;
|
|
316
|
-
else if(c == '\r') *state = YDEC_STATE_CR;
|
|
317
|
-
else *state = YDEC_STATE_NONE;
|
|
318
|
-
}
|
|
319
|
-
}
|
|
320
|
-
#undef YDEC_CHECK_END
|
|
321
|
-
|
|
322
|
-
*src = es;
|
|
323
|
-
*dest = p;
|
|
324
|
-
return YDEC_END_NONE;
|
|
325
|
-
}
|
|
326
|
-
|
|
327
|
-
template<bool isRaw, bool searchEnd>
|
|
328
|
-
YencDecoderEnd do_decode_scalar(const unsigned char** src, unsigned char** dest, size_t len, YencDecoderState* state) {
|
|
329
|
-
if(searchEnd)
|
|
330
|
-
return do_decode_end_scalar<isRaw>(src, dest, len, state);
|
|
331
|
-
*dest += do_decode_noend_scalar<isRaw>(*src, *dest, len, state);
|
|
332
|
-
*src += len;
|
|
333
|
-
return YDEC_END_NONE;
|
|
334
|
-
}
|
|
335
23
|
|
|
336
24
|
|
|
337
25
|
|
|
338
26
|
template<bool isRaw, bool searchEnd, void(&kernel)(const uint8_t*, long&, unsigned char*&, unsigned char&, uint16_t&)>
|
|
339
|
-
inline YencDecoderEnd _do_decode_simd(size_t width, const unsigned char** src, unsigned char** dest, size_t len, YencDecoderState* state) {
|
|
27
|
+
static inline RapidYenc::YencDecoderEnd _do_decode_simd(size_t width, const unsigned char** src, unsigned char** dest, size_t len, RapidYenc::YencDecoderState* state) {
|
|
28
|
+
using namespace RapidYenc;
|
|
29
|
+
|
|
340
30
|
if(len <= width*2) return do_decode_scalar<isRaw, searchEnd>(src, dest, len, state);
|
|
341
31
|
|
|
342
32
|
YencDecoderState tState = YDEC_STATE_CRLF;
|
|
@@ -466,17 +156,19 @@ inline YencDecoderEnd _do_decode_simd(size_t width, const unsigned char** src, u
|
|
|
466
156
|
}
|
|
467
157
|
|
|
468
158
|
template<bool isRaw, bool searchEnd, size_t width, void(&kernel)(const uint8_t*, long&, unsigned char*&, unsigned char&, uint16_t&)>
|
|
469
|
-
YencDecoderEnd do_decode_simd(const unsigned char** src, unsigned char** dest, size_t len, YencDecoderState* state) {
|
|
159
|
+
static RapidYenc::YencDecoderEnd do_decode_simd(const unsigned char** src, unsigned char** dest, size_t len, RapidYenc::YencDecoderState* state) {
|
|
470
160
|
return _do_decode_simd<isRaw, searchEnd, kernel>(width, src, dest, len, state);
|
|
471
161
|
}
|
|
472
162
|
template<bool isRaw, bool searchEnd, size_t(&getWidth)(), void(&kernel)(const uint8_t*, long&, unsigned char*&, unsigned char&, uint16_t&)>
|
|
473
|
-
YencDecoderEnd do_decode_simd(const unsigned char** src, unsigned char** dest, size_t len, YencDecoderState* state) {
|
|
163
|
+
static RapidYenc::YencDecoderEnd do_decode_simd(const unsigned char** src, unsigned char** dest, size_t len, RapidYenc::YencDecoderState* state) {
|
|
474
164
|
return _do_decode_simd<isRaw, searchEnd, kernel>(getWidth(), src, dest, len, state);
|
|
475
165
|
}
|
|
476
166
|
|
|
477
167
|
|
|
478
168
|
#if defined(PLATFORM_X86) || defined(PLATFORM_ARM)
|
|
479
|
-
|
|
169
|
+
namespace RapidYenc {
|
|
170
|
+
void decoder_init_lut(void* compactLUT);
|
|
171
|
+
}
|
|
480
172
|
#endif
|
|
481
173
|
|
|
482
174
|
template<bool isRaw>
|
|
@@ -509,16 +201,20 @@ static inline uint16_t decoder_set_nextMask(const uint8_t* src, unsigned mask) {
|
|
|
509
201
|
// resolve invalid sequences of = to deal with cases like '===='
|
|
510
202
|
// bit hack inspired from simdjson: https://youtu.be/wlvKAT7SZIQ?t=33m38s
|
|
511
203
|
template<typename T>
|
|
512
|
-
static inline T fix_eqMask(T mask) {
|
|
204
|
+
static inline T fix_eqMask(T mask, T maskShift1) {
|
|
513
205
|
// isolate the start of each consecutive bit group (e.g. 01011101 -> 01000101)
|
|
514
|
-
T start = mask & ~
|
|
206
|
+
T start = mask & ~maskShift1;
|
|
207
|
+
|
|
208
|
+
// this strategy works by firstly separating groups that start on even/odd bits
|
|
209
|
+
// generally, it doesn't matter which one (even/odd) we pick, but clearing even groups specifically allows the escFirst bit in maskShift1 to work
|
|
210
|
+
// (this is because the start of the escFirst group is at index -1, an odd bit, but we can't clear it due to being < 0, so we just retain all odd groups instead)
|
|
515
211
|
|
|
516
|
-
const T
|
|
212
|
+
const T even = (T)0x5555555555555555; // every even bit (01010101...)
|
|
517
213
|
|
|
518
|
-
// obtain groups which start on an
|
|
519
|
-
T
|
|
214
|
+
// obtain groups which start on an odd bit (clear groups that start on an even bit, but this leaves an unwanted trailing bit)
|
|
215
|
+
T oddGroups = mask + (start & even);
|
|
520
216
|
|
|
521
|
-
// clear
|
|
217
|
+
// clear even bits in odd groups, whilst conversely preserving even bits in even groups
|
|
522
218
|
// the `& mask` also conveniently gets rid of unwanted trailing bits
|
|
523
|
-
return (
|
|
219
|
+
return (oddGroups ^ even) & mask;
|
|
524
220
|
}
|
package/src/decoder_neon.cc
CHANGED
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
#include "common.h"
|
|
2
|
-
#ifdef __ARM_NEON
|
|
3
|
-
|
|
4
2
|
#include "decoder_common.h"
|
|
5
3
|
|
|
4
|
+
#ifdef __ARM_NEON
|
|
5
|
+
|
|
6
6
|
|
|
7
7
|
#if defined(_MSC_VER) && !defined(__clang__)
|
|
8
8
|
# define vld1_u8_align(p, a) vld1_u8_ex(p, a*8)
|
|
@@ -53,6 +53,8 @@ static bool neon_vect_is_nonzero(uint8x16_t v) {
|
|
|
53
53
|
}
|
|
54
54
|
|
|
55
55
|
|
|
56
|
+
namespace RapidYenc {
|
|
57
|
+
|
|
56
58
|
template<bool isRaw, bool searchEnd>
|
|
57
59
|
HEDLEY_ALWAYS_INLINE void do_decode_neon(const uint8_t* src, long& len, unsigned char*& p, unsigned char& escFirst, uint16_t& nextMask) {
|
|
58
60
|
HEDLEY_ASSUME(escFirst == 0 || escFirst == 1);
|
|
@@ -322,8 +324,9 @@ HEDLEY_ALWAYS_INLINE void do_decode_neon(const uint8_t* src, long& len, unsigned
|
|
|
322
324
|
// a spec compliant encoder should never generate sequences: ==, =\n and =\r, but we'll handle them to be spec compliant
|
|
323
325
|
// the yEnc specification requires any character following = to be unescaped, not skipped over, so we'll deal with that
|
|
324
326
|
// firstly, check for invalid sequences of = (we assume that these are rare, as a spec compliant yEnc encoder should not generate these)
|
|
325
|
-
|
|
326
|
-
|
|
327
|
+
uint32_t maskEqShift1 = (maskEq << 1) | escFirst;
|
|
328
|
+
if(LIKELIHOOD(0.0001, (mask & maskEqShift1) != 0)) {
|
|
329
|
+
maskEq = fix_eqMask<uint32_t>(maskEq, maskEqShift1);
|
|
327
330
|
|
|
328
331
|
unsigned char nextEscFirst = maskEq>>31;
|
|
329
332
|
// next, eliminate anything following a `=` from the special char mask; this eliminates cases of `=\r` so that they aren't removed
|
|
@@ -445,8 +448,9 @@ HEDLEY_ALWAYS_INLINE void do_decode_neon(const uint8_t* src, long& len, unsigned
|
|
|
445
448
|
}
|
|
446
449
|
}
|
|
447
450
|
}
|
|
451
|
+
} // namespace
|
|
448
452
|
|
|
449
|
-
void decoder_set_neon_funcs() {
|
|
453
|
+
void RapidYenc::decoder_set_neon_funcs() {
|
|
450
454
|
decoder_init_lut(compactLUT);
|
|
451
455
|
_do_decode = &do_decode_simd<false, false, sizeof(uint8x16_t)*2, do_decode_neon<false, false> >;
|
|
452
456
|
_do_decode_raw = &do_decode_simd<true, false, sizeof(uint8x16_t)*2, do_decode_neon<true, false> >;
|
|
@@ -454,5 +458,5 @@ void decoder_set_neon_funcs() {
|
|
|
454
458
|
_decode_isa = ISA_LEVEL_NEON;
|
|
455
459
|
}
|
|
456
460
|
#else
|
|
457
|
-
void decoder_set_neon_funcs() {}
|
|
461
|
+
void RapidYenc::decoder_set_neon_funcs() {}
|
|
458
462
|
#endif
|
package/src/decoder_neon64.cc
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
#include "common.h"
|
|
2
|
+
#include "decoder_common.h"
|
|
2
3
|
#if defined(__ARM_NEON) && defined(__aarch64__)
|
|
3
4
|
|
|
4
|
-
#include "decoder_common.h"
|
|
5
5
|
|
|
6
6
|
#pragma pack(16)
|
|
7
7
|
static struct { char bytes[16]; } ALIGN_TO(16, compactLUT[32768]);
|
|
@@ -44,6 +44,8 @@ static HEDLEY_ALWAYS_INLINE uint8x16_t mergeCompares(uint8x16_t a, uint8x16_t b,
|
|
|
44
44
|
}
|
|
45
45
|
|
|
46
46
|
|
|
47
|
+
namespace RapidYenc {
|
|
48
|
+
|
|
47
49
|
template<bool isRaw, bool searchEnd>
|
|
48
50
|
HEDLEY_ALWAYS_INLINE void do_decode_neon(const uint8_t* src, long& len, unsigned char*& p, unsigned char& escFirst, uint16_t& nextMask) {
|
|
49
51
|
HEDLEY_ASSUME(escFirst == 0 || escFirst == 1);
|
|
@@ -290,8 +292,9 @@ HEDLEY_ALWAYS_INLINE void do_decode_neon(const uint8_t* src, long& len, unsigned
|
|
|
290
292
|
// a spec compliant encoder should never generate sequences: ==, =\n and =\r, but we'll handle them to be spec compliant
|
|
291
293
|
// the yEnc specification requires any character following = to be unescaped, not skipped over, so we'll deal with that
|
|
292
294
|
// firstly, check for invalid sequences of = (we assume that these are rare, as a spec compliant yEnc encoder should not generate these)
|
|
293
|
-
|
|
294
|
-
|
|
295
|
+
uint64_t maskEqShift1 = (maskEq << 1) | escFirst;
|
|
296
|
+
if(LIKELIHOOD(0.0001, (mask & maskEqShift1) != 0)) {
|
|
297
|
+
maskEq = fix_eqMask<uint64_t>(maskEq, maskEqShift1);
|
|
295
298
|
|
|
296
299
|
unsigned char nextEscFirst = maskEq>>63;
|
|
297
300
|
// next, eliminate anything following a `=` from the special char mask; this eliminates cases of `=\r` so that they aren't removed
|
|
@@ -429,8 +432,9 @@ HEDLEY_ALWAYS_INLINE void do_decode_neon(const uint8_t* src, long& len, unsigned
|
|
|
429
432
|
}
|
|
430
433
|
}
|
|
431
434
|
}
|
|
435
|
+
} // namespace
|
|
432
436
|
|
|
433
|
-
void decoder_set_neon_funcs() {
|
|
437
|
+
void RapidYenc::decoder_set_neon_funcs() {
|
|
434
438
|
decoder_init_lut(compactLUT);
|
|
435
439
|
_do_decode = &do_decode_simd<false, false, sizeof(uint8x16_t)*4, do_decode_neon<false, false> >;
|
|
436
440
|
_do_decode_raw = &do_decode_simd<true, false, sizeof(uint8x16_t)*4, do_decode_neon<true, false> >;
|
|
@@ -438,5 +442,5 @@ void decoder_set_neon_funcs() {
|
|
|
438
442
|
_decode_isa = ISA_LEVEL_NEON;
|
|
439
443
|
}
|
|
440
444
|
#else
|
|
441
|
-
void decoder_set_neon_funcs() {}
|
|
445
|
+
void RapidYenc::decoder_set_neon_funcs() {}
|
|
442
446
|
#endif
|
package/src/decoder_rvv.cc
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
#include "common.h"
|
|
2
|
-
#ifdef __riscv_vector
|
|
3
2
|
#include "decoder_common.h"
|
|
3
|
+
#ifdef __riscv_vector
|
|
4
4
|
|
|
5
5
|
|
|
6
6
|
#ifdef __riscv_v_intrinsic
|
|
@@ -29,6 +29,17 @@ static inline vbool4_t mask_lshift(vbool4_t m, unsigned shiftIn, size_t vl) {
|
|
|
29
29
|
RV_MASK_CAST(4, 8, mvl), RV_MASK_CAST(4, 8, mvr), vl
|
|
30
30
|
);
|
|
31
31
|
}
|
|
32
|
+
template<int shift>
|
|
33
|
+
static inline vbool64_t mask_lshift(vbool64_t m, unsigned shiftIn, size_t vl) {
|
|
34
|
+
vuint8m1_t mv = RV_VEC_CAST(64, 8, m);
|
|
35
|
+
vuint8m1_t mvl = RV(vsll_vx_u8m1)(mv, shift, vl/8);
|
|
36
|
+
vuint8m1_t mvr = RV(vsrl_vx_u8m1)(mv, 8-shift, vl/8);
|
|
37
|
+
mvr = RV(vslide1up_vx_u8m1)(mvr, shiftIn, vl/8);
|
|
38
|
+
|
|
39
|
+
return RV(vmor_mm_b64)(
|
|
40
|
+
RV_MASK_CAST(64, 8, mvl), RV_MASK_CAST(64, 8, mvr), vl
|
|
41
|
+
);
|
|
42
|
+
}
|
|
32
43
|
|
|
33
44
|
static inline vuint8m2_t set_first_vu8(vuint8m2_t src, uint8_t item, size_t vl) {
|
|
34
45
|
#ifdef __riscv_v_intrinsic
|
|
@@ -48,6 +59,7 @@ static inline vuint16m2_t set_first_vu16(vuint16m2_t src, uint16_t item, size_t
|
|
|
48
59
|
}
|
|
49
60
|
|
|
50
61
|
|
|
62
|
+
namespace RapidYenc {
|
|
51
63
|
|
|
52
64
|
template<bool isRaw, bool searchEnd>
|
|
53
65
|
HEDLEY_ALWAYS_INLINE void do_decode_rvv(const uint8_t* src, long& len, unsigned char*& outp, unsigned char& escFirst, uint16_t& nextMask) {
|
|
@@ -195,48 +207,41 @@ HEDLEY_ALWAYS_INLINE void do_decode_rvv(const uint8_t* src, long& len, unsigned
|
|
|
195
207
|
// the yEnc specification requires any character following = to be unescaped, not skipped over, so we'll deal with that
|
|
196
208
|
// firstly, check for invalid sequences of = (we assume that these are rare, as a spec compliant yEnc encoder should not generate these)
|
|
197
209
|
if(LIKELIHOOD(0.0001, RV(vcpop_m_b4)(RV(vmandn_mm_b4)(cmpEqShift1, cmp, vl2), vl2) != 0)) {
|
|
198
|
-
//
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
// slide the new value in from the top
|
|
223
|
-
#if __riscv_xlen == 64
|
|
224
|
-
cmpEqW = RV(vslide1down_vx_u64m1)(cmpEqW, maskW, wvl);
|
|
225
|
-
#else
|
|
226
|
-
cmpEqW = RV(vslide1down_vx_u32m1)(cmpEqW, maskW, wvl);
|
|
227
|
-
#endif
|
|
210
|
+
// replicate fix_eqMask, but in vector form
|
|
211
|
+
vbool4_t groupStart = RV(vmandn_mm_b4)(cmpEq, cmpEqShift1, vl2);
|
|
212
|
+
vbool4_t evenBits = RV_MASK_CAST(4, 8, RV(vmv_v_x_u8m1)(0x55, vl2));
|
|
213
|
+
vbool4_t evenStart = RV(vmand_mm_b4)(groupStart, evenBits, vl2);
|
|
214
|
+
|
|
215
|
+
// compute `cmpEq + evenStart` to obtain oddGroups
|
|
216
|
+
vbool4_t oddGroups;
|
|
217
|
+
vuint64m1_t cmpEq64 = RV_VEC_CAST(4, 64, cmpEq);
|
|
218
|
+
vuint64m1_t evenStart64 = RV_VEC_CAST(4, 64, evenStart);
|
|
219
|
+
vuint64m1_t oddGroups64;
|
|
220
|
+
if(vl2 <= 64) {
|
|
221
|
+
// no loop needed - single 64b add will work
|
|
222
|
+
oddGroups64 = RV(vadd_vv_u64m1)(cmpEq64, evenStart64, 1);
|
|
223
|
+
} else {
|
|
224
|
+
// need to loop whilst the add causes a carry
|
|
225
|
+
unsigned vl64 = vl2/64;
|
|
226
|
+
vbool64_t carry = RV(vmadc_vv_u64m1_b64)(cmpEq64, evenStart64, vl64);
|
|
227
|
+
carry = mask_lshift<1>(carry, 0, vl64);
|
|
228
|
+
oddGroups64 = RV(vadd_vv_u64m1)(cmpEq64, evenStart64, 1);
|
|
229
|
+
while(RV(vcpop_m_b64)(carry, vl64)) {
|
|
230
|
+
vbool64_t nextCarry = RV(vmadc_vx_u64m1_b64)(oddGroups64, 1, vl64);
|
|
231
|
+
oddGroups64 = RV(vadd_vx_u64m1_mu)(carry, oddGroups64, oddGroups64, 1, vl64);
|
|
232
|
+
carry = mask_lshift<1>(nextCarry, 0, vl64);
|
|
233
|
+
}
|
|
228
234
|
}
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
235
|
+
oddGroups = RV_MASK_CAST(4, 64, oddGroups64);
|
|
236
|
+
|
|
237
|
+
cmpEq = RV(vmand_mm_b4)(RV(vmxor_mm_b4)(oddGroups, evenBits, vl2), cmpEq, vl2);
|
|
238
|
+
|
|
239
|
+
cmpEqShift1 = mask_lshift<1>(cmpEq, escFirst, vl2);
|
|
234
240
|
cmp = RV(vmor_mm_b4)(cmpEqShift1, cmp, vl2); // ~(~cmp & ~cmpEqShift1)
|
|
235
241
|
numOutputChars = RV(vcpop_m_b4)(cmp, vl2);
|
|
236
|
-
} else {
|
|
237
|
-
// no invalid = sequences found - don't need to fix up cmpEq
|
|
238
|
-
escFirst = RV(vcpop_m_b4)(RV(vmand_mm_b4)(cmpEq, lastBit, vl2), vl2);
|
|
239
242
|
}
|
|
243
|
+
escFirst = RV(vcpop_m_b4)(RV(vmand_mm_b4)(cmpEq, lastBit, vl2), vl2);
|
|
244
|
+
|
|
240
245
|
data = RV(vsub_vv_u8m2)(data, RV_vmerge_vxm_u8m2(yencOffset, 64+42, cmpEqShift1, vl2), vl2);
|
|
241
246
|
yencOffset = set_first_vu8(yencOffset, 42 | (escFirst<<6), vl2);
|
|
242
247
|
|
|
@@ -262,13 +267,14 @@ HEDLEY_ALWAYS_INLINE void do_decode_rvv(const uint8_t* src, long& len, unsigned
|
|
|
262
267
|
size_t decoder_rvv_width() {
|
|
263
268
|
return RV(vsetvlmax_e8m2)();
|
|
264
269
|
}
|
|
270
|
+
} // namespace
|
|
265
271
|
|
|
266
|
-
void decoder_set_rvv_funcs() {
|
|
272
|
+
void RapidYenc::decoder_set_rvv_funcs() {
|
|
267
273
|
_do_decode = &do_decode_simd<false, false, decoder_rvv_width, do_decode_rvv<false, false> >;
|
|
268
274
|
_do_decode_raw = &do_decode_simd<true, false, decoder_rvv_width, do_decode_rvv<true, false> >;
|
|
269
275
|
_do_decode_end_raw = &do_decode_simd<true, true, decoder_rvv_width, do_decode_rvv<true, true> >;
|
|
270
276
|
_decode_isa = ISA_LEVEL_RVV;
|
|
271
277
|
}
|
|
272
278
|
#else
|
|
273
|
-
void decoder_set_rvv_funcs() {}
|
|
279
|
+
void RapidYenc::decoder_set_rvv_funcs() {}
|
|
274
280
|
#endif
|
package/src/decoder_sse2.cc
CHANGED
|
@@ -1,10 +1,10 @@
|
|
|
1
1
|
#include "common.h"
|
|
2
2
|
|
|
3
|
-
#ifdef __SSE2__
|
|
4
3
|
#include "decoder_common.h"
|
|
4
|
+
#ifdef __SSE2__
|
|
5
5
|
#include "decoder_sse_base.h"
|
|
6
6
|
|
|
7
|
-
void decoder_sse_init(SSELookups* HEDLEY_RESTRICT& lookups) {
|
|
7
|
+
void RapidYenc::decoder_sse_init(RapidYenc::SSELookups* HEDLEY_RESTRICT& lookups) {
|
|
8
8
|
ALIGN_ALLOC(lookups, sizeof(SSELookups), 16);
|
|
9
9
|
for(int i=0; i<256; i++) {
|
|
10
10
|
lookups->BitsSetTable256inv[i] = 8 - (
|
|
@@ -25,7 +25,7 @@ void decoder_sse_init(SSELookups* HEDLEY_RESTRICT& lookups) {
|
|
|
25
25
|
}
|
|
26
26
|
}
|
|
27
27
|
|
|
28
|
-
void decoder_set_sse2_funcs() {
|
|
28
|
+
void RapidYenc::decoder_set_sse2_funcs() {
|
|
29
29
|
decoder_sse_init(lookups);
|
|
30
30
|
decoder_init_lut(lookups->compact);
|
|
31
31
|
_do_decode = &do_decode_simd<false, false, sizeof(__m128i)*2, do_decode_sse<false, false, ISA_LEVEL_SSE2> >;
|
|
@@ -34,5 +34,5 @@ void decoder_set_sse2_funcs() {
|
|
|
34
34
|
_decode_isa = ISA_LEVEL_SSE2;
|
|
35
35
|
}
|
|
36
36
|
#else
|
|
37
|
-
void decoder_set_sse2_funcs() {}
|
|
37
|
+
void RapidYenc::decoder_set_sse2_funcs() {}
|
|
38
38
|
#endif
|