yencode 1.1.5 → 1.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. package/README.md +130 -189
  2. package/binding.gyp +115 -6
  3. package/index.js +2 -0
  4. package/package.json +1 -1
  5. package/src/common.h +37 -7
  6. package/src/crc.cc +121 -47
  7. package/src/crc.h +74 -10
  8. package/src/crc_arm.cc +51 -34
  9. package/src/crc_arm_pmull.cc +215 -0
  10. package/src/crc_common.h +22 -0
  11. package/src/crc_folding.cc +154 -16
  12. package/src/crc_folding_256.cc +7 -14
  13. package/src/crc_riscv.cc +251 -0
  14. package/src/decoder.cc +373 -13
  15. package/src/decoder.h +10 -14
  16. package/src/decoder_avx.cc +5 -6
  17. package/src/decoder_avx2.cc +8 -9
  18. package/src/decoder_avx2_base.h +7 -11
  19. package/src/decoder_common.h +56 -373
  20. package/src/decoder_neon.cc +13 -19
  21. package/src/decoder_neon64.cc +12 -15
  22. package/src/decoder_rvv.cc +280 -0
  23. package/src/decoder_sse2.cc +26 -5
  24. package/src/decoder_sse_base.h +20 -40
  25. package/src/decoder_ssse3.cc +5 -6
  26. package/src/decoder_vbmi2.cc +6 -13
  27. package/src/encoder.cc +42 -26
  28. package/src/encoder.h +5 -7
  29. package/src/encoder_avx.cc +3 -3
  30. package/src/encoder_avx2.cc +3 -3
  31. package/src/encoder_avx_base.h +3 -0
  32. package/src/encoder_common.h +26 -32
  33. package/src/encoder_neon.cc +6 -3
  34. package/src/encoder_rvv.cc +13 -26
  35. package/src/encoder_sse2.cc +3 -2
  36. package/src/encoder_sse_base.h +2 -0
  37. package/src/encoder_ssse3.cc +3 -3
  38. package/src/encoder_vbmi2.cc +6 -7
  39. package/src/platform.cc +24 -23
  40. package/src/yencode.cc +54 -11
  41. package/test/_speedbase.js +4 -2
  42. package/test/speeddec.js +25 -16
  43. package/test/speedenc.js +21 -17
  44. package/test/testcrc.js +17 -1
  45. package/test/testcrcfuncs.c +53 -0
  46. package/test/testdec.js +1 -0
@@ -0,0 +1,251 @@
1
+ #include "crc_common.h"
2
+
3
+ #if defined(__riscv) && defined(__GNUC__) && (defined(__riscv_zbkc) || defined(__riscv_zbc))
4
+
5
+ #if __has_include(<riscv_bitmanip.h>)
6
+ # include <riscv_bitmanip.h>
7
+ # if __riscv_xlen == 64
8
+ # define rv_clmul __riscv_clmul_64
9
+ # define rv_clmulh __riscv_clmulh_64
10
+ # else
11
+ # define rv_clmul __riscv_clmul_32
12
+ # define rv_clmulh __riscv_clmulh_32
13
+ # endif
14
+ #else
15
+ static HEDLEY_ALWAYS_INLINE uintptr_t rv_clmul(uintptr_t x, uintptr_t y) {
16
+ uintptr_t r;
17
+ __asm__("clmul %0, %1, %2\n"
18
+ : "=r"(r)
19
+ : "r"(x), "r"(y)
20
+ :);
21
+ return r;
22
+ }
23
+ static HEDLEY_ALWAYS_INLINE uintptr_t rv_clmulh(uintptr_t x, uintptr_t y) {
24
+ uintptr_t r;
25
+ __asm__("clmulh %0, %1, %2\n"
26
+ : "=r"(r)
27
+ : "r"(x), "r"(y)
28
+ :);
29
+ return r;
30
+ }
31
+ #endif
32
+
33
+ // TODO: test big-endian
34
+ #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
35
+ # if __riscv_xlen == 64
36
+ # define SWAP __builtin_bswap64
37
+ # else
38
+ # define SWAP __builtin_bswap32
39
+ # endif
40
+ #else
41
+ # define SWAP(d) (d)
42
+ #endif
43
+ static HEDLEY_ALWAYS_INLINE uintptr_t read_partial(const void* p, unsigned sz) {
44
+ uintptr_t data = 0;
45
+ memcpy(&data, p, sz);
46
+ return SWAP(data);
47
+ }
48
+ static HEDLEY_ALWAYS_INLINE uintptr_t read_full(const uintptr_t* p) {
49
+ return SWAP(*p);
50
+ }
51
+ #undef SWAP
52
+
53
+ static uint32_t rv_crc_calc(uint32_t crc, const unsigned char *src, long len) {
54
+ uintptr_t accum[4] = {};
55
+
56
+ // note: constants here are bit-reflected and shifted left by 1
57
+ // Zbc does also have clmulr to avoid the shift, but:
58
+ // - there's no clmulhr, so for XLEN=64, just shift the constant instead to get the same result
59
+ // - it's unavailable in Zbkc
60
+ // - for XLEN=32, 2x constants is likely worth it to avoid the additional XORs in the loop
61
+
62
+ #if __riscv_xlen == 64
63
+ const uint64_t MUL_HI = 0x15a546366 /*2^224*/, MUL_LO = 0xf1da05aa /*2^288*/;
64
+ #define CLMULL rv_clmul
65
+ #define CLMULH rv_clmulh
66
+
67
+ accum[3] = rv_clmul(crc, 0xb66b1fa6); // 2^-32
68
+ #elif __riscv_xlen == 32
69
+ const uint64_t MUL_HI = 0x140d44a2e /*2^128*/, MUL_LO = 0x1751997d0 /*2^160*/;
70
+ #define CLMULL(x, k) rv_clmul(x, k & 0xffffffff)
71
+ #define CLMULH(x, k) (rv_clmulh(x, k & 0xffffffff) ^ (k > 0xffffffffULL ? (x) : 0))
72
+
73
+ accum[2] = rv_clmul(crc, 0xb66b1fa6);
74
+ accum[3] = rv_clmulh(crc, 0xb66b1fa6);
75
+ #else
76
+ #error "Unknown __riscv_xlen"
77
+ #endif
78
+ const size_t WS = sizeof(uintptr_t);
79
+
80
+ // if src isn't word-aligned, process until it is so
81
+ long initial_alignment = ((uintptr_t)src & (WS-1));
82
+ long initial_process = WS - initial_alignment;
83
+ if(initial_alignment && len >= initial_process) {
84
+ unsigned shl = initial_alignment * 8, shr = initial_process * 8;
85
+ #if __riscv_xlen == 64
86
+ accum[2] = accum[3] << shl;
87
+ #else
88
+ accum[1] = accum[2] << shl;
89
+ accum[2] = (accum[3] << shl) | (accum[2] >> shr);
90
+ #endif
91
+ accum[3] = (read_partial(src, initial_process) << shl) | (accum[3] >> shr);
92
+ src += initial_process;
93
+ len -= initial_process;
94
+ }
95
+
96
+ // main processing loop
97
+ const uintptr_t* srcW = (const uintptr_t*)src;
98
+ while((len -= WS*4) >= 0) {
99
+ uintptr_t tmpHi, tmpLo;
100
+ tmpLo = CLMULL(accum[0], MUL_LO) ^ CLMULL(accum[1], MUL_HI);
101
+ tmpHi = CLMULH(accum[0], MUL_LO) ^ CLMULH(accum[1], MUL_HI);
102
+ accum[0] = tmpLo ^ read_full(srcW++);
103
+ accum[1] = tmpHi ^ read_full(srcW++);
104
+
105
+ tmpLo = CLMULL(accum[2], MUL_LO) ^ CLMULL(accum[3], MUL_HI);
106
+ tmpHi = CLMULH(accum[2], MUL_LO) ^ CLMULH(accum[3], MUL_HI);
107
+ accum[2] = tmpLo ^ read_full(srcW++);
108
+ accum[3] = tmpHi ^ read_full(srcW++);
109
+ }
110
+
111
+ // process trailing bytes
112
+ if(len & (WS*2)) {
113
+ uintptr_t tmpLo = CLMULL(accum[0], MUL_LO) ^ CLMULL(accum[1], MUL_HI);
114
+ uintptr_t tmpHi = CLMULH(accum[0], MUL_LO) ^ CLMULH(accum[1], MUL_HI);
115
+ accum[0] = accum[2];
116
+ accum[1] = accum[3];
117
+ accum[2] = tmpLo ^ read_full(srcW++);
118
+ accum[3] = tmpHi ^ read_full(srcW++);
119
+ }
120
+ if(len & WS) {
121
+ uintptr_t tmpLo = CLMULL(accum[0], MUL_HI);
122
+ uintptr_t tmpHi = CLMULH(accum[0], MUL_HI);
123
+ accum[0] = accum[1];
124
+ accum[1] = accum[2];
125
+ accum[2] = accum[3] ^ tmpLo;
126
+ accum[3] = tmpHi ^ read_full(srcW++);
127
+ }
128
+
129
+ size_t tail = len & (WS-1);
130
+ if(tail) {
131
+ unsigned shl = ((WS - tail) * 8), shr = tail * 8;
132
+ uintptr_t tmp = accum[0] << shl;
133
+ uintptr_t tmpLo = CLMULL(tmp, MUL_HI);
134
+ uintptr_t tmpHi = CLMULH(tmp, MUL_HI);
135
+ accum[0] = (accum[0] >> shr) | (accum[1] << shl);
136
+ accum[1] = (accum[1] >> shr) | (accum[2] << shl);
137
+ accum[2] = (accum[2] >> shr) | (accum[3] << shl);
138
+ accum[3] = (accum[3] >> shr) | (read_partial(srcW, tail) << shl);
139
+ accum[2] ^= tmpLo;
140
+ accum[3] ^= tmpHi;
141
+ }
142
+
143
+
144
+ // done processing: fold everything down
145
+ #if __riscv_xlen == 64
146
+ // fold 0,1 -> 2,3
147
+ accum[2] ^= rv_clmul(accum[0], 0x1751997d0) ^ rv_clmul(accum[1], 0xccaa009e);
148
+ accum[3] ^= rv_clmulh(accum[0], 0x1751997d0) ^ rv_clmulh(accum[1], 0xccaa009e);
149
+
150
+ // fold 2->3
151
+ accum[0] = rv_clmulh(accum[2], 0xccaa009e);
152
+ accum[3] ^= rv_clmul(accum[2], 0xccaa009e);
153
+
154
+ // fold 64b->32b
155
+ accum[1] = rv_clmul(accum[3] & 0xffffffff, 0x163cd6124);
156
+ accum[0] ^= accum[1] >> 32;
157
+ accum[3] = accum[1] ^ (accum[3] >> 32);
158
+ accum[3] <<= 32;
159
+ #else
160
+ // fold 0,1 -> 2,3
161
+ accum[2] ^= rv_clmul(accum[0], 0xccaa009e) ^ CLMULL(accum[1], 0x163cd6124);
162
+ accum[3] ^= rv_clmulh(accum[0], 0xccaa009e) ^ CLMULH(accum[1], 0x163cd6124);
163
+
164
+ // fold 2->3
165
+ accum[0] = CLMULH(accum[2], 0x163cd6124);
166
+ accum[3] ^= CLMULL(accum[2], 0x163cd6124);
167
+ #endif
168
+
169
+ // reduction
170
+ accum[3] = CLMULL(accum[3], 0xf7011641);
171
+ accum[3] = CLMULH(accum[3], 0x1db710640); // maybe consider clmulr for XLEN=32
172
+ crc = accum[0] ^ accum[3];
173
+ return crc;
174
+ #undef CLMULL
175
+ #undef CLMULH
176
+ }
177
+
178
+ static uint32_t do_crc32_incremental_rv_zbc(const void* data, size_t length, uint32_t init) {
179
+ return ~rv_crc_calc(~init, (const unsigned char*)data, (long)length);
180
+ }
181
+
182
+
183
+ #if __riscv_xlen == 64
184
+ // note that prod is shifted by 1 place to the right, due to bit-reflection
185
+ static uint32_t crc32_reduce_rv_zbc(uint64_t prod) {
186
+ uint64_t t = rv_clmul(prod << 33, 0xf7011641);
187
+ t = rv_clmulh(t, 0x1db710640);
188
+ t ^= prod >> 31;
189
+ return t;
190
+ }
191
+ #endif
192
+ static uint32_t crc32_multiply_rv_zbc(uint32_t a, uint32_t b) {
193
+ #if __riscv_xlen == 64
194
+ uint64_t t = crc32_reduce_rv_zbc(rv_clmul(a, b));
195
+ #else
196
+ uint32_t prodLo = rv_clmul(a, b);
197
+ uint32_t prodHi = rv_clmulh(a, b);
198
+
199
+ // fix prodHi for bit-reflection (clmulr would be ideal here)
200
+ prodHi += prodHi;
201
+ prodHi |= prodLo >> 31;
202
+ prodLo += prodLo;
203
+
204
+ uint32_t t = rv_clmul(prodLo, 0xf7011641);
205
+ t ^= rv_clmulh(t, 0xdb710640);
206
+ t ^= prodHi;
207
+ #endif
208
+ return t;
209
+ }
210
+
211
+ #if defined(__GNUC__) || defined(_MSC_VER)
212
+ static uint32_t crc32_shift_rv_zbc(uint32_t crc1, uint32_t n) {
213
+ // TODO: require Zbb for ctz
214
+ uint32_t result = crc1;
215
+ #if __riscv_xlen == 64
216
+ // for n<32, can shift directly
217
+ uint64_t prod = result;
218
+ prod <<= 31 ^ (n&31);
219
+ n &= ~31;
220
+ result = crc32_reduce_rv_zbc(prod);
221
+ #endif
222
+ if(!n) return result;
223
+
224
+ uint32_t result2 = RapidYenc::crc_power[ctz32(n)];
225
+ n &= n-1;
226
+
227
+ while(n) {
228
+ result = crc32_multiply_rv_zbc(result, RapidYenc::crc_power[ctz32(n)]);
229
+ n &= n-1;
230
+
231
+ if(n) {
232
+ result2 = crc32_multiply_rv_zbc(result2, RapidYenc::crc_power[ctz32(n)]);
233
+ n &= n-1;
234
+ }
235
+ }
236
+ return crc32_multiply_rv_zbc(result, result2);
237
+ }
238
+ #endif
239
+
240
+
241
+ void RapidYenc::crc_riscv_set_funcs() {
242
+ _do_crc32_incremental = &do_crc32_incremental_rv_zbc;
243
+ _crc32_multiply = &crc32_multiply_rv_zbc;
244
+ #if defined(__GNUC__) || defined(_MSC_VER)
245
+ _crc32_shift = &crc32_shift_rv_zbc;
246
+ #endif
247
+ _crc32_isa = ISA_FEATURE_ZBC;
248
+ }
249
+ #else
250
+ void RapidYenc::crc_riscv_set_funcs() {}
251
+ #endif
package/src/decoder.cc CHANGED
@@ -3,29 +3,359 @@
3
3
  #include "decoder_common.h"
4
4
  #include "decoder.h"
5
5
 
6
- extern "C" {
6
+
7
+
8
+ // TODO: add branch probabilities
9
+
10
+
11
+ // state var: refers to the previous state - only used for incremental processing
12
+ template<bool isRaw>
13
+ static size_t do_decode_noend_scalar(const unsigned char* src, unsigned char* dest, size_t len, RapidYenc::YencDecoderState* state) {
14
+ using namespace RapidYenc;
15
+
16
+ const unsigned char *es = src + len; // end source pointer
17
+ unsigned char *p = dest; // destination pointer
18
+ long i = -(long)len; // input position
19
+ unsigned char c; // input character
20
+
21
+ if(len < 1) return 0;
22
+
23
+ if(isRaw) {
24
+
25
+ if(state) switch(*state) {
26
+ case YDEC_STATE_EQ:
27
+ c = es[i];
28
+ *p++ = c - 42 - 64;
29
+ i++;
30
+ if(c == '\r') {
31
+ *state = YDEC_STATE_CR;
32
+ if(i >= 0) return 0;
33
+ } else {
34
+ *state = YDEC_STATE_NONE;
35
+ break;
36
+ }
37
+ // fall-thru
38
+ case YDEC_STATE_CR:
39
+ if(es[i] != '\n') break;
40
+ i++;
41
+ *state = YDEC_STATE_CRLF;
42
+ if(i >= 0) return 0;
43
+ // Else fall-thru
44
+ case YDEC_STATE_CRLF:
45
+ // skip past first dot
46
+ if(es[i] == '.') i++;
47
+ // fall-thru
48
+ default: break; // silence compiler warnings
49
+ } else // treat as YDEC_STATE_CRLF
50
+ if(es[i] == '.') i++;
51
+
52
+ for(; i < -2; i++) {
53
+ c = es[i];
54
+ switch(c) {
55
+ case '\r':
56
+ // skip past \r\n. sequences
57
+ //i += (es[i+1] == '\n' && es[i+2] == '.') << 1;
58
+ if(es[i+1] == '\n' && es[i+2] == '.')
59
+ i += 2;
60
+ // fall-thru
61
+ case '\n':
62
+ continue;
63
+ case '=':
64
+ c = es[i+1];
65
+ *p++ = c - 42 - 64;
66
+ i += (c != '\r'); // if we have a \r, reprocess character to deal with \r\n. case
67
+ continue;
68
+ default:
69
+ *p++ = c - 42;
70
+ }
71
+ }
72
+ if(state) *state = YDEC_STATE_NONE;
73
+
74
+ if(i == -2) { // 2nd last char
75
+ c = es[i];
76
+ switch(c) {
77
+ case '\r':
78
+ if(state && es[i+1] == '\n') {
79
+ *state = YDEC_STATE_CRLF;
80
+ return p - dest;
81
+ }
82
+ // Else fall-thru
83
+ case '\n':
84
+ break;
85
+ case '=':
86
+ c = es[i+1];
87
+ *p++ = c - 42 - 64;
88
+ i += (c != '\r');
89
+ break;
90
+ default:
91
+ *p++ = c - 42;
92
+ }
93
+ i++;
94
+ }
95
+
96
+ // do final char; we process this separately to prevent an overflow if the final char is '='
97
+ if(i == -1) {
98
+ c = es[i];
99
+ if(c != '\n' && c != '\r' && c != '=') {
100
+ *p++ = c - 42;
101
+ } else if(state) {
102
+ if(c == '=') *state = YDEC_STATE_EQ;
103
+ else if(c == '\r') *state = YDEC_STATE_CR;
104
+ else *state = YDEC_STATE_NONE;
105
+ }
106
+ }
107
+
108
+ } else {
109
+
110
+ if(state && *state == YDEC_STATE_EQ) {
111
+ *p++ = es[i] - 42 - 64;
112
+ i++;
113
+ *state = YDEC_STATE_NONE;
114
+ }
115
+
116
+ /*for(i = 0; i < len - 1; i++) {
117
+ c = src[i];
118
+ if(c == '\n' || c == '\r') continue;
119
+ unsigned char isEquals = (c == '=');
120
+ i += isEquals;
121
+ *p++ = src[i] - (42 + (isEquals << 6));
122
+ }*/
123
+ for(; i < -1; i++) {
124
+ c = es[i];
125
+ switch(c) {
126
+ case '\n': case '\r': continue;
127
+ case '=':
128
+ i++;
129
+ c = es[i] - 64;
130
+ }
131
+ *p++ = c - 42;
132
+ }
133
+ if(state) *state = YDEC_STATE_NONE;
134
+ // do final char; we process this separately to prevent an overflow if the final char is '='
135
+ if(i == -1) {
136
+ c = es[i];
137
+ if(c != '\n' && c != '\r' && c != '=') {
138
+ *p++ = c - 42;
139
+ } else
140
+ if(state) *state = (c == '=' ? YDEC_STATE_EQ : YDEC_STATE_NONE);
141
+ }
142
+
143
+ }
144
+
145
+ return p - dest;
146
+ }
147
+
148
+ template<bool isRaw>
149
+ static RapidYenc::YencDecoderEnd do_decode_end_scalar(const unsigned char** src, unsigned char** dest, size_t len, RapidYenc::YencDecoderState* state) {
150
+ using namespace RapidYenc;
151
+
152
+ const unsigned char *es = (*src) + len; // end source pointer
153
+ unsigned char *p = *dest; // destination pointer
154
+ long i = -(long)len; // input position
155
+ unsigned char c; // input character
156
+
157
+ if(len < 1) return YDEC_END_NONE;
158
+
159
+ #define YDEC_CHECK_END(s) if(i == 0) { \
160
+ *state = s; \
161
+ *src = es; \
162
+ *dest = p; \
163
+ return YDEC_END_NONE; \
164
+ }
165
+ if(state) switch(*state) {
166
+ case YDEC_STATE_CRLFEQ: do_decode_endable_scalar_ceq:
167
+ if(es[i] == 'y') {
168
+ *state = YDEC_STATE_NONE;
169
+ *src = es+i+1;
170
+ *dest = p;
171
+ return YDEC_END_CONTROL;
172
+ } // Else fall-thru
173
+ case YDEC_STATE_EQ:
174
+ c = es[i];
175
+ *p++ = c - 42 - 64;
176
+ i++;
177
+ if(c != '\r') break;
178
+ YDEC_CHECK_END(YDEC_STATE_CR)
179
+ // fall-through
180
+ case YDEC_STATE_CR:
181
+ if(es[i] != '\n') break;
182
+ i++;
183
+ YDEC_CHECK_END(YDEC_STATE_CRLF)
184
+ // fall-through
185
+ case YDEC_STATE_CRLF: do_decode_endable_scalar_c0:
186
+ if(es[i] == '.' && isRaw) {
187
+ i++;
188
+ YDEC_CHECK_END(YDEC_STATE_CRLFDT)
189
+ } else if(es[i] == '=') {
190
+ i++;
191
+ YDEC_CHECK_END(YDEC_STATE_CRLFEQ)
192
+ goto do_decode_endable_scalar_ceq;
193
+ } else
194
+ break;
195
+ // fall-through
196
+ case YDEC_STATE_CRLFDT:
197
+ if(isRaw && es[i] == '\r') {
198
+ i++;
199
+ YDEC_CHECK_END(YDEC_STATE_CRLFDTCR)
200
+ } else if(isRaw && es[i] == '=') { // check for dot-stuffed ending: \r\n.=y
201
+ i++;
202
+ YDEC_CHECK_END(YDEC_STATE_CRLFEQ)
203
+ goto do_decode_endable_scalar_ceq;
204
+ } else
205
+ break;
206
+ // fall-through
207
+ case YDEC_STATE_CRLFDTCR:
208
+ if(es[i] == '\n') {
209
+ if(isRaw) {
210
+ *state = YDEC_STATE_CRLF;
211
+ *src = es + i + 1;
212
+ *dest = p;
213
+ return YDEC_END_ARTICLE;
214
+ } else {
215
+ i++;
216
+ YDEC_CHECK_END(YDEC_STATE_CRLF)
217
+ goto do_decode_endable_scalar_c0; // handle as CRLF
218
+ }
219
+ } else
220
+ break;
221
+ case YDEC_STATE_NONE: break; // silence compiler warning
222
+ } else // treat as YDEC_STATE_CRLF
223
+ goto do_decode_endable_scalar_c0;
224
+
225
+ for(; i < -2; i++) {
226
+ c = es[i];
227
+ switch(c) {
228
+ case '\r': if(es[i+1] == '\n') {
229
+ if(isRaw && es[i+2] == '.') {
230
+ // skip past \r\n. sequences
231
+ i += 3;
232
+ YDEC_CHECK_END(YDEC_STATE_CRLFDT)
233
+ // check for end
234
+ if(es[i] == '\r') {
235
+ i++;
236
+ YDEC_CHECK_END(YDEC_STATE_CRLFDTCR)
237
+ if(es[i] == '\n') {
238
+ *src = es + i + 1;
239
+ *dest = p;
240
+ *state = YDEC_STATE_CRLF;
241
+ return YDEC_END_ARTICLE;
242
+ } else i--;
243
+ } else if(es[i] == '=') {
244
+ i++;
245
+ YDEC_CHECK_END(YDEC_STATE_CRLFEQ)
246
+ if(es[i] == 'y') {
247
+ *src = es + i + 1;
248
+ *dest = p;
249
+ *state = YDEC_STATE_NONE;
250
+ return YDEC_END_CONTROL;
251
+ } else {
252
+ // escape char & continue
253
+ c = es[i];
254
+ *p++ = c - 42 - 64;
255
+ i -= (c == '\r');
256
+ }
257
+ } else i--;
258
+ }
259
+ else if(es[i+2] == '=') {
260
+ i += 3;
261
+ YDEC_CHECK_END(YDEC_STATE_CRLFEQ)
262
+ if(es[i] == 'y') {
263
+ // ended
264
+ *src = es + i + 1;
265
+ *dest = p;
266
+ *state = YDEC_STATE_NONE;
267
+ return YDEC_END_CONTROL;
268
+ } else {
269
+ // escape char & continue
270
+ c = es[i];
271
+ *p++ = c - 42 - 64;
272
+ i -= (c == '\r');
273
+ }
274
+ }
275
+ } // fall-thru
276
+ case '\n':
277
+ continue;
278
+ case '=':
279
+ c = es[i+1];
280
+ *p++ = c - 42 - 64;
281
+ i += (c != '\r'); // if we have a \r, reprocess character to deal with \r\n. case
282
+ continue;
283
+ default:
284
+ *p++ = c - 42;
285
+ }
286
+ }
287
+ if(state) *state = YDEC_STATE_NONE;
288
+
289
+ if(i == -2) { // 2nd last char
290
+ c = es[i];
291
+ switch(c) {
292
+ case '\r':
293
+ if(state && es[i+1] == '\n') {
294
+ *state = YDEC_STATE_CRLF;
295
+ *src = es;
296
+ *dest = p;
297
+ return YDEC_END_NONE;
298
+ }
299
+ // Else fall-thru
300
+ case '\n':
301
+ break;
302
+ case '=':
303
+ c = es[i+1];
304
+ *p++ = c - 42 - 64;
305
+ i += (c != '\r');
306
+ break;
307
+ default:
308
+ *p++ = c - 42;
309
+ }
310
+ i++;
311
+ }
312
+
313
+ // do final char; we process this separately to prevent an overflow if the final char is '='
314
+ if(i == -1) {
315
+ c = es[i];
316
+ if(c != '\n' && c != '\r' && c != '=') {
317
+ *p++ = c - 42;
318
+ } else if(state) {
319
+ if(c == '=') *state = YDEC_STATE_EQ;
320
+ else if(c == '\r') *state = YDEC_STATE_CR;
321
+ else *state = YDEC_STATE_NONE;
322
+ }
323
+ }
324
+ #undef YDEC_CHECK_END
325
+
326
+ *src = es;
327
+ *dest = p;
328
+ return YDEC_END_NONE;
329
+ }
330
+
331
+ template<bool isRaw, bool searchEnd>
332
+ RapidYenc::YencDecoderEnd RapidYenc::do_decode_scalar(const unsigned char** src, unsigned char** dest, size_t len, RapidYenc::YencDecoderState* state) {
333
+ if(searchEnd)
334
+ return do_decode_end_scalar<isRaw>(src, dest, len, state);
335
+ *dest += do_decode_noend_scalar<isRaw>(*src, *dest, len, state);
336
+ *src += len;
337
+ return YDEC_END_NONE;
338
+ }
339
+
340
+
341
+ namespace RapidYenc {
7
342
  YencDecoderEnd (*_do_decode)(const unsigned char**, unsigned char**, size_t, YencDecoderState*) = &do_decode_scalar<false, false>;
8
343
  YencDecoderEnd (*_do_decode_raw)(const unsigned char**, unsigned char**, size_t, YencDecoderState*) = &do_decode_scalar<true, false>;
9
344
  YencDecoderEnd (*_do_decode_end_raw)(const unsigned char**, unsigned char**, size_t, YencDecoderState*) = &do_decode_end_scalar<true>;
10
345
 
11
346
  int _decode_isa = ISA_GENERIC;
347
+
348
+ template YencDecoderEnd do_decode_scalar<true, true>(const unsigned char**, unsigned char**, size_t, YencDecoderState*);
12
349
  }
13
350
 
14
- void decoder_set_sse2_funcs();
15
- void decoder_set_ssse3_funcs();
16
- void decoder_set_avx_funcs();
17
- void decoder_set_avx2_funcs();
18
- void decoder_set_vbmi2_funcs();
19
- extern const bool decoder_has_avx10;
20
- void decoder_set_neon_funcs();
21
-
22
351
 
23
352
  #if defined(PLATFORM_X86) && defined(YENC_BUILD_NATIVE) && YENC_BUILD_NATIVE!=0
24
353
  # if defined(__AVX2__) && !defined(YENC_DISABLE_AVX256)
25
354
  # include "decoder_avx2_base.h"
26
355
  static inline void decoder_set_native_funcs() {
27
356
  ALIGN_ALLOC(lookups, sizeof(*lookups), 16);
28
- decoder_init_lut(lookups->eqFix, lookups->compact);
357
+ using namespace RapidYenc;
358
+ decoder_init_lut(lookups->compact);
29
359
  _do_decode = &do_decode_simd<false, false, sizeof(__m256i)*2, do_decode_avx2<false, false, ISA_NATIVE> >;
30
360
  _do_decode_raw = &do_decode_simd<true, false, sizeof(__m256i)*2, do_decode_avx2<true, false, ISA_NATIVE> >;
31
361
  _do_decode_end_raw = &do_decode_simd<true, true, sizeof(__m256i)*2, do_decode_avx2<true, true, ISA_NATIVE> >;
@@ -34,8 +364,9 @@ static inline void decoder_set_native_funcs() {
34
364
  # else
35
365
  # include "decoder_sse_base.h"
36
366
  static inline void decoder_set_native_funcs() {
37
- decoder_sse_init();
38
- decoder_init_lut(lookups->eqFix, lookups->compact);
367
+ using namespace RapidYenc;
368
+ decoder_sse_init(lookups);
369
+ decoder_init_lut(lookups->compact);
39
370
  _do_decode = &do_decode_simd<false, false, sizeof(__m128i)*2, do_decode_sse<false, false, ISA_NATIVE> >;
40
371
  _do_decode_raw = &do_decode_simd<true, false, sizeof(__m128i)*2, do_decode_sse<true, false, ISA_NATIVE> >;
41
372
  _do_decode_end_raw = &do_decode_simd<true, true, sizeof(__m128i)*2, do_decode_sse<true, true, ISA_NATIVE> >;
@@ -44,7 +375,32 @@ static inline void decoder_set_native_funcs() {
44
375
  # endif
45
376
  #endif
46
377
 
47
- void decoder_init() {
378
+
379
+ #if defined(PLATFORM_X86) || defined(PLATFORM_ARM)
380
+ void RapidYenc::decoder_init_lut(void* compactLUT) {
381
+ #ifdef YENC_DEC_USE_THINTABLE
382
+ const int tableSize = 8;
383
+ #else
384
+ const int tableSize = 16;
385
+ #endif
386
+ for(int i=0; i<(tableSize==8?256:32768); i++) {
387
+ int k = i;
388
+ uint8_t* res = (uint8_t*)compactLUT + i*tableSize;
389
+ int p = 0;
390
+ for(int j=0; j<tableSize; j++) {
391
+ if(!(k & 1)) {
392
+ res[p++] = j;
393
+ }
394
+ k >>= 1;
395
+ }
396
+ for(; p<tableSize; p++)
397
+ res[p] = 0x80;
398
+ }
399
+ }
400
+ #endif
401
+
402
+
403
+ void RapidYenc::decoder_init() {
48
404
  #ifdef PLATFORM_X86
49
405
  # if defined(YENC_BUILD_NATIVE) && YENC_BUILD_NATIVE!=0
50
406
  decoder_set_native_funcs();
@@ -66,4 +422,8 @@ void decoder_init() {
66
422
  if(cpu_supports_neon())
67
423
  decoder_set_neon_funcs();
68
424
  #endif
425
+ #ifdef __riscv
426
+ if(cpu_supports_rvv())
427
+ decoder_set_rvv_funcs();
428
+ #endif
69
429
  }