yencode 1.2.0 → 1.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,342 +1,32 @@
1
1
  #include "decoder.h"
2
2
 
3
+ namespace RapidYenc {
4
+ void decoder_set_sse2_funcs();
5
+ void decoder_set_ssse3_funcs();
6
+ void decoder_set_avx_funcs();
7
+ void decoder_set_avx2_funcs();
8
+ void decoder_set_vbmi2_funcs();
9
+ extern const bool decoder_has_avx10;
10
+ void decoder_set_neon_funcs();
11
+ void decoder_set_rvv_funcs();
12
+
13
+ template<bool isRaw, bool searchEnd>
14
+ YencDecoderEnd do_decode_scalar(const unsigned char** src, unsigned char** dest, size_t len, YencDecoderState* state);
15
+ }
16
+
17
+
3
18
  #if defined(PLATFORM_ARM) && !defined(__aarch64__)
4
19
  #define YENC_DEC_USE_THINTABLE 1
5
20
  #endif
6
21
 
7
22
  // TODO: need to support max output length somehow
8
- // TODO: add branch probabilities
9
-
10
-
11
- // state var: refers to the previous state - only used for incremental processing
12
- template<bool isRaw>
13
- size_t do_decode_noend_scalar(const unsigned char* src, unsigned char* dest, size_t len, YencDecoderState* state) {
14
- const unsigned char *es = src + len; // end source pointer
15
- unsigned char *p = dest; // destination pointer
16
- long i = -(long)len; // input position
17
- unsigned char c; // input character
18
-
19
- if(len < 1) return 0;
20
-
21
- if(isRaw) {
22
-
23
- if(state) switch(*state) {
24
- case YDEC_STATE_EQ:
25
- c = es[i];
26
- *p++ = c - 42 - 64;
27
- i++;
28
- if(c == '\r') {
29
- *state = YDEC_STATE_CR;
30
- if(i >= 0) return 0;
31
- } else {
32
- *state = YDEC_STATE_NONE;
33
- break;
34
- }
35
- // fall-thru
36
- case YDEC_STATE_CR:
37
- if(es[i] != '\n') break;
38
- i++;
39
- *state = YDEC_STATE_CRLF;
40
- if(i >= 0) return 0;
41
- // Else fall-thru
42
- case YDEC_STATE_CRLF:
43
- // skip past first dot
44
- if(es[i] == '.') i++;
45
- // fall-thru
46
- default: break; // silence compiler warnings
47
- } else // treat as YDEC_STATE_CRLF
48
- if(es[i] == '.') i++;
49
-
50
- for(; i < -2; i++) {
51
- c = es[i];
52
- switch(c) {
53
- case '\r':
54
- // skip past \r\n. sequences
55
- //i += (es[i+1] == '\n' && es[i+2] == '.') << 1;
56
- if(es[i+1] == '\n' && es[i+2] == '.')
57
- i += 2;
58
- // fall-thru
59
- case '\n':
60
- continue;
61
- case '=':
62
- c = es[i+1];
63
- *p++ = c - 42 - 64;
64
- i += (c != '\r'); // if we have a \r, reprocess character to deal with \r\n. case
65
- continue;
66
- default:
67
- *p++ = c - 42;
68
- }
69
- }
70
- if(state) *state = YDEC_STATE_NONE;
71
-
72
- if(i == -2) { // 2nd last char
73
- c = es[i];
74
- switch(c) {
75
- case '\r':
76
- if(state && es[i+1] == '\n') {
77
- *state = YDEC_STATE_CRLF;
78
- return p - dest;
79
- }
80
- // Else fall-thru
81
- case '\n':
82
- break;
83
- case '=':
84
- c = es[i+1];
85
- *p++ = c - 42 - 64;
86
- i += (c != '\r');
87
- break;
88
- default:
89
- *p++ = c - 42;
90
- }
91
- i++;
92
- }
93
-
94
- // do final char; we process this separately to prevent an overflow if the final char is '='
95
- if(i == -1) {
96
- c = es[i];
97
- if(c != '\n' && c != '\r' && c != '=') {
98
- *p++ = c - 42;
99
- } else if(state) {
100
- if(c == '=') *state = YDEC_STATE_EQ;
101
- else if(c == '\r') *state = YDEC_STATE_CR;
102
- else *state = YDEC_STATE_NONE;
103
- }
104
- }
105
-
106
- } else {
107
-
108
- if(state && *state == YDEC_STATE_EQ) {
109
- *p++ = es[i] - 42 - 64;
110
- i++;
111
- *state = YDEC_STATE_NONE;
112
- }
113
-
114
- /*for(i = 0; i < len - 1; i++) {
115
- c = src[i];
116
- if(c == '\n' || c == '\r') continue;
117
- unsigned char isEquals = (c == '=');
118
- i += isEquals;
119
- *p++ = src[i] - (42 + (isEquals << 6));
120
- }*/
121
- for(; i < -1; i++) {
122
- c = es[i];
123
- switch(c) {
124
- case '\n': case '\r': continue;
125
- case '=':
126
- i++;
127
- c = es[i] - 64;
128
- }
129
- *p++ = c - 42;
130
- }
131
- if(state) *state = YDEC_STATE_NONE;
132
- // do final char; we process this separately to prevent an overflow if the final char is '='
133
- if(i == -1) {
134
- c = es[i];
135
- if(c != '\n' && c != '\r' && c != '=') {
136
- *p++ = c - 42;
137
- } else
138
- if(state) *state = (c == '=' ? YDEC_STATE_EQ : YDEC_STATE_NONE);
139
- }
140
-
141
- }
142
-
143
- return p - dest;
144
- }
145
-
146
- template<bool isRaw>
147
- YencDecoderEnd do_decode_end_scalar(const unsigned char** src, unsigned char** dest, size_t len, YencDecoderState* state) {
148
- const unsigned char *es = (*src) + len; // end source pointer
149
- unsigned char *p = *dest; // destination pointer
150
- long i = -(long)len; // input position
151
- unsigned char c; // input character
152
-
153
- if(len < 1) return YDEC_END_NONE;
154
-
155
- #define YDEC_CHECK_END(s) if(i == 0) { \
156
- *state = s; \
157
- *src = es; \
158
- *dest = p; \
159
- return YDEC_END_NONE; \
160
- }
161
- if(state) switch(*state) {
162
- case YDEC_STATE_CRLFEQ: do_decode_endable_scalar_ceq:
163
- if(es[i] == 'y') {
164
- *state = YDEC_STATE_NONE;
165
- *src = es+i+1;
166
- *dest = p;
167
- return YDEC_END_CONTROL;
168
- } // Else fall-thru
169
- case YDEC_STATE_EQ:
170
- c = es[i];
171
- *p++ = c - 42 - 64;
172
- i++;
173
- if(c != '\r') break;
174
- YDEC_CHECK_END(YDEC_STATE_CR)
175
- // fall-through
176
- case YDEC_STATE_CR:
177
- if(es[i] != '\n') break;
178
- i++;
179
- YDEC_CHECK_END(YDEC_STATE_CRLF)
180
- // fall-through
181
- case YDEC_STATE_CRLF: do_decode_endable_scalar_c0:
182
- if(es[i] == '.' && isRaw) {
183
- i++;
184
- YDEC_CHECK_END(YDEC_STATE_CRLFDT)
185
- } else if(es[i] == '=') {
186
- i++;
187
- YDEC_CHECK_END(YDEC_STATE_CRLFEQ)
188
- goto do_decode_endable_scalar_ceq;
189
- } else
190
- break;
191
- // fall-through
192
- case YDEC_STATE_CRLFDT:
193
- if(isRaw && es[i] == '\r') {
194
- i++;
195
- YDEC_CHECK_END(YDEC_STATE_CRLFDTCR)
196
- } else if(isRaw && es[i] == '=') { // check for dot-stuffed ending: \r\n.=y
197
- i++;
198
- YDEC_CHECK_END(YDEC_STATE_CRLFEQ)
199
- goto do_decode_endable_scalar_ceq;
200
- } else
201
- break;
202
- // fall-through
203
- case YDEC_STATE_CRLFDTCR:
204
- if(es[i] == '\n') {
205
- if(isRaw) {
206
- *state = YDEC_STATE_CRLF;
207
- *src = es + i + 1;
208
- *dest = p;
209
- return YDEC_END_ARTICLE;
210
- } else {
211
- i++;
212
- YDEC_CHECK_END(YDEC_STATE_CRLF)
213
- goto do_decode_endable_scalar_c0; // handle as CRLF
214
- }
215
- } else
216
- break;
217
- case YDEC_STATE_NONE: break; // silence compiler warning
218
- } else // treat as YDEC_STATE_CRLF
219
- goto do_decode_endable_scalar_c0;
220
-
221
- for(; i < -2; i++) {
222
- c = es[i];
223
- switch(c) {
224
- case '\r': if(es[i+1] == '\n') {
225
- if(isRaw && es[i+2] == '.') {
226
- // skip past \r\n. sequences
227
- i += 3;
228
- YDEC_CHECK_END(YDEC_STATE_CRLFDT)
229
- // check for end
230
- if(es[i] == '\r') {
231
- i++;
232
- YDEC_CHECK_END(YDEC_STATE_CRLFDTCR)
233
- if(es[i] == '\n') {
234
- *src = es + i + 1;
235
- *dest = p;
236
- *state = YDEC_STATE_CRLF;
237
- return YDEC_END_ARTICLE;
238
- } else i--;
239
- } else if(es[i] == '=') {
240
- i++;
241
- YDEC_CHECK_END(YDEC_STATE_CRLFEQ)
242
- if(es[i] == 'y') {
243
- *src = es + i + 1;
244
- *dest = p;
245
- *state = YDEC_STATE_NONE;
246
- return YDEC_END_CONTROL;
247
- } else {
248
- // escape char & continue
249
- c = es[i];
250
- *p++ = c - 42 - 64;
251
- i -= (c == '\r');
252
- }
253
- } else i--;
254
- }
255
- else if(es[i+2] == '=') {
256
- i += 3;
257
- YDEC_CHECK_END(YDEC_STATE_CRLFEQ)
258
- if(es[i] == 'y') {
259
- // ended
260
- *src = es + i + 1;
261
- *dest = p;
262
- *state = YDEC_STATE_NONE;
263
- return YDEC_END_CONTROL;
264
- } else {
265
- // escape char & continue
266
- c = es[i];
267
- *p++ = c - 42 - 64;
268
- i -= (c == '\r');
269
- }
270
- }
271
- } // fall-thru
272
- case '\n':
273
- continue;
274
- case '=':
275
- c = es[i+1];
276
- *p++ = c - 42 - 64;
277
- i += (c != '\r'); // if we have a \r, reprocess character to deal with \r\n. case
278
- continue;
279
- default:
280
- *p++ = c - 42;
281
- }
282
- }
283
- if(state) *state = YDEC_STATE_NONE;
284
-
285
- if(i == -2) { // 2nd last char
286
- c = es[i];
287
- switch(c) {
288
- case '\r':
289
- if(state && es[i+1] == '\n') {
290
- *state = YDEC_STATE_CRLF;
291
- *src = es;
292
- *dest = p;
293
- return YDEC_END_NONE;
294
- }
295
- // Else fall-thru
296
- case '\n':
297
- break;
298
- case '=':
299
- c = es[i+1];
300
- *p++ = c - 42 - 64;
301
- i += (c != '\r');
302
- break;
303
- default:
304
- *p++ = c - 42;
305
- }
306
- i++;
307
- }
308
-
309
- // do final char; we process this separately to prevent an overflow if the final char is '='
310
- if(i == -1) {
311
- c = es[i];
312
- if(c != '\n' && c != '\r' && c != '=') {
313
- *p++ = c - 42;
314
- } else if(state) {
315
- if(c == '=') *state = YDEC_STATE_EQ;
316
- else if(c == '\r') *state = YDEC_STATE_CR;
317
- else *state = YDEC_STATE_NONE;
318
- }
319
- }
320
- #undef YDEC_CHECK_END
321
-
322
- *src = es;
323
- *dest = p;
324
- return YDEC_END_NONE;
325
- }
326
-
327
- template<bool isRaw, bool searchEnd>
328
- YencDecoderEnd do_decode_scalar(const unsigned char** src, unsigned char** dest, size_t len, YencDecoderState* state) {
329
- if(searchEnd)
330
- return do_decode_end_scalar<isRaw>(src, dest, len, state);
331
- *dest += do_decode_noend_scalar<isRaw>(*src, *dest, len, state);
332
- *src += len;
333
- return YDEC_END_NONE;
334
- }
335
23
 
336
24
 
337
25
 
338
26
  template<bool isRaw, bool searchEnd, void(&kernel)(const uint8_t*, long&, unsigned char*&, unsigned char&, uint16_t&)>
339
- inline YencDecoderEnd _do_decode_simd(size_t width, const unsigned char** src, unsigned char** dest, size_t len, YencDecoderState* state) {
27
+ static inline RapidYenc::YencDecoderEnd _do_decode_simd(size_t width, const unsigned char** src, unsigned char** dest, size_t len, RapidYenc::YencDecoderState* state) {
28
+ using namespace RapidYenc;
29
+
340
30
  if(len <= width*2) return do_decode_scalar<isRaw, searchEnd>(src, dest, len, state);
341
31
 
342
32
  YencDecoderState tState = YDEC_STATE_CRLF;
@@ -466,17 +156,19 @@ inline YencDecoderEnd _do_decode_simd(size_t width, const unsigned char** src, u
466
156
  }
467
157
 
468
158
  template<bool isRaw, bool searchEnd, size_t width, void(&kernel)(const uint8_t*, long&, unsigned char*&, unsigned char&, uint16_t&)>
469
- YencDecoderEnd do_decode_simd(const unsigned char** src, unsigned char** dest, size_t len, YencDecoderState* state) {
159
+ static RapidYenc::YencDecoderEnd do_decode_simd(const unsigned char** src, unsigned char** dest, size_t len, RapidYenc::YencDecoderState* state) {
470
160
  return _do_decode_simd<isRaw, searchEnd, kernel>(width, src, dest, len, state);
471
161
  }
472
162
  template<bool isRaw, bool searchEnd, size_t(&getWidth)(), void(&kernel)(const uint8_t*, long&, unsigned char*&, unsigned char&, uint16_t&)>
473
- YencDecoderEnd do_decode_simd(const unsigned char** src, unsigned char** dest, size_t len, YencDecoderState* state) {
163
+ static RapidYenc::YencDecoderEnd do_decode_simd(const unsigned char** src, unsigned char** dest, size_t len, RapidYenc::YencDecoderState* state) {
474
164
  return _do_decode_simd<isRaw, searchEnd, kernel>(getWidth(), src, dest, len, state);
475
165
  }
476
166
 
477
167
 
478
168
  #if defined(PLATFORM_X86) || defined(PLATFORM_ARM)
479
- void decoder_init_lut(void* compactLUT);
169
+ namespace RapidYenc {
170
+ void decoder_init_lut(void* compactLUT);
171
+ }
480
172
  #endif
481
173
 
482
174
  template<bool isRaw>
@@ -509,16 +201,20 @@ static inline uint16_t decoder_set_nextMask(const uint8_t* src, unsigned mask) {
509
201
  // resolve invalid sequences of = to deal with cases like '===='
510
202
  // bit hack inspired from simdjson: https://youtu.be/wlvKAT7SZIQ?t=33m38s
511
203
  template<typename T>
512
- static inline T fix_eqMask(T mask) {
204
+ static inline T fix_eqMask(T mask, T maskShift1) {
513
205
  // isolate the start of each consecutive bit group (e.g. 01011101 -> 01000101)
514
- T start = mask & ~(mask << 1);
206
+ T start = mask & ~maskShift1;
207
+
208
+ // this strategy works by firstly separating groups that start on even/odd bits
209
+ // generally, it doesn't matter which one (even/odd) we pick, but clearing even groups specifically allows the escFirst bit in maskShift1 to work
210
+ // (this is because the start of the escFirst group is at index -1, an odd bit, but we can't clear it due to being < 0, so we just retain all odd groups instead)
515
211
 
516
- const T odd = (T)0xaaaaaaaaaaaaaaaa; // every odd bit (10101010...)
212
+ const T even = (T)0x5555555555555555; // every even bit (01010101...)
517
213
 
518
- // obtain groups which start on an even bit (clear groups that start on an odd bit, but this leaves an unwanted trailing bit)
519
- T evenGroups = mask + (start & odd);
214
+ // obtain groups which start on an odd bit (clear groups that start on an even bit, but this leaves an unwanted trailing bit)
215
+ T oddGroups = mask + (start & even);
520
216
 
521
- // clear odd bits in even groups, whilst conversely preserving odd bits in odd groups
217
+ // clear even bits in odd groups, whilst conversely preserving even bits in even groups
522
218
  // the `& mask` also conveniently gets rid of unwanted trailing bits
523
- return (evenGroups ^ odd) & mask;
219
+ return (oddGroups ^ even) & mask;
524
220
  }
@@ -1,8 +1,8 @@
1
1
  #include "common.h"
2
- #ifdef __ARM_NEON
3
-
4
2
  #include "decoder_common.h"
5
3
 
4
+ #ifdef __ARM_NEON
5
+
6
6
 
7
7
  #if defined(_MSC_VER) && !defined(__clang__)
8
8
  # define vld1_u8_align(p, a) vld1_u8_ex(p, a*8)
@@ -53,6 +53,8 @@ static bool neon_vect_is_nonzero(uint8x16_t v) {
53
53
  }
54
54
 
55
55
 
56
+ namespace RapidYenc {
57
+
56
58
  template<bool isRaw, bool searchEnd>
57
59
  HEDLEY_ALWAYS_INLINE void do_decode_neon(const uint8_t* src, long& len, unsigned char*& p, unsigned char& escFirst, uint16_t& nextMask) {
58
60
  HEDLEY_ASSUME(escFirst == 0 || escFirst == 1);
@@ -322,8 +324,9 @@ HEDLEY_ALWAYS_INLINE void do_decode_neon(const uint8_t* src, long& len, unsigned
322
324
  // a spec compliant encoder should never generate sequences: ==, =\n and =\r, but we'll handle them to be spec compliant
323
325
  // the yEnc specification requires any character following = to be unescaped, not skipped over, so we'll deal with that
324
326
  // firstly, check for invalid sequences of = (we assume that these are rare, as a spec compliant yEnc encoder should not generate these)
325
- if(LIKELIHOOD(0.0001, (mask & ((maskEq << 1) | escFirst)) != 0)) {
326
- maskEq = fix_eqMask<uint32_t>(maskEq & ~escFirst);
327
+ uint32_t maskEqShift1 = (maskEq << 1) | escFirst;
328
+ if(LIKELIHOOD(0.0001, (mask & maskEqShift1) != 0)) {
329
+ maskEq = fix_eqMask<uint32_t>(maskEq, maskEqShift1);
327
330
 
328
331
  unsigned char nextEscFirst = maskEq>>31;
329
332
  // next, eliminate anything following a `=` from the special char mask; this eliminates cases of `=\r` so that they aren't removed
@@ -445,8 +448,9 @@ HEDLEY_ALWAYS_INLINE void do_decode_neon(const uint8_t* src, long& len, unsigned
445
448
  }
446
449
  }
447
450
  }
451
+ } // namespace
448
452
 
449
- void decoder_set_neon_funcs() {
453
+ void RapidYenc::decoder_set_neon_funcs() {
450
454
  decoder_init_lut(compactLUT);
451
455
  _do_decode = &do_decode_simd<false, false, sizeof(uint8x16_t)*2, do_decode_neon<false, false> >;
452
456
  _do_decode_raw = &do_decode_simd<true, false, sizeof(uint8x16_t)*2, do_decode_neon<true, false> >;
@@ -454,5 +458,5 @@ void decoder_set_neon_funcs() {
454
458
  _decode_isa = ISA_LEVEL_NEON;
455
459
  }
456
460
  #else
457
- void decoder_set_neon_funcs() {}
461
+ void RapidYenc::decoder_set_neon_funcs() {}
458
462
  #endif
@@ -1,7 +1,7 @@
1
1
  #include "common.h"
2
+ #include "decoder_common.h"
2
3
  #if defined(__ARM_NEON) && defined(__aarch64__)
3
4
 
4
- #include "decoder_common.h"
5
5
 
6
6
  #pragma pack(16)
7
7
  static struct { char bytes[16]; } ALIGN_TO(16, compactLUT[32768]);
@@ -44,6 +44,8 @@ static HEDLEY_ALWAYS_INLINE uint8x16_t mergeCompares(uint8x16_t a, uint8x16_t b,
44
44
  }
45
45
 
46
46
 
47
+ namespace RapidYenc {
48
+
47
49
  template<bool isRaw, bool searchEnd>
48
50
  HEDLEY_ALWAYS_INLINE void do_decode_neon(const uint8_t* src, long& len, unsigned char*& p, unsigned char& escFirst, uint16_t& nextMask) {
49
51
  HEDLEY_ASSUME(escFirst == 0 || escFirst == 1);
@@ -290,8 +292,9 @@ HEDLEY_ALWAYS_INLINE void do_decode_neon(const uint8_t* src, long& len, unsigned
290
292
  // a spec compliant encoder should never generate sequences: ==, =\n and =\r, but we'll handle them to be spec compliant
291
293
  // the yEnc specification requires any character following = to be unescaped, not skipped over, so we'll deal with that
292
294
  // firstly, check for invalid sequences of = (we assume that these are rare, as a spec compliant yEnc encoder should not generate these)
293
- if(LIKELIHOOD(0.0001, (mask & ((maskEq << 1) | escFirst)) != 0)) {
294
- maskEq = fix_eqMask<uint64_t>(maskEq & ~(uint64_t)escFirst);
295
+ uint64_t maskEqShift1 = (maskEq << 1) | escFirst;
296
+ if(LIKELIHOOD(0.0001, (mask & maskEqShift1) != 0)) {
297
+ maskEq = fix_eqMask<uint64_t>(maskEq, maskEqShift1);
295
298
 
296
299
  unsigned char nextEscFirst = maskEq>>63;
297
300
  // next, eliminate anything following a `=` from the special char mask; this eliminates cases of `=\r` so that they aren't removed
@@ -429,8 +432,9 @@ HEDLEY_ALWAYS_INLINE void do_decode_neon(const uint8_t* src, long& len, unsigned
429
432
  }
430
433
  }
431
434
  }
435
+ } // namespace
432
436
 
433
- void decoder_set_neon_funcs() {
437
+ void RapidYenc::decoder_set_neon_funcs() {
434
438
  decoder_init_lut(compactLUT);
435
439
  _do_decode = &do_decode_simd<false, false, sizeof(uint8x16_t)*4, do_decode_neon<false, false> >;
436
440
  _do_decode_raw = &do_decode_simd<true, false, sizeof(uint8x16_t)*4, do_decode_neon<true, false> >;
@@ -438,5 +442,5 @@ void decoder_set_neon_funcs() {
438
442
  _decode_isa = ISA_LEVEL_NEON;
439
443
  }
440
444
  #else
441
- void decoder_set_neon_funcs() {}
445
+ void RapidYenc::decoder_set_neon_funcs() {}
442
446
  #endif
@@ -1,6 +1,6 @@
1
1
  #include "common.h"
2
- #ifdef __riscv_vector
3
2
  #include "decoder_common.h"
3
+ #ifdef __riscv_vector
4
4
 
5
5
 
6
6
  #ifdef __riscv_v_intrinsic
@@ -29,6 +29,17 @@ static inline vbool4_t mask_lshift(vbool4_t m, unsigned shiftIn, size_t vl) {
29
29
  RV_MASK_CAST(4, 8, mvl), RV_MASK_CAST(4, 8, mvr), vl
30
30
  );
31
31
  }
32
+ template<int shift>
33
+ static inline vbool64_t mask_lshift(vbool64_t m, unsigned shiftIn, size_t vl) {
34
+ vuint8m1_t mv = RV_VEC_CAST(64, 8, m);
35
+ vuint8m1_t mvl = RV(vsll_vx_u8m1)(mv, shift, vl/8);
36
+ vuint8m1_t mvr = RV(vsrl_vx_u8m1)(mv, 8-shift, vl/8);
37
+ mvr = RV(vslide1up_vx_u8m1)(mvr, shiftIn, vl/8);
38
+
39
+ return RV(vmor_mm_b64)(
40
+ RV_MASK_CAST(64, 8, mvl), RV_MASK_CAST(64, 8, mvr), vl
41
+ );
42
+ }
32
43
 
33
44
  static inline vuint8m2_t set_first_vu8(vuint8m2_t src, uint8_t item, size_t vl) {
34
45
  #ifdef __riscv_v_intrinsic
@@ -48,6 +59,7 @@ static inline vuint16m2_t set_first_vu16(vuint16m2_t src, uint16_t item, size_t
48
59
  }
49
60
 
50
61
 
62
+ namespace RapidYenc {
51
63
 
52
64
  template<bool isRaw, bool searchEnd>
53
65
  HEDLEY_ALWAYS_INLINE void do_decode_rvv(const uint8_t* src, long& len, unsigned char*& outp, unsigned char& escFirst, uint16_t& nextMask) {
@@ -195,48 +207,41 @@ HEDLEY_ALWAYS_INLINE void do_decode_rvv(const uint8_t* src, long& len, unsigned
195
207
  // the yEnc specification requires any character following = to be unescaped, not skipped over, so we'll deal with that
196
208
  // firstly, check for invalid sequences of = (we assume that these are rare, as a spec compliant yEnc encoder should not generate these)
197
209
  if(LIKELIHOOD(0.0001, RV(vcpop_m_b4)(RV(vmandn_mm_b4)(cmpEqShift1, cmp, vl2), vl2) != 0)) {
198
- // note: we assume that uintptr_t corresponds with __riscv_xlen
199
- #if __riscv_xlen == 64
200
- vuint64m1_t cmpEqW = RV_VEC_CAST(4, 64, cmpEq);
201
- #else
202
- vuint32m1_t cmpEqW = RV_VEC_CAST(4, 32, cmpEq);
203
- #endif
204
- size_t nextShiftDown = (vl2 > sizeof(uintptr_t)*8 ? sizeof(uintptr_t)*8 : vl2) - 1;
205
- size_t wvl = (vl2 + sizeof(uintptr_t)*8 -1) / (sizeof(uintptr_t)*8);
206
- for(size_t w=0; w<vl2; w+=sizeof(uintptr_t)*8) {
207
- // extract bottom word
208
- #if __riscv_xlen == 64
209
- uintptr_t maskW = RV(vmv_x_s_u64m1_u64)(cmpEqW);
210
- #else
211
- uintptr_t maskW = RV(vmv_x_s_u32m1_u32)(cmpEqW);
212
- #endif
213
-
214
- // fix it
215
- maskW = fix_eqMask<uintptr_t>(maskW & ~(uintptr_t)escFirst);
216
- uint8_t nextEscFirst = (maskW >> nextShiftDown) & 1;
217
-
218
- // shift it up (will be used for cmpEqShift1)
219
- maskW = (maskW<<1) | escFirst; // TODO: should this be done using mask_lshift<1> instead?
220
- escFirst = nextEscFirst;
221
-
222
- // slide the new value in from the top
223
- #if __riscv_xlen == 64
224
- cmpEqW = RV(vslide1down_vx_u64m1)(cmpEqW, maskW, wvl);
225
- #else
226
- cmpEqW = RV(vslide1down_vx_u32m1)(cmpEqW, maskW, wvl);
227
- #endif
210
+ // replicate fix_eqMask, but in vector form
211
+ vbool4_t groupStart = RV(vmandn_mm_b4)(cmpEq, cmpEqShift1, vl2);
212
+ vbool4_t evenBits = RV_MASK_CAST(4, 8, RV(vmv_v_x_u8m1)(0x55, vl2));
213
+ vbool4_t evenStart = RV(vmand_mm_b4)(groupStart, evenBits, vl2);
214
+
215
+ // compute `cmpEq + evenStart` to obtain oddGroups
216
+ vbool4_t oddGroups;
217
+ vuint64m1_t cmpEq64 = RV_VEC_CAST(4, 64, cmpEq);
218
+ vuint64m1_t evenStart64 = RV_VEC_CAST(4, 64, evenStart);
219
+ vuint64m1_t oddGroups64;
220
+ if(vl2 <= 64) {
221
+ // no loop needed - single 64b add will work
222
+ oddGroups64 = RV(vadd_vv_u64m1)(cmpEq64, evenStart64, 1);
223
+ } else {
224
+ // need to loop whilst the add causes a carry
225
+ unsigned vl64 = vl2/64;
226
+ vbool64_t carry = RV(vmadc_vv_u64m1_b64)(cmpEq64, evenStart64, vl64);
227
+ carry = mask_lshift<1>(carry, 0, vl64);
228
+ oddGroups64 = RV(vadd_vv_u64m1)(cmpEq64, evenStart64, 1);
229
+ while(RV(vcpop_m_b64)(carry, vl64)) {
230
+ vbool64_t nextCarry = RV(vmadc_vx_u64m1_b64)(oddGroups64, 1, vl64);
231
+ oddGroups64 = RV(vadd_vx_u64m1_mu)(carry, oddGroups64, oddGroups64, 1, vl64);
232
+ carry = mask_lshift<1>(nextCarry, 0, vl64);
233
+ }
228
234
  }
229
- #if __riscv_xlen == 64
230
- cmpEqShift1 = RV_MASK_CAST(4, 64, cmpEqW);
231
- #else
232
- cmpEqShift1 = RV_MASK_CAST(4, 32, cmpEqW);
233
- #endif
235
+ oddGroups = RV_MASK_CAST(4, 64, oddGroups64);
236
+
237
+ cmpEq = RV(vmand_mm_b4)(RV(vmxor_mm_b4)(oddGroups, evenBits, vl2), cmpEq, vl2);
238
+
239
+ cmpEqShift1 = mask_lshift<1>(cmpEq, escFirst, vl2);
234
240
  cmp = RV(vmor_mm_b4)(cmpEqShift1, cmp, vl2); // ~(~cmp & ~cmpEqShift1)
235
241
  numOutputChars = RV(vcpop_m_b4)(cmp, vl2);
236
- } else {
237
- // no invalid = sequences found - don't need to fix up cmpEq
238
- escFirst = RV(vcpop_m_b4)(RV(vmand_mm_b4)(cmpEq, lastBit, vl2), vl2);
239
242
  }
243
+ escFirst = RV(vcpop_m_b4)(RV(vmand_mm_b4)(cmpEq, lastBit, vl2), vl2);
244
+
240
245
  data = RV(vsub_vv_u8m2)(data, RV_vmerge_vxm_u8m2(yencOffset, 64+42, cmpEqShift1, vl2), vl2);
241
246
  yencOffset = set_first_vu8(yencOffset, 42 | (escFirst<<6), vl2);
242
247
 
@@ -262,13 +267,14 @@ HEDLEY_ALWAYS_INLINE void do_decode_rvv(const uint8_t* src, long& len, unsigned
262
267
  size_t decoder_rvv_width() {
263
268
  return RV(vsetvlmax_e8m2)();
264
269
  }
270
+ } // namespace
265
271
 
266
- void decoder_set_rvv_funcs() {
272
+ void RapidYenc::decoder_set_rvv_funcs() {
267
273
  _do_decode = &do_decode_simd<false, false, decoder_rvv_width, do_decode_rvv<false, false> >;
268
274
  _do_decode_raw = &do_decode_simd<true, false, decoder_rvv_width, do_decode_rvv<true, false> >;
269
275
  _do_decode_end_raw = &do_decode_simd<true, true, decoder_rvv_width, do_decode_rvv<true, true> >;
270
276
  _decode_isa = ISA_LEVEL_RVV;
271
277
  }
272
278
  #else
273
- void decoder_set_rvv_funcs() {}
279
+ void RapidYenc::decoder_set_rvv_funcs() {}
274
280
  #endif
@@ -1,10 +1,10 @@
1
1
  #include "common.h"
2
2
 
3
- #ifdef __SSE2__
4
3
  #include "decoder_common.h"
4
+ #ifdef __SSE2__
5
5
  #include "decoder_sse_base.h"
6
6
 
7
- void decoder_sse_init(SSELookups* HEDLEY_RESTRICT& lookups) {
7
+ void RapidYenc::decoder_sse_init(RapidYenc::SSELookups* HEDLEY_RESTRICT& lookups) {
8
8
  ALIGN_ALLOC(lookups, sizeof(SSELookups), 16);
9
9
  for(int i=0; i<256; i++) {
10
10
  lookups->BitsSetTable256inv[i] = 8 - (
@@ -25,7 +25,7 @@ void decoder_sse_init(SSELookups* HEDLEY_RESTRICT& lookups) {
25
25
  }
26
26
  }
27
27
 
28
- void decoder_set_sse2_funcs() {
28
+ void RapidYenc::decoder_set_sse2_funcs() {
29
29
  decoder_sse_init(lookups);
30
30
  decoder_init_lut(lookups->compact);
31
31
  _do_decode = &do_decode_simd<false, false, sizeof(__m128i)*2, do_decode_sse<false, false, ISA_LEVEL_SSE2> >;
@@ -34,5 +34,5 @@ void decoder_set_sse2_funcs() {
34
34
  _decode_isa = ISA_LEVEL_SSE2;
35
35
  }
36
36
  #else
37
- void decoder_set_sse2_funcs() {}
37
+ void RapidYenc::decoder_set_sse2_funcs() {}
38
38
  #endif