yencode 1.1.5 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,251 @@
1
+ #include "crc_common.h"
2
+
3
+ #if defined(__riscv) && defined(__GNUC__) && (defined(__riscv_zbkc) || defined(__riscv_zbc))
4
+
5
+ #if __has_include(<riscv_bitmanip.h>)
6
+ # include <riscv_bitmanip.h>
7
+ # if __riscv_xlen == 64
8
+ # define rv_clmul __riscv_clmul_64
9
+ # define rv_clmulh __riscv_clmulh_64
10
+ # else
11
+ # define rv_clmul __riscv_clmul_32
12
+ # define rv_clmulh __riscv_clmulh_32
13
+ # endif
14
+ #else
15
+ static HEDLEY_ALWAYS_INLINE uintptr_t rv_clmul(uintptr_t x, uintptr_t y) {
16
+ uintptr_t r;
17
+ __asm__("clmul %0, %1, %2\n"
18
+ : "=r"(r)
19
+ : "r"(x), "r"(y)
20
+ :);
21
+ return r;
22
+ }
23
+ static HEDLEY_ALWAYS_INLINE uintptr_t rv_clmulh(uintptr_t x, uintptr_t y) {
24
+ uintptr_t r;
25
+ __asm__("clmulh %0, %1, %2\n"
26
+ : "=r"(r)
27
+ : "r"(x), "r"(y)
28
+ :);
29
+ return r;
30
+ }
31
+ #endif
32
+
33
+ // TODO: test big-endian
34
+ #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
35
+ # if __riscv_xlen == 64
36
+ # define SWAP __builtin_bswap64
37
+ # else
38
+ # define SWAP __builtin_bswap32
39
+ # endif
40
+ #else
41
+ # define SWAP(d) (d)
42
+ #endif
43
+ static HEDLEY_ALWAYS_INLINE uintptr_t read_partial(const void* p, unsigned sz) {
44
+ uintptr_t data = 0;
45
+ memcpy(&data, p, sz);
46
+ return SWAP(data);
47
+ }
48
+ static HEDLEY_ALWAYS_INLINE uintptr_t read_full(const uintptr_t* p) {
49
+ return SWAP(*p);
50
+ }
51
+ #undef SWAP
52
+
53
+ static uint32_t rv_crc_calc(uint32_t crc, const unsigned char *src, long len) {
54
+ uintptr_t accum[4] = {};
55
+
56
+ // note: constants here are bit-reflected and shifted left by 1
57
+ // Zbc does also have clmulr to avoid the shift, but:
58
+ // - there's no clmulhr, so for XLEN=64, just shift the constant instead to get the same result
59
+ // - it's unavailable in Zbkc
60
+ // - for XLEN=32, 2x constants is likely worth it to avoid the additional XORs in the loop
61
+
62
+ #if __riscv_xlen == 64
63
+ const uint64_t MUL_HI = 0x15a546366 /*2^224*/, MUL_LO = 0xf1da05aa /*2^288*/;
64
+ #define CLMULL rv_clmul
65
+ #define CLMULH rv_clmulh
66
+
67
+ accum[3] = rv_clmul(crc, 0xb66b1fa6); // 2^-32
68
+ #elif __riscv_xlen == 32
69
+ const uint64_t MUL_HI = 0x140d44a2e /*2^128*/, MUL_LO = 0x1751997d0 /*2^160*/;
70
+ #define CLMULL(x, k) rv_clmul(x, k & 0xffffffff)
71
+ #define CLMULH(x, k) (rv_clmulh(x, k & 0xffffffff) ^ (k > 0xffffffffULL ? (x) : 0))
72
+
73
+ accum[2] = rv_clmul(crc, 0xb66b1fa6);
74
+ accum[3] = rv_clmulh(crc, 0xb66b1fa6);
75
+ #else
76
+ #error "Unknown __riscv_xlen"
77
+ #endif
78
+ const size_t WS = sizeof(uintptr_t);
79
+
80
+ // if src isn't word-aligned, process until it is so
81
+ long initial_alignment = ((uintptr_t)src & (WS-1));
82
+ long initial_process = WS - initial_alignment;
83
+ if(initial_alignment && len >= initial_process) {
84
+ unsigned shl = initial_alignment * 8, shr = initial_process * 8;
85
+ #if __riscv_xlen == 64
86
+ accum[2] = accum[3] << shl;
87
+ #else
88
+ accum[1] = accum[2] << shl;
89
+ accum[2] = (accum[3] << shl) | (accum[2] >> shr);
90
+ #endif
91
+ accum[3] = (read_partial(src, initial_process) << shl) | (accum[3] >> shr);
92
+ src += initial_process;
93
+ len -= initial_process;
94
+ }
95
+
96
+ // main processing loop
97
+ const uintptr_t* srcW = (const uintptr_t*)src;
98
+ while((len -= WS*4) >= 0) {
99
+ uintptr_t tmpHi, tmpLo;
100
+ tmpLo = CLMULL(accum[0], MUL_LO) ^ CLMULL(accum[1], MUL_HI);
101
+ tmpHi = CLMULH(accum[0], MUL_LO) ^ CLMULH(accum[1], MUL_HI);
102
+ accum[0] = tmpLo ^ read_full(srcW++);
103
+ accum[1] = tmpHi ^ read_full(srcW++);
104
+
105
+ tmpLo = CLMULL(accum[2], MUL_LO) ^ CLMULL(accum[3], MUL_HI);
106
+ tmpHi = CLMULH(accum[2], MUL_LO) ^ CLMULH(accum[3], MUL_HI);
107
+ accum[2] = tmpLo ^ read_full(srcW++);
108
+ accum[3] = tmpHi ^ read_full(srcW++);
109
+ }
110
+
111
+ // process trailing bytes
112
+ if(len & (WS*2)) {
113
+ uintptr_t tmpLo = CLMULL(accum[0], MUL_LO) ^ CLMULL(accum[1], MUL_HI);
114
+ uintptr_t tmpHi = CLMULH(accum[0], MUL_LO) ^ CLMULH(accum[1], MUL_HI);
115
+ accum[0] = accum[2];
116
+ accum[1] = accum[3];
117
+ accum[2] = tmpLo ^ read_full(srcW++);
118
+ accum[3] = tmpHi ^ read_full(srcW++);
119
+ }
120
+ if(len & WS) {
121
+ uintptr_t tmpLo = CLMULL(accum[0], MUL_HI);
122
+ uintptr_t tmpHi = CLMULH(accum[0], MUL_HI);
123
+ accum[0] = accum[1];
124
+ accum[1] = accum[2];
125
+ accum[2] = accum[3] ^ tmpLo;
126
+ accum[3] = tmpHi ^ read_full(srcW++);
127
+ }
128
+
129
+ size_t tail = len & (WS-1);
130
+ if(tail) {
131
+ unsigned shl = ((WS - tail) * 8), shr = tail * 8;
132
+ uintptr_t tmp = accum[0] << shl;
133
+ uintptr_t tmpLo = CLMULL(tmp, MUL_HI);
134
+ uintptr_t tmpHi = CLMULH(tmp, MUL_HI);
135
+ accum[0] = (accum[0] >> shr) | (accum[1] << shl);
136
+ accum[1] = (accum[1] >> shr) | (accum[2] << shl);
137
+ accum[2] = (accum[2] >> shr) | (accum[3] << shl);
138
+ accum[3] = (accum[3] >> shr) | (read_partial(srcW, tail) << shl);
139
+ accum[2] ^= tmpLo;
140
+ accum[3] ^= tmpHi;
141
+ }
142
+
143
+
144
+ // done processing: fold everything down
145
+ #if __riscv_xlen == 64
146
+ // fold 0,1 -> 2,3
147
+ accum[2] ^= rv_clmul(accum[0], 0x1751997d0) ^ rv_clmul(accum[1], 0xccaa009e);
148
+ accum[3] ^= rv_clmulh(accum[0], 0x1751997d0) ^ rv_clmulh(accum[1], 0xccaa009e);
149
+
150
+ // fold 2->3
151
+ accum[0] = rv_clmulh(accum[2], 0xccaa009e);
152
+ accum[3] ^= rv_clmul(accum[2], 0xccaa009e);
153
+
154
+ // fold 64b->32b
155
+ accum[1] = rv_clmul(accum[3] & 0xffffffff, 0x163cd6124);
156
+ accum[0] ^= accum[1] >> 32;
157
+ accum[3] = accum[1] ^ (accum[3] >> 32);
158
+ accum[3] <<= 32;
159
+ #else
160
+ // fold 0,1 -> 2,3
161
+ accum[2] ^= rv_clmul(accum[0], 0xccaa009e) ^ CLMULL(accum[1], 0x163cd6124);
162
+ accum[3] ^= rv_clmulh(accum[0], 0xccaa009e) ^ CLMULH(accum[1], 0x163cd6124);
163
+
164
+ // fold 2->3
165
+ accum[0] = CLMULH(accum[2], 0x163cd6124);
166
+ accum[3] ^= CLMULL(accum[2], 0x163cd6124);
167
+ #endif
168
+
169
+ // reduction
170
+ accum[3] = CLMULL(accum[3], 0xf7011641);
171
+ accum[3] = CLMULH(accum[3], 0x1db710640); // maybe consider clmulr for XLEN=32
172
+ crc = accum[0] ^ accum[3];
173
+ return crc;
174
+ #undef CLMULL
175
+ #undef CLMULH
176
+ }
177
+
178
+ static uint32_t do_crc32_incremental_rv_zbc(const void* data, size_t length, uint32_t init) {
179
+ return ~rv_crc_calc(~init, (const unsigned char*)data, (long)length);
180
+ }
181
+
182
+
183
+ #if __riscv_xlen == 64
184
+ // note that prod is shifted by 1 place to the right, due to bit-reflection
185
+ static uint32_t crc32_reduce_rv_zbc(uint64_t prod) {
186
+ uint64_t t = rv_clmul(prod << 33, 0xf7011641);
187
+ t = rv_clmulh(t, 0x1db710640);
188
+ t ^= prod >> 31;
189
+ return t;
190
+ }
191
+ #endif
192
+ uint32_t crc32_multiply_rv_zbc(uint32_t a, uint32_t b) {
193
+ #if __riscv_xlen == 64
194
+ uint64_t t = crc32_reduce_rv_zbc(rv_clmul(a, b));
195
+ #else
196
+ uint32_t prodLo = rv_clmul(a, b);
197
+ uint32_t prodHi = rv_clmulh(a, b);
198
+
199
+ // fix prodHi for bit-reflection (clmulr would be ideal here)
200
+ prodHi += prodHi;
201
+ prodHi |= prodLo >> 31;
202
+ prodLo += prodLo;
203
+
204
+ uint32_t t = rv_clmul(prodLo, 0xf7011641);
205
+ t ^= rv_clmulh(t, 0xdb710640);
206
+ t ^= prodHi;
207
+ #endif
208
+ return t;
209
+ }
210
+
211
+ #if defined(__GNUC__) || defined(_MSC_VER)
212
+ uint32_t crc32_shift_rv_zbc(uint32_t crc1, uint32_t n) {
213
+ // TODO: require Zbb for ctz
214
+ uint32_t result = crc1;
215
+ #if __riscv_xlen == 64
216
+ // for n<32, can shift directly
217
+ uint64_t prod = result;
218
+ prod <<= 31 ^ (n&31);
219
+ n &= ~31;
220
+ result = crc32_reduce_rv_zbc(prod);
221
+ #endif
222
+ if(!n) return result;
223
+
224
+ uint32_t result2 = crc_power[ctz32(n)];
225
+ n &= n-1;
226
+
227
+ while(n) {
228
+ result = crc32_multiply_rv_zbc(result, crc_power[ctz32(n)]);
229
+ n &= n-1;
230
+
231
+ if(n) {
232
+ result2 = crc32_multiply_rv_zbc(result2, crc_power[ctz32(n)]);
233
+ n &= n-1;
234
+ }
235
+ }
236
+ return crc32_multiply_rv_zbc(result, result2);
237
+ }
238
+ #endif
239
+
240
+
241
+ void crc_riscv_set_funcs() {
242
+ _do_crc32_incremental = &do_crc32_incremental_rv_zbc;
243
+ _crc32_multiply = &crc32_multiply_rv_zbc;
244
+ #if defined(__GNUC__) || defined(_MSC_VER)
245
+ _crc32_shift = &crc32_shift_rv_zbc;
246
+ #endif
247
+ _crc32_isa = ISA_FEATURE_ZBC;
248
+ }
249
+ #else
250
+ void crc_riscv_set_funcs() {}
251
+ #endif
package/src/decoder.cc CHANGED
@@ -18,6 +18,7 @@ void decoder_set_avx2_funcs();
18
18
  void decoder_set_vbmi2_funcs();
19
19
  extern const bool decoder_has_avx10;
20
20
  void decoder_set_neon_funcs();
21
+ void decoder_set_rvv_funcs();
21
22
 
22
23
 
23
24
  #if defined(PLATFORM_X86) && defined(YENC_BUILD_NATIVE) && YENC_BUILD_NATIVE!=0
@@ -25,7 +26,7 @@ void decoder_set_neon_funcs();
25
26
  # include "decoder_avx2_base.h"
26
27
  static inline void decoder_set_native_funcs() {
27
28
  ALIGN_ALLOC(lookups, sizeof(*lookups), 16);
28
- decoder_init_lut(lookups->eqFix, lookups->compact);
29
+ decoder_init_lut(lookups->compact);
29
30
  _do_decode = &do_decode_simd<false, false, sizeof(__m256i)*2, do_decode_avx2<false, false, ISA_NATIVE> >;
30
31
  _do_decode_raw = &do_decode_simd<true, false, sizeof(__m256i)*2, do_decode_avx2<true, false, ISA_NATIVE> >;
31
32
  _do_decode_end_raw = &do_decode_simd<true, true, sizeof(__m256i)*2, do_decode_avx2<true, true, ISA_NATIVE> >;
@@ -34,8 +35,8 @@ static inline void decoder_set_native_funcs() {
34
35
  # else
35
36
  # include "decoder_sse_base.h"
36
37
  static inline void decoder_set_native_funcs() {
37
- decoder_sse_init();
38
- decoder_init_lut(lookups->eqFix, lookups->compact);
38
+ decoder_sse_init(lookups);
39
+ decoder_init_lut(lookups->compact);
39
40
  _do_decode = &do_decode_simd<false, false, sizeof(__m128i)*2, do_decode_sse<false, false, ISA_NATIVE> >;
40
41
  _do_decode_raw = &do_decode_simd<true, false, sizeof(__m128i)*2, do_decode_sse<true, false, ISA_NATIVE> >;
41
42
  _do_decode_end_raw = &do_decode_simd<true, true, sizeof(__m128i)*2, do_decode_sse<true, true, ISA_NATIVE> >;
@@ -44,6 +45,31 @@ static inline void decoder_set_native_funcs() {
44
45
  # endif
45
46
  #endif
46
47
 
48
+
49
+ #if defined(PLATFORM_X86) || defined(PLATFORM_ARM)
50
+ void decoder_init_lut(void* compactLUT) {
51
+ #ifdef YENC_DEC_USE_THINTABLE
52
+ const int tableSize = 8;
53
+ #else
54
+ const int tableSize = 16;
55
+ #endif
56
+ for(int i=0; i<(tableSize==8?256:32768); i++) {
57
+ int k = i;
58
+ uint8_t* res = (uint8_t*)compactLUT + i*tableSize;
59
+ int p = 0;
60
+ for(int j=0; j<tableSize; j++) {
61
+ if(!(k & 1)) {
62
+ res[p++] = j;
63
+ }
64
+ k >>= 1;
65
+ }
66
+ for(; p<tableSize; p++)
67
+ res[p] = 0x80;
68
+ }
69
+ }
70
+ #endif
71
+
72
+
47
73
  void decoder_init() {
48
74
  #ifdef PLATFORM_X86
49
75
  # if defined(YENC_BUILD_NATIVE) && YENC_BUILD_NATIVE!=0
@@ -66,4 +92,8 @@ void decoder_init() {
66
92
  if(cpu_supports_neon())
67
93
  decoder_set_neon_funcs();
68
94
  #endif
95
+ #ifdef __riscv
96
+ if(cpu_supports_rvv())
97
+ decoder_set_rvv_funcs();
98
+ #endif
69
99
  }
@@ -4,8 +4,8 @@
4
4
  #include "decoder_common.h"
5
5
  #include "decoder_sse_base.h"
6
6
  void decoder_set_avx_funcs() {
7
- decoder_sse_init();
8
- decoder_init_lut(lookups->eqFix, lookups->compact);
7
+ decoder_sse_init(lookups);
8
+ decoder_init_lut(lookups->compact);
9
9
  _do_decode = &do_decode_simd<false, false, sizeof(__m128i)*2, do_decode_sse<false, false, ISA_LEVEL_SSE4_POPCNT> >;
10
10
  _do_decode_raw = &do_decode_simd<true, false, sizeof(__m128i)*2, do_decode_sse<true, false, ISA_LEVEL_SSE4_POPCNT> >;
11
11
  _do_decode_end_raw = &do_decode_simd<true, true, sizeof(__m128i)*2, do_decode_sse<true, true, ISA_LEVEL_SSE4_POPCNT> >;
@@ -5,7 +5,7 @@
5
5
  #include "decoder_avx2_base.h"
6
6
  void decoder_set_avx2_funcs() {
7
7
  ALIGN_ALLOC(lookups, sizeof(*lookups), 16);
8
- decoder_init_lut(lookups->eqFix, lookups->compact);
8
+ decoder_init_lut(lookups->compact);
9
9
  _do_decode = &do_decode_simd<false, false, sizeof(__m256i)*2, do_decode_avx2<false, false, ISA_LEVEL_AVX2> >;
10
10
  _do_decode_raw = &do_decode_simd<true, false, sizeof(__m256i)*2, do_decode_avx2<true, false, ISA_LEVEL_AVX2> >;
11
11
  _do_decode_end_raw = &do_decode_simd<true, true, sizeof(__m256i)*2, do_decode_avx2<true, true, ISA_LEVEL_AVX2> >;
@@ -15,7 +15,6 @@
15
15
  #pragma pack(16)
16
16
  static struct {
17
17
  /*align16*/ struct { char bytes[16]; } compact[32768];
18
- uint8_t eqFix[256];
19
18
  } * HEDLEY_RESTRICT lookups;
20
19
  #pragma pack()
21
20
 
@@ -431,16 +430,9 @@ HEDLEY_ALWAYS_INLINE void do_decode_avx2(const uint8_t* src, long& len, unsigned
431
430
  dataB = _mm256_add_epi8(oDataB, _mm256_set1_epi8(-42));
432
431
 
433
432
  if(LIKELIHOOD(0.0001, (mask & ((maskEq << 1) + escFirst)) != 0)) {
434
- unsigned tmp = lookups->eqFix[(maskEq&0xff) & ~(uint64_t)escFirst];
435
- uint64_t maskEq2 = tmp;
436
- for(int j=8; j<64; j+=8) {
437
- tmp = lookups->eqFix[(unsigned)((maskEq>>j)&0xff) & ~(tmp>>7)];
438
- maskEq2 |= (uint64_t)tmp<<j;
439
- }
440
- maskEq = maskEq2;
441
-
433
+ maskEq = fix_eqMask<uint64_t>(maskEq & ~(uint64_t)escFirst);
442
434
  mask &= ~(uint64_t)escFirst;
443
- escFirst = tmp>>7;
435
+ escFirst = maskEq>>63;
444
436
  // next, eliminate anything following a `=` from the special char mask; this eliminates cases of `=\r` so that they aren't removed
445
437
  maskEq <<= 1;
446
438
  mask &= ~maskEq;
@@ -1,5 +1,9 @@
1
1
  #include "decoder.h"
2
2
 
3
+ #if defined(PLATFORM_ARM) && !defined(__aarch64__)
4
+ #define YENC_DEC_USE_THINTABLE 1
5
+ #endif
6
+
3
7
  // TODO: need to support max output length somehow
4
8
  // TODO: add branch probabilities
5
9
 
@@ -178,24 +182,24 @@ YencDecoderEnd do_decode_end_scalar(const unsigned char** src, unsigned char** d
178
182
  if(es[i] == '.' && isRaw) {
179
183
  i++;
180
184
  YDEC_CHECK_END(YDEC_STATE_CRLFDT)
181
- // fall-through
182
185
  } else if(es[i] == '=') {
183
186
  i++;
184
187
  YDEC_CHECK_END(YDEC_STATE_CRLFEQ)
185
188
  goto do_decode_endable_scalar_ceq;
186
189
  } else
187
190
  break;
191
+ // fall-through
188
192
  case YDEC_STATE_CRLFDT:
189
193
  if(isRaw && es[i] == '\r') {
190
194
  i++;
191
195
  YDEC_CHECK_END(YDEC_STATE_CRLFDTCR)
192
- // fall-through
193
196
  } else if(isRaw && es[i] == '=') { // check for dot-stuffed ending: \r\n.=y
194
197
  i++;
195
198
  YDEC_CHECK_END(YDEC_STATE_CRLFEQ)
196
199
  goto do_decode_endable_scalar_ceq;
197
200
  } else
198
201
  break;
202
+ // fall-through
199
203
  case YDEC_STATE_CRLFDTCR:
200
204
  if(es[i] == '\n') {
201
205
  if(isRaw) {
@@ -331,8 +335,8 @@ YencDecoderEnd do_decode_scalar(const unsigned char** src, unsigned char** dest,
331
335
 
332
336
 
333
337
 
334
- template<bool isRaw, bool searchEnd, int width, void(&kernel)(const uint8_t*, long&, unsigned char*&, unsigned char&, uint16_t&)>
335
- YencDecoderEnd do_decode_simd(const unsigned char** src, unsigned char** dest, size_t len, YencDecoderState* state) {
338
+ template<bool isRaw, bool searchEnd, void(&kernel)(const uint8_t*, long&, unsigned char*&, unsigned char&, uint16_t&)>
339
+ inline YencDecoderEnd _do_decode_simd(size_t width, const unsigned char** src, unsigned char** dest, size_t len, YencDecoderState* state) {
336
340
  if(len <= width*2) return do_decode_scalar<isRaw, searchEnd>(src, dest, len, state);
337
341
 
338
342
  YencDecoderState tState = YDEC_STATE_CRLF;
@@ -461,54 +465,20 @@ YencDecoderEnd do_decode_simd(const unsigned char** src, unsigned char** dest, s
461
465
  return YDEC_END_NONE;
462
466
  }
463
467
 
464
- static inline void decoder_init_lut(uint8_t* eqFixLUT, void* compactLUT) {
465
- for(int i=0; i<256; i++) {
466
- int k = i;
467
- int p = 0;
468
-
469
- // fix LUT
470
- k = i;
471
- p = 0;
472
- for(int j=0; j<8; j++) {
473
- k = i >> j;
474
- if(k & 1) {
475
- p |= 1 << j;
476
- j++;
477
- }
478
- }
479
- eqFixLUT[i] = p;
480
-
481
- #ifdef YENC_DEC_USE_THINTABLE
482
- uint8_t* res = (uint8_t*)compactLUT + i*8;
483
- k = i;
484
- p = 0;
485
- for(int j=0; j<8; j++) {
486
- if(!(k & 1)) {
487
- res[p++] = j;
488
- }
489
- k >>= 1;
490
- }
491
- for(; p<8; p++)
492
- res[p] = 0x80;
493
- #endif
494
- }
495
- #ifndef YENC_DEC_USE_THINTABLE
496
- for(int i=0; i<32768; i++) {
497
- int k = i;
498
- uint8_t* res = (uint8_t*)compactLUT + i*16;
499
- int p = 0;
500
-
501
- for(int j=0; j<16; j++) {
502
- if(!(k & 1)) {
503
- res[p++] = j;
504
- }
505
- k >>= 1;
506
- }
507
- for(; p<16; p++)
508
- res[p] = 0x80;
509
- }
510
- #endif
468
+ template<bool isRaw, bool searchEnd, size_t width, void(&kernel)(const uint8_t*, long&, unsigned char*&, unsigned char&, uint16_t&)>
469
+ YencDecoderEnd do_decode_simd(const unsigned char** src, unsigned char** dest, size_t len, YencDecoderState* state) {
470
+ return _do_decode_simd<isRaw, searchEnd, kernel>(width, src, dest, len, state);
511
471
  }
472
+ template<bool isRaw, bool searchEnd, size_t(&getWidth)(), void(&kernel)(const uint8_t*, long&, unsigned char*&, unsigned char&, uint16_t&)>
473
+ YencDecoderEnd do_decode_simd(const unsigned char** src, unsigned char** dest, size_t len, YencDecoderState* state) {
474
+ return _do_decode_simd<isRaw, searchEnd, kernel>(getWidth(), src, dest, len, state);
475
+ }
476
+
477
+
478
+ #if defined(PLATFORM_X86) || defined(PLATFORM_ARM)
479
+ void decoder_init_lut(void* compactLUT);
480
+ #endif
481
+
512
482
  template<bool isRaw>
513
483
  static inline void decoder_set_nextMask(const uint8_t* src, size_t len, uint16_t& nextMask) {
514
484
  if(isRaw) {
@@ -535,3 +505,20 @@ static inline uint16_t decoder_set_nextMask(const uint8_t* src, unsigned mask) {
535
505
  }
536
506
  return 0;
537
507
  }
508
+
509
+ // resolve invalid sequences of = to deal with cases like '===='
510
+ // bit hack inspired from simdjson: https://youtu.be/wlvKAT7SZIQ?t=33m38s
511
+ template<typename T>
512
+ static inline T fix_eqMask(T mask) {
513
+ // isolate the start of each consecutive bit group (e.g. 01011101 -> 01000101)
514
+ T start = mask & ~(mask << 1);
515
+
516
+ const T odd = (T)0xaaaaaaaaaaaaaaaa; // every odd bit (10101010...)
517
+
518
+ // obtain groups which start on an even bit (clear groups that start on an odd bit, but this leaves an unwanted trailing bit)
519
+ T evenGroups = mask + (start & odd);
520
+
521
+ // clear odd bits in even groups, whilst conversely preserving odd bits in odd groups
522
+ // the `& mask` also conveniently gets rid of unwanted trailing bits
523
+ return (evenGroups ^ odd) & mask;
524
+ }
@@ -1,9 +1,6 @@
1
1
  #include "common.h"
2
2
  #ifdef __ARM_NEON
3
3
 
4
- #ifndef __aarch64__
5
- #define YENC_DEC_USE_THINTABLE 1
6
- #endif
7
4
  #include "decoder_common.h"
8
5
 
9
6
 
@@ -43,8 +40,6 @@ static struct { char bytes[16]; } ALIGN_TO(16, compactLUT[32768]);
43
40
  # pragma pack()
44
41
  #endif
45
42
 
46
- static uint8_t eqFixLUT[256];
47
-
48
43
 
49
44
 
50
45
  static bool neon_vect_is_nonzero(uint8x16_t v) {
@@ -328,18 +323,13 @@ HEDLEY_ALWAYS_INLINE void do_decode_neon(const uint8_t* src, long& len, unsigned
328
323
  // the yEnc specification requires any character following = to be unescaped, not skipped over, so we'll deal with that
329
324
  // firstly, check for invalid sequences of = (we assume that these are rare, as a spec compliant yEnc encoder should not generate these)
330
325
  if(LIKELIHOOD(0.0001, (mask & ((maskEq << 1) | escFirst)) != 0)) {
331
- uint8_t tmp = eqFixLUT[(maskEq&0xff) & ~escFirst];
332
- uint32_t maskEq2 = tmp;
333
- for(int j=8; j<32; j+=8) {
334
- tmp = eqFixLUT[((maskEq>>j)&0xff) & ~(tmp>>7)];
335
- maskEq2 |= tmp<<j;
336
- }
337
- maskEq = maskEq2;
326
+ maskEq = fix_eqMask<uint32_t>(maskEq & ~escFirst);
338
327
 
328
+ unsigned char nextEscFirst = maskEq>>31;
339
329
  // next, eliminate anything following a `=` from the special char mask; this eliminates cases of `=\r` so that they aren't removed
340
330
  maskEq = (maskEq<<1) | escFirst;
341
331
  mask &= ~maskEq;
342
- escFirst = tmp>>7;
332
+ escFirst = nextEscFirst;
343
333
 
344
334
  // unescape chars following `=`
345
335
  uint8x8_t maskEqTemp = vreinterpret_u8_u32(vmov_n_u32(maskEq));
@@ -457,7 +447,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_neon(const uint8_t* src, long& len, unsigned
457
447
  }
458
448
 
459
449
  void decoder_set_neon_funcs() {
460
- decoder_init_lut(eqFixLUT, compactLUT);
450
+ decoder_init_lut(compactLUT);
461
451
  _do_decode = &do_decode_simd<false, false, sizeof(uint8x16_t)*2, do_decode_neon<false, false> >;
462
452
  _do_decode_raw = &do_decode_simd<true, false, sizeof(uint8x16_t)*2, do_decode_neon<true, false> >;
463
453
  _do_decode_end_raw = &do_decode_simd<true, true, sizeof(uint8x16_t)*2, do_decode_neon<true, true> >;
@@ -7,8 +7,6 @@
7
7
  static struct { char bytes[16]; } ALIGN_TO(16, compactLUT[32768]);
8
8
  #pragma pack()
9
9
 
10
- static uint8_t eqFixLUT[256];
11
-
12
10
 
13
11
  // AArch64 GCC lacks these functions until 8.5, 9.4 and 10.1 (10.0 unknown)
14
12
  #if !defined(__clang__) && !defined(_MSC_VER) && (!defined(__aarch64__) || !(HEDLEY_GCC_VERSION_CHECK(9,4,0) || (!HEDLEY_GCC_VERSION_CHECK(9,0,0) && HEDLEY_GCC_VERSION_CHECK(8,5,0))))
@@ -293,18 +291,13 @@ HEDLEY_ALWAYS_INLINE void do_decode_neon(const uint8_t* src, long& len, unsigned
293
291
  // the yEnc specification requires any character following = to be unescaped, not skipped over, so we'll deal with that
294
292
  // firstly, check for invalid sequences of = (we assume that these are rare, as a spec compliant yEnc encoder should not generate these)
295
293
  if(LIKELIHOOD(0.0001, (mask & ((maskEq << 1) | escFirst)) != 0)) {
296
- uint8_t tmp = eqFixLUT[(maskEq&0xff) & ~escFirst];
297
- uint64_t maskEq2 = tmp;
298
- for(int j=8; j<64; j+=8) {
299
- tmp = eqFixLUT[((maskEq>>j)&0xff) & ~(tmp>>7)];
300
- maskEq2 |= ((uint64_t)tmp)<<j;
301
- }
302
- maskEq = maskEq2;
294
+ maskEq = fix_eqMask<uint64_t>(maskEq & ~(uint64_t)escFirst);
303
295
 
296
+ unsigned char nextEscFirst = maskEq>>63;
304
297
  // next, eliminate anything following a `=` from the special char mask; this eliminates cases of `=\r` so that they aren't removed
305
298
  maskEq = (maskEq<<1) | escFirst;
306
299
  mask &= ~maskEq;
307
- escFirst = tmp>>7;
300
+ escFirst = nextEscFirst;
308
301
 
309
302
  // unescape chars following `=`
310
303
  #if defined(__GNUC__) && !defined(__clang__)
@@ -438,7 +431,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_neon(const uint8_t* src, long& len, unsigned
438
431
  }
439
432
 
440
433
  void decoder_set_neon_funcs() {
441
- decoder_init_lut(eqFixLUT, compactLUT);
434
+ decoder_init_lut(compactLUT);
442
435
  _do_decode = &do_decode_simd<false, false, sizeof(uint8x16_t)*4, do_decode_neon<false, false> >;
443
436
  _do_decode_raw = &do_decode_simd<true, false, sizeof(uint8x16_t)*4, do_decode_neon<true, false> >;
444
437
  _do_decode_end_raw = &do_decode_simd<true, true, sizeof(uint8x16_t)*4, do_decode_neon<true, true> >;