yencode 1.1.4 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,8 @@
1
1
  // 256-bit version of crc_folding
2
2
 
3
3
  #include "crc_common.h"
4
+
5
+ void crc_clmul_set_funcs();
4
6
 
5
7
  #if !defined(YENC_DISABLE_AVX256) && ((defined(__VPCLMULQDQ__) && defined(__AVX2__) && defined(__PCLMUL__)) || (defined(_MSC_VER) && _MSC_VER >= 1920 && defined(PLATFORM_X86) && !defined(__clang__)))
6
8
  #include <inttypes.h>
@@ -99,19 +101,12 @@ ALIGN_TO(16, static const unsigned crc_k[]) = {
99
101
 
100
102
 
101
103
  static uint32_t crc_fold(const unsigned char *src, long len, uint32_t initial) {
102
- // info from https://www.reddit.com/r/ReverseEngineering/comments/2zwhl3/mystery_constant_0x9db42487_in_intels_crc32ieee/
103
- // firstly, calculate: xmm_crc0 = (intial * 0x487b9c8a) mod 0x104c11db7, where 0x487b9c8a = inverse(1<<512) mod 0x104c11db7
104
- __m128i xmm_t0 = _mm_cvtsi32_si128(~initial);
105
-
106
- xmm_t0 = _mm_clmulepi64_si128(xmm_t0, _mm_set_epi32(0, 0, 0xa273bc24, 0), 0); // reverse(0x487b9c8a)<<1 == 0xa273bc24
107
- __m128i reduction = _mm_set_epi32( // polynomial reduction factors
108
- 1, 0xdb710640, // G* = 0x04c11db7
109
- 0, 0xf7011641 // Q+ = 0x04d101df (+1 to save an additional xor operation)
104
+ __m128i xmm_t0 = _mm_clmulepi64_si128(
105
+ _mm_cvtsi32_si128(~initial),
106
+ _mm_cvtsi32_si128(0xdfded7ec),
107
+ 0
110
108
  );
111
- __m128i xmm_t1 = _mm_clmulepi64_si128(xmm_t0, reduction, 0);
112
- xmm_t1 = _mm_clmulepi64_si128(xmm_t1, reduction, 0x10);
113
109
 
114
- xmm_t0 = _mm_srli_si128(_mm_xor_si128(xmm_t0, xmm_t1), 8);
115
110
  __m256i crc0 = zext128_256(xmm_t0);
116
111
  __m256i crc1 = _mm256_setzero_si256();
117
112
 
@@ -217,13 +212,14 @@ static uint32_t do_crc32_incremental_clmul(const void* data, size_t length, uint
217
212
  return crc_fold((const unsigned char*)data, (long)length, init);
218
213
  }
219
214
 
220
- void crc_clmul256_set_funcs(crc_func* _do_crc32_incremental) {
221
- *_do_crc32_incremental = &do_crc32_incremental_clmul;
215
+ void crc_clmul256_set_funcs() {
216
+ crc_clmul_set_funcs(); // set multiply/shift function
217
+ _do_crc32_incremental = &do_crc32_incremental_clmul;
218
+ _crc32_isa = ISA_LEVEL_VPCLMUL;
222
219
  }
223
220
  #else
224
- void crc_clmul_set_funcs(crc_func* _do_crc32_incremental);
225
- void crc_clmul256_set_funcs(crc_func* _do_crc32_incremental) {
226
- crc_clmul_set_funcs(_do_crc32_incremental);
221
+ void crc_clmul256_set_funcs() {
222
+ crc_clmul_set_funcs();
227
223
  }
228
224
  #endif
229
225
 
@@ -0,0 +1,251 @@
1
+ #include "crc_common.h"
2
+
3
+ #if defined(__riscv) && defined(__GNUC__) && (defined(__riscv_zbkc) || defined(__riscv_zbc))
4
+
5
+ #if __has_include(<riscv_bitmanip.h>)
6
+ # include <riscv_bitmanip.h>
7
+ # if __riscv_xlen == 64
8
+ # define rv_clmul __riscv_clmul_64
9
+ # define rv_clmulh __riscv_clmulh_64
10
+ # else
11
+ # define rv_clmul __riscv_clmul_32
12
+ # define rv_clmulh __riscv_clmulh_32
13
+ # endif
14
+ #else
15
+ static HEDLEY_ALWAYS_INLINE uintptr_t rv_clmul(uintptr_t x, uintptr_t y) {
16
+ uintptr_t r;
17
+ __asm__("clmul %0, %1, %2\n"
18
+ : "=r"(r)
19
+ : "r"(x), "r"(y)
20
+ :);
21
+ return r;
22
+ }
23
+ static HEDLEY_ALWAYS_INLINE uintptr_t rv_clmulh(uintptr_t x, uintptr_t y) {
24
+ uintptr_t r;
25
+ __asm__("clmulh %0, %1, %2\n"
26
+ : "=r"(r)
27
+ : "r"(x), "r"(y)
28
+ :);
29
+ return r;
30
+ }
31
+ #endif
32
+
33
+ // TODO: test big-endian
34
+ #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
35
+ # if __riscv_xlen == 64
36
+ # define SWAP __builtin_bswap64
37
+ # else
38
+ # define SWAP __builtin_bswap32
39
+ # endif
40
+ #else
41
+ # define SWAP(d) (d)
42
+ #endif
43
+ static HEDLEY_ALWAYS_INLINE uintptr_t read_partial(const void* p, unsigned sz) {
44
+ uintptr_t data = 0;
45
+ memcpy(&data, p, sz);
46
+ return SWAP(data);
47
+ }
48
+ static HEDLEY_ALWAYS_INLINE uintptr_t read_full(const uintptr_t* p) {
49
+ return SWAP(*p);
50
+ }
51
+ #undef SWAP
52
+
53
+ static uint32_t rv_crc_calc(uint32_t crc, const unsigned char *src, long len) {
54
+ uintptr_t accum[4] = {};
55
+
56
+ // note: constants here are bit-reflected and shifted left by 1
57
+ // Zbc does also have clmulr to avoid the shift, but:
58
+ // - there's no clmulhr, so for XLEN=64, just shift the constant instead to get the same result
59
+ // - it's unavailable in Zbkc
60
+ // - for XLEN=32, 2x constants is likely worth it to avoid the additional XORs in the loop
61
+
62
+ #if __riscv_xlen == 64
63
+ const uint64_t MUL_HI = 0x15a546366 /*2^224*/, MUL_LO = 0xf1da05aa /*2^288*/;
64
+ #define CLMULL rv_clmul
65
+ #define CLMULH rv_clmulh
66
+
67
+ accum[3] = rv_clmul(crc, 0xb66b1fa6); // 2^-32
68
+ #elif __riscv_xlen == 32
69
+ const uint64_t MUL_HI = 0x140d44a2e /*2^128*/, MUL_LO = 0x1751997d0 /*2^160*/;
70
+ #define CLMULL(x, k) rv_clmul(x, k & 0xffffffff)
71
+ #define CLMULH(x, k) (rv_clmulh(x, k & 0xffffffff) ^ (k > 0xffffffffULL ? (x) : 0))
72
+
73
+ accum[2] = rv_clmul(crc, 0xb66b1fa6);
74
+ accum[3] = rv_clmulh(crc, 0xb66b1fa6);
75
+ #else
76
+ #error "Unknown __riscv_xlen"
77
+ #endif
78
+ const size_t WS = sizeof(uintptr_t);
79
+
80
+ // if src isn't word-aligned, process until it is so
81
+ long initial_alignment = ((uintptr_t)src & (WS-1));
82
+ long initial_process = WS - initial_alignment;
83
+ if(initial_alignment && len >= initial_process) {
84
+ unsigned shl = initial_alignment * 8, shr = initial_process * 8;
85
+ #if __riscv_xlen == 64
86
+ accum[2] = accum[3] << shl;
87
+ #else
88
+ accum[1] = accum[2] << shl;
89
+ accum[2] = (accum[3] << shl) | (accum[2] >> shr);
90
+ #endif
91
+ accum[3] = (read_partial(src, initial_process) << shl) | (accum[3] >> shr);
92
+ src += initial_process;
93
+ len -= initial_process;
94
+ }
95
+
96
+ // main processing loop
97
+ const uintptr_t* srcW = (const uintptr_t*)src;
98
+ while((len -= WS*4) >= 0) {
99
+ uintptr_t tmpHi, tmpLo;
100
+ tmpLo = CLMULL(accum[0], MUL_LO) ^ CLMULL(accum[1], MUL_HI);
101
+ tmpHi = CLMULH(accum[0], MUL_LO) ^ CLMULH(accum[1], MUL_HI);
102
+ accum[0] = tmpLo ^ read_full(srcW++);
103
+ accum[1] = tmpHi ^ read_full(srcW++);
104
+
105
+ tmpLo = CLMULL(accum[2], MUL_LO) ^ CLMULL(accum[3], MUL_HI);
106
+ tmpHi = CLMULH(accum[2], MUL_LO) ^ CLMULH(accum[3], MUL_HI);
107
+ accum[2] = tmpLo ^ read_full(srcW++);
108
+ accum[3] = tmpHi ^ read_full(srcW++);
109
+ }
110
+
111
+ // process trailing bytes
112
+ if(len & (WS*2)) {
113
+ uintptr_t tmpLo = CLMULL(accum[0], MUL_LO) ^ CLMULL(accum[1], MUL_HI);
114
+ uintptr_t tmpHi = CLMULH(accum[0], MUL_LO) ^ CLMULH(accum[1], MUL_HI);
115
+ accum[0] = accum[2];
116
+ accum[1] = accum[3];
117
+ accum[2] = tmpLo ^ read_full(srcW++);
118
+ accum[3] = tmpHi ^ read_full(srcW++);
119
+ }
120
+ if(len & WS) {
121
+ uintptr_t tmpLo = CLMULL(accum[0], MUL_HI);
122
+ uintptr_t tmpHi = CLMULH(accum[0], MUL_HI);
123
+ accum[0] = accum[1];
124
+ accum[1] = accum[2];
125
+ accum[2] = accum[3] ^ tmpLo;
126
+ accum[3] = tmpHi ^ read_full(srcW++);
127
+ }
128
+
129
+ size_t tail = len & (WS-1);
130
+ if(tail) {
131
+ unsigned shl = ((WS - tail) * 8), shr = tail * 8;
132
+ uintptr_t tmp = accum[0] << shl;
133
+ uintptr_t tmpLo = CLMULL(tmp, MUL_HI);
134
+ uintptr_t tmpHi = CLMULH(tmp, MUL_HI);
135
+ accum[0] = (accum[0] >> shr) | (accum[1] << shl);
136
+ accum[1] = (accum[1] >> shr) | (accum[2] << shl);
137
+ accum[2] = (accum[2] >> shr) | (accum[3] << shl);
138
+ accum[3] = (accum[3] >> shr) | (read_partial(srcW, tail) << shl);
139
+ accum[2] ^= tmpLo;
140
+ accum[3] ^= tmpHi;
141
+ }
142
+
143
+
144
+ // done processing: fold everything down
145
+ #if __riscv_xlen == 64
146
+ // fold 0,1 -> 2,3
147
+ accum[2] ^= rv_clmul(accum[0], 0x1751997d0) ^ rv_clmul(accum[1], 0xccaa009e);
148
+ accum[3] ^= rv_clmulh(accum[0], 0x1751997d0) ^ rv_clmulh(accum[1], 0xccaa009e);
149
+
150
+ // fold 2->3
151
+ accum[0] = rv_clmulh(accum[2], 0xccaa009e);
152
+ accum[3] ^= rv_clmul(accum[2], 0xccaa009e);
153
+
154
+ // fold 64b->32b
155
+ accum[1] = rv_clmul(accum[3] & 0xffffffff, 0x163cd6124);
156
+ accum[0] ^= accum[1] >> 32;
157
+ accum[3] = accum[1] ^ (accum[3] >> 32);
158
+ accum[3] <<= 32;
159
+ #else
160
+ // fold 0,1 -> 2,3
161
+ accum[2] ^= rv_clmul(accum[0], 0xccaa009e) ^ CLMULL(accum[1], 0x163cd6124);
162
+ accum[3] ^= rv_clmulh(accum[0], 0xccaa009e) ^ CLMULH(accum[1], 0x163cd6124);
163
+
164
+ // fold 2->3
165
+ accum[0] = CLMULH(accum[2], 0x163cd6124);
166
+ accum[3] ^= CLMULL(accum[2], 0x163cd6124);
167
+ #endif
168
+
169
+ // reduction
170
+ accum[3] = CLMULL(accum[3], 0xf7011641);
171
+ accum[3] = CLMULH(accum[3], 0x1db710640); // maybe consider clmulr for XLEN=32
172
+ crc = accum[0] ^ accum[3];
173
+ return crc;
174
+ #undef CLMULL
175
+ #undef CLMULH
176
+ }
177
+
178
+ static uint32_t do_crc32_incremental_rv_zbc(const void* data, size_t length, uint32_t init) {
179
+ return ~rv_crc_calc(~init, (const unsigned char*)data, (long)length);
180
+ }
181
+
182
+
183
+ #if __riscv_xlen == 64
184
+ // note that prod is shifted by 1 place to the right, due to bit-reflection
185
+ static uint32_t crc32_reduce_rv_zbc(uint64_t prod) {
186
+ uint64_t t = rv_clmul(prod << 33, 0xf7011641);
187
+ t = rv_clmulh(t, 0x1db710640);
188
+ t ^= prod >> 31;
189
+ return t;
190
+ }
191
+ #endif
192
+ uint32_t crc32_multiply_rv_zbc(uint32_t a, uint32_t b) {
193
+ #if __riscv_xlen == 64
194
+ uint64_t t = crc32_reduce_rv_zbc(rv_clmul(a, b));
195
+ #else
196
+ uint32_t prodLo = rv_clmul(a, b);
197
+ uint32_t prodHi = rv_clmulh(a, b);
198
+
199
+ // fix prodHi for bit-reflection (clmulr would be ideal here)
200
+ prodHi += prodHi;
201
+ prodHi |= prodLo >> 31;
202
+ prodLo += prodLo;
203
+
204
+ uint32_t t = rv_clmul(prodLo, 0xf7011641);
205
+ t ^= rv_clmulh(t, 0xdb710640);
206
+ t ^= prodHi;
207
+ #endif
208
+ return t;
209
+ }
210
+
211
+ #if defined(__GNUC__) || defined(_MSC_VER)
212
+ uint32_t crc32_shift_rv_zbc(uint32_t crc1, uint32_t n) {
213
+ // TODO: require Zbb for ctz
214
+ uint32_t result = crc1;
215
+ #if __riscv_xlen == 64
216
+ // for n<32, can shift directly
217
+ uint64_t prod = result;
218
+ prod <<= 31 ^ (n&31);
219
+ n &= ~31;
220
+ result = crc32_reduce_rv_zbc(prod);
221
+ #endif
222
+ if(!n) return result;
223
+
224
+ uint32_t result2 = crc_power[ctz32(n)];
225
+ n &= n-1;
226
+
227
+ while(n) {
228
+ result = crc32_multiply_rv_zbc(result, crc_power[ctz32(n)]);
229
+ n &= n-1;
230
+
231
+ if(n) {
232
+ result2 = crc32_multiply_rv_zbc(result2, crc_power[ctz32(n)]);
233
+ n &= n-1;
234
+ }
235
+ }
236
+ return crc32_multiply_rv_zbc(result, result2);
237
+ }
238
+ #endif
239
+
240
+
241
+ void crc_riscv_set_funcs() {
242
+ _do_crc32_incremental = &do_crc32_incremental_rv_zbc;
243
+ _crc32_multiply = &crc32_multiply_rv_zbc;
244
+ #if defined(__GNUC__) || defined(_MSC_VER)
245
+ _crc32_shift = &crc32_shift_rv_zbc;
246
+ #endif
247
+ _crc32_isa = ISA_FEATURE_ZBC;
248
+ }
249
+ #else
250
+ void crc_riscv_set_funcs() {}
251
+ #endif
package/src/decoder.cc CHANGED
@@ -7,6 +7,8 @@ extern "C" {
7
7
  YencDecoderEnd (*_do_decode)(const unsigned char**, unsigned char**, size_t, YencDecoderState*) = &do_decode_scalar<false, false>;
8
8
  YencDecoderEnd (*_do_decode_raw)(const unsigned char**, unsigned char**, size_t, YencDecoderState*) = &do_decode_scalar<true, false>;
9
9
  YencDecoderEnd (*_do_decode_end_raw)(const unsigned char**, unsigned char**, size_t, YencDecoderState*) = &do_decode_end_scalar<true>;
10
+
11
+ int _decode_isa = ISA_GENERIC;
10
12
  }
11
13
 
12
14
  void decoder_set_sse2_funcs();
@@ -16,6 +18,7 @@ void decoder_set_avx2_funcs();
16
18
  void decoder_set_vbmi2_funcs();
17
19
  extern const bool decoder_has_avx10;
18
20
  void decoder_set_neon_funcs();
21
+ void decoder_set_rvv_funcs();
19
22
 
20
23
 
21
24
  #if defined(PLATFORM_X86) && defined(YENC_BUILD_NATIVE) && YENC_BUILD_NATIVE!=0
@@ -23,23 +26,50 @@ void decoder_set_neon_funcs();
23
26
  # include "decoder_avx2_base.h"
24
27
  static inline void decoder_set_native_funcs() {
25
28
  ALIGN_ALLOC(lookups, sizeof(*lookups), 16);
26
- decoder_init_lut(lookups->eqFix, lookups->compact);
29
+ decoder_init_lut(lookups->compact);
27
30
  _do_decode = &do_decode_simd<false, false, sizeof(__m256i)*2, do_decode_avx2<false, false, ISA_NATIVE> >;
28
31
  _do_decode_raw = &do_decode_simd<true, false, sizeof(__m256i)*2, do_decode_avx2<true, false, ISA_NATIVE> >;
29
32
  _do_decode_end_raw = &do_decode_simd<true, true, sizeof(__m256i)*2, do_decode_avx2<true, true, ISA_NATIVE> >;
33
+ _decode_isa = ISA_NATIVE;
30
34
  }
31
35
  # else
32
36
  # include "decoder_sse_base.h"
33
37
  static inline void decoder_set_native_funcs() {
34
- decoder_sse_init();
35
- decoder_init_lut(lookups->eqFix, lookups->compact);
38
+ decoder_sse_init(lookups);
39
+ decoder_init_lut(lookups->compact);
36
40
  _do_decode = &do_decode_simd<false, false, sizeof(__m128i)*2, do_decode_sse<false, false, ISA_NATIVE> >;
37
41
  _do_decode_raw = &do_decode_simd<true, false, sizeof(__m128i)*2, do_decode_sse<true, false, ISA_NATIVE> >;
38
42
  _do_decode_end_raw = &do_decode_simd<true, true, sizeof(__m128i)*2, do_decode_sse<true, true, ISA_NATIVE> >;
43
+ _decode_isa = ISA_NATIVE;
39
44
  }
40
45
  # endif
41
46
  #endif
42
47
 
48
+
49
+ #if defined(PLATFORM_X86) || defined(PLATFORM_ARM)
50
+ void decoder_init_lut(void* compactLUT) {
51
+ #ifdef YENC_DEC_USE_THINTABLE
52
+ const int tableSize = 8;
53
+ #else
54
+ const int tableSize = 16;
55
+ #endif
56
+ for(int i=0; i<(tableSize==8?256:32768); i++) {
57
+ int k = i;
58
+ uint8_t* res = (uint8_t*)compactLUT + i*tableSize;
59
+ int p = 0;
60
+ for(int j=0; j<tableSize; j++) {
61
+ if(!(k & 1)) {
62
+ res[p++] = j;
63
+ }
64
+ k >>= 1;
65
+ }
66
+ for(; p<tableSize; p++)
67
+ res[p] = 0x80;
68
+ }
69
+ }
70
+ #endif
71
+
72
+
43
73
  void decoder_init() {
44
74
  #ifdef PLATFORM_X86
45
75
  # if defined(YENC_BUILD_NATIVE) && YENC_BUILD_NATIVE!=0
@@ -62,4 +92,8 @@ void decoder_init() {
62
92
  if(cpu_supports_neon())
63
93
  decoder_set_neon_funcs();
64
94
  #endif
95
+ #ifdef __riscv
96
+ if(cpu_supports_rvv())
97
+ decoder_set_rvv_funcs();
98
+ #endif
65
99
  }
package/src/decoder.h CHANGED
@@ -32,6 +32,7 @@ typedef enum {
32
32
  extern YencDecoderEnd (*_do_decode)(const unsigned char**, unsigned char**, size_t, YencDecoderState*);
33
33
  extern YencDecoderEnd (*_do_decode_raw)(const unsigned char**, unsigned char**, size_t, YencDecoderState*);
34
34
  extern YencDecoderEnd (*_do_decode_end_raw)(const unsigned char**, unsigned char**, size_t, YencDecoderState*);
35
+ extern int _decode_isa;
35
36
 
36
37
  static inline size_t do_decode(int isRaw, const unsigned char* src, unsigned char* dest, size_t len, YencDecoderState* state) {
37
38
  unsigned char* ds = dest;
@@ -45,6 +46,9 @@ static inline YencDecoderEnd do_decode_end(const unsigned char** src, unsigned c
45
46
 
46
47
  void decoder_init();
47
48
 
49
+ static inline int decode_isa_level() {
50
+ return _decode_isa;
51
+ }
48
52
 
49
53
 
50
54
  #ifdef __cplusplus
@@ -4,11 +4,12 @@
4
4
  #include "decoder_common.h"
5
5
  #include "decoder_sse_base.h"
6
6
  void decoder_set_avx_funcs() {
7
- decoder_sse_init();
8
- decoder_init_lut(lookups->eqFix, lookups->compact);
7
+ decoder_sse_init(lookups);
8
+ decoder_init_lut(lookups->compact);
9
9
  _do_decode = &do_decode_simd<false, false, sizeof(__m128i)*2, do_decode_sse<false, false, ISA_LEVEL_SSE4_POPCNT> >;
10
10
  _do_decode_raw = &do_decode_simd<true, false, sizeof(__m128i)*2, do_decode_sse<true, false, ISA_LEVEL_SSE4_POPCNT> >;
11
11
  _do_decode_end_raw = &do_decode_simd<true, true, sizeof(__m128i)*2, do_decode_sse<true, true, ISA_LEVEL_SSE4_POPCNT> >;
12
+ _decode_isa = ISA_LEVEL_AVX;
12
13
  }
13
14
  #else
14
15
  void decoder_set_ssse3_funcs();
@@ -5,10 +5,11 @@
5
5
  #include "decoder_avx2_base.h"
6
6
  void decoder_set_avx2_funcs() {
7
7
  ALIGN_ALLOC(lookups, sizeof(*lookups), 16);
8
- decoder_init_lut(lookups->eqFix, lookups->compact);
8
+ decoder_init_lut(lookups->compact);
9
9
  _do_decode = &do_decode_simd<false, false, sizeof(__m256i)*2, do_decode_avx2<false, false, ISA_LEVEL_AVX2> >;
10
10
  _do_decode_raw = &do_decode_simd<true, false, sizeof(__m256i)*2, do_decode_avx2<true, false, ISA_LEVEL_AVX2> >;
11
11
  _do_decode_end_raw = &do_decode_simd<true, true, sizeof(__m256i)*2, do_decode_avx2<true, true, ISA_LEVEL_AVX2> >;
12
+ _decode_isa = ISA_LEVEL_AVX2;
12
13
  }
13
14
  #else
14
15
  void decoder_set_avx_funcs();
@@ -15,7 +15,6 @@
15
15
  #pragma pack(16)
16
16
  static struct {
17
17
  /*align16*/ struct { char bytes[16]; } compact[32768];
18
- uint8_t eqFix[256];
19
18
  } * HEDLEY_RESTRICT lookups;
20
19
  #pragma pack()
21
20
 
@@ -67,6 +66,8 @@ HEDLEY_ALWAYS_INLINE void do_decode_avx2(const uint8_t* src, long& len, unsigned
67
66
  );
68
67
  }
69
68
 
69
+ decoder_set_nextMask<isRaw>(src, len, _nextMask); // set this before the loop because we can't check src after it's been overwritten
70
+
70
71
  // for some reason, MSVC Win32 seems to crash when trying to compile _mm256_mask_cmpeq_epi8_mask
71
72
  // the crash can be fixed by switching the order of the last two arguments, but it seems to generate wrong code
72
73
  // so just disable the optimisation as it seems to be problematic there
@@ -320,6 +321,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_avx2(const uint8_t* src, long& len, unsigned
320
321
  // terminator found
321
322
  // there's probably faster ways to do this, but reverting to scalar code should be good enough
322
323
  len += (long)i;
324
+ _nextMask = decoder_set_nextMask<isRaw>(src+i, mask);
323
325
  break;
324
326
  }
325
327
  }
@@ -412,6 +414,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_avx2(const uint8_t* src, long& len, unsigned
412
414
  }
413
415
  if(endFound) {
414
416
  len += (long)i;
417
+ _nextMask = decoder_set_nextMask<isRaw>(src+i, mask);
415
418
  break;
416
419
  }
417
420
  }
@@ -427,16 +430,9 @@ HEDLEY_ALWAYS_INLINE void do_decode_avx2(const uint8_t* src, long& len, unsigned
427
430
  dataB = _mm256_add_epi8(oDataB, _mm256_set1_epi8(-42));
428
431
 
429
432
  if(LIKELIHOOD(0.0001, (mask & ((maskEq << 1) + escFirst)) != 0)) {
430
- unsigned tmp = lookups->eqFix[(maskEq&0xff) & ~(uint64_t)escFirst];
431
- uint64_t maskEq2 = tmp;
432
- for(int j=8; j<64; j+=8) {
433
- tmp = lookups->eqFix[(unsigned)((maskEq>>j)&0xff) & ~(tmp>>7)];
434
- maskEq2 |= (uint64_t)tmp<<j;
435
- }
436
- maskEq = maskEq2;
437
-
433
+ maskEq = fix_eqMask<uint64_t>(maskEq & ~(uint64_t)escFirst);
438
434
  mask &= ~(uint64_t)escFirst;
439
- escFirst = tmp>>7;
435
+ escFirst = maskEq>>63;
440
436
  // next, eliminate anything following a `=` from the special char mask; this eliminates cases of `=\r` so that they aren't removed
441
437
  maskEq <<= 1;
442
438
  mask &= ~maskEq;
@@ -613,20 +609,6 @@ HEDLEY_ALWAYS_INLINE void do_decode_avx2(const uint8_t* src, long& len, unsigned
613
609
  }
614
610
  }
615
611
  _escFirst = (unsigned char)escFirst;
616
- if(isRaw) {
617
- // this would be the trivial solution, but requires the compiler holding onto minMask throughout the loop:
618
- //_nextMask = ~(uint16_t)_mm256_movemask_epi8(_mm256_cmpeq_epi8(minMask, _mm256_set1_epi8('.')));
619
- // instead, just scan the memory to determine what to set nextMask to
620
- if(len != 0) { // have to gone through at least one loop cycle
621
- if(src[i-2] == '\r' && src[i-1] == '\n' && src[i] == '.')
622
- _nextMask = 1;
623
- else if(src[i-1] == '\r' && src[i] == '\n' && src[i+1] == '.')
624
- _nextMask = 2;
625
- else
626
- _nextMask = 0;
627
- }
628
- } else
629
- _nextMask = 0;
630
612
  _mm256_zeroupper();
631
613
  }
632
614
  #endif
@@ -1,5 +1,9 @@
1
1
  #include "decoder.h"
2
2
 
3
+ #if defined(PLATFORM_ARM) && !defined(__aarch64__)
4
+ #define YENC_DEC_USE_THINTABLE 1
5
+ #endif
6
+
3
7
  // TODO: need to support max output length somehow
4
8
  // TODO: add branch probabilities
5
9
 
@@ -178,24 +182,24 @@ YencDecoderEnd do_decode_end_scalar(const unsigned char** src, unsigned char** d
178
182
  if(es[i] == '.' && isRaw) {
179
183
  i++;
180
184
  YDEC_CHECK_END(YDEC_STATE_CRLFDT)
181
- // fall-through
182
185
  } else if(es[i] == '=') {
183
186
  i++;
184
187
  YDEC_CHECK_END(YDEC_STATE_CRLFEQ)
185
188
  goto do_decode_endable_scalar_ceq;
186
189
  } else
187
190
  break;
191
+ // fall-through
188
192
  case YDEC_STATE_CRLFDT:
189
193
  if(isRaw && es[i] == '\r') {
190
194
  i++;
191
195
  YDEC_CHECK_END(YDEC_STATE_CRLFDTCR)
192
- // fall-through
193
196
  } else if(isRaw && es[i] == '=') { // check for dot-stuffed ending: \r\n.=y
194
197
  i++;
195
198
  YDEC_CHECK_END(YDEC_STATE_CRLFEQ)
196
199
  goto do_decode_endable_scalar_ceq;
197
200
  } else
198
201
  break;
202
+ // fall-through
199
203
  case YDEC_STATE_CRLFDTCR:
200
204
  if(es[i] == '\n') {
201
205
  if(isRaw) {
@@ -331,8 +335,8 @@ YencDecoderEnd do_decode_scalar(const unsigned char** src, unsigned char** dest,
331
335
 
332
336
 
333
337
 
334
- template<bool isRaw, bool searchEnd, int width, void(&kernel)(const uint8_t*, long&, unsigned char*&, unsigned char&, uint16_t&)>
335
- YencDecoderEnd do_decode_simd(const unsigned char** src, unsigned char** dest, size_t len, YencDecoderState* state) {
338
+ template<bool isRaw, bool searchEnd, void(&kernel)(const uint8_t*, long&, unsigned char*&, unsigned char&, uint16_t&)>
339
+ inline YencDecoderEnd _do_decode_simd(size_t width, const unsigned char** src, unsigned char** dest, size_t len, YencDecoderState* state) {
336
340
  if(len <= width*2) return do_decode_scalar<isRaw, searchEnd>(src, dest, len, state);
337
341
 
338
342
  YencDecoderState tState = YDEC_STATE_CRLF;
@@ -461,52 +465,60 @@ YencDecoderEnd do_decode_simd(const unsigned char** src, unsigned char** dest, s
461
465
  return YDEC_END_NONE;
462
466
  }
463
467
 
464
- static inline void decoder_init_lut(uint8_t* eqFixLUT, void* compactLUT) {
465
- for(int i=0; i<256; i++) {
466
- int k = i;
467
- int p = 0;
468
-
469
- // fix LUT
470
- k = i;
471
- p = 0;
472
- for(int j=0; j<8; j++) {
473
- k = i >> j;
474
- if(k & 1) {
475
- p |= 1 << j;
476
- j++;
477
- }
478
- }
479
- eqFixLUT[i] = p;
480
-
481
- #ifdef YENC_DEC_USE_THINTABLE
482
- uint8_t* res = (uint8_t*)compactLUT + i*8;
483
- k = i;
484
- p = 0;
485
- for(int j=0; j<8; j++) {
486
- if(!(k & 1)) {
487
- res[p++] = j;
488
- }
489
- k >>= 1;
490
- }
491
- for(; p<8; p++)
492
- res[p] = 0x80;
493
- #endif
494
- }
495
- #ifndef YENC_DEC_USE_THINTABLE
496
- for(int i=0; i<32768; i++) {
497
- int k = i;
498
- uint8_t* res = (uint8_t*)compactLUT + i*16;
499
- int p = 0;
500
-
501
- for(int j=0; j<16; j++) {
502
- if(!(k & 1)) {
503
- res[p++] = j;
504
- }
505
- k >>= 1;
468
+ template<bool isRaw, bool searchEnd, size_t width, void(&kernel)(const uint8_t*, long&, unsigned char*&, unsigned char&, uint16_t&)>
469
+ YencDecoderEnd do_decode_simd(const unsigned char** src, unsigned char** dest, size_t len, YencDecoderState* state) {
470
+ return _do_decode_simd<isRaw, searchEnd, kernel>(width, src, dest, len, state);
471
+ }
472
+ template<bool isRaw, bool searchEnd, size_t(&getWidth)(), void(&kernel)(const uint8_t*, long&, unsigned char*&, unsigned char&, uint16_t&)>
473
+ YencDecoderEnd do_decode_simd(const unsigned char** src, unsigned char** dest, size_t len, YencDecoderState* state) {
474
+ return _do_decode_simd<isRaw, searchEnd, kernel>(getWidth(), src, dest, len, state);
475
+ }
476
+
477
+
478
+ #if defined(PLATFORM_X86) || defined(PLATFORM_ARM)
479
+ void decoder_init_lut(void* compactLUT);
480
+ #endif
481
+
482
+ template<bool isRaw>
483
+ static inline void decoder_set_nextMask(const uint8_t* src, size_t len, uint16_t& nextMask) {
484
+ if(isRaw) {
485
+ if(len != 0) { // have to gone through at least one loop cycle
486
+ if(src[-2] == '\r' && src[-1] == '\n' && src[0] == '.')
487
+ nextMask = 1;
488
+ else if(src[-1] == '\r' && src[0] == '\n' && src[1] == '.')
489
+ nextMask = 2;
490
+ else
491
+ nextMask = 0;
506
492
  }
507
- for(; p<16; p++)
508
- res[p] = 0x80;
493
+ } else
494
+ nextMask = 0;
495
+ }
496
+
497
+ // without backtracking
498
+ template<bool isRaw>
499
+ static inline uint16_t decoder_set_nextMask(const uint8_t* src, unsigned mask) {
500
+ if(isRaw) {
501
+ if(src[0] == '.')
502
+ return mask & 1;
503
+ if(src[1] == '.')
504
+ return mask & 2;
509
505
  }
510
- #endif
506
+ return 0;
511
507
  }
512
508
 
509
+ // resolve invalid sequences of = to deal with cases like '===='
510
+ // bit hack inspired from simdjson: https://youtu.be/wlvKAT7SZIQ?t=33m38s
511
+ template<typename T>
512
+ static inline T fix_eqMask(T mask) {
513
+ // isolate the start of each consecutive bit group (e.g. 01011101 -> 01000101)
514
+ T start = mask & ~(mask << 1);
515
+
516
+ const T odd = (T)0xaaaaaaaaaaaaaaaa; // every odd bit (10101010...)
517
+
518
+ // obtain groups which start on an even bit (clear groups that start on an odd bit, but this leaves an unwanted trailing bit)
519
+ T evenGroups = mask + (start & odd);
520
+
521
+ // clear odd bits in even groups, whilst conversely preserving odd bits in odd groups
522
+ // the `& mask` also conveniently gets rid of unwanted trailing bits
523
+ return (evenGroups ^ odd) & mask;
524
+ }