yencode 1.0.8 → 1.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. package/README.md +339 -231
  2. package/binding.gyp +292 -39
  3. package/crcutil-1.0/code/multiword_64_64_gcc_amd64_asm.cc +7 -7
  4. package/crcutil-1.0/code/multiword_64_64_gcc_i386_mmx.cc +14 -14
  5. package/crcutil-1.0/code/multiword_64_64_intrinsic_i386_mmx.cc +1 -1
  6. package/crcutil-1.0/code/uint128_sse2.h +2 -0
  7. package/index.js +329 -22
  8. package/package.json +2 -2
  9. package/src/common.h +299 -0
  10. package/src/crc.cc +95 -0
  11. package/src/crc.h +23 -0
  12. package/src/crc_arm.cc +175 -0
  13. package/src/crc_common.h +4 -0
  14. package/{crc_folding.c → src/crc_folding.cc} +175 -185
  15. package/src/decoder.cc +61 -0
  16. package/src/decoder.h +53 -0
  17. package/src/decoder_avx.cc +18 -0
  18. package/src/decoder_avx2.cc +18 -0
  19. package/src/decoder_avx2_base.h +615 -0
  20. package/src/decoder_common.h +512 -0
  21. package/src/decoder_neon.cc +474 -0
  22. package/src/decoder_neon64.cc +451 -0
  23. package/src/decoder_sse2.cc +16 -0
  24. package/src/decoder_sse_base.h +711 -0
  25. package/src/decoder_ssse3.cc +18 -0
  26. package/src/encoder.cc +170 -0
  27. package/src/encoder.h +21 -0
  28. package/src/encoder_avx.cc +16 -0
  29. package/src/encoder_avx2.cc +16 -0
  30. package/src/encoder_avx_base.h +564 -0
  31. package/src/encoder_common.h +109 -0
  32. package/src/encoder_neon.cc +547 -0
  33. package/src/encoder_sse2.cc +13 -0
  34. package/src/encoder_sse_base.h +724 -0
  35. package/src/encoder_ssse3.cc +18 -0
  36. package/src/hedley.h +1899 -0
  37. package/src/platform.cc +147 -0
  38. package/src/yencode.cc +449 -0
  39. package/test/_maxsize.js +9 -0
  40. package/test/_speedbase.js +147 -0
  41. package/test/speedcrc.js +20 -0
  42. package/test/speeddec.js +92 -0
  43. package/test/speedenc.js +44 -0
  44. package/{testcrc.js → test/testcrc.js} +53 -39
  45. package/test/testdec.js +183 -0
  46. package/test/testenc.js +163 -0
  47. package/test/testpostdec.js +126 -0
  48. package/test.js +0 -91
  49. package/yencode.cc +0 -1622
@@ -0,0 +1,474 @@
1
+ #include "common.h"
2
+ #ifdef __ARM_NEON
3
+
4
+ #ifndef __aarch64__
5
+ #define YENC_DEC_USE_THINTABLE 1
6
+ #endif
7
+ #include "decoder_common.h"
8
+
9
+
10
+ #if defined(_MSC_VER) && !defined(__clang__)
11
+ # define vld1_u8_align(p, a) vld1_u8_ex(p, a*8)
12
+ # define vld1q_u8_align(p, a) vld1q_u8_ex(p, a*8)
13
+ #elif defined(__GNUC__)
14
+ # define vld1_u8_align(p, n) vld1_u8((uint8_t*)__builtin_assume_aligned(p, n))
15
+ # define vld1q_u8_align(p, n) vld1q_u8((uint8_t*)__builtin_assume_aligned(p, n))
16
+ #else
17
+ # define vld1_u8_align(p, n) vld1_u8(p)
18
+ # define vld1q_u8_align(p, n) vld1q_u8(p)
19
+ #endif
20
+
21
+
22
+ // for compilers that lack these functions
23
+ #if defined(__clang__) || (defined(__GNUC__) && (defined(__aarch64__) && __GNUC__ >= 8))
24
+ # define vld1q_u8_x2_align(p, n) vld1q_u8_x2((uint8_t*)__builtin_assume_aligned(p, n))
25
+ #else
26
+ # define vld1q_u8_x2_align(p, n) vcreate2_u8(vld1q_u8_align(p, (n)/2), vld1q_u8_align((p)+16, (n)/2))
27
+ #endif
28
+ // Clang wrongly assumes alignment on vld1q_u8_x2, and ARMv7 GCC doesn't support the function, so effectively, it can only be used in ARMv8 compilers
29
+ #if defined(__aarch64__) && (defined(__clang__) || (defined(__GNUC__) && __GNUC__ >= 9))
30
+ # define vst1q_u8_x2_unaligned vst1q_u8_x2
31
+ #else
32
+ static HEDLEY_ALWAYS_INLINE void vst1q_u8_x2_unaligned(uint8_t* p, uint8x16x2_t data) {
33
+ vst1q_u8(p, data.val[0]);
34
+ vst1q_u8(p+16, data.val[1]);
35
+ }
36
+ #endif
37
+
38
+ #ifdef YENC_DEC_USE_THINTABLE
39
+ static uint64_t ALIGN_TO(8, compactLUT[256]);
40
+ #else
41
+ # pragma pack(16)
42
+ static struct { char bytes[16]; } ALIGN_TO(16, compactLUT[32768]);
43
+ # pragma pack()
44
+ #endif
45
+
46
+ static uint8_t eqFixLUT[256];
47
+
48
+
49
+
50
+ static bool neon_vect_is_nonzero(uint8x16_t v) {
51
+ # ifdef __aarch64__
52
+ return !!(vget_lane_u64(vreinterpret_u64_u32(vqmovn_u64(vreinterpretq_u64_u8(v))), 0));
53
+ # else
54
+ uint32x4_t tmp1 = vreinterpretq_u32_u8(v);
55
+ uint32x2_t tmp2 = vorr_u32(vget_low_u32(tmp1), vget_high_u32(tmp1));
56
+ return !!(vget_lane_u32(vpmax_u32(tmp2, tmp2), 0));
57
+ # endif
58
+ }
59
+
60
+
61
+ template<bool isRaw, bool searchEnd>
62
+ HEDLEY_ALWAYS_INLINE void do_decode_neon(const uint8_t* HEDLEY_RESTRICT src, long& len, unsigned char* HEDLEY_RESTRICT & p, unsigned char& escFirst, uint16_t& nextMask) {
63
+ HEDLEY_ASSUME(escFirst == 0 || escFirst == 1);
64
+ HEDLEY_ASSUME(nextMask == 0 || nextMask == 1 || nextMask == 2);
65
+ uint8x16_t yencOffset = escFirst ? vmakeq_u8(42+64,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42) : vdupq_n_u8(42);
66
+ #ifdef __aarch64__
67
+ uint8x16_t nextMaskMix = vdupq_n_u8(0);
68
+ if(nextMask == 1)
69
+ nextMaskMix = vsetq_lane_u8(1, nextMaskMix, 0);
70
+ if(nextMask == 2)
71
+ nextMaskMix = vsetq_lane_u8(2, nextMaskMix, 1);
72
+ #else
73
+ uint8x16_t lfCompare = vdupq_n_u8('\n');
74
+ if(isRaw) {
75
+ if(nextMask == 1)
76
+ lfCompare = vsetq_lane_u8('.', lfCompare, 0);
77
+ if(nextMask == 2)
78
+ lfCompare = vsetq_lane_u8('.', lfCompare, 1);
79
+ }
80
+ #endif
81
+ long i;
82
+ for(i = -len; i; i += sizeof(uint8x16_t)*2) {
83
+ uint8x16x2_t data = vld1q_u8_x2_align(src+i, 32);
84
+ uint8x16_t dataA = data.val[0];
85
+ uint8x16_t dataB = data.val[1];
86
+
87
+ // search for special chars
88
+ uint8x16_t cmpEqA = vceqq_u8(dataA, vdupq_n_u8('=')),
89
+ cmpEqB = vceqq_u8(dataB, vdupq_n_u8('=')),
90
+ #ifdef __aarch64__
91
+ cmpA = vqtbx1q_u8(
92
+ cmpEqA,
93
+ // \n \r
94
+ vmakeq_u8(0,0,0,0,0,0,0,0,0,0,255,0,0,255,0,0),
95
+ dataA
96
+ ),
97
+ cmpB = vqtbx1q_u8(
98
+ cmpEqB,
99
+ vmakeq_u8(0,0,0,0,0,0,0,0,0,0,255,0,0,255,0,0),
100
+ dataB
101
+ );
102
+ if(isRaw) cmpA = vorrq_u8(cmpA, nextMaskMix);
103
+ #else
104
+ cmpCrA = vceqq_u8(dataA, vdupq_n_u8('\r')),
105
+ cmpCrB = vceqq_u8(dataB, vdupq_n_u8('\r')),
106
+ cmpA = vorrq_u8(
107
+ vorrq_u8(
108
+ cmpCrA,
109
+ vceqq_u8(dataA, lfCompare)
110
+ ),
111
+ cmpEqA
112
+ ),
113
+ cmpB = vorrq_u8(
114
+ vorrq_u8(
115
+ cmpCrB,
116
+ vceqq_u8(dataB, vdupq_n_u8('\n'))
117
+ ),
118
+ cmpEqB
119
+ );
120
+ #endif
121
+
122
+
123
+ #ifdef __aarch64__
124
+ if (LIKELIHOOD(0.42 /*guess*/, neon_vect_is_nonzero(vorrq_u8(cmpA, cmpB)))) {
125
+ cmpA = vandq_u8(cmpA, vmakeq_u8(1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128));
126
+ cmpB = vandq_u8(cmpB, vmakeq_u8(1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128));
127
+ uint8x16_t cmpMerge = vpaddq_u8(cmpA, cmpB);
128
+ uint8x16_t cmpEqMerge = vpaddq_u8(
129
+ vandq_u8(cmpEqA, vmakeq_u8(1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128)),
130
+ vandq_u8(cmpEqB, vmakeq_u8(1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128))
131
+ );
132
+
133
+ uint8x16_t cmpCombined = vpaddq_u8(cmpMerge, cmpEqMerge);
134
+ cmpCombined = vpaddq_u8(cmpCombined, cmpCombined);
135
+ uint8x8_t cmpPacked = vget_low_u8(cmpCombined);
136
+ uint32_t mask = vgetq_lane_u32(vreinterpretq_u32_u8(cmpCombined), 0);
137
+ uint32_t maskEq = vgetq_lane_u32(vreinterpretq_u32_u8(cmpCombined), 1);
138
+ #else
139
+ cmpA = vandq_u8(cmpA, vmakeq_u8(1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128));
140
+ cmpB = vandq_u8(cmpB, vmakeq_u8(1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128));
141
+ // no vpaddq_u8 in ARMv7, so need extra 64-bit VPADD
142
+ uint8x8_t cmpPacked = vpadd_u8(
143
+ vpadd_u8(
144
+ vget_low_u8(cmpA), vget_high_u8(cmpA)
145
+ ),
146
+ vpadd_u8(
147
+ vget_low_u8(cmpB), vget_high_u8(cmpB)
148
+ )
149
+ );
150
+ cmpPacked = vpadd_u8(cmpPacked, cmpPacked);
151
+ uint32_t mask = vget_lane_u32(vreinterpret_u32_u8(cmpPacked), 0);
152
+ if(LIKELIHOOD(0.42, mask != 0)) {
153
+ uint8x16_t cmpEqMaskedA = vandq_u8(cmpEqA, vmakeq_u8(1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128));
154
+ uint8x16_t cmpEqMaskedB = vandq_u8(cmpEqB, vmakeq_u8(1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128));
155
+ uint8x8_t cmpEqPacked = vpadd_u8(
156
+ vpadd_u8(
157
+ vget_low_u8(cmpEqMaskedA), vget_high_u8(cmpEqMaskedA)
158
+ ),
159
+ vpadd_u8(
160
+ vget_low_u8(cmpEqMaskedB), vget_high_u8(cmpEqMaskedB)
161
+ )
162
+ );
163
+ cmpEqPacked = vpadd_u8(cmpEqPacked, cmpEqPacked);
164
+ uint32_t maskEq = vget_lane_u32(vreinterpret_u32_u8(cmpEqPacked), 0);
165
+ #endif
166
+
167
+ // handle \r\n. sequences
168
+ // RFC3977 requires the first dot on a line to be stripped, due to dot-stuffing
169
+ if((isRaw || searchEnd) && LIKELIHOOD(0.15, mask != maskEq)) {
170
+ // vext seems to be a cheap operation on ARM, relative to loads, so only avoid it if there's only one load (isRaw only)
171
+ uint8x16_t tmpData2, nextData;
172
+ if(isRaw && !searchEnd) {
173
+ tmpData2 = vld1q_u8(src+i + 2 + sizeof(uint8x16_t));
174
+ } else {
175
+ nextData = vld1q_u8_align(src+i + sizeof(uint8x16_t)*2, 16); // only 32-bits needed, but there doesn't appear a nice way to do this via intrinsics: https://stackoverflow.com/questions/46910799/arm-neon-intrinsics-convert-d-64-bit-register-to-low-half-of-q-128-bit-regis
176
+ tmpData2 = vextq_u8(dataB, nextData, 2);
177
+ }
178
+ #ifdef __aarch64__
179
+ uint8x16_t cmpCrA = vceqq_u8(dataA, vdupq_n_u8('\r'));
180
+ uint8x16_t cmpCrB = vceqq_u8(dataB, vdupq_n_u8('\r'));
181
+ # define NEXT_DATA(n) vextq_u8(dataB, nextData, n)
182
+ #else
183
+ // on ARMv7, prefer loading over VEXT to avoid holding onto nextData reference; this reduces register spills. Shouldn't be an issue on ARMv8 due to 32x 128-bit registers
184
+ # define NEXT_DATA(n) vld1q_u8(src+i + n+sizeof(uint8x16_t))
185
+ #endif
186
+ uint8x16_t match2EqA, match2Cr_DotA;
187
+ uint8x16_t match2EqB, match2Cr_DotB;
188
+ if(searchEnd) {
189
+ match2EqB = vceqq_u8(tmpData2, vdupq_n_u8('='));
190
+ }
191
+ if(isRaw) {
192
+ match2Cr_DotA = vandq_u8(cmpCrA, vceqq_u8(vextq_u8(dataA, dataB, 2), vdupq_n_u8('.')));
193
+ match2Cr_DotB = vandq_u8(cmpCrB, vceqq_u8(tmpData2, vdupq_n_u8('.')));
194
+ }
195
+
196
+ // find patterns of \r_.
197
+ if(isRaw && LIKELIHOOD(0.001, neon_vect_is_nonzero(
198
+ vorrq_u8(match2Cr_DotA, match2Cr_DotB)
199
+ ))) {
200
+ uint8x16_t match1LfA = vceqq_u8(vextq_u8(dataA, dataB, 1), vdupq_n_u8('\n'));
201
+ uint8x16_t match1LfB;
202
+ if(searchEnd)
203
+ match1LfB = vceqq_u8(NEXT_DATA(1), vdupq_n_u8('\n'));
204
+ else
205
+ match1LfB = vceqq_u8(vld1q_u8(src+i + 1+sizeof(uint8x16_t)), vdupq_n_u8('\n'));
206
+ // merge matches of \r_. with those for \n
207
+ uint8x16_t match2NlDotA = vandq_u8(match2Cr_DotA, match1LfA);
208
+ uint8x16_t match2NlDotB = vandq_u8(match2Cr_DotB, match1LfB);
209
+ if(searchEnd) {
210
+ uint8x16_t match1NlA = vandq_u8(match1LfA, cmpCrA);
211
+ uint8x16_t match1NlB = vandq_u8(match1LfB, cmpCrB);
212
+
213
+ uint8x16_t tmpData3 = NEXT_DATA(3);
214
+ uint8x16_t tmpData4 = NEXT_DATA(4);
215
+ // match instances of \r\n.\r\n and \r\n.=y
216
+ uint8x16_t match3CrB = vceqq_u8(tmpData3, vdupq_n_u8('\r'));
217
+ uint8x16_t match4LfB = vceqq_u8(tmpData4, vdupq_n_u8('\n'));
218
+ uint8x16_t match4Nl = vbslq_u8(vdupq_n_u8('\r'), // exact VBSL vector doesn't matter, so reuse the '\r' vector
219
+ vextq_u8(match1NlA, match1NlB, 3),
220
+ vandq_u8(match3CrB, match4LfB)
221
+ );
222
+ uint8x16_t match4EqY = vbslq_u8(vdupq_n_u8('\r'),
223
+ // match =y
224
+ vreinterpretq_u8_u16(vceqq_u16(vreinterpretq_u16_u8(vextq_u8(dataA, dataB, 4)), vdupq_n_u16(0x793d))),
225
+ vreinterpretq_u8_u16(vceqq_u16(vreinterpretq_u16_u8(tmpData4), vdupq_n_u16(0x793d)))
226
+ );
227
+
228
+ match2EqA = vextq_u8(cmpEqA, cmpEqB, 2);
229
+ uint8x16_t match3EqY = vbslq_u8(vdupq_n_u8('\r'),
230
+ vandq_u8(
231
+ vceqq_u8(vextq_u8(dataA, dataB, 3), vdupq_n_u8('y')),
232
+ match2EqA
233
+ ), vandq_u8(
234
+ vceqq_u8(tmpData3, vdupq_n_u8('y')),
235
+ match2EqB
236
+ )
237
+ );
238
+ // merge \r\n and =y matches for tmpData4
239
+ uint8x16_t match4End = vorrq_u8(
240
+ match4Nl,
241
+ vreinterpretq_u8_u16(vsriq_n_u16(vreinterpretq_u16_u8(match4EqY), vreinterpretq_u16_u8(match3EqY), 8))
242
+ );
243
+ // merge with \r\n.
244
+ uint8x16_t match2NlDot = vbslq_u8(vdupq_n_u8('\r'), match2NlDotA, match2NlDotB);
245
+ match4End = vandq_u8(match4End, match2NlDot);
246
+ // match \r\n=y
247
+ uint8x16_t match1Nl = vbslq_u8(vdupq_n_u8('\r'), match1NlA, match1NlB);
248
+ uint8x16_t match3End = vandq_u8(match3EqY, match1Nl);
249
+ // combine match sequences
250
+ if(LIKELIHOOD(0.001, neon_vect_is_nonzero(vorrq_u8(match4End, match3End)))) {
251
+ // terminator found
252
+ // there's probably faster ways to do this, but reverting to scalar code should be good enough
253
+ len += i;
254
+ break;
255
+ }
256
+ }
257
+ #ifdef __aarch64__
258
+ uint8x16_t match2NlDotBMasked = vandq_u8(match2NlDotB, vmakeq_u8(1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128));
259
+ uint8x16_t mergeKillDots = vpaddq_u8(
260
+ vandq_u8(match2NlDotA, vmakeq_u8(1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128)),
261
+ match2NlDotBMasked
262
+ );
263
+ uint8x8_t mergeKillDots2 = vget_low_u8(vpaddq_u8(mergeKillDots, mergeKillDots));
264
+ #else
265
+ uint8x16_t match2NlDotMaskedA = vandq_u8(match2NlDotA, vmakeq_u8(1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128));
266
+ uint8x16_t match2NlDotMaskedB = vandq_u8(match2NlDotB, vmakeq_u8(1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128));
267
+ uint8x8_t mergeKillDots2 = vpadd_u8(
268
+ vpadd_u8(
269
+ vget_low_u8(match2NlDotMaskedA), vget_high_u8(match2NlDotMaskedA)
270
+ ),
271
+ vpadd_u8(
272
+ vget_low_u8(match2NlDotMaskedB), vget_high_u8(match2NlDotMaskedB)
273
+ )
274
+ );
275
+ #endif
276
+ mergeKillDots2 = vpadd_u8(mergeKillDots2, mergeKillDots2);
277
+ uint32x2_t mergeKillDotsShifted = vshl_n_u32(vreinterpret_u32_u8(mergeKillDots2), 2);
278
+ mask |= vget_lane_u32(mergeKillDotsShifted, 0);
279
+ cmpPacked = vorr_u8(cmpPacked, vreinterpret_u8_u32(mergeKillDotsShifted));
280
+ #ifdef __aarch64__
281
+ nextMaskMix = vextq_u8(match2NlDotB, vdupq_n_u8(0), 14);
282
+ #else
283
+ lfCompare = vcombine_u8(vbsl_u8(
284
+ vext_u8(vget_high_u8(match2NlDotB), vdup_n_u8('\n'), 6),
285
+ vdup_n_u8('.'),
286
+ vget_high_u8(lfCompare)
287
+ ), vget_high_u8(lfCompare));
288
+ #endif
289
+ } else if(searchEnd) {
290
+ match2EqA = vextq_u8(cmpEqA, cmpEqB, 2);
291
+ uint8x16_t match3EqYA = vandq_u8(match2EqA, vceqq_u8(vextq_u8(dataA, dataB, 3), vdupq_n_u8('y')));
292
+ uint8x16_t match3EqYB = vandq_u8(match2EqB, vceqq_u8(NEXT_DATA(3), vdupq_n_u8('y')));
293
+ if(LIKELIHOOD(0.001, neon_vect_is_nonzero(vorrq_u8(
294
+ match3EqYA, match3EqYB
295
+ )))) {
296
+ uint8x16_t match1LfA = vceqq_u8(vextq_u8(dataA, dataB, 1), vdupq_n_u8('\n'));
297
+ uint8x16_t match1LfB = vceqq_u8(NEXT_DATA(1), vdupq_n_u8('\n'));
298
+ uint8x16_t matchEnd = vorrq_u8(
299
+ vandq_u8(match3EqYA, vandq_u8(match1LfA, cmpCrA)),
300
+ vandq_u8(match3EqYB, vandq_u8(match1LfB, cmpCrB))
301
+ );
302
+ if(LIKELIHOOD(0.001, neon_vect_is_nonzero(matchEnd))) {
303
+ len += i;
304
+ break;
305
+ }
306
+ }
307
+ #undef NEXT_DATA
308
+ if(isRaw)
309
+ #ifdef __aarch64__
310
+ nextMaskMix = vdupq_n_u8(0);
311
+ #else
312
+ lfCompare = vcombine_u8(vget_high_u8(lfCompare), vget_high_u8(lfCompare));
313
+ #endif
314
+ } else if(isRaw) // no \r_. found
315
+ #ifdef __aarch64__
316
+ nextMaskMix = vdupq_n_u8(0);
317
+ #else
318
+ lfCompare = vcombine_u8(vget_high_u8(lfCompare), vget_high_u8(lfCompare));
319
+ #endif
320
+ }
321
+
322
+ // a spec compliant encoder should never generate sequences: ==, =\n and =\r, but we'll handle them to be spec compliant
323
+ // the yEnc specification requires any character following = to be unescaped, not skipped over, so we'll deal with that
324
+ // firstly, check for invalid sequences of = (we assume that these are rare, as a spec compliant yEnc encoder should not generate these)
325
+ if(LIKELIHOOD(0.0001, (mask & ((maskEq << 1) | escFirst)) != 0)) {
326
+ uint8_t tmp = eqFixLUT[(maskEq&0xff) & ~escFirst];
327
+ uint32_t maskEq2 = tmp;
328
+ for(int j=8; j<32; j+=8) {
329
+ tmp = eqFixLUT[((maskEq>>j)&0xff) & ~(tmp>>7)];
330
+ maskEq2 |= tmp<<j;
331
+ }
332
+ maskEq = maskEq2;
333
+
334
+ // next, eliminate anything following a `=` from the special char mask; this eliminates cases of `=\r` so that they aren't removed
335
+ maskEq = (maskEq<<1) | escFirst;
336
+ mask &= ~maskEq;
337
+ escFirst = tmp>>7;
338
+
339
+ // unescape chars following `=`
340
+ uint8x8_t maskEqTemp = vreinterpret_u8_u32(vmov_n_u32(maskEq));
341
+ cmpPacked = vbic_u8(cmpPacked, maskEqTemp); // `mask &= ~maskEq` in vector form
342
+ #ifdef __aarch64__
343
+ uint8x16_t vMaskEqA = vqtbl1q_u8(
344
+ vcombine_u8(maskEqTemp, vdup_n_u8(0)),
345
+ vmakeq_u8(0,0,0,0,0,0,0,0, 1,1,1,1,1,1,1,1)
346
+ );
347
+ uint8x16_t vMaskEqB = vqtbl1q_u8(
348
+ vcombine_u8(maskEqTemp, vdup_n_u8(0)),
349
+ vmakeq_u8(2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3)
350
+ );
351
+ #else
352
+ uint8x16_t vMaskEqA = vcombine_u8(
353
+ vdup_lane_u8(maskEqTemp, 0),
354
+ vdup_lane_u8(maskEqTemp, 1)
355
+ );
356
+ uint8x16_t vMaskEqB = vcombine_u8(
357
+ vdup_lane_u8(maskEqTemp, 2),
358
+ vdup_lane_u8(maskEqTemp, 3)
359
+ );
360
+ #endif
361
+ vMaskEqA = vtstq_u8(vMaskEqA, vmakeq_u8(1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128));
362
+ vMaskEqB = vtstq_u8(vMaskEqB, vmakeq_u8(1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128));
363
+
364
+ dataA = vsubq_u8(
365
+ dataA,
366
+ vbslq_u8(vMaskEqA, vdupq_n_u8(64+42), vdupq_n_u8(42))
367
+ );
368
+ dataB = vsubq_u8(
369
+ dataB,
370
+ vbslq_u8(vMaskEqB, vdupq_n_u8(64+42), vdupq_n_u8(42))
371
+ );
372
+ } else {
373
+ // no invalid = sequences found - we can cut out some things from above
374
+ // this code path is a shortened version of above; it's here because it's faster, and what we'll be dealing with most of the time
375
+ escFirst = (maskEq >> 31);
376
+
377
+ dataA = vsubq_u8(
378
+ dataA,
379
+ vbslq_u8(
380
+ vextq_u8(vdupq_n_u8(42), cmpEqA, 15),
381
+ vdupq_n_u8(64+42),
382
+ yencOffset
383
+ )
384
+ );
385
+ dataB = vsubq_u8(
386
+ dataB,
387
+ vbslq_u8(
388
+ vextq_u8(cmpEqA, cmpEqB, 15),
389
+ vdupq_n_u8(64+42),
390
+ vdupq_n_u8(42)
391
+ )
392
+ );
393
+ }
394
+ yencOffset = vsetq_lane_u8((escFirst << 6) | 42, yencOffset, 0);
395
+
396
+ // all that's left is to 'compress' the data (skip over masked chars)
397
+ uint32_t counts = 0x08080808 - vget_lane_u32(vreinterpret_u32_u8(vcnt_u8(cmpPacked)), 0);
398
+ #ifdef __aarch64__
399
+ counts += counts >> 8;
400
+ vst1q_u8(p, vqtbl1q_u8(
401
+ dataA,
402
+ vld1q_u8_align((uint8_t*)(compactLUT + (mask&0x7fff)), 16)
403
+ ));
404
+ p += counts & 0xff;
405
+ mask >>= 16;
406
+ vst1q_u8(p, vqtbl1q_u8(
407
+ dataB,
408
+ vld1q_u8_align((uint8_t*)(compactLUT + (mask&0x7fff)), 16)
409
+ ));
410
+ p += (counts>>16) & 0xff;
411
+ #else
412
+ // lookup compress masks and shuffle
413
+ vst1_u8(p, vtbl1_u8(
414
+ vget_low_u8(dataA),
415
+ vld1_u8_align((uint8_t*)(compactLUT + (mask&0xff)), 8)
416
+ ));
417
+ p += counts & 0xff;
418
+ mask >>= 8;
419
+ vst1_u8(p, vtbl1_u8(
420
+ vget_high_u8(dataA),
421
+ vld1_u8_align((uint8_t*)(compactLUT + (mask&0xff)), 8)
422
+ ));
423
+ p += (counts>>8) & 0xff;
424
+ mask >>= 8;
425
+ vst1_u8(p, vtbl1_u8(
426
+ vget_low_u8(dataB),
427
+ vld1_u8_align((uint8_t*)(compactLUT + (mask&0xff)), 8)
428
+ ));
429
+ p += (counts>>16) & 0xff;
430
+ mask >>= 8;
431
+ vst1_u8(p, vtbl1_u8(
432
+ vget_high_u8(dataB),
433
+ vld1_u8_align((uint8_t*)(compactLUT + (mask&0xff)), 8)
434
+ ));
435
+ p += (counts>>24) & 0xff;
436
+
437
+ #endif
438
+
439
+ } else {
440
+ dataA = vsubq_u8(dataA, yencOffset);
441
+ dataB = vsubq_u8(dataB, vdupq_n_u8(42));
442
+ vst1q_u8_x2_unaligned(p, vcreate2_u8(dataA, dataB));
443
+ p += sizeof(uint8x16_t)*2;
444
+ escFirst = 0;
445
+ #ifdef __aarch64__
446
+ yencOffset = vdupq_n_u8(42);
447
+ #else
448
+ yencOffset = vcombine_u8(vdup_n_u8(42), vget_high_u8(yencOffset));
449
+ #endif
450
+ }
451
+ }
452
+
453
+ if(isRaw) {
454
+ if(len != 0) { // have to gone through at least one loop cycle
455
+ if(src[i-2] == '\r' && src[i-1] == '\n' && src[i] == '.')
456
+ nextMask = 1;
457
+ else if(src[i-1] == '\r' && src[i] == '\n' && src[i+1] == '.')
458
+ nextMask = 2;
459
+ else
460
+ nextMask = 0;
461
+ }
462
+ } else
463
+ nextMask = 0;
464
+ }
465
+
466
+ void decoder_set_neon_funcs() {
467
+ decoder_init_lut(eqFixLUT, compactLUT);
468
+ _do_decode = &do_decode_simd<false, false, sizeof(uint8x16_t)*2, do_decode_neon<false, false> >;
469
+ _do_decode_raw = &do_decode_simd<true, false, sizeof(uint8x16_t)*2, do_decode_neon<true, false> >;
470
+ _do_decode_end_raw = &do_decode_simd<true, true, sizeof(uint8x16_t)*2, do_decode_neon<true, true> >;
471
+ }
472
+ #else
473
+ void decoder_set_neon_funcs() {}
474
+ #endif