yencode 1.0.8 → 1.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. package/README.md +339 -231
  2. package/binding.gyp +292 -39
  3. package/crcutil-1.0/code/multiword_64_64_gcc_amd64_asm.cc +7 -7
  4. package/crcutil-1.0/code/multiword_64_64_gcc_i386_mmx.cc +14 -14
  5. package/crcutil-1.0/code/multiword_64_64_intrinsic_i386_mmx.cc +1 -1
  6. package/crcutil-1.0/code/uint128_sse2.h +2 -0
  7. package/index.js +329 -22
  8. package/package.json +2 -2
  9. package/src/common.h +299 -0
  10. package/src/crc.cc +95 -0
  11. package/src/crc.h +23 -0
  12. package/src/crc_arm.cc +175 -0
  13. package/src/crc_common.h +4 -0
  14. package/{crc_folding.c → src/crc_folding.cc} +175 -185
  15. package/src/decoder.cc +61 -0
  16. package/src/decoder.h +53 -0
  17. package/src/decoder_avx.cc +18 -0
  18. package/src/decoder_avx2.cc +18 -0
  19. package/src/decoder_avx2_base.h +615 -0
  20. package/src/decoder_common.h +512 -0
  21. package/src/decoder_neon.cc +474 -0
  22. package/src/decoder_neon64.cc +451 -0
  23. package/src/decoder_sse2.cc +16 -0
  24. package/src/decoder_sse_base.h +711 -0
  25. package/src/decoder_ssse3.cc +18 -0
  26. package/src/encoder.cc +170 -0
  27. package/src/encoder.h +21 -0
  28. package/src/encoder_avx.cc +16 -0
  29. package/src/encoder_avx2.cc +16 -0
  30. package/src/encoder_avx_base.h +564 -0
  31. package/src/encoder_common.h +109 -0
  32. package/src/encoder_neon.cc +547 -0
  33. package/src/encoder_sse2.cc +13 -0
  34. package/src/encoder_sse_base.h +724 -0
  35. package/src/encoder_ssse3.cc +18 -0
  36. package/src/hedley.h +1899 -0
  37. package/src/platform.cc +147 -0
  38. package/src/yencode.cc +449 -0
  39. package/test/_maxsize.js +9 -0
  40. package/test/_speedbase.js +147 -0
  41. package/test/speedcrc.js +20 -0
  42. package/test/speeddec.js +92 -0
  43. package/test/speedenc.js +44 -0
  44. package/{testcrc.js → test/testcrc.js} +53 -39
  45. package/test/testdec.js +183 -0
  46. package/test/testenc.js +163 -0
  47. package/test/testpostdec.js +126 -0
  48. package/test.js +0 -91
  49. package/yencode.cc +0 -1622
@@ -0,0 +1,451 @@
1
+ #include "common.h"
2
+ #if defined(__ARM_NEON) && defined(__aarch64__)
3
+
4
+ #include "decoder_common.h"
5
+
6
+ #pragma pack(16)
7
+ static struct { char bytes[16]; } ALIGN_TO(16, compactLUT[32768]);
8
+ #pragma pack()
9
+
10
+ static uint8_t eqFixLUT[256];
11
+
12
+
13
+
14
+ #if !defined(__clang__) && !defined(_MSC_VER) && (!defined(__aarch64__) || !HEDLEY_GCC_VERSION_CHECK(10,0,0))
15
+ static HEDLEY_ALWAYS_INLINE uint8x16x4_t vld1q_u8_x4(const uint8_t* p) {
16
+ uint8x16x4_t ret;
17
+ ret.val[0] = vld1q_u8(p);
18
+ ret.val[1] = vld1q_u8(p+16);
19
+ ret.val[2] = vld1q_u8(p+32);
20
+ ret.val[3] = vld1q_u8(p+48);
21
+ return ret;
22
+ }
23
+ static HEDLEY_ALWAYS_INLINE void vst1q_u8_x4(uint8_t* p, uint8x16x4_t data) {
24
+ vst1q_u8(p, data.val[0]);
25
+ vst1q_u8(p+16, data.val[1]);
26
+ vst1q_u8(p+32, data.val[2]);
27
+ vst1q_u8(p+48, data.val[3]);
28
+ }
29
+ #endif
30
+
31
+
32
+ static bool neon_vect_is_nonzero(uint8x16_t v) {
33
+ return !!(vget_lane_u64(vreinterpret_u64_u32(vqmovn_u64(vreinterpretq_u64_u8(v))), 0));
34
+ }
35
+
36
+ static HEDLEY_ALWAYS_INLINE uint8x16_t mergeCompares(uint8x16_t a, uint8x16_t b, uint8x16_t c, uint8x16_t d) {
37
+ // constant vectors arbitrarily chosen from ones that can be reused; exact ordering of bits doesn't matter, we just need to mix them in
38
+ return vbslq_u8(
39
+ vdupq_n_u8('='),
40
+ vbslq_u8(vdupq_n_u8('y'), a, b),
41
+ vbslq_u8(vdupq_n_u8('y'), c, d)
42
+ );
43
+ }
44
+
45
+
46
+ template<bool isRaw, bool searchEnd>
47
+ HEDLEY_ALWAYS_INLINE void do_decode_neon(const uint8_t* HEDLEY_RESTRICT src, long& len, unsigned char* HEDLEY_RESTRICT & p, unsigned char& escFirst, uint16_t& nextMask) {
48
+ HEDLEY_ASSUME(escFirst == 0 || escFirst == 1);
49
+ HEDLEY_ASSUME(nextMask == 0 || nextMask == 1 || nextMask == 2);
50
+ uint8x16_t nextMaskMix = vdupq_n_u8(0);
51
+ if(nextMask == 1)
52
+ nextMaskMix = vsetq_lane_u8(1, nextMaskMix, 0);
53
+ if(nextMask == 2)
54
+ nextMaskMix = vsetq_lane_u8(2, nextMaskMix, 1);
55
+ uint8x16_t yencOffset = escFirst ? vmakeq_u8(42+64,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42) : vdupq_n_u8(42);
56
+ long i;
57
+ for(i = -len; i; i += sizeof(uint8x16_t)*4) {
58
+ uint8x16x4_t data = vld1q_u8_x4(src+i);
59
+ uint8x16_t dataA = data.val[0];
60
+ uint8x16_t dataB = data.val[1];
61
+ uint8x16_t dataC = data.val[2];
62
+ uint8x16_t dataD = data.val[3];
63
+
64
+ // search for special chars
65
+ uint8x16_t cmpEqA = vceqq_u8(dataA, vdupq_n_u8('=')),
66
+ cmpEqB = vceqq_u8(dataB, vdupq_n_u8('=')),
67
+ cmpEqC = vceqq_u8(dataC, vdupq_n_u8('=')),
68
+ cmpEqD = vceqq_u8(dataD, vdupq_n_u8('=')),
69
+ cmpA = vqtbx1q_u8(
70
+ cmpEqA,
71
+ // \n \r
72
+ vmakeq_u8(0,0,0,0,0,0,0,0,0,0,255,0,0,255,0,0),
73
+ dataA
74
+ ),
75
+ cmpB = vqtbx1q_u8(
76
+ cmpEqB,
77
+ vmakeq_u8(0,0,0,0,0,0,0,0,0,0,255,0,0,255,0,0),
78
+ dataB
79
+ ),
80
+ cmpC = vqtbx1q_u8(
81
+ cmpEqC,
82
+ vmakeq_u8(0,0,0,0,0,0,0,0,0,0,255,0,0,255,0,0),
83
+ dataC
84
+ ),
85
+ cmpD = vqtbx1q_u8(
86
+ cmpEqD,
87
+ vmakeq_u8(0,0,0,0,0,0,0,0,0,0,255,0,0,255,0,0),
88
+ dataD
89
+ );
90
+ if(isRaw) cmpA = vorrq_u8(cmpA, nextMaskMix);
91
+
92
+ if (LIKELIHOOD(0.42 /*guess*/, neon_vect_is_nonzero(vorrq_u8(
93
+ vorrq_u8(cmpA, cmpB),
94
+ vorrq_u8(cmpC, cmpD)
95
+ )))) {
96
+ uint8x16_t cmpMerge = vpaddq_u8(
97
+ vpaddq_u8(
98
+ vandq_u8(cmpA, vmakeq_u8(1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128)),
99
+ vandq_u8(cmpB, vmakeq_u8(1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128))
100
+ ),
101
+ vpaddq_u8(
102
+ vandq_u8(cmpC, vmakeq_u8(1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128)),
103
+ vandq_u8(cmpD, vmakeq_u8(1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128))
104
+ )
105
+ );
106
+ uint8x16_t cmpEqMerge = vpaddq_u8(
107
+ vpaddq_u8(
108
+ vandq_u8(cmpEqA, vmakeq_u8(1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128)),
109
+ vandq_u8(cmpEqB, vmakeq_u8(1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128))
110
+ ),
111
+ vpaddq_u8(
112
+ vandq_u8(cmpEqC, vmakeq_u8(1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128)),
113
+ vandq_u8(cmpEqD, vmakeq_u8(1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128))
114
+ )
115
+ );
116
+
117
+ uint8x16_t cmpCombined = vpaddq_u8(cmpMerge, cmpEqMerge);
118
+ uint64_t mask = vgetq_lane_u64(vreinterpretq_u64_u8(cmpCombined), 0);
119
+ uint64_t maskEq = vgetq_lane_u64(vreinterpretq_u64_u8(cmpCombined), 1);
120
+
121
+ // handle \r\n. sequences
122
+ // RFC3977 requires the first dot on a line to be stripped, due to dot-stuffing
123
+ if((isRaw || searchEnd) && LIKELIHOOD(0.15, mask != maskEq)) {
124
+ // vext seems to be a cheap operation on ARM, relative to loads, so only avoid it if there's only one load (isRaw only)
125
+ uint8x16_t tmpData2, nextData;
126
+ if(isRaw && !searchEnd) {
127
+ tmpData2 = vld1q_u8(src+i + 2 + sizeof(uint8x16_t)*3);
128
+ } else {
129
+ nextData = vld1q_u8(src+i + sizeof(uint8x16_t)*4); // only 32-bits needed, but there doesn't appear a nice way to do this via intrinsics: https://stackoverflow.com/questions/46910799/arm-neon-intrinsics-convert-d-64-bit-register-to-low-half-of-q-128-bit-regis
130
+ tmpData2 = vextq_u8(dataD, nextData, 2);
131
+ }
132
+ uint8x16_t cmpCrA = vceqq_u8(dataA, vdupq_n_u8('\r'));
133
+ uint8x16_t cmpCrB = vceqq_u8(dataB, vdupq_n_u8('\r'));
134
+ uint8x16_t cmpCrC = vceqq_u8(dataC, vdupq_n_u8('\r'));
135
+ uint8x16_t cmpCrD = vceqq_u8(dataD, vdupq_n_u8('\r'));
136
+ uint8x16_t match2EqA, match2Cr_DotA;
137
+ uint8x16_t match2EqB, match2Cr_DotB;
138
+ uint8x16_t match2EqC, match2Cr_DotC;
139
+ uint8x16_t match2EqD, match2Cr_DotD;
140
+ if(searchEnd) {
141
+ match2EqD = vceqq_u8(tmpData2, vdupq_n_u8('='));
142
+ }
143
+ if(isRaw) {
144
+ match2Cr_DotA = vandq_u8(cmpCrA, vceqq_u8(vextq_u8(dataA, dataB, 2), vdupq_n_u8('.')));
145
+ match2Cr_DotB = vandq_u8(cmpCrB, vceqq_u8(vextq_u8(dataB, dataC, 2), vdupq_n_u8('.')));
146
+ match2Cr_DotC = vandq_u8(cmpCrC, vceqq_u8(vextq_u8(dataC, dataD, 2), vdupq_n_u8('.')));
147
+ match2Cr_DotD = vandq_u8(cmpCrD, vceqq_u8(tmpData2, vdupq_n_u8('.')));
148
+ }
149
+
150
+ // find patterns of \r_.
151
+ if(isRaw && LIKELIHOOD(0.001, neon_vect_is_nonzero(vorrq_u8(
152
+ vorrq_u8(match2Cr_DotA, match2Cr_DotB),
153
+ vorrq_u8(match2Cr_DotC, match2Cr_DotD)
154
+ )))) {
155
+ uint8x16_t match1LfA = vceqq_u8(vextq_u8(dataA, dataB, 1), vdupq_n_u8('\n'));
156
+ uint8x16_t match1LfB = vceqq_u8(vextq_u8(dataB, dataC, 1), vdupq_n_u8('\n'));
157
+ uint8x16_t match1LfC = vceqq_u8(vextq_u8(dataC, dataD, 1), vdupq_n_u8('\n'));
158
+ uint8x16_t match1LfD;
159
+ if(searchEnd)
160
+ match1LfD = vceqq_u8(vextq_u8(dataD, nextData, 1), vdupq_n_u8('\n'));
161
+ else
162
+ match1LfD = vceqq_u8(vld1q_u8(src+i + 1+sizeof(uint8x16_t)*3), vdupq_n_u8('\n'));
163
+ // merge matches of \r_. with those for \n
164
+ uint8x16_t match2NlDotA = vandq_u8(match2Cr_DotA, match1LfA);
165
+ uint8x16_t match2NlDotB = vandq_u8(match2Cr_DotB, match1LfB);
166
+ uint8x16_t match2NlDotC = vandq_u8(match2Cr_DotC, match1LfC);
167
+ uint8x16_t match2NlDotD = vandq_u8(match2Cr_DotD, match1LfD);
168
+ if(searchEnd) {
169
+ uint8x16_t match1NlA = vandq_u8(match1LfA, cmpCrA);
170
+ uint8x16_t match1NlB = vandq_u8(match1LfB, cmpCrB);
171
+ uint8x16_t match1NlC = vandq_u8(match1LfC, cmpCrC);
172
+ uint8x16_t match1NlD = vandq_u8(match1LfD, cmpCrD);
173
+
174
+ uint8x16_t tmpData3 = vextq_u8(dataD, nextData, 3);
175
+ uint8x16_t tmpData4 = vextq_u8(dataD, nextData, 4);
176
+ // match instances of \r\n.\r\n and \r\n.=y
177
+ uint8x16_t match3CrD = vceqq_u8(tmpData3, vdupq_n_u8('\r'));
178
+ uint8x16_t match4LfD = vceqq_u8(tmpData4, vdupq_n_u8('\n'));
179
+ uint8x16_t match4Nl = mergeCompares(
180
+ vextq_u8(match1NlA, match1NlB, 3),
181
+ vextq_u8(match1NlB, match1NlC, 3),
182
+ vextq_u8(match1NlC, match1NlD, 3),
183
+ vandq_u8(match3CrD, match4LfD)
184
+ );
185
+ uint8x16_t match4EqY = mergeCompares(
186
+ // match with =y
187
+ vreinterpretq_u8_u16(vceqq_u16(vreinterpretq_u16_u8(vextq_u8(dataA, dataB, 4)), vdupq_n_u16(0x793d))),
188
+ vreinterpretq_u8_u16(vceqq_u16(vreinterpretq_u16_u8(vextq_u8(dataB, dataC, 4)), vdupq_n_u16(0x793d))),
189
+ vreinterpretq_u8_u16(vceqq_u16(vreinterpretq_u16_u8(vextq_u8(dataC, dataD, 4)), vdupq_n_u16(0x793d))),
190
+ vreinterpretq_u8_u16(vceqq_u16(vreinterpretq_u16_u8(tmpData4), vdupq_n_u16(0x793d)))
191
+ );
192
+ match2EqA = vextq_u8(cmpEqA, cmpEqB, 2);
193
+ match2EqB = vextq_u8(cmpEqB, cmpEqC, 2);
194
+ match2EqC = vextq_u8(cmpEqC, cmpEqD, 2);
195
+ uint8x16_t match3EqY = mergeCompares(
196
+ vandq_u8(
197
+ vceqq_u8(vextq_u8(dataA, dataB, 3), vdupq_n_u8('y')),
198
+ match2EqA
199
+ ), vandq_u8(
200
+ vceqq_u8(vextq_u8(dataB, dataC, 3), vdupq_n_u8('y')),
201
+ match2EqB
202
+ ), vandq_u8(
203
+ vceqq_u8(vextq_u8(dataC, dataD, 3), vdupq_n_u8('y')),
204
+ match2EqC
205
+ ), vandq_u8(
206
+ vceqq_u8(tmpData3, vdupq_n_u8('y')),
207
+ match2EqD
208
+ )
209
+ );
210
+
211
+ // merge \r\n and =y matches for tmpData4
212
+ uint8x16_t match4End = vorrq_u8(
213
+ match4Nl,
214
+ vreinterpretq_u8_u16(vsriq_n_u16(vreinterpretq_u16_u8(match4EqY), vreinterpretq_u16_u8(match3EqY), 8))
215
+ );
216
+ // merge with \r\n.
217
+ uint8x16_t match2NlDot = mergeCompares(match2NlDotA, match2NlDotB, match2NlDotC, match2NlDotD);
218
+ match4End = vandq_u8(match4End, match2NlDot);
219
+ // match \r\n=y
220
+ uint8x16_t match1Nl = mergeCompares(match1NlA, match1NlB, match1NlC, match1NlD);
221
+ uint8x16_t match3End = vandq_u8(match3EqY, match1Nl);
222
+ // combine match sequences
223
+ if(LIKELIHOOD(0.001, neon_vect_is_nonzero(vorrq_u8(match4End, match3End)))) {
224
+ // terminator found
225
+ // there's probably faster ways to do this, but reverting to scalar code should be good enough
226
+ len += i;
227
+ break;
228
+ }
229
+ }
230
+ uint8x16_t match2NlDotDMasked = vandq_u8(match2NlDotD, vmakeq_u8(1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128));
231
+ uint8x16_t mergeKillDots = vpaddq_u8(
232
+ vpaddq_u8(
233
+ vandq_u8(match2NlDotA, vmakeq_u8(1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128)),
234
+ vandq_u8(match2NlDotB, vmakeq_u8(1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128))
235
+ ),
236
+ vpaddq_u8(
237
+ vandq_u8(match2NlDotC, vmakeq_u8(1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128)),
238
+ match2NlDotDMasked
239
+ )
240
+ );
241
+ mergeKillDots = vpaddq_u8(mergeKillDots, mergeKillDots);
242
+ uint64x2_t mergeKillDotsShifted = vshlq_n_u64(vreinterpretq_u64_u8(mergeKillDots), 2);
243
+ mask |= vgetq_lane_u64(mergeKillDotsShifted, 0);
244
+ cmpCombined = vorrq_u8(cmpCombined, vreinterpretq_u8_u64(mergeKillDotsShifted));
245
+ nextMaskMix = vextq_u8(match2NlDotD, vdupq_n_u8(0), 14);
246
+ } else if(searchEnd) {
247
+ match2EqA = vextq_u8(cmpEqA, cmpEqB, 2);
248
+ match2EqB = vextq_u8(cmpEqB, cmpEqC, 2);
249
+ match2EqC = vextq_u8(cmpEqC, cmpEqD, 2);
250
+
251
+ uint8x16_t match3EqYA = vandq_u8(match2EqA, vceqq_u8(vextq_u8(dataA, dataB, 3), vdupq_n_u8('y')));
252
+ uint8x16_t match3EqYB = vandq_u8(match2EqB, vceqq_u8(vextq_u8(dataB, dataC, 3), vdupq_n_u8('y')));
253
+ uint8x16_t match3EqYC = vandq_u8(match2EqC, vceqq_u8(vextq_u8(dataC, dataD, 3), vdupq_n_u8('y')));
254
+ uint8x16_t match3EqYD = vandq_u8(match2EqD, vceqq_u8(vextq_u8(dataD, nextData, 3), vdupq_n_u8('y')));
255
+ if(LIKELIHOOD(0.001, neon_vect_is_nonzero(vorrq_u8(
256
+ vorrq_u8(match3EqYA, match3EqYB),
257
+ vorrq_u8(match3EqYC, match3EqYD)
258
+ )))) {
259
+ uint8x16_t match1LfA = vceqq_u8(vextq_u8(dataA, dataB, 1), vdupq_n_u8('\n'));
260
+ uint8x16_t match1LfB = vceqq_u8(vextq_u8(dataB, dataC, 1), vdupq_n_u8('\n'));
261
+ uint8x16_t match1LfC = vceqq_u8(vextq_u8(dataC, dataD, 1), vdupq_n_u8('\n'));
262
+ uint8x16_t match1LfD = vceqq_u8(vextq_u8(dataD, nextData, 1), vdupq_n_u8('\n'));
263
+ uint8x16_t matchEnd = vorrq_u8(
264
+ vorrq_u8(
265
+ vandq_u8(match3EqYA, vandq_u8(match1LfA, cmpCrA)),
266
+ vandq_u8(match3EqYB, vandq_u8(match1LfB, cmpCrB))
267
+ ),
268
+ vorrq_u8(
269
+ vandq_u8(match3EqYC, vandq_u8(match1LfC, cmpCrC)),
270
+ vandq_u8(match3EqYD, vandq_u8(match1LfD, cmpCrD))
271
+ )
272
+ );
273
+ if(LIKELIHOOD(0.001, neon_vect_is_nonzero(matchEnd))) {
274
+ len += i;
275
+ break;
276
+ }
277
+ }
278
+ if(isRaw)
279
+ nextMaskMix = vdupq_n_u8(0);
280
+ } else if(isRaw) // no \r_. found
281
+ nextMaskMix = vdupq_n_u8(0);
282
+ }
283
+
284
+ // a spec compliant encoder should never generate sequences: ==, =\n and =\r, but we'll handle them to be spec compliant
285
+ // the yEnc specification requires any character following = to be unescaped, not skipped over, so we'll deal with that
286
+ // firstly, check for invalid sequences of = (we assume that these are rare, as a spec compliant yEnc encoder should not generate these)
287
+ if(LIKELIHOOD(0.0001, (mask & ((maskEq << 1) | escFirst)) != 0)) {
288
+ uint8_t tmp = eqFixLUT[(maskEq&0xff) & ~escFirst];
289
+ uint64_t maskEq2 = tmp;
290
+ for(int j=8; j<64; j+=8) {
291
+ tmp = eqFixLUT[((maskEq>>j)&0xff) & ~(tmp>>7)];
292
+ maskEq2 |= ((uint64_t)tmp)<<j;
293
+ }
294
+ maskEq = maskEq2;
295
+
296
+ // next, eliminate anything following a `=` from the special char mask; this eliminates cases of `=\r` so that they aren't removed
297
+ maskEq = (maskEq<<1) | escFirst;
298
+ mask &= ~maskEq;
299
+ escFirst = tmp>>7;
300
+
301
+ // unescape chars following `=`
302
+ #if defined(__GNUC__) && !defined(__clang__)
303
+ // this seems to stop GCC9 producing slow code, for some reason... TODO: investigate why
304
+ uint8x8_t _maskEqTemp = vreinterpret_u8_u64(vmov_n_u64(maskEq));
305
+ uint8x16_t maskEqTemp = vcombine_u8(_maskEqTemp, vdup_n_u8(0));
306
+ #else
307
+ uint8x16_t maskEqTemp = vreinterpretq_u8_u64(vmovq_n_u64(maskEq));
308
+ #endif
309
+ cmpCombined = vbicq_u8(cmpCombined, maskEqTemp); // `mask &= ~maskEq` in vector form
310
+
311
+ uint8x16_t vMaskEqA = vqtbl1q_u8(
312
+ maskEqTemp,
313
+ vmakeq_u8(0,0,0,0,0,0,0,0, 1,1,1,1,1,1,1,1)
314
+ );
315
+ maskEqTemp = vextq_u8(maskEqTemp, maskEqTemp, 2);
316
+ uint8x16_t vMaskEqB = vqtbl1q_u8(
317
+ maskEqTemp,
318
+ vmakeq_u8(0,0,0,0,0,0,0,0, 1,1,1,1,1,1,1,1)
319
+ );
320
+ maskEqTemp = vextq_u8(maskEqTemp, maskEqTemp, 2);
321
+ uint8x16_t vMaskEqC = vqtbl1q_u8(
322
+ maskEqTemp,
323
+ vmakeq_u8(0,0,0,0,0,0,0,0, 1,1,1,1,1,1,1,1)
324
+ );
325
+ maskEqTemp = vextq_u8(maskEqTemp, maskEqTemp, 2);
326
+ uint8x16_t vMaskEqD = vqtbl1q_u8(
327
+ maskEqTemp,
328
+ vmakeq_u8(0,0,0,0,0,0,0,0, 1,1,1,1,1,1,1,1)
329
+ );
330
+ vMaskEqA = vtstq_u8(vMaskEqA, vmakeq_u8(1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128));
331
+ vMaskEqB = vtstq_u8(vMaskEqB, vmakeq_u8(1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128));
332
+ vMaskEqC = vtstq_u8(vMaskEqC, vmakeq_u8(1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128));
333
+ vMaskEqD = vtstq_u8(vMaskEqD, vmakeq_u8(1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128));
334
+
335
+ dataA = vsubq_u8(
336
+ dataA,
337
+ vbslq_u8(vMaskEqA, vdupq_n_u8(64+42), vdupq_n_u8(42))
338
+ );
339
+ dataB = vsubq_u8(
340
+ dataB,
341
+ vbslq_u8(vMaskEqB, vdupq_n_u8(64+42), vdupq_n_u8(42))
342
+ );
343
+ dataC = vsubq_u8(
344
+ dataC,
345
+ vbslq_u8(vMaskEqC, vdupq_n_u8(64+42), vdupq_n_u8(42))
346
+ );
347
+ dataD = vsubq_u8(
348
+ dataD,
349
+ vbslq_u8(vMaskEqD, vdupq_n_u8(64+42), vdupq_n_u8(42))
350
+ );
351
+ } else {
352
+ // no invalid = sequences found - we can cut out some things from above
353
+ // this code path is a shortened version of above; it's here because it's faster, and what we'll be dealing with most of the time
354
+ escFirst = (maskEq >> 63);
355
+
356
+ dataA = vsubq_u8(
357
+ dataA,
358
+ vbslq_u8(
359
+ vextq_u8(vdupq_n_u8(42), cmpEqA, 15),
360
+ vdupq_n_u8(64+42),
361
+ yencOffset
362
+ )
363
+ );
364
+ dataB = vsubq_u8(
365
+ dataB,
366
+ vbslq_u8(
367
+ vextq_u8(cmpEqA, cmpEqB, 15),
368
+ vdupq_n_u8(64+42),
369
+ vdupq_n_u8(42)
370
+ )
371
+ );
372
+ dataC = vsubq_u8(
373
+ dataC,
374
+ vbslq_u8(
375
+ vextq_u8(cmpEqB, cmpEqC, 15),
376
+ vdupq_n_u8(64+42),
377
+ vdupq_n_u8(42)
378
+ )
379
+ );
380
+ dataD = vsubq_u8(
381
+ dataD,
382
+ vbslq_u8(
383
+ vextq_u8(cmpEqC, cmpEqD, 15),
384
+ vdupq_n_u8(64+42),
385
+ vdupq_n_u8(42)
386
+ )
387
+ );
388
+ }
389
+ yencOffset = vsetq_lane_u8((escFirst << 6) | 42, yencOffset, 0);
390
+
391
+ // all that's left is to 'compress' the data (skip over masked chars)
392
+ uint64_t counts = vget_lane_u64(vreinterpret_u64_u8(vcnt_u8(vget_low_u8(cmpCombined))), 0);
393
+ counts = 0x0808080808080808ULL - counts;
394
+ counts += counts>>8;
395
+
396
+ vst1q_u8(p, vqtbl1q_u8(
397
+ dataA,
398
+ vld1q_u8((uint8_t*)(compactLUT + (mask&0x7fff)))
399
+ ));
400
+ p += counts & 0xff;
401
+ mask >>= 16;
402
+ vst1q_u8(p, vqtbl1q_u8(
403
+ dataB,
404
+ vld1q_u8((uint8_t*)(compactLUT + (mask&0x7fff)))
405
+ ));
406
+ p += (counts>>16) & 0xff;
407
+ mask >>= 16;
408
+ vst1q_u8(p, vqtbl1q_u8(
409
+ dataC,
410
+ vld1q_u8((uint8_t*)(compactLUT + (mask&0x7fff)))
411
+ ));
412
+ p += (counts>>32) & 0xff;
413
+ mask >>= 16;
414
+ vst1q_u8(p, vqtbl1q_u8(
415
+ dataD,
416
+ vld1q_u8((uint8_t*)(compactLUT + (mask&0x7fff)))
417
+ ));
418
+ p += (counts>>48) & 0xff;
419
+ } else {
420
+ dataA = vsubq_u8(dataA, yencOffset);
421
+ dataB = vsubq_u8(dataB, vdupq_n_u8(42));
422
+ dataC = vsubq_u8(dataC, vdupq_n_u8(42));
423
+ dataD = vsubq_u8(dataD, vdupq_n_u8(42));
424
+ vst1q_u8_x4(p, vcreate4_u8(dataA, dataB, dataC, dataD));
425
+ p += sizeof(uint8x16_t)*4;
426
+ escFirst = 0;
427
+ yencOffset = vdupq_n_u8(42);
428
+ }
429
+ }
430
+ if(isRaw) {
431
+ if(len != 0) { // have to gone through at least one loop cycle
432
+ if(src[i-2] == '\r' && src[i-1] == '\n' && src[i] == '.')
433
+ nextMask = 1;
434
+ else if(src[i-1] == '\r' && src[i] == '\n' && src[i+1] == '.')
435
+ nextMask = 2;
436
+ else
437
+ nextMask = 0;
438
+ }
439
+ } else
440
+ nextMask = 0;
441
+ }
442
+
443
+ void decoder_set_neon_funcs() {
444
+ decoder_init_lut(eqFixLUT, compactLUT);
445
+ _do_decode = &do_decode_simd<false, false, sizeof(uint8x16_t)*4, do_decode_neon<false, false> >;
446
+ _do_decode_raw = &do_decode_simd<true, false, sizeof(uint8x16_t)*4, do_decode_neon<true, false> >;
447
+ _do_decode_end_raw = &do_decode_simd<true, true, sizeof(uint8x16_t)*4, do_decode_neon<true, true> >;
448
+ }
449
+ #else
450
+ void decoder_set_neon_funcs() {}
451
+ #endif
@@ -0,0 +1,16 @@
1
+ #include "common.h"
2
+
3
+ #ifdef __SSE2__
4
+ #include "decoder_common.h"
5
+ #include "decoder_sse_base.h"
6
+
7
+ void decoder_set_sse2_funcs() {
8
+ decoder_sse_init();
9
+ decoder_init_lut(lookups->eqFix, lookups->compact);
10
+ _do_decode = &do_decode_simd<false, false, sizeof(__m128i)*2, do_decode_sse<false, false, ISA_LEVEL_SSE2> >;
11
+ _do_decode_raw = &do_decode_simd<true, false, sizeof(__m128i)*2, do_decode_sse<true, false, ISA_LEVEL_SSE2> >;
12
+ _do_decode_end_raw = &do_decode_simd<true, true, sizeof(__m128i)*2, do_decode_sse<true, true, ISA_LEVEL_SSE2> >;
13
+ }
14
+ #else
15
+ void decoder_set_sse2_funcs() {}
16
+ #endif