yencode 1.0.8 → 1.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. package/README.md +339 -231
  2. package/binding.gyp +292 -39
  3. package/crcutil-1.0/code/multiword_64_64_gcc_amd64_asm.cc +7 -7
  4. package/crcutil-1.0/code/multiword_64_64_gcc_i386_mmx.cc +14 -14
  5. package/crcutil-1.0/code/multiword_64_64_intrinsic_i386_mmx.cc +1 -1
  6. package/crcutil-1.0/code/uint128_sse2.h +2 -0
  7. package/index.js +329 -22
  8. package/package.json +2 -2
  9. package/src/common.h +299 -0
  10. package/src/crc.cc +95 -0
  11. package/src/crc.h +23 -0
  12. package/src/crc_arm.cc +175 -0
  13. package/src/crc_common.h +4 -0
  14. package/{crc_folding.c → src/crc_folding.cc} +175 -185
  15. package/src/decoder.cc +61 -0
  16. package/src/decoder.h +53 -0
  17. package/src/decoder_avx.cc +18 -0
  18. package/src/decoder_avx2.cc +18 -0
  19. package/src/decoder_avx2_base.h +615 -0
  20. package/src/decoder_common.h +512 -0
  21. package/src/decoder_neon.cc +474 -0
  22. package/src/decoder_neon64.cc +451 -0
  23. package/src/decoder_sse2.cc +16 -0
  24. package/src/decoder_sse_base.h +711 -0
  25. package/src/decoder_ssse3.cc +18 -0
  26. package/src/encoder.cc +170 -0
  27. package/src/encoder.h +21 -0
  28. package/src/encoder_avx.cc +16 -0
  29. package/src/encoder_avx2.cc +16 -0
  30. package/src/encoder_avx_base.h +564 -0
  31. package/src/encoder_common.h +109 -0
  32. package/src/encoder_neon.cc +547 -0
  33. package/src/encoder_sse2.cc +13 -0
  34. package/src/encoder_sse_base.h +724 -0
  35. package/src/encoder_ssse3.cc +18 -0
  36. package/src/hedley.h +1899 -0
  37. package/src/platform.cc +147 -0
  38. package/src/yencode.cc +449 -0
  39. package/test/_maxsize.js +9 -0
  40. package/test/_speedbase.js +147 -0
  41. package/test/speedcrc.js +20 -0
  42. package/test/speeddec.js +92 -0
  43. package/test/speedenc.js +44 -0
  44. package/{testcrc.js → test/testcrc.js} +53 -39
  45. package/test/testdec.js +183 -0
  46. package/test/testenc.js +163 -0
  47. package/test/testpostdec.js +126 -0
  48. package/test.js +0 -91
  49. package/yencode.cc +0 -1622
@@ -0,0 +1,615 @@
1
+
2
+ #ifdef __AVX2__
3
+
4
+ // GCC (ver 6-10(dev)) fails to optimize pure C version of mask testing, but has this intrinsic; Clang >= 7 optimizes C version fine
5
+ #if defined(__GNUC__) && __GNUC__ >= 7
6
+ # define KORTEST32(a, b) !_kortestz_mask32_u8((a), (b))
7
+ # define KAND32(a, b) _kand_mask32((a), (b))
8
+ # define KOR32(a, b) _kor_mask32((a), (b))
9
+ #else
10
+ # define KORTEST32(a, b) ((a) | (b))
11
+ # define KAND32(a, b) ((a) & (b))
12
+ # define KOR32(a, b) ((a) | (b))
13
+ #endif
14
+
15
+ #pragma pack(16)
16
+ static struct {
17
+ /*align16*/ struct { char bytes[16]; } compact[32768];
18
+ uint8_t eqFix[256];
19
+ } * HEDLEY_RESTRICT lookups;
20
+ #pragma pack()
21
+
22
+
23
+ static HEDLEY_ALWAYS_INLINE __m256i force_align_read_256(const void* p) {
24
+ #ifdef _MSC_VER
25
+ // MSVC complains about casting away volatile
26
+ return *(__m256i *)(p);
27
+ #else
28
+ return *(volatile __m256i *)(p);
29
+ #endif
30
+ }
31
+
32
+ // _mm256_castsi128_si256, but upper is defined to be 0
33
+ #if (defined(__clang__) && __clang_major__ >= 5 && (!defined(__APPLE__) || __clang_major__ >= 7)) || (defined(__GNUC__) && __GNUC__ >= 10)
34
+ // intrinsic unsupported in GCC 9 and MSVC < 2017
35
+ # define zext128_256 _mm256_zextsi128_si256
36
+ #else
37
+ // technically a cast is incorrect, due to upper 128 bits being undefined, but should usually work fine
38
+ // alternative may be `_mm256_set_m128i(_mm_setzero_si128(), v)` but unsupported on GCC < 7, and most compilers generate a VINSERTF128 instruction for it
39
+ # ifdef __OPTIMIZE__
40
+ # define zext128_256 _mm256_castsi128_si256
41
+ # else
42
+ # define zext128_256(x) _mm256_inserti128_si256(_mm256_setzero_si256(), x, 0)
43
+ # endif
44
+ #endif
45
+
46
+
47
+ template<bool isRaw, bool searchEnd, enum YEncDecIsaLevel use_isa>
48
+ HEDLEY_ALWAYS_INLINE void do_decode_avx2(const uint8_t* HEDLEY_RESTRICT src, long& len, unsigned char* HEDLEY_RESTRICT & p, unsigned char& _escFirst, uint16_t& _nextMask) {
49
+ HEDLEY_ASSUME(_escFirst == 0 || _escFirst == 1);
50
+ HEDLEY_ASSUME(_nextMask == 0 || _nextMask == 1 || _nextMask == 2);
51
+ uintptr_t escFirst = _escFirst;
52
+ __m256i yencOffset = escFirst ? _mm256_set_epi8(
53
+ -42,-42,-42,-42,-42,-42,-42,-42,-42,-42,-42,-42,-42,-42,-42,-42,
54
+ -42,-42,-42,-42,-42,-42,-42,-42,-42,-42,-42,-42,-42,-42,-42,-42-64
55
+ ) : _mm256_set1_epi8(-42);
56
+ __m256i minMask = _mm256_set1_epi8('.');
57
+ if(_nextMask && isRaw) {
58
+ minMask = _mm256_set_epi8(
59
+ '.','.','.','.','.','.','.','.','.','.','.','.','.','.','.','.',
60
+ '.','.','.','.','.','.','.','.','.','.','.','.','.','.',_nextMask==2?0:'.',_nextMask==1?0:'.'
61
+ );
62
+ }
63
+ intptr_t i;
64
+ for(i = -len; i; i += sizeof(__m256i)*2) {
65
+ __m256i oDataA = _mm256_load_si256((__m256i *)(src+i));
66
+ __m256i oDataB = _mm256_load_si256((__m256i *)(src+i) + 1);
67
+
68
+ // search for special chars
69
+ __m256i cmpA = _mm256_cmpeq_epi8(oDataA, _mm256_shuffle_epi8(
70
+ _mm256_set_epi8(
71
+ -1,'=','\r',-1,-1,'\n',-1,-1,-1,-1,-1,-1,-1,-1,-1,'.',
72
+ -1,'=','\r',-1,-1,'\n',-1,-1,-1,-1,-1,-1,-1,-1,-1,'.'
73
+ ),
74
+ _mm256_min_epu8(oDataA, minMask)
75
+ ));
76
+ __m256i cmpB = _mm256_cmpeq_epi8(oDataB, _mm256_shuffle_epi8(
77
+ _mm256_set_epi8(
78
+ -1,'=','\r',-1,-1,'\n',-1,-1,-1,-1,-1,-1,-1,-1,-1,'.',
79
+ -1,'=','\r',-1,-1,'\n',-1,-1,-1,-1,-1,-1,-1,-1,-1,'.'
80
+ ),
81
+ _mm256_min_epu8(oDataB, _mm256_set1_epi8('.'))
82
+ ));
83
+
84
+ // TODO: can OR the vectors together to save generating a mask, but may not be worth it
85
+ uint64_t mask = (uint32_t)_mm256_movemask_epi8(cmpB); // not the most accurate mask if we have invalid sequences; we fix this up later
86
+ mask = (mask << 32) | (uint32_t)_mm256_movemask_epi8(cmpA);
87
+ __m256i dataA, dataB;
88
+ if(use_isa >= ISA_LEVEL_AVX3)
89
+ dataA = _mm256_add_epi8(oDataA, yencOffset);
90
+
91
+ if (mask != 0) {
92
+ __m256i cmpEqA = _mm256_cmpeq_epi8(oDataA, _mm256_set1_epi8('='));
93
+ __m256i cmpEqB = _mm256_cmpeq_epi8(oDataB, _mm256_set1_epi8('='));
94
+ uint64_t maskEq = (uint32_t)_mm256_movemask_epi8(cmpEqB);
95
+ maskEq = (maskEq << 32) | (uint32_t)_mm256_movemask_epi8(cmpEqA);
96
+
97
+ // handle \r\n. sequences
98
+ // RFC3977 requires the first dot on a line to be stripped, due to dot-stuffing
99
+ if((isRaw || searchEnd) && LIKELIHOOD(0.45, mask != maskEq)) {
100
+ #if 0
101
+ // prefer shuffling data over unaligned loads on Zen (unknown if worth it on Zen2/Excavator)
102
+ // unfortunately not beneficial, probably due to available register pressure; this is left here because it could be beneficial if we figure out how to use fewer registers
103
+ __m256i nextDataA, nextDataB;
104
+ if(searchEnd) {
105
+ nextDataA = _mm256_inserti128_si256(
106
+ _mm256_castsi128_si256(_mm256_extracti128_si256(oDataA, 1)),
107
+ _mm256_castsi256_si128(oDataB),
108
+ 1
109
+ );
110
+ nextDataB = _mm256_inserti128_si256(
111
+ _mm256_castsi128_si256(_mm256_extracti128_si256(oDataB, 1)),
112
+ _mm_load_si128((__m128i*)(src+i+sizeof(__m256i)*2)),
113
+ 1
114
+ );
115
+ }
116
+ # define SHIFT_DATA_A(offs) (searchEnd ? _mm256_alignr_epi8(nextDataA, oDataA, offs) : _mm256_loadu_si256((__m256i *)(src+i+offs)))
117
+ # define SHIFT_DATA_B(offs) (searchEnd ? _mm256_alignr_epi8(nextDataB, oDataB, offs) : _mm256_loadu_si256((__m256i *)(src+i+offs) + 1))
118
+ #else
119
+ # define SHIFT_DATA_A(offs) _mm256_loadu_si256((__m256i *)(src+i+offs))
120
+ # define SHIFT_DATA_B(offs) _mm256_loadu_si256((__m256i *)(src+i+offs) + 1)
121
+ #endif
122
+ __m256i tmpData2A = SHIFT_DATA_A(2);
123
+ __m256i tmpData2B = SHIFT_DATA_B(2);
124
+ __m256i match2EqA, match2EqB;
125
+ #if defined(__AVX512VL__) && defined(__AVX512BW__)
126
+ __mmask32 match2EqMaskA, match2EqMaskB;
127
+ __mmask32 match0CrMaskA, match0CrMaskB;
128
+ __mmask32 match2CrXDtMaskA, match2CrXDtMaskB;
129
+ if(use_isa >= ISA_LEVEL_AVX3 && searchEnd) {
130
+ match2EqMaskA = _mm256_cmpeq_epi8_mask(_mm256_set1_epi8('='), tmpData2A);
131
+ match2EqMaskB = _mm256_cmpeq_epi8_mask(_mm256_set1_epi8('='), tmpData2B);
132
+ } else
133
+ #endif
134
+ if(searchEnd) {
135
+ match2EqA = _mm256_cmpeq_epi8(_mm256_set1_epi8('='), tmpData2A);
136
+ match2EqB = _mm256_cmpeq_epi8(_mm256_set1_epi8('='), tmpData2B);
137
+ }
138
+
139
+ int partialKillDotFound;
140
+ __m256i match2CrXDtA, match2CrXDtB;
141
+ if(isRaw) {
142
+ // find patterns of \r_.
143
+
144
+ #if defined(__AVX512VL__) && defined(__AVX512BW__)
145
+ if(use_isa >= ISA_LEVEL_AVX3) {
146
+ match0CrMaskA = _mm256_cmpeq_epi8_mask(oDataA, _mm256_set1_epi8('\r'));
147
+ match0CrMaskB = _mm256_cmpeq_epi8_mask(oDataB, _mm256_set1_epi8('\r'));
148
+ match2CrXDtMaskA = _mm256_mask_cmpeq_epi8_mask(match0CrMaskA, tmpData2A, _mm256_set1_epi8('.'));
149
+ match2CrXDtMaskB = _mm256_mask_cmpeq_epi8_mask(match0CrMaskB, tmpData2B, _mm256_set1_epi8('.'));
150
+ partialKillDotFound = KORTEST32(match2CrXDtMaskA, match2CrXDtMaskB);
151
+ } else
152
+ #endif
153
+ {
154
+ match2CrXDtA = _mm256_and_si256(
155
+ _mm256_cmpeq_epi8(oDataA, _mm256_set1_epi8('\r')),
156
+ _mm256_cmpeq_epi8(tmpData2A, _mm256_set1_epi8('.'))
157
+ );
158
+ match2CrXDtB = _mm256_and_si256(
159
+ _mm256_cmpeq_epi8(oDataB, _mm256_set1_epi8('\r')),
160
+ _mm256_cmpeq_epi8(tmpData2B, _mm256_set1_epi8('.'))
161
+ );
162
+ partialKillDotFound = _mm256_movemask_epi8(_mm256_or_si256(
163
+ match2CrXDtA, match2CrXDtB
164
+ ));
165
+ }
166
+ }
167
+
168
+ if(isRaw && LIKELIHOOD(0.002, partialKillDotFound)) {
169
+ // merge matches for \r\n.
170
+ __m256i match2NlDotA, match1NlA;
171
+ __m256i match2NlDotB, match1NlB;
172
+ #if defined(__AVX512VL__) && defined(__AVX512BW__)
173
+ __mmask32 match1NlMaskA, match1NlMaskB;
174
+ __mmask32 match2NlDotMaskA, match2NlDotMaskB;
175
+ if(use_isa >= ISA_LEVEL_AVX3) {
176
+ match1NlMaskA = _mm256_mask_cmpeq_epi8_mask(
177
+ match0CrMaskA,
178
+ _mm256_set1_epi8('\n'),
179
+ SHIFT_DATA_A(1)
180
+ );
181
+ match1NlMaskB = _mm256_mask_cmpeq_epi8_mask(
182
+ match0CrMaskB,
183
+ _mm256_set1_epi8('\n'),
184
+ SHIFT_DATA_B(1)
185
+ );
186
+ match2NlDotMaskA = KAND32(match2CrXDtMaskA, match1NlMaskA);
187
+ match2NlDotMaskB = KAND32(match2CrXDtMaskB, match1NlMaskB);
188
+ } else
189
+ #endif
190
+ {
191
+ __m256i match1LfA = _mm256_cmpeq_epi8(
192
+ _mm256_set1_epi8('\n'),
193
+ SHIFT_DATA_A(1)
194
+ );
195
+ __m256i match1LfB = _mm256_cmpeq_epi8(
196
+ _mm256_set1_epi8('\n'),
197
+ SHIFT_DATA_B(1)
198
+ );
199
+ // force re-computing these to avoid register spills elsewhere
200
+ match1NlA = _mm256_and_si256(match1LfA, _mm256_cmpeq_epi8(force_align_read_256(src+i), _mm256_set1_epi8('\r')));
201
+ match1NlB = _mm256_and_si256(match1LfB, _mm256_cmpeq_epi8(force_align_read_256(src+i + sizeof(__m256i)), _mm256_set1_epi8('\r')));
202
+ match2NlDotA = _mm256_and_si256(match2CrXDtA, match1NlA);
203
+ match2NlDotB = _mm256_and_si256(match2CrXDtB, match1NlB);
204
+ }
205
+ if(searchEnd) {
206
+ __m256i tmpData4A;
207
+ #if defined(__AVX512VL__) && defined(PLATFORM_AMD64)
208
+ if(use_isa >= ISA_LEVEL_AVX3)
209
+ // AVX512 with 32 registers shouldn't have any issue with holding onto oData* in registers
210
+ tmpData4A = _mm256_alignr_epi32(oDataB, oDataA, 1);
211
+ else
212
+ #endif
213
+ tmpData4A = SHIFT_DATA_A(4);
214
+ __m256i tmpData4B = SHIFT_DATA_B(4);
215
+ // match instances of \r\n.\r\n and \r\n.=y
216
+ __m256i match3CrA = _mm256_cmpeq_epi8(
217
+ _mm256_set1_epi8('\r'),
218
+ SHIFT_DATA_A(3)
219
+ );
220
+ __m256i match3CrB = _mm256_cmpeq_epi8(
221
+ _mm256_set1_epi8('\r'),
222
+ SHIFT_DATA_B(3)
223
+ );
224
+ __m256i match4LfA = _mm256_cmpeq_epi8(tmpData4A, _mm256_set1_epi8('\n'));
225
+ __m256i match4LfB = _mm256_cmpeq_epi8(tmpData4B, _mm256_set1_epi8('\n'));
226
+ __m256i match4EqYA = _mm256_cmpeq_epi16(tmpData4A, _mm256_set1_epi16(0x793d)); // =y
227
+ __m256i match4EqYB = _mm256_cmpeq_epi16(tmpData4B, _mm256_set1_epi16(0x793d)); // =y
228
+
229
+ int matchEnd;
230
+ #if defined(__AVX512VL__) && defined(__AVX512BW__)
231
+ if(use_isa >= ISA_LEVEL_AVX3) {
232
+ __mmask32 match3EqYMaskA = _mm256_mask_cmpeq_epi8_mask(
233
+ match2EqMaskA,
234
+ _mm256_set1_epi8('y'),
235
+ SHIFT_DATA_A(3)
236
+ );
237
+ __mmask32 match3EqYMaskB = _mm256_mask_cmpeq_epi8_mask(
238
+ match2EqMaskB,
239
+ _mm256_set1_epi8('y'),
240
+ SHIFT_DATA_B(3)
241
+ );
242
+ __m256i match34EqYA, match34EqYB;
243
+ # ifdef __AVX512VBMI2__
244
+ if(use_isa >= ISA_LEVEL_VBMI2) {
245
+ match34EqYA = _mm256_shrdi_epi16(_mm256_movm_epi8(match3EqYMaskA), match4EqYA, 8);
246
+ match34EqYB = _mm256_shrdi_epi16(_mm256_movm_epi8(match3EqYMaskB), match4EqYB, 8);
247
+ } else
248
+ # endif
249
+ {
250
+ // (match4EqY & 0xff00) | (match3EqY >> 8)
251
+ match34EqYA = _mm256_mask_blend_epi8(match3EqYMaskA>>1, _mm256_and_si256(match4EqYA, _mm256_set1_epi16(-0xff)), _mm256_set1_epi8(-1));
252
+ match34EqYB = _mm256_mask_blend_epi8(match3EqYMaskB>>1, _mm256_and_si256(match4EqYB, _mm256_set1_epi16(-0xff)), _mm256_set1_epi8(-1));
253
+ }
254
+ // merge \r\n and =y matches for tmpData4
255
+ __m256i match4EndA = _mm256_ternarylogic_epi32(match34EqYA, match3CrA, match4LfA, 0xF8); // (match3Cr & match4Lf) | match34EqY
256
+ __m256i match4EndB = _mm256_ternarylogic_epi32(match34EqYB, match3CrB, match4LfB, 0xF8);
257
+ // merge with \r\n. and combine
258
+ matchEnd = KORTEST32(
259
+ KOR32(
260
+ _mm256_mask_test_epi8_mask(match2NlDotMaskA, match4EndA, match4EndA),
261
+ KAND32(match3EqYMaskA, match1NlMaskA)
262
+ ),
263
+ KOR32(
264
+ _mm256_mask_test_epi8_mask(match2NlDotMaskB, match4EndB, match4EndB),
265
+ KAND32(match3EqYMaskB, match1NlMaskB)
266
+ )
267
+ );
268
+ } else
269
+ #endif
270
+ {
271
+ __m256i match3EqYA = _mm256_and_si256(match2EqA, _mm256_cmpeq_epi8(
272
+ _mm256_set1_epi8('y'),
273
+ SHIFT_DATA_A(3)
274
+ ));
275
+ __m256i match3EqYB = _mm256_and_si256(match2EqB, _mm256_cmpeq_epi8(
276
+ _mm256_set1_epi8('y'),
277
+ SHIFT_DATA_B(3)
278
+ ));
279
+ match4EqYA = _mm256_slli_epi16(match4EqYA, 8); // TODO: also consider using PBLENDVB here with shifted match3EqY instead
280
+ match4EqYB = _mm256_slli_epi16(match4EqYB, 8);
281
+ // merge \r\n and =y matches for tmpData4
282
+ __m256i match4EndA = _mm256_or_si256(
283
+ _mm256_and_si256(match3CrA, match4LfA),
284
+ _mm256_or_si256(match4EqYA, _mm256_srli_epi16(match3EqYA, 8)) // _mm256_srli_si256 by 1 also works
285
+ );
286
+ __m256i match4EndB = _mm256_or_si256(
287
+ _mm256_and_si256(match3CrB, match4LfB),
288
+ _mm256_or_si256(match4EqYB, _mm256_srli_epi16(match3EqYB, 8))
289
+ );
290
+ // merge with \r\n.
291
+ match4EndA = _mm256_and_si256(match4EndA, match2NlDotA);
292
+ match4EndB = _mm256_and_si256(match4EndB, match2NlDotB);
293
+ // match \r\n=y
294
+ __m256i match3EndA = _mm256_and_si256(match3EqYA, match1NlA);
295
+ __m256i match3EndB = _mm256_and_si256(match3EqYB, match1NlB);
296
+ // combine match sequences
297
+ matchEnd = _mm256_movemask_epi8(_mm256_or_si256(
298
+ _mm256_or_si256(match4EndA, match3EndA),
299
+ _mm256_or_si256(match4EndB, match3EndB)
300
+ ));
301
+ }
302
+ if(LIKELIHOOD(0.002, matchEnd)) {
303
+ // terminator found
304
+ // there's probably faster ways to do this, but reverting to scalar code should be good enough
305
+ len += (long)i;
306
+ break;
307
+ }
308
+ }
309
+ #if defined(__AVX512VL__) && defined(__AVX512BW__)
310
+ if(use_isa >= ISA_LEVEL_AVX3) {
311
+ mask |= (uint64_t)match2NlDotMaskA << 2;
312
+ mask |= (uint64_t)match2NlDotMaskB << 34;
313
+ minMask = _mm256_maskz_mov_epi8(~(match2NlDotMaskB>>30), _mm256_set1_epi8('.'));
314
+ } else
315
+ #endif
316
+ {
317
+ mask |= (uint64_t)((uint32_t)_mm256_movemask_epi8(match2NlDotA)) << 2;
318
+ mask |= (uint64_t)((uint32_t)_mm256_movemask_epi8(match2NlDotB)) << 34;
319
+ match2NlDotB = zext128_256(_mm_srli_si128(_mm256_extracti128_si256(match2NlDotB, 1), 14));
320
+ minMask = _mm256_subs_epu8(_mm256_set1_epi8('.'), match2NlDotB);
321
+ }
322
+ }
323
+ else if(searchEnd) {
324
+ bool partialEndFound;
325
+ __m256i match3EqYA, match3EqYB;
326
+ #if defined(__AVX512VL__) && defined(__AVX512BW__)
327
+ __mmask32 match3EqYMaskA, match3EqYMaskB;
328
+ if(use_isa >= ISA_LEVEL_AVX3) {
329
+ match3EqYMaskA = _mm256_mask_cmpeq_epi8_mask(
330
+ match2EqMaskA,
331
+ _mm256_set1_epi8('y'),
332
+ SHIFT_DATA_A(3)
333
+ );
334
+ match3EqYMaskB = _mm256_mask_cmpeq_epi8_mask(
335
+ match2EqMaskB,
336
+ _mm256_set1_epi8('y'),
337
+ SHIFT_DATA_B(3)
338
+ );
339
+ partialEndFound = KORTEST32(match3EqYMaskA, match3EqYMaskB);
340
+ } else
341
+ #endif
342
+ {
343
+ __m256i match3YA = _mm256_cmpeq_epi8(
344
+ _mm256_set1_epi8('y'),
345
+ SHIFT_DATA_A(3)
346
+ );
347
+ __m256i match3YB = _mm256_cmpeq_epi8(
348
+ _mm256_set1_epi8('y'),
349
+ SHIFT_DATA_B(3)
350
+ );
351
+ match3EqYA = _mm256_and_si256(match2EqA, match3YA);
352
+ match3EqYB = _mm256_and_si256(match2EqB, match3YB);
353
+ partialEndFound = _mm256_movemask_epi8(_mm256_or_si256(match3EqYA, match3EqYB));
354
+ }
355
+ if(LIKELIHOOD(0.002, partialEndFound)) {
356
+ bool endFound;
357
+ #if defined(__AVX512VL__) && defined(__AVX512BW__)
358
+ if(use_isa >= ISA_LEVEL_AVX3) {
359
+ __mmask32 match3LfEqYMaskA = _mm256_mask_cmpeq_epi8_mask(
360
+ match3EqYMaskA,
361
+ _mm256_set1_epi8('\n'),
362
+ SHIFT_DATA_A(1)
363
+ );
364
+ __mmask32 match3LfEqYMaskB = _mm256_mask_cmpeq_epi8_mask(
365
+ match3EqYMaskB,
366
+ _mm256_set1_epi8('\n'),
367
+ SHIFT_DATA_B(1)
368
+ );
369
+
370
+ endFound = KORTEST32(
371
+ _mm256_mask_cmpeq_epi8_mask(match3LfEqYMaskA, oDataA, _mm256_set1_epi8('\r')),
372
+ _mm256_mask_cmpeq_epi8_mask(match3LfEqYMaskB, oDataB, _mm256_set1_epi8('\r'))
373
+ );
374
+ } else
375
+ #endif
376
+ {
377
+ __m256i match1LfA = _mm256_cmpeq_epi8(
378
+ _mm256_set1_epi8('\n'),
379
+ SHIFT_DATA_A(1)
380
+ );
381
+ __m256i match1LfB = _mm256_cmpeq_epi8(
382
+ _mm256_set1_epi8('\n'),
383
+ SHIFT_DATA_B(1)
384
+ );
385
+ endFound = _mm256_movemask_epi8(_mm256_or_si256(
386
+ _mm256_and_si256(
387
+ match3EqYA,
388
+ _mm256_and_si256(match1LfA, _mm256_cmpeq_epi8(force_align_read_256(src+i), _mm256_set1_epi8('\r')))
389
+ ),
390
+ _mm256_and_si256(
391
+ match3EqYB,
392
+ _mm256_and_si256(match1LfB, _mm256_cmpeq_epi8(force_align_read_256(src+i + sizeof(__m256i)), _mm256_set1_epi8('\r')))
393
+ )
394
+ ));
395
+ }
396
+ if(endFound) {
397
+ len += (long)i;
398
+ break;
399
+ }
400
+ }
401
+ if(isRaw) minMask = _mm256_set1_epi8('.');
402
+ }
403
+ else if(isRaw) // no \r_. found
404
+ minMask = _mm256_set1_epi8('.');
405
+ }
406
+ #undef SHIFT_DATA_A
407
+ #undef SHIFT_DATA_B
408
+
409
+ if(use_isa >= ISA_LEVEL_AVX3)
410
+ dataB = _mm256_add_epi8(oDataB, _mm256_set1_epi8(-42));
411
+
412
+ if(LIKELIHOOD(0.0001, (mask & ((maskEq << 1) + escFirst)) != 0)) {
413
+ unsigned tmp = lookups->eqFix[(maskEq&0xff) & ~(uint64_t)escFirst];
414
+ uint64_t maskEq2 = tmp;
415
+ for(int j=8; j<64; j+=8) {
416
+ tmp = lookups->eqFix[(unsigned)((maskEq>>j)&0xff) & ~(tmp>>7)];
417
+ maskEq2 |= (uint64_t)tmp<<j;
418
+ }
419
+ maskEq = maskEq2;
420
+
421
+ mask &= ~(uint64_t)escFirst;
422
+ escFirst = tmp>>7;
423
+ // next, eliminate anything following a `=` from the special char mask; this eliminates cases of `=\r` so that they aren't removed
424
+ maskEq <<= 1;
425
+ mask &= ~maskEq;
426
+
427
+ // unescape chars following `=`
428
+ #if defined(__AVX512VL__) && defined(__AVX512BW__)
429
+ if(use_isa >= ISA_LEVEL_AVX3) {
430
+ // GCC < 7 seems to generate rubbish assembly for this
431
+ dataA = _mm256_mask_add_epi8(
432
+ dataA,
433
+ (__mmask32)maskEq,
434
+ dataA,
435
+ _mm256_set1_epi8(-64)
436
+ );
437
+ dataB = _mm256_mask_add_epi8(
438
+ dataB,
439
+ (__mmask32)(maskEq>>32),
440
+ dataB,
441
+ _mm256_set1_epi8(-64)
442
+ );
443
+ } else
444
+ #endif
445
+ {
446
+ // convert maskEq into vector form (i.e. reverse pmovmskb)
447
+ #ifdef PLATFORM_AMD64
448
+ __m256i vMaskEq = _mm256_broadcastq_epi64(_mm_cvtsi64_si128(maskEq));
449
+ #else
450
+ __m256i vMaskEq = _mm256_permute4x64_epi64(_mm256_insert_epi32(
451
+ _mm256_set_epi32(0,0,0,0, 0,0,0, maskEq & 0xffffffff),
452
+ maskEq >> 32,
453
+ 1
454
+ ), 0);
455
+ #endif
456
+ __m256i vMaskEqA = _mm256_shuffle_epi8(vMaskEq, _mm256_set_epi32(
457
+ 0x03030303, 0x03030303, 0x02020202, 0x02020202,
458
+ 0x01010101, 0x01010101, 0x00000000, 0x00000000
459
+ ));
460
+ __m256i vMaskEqB = _mm256_shuffle_epi8(vMaskEq, _mm256_set_epi32(
461
+ 0x07070707, 0x07070707, 0x06060606, 0x06060606,
462
+ 0x05050505, 0x05050505, 0x04040404, 0x04040404
463
+ ));
464
+ vMaskEqA = _mm256_cmpeq_epi8(
465
+ _mm256_and_si256(vMaskEqA, _mm256_set1_epi64x(0x8040201008040201ULL)),
466
+ _mm256_set1_epi64x(0x8040201008040201ULL)
467
+ );
468
+ vMaskEqB = _mm256_cmpeq_epi8(
469
+ _mm256_and_si256(vMaskEqB, _mm256_set1_epi64x(0x8040201008040201ULL)),
470
+ _mm256_set1_epi64x(0x8040201008040201ULL)
471
+ );
472
+ dataA = _mm256_add_epi8(oDataA, _mm256_blendv_epi8(yencOffset, _mm256_set1_epi8(-42-64), vMaskEqA));
473
+ dataB = _mm256_add_epi8(oDataB, _mm256_blendv_epi8(_mm256_set1_epi8(-42), _mm256_set1_epi8(-42-64), vMaskEqB));
474
+ }
475
+ } else {
476
+ escFirst = (maskEq >> 63);
477
+
478
+ #if defined(__AVX512VL__) && defined(__AVX512BW__)
479
+ if(use_isa >= ISA_LEVEL_AVX3) {
480
+ dataA = _mm256_mask_add_epi8(
481
+ dataA,
482
+ (__mmask32)(maskEq << 1),
483
+ dataA,
484
+ _mm256_set1_epi8(-64)
485
+ );
486
+ dataB = _mm256_mask_add_epi8(
487
+ dataB,
488
+ (__mmask32)(maskEq >> 31),
489
+ dataB,
490
+ _mm256_set1_epi8(-64)
491
+ );
492
+ } else
493
+ #endif
494
+ {
495
+ // << 1 byte
496
+ cmpEqA = _mm256_alignr_epi8(cmpEqA, _mm256_inserti128_si256(
497
+ _mm256_set1_epi8('='), _mm256_castsi256_si128(cmpEqA), 1
498
+ ), 15);
499
+ cmpEqB = _mm256_cmpeq_epi8(_mm256_set1_epi8('='), _mm256_loadu_si256((__m256i *)(src+i-1) + 1));
500
+ dataA = _mm256_add_epi8(
501
+ oDataA,
502
+ _mm256_blendv_epi8(
503
+ yencOffset,
504
+ _mm256_set1_epi8(-42-64),
505
+ cmpEqA
506
+ )
507
+ );
508
+ dataB = _mm256_add_epi8(
509
+ oDataB,
510
+ _mm256_blendv_epi8(
511
+ _mm256_set1_epi8(-42),
512
+ _mm256_set1_epi8(-42-64),
513
+ cmpEqB
514
+ )
515
+ );
516
+ }
517
+ }
518
+ // subtract 64 from first element if escFirst == 1
519
+ #if defined(__AVX512VL__) && defined(__AVX512BW__)
520
+ if(use_isa >= ISA_LEVEL_AVX3) {
521
+ yencOffset = _mm256_mask_add_epi8(_mm256_set1_epi8(-42), (__mmask32)escFirst, _mm256_set1_epi8(-42), _mm256_set1_epi8(-64));
522
+ } else
523
+ #endif
524
+ {
525
+ yencOffset = _mm256_xor_si256(_mm256_set1_epi8(-42), zext128_256(
526
+ _mm_slli_epi16(_mm_cvtsi32_si128((int)escFirst), 6)
527
+ ));
528
+ }
529
+
530
+ // all that's left is to 'compress' the data (skip over masked chars)
531
+ #if defined(__AVX512VBMI2__) && defined(__AVX512VL__)
532
+ if(use_isa >= ISA_LEVEL_VBMI2) {
533
+ _mm256_mask_compressstoreu_epi8(p, KNOT32(mask), dataA);
534
+ p -= popcnt32(mask & 0xffffffff);
535
+ _mm256_mask_compressstoreu_epi8((p + XMM_SIZE*2), KNOT32(mask>>32), dataB);
536
+ p += XMM_SIZE*4 - popcnt32(mask >> 32);
537
+ } else
538
+ #endif
539
+ {
540
+ // lookup compress masks and shuffle
541
+ __m256i shuf = _mm256_inserti128_si256(
542
+ _mm256_castsi128_si256(_mm_load_si128((__m128i*)(lookups->compact + (mask & 0x7fff)))),
543
+ *(__m128i*)((char*)lookups->compact + ((mask >> 12) & 0x7fff0)),
544
+ 1
545
+ );
546
+ dataA = _mm256_shuffle_epi8(dataA, shuf);
547
+
548
+ _mm_storeu_si128((__m128i*)p, _mm256_castsi256_si128(dataA));
549
+ // increment output position
550
+ p -= popcnt32(mask & 0xffff);
551
+
552
+ _mm_storeu_si128((__m128i*)(p + XMM_SIZE), _mm256_extracti128_si256(dataA, 1));
553
+ p -= popcnt32(mask & 0xffff0000);
554
+
555
+ #ifdef PLATFORM_AMD64
556
+ mask >>= 28;
557
+ shuf = _mm256_inserti128_si256(
558
+ _mm256_castsi128_si256(_mm_load_si128((__m128i*)((char*)lookups->compact + (mask & 0x7fff0)))),
559
+ *(__m128i*)((char*)lookups->compact + ((mask >> 16) & 0x7fff0)),
560
+ 1
561
+ );
562
+ dataB = _mm256_shuffle_epi8(dataB, shuf);
563
+
564
+ _mm_storeu_si128((__m128i*)(p + XMM_SIZE*2), _mm256_castsi256_si128(dataB));
565
+ p -= popcnt32(mask & 0xffff0);
566
+
567
+ _mm_storeu_si128((__m128i*)(p + XMM_SIZE*3), _mm256_extracti128_si256(dataB, 1));
568
+ p -= popcnt32((unsigned int)(mask >> 20));
569
+ #else
570
+ mask >>= 32;
571
+ shuf = _mm256_inserti128_si256(
572
+ _mm256_castsi128_si256(_mm_load_si128((__m128i*)(lookups->compact + (mask & 0x7fff)))),
573
+ *(__m128i*)((char*)lookups->compact + ((mask >> 12) & 0x7fff0)),
574
+ 1
575
+ );
576
+ dataB = _mm256_shuffle_epi8(dataB, shuf);
577
+
578
+ _mm_storeu_si128((__m128i*)(p + XMM_SIZE*2), _mm256_castsi256_si128(dataB));
579
+ p -= popcnt32(mask & 0xffff);
580
+
581
+ _mm_storeu_si128((__m128i*)(p + XMM_SIZE*3), _mm256_extracti128_si256(dataB, 1));
582
+ p -= popcnt32(mask & 0xffff0000);
583
+ #endif
584
+ p += XMM_SIZE*4;
585
+ }
586
+ } else {
587
+ if(use_isa < ISA_LEVEL_AVX3)
588
+ dataA = _mm256_add_epi8(oDataA, yencOffset);
589
+ dataB = _mm256_add_epi8(oDataB, _mm256_set1_epi8(-42));
590
+
591
+ _mm256_storeu_si256((__m256i*)p, dataA);
592
+ _mm256_storeu_si256((__m256i*)p + 1, dataB);
593
+ p += sizeof(__m256i)*2;
594
+ escFirst = 0;
595
+ yencOffset = _mm256_set1_epi8(-42);
596
+ }
597
+ }
598
+ _escFirst = (unsigned char)escFirst;
599
+ if(isRaw) {
600
+ // this would be the trivial solution, but requires the compiler holding onto minMask throughout the loop:
601
+ //_nextMask = ~(uint16_t)_mm256_movemask_epi8(_mm256_cmpeq_epi8(minMask, _mm256_set1_epi8('.')));
602
+ // instead, just scan the memory to determine what to set nextMask to
603
+ if(len != 0) { // have to gone through at least one loop cycle
604
+ if(src[i-2] == '\r' && src[i-1] == '\n' && src[i] == '.')
605
+ _nextMask = 1;
606
+ else if(src[i-1] == '\r' && src[i] == '\n' && src[i+1] == '.')
607
+ _nextMask = 2;
608
+ else
609
+ _nextMask = 0;
610
+ }
611
+ } else
612
+ _nextMask = 0;
613
+ _mm256_zeroupper();
614
+ }
615
+ #endif