yencode 1.0.8 → 1.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. package/README.md +339 -231
  2. package/binding.gyp +292 -39
  3. package/crcutil-1.0/code/multiword_64_64_gcc_amd64_asm.cc +7 -7
  4. package/crcutil-1.0/code/multiword_64_64_gcc_i386_mmx.cc +14 -14
  5. package/crcutil-1.0/code/multiword_64_64_intrinsic_i386_mmx.cc +1 -1
  6. package/crcutil-1.0/code/uint128_sse2.h +2 -0
  7. package/index.js +329 -22
  8. package/package.json +2 -2
  9. package/src/common.h +299 -0
  10. package/src/crc.cc +95 -0
  11. package/src/crc.h +23 -0
  12. package/src/crc_arm.cc +175 -0
  13. package/src/crc_common.h +4 -0
  14. package/{crc_folding.c → src/crc_folding.cc} +175 -185
  15. package/src/decoder.cc +61 -0
  16. package/src/decoder.h +53 -0
  17. package/src/decoder_avx.cc +18 -0
  18. package/src/decoder_avx2.cc +18 -0
  19. package/src/decoder_avx2_base.h +615 -0
  20. package/src/decoder_common.h +512 -0
  21. package/src/decoder_neon.cc +474 -0
  22. package/src/decoder_neon64.cc +451 -0
  23. package/src/decoder_sse2.cc +16 -0
  24. package/src/decoder_sse_base.h +711 -0
  25. package/src/decoder_ssse3.cc +18 -0
  26. package/src/encoder.cc +170 -0
  27. package/src/encoder.h +21 -0
  28. package/src/encoder_avx.cc +16 -0
  29. package/src/encoder_avx2.cc +16 -0
  30. package/src/encoder_avx_base.h +564 -0
  31. package/src/encoder_common.h +109 -0
  32. package/src/encoder_neon.cc +547 -0
  33. package/src/encoder_sse2.cc +13 -0
  34. package/src/encoder_sse_base.h +724 -0
  35. package/src/encoder_ssse3.cc +18 -0
  36. package/src/hedley.h +1899 -0
  37. package/src/platform.cc +147 -0
  38. package/src/yencode.cc +449 -0
  39. package/test/_maxsize.js +9 -0
  40. package/test/_speedbase.js +147 -0
  41. package/test/speedcrc.js +20 -0
  42. package/test/speeddec.js +92 -0
  43. package/test/speedenc.js +44 -0
  44. package/{testcrc.js → test/testcrc.js} +53 -39
  45. package/test/testdec.js +183 -0
  46. package/test/testenc.js +163 -0
  47. package/test/testpostdec.js +126 -0
  48. package/test.js +0 -91
  49. package/yencode.cc +0 -1622
@@ -0,0 +1,711 @@
1
+
2
+ #ifdef __SSE2__
3
+
4
+ #if defined(__clang__) && __clang_major__ == 6 && __clang_minor__ == 0
5
+ // VBMI2 introduced in clang 6.0, but 128-bit functions misnamed there; fixed in clang 7.0, but we'll handle those on 6.0
6
+ # define _mm_mask_compressstoreu_epi8 _mm128_mask_compressstoreu_epi8
7
+ # define _mm_shrdi_epi16 _mm128_shrdi_epi16
8
+ #endif
9
+
10
+ // GCC (ver 6-10(dev)) fails to optimize pure C version of mask testing, but has this intrinsic; Clang >= 7 optimizes C version fine
11
+ #if defined(__GNUC__) && __GNUC__ >= 7
12
+ # define KORTEST16(a, b) !_kortestz_mask16_u8((a), (b))
13
+ # define KAND16(a, b) _kand_mask16((a), (b))
14
+ # define KOR16(a, b) _kor_mask16((a), (b))
15
+ #else
16
+ # define KORTEST16(a, b) ((a) | (b))
17
+ # define KAND16(a, b) ((a) & (b))
18
+ # define KOR16(a, b) ((a) | (b))
19
+ #endif
20
+
21
+ #pragma pack(16)
22
+ static struct {
23
+ unsigned char BitsSetTable256inv[256];
24
+ /*align16*/ struct { char bytes[16]; } compact[32768];
25
+ uint8_t eqFix[256];
26
+ /*align8*/ uint64_t eqAdd[256];
27
+ /*align16*/ int8_t unshufMask[32*16];
28
+ } * HEDLEY_RESTRICT lookups;
29
+ #pragma pack()
30
+
31
+
32
+ static HEDLEY_ALWAYS_INLINE __m128i force_align_read_128(const void* p) {
33
+ #ifdef _MSC_VER
34
+ // MSVC complains about casting away volatile
35
+ return *(__m128i *)(p);
36
+ #else
37
+ return *(volatile __m128i *)(p);
38
+ #endif
39
+ }
40
+
41
+
42
+ static void decoder_sse_init() {
43
+ ALIGN_ALLOC(lookups, sizeof(*lookups), 16);
44
+ for(int i=0; i<256; i++) {
45
+ lookups->BitsSetTable256inv[i] = 8 - (
46
+ (i & 1) + ((i>>1) & 1) + ((i>>2) & 1) + ((i>>3) & 1) + ((i>>4) & 1) + ((i>>5) & 1) + ((i>>6) & 1) + ((i>>7) & 1)
47
+ );
48
+
49
+ #define _X(n, k) ((((n) & (1<<k)) ? 192ULL : 0ULL) << (k*8))
50
+ lookups->eqAdd[i] = _X(i, 0) | _X(i, 1) | _X(i, 2) | _X(i, 3) | _X(i, 4) | _X(i, 5) | _X(i, 6) | _X(i, 7);
51
+ #undef _X
52
+ }
53
+ for(int i=0; i<32; i++) {
54
+ for(int j=0; j<16; j++) {
55
+ if(i >= 16) // only used for LZCNT
56
+ lookups->unshufMask[i*16 + j] = ((31-i)>j ? -1 : 0);
57
+ else // only used for BSR
58
+ lookups->unshufMask[i*16 + j] = (i>j ? -1 : 0);
59
+ }
60
+ }
61
+ }
62
+
63
+
64
+ // for LZCNT/BSR
65
+ #ifdef _MSC_VER
66
+ # include <intrin.h>
67
+ # include <ammintrin.h>
68
+ static HEDLEY_ALWAYS_INLINE unsigned BSR32(unsigned src) {
69
+ unsigned long result;
70
+ _BitScanReverse((unsigned long*)&result, src);
71
+ return result;
72
+ }
73
+ #elif defined(__GNUC__)
74
+ // have seen Clang not like _bit_scan_reverse
75
+ # include <x86intrin.h> // for lzcnt
76
+ # define BSR32(src) (31^__builtin_clz(src))
77
+ #else
78
+ # include <x86intrin.h>
79
+ # define BSR32 _bit_scan_reverse
80
+ #endif
81
+
82
+ template<enum YEncDecIsaLevel use_isa>
83
+ static HEDLEY_ALWAYS_INLINE __m128i sse2_compact_vect(uint32_t mask, __m128i data) {
84
+ while(mask) {
85
+ unsigned bitIndex;
86
+ #if defined(__LZCNT__)
87
+ if(use_isa & ISA_FEATURE_LZCNT) {
88
+ // lzcnt is always at least as fast as bsr, so prefer it if it's available
89
+ bitIndex = _lzcnt_u32(mask);
90
+ mask &= 0x7fffffffU>>bitIndex;
91
+ } else
92
+ #endif
93
+ {
94
+ bitIndex = BSR32(mask);
95
+ mask ^= 1<<bitIndex;
96
+ }
97
+ __m128i mergeMask = _mm_load_si128((__m128i*)lookups->unshufMask + bitIndex);
98
+ data = _mm_or_si128(
99
+ _mm_and_si128(mergeMask, data),
100
+ _mm_andnot_si128(mergeMask, _mm_srli_si128(data, 1))
101
+ );
102
+ }
103
+ return data;
104
+ }
105
+
106
+ template<bool isRaw, bool searchEnd, enum YEncDecIsaLevel use_isa>
107
+ HEDLEY_ALWAYS_INLINE void do_decode_sse(const uint8_t* HEDLEY_RESTRICT src, long& len, unsigned char* HEDLEY_RESTRICT & p, unsigned char& _escFirst, uint16_t& _nextMask) {
108
+ HEDLEY_ASSUME(_escFirst == 0 || _escFirst == 1);
109
+ HEDLEY_ASSUME(_nextMask == 0 || _nextMask == 1 || _nextMask == 2);
110
+ uintptr_t escFirst = _escFirst;
111
+ __m128i yencOffset = escFirst ? _mm_set_epi8(
112
+ -42,-42,-42,-42,-42,-42,-42,-42,-42,-42,-42,-42,-42,-42,-42,-42-64
113
+ ) : _mm_set1_epi8(-42);
114
+
115
+ #if defined(__SSSE3__) && !defined(__tune_atom__) && !defined(__tune_slm__) && !defined(__tune_btver1__)
116
+ const bool _USING_FAST_MATCH = (use_isa >= ISA_LEVEL_SSSE3);
117
+ #else
118
+ const bool _USING_FAST_MATCH = false;
119
+ #endif
120
+ #if defined(__SSE4_1__) && !defined(__tune_slm__) && !defined(__tune_goldmont__) && !defined(__tune_goldmont_plus__) && !defined(__tune_tremont__)
121
+ const bool _USING_BLEND_ADD = (use_isa >= ISA_LEVEL_SSE41);
122
+ #else
123
+ const bool _USING_BLEND_ADD = false;
124
+ #endif
125
+
126
+ __m128i lfCompare = _mm_set1_epi8('\n');
127
+ __m128i minMask = _mm_set1_epi8('.');
128
+ if(_nextMask && isRaw) {
129
+ if(_USING_FAST_MATCH)
130
+ minMask = _mm_insert_epi16(minMask, _nextMask == 1 ? 0x2e00 : 0x002e, 0);
131
+ else
132
+ lfCompare = _mm_insert_epi16(lfCompare, _nextMask == 1 ? 0x0a2e /*".\n"*/ : 0x2e0a /*"\n."*/, 0);
133
+ }
134
+ intptr_t i;
135
+ for(i = -len; i; i += sizeof(__m128i)*2) {
136
+ __m128i oDataA = _mm_load_si128((__m128i *)(src+i));
137
+ __m128i oDataB = _mm_load_si128((__m128i *)(src+i) + 1);
138
+
139
+ // search for special chars
140
+ __m128i cmpEqA, cmpEqB, cmpCrA, cmpCrB;
141
+ __m128i cmpA, cmpB;
142
+ #if defined(__SSSE3__)
143
+ if(_USING_FAST_MATCH) {
144
+ cmpA = _mm_cmpeq_epi8(oDataA, _mm_shuffle_epi8(
145
+ _mm_set_epi8(-1,'=','\r',-1,-1,'\n',-1,-1,-1,-1,-1,-1,-1,-1,-1,'.'),
146
+ _mm_min_epu8(oDataA, minMask)
147
+ ));
148
+ cmpB = _mm_cmpeq_epi8(oDataB, _mm_shuffle_epi8(
149
+ _mm_set_epi8(-1,'=','\r',-1,-1,'\n',-1,-1,-1,-1,-1,-1,-1,-1,-1,'.'),
150
+ _mm_min_epu8(oDataB, _mm_set1_epi8('.'))
151
+ ));
152
+ } else
153
+ #endif
154
+ {
155
+ cmpEqA = _mm_cmpeq_epi8(oDataA, _mm_set1_epi8('='));
156
+ cmpEqB = _mm_cmpeq_epi8(oDataB, _mm_set1_epi8('='));
157
+ cmpCrA = _mm_cmpeq_epi8(oDataA, _mm_set1_epi8('\r'));
158
+ cmpCrB = _mm_cmpeq_epi8(oDataB, _mm_set1_epi8('\r'));
159
+ cmpA = _mm_or_si128(
160
+ _mm_or_si128(
161
+ _mm_cmpeq_epi8(oDataA, lfCompare), cmpCrA
162
+ ),
163
+ cmpEqA
164
+ );
165
+ cmpB = _mm_or_si128(
166
+ _mm_or_si128(
167
+ _mm_cmpeq_epi8(oDataB, _mm_set1_epi8('\n')), cmpCrB
168
+ ),
169
+ cmpEqB
170
+ );
171
+ }
172
+
173
+ __m128i dataA, dataB;
174
+ if(!_USING_BLEND_ADD)
175
+ dataA = _mm_add_epi8(oDataA, yencOffset);
176
+ uint32_t mask = (unsigned)_mm_movemask_epi8(cmpA) | ((unsigned)_mm_movemask_epi8(cmpB) << 16); // not the most accurate mask if we have invalid sequences; we fix this up later
177
+
178
+ if (LIKELIHOOD(0.42 /* rough guess */, mask != 0)) {
179
+ if(_USING_FAST_MATCH) {
180
+ cmpEqA = _mm_cmpeq_epi8(oDataA, _mm_set1_epi8('='));
181
+ cmpEqB = _mm_cmpeq_epi8(oDataB, _mm_set1_epi8('='));
182
+ }
183
+
184
+ #define LOAD_HALVES(a, b) _mm_castps_si128(_mm_loadh_pi( \
185
+ _mm_castsi128_ps(_mm_loadl_epi64((__m128i*)(a))), \
186
+ (__m64*)(b) \
187
+ ))
188
+
189
+ // a spec compliant encoder should never generate sequences: ==, =\n and =\r, but we'll handle them to be spec compliant
190
+ // the yEnc specification requires any character following = to be unescaped, not skipped over, so we'll deal with that
191
+ // firstly, check for invalid sequences of = (we assume that these are rare, as a spec compliant yEnc encoder should not generate these)
192
+ uint32_t maskEq = (unsigned)_mm_movemask_epi8(cmpEqA) | ((unsigned)_mm_movemask_epi8(cmpEqB) << 16);
193
+
194
+ // handle \r\n. sequences
195
+ // RFC3977 requires the first dot on a line to be stripped, due to dot-stuffing
196
+ if((isRaw || searchEnd) && LIKELIHOOD(0.25, mask != maskEq)) {
197
+ #if 0
198
+ // for experimentation: prefer shifting data over unaligned loads on CPUs with slow unaligned handling
199
+ // haven't ever seen this be beneficial though
200
+ __m128i nextDataB;
201
+ if(searchEnd && _USING_BLEND_ADD)
202
+ nextDataB = _mm_cvtsi32_si128(*(uint32_t*)(src+i+sizeof(__m128i)*2));
203
+ # define SHIFT_DATA_A(offs) (searchEnd && _USING_BLEND_ADD ? _mm_alignr_epi8(oDataB, oDataA, offs) : _mm_loadu_si128((__m128i *)(src+i+offs)))
204
+ # define SHIFT_DATA_B(offs) (searchEnd && _USING_BLEND_ADD ? _mm_alignr_epi8(nextDataB, oDataB, offs) : _mm_loadu_si128((__m128i *)(src+i+offs) + 1))
205
+ #else
206
+ # define SHIFT_DATA_A(offs) _mm_loadu_si128((__m128i *)(src+i+offs))
207
+ # define SHIFT_DATA_B(offs) _mm_loadu_si128((__m128i *)(src+i+offs) + 1)
208
+ #endif
209
+ __m128i tmpData2A = SHIFT_DATA_A(2);
210
+ __m128i tmpData2B = SHIFT_DATA_B(2);
211
+ __m128i match2EqA, match2EqB;
212
+
213
+ #if defined(__AVX512VL__) && defined(__AVX512BW__)
214
+ __mmask16 match2EqMaskA, match2EqMaskB;
215
+ __mmask16 match0CrMaskA, match0CrMaskB;
216
+ __mmask16 match2CrXDtMaskA, match2CrXDtMaskB;
217
+ if(use_isa >= ISA_LEVEL_AVX3 && searchEnd) {
218
+ match2EqMaskA = _mm_cmpeq_epi8_mask(_mm_set1_epi8('='), tmpData2A);
219
+ match2EqMaskB = _mm_cmpeq_epi8_mask(_mm_set1_epi8('='), tmpData2B);
220
+ } else
221
+ #endif
222
+ if(searchEnd) {
223
+ #if !defined(__tune_btver1__)
224
+ if(use_isa < ISA_LEVEL_SSSE3)
225
+ #endif
226
+ match2EqA = _mm_cmpeq_epi8(_mm_set1_epi8('='), tmpData2A);
227
+ match2EqB = _mm_cmpeq_epi8(_mm_set1_epi8('='), tmpData2B);
228
+ }
229
+ int partialKillDotFound;
230
+ __m128i match2CrXDtA, match2CrXDtB;
231
+ if(isRaw) {
232
+ #if defined(__AVX512VL__) && defined(__AVX512BW__)
233
+ if(use_isa >= ISA_LEVEL_AVX3) {
234
+ match0CrMaskA = _mm_cmpeq_epi8_mask(oDataA, _mm_set1_epi8('\r'));
235
+ match0CrMaskB = _mm_cmpeq_epi8_mask(oDataB, _mm_set1_epi8('\r'));
236
+ match2CrXDtMaskA = _mm_mask_cmpeq_epi8_mask(match0CrMaskA, tmpData2A, _mm_set1_epi8('.'));
237
+ match2CrXDtMaskB = _mm_mask_cmpeq_epi8_mask(match0CrMaskB, tmpData2B, _mm_set1_epi8('.'));
238
+ partialKillDotFound = KORTEST16(match2CrXDtMaskA, match2CrXDtMaskB);
239
+ } else
240
+ #endif
241
+ {
242
+ if(_USING_FAST_MATCH) {
243
+ cmpCrA = _mm_cmpeq_epi8(oDataA, _mm_set1_epi8('\r'));
244
+ cmpCrB = _mm_cmpeq_epi8(oDataB, _mm_set1_epi8('\r'));
245
+ }
246
+ match2CrXDtA = _mm_and_si128(cmpCrA, _mm_cmpeq_epi8(tmpData2A, _mm_set1_epi8('.')));
247
+ match2CrXDtB = _mm_and_si128(cmpCrB, _mm_cmpeq_epi8(tmpData2B, _mm_set1_epi8('.')));
248
+ partialKillDotFound = _mm_movemask_epi8(_mm_or_si128(match2CrXDtA, match2CrXDtB));
249
+ }
250
+ }
251
+
252
+ if(isRaw && LIKELIHOOD(0.001, partialKillDotFound)) {
253
+ __m128i match2NlDotA, match1NlA;
254
+ __m128i match2NlDotB, match1NlB;
255
+ // merge matches for \r\n.
256
+ #if defined(__AVX512VL__) && defined(__AVX512BW__)
257
+ __mmask16 match1NlMaskA, match1NlMaskB;
258
+ __mmask16 match2NlDotMaskA, match2NlDotMaskB;
259
+ if(use_isa >= ISA_LEVEL_AVX3) {
260
+ match1NlMaskA = _mm_mask_cmpeq_epi8_mask(
261
+ match0CrMaskA,
262
+ _mm_set1_epi8('\n'),
263
+ SHIFT_DATA_A(1)
264
+ );
265
+ match1NlMaskB = _mm_mask_cmpeq_epi8_mask(
266
+ match0CrMaskB,
267
+ _mm_set1_epi8('\n'),
268
+ SHIFT_DATA_B(1)
269
+ );
270
+ match2NlDotMaskA = KAND16(match2CrXDtMaskA, match1NlMaskA);
271
+ match2NlDotMaskB = KAND16(match2CrXDtMaskB, match1NlMaskB);
272
+ } else
273
+ #endif
274
+ {
275
+ __m128i match1LfA = _mm_cmpeq_epi8(_mm_set1_epi8('\n'), SHIFT_DATA_A(1));
276
+ __m128i match1LfB = _mm_cmpeq_epi8(_mm_set1_epi8('\n'), SHIFT_DATA_B(1));
277
+
278
+ // always recompute cmpCr to avoid register spills above
279
+ cmpCrA = _mm_cmpeq_epi8(force_align_read_128(src+i), _mm_set1_epi8('\r'));
280
+ cmpCrB = _mm_cmpeq_epi8(force_align_read_128(src+i + sizeof(__m128i)), _mm_set1_epi8('\r'));
281
+ match1NlA = _mm_and_si128(match1LfA, cmpCrA);
282
+ match1NlB = _mm_and_si128(match1LfB, cmpCrB);
283
+ match2NlDotA = _mm_and_si128(match2CrXDtA, match1NlA);
284
+ match2NlDotB = _mm_and_si128(match2CrXDtB, match1NlB);
285
+ }
286
+ if(searchEnd) {
287
+ __m128i tmpData3A = SHIFT_DATA_A(3);
288
+ __m128i tmpData3B = SHIFT_DATA_B(3);
289
+ __m128i tmpData4A = SHIFT_DATA_A(4);
290
+ __m128i tmpData4B = SHIFT_DATA_B(4);
291
+ // match instances of \r\n.\r\n and \r\n.=y
292
+ // TODO: consider doing a PALIGNR using match1Nl for match4NlA
293
+ __m128i match3CrA = _mm_cmpeq_epi8(_mm_set1_epi8('\r'), tmpData3A);
294
+ __m128i match3CrB = _mm_cmpeq_epi8(_mm_set1_epi8('\r'), tmpData3B);
295
+ __m128i match4LfA = _mm_cmpeq_epi8(tmpData4A, _mm_set1_epi8('\n'));
296
+ __m128i match4LfB = _mm_cmpeq_epi8(tmpData4B, _mm_set1_epi8('\n'));
297
+ __m128i match4EqYA = _mm_cmpeq_epi16(tmpData4A, _mm_set1_epi16(0x793d)); // =y
298
+ __m128i match4EqYB = _mm_cmpeq_epi16(tmpData4B, _mm_set1_epi16(0x793d)); // =y
299
+
300
+ int matchEnd;
301
+ #if defined(__AVX512VL__) && defined(__AVX512BW__)
302
+ if(use_isa >= ISA_LEVEL_AVX3) {
303
+ __mmask16 match3EqYMaskA = _mm_mask_cmpeq_epi8_mask(
304
+ match2EqMaskA, _mm_set1_epi8('y'), tmpData3A
305
+ );
306
+ __mmask16 match3EqYMaskB = _mm_mask_cmpeq_epi8_mask(
307
+ match2EqMaskB, _mm_set1_epi8('y'), tmpData3B
308
+ );
309
+ __m128i match34EqYA, match34EqYB;
310
+ # ifdef __AVX512VBMI2__
311
+ if(use_isa >= ISA_LEVEL_VBMI2) {
312
+ match34EqYA = _mm_shrdi_epi16(_mm_movm_epi8(match3EqYMaskA), match4EqYA, 8);
313
+ match34EqYB = _mm_shrdi_epi16(_mm_movm_epi8(match3EqYMaskB), match4EqYB, 8);
314
+ } else
315
+ # endif
316
+ {
317
+ // (match4EqY & 0xff00) | (match3EqY >> 8)
318
+ match34EqYA = _mm_mask_blend_epi8(match3EqYMaskA>>1, _mm_and_si128(match4EqYA, _mm_set1_epi16(-0xff)), _mm_set1_epi8(-1));
319
+ match34EqYB = _mm_mask_blend_epi8(match3EqYMaskB>>1, _mm_and_si128(match4EqYB, _mm_set1_epi16(-0xff)), _mm_set1_epi8(-1));
320
+ }
321
+ // merge \r\n and =y matches for tmpData4
322
+ __m128i match4EndA = _mm_ternarylogic_epi32(match34EqYA, match3CrA, match4LfA, 0xF8); // (match3Cr & match4Lf) | match34EqY
323
+ __m128i match4EndB = _mm_ternarylogic_epi32(match34EqYB, match3CrB, match4LfB, 0xF8);
324
+ // merge with \r\n. and combine
325
+ matchEnd = KORTEST16(
326
+ KOR16(
327
+ _mm_mask_test_epi8_mask(match2NlDotMaskA, match4EndA, match4EndA),
328
+ KAND16(match3EqYMaskA, match1NlMaskA)
329
+ ),
330
+ KOR16(
331
+ _mm_mask_test_epi8_mask(match2NlDotMaskB, match4EndB, match4EndB),
332
+ KAND16(match3EqYMaskB, match1NlMaskB)
333
+ )
334
+ );
335
+ } else
336
+ #endif
337
+ {
338
+ #if defined(__SSSE3__) && !defined(__tune_btver1__)
339
+ if(use_isa >= ISA_LEVEL_SSSE3)
340
+ match2EqA = _mm_alignr_epi8(cmpEqB, cmpEqA, 2);
341
+ #endif
342
+ __m128i match3EqYA = _mm_and_si128(match2EqA, _mm_cmpeq_epi8(_mm_set1_epi8('y'), tmpData3A));
343
+ __m128i match3EqYB = _mm_and_si128(match2EqB, _mm_cmpeq_epi8(_mm_set1_epi8('y'), tmpData3B));
344
+ match4EqYA = _mm_slli_epi16(match4EqYA, 8); // TODO: also consider using PBLENDVB here with shifted match3EqY instead
345
+ match4EqYB = _mm_slli_epi16(match4EqYB, 8);
346
+ // merge \r\n and =y matches for tmpData4
347
+ __m128i match4EndA = _mm_or_si128(
348
+ _mm_and_si128(match3CrA, match4LfA),
349
+ _mm_or_si128(match4EqYA, _mm_srli_epi16(match3EqYA, 8)) // _mm_srli_si128 by 1 also works
350
+ );
351
+ __m128i match4EndB = _mm_or_si128(
352
+ _mm_and_si128(match3CrB, match4LfB),
353
+ _mm_or_si128(match4EqYB, _mm_srli_epi16(match3EqYB, 8)) // _mm_srli_si128 by 1 also works
354
+ );
355
+ // merge with \r\n.
356
+ match4EndA = _mm_and_si128(match4EndA, match2NlDotA);
357
+ match4EndB = _mm_and_si128(match4EndB, match2NlDotB);
358
+ // match \r\n=y
359
+ __m128i match3EndA = _mm_and_si128(match3EqYA, match1NlA);
360
+ __m128i match3EndB = _mm_and_si128(match3EqYB, match1NlB);
361
+ // combine match sequences
362
+ matchEnd = _mm_movemask_epi8(_mm_or_si128(
363
+ _mm_or_si128(match4EndA, match3EndA),
364
+ _mm_or_si128(match4EndB, match3EndB)
365
+ ));
366
+ }
367
+
368
+ if(LIKELIHOOD(0.001, matchEnd)) {
369
+ // terminator found
370
+ // there's probably faster ways to do this, but reverting to scalar code should be good enough
371
+ len += (long)i;
372
+ break;
373
+ }
374
+ }
375
+ #if defined(__AVX512VL__) && defined(__AVX512BW__)
376
+ if(use_isa >= ISA_LEVEL_AVX3) {
377
+ mask |= match2NlDotMaskA << 2;
378
+ mask |= (match2NlDotMaskB << 18) & 0xffffffff;
379
+ minMask = _mm_maskz_mov_epi8(~(match2NlDotMaskB>>14), _mm_set1_epi8('.'));
380
+ } else
381
+ #endif
382
+ {
383
+ mask |= (_mm_movemask_epi8(match2NlDotA) << 2);
384
+ mask |= (_mm_movemask_epi8(match2NlDotB) << 18) & 0xffffffff;
385
+ match2NlDotB = _mm_srli_si128(match2NlDotB, 14);
386
+ if(_USING_FAST_MATCH)
387
+ minMask = _mm_subs_epu8(_mm_set1_epi8('.'), match2NlDotB);
388
+ else
389
+ // this bitiwse trick works because '.'|'\n' == '.'
390
+ lfCompare = _mm_or_si128(
391
+ _mm_and_si128(match2NlDotB, _mm_set1_epi8('.')),
392
+ _mm_set1_epi8('\n')
393
+ );
394
+ }
395
+ }
396
+ else if(searchEnd) {
397
+ bool partialEndFound;
398
+ __m128i match3EqYA, match3EqYB;
399
+ #if defined(__AVX512VL__) && defined(__AVX512BW__)
400
+ __mmask16 match3EqYMaskA, match3EqYMaskB;
401
+ if(use_isa >= ISA_LEVEL_AVX3) {
402
+ match3EqYMaskA = _mm_mask_cmpeq_epi8_mask(
403
+ match2EqMaskA,
404
+ _mm_set1_epi8('y'),
405
+ SHIFT_DATA_A(3)
406
+ );
407
+ match3EqYMaskB = _mm_mask_cmpeq_epi8_mask(
408
+ match2EqMaskB,
409
+ _mm_set1_epi8('y'),
410
+ SHIFT_DATA_B(3)
411
+ );
412
+ partialEndFound = KORTEST16(match3EqYMaskA, match3EqYMaskB);
413
+ } else
414
+ #endif
415
+ {
416
+ __m128i match3YA = _mm_cmpeq_epi8(
417
+ _mm_set1_epi8('y'),
418
+ SHIFT_DATA_A(3)
419
+ );
420
+ __m128i match3YB = _mm_cmpeq_epi8(
421
+ _mm_set1_epi8('y'),
422
+ SHIFT_DATA_B(3)
423
+ );
424
+ #if defined(__SSSE3__) && !defined(__tune_btver1__)
425
+ if(use_isa >= ISA_LEVEL_SSSE3)
426
+ match2EqA = _mm_alignr_epi8(cmpEqB, cmpEqA, 2);
427
+ #endif
428
+ match3EqYA = _mm_and_si128(match2EqA, match3YA);
429
+ match3EqYB = _mm_and_si128(match2EqB, match3YB);
430
+ partialEndFound = _mm_movemask_epi8(_mm_or_si128(match3EqYA, match3EqYB));
431
+ }
432
+ if(LIKELIHOOD(0.001, partialEndFound)) {
433
+ // if the rare case of '=y' is found, do a more precise check
434
+ bool endFound;
435
+
436
+ #if defined(__AVX512VL__) && defined(__AVX512BW__)
437
+ if(use_isa >= ISA_LEVEL_AVX3) {
438
+ __mmask16 match3LfEqYMaskA = _mm_mask_cmpeq_epi8_mask(
439
+ match3EqYMaskA,
440
+ _mm_set1_epi8('\n'),
441
+ SHIFT_DATA_A(1)
442
+ );
443
+ __mmask16 match3LfEqYMaskB = _mm_mask_cmpeq_epi8_mask(
444
+ match3EqYMaskB,
445
+ _mm_set1_epi8('\n'),
446
+ SHIFT_DATA_B(1)
447
+ );
448
+
449
+ endFound = KORTEST16(
450
+ _mm_mask_cmpeq_epi8_mask(match3LfEqYMaskA, oDataA, _mm_set1_epi8('\r')),
451
+ _mm_mask_cmpeq_epi8_mask(match3LfEqYMaskB, oDataB, _mm_set1_epi8('\r'))
452
+ );
453
+ } else
454
+ #endif
455
+ {
456
+ // always recompute cmpCr to avoid register spills above
457
+ cmpCrA = _mm_cmpeq_epi8(force_align_read_128(src+i), _mm_set1_epi8('\r'));
458
+ cmpCrB = _mm_cmpeq_epi8(force_align_read_128(src+i + sizeof(__m128i)), _mm_set1_epi8('\r'));
459
+ __m128i match1LfA = _mm_cmpeq_epi8(
460
+ _mm_set1_epi8('\n'),
461
+ SHIFT_DATA_A(1)
462
+ );
463
+ __m128i match1LfB = _mm_cmpeq_epi8(
464
+ _mm_set1_epi8('\n'),
465
+ SHIFT_DATA_B(1)
466
+ );
467
+ endFound = _mm_movemask_epi8(_mm_or_si128(
468
+ _mm_and_si128(
469
+ match3EqYA,
470
+ _mm_and_si128(match1LfA, cmpCrA)
471
+ ),
472
+ _mm_and_si128(
473
+ match3EqYB,
474
+ _mm_and_si128(match1LfB, cmpCrB)
475
+ )
476
+ ));
477
+ }
478
+
479
+ if(endFound) {
480
+ len += (long)i;
481
+ break;
482
+ }
483
+ }
484
+ if(isRaw) {
485
+ if(_USING_FAST_MATCH)
486
+ minMask = _mm_set1_epi8('.');
487
+ else
488
+ lfCompare = _mm_set1_epi8('\n');
489
+ }
490
+ }
491
+ else if(isRaw) { // no \r_. found
492
+ if(_USING_FAST_MATCH)
493
+ minMask = _mm_set1_epi8('.');
494
+ else
495
+ lfCompare = _mm_set1_epi8('\n');
496
+ }
497
+ }
498
+ #undef SHIFT_DATA_A
499
+ #undef SHIFT_DATA_B
500
+
501
+ if(!_USING_BLEND_ADD)
502
+ dataB = _mm_add_epi8(oDataB, _mm_set1_epi8(-42));
503
+
504
+ if(LIKELIHOOD(0.0001, (mask & ((maskEq << 1) + escFirst)) != 0)) {
505
+ // resolve invalid sequences of = to deal with cases like '===='
506
+ unsigned tmp = lookups->eqFix[(maskEq&0xff) & ~escFirst];
507
+ uint32_t maskEq2 = tmp;
508
+ for(int j=8; j<32; j+=8) {
509
+ tmp = lookups->eqFix[((maskEq>>j)&0xff) & ~(tmp>>7)];
510
+ maskEq2 |= tmp<<j;
511
+ }
512
+ maskEq = maskEq2;
513
+
514
+ mask &= ~escFirst;
515
+ escFirst = (maskEq >> 31);
516
+ // next, eliminate anything following a `=` from the special char mask; this eliminates cases of `=\r` so that they aren't removed
517
+ maskEq <<= 1;
518
+ mask &= ~maskEq;
519
+
520
+ if(_USING_BLEND_ADD) {
521
+ dataA = _mm_add_epi8(oDataA, yencOffset);
522
+ dataB = _mm_add_epi8(oDataB, _mm_set1_epi8(-42));
523
+ }
524
+
525
+ // unescape chars following `=`
526
+ #if defined(__AVX512VL__) && defined(__AVX512BW__)
527
+ if(use_isa >= ISA_LEVEL_AVX3) {
528
+ // GCC < 7 seems to generate rubbish assembly for this
529
+ dataA = _mm_mask_add_epi8(
530
+ dataA,
531
+ (__mmask16)maskEq,
532
+ dataA,
533
+ _mm_set1_epi8(-64)
534
+ );
535
+ dataB = _mm_mask_add_epi8(
536
+ dataB,
537
+ (__mmask16)(maskEq>>16),
538
+ dataB,
539
+ _mm_set1_epi8(-64)
540
+ );
541
+ } else
542
+ #endif
543
+ {
544
+ dataA = _mm_add_epi8(
545
+ dataA,
546
+ LOAD_HALVES(
547
+ lookups->eqAdd + (maskEq&0xff),
548
+ lookups->eqAdd + ((maskEq>>8)&0xff)
549
+ )
550
+ );
551
+ maskEq >>= 16;
552
+ dataB = _mm_add_epi8(
553
+ dataB,
554
+ LOAD_HALVES(
555
+ lookups->eqAdd + (maskEq&0xff),
556
+ lookups->eqAdd + ((maskEq>>8)&0xff)
557
+ )
558
+ );
559
+
560
+ yencOffset = _mm_xor_si128(_mm_set1_epi8(-42),
561
+ _mm_slli_epi16(_mm_cvtsi32_si128((int)escFirst), 6)
562
+ );
563
+ }
564
+ } else {
565
+ // no invalid = sequences found - we can cut out some things from above
566
+ // this code path is a shortened version of above; it's here because it's faster, and what we'll be dealing with most of the time
567
+ escFirst = (maskEq >> 31);
568
+
569
+ #if defined(__AVX512VL__) && defined(__AVX512BW__)
570
+ if(use_isa >= ISA_LEVEL_AVX3) {
571
+ dataA = _mm_add_epi8(
572
+ oDataA,
573
+ _mm_ternarylogic_epi32(
574
+ _mm_slli_si128(cmpEqA, 1), yencOffset, _mm_set1_epi8(-42-64), 0xac
575
+ )
576
+ );
577
+ dataB = _mm_add_epi8(
578
+ oDataB,
579
+ _mm_ternarylogic_epi32(
580
+ _mm_alignr_epi8(cmpEqB, cmpEqA, 15), _mm_set1_epi8(-42), _mm_set1_epi8(-42-64), 0xac
581
+ )
582
+ );
583
+ } else
584
+ #endif
585
+ #if defined(__SSE4_1__)
586
+ if(_USING_BLEND_ADD) {
587
+ /* // the following strategy seems more ideal, however, both GCC and Clang go bonkers over it and spill more registers
588
+ cmpEqA = _mm_blendv_epi8(_mm_set1_epi8(-42), _mm_set1_epi8(-42-64), cmpEqA);
589
+ cmpEqB = _mm_blendv_epi8(_mm_set1_epi8(-42), _mm_set1_epi8(-42-64), cmpEqB);
590
+ dataB = _mm_add_epi8(oDataB, _mm_alignr_epi8(cmpEqB, cmpEqA, 15));
591
+ dataA = _mm_add_epi8(oDataA, _mm_and_si128(
592
+ _mm_alignr_epi8(cmpEqA, _mm_set1_epi8(-42), 15),
593
+ yencOffset
594
+ ));
595
+ yencOffset = _mm_alignr_epi8(_mm_set1_epi8(-42), cmpEqB, 15);
596
+ */
597
+
598
+ dataA = _mm_add_epi8(
599
+ oDataA,
600
+ _mm_blendv_epi8(
601
+ yencOffset, _mm_set1_epi8(-42-64), _mm_slli_si128(cmpEqA, 1)
602
+ )
603
+ );
604
+ dataB = _mm_add_epi8(
605
+ oDataB,
606
+ _mm_blendv_epi8(
607
+ _mm_set1_epi8(-42), _mm_set1_epi8(-42-64), _mm_alignr_epi8(cmpEqB, cmpEqA, 15)
608
+ )
609
+ );
610
+ yencOffset = _mm_xor_si128(_mm_set1_epi8(-42),
611
+ _mm_slli_epi16(_mm_cvtsi32_si128((int)escFirst), 6)
612
+ );
613
+ } else
614
+ #endif
615
+ {
616
+ cmpEqA = _mm_and_si128(cmpEqA, _mm_set1_epi8(-64));
617
+ cmpEqB = _mm_and_si128(cmpEqB, _mm_set1_epi8(-64));
618
+ yencOffset = _mm_add_epi8(_mm_set1_epi8(-42), _mm_srli_si128(cmpEqB, 15));
619
+ #if defined(__SSSE3__) && !defined(__tune_btver1__)
620
+ if(use_isa >= ISA_LEVEL_SSSE3)
621
+ cmpEqB = _mm_alignr_epi8(cmpEqB, cmpEqA, 15);
622
+ else
623
+ #endif
624
+ cmpEqB = _mm_or_si128(
625
+ _mm_slli_si128(cmpEqB, 1),
626
+ _mm_srli_si128(cmpEqA, 15)
627
+ );
628
+ cmpEqA = _mm_slli_si128(cmpEqA, 1);
629
+ dataA = _mm_add_epi8(dataA, cmpEqA);
630
+ dataB = _mm_add_epi8(dataB, cmpEqB);
631
+ }
632
+ }
633
+ // subtract 64 from first element if escFirst == 1
634
+ #if defined(__AVX512VL__) && defined(__AVX512BW__)
635
+ if(use_isa >= ISA_LEVEL_AVX3) {
636
+ yencOffset = _mm_mask_add_epi8(_mm_set1_epi8(-42), (__mmask16)escFirst, _mm_set1_epi8(-42), _mm_set1_epi8(-64));
637
+ }
638
+ #endif
639
+
640
+ // all that's left is to 'compress' the data (skip over masked chars)
641
+ #ifdef __SSSE3__
642
+ if(use_isa >= ISA_LEVEL_SSSE3) {
643
+ # if defined(__AVX512VBMI2__) && defined(__AVX512VL__) && defined(__POPCNT__)
644
+ if(use_isa >= ISA_LEVEL_VBMI2) {
645
+ _mm_mask_compressstoreu_epi8(p, KNOT16(mask), dataA);
646
+ p -= popcnt32(mask & 0xffff);
647
+ _mm_mask_compressstoreu_epi8(p+XMM_SIZE, KNOT16(mask>>16), dataB);
648
+ p -= popcnt32(mask>>16);
649
+ p += XMM_SIZE*2;
650
+ } else
651
+ # endif
652
+ {
653
+
654
+ dataA = _mm_shuffle_epi8(dataA, _mm_load_si128((__m128i*)(lookups->compact + (mask&0x7fff))));
655
+ STOREU_XMM(p, dataA);
656
+
657
+ dataB = _mm_shuffle_epi8(dataB, _mm_load_si128((__m128i*)((char*)lookups->compact + ((mask >> 12) & 0x7fff0))));
658
+
659
+ # if defined(__POPCNT__) && !defined(__tune_btver1__)
660
+ if(use_isa & ISA_FEATURE_POPCNT) {
661
+ p -= popcnt32(mask & 0xffff);
662
+ STOREU_XMM(p+XMM_SIZE, dataB);
663
+ p -= popcnt32(mask & 0xffff0000);
664
+ p += XMM_SIZE*2;
665
+ } else
666
+ # endif
667
+ {
668
+ p += lookups->BitsSetTable256inv[mask & 0xff] + lookups->BitsSetTable256inv[(mask >> 8) & 0xff];
669
+ STOREU_XMM(p, dataB);
670
+ mask >>= 16;
671
+ p += lookups->BitsSetTable256inv[mask & 0xff] + lookups->BitsSetTable256inv[(mask >> 8) & 0xff];
672
+ }
673
+ }
674
+ } else
675
+ #endif
676
+ {
677
+ dataA = sse2_compact_vect<use_isa>(mask & 0xffff, dataA);
678
+ STOREU_XMM(p, dataA);
679
+ p += lookups->BitsSetTable256inv[mask & 0xff] + lookups->BitsSetTable256inv[(mask >> 8) & 0xff];
680
+ mask >>= 16;
681
+ dataB = sse2_compact_vect<use_isa>(mask, dataB);
682
+ STOREU_XMM(p, dataB);
683
+ p += lookups->BitsSetTable256inv[mask & 0xff] + lookups->BitsSetTable256inv[(mask >> 8) & 0xff];
684
+ }
685
+ #undef LOAD_HALVES
686
+ } else {
687
+ if(_USING_BLEND_ADD)
688
+ dataA = _mm_add_epi8(oDataA, yencOffset);
689
+ dataB = _mm_add_epi8(oDataB, _mm_set1_epi8(-42));
690
+
691
+ STOREU_XMM(p, dataA);
692
+ STOREU_XMM(p+XMM_SIZE, dataB);
693
+ p += XMM_SIZE*2;
694
+ escFirst = 0;
695
+ yencOffset = _mm_set1_epi8(-42);
696
+ }
697
+ }
698
+ _escFirst = (unsigned char)escFirst;
699
+ if(isRaw) {
700
+ if(len != 0) { // have to gone through at least one loop cycle
701
+ if(src[i-2] == '\r' && src[i-1] == '\n' && src[i] == '.')
702
+ _nextMask = 1;
703
+ else if(src[i-1] == '\r' && src[i] == '\n' && src[i+1] == '.')
704
+ _nextMask = 2;
705
+ else
706
+ _nextMask = 0;
707
+ }
708
+ } else
709
+ _nextMask = 0;
710
+ }
711
+ #endif