yencode 1.0.8 → 1.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. package/README.md +339 -231
  2. package/binding.gyp +292 -39
  3. package/crcutil-1.0/code/multiword_64_64_gcc_amd64_asm.cc +7 -7
  4. package/crcutil-1.0/code/multiword_64_64_gcc_i386_mmx.cc +14 -14
  5. package/crcutil-1.0/code/multiword_64_64_intrinsic_i386_mmx.cc +1 -1
  6. package/crcutil-1.0/code/uint128_sse2.h +2 -0
  7. package/index.js +329 -22
  8. package/package.json +2 -2
  9. package/src/common.h +299 -0
  10. package/src/crc.cc +95 -0
  11. package/src/crc.h +23 -0
  12. package/src/crc_arm.cc +175 -0
  13. package/src/crc_common.h +4 -0
  14. package/{crc_folding.c → src/crc_folding.cc} +175 -185
  15. package/src/decoder.cc +61 -0
  16. package/src/decoder.h +53 -0
  17. package/src/decoder_avx.cc +18 -0
  18. package/src/decoder_avx2.cc +18 -0
  19. package/src/decoder_avx2_base.h +615 -0
  20. package/src/decoder_common.h +512 -0
  21. package/src/decoder_neon.cc +474 -0
  22. package/src/decoder_neon64.cc +451 -0
  23. package/src/decoder_sse2.cc +16 -0
  24. package/src/decoder_sse_base.h +711 -0
  25. package/src/decoder_ssse3.cc +18 -0
  26. package/src/encoder.cc +170 -0
  27. package/src/encoder.h +21 -0
  28. package/src/encoder_avx.cc +16 -0
  29. package/src/encoder_avx2.cc +16 -0
  30. package/src/encoder_avx_base.h +564 -0
  31. package/src/encoder_common.h +109 -0
  32. package/src/encoder_neon.cc +547 -0
  33. package/src/encoder_sse2.cc +13 -0
  34. package/src/encoder_sse_base.h +724 -0
  35. package/src/encoder_ssse3.cc +18 -0
  36. package/src/hedley.h +1899 -0
  37. package/src/platform.cc +147 -0
  38. package/src/yencode.cc +449 -0
  39. package/test/_maxsize.js +9 -0
  40. package/test/_speedbase.js +147 -0
  41. package/test/speedcrc.js +20 -0
  42. package/test/speeddec.js +92 -0
  43. package/test/speedenc.js +44 -0
  44. package/{testcrc.js → test/testcrc.js} +53 -39
  45. package/test/testdec.js +183 -0
  46. package/test/testenc.js +163 -0
  47. package/test/testpostdec.js +126 -0
  48. package/test.js +0 -91
  49. package/yencode.cc +0 -1622
@@ -0,0 +1,724 @@
1
+ #include "common.h"
2
+
3
+ #include "encoder.h"
4
+ #include "encoder_common.h"
5
+
6
+ #if defined(__clang__) && __clang_major__ == 6 && __clang_minor__ == 0
7
+ // VBMI2 introduced in clang 6.0, but 128-bit functions misnamed there; fixed in clang 7.0, but we'll handle those on 6.0
8
+ # define _mm_mask_expand_epi8 _mm128_mask_expand_epi8
9
+ #endif
10
+
11
+ #if defined(__GNUC__) && __GNUC__ >= 7
12
+ # define KLOAD16(a, offs) _load_mask16((__mmask16*)(a) + (offs))
13
+ #else
14
+ # define KLOAD16(a, offs) (((uint16_t*)(a))[(offs)])
15
+ #endif
16
+
17
+ #pragma pack(16)
18
+ static struct {
19
+ /*align16*/ struct { __m128i shuf, mix; } shufMix[256];
20
+ unsigned char BitsSetTable256plus8[256];
21
+ uint32_t eolLastChar[256];
22
+ uint16_t eolFirstMask[256];
23
+ uint16_t expandMask[256];
24
+ /*align16*/ int8_t expandMaskmix[33*2*32];
25
+ /*align16*/ int8_t expandShufmaskmix[33*2*32];
26
+ } * HEDLEY_RESTRICT lookups;
27
+ #pragma pack()
28
+
29
+ template<enum YEncDecIsaLevel use_isa>
30
+ static void encoder_sse_lut() {
31
+ ALIGN_ALLOC(lookups, sizeof(*lookups), 16);
32
+ for(int i=0; i<256; i++) {
33
+ int k = i;
34
+ uint8_t* res = (uint8_t*)(&(lookups->shufMix[i].shuf));
35
+ uint16_t expand = 0;
36
+ int p = 0;
37
+ for(int j=0; j<8; j++) {
38
+ if(k & 1) {
39
+ res[j+p] = 0xf0 + j;
40
+ p++;
41
+ }
42
+ expand |= 1<<(j+p);
43
+ res[j+p] = j;
44
+ k >>= 1;
45
+ }
46
+ for(; p<8; p++)
47
+ res[8+p] = 8+p +0x40; // +0x40 is an arbitrary value to make debugging slightly easier? the top bit cannot be set
48
+
49
+ lookups->expandMask[i] = expand;
50
+
51
+ // calculate add mask for mixing escape chars in
52
+ __m128i shuf = _mm_load_si128((__m128i*)res);
53
+ __m128i maskEsc = _mm_cmpeq_epi8(_mm_and_si128(shuf, _mm_set1_epi8(-16)), _mm_set1_epi8(-16)); // -16 == 0xf0
54
+ __m128i addMask = _mm_and_si128(_mm_slli_si128(maskEsc, 1), _mm_set1_epi8(64));
55
+ addMask = _mm_or_si128(addMask, _mm_and_si128(maskEsc, _mm_set1_epi8('='-42)));
56
+ addMask = _mm_add_epi8(addMask, _mm_set1_epi8(42));
57
+
58
+ _mm_store_si128(&(lookups->shufMix[i].mix), addMask);
59
+
60
+
61
+ lookups->eolLastChar[i] = ((i == 214+'\t' || i == 214+' ' || i == 214+'\0' || i == 214+'\n' || i == 214+'\r' || i == '='-42) ? (((i+42+64)&0xff)<<8)+0x0a0d003d : ((i+42)&0xff)+0x0a0d00);
62
+ lookups->eolFirstMask[i] = ((i == 214+'\t' || i == 214+' ' || i == '.'-42) ? 1 : 0);
63
+
64
+ lookups->BitsSetTable256plus8[i] = 8 + (
65
+ (i & 1) + ((i>>1) & 1) + ((i>>2) & 1) + ((i>>3) & 1) + ((i>>4) & 1) + ((i>>5) & 1) + ((i>>6) & 1) + ((i>>7) & 1)
66
+ );
67
+ }
68
+ for(int i=0; i<33; i++) {
69
+ int n = (use_isa & ISA_FEATURE_LZCNT) ? (i == 32 ? 32 : 31-i) : (i == 0 ? 32 : i-1);
70
+ for(int j=0; j<32; j++) {
71
+ lookups->expandMaskmix[i*64 + j] = (n>j ? -1 : 0);
72
+ if(j > 15) // mask part
73
+ lookups->expandShufmaskmix[i*64 + j] = (n>=j ? -1 : 0);
74
+ else // shuffle part
75
+ lookups->expandShufmaskmix[i*64 + j] = (n==j ? -1 : (j-(n<j)));
76
+ lookups->expandMaskmix[i*64 + j + 32] = (n==j ? '=' : 42+64*(n==j-1));
77
+ lookups->expandShufmaskmix[i*64 + j + 32] = (n==j ? '=' : 42+64*(n==j-1));
78
+ }
79
+ }
80
+ }
81
+
82
+
83
+ // for LZCNT/BSF
84
+ #ifdef _MSC_VER
85
+ # include <intrin.h>
86
+ # include <ammintrin.h>
87
+ static HEDLEY_ALWAYS_INLINE unsigned BSR32(unsigned src) {
88
+ unsigned long result;
89
+ _BitScanReverse((unsigned long*)&result, src);
90
+ return result;
91
+ }
92
+ #elif defined(__GNUC__)
93
+ // have seen Clang not like _bit_scan_reverse
94
+ # include <x86intrin.h> // for lzcnt
95
+ # define BSR32(src) (31^__builtin_clz(src))
96
+ #else
97
+ # include <x86intrin.h>
98
+ # define BSR32 _bit_scan_reverse
99
+ #endif
100
+
101
+ template<enum YEncDecIsaLevel use_isa>
102
+ static HEDLEY_ALWAYS_INLINE __m128i sse2_expand_bytes(unsigned mask, __m128i data) {
103
+ while(mask) {
104
+ // get highest bit
105
+ unsigned bitIndex;
106
+ __m128i mergeMask;
107
+ #if defined(__LZCNT__)
108
+ if(use_isa & ISA_FEATURE_LZCNT) {
109
+ bitIndex = _lzcnt_u32(mask);
110
+ mergeMask = _mm_load_si128((const __m128i*)lookups->expandMaskmix + bitIndex*4);
111
+ mask &= 0x7fffffffU>>bitIndex;
112
+ } else
113
+ #endif
114
+ {
115
+ // TODO: consider LUT for when BSR is slow
116
+ bitIndex = BSR32(mask);
117
+ mergeMask = _mm_load_si128((const __m128i*)lookups->expandMaskmix + (bitIndex+1)*4);
118
+ mask ^= 1<<bitIndex;
119
+ }
120
+ // perform expansion
121
+ data = _mm_or_si128(
122
+ _mm_and_si128(mergeMask, data),
123
+ _mm_slli_si128(_mm_andnot_si128(mergeMask, data), 1)
124
+ );
125
+ }
126
+ return data;
127
+ }
128
+
129
+ template<enum YEncDecIsaLevel use_isa>
130
+ static HEDLEY_ALWAYS_INLINE uintptr_t sse2_expand_store_vector(__m128i data, unsigned int mask, unsigned maskP1, unsigned maskP2, uint8_t* p, unsigned int& shufLenP1, unsigned int& shufLenP2) {
131
+ // TODO: consider 1 bit shortcut (slightly tricky with needing bit counts though)
132
+ if(mask) {
133
+ __m128i dataA = sse2_expand_bytes<use_isa>(maskP1, data);
134
+ __m128i dataB = sse2_expand_bytes<use_isa>(maskP2, _mm_srli_si128(data, 8));
135
+ dataA = _mm_add_epi8(dataA, _mm_load_si128(&(lookups->shufMix[maskP1].mix)));
136
+ dataB = _mm_add_epi8(dataB, _mm_load_si128(&(lookups->shufMix[maskP2].mix)));
137
+ shufLenP1 = lookups->BitsSetTable256plus8[maskP1];
138
+ shufLenP2 = shufLenP1 + lookups->BitsSetTable256plus8[maskP2];
139
+ STOREU_XMM(p, dataA);
140
+ STOREU_XMM(p+shufLenP1, dataB);
141
+ return shufLenP2;
142
+ } else {
143
+ STOREU_XMM(p, _mm_sub_epi8(data, _mm_set1_epi8(-42)));
144
+ shufLenP1 = 8;
145
+ shufLenP2 = 16;
146
+ return XMM_SIZE;
147
+ }
148
+ }
149
+
150
+
151
+ template<enum YEncDecIsaLevel use_isa>
152
+ HEDLEY_ALWAYS_INLINE void do_encode_sse(int line_size, int* colOffset, const uint8_t* HEDLEY_RESTRICT srcEnd, uint8_t* HEDLEY_RESTRICT& dest, size_t& len) {
153
+ // offset position to enable simpler loop condition checking
154
+ const int INPUT_OFFSET = XMM_SIZE*4+1 -1; // EOL handling reads an additional byte, -1 to change <= to <
155
+ if(len <= INPUT_OFFSET || line_size < XMM_SIZE) return;
156
+
157
+ // slower CPUs prefer to branch as mispredict penalty is probably small relative to general execution
158
+ #if defined(__tune_atom__) || defined(__tune_slm__) || defined(__tune_btver1__)
159
+ const bool _PREFER_BRANCHING = true;
160
+ #else
161
+ const bool _PREFER_BRANCHING = (use_isa < ISA_LEVEL_SSSE3);
162
+ #endif
163
+
164
+ uint8_t *p = dest; // destination pointer
165
+ intptr_t i = -(intptr_t)len; // input position
166
+ intptr_t lineSizeOffset = -line_size +1;
167
+ //intptr_t col = *colOffset - line_size +1; // for some reason, this causes GCC-8 to spill an extra register, causing the main loop to run ~5% slower, so use the alternative version below
168
+ intptr_t col = *colOffset + lineSizeOffset;
169
+
170
+ i += INPUT_OFFSET;
171
+ const uint8_t* es = srcEnd - INPUT_OFFSET;
172
+
173
+ if(HEDLEY_UNLIKELY(col >= 0)) {
174
+ uint8_t c = es[i++];
175
+ if(col == 0) {
176
+ uint32_t eolChar = lookups->eolLastChar[c];
177
+ *(uint32_t*)p = eolChar;
178
+ p += 3 + (eolChar>>27);
179
+ col = -line_size+1;
180
+ } else {
181
+ if (LIKELIHOOD(0.0273, escapedLUT[c]!=0)) {
182
+ *(uint32_t*)p = UINT32_16_PACK(UINT16_PACK('\r', '\n'), (uint32_t)escapedLUT[c]);
183
+ p += 4;
184
+ col = 2-line_size + 1;
185
+ } else {
186
+ *(uint32_t*)p = UINT32_PACK('\r', '\n', (uint32_t)(c+42), 0);
187
+ p += 3;
188
+ col = 2-line_size;
189
+ }
190
+ }
191
+ }
192
+ if (HEDLEY_LIKELY(col == -line_size+1)) {
193
+ uint8_t c = es[i++];
194
+ if (LIKELIHOOD(0.0273, escapedLUT[c] != 0)) {
195
+ *(uint16_t*)p = escapedLUT[c];
196
+ p += 2;
197
+ col += 2;
198
+ } else {
199
+ *(p++) = c + 42;
200
+ col += 1;
201
+ }
202
+ }
203
+ do {
204
+ __m128i dataA = _mm_loadu_si128((__m128i *)(es + i)); // probably not worth the effort to align
205
+ __m128i dataB = _mm_loadu_si128((__m128i *)(es + i) +1);
206
+
207
+ i += XMM_SIZE*2;
208
+ // search for special chars
209
+ __m128i cmpA, cmpB;
210
+ #if defined(__SSSE3__) && !defined(__tune_atom__) && !defined(__tune_slm__) && !defined(__tune_btver1__)
211
+ if(use_isa >= ISA_LEVEL_SSSE3) {
212
+ cmpA = _mm_cmpeq_epi8(
213
+ _mm_shuffle_epi8(_mm_set_epi8(
214
+ '\0'-42,-42,'\r'-42,'.'-42,'='-42,'\0'-42,'\t'-42,'\n'-42,-42,-42,'\r'-42,-42,'='-42,' '-42,-42,'\n'-42
215
+ ), _mm_abs_epi8(dataA)),
216
+ dataA
217
+ );
218
+ } else
219
+ #endif
220
+ {
221
+ cmpA = _mm_or_si128(
222
+ _mm_or_si128(
223
+ _mm_cmpeq_epi8(dataA, _mm_set1_epi8(-42)),
224
+ _mm_cmpeq_epi8(dataA, _mm_set1_epi8('\n'-42))
225
+ ),
226
+ _mm_or_si128(
227
+ _mm_cmpeq_epi8(dataA, _mm_set1_epi8('='-42)),
228
+ _mm_cmpeq_epi8(dataA, _mm_set1_epi8('\r'-42))
229
+ )
230
+ );
231
+ }
232
+
233
+ _encode_loop_branchA:
234
+ unsigned int maskA = _mm_movemask_epi8(cmpA);
235
+ _encode_loop_branchB:
236
+
237
+ #if defined(__SSSE3__) && !defined(__tune_atom__) && !defined(__tune_slm__) && !defined(__tune_btver1__)
238
+ if(use_isa >= ISA_LEVEL_SSSE3) {
239
+ cmpB = _mm_cmpeq_epi8(
240
+ _mm_shuffle_epi8(_mm_set_epi8(
241
+ '\0'-42,-42,'\r'-42,'.'-42,'='-42,'\0'-42,'\t'-42,'\n'-42,-42,-42,'\r'-42,-42,'='-42,' '-42,-42,'\n'-42
242
+ ), _mm_abs_epi8(dataB)),
243
+ dataB
244
+ );
245
+ } else
246
+ #endif
247
+ {
248
+ cmpB = _mm_or_si128(
249
+ _mm_or_si128(
250
+ _mm_cmpeq_epi8(dataB, _mm_set1_epi8(-42)),
251
+ _mm_cmpeq_epi8(dataB, _mm_set1_epi8('\n'-42))
252
+ ),
253
+ _mm_or_si128(
254
+ _mm_cmpeq_epi8(dataB, _mm_set1_epi8('='-42)),
255
+ _mm_cmpeq_epi8(dataB, _mm_set1_epi8('\r'-42))
256
+ )
257
+ );
258
+ }
259
+ unsigned int maskB = _mm_movemask_epi8(cmpB);
260
+
261
+ uint32_t mask = (maskB<<16) | maskA;
262
+ intptr_t bitIndex; // because you can't goto past variable declarations...
263
+ intptr_t maskBits, outputBytes;
264
+
265
+ bool manyBitsSet;
266
+ #if defined(__POPCNT__) && !defined(__tune_btver1__)
267
+ if(use_isa & ISA_FEATURE_POPCNT) {
268
+ maskBits = popcnt32(mask);
269
+ outputBytes = maskBits + XMM_SIZE*2;
270
+ manyBitsSet = maskBits > 1;
271
+ } else
272
+ #endif
273
+ {
274
+ manyBitsSet = (mask & (mask-1)) != 0;
275
+ }
276
+
277
+ if (LIKELIHOOD(0.089, manyBitsSet)) {
278
+ _encode_loop_branch_slow:
279
+ unsigned m1 = maskA & 0xFF;
280
+ unsigned m2 = maskA >> 8;
281
+ unsigned m3 = maskB & 0xFF;
282
+ unsigned m4 = maskB >> 8;
283
+ unsigned int shuf1Len, shuf2Len, shuf3Len;
284
+ __m128i shuf1A, shuf1B, shuf2A, shuf2B; // only used for SSSE3 path
285
+ __m128i data1A, data1B, data2A, data2B;
286
+
287
+ #if defined(__AVX512VBMI2__) && defined(__AVX512VL__) && defined(__AVX512BW__)
288
+ if(use_isa >= ISA_LEVEL_VBMI2) {
289
+ // do +42 and +64 to data
290
+ dataA = _mm_sub_epi8(dataA, _mm_set1_epi8(-42));
291
+ dataA = _mm_ternarylogic_epi32(dataA, cmpA, _mm_set1_epi8(64), 0xf8); // data | (cmp & 64)
292
+ dataB = _mm_sub_epi8(dataB, _mm_set1_epi8(-42));
293
+ dataB = _mm_ternarylogic_epi32(dataB, cmpB, _mm_set1_epi8(64), 0xf8);
294
+
295
+ /* alternative no-LUT 64-bit only version
296
+ * LUT generally seems to be faster though
297
+ //uint64_t expandMask = _pdep_u64(mask, 0x5555555555555555); // expand bits
298
+ //expandMask = ~_pext_u64(expandMask, expandMask|~0x5555555555555555);
299
+ uint64_t expandMask = ~_pdep_u64(~mask, 0x5555555555555555); // expand bits, with bits set
300
+ expandMask = _pext_u64(expandMask^0x5555555555555555, expandMask);
301
+ data2A = _mm_mask_expand_epi8(_mm_set1_epi8('='), expandMask>>16, _mm_srli_si128(dataA, 8));
302
+ data1A = _mm_mask_expand_epi8(_mm_set1_epi8('='), expandMask , dataA);
303
+ data2B = _mm_mask_expand_epi8(_mm_set1_epi8('='), expandMask>>48, _mm_srli_si128(dataB, 8));
304
+ data1B = _mm_mask_expand_epi8(_mm_set1_epi8('='), expandMask>>32, dataB);
305
+ */
306
+ data2A = _mm_mask_expand_epi8(_mm_set1_epi8('='), KLOAD16(lookups->expandMask, m2), _mm_srli_si128(dataA, 8));
307
+ data1A = _mm_mask_expand_epi8(_mm_set1_epi8('='), KLOAD16(lookups->expandMask, m1), dataA);
308
+ data2B = _mm_mask_expand_epi8(_mm_set1_epi8('='), KLOAD16(lookups->expandMask, m4), _mm_srli_si128(dataB, 8));
309
+ data1B = _mm_mask_expand_epi8(_mm_set1_epi8('='), KLOAD16(lookups->expandMask, m3), dataB);
310
+ } else
311
+ #endif
312
+ #ifdef __SSSE3__
313
+ if(use_isa >= ISA_LEVEL_SSSE3) {
314
+ // perform lookup for shuffle mask
315
+ shuf1A = _mm_load_si128(&(lookups->shufMix[m1].shuf));
316
+ shuf2A = _mm_load_si128(&(lookups->shufMix[m2].shuf));
317
+ shuf1B = _mm_load_si128(&(lookups->shufMix[m3].shuf));
318
+ shuf2B = _mm_load_si128(&(lookups->shufMix[m4].shuf));
319
+
320
+ // second mask processes on second half, so add to the offsets
321
+ shuf2A = _mm_or_si128(shuf2A, _mm_set1_epi8(8));
322
+ shuf2B = _mm_or_si128(shuf2B, _mm_set1_epi8(8));
323
+
324
+ // expand halves
325
+ data2A = _mm_shuffle_epi8(dataA, shuf2A);
326
+ data1A = _mm_shuffle_epi8(dataA, shuf1A);
327
+ data2B = _mm_shuffle_epi8(dataB, shuf2B);
328
+ data1B = _mm_shuffle_epi8(dataB, shuf1B);
329
+
330
+ // add in escaped chars
331
+ data1A = _mm_add_epi8(data1A, _mm_load_si128(&(lookups->shufMix[m1].mix)));
332
+ data2A = _mm_add_epi8(data2A, _mm_load_si128(&(lookups->shufMix[m2].mix)));
333
+ data1B = _mm_add_epi8(data1B, _mm_load_si128(&(lookups->shufMix[m3].mix)));
334
+ data2B = _mm_add_epi8(data2B, _mm_load_si128(&(lookups->shufMix[m4].mix)));
335
+ } else
336
+ #endif
337
+ {
338
+ p += sse2_expand_store_vector<use_isa>(dataA, maskA, m1, m2, p, shuf1Len, shuf2Len);
339
+ unsigned int shuf4Len;
340
+ p += sse2_expand_store_vector<use_isa>(dataB, maskB, m3, m4, p, shuf3Len, shuf4Len);
341
+ shuf3Len += shuf2Len;
342
+ #if !defined(__tune_btver1__)
343
+ if(!(use_isa & ISA_FEATURE_POPCNT))
344
+ #endif
345
+ outputBytes = shuf2Len + shuf4Len;
346
+ }
347
+
348
+ if(use_isa >= ISA_LEVEL_SSSE3) {
349
+ // store out
350
+ #if defined(__POPCNT__) && !defined(__tune_btver1__)
351
+ if(use_isa & ISA_FEATURE_POPCNT) {
352
+ shuf2Len = popcnt32(maskA) + 16;
353
+ # if defined(__tune_znver3__) || defined(__tune_znver2__) || defined(__tune_znver1__) || defined(__tune_btver2__)
354
+ shuf1Len = popcnt32(m1) + 8;
355
+ shuf3Len = popcnt32(m3) + shuf2Len + 8;
356
+ # else
357
+ shuf1Len = lookups->BitsSetTable256plus8[m1];
358
+ shuf3Len = lookups->BitsSetTable256plus8[m3] + shuf2Len;
359
+ # endif
360
+ } else
361
+ #endif
362
+ {
363
+ shuf1Len = lookups->BitsSetTable256plus8[m1];
364
+ shuf2Len = shuf1Len + lookups->BitsSetTable256plus8[m2];
365
+ shuf3Len = shuf2Len + lookups->BitsSetTable256plus8[m3];
366
+ outputBytes = shuf3Len + lookups->BitsSetTable256plus8[m4];
367
+ }
368
+ STOREU_XMM(p, data1A);
369
+ STOREU_XMM(p+shuf1Len, data2A);
370
+ STOREU_XMM(p+shuf2Len, data1B);
371
+ STOREU_XMM(p+shuf3Len, data2B);
372
+ p += outputBytes;
373
+ }
374
+ col += outputBytes;
375
+
376
+ if(LIKELIHOOD(0.3 /*guess, using 128b lines*/, col >= 0)) {
377
+ uintptr_t bitCount;
378
+ intptr_t shiftAmt = (outputBytes - shuf2Len) - col -1;
379
+ uint32_t eqMask;
380
+ if(HEDLEY_UNLIKELY(shiftAmt < 0)) {
381
+ shiftAmt += shuf2Len;
382
+ i -= 16;
383
+ if(use_isa >= ISA_LEVEL_VBMI2 || use_isa < ISA_LEVEL_SSSE3) {
384
+ eqMask =
385
+ ((uint32_t)lookups->expandMask[m2] << shuf1Len)
386
+ | (uint32_t)lookups->expandMask[m1];
387
+ } else {
388
+ eqMask =
389
+ ((uint32_t)_mm_movemask_epi8(shuf2A) << shuf1Len)
390
+ | (uint32_t)_mm_movemask_epi8(shuf1A);
391
+ i += outputBytes - shuf2Len;
392
+ }
393
+ } else {
394
+ if(use_isa >= ISA_LEVEL_VBMI2 || use_isa < ISA_LEVEL_SSSE3) {
395
+ eqMask =
396
+ ((uint32_t)lookups->expandMask[m4] << (shuf3Len-shuf2Len))
397
+ | (uint32_t)lookups->expandMask[m3];
398
+ } else {
399
+ eqMask =
400
+ ((uint32_t)_mm_movemask_epi8(shuf2B) << (shuf3Len-shuf2Len))
401
+ | (uint32_t)_mm_movemask_epi8(shuf1B);
402
+ }
403
+ }
404
+
405
+ if(use_isa >= ISA_LEVEL_VBMI2 || use_isa < ISA_LEVEL_SSSE3) {
406
+ #if defined(__GNUC__)
407
+ // be careful to avoid partial flag stalls on Intel P6 CPUs (SHR+ADC will likely stall)
408
+ # if !(defined(__tune_amdfam10__) || defined(__tune_k8__))
409
+ if(use_isa >= ISA_LEVEL_VBMI2)
410
+ # endif
411
+ {
412
+ asm(
413
+ "shrl $1, %[eqMask] \n"
414
+ "shrl %%cl, %[eqMask] \n" // TODO: can use shrq to avoid above shift?
415
+ # if defined(PLATFORM_AMD64)
416
+ "adcq %[col], %[p] \n"
417
+ # else
418
+ "adcl %[col], %[p] \n"
419
+ # endif
420
+ : [eqMask]"+r"(eqMask), [p]"+r"(p)
421
+ : "c"(shiftAmt), [col]"r"(~col)
422
+ );
423
+ }
424
+ # if !(defined(__tune_amdfam10__) || defined(__tune_k8__))
425
+ else
426
+ # else
427
+ if(0)
428
+ # endif
429
+ #endif
430
+ {
431
+ eqMask >>= shiftAmt;
432
+ p -= col;
433
+ if(LIKELIHOOD(0.98, (eqMask & 1) != 1))
434
+ p--;
435
+ else
436
+ i++;
437
+ }
438
+ } else {
439
+ eqMask >>= shiftAmt;
440
+ col += eqMask & 1; // revert if escape char
441
+ }
442
+
443
+ #if defined(__POPCNT__)
444
+ if(use_isa & ISA_FEATURE_POPCNT) {
445
+ bitCount = popcnt32(eqMask);
446
+ } else
447
+ #endif
448
+ {
449
+ unsigned char cnt = lookups->BitsSetTable256plus8[eqMask & 0xff];
450
+ cnt += lookups->BitsSetTable256plus8[(eqMask>>8) & 0xff];
451
+ cnt += lookups->BitsSetTable256plus8[(eqMask>>16) & 0xff];
452
+ cnt += lookups->BitsSetTable256plus8[(eqMask>>24) & 0xff];
453
+ bitCount = (uintptr_t)cnt - 32;
454
+ }
455
+
456
+ if(use_isa >= ISA_LEVEL_VBMI2 || use_isa < ISA_LEVEL_SSSE3) {
457
+ i -= bitCount;
458
+ goto _encode_eol_handle_pre;
459
+ } else {
460
+ i += bitCount;
461
+ goto _encode_eol_handle_pre_adjust;
462
+ }
463
+ }
464
+ } else {
465
+ if(_PREFER_BRANCHING && LIKELIHOOD(0.663, !mask)) {
466
+ _encode_loop_branch_fast_noesc:
467
+ dataA = _mm_sub_epi8(dataA, _mm_set1_epi8(-42));
468
+ dataB = _mm_sub_epi8(dataB, _mm_set1_epi8(-42));
469
+ STOREU_XMM(p, dataA);
470
+ STOREU_XMM(p+XMM_SIZE, dataB);
471
+ p += XMM_SIZE*2;
472
+ col += XMM_SIZE*2;
473
+ if(LIKELIHOOD(0.15, col >= 0))
474
+ goto _encode_eol_handle_pre_adjust;
475
+ continue;
476
+ }
477
+ // shortcut for common case of only 1 bit set
478
+ _encode_loop_branch_fast_1ch:
479
+ #if defined(__AVX512VL__) && defined(__AVX512BW__)
480
+ if(use_isa >= ISA_LEVEL_AVX3) {
481
+ dataA = _mm_sub_epi8(dataA, _mm_set1_epi8(-42));
482
+ dataA = _mm_ternarylogic_epi32(dataA, cmpA, _mm_set1_epi8(64), 0xf8); // data | (cmp & 64)
483
+ dataB = _mm_sub_epi8(dataB, _mm_set1_epi8(-42));
484
+ dataB = _mm_ternarylogic_epi32(dataB, cmpB, _mm_set1_epi8(64), 0xf8);
485
+
486
+ // store last char
487
+ _mm_mask_storeu_epi8(p+XMM_SIZE+1, 1<<15, dataB);
488
+
489
+ uint32_t blendMask = (uint32_t)(-(int32_t)mask);
490
+ dataB = _mm_mask_alignr_epi8(dataB, blendMask>>16, dataB, dataA, 15);
491
+ dataB = _mm_ternarylogic_epi32(dataB, cmpB, _mm_set1_epi8('='), 0xb8); // (data & ~cmp) | (cmp & '=')
492
+
493
+ # if defined(__AVX512VBMI2__)
494
+ if(use_isa >= ISA_LEVEL_VBMI2)
495
+ dataA = _mm_mask_expand_epi8(_mm_set1_epi8('='), ~mask, dataA);
496
+ else
497
+ # endif
498
+ {
499
+ dataA = _mm_mask_alignr_epi8(dataA, blendMask, dataA, dataA, 15); // there's no masked shift, so use ALIGNR instead
500
+ dataA = _mm_ternarylogic_epi32(dataA, cmpA, _mm_set1_epi8('='), 0xb8);
501
+ }
502
+ } else
503
+ #endif
504
+ {
505
+
506
+ #if !defined(__tune_btver1__)
507
+ if(!(use_isa & ISA_FEATURE_POPCNT))
508
+ #endif
509
+ maskBits = (mask != 0);
510
+ if(_PREFER_BRANCHING) maskBits = 1;
511
+ #if !defined(__tune_btver1__)
512
+ if(!(use_isa & ISA_FEATURE_POPCNT))
513
+ #endif
514
+ outputBytes = XMM_SIZE*2 + maskBits;
515
+
516
+ #if defined(__LZCNT__)
517
+ if(use_isa & ISA_FEATURE_LZCNT)
518
+ bitIndex = _lzcnt_u32(mask);
519
+ else
520
+ #endif
521
+ {
522
+ bitIndex = BSR32(mask);
523
+ bitIndex |= maskBits-1; // if(mask == 0) bitIndex = -1;
524
+ }
525
+ const __m128i* entries;
526
+
527
+ #if defined(__SSSE3__) && !defined(__tune_atom__) && !defined(__tune_slm__) && !defined(__tune_btver1__)
528
+ if(use_isa >= ISA_LEVEL_SSSE3) {
529
+ entries = (const __m128i*)lookups->expandShufmaskmix;
530
+ if(!(use_isa & ISA_FEATURE_LZCNT))
531
+ entries += 4;
532
+ entries += bitIndex*4;
533
+
534
+ __m128i shufMaskA = _mm_load_si128(entries+0);
535
+ __m128i mergeMaskB = _mm_load_si128(entries+1);
536
+ __m128i dataBShifted = _mm_alignr_epi8(dataB, dataA, 15);
537
+ dataB = _mm_andnot_si128(cmpB, dataB);
538
+
539
+ dataA = _mm_shuffle_epi8(dataA, shufMaskA);
540
+
541
+ # if defined(__SSE4_1__) && !defined(__tune_slm__) && !defined(__tune_goldmont__) && !defined(__tune_goldmont_plus__) && !defined(__tune_tremont__)
542
+ // unsure if worth on: Jaguar/Puma (3|2), Core2 (2|2)
543
+ if(use_isa >= ISA_LEVEL_SSE41) {
544
+ dataB = _mm_blendv_epi8(dataBShifted, dataB, mergeMaskB);
545
+ } else
546
+ # endif
547
+ {
548
+ dataB = _mm_or_si128(
549
+ _mm_and_si128(mergeMaskB, dataB),
550
+ _mm_andnot_si128(mergeMaskB, dataBShifted)
551
+ );
552
+ }
553
+ } else
554
+ #endif
555
+ {
556
+
557
+ entries = (const __m128i*)lookups->expandMaskmix;
558
+ if(!(use_isa & ISA_FEATURE_LZCNT))
559
+ entries += 4;
560
+ entries += bitIndex*4;
561
+
562
+ __m128i mergeMaskA = _mm_load_si128(entries+0);
563
+ __m128i mergeMaskB = _mm_load_si128(entries+1);
564
+ // TODO: consider deferring mask operation? (does require an extra ANDN but may help with L1 latency)
565
+ __m128i dataAMasked = _mm_andnot_si128(mergeMaskA, dataA);
566
+ __m128i dataBMasked = _mm_andnot_si128(mergeMaskB, dataB);
567
+ __m128i dataAShifted = _mm_slli_si128(dataAMasked, 1);
568
+ __m128i dataBShifted;
569
+
570
+ #if defined(__SSSE3__) && !defined(__tune_btver1__)
571
+ if(use_isa >= ISA_LEVEL_SSSE3)
572
+ dataBShifted = _mm_alignr_epi8(dataBMasked, dataAMasked, 15);
573
+ else
574
+ #endif
575
+ dataBShifted = _mm_or_si128(
576
+ _mm_slli_si128(dataBMasked, 1),
577
+ _mm_srli_si128(dataAMasked, 15)
578
+ );
579
+
580
+ // alternatively `_mm_xor_si128(dataAMasked, dataA)` if compiler wants to load mergeMask* again
581
+ dataB = _mm_or_si128(
582
+ _mm_and_si128(mergeMaskB, dataB), dataBShifted
583
+ );
584
+ dataA = _mm_or_si128(
585
+ _mm_and_si128(mergeMaskA, dataA), dataAShifted
586
+ );
587
+ }
588
+ // add escape chars
589
+ dataA = _mm_add_epi8(dataA, _mm_load_si128(entries+2));
590
+ dataB = _mm_add_epi8(dataB, _mm_load_si128(entries+3));
591
+
592
+ // store final char
593
+ p[XMM_SIZE*2] = es[i-1] + 42 + (64 & (mask>>(XMM_SIZE*2-1-6)));
594
+ }
595
+
596
+ // store main part
597
+ STOREU_XMM(p, dataA);
598
+ STOREU_XMM(p+XMM_SIZE, dataB);
599
+
600
+ p += outputBytes;
601
+ col += outputBytes;
602
+
603
+ if(LIKELIHOOD(0.3, col >= 0)) {
604
+ #if defined(__AVX512VL__)
605
+ if(use_isa >= ISA_LEVEL_AVX3)
606
+ bitIndex = _lzcnt_u32(mask) +1;
607
+ else
608
+ #endif
609
+ if(use_isa & ISA_FEATURE_LZCNT)
610
+ bitIndex = bitIndex +1;
611
+ else
612
+ bitIndex = 31-bitIndex +1;
613
+ if(HEDLEY_UNLIKELY(col == bitIndex)) {
614
+ // this is an escape character, so line will need to overflow
615
+ p--;
616
+ } else {
617
+ i += (col > bitIndex);
618
+ }
619
+ _encode_eol_handle_pre_adjust:
620
+ p -= col;
621
+ i -= col;
622
+
623
+ _encode_eol_handle_pre:
624
+ uint32_t eolChar = lookups->eolLastChar[es[i]];
625
+ *(uint32_t*)p = eolChar;
626
+ p += 3 + (eolChar>>27);
627
+ col = lineSizeOffset;
628
+
629
+ if(HEDLEY_UNLIKELY(i >= 0)) { // this isn't really a proper check - it's only needed to support short lines; basically, if the line is too short, `i` never gets checked, so we need one somewhere
630
+ i++;
631
+ break;
632
+ }
633
+
634
+ dataA = _mm_loadu_si128((__m128i *)(es + i + 1));
635
+ dataB = _mm_loadu_si128((__m128i *)(es + i + 1) + 1);
636
+ // search for special chars (EOL)
637
+ #if defined(__SSSE3__) && !defined(__tune_atom__) && !defined(__tune_slm__) && !defined(__tune_btver1__)
638
+ if(use_isa >= ISA_LEVEL_SSSE3) {
639
+ cmpA = _mm_cmpeq_epi8(
640
+ _mm_shuffle_epi8(_mm_set_epi8(
641
+ '\0'-42,-42,'\r'-42,'.'-42,'='-42,'\0'-42,'\t'-42,'\n'-42,-42,-42,'\r'-42,-42,'='-42,' '-42,-42,'\n'-42
642
+ ), _mm_adds_epi8(
643
+ _mm_abs_epi8(dataA), _mm_cvtsi32_si128(88)
644
+ )),
645
+ dataA
646
+ );
647
+ i += XMM_SIZE*2 + 1;
648
+ # if defined(__GNUC__) && !defined(__clang__)
649
+ // GCC seems to have trouble keeping track of variable usage and spills many of them if we goto after declarations; Clang9 seems to be fine, or if _PREFER_BRANCHING is used
650
+ if(!_PREFER_BRANCHING)
651
+ goto _encode_loop_branchA;
652
+ # endif
653
+ maskA = _mm_movemask_epi8(cmpA);
654
+ cmpB = _mm_cmpeq_epi8(
655
+ _mm_shuffle_epi8(_mm_set_epi8(
656
+ '\0'-42,-42,'\r'-42,'.'-42,'='-42,'\0'-42,'\t'-42,'\n'-42,-42,-42,'\r'-42,-42,'='-42,' '-42,-42,'\n'-42
657
+ ), _mm_abs_epi8(dataB)),
658
+ dataB
659
+ );
660
+ } else
661
+ #endif
662
+ {
663
+ cmpA = _mm_or_si128(
664
+ _mm_or_si128(
665
+ _mm_cmpeq_epi8(dataA, _mm_set1_epi8(-42)),
666
+ _mm_cmpeq_epi8(dataA, _mm_set1_epi8('\n'-42))
667
+ ),
668
+ _mm_or_si128(
669
+ _mm_cmpeq_epi8(dataA, _mm_set1_epi8('='-42)),
670
+ _mm_cmpeq_epi8(dataA, _mm_set1_epi8('\r'-42))
671
+ )
672
+ );
673
+ maskA = _mm_movemask_epi8(cmpA);
674
+ maskA |= lookups->eolFirstMask[es[i+1]];
675
+ i += XMM_SIZE*2 + 1;
676
+ #if defined(__GNUC__) && !defined(__clang__)
677
+ if(!_PREFER_BRANCHING)
678
+ goto _encode_loop_branchB;
679
+ #endif
680
+ cmpB = _mm_or_si128(
681
+ _mm_or_si128(
682
+ _mm_cmpeq_epi8(dataB, _mm_set1_epi8(-42)),
683
+ _mm_cmpeq_epi8(dataB, _mm_set1_epi8('\n'-42))
684
+ ),
685
+ _mm_or_si128(
686
+ _mm_cmpeq_epi8(dataB, _mm_set1_epi8('='-42)),
687
+ _mm_cmpeq_epi8(dataB, _mm_set1_epi8('\r'-42))
688
+ )
689
+ );
690
+ }
691
+ maskB = _mm_movemask_epi8(cmpB);
692
+
693
+ mask = (maskB<<16) | maskA;
694
+ bool manyBitsSet; // don't retain this across loop cycles
695
+ #if defined(__POPCNT__) && !defined(__tune_btver1__)
696
+ if(use_isa & ISA_FEATURE_POPCNT) {
697
+ maskBits = popcnt32(mask);
698
+ outputBytes = maskBits + XMM_SIZE*2;
699
+ manyBitsSet = maskBits > 1;
700
+ } else
701
+ #endif
702
+ {
703
+ manyBitsSet = (mask & (mask-1)) != 0;
704
+ }
705
+
706
+ if (LIKELIHOOD(0.089, manyBitsSet))
707
+ goto _encode_loop_branch_slow;
708
+ if(_PREFER_BRANCHING && LIKELIHOOD(0.663, !mask))
709
+ goto _encode_loop_branch_fast_noesc;
710
+ goto _encode_loop_branch_fast_1ch;
711
+ if(0) { // silence unused label warnings
712
+ goto _encode_loop_branchA;
713
+ goto _encode_loop_branchB;
714
+ }
715
+ }
716
+
717
+ }
718
+ } while(i < 0);
719
+
720
+ *colOffset = (int)(col + line_size -1);
721
+ dest = p;
722
+ len = -(i - INPUT_OFFSET);
723
+ }
724
+