yencode 1.0.8 → 1.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. package/README.md +339 -231
  2. package/binding.gyp +292 -39
  3. package/crcutil-1.0/code/multiword_64_64_gcc_amd64_asm.cc +7 -7
  4. package/crcutil-1.0/code/multiword_64_64_gcc_i386_mmx.cc +14 -14
  5. package/crcutil-1.0/code/multiword_64_64_intrinsic_i386_mmx.cc +1 -1
  6. package/crcutil-1.0/code/uint128_sse2.h +2 -0
  7. package/index.js +329 -22
  8. package/package.json +2 -2
  9. package/src/common.h +299 -0
  10. package/src/crc.cc +95 -0
  11. package/src/crc.h +23 -0
  12. package/src/crc_arm.cc +175 -0
  13. package/src/crc_common.h +4 -0
  14. package/{crc_folding.c → src/crc_folding.cc} +175 -185
  15. package/src/decoder.cc +61 -0
  16. package/src/decoder.h +53 -0
  17. package/src/decoder_avx.cc +18 -0
  18. package/src/decoder_avx2.cc +18 -0
  19. package/src/decoder_avx2_base.h +615 -0
  20. package/src/decoder_common.h +512 -0
  21. package/src/decoder_neon.cc +474 -0
  22. package/src/decoder_neon64.cc +451 -0
  23. package/src/decoder_sse2.cc +16 -0
  24. package/src/decoder_sse_base.h +711 -0
  25. package/src/decoder_ssse3.cc +18 -0
  26. package/src/encoder.cc +170 -0
  27. package/src/encoder.h +21 -0
  28. package/src/encoder_avx.cc +16 -0
  29. package/src/encoder_avx2.cc +16 -0
  30. package/src/encoder_avx_base.h +564 -0
  31. package/src/encoder_common.h +109 -0
  32. package/src/encoder_neon.cc +547 -0
  33. package/src/encoder_sse2.cc +13 -0
  34. package/src/encoder_sse_base.h +724 -0
  35. package/src/encoder_ssse3.cc +18 -0
  36. package/src/hedley.h +1899 -0
  37. package/src/platform.cc +147 -0
  38. package/src/yencode.cc +449 -0
  39. package/test/_maxsize.js +9 -0
  40. package/test/_speedbase.js +147 -0
  41. package/test/speedcrc.js +20 -0
  42. package/test/speeddec.js +92 -0
  43. package/test/speedenc.js +44 -0
  44. package/{testcrc.js → test/testcrc.js} +53 -39
  45. package/test/testdec.js +183 -0
  46. package/test/testenc.js +163 -0
  47. package/test/testpostdec.js +126 -0
  48. package/test.js +0 -91
  49. package/yencode.cc +0 -1622
package/yencode.cc DELETED
@@ -1,1622 +0,0 @@
1
-
2
- #include <node.h>
3
- #include <node_buffer.h>
4
- #include <node_version.h>
5
- #include <v8.h>
6
- #include <stdlib.h>
7
-
8
- using namespace v8;
9
-
10
- // MSVC compatibility
11
- #if (defined(_M_IX86_FP) && _M_IX86_FP == 2) || defined(_M_X64)
12
- #define __SSE2__ 1
13
- #define __SSSE3__ 1
14
- //#define __SSE4_1__ 1
15
- #if defined(_MSC_VER) && _MSC_VER >= 1600
16
- #define X86_PCLMULQDQ_CRC 1
17
- #endif
18
- #endif
19
- #ifdef _MSC_VER
20
- #define __BYTE_ORDER__ 1234
21
- #define __ORDER_BIG_ENDIAN__ 4321
22
- #include <intrin.h>
23
- #endif
24
-
25
- #if defined(__x86_64__) || defined(__i386__)
26
- #if !defined(X86_PCLMULQDQ_CRC) && defined(__PCLMUL__) && defined(__SSSE3__) && defined(__SSE4_1__)
27
- #define X86_PCLMULQDQ_CRC 1
28
- #endif
29
- #endif
30
-
31
- static unsigned char escapeLUT[256]; // whether or not the character is critical
32
- static uint16_t escapedLUT[256]; // escaped sequences for characters that need escaping
33
- // combine two 8-bit ints into a 16-bit one
34
- #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
35
- #define UINT16_PACK(a, b) (((a) << 8) | (b))
36
- #define UINT32_PACK(a, b, c, d) (((a) << 24) | ((b) << 16) | ((c) << 8) | (d))
37
- #define UINT32_16_PACK(a, b) (((a) << 16) | (b))
38
- #else
39
- #define UINT16_PACK(a, b) ((a) | ((b) << 8))
40
- #define UINT32_PACK(a, b, c, d) ((a) | ((b) << 8) | ((c) << 16) | ((d) << 24))
41
- #define UINT32_16_PACK(a, b) ((a) | ((b) << 16))
42
- #endif
43
-
44
- #ifdef __SSE2__
45
- #include <emmintrin.h>
46
- #define XMM_SIZE 16 /*== (signed int)sizeof(__m128i)*/
47
-
48
- #ifdef _MSC_VER
49
- #define ALIGN_32(v) __declspec(align(32)) v
50
- #else
51
- #define ALIGN_32(v) v __attribute__((aligned(32)))
52
- #endif
53
-
54
- #ifdef __SSSE3__
55
- #include <tmmintrin.h>
56
- #endif
57
- #ifdef __SSE4_1__
58
- #include <smmintrin.h>
59
- #endif
60
- #ifdef __POPCNT__
61
- #include <nmmintrin.h>
62
- #endif
63
- /*
64
- #ifdef __AVX2__
65
- #include <immintrin.h>
66
- #endif
67
- */
68
-
69
- #if defined(__tune_core2__) || defined(__tune_atom__)
70
- /* on older Intel CPUs, plus first gen Atom, it is faster to store XMM registers in half */
71
- # define STOREU_XMM(dest, xmm) \
72
- _mm_storel_epi64((__m128i*)(dest), xmm); \
73
- _mm_storeh_pi(((__m64*)(dest) +1), _mm_castsi128_ps(xmm))
74
- #else
75
- # define STOREU_XMM(dest, xmm) \
76
- _mm_storeu_si128((__m128i*)(dest), xmm)
77
- #endif
78
-
79
- #endif
80
-
81
- // runs at around 380MB/s on 2.4GHz Silvermont (worst: 125MB/s, best: 440MB/s)
82
- static size_t do_encode_slow(int line_size, int col, const unsigned char* src, unsigned char* dest, size_t len) {
83
- unsigned char *p = dest; // destination pointer
84
- unsigned long i = 0; // input position
85
- unsigned char c, escaped; // input character; escaped input character
86
-
87
- if (col == 0) {
88
- c = src[i++];
89
- if (escapedLUT[c]) {
90
- *(uint16_t*)p = escapedLUT[c];
91
- p += 2;
92
- col = 2;
93
- } else {
94
- *(p++) = c + 42;
95
- col = 1;
96
- }
97
- }
98
- while(i < len) {
99
- unsigned char* sp = NULL;
100
- // main line
101
- #ifdef __SSE2__
102
- while (len-i-1 > XMM_SIZE && col < line_size-1) {
103
- __m128i data = _mm_add_epi8(
104
- _mm_loadu_si128((__m128i *)(src + i)), // probably not worth the effort to align
105
- _mm_set1_epi8(42)
106
- );
107
- // search for special chars
108
- // TODO: for some reason, GCC feels obliged to spill `data` onto the stack, then _load_ from it!
109
- __m128i cmp = _mm_or_si128(
110
- _mm_or_si128(
111
- _mm_cmpeq_epi8(data, _mm_setzero_si128()),
112
- _mm_cmpeq_epi8(data, _mm_set1_epi8('\n'))
113
- ),
114
- _mm_or_si128(
115
- _mm_cmpeq_epi8(data, _mm_set1_epi8('\r')),
116
- _mm_cmpeq_epi8(data, _mm_set1_epi8('='))
117
- )
118
- );
119
-
120
- unsigned int mask = _mm_movemask_epi8(cmp);
121
- if (mask != 0) {
122
- sp = p;
123
- ALIGN_32(uint32_t mmTmp[4]);
124
- // special characters exist
125
- _mm_store_si128((__m128i*)mmTmp, data);
126
- #define DO_THING(n) \
127
- c = src[i+n], escaped = escapeLUT[c]; \
128
- if (escaped) \
129
- *(p+n) = escaped; \
130
- else { \
131
- *(uint16_t*)(p+n) = escapedLUT[c]; \
132
- p++; \
133
- }
134
- #define DO_THING_4(n) \
135
- if(mask & (0xF << n)) { \
136
- DO_THING(n); \
137
- DO_THING(n+1); \
138
- DO_THING(n+2); \
139
- DO_THING(n+3); \
140
- } else { \
141
- *(uint32_t*)(p+n) = mmTmp[n>>2]; \
142
- }
143
- DO_THING_4(0);
144
- DO_THING_4(4);
145
- DO_THING_4(8);
146
- DO_THING_4(12);
147
- p += XMM_SIZE;
148
- col += (int)(p - sp);
149
-
150
- if(col > line_size-1) {
151
- // TODO: consider revert optimisation from do_encode_fast
152
- // we overflowed - need to revert and use slower method :(
153
- col -= (int)(p - sp);
154
- p = sp;
155
- break;
156
- }
157
- } else {
158
- STOREU_XMM(p, data);
159
- p += XMM_SIZE;
160
- col += XMM_SIZE;
161
- if(col > line_size-1) {
162
- p -= col - (line_size-1);
163
- i += XMM_SIZE - (col - (line_size-1));
164
- //col = line_size-1; // never read again, doesn't need to be set
165
- goto last_char;
166
- }
167
- }
168
-
169
- i += XMM_SIZE;
170
- }
171
- #else
172
- while (len-i-1 > 8 && line_size-col-1 > 8) {
173
- // 8 cycle unrolled version
174
- sp = p;
175
- #define DO_THING(n) \
176
- c = src[i+n], escaped = escapeLUT[c]; \
177
- if (escaped) \
178
- *(p++) = escaped; \
179
- else { \
180
- *(uint16_t*)p = escapedLUT[c]; \
181
- p += 2; \
182
- }
183
- DO_THING(0);
184
- DO_THING(1);
185
- DO_THING(2);
186
- DO_THING(3);
187
- DO_THING(4);
188
- DO_THING(5);
189
- DO_THING(6);
190
- DO_THING(7);
191
-
192
- i += 8;
193
- col += (int)(p - sp);
194
- }
195
- if(sp && col >= line_size-1) {
196
- // TODO: consider revert optimisation from do_encode_fast
197
- // we overflowed - need to revert and use slower method :(
198
- col -= (int)(p - sp);
199
- p = sp;
200
- i -= 8;
201
- }
202
- #endif
203
- // handle remaining chars
204
- while(col < line_size-1) {
205
- c = src[i++], escaped = escapeLUT[c];
206
- if (escaped) {
207
- *(p++) = escaped;
208
- col++;
209
- }
210
- else {
211
- *(uint16_t*)p = escapedLUT[c];
212
- p += 2;
213
- col += 2;
214
- }
215
- /* experimental branchless version
216
- *p = '=';
217
- c = (src[i++] + 42) & 0xFF;
218
- int cond = (c=='\0' || c=='=' || c=='\r' || c=='\n');
219
- *(p+cond) = c + (cond << 6);
220
- p += 1+cond;
221
- col += 1+cond;
222
- */
223
- if (i >= len) goto end;
224
- }
225
-
226
- // last line char
227
- if(col < line_size) { // this can only be false if the last character was an escape sequence (or line_size is horribly small), in which case, we don't need to handle space/tab cases
228
- last_char:
229
- c = src[i++];
230
- if (escapedLUT[c] && c != '.'-42) {
231
- *(uint16_t*)p = escapedLUT[c];
232
- p += 2;
233
- } else {
234
- *(p++) = c + 42;
235
- }
236
- }
237
-
238
- if (i >= len) break;
239
-
240
- c = src[i++];
241
- if (escapedLUT[c]) {
242
- *(uint32_t*)p = UINT32_16_PACK(UINT16_PACK('\r', '\n'), (uint32_t)escapedLUT[c]);
243
- p += 4;
244
- col = 2;
245
- } else {
246
- // another option may be to just write the EOL and let the first char be handled by the faster methods above, but it appears that writing the extra byte here is generally faster...
247
- *(uint32_t*)p = UINT32_PACK('\r', '\n', (uint32_t)(c+42), 0);
248
- p += 3;
249
- col = 1;
250
- }
251
- }
252
-
253
- end:
254
- // special case: if the last character is a space/tab, it needs to be escaped as it's the final character on the line
255
- unsigned char lc = *(p-1);
256
- if(lc == '\t' || lc == ' ') {
257
- *(uint16_t*)(p-1) = UINT16_PACK('=', lc+64);
258
- p++;
259
- //col++;
260
- }
261
- return p - dest;
262
- }
263
-
264
-
265
- // slightly faster version which improves the worst case scenario significantly; since worst case doesn't happen often, overall speedup is relatively minor
266
- // requires PSHUFB (SSSE3) instruction, but will use PBLENDV (SSE4.1) and POPCNT (SSE4.2 (or AMD's ABM, but Phenom doesn't support SSSE3 so doesn't matter)) if available (these only seem to give minor speedups, so considered optional)
267
- #ifdef __SSSE3__
268
- size_t (*_do_encode)(int, int, const unsigned char*, unsigned char*, size_t) = &do_encode_slow;
269
- #define do_encode (*_do_encode)
270
- ALIGN_32(__m128i _shufLUT[258]); // +2 for underflow guard entry
271
- __m128i* shufLUT = _shufLUT+2;
272
- #ifndef __POPCNT__
273
- // table from http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetTable
274
- static const unsigned char BitsSetTable256[256] =
275
- {
276
- # define B2(n) n, n+1, n+1, n+2
277
- # define B4(n) B2(n), B2(n+1), B2(n+1), B2(n+2)
278
- # define B6(n) B4(n), B4(n+1), B4(n+1), B4(n+2)
279
- B6(0), B6(1), B6(1), B6(2)
280
- #undef B2
281
- #undef B4
282
- #undef B6
283
- };
284
- #endif
285
- static size_t do_encode_fast(int line_size, int col, const unsigned char* src, unsigned char* dest, size_t len) {
286
- unsigned char *p = dest; // destination pointer
287
- unsigned long i = 0; // input position
288
- unsigned char c, escaped; // input character; escaped input character
289
-
290
- __m128i equals = _mm_set1_epi8('=');
291
-
292
- if (col == 0) {
293
- c = src[i++];
294
- if (escapedLUT[c]) {
295
- *(uint16_t*)p = escapedLUT[c];
296
- p += 2;
297
- col = 2;
298
- } else {
299
- *(p++) = c + 42;
300
- col = 1;
301
- }
302
- }
303
- while(i < len) {
304
- // main line
305
- while (len-i-1 > XMM_SIZE && col < line_size-1) {
306
- __m128i data = _mm_add_epi8(
307
- _mm_loadu_si128((__m128i *)(src + i)), // probably not worth the effort to align
308
- _mm_set1_epi8(42)
309
- );
310
- i += XMM_SIZE;
311
- // search for special chars
312
- __m128i cmp = _mm_or_si128(
313
- _mm_or_si128(
314
- _mm_cmpeq_epi8(data, _mm_setzero_si128()),
315
- _mm_cmpeq_epi8(data, _mm_set1_epi8('\n'))
316
- ),
317
- _mm_or_si128(
318
- _mm_cmpeq_epi8(data, _mm_set1_epi8('\r')),
319
- _mm_cmpeq_epi8(data, equals)
320
- )
321
- );
322
-
323
- unsigned int mask = _mm_movemask_epi8(cmp);
324
- if (mask != 0) { // seems to always be faster than _mm_test_all_zeros, possibly because http://stackoverflow.com/questions/34155897/simd-sse-how-to-check-that-all-vector-elements-are-non-zero#comment-62475316
325
- uint8_t m1 = mask & 0xFF;
326
- uint8_t m2 = mask >> 8;
327
-
328
- // perform lookup for shuffle mask
329
- __m128i shufMA = _mm_load_si128(shufLUT + m1);
330
- __m128i shufMB = _mm_load_si128(shufLUT + m2);
331
-
332
- // second mask processes on second half, so add to the offsets
333
- // this seems to be faster than right-shifting data by 8 bytes on Intel chips, maybe due to psrldq always running on port5? may be different on AMD
334
- shufMB = _mm_add_epi8(shufMB, _mm_set1_epi8(8));
335
-
336
- // expand halves
337
- //shuf = _mm_or_si128(_mm_cmpgt_epi8(shuf, _mm_set1_epi8(15)), shuf);
338
- __m128i data2 = _mm_shuffle_epi8(data, shufMB);
339
- data = _mm_shuffle_epi8(data, shufMA);
340
-
341
- // get the maskEsc for the escaped chars
342
- __m128i maskEscA = _mm_cmpeq_epi8(shufMA, _mm_srli_si128(shufMA, 1));
343
- __m128i maskEscB = _mm_cmpeq_epi8(shufMB, _mm_srli_si128(shufMB, 1));
344
-
345
- // blend escape chars in
346
- __m128i tmp1 = _mm_add_epi8(data, _mm_set1_epi8(64));
347
- __m128i tmp2 = _mm_add_epi8(data2, _mm_set1_epi8(64));
348
- #ifdef __SSE4_1__
349
- #define BLENDV _mm_blendv_epi8
350
- #else
351
- #define BLENDV(v1, v2, m) _mm_or_si128(_mm_andnot_si128(m, v1), _mm_and_si128(m, v2))
352
- #endif
353
- data = BLENDV(data, equals, maskEscA);
354
- data2 = BLENDV(data2, equals, maskEscB);
355
- maskEscA = _mm_slli_si128(maskEscA, 1);
356
- maskEscB = _mm_slli_si128(maskEscB, 1);
357
- data = BLENDV(data, tmp1, maskEscA);
358
- data2 = BLENDV(data2, tmp2, maskEscB);
359
- #undef BLENDV
360
- // store out
361
- #ifdef __POPCNT__
362
- unsigned char shufALen = _mm_popcnt_u32(m1) + 8;
363
- unsigned char shufBLen = _mm_popcnt_u32(m2) + 8;
364
- #else
365
- unsigned char shufALen = BitsSetTable256[m1] + 8;
366
- unsigned char shufBLen = BitsSetTable256[m2] + 8;
367
- #endif
368
- STOREU_XMM(p, data);
369
- p += shufALen;
370
- STOREU_XMM(p, data2);
371
- p += shufBLen;
372
- col += shufALen + shufBLen;
373
-
374
- int ovrflowAmt = col - (line_size-1);
375
- if(ovrflowAmt > 0) {
376
- // we overflowed - find correct position to revert back to
377
- p -= ovrflowAmt;
378
- if(ovrflowAmt == shufBLen) {
379
- i -= 8;
380
- goto last_char_fast;
381
- } else {
382
- int isEsc;
383
- uint16_t tst;
384
- int offs = shufBLen - ovrflowAmt -1;
385
- if(ovrflowAmt > shufBLen) {
386
- tst = *(uint16_t*)((char*)(shufLUT+m1) + shufALen+offs);
387
- i -= 8;
388
- } else {
389
- tst = *(uint16_t*)((char*)(shufLUT+m2) + offs);
390
- }
391
- isEsc = ((tst>>8) == (tst&0xFF));
392
- p += isEsc;
393
- i -= 8 - (tst>>8) - isEsc;
394
- //col = line_size-1 + isEsc; // doesn't need to be set, since it's never read again
395
- if(isEsc)
396
- goto after_last_char_fast;
397
- else
398
- goto last_char_fast;
399
- }
400
- }
401
- } else {
402
- STOREU_XMM(p, data);
403
- p += XMM_SIZE;
404
- col += XMM_SIZE;
405
- if(col > line_size-1) {
406
- p -= col - (line_size-1);
407
- i -= col - (line_size-1);
408
- //col = line_size-1; // doesn't need to be set, since it's never read again
409
- goto last_char_fast;
410
- }
411
- }
412
- }
413
- // handle remaining chars
414
- while(col < line_size-1) {
415
- c = src[i++], escaped = escapeLUT[c];
416
- if (escaped) {
417
- *(p++) = escaped;
418
- col++;
419
- }
420
- else {
421
- *(uint16_t*)p = escapedLUT[c];
422
- p += 2;
423
- col += 2;
424
- }
425
- if (i >= len) goto end;
426
- }
427
-
428
- // last line char
429
- if(col < line_size) { // this can only be false if the last character was an escape sequence (or line_size is horribly small), in which case, we don't need to handle space/tab cases
430
- last_char_fast:
431
- c = src[i++];
432
- if (escapedLUT[c] && c != '.'-42) {
433
- *(uint16_t*)p = escapedLUT[c];
434
- p += 2;
435
- } else {
436
- *(p++) = c + 42;
437
- }
438
- }
439
-
440
- after_last_char_fast:
441
- if (i >= len) break;
442
-
443
- c = src[i++];
444
- if (escapedLUT[c]) {
445
- *(uint32_t*)p = UINT32_16_PACK(UINT16_PACK('\r', '\n'), (uint32_t)escapedLUT[c]);
446
- p += 4;
447
- col = 2;
448
- } else {
449
- *(uint32_t*)p = UINT32_PACK('\r', '\n', (uint32_t)(c+42), 0);
450
- p += 3;
451
- col = 1;
452
- }
453
- }
454
-
455
- end:
456
- // special case: if the last character is a space/tab, it needs to be escaped as it's the final character on the line
457
- unsigned char lc = *(p-1);
458
- if(lc == '\t' || lc == ' ') {
459
- *(uint16_t*)(p-1) = UINT16_PACK('=', lc+64);
460
- p++;
461
- }
462
- return p - dest;
463
- }
464
-
465
- /*
466
- // experimental AVX2 version
467
- // seems to be slower than SSSE3 variant, so not used at the moment; with experimental optimisations, is faster on Haswell, but only mildly so
468
- #ifdef __AVX2__
469
- #define YMM_SIZE 32
470
- static size_t do_encode_avx2(int line_size, int col, const unsigned char* src, unsigned char* dest, size_t len) {
471
- unsigned char *p = dest; // destination pointer
472
- unsigned long i = 0; // input position
473
- unsigned char c, escaped; // input character; escaped input character
474
-
475
- __m256i equals = _mm256_set1_epi8('=');
476
-
477
- if (col == 0) {
478
- c = src[i++];
479
- if (escapedLUT[c]) {
480
- *(uint16_t*)p = escapedLUT[c];
481
- p += 2;
482
- col = 2;
483
- } else {
484
- *(p++) = c + 42;
485
- col = 1;
486
- }
487
- }
488
- while(i < len) {
489
- // main line
490
- while (len-i-1 > YMM_SIZE && col < line_size-1) {
491
- __m256i data = _mm256_add_epi8(
492
- _mm256_loadu_si256((__m256i *)(src + i)),
493
- _mm256_set1_epi8(42)
494
- );
495
- i += YMM_SIZE;
496
- // search for special chars
497
- __m256i cmp = _mm256_or_si256(
498
- _mm256_or_si256(
499
- _mm256_cmpeq_epi8(data, _mm256_setzero_si256()),
500
- _mm256_cmpeq_epi8(data, _mm256_set1_epi8('\n'))
501
- ),
502
- _mm256_or_si256(
503
- _mm256_cmpeq_epi8(data, _mm256_set1_epi8('\r')),
504
- _mm256_cmpeq_epi8(data, equals)
505
- )
506
- );
507
-
508
- unsigned int mask = _mm256_movemask_epi8(cmp);
509
- if (mask != 0) {
510
- uint8_t m1 = mask & 0xFF;
511
- uint8_t m2 = (mask >> 8) & 0xFF;
512
- uint8_t m3 = (mask >> 16) & 0xFF;
513
- uint8_t m4 = mask >> 24;
514
-
515
- // perform lookup for shuffle mask
516
- // note that we interlave 1/3, 2/4 to make processing easier
517
- // TODO: any way to ensure that these loads use AVX?
518
- __m256i shufMA = _mm256_inserti128_si256(
519
- _mm256_castsi128_si256(shufLUT[m1]),
520
- shufLUT[m3],
521
- 1
522
- );
523
- __m256i shufMB = _mm256_inserti128_si256(
524
- _mm256_castsi128_si256(shufLUT[m2]),
525
- shufLUT[m4],
526
- 1
527
- );
528
-
529
- // offset second mask
530
- shufMB = _mm256_add_epi8(shufMB, _mm256_set1_epi8(8));
531
-
532
- // expand halves
533
- __m256i data1 = _mm256_shuffle_epi8(data, shufMA);
534
- __m256i data2 = _mm256_shuffle_epi8(data, shufMB);
535
-
536
- // get the maskEsc for the escaped chars
537
- __m256i maskEscA = _mm256_cmpeq_epi8(shufMA, _mm256_srli_si256(shufMA, 1));
538
- __m256i maskEscB = _mm256_cmpeq_epi8(shufMB, _mm256_srli_si256(shufMB, 1));
539
-
540
- // blend escape chars in
541
- __m256i tmp1 = _mm256_add_epi8(data1, _mm256_set1_epi8(64));
542
- __m256i tmp2 = _mm256_add_epi8(data2, _mm256_set1_epi8(64));
543
- data1 = _mm256_blendv_epi8(data1, equals, maskEscA);
544
- data2 = _mm256_blendv_epi8(data2, equals, maskEscB);
545
- data1 = _mm256_blendv_epi8(data1, tmp1, _mm256_slli_si256(maskEscA, 1));
546
- data2 = _mm256_blendv_epi8(data2, tmp2, _mm256_slli_si256(maskEscB, 1));
547
- // store out
548
- unsigned char shuf1Len = _mm_popcnt_u32(m1) + 8;
549
- unsigned char shuf2Len = _mm_popcnt_u32(m2) + 8;
550
- unsigned char shuf3Len = _mm_popcnt_u32(m3) + 8;
551
- unsigned char shuf4Len = _mm_popcnt_u32(m4) + 8;
552
- // TODO: do these stores always use AVX?
553
- // TODO: this can overflow since we only give +32 chars for over-allocation
554
- _mm_storeu_si128((__m128i*)p, _mm256_castsi256_si128(data1));
555
- p += shuf1Len;
556
- _mm_storeu_si128((__m128i*)p, _mm256_castsi256_si128(data2));
557
- p += shuf2Len;
558
- _mm_storeu_si128((__m128i*)p, _mm256_extracti128_si256(data1, 1));
559
- p += shuf3Len;
560
- _mm_storeu_si128((__m128i*)p, _mm256_extracti128_si256(data2, 1));
561
- p += shuf4Len;
562
- col += shuf1Len + shuf2Len + shuf3Len + shuf4Len;
563
-
564
- if(col > line_size-1) {
565
- // we overflowed - may need to revert and use slower method :(
566
- // TODO: optimize this
567
- col -= shuf1Len + shuf2Len + shuf3Len + shuf4Len;
568
- p -= shuf1Len + shuf2Len + shuf3Len + shuf4Len;
569
- i -= YMM_SIZE;
570
- break;
571
- }
572
- } else {
573
- _mm256_storeu_si256((__m256i*)p, data);
574
- p += YMM_SIZE;
575
- col += YMM_SIZE;
576
- if(col > line_size-1) {
577
- p -= col - (line_size-1);
578
- i -= col - (line_size-1);
579
- col = line_size-1;
580
- goto last_char_avx2;
581
- }
582
- }
583
- }
584
- // handle remaining chars
585
- while(col < line_size-1) {
586
- c = src[i++], escaped = escapeLUT[c];
587
- if (escaped) {
588
- *(p++) = escaped;
589
- col++;
590
- }
591
- else {
592
- *(uint16_t*)p = escapedLUT[c];
593
- p += 2;
594
- col += 2;
595
- }
596
- if (i >= len) goto end;
597
- }
598
-
599
- // last line char
600
- if(col < line_size) { // this can only be false if the last character was an escape sequence (or line_size is horribly small), in which case, we don't need to handle space/tab cases
601
- last_char_avx2:
602
- c = src[i++];
603
- if (escapedLUT[c] && c != '.'-42) {
604
- *(uint16_t*)p = escapedLUT[c];
605
- p += 2;
606
- } else {
607
- *(p++) = c + 42;
608
- }
609
- }
610
-
611
- if (i >= len) break;
612
-
613
- c = src[i++];
614
- if (escapedLUT[c]) {
615
- *(uint32_t*)p = UINT32_16_PACK(UINT16_PACK('\r', '\n'), (uint32_t)escapedLUT[c]);
616
- p += 4;
617
- col = 2;
618
- } else {
619
- *(uint32_t*)p = UINT32_PACK('\r', '\n', (uint32_t)(c+42), 0);
620
- p += 3;
621
- col = 1;
622
- }
623
- }
624
-
625
- _mm256_zeroupper();
626
-
627
- end:
628
- // special case: if the last character is a space/tab, it needs to be escaped as it's the final character on the line
629
- unsigned char lc = *(p-1);
630
- if(lc == '\t' || lc == ' ') {
631
- *(uint16_t*)(p-1) = UINT16_PACK('=', lc+64);
632
- p++;
633
- }
634
- return p - dest;
635
- }
636
- #endif
637
- */
638
-
639
-
640
-
641
- ALIGN_32(static const uint8_t _pshufb_shift_table[272]) = {
642
- 0x80,0x81,0x82,0x83,0x84,0x85,0x86,0x87,0x88,0x89,0x8a,0x8b,0x8c,0x8d,0x8e,0x8f,
643
- 0x0f,0x80,0x81,0x82,0x83,0x84,0x85,0x86,0x87,0x88,0x89,0x8a,0x8b,0x8c,0x8d,0x8e,
644
- 0x0e,0x0f,0x80,0x81,0x82,0x83,0x84,0x85,0x86,0x87,0x88,0x89,0x8a,0x8b,0x8c,0x8d,
645
- 0x0d,0x0e,0x0f,0x80,0x81,0x82,0x83,0x84,0x85,0x86,0x87,0x88,0x89,0x8a,0x8b,0x8c,
646
- 0x0c,0x0d,0x0e,0x0f,0x80,0x81,0x82,0x83,0x84,0x85,0x86,0x87,0x88,0x89,0x8a,0x8b,
647
- 0x0b,0x0c,0x0d,0x0e,0x0f,0x80,0x81,0x82,0x83,0x84,0x85,0x86,0x87,0x88,0x89,0x8a,
648
- 0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0x80,0x81,0x82,0x83,0x84,0x85,0x86,0x87,0x88,0x89,
649
- 0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0x80,0x81,0x82,0x83,0x84,0x85,0x86,0x87,0x88,
650
- 0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0x80,0x81,0x82,0x83,0x84,0x85,0x86,0x87,
651
- 0x07,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0x80,0x81,0x82,0x83,0x84,0x85,0x86,
652
- 0x06,0x07,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0x80,0x81,0x82,0x83,0x84,0x85,
653
- 0x05,0x06,0x07,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0x80,0x81,0x82,0x83,0x84,
654
- 0x04,0x05,0x06,0x07,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0x80,0x81,0x82,0x83,
655
- 0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0x80,0x81,0x82,
656
- 0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0x80,0x81,
657
- 0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0x80,
658
- 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f
659
- };
660
- static const __m128i* pshufb_shift_table = (const __m128i*)_pshufb_shift_table;
661
-
662
- // assumes line_size is reasonably large (probably >32)
663
- static size_t do_encode_fast2(int line_size, int col, const unsigned char* src, unsigned char* dest, size_t len) {
664
- unsigned char *p = dest; // destination pointer
665
- unsigned long i = 0; // input position
666
- unsigned char c; // input character; escaped input character
667
-
668
- __m128i equals = _mm_set1_epi8('=');
669
-
670
- // firstly, align reader
671
- for (; (uintptr_t)(src+i) & 0xF; i++) {
672
- if(i >= len) goto encode_fast2_end;
673
- c = (src[i] + 42) & 0xFF;
674
- switch(c) {
675
- case '.':
676
- if(col > 0) break;
677
- case '\t': case ' ':
678
- if(col > 0 && col < line_size-1) break;
679
- case '\0': case '\r': case '\n': case '=':
680
- *(p++) = '=';
681
- c += 64;
682
- col++;
683
- }
684
- *(p++) = c;
685
- col++;
686
- if(col >= line_size && i+1 < len) {
687
- *(uint16_t*)p = UINT16_PACK('\r', '\n');
688
- p += 2;
689
- col = 0;
690
- }
691
- }
692
-
693
- if(len-i-1 > XMM_SIZE) {
694
- __m128i input = _mm_load_si128((__m128i *)(src + i));
695
-
696
- if (col == 0) {
697
- // first char in line
698
- c = src[i];
699
- if (escapedLUT[c]) {
700
- *p++ = '=';
701
- col = 1;
702
-
703
- #ifdef __SSE4_1__
704
- input = _mm_insert_epi8(input, c-(214-64)-42, 0);
705
- #else
706
- input = _mm_insert_epi16(input, (uint16_t)(c-(214-64)-42) + (((uint16_t)src[i+1])<<8), 0);
707
- #endif
708
- }
709
- }
710
- do {
711
- __m128i data = _mm_add_epi8(input, _mm_set1_epi8(42));
712
- i += XMM_SIZE;
713
- // search for special chars
714
- __m128i cmp = _mm_or_si128(
715
- _mm_or_si128(
716
- _mm_cmpeq_epi8(data, _mm_setzero_si128()),
717
- _mm_cmpeq_epi8(data, _mm_set1_epi8('\n'))
718
- ),
719
- _mm_or_si128(
720
- _mm_cmpeq_epi8(data, _mm_set1_epi8('\r')),
721
- _mm_cmpeq_epi8(data, equals)
722
- )
723
- );
724
-
725
- unsigned int mask = _mm_movemask_epi8(cmp);
726
- if (mask != 0) { // seems to always be faster than _mm_test_all_zeros, possibly because http://stackoverflow.com/questions/34155897/simd-sse-how-to-check-that-all-vector-elements-are-non-zero#comment-62475316
727
- uint8_t m1 = mask & 0xFF;
728
- uint8_t m2 = mask >> 8;
729
-
730
- // perform lookup for shuffle mask
731
- __m128i shufMA = _mm_load_si128(shufLUT + m1);
732
- __m128i shufMB = _mm_load_si128(shufLUT + m2);
733
-
734
- // second mask processes on second half, so add to the offsets
735
- // this seems to be faster than right-shifting data by 8 bytes on Intel chips, maybe due to psrldq always running on port5? may be different on AMD
736
- shufMB = _mm_add_epi8(shufMB, _mm_set1_epi8(8));
737
-
738
- // expand halves
739
- __m128i data2 = _mm_shuffle_epi8(data, shufMB);
740
- data = _mm_shuffle_epi8(data, shufMA);
741
-
742
- // get the maskEsc for the escaped chars
743
- __m128i maskEscA = _mm_cmpeq_epi8(shufMA, _mm_srli_si128(shufMA, 1));
744
- __m128i maskEscB = _mm_cmpeq_epi8(shufMB, _mm_srli_si128(shufMB, 1));
745
-
746
- // blend escape chars in
747
- __m128i tmp1 = _mm_add_epi8(data, _mm_set1_epi8(64));
748
- __m128i tmp2 = _mm_add_epi8(data2, _mm_set1_epi8(64));
749
- #ifdef __SSE4_1__
750
- #define BLENDV _mm_blendv_epi8
751
- #else
752
- #define BLENDV(v1, v2, m) _mm_or_si128(_mm_andnot_si128(m, v1), _mm_and_si128(m, v2))
753
- #endif
754
- data = BLENDV(data, equals, maskEscA);
755
- data2 = BLENDV(data2, equals, maskEscB);
756
- maskEscA = _mm_slli_si128(maskEscA, 1);
757
- maskEscB = _mm_slli_si128(maskEscB, 1);
758
- data = BLENDV(data, tmp1, maskEscA);
759
- data2 = BLENDV(data2, tmp2, maskEscB);
760
- #undef BLENDV
761
- // store out
762
- #ifdef __POPCNT__
763
- unsigned char shufALen = _mm_popcnt_u32(m1) + 8;
764
- unsigned char shufBLen = _mm_popcnt_u32(m2) + 8;
765
- #else
766
- unsigned char shufALen = BitsSetTable256[m1] + 8;
767
- unsigned char shufBLen = BitsSetTable256[m2] + 8;
768
- #endif
769
- STOREU_XMM(p, data);
770
- p += shufALen;
771
- STOREU_XMM(p, data2);
772
- p += shufBLen;
773
- col += shufALen + shufBLen;
774
-
775
- int ovrflowAmt = col - (line_size-1);
776
- if(ovrflowAmt > 0) {
777
- // we overflowed - find correct position to revert back to
778
- p -= ovrflowAmt;
779
- if(ovrflowAmt == shufBLen) {
780
- c = src[i-8];
781
- // TODO: consider doing direct comparisons instead of lookup
782
- if (escapedLUT[c] && c != '.'-42) {
783
- // if data2's version is escaped, we shift out by 2, otherwise only by 1
784
- if(m2 & 1) {
785
- data2 = _mm_srli_si128(data2, 1);
786
- ovrflowAmt--;
787
- } else
788
- *(uint16_t*)p = escapedLUT[c];
789
- p += 2;
790
- } else {
791
- p++;
792
- // shift data2 by one (actually will be combined below)
793
- }
794
- ovrflowAmt--;
795
-
796
- c = src[i-7];
797
- if (escapedLUT[c] && !(m2 & 2)) { // if the character was originally escaped, we can just fallback to storing it straight out
798
- col = ovrflowAmt+1;
799
- data2 = _mm_srli_si128(data2, 1+1);
800
-
801
- *(uint32_t*)p = UINT32_16_PACK(UINT16_PACK('\r', '\n'), (uint32_t)escapedLUT[c]);
802
- p += 4;
803
- ovrflowAmt--;
804
- } else {
805
- *(uint16_t*)p = UINT16_PACK('\r', '\n');
806
- col = ovrflowAmt;
807
- data2 = _mm_srli_si128(data2, 1);
808
- p += 2;
809
- }
810
- STOREU_XMM(p, data2);
811
- p += ovrflowAmt;
812
- } else {
813
- int isEsc, lastIsEsc;
814
- uint16_t tst;
815
- int offs = shufBLen - ovrflowAmt -1;
816
- unsigned long tmpInPos = i;
817
- if(ovrflowAmt > shufBLen) {
818
- // ! Note that it's possible for shufALen+offs == -1 to be true !!
819
- // although the way the lookup tables are constructed (with the additional guard entry), this isn't a problem, but it's not ideal; TODO: consider proper fix
820
- tst = *(uint16_t*)((char*)(shufLUT+m1) + shufALen+offs);
821
- tmpInPos -= 8;
822
- } else {
823
- tst = *(uint16_t*)((char*)(shufLUT+m2) + offs);
824
- }
825
- isEsc = ((tst>>8) == (tst&0xFF));
826
- tmpInPos -= 8 - (tst>>8) - isEsc;
827
-
828
- lastIsEsc = 0;
829
- if(!isEsc) {
830
- //lastIsEsc = (mask & (1 << (16-(i-tmpInPos)))) ? 1:0; // TODO: use this?
831
- c = src[tmpInPos++];
832
- // TODO: consider doing direct comparisons instead of lookup
833
- if (escapedLUT[c] && c != '.'-42) {
834
- *(uint16_t*)p = escapedLUT[c];
835
- p++;
836
-
837
- lastIsEsc = escapeLUT[c] ? 0:1;
838
- }
839
- }
840
- p++;
841
-
842
-
843
- //offs = offs + 1 - (1+lastIsEsc);
844
- if(ovrflowAmt > shufBLen) {
845
- ovrflowAmt -= 1+lastIsEsc;
846
- __m128i shiftThing = _mm_load_si128(&pshufb_shift_table[16 - (shufALen+shufBLen - ovrflowAmt)]);
847
- data = _mm_shuffle_epi8(data, shiftThing);
848
- shufALen = ovrflowAmt-shufBLen;
849
- } else {
850
- // TODO: theoretically, everything in the 2nd half can be optimized better, but requires a lot more code paths :|
851
- ovrflowAmt -= 1+lastIsEsc;
852
- __m128i shiftThing = _mm_load_si128(&pshufb_shift_table[16 - (shufBLen - ovrflowAmt)]);
853
- data2 = _mm_shuffle_epi8(data2, shiftThing);
854
- shufBLen = ovrflowAmt;
855
- }
856
-
857
- if(tmpInPos >= len) goto encode_fast2_end; // TODO: remove conditional by pre-checking this
858
-
859
- c = src[tmpInPos];
860
- if(ovrflowAmt > 0) {
861
- isEsc = mask & (1 << (16-(i-tmpInPos)));
862
- if(ovrflowAmt > shufBLen) {
863
- if (escapedLUT[c] && !isEsc) {
864
- *(uint32_t*)p = UINT32_16_PACK(UINT16_PACK('\r', '\n'), (uint32_t)escapedLUT[c]);
865
- col = 2/*escaped char*/+shufALen-1/*previously not escaped char*/;
866
- data = _mm_srli_si128(data, 1);
867
- p += 4;
868
- shufALen--;
869
- } else {
870
- *(uint16_t*)p = UINT16_PACK('\r', '\n');
871
- col = shufALen;
872
- p += 2;
873
- }
874
-
875
- STOREU_XMM(p, data);
876
- p += shufALen;
877
- } else {
878
- if (escapedLUT[c] && !isEsc) {
879
- *(uint32_t*)p = UINT32_16_PACK(UINT16_PACK('\r', '\n'), (uint32_t)escapedLUT[c]);
880
- col = 2;
881
- p += 4;
882
- shufBLen--;
883
- data2 = _mm_srli_si128(data2, 1);
884
- } else {
885
- *(uint16_t*)p = UINT16_PACK('\r', '\n');
886
- col = 0;
887
- p += 2;
888
- }
889
- }
890
- STOREU_XMM(p, data2);
891
- p += shufBLen;
892
- col += shufBLen;
893
- } else {
894
- if (escapedLUT[c]) {
895
- // check if we have enough bytes to read
896
- if(len-i-1 <= XMM_SIZE) {
897
- *(uint16_t*)p = UINT16_PACK('\r', '\n');
898
- col = 0;
899
- p += 2;
900
- break;
901
- }
902
-
903
- // we've now got a rather problematic case to handle... :|
904
- *(uint32_t*)p = UINT32_PACK('\r', '\n', '=', 0);
905
- p += 3;
906
- col = 1;
907
-
908
- // ewww....
909
- input = _mm_load_si128((__m128i *)(src + i));
910
- // hack XMM input to fool regular code into writing the correct character
911
- #ifdef __SSE4_1__
912
- input = _mm_insert_epi8(input, c-(214-64)-42, 0);
913
- #else
914
- input = _mm_insert_epi16(input, (uint16_t)(c-(214-64)-42) + (((uint16_t)src[i+1])<<8), 0);
915
- #endif
916
- continue;
917
- } else {
918
- *(uint16_t*)p = UINT16_PACK('\r', '\n');
919
- col = 0;
920
- p += 2;
921
- }
922
- }
923
- }
924
- }
925
- } else {
926
- STOREU_XMM(p, data);
927
- p += XMM_SIZE;
928
- col += XMM_SIZE;
929
- int ovrflowAmt = col - (line_size-1);
930
- if(ovrflowAmt > 0) {
931
- // optimisation: check last char here
932
- c = src[i - ovrflowAmt];
933
- ovrflowAmt--;
934
- // TODO: consider doing direct comparisons instead of lookup
935
- if (escapedLUT[c] && c != '.'-42) {
936
- p -= ovrflowAmt-1;
937
- *(uint16_t*)(p-2) = escapedLUT[c];
938
- } else {
939
- p -= ovrflowAmt;
940
- }
941
-
942
- if(i-ovrflowAmt >= len) goto encode_fast2_end; // TODO: remove conditional by pre-checking this
943
-
944
- c = src[i - ovrflowAmt];
945
- if(ovrflowAmt != 0) {
946
- int dataLen;
947
- if (escapedLUT[c]) {
948
- *(uint32_t*)p = UINT32_16_PACK(UINT16_PACK('\r', '\n'), (uint32_t)escapedLUT[c]);
949
- col = 2+ovrflowAmt-1;
950
- dataLen = ovrflowAmt-1;
951
- data = _mm_srli_si128(data, 1);
952
- p += 4;
953
- } else {
954
- *(uint16_t*)p = UINT16_PACK('\r', '\n');
955
- col = ovrflowAmt;
956
- dataLen = ovrflowAmt;
957
- p += 2;
958
- }
959
-
960
- // shuffle remaining elements across
961
- __m128i shiftThing = _mm_load_si128(&pshufb_shift_table[ovrflowAmt]);
962
- data = _mm_shuffle_epi8(data, shiftThing);
963
- // store out data
964
- STOREU_XMM(p, data);
965
- p += dataLen;
966
- } else {
967
- if (escapedLUT[c]) { // will also handle case which would be handled fine normally, but since we're checking it...
968
- // ugly hacky code
969
- // check if we have enough bytes to read
970
- if(len-i-1 <= XMM_SIZE) {
971
- *(uint16_t*)p = UINT16_PACK('\r', '\n');
972
- col = 0;
973
- p += 2;
974
- break;
975
- }
976
-
977
- *(uint32_t*)p = UINT32_PACK('\r', '\n', '=', 0);
978
- col = 1;
979
- p += 3;
980
-
981
- // ewww....
982
- input = _mm_load_si128((__m128i *)(src + i));
983
- // hack XMM input to fool regular code into writing the correct character
984
- #ifdef __SSE4_1__
985
- input = _mm_insert_epi8(input, c-(214-64)-42, 0);
986
- #else
987
- input = _mm_insert_epi16(input, (uint16_t)(c-(214-64)-42) + (((uint16_t)src[i+1])<<8), 0);
988
- #endif
989
- continue;
990
-
991
- } else {
992
- *(uint16_t*)p = UINT16_PACK('\r', '\n');
993
- col = 0;
994
- p += 2;
995
- }
996
- }
997
- }
998
- }
999
- input = _mm_load_si128((__m128i *)(src + i));
1000
- } while(len-i-1 > XMM_SIZE);
1001
- }
1002
-
1003
- if(col == 0) {
1004
- c = src[i++];
1005
- if (escapedLUT[c]) {
1006
- *(uint16_t*)p = escapedLUT[c];
1007
- p += 2;
1008
- col = 2;
1009
- } else {
1010
- *(p++) = c + 42;
1011
- col = 1;
1012
- }
1013
- }
1014
- while(i < len) {
1015
- while(col < line_size-1) {
1016
- c = src[i++];
1017
- if (escapeLUT[c]) {
1018
- *(p++) = escapeLUT[c];
1019
- col++;
1020
- }
1021
- else {
1022
- *(uint16_t*)p = escapedLUT[c];
1023
- p += 2;
1024
- col += 2;
1025
- }
1026
- if (i >= len) goto encode_fast2_end;
1027
- }
1028
-
1029
- // last line char
1030
- // TODO: consider combining with above
1031
- if(col < line_size) { // this can only be false if the last character was an escape sequence (or line_size is horribly small), in which case, we don't need to handle space/tab cases
1032
- c = src[i++];
1033
- if (escapedLUT[c] && c != '.'-42) {
1034
- *(uint16_t*)p = escapedLUT[c];
1035
- p += 2;
1036
- } else {
1037
- *(p++) = c + 42;
1038
- }
1039
- }
1040
-
1041
- if (i >= len) break;
1042
-
1043
- c = src[i++];
1044
- if (escapedLUT[c]) {
1045
- *(uint32_t*)p = UINT32_16_PACK(UINT16_PACK('\r', '\n'), (uint32_t)escapedLUT[c]);
1046
- p += 4;
1047
- col = 2;
1048
- } else {
1049
- *(uint32_t*)p = UINT32_PACK('\r', '\n', (uint32_t)(c+42), 0);
1050
- p += 3;
1051
- col = 1;
1052
- }
1053
- }
1054
-
1055
-
1056
- encode_fast2_end:
1057
- // special case: if the last character is a space/tab, it needs to be escaped as it's the final character on the line
1058
- unsigned char lc = *(p-1);
1059
- if(lc == '\t' || lc == ' ') {
1060
- *(uint16_t*)(p-1) = UINT16_PACK('=', lc+64);
1061
- p++;
1062
- }
1063
- return p - dest;
1064
- }
1065
-
1066
- #else
1067
- #define do_encode do_encode_slow
1068
- #endif
1069
-
1070
-
1071
- /*
1072
- // simple naive implementation - most yEnc encoders I've seen do something like the following
1073
- // runs at around 145MB/s on 2.4GHz Silvermont (worst: 135MB/s, best: 158MB/s)
1074
- static inline unsigned long do_encode(int line_size, int col, const unsigned char* src, unsigned char* dest, size_t len) {
1075
- unsigned char *p = dest;
1076
-
1077
- for (unsigned long i = 0; i < len; i++) {
1078
- unsigned char c = (src[i] + 42) & 0xFF;
1079
- switch(c) {
1080
- case '.':
1081
- if(col > 0) break;
1082
- case '\t': case ' ':
1083
- if(col > 0 && col < line_size-1) break;
1084
- case '\0': case '\r': case '\n': case '=':
1085
- *(p++) = '=';
1086
- c += 64;
1087
- col++;
1088
- }
1089
- *(p++) = c;
1090
- col++;
1091
- if(col >= line_size && i+1 < len) {
1092
- *(uint16_t*)p = UINT16_PACK('\r', '\n');
1093
- p += 2;
1094
- col = 0;
1095
- }
1096
- }
1097
-
1098
- // special case: if the last character is a space/tab, it needs to be escaped as it's the final character on the line
1099
- unsigned char lc = *(p-1);
1100
- if(lc == '\t' || lc == ' ') {
1101
- *(uint16_t*)(p-1) = UINT16_PACK('=', lc+64);
1102
- p++;
1103
- }
1104
- return p - dest;
1105
- }
1106
- */
1107
-
1108
-
1109
- union crc32 {
1110
- uint32_t u32;
1111
- unsigned char u8a[4];
1112
- };
1113
-
1114
- #define PACK_4(arr) (((uint_fast32_t)arr[0] << 24) | ((uint_fast32_t)arr[1] << 16) | ((uint_fast32_t)arr[2] << 8) | (uint_fast32_t)arr[3])
1115
- #define UNPACK_4(arr, val) { \
1116
- arr[0] = (unsigned char)(val >> 24) & 0xFF; \
1117
- arr[1] = (unsigned char)(val >> 16) & 0xFF; \
1118
- arr[2] = (unsigned char)(val >> 8) & 0xFF; \
1119
- arr[3] = (unsigned char)val & 0xFF; \
1120
- }
1121
-
1122
- #include "interface.h"
1123
- crcutil_interface::CRC* crc = NULL;
1124
-
1125
- #ifdef X86_PCLMULQDQ_CRC
1126
- bool x86_cpu_has_pclmulqdq = false;
1127
- #include "crc_folding.c"
1128
- #else
1129
- #define x86_cpu_has_pclmulqdq false
1130
- #define crc_fold(a, b) 0
1131
- #endif
1132
-
1133
- static inline void do_crc32(const void* data, size_t length, unsigned char out[4]) {
1134
- // if we have the pclmulqdq instruction, use the insanely fast folding method
1135
- if(x86_cpu_has_pclmulqdq) {
1136
- uint32_t tmp = crc_fold((const unsigned char*)data, (long)length);
1137
- UNPACK_4(out, tmp);
1138
- } else {
1139
- if(!crc) {
1140
- crc = crcutil_interface::CRC::Create(
1141
- 0xEDB88320, 0, 32, true, 0, 0, 0, 0, NULL);
1142
- // instance never deleted... oh well...
1143
- }
1144
- crcutil_interface::UINT64 tmp = 0;
1145
- crc->Compute(data, length, &tmp);
1146
- UNPACK_4(out, tmp);
1147
- }
1148
- }
1149
-
1150
- crcutil_interface::CRC* crcI = NULL;
1151
- static inline void do_crc32_incremental(const void* data, size_t length, unsigned char init[4]) {
1152
- if(!crcI) {
1153
- crcI = crcutil_interface::CRC::Create(
1154
- 0xEDB88320, 0, 32, false, 0, 0, 0, 0, NULL);
1155
- // instance never deleted... oh well...
1156
- }
1157
-
1158
- if(x86_cpu_has_pclmulqdq) {
1159
- // TODO: think of a nicer way to do this than a combine
1160
- crcutil_interface::UINT64 crc1_ = PACK_4(init);
1161
- crcutil_interface::UINT64 crc2_ = crc_fold((const unsigned char*)data, (long)length);
1162
- crcI->Concatenate(crc2_, 0, length, &crc1_);
1163
- UNPACK_4(init, crc1_);
1164
- } else {
1165
- crcutil_interface::UINT64 tmp = PACK_4(init) ^ 0xffffffff;
1166
- crcI->Compute(data, length, &tmp);
1167
- tmp ^= 0xffffffff;
1168
- UNPACK_4(init, tmp);
1169
- }
1170
- }
1171
-
1172
- static inline void do_crc32_combine(unsigned char crc1[4], const unsigned char crc2[4], size_t len2) {
1173
- if(!crc) {
1174
- crc = crcutil_interface::CRC::Create(
1175
- 0xEDB88320, 0, 32, true, 0, 0, 0, 0, NULL);
1176
- // instance never deleted... oh well...
1177
- }
1178
- crcutil_interface::UINT64 crc1_ = PACK_4(crc1), crc2_ = PACK_4(crc2);
1179
- crc->Concatenate(crc2_, 0, len2, &crc1_);
1180
- UNPACK_4(crc1, crc1_);
1181
- }
1182
-
1183
- static inline void do_crc32_zeros(unsigned char crc1[4], size_t len) {
1184
- if(!crc) {
1185
- crc = crcutil_interface::CRC::Create(
1186
- 0xEDB88320, 0, 32, true, 0, 0, 0, 0, NULL);
1187
- // instance never deleted... oh well...
1188
- }
1189
- crcutil_interface::UINT64 crc_ = 0;
1190
- crc->CrcOfZeroes(len, &crc_);
1191
- UNPACK_4(crc1, crc_);
1192
- }
1193
-
1194
- void free_buffer(char* data, void* _size) {
1195
- #if !NODE_VERSION_AT_LEAST(0, 11, 0)
1196
- int size = (int)(size_t)_size;
1197
- V8::AdjustAmountOfExternalAllocatedMemory(-size);
1198
- #endif
1199
- //Isolate::GetCurrent()->AdjustAmountOfExternalAllocatedMemory(-size);
1200
- free(data);
1201
- }
1202
-
1203
- // TODO: encode should return col num for incremental processing
1204
- // line limit + return input consumed
1205
- // async processing?
1206
-
1207
- #define YENC_MAX_SIZE(len, line_size) ( \
1208
- len * 2 /* all characters escaped */ \
1209
- + ((len*4) / line_size) /* newlines, considering the possibility of all chars escaped */ \
1210
- + 2 /* allocation for offset and that a newline may occur early */ \
1211
- + 32 /* allocation for XMM overflowing */ \
1212
- )
1213
-
1214
-
1215
- // encode(str, line_size, col)
1216
- // crc32(str, init)
1217
- #if NODE_VERSION_AT_LEAST(0, 11, 0)
1218
-
1219
- #if NODE_VERSION_AT_LEAST(3, 0, 0) // iojs3
1220
- #define BUFFER_NEW(...) node::Buffer::New(isolate, __VA_ARGS__).ToLocalChecked()
1221
- #else
1222
- #define BUFFER_NEW(...) node::Buffer::New(isolate, __VA_ARGS__)
1223
- #endif
1224
-
1225
- // node 0.12 version
1226
- static void Encode(const FunctionCallbackInfo<Value>& args) {
1227
- Isolate* isolate = Isolate::GetCurrent();
1228
- HandleScope scope(isolate);
1229
-
1230
- if (args.Length() == 0 || !node::Buffer::HasInstance(args[0])) {
1231
- isolate->ThrowException(Exception::Error(
1232
- String::NewFromUtf8(isolate, "You must supply a Buffer"))
1233
- );
1234
- return;
1235
- }
1236
-
1237
- size_t arg_len = node::Buffer::Length(args[0]);
1238
- if (arg_len == 0) {
1239
- args.GetReturnValue().Set( BUFFER_NEW(0) );
1240
- return;
1241
- }
1242
-
1243
- int line_size = 128, col = 0;
1244
- if (args.Length() >= 2) {
1245
- // TODO: probably should throw errors instead of transparently fixing these...
1246
- line_size = args[1]->ToInteger()->Value();
1247
- if (line_size < 1) line_size = 128;
1248
- if (args.Length() >= 3) {
1249
- col = args[2]->ToInteger()->Value();
1250
- if (col >= line_size) col = 0;
1251
- }
1252
- }
1253
-
1254
- // allocate enough memory to handle worst case requirements
1255
- size_t dest_len = YENC_MAX_SIZE(arg_len, line_size);
1256
-
1257
- unsigned char *result = (unsigned char*) malloc(dest_len);
1258
- size_t len = do_encode(line_size, col, (const unsigned char*)node::Buffer::Data(args[0]), result, arg_len);
1259
- result = (unsigned char*)realloc(result, len);
1260
- //isolate->AdjustAmountOfExternalAllocatedMemory(len);
1261
- args.GetReturnValue().Set( BUFFER_NEW((char*)result, len, free_buffer, (void*)len) );
1262
- }
1263
-
1264
- static void EncodeTo(const FunctionCallbackInfo<Value>& args) {
1265
- Isolate* isolate = Isolate::GetCurrent();
1266
- HandleScope scope(isolate);
1267
-
1268
- if (args.Length() < 2 || !node::Buffer::HasInstance(args[0]) || !node::Buffer::HasInstance(args[1])) {
1269
- isolate->ThrowException(Exception::Error(
1270
- String::NewFromUtf8(isolate, "You must supply two Buffers"))
1271
- );
1272
- return;
1273
- }
1274
-
1275
- size_t arg_len = node::Buffer::Length(args[0]);
1276
- if (arg_len == 0) {
1277
- args.GetReturnValue().Set( Integer::New(isolate, 0) );
1278
- return;
1279
- }
1280
-
1281
- int line_size = 128, col = 0;
1282
- if (args.Length() >= 3) {
1283
- // TODO: probably should throw errors instead of transparently fixing these...
1284
- line_size = args[2]->ToInteger()->Value();
1285
- if (line_size < 1) line_size = 128;
1286
- if (args.Length() >= 4) {
1287
- col = args[3]->ToInteger()->Value();
1288
- if (col >= line_size) col = 0;
1289
- }
1290
- }
1291
-
1292
- // check that destination buffer has enough space
1293
- size_t dest_len = YENC_MAX_SIZE(arg_len, line_size);
1294
- if(node::Buffer::Length(args[1]) < dest_len) {
1295
- args.GetReturnValue().Set( Integer::New(isolate, 0) );
1296
- return;
1297
- }
1298
-
1299
- size_t len = do_encode(line_size, col, (const unsigned char*)node::Buffer::Data(args[0]), (unsigned char*)node::Buffer::Data(args[1]), arg_len);
1300
- args.GetReturnValue().Set( Integer::New(isolate, len) );
1301
- }
1302
-
1303
- #if NODE_VERSION_AT_LEAST(3, 0, 0)
1304
- // for whatever reason, iojs 3 gives buffer corruption if you pass in a pointer without a free function
1305
- #define RETURN_CRC(x) do { \
1306
- Local<Object> buff = BUFFER_NEW(4); \
1307
- *(uint32_t*)node::Buffer::Data(buff) = x.u32; \
1308
- args.GetReturnValue().Set( buff ); \
1309
- } while(0)
1310
- #else
1311
- #define RETURN_CRC(x) args.GetReturnValue().Set( BUFFER_NEW((char*)x.u8a, 4) )
1312
- #endif
1313
-
1314
- static void CRC32(const FunctionCallbackInfo<Value>& args) {
1315
- Isolate* isolate = Isolate::GetCurrent();
1316
- HandleScope scope(isolate);
1317
-
1318
- if (args.Length() == 0 || !node::Buffer::HasInstance(args[0])) {
1319
- isolate->ThrowException(Exception::Error(
1320
- String::NewFromUtf8(isolate, "You must supply a Buffer"))
1321
- );
1322
- return;
1323
- }
1324
- // TODO: support string args??
1325
-
1326
- union crc32 init;
1327
- init.u32 = 0;
1328
- if (args.Length() >= 2) {
1329
- if (!node::Buffer::HasInstance(args[1]) || node::Buffer::Length(args[1]) != 4) {
1330
- isolate->ThrowException(Exception::Error(
1331
- String::NewFromUtf8(isolate, "Second argument must be a 4 byte buffer"))
1332
- );
1333
- return;
1334
- }
1335
- init.u32 = *(uint32_t*)node::Buffer::Data(args[1]);
1336
- do_crc32_incremental(
1337
- (const void*)node::Buffer::Data(args[0]),
1338
- node::Buffer::Length(args[0]),
1339
- init.u8a
1340
- );
1341
- } else {
1342
- do_crc32(
1343
- (const void*)node::Buffer::Data(args[0]),
1344
- node::Buffer::Length(args[0]),
1345
- init.u8a
1346
- );
1347
- }
1348
- RETURN_CRC(init);
1349
- }
1350
-
1351
- static void CRC32Combine(const FunctionCallbackInfo<Value>& args) {
1352
- Isolate* isolate = Isolate::GetCurrent();
1353
- HandleScope scope(isolate);
1354
-
1355
- if (args.Length() < 3) {
1356
- isolate->ThrowException(Exception::Error(
1357
- String::NewFromUtf8(isolate, "At least 3 arguments required"))
1358
- );
1359
- return;
1360
- }
1361
- if (!node::Buffer::HasInstance(args[0]) || node::Buffer::Length(args[0]) != 4
1362
- || !node::Buffer::HasInstance(args[1]) || node::Buffer::Length(args[1]) != 4) {
1363
- isolate->ThrowException(Exception::Error(
1364
- String::NewFromUtf8(isolate, "You must supply a 4 byte Buffer for the first two arguments"))
1365
- );
1366
- return;
1367
- }
1368
-
1369
- union crc32 crc1, crc2;
1370
- size_t len = (size_t)args[2]->ToInteger()->Value();
1371
-
1372
- crc1.u32 = *(uint32_t*)node::Buffer::Data(args[0]);
1373
- crc2.u32 = *(uint32_t*)node::Buffer::Data(args[1]);
1374
-
1375
- do_crc32_combine(crc1.u8a, crc2.u8a, len);
1376
- RETURN_CRC(crc1);
1377
- }
1378
-
1379
- static void CRC32Zeroes(const FunctionCallbackInfo<Value>& args) {
1380
- Isolate* isolate = Isolate::GetCurrent();
1381
- HandleScope scope(isolate);
1382
-
1383
- if (args.Length() < 1) {
1384
- isolate->ThrowException(Exception::Error(
1385
- String::NewFromUtf8(isolate, "At least 1 argument required"))
1386
- );
1387
- return;
1388
- }
1389
-
1390
- union crc32 crc1;
1391
- size_t len = (size_t)args[0]->ToInteger()->Value();
1392
- do_crc32_zeros(crc1.u8a, len);
1393
- RETURN_CRC(crc1);
1394
- }
1395
- #else
1396
- // node 0.10 version
1397
- #define ReturnBuffer(buffer, size, offset) return scope.Close(Local<Object>::New((buffer)->handle_))
1398
-
1399
- static Handle<Value> Encode(const Arguments& args) {
1400
- HandleScope scope;
1401
-
1402
- if (args.Length() == 0 || !node::Buffer::HasInstance(args[0])) {
1403
- return ThrowException(Exception::Error(
1404
- String::New("You must supply a Buffer"))
1405
- );
1406
- }
1407
-
1408
- size_t arg_len = node::Buffer::Length(args[0]);
1409
- if (arg_len == 0) {
1410
- ReturnBuffer(node::Buffer::New(0), 0, 0);
1411
- }
1412
-
1413
- int line_size = 128, col = 0;
1414
- if (args.Length() >= 2) {
1415
- // TODO: probably should throw errors instead of transparently fixing these...
1416
- line_size = args[1]->ToInteger()->Value();
1417
- if (line_size < 1) line_size = 128;
1418
- if (args.Length() >= 3) {
1419
- col = args[2]->ToInteger()->Value();
1420
- if (col >= line_size) col = 0;
1421
- }
1422
- }
1423
-
1424
- // allocate enough memory to handle worst case requirements
1425
- size_t dest_len = YENC_MAX_SIZE(arg_len, line_size);
1426
-
1427
- unsigned char *result = (unsigned char*) malloc(dest_len);
1428
- size_t len = do_encode(line_size, col, (const unsigned char*)node::Buffer::Data(args[0]), result, arg_len);
1429
- result = (unsigned char*)realloc(result, len);
1430
- V8::AdjustAmountOfExternalAllocatedMemory(len);
1431
- ReturnBuffer(node::Buffer::New((char*)result, len, free_buffer, (void*)len), len, 0);
1432
- }
1433
-
1434
- static Handle<Value> EncodeTo(const Arguments& args) {
1435
- HandleScope scope;
1436
-
1437
- if (args.Length() < 2 || !node::Buffer::HasInstance(args[0]) || !node::Buffer::HasInstance(args[1])) {
1438
- return ThrowException(Exception::Error(
1439
- String::New("You must supply two Buffers"))
1440
- );
1441
- }
1442
-
1443
- size_t arg_len = node::Buffer::Length(args[0]);
1444
- if (arg_len == 0) {
1445
- return scope.Close(Integer::New(0));
1446
- }
1447
-
1448
- int line_size = 128, col = 0;
1449
- if (args.Length() >= 3) {
1450
- // TODO: probably should throw errors instead of transparently fixing these...
1451
- line_size = args[2]->ToInteger()->Value();
1452
- if (line_size < 1) line_size = 128;
1453
- if (args.Length() >= 4) {
1454
- col = args[3]->ToInteger()->Value();
1455
- if (col >= line_size) col = 0;
1456
- }
1457
- }
1458
-
1459
- // check that destination buffer has enough space
1460
- size_t dest_len = YENC_MAX_SIZE(arg_len, line_size);
1461
- if(node::Buffer::Length(args[1]) < dest_len) {
1462
- return scope.Close(Integer::New(0));
1463
- }
1464
-
1465
- size_t len = do_encode(line_size, col, (const unsigned char*)node::Buffer::Data(args[0]), (unsigned char*)node::Buffer::Data(args[1]), arg_len);
1466
- return scope.Close(Integer::New(len));
1467
- }
1468
-
1469
- static Handle<Value> CRC32(const Arguments& args) {
1470
- HandleScope scope;
1471
-
1472
- if (args.Length() == 0 || !node::Buffer::HasInstance(args[0])) {
1473
- return ThrowException(Exception::Error(
1474
- String::New("You must supply a Buffer"))
1475
- );
1476
- }
1477
- // TODO: support string args??
1478
-
1479
- union crc32 init;
1480
- init.u32 = 0;
1481
- if (args.Length() >= 2) {
1482
- if (!node::Buffer::HasInstance(args[1]) || node::Buffer::Length(args[1]) != 4)
1483
- return ThrowException(Exception::Error(
1484
- String::New("Second argument must be a 4 byte buffer"))
1485
- );
1486
-
1487
- init.u32 = *(uint32_t*)node::Buffer::Data(args[1]);
1488
- do_crc32_incremental(
1489
- (const void*)node::Buffer::Data(args[0]),
1490
- node::Buffer::Length(args[0]),
1491
- init.u8a
1492
- );
1493
- } else {
1494
- do_crc32(
1495
- (const void*)node::Buffer::Data(args[0]),
1496
- node::Buffer::Length(args[0]),
1497
- init.u8a
1498
- );
1499
- }
1500
- ReturnBuffer(node::Buffer::New((char*)init.u8a, 4), 4, 0);
1501
- }
1502
-
1503
- static Handle<Value> CRC32Combine(const Arguments& args) {
1504
- HandleScope scope;
1505
-
1506
- if (args.Length() < 3) {
1507
- return ThrowException(Exception::Error(
1508
- String::New("At least 3 arguments required"))
1509
- );
1510
- }
1511
- if (!node::Buffer::HasInstance(args[0]) || node::Buffer::Length(args[0]) != 4
1512
- || !node::Buffer::HasInstance(args[1]) || node::Buffer::Length(args[1]) != 4) {
1513
- return ThrowException(Exception::Error(
1514
- String::New("You must supply a 4 byte Buffer for the first two arguments"))
1515
- );
1516
- }
1517
-
1518
- union crc32 crc1, crc2;
1519
- size_t len = (size_t)args[2]->ToInteger()->Value();
1520
-
1521
- crc1.u32 = *(uint32_t*)node::Buffer::Data(args[0]);
1522
- crc2.u32 = *(uint32_t*)node::Buffer::Data(args[1]);
1523
-
1524
- do_crc32_combine(crc1.u8a, crc2.u8a, len);
1525
- ReturnBuffer(node::Buffer::New((char*)crc1.u8a, 4), 4, 0);
1526
- }
1527
-
1528
- static Handle<Value> CRC32Zeroes(const Arguments& args) {
1529
- HandleScope scope;
1530
-
1531
- if (args.Length() < 1) {
1532
- return ThrowException(Exception::Error(
1533
- String::New("At least 1 argument required"))
1534
- );
1535
- }
1536
- union crc32 crc1;
1537
- size_t len = (size_t)args[0]->ToInteger()->Value();
1538
-
1539
- do_crc32_zeros(crc1.u8a, len);
1540
- ReturnBuffer(node::Buffer::New((char*)crc1.u8a, 4), 4, 0);
1541
- }
1542
- #endif
1543
-
1544
- void init(Handle<Object> target) {
1545
- for (int i=0; i<256; i++) {
1546
- escapeLUT[i] = (i+42) & 0xFF;
1547
- escapedLUT[i] = 0;
1548
- }
1549
- escapeLUT[214 + '\0'] = 0;
1550
- escapeLUT[214 + '\r'] = 0;
1551
- escapeLUT[214 + '\n'] = 0;
1552
- escapeLUT['=' - 42 ] = 0;
1553
-
1554
- escapedLUT[214 + '\0'] = UINT16_PACK('=', '\0'+64);
1555
- escapedLUT[214 + '\r'] = UINT16_PACK('=', '\r'+64);
1556
- escapedLUT[214 + '\n'] = UINT16_PACK('=', '\n'+64);
1557
- escapedLUT['=' - 42 ] = UINT16_PACK('=', '='+64);
1558
- escapedLUT[214 + '\t'] = UINT16_PACK('=', '\t'+64);
1559
- escapedLUT[214 + ' ' ] = UINT16_PACK('=', ' '+64);
1560
- escapedLUT['.' - 42 ] = UINT16_PACK('=', '.'+64);
1561
- NODE_SET_METHOD(target, "encode", Encode);
1562
- NODE_SET_METHOD(target, "encodeTo", EncodeTo);
1563
- NODE_SET_METHOD(target, "crc32", CRC32);
1564
- NODE_SET_METHOD(target, "crc32_combine", CRC32Combine);
1565
- NODE_SET_METHOD(target, "crc32_zeroes", CRC32Zeroes);
1566
-
1567
-
1568
-
1569
- #ifdef __SSSE3__
1570
- uint32_t flags;
1571
- #ifdef _MSC_VER
1572
- int cpuInfo[4];
1573
- __cpuid(cpuInfo, 1);
1574
- flags = cpuInfo[2];
1575
- #else
1576
- // conveniently stolen from zlib-ng
1577
- __asm__ __volatile__ (
1578
- "cpuid"
1579
- : "=c" (flags)
1580
- : "a" (1)
1581
- : "%edx", "%ebx"
1582
- );
1583
- #endif
1584
- #ifdef X86_PCLMULQDQ_CRC
1585
- x86_cpu_has_pclmulqdq = (flags & 0x80202) == 0x80202; // SSE4.1 + SSSE3 + CLMUL
1586
- #endif
1587
-
1588
- uint32_t fastYencMask = 0x200;
1589
- #ifdef __POPCNT__
1590
- fastYencMask |= 0x800000;
1591
- #endif
1592
- #ifdef __SSE4_1__
1593
- fastYencMask |= 0x80000;
1594
- #endif
1595
- _do_encode = ((flags & fastYencMask) == fastYencMask) ? &do_encode_fast : &do_encode_slow; // SSSE3 + required stuff based on compiler flags
1596
-
1597
- if((flags & fastYencMask) == fastYencMask) {
1598
- // generate shuf LUT
1599
- for(int i=0; i<256; i++) {
1600
- int k = i;
1601
- uint8_t res[16];
1602
- int p = 0;
1603
- for(int j=0; j<8; j++) {
1604
- res[j+p] = j;
1605
- if(k & 1) {
1606
- p++;
1607
- res[j+p] = j;
1608
- }
1609
- k >>= 1;
1610
- }
1611
- for(; p<8; p++)
1612
- res[8+p] = 8+p +0x80; // +0x80 causes PSHUFB to 0 discarded entries; has no effect other than to ease debugging
1613
- _mm_store_si128(shufLUT + i, _mm_loadu_si128((__m128i*)res));
1614
- }
1615
- // underflow guard entries; this may occur when checking for escaped characters, when the shufLUT[0] and shufLUT[-1] are used for testing
1616
- _mm_store_si128(_shufLUT +0, _mm_set1_epi8(0xFF));
1617
- _mm_store_si128(_shufLUT +1, _mm_set1_epi8(0xFF));
1618
- }
1619
- #endif
1620
- }
1621
-
1622
- NODE_MODULE(yencode, init);