yencode 1.0.8 → 1.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. package/README.md +339 -231
  2. package/binding.gyp +292 -39
  3. package/crcutil-1.0/code/multiword_64_64_gcc_amd64_asm.cc +7 -7
  4. package/crcutil-1.0/code/multiword_64_64_gcc_i386_mmx.cc +14 -14
  5. package/crcutil-1.0/code/multiword_64_64_intrinsic_i386_mmx.cc +1 -1
  6. package/crcutil-1.0/code/uint128_sse2.h +2 -0
  7. package/index.js +329 -22
  8. package/package.json +2 -2
  9. package/src/common.h +299 -0
  10. package/src/crc.cc +95 -0
  11. package/src/crc.h +23 -0
  12. package/src/crc_arm.cc +175 -0
  13. package/src/crc_common.h +4 -0
  14. package/{crc_folding.c → src/crc_folding.cc} +175 -185
  15. package/src/decoder.cc +61 -0
  16. package/src/decoder.h +53 -0
  17. package/src/decoder_avx.cc +18 -0
  18. package/src/decoder_avx2.cc +18 -0
  19. package/src/decoder_avx2_base.h +615 -0
  20. package/src/decoder_common.h +512 -0
  21. package/src/decoder_neon.cc +474 -0
  22. package/src/decoder_neon64.cc +451 -0
  23. package/src/decoder_sse2.cc +16 -0
  24. package/src/decoder_sse_base.h +711 -0
  25. package/src/decoder_ssse3.cc +18 -0
  26. package/src/encoder.cc +170 -0
  27. package/src/encoder.h +21 -0
  28. package/src/encoder_avx.cc +16 -0
  29. package/src/encoder_avx2.cc +16 -0
  30. package/src/encoder_avx_base.h +564 -0
  31. package/src/encoder_common.h +109 -0
  32. package/src/encoder_neon.cc +547 -0
  33. package/src/encoder_sse2.cc +13 -0
  34. package/src/encoder_sse_base.h +724 -0
  35. package/src/encoder_ssse3.cc +18 -0
  36. package/src/hedley.h +1899 -0
  37. package/src/platform.cc +147 -0
  38. package/src/yencode.cc +449 -0
  39. package/test/_maxsize.js +9 -0
  40. package/test/_speedbase.js +147 -0
  41. package/test/speedcrc.js +20 -0
  42. package/test/speeddec.js +92 -0
  43. package/test/speedenc.js +44 -0
  44. package/{testcrc.js → test/testcrc.js} +53 -39
  45. package/test/testdec.js +183 -0
  46. package/test/testenc.js +163 -0
  47. package/test/testpostdec.js +126 -0
  48. package/test.js +0 -91
  49. package/yencode.cc +0 -1622
@@ -0,0 +1,547 @@
1
+ #include "common.h"
2
+
3
+ #ifdef __ARM_NEON
4
+ #include "encoder.h"
5
+ #include "encoder_common.h"
6
+
7
+ // Clang wrongly assumes alignment on vst1q_u8_x2, and ARMv7 GCC doesn't support the function, so effectively, it can only be used in ARMv8 compilers
8
+ #if defined(__aarch64__) && (defined(__clang__) || (defined(__GNUC__) && __GNUC__ >= 9))
9
+ # define vst1q_u8_x2_unaligned vst1q_u8_x2
10
+ #else
11
+ static HEDLEY_ALWAYS_INLINE void vst1q_u8_x2_unaligned(uint8_t* p, uint8x16x2_t data) {
12
+ vst1q_u8(p, data.val[0]);
13
+ vst1q_u8(p+16, data.val[1]);
14
+ }
15
+ #endif
16
+
17
+
18
+ static uint8x16_t ALIGN_TO(16, shufLUT[256]);
19
+ static uint16_t expandLUT[256];
20
+
21
+ static HEDLEY_ALWAYS_INLINE void encode_eol_handle_pre(const uint8_t* HEDLEY_RESTRICT es, long& i, uint8_t*& p, long& col, long lineSizeOffset) {
22
+ uint8x16_t oDataA = vld1q_u8(es + i);
23
+ uint8x16_t oDataB = vld1q_u8(es + i + sizeof(uint8x16_t));
24
+ uint8x16_t dataA = oDataA;
25
+ uint8x16_t dataB = oDataB;
26
+ #ifdef __aarch64__
27
+ uint8x16_t cmpA = vreinterpretq_u8_s8(vqtbx2q_s8(
28
+ vdupq_n_s8('='-42),
29
+ vcreate2_s8(vmakeq_s8('\0'-42,-128,-128,'\0'-42,'\t'-42,'\n'-42,'\r'-42,'\t'-42,'\n'-42,'\r'-42,-128,-128,'\0'-42,-128,-128,-128), vmakeq_s8(' '-42,'\n'-42,'\r'-42,' '-42,-128,-128,-128,-128,-128,-128,'.'-42,-128,-128,-128,'='-42,-128)),
30
+ vreinterpretq_u8_s8(vhaddq_s8(vreinterpretq_s8_u8(dataA), vmakeq_s8(42,48,66,66, 66,66,66,66, 66,66,66,66, 66,66,66,66)))
31
+ ));
32
+ cmpA = vceqq_u8(cmpA, dataA);
33
+
34
+ dataB = vaddq_u8(oDataB, vdupq_n_u8(42));
35
+ uint8x16_t cmpB = vqtbx1q_u8(
36
+ vceqq_u8(oDataB, vdupq_n_u8('='-42)),
37
+ // \0 \n \r
38
+ vmakeq_u8(255,0,0,0,0,0,0,0,0,0,255,0,0,255,0,0),
39
+ dataB
40
+ );
41
+ dataA = vaddq_u8(dataA, vbslq_u8(cmpA, vdupq_n_u8(64+42), vdupq_n_u8(42)));
42
+ dataB = vorrq_u8(dataB, vandq_u8(cmpB, vdupq_n_u8(64)));
43
+ #else
44
+ uint8x16_t cmpA = vorrq_u8(
45
+ vorrq_u8(
46
+ vceqq_u8(oDataA, vdupq_n_u8(-42)),
47
+ vceqq_u8(oDataA, vdupq_n_u8('='-42))
48
+ ),
49
+ vorrq_u8(
50
+ vceqq_u8(oDataA, vdupq_n_u8('\r'-42)),
51
+ vceqq_u8(oDataA, vdupq_n_u8('\n'-42))
52
+ )
53
+ );
54
+ uint8x16_t cmpB = vorrq_u8(
55
+ vorrq_u8(
56
+ vceqq_u8(oDataB, vdupq_n_u8(-42)),
57
+ vceqq_u8(oDataB, vdupq_n_u8('='-42))
58
+ ),
59
+ vorrq_u8(
60
+ vceqq_u8(oDataB, vdupq_n_u8('\r'-42)),
61
+ vceqq_u8(oDataB, vdupq_n_u8('\n'-42))
62
+ )
63
+ );
64
+
65
+ // dup low 2 bytes & compare
66
+ uint8x8_t firstTwoChars = vreinterpret_u8_u16(vdup_lane_u16(vreinterpret_u16_u8(vget_low_u8(oDataA)), 0));
67
+ uint8x8_t cmpNl = vceq_u8(firstTwoChars, vmake_u8(
68
+ ' '+214,' '+214,'\t'+214,'\t'+214,'\r'+214,'.'-42,'='-42,'='-42
69
+ ));
70
+ // use padd to merge comparisons
71
+ uint16x4_t cmpNl2 = vreinterpret_u16_u8(cmpNl);
72
+ cmpNl2 = vpadd_u16(cmpNl2, vdup_n_u16(0));
73
+ cmpNl2 = vpadd_u16(cmpNl2, vdup_n_u16(0));
74
+ cmpA = vcombine_u8(
75
+ vorr_u8(vget_low_u8(cmpA), vreinterpret_u8_u16(cmpNl2)),
76
+ vget_high_u8(cmpA)
77
+ );
78
+ dataA = vsubq_u8(dataA, vbslq_u8(cmpA, vdupq_n_u8(-64-42), vdupq_n_u8(-42)));
79
+ dataB = vsubq_u8(dataB, vbslq_u8(cmpB, vdupq_n_u8(-64-42), vdupq_n_u8(-42)));
80
+ #endif
81
+
82
+
83
+ uint8x16_t cmpAMasked = vandq_u8(cmpA, vmakeq_u8(1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128));
84
+ uint8x16_t cmpBMasked = vandq_u8(cmpB, vmakeq_u8(1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128));
85
+ #ifdef __aarch64__
86
+ uint8x16_t cmpMerge = vpaddq_u8(cmpAMasked, cmpBMasked);
87
+ cmpMerge = vpaddq_u8(cmpMerge, cmpMerge);
88
+ uint64_t mask = vgetq_lane_u64(vreinterpretq_u64_u8(cmpMerge), 0);
89
+
90
+ // write out first char + newline
91
+ uint32_t firstChar = vgetq_lane_u8(dataA, 0);
92
+ if(LIKELIHOOD(0.0234, mask & 1)) {
93
+ firstChar <<= 8;
94
+ firstChar |= 0x0a0d003d;
95
+ memcpy(p, &firstChar, sizeof(firstChar));
96
+ p += 4;
97
+ mask ^= 1;
98
+ cmpMerge = vbicq_u8(cmpMerge, vmakeq_u8(1,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0));
99
+ } else {
100
+ firstChar |= 0x0a0d00;
101
+ memcpy(p, &firstChar, sizeof(firstChar));
102
+ p += 3;
103
+ }
104
+
105
+ if(LIKELIHOOD(0.09, (mask & (mask-1)) != 0)) {
106
+ mask |= mask >> 8;
107
+ uint8x8_t cmpPacked = vpadd_u8(vget_low_u8(cmpMerge), vget_low_u8(cmpMerge));
108
+ uint8_t m1 = (mask & 0xff);
109
+ uint8_t m2 = ((mask >> 16) & 0xff);
110
+ uint8_t m3 = ((mask >> 32) & 0xff);
111
+ uint8_t m4 = ((mask >> 48) & 0xff);
112
+ #else
113
+ // no vpaddq_u8 in ARMv7, so need extra 64-bit VPADD
114
+ uint8x8_t cmpPacked = vpadd_u8(
115
+ vpadd_u8(
116
+ vget_low_u8(cmpAMasked), vget_high_u8(cmpAMasked)
117
+ ),
118
+ vpadd_u8(
119
+ vget_low_u8(cmpBMasked), vget_high_u8(cmpBMasked)
120
+ )
121
+ );
122
+ cmpPacked = vpadd_u8(cmpPacked, cmpPacked);
123
+ uint32_t mask = vget_lane_u32(vreinterpret_u32_u8(cmpPacked), 0);
124
+
125
+ // write out first char + newline
126
+ uint32_t firstChar = vgetq_lane_u8(dataA, 0);
127
+ if(LIKELIHOOD(0.0234, mask & 1)) {
128
+ firstChar <<= 8;
129
+ firstChar |= 0x0a0d003d;
130
+ memcpy(p, &firstChar, sizeof(firstChar));
131
+ p += 4;
132
+ mask ^= 1;
133
+ cmpPacked = vbic_u8(cmpPacked, vmake_u8(1,0,0,0, 0,0,0,0));
134
+ } else {
135
+ firstChar |= 0x0a0d00;
136
+ memcpy(p, &firstChar, sizeof(firstChar));
137
+ p += 3;
138
+ }
139
+
140
+ if(LIKELIHOOD(0.09, (mask & (mask-1)) != 0)) {
141
+ uint8_t m1 = (mask & 0xff);
142
+ uint8_t m2 = ((mask >> 8) & 0xff);
143
+ uint8_t m3 = ((mask >> 16) & 0xff);
144
+ uint8_t m4 = ((mask >> 24) & 0xff);
145
+ #endif
146
+
147
+ // perform lookup for shuffle mask
148
+ uint8x16_t shuf1 = vld1q_u8((uint8_t*)(shufLUT + m1));
149
+ uint8x16_t shuf2 = vld1q_u8((uint8_t*)(shufLUT + m2));
150
+ uint8x16_t shuf3 = vld1q_u8((uint8_t*)(shufLUT + m3));
151
+ uint8x16_t shuf4 = vld1q_u8((uint8_t*)(shufLUT + m4));
152
+ #ifdef __aarch64__
153
+ uint8x16_t data1A = vqtbx1q_u8(shuf1, dataA, shuf1);
154
+ uint8x16_t data2A = vqtbx1q_u8(shuf2, vextq_u8(dataA, dataA, 8), shuf2);
155
+ uint8x16_t data1B = vqtbx1q_u8(shuf3, dataB, shuf3);
156
+ uint8x16_t data2B = vqtbx1q_u8(shuf4, vextq_u8(dataB, dataB, 8), shuf4);
157
+ #else
158
+ uint8x8_t shuf1l = vget_low_u8(shuf1);
159
+ uint8x8_t shuf1h = vget_high_u8(shuf1);
160
+ uint8x8_t shuf2l = vget_low_u8(shuf2);
161
+ uint8x8_t shuf2h = vget_high_u8(shuf2);
162
+ uint8x8_t shuf3l = vget_low_u8(shuf3);
163
+ uint8x8_t shuf3h = vget_high_u8(shuf3);
164
+ uint8x8_t shuf4l = vget_low_u8(shuf4);
165
+ uint8x8_t shuf4h = vget_high_u8(shuf4);
166
+ uint8x16_t data1A = vcombine_u8(vtbx1_u8(shuf1l, vget_low_u8(dataA), shuf1l),
167
+ vtbx1_u8(shuf1h, vget_low_u8(dataA), shuf1h));
168
+ uint8x16_t data2A = vcombine_u8(vtbx1_u8(shuf2l, vget_high_u8(dataA), shuf2l),
169
+ vtbx1_u8(shuf2h, vget_high_u8(dataA), shuf2h));
170
+ uint8x16_t data1B = vcombine_u8(vtbx1_u8(shuf3l, vget_low_u8(dataB), shuf3l),
171
+ vtbx1_u8(shuf3h, vget_low_u8(dataB), shuf3h));
172
+ uint8x16_t data2B = vcombine_u8(vtbx1_u8(shuf4l, vget_high_u8(dataB), shuf4l),
173
+ vtbx1_u8(shuf4h, vget_high_u8(dataB), shuf4h));
174
+ #endif
175
+ data1A = vextq_u8(data1A, data1A, 1); // shift out processed byte (last char of line)
176
+
177
+ uint32_t counts = vget_lane_u32(vreinterpret_u32_u8(vcnt_u8(cmpPacked)), 0);
178
+ counts += 0x08080807;
179
+
180
+ unsigned char shuf1Len = counts & 0xff;
181
+ unsigned char shuf2Len = (counts>>8) & 0xff;
182
+ unsigned char shuf3Len = (counts>>16) & 0xff;
183
+ unsigned char shuf4Len = (counts>>24) & 0xff;
184
+ uint32_t shufTotalLen = counts * 0x1010101;
185
+ shufTotalLen >>= 24;
186
+
187
+ vst1q_u8(p, data1A);
188
+ p += shuf1Len;
189
+ vst1q_u8(p, data2A);
190
+ p += shuf2Len;
191
+ vst1q_u8(p, data1B);
192
+ p += shuf3Len;
193
+ vst1q_u8(p, data2B);
194
+ p += shuf4Len;
195
+ col = shufTotalLen+1 + lineSizeOffset-32;
196
+ } else {
197
+ // shuffle stuff up
198
+ #ifdef __aarch64__
199
+ # ifdef _MSC_VER
200
+ long bitIndex;
201
+ if(_BitScanReverse64((unsigned long*)&bitIndex, mask))
202
+ bitIndex ^= 63;
203
+ else
204
+ bitIndex = 64;
205
+ # else
206
+ long bitIndex = __builtin_clzll(mask);
207
+ # endif
208
+ #else
209
+ # ifdef __GNUC__
210
+ long bitIndex = __builtin_clz(mask); // TODO: is the 'undefined if 0' case problematic here?
211
+ # elif defined(_MSC_VER)
212
+ long bitIndex = _arm_clz(mask);
213
+ # else
214
+ long bitIndex = __clz(mask); // ARM compiler?
215
+ # endif
216
+ #endif
217
+
218
+ uint8x16_t vClz = vdupq_n_u8(bitIndex & ~(sizeof(mask)*8));
219
+ #ifdef __aarch64__
220
+ uint8x16_t blendA = vcgtq_u8(vmakeq_u8(63,62,61,60,51,50,49,48,47,46,45,44,35,34,33,32), vClz);
221
+ uint8x16_t blendB = vcgtq_u8(vmakeq_u8(31,30,29,28,19,18,17,16,15,14,13,12, 3, 2, 1, 0), vClz);
222
+ #else
223
+ uint8x16_t blendA = vcgtq_u8(vmakeq_u8(31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16), vClz);
224
+ uint8x16_t blendB = vcgtq_u8(vmakeq_u8(15,14,13,12,11,10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0), vClz);
225
+ #endif
226
+ uint8x16_t dataAShifted = vbslq_u8(cmpA, vdupq_n_u8('='), dataA);
227
+ uint8x16_t dataBShifted = vbslq_u8(cmpB, vdupq_n_u8('='), dataB);
228
+ dataAShifted = vextq_u8(dataAShifted, dataBShifted, 1);
229
+ dataBShifted = vextq_u8(dataBShifted, dataBShifted, 1);
230
+ dataA = vbslq_u8(blendA, dataAShifted, dataA);
231
+ dataB = vbslq_u8(blendB, dataBShifted, dataB);
232
+
233
+ vst1q_u8_x2_unaligned(p, vcreate2_u8(dataA, dataB));
234
+ p += sizeof(uint8x16_t)*2 - 1;
235
+ p += (mask != 0);
236
+ col = lineSizeOffset + (mask != 0);
237
+ }
238
+
239
+ i += sizeof(uint8x16_t)*2;
240
+ // TODO: check col >= 0 if we want to support short lines
241
+ }
242
+
243
+
244
+ HEDLEY_ALWAYS_INLINE void do_encode_neon(int line_size, int* colOffset, const uint8_t* HEDLEY_RESTRICT srcEnd, uint8_t* HEDLEY_RESTRICT& dest, size_t& len) {
245
+ // offset position to enable simpler loop condition checking
246
+ const int INPUT_OFFSET = sizeof(uint8x16_t)*4 -1; // extra chars for EOL handling, -1 to change <= to <
247
+ if(len <= INPUT_OFFSET || line_size < (int)sizeof(uint8x16_t)*4) return;
248
+
249
+ uint8_t *p = dest; // destination pointer
250
+ long i = -(long)len; // input position
251
+ long lineSizeOffset = -line_size +32; // line size plus vector length
252
+ long col = *colOffset - line_size +1;
253
+
254
+ i += INPUT_OFFSET;
255
+ const uint8_t* es = srcEnd - INPUT_OFFSET;
256
+
257
+ if (HEDLEY_LIKELY(col == -line_size+1)) {
258
+ uint8_t c = es[i++];
259
+ if (LIKELIHOOD(0.0273, escapedLUT[c] != 0)) {
260
+ memcpy(p, escapedLUT + c, 2);
261
+ p += 2;
262
+ col += 2;
263
+ } else {
264
+ *(p++) = c + 42;
265
+ col += 1;
266
+ }
267
+ }
268
+ if(HEDLEY_UNLIKELY(col >= 0)) {
269
+ if(col == 0)
270
+ encode_eol_handle_pre(es, i, p, col, lineSizeOffset);
271
+ else {
272
+ uint8_t c = es[i++];
273
+ if (LIKELIHOOD(0.0273, escapedLUT[c]!=0)) {
274
+ uint32_t v = UINT32_16_PACK(UINT16_PACK('\r', '\n'), (uint32_t)escapedLUT[c]);
275
+ memcpy(p, &v, sizeof(v));
276
+ p += 4;
277
+ col = 2-line_size + 1;
278
+ } else {
279
+ uint32_t v = UINT32_PACK('\r', '\n', (uint32_t)(c+42), 0);
280
+ memcpy(p, &v, sizeof(v));
281
+ p += 3;
282
+ col = 2-line_size;
283
+ }
284
+ }
285
+ }
286
+ while(i < 0) {
287
+ // for unaligned loads, separate loads seem to be faster than vld1q_u8_x2 on Cortex A53; unsure if this applies elsewhere
288
+ uint8x16_t dataA = vld1q_u8(es + i);
289
+ uint8x16_t dataB = vld1q_u8(es + i + sizeof(uint8x16_t));
290
+ i += sizeof(uint8x16_t)*2;
291
+ // search for special chars
292
+ #ifdef __aarch64__
293
+ uint8x16_t cmpEqA = vceqq_u8(dataA, vdupq_n_u8('='-42));
294
+ uint8x16_t cmpEqB = vceqq_u8(dataB, vdupq_n_u8('='-42));
295
+ dataA = vaddq_u8(dataA, vdupq_n_u8(42));
296
+ dataB = vaddq_u8(dataB, vdupq_n_u8(42));
297
+ uint8x16_t cmpA = vqtbx1q_u8(
298
+ cmpEqA,
299
+ // \0 \n \r
300
+ vmakeq_u8(255,0,0,0,0,0,0,0,0,0,255,0,0,255,0,0),
301
+ dataA
302
+ );
303
+ uint8x16_t cmpB = vqtbx1q_u8(
304
+ cmpEqB,
305
+ // \0 \n \r
306
+ vmakeq_u8(255,0,0,0,0,0,0,0,0,0,255,0,0,255,0,0),
307
+ dataB
308
+ );
309
+
310
+ dataA = vorrq_u8(dataA, vandq_u8(cmpA, vdupq_n_u8(64)));
311
+ dataB = vorrq_u8(dataB, vandq_u8(cmpB, vdupq_n_u8(64)));
312
+ #else
313
+ // the ARMv8 strategy may be worth it here with 2x vtbx2's, but both GCC-9 and Clang-9 generate poor assembly for it, so it performs worse than the following
314
+ uint8x16_t cmpA = vorrq_u8(
315
+ vorrq_u8(
316
+ vceqq_u8(dataA, vdupq_n_u8(-42)),
317
+ vceqq_u8(dataA, vdupq_n_u8('='-42))
318
+ ),
319
+ vorrq_u8(
320
+ vceqq_u8(dataA, vdupq_n_u8('\r'-42)),
321
+ vceqq_u8(dataA, vdupq_n_u8('\n'-42))
322
+ )
323
+ );
324
+ uint8x16_t cmpB = vorrq_u8(
325
+ vorrq_u8(
326
+ vceqq_u8(dataB, vdupq_n_u8(-42)),
327
+ vceqq_u8(dataB, vdupq_n_u8('='-42))
328
+ ),
329
+ vorrq_u8(
330
+ vceqq_u8(dataB, vdupq_n_u8('\r'-42)),
331
+ vceqq_u8(dataB, vdupq_n_u8('\n'-42))
332
+ )
333
+ );
334
+
335
+ dataA = vsubq_u8(dataA, vbslq_u8(cmpA, vdupq_n_u8(-64-42), vdupq_n_u8(-42)));
336
+ dataB = vsubq_u8(dataB, vbslq_u8(cmpB, vdupq_n_u8(-64-42), vdupq_n_u8(-42)));
337
+ #endif
338
+
339
+
340
+ long bitIndex; // prevent compiler whining
341
+ uint8x16_t cmpAMasked = vandq_u8(cmpA, vmakeq_u8(1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128));
342
+ uint8x16_t cmpBMasked = vandq_u8(cmpB, vmakeq_u8(1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128));
343
+ #ifdef __aarch64__
344
+ uint8x16_t cmpMerge = vpaddq_u8(cmpAMasked, cmpBMasked);
345
+ cmpMerge = vpaddq_u8(cmpMerge, cmpMerge);
346
+ uint64_t mask = vgetq_lane_u64(vreinterpretq_u64_u8(cmpMerge), 0);
347
+ if(LIKELIHOOD(0.09, (mask & (mask-1)) != 0)) {
348
+ mask |= mask >> 8;
349
+ uint8x8_t cmpPacked = vpadd_u8(vget_low_u8(cmpMerge), vget_low_u8(cmpMerge));
350
+ uint8_t m1 = (mask & 0xff);
351
+ uint8_t m2 = ((mask >> 16) & 0xff);
352
+ uint8_t m3 = ((mask >> 32) & 0xff);
353
+ uint8_t m4 = ((mask >> 48) & 0xff);
354
+ #else
355
+ // no vpaddq_u8 in ARMv7, so need extra 64-bit VPADD
356
+ uint8x8_t cmpPacked = vpadd_u8(
357
+ vpadd_u8(
358
+ vget_low_u8(cmpAMasked), vget_high_u8(cmpAMasked)
359
+ ),
360
+ vpadd_u8(
361
+ vget_low_u8(cmpBMasked), vget_high_u8(cmpBMasked)
362
+ )
363
+ );
364
+ cmpPacked = vpadd_u8(cmpPacked, cmpPacked);
365
+ uint32_t mask = vget_lane_u32(vreinterpret_u32_u8(cmpPacked), 0);
366
+ if(LIKELIHOOD(0.09, (mask & (mask-1)) != 0)) {
367
+ uint8_t m1 = (mask & 0xff);
368
+ uint8_t m2 = ((mask >> 8) & 0xff);
369
+ uint8_t m3 = ((mask >> 16) & 0xff);
370
+ uint8_t m4 = ((mask >> 24) & 0xff);
371
+ #endif
372
+
373
+ // perform lookup for shuffle mask
374
+ uint8x16_t shuf1 = vld1q_u8((uint8_t*)(shufLUT + m1));
375
+ uint8x16_t shuf2 = vld1q_u8((uint8_t*)(shufLUT + m2));
376
+ uint8x16_t shuf3 = vld1q_u8((uint8_t*)(shufLUT + m3));
377
+ uint8x16_t shuf4 = vld1q_u8((uint8_t*)(shufLUT + m4));
378
+
379
+ // expand halves
380
+ #ifdef __aarch64__
381
+ uint8x16_t data1A = vqtbx1q_u8(shuf1, dataA, shuf1);
382
+ uint8x16_t data2A = vqtbx1q_u8(shuf2, vextq_u8(dataA, dataA, 8), shuf2);
383
+ uint8x16_t data1B = vqtbx1q_u8(shuf3, dataB, shuf3);
384
+ uint8x16_t data2B = vqtbx1q_u8(shuf4, vextq_u8(dataB, dataB, 8), shuf4);
385
+ #else
386
+ uint8x8_t shuf1l = vget_low_u8(shuf1);
387
+ uint8x8_t shuf1h = vget_high_u8(shuf1);
388
+ uint8x8_t shuf2l = vget_low_u8(shuf2);
389
+ uint8x8_t shuf2h = vget_high_u8(shuf2);
390
+ uint8x8_t shuf3l = vget_low_u8(shuf3);
391
+ uint8x8_t shuf3h = vget_high_u8(shuf3);
392
+ uint8x8_t shuf4l = vget_low_u8(shuf4);
393
+ uint8x8_t shuf4h = vget_high_u8(shuf4);
394
+ uint8x16_t data1A = vcombine_u8(vtbx1_u8(shuf1l, vget_low_u8(dataA), shuf1l),
395
+ vtbx1_u8(shuf1h, vget_low_u8(dataA), shuf1h));
396
+ uint8x16_t data2A = vcombine_u8(vtbx1_u8(shuf2l, vget_high_u8(dataA), shuf2l),
397
+ vtbx1_u8(shuf2h, vget_high_u8(dataA), shuf2h));
398
+ uint8x16_t data1B = vcombine_u8(vtbx1_u8(shuf3l, vget_low_u8(dataB), shuf3l),
399
+ vtbx1_u8(shuf3h, vget_low_u8(dataB), shuf3h));
400
+ uint8x16_t data2B = vcombine_u8(vtbx1_u8(shuf4l, vget_high_u8(dataB), shuf4l),
401
+ vtbx1_u8(shuf4h, vget_high_u8(dataB), shuf4h));
402
+ #endif
403
+
404
+ // store out
405
+ uint32_t counts = vget_lane_u32(vreinterpret_u32_u8(vcnt_u8(cmpPacked)), 0);
406
+ counts += 0x08080808;
407
+
408
+ unsigned char shuf1Len = counts & 0xff;
409
+ unsigned char shuf2Len = (counts>>8) & 0xff;
410
+ unsigned char shuf3Len = (counts>>16) & 0xff;
411
+ unsigned char shuf4Len = (counts>>24) & 0xff;
412
+ uint32_t shufTotalLen = counts * 0x1010101;
413
+ shufTotalLen >>= 24;
414
+
415
+ vst1q_u8(p, data1A);
416
+ p += shuf1Len;
417
+ vst1q_u8(p, data2A);
418
+ p += shuf2Len;
419
+ vst1q_u8(p, data1B);
420
+ p += shuf3Len;
421
+ vst1q_u8(p, data2B);
422
+ p += shuf4Len;
423
+ col += shufTotalLen;
424
+
425
+ if(LIKELIHOOD(0.3, col >= 0)) {
426
+ // we overflowed - find correct position to revert back to
427
+ long revert = col;
428
+ long len2ndHalf = shuf3Len+shuf4Len;
429
+ long shiftAmt = len2ndHalf - col -1;
430
+ uint32_t eqMaskHalf;
431
+ if(HEDLEY_UNLIKELY(shiftAmt < 0)) {
432
+ eqMaskHalf = (expandLUT[m2] << shuf1Len) | expandLUT[m1];
433
+ eqMaskHalf >>= shufTotalLen - col -1;
434
+ i += len2ndHalf - 16;
435
+ } else {
436
+ eqMaskHalf = (expandLUT[m4] << shuf3Len) | expandLUT[m3];
437
+ eqMaskHalf >>= shiftAmt;
438
+ }
439
+ revert += eqMaskHalf & 1;
440
+
441
+ // count bits in eqMask
442
+ uint8x8_t vCnt = vcnt_u8(vreinterpret_u8_u32(vmov_n_u32(eqMaskHalf)));
443
+ uint32_t cnt = vget_lane_u32(vreinterpret_u32_u8(vCnt), 0);
444
+ cnt *= 0x1010101;
445
+ i += cnt >> 24;
446
+
447
+ p -= revert;
448
+ i -= revert;
449
+ goto _encode_eol_handle_pre;
450
+ }
451
+ } else {
452
+ {
453
+ #ifdef __aarch64__
454
+ # ifdef _MSC_VER
455
+ // does this work?
456
+ if(_BitScanReverse64((unsigned long*)&bitIndex, mask))
457
+ bitIndex ^= 63;
458
+ else
459
+ bitIndex = 64;
460
+ # else
461
+ bitIndex = __builtin_clzll(mask); // TODO: is the 'undefined if 0' case problematic here?
462
+ # endif
463
+ #else
464
+ # ifdef __GNUC__
465
+ bitIndex = __builtin_clz(mask);
466
+ # elif defined(_MSC_VER)
467
+ bitIndex = _arm_clz(mask);
468
+ # else
469
+ bitIndex = __clz(mask); // ARM compiler?
470
+ # endif
471
+ #endif
472
+
473
+ uint8x16_t vClz = vdupq_n_u8(bitIndex & ~(sizeof(mask)*8));
474
+ #ifdef __aarch64__
475
+ uint8x16_t blendA = vcgeq_u8(vmakeq_u8(63,62,61,60,51,50,49,48,47,46,45,44,35,34,33,32), vClz);
476
+ uint8x16_t blendB = vcgeq_u8(vmakeq_u8(31,30,29,28,19,18,17,16,15,14,13,12, 3, 2, 1, 0), vClz);
477
+ #else
478
+ uint8x16_t blendA = vcgeq_u8(vmakeq_u8(31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16), vClz);
479
+ uint8x16_t blendB = vcgeq_u8(vmakeq_u8(15,14,13,12,11,10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0), vClz);
480
+ #endif
481
+ uint8x16_t dataAShifted = vextq_u8(dataA, dataA, 15);
482
+ uint8x16_t dataBShifted = vextq_u8(dataA, dataB, 15);
483
+ dataA = vbslq_u8(cmpA, vdupq_n_u8('='), dataA);
484
+ uint8x16_t outDataB = vbslq_u8(cmpB, vdupq_n_u8('='), dataB);
485
+ dataA = vbslq_u8(blendA, dataA, dataAShifted);
486
+ outDataB = vbslq_u8(blendB, outDataB, dataBShifted);
487
+
488
+ vst1q_u8_x2_unaligned(p, vcreate2_u8(dataA, outDataB));
489
+ p += sizeof(uint8x16_t)*2;
490
+ // write last byte
491
+ *p = vgetq_lane_u8(dataB, 15);
492
+ p += (mask != 0);
493
+ col += (mask != 0) + sizeof(uint8x16_t)*2;
494
+ }
495
+
496
+ if(HEDLEY_UNLIKELY(col >= 0)) {
497
+ #ifdef __aarch64__
498
+ // fixup bitIndex
499
+ bitIndex -= ((bitIndex+4)>>4)<<3;
500
+ #endif
501
+ bitIndex = bitIndex +1;
502
+ if(HEDLEY_UNLIKELY(col == bitIndex)) {
503
+ // this is an escape character, so line will need to overflow
504
+ p--;
505
+ } else {
506
+ i += (col > bitIndex);
507
+ }
508
+ p -= col;
509
+ i -= col;
510
+
511
+ _encode_eol_handle_pre:
512
+ encode_eol_handle_pre(es, i, p, col, lineSizeOffset);
513
+ }
514
+ }
515
+ }
516
+
517
+ *colOffset = col + line_size -1;
518
+ dest = p;
519
+ len = -(i - INPUT_OFFSET);
520
+ }
521
+
522
+ void encoder_neon_init() {
523
+ _do_encode = &do_encode_simd<do_encode_neon>;
524
+ // generate shuf LUT
525
+ for(int i=0; i<256; i++) {
526
+ int k = i;
527
+ uint16_t expand = 0;
528
+ uint8_t* res = (uint8_t*)(shufLUT + i);
529
+ int p = 0;
530
+ for(int j=0; j<8; j++) {
531
+ if(k & 1) {
532
+ res[j+p] = '=';
533
+ expand |= 1<<(j+p);
534
+ p++;
535
+ }
536
+ res[j+p] = j;
537
+ k >>= 1;
538
+ }
539
+ for(; p<8; p++)
540
+ res[8+p] = 8+p +0x80; // +0x80 => 0 discarded entries; has no effect other than to ease debugging
541
+
542
+ expandLUT[i] = expand;
543
+ }
544
+ }
545
+ #else
546
+ void encoder_neon_init() {}
547
+ #endif /* defined(__ARM_NEON) */
@@ -0,0 +1,13 @@
1
+ #include "common.h"
2
+
3
+ #ifdef __SSE2__
4
+ #include "encoder_sse_base.h"
5
+
6
+ void encoder_sse2_init() {
7
+ _do_encode = &do_encode_simd< do_encode_sse<ISA_LEVEL_SSE2> >;
8
+ encoder_sse_lut<ISA_LEVEL_SSE2>();
9
+ }
10
+ #else
11
+ void encoder_sse2_init() {}
12
+ #endif
13
+