yencode 1.1.1 → 1.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -19,44 +19,29 @@
19
19
 
20
20
  #include "crc_common.h"
21
21
 
22
- #if !defined(_MSC_VER) || defined(_STDINT) || _MSC_VER >= 1900
23
- # include <stdint.h>
24
- #else
25
- /* Workaround for older MSVC not supporting stdint.h - just pull it from V8 */
26
- # include <v8.h>
27
- #endif
28
-
29
- #if (defined(__PCLMUL__) && defined(__SSSE3__) && defined(__SSE4_1__)) || (defined(_MSC_VER) && _MSC_VER >= 1600)
22
+ #if (defined(__PCLMUL__) && defined(__SSSE3__) && defined(__SSE4_1__)) || (defined(_MSC_VER) && _MSC_VER >= 1600 && defined(PLATFORM_X86))
30
23
  #include <inttypes.h>
31
24
  #include <immintrin.h>
32
25
  #include <wmmintrin.h>
33
26
 
34
- #define local static
35
27
 
36
- #ifdef _MSC_VER
37
- # define ALIGN(_a, v) __declspec(align(_a)) v
38
- /* Because we don't have dynamic dispatch for AVX, disable it for MSVC builds (only use AVX for -march=native style builds) */
39
- # undef __AVX__
40
- # undef __AVX512F__
41
- # undef __AVX512VL__
42
- # undef __GFNI__
43
- #else
44
- # define ALIGN(_a, v) v __attribute__((aligned(_a)))
28
+ #if defined(__AVX512VL__) && defined(YENC_BUILD_NATIVE) && YENC_BUILD_NATIVE!=0
29
+ # define ENABLE_AVX512 1
45
30
  #endif
46
31
 
47
32
 
48
33
  // interestingly, MSVC seems to generate better code if using VXORPS over VPXOR
49
34
  // original Intel code uses XORPS for many XOR operations, but PXOR is pretty much always better (more port freedom on Intel CPUs). The only advantage of XORPS is that it's 1 byte shorter, an advantage which disappears under AVX as both instructions have the same length
50
- #ifdef __AVX__
35
+ #if defined(__AVX__) && defined(YENC_BUILD_NATIVE) && YENC_BUILD_NATIVE!=0
51
36
  # define fold_xor _mm_xor_si128
52
37
  #else
53
- local __m128i fold_xor(__m128i a, __m128i b) {
38
+ static __m128i fold_xor(__m128i a, __m128i b) {
54
39
  return _mm_castps_si128(_mm_xor_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b)));
55
40
  }
56
41
  #endif
57
42
 
58
- #ifdef __AVX512VL__
59
- local __m128i do_one_fold_merge(__m128i src, __m128i data) {
43
+ #ifdef ENABLE_AVX512
44
+ static __m128i do_one_fold_merge(__m128i src, __m128i data) {
60
45
  const __m128i xmm_fold4 = _mm_set_epi32(
61
46
  0x00000001, 0x54442bd4,
62
47
  0x00000001, 0xc6e41596);
@@ -68,7 +53,7 @@ local __m128i do_one_fold_merge(__m128i src, __m128i data) {
68
53
  );
69
54
  }
70
55
  #else
71
- local __m128i do_one_fold(__m128i src) {
56
+ static __m128i do_one_fold(__m128i src) {
72
57
  const __m128i xmm_fold4 = _mm_set_epi32(
73
58
  0x00000001, 0x54442bd4,
74
59
  0x00000001, 0xc6e41596);
@@ -79,7 +64,7 @@ local __m128i do_one_fold(__m128i src) {
79
64
  }
80
65
  #endif
81
66
 
82
- ALIGN(32, local const unsigned pshufb_shf_table[60]) = {
67
+ ALIGN_TO(32, static const unsigned pshufb_shf_table[60]) = {
83
68
  0x84838281, 0x88878685, 0x8c8b8a89, 0x008f8e8d, /* shl 15 (16 - 1)/shr1 */
84
69
  0x85848382, 0x89888786, 0x8d8c8b8a, 0x01008f8e, /* shl 14 (16 - 3)/shr2 */
85
70
  0x86858483, 0x8a898887, 0x8e8d8c8b, 0x0201008f, /* shl 13 (16 - 4)/shr3 */
@@ -97,7 +82,7 @@ ALIGN(32, local const unsigned pshufb_shf_table[60]) = {
97
82
  0x0201008f, 0x06050403, 0x0a090807, 0x0e0d0c0b /* shl 1 (16 -15)/shr15*/
98
83
  };
99
84
 
100
- local void partial_fold(const size_t len, __m128i *xmm_crc0, __m128i *xmm_crc1,
85
+ static void partial_fold(const size_t len, __m128i *xmm_crc0, __m128i *xmm_crc1,
101
86
  __m128i *xmm_crc2, __m128i *xmm_crc3, __m128i *xmm_crc_part) {
102
87
 
103
88
  const __m128i xmm_mask3 = _mm_set1_epi32(0x80808080);
@@ -127,7 +112,7 @@ local void partial_fold(const size_t len, __m128i *xmm_crc0, __m128i *xmm_crc1,
127
112
  *xmm_crc_part = _mm_shuffle_epi8(*xmm_crc_part, xmm_shl);
128
113
  *xmm_crc3 = _mm_or_si128(*xmm_crc3, *xmm_crc_part);
129
114
 
130
- #ifdef __AVX512VL__
115
+ #ifdef ENABLE_AVX512
131
116
  *xmm_crc3 = do_one_fold_merge(xmm_a0_0, *xmm_crc3);
132
117
  #else
133
118
  *xmm_crc3 = fold_xor(
@@ -137,25 +122,21 @@ local void partial_fold(const size_t len, __m128i *xmm_crc0, __m128i *xmm_crc1,
137
122
  #endif
138
123
  }
139
124
 
140
- ALIGN(16, local const unsigned crc_k[]) = {
125
+ ALIGN_TO(16, static const unsigned crc_k[]) = {
141
126
  0xccaa009e, 0x00000000, /* rk1 */
142
127
  0x751997d0, 0x00000001, /* rk2 */
143
128
  0xccaa009e, 0x00000000, /* rk5 */
144
129
  0x63cd6124, 0x00000001, /* rk6 */
145
- 0xf7011640, 0x00000001, /* rk7 */
130
+ 0xf7011641, 0x00000000, /* rk7 */
146
131
  0xdb710640, 0x00000001 /* rk8 */
147
132
  };
148
133
 
149
- ALIGN(16, local const unsigned crc_mask[4]) = {
150
- 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000
151
- };
152
-
153
- ALIGN(16, local const unsigned crc_mask2[4]) = {
134
+ ALIGN_TO(16, static const unsigned crc_mask[4]) = {
154
135
  0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF
155
136
  };
156
137
 
157
- local __m128i reverse_bits_epi8(__m128i src) {
158
- #ifdef __GFNI__
138
+ static __m128i reverse_bits_epi8(__m128i src) {
139
+ #if defined(__GFNI__) && defined(YENC_BUILD_NATIVE) && YENC_BUILD_NATIVE!=0
159
140
  return _mm_gf2p8affine_epi64_epi8(src, _mm_set_epi32(
160
141
  0x80402010, 0x08040201,
161
142
  0x80402010, 0x08040201
@@ -164,7 +145,8 @@ local __m128i reverse_bits_epi8(__m128i src) {
164
145
  __m128i xmm_t0 = _mm_and_si128(src, _mm_set1_epi8(0x0f));
165
146
  __m128i xmm_t1 = _mm_and_si128(_mm_srli_epi16(src, 4), _mm_set1_epi8(0x0f));
166
147
  xmm_t0 = _mm_shuffle_epi8(_mm_set_epi8(
167
- 0xf0, 0x70, 0xb0, 0x30, 0xd0, 0x50, 0x90, 0x10, 0xe0, 0x60, 0xa0, 0x20, 0xc0, 0x40, 0x80, 0
148
+ -16, 112, -80, 48, -48, 80, -112, 16, -32, 96, -96, 32, -64, 64, -128, 0
149
+ //0xf0, 0x70, 0xb0, 0x30, 0xd0, 0x50, 0x90, 0x10, 0xe0, 0x60, 0xa0, 0x20, 0xc0, 0x40, 0x80, 0
168
150
  ), xmm_t0);
169
151
  xmm_t1 = _mm_shuffle_epi8(_mm_set_epi8(
170
152
  15, 7, 11, 3, 13, 5, 9, 1, 14, 6, 10, 2, 12, 4, 8, 0
@@ -181,7 +163,7 @@ local __m128i reverse_bits_epi8(__m128i src) {
181
163
  # define BSWAP32(n) ((((n)&0xff)<<24) | (((n)&0xff00)<<8) | (((n)&0xff0000)>>8) | (((n)&0xff000000)>>24))
182
164
  #endif
183
165
 
184
- local uint32_t crc_fold(const unsigned char *src, long len, uint32_t initial) {
166
+ static uint32_t crc_fold(const unsigned char *src, long len, uint32_t initial) {
185
167
  unsigned long algn_diff;
186
168
  __m128i xmm_t0, xmm_t1, xmm_t2, xmm_t3;
187
169
 
@@ -235,7 +217,7 @@ local uint32_t crc_fold(const unsigned char *src, long len, uint32_t initial) {
235
217
  xmm_t2 = _mm_load_si128((__m128i *)src + 2);
236
218
  xmm_t3 = _mm_load_si128((__m128i *)src + 3);
237
219
 
238
- #ifdef __AVX512VL__
220
+ #ifdef ENABLE_AVX512
239
221
  xmm_crc0 = do_one_fold_merge(xmm_crc0, xmm_t0);
240
222
  xmm_crc1 = do_one_fold_merge(xmm_crc1, xmm_t1);
241
223
  xmm_crc2 = do_one_fold_merge(xmm_crc2, xmm_t2);
@@ -266,7 +248,7 @@ local uint32_t crc_fold(const unsigned char *src, long len, uint32_t initial) {
266
248
  xmm_t2 = _mm_load_si128((__m128i *)src + 2);
267
249
 
268
250
  xmm_t3 = xmm_crc3;
269
- #ifdef __AVX512VL__
251
+ #ifdef ENABLE_AVX512
270
252
  xmm_crc3 = do_one_fold_merge(xmm_crc2, xmm_t2);
271
253
  xmm_crc2 = do_one_fold_merge(xmm_crc1, xmm_t1);
272
254
  xmm_crc1 = do_one_fold_merge(xmm_crc0, xmm_t0);
@@ -292,7 +274,7 @@ local uint32_t crc_fold(const unsigned char *src, long len, uint32_t initial) {
292
274
 
293
275
  xmm_t2 = xmm_crc2;
294
276
  xmm_t3 = xmm_crc3;
295
- #ifdef __AVX512VL__
277
+ #ifdef ENABLE_AVX512
296
278
  xmm_crc3 = do_one_fold_merge(xmm_crc1, xmm_t1);
297
279
  xmm_crc2 = do_one_fold_merge(xmm_crc0, xmm_t0);
298
280
  #else
@@ -314,7 +296,7 @@ local uint32_t crc_fold(const unsigned char *src, long len, uint32_t initial) {
314
296
  xmm_t0 = _mm_load_si128((__m128i *)src);
315
297
 
316
298
  xmm_t3 = xmm_crc3;
317
- #ifdef __AVX512VL__
299
+ #ifdef ENABLE_AVX512
318
300
  xmm_crc3 = do_one_fold_merge(xmm_crc0, xmm_t0);
319
301
  #else
320
302
  xmm_crc3 = _mm_xor_si128(do_one_fold(xmm_crc0), xmm_t0);
@@ -339,8 +321,7 @@ partial:
339
321
  &xmm_crc_part);
340
322
  done:
341
323
  {
342
- const __m128i xmm_mask = _mm_load_si128((__m128i *)crc_mask);
343
- const __m128i xmm_mask2 = _mm_load_si128((__m128i *)crc_mask2);
324
+ const __m128i xmm_mask = _mm_load_si128((__m128i *)crc_mask);
344
325
  __m128i x_tmp0, x_tmp1, x_tmp2, crc_fold;
345
326
 
346
327
  /*
@@ -350,7 +331,7 @@ done:
350
331
 
351
332
  x_tmp0 = _mm_clmulepi64_si128(xmm_crc0, crc_fold, 0x10);
352
333
  xmm_crc0 = _mm_clmulepi64_si128(xmm_crc0, crc_fold, 0x01);
353
- #ifdef __AVX512VL__
334
+ #ifdef ENABLE_AVX512
354
335
  xmm_crc1 = _mm_ternarylogic_epi32(xmm_crc1, x_tmp0, xmm_crc0, 0x96);
355
336
  #else
356
337
  xmm_crc1 = _mm_xor_si128(xmm_crc1, x_tmp0);
@@ -359,7 +340,7 @@ done:
359
340
 
360
341
  x_tmp1 = _mm_clmulepi64_si128(xmm_crc1, crc_fold, 0x10);
361
342
  xmm_crc1 = _mm_clmulepi64_si128(xmm_crc1, crc_fold, 0x01);
362
- #ifdef __AVX512VL__
343
+ #ifdef ENABLE_AVX512
363
344
  xmm_crc2 = _mm_ternarylogic_epi32(xmm_crc2, x_tmp1, xmm_crc1, 0x96);
364
345
  #else
365
346
  xmm_crc2 = _mm_xor_si128(xmm_crc2, x_tmp1);
@@ -368,7 +349,7 @@ done:
368
349
 
369
350
  x_tmp2 = _mm_clmulepi64_si128(xmm_crc2, crc_fold, 0x10);
370
351
  xmm_crc2 = _mm_clmulepi64_si128(xmm_crc2, crc_fold, 0x01);
371
- #ifdef __AVX512VL__
352
+ #ifdef ENABLE_AVX512
372
353
  xmm_crc3 = _mm_ternarylogic_epi32(xmm_crc3, x_tmp2, xmm_crc2, 0x96);
373
354
  #else
374
355
  xmm_crc3 = _mm_xor_si128(xmm_crc3, x_tmp2);
@@ -388,58 +369,43 @@ done:
388
369
  xmm_crc0 = xmm_crc3;
389
370
  xmm_crc3 = _mm_slli_si128(xmm_crc3, 4);
390
371
  xmm_crc3 = _mm_clmulepi64_si128(xmm_crc3, crc_fold, 0x10);
391
- #ifdef __AVX512VL__
372
+ #ifdef ENABLE_AVX512
392
373
  //xmm_crc3 = _mm_maskz_xor_epi32(14, xmm_crc3, xmm_crc0);
393
- xmm_crc3 = _mm_ternarylogic_epi32(xmm_crc3, xmm_crc0, xmm_mask2, 0x28);
374
+ xmm_crc3 = _mm_ternarylogic_epi32(xmm_crc3, xmm_crc0, xmm_mask, 0x28);
394
375
  #else
376
+ xmm_crc0 = _mm_and_si128(xmm_crc0, xmm_mask);
395
377
  xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc0);
396
- xmm_crc3 = _mm_and_si128(xmm_crc3, xmm_mask2);
397
378
  #endif
398
379
 
399
380
  /*
400
381
  * k7
401
382
  */
402
383
  xmm_crc1 = xmm_crc3;
403
- xmm_crc2 = xmm_crc3;
404
384
  crc_fold = _mm_load_si128((__m128i *)crc_k + 2);
405
385
 
406
386
  xmm_crc3 = _mm_clmulepi64_si128(xmm_crc3, crc_fold, 0);
407
- #ifdef __AVX512VL__
408
- //xmm_crc3 = _mm_maskz_xor_epi32(3, xmm_crc3, xmm_crc2);
409
- xmm_crc3 = _mm_ternarylogic_epi32(xmm_crc3, xmm_crc2, xmm_mask, 0x28);
410
- #else
411
- xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc2);
412
- xmm_crc3 = _mm_and_si128(xmm_crc3, xmm_mask);
413
- #endif
414
-
415
- xmm_crc2 = xmm_crc3;
416
387
  xmm_crc3 = _mm_clmulepi64_si128(xmm_crc3, crc_fold, 0x10);
417
- #ifdef __AVX512VL__
418
- xmm_crc3 = _mm_ternarylogic_epi32(xmm_crc3, xmm_crc2, xmm_crc1, 0x69); // NOT(double-XOR)
419
- return _mm_extract_epi32(xmm_crc3, 2);
388
+ #ifdef ENABLE_AVX512
389
+ xmm_crc3 = _mm_ternarylogic_epi32(xmm_crc3, xmm_crc1, xmm_crc1, 0xC3); // NOT(xmm_crc3 ^ xmm_crc1)
420
390
  #else
421
- xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc2);
391
+ xmm_crc1 = _mm_xor_si128(xmm_crc1, xmm_mask);
422
392
  xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc1);
423
- return ~_mm_extract_epi32(xmm_crc3, 2);
424
393
  #endif
394
+ return _mm_extract_epi32(xmm_crc3, 2);
425
395
  }
426
396
 
427
397
  }
428
398
 
429
- static void do_crc32_clmul(const void* data, size_t length, unsigned char out[4]) {
430
- uint32_t tmp = crc_fold((const unsigned char*)data, (long)length, 0);
431
- UNPACK_4(out, tmp);
432
- }
433
- static void do_crc32_incremental_clmul(const void* data, size_t length, unsigned char init[4]) {
434
- uint32_t tmp = crc_fold((const unsigned char*)data, (long)length, PACK_4(init));
435
- UNPACK_4(init, tmp);
399
+ static uint32_t do_crc32_incremental_clmul(const void* data, size_t length, uint32_t init) {
400
+ return crc_fold((const unsigned char*)data, (long)length, init);
436
401
  }
437
402
 
438
- void crc_clmul_set_funcs(crc_func* _do_crc32, crc_func* _do_crc32_incremental) {
439
- *_do_crc32 = &do_crc32_clmul;
403
+ void crc_clmul_set_funcs(crc_func* _do_crc32_incremental) {
440
404
  *_do_crc32_incremental = &do_crc32_incremental_clmul;
441
405
  }
442
406
  #else
443
- void crc_clmul_set_funcs(crc_func* _do_crc32, crc_func* _do_crc32_incremental) {}
407
+ void crc_clmul_set_funcs(crc_func* _do_crc32_incremental) {
408
+ (void)_do_crc32_incremental;
409
+ }
444
410
  #endif
445
411
 
package/src/decoder.cc CHANGED
@@ -1,10 +1,13 @@
1
1
  #include "common.h"
2
2
 
3
3
  #include "decoder_common.h"
4
+ #include "decoder.h"
4
5
 
5
- YencDecoderEnd (*_do_decode)(const unsigned char*HEDLEY_RESTRICT*, unsigned char*HEDLEY_RESTRICT*, size_t, YencDecoderState*) = &do_decode_scalar<false, false>;
6
- YencDecoderEnd (*_do_decode_raw)(const unsigned char*HEDLEY_RESTRICT*, unsigned char*HEDLEY_RESTRICT*, size_t, YencDecoderState*) = &do_decode_scalar<true, false>;
7
- YencDecoderEnd (*_do_decode_end_raw)(const unsigned char*HEDLEY_RESTRICT*, unsigned char*HEDLEY_RESTRICT*, size_t, YencDecoderState*) = &do_decode_end_scalar<true>;
6
+ extern "C" {
7
+ YencDecoderEnd (*_do_decode)(const unsigned char*HEDLEY_RESTRICT*, unsigned char*HEDLEY_RESTRICT*, size_t, YencDecoderState*) = &do_decode_scalar<false, false>;
8
+ YencDecoderEnd (*_do_decode_raw)(const unsigned char*HEDLEY_RESTRICT*, unsigned char*HEDLEY_RESTRICT*, size_t, YencDecoderState*) = &do_decode_scalar<true, false>;
9
+ YencDecoderEnd (*_do_decode_end_raw)(const unsigned char*HEDLEY_RESTRICT*, unsigned char*HEDLEY_RESTRICT*, size_t, YencDecoderState*) = &do_decode_end_scalar<true>;
10
+ }
8
11
 
9
12
  void decoder_set_sse2_funcs();
10
13
  void decoder_set_ssse3_funcs();
package/src/decoder.h CHANGED
@@ -1,3 +1,11 @@
1
+ #ifndef __YENC_DECODER_H
2
+ #define __YENC_DECODER_H
3
+
4
+ #ifdef __cplusplus
5
+ extern "C" {
6
+ #endif
7
+
8
+
1
9
 
2
10
  // the last state that the decoder was in (i.e. last few characters processed)
3
11
  // the state is needed for incremental decoders as its behavior is affected by what it processed last
@@ -25,8 +33,7 @@ extern YencDecoderEnd (*_do_decode)(const unsigned char*HEDLEY_RESTRICT*, unsign
25
33
  extern YencDecoderEnd (*_do_decode_raw)(const unsigned char*HEDLEY_RESTRICT*, unsigned char*HEDLEY_RESTRICT*, size_t, YencDecoderState*);
26
34
  extern YencDecoderEnd (*_do_decode_end_raw)(const unsigned char*HEDLEY_RESTRICT*, unsigned char*HEDLEY_RESTRICT*, size_t, YencDecoderState*);
27
35
 
28
- template<bool isRaw>
29
- static inline size_t do_decode(const unsigned char* HEDLEY_RESTRICT src, unsigned char* HEDLEY_RESTRICT dest, size_t len, YencDecoderState* state) {
36
+ static inline size_t do_decode(int isRaw, const unsigned char* HEDLEY_RESTRICT src, unsigned char* HEDLEY_RESTRICT dest, size_t len, YencDecoderState* state) {
30
37
  unsigned char* ds = dest;
31
38
  (*(isRaw ? _do_decode_raw : _do_decode))(&src, &ds, len, state);
32
39
  return ds - dest;
@@ -37,3 +44,10 @@ static inline YencDecoderEnd do_decode_end(const unsigned char*HEDLEY_RESTRICT*
37
44
  }
38
45
 
39
46
  void decoder_init();
47
+
48
+
49
+
50
+ #ifdef __cplusplus
51
+ }
52
+ #endif
53
+ #endif
@@ -30,13 +30,17 @@ static HEDLEY_ALWAYS_INLINE __m256i force_align_read_256(const void* p) {
30
30
  }
31
31
 
32
32
  // _mm256_castsi128_si256, but upper is defined to be 0
33
- #if defined(__clang__) && __clang_major__ >= 5
33
+ #if (defined(__clang__) && __clang_major__ >= 5 && (!defined(__APPLE__) || __clang_major__ >= 7)) || (defined(__GNUC__) && __GNUC__ >= 10)
34
34
  // intrinsic unsupported in GCC 9 and MSVC < 2017
35
35
  # define zext128_256 _mm256_zextsi128_si256
36
36
  #else
37
37
  // technically a cast is incorrect, due to upper 128 bits being undefined, but should usually work fine
38
38
  // alternative may be `_mm256_set_m128i(_mm_setzero_si128(), v)` but unsupported on GCC < 7, and most compilers generate a VINSERTF128 instruction for it
39
- # define zext128_256 _mm256_castsi128_si256
39
+ # ifdef __OPTIMIZE__
40
+ # define zext128_256 _mm256_castsi128_si256
41
+ # else
42
+ # define zext128_256(x) _mm256_inserti128_si256(_mm256_setzero_si256(), x, 0)
43
+ # endif
40
44
  #endif
41
45
 
42
46
 
@@ -298,7 +302,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_avx2(const uint8_t* HEDLEY_RESTRICT src, lon
298
302
  if(LIKELIHOOD(0.002, matchEnd)) {
299
303
  // terminator found
300
304
  // there's probably faster ways to do this, but reverting to scalar code should be good enough
301
- len += i;
305
+ len += (long)i;
302
306
  break;
303
307
  }
304
308
  }
@@ -390,7 +394,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_avx2(const uint8_t* HEDLEY_RESTRICT src, lon
390
394
  ));
391
395
  }
392
396
  if(endFound) {
393
- len += i;
397
+ len += (long)i;
394
398
  break;
395
399
  }
396
400
  }
@@ -489,14 +493,10 @@ HEDLEY_ALWAYS_INLINE void do_decode_avx2(const uint8_t* HEDLEY_RESTRICT src, lon
489
493
  #endif
490
494
  {
491
495
  // << 1 byte
492
- cmpEqB = _mm256_cmpeq_epi8(_mm256_set1_epi8('='), _mm256_loadu_si256((__m256i *)(src+i-1) + 1));
493
- #if defined(__tune_znver1__) || defined(__tune_bdver4__)
494
496
  cmpEqA = _mm256_alignr_epi8(cmpEqA, _mm256_inserti128_si256(
495
- _mm256_setzero_si256(), _mm256_castsi256_si128(cmpEqA), 1
497
+ _mm256_set1_epi8('='), _mm256_castsi256_si128(cmpEqA), 1
496
498
  ), 15);
497
- #else
498
- cmpEqA = _mm256_alignr_epi8(cmpEqA, _mm256_permute2x128_si256(cmpEqA, cmpEqA, 0x08), 15);
499
- #endif
499
+ cmpEqB = _mm256_cmpeq_epi8(_mm256_set1_epi8('='), _mm256_loadu_si256((__m256i *)(src+i-1) + 1));
500
500
  dataA = _mm256_add_epi8(
501
501
  oDataA,
502
502
  _mm256_blendv_epi8(
@@ -523,7 +523,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_avx2(const uint8_t* HEDLEY_RESTRICT src, lon
523
523
  #endif
524
524
  {
525
525
  yencOffset = _mm256_xor_si256(_mm256_set1_epi8(-42), zext128_256(
526
- _mm_slli_epi16(_mm_cvtsi32_si128(escFirst), 6)
526
+ _mm_slli_epi16(_mm_cvtsi32_si128((int)escFirst), 6)
527
527
  ));
528
528
  }
529
529
 
@@ -565,7 +565,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_avx2(const uint8_t* HEDLEY_RESTRICT src, lon
565
565
  p -= popcnt32(mask & 0xffff0);
566
566
 
567
567
  _mm_storeu_si128((__m128i*)(p + XMM_SIZE*3), _mm256_extracti128_si256(dataB, 1));
568
- p -= popcnt32(mask >> 20);
568
+ p -= popcnt32((unsigned int)(mask >> 20));
569
569
  #else
570
570
  mask >>= 32;
571
571
  shuf = _mm256_inserti128_si256(
@@ -340,7 +340,7 @@ YencDecoderEnd do_decode_simd(const unsigned char* HEDLEY_RESTRICT* src, unsigne
340
340
  if((uintptr_t)(*src) & ((width-1))) {
341
341
  // find source memory alignment
342
342
  unsigned char* aSrc = (unsigned char*)(((uintptr_t)(*src) + (width-1)) & ~(width-1));
343
- int amount = aSrc - *src;
343
+ int amount = (int)(aSrc - *src);
344
344
  len -= amount;
345
345
  YencDecoderEnd ended = do_decode_scalar<isRaw, searchEnd>(src, dest, amount, pState);
346
346
  if(ended) return ended;
@@ -427,7 +427,7 @@ YencDecoderEnd do_decode_simd(const unsigned char* HEDLEY_RESTRICT* src, unsigne
427
427
  escFirst = (*pState == YDEC_STATE_EQ || *pState == YDEC_STATE_CRLFEQ);
428
428
 
429
429
  // our algorithm may perform an aligned load on the next part, of which we consider 2 bytes (for \r\n. sequence checking)
430
- long dLen = len - lenBuffer;
430
+ long dLen = (long)(len - lenBuffer);
431
431
  dLen = (dLen + (width-1)) & ~(width-1);
432
432
 
433
433
  kernel((const uint8_t*)(*src) + dLen, dLen, p, escFirst, nextMask);
@@ -7,9 +7,9 @@
7
7
  #include "decoder_common.h"
8
8
 
9
9
 
10
- #ifdef _MSC_VER
11
- # define vld1_u8_align vld1_u8_ex
12
- # define vld1q_u8_align vld1q_u8_ex
10
+ #if defined(_MSC_VER) && !defined(__clang__)
11
+ # define vld1_u8_align(p, a) vld1_u8_ex(p, a*8)
12
+ # define vld1q_u8_align(p, a) vld1q_u8_ex(p, a*8)
13
13
  #elif defined(__GNUC__)
14
14
  # define vld1_u8_align(p, n) vld1_u8((uint8_t*)__builtin_assume_aligned(p, n))
15
15
  # define vld1q_u8_align(p, n) vld1q_u8((uint8_t*)__builtin_assume_aligned(p, n))
@@ -23,15 +23,13 @@
23
23
  #if defined(__clang__) || (defined(__GNUC__) && (defined(__aarch64__) && __GNUC__ >= 8))
24
24
  # define vld1q_u8_x2_align(p, n) vld1q_u8_x2((uint8_t*)__builtin_assume_aligned(p, n))
25
25
  #else
26
- HEDLEY_ALWAYS_INLINE uint8x16x2_t vld1q_u8_x2_align(const uint8_t* p, int n) {
27
- return (uint8x16x2_t){vld1q_u8_align(p, n), vld1q_u8_align(p+16, n)};
28
- }
26
+ # define vld1q_u8_x2_align(p, n) vcreate2_u8(vld1q_u8_align(p, (n)/2), vld1q_u8_align((p)+16, (n)/2))
29
27
  #endif
30
28
  // Clang wrongly assumes alignment on vld1q_u8_x2, and ARMv7 GCC doesn't support the function, so effectively, it can only be used in ARMv8 compilers
31
29
  #if defined(__aarch64__) && (defined(__clang__) || (defined(__GNUC__) && __GNUC__ >= 9))
32
30
  # define vst1q_u8_x2_unaligned vst1q_u8_x2
33
31
  #else
34
- HEDLEY_ALWAYS_INLINE void vst1q_u8_x2_unaligned(uint8_t* p, uint8x16x2_t data) {
32
+ static HEDLEY_ALWAYS_INLINE void vst1q_u8_x2_unaligned(uint8_t* p, uint8x16x2_t data) {
35
33
  vst1q_u8(p, data.val[0]);
36
34
  vst1q_u8(p+16, data.val[1]);
37
35
  }
@@ -64,18 +62,20 @@ template<bool isRaw, bool searchEnd>
64
62
  HEDLEY_ALWAYS_INLINE void do_decode_neon(const uint8_t* HEDLEY_RESTRICT src, long& len, unsigned char* HEDLEY_RESTRICT & p, unsigned char& escFirst, uint16_t& nextMask) {
65
63
  HEDLEY_ASSUME(escFirst == 0 || escFirst == 1);
66
64
  HEDLEY_ASSUME(nextMask == 0 || nextMask == 1 || nextMask == 2);
67
- uint8x16_t yencOffset = escFirst ? (uint8x16_t){42+64,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42} : vdupq_n_u8(42);
65
+ uint8x16_t yencOffset = escFirst ? vmakeq_u8(42+64,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42) : vdupq_n_u8(42);
68
66
  #ifdef __aarch64__
69
67
  uint8x16_t nextMaskMix = vdupq_n_u8(0);
70
- if(nextMask)
71
- nextMaskMix[nextMask-1] = nextMask;
68
+ if(nextMask == 1)
69
+ nextMaskMix = vsetq_lane_u8(1, nextMaskMix, 0);
70
+ if(nextMask == 2)
71
+ nextMaskMix = vsetq_lane_u8(2, nextMaskMix, 1);
72
72
  #else
73
73
  uint8x16_t lfCompare = vdupq_n_u8('\n');
74
74
  if(isRaw) {
75
75
  if(nextMask == 1)
76
- lfCompare[0] = '.';
76
+ lfCompare = vsetq_lane_u8('.', lfCompare, 0);
77
77
  if(nextMask == 2)
78
- lfCompare[1] = '.';
78
+ lfCompare = vsetq_lane_u8('.', lfCompare, 1);
79
79
  }
80
80
  #endif
81
81
  long i;
@@ -90,13 +90,13 @@ HEDLEY_ALWAYS_INLINE void do_decode_neon(const uint8_t* HEDLEY_RESTRICT src, lon
90
90
  #ifdef __aarch64__
91
91
  cmpA = vqtbx1q_u8(
92
92
  cmpEqA,
93
- // \n \r
94
- (uint8x16_t){0,0,0,0,0,0,0,0,0,0,255,0,0,255,0,0},
93
+ // \n \r
94
+ vmakeq_u8(0,0,0,0,0,0,0,0,0,0,255,0,0,255,0,0),
95
95
  dataA
96
96
  ),
97
97
  cmpB = vqtbx1q_u8(
98
98
  cmpEqB,
99
- (uint8x16_t){0,0,0,0,0,0,0,0,0,0,255,0,0,255,0,0},
99
+ vmakeq_u8(0,0,0,0,0,0,0,0,0,0,255,0,0,255,0,0),
100
100
  dataB
101
101
  );
102
102
  if(isRaw) cmpA = vorrq_u8(cmpA, nextMaskMix);
@@ -122,12 +122,12 @@ HEDLEY_ALWAYS_INLINE void do_decode_neon(const uint8_t* HEDLEY_RESTRICT src, lon
122
122
 
123
123
  #ifdef __aarch64__
124
124
  if (LIKELIHOOD(0.42 /*guess*/, neon_vect_is_nonzero(vorrq_u8(cmpA, cmpB)))) {
125
- cmpA = vandq_u8(cmpA, (uint8x16_t){1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128});
126
- cmpB = vandq_u8(cmpB, (uint8x16_t){1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128});
125
+ cmpA = vandq_u8(cmpA, vmakeq_u8(1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128));
126
+ cmpB = vandq_u8(cmpB, vmakeq_u8(1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128));
127
127
  uint8x16_t cmpMerge = vpaddq_u8(cmpA, cmpB);
128
128
  uint8x16_t cmpEqMerge = vpaddq_u8(
129
- vandq_u8(cmpEqA, (uint8x16_t){1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128}),
130
- vandq_u8(cmpEqB, (uint8x16_t){1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128})
129
+ vandq_u8(cmpEqA, vmakeq_u8(1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128)),
130
+ vandq_u8(cmpEqB, vmakeq_u8(1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128))
131
131
  );
132
132
 
133
133
  uint8x16_t cmpCombined = vpaddq_u8(cmpMerge, cmpEqMerge);
@@ -136,8 +136,8 @@ HEDLEY_ALWAYS_INLINE void do_decode_neon(const uint8_t* HEDLEY_RESTRICT src, lon
136
136
  uint32_t mask = vgetq_lane_u32(vreinterpretq_u32_u8(cmpCombined), 0);
137
137
  uint32_t maskEq = vgetq_lane_u32(vreinterpretq_u32_u8(cmpCombined), 1);
138
138
  #else
139
- cmpA = vandq_u8(cmpA, (uint8x16_t){1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128});
140
- cmpB = vandq_u8(cmpB, (uint8x16_t){1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128});
139
+ cmpA = vandq_u8(cmpA, vmakeq_u8(1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128));
140
+ cmpB = vandq_u8(cmpB, vmakeq_u8(1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128));
141
141
  // no vpaddq_u8 in ARMv7, so need extra 64-bit VPADD
142
142
  uint8x8_t cmpPacked = vpadd_u8(
143
143
  vpadd_u8(
@@ -150,8 +150,8 @@ HEDLEY_ALWAYS_INLINE void do_decode_neon(const uint8_t* HEDLEY_RESTRICT src, lon
150
150
  cmpPacked = vpadd_u8(cmpPacked, cmpPacked);
151
151
  uint32_t mask = vget_lane_u32(vreinterpret_u32_u8(cmpPacked), 0);
152
152
  if(LIKELIHOOD(0.42, mask != 0)) {
153
- uint8x16_t cmpEqMaskedA = vandq_u8(cmpEqA, (uint8x16_t){1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128});
154
- uint8x16_t cmpEqMaskedB = vandq_u8(cmpEqB, (uint8x16_t){1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128});
153
+ uint8x16_t cmpEqMaskedA = vandq_u8(cmpEqA, vmakeq_u8(1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128));
154
+ uint8x16_t cmpEqMaskedB = vandq_u8(cmpEqB, vmakeq_u8(1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128));
155
155
  uint8x8_t cmpEqPacked = vpadd_u8(
156
156
  vpadd_u8(
157
157
  vget_low_u8(cmpEqMaskedA), vget_high_u8(cmpEqMaskedA)
@@ -170,7 +170,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_neon(const uint8_t* HEDLEY_RESTRICT src, lon
170
170
  // vext seems to be a cheap operation on ARM, relative to loads, so only avoid it if there's only one load (isRaw only)
171
171
  uint8x16_t tmpData2, nextData;
172
172
  if(isRaw && !searchEnd) {
173
- tmpData2 = vld1q_u8_align(src+i + 2 + sizeof(uint8x16_t), 2);
173
+ tmpData2 = vld1q_u8(src+i + 2 + sizeof(uint8x16_t));
174
174
  } else {
175
175
  nextData = vld1q_u8_align(src+i + sizeof(uint8x16_t)*2, 16); // only 32-bits needed, but there doesn't appear a nice way to do this via intrinsics: https://stackoverflow.com/questions/46910799/arm-neon-intrinsics-convert-d-64-bit-register-to-low-half-of-q-128-bit-regis
176
176
  tmpData2 = vextq_u8(dataB, nextData, 2);
@@ -255,15 +255,15 @@ HEDLEY_ALWAYS_INLINE void do_decode_neon(const uint8_t* HEDLEY_RESTRICT src, lon
255
255
  }
256
256
  }
257
257
  #ifdef __aarch64__
258
- uint8x16_t match2NlDotBMasked = vandq_u8(match2NlDotB, (uint8x16_t){1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128});
258
+ uint8x16_t match2NlDotBMasked = vandq_u8(match2NlDotB, vmakeq_u8(1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128));
259
259
  uint8x16_t mergeKillDots = vpaddq_u8(
260
- vandq_u8(match2NlDotA, (uint8x16_t){1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128}),
260
+ vandq_u8(match2NlDotA, vmakeq_u8(1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128)),
261
261
  match2NlDotBMasked
262
262
  );
263
263
  uint8x8_t mergeKillDots2 = vget_low_u8(vpaddq_u8(mergeKillDots, mergeKillDots));
264
264
  #else
265
- uint8x16_t match2NlDotMaskedA = vandq_u8(match2NlDotA, (uint8x16_t){1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128});
266
- uint8x16_t match2NlDotMaskedB = vandq_u8(match2NlDotB, (uint8x16_t){1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128});
265
+ uint8x16_t match2NlDotMaskedA = vandq_u8(match2NlDotA, vmakeq_u8(1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128));
266
+ uint8x16_t match2NlDotMaskedB = vandq_u8(match2NlDotB, vmakeq_u8(1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128));
267
267
  uint8x8_t mergeKillDots2 = vpadd_u8(
268
268
  vpadd_u8(
269
269
  vget_low_u8(match2NlDotMaskedA), vget_high_u8(match2NlDotMaskedA)
@@ -342,11 +342,11 @@ HEDLEY_ALWAYS_INLINE void do_decode_neon(const uint8_t* HEDLEY_RESTRICT src, lon
342
342
  #ifdef __aarch64__
343
343
  uint8x16_t vMaskEqA = vqtbl1q_u8(
344
344
  vcombine_u8(maskEqTemp, vdup_n_u8(0)),
345
- (uint8x16_t){0,0,0,0,0,0,0,0, 1,1,1,1,1,1,1,1}
345
+ vmakeq_u8(0,0,0,0,0,0,0,0, 1,1,1,1,1,1,1,1)
346
346
  );
347
347
  uint8x16_t vMaskEqB = vqtbl1q_u8(
348
348
  vcombine_u8(maskEqTemp, vdup_n_u8(0)),
349
- (uint8x16_t){2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3}
349
+ vmakeq_u8(2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3)
350
350
  );
351
351
  #else
352
352
  uint8x16_t vMaskEqA = vcombine_u8(
@@ -358,8 +358,8 @@ HEDLEY_ALWAYS_INLINE void do_decode_neon(const uint8_t* HEDLEY_RESTRICT src, lon
358
358
  vdup_lane_u8(maskEqTemp, 3)
359
359
  );
360
360
  #endif
361
- vMaskEqA = vtstq_u8(vMaskEqA, (uint8x16_t){1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128});
362
- vMaskEqB = vtstq_u8(vMaskEqB, (uint8x16_t){1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128});
361
+ vMaskEqA = vtstq_u8(vMaskEqA, vmakeq_u8(1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128));
362
+ vMaskEqB = vtstq_u8(vMaskEqB, vmakeq_u8(1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128));
363
363
 
364
364
  dataA = vsubq_u8(
365
365
  dataA,
@@ -391,7 +391,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_neon(const uint8_t* HEDLEY_RESTRICT src, lon
391
391
  )
392
392
  );
393
393
  }
394
- yencOffset[0] = (escFirst << 6) | 42;
394
+ yencOffset = vsetq_lane_u8((escFirst << 6) | 42, yencOffset, 0);
395
395
 
396
396
  // all that's left is to 'compress' the data (skip over masked chars)
397
397
  uint32_t counts = 0x08080808 - vget_lane_u32(vreinterpret_u32_u8(vcnt_u8(cmpPacked)), 0);
@@ -439,7 +439,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_neon(const uint8_t* HEDLEY_RESTRICT src, lon
439
439
  } else {
440
440
  dataA = vsubq_u8(dataA, yencOffset);
441
441
  dataB = vsubq_u8(dataB, vdupq_n_u8(42));
442
- vst1q_u8_x2_unaligned(p, ((uint8x16x2_t){dataA, dataB}));
442
+ vst1q_u8_x2_unaligned(p, vcreate2_u8(dataA, dataB));
443
443
  p += sizeof(uint8x16_t)*2;
444
444
  escFirst = 0;
445
445
  #ifdef __aarch64__