yencode 1.1.0 → 1.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,230 @@
1
+ // 256-bit version of crc_folding
2
+
3
+ #include "crc_common.h"
4
+
5
+ #if !defined(YENC_DISABLE_AVX256) && ((defined(__VPCLMULQDQ__) && defined(__AVX2__) && defined(__PCLMUL__)) || (defined(_MSC_VER) && _MSC_VER >= 1920 && defined(PLATFORM_X86) && !defined(__clang__)))
6
+ #include <inttypes.h>
7
+ #include <immintrin.h>
8
+
9
+
10
+ #if defined(__AVX512VL__) && defined(YENC_BUILD_NATIVE) && YENC_BUILD_NATIVE!=0
11
+ # define ENABLE_AVX512 1
12
+ #endif
13
+
14
+ static __m256i do_one_fold(__m256i src, __m256i data) {
15
+ const __m256i fold4 = _mm256_set_epi32(
16
+ 0x00000001, 0x54442bd4,
17
+ 0x00000001, 0xc6e41596,
18
+ 0x00000001, 0x54442bd4,
19
+ 0x00000001, 0xc6e41596
20
+ );
21
+ #ifdef ENABLE_AVX512
22
+ return _mm256_ternarylogic_epi32(
23
+ _mm256_clmulepi64_epi128(src, fold4, 0x01),
24
+ _mm256_clmulepi64_epi128(src, fold4, 0x10),
25
+ data,
26
+ 0x96
27
+ );
28
+ #else
29
+ return _mm256_xor_si256(data, _mm256_xor_si256(
30
+ _mm256_clmulepi64_epi128(src, fold4, 0x01),
31
+ _mm256_clmulepi64_epi128(src, fold4, 0x10)
32
+ ));
33
+ #endif
34
+ }
35
+
36
+ ALIGN_TO(32, static const uint8_t pshufb_rot_table[]) = {
37
+ 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,
38
+ 16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31
39
+ };
40
+ // _mm256_castsi128_si256, but upper is defined to be 0
41
+ #if (defined(__clang__) && __clang_major__ >= 5 && (!defined(__APPLE__) || __clang_major__ >= 7)) || (defined(__GNUC__) && __GNUC__ >= 10)
42
+ // intrinsic unsupported in GCC 9 and MSVC < 2017
43
+ # define zext128_256 _mm256_zextsi128_si256
44
+ #else
45
+ // technically a cast is incorrect, due to upper 128 bits being undefined, but should usually work fine
46
+ // alternative may be `_mm256_set_m128i(_mm_setzero_si128(), v)` but unsupported on GCC < 7, and most compilers generate a VINSERTF128 instruction for it
47
+ # ifdef __OPTIMIZE__
48
+ # define zext128_256 _mm256_castsi128_si256
49
+ # else
50
+ # define zext128_256(x) _mm256_inserti128_si256(_mm256_setzero_si256(), x, 0)
51
+ # endif
52
+ #endif
53
+
54
+ #ifdef ENABLE_AVX512
55
+ # define MM256_BLENDV(a, b, m) _mm256_ternarylogic_epi32(a, b, m, 0xd8)
56
+ # define MM_2XOR(a, b, c) _mm_ternarylogic_epi32(a, b, c, 0x96)
57
+ #else
58
+ # define MM256_BLENDV _mm256_blendv_epi8
59
+ # define MM_2XOR(a, b, c) _mm_xor_si128(_mm_xor_si128(a, b), c)
60
+ #endif
61
+
62
+ static void partial_fold(const size_t len, __m256i *crc0, __m256i *crc1, __m256i crc_part) {
63
+ __m256i shuf = _mm256_broadcastsi128_si256(_mm_loadu_si128((__m128i*)(pshufb_rot_table + (len&15))));
64
+ __m256i mask = _mm256_cmpgt_epi8(shuf, _mm256_set1_epi8(15));
65
+
66
+ *crc0 = _mm256_shuffle_epi8(*crc0, shuf);
67
+ *crc1 = _mm256_shuffle_epi8(*crc1, shuf);
68
+ crc_part = _mm256_shuffle_epi8(crc_part, shuf);
69
+
70
+ __m256i crc_out = _mm256_permute2x128_si256(*crc0, *crc0, 0x08); // move bottom->top
71
+ __m256i crc01, crc1p;
72
+ if(len >= 16) {
73
+ crc_out = MM256_BLENDV(crc_out, *crc0, mask);
74
+ crc01 = *crc1;
75
+ crc1p = crc_part;
76
+ *crc0 = _mm256_permute2x128_si256(*crc0, *crc1, 0x21);
77
+ *crc1 = _mm256_permute2x128_si256(*crc1, crc_part, 0x21);
78
+ crc_part = zext128_256(_mm256_extracti128_si256(crc_part, 1));
79
+ } else {
80
+ crc_out = _mm256_and_si256(crc_out, mask);
81
+ crc01 = _mm256_permute2x128_si256(*crc0, *crc1, 0x21);
82
+ crc1p = _mm256_permute2x128_si256(*crc1, crc_part, 0x21);
83
+ }
84
+
85
+ *crc0 = MM256_BLENDV(*crc0, crc01, mask);
86
+ *crc1 = MM256_BLENDV(*crc1, crc1p, mask);
87
+
88
+ *crc1 = do_one_fold(crc_out, *crc1);
89
+ }
90
+
91
+
92
+ ALIGN_TO(16, static const unsigned crc_k[]) = {
93
+ 0xccaa009e, 0x00000000, /* rk1 */
94
+ 0x751997d0, 0x00000001, /* rk2 */
95
+ 0xccaa009e, 0x00000000, /* rk5 */
96
+ 0x63cd6124, 0x00000001, /* rk6 */
97
+ 0xf7011641, 0x00000000, /* rk7 */
98
+ 0xdb710640, 0x00000001 /* rk8 */
99
+ };
100
+
101
+
102
+ static uint32_t crc_fold(const unsigned char *src, long len, uint32_t initial) {
103
+ // info from https://www.reddit.com/r/ReverseEngineering/comments/2zwhl3/mystery_constant_0x9db42487_in_intels_crc32ieee/
104
+ // firstly, calculate: xmm_crc0 = (intial * 0x487b9c8a) mod 0x104c11db7, where 0x487b9c8a = inverse(1<<512) mod 0x104c11db7
105
+ __m128i xmm_t0 = _mm_cvtsi32_si128(~initial);
106
+
107
+ xmm_t0 = _mm_clmulepi64_si128(xmm_t0, _mm_set_epi32(0, 0, 0xa273bc24, 0), 0); // reverse(0x487b9c8a)<<1 == 0xa273bc24
108
+ __m128i reduction = _mm_set_epi32( // polynomial reduction factors
109
+ 1, 0xdb710640, // G* = 0x04c11db7
110
+ 0, 0xf7011641 // Q+ = 0x04d101df (+1 to save an additional xor operation)
111
+ );
112
+ __m128i xmm_t1 = _mm_clmulepi64_si128(xmm_t0, reduction, 0);
113
+ xmm_t1 = _mm_clmulepi64_si128(xmm_t1, reduction, 0x10);
114
+
115
+ xmm_t0 = _mm_srli_si128(_mm_xor_si128(xmm_t0, xmm_t1), 8);
116
+ __m256i crc0 = zext128_256(xmm_t0);
117
+ __m256i crc1 = _mm256_setzero_si256();
118
+
119
+ if (len < 32) {
120
+ if (len == 0)
121
+ return initial;
122
+ __m256i crc_part = _mm256_setzero_si256();
123
+ memcpy(&crc_part, src, len);
124
+ partial_fold(len, &crc0, &crc1, crc_part);
125
+ } else {
126
+ uintptr_t algn_diff = (0 - (uintptr_t)src) & 0x1F;
127
+ if (algn_diff) {
128
+ partial_fold(algn_diff, &crc0, &crc1, _mm256_loadu_si256((__m256i *)src));
129
+ src += algn_diff;
130
+ len -= algn_diff;
131
+ }
132
+
133
+ while (len >= 64) {
134
+ crc0 = do_one_fold(crc0, _mm256_load_si256((__m256i*)src));
135
+ crc1 = do_one_fold(crc1, _mm256_load_si256((__m256i*)src + 1));
136
+ src += 64;
137
+ len -= 64;
138
+ }
139
+
140
+ if (len >= 32) {
141
+ __m256i old = crc1;
142
+ crc1 = do_one_fold(crc0, _mm256_load_si256((__m256i*)src));
143
+ crc0 = old;
144
+
145
+ len -= 32;
146
+ src += 32;
147
+ }
148
+
149
+ if(len != 0) {
150
+ partial_fold(len, &crc0, &crc1, _mm256_load_si256((__m256i *)src));
151
+ }
152
+ }
153
+
154
+ const __m128i xmm_mask = _mm_set_epi32(-1,-1,-1,0);
155
+ __m128i x_tmp0, x_tmp1, x_tmp2, crc_fold;
156
+
157
+ __m128i xmm_crc0 = _mm256_castsi256_si128(crc0);
158
+ __m128i xmm_crc1 = _mm256_extracti128_si256(crc0, 1);
159
+ __m128i xmm_crc2 = _mm256_castsi256_si128(crc1);
160
+ __m128i xmm_crc3 = _mm256_extracti128_si256(crc1, 1);
161
+
162
+ /*
163
+ * k1
164
+ */
165
+ crc_fold = _mm_load_si128((__m128i *)crc_k);
166
+
167
+ x_tmp0 = _mm_clmulepi64_si128(xmm_crc0, crc_fold, 0x10);
168
+ xmm_crc0 = _mm_clmulepi64_si128(xmm_crc0, crc_fold, 0x01);
169
+ xmm_crc1 = MM_2XOR(xmm_crc1, x_tmp0, xmm_crc0);
170
+
171
+ x_tmp1 = _mm_clmulepi64_si128(xmm_crc1, crc_fold, 0x10);
172
+ xmm_crc1 = _mm_clmulepi64_si128(xmm_crc1, crc_fold, 0x01);
173
+ xmm_crc2 = MM_2XOR(xmm_crc2, x_tmp1, xmm_crc1);
174
+
175
+ x_tmp2 = _mm_clmulepi64_si128(xmm_crc2, crc_fold, 0x10);
176
+ xmm_crc2 = _mm_clmulepi64_si128(xmm_crc2, crc_fold, 0x01);
177
+ xmm_crc3 = MM_2XOR(xmm_crc3, x_tmp2, xmm_crc2);
178
+
179
+ /*
180
+ * k5
181
+ */
182
+ crc_fold = _mm_load_si128((__m128i *)crc_k + 1);
183
+
184
+ xmm_crc0 = xmm_crc3;
185
+ xmm_crc3 = _mm_clmulepi64_si128(xmm_crc3, crc_fold, 0);
186
+ xmm_crc0 = _mm_srli_si128(xmm_crc0, 8);
187
+ xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc0);
188
+
189
+ xmm_crc0 = xmm_crc3;
190
+ xmm_crc3 = _mm_slli_si128(xmm_crc3, 4);
191
+ xmm_crc3 = _mm_clmulepi64_si128(xmm_crc3, crc_fold, 0x10);
192
+ #ifdef ENABLE_AVX512
193
+ //xmm_crc3 = _mm_maskz_xor_epi32(14, xmm_crc3, xmm_crc0);
194
+ xmm_crc3 = _mm_ternarylogic_epi32(xmm_crc3, xmm_crc0, xmm_mask, 0x28);
195
+ #else
196
+ xmm_crc0 = _mm_and_si128(xmm_crc0, xmm_mask);
197
+ xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc0);
198
+ #endif
199
+
200
+ /*
201
+ * k7
202
+ */
203
+ xmm_crc1 = xmm_crc3;
204
+ crc_fold = _mm_load_si128((__m128i *)crc_k + 2);
205
+
206
+ xmm_crc3 = _mm_clmulepi64_si128(xmm_crc3, crc_fold, 0);
207
+ xmm_crc3 = _mm_clmulepi64_si128(xmm_crc3, crc_fold, 0x10);
208
+ #ifdef ENABLE_AVX512
209
+ xmm_crc3 = _mm_ternarylogic_epi32(xmm_crc3, xmm_crc1, xmm_crc1, 0xC3); // NOT(xmm_crc3 ^ xmm_crc1)
210
+ #else
211
+ xmm_crc1 = _mm_xor_si128(xmm_crc1, xmm_mask);
212
+ xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc1);
213
+ #endif
214
+ return _mm_extract_epi32(xmm_crc3, 2);
215
+ }
216
+
217
+ static uint32_t do_crc32_incremental_clmul(const void* data, size_t length, uint32_t init) {
218
+ return crc_fold((const unsigned char*)data, (long)length, init);
219
+ }
220
+
221
+ void crc_clmul256_set_funcs(crc_func* _do_crc32_incremental) {
222
+ *_do_crc32_incremental = &do_crc32_incremental_clmul;
223
+ }
224
+ #else
225
+ void crc_clmul_set_funcs(crc_func* _do_crc32_incremental);
226
+ void crc_clmul256_set_funcs(crc_func* _do_crc32_incremental) {
227
+ crc_clmul_set_funcs(_do_crc32_incremental);
228
+ }
229
+ #endif
230
+
package/src/decoder.cc CHANGED
@@ -1,15 +1,19 @@
1
1
  #include "common.h"
2
2
 
3
3
  #include "decoder_common.h"
4
+ #include "decoder.h"
4
5
 
5
- YencDecoderEnd (*_do_decode)(const unsigned char*HEDLEY_RESTRICT*, unsigned char*HEDLEY_RESTRICT*, size_t, YencDecoderState*) = &do_decode_scalar<false, false>;
6
- YencDecoderEnd (*_do_decode_raw)(const unsigned char*HEDLEY_RESTRICT*, unsigned char*HEDLEY_RESTRICT*, size_t, YencDecoderState*) = &do_decode_scalar<true, false>;
7
- YencDecoderEnd (*_do_decode_end_raw)(const unsigned char*HEDLEY_RESTRICT*, unsigned char*HEDLEY_RESTRICT*, size_t, YencDecoderState*) = &do_decode_end_scalar<true>;
6
+ extern "C" {
7
+ YencDecoderEnd (*_do_decode)(const unsigned char*HEDLEY_RESTRICT*, unsigned char*HEDLEY_RESTRICT*, size_t, YencDecoderState*) = &do_decode_scalar<false, false>;
8
+ YencDecoderEnd (*_do_decode_raw)(const unsigned char*HEDLEY_RESTRICT*, unsigned char*HEDLEY_RESTRICT*, size_t, YencDecoderState*) = &do_decode_scalar<true, false>;
9
+ YencDecoderEnd (*_do_decode_end_raw)(const unsigned char*HEDLEY_RESTRICT*, unsigned char*HEDLEY_RESTRICT*, size_t, YencDecoderState*) = &do_decode_end_scalar<true>;
10
+ }
8
11
 
9
12
  void decoder_set_sse2_funcs();
10
13
  void decoder_set_ssse3_funcs();
11
14
  void decoder_set_avx_funcs();
12
15
  void decoder_set_avx2_funcs();
16
+ void decoder_set_vbmi2_funcs();
13
17
  void decoder_set_neon_funcs();
14
18
 
15
19
 
@@ -41,7 +45,9 @@ void decoder_init() {
41
45
  decoder_set_native_funcs();
42
46
  # else
43
47
  int use_isa = cpu_supports_isa();
44
- if(use_isa >= ISA_LEVEL_AVX2)
48
+ if(use_isa >= ISA_LEVEL_VBMI2)
49
+ decoder_set_vbmi2_funcs();
50
+ else if(use_isa >= ISA_LEVEL_AVX2)
45
51
  decoder_set_avx2_funcs();
46
52
  else if(use_isa >= ISA_LEVEL_AVX)
47
53
  decoder_set_avx_funcs();
package/src/decoder.h CHANGED
@@ -1,3 +1,11 @@
1
+ #ifndef __YENC_DECODER_H
2
+ #define __YENC_DECODER_H
3
+
4
+ #ifdef __cplusplus
5
+ extern "C" {
6
+ #endif
7
+
8
+
1
9
 
2
10
  // the last state that the decoder was in (i.e. last few characters processed)
3
11
  // the state is needed for incremental decoders as its behavior is affected by what it processed last
@@ -25,8 +33,7 @@ extern YencDecoderEnd (*_do_decode)(const unsigned char*HEDLEY_RESTRICT*, unsign
25
33
  extern YencDecoderEnd (*_do_decode_raw)(const unsigned char*HEDLEY_RESTRICT*, unsigned char*HEDLEY_RESTRICT*, size_t, YencDecoderState*);
26
34
  extern YencDecoderEnd (*_do_decode_end_raw)(const unsigned char*HEDLEY_RESTRICT*, unsigned char*HEDLEY_RESTRICT*, size_t, YencDecoderState*);
27
35
 
28
- template<bool isRaw>
29
- static inline size_t do_decode(const unsigned char* HEDLEY_RESTRICT src, unsigned char* HEDLEY_RESTRICT dest, size_t len, YencDecoderState* state) {
36
+ static inline size_t do_decode(int isRaw, const unsigned char* HEDLEY_RESTRICT src, unsigned char* HEDLEY_RESTRICT dest, size_t len, YencDecoderState* state) {
30
37
  unsigned char* ds = dest;
31
38
  (*(isRaw ? _do_decode_raw : _do_decode))(&src, &ds, len, state);
32
39
  return ds - dest;
@@ -37,3 +44,10 @@ static inline YencDecoderEnd do_decode_end(const unsigned char*HEDLEY_RESTRICT*
37
44
  }
38
45
 
39
46
  void decoder_init();
47
+
48
+
49
+
50
+ #ifdef __cplusplus
51
+ }
52
+ #endif
53
+ #endif
@@ -1,8 +1,8 @@
1
1
 
2
2
  #ifdef __AVX2__
3
3
 
4
- // GCC (ver 6-10(dev)) fails to optimize pure C version of mask testing, but has this intrinsic; Clang >= 7 optimizes C version fine
5
- #if defined(__GNUC__) && __GNUC__ >= 7
4
+ // GCC (ver 6-10(dev)) fails to optimize pure C version of mask testing, but has this intrinsic; Clang >= 7 optimizes C version fine; functions added in Clang 8
5
+ #if (defined(__GNUC__) && __GNUC__ >= 7) || (defined(_MSC_VER) && _MSC_VER >= 1924)
6
6
  # define KORTEST32(a, b) !_kortestz_mask32_u8((a), (b))
7
7
  # define KAND32(a, b) _kand_mask32((a), (b))
8
8
  # define KOR32(a, b) _kor_mask32((a), (b))
@@ -30,13 +30,17 @@ static HEDLEY_ALWAYS_INLINE __m256i force_align_read_256(const void* p) {
30
30
  }
31
31
 
32
32
  // _mm256_castsi128_si256, but upper is defined to be 0
33
- #if defined(__clang__) && __clang_major__ >= 5
33
+ #if (defined(__clang__) && __clang_major__ >= 5 && (!defined(__APPLE__) || __clang_major__ >= 7)) || (defined(__GNUC__) && __GNUC__ >= 10)
34
34
  // intrinsic unsupported in GCC 9 and MSVC < 2017
35
35
  # define zext128_256 _mm256_zextsi128_si256
36
36
  #else
37
37
  // technically a cast is incorrect, due to upper 128 bits being undefined, but should usually work fine
38
38
  // alternative may be `_mm256_set_m128i(_mm_setzero_si128(), v)` but unsupported on GCC < 7, and most compilers generate a VINSERTF128 instruction for it
39
- # define zext128_256 _mm256_castsi128_si256
39
+ # ifdef __OPTIMIZE__
40
+ # define zext128_256 _mm256_castsi128_si256
41
+ # else
42
+ # define zext128_256(x) _mm256_inserti128_si256(_mm256_setzero_si256(), x, 0)
43
+ # endif
40
44
  #endif
41
45
 
42
46
 
@@ -56,6 +60,17 @@ HEDLEY_ALWAYS_INLINE void do_decode_avx2(const uint8_t* HEDLEY_RESTRICT src, lon
56
60
  '.','.','.','.','.','.','.','.','.','.','.','.','.','.',_nextMask==2?0:'.',_nextMask==1?0:'.'
57
61
  );
58
62
  }
63
+
64
+ // for some reason, MSVC Win32 seems to crash when trying to compile _mm256_mask_cmpeq_epi8_mask
65
+ // the crash can be fixed by switching the order of the last two arguments, but it seems to generate wrong code
66
+ // so just disable the optimisation as it seems to be problematic there
67
+ #if defined(__AVX512VL__) && defined(__AVX512BW__)
68
+ # if defined(_MSC_VER) && !defined(PLATFORM_AMD64) && !defined(__clang__)
69
+ const bool useAVX3MaskCmp = false;
70
+ # else
71
+ const bool useAVX3MaskCmp = (use_isa >= ISA_LEVEL_AVX3);
72
+ # endif
73
+ #endif
59
74
  intptr_t i;
60
75
  for(i = -len; i; i += sizeof(__m256i)*2) {
61
76
  __m256i oDataA = _mm256_load_si256((__m256i *)(src+i));
@@ -122,7 +137,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_avx2(const uint8_t* HEDLEY_RESTRICT src, lon
122
137
  __mmask32 match2EqMaskA, match2EqMaskB;
123
138
  __mmask32 match0CrMaskA, match0CrMaskB;
124
139
  __mmask32 match2CrXDtMaskA, match2CrXDtMaskB;
125
- if(use_isa >= ISA_LEVEL_AVX3 && searchEnd) {
140
+ if(useAVX3MaskCmp && searchEnd) {
126
141
  match2EqMaskA = _mm256_cmpeq_epi8_mask(_mm256_set1_epi8('='), tmpData2A);
127
142
  match2EqMaskB = _mm256_cmpeq_epi8_mask(_mm256_set1_epi8('='), tmpData2B);
128
143
  } else
@@ -138,7 +153,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_avx2(const uint8_t* HEDLEY_RESTRICT src, lon
138
153
  // find patterns of \r_.
139
154
 
140
155
  #if defined(__AVX512VL__) && defined(__AVX512BW__)
141
- if(use_isa >= ISA_LEVEL_AVX3) {
156
+ if(useAVX3MaskCmp) {
142
157
  match0CrMaskA = _mm256_cmpeq_epi8_mask(oDataA, _mm256_set1_epi8('\r'));
143
158
  match0CrMaskB = _mm256_cmpeq_epi8_mask(oDataB, _mm256_set1_epi8('\r'));
144
159
  match2CrXDtMaskA = _mm256_mask_cmpeq_epi8_mask(match0CrMaskA, tmpData2A, _mm256_set1_epi8('.'));
@@ -168,7 +183,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_avx2(const uint8_t* HEDLEY_RESTRICT src, lon
168
183
  #if defined(__AVX512VL__) && defined(__AVX512BW__)
169
184
  __mmask32 match1NlMaskA, match1NlMaskB;
170
185
  __mmask32 match2NlDotMaskA, match2NlDotMaskB;
171
- if(use_isa >= ISA_LEVEL_AVX3) {
186
+ if(useAVX3MaskCmp) {
172
187
  match1NlMaskA = _mm256_mask_cmpeq_epi8_mask(
173
188
  match0CrMaskA,
174
189
  _mm256_set1_epi8('\n'),
@@ -224,7 +239,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_avx2(const uint8_t* HEDLEY_RESTRICT src, lon
224
239
 
225
240
  int matchEnd;
226
241
  #if defined(__AVX512VL__) && defined(__AVX512BW__)
227
- if(use_isa >= ISA_LEVEL_AVX3) {
242
+ if(useAVX3MaskCmp) {
228
243
  __mmask32 match3EqYMaskA = _mm256_mask_cmpeq_epi8_mask(
229
244
  match2EqMaskA,
230
245
  _mm256_set1_epi8('y'),
@@ -298,12 +313,12 @@ HEDLEY_ALWAYS_INLINE void do_decode_avx2(const uint8_t* HEDLEY_RESTRICT src, lon
298
313
  if(LIKELIHOOD(0.002, matchEnd)) {
299
314
  // terminator found
300
315
  // there's probably faster ways to do this, but reverting to scalar code should be good enough
301
- len += i;
316
+ len += (long)i;
302
317
  break;
303
318
  }
304
319
  }
305
320
  #if defined(__AVX512VL__) && defined(__AVX512BW__)
306
- if(use_isa >= ISA_LEVEL_AVX3) {
321
+ if(useAVX3MaskCmp) {
307
322
  mask |= (uint64_t)match2NlDotMaskA << 2;
308
323
  mask |= (uint64_t)match2NlDotMaskB << 34;
309
324
  minMask = _mm256_maskz_mov_epi8(~(match2NlDotMaskB>>30), _mm256_set1_epi8('.'));
@@ -321,7 +336,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_avx2(const uint8_t* HEDLEY_RESTRICT src, lon
321
336
  __m256i match3EqYA, match3EqYB;
322
337
  #if defined(__AVX512VL__) && defined(__AVX512BW__)
323
338
  __mmask32 match3EqYMaskA, match3EqYMaskB;
324
- if(use_isa >= ISA_LEVEL_AVX3) {
339
+ if(useAVX3MaskCmp) {
325
340
  match3EqYMaskA = _mm256_mask_cmpeq_epi8_mask(
326
341
  match2EqMaskA,
327
342
  _mm256_set1_epi8('y'),
@@ -351,7 +366,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_avx2(const uint8_t* HEDLEY_RESTRICT src, lon
351
366
  if(LIKELIHOOD(0.002, partialEndFound)) {
352
367
  bool endFound;
353
368
  #if defined(__AVX512VL__) && defined(__AVX512BW__)
354
- if(use_isa >= ISA_LEVEL_AVX3) {
369
+ if(useAVX3MaskCmp) {
355
370
  __mmask32 match3LfEqYMaskA = _mm256_mask_cmpeq_epi8_mask(
356
371
  match3EqYMaskA,
357
372
  _mm256_set1_epi8('\n'),
@@ -390,7 +405,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_avx2(const uint8_t* HEDLEY_RESTRICT src, lon
390
405
  ));
391
406
  }
392
407
  if(endFound) {
393
- len += i;
408
+ len += (long)i;
394
409
  break;
395
410
  }
396
411
  }
@@ -489,14 +504,10 @@ HEDLEY_ALWAYS_INLINE void do_decode_avx2(const uint8_t* HEDLEY_RESTRICT src, lon
489
504
  #endif
490
505
  {
491
506
  // << 1 byte
492
- cmpEqB = _mm256_cmpeq_epi8(_mm256_set1_epi8('='), _mm256_loadu_si256((__m256i *)(src+i-1) + 1));
493
- #if defined(__tune_znver1__) || defined(__tune_bdver4__)
494
507
  cmpEqA = _mm256_alignr_epi8(cmpEqA, _mm256_inserti128_si256(
495
- _mm256_setzero_si256(), _mm256_castsi256_si128(cmpEqA), 1
508
+ _mm256_set1_epi8('='), _mm256_castsi256_si128(cmpEqA), 1
496
509
  ), 15);
497
- #else
498
- cmpEqA = _mm256_alignr_epi8(cmpEqA, _mm256_permute2x128_si256(cmpEqA, cmpEqA, 0x08), 15);
499
- #endif
510
+ cmpEqB = _mm256_cmpeq_epi8(_mm256_set1_epi8('='), _mm256_loadu_si256((__m256i *)(src+i-1) + 1));
500
511
  dataA = _mm256_add_epi8(
501
512
  oDataA,
502
513
  _mm256_blendv_epi8(
@@ -523,7 +534,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_avx2(const uint8_t* HEDLEY_RESTRICT src, lon
523
534
  #endif
524
535
  {
525
536
  yencOffset = _mm256_xor_si256(_mm256_set1_epi8(-42), zext128_256(
526
- _mm_slli_epi16(_mm_cvtsi32_si128(escFirst), 6)
537
+ _mm_slli_epi16(_mm_cvtsi32_si128((int)escFirst), 6)
527
538
  ));
528
539
  }
529
540
 
@@ -565,7 +576,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_avx2(const uint8_t* HEDLEY_RESTRICT src, lon
565
576
  p -= popcnt32(mask & 0xffff0);
566
577
 
567
578
  _mm_storeu_si128((__m128i*)(p + XMM_SIZE*3), _mm256_extracti128_si256(dataB, 1));
568
- p -= popcnt32(mask >> 20);
579
+ p -= popcnt32((unsigned int)(mask >> 20));
569
580
  #else
570
581
  mask >>= 32;
571
582
  shuf = _mm256_inserti128_si256(
@@ -340,7 +340,7 @@ YencDecoderEnd do_decode_simd(const unsigned char* HEDLEY_RESTRICT* src, unsigne
340
340
  if((uintptr_t)(*src) & ((width-1))) {
341
341
  // find source memory alignment
342
342
  unsigned char* aSrc = (unsigned char*)(((uintptr_t)(*src) + (width-1)) & ~(width-1));
343
- int amount = aSrc - *src;
343
+ int amount = (int)(aSrc - *src);
344
344
  len -= amount;
345
345
  YencDecoderEnd ended = do_decode_scalar<isRaw, searchEnd>(src, dest, amount, pState);
346
346
  if(ended) return ended;
@@ -427,7 +427,7 @@ YencDecoderEnd do_decode_simd(const unsigned char* HEDLEY_RESTRICT* src, unsigne
427
427
  escFirst = (*pState == YDEC_STATE_EQ || *pState == YDEC_STATE_CRLFEQ);
428
428
 
429
429
  // our algorithm may perform an aligned load on the next part, of which we consider 2 bytes (for \r\n. sequence checking)
430
- long dLen = len - lenBuffer;
430
+ long dLen = (long)(len - lenBuffer);
431
431
  dLen = (dLen + (width-1)) & ~(width-1);
432
432
 
433
433
  kernel((const uint8_t*)(*src) + dLen, dLen, p, escFirst, nextMask);
@@ -7,9 +7,9 @@
7
7
  #include "decoder_common.h"
8
8
 
9
9
 
10
- #ifdef _MSC_VER
11
- # define vld1_u8_align vld1_u8_ex
12
- # define vld1q_u8_align vld1q_u8_ex
10
+ #if defined(_MSC_VER) && !defined(__clang__)
11
+ # define vld1_u8_align(p, a) vld1_u8_ex(p, a*8)
12
+ # define vld1q_u8_align(p, a) vld1q_u8_ex(p, a*8)
13
13
  #elif defined(__GNUC__)
14
14
  # define vld1_u8_align(p, n) vld1_u8((uint8_t*)__builtin_assume_aligned(p, n))
15
15
  # define vld1q_u8_align(p, n) vld1q_u8((uint8_t*)__builtin_assume_aligned(p, n))
@@ -19,19 +19,17 @@
19
19
  #endif
20
20
 
21
21
 
22
- // for compilers that lack these functions
23
- #if defined(__clang__) || (defined(__GNUC__) && (defined(__aarch64__) && __GNUC__ >= 8))
22
+ // for compilers that lack these functions (Clang armv7 9-12 seems to have issues with multi-vector loads)
23
+ #if (defined(__clang__) && (defined(__aarch64__) || __clang_major__<9 || __clang_major__>12)) || (defined(__GNUC__) && (defined(__aarch64__) && __GNUC__ >= 8))
24
24
  # define vld1q_u8_x2_align(p, n) vld1q_u8_x2((uint8_t*)__builtin_assume_aligned(p, n))
25
25
  #else
26
- HEDLEY_ALWAYS_INLINE uint8x16x2_t vld1q_u8_x2_align(const uint8_t* p, int n) {
27
- return (uint8x16x2_t){vld1q_u8_align(p, n), vld1q_u8_align(p+16, n)};
28
- }
26
+ # define vld1q_u8_x2_align(p, n) vcreate2_u8(vld1q_u8_align(p, (n)/2), vld1q_u8_align((p)+16, (n)/2))
29
27
  #endif
30
28
  // Clang wrongly assumes alignment on vld1q_u8_x2, and ARMv7 GCC doesn't support the function, so effectively, it can only be used in ARMv8 compilers
31
- #if defined(__aarch64__) && (defined(__clang__) || (defined(__GNUC__) && __GNUC__ >= 9))
29
+ #if defined(__aarch64__) && (defined(__clang__) || HEDLEY_GCC_VERSION_CHECK(8,5,0))
32
30
  # define vst1q_u8_x2_unaligned vst1q_u8_x2
33
31
  #else
34
- HEDLEY_ALWAYS_INLINE void vst1q_u8_x2_unaligned(uint8_t* p, uint8x16x2_t data) {
32
+ static HEDLEY_ALWAYS_INLINE void vst1q_u8_x2_unaligned(uint8_t* p, uint8x16x2_t data) {
35
33
  vst1q_u8(p, data.val[0]);
36
34
  vst1q_u8(p+16, data.val[1]);
37
35
  }
@@ -64,18 +62,20 @@ template<bool isRaw, bool searchEnd>
64
62
  HEDLEY_ALWAYS_INLINE void do_decode_neon(const uint8_t* HEDLEY_RESTRICT src, long& len, unsigned char* HEDLEY_RESTRICT & p, unsigned char& escFirst, uint16_t& nextMask) {
65
63
  HEDLEY_ASSUME(escFirst == 0 || escFirst == 1);
66
64
  HEDLEY_ASSUME(nextMask == 0 || nextMask == 1 || nextMask == 2);
67
- uint8x16_t yencOffset = escFirst ? (uint8x16_t){42+64,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42} : vdupq_n_u8(42);
65
+ uint8x16_t yencOffset = escFirst ? vmakeq_u8(42+64,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42) : vdupq_n_u8(42);
68
66
  #ifdef __aarch64__
69
67
  uint8x16_t nextMaskMix = vdupq_n_u8(0);
70
- if(nextMask)
71
- nextMaskMix[nextMask-1] = nextMask;
68
+ if(nextMask == 1)
69
+ nextMaskMix = vsetq_lane_u8(1, nextMaskMix, 0);
70
+ if(nextMask == 2)
71
+ nextMaskMix = vsetq_lane_u8(2, nextMaskMix, 1);
72
72
  #else
73
73
  uint8x16_t lfCompare = vdupq_n_u8('\n');
74
74
  if(isRaw) {
75
75
  if(nextMask == 1)
76
- lfCompare[0] = '.';
76
+ lfCompare = vsetq_lane_u8('.', lfCompare, 0);
77
77
  if(nextMask == 2)
78
- lfCompare[1] = '.';
78
+ lfCompare = vsetq_lane_u8('.', lfCompare, 1);
79
79
  }
80
80
  #endif
81
81
  long i;
@@ -90,13 +90,13 @@ HEDLEY_ALWAYS_INLINE void do_decode_neon(const uint8_t* HEDLEY_RESTRICT src, lon
90
90
  #ifdef __aarch64__
91
91
  cmpA = vqtbx1q_u8(
92
92
  cmpEqA,
93
- // \n \r
94
- (uint8x16_t){0,0,0,0,0,0,0,0,0,0,255,0,0,255,0,0},
93
+ // \n \r
94
+ vmakeq_u8(0,0,0,0,0,0,0,0,0,0,255,0,0,255,0,0),
95
95
  dataA
96
96
  ),
97
97
  cmpB = vqtbx1q_u8(
98
98
  cmpEqB,
99
- (uint8x16_t){0,0,0,0,0,0,0,0,0,0,255,0,0,255,0,0},
99
+ vmakeq_u8(0,0,0,0,0,0,0,0,0,0,255,0,0,255,0,0),
100
100
  dataB
101
101
  );
102
102
  if(isRaw) cmpA = vorrq_u8(cmpA, nextMaskMix);
@@ -122,12 +122,12 @@ HEDLEY_ALWAYS_INLINE void do_decode_neon(const uint8_t* HEDLEY_RESTRICT src, lon
122
122
 
123
123
  #ifdef __aarch64__
124
124
  if (LIKELIHOOD(0.42 /*guess*/, neon_vect_is_nonzero(vorrq_u8(cmpA, cmpB)))) {
125
- cmpA = vandq_u8(cmpA, (uint8x16_t){1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128});
126
- cmpB = vandq_u8(cmpB, (uint8x16_t){1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128});
125
+ cmpA = vandq_u8(cmpA, vmakeq_u8(1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128));
126
+ cmpB = vandq_u8(cmpB, vmakeq_u8(1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128));
127
127
  uint8x16_t cmpMerge = vpaddq_u8(cmpA, cmpB);
128
128
  uint8x16_t cmpEqMerge = vpaddq_u8(
129
- vandq_u8(cmpEqA, (uint8x16_t){1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128}),
130
- vandq_u8(cmpEqB, (uint8x16_t){1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128})
129
+ vandq_u8(cmpEqA, vmakeq_u8(1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128)),
130
+ vandq_u8(cmpEqB, vmakeq_u8(1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128))
131
131
  );
132
132
 
133
133
  uint8x16_t cmpCombined = vpaddq_u8(cmpMerge, cmpEqMerge);
@@ -136,8 +136,8 @@ HEDLEY_ALWAYS_INLINE void do_decode_neon(const uint8_t* HEDLEY_RESTRICT src, lon
136
136
  uint32_t mask = vgetq_lane_u32(vreinterpretq_u32_u8(cmpCombined), 0);
137
137
  uint32_t maskEq = vgetq_lane_u32(vreinterpretq_u32_u8(cmpCombined), 1);
138
138
  #else
139
- cmpA = vandq_u8(cmpA, (uint8x16_t){1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128});
140
- cmpB = vandq_u8(cmpB, (uint8x16_t){1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128});
139
+ cmpA = vandq_u8(cmpA, vmakeq_u8(1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128));
140
+ cmpB = vandq_u8(cmpB, vmakeq_u8(1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128));
141
141
  // no vpaddq_u8 in ARMv7, so need extra 64-bit VPADD
142
142
  uint8x8_t cmpPacked = vpadd_u8(
143
143
  vpadd_u8(
@@ -150,8 +150,8 @@ HEDLEY_ALWAYS_INLINE void do_decode_neon(const uint8_t* HEDLEY_RESTRICT src, lon
150
150
  cmpPacked = vpadd_u8(cmpPacked, cmpPacked);
151
151
  uint32_t mask = vget_lane_u32(vreinterpret_u32_u8(cmpPacked), 0);
152
152
  if(LIKELIHOOD(0.42, mask != 0)) {
153
- uint8x16_t cmpEqMaskedA = vandq_u8(cmpEqA, (uint8x16_t){1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128});
154
- uint8x16_t cmpEqMaskedB = vandq_u8(cmpEqB, (uint8x16_t){1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128});
153
+ uint8x16_t cmpEqMaskedA = vandq_u8(cmpEqA, vmakeq_u8(1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128));
154
+ uint8x16_t cmpEqMaskedB = vandq_u8(cmpEqB, vmakeq_u8(1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128));
155
155
  uint8x8_t cmpEqPacked = vpadd_u8(
156
156
  vpadd_u8(
157
157
  vget_low_u8(cmpEqMaskedA), vget_high_u8(cmpEqMaskedA)
@@ -170,7 +170,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_neon(const uint8_t* HEDLEY_RESTRICT src, lon
170
170
  // vext seems to be a cheap operation on ARM, relative to loads, so only avoid it if there's only one load (isRaw only)
171
171
  uint8x16_t tmpData2, nextData;
172
172
  if(isRaw && !searchEnd) {
173
- tmpData2 = vld1q_u8_align(src+i + 2 + sizeof(uint8x16_t), 2);
173
+ tmpData2 = vld1q_u8(src+i + 2 + sizeof(uint8x16_t));
174
174
  } else {
175
175
  nextData = vld1q_u8_align(src+i + sizeof(uint8x16_t)*2, 16); // only 32-bits needed, but there doesn't appear a nice way to do this via intrinsics: https://stackoverflow.com/questions/46910799/arm-neon-intrinsics-convert-d-64-bit-register-to-low-half-of-q-128-bit-regis
176
176
  tmpData2 = vextq_u8(dataB, nextData, 2);
@@ -255,15 +255,15 @@ HEDLEY_ALWAYS_INLINE void do_decode_neon(const uint8_t* HEDLEY_RESTRICT src, lon
255
255
  }
256
256
  }
257
257
  #ifdef __aarch64__
258
- uint8x16_t match2NlDotBMasked = vandq_u8(match2NlDotB, (uint8x16_t){1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128});
258
+ uint8x16_t match2NlDotBMasked = vandq_u8(match2NlDotB, vmakeq_u8(1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128));
259
259
  uint8x16_t mergeKillDots = vpaddq_u8(
260
- vandq_u8(match2NlDotA, (uint8x16_t){1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128}),
260
+ vandq_u8(match2NlDotA, vmakeq_u8(1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128)),
261
261
  match2NlDotBMasked
262
262
  );
263
263
  uint8x8_t mergeKillDots2 = vget_low_u8(vpaddq_u8(mergeKillDots, mergeKillDots));
264
264
  #else
265
- uint8x16_t match2NlDotMaskedA = vandq_u8(match2NlDotA, (uint8x16_t){1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128});
266
- uint8x16_t match2NlDotMaskedB = vandq_u8(match2NlDotB, (uint8x16_t){1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128});
265
+ uint8x16_t match2NlDotMaskedA = vandq_u8(match2NlDotA, vmakeq_u8(1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128));
266
+ uint8x16_t match2NlDotMaskedB = vandq_u8(match2NlDotB, vmakeq_u8(1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128));
267
267
  uint8x8_t mergeKillDots2 = vpadd_u8(
268
268
  vpadd_u8(
269
269
  vget_low_u8(match2NlDotMaskedA), vget_high_u8(match2NlDotMaskedA)
@@ -342,11 +342,11 @@ HEDLEY_ALWAYS_INLINE void do_decode_neon(const uint8_t* HEDLEY_RESTRICT src, lon
342
342
  #ifdef __aarch64__
343
343
  uint8x16_t vMaskEqA = vqtbl1q_u8(
344
344
  vcombine_u8(maskEqTemp, vdup_n_u8(0)),
345
- (uint8x16_t){0,0,0,0,0,0,0,0, 1,1,1,1,1,1,1,1}
345
+ vmakeq_u8(0,0,0,0,0,0,0,0, 1,1,1,1,1,1,1,1)
346
346
  );
347
347
  uint8x16_t vMaskEqB = vqtbl1q_u8(
348
348
  vcombine_u8(maskEqTemp, vdup_n_u8(0)),
349
- (uint8x16_t){2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3}
349
+ vmakeq_u8(2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3)
350
350
  );
351
351
  #else
352
352
  uint8x16_t vMaskEqA = vcombine_u8(
@@ -358,8 +358,8 @@ HEDLEY_ALWAYS_INLINE void do_decode_neon(const uint8_t* HEDLEY_RESTRICT src, lon
358
358
  vdup_lane_u8(maskEqTemp, 3)
359
359
  );
360
360
  #endif
361
- vMaskEqA = vtstq_u8(vMaskEqA, (uint8x16_t){1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128});
362
- vMaskEqB = vtstq_u8(vMaskEqB, (uint8x16_t){1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128});
361
+ vMaskEqA = vtstq_u8(vMaskEqA, vmakeq_u8(1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128));
362
+ vMaskEqB = vtstq_u8(vMaskEqB, vmakeq_u8(1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128));
363
363
 
364
364
  dataA = vsubq_u8(
365
365
  dataA,
@@ -391,7 +391,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_neon(const uint8_t* HEDLEY_RESTRICT src, lon
391
391
  )
392
392
  );
393
393
  }
394
- yencOffset[0] = (escFirst << 6) | 42;
394
+ yencOffset = vsetq_lane_u8((escFirst << 6) | 42, yencOffset, 0);
395
395
 
396
396
  // all that's left is to 'compress' the data (skip over masked chars)
397
397
  uint32_t counts = 0x08080808 - vget_lane_u32(vreinterpret_u32_u8(vcnt_u8(cmpPacked)), 0);
@@ -439,7 +439,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_neon(const uint8_t* HEDLEY_RESTRICT src, lon
439
439
  } else {
440
440
  dataA = vsubq_u8(dataA, yencOffset);
441
441
  dataB = vsubq_u8(dataB, vdupq_n_u8(42));
442
- vst1q_u8_x2_unaligned(p, ((uint8x16x2_t){dataA, dataB}));
442
+ vst1q_u8_x2_unaligned(p, vcreate2_u8(dataA, dataB));
443
443
  p += sizeof(uint8x16_t)*2;
444
444
  escFirst = 0;
445
445
  #ifdef __aarch64__