yencode 1.1.0 → 1.1.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/binding.gyp +79 -7
- package/crcutil-1.0/code/multiword_64_64_intrinsic_i386_mmx.cc +1 -1
- package/package.json +1 -1
- package/src/common.h +88 -24
- package/src/crc.cc +59 -27
- package/src/crc.h +20 -6
- package/src/crc_arm.cc +154 -27
- package/src/crc_common.h +3 -10
- package/src/{crc_folding.c → crc_folding.cc} +53 -122
- package/src/crc_folding_256.cc +230 -0
- package/src/decoder.cc +10 -4
- package/src/decoder.h +16 -2
- package/src/decoder_avx2_base.h +32 -21
- package/src/decoder_common.h +2 -2
- package/src/decoder_neon.cc +37 -37
- package/src/decoder_neon64.cc +41 -36
- package/src/decoder_sse_base.h +21 -14
- package/src/decoder_vbmi2.cc +30 -0
- package/src/encoder.cc +9 -3
- package/src/encoder.h +17 -1
- package/src/encoder_avx_base.h +8 -8
- package/src/encoder_common.h +3 -3
- package/src/encoder_neon.cc +31 -31
- package/src/encoder_sse_base.h +7 -8
- package/src/encoder_vbmi2.cc +23 -0
- package/src/platform.cc +57 -8
- package/src/yencode.cc +33 -44
- package/test/testcrc.js +14 -0
|
@@ -0,0 +1,230 @@
|
|
|
1
|
+
// 256-bit version of crc_folding
|
|
2
|
+
|
|
3
|
+
#include "crc_common.h"
|
|
4
|
+
|
|
5
|
+
#if !defined(YENC_DISABLE_AVX256) && ((defined(__VPCLMULQDQ__) && defined(__AVX2__) && defined(__PCLMUL__)) || (defined(_MSC_VER) && _MSC_VER >= 1920 && defined(PLATFORM_X86) && !defined(__clang__)))
|
|
6
|
+
#include <inttypes.h>
|
|
7
|
+
#include <immintrin.h>
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
#if defined(__AVX512VL__) && defined(YENC_BUILD_NATIVE) && YENC_BUILD_NATIVE!=0
|
|
11
|
+
# define ENABLE_AVX512 1
|
|
12
|
+
#endif
|
|
13
|
+
|
|
14
|
+
static __m256i do_one_fold(__m256i src, __m256i data) {
|
|
15
|
+
const __m256i fold4 = _mm256_set_epi32(
|
|
16
|
+
0x00000001, 0x54442bd4,
|
|
17
|
+
0x00000001, 0xc6e41596,
|
|
18
|
+
0x00000001, 0x54442bd4,
|
|
19
|
+
0x00000001, 0xc6e41596
|
|
20
|
+
);
|
|
21
|
+
#ifdef ENABLE_AVX512
|
|
22
|
+
return _mm256_ternarylogic_epi32(
|
|
23
|
+
_mm256_clmulepi64_epi128(src, fold4, 0x01),
|
|
24
|
+
_mm256_clmulepi64_epi128(src, fold4, 0x10),
|
|
25
|
+
data,
|
|
26
|
+
0x96
|
|
27
|
+
);
|
|
28
|
+
#else
|
|
29
|
+
return _mm256_xor_si256(data, _mm256_xor_si256(
|
|
30
|
+
_mm256_clmulepi64_epi128(src, fold4, 0x01),
|
|
31
|
+
_mm256_clmulepi64_epi128(src, fold4, 0x10)
|
|
32
|
+
));
|
|
33
|
+
#endif
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
ALIGN_TO(32, static const uint8_t pshufb_rot_table[]) = {
|
|
37
|
+
0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,
|
|
38
|
+
16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31
|
|
39
|
+
};
|
|
40
|
+
// _mm256_castsi128_si256, but upper is defined to be 0
|
|
41
|
+
#if (defined(__clang__) && __clang_major__ >= 5 && (!defined(__APPLE__) || __clang_major__ >= 7)) || (defined(__GNUC__) && __GNUC__ >= 10)
|
|
42
|
+
// intrinsic unsupported in GCC 9 and MSVC < 2017
|
|
43
|
+
# define zext128_256 _mm256_zextsi128_si256
|
|
44
|
+
#else
|
|
45
|
+
// technically a cast is incorrect, due to upper 128 bits being undefined, but should usually work fine
|
|
46
|
+
// alternative may be `_mm256_set_m128i(_mm_setzero_si128(), v)` but unsupported on GCC < 7, and most compilers generate a VINSERTF128 instruction for it
|
|
47
|
+
# ifdef __OPTIMIZE__
|
|
48
|
+
# define zext128_256 _mm256_castsi128_si256
|
|
49
|
+
# else
|
|
50
|
+
# define zext128_256(x) _mm256_inserti128_si256(_mm256_setzero_si256(), x, 0)
|
|
51
|
+
# endif
|
|
52
|
+
#endif
|
|
53
|
+
|
|
54
|
+
#ifdef ENABLE_AVX512
|
|
55
|
+
# define MM256_BLENDV(a, b, m) _mm256_ternarylogic_epi32(a, b, m, 0xd8)
|
|
56
|
+
# define MM_2XOR(a, b, c) _mm_ternarylogic_epi32(a, b, c, 0x96)
|
|
57
|
+
#else
|
|
58
|
+
# define MM256_BLENDV _mm256_blendv_epi8
|
|
59
|
+
# define MM_2XOR(a, b, c) _mm_xor_si128(_mm_xor_si128(a, b), c)
|
|
60
|
+
#endif
|
|
61
|
+
|
|
62
|
+
static void partial_fold(const size_t len, __m256i *crc0, __m256i *crc1, __m256i crc_part) {
|
|
63
|
+
__m256i shuf = _mm256_broadcastsi128_si256(_mm_loadu_si128((__m128i*)(pshufb_rot_table + (len&15))));
|
|
64
|
+
__m256i mask = _mm256_cmpgt_epi8(shuf, _mm256_set1_epi8(15));
|
|
65
|
+
|
|
66
|
+
*crc0 = _mm256_shuffle_epi8(*crc0, shuf);
|
|
67
|
+
*crc1 = _mm256_shuffle_epi8(*crc1, shuf);
|
|
68
|
+
crc_part = _mm256_shuffle_epi8(crc_part, shuf);
|
|
69
|
+
|
|
70
|
+
__m256i crc_out = _mm256_permute2x128_si256(*crc0, *crc0, 0x08); // move bottom->top
|
|
71
|
+
__m256i crc01, crc1p;
|
|
72
|
+
if(len >= 16) {
|
|
73
|
+
crc_out = MM256_BLENDV(crc_out, *crc0, mask);
|
|
74
|
+
crc01 = *crc1;
|
|
75
|
+
crc1p = crc_part;
|
|
76
|
+
*crc0 = _mm256_permute2x128_si256(*crc0, *crc1, 0x21);
|
|
77
|
+
*crc1 = _mm256_permute2x128_si256(*crc1, crc_part, 0x21);
|
|
78
|
+
crc_part = zext128_256(_mm256_extracti128_si256(crc_part, 1));
|
|
79
|
+
} else {
|
|
80
|
+
crc_out = _mm256_and_si256(crc_out, mask);
|
|
81
|
+
crc01 = _mm256_permute2x128_si256(*crc0, *crc1, 0x21);
|
|
82
|
+
crc1p = _mm256_permute2x128_si256(*crc1, crc_part, 0x21);
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
*crc0 = MM256_BLENDV(*crc0, crc01, mask);
|
|
86
|
+
*crc1 = MM256_BLENDV(*crc1, crc1p, mask);
|
|
87
|
+
|
|
88
|
+
*crc1 = do_one_fold(crc_out, *crc1);
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
ALIGN_TO(16, static const unsigned crc_k[]) = {
|
|
93
|
+
0xccaa009e, 0x00000000, /* rk1 */
|
|
94
|
+
0x751997d0, 0x00000001, /* rk2 */
|
|
95
|
+
0xccaa009e, 0x00000000, /* rk5 */
|
|
96
|
+
0x63cd6124, 0x00000001, /* rk6 */
|
|
97
|
+
0xf7011641, 0x00000000, /* rk7 */
|
|
98
|
+
0xdb710640, 0x00000001 /* rk8 */
|
|
99
|
+
};
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
static uint32_t crc_fold(const unsigned char *src, long len, uint32_t initial) {
|
|
103
|
+
// info from https://www.reddit.com/r/ReverseEngineering/comments/2zwhl3/mystery_constant_0x9db42487_in_intels_crc32ieee/
|
|
104
|
+
// firstly, calculate: xmm_crc0 = (intial * 0x487b9c8a) mod 0x104c11db7, where 0x487b9c8a = inverse(1<<512) mod 0x104c11db7
|
|
105
|
+
__m128i xmm_t0 = _mm_cvtsi32_si128(~initial);
|
|
106
|
+
|
|
107
|
+
xmm_t0 = _mm_clmulepi64_si128(xmm_t0, _mm_set_epi32(0, 0, 0xa273bc24, 0), 0); // reverse(0x487b9c8a)<<1 == 0xa273bc24
|
|
108
|
+
__m128i reduction = _mm_set_epi32( // polynomial reduction factors
|
|
109
|
+
1, 0xdb710640, // G* = 0x04c11db7
|
|
110
|
+
0, 0xf7011641 // Q+ = 0x04d101df (+1 to save an additional xor operation)
|
|
111
|
+
);
|
|
112
|
+
__m128i xmm_t1 = _mm_clmulepi64_si128(xmm_t0, reduction, 0);
|
|
113
|
+
xmm_t1 = _mm_clmulepi64_si128(xmm_t1, reduction, 0x10);
|
|
114
|
+
|
|
115
|
+
xmm_t0 = _mm_srli_si128(_mm_xor_si128(xmm_t0, xmm_t1), 8);
|
|
116
|
+
__m256i crc0 = zext128_256(xmm_t0);
|
|
117
|
+
__m256i crc1 = _mm256_setzero_si256();
|
|
118
|
+
|
|
119
|
+
if (len < 32) {
|
|
120
|
+
if (len == 0)
|
|
121
|
+
return initial;
|
|
122
|
+
__m256i crc_part = _mm256_setzero_si256();
|
|
123
|
+
memcpy(&crc_part, src, len);
|
|
124
|
+
partial_fold(len, &crc0, &crc1, crc_part);
|
|
125
|
+
} else {
|
|
126
|
+
uintptr_t algn_diff = (0 - (uintptr_t)src) & 0x1F;
|
|
127
|
+
if (algn_diff) {
|
|
128
|
+
partial_fold(algn_diff, &crc0, &crc1, _mm256_loadu_si256((__m256i *)src));
|
|
129
|
+
src += algn_diff;
|
|
130
|
+
len -= algn_diff;
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
while (len >= 64) {
|
|
134
|
+
crc0 = do_one_fold(crc0, _mm256_load_si256((__m256i*)src));
|
|
135
|
+
crc1 = do_one_fold(crc1, _mm256_load_si256((__m256i*)src + 1));
|
|
136
|
+
src += 64;
|
|
137
|
+
len -= 64;
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
if (len >= 32) {
|
|
141
|
+
__m256i old = crc1;
|
|
142
|
+
crc1 = do_one_fold(crc0, _mm256_load_si256((__m256i*)src));
|
|
143
|
+
crc0 = old;
|
|
144
|
+
|
|
145
|
+
len -= 32;
|
|
146
|
+
src += 32;
|
|
147
|
+
}
|
|
148
|
+
|
|
149
|
+
if(len != 0) {
|
|
150
|
+
partial_fold(len, &crc0, &crc1, _mm256_load_si256((__m256i *)src));
|
|
151
|
+
}
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
const __m128i xmm_mask = _mm_set_epi32(-1,-1,-1,0);
|
|
155
|
+
__m128i x_tmp0, x_tmp1, x_tmp2, crc_fold;
|
|
156
|
+
|
|
157
|
+
__m128i xmm_crc0 = _mm256_castsi256_si128(crc0);
|
|
158
|
+
__m128i xmm_crc1 = _mm256_extracti128_si256(crc0, 1);
|
|
159
|
+
__m128i xmm_crc2 = _mm256_castsi256_si128(crc1);
|
|
160
|
+
__m128i xmm_crc3 = _mm256_extracti128_si256(crc1, 1);
|
|
161
|
+
|
|
162
|
+
/*
|
|
163
|
+
* k1
|
|
164
|
+
*/
|
|
165
|
+
crc_fold = _mm_load_si128((__m128i *)crc_k);
|
|
166
|
+
|
|
167
|
+
x_tmp0 = _mm_clmulepi64_si128(xmm_crc0, crc_fold, 0x10);
|
|
168
|
+
xmm_crc0 = _mm_clmulepi64_si128(xmm_crc0, crc_fold, 0x01);
|
|
169
|
+
xmm_crc1 = MM_2XOR(xmm_crc1, x_tmp0, xmm_crc0);
|
|
170
|
+
|
|
171
|
+
x_tmp1 = _mm_clmulepi64_si128(xmm_crc1, crc_fold, 0x10);
|
|
172
|
+
xmm_crc1 = _mm_clmulepi64_si128(xmm_crc1, crc_fold, 0x01);
|
|
173
|
+
xmm_crc2 = MM_2XOR(xmm_crc2, x_tmp1, xmm_crc1);
|
|
174
|
+
|
|
175
|
+
x_tmp2 = _mm_clmulepi64_si128(xmm_crc2, crc_fold, 0x10);
|
|
176
|
+
xmm_crc2 = _mm_clmulepi64_si128(xmm_crc2, crc_fold, 0x01);
|
|
177
|
+
xmm_crc3 = MM_2XOR(xmm_crc3, x_tmp2, xmm_crc2);
|
|
178
|
+
|
|
179
|
+
/*
|
|
180
|
+
* k5
|
|
181
|
+
*/
|
|
182
|
+
crc_fold = _mm_load_si128((__m128i *)crc_k + 1);
|
|
183
|
+
|
|
184
|
+
xmm_crc0 = xmm_crc3;
|
|
185
|
+
xmm_crc3 = _mm_clmulepi64_si128(xmm_crc3, crc_fold, 0);
|
|
186
|
+
xmm_crc0 = _mm_srli_si128(xmm_crc0, 8);
|
|
187
|
+
xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc0);
|
|
188
|
+
|
|
189
|
+
xmm_crc0 = xmm_crc3;
|
|
190
|
+
xmm_crc3 = _mm_slli_si128(xmm_crc3, 4);
|
|
191
|
+
xmm_crc3 = _mm_clmulepi64_si128(xmm_crc3, crc_fold, 0x10);
|
|
192
|
+
#ifdef ENABLE_AVX512
|
|
193
|
+
//xmm_crc3 = _mm_maskz_xor_epi32(14, xmm_crc3, xmm_crc0);
|
|
194
|
+
xmm_crc3 = _mm_ternarylogic_epi32(xmm_crc3, xmm_crc0, xmm_mask, 0x28);
|
|
195
|
+
#else
|
|
196
|
+
xmm_crc0 = _mm_and_si128(xmm_crc0, xmm_mask);
|
|
197
|
+
xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc0);
|
|
198
|
+
#endif
|
|
199
|
+
|
|
200
|
+
/*
|
|
201
|
+
* k7
|
|
202
|
+
*/
|
|
203
|
+
xmm_crc1 = xmm_crc3;
|
|
204
|
+
crc_fold = _mm_load_si128((__m128i *)crc_k + 2);
|
|
205
|
+
|
|
206
|
+
xmm_crc3 = _mm_clmulepi64_si128(xmm_crc3, crc_fold, 0);
|
|
207
|
+
xmm_crc3 = _mm_clmulepi64_si128(xmm_crc3, crc_fold, 0x10);
|
|
208
|
+
#ifdef ENABLE_AVX512
|
|
209
|
+
xmm_crc3 = _mm_ternarylogic_epi32(xmm_crc3, xmm_crc1, xmm_crc1, 0xC3); // NOT(xmm_crc3 ^ xmm_crc1)
|
|
210
|
+
#else
|
|
211
|
+
xmm_crc1 = _mm_xor_si128(xmm_crc1, xmm_mask);
|
|
212
|
+
xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc1);
|
|
213
|
+
#endif
|
|
214
|
+
return _mm_extract_epi32(xmm_crc3, 2);
|
|
215
|
+
}
|
|
216
|
+
|
|
217
|
+
static uint32_t do_crc32_incremental_clmul(const void* data, size_t length, uint32_t init) {
|
|
218
|
+
return crc_fold((const unsigned char*)data, (long)length, init);
|
|
219
|
+
}
|
|
220
|
+
|
|
221
|
+
void crc_clmul256_set_funcs(crc_func* _do_crc32_incremental) {
|
|
222
|
+
*_do_crc32_incremental = &do_crc32_incremental_clmul;
|
|
223
|
+
}
|
|
224
|
+
#else
|
|
225
|
+
void crc_clmul_set_funcs(crc_func* _do_crc32_incremental);
|
|
226
|
+
void crc_clmul256_set_funcs(crc_func* _do_crc32_incremental) {
|
|
227
|
+
crc_clmul_set_funcs(_do_crc32_incremental);
|
|
228
|
+
}
|
|
229
|
+
#endif
|
|
230
|
+
|
package/src/decoder.cc
CHANGED
|
@@ -1,15 +1,19 @@
|
|
|
1
1
|
#include "common.h"
|
|
2
2
|
|
|
3
3
|
#include "decoder_common.h"
|
|
4
|
+
#include "decoder.h"
|
|
4
5
|
|
|
5
|
-
|
|
6
|
-
YencDecoderEnd (*
|
|
7
|
-
YencDecoderEnd (*
|
|
6
|
+
extern "C" {
|
|
7
|
+
YencDecoderEnd (*_do_decode)(const unsigned char*HEDLEY_RESTRICT*, unsigned char*HEDLEY_RESTRICT*, size_t, YencDecoderState*) = &do_decode_scalar<false, false>;
|
|
8
|
+
YencDecoderEnd (*_do_decode_raw)(const unsigned char*HEDLEY_RESTRICT*, unsigned char*HEDLEY_RESTRICT*, size_t, YencDecoderState*) = &do_decode_scalar<true, false>;
|
|
9
|
+
YencDecoderEnd (*_do_decode_end_raw)(const unsigned char*HEDLEY_RESTRICT*, unsigned char*HEDLEY_RESTRICT*, size_t, YencDecoderState*) = &do_decode_end_scalar<true>;
|
|
10
|
+
}
|
|
8
11
|
|
|
9
12
|
void decoder_set_sse2_funcs();
|
|
10
13
|
void decoder_set_ssse3_funcs();
|
|
11
14
|
void decoder_set_avx_funcs();
|
|
12
15
|
void decoder_set_avx2_funcs();
|
|
16
|
+
void decoder_set_vbmi2_funcs();
|
|
13
17
|
void decoder_set_neon_funcs();
|
|
14
18
|
|
|
15
19
|
|
|
@@ -41,7 +45,9 @@ void decoder_init() {
|
|
|
41
45
|
decoder_set_native_funcs();
|
|
42
46
|
# else
|
|
43
47
|
int use_isa = cpu_supports_isa();
|
|
44
|
-
if(use_isa >=
|
|
48
|
+
if(use_isa >= ISA_LEVEL_VBMI2)
|
|
49
|
+
decoder_set_vbmi2_funcs();
|
|
50
|
+
else if(use_isa >= ISA_LEVEL_AVX2)
|
|
45
51
|
decoder_set_avx2_funcs();
|
|
46
52
|
else if(use_isa >= ISA_LEVEL_AVX)
|
|
47
53
|
decoder_set_avx_funcs();
|
package/src/decoder.h
CHANGED
|
@@ -1,3 +1,11 @@
|
|
|
1
|
+
#ifndef __YENC_DECODER_H
|
|
2
|
+
#define __YENC_DECODER_H
|
|
3
|
+
|
|
4
|
+
#ifdef __cplusplus
|
|
5
|
+
extern "C" {
|
|
6
|
+
#endif
|
|
7
|
+
|
|
8
|
+
|
|
1
9
|
|
|
2
10
|
// the last state that the decoder was in (i.e. last few characters processed)
|
|
3
11
|
// the state is needed for incremental decoders as its behavior is affected by what it processed last
|
|
@@ -25,8 +33,7 @@ extern YencDecoderEnd (*_do_decode)(const unsigned char*HEDLEY_RESTRICT*, unsign
|
|
|
25
33
|
extern YencDecoderEnd (*_do_decode_raw)(const unsigned char*HEDLEY_RESTRICT*, unsigned char*HEDLEY_RESTRICT*, size_t, YencDecoderState*);
|
|
26
34
|
extern YencDecoderEnd (*_do_decode_end_raw)(const unsigned char*HEDLEY_RESTRICT*, unsigned char*HEDLEY_RESTRICT*, size_t, YencDecoderState*);
|
|
27
35
|
|
|
28
|
-
|
|
29
|
-
static inline size_t do_decode(const unsigned char* HEDLEY_RESTRICT src, unsigned char* HEDLEY_RESTRICT dest, size_t len, YencDecoderState* state) {
|
|
36
|
+
static inline size_t do_decode(int isRaw, const unsigned char* HEDLEY_RESTRICT src, unsigned char* HEDLEY_RESTRICT dest, size_t len, YencDecoderState* state) {
|
|
30
37
|
unsigned char* ds = dest;
|
|
31
38
|
(*(isRaw ? _do_decode_raw : _do_decode))(&src, &ds, len, state);
|
|
32
39
|
return ds - dest;
|
|
@@ -37,3 +44,10 @@ static inline YencDecoderEnd do_decode_end(const unsigned char*HEDLEY_RESTRICT*
|
|
|
37
44
|
}
|
|
38
45
|
|
|
39
46
|
void decoder_init();
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
#ifdef __cplusplus
|
|
51
|
+
}
|
|
52
|
+
#endif
|
|
53
|
+
#endif
|
package/src/decoder_avx2_base.h
CHANGED
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
|
|
2
2
|
#ifdef __AVX2__
|
|
3
3
|
|
|
4
|
-
// GCC (ver 6-10(dev)) fails to optimize pure C version of mask testing, but has this intrinsic; Clang >= 7 optimizes C version fine
|
|
5
|
-
#if defined(__GNUC__) && __GNUC__ >= 7
|
|
4
|
+
// GCC (ver 6-10(dev)) fails to optimize pure C version of mask testing, but has this intrinsic; Clang >= 7 optimizes C version fine; functions added in Clang 8
|
|
5
|
+
#if (defined(__GNUC__) && __GNUC__ >= 7) || (defined(_MSC_VER) && _MSC_VER >= 1924)
|
|
6
6
|
# define KORTEST32(a, b) !_kortestz_mask32_u8((a), (b))
|
|
7
7
|
# define KAND32(a, b) _kand_mask32((a), (b))
|
|
8
8
|
# define KOR32(a, b) _kor_mask32((a), (b))
|
|
@@ -30,13 +30,17 @@ static HEDLEY_ALWAYS_INLINE __m256i force_align_read_256(const void* p) {
|
|
|
30
30
|
}
|
|
31
31
|
|
|
32
32
|
// _mm256_castsi128_si256, but upper is defined to be 0
|
|
33
|
-
#if defined(__clang__) && __clang_major__ >= 5
|
|
33
|
+
#if (defined(__clang__) && __clang_major__ >= 5 && (!defined(__APPLE__) || __clang_major__ >= 7)) || (defined(__GNUC__) && __GNUC__ >= 10)
|
|
34
34
|
// intrinsic unsupported in GCC 9 and MSVC < 2017
|
|
35
35
|
# define zext128_256 _mm256_zextsi128_si256
|
|
36
36
|
#else
|
|
37
37
|
// technically a cast is incorrect, due to upper 128 bits being undefined, but should usually work fine
|
|
38
38
|
// alternative may be `_mm256_set_m128i(_mm_setzero_si128(), v)` but unsupported on GCC < 7, and most compilers generate a VINSERTF128 instruction for it
|
|
39
|
-
#
|
|
39
|
+
# ifdef __OPTIMIZE__
|
|
40
|
+
# define zext128_256 _mm256_castsi128_si256
|
|
41
|
+
# else
|
|
42
|
+
# define zext128_256(x) _mm256_inserti128_si256(_mm256_setzero_si256(), x, 0)
|
|
43
|
+
# endif
|
|
40
44
|
#endif
|
|
41
45
|
|
|
42
46
|
|
|
@@ -56,6 +60,17 @@ HEDLEY_ALWAYS_INLINE void do_decode_avx2(const uint8_t* HEDLEY_RESTRICT src, lon
|
|
|
56
60
|
'.','.','.','.','.','.','.','.','.','.','.','.','.','.',_nextMask==2?0:'.',_nextMask==1?0:'.'
|
|
57
61
|
);
|
|
58
62
|
}
|
|
63
|
+
|
|
64
|
+
// for some reason, MSVC Win32 seems to crash when trying to compile _mm256_mask_cmpeq_epi8_mask
|
|
65
|
+
// the crash can be fixed by switching the order of the last two arguments, but it seems to generate wrong code
|
|
66
|
+
// so just disable the optimisation as it seems to be problematic there
|
|
67
|
+
#if defined(__AVX512VL__) && defined(__AVX512BW__)
|
|
68
|
+
# if defined(_MSC_VER) && !defined(PLATFORM_AMD64) && !defined(__clang__)
|
|
69
|
+
const bool useAVX3MaskCmp = false;
|
|
70
|
+
# else
|
|
71
|
+
const bool useAVX3MaskCmp = (use_isa >= ISA_LEVEL_AVX3);
|
|
72
|
+
# endif
|
|
73
|
+
#endif
|
|
59
74
|
intptr_t i;
|
|
60
75
|
for(i = -len; i; i += sizeof(__m256i)*2) {
|
|
61
76
|
__m256i oDataA = _mm256_load_si256((__m256i *)(src+i));
|
|
@@ -122,7 +137,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_avx2(const uint8_t* HEDLEY_RESTRICT src, lon
|
|
|
122
137
|
__mmask32 match2EqMaskA, match2EqMaskB;
|
|
123
138
|
__mmask32 match0CrMaskA, match0CrMaskB;
|
|
124
139
|
__mmask32 match2CrXDtMaskA, match2CrXDtMaskB;
|
|
125
|
-
if(
|
|
140
|
+
if(useAVX3MaskCmp && searchEnd) {
|
|
126
141
|
match2EqMaskA = _mm256_cmpeq_epi8_mask(_mm256_set1_epi8('='), tmpData2A);
|
|
127
142
|
match2EqMaskB = _mm256_cmpeq_epi8_mask(_mm256_set1_epi8('='), tmpData2B);
|
|
128
143
|
} else
|
|
@@ -138,7 +153,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_avx2(const uint8_t* HEDLEY_RESTRICT src, lon
|
|
|
138
153
|
// find patterns of \r_.
|
|
139
154
|
|
|
140
155
|
#if defined(__AVX512VL__) && defined(__AVX512BW__)
|
|
141
|
-
if(
|
|
156
|
+
if(useAVX3MaskCmp) {
|
|
142
157
|
match0CrMaskA = _mm256_cmpeq_epi8_mask(oDataA, _mm256_set1_epi8('\r'));
|
|
143
158
|
match0CrMaskB = _mm256_cmpeq_epi8_mask(oDataB, _mm256_set1_epi8('\r'));
|
|
144
159
|
match2CrXDtMaskA = _mm256_mask_cmpeq_epi8_mask(match0CrMaskA, tmpData2A, _mm256_set1_epi8('.'));
|
|
@@ -168,7 +183,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_avx2(const uint8_t* HEDLEY_RESTRICT src, lon
|
|
|
168
183
|
#if defined(__AVX512VL__) && defined(__AVX512BW__)
|
|
169
184
|
__mmask32 match1NlMaskA, match1NlMaskB;
|
|
170
185
|
__mmask32 match2NlDotMaskA, match2NlDotMaskB;
|
|
171
|
-
if(
|
|
186
|
+
if(useAVX3MaskCmp) {
|
|
172
187
|
match1NlMaskA = _mm256_mask_cmpeq_epi8_mask(
|
|
173
188
|
match0CrMaskA,
|
|
174
189
|
_mm256_set1_epi8('\n'),
|
|
@@ -224,7 +239,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_avx2(const uint8_t* HEDLEY_RESTRICT src, lon
|
|
|
224
239
|
|
|
225
240
|
int matchEnd;
|
|
226
241
|
#if defined(__AVX512VL__) && defined(__AVX512BW__)
|
|
227
|
-
if(
|
|
242
|
+
if(useAVX3MaskCmp) {
|
|
228
243
|
__mmask32 match3EqYMaskA = _mm256_mask_cmpeq_epi8_mask(
|
|
229
244
|
match2EqMaskA,
|
|
230
245
|
_mm256_set1_epi8('y'),
|
|
@@ -298,12 +313,12 @@ HEDLEY_ALWAYS_INLINE void do_decode_avx2(const uint8_t* HEDLEY_RESTRICT src, lon
|
|
|
298
313
|
if(LIKELIHOOD(0.002, matchEnd)) {
|
|
299
314
|
// terminator found
|
|
300
315
|
// there's probably faster ways to do this, but reverting to scalar code should be good enough
|
|
301
|
-
len += i;
|
|
316
|
+
len += (long)i;
|
|
302
317
|
break;
|
|
303
318
|
}
|
|
304
319
|
}
|
|
305
320
|
#if defined(__AVX512VL__) && defined(__AVX512BW__)
|
|
306
|
-
if(
|
|
321
|
+
if(useAVX3MaskCmp) {
|
|
307
322
|
mask |= (uint64_t)match2NlDotMaskA << 2;
|
|
308
323
|
mask |= (uint64_t)match2NlDotMaskB << 34;
|
|
309
324
|
minMask = _mm256_maskz_mov_epi8(~(match2NlDotMaskB>>30), _mm256_set1_epi8('.'));
|
|
@@ -321,7 +336,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_avx2(const uint8_t* HEDLEY_RESTRICT src, lon
|
|
|
321
336
|
__m256i match3EqYA, match3EqYB;
|
|
322
337
|
#if defined(__AVX512VL__) && defined(__AVX512BW__)
|
|
323
338
|
__mmask32 match3EqYMaskA, match3EqYMaskB;
|
|
324
|
-
if(
|
|
339
|
+
if(useAVX3MaskCmp) {
|
|
325
340
|
match3EqYMaskA = _mm256_mask_cmpeq_epi8_mask(
|
|
326
341
|
match2EqMaskA,
|
|
327
342
|
_mm256_set1_epi8('y'),
|
|
@@ -351,7 +366,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_avx2(const uint8_t* HEDLEY_RESTRICT src, lon
|
|
|
351
366
|
if(LIKELIHOOD(0.002, partialEndFound)) {
|
|
352
367
|
bool endFound;
|
|
353
368
|
#if defined(__AVX512VL__) && defined(__AVX512BW__)
|
|
354
|
-
if(
|
|
369
|
+
if(useAVX3MaskCmp) {
|
|
355
370
|
__mmask32 match3LfEqYMaskA = _mm256_mask_cmpeq_epi8_mask(
|
|
356
371
|
match3EqYMaskA,
|
|
357
372
|
_mm256_set1_epi8('\n'),
|
|
@@ -390,7 +405,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_avx2(const uint8_t* HEDLEY_RESTRICT src, lon
|
|
|
390
405
|
));
|
|
391
406
|
}
|
|
392
407
|
if(endFound) {
|
|
393
|
-
len += i;
|
|
408
|
+
len += (long)i;
|
|
394
409
|
break;
|
|
395
410
|
}
|
|
396
411
|
}
|
|
@@ -489,14 +504,10 @@ HEDLEY_ALWAYS_INLINE void do_decode_avx2(const uint8_t* HEDLEY_RESTRICT src, lon
|
|
|
489
504
|
#endif
|
|
490
505
|
{
|
|
491
506
|
// << 1 byte
|
|
492
|
-
cmpEqB = _mm256_cmpeq_epi8(_mm256_set1_epi8('='), _mm256_loadu_si256((__m256i *)(src+i-1) + 1));
|
|
493
|
-
#if defined(__tune_znver1__) || defined(__tune_bdver4__)
|
|
494
507
|
cmpEqA = _mm256_alignr_epi8(cmpEqA, _mm256_inserti128_si256(
|
|
495
|
-
|
|
508
|
+
_mm256_set1_epi8('='), _mm256_castsi256_si128(cmpEqA), 1
|
|
496
509
|
), 15);
|
|
497
|
-
|
|
498
|
-
cmpEqA = _mm256_alignr_epi8(cmpEqA, _mm256_permute2x128_si256(cmpEqA, cmpEqA, 0x08), 15);
|
|
499
|
-
#endif
|
|
510
|
+
cmpEqB = _mm256_cmpeq_epi8(_mm256_set1_epi8('='), _mm256_loadu_si256((__m256i *)(src+i-1) + 1));
|
|
500
511
|
dataA = _mm256_add_epi8(
|
|
501
512
|
oDataA,
|
|
502
513
|
_mm256_blendv_epi8(
|
|
@@ -523,7 +534,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_avx2(const uint8_t* HEDLEY_RESTRICT src, lon
|
|
|
523
534
|
#endif
|
|
524
535
|
{
|
|
525
536
|
yencOffset = _mm256_xor_si256(_mm256_set1_epi8(-42), zext128_256(
|
|
526
|
-
_mm_slli_epi16(_mm_cvtsi32_si128(escFirst), 6)
|
|
537
|
+
_mm_slli_epi16(_mm_cvtsi32_si128((int)escFirst), 6)
|
|
527
538
|
));
|
|
528
539
|
}
|
|
529
540
|
|
|
@@ -565,7 +576,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_avx2(const uint8_t* HEDLEY_RESTRICT src, lon
|
|
|
565
576
|
p -= popcnt32(mask & 0xffff0);
|
|
566
577
|
|
|
567
578
|
_mm_storeu_si128((__m128i*)(p + XMM_SIZE*3), _mm256_extracti128_si256(dataB, 1));
|
|
568
|
-
p -= popcnt32(mask >> 20);
|
|
579
|
+
p -= popcnt32((unsigned int)(mask >> 20));
|
|
569
580
|
#else
|
|
570
581
|
mask >>= 32;
|
|
571
582
|
shuf = _mm256_inserti128_si256(
|
package/src/decoder_common.h
CHANGED
|
@@ -340,7 +340,7 @@ YencDecoderEnd do_decode_simd(const unsigned char* HEDLEY_RESTRICT* src, unsigne
|
|
|
340
340
|
if((uintptr_t)(*src) & ((width-1))) {
|
|
341
341
|
// find source memory alignment
|
|
342
342
|
unsigned char* aSrc = (unsigned char*)(((uintptr_t)(*src) + (width-1)) & ~(width-1));
|
|
343
|
-
int amount = aSrc - *src;
|
|
343
|
+
int amount = (int)(aSrc - *src);
|
|
344
344
|
len -= amount;
|
|
345
345
|
YencDecoderEnd ended = do_decode_scalar<isRaw, searchEnd>(src, dest, amount, pState);
|
|
346
346
|
if(ended) return ended;
|
|
@@ -427,7 +427,7 @@ YencDecoderEnd do_decode_simd(const unsigned char* HEDLEY_RESTRICT* src, unsigne
|
|
|
427
427
|
escFirst = (*pState == YDEC_STATE_EQ || *pState == YDEC_STATE_CRLFEQ);
|
|
428
428
|
|
|
429
429
|
// our algorithm may perform an aligned load on the next part, of which we consider 2 bytes (for \r\n. sequence checking)
|
|
430
|
-
long dLen = len - lenBuffer;
|
|
430
|
+
long dLen = (long)(len - lenBuffer);
|
|
431
431
|
dLen = (dLen + (width-1)) & ~(width-1);
|
|
432
432
|
|
|
433
433
|
kernel((const uint8_t*)(*src) + dLen, dLen, p, escFirst, nextMask);
|
package/src/decoder_neon.cc
CHANGED
|
@@ -7,9 +7,9 @@
|
|
|
7
7
|
#include "decoder_common.h"
|
|
8
8
|
|
|
9
9
|
|
|
10
|
-
#
|
|
11
|
-
# define vld1_u8_align vld1_u8_ex
|
|
12
|
-
# define vld1q_u8_align vld1q_u8_ex
|
|
10
|
+
#if defined(_MSC_VER) && !defined(__clang__)
|
|
11
|
+
# define vld1_u8_align(p, a) vld1_u8_ex(p, a*8)
|
|
12
|
+
# define vld1q_u8_align(p, a) vld1q_u8_ex(p, a*8)
|
|
13
13
|
#elif defined(__GNUC__)
|
|
14
14
|
# define vld1_u8_align(p, n) vld1_u8((uint8_t*)__builtin_assume_aligned(p, n))
|
|
15
15
|
# define vld1q_u8_align(p, n) vld1q_u8((uint8_t*)__builtin_assume_aligned(p, n))
|
|
@@ -19,19 +19,17 @@
|
|
|
19
19
|
#endif
|
|
20
20
|
|
|
21
21
|
|
|
22
|
-
// for compilers that lack these functions
|
|
23
|
-
#if defined(__clang__) || (defined(__GNUC__) && (defined(__aarch64__) && __GNUC__ >= 8))
|
|
22
|
+
// for compilers that lack these functions (Clang armv7 9-12 seems to have issues with multi-vector loads)
|
|
23
|
+
#if (defined(__clang__) && (defined(__aarch64__) || __clang_major__<9 || __clang_major__>12)) || (defined(__GNUC__) && (defined(__aarch64__) && __GNUC__ >= 8))
|
|
24
24
|
# define vld1q_u8_x2_align(p, n) vld1q_u8_x2((uint8_t*)__builtin_assume_aligned(p, n))
|
|
25
25
|
#else
|
|
26
|
-
|
|
27
|
-
return (uint8x16x2_t){vld1q_u8_align(p, n), vld1q_u8_align(p+16, n)};
|
|
28
|
-
}
|
|
26
|
+
# define vld1q_u8_x2_align(p, n) vcreate2_u8(vld1q_u8_align(p, (n)/2), vld1q_u8_align((p)+16, (n)/2))
|
|
29
27
|
#endif
|
|
30
28
|
// Clang wrongly assumes alignment on vld1q_u8_x2, and ARMv7 GCC doesn't support the function, so effectively, it can only be used in ARMv8 compilers
|
|
31
|
-
#if defined(__aarch64__) && (defined(__clang__) || (
|
|
29
|
+
#if defined(__aarch64__) && (defined(__clang__) || HEDLEY_GCC_VERSION_CHECK(8,5,0))
|
|
32
30
|
# define vst1q_u8_x2_unaligned vst1q_u8_x2
|
|
33
31
|
#else
|
|
34
|
-
HEDLEY_ALWAYS_INLINE void vst1q_u8_x2_unaligned(uint8_t* p, uint8x16x2_t data) {
|
|
32
|
+
static HEDLEY_ALWAYS_INLINE void vst1q_u8_x2_unaligned(uint8_t* p, uint8x16x2_t data) {
|
|
35
33
|
vst1q_u8(p, data.val[0]);
|
|
36
34
|
vst1q_u8(p+16, data.val[1]);
|
|
37
35
|
}
|
|
@@ -64,18 +62,20 @@ template<bool isRaw, bool searchEnd>
|
|
|
64
62
|
HEDLEY_ALWAYS_INLINE void do_decode_neon(const uint8_t* HEDLEY_RESTRICT src, long& len, unsigned char* HEDLEY_RESTRICT & p, unsigned char& escFirst, uint16_t& nextMask) {
|
|
65
63
|
HEDLEY_ASSUME(escFirst == 0 || escFirst == 1);
|
|
66
64
|
HEDLEY_ASSUME(nextMask == 0 || nextMask == 1 || nextMask == 2);
|
|
67
|
-
uint8x16_t yencOffset = escFirst ? (
|
|
65
|
+
uint8x16_t yencOffset = escFirst ? vmakeq_u8(42+64,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42) : vdupq_n_u8(42);
|
|
68
66
|
#ifdef __aarch64__
|
|
69
67
|
uint8x16_t nextMaskMix = vdupq_n_u8(0);
|
|
70
|
-
if(nextMask)
|
|
71
|
-
nextMaskMix
|
|
68
|
+
if(nextMask == 1)
|
|
69
|
+
nextMaskMix = vsetq_lane_u8(1, nextMaskMix, 0);
|
|
70
|
+
if(nextMask == 2)
|
|
71
|
+
nextMaskMix = vsetq_lane_u8(2, nextMaskMix, 1);
|
|
72
72
|
#else
|
|
73
73
|
uint8x16_t lfCompare = vdupq_n_u8('\n');
|
|
74
74
|
if(isRaw) {
|
|
75
75
|
if(nextMask == 1)
|
|
76
|
-
lfCompare
|
|
76
|
+
lfCompare = vsetq_lane_u8('.', lfCompare, 0);
|
|
77
77
|
if(nextMask == 2)
|
|
78
|
-
lfCompare
|
|
78
|
+
lfCompare = vsetq_lane_u8('.', lfCompare, 1);
|
|
79
79
|
}
|
|
80
80
|
#endif
|
|
81
81
|
long i;
|
|
@@ -90,13 +90,13 @@ HEDLEY_ALWAYS_INLINE void do_decode_neon(const uint8_t* HEDLEY_RESTRICT src, lon
|
|
|
90
90
|
#ifdef __aarch64__
|
|
91
91
|
cmpA = vqtbx1q_u8(
|
|
92
92
|
cmpEqA,
|
|
93
|
-
//
|
|
94
|
-
(
|
|
93
|
+
// \n \r
|
|
94
|
+
vmakeq_u8(0,0,0,0,0,0,0,0,0,0,255,0,0,255,0,0),
|
|
95
95
|
dataA
|
|
96
96
|
),
|
|
97
97
|
cmpB = vqtbx1q_u8(
|
|
98
98
|
cmpEqB,
|
|
99
|
-
(
|
|
99
|
+
vmakeq_u8(0,0,0,0,0,0,0,0,0,0,255,0,0,255,0,0),
|
|
100
100
|
dataB
|
|
101
101
|
);
|
|
102
102
|
if(isRaw) cmpA = vorrq_u8(cmpA, nextMaskMix);
|
|
@@ -122,12 +122,12 @@ HEDLEY_ALWAYS_INLINE void do_decode_neon(const uint8_t* HEDLEY_RESTRICT src, lon
|
|
|
122
122
|
|
|
123
123
|
#ifdef __aarch64__
|
|
124
124
|
if (LIKELIHOOD(0.42 /*guess*/, neon_vect_is_nonzero(vorrq_u8(cmpA, cmpB)))) {
|
|
125
|
-
cmpA = vandq_u8(cmpA, (
|
|
126
|
-
cmpB = vandq_u8(cmpB, (
|
|
125
|
+
cmpA = vandq_u8(cmpA, vmakeq_u8(1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128));
|
|
126
|
+
cmpB = vandq_u8(cmpB, vmakeq_u8(1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128));
|
|
127
127
|
uint8x16_t cmpMerge = vpaddq_u8(cmpA, cmpB);
|
|
128
128
|
uint8x16_t cmpEqMerge = vpaddq_u8(
|
|
129
|
-
vandq_u8(cmpEqA, (
|
|
130
|
-
vandq_u8(cmpEqB, (
|
|
129
|
+
vandq_u8(cmpEqA, vmakeq_u8(1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128)),
|
|
130
|
+
vandq_u8(cmpEqB, vmakeq_u8(1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128))
|
|
131
131
|
);
|
|
132
132
|
|
|
133
133
|
uint8x16_t cmpCombined = vpaddq_u8(cmpMerge, cmpEqMerge);
|
|
@@ -136,8 +136,8 @@ HEDLEY_ALWAYS_INLINE void do_decode_neon(const uint8_t* HEDLEY_RESTRICT src, lon
|
|
|
136
136
|
uint32_t mask = vgetq_lane_u32(vreinterpretq_u32_u8(cmpCombined), 0);
|
|
137
137
|
uint32_t maskEq = vgetq_lane_u32(vreinterpretq_u32_u8(cmpCombined), 1);
|
|
138
138
|
#else
|
|
139
|
-
cmpA = vandq_u8(cmpA, (
|
|
140
|
-
cmpB = vandq_u8(cmpB, (
|
|
139
|
+
cmpA = vandq_u8(cmpA, vmakeq_u8(1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128));
|
|
140
|
+
cmpB = vandq_u8(cmpB, vmakeq_u8(1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128));
|
|
141
141
|
// no vpaddq_u8 in ARMv7, so need extra 64-bit VPADD
|
|
142
142
|
uint8x8_t cmpPacked = vpadd_u8(
|
|
143
143
|
vpadd_u8(
|
|
@@ -150,8 +150,8 @@ HEDLEY_ALWAYS_INLINE void do_decode_neon(const uint8_t* HEDLEY_RESTRICT src, lon
|
|
|
150
150
|
cmpPacked = vpadd_u8(cmpPacked, cmpPacked);
|
|
151
151
|
uint32_t mask = vget_lane_u32(vreinterpret_u32_u8(cmpPacked), 0);
|
|
152
152
|
if(LIKELIHOOD(0.42, mask != 0)) {
|
|
153
|
-
uint8x16_t cmpEqMaskedA = vandq_u8(cmpEqA, (
|
|
154
|
-
uint8x16_t cmpEqMaskedB = vandq_u8(cmpEqB, (
|
|
153
|
+
uint8x16_t cmpEqMaskedA = vandq_u8(cmpEqA, vmakeq_u8(1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128));
|
|
154
|
+
uint8x16_t cmpEqMaskedB = vandq_u8(cmpEqB, vmakeq_u8(1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128));
|
|
155
155
|
uint8x8_t cmpEqPacked = vpadd_u8(
|
|
156
156
|
vpadd_u8(
|
|
157
157
|
vget_low_u8(cmpEqMaskedA), vget_high_u8(cmpEqMaskedA)
|
|
@@ -170,7 +170,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_neon(const uint8_t* HEDLEY_RESTRICT src, lon
|
|
|
170
170
|
// vext seems to be a cheap operation on ARM, relative to loads, so only avoid it if there's only one load (isRaw only)
|
|
171
171
|
uint8x16_t tmpData2, nextData;
|
|
172
172
|
if(isRaw && !searchEnd) {
|
|
173
|
-
tmpData2 =
|
|
173
|
+
tmpData2 = vld1q_u8(src+i + 2 + sizeof(uint8x16_t));
|
|
174
174
|
} else {
|
|
175
175
|
nextData = vld1q_u8_align(src+i + sizeof(uint8x16_t)*2, 16); // only 32-bits needed, but there doesn't appear a nice way to do this via intrinsics: https://stackoverflow.com/questions/46910799/arm-neon-intrinsics-convert-d-64-bit-register-to-low-half-of-q-128-bit-regis
|
|
176
176
|
tmpData2 = vextq_u8(dataB, nextData, 2);
|
|
@@ -255,15 +255,15 @@ HEDLEY_ALWAYS_INLINE void do_decode_neon(const uint8_t* HEDLEY_RESTRICT src, lon
|
|
|
255
255
|
}
|
|
256
256
|
}
|
|
257
257
|
#ifdef __aarch64__
|
|
258
|
-
uint8x16_t match2NlDotBMasked = vandq_u8(match2NlDotB, (
|
|
258
|
+
uint8x16_t match2NlDotBMasked = vandq_u8(match2NlDotB, vmakeq_u8(1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128));
|
|
259
259
|
uint8x16_t mergeKillDots = vpaddq_u8(
|
|
260
|
-
vandq_u8(match2NlDotA, (
|
|
260
|
+
vandq_u8(match2NlDotA, vmakeq_u8(1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128)),
|
|
261
261
|
match2NlDotBMasked
|
|
262
262
|
);
|
|
263
263
|
uint8x8_t mergeKillDots2 = vget_low_u8(vpaddq_u8(mergeKillDots, mergeKillDots));
|
|
264
264
|
#else
|
|
265
|
-
uint8x16_t match2NlDotMaskedA = vandq_u8(match2NlDotA, (
|
|
266
|
-
uint8x16_t match2NlDotMaskedB = vandq_u8(match2NlDotB, (
|
|
265
|
+
uint8x16_t match2NlDotMaskedA = vandq_u8(match2NlDotA, vmakeq_u8(1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128));
|
|
266
|
+
uint8x16_t match2NlDotMaskedB = vandq_u8(match2NlDotB, vmakeq_u8(1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128));
|
|
267
267
|
uint8x8_t mergeKillDots2 = vpadd_u8(
|
|
268
268
|
vpadd_u8(
|
|
269
269
|
vget_low_u8(match2NlDotMaskedA), vget_high_u8(match2NlDotMaskedA)
|
|
@@ -342,11 +342,11 @@ HEDLEY_ALWAYS_INLINE void do_decode_neon(const uint8_t* HEDLEY_RESTRICT src, lon
|
|
|
342
342
|
#ifdef __aarch64__
|
|
343
343
|
uint8x16_t vMaskEqA = vqtbl1q_u8(
|
|
344
344
|
vcombine_u8(maskEqTemp, vdup_n_u8(0)),
|
|
345
|
-
(
|
|
345
|
+
vmakeq_u8(0,0,0,0,0,0,0,0, 1,1,1,1,1,1,1,1)
|
|
346
346
|
);
|
|
347
347
|
uint8x16_t vMaskEqB = vqtbl1q_u8(
|
|
348
348
|
vcombine_u8(maskEqTemp, vdup_n_u8(0)),
|
|
349
|
-
(
|
|
349
|
+
vmakeq_u8(2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3)
|
|
350
350
|
);
|
|
351
351
|
#else
|
|
352
352
|
uint8x16_t vMaskEqA = vcombine_u8(
|
|
@@ -358,8 +358,8 @@ HEDLEY_ALWAYS_INLINE void do_decode_neon(const uint8_t* HEDLEY_RESTRICT src, lon
|
|
|
358
358
|
vdup_lane_u8(maskEqTemp, 3)
|
|
359
359
|
);
|
|
360
360
|
#endif
|
|
361
|
-
vMaskEqA = vtstq_u8(vMaskEqA, (
|
|
362
|
-
vMaskEqB = vtstq_u8(vMaskEqB, (
|
|
361
|
+
vMaskEqA = vtstq_u8(vMaskEqA, vmakeq_u8(1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128));
|
|
362
|
+
vMaskEqB = vtstq_u8(vMaskEqB, vmakeq_u8(1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128));
|
|
363
363
|
|
|
364
364
|
dataA = vsubq_u8(
|
|
365
365
|
dataA,
|
|
@@ -391,7 +391,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_neon(const uint8_t* HEDLEY_RESTRICT src, lon
|
|
|
391
391
|
)
|
|
392
392
|
);
|
|
393
393
|
}
|
|
394
|
-
yencOffset
|
|
394
|
+
yencOffset = vsetq_lane_u8((escFirst << 6) | 42, yencOffset, 0);
|
|
395
395
|
|
|
396
396
|
// all that's left is to 'compress' the data (skip over masked chars)
|
|
397
397
|
uint32_t counts = 0x08080808 - vget_lane_u32(vreinterpret_u32_u8(vcnt_u8(cmpPacked)), 0);
|
|
@@ -439,7 +439,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_neon(const uint8_t* HEDLEY_RESTRICT src, lon
|
|
|
439
439
|
} else {
|
|
440
440
|
dataA = vsubq_u8(dataA, yencOffset);
|
|
441
441
|
dataB = vsubq_u8(dataB, vdupq_n_u8(42));
|
|
442
|
-
vst1q_u8_x2_unaligned(p, (
|
|
442
|
+
vst1q_u8_x2_unaligned(p, vcreate2_u8(dataA, dataB));
|
|
443
443
|
p += sizeof(uint8x16_t)*2;
|
|
444
444
|
escFirst = 0;
|
|
445
445
|
#ifdef __aarch64__
|