yencode 1.0.8 → 1.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +339 -231
- package/binding.gyp +292 -39
- package/crcutil-1.0/code/multiword_64_64_gcc_amd64_asm.cc +7 -7
- package/crcutil-1.0/code/multiword_64_64_gcc_i386_mmx.cc +14 -14
- package/crcutil-1.0/code/multiword_64_64_intrinsic_i386_mmx.cc +1 -1
- package/crcutil-1.0/code/uint128_sse2.h +2 -0
- package/index.js +329 -22
- package/package.json +2 -2
- package/src/common.h +299 -0
- package/src/crc.cc +95 -0
- package/src/crc.h +23 -0
- package/src/crc_arm.cc +175 -0
- package/src/crc_common.h +4 -0
- package/{crc_folding.c → src/crc_folding.cc} +175 -185
- package/src/decoder.cc +61 -0
- package/src/decoder.h +53 -0
- package/src/decoder_avx.cc +18 -0
- package/src/decoder_avx2.cc +18 -0
- package/src/decoder_avx2_base.h +615 -0
- package/src/decoder_common.h +512 -0
- package/src/decoder_neon.cc +474 -0
- package/src/decoder_neon64.cc +451 -0
- package/src/decoder_sse2.cc +16 -0
- package/src/decoder_sse_base.h +711 -0
- package/src/decoder_ssse3.cc +18 -0
- package/src/encoder.cc +170 -0
- package/src/encoder.h +21 -0
- package/src/encoder_avx.cc +16 -0
- package/src/encoder_avx2.cc +16 -0
- package/src/encoder_avx_base.h +564 -0
- package/src/encoder_common.h +109 -0
- package/src/encoder_neon.cc +547 -0
- package/src/encoder_sse2.cc +13 -0
- package/src/encoder_sse_base.h +724 -0
- package/src/encoder_ssse3.cc +18 -0
- package/src/hedley.h +1899 -0
- package/src/platform.cc +147 -0
- package/src/yencode.cc +449 -0
- package/test/_maxsize.js +9 -0
- package/test/_speedbase.js +147 -0
- package/test/speedcrc.js +20 -0
- package/test/speeddec.js +92 -0
- package/test/speedenc.js +44 -0
- package/{testcrc.js → test/testcrc.js} +53 -39
- package/test/testdec.js +183 -0
- package/test/testenc.js +163 -0
- package/test/testpostdec.js +126 -0
- package/test.js +0 -91
- package/yencode.cc +0 -1622
|
@@ -17,158 +17,54 @@
|
|
|
17
17
|
* For conditions of distribution and use, see copyright notice in zlib.h
|
|
18
18
|
*/
|
|
19
19
|
|
|
20
|
-
#
|
|
21
|
-
|
|
20
|
+
#include "crc_common.h"
|
|
21
|
+
|
|
22
|
+
#if (defined(__PCLMUL__) && defined(__SSSE3__) && defined(__SSE4_1__)) || (defined(_MSC_VER) && _MSC_VER >= 1600 && defined(PLATFORM_X86))
|
|
22
23
|
#include <inttypes.h>
|
|
23
24
|
#include <immintrin.h>
|
|
24
25
|
#include <wmmintrin.h>
|
|
25
26
|
|
|
26
|
-
#define local static
|
|
27
27
|
|
|
28
|
-
#
|
|
29
|
-
#define
|
|
30
|
-
#else
|
|
31
|
-
#define ALIGN(_a, v) v __attribute__((aligned(_a)))
|
|
28
|
+
#if defined(__AVX512VL__) && defined(YENC_BUILD_NATIVE) && YENC_BUILD_NATIVE!=0
|
|
29
|
+
# define ENABLE_AVX512 1
|
|
32
30
|
#endif
|
|
33
31
|
|
|
34
32
|
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
x_tmp3 = *xmm_crc3;
|
|
44
|
-
|
|
45
|
-
*xmm_crc3 = *xmm_crc0;
|
|
46
|
-
*xmm_crc0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x01);
|
|
47
|
-
*xmm_crc3 = _mm_clmulepi64_si128(*xmm_crc3, xmm_fold4, 0x10);
|
|
48
|
-
ps_crc0 = _mm_castsi128_ps(*xmm_crc0);
|
|
49
|
-
ps_crc3 = _mm_castsi128_ps(*xmm_crc3);
|
|
50
|
-
ps_res = _mm_xor_ps(ps_crc0, ps_crc3);
|
|
51
|
-
|
|
52
|
-
*xmm_crc0 = *xmm_crc1;
|
|
53
|
-
*xmm_crc1 = *xmm_crc2;
|
|
54
|
-
*xmm_crc2 = x_tmp3;
|
|
55
|
-
*xmm_crc3 = _mm_castps_si128(ps_res);
|
|
56
|
-
}
|
|
57
|
-
|
|
58
|
-
local void fold_2(__m128i *xmm_crc0, __m128i *xmm_crc1, __m128i *xmm_crc2, __m128i *xmm_crc3) {
|
|
59
|
-
const __m128i xmm_fold4 = _mm_set_epi32(
|
|
60
|
-
0x00000001, 0x54442bd4,
|
|
61
|
-
0x00000001, 0xc6e41596);
|
|
62
|
-
|
|
63
|
-
__m128i x_tmp3, x_tmp2;
|
|
64
|
-
__m128 ps_crc0, ps_crc1, ps_crc2, ps_crc3, ps_res31, ps_res20;
|
|
65
|
-
|
|
66
|
-
x_tmp3 = *xmm_crc3;
|
|
67
|
-
x_tmp2 = *xmm_crc2;
|
|
68
|
-
|
|
69
|
-
*xmm_crc3 = *xmm_crc1;
|
|
70
|
-
*xmm_crc1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold4, 0x01);
|
|
71
|
-
*xmm_crc3 = _mm_clmulepi64_si128(*xmm_crc3, xmm_fold4, 0x10);
|
|
72
|
-
ps_crc3 = _mm_castsi128_ps(*xmm_crc3);
|
|
73
|
-
ps_crc1 = _mm_castsi128_ps(*xmm_crc1);
|
|
74
|
-
ps_res31 = _mm_xor_ps(ps_crc3, ps_crc1);
|
|
75
|
-
|
|
76
|
-
*xmm_crc2 = *xmm_crc0;
|
|
77
|
-
*xmm_crc0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x01);
|
|
78
|
-
*xmm_crc2 = _mm_clmulepi64_si128(*xmm_crc2, xmm_fold4, 0x10);
|
|
79
|
-
ps_crc0 = _mm_castsi128_ps(*xmm_crc0);
|
|
80
|
-
ps_crc2 = _mm_castsi128_ps(*xmm_crc2);
|
|
81
|
-
ps_res20 = _mm_xor_ps(ps_crc0, ps_crc2);
|
|
82
|
-
|
|
83
|
-
*xmm_crc0 = x_tmp2;
|
|
84
|
-
*xmm_crc1 = x_tmp3;
|
|
85
|
-
*xmm_crc2 = _mm_castps_si128(ps_res20);
|
|
86
|
-
*xmm_crc3 = _mm_castps_si128(ps_res31);
|
|
33
|
+
// interestingly, MSVC seems to generate better code if using VXORPS over VPXOR
|
|
34
|
+
// original Intel code uses XORPS for many XOR operations, but PXOR is pretty much always better (more port freedom on Intel CPUs). The only advantage of XORPS is that it's 1 byte shorter, an advantage which disappears under AVX as both instructions have the same length
|
|
35
|
+
#if defined(__AVX__) && defined(YENC_BUILD_NATIVE) && YENC_BUILD_NATIVE!=0
|
|
36
|
+
# define fold_xor _mm_xor_si128
|
|
37
|
+
#else
|
|
38
|
+
static __m128i fold_xor(__m128i a, __m128i b) {
|
|
39
|
+
return _mm_castps_si128(_mm_xor_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b)));
|
|
87
40
|
}
|
|
41
|
+
#endif
|
|
88
42
|
|
|
89
|
-
|
|
43
|
+
#ifdef ENABLE_AVX512
|
|
44
|
+
static __m128i do_one_fold_merge(__m128i src, __m128i data) {
|
|
90
45
|
const __m128i xmm_fold4 = _mm_set_epi32(
|
|
91
46
|
0x00000001, 0x54442bd4,
|
|
92
47
|
0x00000001, 0xc6e41596);
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
*xmm_crc3 = *xmm_crc2;
|
|
100
|
-
*xmm_crc2 = _mm_clmulepi64_si128(*xmm_crc2, xmm_fold4, 0x01);
|
|
101
|
-
*xmm_crc3 = _mm_clmulepi64_si128(*xmm_crc3, xmm_fold4, 0x10);
|
|
102
|
-
ps_crc2 = _mm_castsi128_ps(*xmm_crc2);
|
|
103
|
-
ps_crc3 = _mm_castsi128_ps(*xmm_crc3);
|
|
104
|
-
ps_res32 = _mm_xor_ps(ps_crc2, ps_crc3);
|
|
105
|
-
|
|
106
|
-
*xmm_crc2 = *xmm_crc1;
|
|
107
|
-
*xmm_crc1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold4, 0x01);
|
|
108
|
-
*xmm_crc2 = _mm_clmulepi64_si128(*xmm_crc2, xmm_fold4, 0x10);
|
|
109
|
-
ps_crc1 = _mm_castsi128_ps(*xmm_crc1);
|
|
110
|
-
ps_crc2 = _mm_castsi128_ps(*xmm_crc2);
|
|
111
|
-
ps_res21 = _mm_xor_ps(ps_crc1, ps_crc2);
|
|
112
|
-
|
|
113
|
-
*xmm_crc1 = *xmm_crc0;
|
|
114
|
-
*xmm_crc0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x01);
|
|
115
|
-
*xmm_crc1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold4, 0x10);
|
|
116
|
-
ps_crc0 = _mm_castsi128_ps(*xmm_crc0);
|
|
117
|
-
ps_crc1 = _mm_castsi128_ps(*xmm_crc1);
|
|
118
|
-
ps_res10 = _mm_xor_ps(ps_crc0, ps_crc1);
|
|
119
|
-
|
|
120
|
-
*xmm_crc0 = x_tmp3;
|
|
121
|
-
*xmm_crc1 = _mm_castps_si128(ps_res10);
|
|
122
|
-
*xmm_crc2 = _mm_castps_si128(ps_res21);
|
|
123
|
-
*xmm_crc3 = _mm_castps_si128(ps_res32);
|
|
48
|
+
return _mm_ternarylogic_epi32(
|
|
49
|
+
_mm_clmulepi64_si128(src, xmm_fold4, 0x01),
|
|
50
|
+
_mm_clmulepi64_si128(src, xmm_fold4, 0x10),
|
|
51
|
+
data,
|
|
52
|
+
0x96
|
|
53
|
+
);
|
|
124
54
|
}
|
|
125
|
-
|
|
126
|
-
|
|
55
|
+
#else
|
|
56
|
+
static __m128i do_one_fold(__m128i src) {
|
|
127
57
|
const __m128i xmm_fold4 = _mm_set_epi32(
|
|
128
58
|
0x00000001, 0x54442bd4,
|
|
129
59
|
0x00000001, 0xc6e41596);
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
__m128 ps_res0, ps_res1, ps_res2, ps_res3;
|
|
135
|
-
|
|
136
|
-
x_tmp0 = *xmm_crc0;
|
|
137
|
-
x_tmp1 = *xmm_crc1;
|
|
138
|
-
x_tmp2 = *xmm_crc2;
|
|
139
|
-
x_tmp3 = *xmm_crc3;
|
|
140
|
-
|
|
141
|
-
*xmm_crc0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x01);
|
|
142
|
-
x_tmp0 = _mm_clmulepi64_si128(x_tmp0, xmm_fold4, 0x10);
|
|
143
|
-
ps_crc0 = _mm_castsi128_ps(*xmm_crc0);
|
|
144
|
-
ps_t0 = _mm_castsi128_ps(x_tmp0);
|
|
145
|
-
ps_res0 = _mm_xor_ps(ps_crc0, ps_t0);
|
|
146
|
-
|
|
147
|
-
*xmm_crc1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold4, 0x01);
|
|
148
|
-
x_tmp1 = _mm_clmulepi64_si128(x_tmp1, xmm_fold4, 0x10);
|
|
149
|
-
ps_crc1 = _mm_castsi128_ps(*xmm_crc1);
|
|
150
|
-
ps_t1 = _mm_castsi128_ps(x_tmp1);
|
|
151
|
-
ps_res1 = _mm_xor_ps(ps_crc1, ps_t1);
|
|
152
|
-
|
|
153
|
-
*xmm_crc2 = _mm_clmulepi64_si128(*xmm_crc2, xmm_fold4, 0x01);
|
|
154
|
-
x_tmp2 = _mm_clmulepi64_si128(x_tmp2, xmm_fold4, 0x10);
|
|
155
|
-
ps_crc2 = _mm_castsi128_ps(*xmm_crc2);
|
|
156
|
-
ps_t2 = _mm_castsi128_ps(x_tmp2);
|
|
157
|
-
ps_res2 = _mm_xor_ps(ps_crc2, ps_t2);
|
|
158
|
-
|
|
159
|
-
*xmm_crc3 = _mm_clmulepi64_si128(*xmm_crc3, xmm_fold4, 0x01);
|
|
160
|
-
x_tmp3 = _mm_clmulepi64_si128(x_tmp3, xmm_fold4, 0x10);
|
|
161
|
-
ps_crc3 = _mm_castsi128_ps(*xmm_crc3);
|
|
162
|
-
ps_t3 = _mm_castsi128_ps(x_tmp3);
|
|
163
|
-
ps_res3 = _mm_xor_ps(ps_crc3, ps_t3);
|
|
164
|
-
|
|
165
|
-
*xmm_crc0 = _mm_castps_si128(ps_res0);
|
|
166
|
-
*xmm_crc1 = _mm_castps_si128(ps_res1);
|
|
167
|
-
*xmm_crc2 = _mm_castps_si128(ps_res2);
|
|
168
|
-
*xmm_crc3 = _mm_castps_si128(ps_res3);
|
|
60
|
+
return fold_xor(
|
|
61
|
+
_mm_clmulepi64_si128(src, xmm_fold4, 0x01),
|
|
62
|
+
_mm_clmulepi64_si128(src, xmm_fold4, 0x10)
|
|
63
|
+
);
|
|
169
64
|
}
|
|
65
|
+
#endif
|
|
170
66
|
|
|
171
|
-
|
|
67
|
+
ALIGN_TO(32, static const unsigned pshufb_shf_table[60]) = {
|
|
172
68
|
0x84838281, 0x88878685, 0x8c8b8a89, 0x008f8e8d, /* shl 15 (16 - 1)/shr1 */
|
|
173
69
|
0x85848382, 0x89888786, 0x8d8c8b8a, 0x01008f8e, /* shl 14 (16 - 3)/shr2 */
|
|
174
70
|
0x86858483, 0x8a898887, 0x8e8d8c8b, 0x0201008f, /* shl 13 (16 - 4)/shr3 */
|
|
@@ -186,17 +82,13 @@ ALIGN(32, local const unsigned pshufb_shf_table[60]) = {
|
|
|
186
82
|
0x0201008f, 0x06050403, 0x0a090807, 0x0e0d0c0b /* shl 1 (16 -15)/shr15*/
|
|
187
83
|
};
|
|
188
84
|
|
|
189
|
-
|
|
85
|
+
static void partial_fold(const size_t len, __m128i *xmm_crc0, __m128i *xmm_crc1,
|
|
190
86
|
__m128i *xmm_crc2, __m128i *xmm_crc3, __m128i *xmm_crc_part) {
|
|
191
87
|
|
|
192
|
-
const __m128i xmm_fold4 = _mm_set_epi32(
|
|
193
|
-
0x00000001, 0x54442bd4,
|
|
194
|
-
0x00000001, 0xc6e41596);
|
|
195
88
|
const __m128i xmm_mask3 = _mm_set1_epi32(0x80808080);
|
|
196
89
|
|
|
197
90
|
__m128i xmm_shl, xmm_shr, xmm_tmp1, xmm_tmp2, xmm_tmp3;
|
|
198
|
-
__m128i xmm_a0_0
|
|
199
|
-
__m128 ps_crc3, psa0_0, psa0_1, ps_res;
|
|
91
|
+
__m128i xmm_a0_0;
|
|
200
92
|
|
|
201
93
|
xmm_shl = _mm_load_si128((__m128i *)pshufb_shf_table + (len - 1));
|
|
202
94
|
xmm_shr = xmm_shl;
|
|
@@ -220,41 +112,82 @@ local void partial_fold(const size_t len, __m128i *xmm_crc0, __m128i *xmm_crc1,
|
|
|
220
112
|
*xmm_crc_part = _mm_shuffle_epi8(*xmm_crc_part, xmm_shl);
|
|
221
113
|
*xmm_crc3 = _mm_or_si128(*xmm_crc3, *xmm_crc_part);
|
|
222
114
|
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
ps_res = _mm_xor_ps(ps_res, psa0_1);
|
|
232
|
-
|
|
233
|
-
*xmm_crc3 = _mm_castps_si128(ps_res);
|
|
115
|
+
#ifdef ENABLE_AVX512
|
|
116
|
+
*xmm_crc3 = do_one_fold_merge(xmm_a0_0, *xmm_crc3);
|
|
117
|
+
#else
|
|
118
|
+
*xmm_crc3 = fold_xor(
|
|
119
|
+
do_one_fold(xmm_a0_0),
|
|
120
|
+
*xmm_crc3
|
|
121
|
+
);
|
|
122
|
+
#endif
|
|
234
123
|
}
|
|
235
124
|
|
|
236
|
-
|
|
125
|
+
ALIGN_TO(16, static const unsigned crc_k[]) = {
|
|
237
126
|
0xccaa009e, 0x00000000, /* rk1 */
|
|
238
127
|
0x751997d0, 0x00000001, /* rk2 */
|
|
239
128
|
0xccaa009e, 0x00000000, /* rk5 */
|
|
240
129
|
0x63cd6124, 0x00000001, /* rk6 */
|
|
241
|
-
|
|
130
|
+
0xf7011641, 0x00000000, /* rk7 */
|
|
242
131
|
0xdb710640, 0x00000001 /* rk8 */
|
|
243
132
|
};
|
|
244
133
|
|
|
245
|
-
|
|
246
|
-
0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000
|
|
247
|
-
};
|
|
248
|
-
|
|
249
|
-
ALIGN(16, local const unsigned crc_mask2[4]) = {
|
|
134
|
+
ALIGN_TO(16, static const unsigned crc_mask[4]) = {
|
|
250
135
|
0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF
|
|
251
136
|
};
|
|
252
137
|
|
|
253
|
-
|
|
138
|
+
static __m128i reverse_bits_epi8(__m128i src) {
|
|
139
|
+
#if defined(__GFNI__) && defined(YENC_BUILD_NATIVE) && YENC_BUILD_NATIVE!=0
|
|
140
|
+
return _mm_gf2p8affine_epi64_epi8(src, _mm_set_epi32(
|
|
141
|
+
0x80402010, 0x08040201,
|
|
142
|
+
0x80402010, 0x08040201
|
|
143
|
+
), 0);
|
|
144
|
+
#else
|
|
145
|
+
__m128i xmm_t0 = _mm_and_si128(src, _mm_set1_epi8(0x0f));
|
|
146
|
+
__m128i xmm_t1 = _mm_and_si128(_mm_srli_epi16(src, 4), _mm_set1_epi8(0x0f));
|
|
147
|
+
xmm_t0 = _mm_shuffle_epi8(_mm_set_epi8(
|
|
148
|
+
-16, 112, -80, 48, -48, 80, -112, 16, -32, 96, -96, 32, -64, 64, -128, 0
|
|
149
|
+
//0xf0, 0x70, 0xb0, 0x30, 0xd0, 0x50, 0x90, 0x10, 0xe0, 0x60, 0xa0, 0x20, 0xc0, 0x40, 0x80, 0
|
|
150
|
+
), xmm_t0);
|
|
151
|
+
xmm_t1 = _mm_shuffle_epi8(_mm_set_epi8(
|
|
152
|
+
15, 7, 11, 3, 13, 5, 9, 1, 14, 6, 10, 2, 12, 4, 8, 0
|
|
153
|
+
), xmm_t1);
|
|
154
|
+
return _mm_or_si128(xmm_t0, xmm_t1);
|
|
155
|
+
#endif
|
|
156
|
+
}
|
|
157
|
+
|
|
158
|
+
#ifdef _MSC_VER
|
|
159
|
+
// because MSVC doesn't use BSWAP unless you specifically tell it to...
|
|
160
|
+
# include <stdlib.h>
|
|
161
|
+
# define BSWAP32 _byteswap_ulong
|
|
162
|
+
#else
|
|
163
|
+
# define BSWAP32(n) ((((n)&0xff)<<24) | (((n)&0xff00)<<8) | (((n)&0xff0000)>>8) | (((n)&0xff000000)>>24))
|
|
164
|
+
#endif
|
|
165
|
+
|
|
166
|
+
static uint32_t crc_fold(const unsigned char *src, long len, uint32_t initial) {
|
|
254
167
|
unsigned long algn_diff;
|
|
255
168
|
__m128i xmm_t0, xmm_t1, xmm_t2, xmm_t3;
|
|
256
169
|
|
|
257
|
-
|
|
170
|
+
// TODO: consider calculating this via a LUT instead (probably faster)
|
|
171
|
+
// info from https://www.reddit.com/r/ReverseEngineering/comments/2zwhl3/mystery_constant_0x9db42487_in_intels_crc32ieee/
|
|
172
|
+
// firstly, calculate: xmm_crc0 = (intial * 0x487b9c8a) mod 0x104c11db7, where 0x487b9c8a = inverse(1<<512) mod 0x104c11db7
|
|
173
|
+
|
|
174
|
+
// reverse input bits + load into XMM register
|
|
175
|
+
uint32_t init_t = BSWAP32(initial);
|
|
176
|
+
xmm_t0 = reverse_bits_epi8(_mm_cvtsi32_si128(~init_t));
|
|
177
|
+
|
|
178
|
+
xmm_t0 = _mm_clmulepi64_si128(xmm_t0, _mm_cvtsi32_si128(0x487b9c8a), 0);
|
|
179
|
+
xmm_t1 = _mm_and_si128(xmm_t0, _mm_set_epi32(-1,-1,-1,0)); // shifted up by 32bits to avoid shifts by using clmul's capability to select top 64bits instead
|
|
180
|
+
xmm_t2 = _mm_set_epi32( // polynomial reduction factors
|
|
181
|
+
0, 0x04c11db7, // G*
|
|
182
|
+
1, 0x04d101df // Q+
|
|
183
|
+
);
|
|
184
|
+
xmm_t1 = _mm_clmulepi64_si128(xmm_t1, xmm_t2, 0);
|
|
185
|
+
xmm_t1 = _mm_clmulepi64_si128(xmm_t1, xmm_t2, 0x11);
|
|
186
|
+
|
|
187
|
+
__m128i xmm_crc0 = _mm_xor_si128(xmm_t0, xmm_t1);
|
|
188
|
+
// reverse bits
|
|
189
|
+
xmm_crc0 = _mm_shuffle_epi8(reverse_bits_epi8(xmm_crc0), _mm_set_epi32(-1,-1,-1,0x00010203));
|
|
190
|
+
|
|
258
191
|
__m128i xmm_crc1 = _mm_setzero_si128();
|
|
259
192
|
__m128i xmm_crc2 = _mm_setzero_si128();
|
|
260
193
|
__m128i xmm_crc3 = _mm_setzero_si128();
|
|
@@ -262,7 +195,7 @@ uint32_t crc_fold(const unsigned char *src, long len) {
|
|
|
262
195
|
|
|
263
196
|
if (len < 16) {
|
|
264
197
|
if (len == 0)
|
|
265
|
-
return
|
|
198
|
+
return initial;
|
|
266
199
|
xmm_crc_part = _mm_loadu_si128((__m128i *)src);
|
|
267
200
|
goto partial;
|
|
268
201
|
}
|
|
@@ -284,12 +217,22 @@ uint32_t crc_fold(const unsigned char *src, long len) {
|
|
|
284
217
|
xmm_t2 = _mm_load_si128((__m128i *)src + 2);
|
|
285
218
|
xmm_t3 = _mm_load_si128((__m128i *)src + 3);
|
|
286
219
|
|
|
287
|
-
|
|
288
|
-
|
|
220
|
+
#ifdef ENABLE_AVX512
|
|
221
|
+
xmm_crc0 = do_one_fold_merge(xmm_crc0, xmm_t0);
|
|
222
|
+
xmm_crc1 = do_one_fold_merge(xmm_crc1, xmm_t1);
|
|
223
|
+
xmm_crc2 = do_one_fold_merge(xmm_crc2, xmm_t2);
|
|
224
|
+
xmm_crc3 = do_one_fold_merge(xmm_crc3, xmm_t3);
|
|
225
|
+
#else
|
|
226
|
+
// nesting do_one_fold() in _mm_xor_si128() seems to cause MSVC to generate horrible code, so separate it out
|
|
227
|
+
xmm_crc0 = do_one_fold(xmm_crc0);
|
|
228
|
+
xmm_crc1 = do_one_fold(xmm_crc1);
|
|
229
|
+
xmm_crc2 = do_one_fold(xmm_crc2);
|
|
230
|
+
xmm_crc3 = do_one_fold(xmm_crc3);
|
|
289
231
|
xmm_crc0 = _mm_xor_si128(xmm_crc0, xmm_t0);
|
|
290
232
|
xmm_crc1 = _mm_xor_si128(xmm_crc1, xmm_t1);
|
|
291
233
|
xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_t2);
|
|
292
234
|
xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t3);
|
|
235
|
+
#endif
|
|
293
236
|
|
|
294
237
|
src += 64;
|
|
295
238
|
}
|
|
@@ -304,11 +247,20 @@ uint32_t crc_fold(const unsigned char *src, long len) {
|
|
|
304
247
|
xmm_t1 = _mm_load_si128((__m128i *)src + 1);
|
|
305
248
|
xmm_t2 = _mm_load_si128((__m128i *)src + 2);
|
|
306
249
|
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
xmm_crc2 =
|
|
250
|
+
xmm_t3 = xmm_crc3;
|
|
251
|
+
#ifdef ENABLE_AVX512
|
|
252
|
+
xmm_crc3 = do_one_fold_merge(xmm_crc2, xmm_t2);
|
|
253
|
+
xmm_crc2 = do_one_fold_merge(xmm_crc1, xmm_t1);
|
|
254
|
+
xmm_crc1 = do_one_fold_merge(xmm_crc0, xmm_t0);
|
|
255
|
+
#else
|
|
256
|
+
xmm_crc3 = do_one_fold(xmm_crc2);
|
|
257
|
+
xmm_crc2 = do_one_fold(xmm_crc1);
|
|
258
|
+
xmm_crc1 = do_one_fold(xmm_crc0);
|
|
311
259
|
xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t2);
|
|
260
|
+
xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_t1);
|
|
261
|
+
xmm_crc1 = _mm_xor_si128(xmm_crc1, xmm_t0);
|
|
262
|
+
#endif
|
|
263
|
+
xmm_crc0 = xmm_t3;
|
|
312
264
|
|
|
313
265
|
if (len == 0)
|
|
314
266
|
goto done;
|
|
@@ -320,10 +272,19 @@ uint32_t crc_fold(const unsigned char *src, long len) {
|
|
|
320
272
|
xmm_t0 = _mm_load_si128((__m128i *)src);
|
|
321
273
|
xmm_t1 = _mm_load_si128((__m128i *)src + 1);
|
|
322
274
|
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
275
|
+
xmm_t2 = xmm_crc2;
|
|
276
|
+
xmm_t3 = xmm_crc3;
|
|
277
|
+
#ifdef ENABLE_AVX512
|
|
278
|
+
xmm_crc3 = do_one_fold_merge(xmm_crc1, xmm_t1);
|
|
279
|
+
xmm_crc2 = do_one_fold_merge(xmm_crc0, xmm_t0);
|
|
280
|
+
#else
|
|
281
|
+
xmm_crc3 = do_one_fold(xmm_crc1);
|
|
282
|
+
xmm_crc2 = do_one_fold(xmm_crc0);
|
|
326
283
|
xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t1);
|
|
284
|
+
xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_t0);
|
|
285
|
+
#endif
|
|
286
|
+
xmm_crc1 = xmm_t3;
|
|
287
|
+
xmm_crc0 = xmm_t2;
|
|
327
288
|
|
|
328
289
|
if (len == 0)
|
|
329
290
|
goto done;
|
|
@@ -334,9 +295,15 @@ uint32_t crc_fold(const unsigned char *src, long len) {
|
|
|
334
295
|
|
|
335
296
|
xmm_t0 = _mm_load_si128((__m128i *)src);
|
|
336
297
|
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
xmm_crc3 =
|
|
298
|
+
xmm_t3 = xmm_crc3;
|
|
299
|
+
#ifdef ENABLE_AVX512
|
|
300
|
+
xmm_crc3 = do_one_fold_merge(xmm_crc0, xmm_t0);
|
|
301
|
+
#else
|
|
302
|
+
xmm_crc3 = _mm_xor_si128(do_one_fold(xmm_crc0), xmm_t0);
|
|
303
|
+
#endif
|
|
304
|
+
xmm_crc0 = xmm_crc1;
|
|
305
|
+
xmm_crc1 = xmm_crc2;
|
|
306
|
+
xmm_crc2 = xmm_t3;
|
|
340
307
|
|
|
341
308
|
if (len == 0)
|
|
342
309
|
goto done;
|
|
@@ -354,10 +321,7 @@ partial:
|
|
|
354
321
|
&xmm_crc_part);
|
|
355
322
|
done:
|
|
356
323
|
{
|
|
357
|
-
const __m128i xmm_mask
|
|
358
|
-
const __m128i xmm_mask2 = _mm_load_si128((__m128i *)crc_mask2);
|
|
359
|
-
|
|
360
|
-
uint32_t crc;
|
|
324
|
+
const __m128i xmm_mask = _mm_load_si128((__m128i *)crc_mask);
|
|
361
325
|
__m128i x_tmp0, x_tmp1, x_tmp2, crc_fold;
|
|
362
326
|
|
|
363
327
|
/*
|
|
@@ -367,18 +331,30 @@ done:
|
|
|
367
331
|
|
|
368
332
|
x_tmp0 = _mm_clmulepi64_si128(xmm_crc0, crc_fold, 0x10);
|
|
369
333
|
xmm_crc0 = _mm_clmulepi64_si128(xmm_crc0, crc_fold, 0x01);
|
|
334
|
+
#ifdef ENABLE_AVX512
|
|
335
|
+
xmm_crc1 = _mm_ternarylogic_epi32(xmm_crc1, x_tmp0, xmm_crc0, 0x96);
|
|
336
|
+
#else
|
|
370
337
|
xmm_crc1 = _mm_xor_si128(xmm_crc1, x_tmp0);
|
|
371
338
|
xmm_crc1 = _mm_xor_si128(xmm_crc1, xmm_crc0);
|
|
339
|
+
#endif
|
|
372
340
|
|
|
373
341
|
x_tmp1 = _mm_clmulepi64_si128(xmm_crc1, crc_fold, 0x10);
|
|
374
342
|
xmm_crc1 = _mm_clmulepi64_si128(xmm_crc1, crc_fold, 0x01);
|
|
343
|
+
#ifdef ENABLE_AVX512
|
|
344
|
+
xmm_crc2 = _mm_ternarylogic_epi32(xmm_crc2, x_tmp1, xmm_crc1, 0x96);
|
|
345
|
+
#else
|
|
375
346
|
xmm_crc2 = _mm_xor_si128(xmm_crc2, x_tmp1);
|
|
376
347
|
xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_crc1);
|
|
348
|
+
#endif
|
|
377
349
|
|
|
378
350
|
x_tmp2 = _mm_clmulepi64_si128(xmm_crc2, crc_fold, 0x10);
|
|
379
351
|
xmm_crc2 = _mm_clmulepi64_si128(xmm_crc2, crc_fold, 0x01);
|
|
352
|
+
#ifdef ENABLE_AVX512
|
|
353
|
+
xmm_crc3 = _mm_ternarylogic_epi32(xmm_crc3, x_tmp2, xmm_crc2, 0x96);
|
|
354
|
+
#else
|
|
380
355
|
xmm_crc3 = _mm_xor_si128(xmm_crc3, x_tmp2);
|
|
381
356
|
xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc2);
|
|
357
|
+
#endif
|
|
382
358
|
|
|
383
359
|
/*
|
|
384
360
|
* k5
|
|
@@ -393,29 +369,43 @@ done:
|
|
|
393
369
|
xmm_crc0 = xmm_crc3;
|
|
394
370
|
xmm_crc3 = _mm_slli_si128(xmm_crc3, 4);
|
|
395
371
|
xmm_crc3 = _mm_clmulepi64_si128(xmm_crc3, crc_fold, 0x10);
|
|
372
|
+
#ifdef ENABLE_AVX512
|
|
373
|
+
//xmm_crc3 = _mm_maskz_xor_epi32(14, xmm_crc3, xmm_crc0);
|
|
374
|
+
xmm_crc3 = _mm_ternarylogic_epi32(xmm_crc3, xmm_crc0, xmm_mask, 0x28);
|
|
375
|
+
#else
|
|
376
|
+
xmm_crc0 = _mm_and_si128(xmm_crc0, xmm_mask);
|
|
396
377
|
xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc0);
|
|
397
|
-
|
|
378
|
+
#endif
|
|
398
379
|
|
|
399
380
|
/*
|
|
400
381
|
* k7
|
|
401
382
|
*/
|
|
402
383
|
xmm_crc1 = xmm_crc3;
|
|
403
|
-
xmm_crc2 = xmm_crc3;
|
|
404
384
|
crc_fold = _mm_load_si128((__m128i *)crc_k + 2);
|
|
405
385
|
|
|
406
386
|
xmm_crc3 = _mm_clmulepi64_si128(xmm_crc3, crc_fold, 0);
|
|
407
|
-
xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc2);
|
|
408
|
-
xmm_crc3 = _mm_and_si128(xmm_crc3, xmm_mask);
|
|
409
|
-
|
|
410
|
-
xmm_crc2 = xmm_crc3;
|
|
411
387
|
xmm_crc3 = _mm_clmulepi64_si128(xmm_crc3, crc_fold, 0x10);
|
|
412
|
-
|
|
388
|
+
#ifdef ENABLE_AVX512
|
|
389
|
+
xmm_crc3 = _mm_ternarylogic_epi32(xmm_crc3, xmm_crc1, xmm_crc1, 0xC3); // NOT(xmm_crc3 ^ xmm_crc1)
|
|
390
|
+
#else
|
|
391
|
+
xmm_crc1 = _mm_xor_si128(xmm_crc1, xmm_mask);
|
|
413
392
|
xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc1);
|
|
393
|
+
#endif
|
|
394
|
+
return _mm_extract_epi32(xmm_crc3, 2);
|
|
395
|
+
}
|
|
414
396
|
|
|
415
|
-
crc = _mm_extract_epi32(xmm_crc3, 2);
|
|
416
|
-
return ~crc;
|
|
417
397
|
}
|
|
418
398
|
|
|
399
|
+
static uint32_t do_crc32_incremental_clmul(const void* data, size_t length, uint32_t init) {
|
|
400
|
+
return crc_fold((const unsigned char*)data, (long)length, init);
|
|
401
|
+
}
|
|
402
|
+
|
|
403
|
+
void crc_clmul_set_funcs(crc_func* _do_crc32_incremental) {
|
|
404
|
+
*_do_crc32_incremental = &do_crc32_incremental_clmul;
|
|
405
|
+
}
|
|
406
|
+
#else
|
|
407
|
+
void crc_clmul_set_funcs(crc_func* _do_crc32_incremental) {
|
|
408
|
+
(void)_do_crc32_incremental;
|
|
419
409
|
}
|
|
420
410
|
#endif
|
|
421
411
|
|
package/src/decoder.cc
ADDED
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
#include "common.h"
|
|
2
|
+
|
|
3
|
+
#include "decoder_common.h"
|
|
4
|
+
#include "decoder.h"
|
|
5
|
+
|
|
6
|
+
extern "C" {
|
|
7
|
+
YencDecoderEnd (*_do_decode)(const unsigned char*HEDLEY_RESTRICT*, unsigned char*HEDLEY_RESTRICT*, size_t, YencDecoderState*) = &do_decode_scalar<false, false>;
|
|
8
|
+
YencDecoderEnd (*_do_decode_raw)(const unsigned char*HEDLEY_RESTRICT*, unsigned char*HEDLEY_RESTRICT*, size_t, YencDecoderState*) = &do_decode_scalar<true, false>;
|
|
9
|
+
YencDecoderEnd (*_do_decode_end_raw)(const unsigned char*HEDLEY_RESTRICT*, unsigned char*HEDLEY_RESTRICT*, size_t, YencDecoderState*) = &do_decode_end_scalar<true>;
|
|
10
|
+
}
|
|
11
|
+
|
|
12
|
+
void decoder_set_sse2_funcs();
|
|
13
|
+
void decoder_set_ssse3_funcs();
|
|
14
|
+
void decoder_set_avx_funcs();
|
|
15
|
+
void decoder_set_avx2_funcs();
|
|
16
|
+
void decoder_set_neon_funcs();
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
#if defined(PLATFORM_X86) && defined(YENC_BUILD_NATIVE) && YENC_BUILD_NATIVE!=0
|
|
20
|
+
# if defined(__AVX2__) && !defined(YENC_DISABLE_AVX256)
|
|
21
|
+
# include "decoder_avx2_base.h"
|
|
22
|
+
static inline void decoder_set_native_funcs() {
|
|
23
|
+
ALIGN_ALLOC(lookups, sizeof(*lookups), 16);
|
|
24
|
+
decoder_init_lut(lookups->eqFix, lookups->compact);
|
|
25
|
+
_do_decode = &do_decode_simd<false, false, sizeof(__m256i)*2, do_decode_avx2<false, false, ISA_NATIVE> >;
|
|
26
|
+
_do_decode_raw = &do_decode_simd<true, false, sizeof(__m256i)*2, do_decode_avx2<true, false, ISA_NATIVE> >;
|
|
27
|
+
_do_decode_end_raw = &do_decode_simd<true, true, sizeof(__m256i)*2, do_decode_avx2<true, true, ISA_NATIVE> >;
|
|
28
|
+
}
|
|
29
|
+
# else
|
|
30
|
+
# include "decoder_sse_base.h"
|
|
31
|
+
static inline void decoder_set_native_funcs() {
|
|
32
|
+
decoder_sse_init();
|
|
33
|
+
decoder_init_lut(lookups->eqFix, lookups->compact);
|
|
34
|
+
_do_decode = &do_decode_simd<false, false, sizeof(__m128i)*2, do_decode_sse<false, false, ISA_NATIVE> >;
|
|
35
|
+
_do_decode_raw = &do_decode_simd<true, false, sizeof(__m128i)*2, do_decode_sse<true, false, ISA_NATIVE> >;
|
|
36
|
+
_do_decode_end_raw = &do_decode_simd<true, true, sizeof(__m128i)*2, do_decode_sse<true, true, ISA_NATIVE> >;
|
|
37
|
+
}
|
|
38
|
+
# endif
|
|
39
|
+
#endif
|
|
40
|
+
|
|
41
|
+
void decoder_init() {
|
|
42
|
+
#ifdef PLATFORM_X86
|
|
43
|
+
# if defined(YENC_BUILD_NATIVE) && YENC_BUILD_NATIVE!=0
|
|
44
|
+
decoder_set_native_funcs();
|
|
45
|
+
# else
|
|
46
|
+
int use_isa = cpu_supports_isa();
|
|
47
|
+
if(use_isa >= ISA_LEVEL_AVX2)
|
|
48
|
+
decoder_set_avx2_funcs();
|
|
49
|
+
else if(use_isa >= ISA_LEVEL_AVX)
|
|
50
|
+
decoder_set_avx_funcs();
|
|
51
|
+
else if(use_isa >= ISA_LEVEL_SSSE3)
|
|
52
|
+
decoder_set_ssse3_funcs();
|
|
53
|
+
else
|
|
54
|
+
decoder_set_sse2_funcs();
|
|
55
|
+
# endif
|
|
56
|
+
#endif
|
|
57
|
+
#ifdef PLATFORM_ARM
|
|
58
|
+
if(cpu_supports_neon())
|
|
59
|
+
decoder_set_neon_funcs();
|
|
60
|
+
#endif
|
|
61
|
+
}
|
package/src/decoder.h
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
#ifndef __YENC_DECODER_H
|
|
2
|
+
#define __YENC_DECODER_H
|
|
3
|
+
|
|
4
|
+
#ifdef __cplusplus
|
|
5
|
+
extern "C" {
|
|
6
|
+
#endif
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
// the last state that the decoder was in (i.e. last few characters processed)
|
|
11
|
+
// the state is needed for incremental decoders as its behavior is affected by what it processed last
|
|
12
|
+
// acronyms: CR = carriage return (\r), LF = line feed (\n), EQ = equals char, DT = dot char (.)
|
|
13
|
+
typedef enum {
|
|
14
|
+
YDEC_STATE_CRLF, // default
|
|
15
|
+
YDEC_STATE_EQ,
|
|
16
|
+
YDEC_STATE_CR,
|
|
17
|
+
YDEC_STATE_NONE,
|
|
18
|
+
YDEC_STATE_CRLFDT,
|
|
19
|
+
YDEC_STATE_CRLFDTCR,
|
|
20
|
+
YDEC_STATE_CRLFEQ // may actually be "\r\n.=" in raw decoder
|
|
21
|
+
} YencDecoderState;
|
|
22
|
+
|
|
23
|
+
// end result for incremental processing (whether the end of the yEnc data was reached)
|
|
24
|
+
typedef enum {
|
|
25
|
+
YDEC_END_NONE, // end not reached
|
|
26
|
+
YDEC_END_CONTROL, // \r\n=y sequence found, src points to byte after 'y'
|
|
27
|
+
YDEC_END_ARTICLE // \r\n.\r\n sequence found, src points to byte after last '\n'
|
|
28
|
+
} YencDecoderEnd;
|
|
29
|
+
|
|
30
|
+
#include "hedley.h"
|
|
31
|
+
|
|
32
|
+
extern YencDecoderEnd (*_do_decode)(const unsigned char*HEDLEY_RESTRICT*, unsigned char*HEDLEY_RESTRICT*, size_t, YencDecoderState*);
|
|
33
|
+
extern YencDecoderEnd (*_do_decode_raw)(const unsigned char*HEDLEY_RESTRICT*, unsigned char*HEDLEY_RESTRICT*, size_t, YencDecoderState*);
|
|
34
|
+
extern YencDecoderEnd (*_do_decode_end_raw)(const unsigned char*HEDLEY_RESTRICT*, unsigned char*HEDLEY_RESTRICT*, size_t, YencDecoderState*);
|
|
35
|
+
|
|
36
|
+
static inline size_t do_decode(int isRaw, const unsigned char* HEDLEY_RESTRICT src, unsigned char* HEDLEY_RESTRICT dest, size_t len, YencDecoderState* state) {
|
|
37
|
+
unsigned char* ds = dest;
|
|
38
|
+
(*(isRaw ? _do_decode_raw : _do_decode))(&src, &ds, len, state);
|
|
39
|
+
return ds - dest;
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
static inline YencDecoderEnd do_decode_end(const unsigned char*HEDLEY_RESTRICT* src, unsigned char*HEDLEY_RESTRICT* dest, size_t len, YencDecoderState* state) {
|
|
43
|
+
return _do_decode_end_raw(src, dest, len, state);
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
void decoder_init();
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
#ifdef __cplusplus
|
|
51
|
+
}
|
|
52
|
+
#endif
|
|
53
|
+
#endif
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
#include "common.h"
|
|
2
|
+
|
|
3
|
+
#if defined(__AVX__) && defined(__POPCNT__)
|
|
4
|
+
#include "decoder_common.h"
|
|
5
|
+
#include "decoder_sse_base.h"
|
|
6
|
+
void decoder_set_avx_funcs() {
|
|
7
|
+
decoder_sse_init();
|
|
8
|
+
decoder_init_lut(lookups->eqFix, lookups->compact);
|
|
9
|
+
_do_decode = &do_decode_simd<false, false, sizeof(__m128i)*2, do_decode_sse<false, false, ISA_LEVEL_SSE4_POPCNT> >;
|
|
10
|
+
_do_decode_raw = &do_decode_simd<true, false, sizeof(__m128i)*2, do_decode_sse<true, false, ISA_LEVEL_SSE4_POPCNT> >;
|
|
11
|
+
_do_decode_end_raw = &do_decode_simd<true, true, sizeof(__m128i)*2, do_decode_sse<true, true, ISA_LEVEL_SSE4_POPCNT> >;
|
|
12
|
+
}
|
|
13
|
+
#else
|
|
14
|
+
void decoder_set_ssse3_funcs();
|
|
15
|
+
void decoder_set_avx_funcs() {
|
|
16
|
+
decoder_set_ssse3_funcs();
|
|
17
|
+
}
|
|
18
|
+
#endif
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
#include "common.h"
|
|
2
|
+
|
|
3
|
+
#if defined(__AVX2__) && !defined(YENC_DISABLE_AVX256)
|
|
4
|
+
#include "decoder_common.h"
|
|
5
|
+
#include "decoder_avx2_base.h"
|
|
6
|
+
void decoder_set_avx2_funcs() {
|
|
7
|
+
ALIGN_ALLOC(lookups, sizeof(*lookups), 16);
|
|
8
|
+
decoder_init_lut(lookups->eqFix, lookups->compact);
|
|
9
|
+
_do_decode = &do_decode_simd<false, false, sizeof(__m256i)*2, do_decode_avx2<false, false, ISA_LEVEL_AVX2> >;
|
|
10
|
+
_do_decode_raw = &do_decode_simd<true, false, sizeof(__m256i)*2, do_decode_avx2<true, false, ISA_LEVEL_AVX2> >;
|
|
11
|
+
_do_decode_end_raw = &do_decode_simd<true, true, sizeof(__m256i)*2, do_decode_avx2<true, true, ISA_LEVEL_AVX2> >;
|
|
12
|
+
}
|
|
13
|
+
#else
|
|
14
|
+
void decoder_set_avx_funcs();
|
|
15
|
+
void decoder_set_avx2_funcs() {
|
|
16
|
+
decoder_set_avx_funcs();
|
|
17
|
+
}
|
|
18
|
+
#endif
|