yencode 1.1.2 → 1.1.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/binding.gyp +73 -1
- package/package.json +1 -1
- package/src/common.h +13 -16
- package/src/crc.cc +9 -3
- package/src/crc_arm.cc +35 -6
- package/src/crc_folding.cc +18 -53
- package/src/crc_folding_256.cc +230 -0
- package/src/decoder.cc +4 -1
- package/src/decoder_avx2_base.h +20 -9
- package/src/decoder_neon.cc +3 -3
- package/src/decoder_neon64.cc +9 -6
- package/src/decoder_sse_base.h +16 -9
- package/src/decoder_vbmi2.cc +30 -0
- package/src/encoder.cc +4 -1
- package/src/encoder_avx_base.h +2 -2
- package/src/encoder_neon.cc +1 -1
- package/src/encoder_sse_base.h +4 -5
- package/src/encoder_vbmi2.cc +23 -0
- package/src/platform.cc +23 -2
- package/test/testcrc.js +14 -0
package/binding.gyp
CHANGED
|
@@ -64,7 +64,7 @@
|
|
|
64
64
|
"targets": [
|
|
65
65
|
{
|
|
66
66
|
"target_name": "yencode",
|
|
67
|
-
"dependencies": ["crcutil", "yencode_sse2", "yencode_ssse3", "yencode_clmul", "yencode_avx", "yencode_avx2", "yencode_neon", "yencode_armcrc"],
|
|
67
|
+
"dependencies": ["crcutil", "yencode_sse2", "yencode_ssse3", "yencode_clmul", "yencode_clmul256", "yencode_avx", "yencode_avx2", "yencode_vbmi2", "yencode_neon", "yencode_armcrc"],
|
|
68
68
|
"sources": [
|
|
69
69
|
"src/yencode.cc",
|
|
70
70
|
"src/platform.cc",
|
|
@@ -206,6 +206,70 @@
|
|
|
206
206
|
}]
|
|
207
207
|
]
|
|
208
208
|
},
|
|
209
|
+
{
|
|
210
|
+
"target_name": "yencode_clmul256",
|
|
211
|
+
"type": "static_library",
|
|
212
|
+
"sources": [
|
|
213
|
+
"src/crc_folding_256.cc"
|
|
214
|
+
],
|
|
215
|
+
"cflags!": ["-fno-omit-frame-pointer", "-fno-tree-vrp", "-fno-strict-aliasing"],
|
|
216
|
+
"cxxflags!": ["-fno-omit-frame-pointer", "-fno-tree-vrp", "-fno-strict-aliasing"],
|
|
217
|
+
"xcode_settings": {
|
|
218
|
+
"OTHER_CFLAGS!": ["-fno-omit-frame-pointer", "-fno-tree-vrp", "-fno-strict-aliasing"],
|
|
219
|
+
"OTHER_CXXFLAGS!": ["-fno-omit-frame-pointer", "-fno-tree-vrp", "-fno-strict-aliasing"]
|
|
220
|
+
},
|
|
221
|
+
"msvs_settings": {"VCCLCompilerTool": {"BufferSecurityCheck": "false"}},
|
|
222
|
+
"conditions": [
|
|
223
|
+
['target_arch in "ia32 x64" and OS!="win"', {
|
|
224
|
+
"variables": {"supports_vpclmul%": "<!(<!(echo ${CC_target:-${CC:-cc}}) -MM -E src/crc_folding_256.cc -mavx2 -mvpclmulqdq 2>/dev/null || true)"},
|
|
225
|
+
"conditions": [
|
|
226
|
+
['supports_vpclmul!=""', {
|
|
227
|
+
"cflags": ["-mavx2", "-mvpclmulqdq", "-mpclmul"],
|
|
228
|
+
"cxxflags": ["-mavx2", "-mvpclmulqdq", "-mpclmul"],
|
|
229
|
+
"xcode_settings": {
|
|
230
|
+
"OTHER_CFLAGS": ["-mavx2", "-mvpclmulqdq", "-mpclmul"],
|
|
231
|
+
"OTHER_CXXFLAGS": ["-mavx2", "-mvpclmulqdq", "-mpclmul"],
|
|
232
|
+
}
|
|
233
|
+
}]
|
|
234
|
+
]
|
|
235
|
+
}],
|
|
236
|
+
['target_arch in "ia32 x64" and OS=="win"', {
|
|
237
|
+
"msvs_settings": {"VCCLCompilerTool": {"EnableEnhancedInstructionSet": "3"}}
|
|
238
|
+
}]
|
|
239
|
+
]
|
|
240
|
+
},
|
|
241
|
+
{
|
|
242
|
+
"target_name": "yencode_vbmi2",
|
|
243
|
+
"type": "static_library",
|
|
244
|
+
"sources": [
|
|
245
|
+
"src/decoder_vbmi2.cc", "src/encoder_vbmi2.cc"
|
|
246
|
+
],
|
|
247
|
+
"cflags!": ["-fno-omit-frame-pointer", "-fno-tree-vrp", "-fno-strict-aliasing"],
|
|
248
|
+
"cxxflags!": ["-fno-omit-frame-pointer", "-fno-tree-vrp", "-fno-strict-aliasing"],
|
|
249
|
+
"xcode_settings": {
|
|
250
|
+
"OTHER_CFLAGS!": ["-fno-omit-frame-pointer", "-fno-tree-vrp", "-fno-strict-aliasing"],
|
|
251
|
+
"OTHER_CXXFLAGS!": ["-fno-omit-frame-pointer", "-fno-tree-vrp", "-fno-strict-aliasing"]
|
|
252
|
+
},
|
|
253
|
+
"msvs_settings": {"VCCLCompilerTool": {"BufferSecurityCheck": "false"}},
|
|
254
|
+
"conditions": [
|
|
255
|
+
['target_arch in "ia32 x64" and OS!="win"', {
|
|
256
|
+
"variables": {"supports_vbmi2%": "<!(<!(echo ${CC_target:-${CC:-cc}}) -MM -E src/encoder_vbmi2.cc -mavx512vl -mavx512vbmi2 2>/dev/null || true)"},
|
|
257
|
+
"conditions": [
|
|
258
|
+
['supports_vbmi2!=""', {
|
|
259
|
+
"cflags": ["-mavx512vbmi2", "-mavx512vl", "-mavx512bw", "-mpopcnt", "-mbmi", "-mbmi2", "-mlzcnt"],
|
|
260
|
+
"cxxflags": ["-mavx512vbmi2", "-mavx512vl", "-mavx512bw", "-mpopcnt", "-mbmi", "-mbmi2", "-mlzcnt"],
|
|
261
|
+
"xcode_settings": {
|
|
262
|
+
"OTHER_CFLAGS": ["-mavx512vbmi2", "-mavx512vl", "-mavx512bw", "-mpopcnt", "-mbmi", "-mbmi2", "-mlzcnt"],
|
|
263
|
+
"OTHER_CXXFLAGS": ["-mavx512vbmi2", "-mavx512vl", "-mavx512bw", "-mpopcnt", "-mbmi", "-mbmi2", "-mlzcnt"],
|
|
264
|
+
}
|
|
265
|
+
}]
|
|
266
|
+
]
|
|
267
|
+
}],
|
|
268
|
+
['target_arch in "ia32 x64" and OS=="win"', {
|
|
269
|
+
"msvs_settings": {"VCCLCompilerTool": {"AdditionalOptions": ["/arch:AVX512"], "EnableEnhancedInstructionSet": "0"}}
|
|
270
|
+
}]
|
|
271
|
+
]
|
|
272
|
+
},
|
|
209
273
|
{
|
|
210
274
|
"target_name": "yencode_neon",
|
|
211
275
|
"type": "static_library",
|
|
@@ -260,6 +324,14 @@
|
|
|
260
324
|
"OTHER_CFLAGS": ["-march=armv8-a+crc"],
|
|
261
325
|
"OTHER_CXXFLAGS": ["-march=armv8-a+crc"],
|
|
262
326
|
}
|
|
327
|
+
}],
|
|
328
|
+
['OS!="win" and target_arch=="arm"', {
|
|
329
|
+
"cflags": ["-mfpu=fp-armv8"],
|
|
330
|
+
"cxxflags": ["-mfpu=fp-armv8"],
|
|
331
|
+
"xcode_settings": {
|
|
332
|
+
"OTHER_CFLAGS": ["-mfpu=fp-armv8"],
|
|
333
|
+
"OTHER_CXXFLAGS": ["-mfpu=fp-armv8"]
|
|
334
|
+
}
|
|
263
335
|
}]
|
|
264
336
|
]
|
|
265
337
|
},
|
package/package.json
CHANGED
package/src/common.h
CHANGED
|
@@ -35,18 +35,22 @@
|
|
|
35
35
|
#endif
|
|
36
36
|
|
|
37
37
|
|
|
38
|
+
#include <stdlib.h>
|
|
38
39
|
#if defined(_MSC_VER) || defined(__MINGW32__) || defined(__MINGW64__)
|
|
39
|
-
|
|
40
|
+
// MSVC doesn't support C11 aligned_alloc: https://stackoverflow.com/a/62963007
|
|
40
41
|
#define ALIGN_ALLOC(buf, len, align) *(void**)&(buf) = _aligned_malloc((len), align)
|
|
41
42
|
#define ALIGN_FREE _aligned_free
|
|
42
|
-
#elif defined(
|
|
43
|
-
//
|
|
43
|
+
#elif defined(_ISOC11_SOURCE)
|
|
44
|
+
// C11 method
|
|
44
45
|
// len needs to be a multiple of alignment, although it sometimes works if it isn't...
|
|
45
|
-
#include <cstdlib>
|
|
46
46
|
#define ALIGN_ALLOC(buf, len, align) *(void**)&(buf) = aligned_alloc(align, ((len) + (align)-1) & ~((align)-1))
|
|
47
47
|
#define ALIGN_FREE free
|
|
48
|
+
#elif defined(__cplusplus) && __cplusplus >= 201700
|
|
49
|
+
// C++17 method
|
|
50
|
+
#include <cstdlib>
|
|
51
|
+
#define ALIGN_ALLOC(buf, len, align) *(void**)&(buf) = std::aligned_alloc(align, ((len) + (align)-1) & ~((align)-1))
|
|
52
|
+
#define ALIGN_FREE free
|
|
48
53
|
#else
|
|
49
|
-
#include <stdlib.h>
|
|
50
54
|
#define ALIGN_ALLOC(buf, len, align) if(posix_memalign((void**)&(buf), align, (len))) (buf) = NULL
|
|
51
55
|
#define ALIGN_FREE free
|
|
52
56
|
#endif
|
|
@@ -217,9 +221,9 @@ enum YEncDecIsaLevel {
|
|
|
217
221
|
ISA_LEVEL_SSE41 = 0x300,
|
|
218
222
|
ISA_LEVEL_SSE4_POPCNT = 0x301,
|
|
219
223
|
ISA_LEVEL_AVX = 0x381, // same as above, just used as a differentiator for `cpu_supports_isa`
|
|
220
|
-
ISA_LEVEL_AVX2 =
|
|
221
|
-
ISA_LEVEL_AVX3 =
|
|
222
|
-
ISA_LEVEL_VBMI2 =
|
|
224
|
+
ISA_LEVEL_AVX2 = 0x403, // also includes BMI1/2 and LZCNT
|
|
225
|
+
ISA_LEVEL_AVX3 = 0x503, // SKX variant; AVX512VL + AVX512BW
|
|
226
|
+
ISA_LEVEL_VBMI2 = 0x603 // ICL
|
|
223
227
|
};
|
|
224
228
|
#ifdef _MSC_VER
|
|
225
229
|
// native tuning not supported in MSVC
|
|
@@ -249,13 +253,6 @@ enum YEncDecIsaLevel {
|
|
|
249
253
|
# endif
|
|
250
254
|
#endif
|
|
251
255
|
|
|
252
|
-
#ifdef _MSC_VER
|
|
253
|
-
# define _cpuid1(ar) __cpuid(ar, 1)
|
|
254
|
-
#else
|
|
255
|
-
# include <cpuid.h>
|
|
256
|
-
# define _cpuid1(ar) __cpuid(1, ar[0], ar[1], ar[2], ar[3])
|
|
257
|
-
#endif
|
|
258
|
-
|
|
259
256
|
int cpu_supports_isa();
|
|
260
257
|
#endif // PLATFORM_X86
|
|
261
258
|
|
|
@@ -270,7 +267,7 @@ int cpu_supports_isa();
|
|
|
270
267
|
|
|
271
268
|
|
|
272
269
|
// GCC 8/9/10(dev) fails to optimize cases where KNOT should be used, so use intrinsic explicitly; Clang 6+ has no issue, but Clang 6/7 doesn't have the intrinsic; MSVC 2019 also fails and lacks the intrinsic
|
|
273
|
-
#if defined(__GNUC__) && __GNUC__ >= 7
|
|
270
|
+
#if (defined(__GNUC__) && __GNUC__ >= 7) || (defined(_MSC_VER) && _MSC_VER >= 1924)
|
|
274
271
|
# define KNOT16 _knot_mask16
|
|
275
272
|
# define KNOT32 _knot_mask32
|
|
276
273
|
#else
|
package/src/crc.cc
CHANGED
|
@@ -25,8 +25,13 @@ uint32_t do_crc32_zeros(uint32_t crc1, size_t len) {
|
|
|
25
25
|
}
|
|
26
26
|
|
|
27
27
|
void crc_clmul_set_funcs(crc_func*);
|
|
28
|
+
void crc_clmul256_set_funcs(crc_func*);
|
|
28
29
|
void crc_arm_set_funcs(crc_func*);
|
|
29
30
|
|
|
31
|
+
#ifdef PLATFORM_X86
|
|
32
|
+
int cpu_supports_crc_isa();
|
|
33
|
+
#endif
|
|
34
|
+
|
|
30
35
|
#if defined(PLATFORM_ARM) && defined(_WIN32)
|
|
31
36
|
# define WIN32_LEAN_AND_MEAN
|
|
32
37
|
# include <Windows.h>
|
|
@@ -58,9 +63,10 @@ void crc_init() {
|
|
|
58
63
|
// instance never deleted... oh well...
|
|
59
64
|
|
|
60
65
|
#ifdef PLATFORM_X86
|
|
61
|
-
int
|
|
62
|
-
|
|
63
|
-
|
|
66
|
+
int support = cpu_supports_crc_isa();
|
|
67
|
+
if(support == 2)
|
|
68
|
+
crc_clmul256_set_funcs(&_do_crc32_incremental);
|
|
69
|
+
else if(support == 1)
|
|
64
70
|
crc_clmul_set_funcs(&_do_crc32_incremental);
|
|
65
71
|
#endif
|
|
66
72
|
#ifdef PLATFORM_ARM
|
package/src/crc_arm.cc
CHANGED
|
@@ -5,6 +5,18 @@
|
|
|
5
5
|
HEDLEY_WARNING("CRC32 acceleration is not been enabled under ARM clang-cl by default; add `-march=armv8-a+crc` to additional compiler arguments to enable");
|
|
6
6
|
#endif
|
|
7
7
|
|
|
8
|
+
// disable CRC on GCC versions with broken arm_acle.h
|
|
9
|
+
#if defined(__ARM_FEATURE_CRC32) && defined(HEDLEY_GCC_VERSION)
|
|
10
|
+
# if !defined(__aarch64__) && HEDLEY_GCC_VERSION_CHECK(7,0,0) && !HEDLEY_GCC_VERSION_CHECK(8,1,1)
|
|
11
|
+
# undef __ARM_FEATURE_CRC32
|
|
12
|
+
HEDLEY_WARNING("CRC32 acceleration has been disabled due to broken arm_acle.h shipped in GCC 7.0 - 8.1 [https://gcc.gnu.org/bugzilla/show_bug.cgi?id=81497]. If you need this feature, please use a different compiler or version of GCC");
|
|
13
|
+
# endif
|
|
14
|
+
# if defined(__aarch64__) && HEDLEY_GCC_VERSION_CHECK(9,4,0) && !HEDLEY_GCC_VERSION_CHECK(9,5,0)
|
|
15
|
+
# undef __ARM_FEATURE_CRC32
|
|
16
|
+
HEDLEY_WARNING("CRC32 acceleration has been disabled due to broken arm_acle.h shipped in GCC 9.4 [https://gcc.gnu.org/bugzilla/show_bug.cgi?id=100985]. If you need this feature, please use a different compiler or version of GCC");
|
|
17
|
+
# endif
|
|
18
|
+
#endif
|
|
19
|
+
|
|
8
20
|
#if defined(__ARM_FEATURE_CRC32) || (defined(_M_ARM64) && !defined(__clang__)) // MSVC doesn't support CRC for ARM32
|
|
9
21
|
|
|
10
22
|
/* ARMv8 accelerated CRC */
|
|
@@ -14,14 +26,30 @@ HEDLEY_WARNING("CRC32 acceleration is not been enabled under ARM clang-cl by def
|
|
|
14
26
|
#include <arm_acle.h>
|
|
15
27
|
#endif
|
|
16
28
|
|
|
29
|
+
|
|
30
|
+
#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
|
|
31
|
+
# ifdef __GNUC__
|
|
32
|
+
# define _LE16 __builtin_bswap16
|
|
33
|
+
# define _LE32 __builtin_bswap32
|
|
34
|
+
# define _LE64 __builtin_bswap64
|
|
35
|
+
# else
|
|
36
|
+
// currently not supported
|
|
37
|
+
# error No endian swap intrinsic defined
|
|
38
|
+
# endif
|
|
39
|
+
#else
|
|
40
|
+
# define _LE16(x) (x)
|
|
41
|
+
# define _LE32(x) (x)
|
|
42
|
+
# define _LE64(x) (x)
|
|
43
|
+
#endif
|
|
44
|
+
|
|
17
45
|
#ifdef __aarch64__
|
|
18
46
|
# define WORD_T uint64_t
|
|
19
47
|
# define WORDSIZE_LOG 3 // sizeof(WORD_T) == 1<<WORDSIZE_LOG
|
|
20
|
-
# define CRC_WORD __crc32d
|
|
48
|
+
# define CRC_WORD(crc, data) __crc32d(crc, _LE64(data))
|
|
21
49
|
#else
|
|
22
50
|
# define WORD_T uint32_t
|
|
23
51
|
# define WORDSIZE_LOG 2 // sizeof(WORD_T) == 1<<WORDSIZE_LOG
|
|
24
|
-
# define CRC_WORD __crc32w
|
|
52
|
+
# define CRC_WORD(crc, data) __crc32w(crc, _LE32(data))
|
|
25
53
|
#endif
|
|
26
54
|
|
|
27
55
|
|
|
@@ -64,6 +92,7 @@ static const uint32_t crc_power[] = { // pre-computed 2^n, with first 3 entries
|
|
|
64
92
|
#endif
|
|
65
93
|
|
|
66
94
|
|
|
95
|
+
|
|
67
96
|
// inspired/stolen off https://github.com/jocover/crc32_armv8/blob/master/crc32_armv8.c
|
|
68
97
|
static uint32_t arm_crc_calc(uint32_t crc, const unsigned char *src, long len) {
|
|
69
98
|
|
|
@@ -75,13 +104,13 @@ static uint32_t arm_crc_calc(uint32_t crc, const unsigned char *src, long len) {
|
|
|
75
104
|
len--;
|
|
76
105
|
}
|
|
77
106
|
if ((uintptr_t)src & sizeof(uint16_t)) {
|
|
78
|
-
crc = __crc32h(crc, *((uint16_t *)src));
|
|
107
|
+
crc = __crc32h(crc, _LE16(*((uint16_t *)src)));
|
|
79
108
|
src += sizeof(uint16_t);
|
|
80
109
|
len -= sizeof(uint16_t);
|
|
81
110
|
}
|
|
82
111
|
#ifdef __aarch64__
|
|
83
112
|
if ((uintptr_t)src & sizeof(uint32_t)) {
|
|
84
|
-
crc = __crc32w(crc, *((uint32_t *)src));
|
|
113
|
+
crc = __crc32w(crc, _LE32(*((uint32_t *)src)));
|
|
85
114
|
src += sizeof(uint32_t);
|
|
86
115
|
len -= sizeof(uint32_t);
|
|
87
116
|
}
|
|
@@ -147,12 +176,12 @@ static uint32_t arm_crc_calc(uint32_t crc, const unsigned char *src, long len) {
|
|
|
147
176
|
|
|
148
177
|
#ifdef __aarch64__
|
|
149
178
|
if (len & sizeof(uint32_t)) {
|
|
150
|
-
crc = __crc32w(crc, *((uint32_t *)src));
|
|
179
|
+
crc = __crc32w(crc, _LE32(*((uint32_t *)src)));
|
|
151
180
|
src += sizeof(uint32_t);
|
|
152
181
|
}
|
|
153
182
|
#endif
|
|
154
183
|
if (len & sizeof(uint16_t)) {
|
|
155
|
-
crc = __crc32h(crc, *((uint16_t *)src));
|
|
184
|
+
crc = __crc32h(crc, _LE16(*((uint16_t *)src)));
|
|
156
185
|
src += sizeof(uint16_t);
|
|
157
186
|
}
|
|
158
187
|
if (len & sizeof(uint8_t))
|
package/src/crc_folding.cc
CHANGED
|
@@ -19,7 +19,7 @@
|
|
|
19
19
|
|
|
20
20
|
#include "crc_common.h"
|
|
21
21
|
|
|
22
|
-
#if (defined(__PCLMUL__) && defined(__SSSE3__) && defined(__SSE4_1__)) || (defined(_MSC_VER) && _MSC_VER >= 1600 && defined(PLATFORM_X86))
|
|
22
|
+
#if (defined(__PCLMUL__) && defined(__SSSE3__) && defined(__SSE4_1__)) || (defined(_MSC_VER) && _MSC_VER >= 1600 && defined(PLATFORM_X86) && !defined(__clang__))
|
|
23
23
|
#include <inttypes.h>
|
|
24
24
|
#include <immintrin.h>
|
|
25
25
|
#include <wmmintrin.h>
|
|
@@ -135,33 +135,6 @@ ALIGN_TO(16, static const unsigned crc_mask[4]) = {
|
|
|
135
135
|
0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF
|
|
136
136
|
};
|
|
137
137
|
|
|
138
|
-
static __m128i reverse_bits_epi8(__m128i src) {
|
|
139
|
-
#if defined(__GFNI__) && defined(YENC_BUILD_NATIVE) && YENC_BUILD_NATIVE!=0
|
|
140
|
-
return _mm_gf2p8affine_epi64_epi8(src, _mm_set_epi32(
|
|
141
|
-
0x80402010, 0x08040201,
|
|
142
|
-
0x80402010, 0x08040201
|
|
143
|
-
), 0);
|
|
144
|
-
#else
|
|
145
|
-
__m128i xmm_t0 = _mm_and_si128(src, _mm_set1_epi8(0x0f));
|
|
146
|
-
__m128i xmm_t1 = _mm_and_si128(_mm_srli_epi16(src, 4), _mm_set1_epi8(0x0f));
|
|
147
|
-
xmm_t0 = _mm_shuffle_epi8(_mm_set_epi8(
|
|
148
|
-
-16, 112, -80, 48, -48, 80, -112, 16, -32, 96, -96, 32, -64, 64, -128, 0
|
|
149
|
-
//0xf0, 0x70, 0xb0, 0x30, 0xd0, 0x50, 0x90, 0x10, 0xe0, 0x60, 0xa0, 0x20, 0xc0, 0x40, 0x80, 0
|
|
150
|
-
), xmm_t0);
|
|
151
|
-
xmm_t1 = _mm_shuffle_epi8(_mm_set_epi8(
|
|
152
|
-
15, 7, 11, 3, 13, 5, 9, 1, 14, 6, 10, 2, 12, 4, 8, 0
|
|
153
|
-
), xmm_t1);
|
|
154
|
-
return _mm_or_si128(xmm_t0, xmm_t1);
|
|
155
|
-
#endif
|
|
156
|
-
}
|
|
157
|
-
|
|
158
|
-
#ifdef _MSC_VER
|
|
159
|
-
// because MSVC doesn't use BSWAP unless you specifically tell it to...
|
|
160
|
-
# include <stdlib.h>
|
|
161
|
-
# define BSWAP32 _byteswap_ulong
|
|
162
|
-
#else
|
|
163
|
-
# define BSWAP32(n) ((((n)&0xff)<<24) | (((n)&0xff00)<<8) | (((n)&0xff0000)>>8) | (((n)&0xff000000)>>24))
|
|
164
|
-
#endif
|
|
165
138
|
|
|
166
139
|
static uint32_t crc_fold(const unsigned char *src, long len, uint32_t initial) {
|
|
167
140
|
unsigned long algn_diff;
|
|
@@ -170,23 +143,17 @@ static uint32_t crc_fold(const unsigned char *src, long len, uint32_t initial) {
|
|
|
170
143
|
// TODO: consider calculating this via a LUT instead (probably faster)
|
|
171
144
|
// info from https://www.reddit.com/r/ReverseEngineering/comments/2zwhl3/mystery_constant_0x9db42487_in_intels_crc32ieee/
|
|
172
145
|
// firstly, calculate: xmm_crc0 = (intial * 0x487b9c8a) mod 0x104c11db7, where 0x487b9c8a = inverse(1<<512) mod 0x104c11db7
|
|
146
|
+
xmm_t0 = _mm_cvtsi32_si128(~initial);
|
|
173
147
|
|
|
174
|
-
|
|
175
|
-
uint32_t init_t = BSWAP32(initial);
|
|
176
|
-
xmm_t0 = reverse_bits_epi8(_mm_cvtsi32_si128(~init_t));
|
|
177
|
-
|
|
178
|
-
xmm_t0 = _mm_clmulepi64_si128(xmm_t0, _mm_cvtsi32_si128(0x487b9c8a), 0);
|
|
179
|
-
xmm_t1 = _mm_and_si128(xmm_t0, _mm_set_epi32(-1,-1,-1,0)); // shifted up by 32bits to avoid shifts by using clmul's capability to select top 64bits instead
|
|
148
|
+
xmm_t0 = _mm_clmulepi64_si128(xmm_t0, _mm_set_epi32(0, 0, 0xa273bc24, 0), 0); // reverse(0x487b9c8a)<<1 == 0xa273bc24
|
|
180
149
|
xmm_t2 = _mm_set_epi32( // polynomial reduction factors
|
|
181
|
-
|
|
182
|
-
|
|
150
|
+
1, 0xdb710640, // G* = 0x04c11db7
|
|
151
|
+
0, 0xf7011641 // Q+ = 0x04d101df (+1 to save an additional xor operation)
|
|
183
152
|
);
|
|
184
|
-
xmm_t1 = _mm_clmulepi64_si128(
|
|
185
|
-
xmm_t1 = _mm_clmulepi64_si128(xmm_t1, xmm_t2,
|
|
153
|
+
xmm_t1 = _mm_clmulepi64_si128(xmm_t0, xmm_t2, 0);
|
|
154
|
+
xmm_t1 = _mm_clmulepi64_si128(xmm_t1, xmm_t2, 0x10);
|
|
186
155
|
|
|
187
|
-
__m128i xmm_crc0 = _mm_xor_si128(xmm_t0, xmm_t1);
|
|
188
|
-
// reverse bits
|
|
189
|
-
xmm_crc0 = _mm_shuffle_epi8(reverse_bits_epi8(xmm_crc0), _mm_set_epi32(-1,-1,-1,0x00010203));
|
|
156
|
+
__m128i xmm_crc0 = _mm_srli_si128(_mm_xor_si128(xmm_t0, xmm_t1), 8);
|
|
190
157
|
|
|
191
158
|
__m128i xmm_crc1 = _mm_setzero_si128();
|
|
192
159
|
__m128i xmm_crc2 = _mm_setzero_si128();
|
|
@@ -196,7 +163,8 @@ static uint32_t crc_fold(const unsigned char *src, long len, uint32_t initial) {
|
|
|
196
163
|
if (len < 16) {
|
|
197
164
|
if (len == 0)
|
|
198
165
|
return initial;
|
|
199
|
-
xmm_crc_part =
|
|
166
|
+
xmm_crc_part = _mm_setzero_si128();
|
|
167
|
+
memcpy(&xmm_crc_part, src, len);
|
|
200
168
|
goto partial;
|
|
201
169
|
}
|
|
202
170
|
|
|
@@ -211,7 +179,7 @@ static uint32_t crc_fold(const unsigned char *src, long len, uint32_t initial) {
|
|
|
211
179
|
&xmm_crc_part);
|
|
212
180
|
}
|
|
213
181
|
|
|
214
|
-
while (
|
|
182
|
+
while (len >= 64) {
|
|
215
183
|
xmm_t0 = _mm_load_si128((__m128i *)src);
|
|
216
184
|
xmm_t1 = _mm_load_si128((__m128i *)src + 1);
|
|
217
185
|
xmm_t2 = _mm_load_si128((__m128i *)src + 2);
|
|
@@ -235,13 +203,11 @@ static uint32_t crc_fold(const unsigned char *src, long len, uint32_t initial) {
|
|
|
235
203
|
#endif
|
|
236
204
|
|
|
237
205
|
src += 64;
|
|
206
|
+
len -= 64;
|
|
238
207
|
}
|
|
239
208
|
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
*/
|
|
243
|
-
if (len + 16 >= 0) {
|
|
244
|
-
len += 16;
|
|
209
|
+
if (len >= 48) {
|
|
210
|
+
len -= 48;
|
|
245
211
|
|
|
246
212
|
xmm_t0 = _mm_load_si128((__m128i *)src);
|
|
247
213
|
xmm_t1 = _mm_load_si128((__m128i *)src + 1);
|
|
@@ -266,8 +232,8 @@ static uint32_t crc_fold(const unsigned char *src, long len, uint32_t initial) {
|
|
|
266
232
|
goto done;
|
|
267
233
|
|
|
268
234
|
xmm_crc_part = _mm_load_si128((__m128i *)src + 3);
|
|
269
|
-
} else if (len
|
|
270
|
-
len
|
|
235
|
+
} else if (len >= 32) {
|
|
236
|
+
len -= 32;
|
|
271
237
|
|
|
272
238
|
xmm_t0 = _mm_load_si128((__m128i *)src);
|
|
273
239
|
xmm_t1 = _mm_load_si128((__m128i *)src + 1);
|
|
@@ -290,8 +256,8 @@ static uint32_t crc_fold(const unsigned char *src, long len, uint32_t initial) {
|
|
|
290
256
|
goto done;
|
|
291
257
|
|
|
292
258
|
xmm_crc_part = _mm_load_si128((__m128i *)src + 2);
|
|
293
|
-
} else if (len
|
|
294
|
-
len
|
|
259
|
+
} else if (len >= 16) {
|
|
260
|
+
len -= 16;
|
|
295
261
|
|
|
296
262
|
xmm_t0 = _mm_load_si128((__m128i *)src);
|
|
297
263
|
|
|
@@ -310,7 +276,6 @@ static uint32_t crc_fold(const unsigned char *src, long len, uint32_t initial) {
|
|
|
310
276
|
|
|
311
277
|
xmm_crc_part = _mm_load_si128((__m128i *)src + 1);
|
|
312
278
|
} else {
|
|
313
|
-
len += 64;
|
|
314
279
|
if (len == 0)
|
|
315
280
|
goto done;
|
|
316
281
|
xmm_crc_part = _mm_load_si128((__m128i *)src);
|
|
@@ -0,0 +1,230 @@
|
|
|
1
|
+
// 256-bit version of crc_folding
|
|
2
|
+
|
|
3
|
+
#include "crc_common.h"
|
|
4
|
+
|
|
5
|
+
#if !defined(YENC_DISABLE_AVX256) && ((defined(__VPCLMULQDQ__) && defined(__AVX2__) && defined(__PCLMUL__)) || (defined(_MSC_VER) && _MSC_VER >= 1920 && defined(PLATFORM_X86) && !defined(__clang__)))
|
|
6
|
+
#include <inttypes.h>
|
|
7
|
+
#include <immintrin.h>
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
#if defined(__AVX512VL__) && defined(YENC_BUILD_NATIVE) && YENC_BUILD_NATIVE!=0
|
|
11
|
+
# define ENABLE_AVX512 1
|
|
12
|
+
#endif
|
|
13
|
+
|
|
14
|
+
static __m256i do_one_fold(__m256i src, __m256i data) {
|
|
15
|
+
const __m256i fold4 = _mm256_set_epi32(
|
|
16
|
+
0x00000001, 0x54442bd4,
|
|
17
|
+
0x00000001, 0xc6e41596,
|
|
18
|
+
0x00000001, 0x54442bd4,
|
|
19
|
+
0x00000001, 0xc6e41596
|
|
20
|
+
);
|
|
21
|
+
#ifdef ENABLE_AVX512
|
|
22
|
+
return _mm256_ternarylogic_epi32(
|
|
23
|
+
_mm256_clmulepi64_epi128(src, fold4, 0x01),
|
|
24
|
+
_mm256_clmulepi64_epi128(src, fold4, 0x10),
|
|
25
|
+
data,
|
|
26
|
+
0x96
|
|
27
|
+
);
|
|
28
|
+
#else
|
|
29
|
+
return _mm256_xor_si256(data, _mm256_xor_si256(
|
|
30
|
+
_mm256_clmulepi64_epi128(src, fold4, 0x01),
|
|
31
|
+
_mm256_clmulepi64_epi128(src, fold4, 0x10)
|
|
32
|
+
));
|
|
33
|
+
#endif
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
ALIGN_TO(32, static const uint8_t pshufb_rot_table[]) = {
|
|
37
|
+
0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,
|
|
38
|
+
16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31
|
|
39
|
+
};
|
|
40
|
+
// _mm256_castsi128_si256, but upper is defined to be 0
|
|
41
|
+
#if (defined(__clang__) && __clang_major__ >= 5 && (!defined(__APPLE__) || __clang_major__ >= 7)) || (defined(__GNUC__) && __GNUC__ >= 10)
|
|
42
|
+
// intrinsic unsupported in GCC 9 and MSVC < 2017
|
|
43
|
+
# define zext128_256 _mm256_zextsi128_si256
|
|
44
|
+
#else
|
|
45
|
+
// technically a cast is incorrect, due to upper 128 bits being undefined, but should usually work fine
|
|
46
|
+
// alternative may be `_mm256_set_m128i(_mm_setzero_si128(), v)` but unsupported on GCC < 7, and most compilers generate a VINSERTF128 instruction for it
|
|
47
|
+
# ifdef __OPTIMIZE__
|
|
48
|
+
# define zext128_256 _mm256_castsi128_si256
|
|
49
|
+
# else
|
|
50
|
+
# define zext128_256(x) _mm256_inserti128_si256(_mm256_setzero_si256(), x, 0)
|
|
51
|
+
# endif
|
|
52
|
+
#endif
|
|
53
|
+
|
|
54
|
+
#ifdef ENABLE_AVX512
|
|
55
|
+
# define MM256_BLENDV(a, b, m) _mm256_ternarylogic_epi32(a, b, m, 0xd8)
|
|
56
|
+
# define MM_2XOR(a, b, c) _mm_ternarylogic_epi32(a, b, c, 0x96)
|
|
57
|
+
#else
|
|
58
|
+
# define MM256_BLENDV _mm256_blendv_epi8
|
|
59
|
+
# define MM_2XOR(a, b, c) _mm_xor_si128(_mm_xor_si128(a, b), c)
|
|
60
|
+
#endif
|
|
61
|
+
|
|
62
|
+
static void partial_fold(const size_t len, __m256i *crc0, __m256i *crc1, __m256i crc_part) {
|
|
63
|
+
__m256i shuf = _mm256_broadcastsi128_si256(_mm_loadu_si128((__m128i*)(pshufb_rot_table + (len&15))));
|
|
64
|
+
__m256i mask = _mm256_cmpgt_epi8(shuf, _mm256_set1_epi8(15));
|
|
65
|
+
|
|
66
|
+
*crc0 = _mm256_shuffle_epi8(*crc0, shuf);
|
|
67
|
+
*crc1 = _mm256_shuffle_epi8(*crc1, shuf);
|
|
68
|
+
crc_part = _mm256_shuffle_epi8(crc_part, shuf);
|
|
69
|
+
|
|
70
|
+
__m256i crc_out = _mm256_permute2x128_si256(*crc0, *crc0, 0x08); // move bottom->top
|
|
71
|
+
__m256i crc01, crc1p;
|
|
72
|
+
if(len >= 16) {
|
|
73
|
+
crc_out = MM256_BLENDV(crc_out, *crc0, mask);
|
|
74
|
+
crc01 = *crc1;
|
|
75
|
+
crc1p = crc_part;
|
|
76
|
+
*crc0 = _mm256_permute2x128_si256(*crc0, *crc1, 0x21);
|
|
77
|
+
*crc1 = _mm256_permute2x128_si256(*crc1, crc_part, 0x21);
|
|
78
|
+
crc_part = zext128_256(_mm256_extracti128_si256(crc_part, 1));
|
|
79
|
+
} else {
|
|
80
|
+
crc_out = _mm256_and_si256(crc_out, mask);
|
|
81
|
+
crc01 = _mm256_permute2x128_si256(*crc0, *crc1, 0x21);
|
|
82
|
+
crc1p = _mm256_permute2x128_si256(*crc1, crc_part, 0x21);
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
*crc0 = MM256_BLENDV(*crc0, crc01, mask);
|
|
86
|
+
*crc1 = MM256_BLENDV(*crc1, crc1p, mask);
|
|
87
|
+
|
|
88
|
+
*crc1 = do_one_fold(crc_out, *crc1);
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
ALIGN_TO(16, static const unsigned crc_k[]) = {
|
|
93
|
+
0xccaa009e, 0x00000000, /* rk1 */
|
|
94
|
+
0x751997d0, 0x00000001, /* rk2 */
|
|
95
|
+
0xccaa009e, 0x00000000, /* rk5 */
|
|
96
|
+
0x63cd6124, 0x00000001, /* rk6 */
|
|
97
|
+
0xf7011641, 0x00000000, /* rk7 */
|
|
98
|
+
0xdb710640, 0x00000001 /* rk8 */
|
|
99
|
+
};
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
static uint32_t crc_fold(const unsigned char *src, long len, uint32_t initial) {
|
|
103
|
+
// info from https://www.reddit.com/r/ReverseEngineering/comments/2zwhl3/mystery_constant_0x9db42487_in_intels_crc32ieee/
|
|
104
|
+
// firstly, calculate: xmm_crc0 = (intial * 0x487b9c8a) mod 0x104c11db7, where 0x487b9c8a = inverse(1<<512) mod 0x104c11db7
|
|
105
|
+
__m128i xmm_t0 = _mm_cvtsi32_si128(~initial);
|
|
106
|
+
|
|
107
|
+
xmm_t0 = _mm_clmulepi64_si128(xmm_t0, _mm_set_epi32(0, 0, 0xa273bc24, 0), 0); // reverse(0x487b9c8a)<<1 == 0xa273bc24
|
|
108
|
+
__m128i reduction = _mm_set_epi32( // polynomial reduction factors
|
|
109
|
+
1, 0xdb710640, // G* = 0x04c11db7
|
|
110
|
+
0, 0xf7011641 // Q+ = 0x04d101df (+1 to save an additional xor operation)
|
|
111
|
+
);
|
|
112
|
+
__m128i xmm_t1 = _mm_clmulepi64_si128(xmm_t0, reduction, 0);
|
|
113
|
+
xmm_t1 = _mm_clmulepi64_si128(xmm_t1, reduction, 0x10);
|
|
114
|
+
|
|
115
|
+
xmm_t0 = _mm_srli_si128(_mm_xor_si128(xmm_t0, xmm_t1), 8);
|
|
116
|
+
__m256i crc0 = zext128_256(xmm_t0);
|
|
117
|
+
__m256i crc1 = _mm256_setzero_si256();
|
|
118
|
+
|
|
119
|
+
if (len < 32) {
|
|
120
|
+
if (len == 0)
|
|
121
|
+
return initial;
|
|
122
|
+
__m256i crc_part = _mm256_setzero_si256();
|
|
123
|
+
memcpy(&crc_part, src, len);
|
|
124
|
+
partial_fold(len, &crc0, &crc1, crc_part);
|
|
125
|
+
} else {
|
|
126
|
+
uintptr_t algn_diff = (0 - (uintptr_t)src) & 0x1F;
|
|
127
|
+
if (algn_diff) {
|
|
128
|
+
partial_fold(algn_diff, &crc0, &crc1, _mm256_loadu_si256((__m256i *)src));
|
|
129
|
+
src += algn_diff;
|
|
130
|
+
len -= algn_diff;
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
while (len >= 64) {
|
|
134
|
+
crc0 = do_one_fold(crc0, _mm256_load_si256((__m256i*)src));
|
|
135
|
+
crc1 = do_one_fold(crc1, _mm256_load_si256((__m256i*)src + 1));
|
|
136
|
+
src += 64;
|
|
137
|
+
len -= 64;
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
if (len >= 32) {
|
|
141
|
+
__m256i old = crc1;
|
|
142
|
+
crc1 = do_one_fold(crc0, _mm256_load_si256((__m256i*)src));
|
|
143
|
+
crc0 = old;
|
|
144
|
+
|
|
145
|
+
len -= 32;
|
|
146
|
+
src += 32;
|
|
147
|
+
}
|
|
148
|
+
|
|
149
|
+
if(len != 0) {
|
|
150
|
+
partial_fold(len, &crc0, &crc1, _mm256_load_si256((__m256i *)src));
|
|
151
|
+
}
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
const __m128i xmm_mask = _mm_set_epi32(-1,-1,-1,0);
|
|
155
|
+
__m128i x_tmp0, x_tmp1, x_tmp2, crc_fold;
|
|
156
|
+
|
|
157
|
+
__m128i xmm_crc0 = _mm256_castsi256_si128(crc0);
|
|
158
|
+
__m128i xmm_crc1 = _mm256_extracti128_si256(crc0, 1);
|
|
159
|
+
__m128i xmm_crc2 = _mm256_castsi256_si128(crc1);
|
|
160
|
+
__m128i xmm_crc3 = _mm256_extracti128_si256(crc1, 1);
|
|
161
|
+
|
|
162
|
+
/*
|
|
163
|
+
* k1
|
|
164
|
+
*/
|
|
165
|
+
crc_fold = _mm_load_si128((__m128i *)crc_k);
|
|
166
|
+
|
|
167
|
+
x_tmp0 = _mm_clmulepi64_si128(xmm_crc0, crc_fold, 0x10);
|
|
168
|
+
xmm_crc0 = _mm_clmulepi64_si128(xmm_crc0, crc_fold, 0x01);
|
|
169
|
+
xmm_crc1 = MM_2XOR(xmm_crc1, x_tmp0, xmm_crc0);
|
|
170
|
+
|
|
171
|
+
x_tmp1 = _mm_clmulepi64_si128(xmm_crc1, crc_fold, 0x10);
|
|
172
|
+
xmm_crc1 = _mm_clmulepi64_si128(xmm_crc1, crc_fold, 0x01);
|
|
173
|
+
xmm_crc2 = MM_2XOR(xmm_crc2, x_tmp1, xmm_crc1);
|
|
174
|
+
|
|
175
|
+
x_tmp2 = _mm_clmulepi64_si128(xmm_crc2, crc_fold, 0x10);
|
|
176
|
+
xmm_crc2 = _mm_clmulepi64_si128(xmm_crc2, crc_fold, 0x01);
|
|
177
|
+
xmm_crc3 = MM_2XOR(xmm_crc3, x_tmp2, xmm_crc2);
|
|
178
|
+
|
|
179
|
+
/*
|
|
180
|
+
* k5
|
|
181
|
+
*/
|
|
182
|
+
crc_fold = _mm_load_si128((__m128i *)crc_k + 1);
|
|
183
|
+
|
|
184
|
+
xmm_crc0 = xmm_crc3;
|
|
185
|
+
xmm_crc3 = _mm_clmulepi64_si128(xmm_crc3, crc_fold, 0);
|
|
186
|
+
xmm_crc0 = _mm_srli_si128(xmm_crc0, 8);
|
|
187
|
+
xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc0);
|
|
188
|
+
|
|
189
|
+
xmm_crc0 = xmm_crc3;
|
|
190
|
+
xmm_crc3 = _mm_slli_si128(xmm_crc3, 4);
|
|
191
|
+
xmm_crc3 = _mm_clmulepi64_si128(xmm_crc3, crc_fold, 0x10);
|
|
192
|
+
#ifdef ENABLE_AVX512
|
|
193
|
+
//xmm_crc3 = _mm_maskz_xor_epi32(14, xmm_crc3, xmm_crc0);
|
|
194
|
+
xmm_crc3 = _mm_ternarylogic_epi32(xmm_crc3, xmm_crc0, xmm_mask, 0x28);
|
|
195
|
+
#else
|
|
196
|
+
xmm_crc0 = _mm_and_si128(xmm_crc0, xmm_mask);
|
|
197
|
+
xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc0);
|
|
198
|
+
#endif
|
|
199
|
+
|
|
200
|
+
/*
|
|
201
|
+
* k7
|
|
202
|
+
*/
|
|
203
|
+
xmm_crc1 = xmm_crc3;
|
|
204
|
+
crc_fold = _mm_load_si128((__m128i *)crc_k + 2);
|
|
205
|
+
|
|
206
|
+
xmm_crc3 = _mm_clmulepi64_si128(xmm_crc3, crc_fold, 0);
|
|
207
|
+
xmm_crc3 = _mm_clmulepi64_si128(xmm_crc3, crc_fold, 0x10);
|
|
208
|
+
#ifdef ENABLE_AVX512
|
|
209
|
+
xmm_crc3 = _mm_ternarylogic_epi32(xmm_crc3, xmm_crc1, xmm_crc1, 0xC3); // NOT(xmm_crc3 ^ xmm_crc1)
|
|
210
|
+
#else
|
|
211
|
+
xmm_crc1 = _mm_xor_si128(xmm_crc1, xmm_mask);
|
|
212
|
+
xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc1);
|
|
213
|
+
#endif
|
|
214
|
+
return _mm_extract_epi32(xmm_crc3, 2);
|
|
215
|
+
}
|
|
216
|
+
|
|
217
|
+
static uint32_t do_crc32_incremental_clmul(const void* data, size_t length, uint32_t init) {
|
|
218
|
+
return crc_fold((const unsigned char*)data, (long)length, init);
|
|
219
|
+
}
|
|
220
|
+
|
|
221
|
+
void crc_clmul256_set_funcs(crc_func* _do_crc32_incremental) {
|
|
222
|
+
*_do_crc32_incremental = &do_crc32_incremental_clmul;
|
|
223
|
+
}
|
|
224
|
+
#else
|
|
225
|
+
void crc_clmul_set_funcs(crc_func* _do_crc32_incremental);
|
|
226
|
+
void crc_clmul256_set_funcs(crc_func* _do_crc32_incremental) {
|
|
227
|
+
crc_clmul_set_funcs(_do_crc32_incremental);
|
|
228
|
+
}
|
|
229
|
+
#endif
|
|
230
|
+
|
package/src/decoder.cc
CHANGED
|
@@ -13,6 +13,7 @@ void decoder_set_sse2_funcs();
|
|
|
13
13
|
void decoder_set_ssse3_funcs();
|
|
14
14
|
void decoder_set_avx_funcs();
|
|
15
15
|
void decoder_set_avx2_funcs();
|
|
16
|
+
void decoder_set_vbmi2_funcs();
|
|
16
17
|
void decoder_set_neon_funcs();
|
|
17
18
|
|
|
18
19
|
|
|
@@ -44,7 +45,9 @@ void decoder_init() {
|
|
|
44
45
|
decoder_set_native_funcs();
|
|
45
46
|
# else
|
|
46
47
|
int use_isa = cpu_supports_isa();
|
|
47
|
-
if(use_isa >=
|
|
48
|
+
if(use_isa >= ISA_LEVEL_VBMI2)
|
|
49
|
+
decoder_set_vbmi2_funcs();
|
|
50
|
+
else if(use_isa >= ISA_LEVEL_AVX2)
|
|
48
51
|
decoder_set_avx2_funcs();
|
|
49
52
|
else if(use_isa >= ISA_LEVEL_AVX)
|
|
50
53
|
decoder_set_avx_funcs();
|
package/src/decoder_avx2_base.h
CHANGED
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
|
|
2
2
|
#ifdef __AVX2__
|
|
3
3
|
|
|
4
|
-
// GCC (ver 6-10(dev)) fails to optimize pure C version of mask testing, but has this intrinsic; Clang >= 7 optimizes C version fine
|
|
5
|
-
#if defined(__GNUC__) && __GNUC__ >= 7
|
|
4
|
+
// GCC (ver 6-10(dev)) fails to optimize pure C version of mask testing, but has this intrinsic; Clang >= 7 optimizes C version fine; functions added in Clang 8
|
|
5
|
+
#if (defined(__GNUC__) && __GNUC__ >= 7) || (defined(_MSC_VER) && _MSC_VER >= 1924)
|
|
6
6
|
# define KORTEST32(a, b) !_kortestz_mask32_u8((a), (b))
|
|
7
7
|
# define KAND32(a, b) _kand_mask32((a), (b))
|
|
8
8
|
# define KOR32(a, b) _kor_mask32((a), (b))
|
|
@@ -60,6 +60,17 @@ HEDLEY_ALWAYS_INLINE void do_decode_avx2(const uint8_t* HEDLEY_RESTRICT src, lon
|
|
|
60
60
|
'.','.','.','.','.','.','.','.','.','.','.','.','.','.',_nextMask==2?0:'.',_nextMask==1?0:'.'
|
|
61
61
|
);
|
|
62
62
|
}
|
|
63
|
+
|
|
64
|
+
// for some reason, MSVC Win32 seems to crash when trying to compile _mm256_mask_cmpeq_epi8_mask
|
|
65
|
+
// the crash can be fixed by switching the order of the last two arguments, but it seems to generate wrong code
|
|
66
|
+
// so just disable the optimisation as it seems to be problematic there
|
|
67
|
+
#if defined(__AVX512VL__) && defined(__AVX512BW__)
|
|
68
|
+
# if defined(_MSC_VER) && !defined(PLATFORM_AMD64) && !defined(__clang__)
|
|
69
|
+
const bool useAVX3MaskCmp = false;
|
|
70
|
+
# else
|
|
71
|
+
const bool useAVX3MaskCmp = (use_isa >= ISA_LEVEL_AVX3);
|
|
72
|
+
# endif
|
|
73
|
+
#endif
|
|
63
74
|
intptr_t i;
|
|
64
75
|
for(i = -len; i; i += sizeof(__m256i)*2) {
|
|
65
76
|
__m256i oDataA = _mm256_load_si256((__m256i *)(src+i));
|
|
@@ -126,7 +137,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_avx2(const uint8_t* HEDLEY_RESTRICT src, lon
|
|
|
126
137
|
__mmask32 match2EqMaskA, match2EqMaskB;
|
|
127
138
|
__mmask32 match0CrMaskA, match0CrMaskB;
|
|
128
139
|
__mmask32 match2CrXDtMaskA, match2CrXDtMaskB;
|
|
129
|
-
if(
|
|
140
|
+
if(useAVX3MaskCmp && searchEnd) {
|
|
130
141
|
match2EqMaskA = _mm256_cmpeq_epi8_mask(_mm256_set1_epi8('='), tmpData2A);
|
|
131
142
|
match2EqMaskB = _mm256_cmpeq_epi8_mask(_mm256_set1_epi8('='), tmpData2B);
|
|
132
143
|
} else
|
|
@@ -142,7 +153,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_avx2(const uint8_t* HEDLEY_RESTRICT src, lon
|
|
|
142
153
|
// find patterns of \r_.
|
|
143
154
|
|
|
144
155
|
#if defined(__AVX512VL__) && defined(__AVX512BW__)
|
|
145
|
-
if(
|
|
156
|
+
if(useAVX3MaskCmp) {
|
|
146
157
|
match0CrMaskA = _mm256_cmpeq_epi8_mask(oDataA, _mm256_set1_epi8('\r'));
|
|
147
158
|
match0CrMaskB = _mm256_cmpeq_epi8_mask(oDataB, _mm256_set1_epi8('\r'));
|
|
148
159
|
match2CrXDtMaskA = _mm256_mask_cmpeq_epi8_mask(match0CrMaskA, tmpData2A, _mm256_set1_epi8('.'));
|
|
@@ -172,7 +183,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_avx2(const uint8_t* HEDLEY_RESTRICT src, lon
|
|
|
172
183
|
#if defined(__AVX512VL__) && defined(__AVX512BW__)
|
|
173
184
|
__mmask32 match1NlMaskA, match1NlMaskB;
|
|
174
185
|
__mmask32 match2NlDotMaskA, match2NlDotMaskB;
|
|
175
|
-
if(
|
|
186
|
+
if(useAVX3MaskCmp) {
|
|
176
187
|
match1NlMaskA = _mm256_mask_cmpeq_epi8_mask(
|
|
177
188
|
match0CrMaskA,
|
|
178
189
|
_mm256_set1_epi8('\n'),
|
|
@@ -228,7 +239,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_avx2(const uint8_t* HEDLEY_RESTRICT src, lon
|
|
|
228
239
|
|
|
229
240
|
int matchEnd;
|
|
230
241
|
#if defined(__AVX512VL__) && defined(__AVX512BW__)
|
|
231
|
-
if(
|
|
242
|
+
if(useAVX3MaskCmp) {
|
|
232
243
|
__mmask32 match3EqYMaskA = _mm256_mask_cmpeq_epi8_mask(
|
|
233
244
|
match2EqMaskA,
|
|
234
245
|
_mm256_set1_epi8('y'),
|
|
@@ -307,7 +318,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_avx2(const uint8_t* HEDLEY_RESTRICT src, lon
|
|
|
307
318
|
}
|
|
308
319
|
}
|
|
309
320
|
#if defined(__AVX512VL__) && defined(__AVX512BW__)
|
|
310
|
-
if(
|
|
321
|
+
if(useAVX3MaskCmp) {
|
|
311
322
|
mask |= (uint64_t)match2NlDotMaskA << 2;
|
|
312
323
|
mask |= (uint64_t)match2NlDotMaskB << 34;
|
|
313
324
|
minMask = _mm256_maskz_mov_epi8(~(match2NlDotMaskB>>30), _mm256_set1_epi8('.'));
|
|
@@ -325,7 +336,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_avx2(const uint8_t* HEDLEY_RESTRICT src, lon
|
|
|
325
336
|
__m256i match3EqYA, match3EqYB;
|
|
326
337
|
#if defined(__AVX512VL__) && defined(__AVX512BW__)
|
|
327
338
|
__mmask32 match3EqYMaskA, match3EqYMaskB;
|
|
328
|
-
if(
|
|
339
|
+
if(useAVX3MaskCmp) {
|
|
329
340
|
match3EqYMaskA = _mm256_mask_cmpeq_epi8_mask(
|
|
330
341
|
match2EqMaskA,
|
|
331
342
|
_mm256_set1_epi8('y'),
|
|
@@ -355,7 +366,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_avx2(const uint8_t* HEDLEY_RESTRICT src, lon
|
|
|
355
366
|
if(LIKELIHOOD(0.002, partialEndFound)) {
|
|
356
367
|
bool endFound;
|
|
357
368
|
#if defined(__AVX512VL__) && defined(__AVX512BW__)
|
|
358
|
-
if(
|
|
369
|
+
if(useAVX3MaskCmp) {
|
|
359
370
|
__mmask32 match3LfEqYMaskA = _mm256_mask_cmpeq_epi8_mask(
|
|
360
371
|
match3EqYMaskA,
|
|
361
372
|
_mm256_set1_epi8('\n'),
|
package/src/decoder_neon.cc
CHANGED
|
@@ -19,14 +19,14 @@
|
|
|
19
19
|
#endif
|
|
20
20
|
|
|
21
21
|
|
|
22
|
-
// for compilers that lack these functions
|
|
23
|
-
#if defined(__clang__) || (defined(__GNUC__) && (defined(__aarch64__) && __GNUC__ >= 8))
|
|
22
|
+
// for compilers that lack these functions (Clang armv7 9-12 seems to have issues with multi-vector loads)
|
|
23
|
+
#if (defined(__clang__) && (defined(__aarch64__) || __clang_major__<9 || __clang_major__>12)) || (defined(__GNUC__) && (defined(__aarch64__) && __GNUC__ >= 8))
|
|
24
24
|
# define vld1q_u8_x2_align(p, n) vld1q_u8_x2((uint8_t*)__builtin_assume_aligned(p, n))
|
|
25
25
|
#else
|
|
26
26
|
# define vld1q_u8_x2_align(p, n) vcreate2_u8(vld1q_u8_align(p, (n)/2), vld1q_u8_align((p)+16, (n)/2))
|
|
27
27
|
#endif
|
|
28
28
|
// Clang wrongly assumes alignment on vld1q_u8_x2, and ARMv7 GCC doesn't support the function, so effectively, it can only be used in ARMv8 compilers
|
|
29
|
-
#if defined(__aarch64__) && (defined(__clang__) || (
|
|
29
|
+
#if defined(__aarch64__) && (defined(__clang__) || HEDLEY_GCC_VERSION_CHECK(8,5,0))
|
|
30
30
|
# define vst1q_u8_x2_unaligned vst1q_u8_x2
|
|
31
31
|
#else
|
|
32
32
|
static HEDLEY_ALWAYS_INLINE void vst1q_u8_x2_unaligned(uint8_t* p, uint8x16x2_t data) {
|
package/src/decoder_neon64.cc
CHANGED
|
@@ -10,9 +10,9 @@ static struct { char bytes[16]; } ALIGN_TO(16, compactLUT[32768]);
|
|
|
10
10
|
static uint8_t eqFixLUT[256];
|
|
11
11
|
|
|
12
12
|
|
|
13
|
-
|
|
14
|
-
#if !defined(__clang__) && !defined(_MSC_VER) && (!defined(__aarch64__) || !HEDLEY_GCC_VERSION_CHECK(
|
|
15
|
-
static HEDLEY_ALWAYS_INLINE uint8x16x4_t
|
|
13
|
+
// AArch64 GCC lacks these functions until 8.5, 9.4 and 10.1 (10.0 unknown)
|
|
14
|
+
#if !defined(__clang__) && !defined(_MSC_VER) && (!defined(__aarch64__) || !(HEDLEY_GCC_VERSION_CHECK(9,4,0) || (!HEDLEY_GCC_VERSION_CHECK(9,0,0) && HEDLEY_GCC_VERSION_CHECK(8,5,0))))
|
|
15
|
+
static HEDLEY_ALWAYS_INLINE uint8x16x4_t _vld1q_u8_x4(const uint8_t* p) {
|
|
16
16
|
uint8x16x4_t ret;
|
|
17
17
|
ret.val[0] = vld1q_u8(p);
|
|
18
18
|
ret.val[1] = vld1q_u8(p+16);
|
|
@@ -20,12 +20,15 @@ static HEDLEY_ALWAYS_INLINE uint8x16x4_t vld1q_u8_x4(const uint8_t* p) {
|
|
|
20
20
|
ret.val[3] = vld1q_u8(p+48);
|
|
21
21
|
return ret;
|
|
22
22
|
}
|
|
23
|
-
static HEDLEY_ALWAYS_INLINE void
|
|
23
|
+
static HEDLEY_ALWAYS_INLINE void _vst1q_u8_x4(uint8_t* p, uint8x16x4_t data) {
|
|
24
24
|
vst1q_u8(p, data.val[0]);
|
|
25
25
|
vst1q_u8(p+16, data.val[1]);
|
|
26
26
|
vst1q_u8(p+32, data.val[2]);
|
|
27
27
|
vst1q_u8(p+48, data.val[3]);
|
|
28
28
|
}
|
|
29
|
+
#else
|
|
30
|
+
# define _vld1q_u8_x4 vld1q_u8_x4
|
|
31
|
+
# define _vst1q_u8_x4 vst1q_u8_x4
|
|
29
32
|
#endif
|
|
30
33
|
|
|
31
34
|
|
|
@@ -55,7 +58,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_neon(const uint8_t* HEDLEY_RESTRICT src, lon
|
|
|
55
58
|
uint8x16_t yencOffset = escFirst ? vmakeq_u8(42+64,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42) : vdupq_n_u8(42);
|
|
56
59
|
long i;
|
|
57
60
|
for(i = -len; i; i += sizeof(uint8x16_t)*4) {
|
|
58
|
-
uint8x16x4_t data =
|
|
61
|
+
uint8x16x4_t data = _vld1q_u8_x4(src+i);
|
|
59
62
|
uint8x16_t dataA = data.val[0];
|
|
60
63
|
uint8x16_t dataB = data.val[1];
|
|
61
64
|
uint8x16_t dataC = data.val[2];
|
|
@@ -421,7 +424,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_neon(const uint8_t* HEDLEY_RESTRICT src, lon
|
|
|
421
424
|
dataB = vsubq_u8(dataB, vdupq_n_u8(42));
|
|
422
425
|
dataC = vsubq_u8(dataC, vdupq_n_u8(42));
|
|
423
426
|
dataD = vsubq_u8(dataD, vdupq_n_u8(42));
|
|
424
|
-
|
|
427
|
+
_vst1q_u8_x4(p, vcreate4_u8(dataA, dataB, dataC, dataD));
|
|
425
428
|
p += sizeof(uint8x16_t)*4;
|
|
426
429
|
escFirst = 0;
|
|
427
430
|
yencOffset = vdupq_n_u8(42);
|
package/src/decoder_sse_base.h
CHANGED
|
@@ -8,7 +8,7 @@
|
|
|
8
8
|
#endif
|
|
9
9
|
|
|
10
10
|
// GCC (ver 6-10(dev)) fails to optimize pure C version of mask testing, but has this intrinsic; Clang >= 7 optimizes C version fine
|
|
11
|
-
#if defined(__GNUC__) && __GNUC__ >= 7
|
|
11
|
+
#if (defined(__GNUC__) && __GNUC__ >= 7) || (defined(_MSC_VER) && _MSC_VER >= 1924)
|
|
12
12
|
# define KORTEST16(a, b) !_kortestz_mask16_u8((a), (b))
|
|
13
13
|
# define KAND16(a, b) _kand_mask16((a), (b))
|
|
14
14
|
# define KOR16(a, b) _kor_mask16((a), (b))
|
|
@@ -112,7 +112,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_sse(const uint8_t* HEDLEY_RESTRICT src, long
|
|
|
112
112
|
-42,-42,-42,-42,-42,-42,-42,-42,-42,-42,-42,-42,-42,-42,-42,-42-64
|
|
113
113
|
) : _mm_set1_epi8(-42);
|
|
114
114
|
|
|
115
|
-
#if defined(__SSSE3__) && !defined(__tune_atom__) && !defined(__tune_slm__) && !defined(__tune_btver1__)
|
|
115
|
+
#if defined(__SSSE3__) && !defined(__tune_atom__) && !defined(__tune_slm__) && !defined(__tune_btver1__) && !defined(__tune_btver2__)
|
|
116
116
|
const bool _USING_FAST_MATCH = (use_isa >= ISA_LEVEL_SSSE3);
|
|
117
117
|
#else
|
|
118
118
|
const bool _USING_FAST_MATCH = false;
|
|
@@ -121,6 +121,13 @@ HEDLEY_ALWAYS_INLINE void do_decode_sse(const uint8_t* HEDLEY_RESTRICT src, long
|
|
|
121
121
|
const bool _USING_BLEND_ADD = (use_isa >= ISA_LEVEL_SSE41);
|
|
122
122
|
#else
|
|
123
123
|
const bool _USING_BLEND_ADD = false;
|
|
124
|
+
#endif
|
|
125
|
+
#if defined(__AVX512VL__) && defined(__AVX512BW__)
|
|
126
|
+
# if defined(_MSC_VER) && !defined(PLATFORM_AMD64) && !defined(__clang__)
|
|
127
|
+
const bool useAVX3MaskCmp = false;
|
|
128
|
+
# else
|
|
129
|
+
const bool useAVX3MaskCmp = (use_isa >= ISA_LEVEL_AVX3);
|
|
130
|
+
# endif
|
|
124
131
|
#endif
|
|
125
132
|
|
|
126
133
|
__m128i lfCompare = _mm_set1_epi8('\n');
|
|
@@ -214,7 +221,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_sse(const uint8_t* HEDLEY_RESTRICT src, long
|
|
|
214
221
|
__mmask16 match2EqMaskA, match2EqMaskB;
|
|
215
222
|
__mmask16 match0CrMaskA, match0CrMaskB;
|
|
216
223
|
__mmask16 match2CrXDtMaskA, match2CrXDtMaskB;
|
|
217
|
-
if(
|
|
224
|
+
if(useAVX3MaskCmp && searchEnd) {
|
|
218
225
|
match2EqMaskA = _mm_cmpeq_epi8_mask(_mm_set1_epi8('='), tmpData2A);
|
|
219
226
|
match2EqMaskB = _mm_cmpeq_epi8_mask(_mm_set1_epi8('='), tmpData2B);
|
|
220
227
|
} else
|
|
@@ -230,7 +237,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_sse(const uint8_t* HEDLEY_RESTRICT src, long
|
|
|
230
237
|
__m128i match2CrXDtA, match2CrXDtB;
|
|
231
238
|
if(isRaw) {
|
|
232
239
|
#if defined(__AVX512VL__) && defined(__AVX512BW__)
|
|
233
|
-
if(
|
|
240
|
+
if(useAVX3MaskCmp) {
|
|
234
241
|
match0CrMaskA = _mm_cmpeq_epi8_mask(oDataA, _mm_set1_epi8('\r'));
|
|
235
242
|
match0CrMaskB = _mm_cmpeq_epi8_mask(oDataB, _mm_set1_epi8('\r'));
|
|
236
243
|
match2CrXDtMaskA = _mm_mask_cmpeq_epi8_mask(match0CrMaskA, tmpData2A, _mm_set1_epi8('.'));
|
|
@@ -256,7 +263,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_sse(const uint8_t* HEDLEY_RESTRICT src, long
|
|
|
256
263
|
#if defined(__AVX512VL__) && defined(__AVX512BW__)
|
|
257
264
|
__mmask16 match1NlMaskA, match1NlMaskB;
|
|
258
265
|
__mmask16 match2NlDotMaskA, match2NlDotMaskB;
|
|
259
|
-
if(
|
|
266
|
+
if(useAVX3MaskCmp) {
|
|
260
267
|
match1NlMaskA = _mm_mask_cmpeq_epi8_mask(
|
|
261
268
|
match0CrMaskA,
|
|
262
269
|
_mm_set1_epi8('\n'),
|
|
@@ -299,7 +306,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_sse(const uint8_t* HEDLEY_RESTRICT src, long
|
|
|
299
306
|
|
|
300
307
|
int matchEnd;
|
|
301
308
|
#if defined(__AVX512VL__) && defined(__AVX512BW__)
|
|
302
|
-
if(
|
|
309
|
+
if(useAVX3MaskCmp) {
|
|
303
310
|
__mmask16 match3EqYMaskA = _mm_mask_cmpeq_epi8_mask(
|
|
304
311
|
match2EqMaskA, _mm_set1_epi8('y'), tmpData3A
|
|
305
312
|
);
|
|
@@ -373,7 +380,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_sse(const uint8_t* HEDLEY_RESTRICT src, long
|
|
|
373
380
|
}
|
|
374
381
|
}
|
|
375
382
|
#if defined(__AVX512VL__) && defined(__AVX512BW__)
|
|
376
|
-
if(
|
|
383
|
+
if(useAVX3MaskCmp) {
|
|
377
384
|
mask |= match2NlDotMaskA << 2;
|
|
378
385
|
mask |= (match2NlDotMaskB << 18) & 0xffffffff;
|
|
379
386
|
minMask = _mm_maskz_mov_epi8(~(match2NlDotMaskB>>14), _mm_set1_epi8('.'));
|
|
@@ -398,7 +405,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_sse(const uint8_t* HEDLEY_RESTRICT src, long
|
|
|
398
405
|
__m128i match3EqYA, match3EqYB;
|
|
399
406
|
#if defined(__AVX512VL__) && defined(__AVX512BW__)
|
|
400
407
|
__mmask16 match3EqYMaskA, match3EqYMaskB;
|
|
401
|
-
if(
|
|
408
|
+
if(useAVX3MaskCmp) {
|
|
402
409
|
match3EqYMaskA = _mm_mask_cmpeq_epi8_mask(
|
|
403
410
|
match2EqMaskA,
|
|
404
411
|
_mm_set1_epi8('y'),
|
|
@@ -434,7 +441,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_sse(const uint8_t* HEDLEY_RESTRICT src, long
|
|
|
434
441
|
bool endFound;
|
|
435
442
|
|
|
436
443
|
#if defined(__AVX512VL__) && defined(__AVX512BW__)
|
|
437
|
-
if(
|
|
444
|
+
if(useAVX3MaskCmp) {
|
|
438
445
|
__mmask16 match3LfEqYMaskA = _mm_mask_cmpeq_epi8_mask(
|
|
439
446
|
match3EqYMaskA,
|
|
440
447
|
_mm_set1_epi8('\n'),
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
#include "common.h"
|
|
2
|
+
|
|
3
|
+
#if defined(__AVX512VL__) && defined(__AVX512VBMI2__) && defined(__AVX512BW__)
|
|
4
|
+
# include "decoder_common.h"
|
|
5
|
+
# ifndef YENC_DISABLE_AVX256
|
|
6
|
+
# include "decoder_avx2_base.h"
|
|
7
|
+
void decoder_set_vbmi2_funcs() {
|
|
8
|
+
ALIGN_ALLOC(lookups, sizeof(*lookups), 16);
|
|
9
|
+
// TODO: consider removing compact LUT
|
|
10
|
+
decoder_init_lut(lookups->eqFix, lookups->compact);
|
|
11
|
+
_do_decode = &do_decode_simd<false, false, sizeof(__m256i)*2, do_decode_avx2<false, false, ISA_LEVEL_VBMI2> >;
|
|
12
|
+
_do_decode_raw = &do_decode_simd<true, false, sizeof(__m256i)*2, do_decode_avx2<true, false, ISA_LEVEL_VBMI2> >;
|
|
13
|
+
_do_decode_end_raw = &do_decode_simd<true, true, sizeof(__m256i)*2, do_decode_avx2<true, true, ISA_LEVEL_VBMI2> >;
|
|
14
|
+
}
|
|
15
|
+
# else
|
|
16
|
+
# include "decoder_sse_base.h"
|
|
17
|
+
void decoder_set_vbmi2_funcs() {
|
|
18
|
+
decoder_sse_init();
|
|
19
|
+
decoder_init_lut(lookups->eqFix, lookups->compact);
|
|
20
|
+
_do_decode = &do_decode_simd<false, false, sizeof(__m128i)*2, do_decode_sse<false, false, ISA_LEVEL_VBMI2> >;
|
|
21
|
+
_do_decode_raw = &do_decode_simd<true, false, sizeof(__m128i)*2, do_decode_sse<true, false, ISA_LEVEL_VBMI2> >;
|
|
22
|
+
_do_decode_end_raw = &do_decode_simd<true, true, sizeof(__m128i)*2, do_decode_sse<true, true, ISA_LEVEL_VBMI2> >;
|
|
23
|
+
}
|
|
24
|
+
# endif
|
|
25
|
+
#else
|
|
26
|
+
void decoder_set_avx2_funcs();
|
|
27
|
+
void decoder_set_vbmi2_funcs() {
|
|
28
|
+
decoder_set_avx2_funcs();
|
|
29
|
+
}
|
|
30
|
+
#endif
|
package/src/encoder.cc
CHANGED
|
@@ -128,6 +128,7 @@ void encoder_sse2_init();
|
|
|
128
128
|
void encoder_ssse3_init();
|
|
129
129
|
void encoder_avx_init();
|
|
130
130
|
void encoder_avx2_init();
|
|
131
|
+
void encoder_vbmi2_init();
|
|
131
132
|
void encoder_neon_init();
|
|
132
133
|
|
|
133
134
|
#if defined(PLATFORM_X86) && defined(YENC_BUILD_NATIVE) && YENC_BUILD_NATIVE!=0
|
|
@@ -153,7 +154,9 @@ void encoder_init() {
|
|
|
153
154
|
encoder_native_init();
|
|
154
155
|
# else
|
|
155
156
|
int use_isa = cpu_supports_isa();
|
|
156
|
-
if(use_isa >=
|
|
157
|
+
if(use_isa >= ISA_LEVEL_VBMI2)
|
|
158
|
+
encoder_vbmi2_init();
|
|
159
|
+
else if(use_isa >= ISA_LEVEL_AVX2)
|
|
157
160
|
encoder_avx2_init();
|
|
158
161
|
else if(use_isa >= ISA_LEVEL_AVX)
|
|
159
162
|
encoder_avx_init();
|
package/src/encoder_avx_base.h
CHANGED
|
@@ -6,7 +6,7 @@
|
|
|
6
6
|
#include "encoder_common.h"
|
|
7
7
|
#define YMM_SIZE 32
|
|
8
8
|
|
|
9
|
-
#if defined(__GNUC__) && __GNUC__ >= 7
|
|
9
|
+
#if (defined(__GNUC__) && __GNUC__ >= 7) || (defined(_MSC_VER) && _MSC_VER >= 1924)
|
|
10
10
|
# define KLOAD32(a, offs) _load_mask32((__mmask32*)(a) + (offs))
|
|
11
11
|
#else
|
|
12
12
|
# define KLOAD32(a, offs) (((uint32_t*)(a))[(offs)])
|
|
@@ -293,7 +293,7 @@ HEDLEY_ALWAYS_INLINE void do_encode_avx2(int line_size, int* colOffset, const ui
|
|
|
293
293
|
asm(
|
|
294
294
|
"shrq $1, %[eqMask] \n"
|
|
295
295
|
"shrq %%cl, %[eqMask] \n"
|
|
296
|
-
"adcq %[col], %[p] \n"
|
|
296
|
+
"adcq %q[col], %q[p] \n"
|
|
297
297
|
: [eqMask]"+r"(eqMask), [p]"+r"(p)
|
|
298
298
|
: "c"(shiftAmt), [col]"r"(~col)
|
|
299
299
|
);
|
package/src/encoder_neon.cc
CHANGED
|
@@ -5,7 +5,7 @@
|
|
|
5
5
|
#include "encoder_common.h"
|
|
6
6
|
|
|
7
7
|
// Clang wrongly assumes alignment on vst1q_u8_x2, and ARMv7 GCC doesn't support the function, so effectively, it can only be used in ARMv8 compilers
|
|
8
|
-
#if defined(__aarch64__) && (defined(__clang__) || (
|
|
8
|
+
#if defined(__aarch64__) && (defined(__clang__) || HEDLEY_GCC_VERSION_CHECK(8,5,0))
|
|
9
9
|
# define vst1q_u8_x2_unaligned vst1q_u8_x2
|
|
10
10
|
#else
|
|
11
11
|
static HEDLEY_ALWAYS_INLINE void vst1q_u8_x2_unaligned(uint8_t* p, uint8x16x2_t data) {
|
package/src/encoder_sse_base.h
CHANGED
|
@@ -8,7 +8,7 @@
|
|
|
8
8
|
# define _mm_mask_expand_epi8 _mm128_mask_expand_epi8
|
|
9
9
|
#endif
|
|
10
10
|
|
|
11
|
-
#if defined(__GNUC__) && __GNUC__ >= 7
|
|
11
|
+
#if (defined(__GNUC__) && __GNUC__ >= 7) || (defined(_MSC_VER) && _MSC_VER >= 1924)
|
|
12
12
|
# define KLOAD16(a, offs) _load_mask16((__mmask16*)(a) + (offs))
|
|
13
13
|
#else
|
|
14
14
|
# define KLOAD16(a, offs) (((uint16_t*)(a))[(offs)])
|
|
@@ -155,7 +155,7 @@ HEDLEY_ALWAYS_INLINE void do_encode_sse(int line_size, int* colOffset, const uin
|
|
|
155
155
|
if(len <= INPUT_OFFSET || line_size < XMM_SIZE) return;
|
|
156
156
|
|
|
157
157
|
// slower CPUs prefer to branch as mispredict penalty is probably small relative to general execution
|
|
158
|
-
#if defined(__tune_atom__) || defined(__tune_slm__) || defined(__tune_btver1__)
|
|
158
|
+
#if defined(__tune_atom__) || defined(__tune_slm__) || defined(__tune_btver1__) || defined(__tune_btver2__)
|
|
159
159
|
const bool _PREFER_BRANCHING = true;
|
|
160
160
|
#else
|
|
161
161
|
const bool _PREFER_BRANCHING = (use_isa < ISA_LEVEL_SSSE3);
|
|
@@ -412,8 +412,8 @@ HEDLEY_ALWAYS_INLINE void do_encode_sse(int line_size, int* colOffset, const uin
|
|
|
412
412
|
asm(
|
|
413
413
|
"shrl $1, %[eqMask] \n"
|
|
414
414
|
"shrl %%cl, %[eqMask] \n" // TODO: can use shrq to avoid above shift?
|
|
415
|
-
# if defined(PLATFORM_AMD64)
|
|
416
|
-
"adcq %[col], %[p] \n"
|
|
415
|
+
# if defined(PLATFORM_AMD64) && !defined(__ILP32__)
|
|
416
|
+
"adcq %q[col], %q[p] \n"
|
|
417
417
|
# else
|
|
418
418
|
"adcl %[col], %[p] \n"
|
|
419
419
|
# endif
|
|
@@ -539,7 +539,6 @@ HEDLEY_ALWAYS_INLINE void do_encode_sse(int line_size, int* colOffset, const uin
|
|
|
539
539
|
dataA = _mm_shuffle_epi8(dataA, shufMaskA);
|
|
540
540
|
|
|
541
541
|
# if defined(__SSE4_1__) && !defined(__tune_slm__) && !defined(__tune_goldmont__) && !defined(__tune_goldmont_plus__) && !defined(__tune_tremont__)
|
|
542
|
-
// unsure if worth on: Jaguar/Puma (3|2), Core2 (2|2)
|
|
543
542
|
if(use_isa >= ISA_LEVEL_SSE41) {
|
|
544
543
|
dataB = _mm_blendv_epi8(dataBShifted, dataB, mergeMaskB);
|
|
545
544
|
} else
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
#include "common.h"
|
|
2
|
+
|
|
3
|
+
#if defined(__AVX512VL__) && defined(__AVX512VBMI2__) && defined(__AVX512BW__)
|
|
4
|
+
# ifndef YENC_DISABLE_AVX256
|
|
5
|
+
# include "encoder_avx_base.h"
|
|
6
|
+
|
|
7
|
+
void encoder_vbmi2_init() {
|
|
8
|
+
_do_encode = &do_encode_simd< do_encode_avx2<ISA_LEVEL_VBMI2> >;
|
|
9
|
+
encoder_avx2_lut<ISA_LEVEL_VBMI2>();
|
|
10
|
+
}
|
|
11
|
+
# else
|
|
12
|
+
# include "encoder_sse_base.h"
|
|
13
|
+
void encoder_vbmi2_init() {
|
|
14
|
+
_do_encode = &do_encode_simd< do_encode_sse<ISA_LEVEL_VBMI2> >;
|
|
15
|
+
encoder_sse_lut<ISA_LEVEL_VBMI2>();
|
|
16
|
+
}
|
|
17
|
+
# endif
|
|
18
|
+
#else
|
|
19
|
+
void encoder_avx2_init();
|
|
20
|
+
void encoder_vbmi2_init() {
|
|
21
|
+
encoder_avx2_init();
|
|
22
|
+
}
|
|
23
|
+
#endif
|
package/src/platform.cc
CHANGED
|
@@ -55,6 +55,7 @@ bool cpu_supports_neon() {
|
|
|
55
55
|
|
|
56
56
|
#ifdef PLATFORM_X86
|
|
57
57
|
#ifdef _MSC_VER
|
|
58
|
+
# define _cpuid1(ar) __cpuid(ar, 1)
|
|
58
59
|
# define _cpuid1x(ar) __cpuid(ar, 0x80000001)
|
|
59
60
|
# if _MSC_VER >= 1600
|
|
60
61
|
# define _cpuidX __cpuidex
|
|
@@ -66,6 +67,8 @@ bool cpu_supports_neon() {
|
|
|
66
67
|
# define _GET_XCR() 0
|
|
67
68
|
# endif
|
|
68
69
|
#else
|
|
70
|
+
# include <cpuid.h>
|
|
71
|
+
# define _cpuid1(ar) __cpuid(1, ar[0], ar[1], ar[2], ar[3])
|
|
69
72
|
# define _cpuid1x(ar) __cpuid(0x80000001, ar[0], ar[1], ar[2], ar[3])
|
|
70
73
|
# define _cpuidX(ar, eax, ecx) __cpuid_count(eax, ecx, ar[0], ar[1], ar[2], ar[3])
|
|
71
74
|
static inline int _GET_XCR() {
|
|
@@ -112,8 +115,6 @@ int cpu_supports_isa() {
|
|
|
112
115
|
// AMD Bobcat with slow SSSE3 instructions - pretend it doesn't exist
|
|
113
116
|
return ret | ISA_LEVEL_SSE2;
|
|
114
117
|
|
|
115
|
-
// Jaguar/Puma performance unkown (slowish PSHUFB/PBLENDVB)
|
|
116
|
-
|
|
117
118
|
if((flags[2] & 0x200) == 0x200) { // SSSE3
|
|
118
119
|
if(family == 6 && (model == 0x5c || model == 0x5f || model == 0x7a || model == 0x9c))
|
|
119
120
|
// Intel Goldmont/plus / Tremont with slow PBLENDVB
|
|
@@ -144,4 +145,24 @@ int cpu_supports_isa() {
|
|
|
144
145
|
return ret | ISA_LEVEL_SSE2;
|
|
145
146
|
}
|
|
146
147
|
|
|
148
|
+
int cpu_supports_crc_isa() {
|
|
149
|
+
int flags[4];
|
|
150
|
+
_cpuid1(flags);
|
|
151
|
+
|
|
152
|
+
if((flags[2] & 0x80202) == 0x80202) { // SSE4.1 + SSSE3 + CLMUL
|
|
153
|
+
if((flags[2] & 0x18000000) == 0x18000000) { // OSXSAVE + AVX
|
|
154
|
+
int xcr = _GET_XCR() & 0xff; // ignore unused bits
|
|
155
|
+
if((xcr & 6) == 6) { // AVX enabled
|
|
156
|
+
int cpuInfo[4];
|
|
157
|
+
_cpuidX(cpuInfo, 7, 0);
|
|
158
|
+
if((cpuInfo[1] & 0x20) == 0x20 && (cpuInfo[2] & 0x400) == 0x400) { // AVX2 + VPCLMULQDQ
|
|
159
|
+
return 2;
|
|
160
|
+
}
|
|
161
|
+
}
|
|
162
|
+
}
|
|
163
|
+
return 1;
|
|
164
|
+
}
|
|
165
|
+
return 0;
|
|
166
|
+
}
|
|
167
|
+
|
|
147
168
|
#endif // PLATFORM_X86
|
package/test/testcrc.js
CHANGED
|
@@ -50,4 +50,18 @@ doTest('Random', 'crc32', 'fj[-oqijnw34-59n26 4345j8yn89032q78t9ab9gabh023quhoiB
|
|
|
50
50
|
doTest('Random Continue', 'crc32', ['KZSHZ5EDOVAmDdakZZOrGSUGGKSpCJoWH7M0MHy6ohnSzvHY4DjpxXmyfWYJQoJ7tKdNhGcuRVUzrgXM', ycrc32('BdenbmoBgiB10ZkeUBjrsZV3dg2Da2fhHqU9TMdi69AHhLRck3Nk60YuFBXh6lvtefBpjdTxbeEmsaEm')], crc32('BdenbmoBgiB10ZkeUBjrsZV3dg2Da2fhHqU9TMdi69AHhLRck3Nk60YuFBXh6lvtefBpjdTxbeEmsaEmKZSHZ5EDOVAmDdakZZOrGSUGGKSpCJoWH7M0MHy6ohnSzvHY4DjpxXmyfWYJQoJ7tKdNhGcuRVUzrgXM'));
|
|
51
51
|
|
|
52
52
|
|
|
53
|
+
// random tests
|
|
54
|
+
for(var i=1; i<128; i++) {
|
|
55
|
+
var rand = require('crypto').pseudoRandomBytes(i);
|
|
56
|
+
doTest('Random Short Buffer', 'crc32', rand);
|
|
57
|
+
}
|
|
58
|
+
for(var i=0; i<32; i++) {
|
|
59
|
+
var rand = require('crypto').pseudoRandomBytes(100000);
|
|
60
|
+
doTest('Random Buffer', 'crc32', rand);
|
|
61
|
+
|
|
62
|
+
var split = Math.random()*rand.length;
|
|
63
|
+
doTest('Random Continue Buffer', 'crc32', [rand.slice(split), ycrc32(rand.slice(0, split))], crc32(rand));
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
|
|
53
67
|
console.log('All tests passed');
|