yencode 1.1.0 → 1.1.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/binding.gyp +79 -7
- package/crcutil-1.0/code/multiword_64_64_intrinsic_i386_mmx.cc +1 -1
- package/package.json +1 -1
- package/src/common.h +88 -24
- package/src/crc.cc +59 -27
- package/src/crc.h +20 -6
- package/src/crc_arm.cc +154 -27
- package/src/crc_common.h +3 -10
- package/src/{crc_folding.c → crc_folding.cc} +53 -122
- package/src/crc_folding_256.cc +230 -0
- package/src/decoder.cc +10 -4
- package/src/decoder.h +16 -2
- package/src/decoder_avx2_base.h +32 -21
- package/src/decoder_common.h +2 -2
- package/src/decoder_neon.cc +37 -37
- package/src/decoder_neon64.cc +41 -36
- package/src/decoder_sse_base.h +21 -14
- package/src/decoder_vbmi2.cc +30 -0
- package/src/encoder.cc +9 -3
- package/src/encoder.h +17 -1
- package/src/encoder_avx_base.h +8 -8
- package/src/encoder_common.h +3 -3
- package/src/encoder_neon.cc +31 -31
- package/src/encoder_sse_base.h +7 -8
- package/src/encoder_vbmi2.cc +23 -0
- package/src/platform.cc +57 -8
- package/src/yencode.cc +33 -44
- package/test/testcrc.js +14 -0
package/src/crc_arm.cc
CHANGED
|
@@ -1,15 +1,98 @@
|
|
|
1
|
-
#include "common.h"
|
|
2
1
|
#include "crc_common.h"
|
|
3
2
|
|
|
4
|
-
#if defined(
|
|
3
|
+
#if defined(PLATFORM_ARM) && defined(_MSC_VER) && defined(__clang__) && !defined(__ARM_FEATURE_CRC32)
|
|
4
|
+
// I don't think GYP provides a nice way to detect whether MSVC or clang-cl is being used, but it doesn't use clang-cl by default, so a warning here is probably sufficient
|
|
5
|
+
HEDLEY_WARNING("CRC32 acceleration is not been enabled under ARM clang-cl by default; add `-march=armv8-a+crc` to additional compiler arguments to enable");
|
|
6
|
+
#endif
|
|
7
|
+
|
|
8
|
+
// disable CRC on GCC versions with broken arm_acle.h
|
|
9
|
+
#if defined(__ARM_FEATURE_CRC32) && defined(HEDLEY_GCC_VERSION)
|
|
10
|
+
# if !defined(__aarch64__) && HEDLEY_GCC_VERSION_CHECK(7,0,0) && !HEDLEY_GCC_VERSION_CHECK(8,1,1)
|
|
11
|
+
# undef __ARM_FEATURE_CRC32
|
|
12
|
+
HEDLEY_WARNING("CRC32 acceleration has been disabled due to broken arm_acle.h shipped in GCC 7.0 - 8.1 [https://gcc.gnu.org/bugzilla/show_bug.cgi?id=81497]. If you need this feature, please use a different compiler or version of GCC");
|
|
13
|
+
# endif
|
|
14
|
+
# if defined(__aarch64__) && HEDLEY_GCC_VERSION_CHECK(9,4,0) && !HEDLEY_GCC_VERSION_CHECK(9,5,0)
|
|
15
|
+
# undef __ARM_FEATURE_CRC32
|
|
16
|
+
HEDLEY_WARNING("CRC32 acceleration has been disabled due to broken arm_acle.h shipped in GCC 9.4 [https://gcc.gnu.org/bugzilla/show_bug.cgi?id=100985]. If you need this feature, please use a different compiler or version of GCC");
|
|
17
|
+
# endif
|
|
18
|
+
#endif
|
|
19
|
+
|
|
20
|
+
#if defined(__ARM_FEATURE_CRC32) || (defined(_M_ARM64) && !defined(__clang__)) // MSVC doesn't support CRC for ARM32
|
|
5
21
|
|
|
6
22
|
/* ARMv8 accelerated CRC */
|
|
7
|
-
#
|
|
23
|
+
#if defined(_MSC_VER) && !defined(__clang__)
|
|
8
24
|
#include <intrin.h>
|
|
9
25
|
#else
|
|
10
26
|
#include <arm_acle.h>
|
|
11
27
|
#endif
|
|
12
28
|
|
|
29
|
+
|
|
30
|
+
#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
|
|
31
|
+
# ifdef __GNUC__
|
|
32
|
+
# define _LE16 __builtin_bswap16
|
|
33
|
+
# define _LE32 __builtin_bswap32
|
|
34
|
+
# define _LE64 __builtin_bswap64
|
|
35
|
+
# else
|
|
36
|
+
// currently not supported
|
|
37
|
+
# error No endian swap intrinsic defined
|
|
38
|
+
# endif
|
|
39
|
+
#else
|
|
40
|
+
# define _LE16(x) (x)
|
|
41
|
+
# define _LE32(x) (x)
|
|
42
|
+
# define _LE64(x) (x)
|
|
43
|
+
#endif
|
|
44
|
+
|
|
45
|
+
#ifdef __aarch64__
|
|
46
|
+
# define WORD_T uint64_t
|
|
47
|
+
# define WORDSIZE_LOG 3 // sizeof(WORD_T) == 1<<WORDSIZE_LOG
|
|
48
|
+
# define CRC_WORD(crc, data) __crc32d(crc, _LE64(data))
|
|
49
|
+
#else
|
|
50
|
+
# define WORD_T uint32_t
|
|
51
|
+
# define WORDSIZE_LOG 2 // sizeof(WORD_T) == 1<<WORDSIZE_LOG
|
|
52
|
+
# define CRC_WORD(crc, data) __crc32w(crc, _LE32(data))
|
|
53
|
+
#endif
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
// exploit CPU pipelining during CRC computation; unfortunately I haven't been able to measure any benefit
|
|
57
|
+
// - Neoverse N1: no noticeable difference
|
|
58
|
+
// - Cortex A53: actually runs a bit slower
|
|
59
|
+
//#define ENABLE_PIPELINE_OPT 1
|
|
60
|
+
|
|
61
|
+
#ifdef ENABLE_PIPELINE_OPT
|
|
62
|
+
// workaround MSVC complaining "unary minus operator applied to unsigned type, result still unsigned"
|
|
63
|
+
#define NEGATE(n) (uint32_t)(-((int32_t)(n)))
|
|
64
|
+
|
|
65
|
+
static HEDLEY_ALWAYS_INLINE uint32_t crc_multiply(uint32_t a, uint32_t b) {
|
|
66
|
+
uint32_t res = 0;
|
|
67
|
+
for(int i=0; i<31; i++) {
|
|
68
|
+
res ^= NEGATE(b>>31) & a;
|
|
69
|
+
a = ((a >> 1) ^ (0xEDB88320 & NEGATE(a&1)));
|
|
70
|
+
b <<= 1;
|
|
71
|
+
}
|
|
72
|
+
res ^= NEGATE(b>>31) & a;
|
|
73
|
+
return res;
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
static const uint32_t crc_power[] = { // pre-computed 2^n, with first 3 entries removed (saves a shift)
|
|
77
|
+
0x00800000, 0x00008000, 0xedb88320, 0xb1e6b092, 0xa06a2517, 0xed627dae, 0x88d14467, 0xd7bbfe6a,
|
|
78
|
+
0xec447f11, 0x8e7ea170, 0x6427800e, 0x4d47bae0, 0x09fe548f, 0x83852d0f, 0x30362f1a, 0x7b5a9cc3,
|
|
79
|
+
0x31fec169, 0x9fec022a, 0x6c8dedc4, 0x15d6874d, 0x5fde7a4e, 0xbad90e37, 0x2e4e5eef, 0x4eaba214,
|
|
80
|
+
0xa8a472c0, 0x429a969e, 0x148d302a, 0xc40ba6d0, 0xc4e22c3c, 0x40000000, 0x20000000, 0x08000000
|
|
81
|
+
};
|
|
82
|
+
/* above table can be computed with
|
|
83
|
+
int main(void) {
|
|
84
|
+
uint32_t k = 0x80000000 >> 1;
|
|
85
|
+
for (size_t i = 0; i < 32+3; ++i) {
|
|
86
|
+
if(i>2) printf("0x%08x, ", k);
|
|
87
|
+
k = crc_multiply(k, k);
|
|
88
|
+
}
|
|
89
|
+
return 0;
|
|
90
|
+
}
|
|
91
|
+
*/
|
|
92
|
+
#endif
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
|
|
13
96
|
// inspired/stolen off https://github.com/jocover/crc32_armv8/blob/master/crc32_armv8.c
|
|
14
97
|
static uint32_t arm_crc_calc(uint32_t crc, const unsigned char *src, long len) {
|
|
15
98
|
|
|
@@ -21,35 +104,84 @@ static uint32_t arm_crc_calc(uint32_t crc, const unsigned char *src, long len) {
|
|
|
21
104
|
len--;
|
|
22
105
|
}
|
|
23
106
|
if ((uintptr_t)src & sizeof(uint16_t)) {
|
|
24
|
-
crc = __crc32h(crc, *((uint16_t *)src));
|
|
107
|
+
crc = __crc32h(crc, _LE16(*((uint16_t *)src)));
|
|
25
108
|
src += sizeof(uint16_t);
|
|
26
109
|
len -= sizeof(uint16_t);
|
|
27
110
|
}
|
|
28
|
-
|
|
29
111
|
#ifdef __aarch64__
|
|
30
112
|
if ((uintptr_t)src & sizeof(uint32_t)) {
|
|
31
|
-
crc = __crc32w(crc, *((uint32_t *)src));
|
|
113
|
+
crc = __crc32w(crc, _LE32(*((uint32_t *)src)));
|
|
32
114
|
src += sizeof(uint32_t);
|
|
33
115
|
len -= sizeof(uint32_t);
|
|
34
116
|
}
|
|
117
|
+
#endif
|
|
35
118
|
}
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
119
|
+
|
|
120
|
+
const WORD_T* srcW = (const WORD_T*)src;
|
|
121
|
+
|
|
122
|
+
#ifdef ENABLE_PIPELINE_OPT
|
|
123
|
+
// uses ideas from https://github.com/komrad36/crc#option-13-golden
|
|
124
|
+
// (this is a slightly less efficient, but much simpler implementation of the idea)
|
|
125
|
+
const unsigned SPLIT_WORDS_LOG = 10; // make sure it's at least 2
|
|
126
|
+
const unsigned SPLIT_WORDS = 1<<SPLIT_WORDS_LOG;
|
|
127
|
+
while(len >= (long)(sizeof(WORD_T)*SPLIT_WORDS*2)) {
|
|
128
|
+
// compute 2x CRCs concurrently to leverage piplining
|
|
129
|
+
uint32_t crc2 = 0;
|
|
130
|
+
for(unsigned i=0; i<SPLIT_WORDS; i+=4) {
|
|
131
|
+
crc = CRC_WORD(crc, *srcW);
|
|
132
|
+
crc2 = CRC_WORD(crc2, *(srcW + SPLIT_WORDS));
|
|
133
|
+
srcW++;
|
|
134
|
+
crc = CRC_WORD(crc, *srcW);
|
|
135
|
+
crc2 = CRC_WORD(crc2, *(srcW + SPLIT_WORDS));
|
|
136
|
+
srcW++;
|
|
137
|
+
crc = CRC_WORD(crc, *srcW);
|
|
138
|
+
crc2 = CRC_WORD(crc2, *(srcW + SPLIT_WORDS));
|
|
139
|
+
srcW++;
|
|
140
|
+
crc = CRC_WORD(crc, *srcW);
|
|
141
|
+
crc2 = CRC_WORD(crc2, *(srcW + SPLIT_WORDS));
|
|
142
|
+
srcW++;
|
|
143
|
+
}
|
|
144
|
+
// merge the CRCs
|
|
145
|
+
// since we're multiplying by a fixed number, it could be sped up with some lookup tables
|
|
146
|
+
crc = crc_multiply(crc, crc_power[SPLIT_WORDS_LOG + WORDSIZE_LOG]) ^ crc2;
|
|
147
|
+
srcW += SPLIT_WORDS;
|
|
148
|
+
len -= sizeof(WORD_T)*SPLIT_WORDS*2;
|
|
39
149
|
}
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
150
|
+
#endif
|
|
151
|
+
|
|
152
|
+
while ((len -= sizeof(WORD_T)*8) >= 0) {
|
|
153
|
+
crc = CRC_WORD(crc, *(srcW++));
|
|
154
|
+
crc = CRC_WORD(crc, *(srcW++));
|
|
155
|
+
crc = CRC_WORD(crc, *(srcW++));
|
|
156
|
+
crc = CRC_WORD(crc, *(srcW++));
|
|
157
|
+
crc = CRC_WORD(crc, *(srcW++));
|
|
158
|
+
crc = CRC_WORD(crc, *(srcW++));
|
|
159
|
+
crc = CRC_WORD(crc, *(srcW++));
|
|
160
|
+
crc = CRC_WORD(crc, *(srcW++));
|
|
43
161
|
}
|
|
44
|
-
|
|
162
|
+
if (len & sizeof(WORD_T)*4) {
|
|
163
|
+
crc = CRC_WORD(crc, *(srcW++));
|
|
164
|
+
crc = CRC_WORD(crc, *(srcW++));
|
|
165
|
+
crc = CRC_WORD(crc, *(srcW++));
|
|
166
|
+
crc = CRC_WORD(crc, *(srcW++));
|
|
167
|
+
}
|
|
168
|
+
if (len & sizeof(WORD_T)*2) {
|
|
169
|
+
crc = CRC_WORD(crc, *(srcW++));
|
|
170
|
+
crc = CRC_WORD(crc, *(srcW++));
|
|
171
|
+
}
|
|
172
|
+
if (len & sizeof(WORD_T)) {
|
|
173
|
+
crc = CRC_WORD(crc, *(srcW++));
|
|
45
174
|
}
|
|
46
|
-
|
|
47
|
-
|
|
175
|
+
src = (const unsigned char*)srcW;
|
|
176
|
+
|
|
177
|
+
#ifdef __aarch64__
|
|
178
|
+
if (len & sizeof(uint32_t)) {
|
|
179
|
+
crc = __crc32w(crc, _LE32(*((uint32_t *)src)));
|
|
48
180
|
src += sizeof(uint32_t);
|
|
49
181
|
}
|
|
50
182
|
#endif
|
|
51
183
|
if (len & sizeof(uint16_t)) {
|
|
52
|
-
crc = __crc32h(crc, *((uint16_t *)src));
|
|
184
|
+
crc = __crc32h(crc, _LE16(*((uint16_t *)src)));
|
|
53
185
|
src += sizeof(uint16_t);
|
|
54
186
|
}
|
|
55
187
|
if (len & sizeof(uint8_t))
|
|
@@ -58,20 +190,15 @@ static uint32_t arm_crc_calc(uint32_t crc, const unsigned char *src, long len) {
|
|
|
58
190
|
return crc;
|
|
59
191
|
}
|
|
60
192
|
|
|
61
|
-
static
|
|
62
|
-
|
|
63
|
-
UNPACK_4(out, ~crc);
|
|
64
|
-
}
|
|
65
|
-
static void do_crc32_incremental_arm(const void* data, size_t length, unsigned char init[4]) {
|
|
66
|
-
uint32_t crc = PACK_4(init);
|
|
67
|
-
crc = arm_crc_calc(~crc, (const unsigned char*)data, (long)length);
|
|
68
|
-
UNPACK_4(init, ~crc);
|
|
193
|
+
static uint32_t do_crc32_incremental_arm(const void* data, size_t length, uint32_t init) {
|
|
194
|
+
return ~arm_crc_calc(~init, (const unsigned char*)data, (long)length);
|
|
69
195
|
}
|
|
70
196
|
|
|
71
|
-
void crc_arm_set_funcs(crc_func*
|
|
72
|
-
*_do_crc32 = &do_crc32_arm;
|
|
197
|
+
void crc_arm_set_funcs(crc_func* _do_crc32_incremental) {
|
|
73
198
|
*_do_crc32_incremental = &do_crc32_incremental_arm;
|
|
74
199
|
}
|
|
75
200
|
#else
|
|
76
|
-
void crc_arm_set_funcs(crc_func*
|
|
201
|
+
void crc_arm_set_funcs(crc_func* _do_crc32_incremental) {
|
|
202
|
+
(void)_do_crc32_incremental;
|
|
203
|
+
}
|
|
77
204
|
#endif
|
package/src/crc_common.h
CHANGED
|
@@ -1,11 +1,4 @@
|
|
|
1
|
-
|
|
2
|
-
#define PACK_4(arr) (((uint_fast32_t)arr[0] << 24) | ((uint_fast32_t)arr[1] << 16) | ((uint_fast32_t)arr[2] << 8) | (uint_fast32_t)arr[3])
|
|
3
|
-
#define UNPACK_4(arr, val) { \
|
|
4
|
-
arr[0] = (unsigned char)(val >> 24) & 0xFF; \
|
|
5
|
-
arr[1] = (unsigned char)(val >> 16) & 0xFF; \
|
|
6
|
-
arr[2] = (unsigned char)(val >> 8) & 0xFF; \
|
|
7
|
-
arr[3] = (unsigned char)val & 0xFF; \
|
|
8
|
-
}
|
|
9
|
-
|
|
1
|
+
#include "common.h"
|
|
10
2
|
#include <stddef.h> // for size_t
|
|
11
|
-
|
|
3
|
+
#include "crc.h"
|
|
4
|
+
|
|
@@ -19,44 +19,29 @@
|
|
|
19
19
|
|
|
20
20
|
#include "crc_common.h"
|
|
21
21
|
|
|
22
|
-
#if
|
|
23
|
-
# include <stdint.h>
|
|
24
|
-
#else
|
|
25
|
-
/* Workaround for older MSVC not supporting stdint.h - just pull it from V8 */
|
|
26
|
-
# include <v8.h>
|
|
27
|
-
#endif
|
|
28
|
-
|
|
29
|
-
#if (defined(__PCLMUL__) && defined(__SSSE3__) && defined(__SSE4_1__)) || (defined(_MSC_VER) && _MSC_VER >= 1600)
|
|
22
|
+
#if (defined(__PCLMUL__) && defined(__SSSE3__) && defined(__SSE4_1__)) || (defined(_MSC_VER) && _MSC_VER >= 1600 && defined(PLATFORM_X86) && !defined(__clang__))
|
|
30
23
|
#include <inttypes.h>
|
|
31
24
|
#include <immintrin.h>
|
|
32
25
|
#include <wmmintrin.h>
|
|
33
26
|
|
|
34
|
-
#define local static
|
|
35
27
|
|
|
36
|
-
#
|
|
37
|
-
# define
|
|
38
|
-
/* Because we don't have dynamic dispatch for AVX, disable it for MSVC builds (only use AVX for -march=native style builds) */
|
|
39
|
-
# undef __AVX__
|
|
40
|
-
# undef __AVX512F__
|
|
41
|
-
# undef __AVX512VL__
|
|
42
|
-
# undef __GFNI__
|
|
43
|
-
#else
|
|
44
|
-
# define ALIGN(_a, v) v __attribute__((aligned(_a)))
|
|
28
|
+
#if defined(__AVX512VL__) && defined(YENC_BUILD_NATIVE) && YENC_BUILD_NATIVE!=0
|
|
29
|
+
# define ENABLE_AVX512 1
|
|
45
30
|
#endif
|
|
46
31
|
|
|
47
32
|
|
|
48
33
|
// interestingly, MSVC seems to generate better code if using VXORPS over VPXOR
|
|
49
34
|
// original Intel code uses XORPS for many XOR operations, but PXOR is pretty much always better (more port freedom on Intel CPUs). The only advantage of XORPS is that it's 1 byte shorter, an advantage which disappears under AVX as both instructions have the same length
|
|
50
|
-
#
|
|
35
|
+
#if defined(__AVX__) && defined(YENC_BUILD_NATIVE) && YENC_BUILD_NATIVE!=0
|
|
51
36
|
# define fold_xor _mm_xor_si128
|
|
52
37
|
#else
|
|
53
|
-
|
|
38
|
+
static __m128i fold_xor(__m128i a, __m128i b) {
|
|
54
39
|
return _mm_castps_si128(_mm_xor_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b)));
|
|
55
40
|
}
|
|
56
41
|
#endif
|
|
57
42
|
|
|
58
|
-
#ifdef
|
|
59
|
-
|
|
43
|
+
#ifdef ENABLE_AVX512
|
|
44
|
+
static __m128i do_one_fold_merge(__m128i src, __m128i data) {
|
|
60
45
|
const __m128i xmm_fold4 = _mm_set_epi32(
|
|
61
46
|
0x00000001, 0x54442bd4,
|
|
62
47
|
0x00000001, 0xc6e41596);
|
|
@@ -68,7 +53,7 @@ local __m128i do_one_fold_merge(__m128i src, __m128i data) {
|
|
|
68
53
|
);
|
|
69
54
|
}
|
|
70
55
|
#else
|
|
71
|
-
|
|
56
|
+
static __m128i do_one_fold(__m128i src) {
|
|
72
57
|
const __m128i xmm_fold4 = _mm_set_epi32(
|
|
73
58
|
0x00000001, 0x54442bd4,
|
|
74
59
|
0x00000001, 0xc6e41596);
|
|
@@ -79,7 +64,7 @@ local __m128i do_one_fold(__m128i src) {
|
|
|
79
64
|
}
|
|
80
65
|
#endif
|
|
81
66
|
|
|
82
|
-
|
|
67
|
+
ALIGN_TO(32, static const unsigned pshufb_shf_table[60]) = {
|
|
83
68
|
0x84838281, 0x88878685, 0x8c8b8a89, 0x008f8e8d, /* shl 15 (16 - 1)/shr1 */
|
|
84
69
|
0x85848382, 0x89888786, 0x8d8c8b8a, 0x01008f8e, /* shl 14 (16 - 3)/shr2 */
|
|
85
70
|
0x86858483, 0x8a898887, 0x8e8d8c8b, 0x0201008f, /* shl 13 (16 - 4)/shr3 */
|
|
@@ -97,7 +82,7 @@ ALIGN(32, local const unsigned pshufb_shf_table[60]) = {
|
|
|
97
82
|
0x0201008f, 0x06050403, 0x0a090807, 0x0e0d0c0b /* shl 1 (16 -15)/shr15*/
|
|
98
83
|
};
|
|
99
84
|
|
|
100
|
-
|
|
85
|
+
static void partial_fold(const size_t len, __m128i *xmm_crc0, __m128i *xmm_crc1,
|
|
101
86
|
__m128i *xmm_crc2, __m128i *xmm_crc3, __m128i *xmm_crc_part) {
|
|
102
87
|
|
|
103
88
|
const __m128i xmm_mask3 = _mm_set1_epi32(0x80808080);
|
|
@@ -127,7 +112,7 @@ local void partial_fold(const size_t len, __m128i *xmm_crc0, __m128i *xmm_crc1,
|
|
|
127
112
|
*xmm_crc_part = _mm_shuffle_epi8(*xmm_crc_part, xmm_shl);
|
|
128
113
|
*xmm_crc3 = _mm_or_si128(*xmm_crc3, *xmm_crc_part);
|
|
129
114
|
|
|
130
|
-
#ifdef
|
|
115
|
+
#ifdef ENABLE_AVX512
|
|
131
116
|
*xmm_crc3 = do_one_fold_merge(xmm_a0_0, *xmm_crc3);
|
|
132
117
|
#else
|
|
133
118
|
*xmm_crc3 = fold_xor(
|
|
@@ -137,74 +122,38 @@ local void partial_fold(const size_t len, __m128i *xmm_crc0, __m128i *xmm_crc1,
|
|
|
137
122
|
#endif
|
|
138
123
|
}
|
|
139
124
|
|
|
140
|
-
|
|
125
|
+
ALIGN_TO(16, static const unsigned crc_k[]) = {
|
|
141
126
|
0xccaa009e, 0x00000000, /* rk1 */
|
|
142
127
|
0x751997d0, 0x00000001, /* rk2 */
|
|
143
128
|
0xccaa009e, 0x00000000, /* rk5 */
|
|
144
129
|
0x63cd6124, 0x00000001, /* rk6 */
|
|
145
|
-
|
|
130
|
+
0xf7011641, 0x00000000, /* rk7 */
|
|
146
131
|
0xdb710640, 0x00000001 /* rk8 */
|
|
147
132
|
};
|
|
148
133
|
|
|
149
|
-
|
|
150
|
-
0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000
|
|
151
|
-
};
|
|
152
|
-
|
|
153
|
-
ALIGN(16, local const unsigned crc_mask2[4]) = {
|
|
134
|
+
ALIGN_TO(16, static const unsigned crc_mask[4]) = {
|
|
154
135
|
0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF
|
|
155
136
|
};
|
|
156
137
|
|
|
157
|
-
local __m128i reverse_bits_epi8(__m128i src) {
|
|
158
|
-
#ifdef __GFNI__
|
|
159
|
-
return _mm_gf2p8affine_epi64_epi8(src, _mm_set_epi32(
|
|
160
|
-
0x80402010, 0x08040201,
|
|
161
|
-
0x80402010, 0x08040201
|
|
162
|
-
), 0);
|
|
163
|
-
#else
|
|
164
|
-
__m128i xmm_t0 = _mm_and_si128(src, _mm_set1_epi8(0x0f));
|
|
165
|
-
__m128i xmm_t1 = _mm_and_si128(_mm_srli_epi16(src, 4), _mm_set1_epi8(0x0f));
|
|
166
|
-
xmm_t0 = _mm_shuffle_epi8(_mm_set_epi8(
|
|
167
|
-
0xf0, 0x70, 0xb0, 0x30, 0xd0, 0x50, 0x90, 0x10, 0xe0, 0x60, 0xa0, 0x20, 0xc0, 0x40, 0x80, 0
|
|
168
|
-
), xmm_t0);
|
|
169
|
-
xmm_t1 = _mm_shuffle_epi8(_mm_set_epi8(
|
|
170
|
-
15, 7, 11, 3, 13, 5, 9, 1, 14, 6, 10, 2, 12, 4, 8, 0
|
|
171
|
-
), xmm_t1);
|
|
172
|
-
return _mm_or_si128(xmm_t0, xmm_t1);
|
|
173
|
-
#endif
|
|
174
|
-
}
|
|
175
|
-
|
|
176
|
-
#ifdef _MSC_VER
|
|
177
|
-
// because MSVC doesn't use BSWAP unless you specifically tell it to...
|
|
178
|
-
# include <stdlib.h>
|
|
179
|
-
# define BSWAP32 _byteswap_ulong
|
|
180
|
-
#else
|
|
181
|
-
# define BSWAP32(n) ((((n)&0xff)<<24) | (((n)&0xff00)<<8) | (((n)&0xff0000)>>8) | (((n)&0xff000000)>>24))
|
|
182
|
-
#endif
|
|
183
138
|
|
|
184
|
-
|
|
139
|
+
static uint32_t crc_fold(const unsigned char *src, long len, uint32_t initial) {
|
|
185
140
|
unsigned long algn_diff;
|
|
186
141
|
__m128i xmm_t0, xmm_t1, xmm_t2, xmm_t3;
|
|
187
142
|
|
|
188
143
|
// TODO: consider calculating this via a LUT instead (probably faster)
|
|
189
144
|
// info from https://www.reddit.com/r/ReverseEngineering/comments/2zwhl3/mystery_constant_0x9db42487_in_intels_crc32ieee/
|
|
190
145
|
// firstly, calculate: xmm_crc0 = (intial * 0x487b9c8a) mod 0x104c11db7, where 0x487b9c8a = inverse(1<<512) mod 0x104c11db7
|
|
146
|
+
xmm_t0 = _mm_cvtsi32_si128(~initial);
|
|
191
147
|
|
|
192
|
-
|
|
193
|
-
uint32_t init_t = BSWAP32(initial);
|
|
194
|
-
xmm_t0 = reverse_bits_epi8(_mm_cvtsi32_si128(~init_t));
|
|
195
|
-
|
|
196
|
-
xmm_t0 = _mm_clmulepi64_si128(xmm_t0, _mm_cvtsi32_si128(0x487b9c8a), 0);
|
|
197
|
-
xmm_t1 = _mm_and_si128(xmm_t0, _mm_set_epi32(-1,-1,-1,0)); // shifted up by 32bits to avoid shifts by using clmul's capability to select top 64bits instead
|
|
148
|
+
xmm_t0 = _mm_clmulepi64_si128(xmm_t0, _mm_set_epi32(0, 0, 0xa273bc24, 0), 0); // reverse(0x487b9c8a)<<1 == 0xa273bc24
|
|
198
149
|
xmm_t2 = _mm_set_epi32( // polynomial reduction factors
|
|
199
|
-
|
|
200
|
-
|
|
150
|
+
1, 0xdb710640, // G* = 0x04c11db7
|
|
151
|
+
0, 0xf7011641 // Q+ = 0x04d101df (+1 to save an additional xor operation)
|
|
201
152
|
);
|
|
202
|
-
xmm_t1 = _mm_clmulepi64_si128(
|
|
203
|
-
xmm_t1 = _mm_clmulepi64_si128(xmm_t1, xmm_t2,
|
|
153
|
+
xmm_t1 = _mm_clmulepi64_si128(xmm_t0, xmm_t2, 0);
|
|
154
|
+
xmm_t1 = _mm_clmulepi64_si128(xmm_t1, xmm_t2, 0x10);
|
|
204
155
|
|
|
205
|
-
__m128i xmm_crc0 = _mm_xor_si128(xmm_t0, xmm_t1);
|
|
206
|
-
// reverse bits
|
|
207
|
-
xmm_crc0 = _mm_shuffle_epi8(reverse_bits_epi8(xmm_crc0), _mm_set_epi32(-1,-1,-1,0x00010203));
|
|
156
|
+
__m128i xmm_crc0 = _mm_srli_si128(_mm_xor_si128(xmm_t0, xmm_t1), 8);
|
|
208
157
|
|
|
209
158
|
__m128i xmm_crc1 = _mm_setzero_si128();
|
|
210
159
|
__m128i xmm_crc2 = _mm_setzero_si128();
|
|
@@ -214,7 +163,8 @@ local uint32_t crc_fold(const unsigned char *src, long len, uint32_t initial) {
|
|
|
214
163
|
if (len < 16) {
|
|
215
164
|
if (len == 0)
|
|
216
165
|
return initial;
|
|
217
|
-
xmm_crc_part =
|
|
166
|
+
xmm_crc_part = _mm_setzero_si128();
|
|
167
|
+
memcpy(&xmm_crc_part, src, len);
|
|
218
168
|
goto partial;
|
|
219
169
|
}
|
|
220
170
|
|
|
@@ -229,13 +179,13 @@ local uint32_t crc_fold(const unsigned char *src, long len, uint32_t initial) {
|
|
|
229
179
|
&xmm_crc_part);
|
|
230
180
|
}
|
|
231
181
|
|
|
232
|
-
while (
|
|
182
|
+
while (len >= 64) {
|
|
233
183
|
xmm_t0 = _mm_load_si128((__m128i *)src);
|
|
234
184
|
xmm_t1 = _mm_load_si128((__m128i *)src + 1);
|
|
235
185
|
xmm_t2 = _mm_load_si128((__m128i *)src + 2);
|
|
236
186
|
xmm_t3 = _mm_load_si128((__m128i *)src + 3);
|
|
237
187
|
|
|
238
|
-
#ifdef
|
|
188
|
+
#ifdef ENABLE_AVX512
|
|
239
189
|
xmm_crc0 = do_one_fold_merge(xmm_crc0, xmm_t0);
|
|
240
190
|
xmm_crc1 = do_one_fold_merge(xmm_crc1, xmm_t1);
|
|
241
191
|
xmm_crc2 = do_one_fold_merge(xmm_crc2, xmm_t2);
|
|
@@ -253,20 +203,18 @@ local uint32_t crc_fold(const unsigned char *src, long len, uint32_t initial) {
|
|
|
253
203
|
#endif
|
|
254
204
|
|
|
255
205
|
src += 64;
|
|
206
|
+
len -= 64;
|
|
256
207
|
}
|
|
257
208
|
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
*/
|
|
261
|
-
if (len + 16 >= 0) {
|
|
262
|
-
len += 16;
|
|
209
|
+
if (len >= 48) {
|
|
210
|
+
len -= 48;
|
|
263
211
|
|
|
264
212
|
xmm_t0 = _mm_load_si128((__m128i *)src);
|
|
265
213
|
xmm_t1 = _mm_load_si128((__m128i *)src + 1);
|
|
266
214
|
xmm_t2 = _mm_load_si128((__m128i *)src + 2);
|
|
267
215
|
|
|
268
216
|
xmm_t3 = xmm_crc3;
|
|
269
|
-
#ifdef
|
|
217
|
+
#ifdef ENABLE_AVX512
|
|
270
218
|
xmm_crc3 = do_one_fold_merge(xmm_crc2, xmm_t2);
|
|
271
219
|
xmm_crc2 = do_one_fold_merge(xmm_crc1, xmm_t1);
|
|
272
220
|
xmm_crc1 = do_one_fold_merge(xmm_crc0, xmm_t0);
|
|
@@ -284,15 +232,15 @@ local uint32_t crc_fold(const unsigned char *src, long len, uint32_t initial) {
|
|
|
284
232
|
goto done;
|
|
285
233
|
|
|
286
234
|
xmm_crc_part = _mm_load_si128((__m128i *)src + 3);
|
|
287
|
-
} else if (len
|
|
288
|
-
len
|
|
235
|
+
} else if (len >= 32) {
|
|
236
|
+
len -= 32;
|
|
289
237
|
|
|
290
238
|
xmm_t0 = _mm_load_si128((__m128i *)src);
|
|
291
239
|
xmm_t1 = _mm_load_si128((__m128i *)src + 1);
|
|
292
240
|
|
|
293
241
|
xmm_t2 = xmm_crc2;
|
|
294
242
|
xmm_t3 = xmm_crc3;
|
|
295
|
-
#ifdef
|
|
243
|
+
#ifdef ENABLE_AVX512
|
|
296
244
|
xmm_crc3 = do_one_fold_merge(xmm_crc1, xmm_t1);
|
|
297
245
|
xmm_crc2 = do_one_fold_merge(xmm_crc0, xmm_t0);
|
|
298
246
|
#else
|
|
@@ -308,13 +256,13 @@ local uint32_t crc_fold(const unsigned char *src, long len, uint32_t initial) {
|
|
|
308
256
|
goto done;
|
|
309
257
|
|
|
310
258
|
xmm_crc_part = _mm_load_si128((__m128i *)src + 2);
|
|
311
|
-
} else if (len
|
|
312
|
-
len
|
|
259
|
+
} else if (len >= 16) {
|
|
260
|
+
len -= 16;
|
|
313
261
|
|
|
314
262
|
xmm_t0 = _mm_load_si128((__m128i *)src);
|
|
315
263
|
|
|
316
264
|
xmm_t3 = xmm_crc3;
|
|
317
|
-
#ifdef
|
|
265
|
+
#ifdef ENABLE_AVX512
|
|
318
266
|
xmm_crc3 = do_one_fold_merge(xmm_crc0, xmm_t0);
|
|
319
267
|
#else
|
|
320
268
|
xmm_crc3 = _mm_xor_si128(do_one_fold(xmm_crc0), xmm_t0);
|
|
@@ -328,7 +276,6 @@ local uint32_t crc_fold(const unsigned char *src, long len, uint32_t initial) {
|
|
|
328
276
|
|
|
329
277
|
xmm_crc_part = _mm_load_si128((__m128i *)src + 1);
|
|
330
278
|
} else {
|
|
331
|
-
len += 64;
|
|
332
279
|
if (len == 0)
|
|
333
280
|
goto done;
|
|
334
281
|
xmm_crc_part = _mm_load_si128((__m128i *)src);
|
|
@@ -339,8 +286,7 @@ partial:
|
|
|
339
286
|
&xmm_crc_part);
|
|
340
287
|
done:
|
|
341
288
|
{
|
|
342
|
-
const __m128i xmm_mask
|
|
343
|
-
const __m128i xmm_mask2 = _mm_load_si128((__m128i *)crc_mask2);
|
|
289
|
+
const __m128i xmm_mask = _mm_load_si128((__m128i *)crc_mask);
|
|
344
290
|
__m128i x_tmp0, x_tmp1, x_tmp2, crc_fold;
|
|
345
291
|
|
|
346
292
|
/*
|
|
@@ -350,7 +296,7 @@ done:
|
|
|
350
296
|
|
|
351
297
|
x_tmp0 = _mm_clmulepi64_si128(xmm_crc0, crc_fold, 0x10);
|
|
352
298
|
xmm_crc0 = _mm_clmulepi64_si128(xmm_crc0, crc_fold, 0x01);
|
|
353
|
-
#ifdef
|
|
299
|
+
#ifdef ENABLE_AVX512
|
|
354
300
|
xmm_crc1 = _mm_ternarylogic_epi32(xmm_crc1, x_tmp0, xmm_crc0, 0x96);
|
|
355
301
|
#else
|
|
356
302
|
xmm_crc1 = _mm_xor_si128(xmm_crc1, x_tmp0);
|
|
@@ -359,7 +305,7 @@ done:
|
|
|
359
305
|
|
|
360
306
|
x_tmp1 = _mm_clmulepi64_si128(xmm_crc1, crc_fold, 0x10);
|
|
361
307
|
xmm_crc1 = _mm_clmulepi64_si128(xmm_crc1, crc_fold, 0x01);
|
|
362
|
-
#ifdef
|
|
308
|
+
#ifdef ENABLE_AVX512
|
|
363
309
|
xmm_crc2 = _mm_ternarylogic_epi32(xmm_crc2, x_tmp1, xmm_crc1, 0x96);
|
|
364
310
|
#else
|
|
365
311
|
xmm_crc2 = _mm_xor_si128(xmm_crc2, x_tmp1);
|
|
@@ -368,7 +314,7 @@ done:
|
|
|
368
314
|
|
|
369
315
|
x_tmp2 = _mm_clmulepi64_si128(xmm_crc2, crc_fold, 0x10);
|
|
370
316
|
xmm_crc2 = _mm_clmulepi64_si128(xmm_crc2, crc_fold, 0x01);
|
|
371
|
-
#ifdef
|
|
317
|
+
#ifdef ENABLE_AVX512
|
|
372
318
|
xmm_crc3 = _mm_ternarylogic_epi32(xmm_crc3, x_tmp2, xmm_crc2, 0x96);
|
|
373
319
|
#else
|
|
374
320
|
xmm_crc3 = _mm_xor_si128(xmm_crc3, x_tmp2);
|
|
@@ -388,58 +334,43 @@ done:
|
|
|
388
334
|
xmm_crc0 = xmm_crc3;
|
|
389
335
|
xmm_crc3 = _mm_slli_si128(xmm_crc3, 4);
|
|
390
336
|
xmm_crc3 = _mm_clmulepi64_si128(xmm_crc3, crc_fold, 0x10);
|
|
391
|
-
#ifdef
|
|
337
|
+
#ifdef ENABLE_AVX512
|
|
392
338
|
//xmm_crc3 = _mm_maskz_xor_epi32(14, xmm_crc3, xmm_crc0);
|
|
393
|
-
xmm_crc3 = _mm_ternarylogic_epi32(xmm_crc3, xmm_crc0,
|
|
339
|
+
xmm_crc3 = _mm_ternarylogic_epi32(xmm_crc3, xmm_crc0, xmm_mask, 0x28);
|
|
394
340
|
#else
|
|
341
|
+
xmm_crc0 = _mm_and_si128(xmm_crc0, xmm_mask);
|
|
395
342
|
xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc0);
|
|
396
|
-
xmm_crc3 = _mm_and_si128(xmm_crc3, xmm_mask2);
|
|
397
343
|
#endif
|
|
398
344
|
|
|
399
345
|
/*
|
|
400
346
|
* k7
|
|
401
347
|
*/
|
|
402
348
|
xmm_crc1 = xmm_crc3;
|
|
403
|
-
xmm_crc2 = xmm_crc3;
|
|
404
349
|
crc_fold = _mm_load_si128((__m128i *)crc_k + 2);
|
|
405
350
|
|
|
406
351
|
xmm_crc3 = _mm_clmulepi64_si128(xmm_crc3, crc_fold, 0);
|
|
407
|
-
#ifdef __AVX512VL__
|
|
408
|
-
//xmm_crc3 = _mm_maskz_xor_epi32(3, xmm_crc3, xmm_crc2);
|
|
409
|
-
xmm_crc3 = _mm_ternarylogic_epi32(xmm_crc3, xmm_crc2, xmm_mask, 0x28);
|
|
410
|
-
#else
|
|
411
|
-
xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc2);
|
|
412
|
-
xmm_crc3 = _mm_and_si128(xmm_crc3, xmm_mask);
|
|
413
|
-
#endif
|
|
414
|
-
|
|
415
|
-
xmm_crc2 = xmm_crc3;
|
|
416
352
|
xmm_crc3 = _mm_clmulepi64_si128(xmm_crc3, crc_fold, 0x10);
|
|
417
|
-
#ifdef
|
|
418
|
-
xmm_crc3 = _mm_ternarylogic_epi32(xmm_crc3,
|
|
419
|
-
return _mm_extract_epi32(xmm_crc3, 2);
|
|
353
|
+
#ifdef ENABLE_AVX512
|
|
354
|
+
xmm_crc3 = _mm_ternarylogic_epi32(xmm_crc3, xmm_crc1, xmm_crc1, 0xC3); // NOT(xmm_crc3 ^ xmm_crc1)
|
|
420
355
|
#else
|
|
421
|
-
|
|
356
|
+
xmm_crc1 = _mm_xor_si128(xmm_crc1, xmm_mask);
|
|
422
357
|
xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc1);
|
|
423
|
-
return ~_mm_extract_epi32(xmm_crc3, 2);
|
|
424
358
|
#endif
|
|
359
|
+
return _mm_extract_epi32(xmm_crc3, 2);
|
|
425
360
|
}
|
|
426
361
|
|
|
427
362
|
}
|
|
428
363
|
|
|
429
|
-
static
|
|
430
|
-
|
|
431
|
-
UNPACK_4(out, tmp);
|
|
432
|
-
}
|
|
433
|
-
static void do_crc32_incremental_clmul(const void* data, size_t length, unsigned char init[4]) {
|
|
434
|
-
uint32_t tmp = crc_fold((const unsigned char*)data, (long)length, PACK_4(init));
|
|
435
|
-
UNPACK_4(init, tmp);
|
|
364
|
+
static uint32_t do_crc32_incremental_clmul(const void* data, size_t length, uint32_t init) {
|
|
365
|
+
return crc_fold((const unsigned char*)data, (long)length, init);
|
|
436
366
|
}
|
|
437
367
|
|
|
438
|
-
void crc_clmul_set_funcs(crc_func*
|
|
439
|
-
*_do_crc32 = &do_crc32_clmul;
|
|
368
|
+
void crc_clmul_set_funcs(crc_func* _do_crc32_incremental) {
|
|
440
369
|
*_do_crc32_incremental = &do_crc32_incremental_clmul;
|
|
441
370
|
}
|
|
442
371
|
#else
|
|
443
|
-
void crc_clmul_set_funcs(crc_func*
|
|
372
|
+
void crc_clmul_set_funcs(crc_func* _do_crc32_incremental) {
|
|
373
|
+
(void)_do_crc32_incremental;
|
|
374
|
+
}
|
|
444
375
|
#endif
|
|
445
376
|
|