yencode 1.0.8 → 1.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. package/README.md +339 -231
  2. package/binding.gyp +292 -39
  3. package/crcutil-1.0/code/multiword_64_64_gcc_amd64_asm.cc +7 -7
  4. package/crcutil-1.0/code/multiword_64_64_gcc_i386_mmx.cc +14 -14
  5. package/crcutil-1.0/code/multiword_64_64_intrinsic_i386_mmx.cc +1 -1
  6. package/crcutil-1.0/code/uint128_sse2.h +2 -0
  7. package/index.js +329 -22
  8. package/package.json +2 -2
  9. package/src/common.h +299 -0
  10. package/src/crc.cc +95 -0
  11. package/src/crc.h +23 -0
  12. package/src/crc_arm.cc +175 -0
  13. package/src/crc_common.h +4 -0
  14. package/{crc_folding.c → src/crc_folding.cc} +175 -185
  15. package/src/decoder.cc +61 -0
  16. package/src/decoder.h +53 -0
  17. package/src/decoder_avx.cc +18 -0
  18. package/src/decoder_avx2.cc +18 -0
  19. package/src/decoder_avx2_base.h +615 -0
  20. package/src/decoder_common.h +512 -0
  21. package/src/decoder_neon.cc +474 -0
  22. package/src/decoder_neon64.cc +451 -0
  23. package/src/decoder_sse2.cc +16 -0
  24. package/src/decoder_sse_base.h +711 -0
  25. package/src/decoder_ssse3.cc +18 -0
  26. package/src/encoder.cc +170 -0
  27. package/src/encoder.h +21 -0
  28. package/src/encoder_avx.cc +16 -0
  29. package/src/encoder_avx2.cc +16 -0
  30. package/src/encoder_avx_base.h +564 -0
  31. package/src/encoder_common.h +109 -0
  32. package/src/encoder_neon.cc +547 -0
  33. package/src/encoder_sse2.cc +13 -0
  34. package/src/encoder_sse_base.h +724 -0
  35. package/src/encoder_ssse3.cc +18 -0
  36. package/src/hedley.h +1899 -0
  37. package/src/platform.cc +147 -0
  38. package/src/yencode.cc +449 -0
  39. package/test/_maxsize.js +9 -0
  40. package/test/_speedbase.js +147 -0
  41. package/test/speedcrc.js +20 -0
  42. package/test/speeddec.js +92 -0
  43. package/test/speedenc.js +44 -0
  44. package/{testcrc.js → test/testcrc.js} +53 -39
  45. package/test/testdec.js +183 -0
  46. package/test/testenc.js +163 -0
  47. package/test/testpostdec.js +126 -0
  48. package/test.js +0 -91
  49. package/yencode.cc +0 -1622
@@ -17,158 +17,54 @@
17
17
  * For conditions of distribution and use, see copyright notice in zlib.h
18
18
  */
19
19
 
20
- #ifdef X86_PCLMULQDQ_CRC
21
-
20
+ #include "crc_common.h"
21
+
22
+ #if (defined(__PCLMUL__) && defined(__SSSE3__) && defined(__SSE4_1__)) || (defined(_MSC_VER) && _MSC_VER >= 1600 && defined(PLATFORM_X86))
22
23
  #include <inttypes.h>
23
24
  #include <immintrin.h>
24
25
  #include <wmmintrin.h>
25
26
 
26
- #define local static
27
27
 
28
- #ifdef _MSC_VER
29
- #define ALIGN(_a, v) __declspec(align(_a)) v
30
- #else
31
- #define ALIGN(_a, v) v __attribute__((aligned(_a)))
28
+ #if defined(__AVX512VL__) && defined(YENC_BUILD_NATIVE) && YENC_BUILD_NATIVE!=0
29
+ # define ENABLE_AVX512 1
32
30
  #endif
33
31
 
34
32
 
35
- local void fold_1(__m128i *xmm_crc0, __m128i *xmm_crc1, __m128i *xmm_crc2, __m128i *xmm_crc3) {
36
- const __m128i xmm_fold4 = _mm_set_epi32(
37
- 0x00000001, 0x54442bd4,
38
- 0x00000001, 0xc6e41596);
39
-
40
- __m128i x_tmp3;
41
- __m128 ps_crc0, ps_crc3, ps_res;
42
-
43
- x_tmp3 = *xmm_crc3;
44
-
45
- *xmm_crc3 = *xmm_crc0;
46
- *xmm_crc0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x01);
47
- *xmm_crc3 = _mm_clmulepi64_si128(*xmm_crc3, xmm_fold4, 0x10);
48
- ps_crc0 = _mm_castsi128_ps(*xmm_crc0);
49
- ps_crc3 = _mm_castsi128_ps(*xmm_crc3);
50
- ps_res = _mm_xor_ps(ps_crc0, ps_crc3);
51
-
52
- *xmm_crc0 = *xmm_crc1;
53
- *xmm_crc1 = *xmm_crc2;
54
- *xmm_crc2 = x_tmp3;
55
- *xmm_crc3 = _mm_castps_si128(ps_res);
56
- }
57
-
58
- local void fold_2(__m128i *xmm_crc0, __m128i *xmm_crc1, __m128i *xmm_crc2, __m128i *xmm_crc3) {
59
- const __m128i xmm_fold4 = _mm_set_epi32(
60
- 0x00000001, 0x54442bd4,
61
- 0x00000001, 0xc6e41596);
62
-
63
- __m128i x_tmp3, x_tmp2;
64
- __m128 ps_crc0, ps_crc1, ps_crc2, ps_crc3, ps_res31, ps_res20;
65
-
66
- x_tmp3 = *xmm_crc3;
67
- x_tmp2 = *xmm_crc2;
68
-
69
- *xmm_crc3 = *xmm_crc1;
70
- *xmm_crc1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold4, 0x01);
71
- *xmm_crc3 = _mm_clmulepi64_si128(*xmm_crc3, xmm_fold4, 0x10);
72
- ps_crc3 = _mm_castsi128_ps(*xmm_crc3);
73
- ps_crc1 = _mm_castsi128_ps(*xmm_crc1);
74
- ps_res31 = _mm_xor_ps(ps_crc3, ps_crc1);
75
-
76
- *xmm_crc2 = *xmm_crc0;
77
- *xmm_crc0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x01);
78
- *xmm_crc2 = _mm_clmulepi64_si128(*xmm_crc2, xmm_fold4, 0x10);
79
- ps_crc0 = _mm_castsi128_ps(*xmm_crc0);
80
- ps_crc2 = _mm_castsi128_ps(*xmm_crc2);
81
- ps_res20 = _mm_xor_ps(ps_crc0, ps_crc2);
82
-
83
- *xmm_crc0 = x_tmp2;
84
- *xmm_crc1 = x_tmp3;
85
- *xmm_crc2 = _mm_castps_si128(ps_res20);
86
- *xmm_crc3 = _mm_castps_si128(ps_res31);
33
+ // interestingly, MSVC seems to generate better code if using VXORPS over VPXOR
34
+ // original Intel code uses XORPS for many XOR operations, but PXOR is pretty much always better (more port freedom on Intel CPUs). The only advantage of XORPS is that it's 1 byte shorter, an advantage which disappears under AVX as both instructions have the same length
35
+ #if defined(__AVX__) && defined(YENC_BUILD_NATIVE) && YENC_BUILD_NATIVE!=0
36
+ # define fold_xor _mm_xor_si128
37
+ #else
38
+ static __m128i fold_xor(__m128i a, __m128i b) {
39
+ return _mm_castps_si128(_mm_xor_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b)));
87
40
  }
41
+ #endif
88
42
 
89
- local void fold_3(__m128i *xmm_crc0, __m128i *xmm_crc1, __m128i *xmm_crc2, __m128i *xmm_crc3) {
43
+ #ifdef ENABLE_AVX512
44
+ static __m128i do_one_fold_merge(__m128i src, __m128i data) {
90
45
  const __m128i xmm_fold4 = _mm_set_epi32(
91
46
  0x00000001, 0x54442bd4,
92
47
  0x00000001, 0xc6e41596);
93
-
94
- __m128i x_tmp3;
95
- __m128 ps_crc0, ps_crc1, ps_crc2, ps_crc3, ps_res32, ps_res21, ps_res10;
96
-
97
- x_tmp3 = *xmm_crc3;
98
-
99
- *xmm_crc3 = *xmm_crc2;
100
- *xmm_crc2 = _mm_clmulepi64_si128(*xmm_crc2, xmm_fold4, 0x01);
101
- *xmm_crc3 = _mm_clmulepi64_si128(*xmm_crc3, xmm_fold4, 0x10);
102
- ps_crc2 = _mm_castsi128_ps(*xmm_crc2);
103
- ps_crc3 = _mm_castsi128_ps(*xmm_crc3);
104
- ps_res32 = _mm_xor_ps(ps_crc2, ps_crc3);
105
-
106
- *xmm_crc2 = *xmm_crc1;
107
- *xmm_crc1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold4, 0x01);
108
- *xmm_crc2 = _mm_clmulepi64_si128(*xmm_crc2, xmm_fold4, 0x10);
109
- ps_crc1 = _mm_castsi128_ps(*xmm_crc1);
110
- ps_crc2 = _mm_castsi128_ps(*xmm_crc2);
111
- ps_res21 = _mm_xor_ps(ps_crc1, ps_crc2);
112
-
113
- *xmm_crc1 = *xmm_crc0;
114
- *xmm_crc0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x01);
115
- *xmm_crc1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold4, 0x10);
116
- ps_crc0 = _mm_castsi128_ps(*xmm_crc0);
117
- ps_crc1 = _mm_castsi128_ps(*xmm_crc1);
118
- ps_res10 = _mm_xor_ps(ps_crc0, ps_crc1);
119
-
120
- *xmm_crc0 = x_tmp3;
121
- *xmm_crc1 = _mm_castps_si128(ps_res10);
122
- *xmm_crc2 = _mm_castps_si128(ps_res21);
123
- *xmm_crc3 = _mm_castps_si128(ps_res32);
48
+ return _mm_ternarylogic_epi32(
49
+ _mm_clmulepi64_si128(src, xmm_fold4, 0x01),
50
+ _mm_clmulepi64_si128(src, xmm_fold4, 0x10),
51
+ data,
52
+ 0x96
53
+ );
124
54
  }
125
-
126
- local void fold_4(__m128i *xmm_crc0, __m128i *xmm_crc1, __m128i *xmm_crc2, __m128i *xmm_crc3) {
55
+ #else
56
+ static __m128i do_one_fold(__m128i src) {
127
57
  const __m128i xmm_fold4 = _mm_set_epi32(
128
58
  0x00000001, 0x54442bd4,
129
59
  0x00000001, 0xc6e41596);
130
-
131
- __m128i x_tmp0, x_tmp1, x_tmp2, x_tmp3;
132
- __m128 ps_crc0, ps_crc1, ps_crc2, ps_crc3;
133
- __m128 ps_t0, ps_t1, ps_t2, ps_t3;
134
- __m128 ps_res0, ps_res1, ps_res2, ps_res3;
135
-
136
- x_tmp0 = *xmm_crc0;
137
- x_tmp1 = *xmm_crc1;
138
- x_tmp2 = *xmm_crc2;
139
- x_tmp3 = *xmm_crc3;
140
-
141
- *xmm_crc0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x01);
142
- x_tmp0 = _mm_clmulepi64_si128(x_tmp0, xmm_fold4, 0x10);
143
- ps_crc0 = _mm_castsi128_ps(*xmm_crc0);
144
- ps_t0 = _mm_castsi128_ps(x_tmp0);
145
- ps_res0 = _mm_xor_ps(ps_crc0, ps_t0);
146
-
147
- *xmm_crc1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold4, 0x01);
148
- x_tmp1 = _mm_clmulepi64_si128(x_tmp1, xmm_fold4, 0x10);
149
- ps_crc1 = _mm_castsi128_ps(*xmm_crc1);
150
- ps_t1 = _mm_castsi128_ps(x_tmp1);
151
- ps_res1 = _mm_xor_ps(ps_crc1, ps_t1);
152
-
153
- *xmm_crc2 = _mm_clmulepi64_si128(*xmm_crc2, xmm_fold4, 0x01);
154
- x_tmp2 = _mm_clmulepi64_si128(x_tmp2, xmm_fold4, 0x10);
155
- ps_crc2 = _mm_castsi128_ps(*xmm_crc2);
156
- ps_t2 = _mm_castsi128_ps(x_tmp2);
157
- ps_res2 = _mm_xor_ps(ps_crc2, ps_t2);
158
-
159
- *xmm_crc3 = _mm_clmulepi64_si128(*xmm_crc3, xmm_fold4, 0x01);
160
- x_tmp3 = _mm_clmulepi64_si128(x_tmp3, xmm_fold4, 0x10);
161
- ps_crc3 = _mm_castsi128_ps(*xmm_crc3);
162
- ps_t3 = _mm_castsi128_ps(x_tmp3);
163
- ps_res3 = _mm_xor_ps(ps_crc3, ps_t3);
164
-
165
- *xmm_crc0 = _mm_castps_si128(ps_res0);
166
- *xmm_crc1 = _mm_castps_si128(ps_res1);
167
- *xmm_crc2 = _mm_castps_si128(ps_res2);
168
- *xmm_crc3 = _mm_castps_si128(ps_res3);
60
+ return fold_xor(
61
+ _mm_clmulepi64_si128(src, xmm_fold4, 0x01),
62
+ _mm_clmulepi64_si128(src, xmm_fold4, 0x10)
63
+ );
169
64
  }
65
+ #endif
170
66
 
171
- ALIGN(32, local const unsigned pshufb_shf_table[60]) = {
67
+ ALIGN_TO(32, static const unsigned pshufb_shf_table[60]) = {
172
68
  0x84838281, 0x88878685, 0x8c8b8a89, 0x008f8e8d, /* shl 15 (16 - 1)/shr1 */
173
69
  0x85848382, 0x89888786, 0x8d8c8b8a, 0x01008f8e, /* shl 14 (16 - 3)/shr2 */
174
70
  0x86858483, 0x8a898887, 0x8e8d8c8b, 0x0201008f, /* shl 13 (16 - 4)/shr3 */
@@ -186,17 +82,13 @@ ALIGN(32, local const unsigned pshufb_shf_table[60]) = {
186
82
  0x0201008f, 0x06050403, 0x0a090807, 0x0e0d0c0b /* shl 1 (16 -15)/shr15*/
187
83
  };
188
84
 
189
- local void partial_fold(const size_t len, __m128i *xmm_crc0, __m128i *xmm_crc1,
85
+ static void partial_fold(const size_t len, __m128i *xmm_crc0, __m128i *xmm_crc1,
190
86
  __m128i *xmm_crc2, __m128i *xmm_crc3, __m128i *xmm_crc_part) {
191
87
 
192
- const __m128i xmm_fold4 = _mm_set_epi32(
193
- 0x00000001, 0x54442bd4,
194
- 0x00000001, 0xc6e41596);
195
88
  const __m128i xmm_mask3 = _mm_set1_epi32(0x80808080);
196
89
 
197
90
  __m128i xmm_shl, xmm_shr, xmm_tmp1, xmm_tmp2, xmm_tmp3;
198
- __m128i xmm_a0_0, xmm_a0_1;
199
- __m128 ps_crc3, psa0_0, psa0_1, ps_res;
91
+ __m128i xmm_a0_0;
200
92
 
201
93
  xmm_shl = _mm_load_si128((__m128i *)pshufb_shf_table + (len - 1));
202
94
  xmm_shr = xmm_shl;
@@ -220,41 +112,82 @@ local void partial_fold(const size_t len, __m128i *xmm_crc0, __m128i *xmm_crc1,
220
112
  *xmm_crc_part = _mm_shuffle_epi8(*xmm_crc_part, xmm_shl);
221
113
  *xmm_crc3 = _mm_or_si128(*xmm_crc3, *xmm_crc_part);
222
114
 
223
- xmm_a0_1 = _mm_clmulepi64_si128(xmm_a0_0, xmm_fold4, 0x10);
224
- xmm_a0_0 = _mm_clmulepi64_si128(xmm_a0_0, xmm_fold4, 0x01);
225
-
226
- ps_crc3 = _mm_castsi128_ps(*xmm_crc3);
227
- psa0_0 = _mm_castsi128_ps(xmm_a0_0);
228
- psa0_1 = _mm_castsi128_ps(xmm_a0_1);
229
-
230
- ps_res = _mm_xor_ps(ps_crc3, psa0_0);
231
- ps_res = _mm_xor_ps(ps_res, psa0_1);
232
-
233
- *xmm_crc3 = _mm_castps_si128(ps_res);
115
+ #ifdef ENABLE_AVX512
116
+ *xmm_crc3 = do_one_fold_merge(xmm_a0_0, *xmm_crc3);
117
+ #else
118
+ *xmm_crc3 = fold_xor(
119
+ do_one_fold(xmm_a0_0),
120
+ *xmm_crc3
121
+ );
122
+ #endif
234
123
  }
235
124
 
236
- ALIGN(16, local const unsigned crc_k[]) = {
125
+ ALIGN_TO(16, static const unsigned crc_k[]) = {
237
126
  0xccaa009e, 0x00000000, /* rk1 */
238
127
  0x751997d0, 0x00000001, /* rk2 */
239
128
  0xccaa009e, 0x00000000, /* rk5 */
240
129
  0x63cd6124, 0x00000001, /* rk6 */
241
- 0xf7011640, 0x00000001, /* rk7 */
130
+ 0xf7011641, 0x00000000, /* rk7 */
242
131
  0xdb710640, 0x00000001 /* rk8 */
243
132
  };
244
133
 
245
- ALIGN(16, local const unsigned crc_mask[4]) = {
246
- 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000
247
- };
248
-
249
- ALIGN(16, local const unsigned crc_mask2[4]) = {
134
+ ALIGN_TO(16, static const unsigned crc_mask[4]) = {
250
135
  0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF
251
136
  };
252
137
 
253
- uint32_t crc_fold(const unsigned char *src, long len) {
138
+ static __m128i reverse_bits_epi8(__m128i src) {
139
+ #if defined(__GFNI__) && defined(YENC_BUILD_NATIVE) && YENC_BUILD_NATIVE!=0
140
+ return _mm_gf2p8affine_epi64_epi8(src, _mm_set_epi32(
141
+ 0x80402010, 0x08040201,
142
+ 0x80402010, 0x08040201
143
+ ), 0);
144
+ #else
145
+ __m128i xmm_t0 = _mm_and_si128(src, _mm_set1_epi8(0x0f));
146
+ __m128i xmm_t1 = _mm_and_si128(_mm_srli_epi16(src, 4), _mm_set1_epi8(0x0f));
147
+ xmm_t0 = _mm_shuffle_epi8(_mm_set_epi8(
148
+ -16, 112, -80, 48, -48, 80, -112, 16, -32, 96, -96, 32, -64, 64, -128, 0
149
+ //0xf0, 0x70, 0xb0, 0x30, 0xd0, 0x50, 0x90, 0x10, 0xe0, 0x60, 0xa0, 0x20, 0xc0, 0x40, 0x80, 0
150
+ ), xmm_t0);
151
+ xmm_t1 = _mm_shuffle_epi8(_mm_set_epi8(
152
+ 15, 7, 11, 3, 13, 5, 9, 1, 14, 6, 10, 2, 12, 4, 8, 0
153
+ ), xmm_t1);
154
+ return _mm_or_si128(xmm_t0, xmm_t1);
155
+ #endif
156
+ }
157
+
158
+ #ifdef _MSC_VER
159
+ // because MSVC doesn't use BSWAP unless you specifically tell it to...
160
+ # include <stdlib.h>
161
+ # define BSWAP32 _byteswap_ulong
162
+ #else
163
+ # define BSWAP32(n) ((((n)&0xff)<<24) | (((n)&0xff00)<<8) | (((n)&0xff0000)>>8) | (((n)&0xff000000)>>24))
164
+ #endif
165
+
166
+ static uint32_t crc_fold(const unsigned char *src, long len, uint32_t initial) {
254
167
  unsigned long algn_diff;
255
168
  __m128i xmm_t0, xmm_t1, xmm_t2, xmm_t3;
256
169
 
257
- __m128i xmm_crc0 = _mm_cvtsi32_si128(0x9db42487);
170
+ // TODO: consider calculating this via a LUT instead (probably faster)
171
+ // info from https://www.reddit.com/r/ReverseEngineering/comments/2zwhl3/mystery_constant_0x9db42487_in_intels_crc32ieee/
172
+ // firstly, calculate: xmm_crc0 = (intial * 0x487b9c8a) mod 0x104c11db7, where 0x487b9c8a = inverse(1<<512) mod 0x104c11db7
173
+
174
+ // reverse input bits + load into XMM register
175
+ uint32_t init_t = BSWAP32(initial);
176
+ xmm_t0 = reverse_bits_epi8(_mm_cvtsi32_si128(~init_t));
177
+
178
+ xmm_t0 = _mm_clmulepi64_si128(xmm_t0, _mm_cvtsi32_si128(0x487b9c8a), 0);
179
+ xmm_t1 = _mm_and_si128(xmm_t0, _mm_set_epi32(-1,-1,-1,0)); // shifted up by 32bits to avoid shifts by using clmul's capability to select top 64bits instead
180
+ xmm_t2 = _mm_set_epi32( // polynomial reduction factors
181
+ 0, 0x04c11db7, // G*
182
+ 1, 0x04d101df // Q+
183
+ );
184
+ xmm_t1 = _mm_clmulepi64_si128(xmm_t1, xmm_t2, 0);
185
+ xmm_t1 = _mm_clmulepi64_si128(xmm_t1, xmm_t2, 0x11);
186
+
187
+ __m128i xmm_crc0 = _mm_xor_si128(xmm_t0, xmm_t1);
188
+ // reverse bits
189
+ xmm_crc0 = _mm_shuffle_epi8(reverse_bits_epi8(xmm_crc0), _mm_set_epi32(-1,-1,-1,0x00010203));
190
+
258
191
  __m128i xmm_crc1 = _mm_setzero_si128();
259
192
  __m128i xmm_crc2 = _mm_setzero_si128();
260
193
  __m128i xmm_crc3 = _mm_setzero_si128();
@@ -262,7 +195,7 @@ uint32_t crc_fold(const unsigned char *src, long len) {
262
195
 
263
196
  if (len < 16) {
264
197
  if (len == 0)
265
- return 0;
198
+ return initial;
266
199
  xmm_crc_part = _mm_loadu_si128((__m128i *)src);
267
200
  goto partial;
268
201
  }
@@ -284,12 +217,22 @@ uint32_t crc_fold(const unsigned char *src, long len) {
284
217
  xmm_t2 = _mm_load_si128((__m128i *)src + 2);
285
218
  xmm_t3 = _mm_load_si128((__m128i *)src + 3);
286
219
 
287
- fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
288
-
220
+ #ifdef ENABLE_AVX512
221
+ xmm_crc0 = do_one_fold_merge(xmm_crc0, xmm_t0);
222
+ xmm_crc1 = do_one_fold_merge(xmm_crc1, xmm_t1);
223
+ xmm_crc2 = do_one_fold_merge(xmm_crc2, xmm_t2);
224
+ xmm_crc3 = do_one_fold_merge(xmm_crc3, xmm_t3);
225
+ #else
226
+ // nesting do_one_fold() in _mm_xor_si128() seems to cause MSVC to generate horrible code, so separate it out
227
+ xmm_crc0 = do_one_fold(xmm_crc0);
228
+ xmm_crc1 = do_one_fold(xmm_crc1);
229
+ xmm_crc2 = do_one_fold(xmm_crc2);
230
+ xmm_crc3 = do_one_fold(xmm_crc3);
289
231
  xmm_crc0 = _mm_xor_si128(xmm_crc0, xmm_t0);
290
232
  xmm_crc1 = _mm_xor_si128(xmm_crc1, xmm_t1);
291
233
  xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_t2);
292
234
  xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t3);
235
+ #endif
293
236
 
294
237
  src += 64;
295
238
  }
@@ -304,11 +247,20 @@ uint32_t crc_fold(const unsigned char *src, long len) {
304
247
  xmm_t1 = _mm_load_si128((__m128i *)src + 1);
305
248
  xmm_t2 = _mm_load_si128((__m128i *)src + 2);
306
249
 
307
- fold_3(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
308
-
309
- xmm_crc1 = _mm_xor_si128(xmm_crc1, xmm_t0);
310
- xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_t1);
250
+ xmm_t3 = xmm_crc3;
251
+ #ifdef ENABLE_AVX512
252
+ xmm_crc3 = do_one_fold_merge(xmm_crc2, xmm_t2);
253
+ xmm_crc2 = do_one_fold_merge(xmm_crc1, xmm_t1);
254
+ xmm_crc1 = do_one_fold_merge(xmm_crc0, xmm_t0);
255
+ #else
256
+ xmm_crc3 = do_one_fold(xmm_crc2);
257
+ xmm_crc2 = do_one_fold(xmm_crc1);
258
+ xmm_crc1 = do_one_fold(xmm_crc0);
311
259
  xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t2);
260
+ xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_t1);
261
+ xmm_crc1 = _mm_xor_si128(xmm_crc1, xmm_t0);
262
+ #endif
263
+ xmm_crc0 = xmm_t3;
312
264
 
313
265
  if (len == 0)
314
266
  goto done;
@@ -320,10 +272,19 @@ uint32_t crc_fold(const unsigned char *src, long len) {
320
272
  xmm_t0 = _mm_load_si128((__m128i *)src);
321
273
  xmm_t1 = _mm_load_si128((__m128i *)src + 1);
322
274
 
323
- fold_2(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
324
-
325
- xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_t0);
275
+ xmm_t2 = xmm_crc2;
276
+ xmm_t3 = xmm_crc3;
277
+ #ifdef ENABLE_AVX512
278
+ xmm_crc3 = do_one_fold_merge(xmm_crc1, xmm_t1);
279
+ xmm_crc2 = do_one_fold_merge(xmm_crc0, xmm_t0);
280
+ #else
281
+ xmm_crc3 = do_one_fold(xmm_crc1);
282
+ xmm_crc2 = do_one_fold(xmm_crc0);
326
283
  xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t1);
284
+ xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_t0);
285
+ #endif
286
+ xmm_crc1 = xmm_t3;
287
+ xmm_crc0 = xmm_t2;
327
288
 
328
289
  if (len == 0)
329
290
  goto done;
@@ -334,9 +295,15 @@ uint32_t crc_fold(const unsigned char *src, long len) {
334
295
 
335
296
  xmm_t0 = _mm_load_si128((__m128i *)src);
336
297
 
337
- fold_1(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
338
-
339
- xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t0);
298
+ xmm_t3 = xmm_crc3;
299
+ #ifdef ENABLE_AVX512
300
+ xmm_crc3 = do_one_fold_merge(xmm_crc0, xmm_t0);
301
+ #else
302
+ xmm_crc3 = _mm_xor_si128(do_one_fold(xmm_crc0), xmm_t0);
303
+ #endif
304
+ xmm_crc0 = xmm_crc1;
305
+ xmm_crc1 = xmm_crc2;
306
+ xmm_crc2 = xmm_t3;
340
307
 
341
308
  if (len == 0)
342
309
  goto done;
@@ -354,10 +321,7 @@ partial:
354
321
  &xmm_crc_part);
355
322
  done:
356
323
  {
357
- const __m128i xmm_mask = _mm_load_si128((__m128i *)crc_mask);
358
- const __m128i xmm_mask2 = _mm_load_si128((__m128i *)crc_mask2);
359
-
360
- uint32_t crc;
324
+ const __m128i xmm_mask = _mm_load_si128((__m128i *)crc_mask);
361
325
  __m128i x_tmp0, x_tmp1, x_tmp2, crc_fold;
362
326
 
363
327
  /*
@@ -367,18 +331,30 @@ done:
367
331
 
368
332
  x_tmp0 = _mm_clmulepi64_si128(xmm_crc0, crc_fold, 0x10);
369
333
  xmm_crc0 = _mm_clmulepi64_si128(xmm_crc0, crc_fold, 0x01);
334
+ #ifdef ENABLE_AVX512
335
+ xmm_crc1 = _mm_ternarylogic_epi32(xmm_crc1, x_tmp0, xmm_crc0, 0x96);
336
+ #else
370
337
  xmm_crc1 = _mm_xor_si128(xmm_crc1, x_tmp0);
371
338
  xmm_crc1 = _mm_xor_si128(xmm_crc1, xmm_crc0);
339
+ #endif
372
340
 
373
341
  x_tmp1 = _mm_clmulepi64_si128(xmm_crc1, crc_fold, 0x10);
374
342
  xmm_crc1 = _mm_clmulepi64_si128(xmm_crc1, crc_fold, 0x01);
343
+ #ifdef ENABLE_AVX512
344
+ xmm_crc2 = _mm_ternarylogic_epi32(xmm_crc2, x_tmp1, xmm_crc1, 0x96);
345
+ #else
375
346
  xmm_crc2 = _mm_xor_si128(xmm_crc2, x_tmp1);
376
347
  xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_crc1);
348
+ #endif
377
349
 
378
350
  x_tmp2 = _mm_clmulepi64_si128(xmm_crc2, crc_fold, 0x10);
379
351
  xmm_crc2 = _mm_clmulepi64_si128(xmm_crc2, crc_fold, 0x01);
352
+ #ifdef ENABLE_AVX512
353
+ xmm_crc3 = _mm_ternarylogic_epi32(xmm_crc3, x_tmp2, xmm_crc2, 0x96);
354
+ #else
380
355
  xmm_crc3 = _mm_xor_si128(xmm_crc3, x_tmp2);
381
356
  xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc2);
357
+ #endif
382
358
 
383
359
  /*
384
360
  * k5
@@ -393,29 +369,43 @@ done:
393
369
  xmm_crc0 = xmm_crc3;
394
370
  xmm_crc3 = _mm_slli_si128(xmm_crc3, 4);
395
371
  xmm_crc3 = _mm_clmulepi64_si128(xmm_crc3, crc_fold, 0x10);
372
+ #ifdef ENABLE_AVX512
373
+ //xmm_crc3 = _mm_maskz_xor_epi32(14, xmm_crc3, xmm_crc0);
374
+ xmm_crc3 = _mm_ternarylogic_epi32(xmm_crc3, xmm_crc0, xmm_mask, 0x28);
375
+ #else
376
+ xmm_crc0 = _mm_and_si128(xmm_crc0, xmm_mask);
396
377
  xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc0);
397
- xmm_crc3 = _mm_and_si128(xmm_crc3, xmm_mask2);
378
+ #endif
398
379
 
399
380
  /*
400
381
  * k7
401
382
  */
402
383
  xmm_crc1 = xmm_crc3;
403
- xmm_crc2 = xmm_crc3;
404
384
  crc_fold = _mm_load_si128((__m128i *)crc_k + 2);
405
385
 
406
386
  xmm_crc3 = _mm_clmulepi64_si128(xmm_crc3, crc_fold, 0);
407
- xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc2);
408
- xmm_crc3 = _mm_and_si128(xmm_crc3, xmm_mask);
409
-
410
- xmm_crc2 = xmm_crc3;
411
387
  xmm_crc3 = _mm_clmulepi64_si128(xmm_crc3, crc_fold, 0x10);
412
- xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc2);
388
+ #ifdef ENABLE_AVX512
389
+ xmm_crc3 = _mm_ternarylogic_epi32(xmm_crc3, xmm_crc1, xmm_crc1, 0xC3); // NOT(xmm_crc3 ^ xmm_crc1)
390
+ #else
391
+ xmm_crc1 = _mm_xor_si128(xmm_crc1, xmm_mask);
413
392
  xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc1);
393
+ #endif
394
+ return _mm_extract_epi32(xmm_crc3, 2);
395
+ }
414
396
 
415
- crc = _mm_extract_epi32(xmm_crc3, 2);
416
- return ~crc;
417
397
  }
418
398
 
399
+ static uint32_t do_crc32_incremental_clmul(const void* data, size_t length, uint32_t init) {
400
+ return crc_fold((const unsigned char*)data, (long)length, init);
401
+ }
402
+
403
+ void crc_clmul_set_funcs(crc_func* _do_crc32_incremental) {
404
+ *_do_crc32_incremental = &do_crc32_incremental_clmul;
405
+ }
406
+ #else
407
+ void crc_clmul_set_funcs(crc_func* _do_crc32_incremental) {
408
+ (void)_do_crc32_incremental;
419
409
  }
420
410
  #endif
421
411
 
package/src/decoder.cc ADDED
@@ -0,0 +1,61 @@
1
+ #include "common.h"
2
+
3
+ #include "decoder_common.h"
4
+ #include "decoder.h"
5
+
6
+ extern "C" {
7
+ YencDecoderEnd (*_do_decode)(const unsigned char*HEDLEY_RESTRICT*, unsigned char*HEDLEY_RESTRICT*, size_t, YencDecoderState*) = &do_decode_scalar<false, false>;
8
+ YencDecoderEnd (*_do_decode_raw)(const unsigned char*HEDLEY_RESTRICT*, unsigned char*HEDLEY_RESTRICT*, size_t, YencDecoderState*) = &do_decode_scalar<true, false>;
9
+ YencDecoderEnd (*_do_decode_end_raw)(const unsigned char*HEDLEY_RESTRICT*, unsigned char*HEDLEY_RESTRICT*, size_t, YencDecoderState*) = &do_decode_end_scalar<true>;
10
+ }
11
+
12
+ void decoder_set_sse2_funcs();
13
+ void decoder_set_ssse3_funcs();
14
+ void decoder_set_avx_funcs();
15
+ void decoder_set_avx2_funcs();
16
+ void decoder_set_neon_funcs();
17
+
18
+
19
+ #if defined(PLATFORM_X86) && defined(YENC_BUILD_NATIVE) && YENC_BUILD_NATIVE!=0
20
+ # if defined(__AVX2__) && !defined(YENC_DISABLE_AVX256)
21
+ # include "decoder_avx2_base.h"
22
+ static inline void decoder_set_native_funcs() {
23
+ ALIGN_ALLOC(lookups, sizeof(*lookups), 16);
24
+ decoder_init_lut(lookups->eqFix, lookups->compact);
25
+ _do_decode = &do_decode_simd<false, false, sizeof(__m256i)*2, do_decode_avx2<false, false, ISA_NATIVE> >;
26
+ _do_decode_raw = &do_decode_simd<true, false, sizeof(__m256i)*2, do_decode_avx2<true, false, ISA_NATIVE> >;
27
+ _do_decode_end_raw = &do_decode_simd<true, true, sizeof(__m256i)*2, do_decode_avx2<true, true, ISA_NATIVE> >;
28
+ }
29
+ # else
30
+ # include "decoder_sse_base.h"
31
+ static inline void decoder_set_native_funcs() {
32
+ decoder_sse_init();
33
+ decoder_init_lut(lookups->eqFix, lookups->compact);
34
+ _do_decode = &do_decode_simd<false, false, sizeof(__m128i)*2, do_decode_sse<false, false, ISA_NATIVE> >;
35
+ _do_decode_raw = &do_decode_simd<true, false, sizeof(__m128i)*2, do_decode_sse<true, false, ISA_NATIVE> >;
36
+ _do_decode_end_raw = &do_decode_simd<true, true, sizeof(__m128i)*2, do_decode_sse<true, true, ISA_NATIVE> >;
37
+ }
38
+ # endif
39
+ #endif
40
+
41
+ void decoder_init() {
42
+ #ifdef PLATFORM_X86
43
+ # if defined(YENC_BUILD_NATIVE) && YENC_BUILD_NATIVE!=0
44
+ decoder_set_native_funcs();
45
+ # else
46
+ int use_isa = cpu_supports_isa();
47
+ if(use_isa >= ISA_LEVEL_AVX2)
48
+ decoder_set_avx2_funcs();
49
+ else if(use_isa >= ISA_LEVEL_AVX)
50
+ decoder_set_avx_funcs();
51
+ else if(use_isa >= ISA_LEVEL_SSSE3)
52
+ decoder_set_ssse3_funcs();
53
+ else
54
+ decoder_set_sse2_funcs();
55
+ # endif
56
+ #endif
57
+ #ifdef PLATFORM_ARM
58
+ if(cpu_supports_neon())
59
+ decoder_set_neon_funcs();
60
+ #endif
61
+ }
package/src/decoder.h ADDED
@@ -0,0 +1,53 @@
1
+ #ifndef __YENC_DECODER_H
2
+ #define __YENC_DECODER_H
3
+
4
+ #ifdef __cplusplus
5
+ extern "C" {
6
+ #endif
7
+
8
+
9
+
10
+ // the last state that the decoder was in (i.e. last few characters processed)
11
+ // the state is needed for incremental decoders as its behavior is affected by what it processed last
12
+ // acronyms: CR = carriage return (\r), LF = line feed (\n), EQ = equals char, DT = dot char (.)
13
+ typedef enum {
14
+ YDEC_STATE_CRLF, // default
15
+ YDEC_STATE_EQ,
16
+ YDEC_STATE_CR,
17
+ YDEC_STATE_NONE,
18
+ YDEC_STATE_CRLFDT,
19
+ YDEC_STATE_CRLFDTCR,
20
+ YDEC_STATE_CRLFEQ // may actually be "\r\n.=" in raw decoder
21
+ } YencDecoderState;
22
+
23
+ // end result for incremental processing (whether the end of the yEnc data was reached)
24
+ typedef enum {
25
+ YDEC_END_NONE, // end not reached
26
+ YDEC_END_CONTROL, // \r\n=y sequence found, src points to byte after 'y'
27
+ YDEC_END_ARTICLE // \r\n.\r\n sequence found, src points to byte after last '\n'
28
+ } YencDecoderEnd;
29
+
30
+ #include "hedley.h"
31
+
32
+ extern YencDecoderEnd (*_do_decode)(const unsigned char*HEDLEY_RESTRICT*, unsigned char*HEDLEY_RESTRICT*, size_t, YencDecoderState*);
33
+ extern YencDecoderEnd (*_do_decode_raw)(const unsigned char*HEDLEY_RESTRICT*, unsigned char*HEDLEY_RESTRICT*, size_t, YencDecoderState*);
34
+ extern YencDecoderEnd (*_do_decode_end_raw)(const unsigned char*HEDLEY_RESTRICT*, unsigned char*HEDLEY_RESTRICT*, size_t, YencDecoderState*);
35
+
36
+ static inline size_t do_decode(int isRaw, const unsigned char* HEDLEY_RESTRICT src, unsigned char* HEDLEY_RESTRICT dest, size_t len, YencDecoderState* state) {
37
+ unsigned char* ds = dest;
38
+ (*(isRaw ? _do_decode_raw : _do_decode))(&src, &ds, len, state);
39
+ return ds - dest;
40
+ }
41
+
42
+ static inline YencDecoderEnd do_decode_end(const unsigned char*HEDLEY_RESTRICT* src, unsigned char*HEDLEY_RESTRICT* dest, size_t len, YencDecoderState* state) {
43
+ return _do_decode_end_raw(src, dest, len, state);
44
+ }
45
+
46
+ void decoder_init();
47
+
48
+
49
+
50
+ #ifdef __cplusplus
51
+ }
52
+ #endif
53
+ #endif
@@ -0,0 +1,18 @@
1
+ #include "common.h"
2
+
3
+ #if defined(__AVX__) && defined(__POPCNT__)
4
+ #include "decoder_common.h"
5
+ #include "decoder_sse_base.h"
6
+ void decoder_set_avx_funcs() {
7
+ decoder_sse_init();
8
+ decoder_init_lut(lookups->eqFix, lookups->compact);
9
+ _do_decode = &do_decode_simd<false, false, sizeof(__m128i)*2, do_decode_sse<false, false, ISA_LEVEL_SSE4_POPCNT> >;
10
+ _do_decode_raw = &do_decode_simd<true, false, sizeof(__m128i)*2, do_decode_sse<true, false, ISA_LEVEL_SSE4_POPCNT> >;
11
+ _do_decode_end_raw = &do_decode_simd<true, true, sizeof(__m128i)*2, do_decode_sse<true, true, ISA_LEVEL_SSE4_POPCNT> >;
12
+ }
13
+ #else
14
+ void decoder_set_ssse3_funcs();
15
+ void decoder_set_avx_funcs() {
16
+ decoder_set_ssse3_funcs();
17
+ }
18
+ #endif
@@ -0,0 +1,18 @@
1
+ #include "common.h"
2
+
3
+ #if defined(__AVX2__) && !defined(YENC_DISABLE_AVX256)
4
+ #include "decoder_common.h"
5
+ #include "decoder_avx2_base.h"
6
+ void decoder_set_avx2_funcs() {
7
+ ALIGN_ALLOC(lookups, sizeof(*lookups), 16);
8
+ decoder_init_lut(lookups->eqFix, lookups->compact);
9
+ _do_decode = &do_decode_simd<false, false, sizeof(__m256i)*2, do_decode_avx2<false, false, ISA_LEVEL_AVX2> >;
10
+ _do_decode_raw = &do_decode_simd<true, false, sizeof(__m256i)*2, do_decode_avx2<true, false, ISA_LEVEL_AVX2> >;
11
+ _do_decode_end_raw = &do_decode_simd<true, true, sizeof(__m256i)*2, do_decode_avx2<true, true, ISA_LEVEL_AVX2> >;
12
+ }
13
+ #else
14
+ void decoder_set_avx_funcs();
15
+ void decoder_set_avx2_funcs() {
16
+ decoder_set_avx_funcs();
17
+ }
18
+ #endif