yencode 1.1.2 → 1.1.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +3 -2
- package/binding.gyp +141 -6
- package/index.js +21 -19
- package/package.json +2 -1
- package/src/common.h +34 -19
- package/src/crc.cc +138 -11
- package/src/crc_arm.cc +42 -7
- package/src/crc_folding.cc +18 -53
- package/src/crc_folding_256.cc +229 -0
- package/src/decoder.cc +8 -4
- package/src/decoder.h +5 -5
- package/src/decoder_avx2_base.h +30 -13
- package/src/decoder_common.h +5 -5
- package/src/decoder_neon.cc +4 -4
- package/src/decoder_neon64.cc +10 -7
- package/src/decoder_sse_base.h +26 -12
- package/src/decoder_vbmi2.cc +37 -0
- package/src/encoder.cc +10 -1
- package/src/encoder_avx_base.h +24 -16
- package/src/encoder_neon.cc +40 -41
- package/src/encoder_rvv.cc +219 -0
- package/src/encoder_sse_base.h +7 -8
- package/src/encoder_vbmi2.cc +30 -0
- package/src/hedley.h +278 -135
- package/src/platform.cc +79 -10
- package/src/test_alignalloc.c +6 -0
- package/test/_speedbase.js +12 -11
- package/test/speeddec.js +6 -5
- package/test/testcrc.js +14 -0
- package/test/testdec.js +30 -14
- package/test/testenc.js +10 -7
- package/test/testpostdec.js +6 -5
|
@@ -0,0 +1,219 @@
|
|
|
1
|
+
#include "common.h"
|
|
2
|
+
|
|
3
|
+
#ifdef __riscv_vector
|
|
4
|
+
#include "encoder.h"
|
|
5
|
+
#include "encoder_common.h"
|
|
6
|
+
|
|
7
|
+
# include <riscv_vector.h>
|
|
8
|
+
# if defined(__clang__) && __clang_major__ < 16
|
|
9
|
+
# define RV(f) f
|
|
10
|
+
# else
|
|
11
|
+
# define RV(f) __riscv_##f
|
|
12
|
+
# endif
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
static HEDLEY_ALWAYS_INLINE void encode_eol_handle_pre(const uint8_t* HEDLEY_RESTRICT _src, long& inpos, uint8_t*& outp, long& col, long lineSizeOffset) {
|
|
16
|
+
// TODO: vectorize
|
|
17
|
+
uint8_t c = _src[inpos++];
|
|
18
|
+
if(HEDLEY_UNLIKELY(escapedLUT[c] && c != '.'-42)) {
|
|
19
|
+
memcpy(outp, &escapedLUT[c], sizeof(uint16_t));
|
|
20
|
+
outp += 2;
|
|
21
|
+
} else {
|
|
22
|
+
*(outp++) = c + 42;
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
c = _src[inpos++];
|
|
26
|
+
if(LIKELIHOOD(0.0273, escapedLUT[c]!=0)) {
|
|
27
|
+
uint32_t w = UINT32_16_PACK(UINT16_PACK('\r', '\n'), (uint32_t)escapedLUT[c]);
|
|
28
|
+
memcpy(outp, &w, sizeof(w));
|
|
29
|
+
outp += 4;
|
|
30
|
+
col = lineSizeOffset + 2;
|
|
31
|
+
} else {
|
|
32
|
+
uint32_t w = UINT32_PACK('\r', '\n', (uint32_t)(c+42), 0);
|
|
33
|
+
memcpy(outp, &w, sizeof(w));
|
|
34
|
+
outp += 3;
|
|
35
|
+
col = lineSizeOffset + 1;
|
|
36
|
+
}
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
HEDLEY_ALWAYS_INLINE void do_encode_rvv(int line_size, int* colOffset, const uint8_t* HEDLEY_RESTRICT srcEnd, uint8_t* HEDLEY_RESTRICT& dest, size_t& len) {
|
|
41
|
+
size_t vl2 = RV(vsetvlmax_e8m2)(); // TODO: limit to line length
|
|
42
|
+
// TODO: have a LMUL=1 variant if line_size < vl
|
|
43
|
+
|
|
44
|
+
// offset position to enable simpler loop condition checking
|
|
45
|
+
const int INPUT_OFFSET = vl2*2 -1; // extra chars for EOL handling, -1 to change <= to <
|
|
46
|
+
if((intptr_t)len <= INPUT_OFFSET || line_size < (int)vl2*2) return;
|
|
47
|
+
|
|
48
|
+
uint8_t *outp = dest;
|
|
49
|
+
long inpos = -(long)len;
|
|
50
|
+
long lineSizeOffset = -line_size +1;
|
|
51
|
+
long col = *colOffset - line_size +1;
|
|
52
|
+
|
|
53
|
+
inpos += INPUT_OFFSET;
|
|
54
|
+
const uint8_t* _src = srcEnd - INPUT_OFFSET;
|
|
55
|
+
|
|
56
|
+
if (HEDLEY_LIKELY(col == -line_size+1)) {
|
|
57
|
+
uint8_t c = _src[inpos++];
|
|
58
|
+
if (LIKELIHOOD(0.0273, escapedLUT[c] != 0)) {
|
|
59
|
+
memcpy(outp, escapedLUT + c, 2);
|
|
60
|
+
outp += 2;
|
|
61
|
+
col += 2;
|
|
62
|
+
} else {
|
|
63
|
+
*(outp++) = c + 42;
|
|
64
|
+
col += 1;
|
|
65
|
+
}
|
|
66
|
+
}
|
|
67
|
+
if(HEDLEY_UNLIKELY(col >= 0)) {
|
|
68
|
+
if(col == 0)
|
|
69
|
+
encode_eol_handle_pre(_src, inpos, outp, col, lineSizeOffset);
|
|
70
|
+
else {
|
|
71
|
+
uint8_t c = _src[inpos++];
|
|
72
|
+
if(LIKELIHOOD(0.0273, escapedLUT[c]!=0)) {
|
|
73
|
+
uint32_t v = UINT32_16_PACK(UINT16_PACK('\r', '\n'), (uint32_t)escapedLUT[c]);
|
|
74
|
+
memcpy(outp, &v, sizeof(v));
|
|
75
|
+
outp += 4;
|
|
76
|
+
col = 2-line_size + 1;
|
|
77
|
+
} else {
|
|
78
|
+
uint32_t v = UINT32_PACK('\r', '\n', (uint32_t)(c+42), 0);
|
|
79
|
+
memcpy(outp, &v, sizeof(v));
|
|
80
|
+
outp += 3;
|
|
81
|
+
col = 2-line_size;
|
|
82
|
+
}
|
|
83
|
+
}
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
// vector constants
|
|
87
|
+
const vuint8mf2_t ALT_SHIFT = RV(vreinterpret_v_u16mf2_u8mf2)(RV(vmv_v_x_u16mf2)(4, vl2));
|
|
88
|
+
const uint8_t _MASK_EXPAND[] = {0xAA, 0xAB, 0xAE, 0xAF, 0xBA, 0xBB, 0xBE, 0xBF, 0xEA, 0xEB, 0xEE, 0xEF, 0xFA, 0xFB, 0xFE, 0xFF};
|
|
89
|
+
const vuint8m1_t MASK_EXPAND = RV(vle8_v_u8m1)(_MASK_EXPAND, 16);
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
// TODO: consider exploiting partial vector capability
|
|
93
|
+
while(inpos < 0) {
|
|
94
|
+
vuint8m2_t data = RV(vle8_v_u8m2)(_src + inpos, vl2);
|
|
95
|
+
inpos += vl2;
|
|
96
|
+
|
|
97
|
+
// search for special chars
|
|
98
|
+
// TODO: vrgather strat
|
|
99
|
+
|
|
100
|
+
vuint8m2_t tmpData = RV(vsub_vx_u8m2)(data, -42, vl2);
|
|
101
|
+
vbool4_t cmp = RV(vmor_mm_b4)(
|
|
102
|
+
RV(vmor_mm_b4)(
|
|
103
|
+
RV(vmseq_vx_u8m2_b4)(data, -42, vl2),
|
|
104
|
+
RV(vmseq_vx_u8m2_b4)(tmpData, '=', vl2),
|
|
105
|
+
vl2
|
|
106
|
+
),
|
|
107
|
+
RV(vmor_mm_b4)(
|
|
108
|
+
RV(vmseq_vx_u8m2_b4)(data, '\r'-42, vl2),
|
|
109
|
+
RV(vmseq_vx_u8m2_b4)(data, '\n'-42, vl2),
|
|
110
|
+
vl2
|
|
111
|
+
),
|
|
112
|
+
vl2
|
|
113
|
+
);
|
|
114
|
+
|
|
115
|
+
#if defined(__riscv_v_intrinsic) && __riscv_v_intrinsic >= 13000
|
|
116
|
+
data = RV(vor_vx_u8m2_mu)(cmp, tmpData, tmpData, 64, vl2);
|
|
117
|
+
#else
|
|
118
|
+
data = RV(vor_vx_u8m2_m)(cmp, tmpData, tmpData, 64, vl2);
|
|
119
|
+
#endif
|
|
120
|
+
|
|
121
|
+
int idx;
|
|
122
|
+
size_t count = RV(vcpop_m_b4)(cmp, vl2);
|
|
123
|
+
if(count > 1) {
|
|
124
|
+
// widen mask: 4b->8b
|
|
125
|
+
#if defined(__riscv_v_intrinsic) && __riscv_v_intrinsic >= 13000
|
|
126
|
+
vuint8mf4_t vcmp = RV(vlmul_trunc_v_u8m1_u8mf4)(RV(vreinterpret_v_b4_u8m1)(cmp));
|
|
127
|
+
#else
|
|
128
|
+
vuint8mf4_t vcmp = *(vuint8mf4_t*)(&cmp);
|
|
129
|
+
#endif
|
|
130
|
+
// TODO: use vwsll instead if available
|
|
131
|
+
// - is clmul useful here?
|
|
132
|
+
vuint8mf2_t xcmp = RV(vreinterpret_v_u16mf2_u8mf2)(RV(vwmulu_vx_u16mf2)(vcmp, 16, vl2));
|
|
133
|
+
xcmp = RV(vsrl_vv_u8mf2)(xcmp, ALT_SHIFT, vl2);
|
|
134
|
+
|
|
135
|
+
// expand mask by inserting '1' between each bit (0000abcd -> 1a1b1c1d)
|
|
136
|
+
vuint8m1_t xcmpTmp = RV(vrgather_vv_u8m1)(MASK_EXPAND, RV(vlmul_ext_v_u8mf2_u8m1)(xcmp), vl2);
|
|
137
|
+
#if defined(__riscv_v_intrinsic) && __riscv_v_intrinsic >= 13000
|
|
138
|
+
vbool2_t cmpmask = RV(vreinterpret_b2)(xcmpTmp);
|
|
139
|
+
#else
|
|
140
|
+
vbool2_t cmpmask = *(vbool2_t*)(&xcmpTmp);
|
|
141
|
+
#endif
|
|
142
|
+
|
|
143
|
+
// expand data and insert =
|
|
144
|
+
// TODO: use vwsll instead if available
|
|
145
|
+
vuint16m4_t data2 = RV(vzext_vf2_u16m4)(data, vl2);
|
|
146
|
+
data2 = RV(vsll_vx_u16m4)(data2, 8, vl2);
|
|
147
|
+
data2 = RV(vor_vx_u16m4)(data2, '=', vl2);
|
|
148
|
+
|
|
149
|
+
// prune unneeded =
|
|
150
|
+
vuint8m4_t dataTmp = RV(vreinterpret_v_u16m4_u8m4)(data2);
|
|
151
|
+
vuint8m4_t final_data = RV(vcompress_vm_u8m4)(
|
|
152
|
+
#if defined(__riscv_v_intrinsic) && __riscv_v_intrinsic >= 13000
|
|
153
|
+
dataTmp, cmpmask, vl2*2
|
|
154
|
+
#else
|
|
155
|
+
cmpmask, dataTmp, dataTmp, vl2*2
|
|
156
|
+
#endif
|
|
157
|
+
);
|
|
158
|
+
|
|
159
|
+
RV(vse8_v_u8m4)(outp, final_data, vl2*2);
|
|
160
|
+
outp += vl2 + count;
|
|
161
|
+
col += vl2 + count;
|
|
162
|
+
|
|
163
|
+
if(col >= 0) {
|
|
164
|
+
// we overflowed - find correct position to revert back to
|
|
165
|
+
// TODO: stick with u8 type for vlmax <= 2048 (need to check if ok if vlmax == 2048)
|
|
166
|
+
// - considering that it's rare for colWidth > 128, maybe just don't support vectors that long
|
|
167
|
+
vuint16m8_t xidx = RV(viota_m_u16m8)(cmpmask, vl2*2);
|
|
168
|
+
vbool2_t discardmask = RV(vmsgeu_vx_u16m8_b2)(xidx, vl2 + count - col, vl2*2);
|
|
169
|
+
long idx_revert = RV(vcpop_m_b2)(discardmask, vl2*2);
|
|
170
|
+
|
|
171
|
+
outp -= col + (idx_revert & 1);
|
|
172
|
+
inpos -= ((idx_revert+1) >> 1);
|
|
173
|
+
|
|
174
|
+
goto _encode_eol_handle_pre;
|
|
175
|
+
}
|
|
176
|
+
} else {
|
|
177
|
+
// 0 or 1 special characters
|
|
178
|
+
{
|
|
179
|
+
vbool4_t mask = RV(vmsbf_m_b4)(cmp, vl2);
|
|
180
|
+
// TODO: is it better to shuffle this into two stores, instead of three?
|
|
181
|
+
RV(vse8_v_u8m2_m)(mask, outp, data, vl2);
|
|
182
|
+
idx = RV(vcpop_m_b4)(mask, vl2);
|
|
183
|
+
outp[idx] = '=';
|
|
184
|
+
RV(vse8_v_u8m2_m)(RV(vmnot_m_b4)(mask, vl2), outp+1, data, vl2);
|
|
185
|
+
|
|
186
|
+
outp += vl2 + count;
|
|
187
|
+
col += vl2 + count;
|
|
188
|
+
}
|
|
189
|
+
|
|
190
|
+
if(col >= 0) {
|
|
191
|
+
if(count > 0) {
|
|
192
|
+
idx = vl2 - idx;
|
|
193
|
+
if(HEDLEY_UNLIKELY(col == idx)) {
|
|
194
|
+
// this is an escape character, so line will need to overflow
|
|
195
|
+
outp--;
|
|
196
|
+
} else {
|
|
197
|
+
inpos += (col > idx);
|
|
198
|
+
}
|
|
199
|
+
}
|
|
200
|
+
outp -= col;
|
|
201
|
+
inpos -= col;
|
|
202
|
+
|
|
203
|
+
_encode_eol_handle_pre:
|
|
204
|
+
encode_eol_handle_pre(_src, inpos, outp, col, lineSizeOffset);
|
|
205
|
+
}
|
|
206
|
+
}
|
|
207
|
+
}
|
|
208
|
+
|
|
209
|
+
*colOffset = col + line_size -1;
|
|
210
|
+
dest = outp;
|
|
211
|
+
len = -(inpos - INPUT_OFFSET);
|
|
212
|
+
}
|
|
213
|
+
|
|
214
|
+
void encoder_rvv_init() {
|
|
215
|
+
_do_encode = &do_encode_simd<do_encode_rvv>;
|
|
216
|
+
}
|
|
217
|
+
#else
|
|
218
|
+
void encoder_rvv_init() {}
|
|
219
|
+
#endif /* defined(__riscv_vector) */
|
package/src/encoder_sse_base.h
CHANGED
|
@@ -8,7 +8,7 @@
|
|
|
8
8
|
# define _mm_mask_expand_epi8 _mm128_mask_expand_epi8
|
|
9
9
|
#endif
|
|
10
10
|
|
|
11
|
-
#if defined(__GNUC__) && __GNUC__ >= 7
|
|
11
|
+
#if (defined(__GNUC__) && __GNUC__ >= 7) || (defined(_MSC_VER) && _MSC_VER >= 1924)
|
|
12
12
|
# define KLOAD16(a, offs) _load_mask16((__mmask16*)(a) + (offs))
|
|
13
13
|
#else
|
|
14
14
|
# define KLOAD16(a, offs) (((uint16_t*)(a))[(offs)])
|
|
@@ -155,7 +155,7 @@ HEDLEY_ALWAYS_INLINE void do_encode_sse(int line_size, int* colOffset, const uin
|
|
|
155
155
|
if(len <= INPUT_OFFSET || line_size < XMM_SIZE) return;
|
|
156
156
|
|
|
157
157
|
// slower CPUs prefer to branch as mispredict penalty is probably small relative to general execution
|
|
158
|
-
#if defined(__tune_atom__) || defined(__tune_slm__) || defined(__tune_btver1__)
|
|
158
|
+
#if defined(__tune_atom__) || defined(__tune_slm__) || defined(__tune_btver1__) || defined(__tune_btver2__)
|
|
159
159
|
const bool _PREFER_BRANCHING = true;
|
|
160
160
|
#else
|
|
161
161
|
const bool _PREFER_BRANCHING = (use_isa < ISA_LEVEL_SSSE3);
|
|
@@ -350,7 +350,7 @@ HEDLEY_ALWAYS_INLINE void do_encode_sse(int line_size, int* colOffset, const uin
|
|
|
350
350
|
#if defined(__POPCNT__) && !defined(__tune_btver1__)
|
|
351
351
|
if(use_isa & ISA_FEATURE_POPCNT) {
|
|
352
352
|
shuf2Len = popcnt32(maskA) + 16;
|
|
353
|
-
# if defined(__tune_znver3__) || defined(__tune_znver2__) || defined(__tune_znver1__) || defined(__tune_btver2__)
|
|
353
|
+
# if defined(__tune_znver4__) || defined(__tune_znver3__) || defined(__tune_znver2__) || defined(__tune_znver1__) || defined(__tune_btver2__)
|
|
354
354
|
shuf1Len = popcnt32(m1) + 8;
|
|
355
355
|
shuf3Len = popcnt32(m3) + shuf2Len + 8;
|
|
356
356
|
# else
|
|
@@ -409,11 +409,11 @@ HEDLEY_ALWAYS_INLINE void do_encode_sse(int line_size, int* colOffset, const uin
|
|
|
409
409
|
if(use_isa >= ISA_LEVEL_VBMI2)
|
|
410
410
|
# endif
|
|
411
411
|
{
|
|
412
|
-
|
|
412
|
+
__asm__(
|
|
413
413
|
"shrl $1, %[eqMask] \n"
|
|
414
414
|
"shrl %%cl, %[eqMask] \n" // TODO: can use shrq to avoid above shift?
|
|
415
|
-
# if defined(PLATFORM_AMD64)
|
|
416
|
-
"adcq %[col], %[p] \n"
|
|
415
|
+
# if defined(PLATFORM_AMD64) && !defined(__ILP32__)
|
|
416
|
+
"adcq %q[col], %q[p] \n"
|
|
417
417
|
# else
|
|
418
418
|
"adcl %[col], %[p] \n"
|
|
419
419
|
# endif
|
|
@@ -484,7 +484,7 @@ HEDLEY_ALWAYS_INLINE void do_encode_sse(int line_size, int* colOffset, const uin
|
|
|
484
484
|
dataB = _mm_ternarylogic_epi32(dataB, cmpB, _mm_set1_epi8(64), 0xf8);
|
|
485
485
|
|
|
486
486
|
// store last char
|
|
487
|
-
|
|
487
|
+
p[XMM_SIZE*2] = _mm_extract_epi8(dataB, 15);
|
|
488
488
|
|
|
489
489
|
uint32_t blendMask = (uint32_t)(-(int32_t)mask);
|
|
490
490
|
dataB = _mm_mask_alignr_epi8(dataB, blendMask>>16, dataB, dataA, 15);
|
|
@@ -539,7 +539,6 @@ HEDLEY_ALWAYS_INLINE void do_encode_sse(int line_size, int* colOffset, const uin
|
|
|
539
539
|
dataA = _mm_shuffle_epi8(dataA, shufMaskA);
|
|
540
540
|
|
|
541
541
|
# if defined(__SSE4_1__) && !defined(__tune_slm__) && !defined(__tune_goldmont__) && !defined(__tune_goldmont_plus__) && !defined(__tune_tremont__)
|
|
542
|
-
// unsure if worth on: Jaguar/Puma (3|2), Core2 (2|2)
|
|
543
542
|
if(use_isa >= ISA_LEVEL_SSE41) {
|
|
544
543
|
dataB = _mm_blendv_epi8(dataBShifted, dataB, mergeMaskB);
|
|
545
544
|
} else
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
#include "common.h"
|
|
2
|
+
|
|
3
|
+
extern const bool encoder_has_avx10;
|
|
4
|
+
#if !defined(__EVEX512__) && (defined(__AVX10_1__) || defined(__EVEX256__)) && defined(__AVX512VL__) && defined(__AVX512VBMI2__) && defined(__AVX512BW__)
|
|
5
|
+
const bool encoder_has_avx10 = true;
|
|
6
|
+
#else
|
|
7
|
+
const bool encoder_has_avx10 = false;
|
|
8
|
+
#endif
|
|
9
|
+
|
|
10
|
+
#if defined(__AVX512VL__) && defined(__AVX512VBMI2__) && defined(__AVX512BW__)
|
|
11
|
+
# ifndef YENC_DISABLE_AVX256
|
|
12
|
+
# include "encoder_avx_base.h"
|
|
13
|
+
|
|
14
|
+
void encoder_vbmi2_init() {
|
|
15
|
+
_do_encode = &do_encode_simd< do_encode_avx2<ISA_LEVEL_VBMI2> >;
|
|
16
|
+
encoder_avx2_lut<ISA_LEVEL_VBMI2>();
|
|
17
|
+
}
|
|
18
|
+
# else
|
|
19
|
+
# include "encoder_sse_base.h"
|
|
20
|
+
void encoder_vbmi2_init() {
|
|
21
|
+
_do_encode = &do_encode_simd< do_encode_sse<ISA_LEVEL_VBMI2> >;
|
|
22
|
+
encoder_sse_lut<ISA_LEVEL_VBMI2>();
|
|
23
|
+
}
|
|
24
|
+
# endif
|
|
25
|
+
#else
|
|
26
|
+
void encoder_avx2_init();
|
|
27
|
+
void encoder_vbmi2_init() {
|
|
28
|
+
encoder_avx2_init();
|
|
29
|
+
}
|
|
30
|
+
#endif
|