yencode 1.1.5 → 1.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +130 -189
- package/binding.gyp +115 -6
- package/index.js +2 -0
- package/package.json +1 -1
- package/src/common.h +37 -7
- package/src/crc.cc +121 -47
- package/src/crc.h +74 -10
- package/src/crc_arm.cc +51 -34
- package/src/crc_arm_pmull.cc +215 -0
- package/src/crc_common.h +22 -0
- package/src/crc_folding.cc +154 -16
- package/src/crc_folding_256.cc +7 -14
- package/src/crc_riscv.cc +251 -0
- package/src/decoder.cc +373 -13
- package/src/decoder.h +10 -14
- package/src/decoder_avx.cc +5 -6
- package/src/decoder_avx2.cc +8 -9
- package/src/decoder_avx2_base.h +7 -11
- package/src/decoder_common.h +56 -373
- package/src/decoder_neon.cc +13 -19
- package/src/decoder_neon64.cc +12 -15
- package/src/decoder_rvv.cc +280 -0
- package/src/decoder_sse2.cc +26 -5
- package/src/decoder_sse_base.h +20 -40
- package/src/decoder_ssse3.cc +5 -6
- package/src/decoder_vbmi2.cc +6 -13
- package/src/encoder.cc +42 -26
- package/src/encoder.h +5 -7
- package/src/encoder_avx.cc +3 -3
- package/src/encoder_avx2.cc +3 -3
- package/src/encoder_avx_base.h +3 -0
- package/src/encoder_common.h +26 -32
- package/src/encoder_neon.cc +6 -3
- package/src/encoder_rvv.cc +13 -26
- package/src/encoder_sse2.cc +3 -2
- package/src/encoder_sse_base.h +2 -0
- package/src/encoder_ssse3.cc +3 -3
- package/src/encoder_vbmi2.cc +6 -7
- package/src/platform.cc +24 -23
- package/src/yencode.cc +54 -11
- package/test/_speedbase.js +4 -2
- package/test/speeddec.js +25 -16
- package/test/speedenc.js +21 -17
- package/test/testcrc.js +17 -1
- package/test/testcrcfuncs.c +53 -0
- package/test/testdec.js +1 -0
package/src/decoder_neon64.cc
CHANGED
|
@@ -1,14 +1,12 @@
|
|
|
1
1
|
#include "common.h"
|
|
2
|
+
#include "decoder_common.h"
|
|
2
3
|
#if defined(__ARM_NEON) && defined(__aarch64__)
|
|
3
4
|
|
|
4
|
-
#include "decoder_common.h"
|
|
5
5
|
|
|
6
6
|
#pragma pack(16)
|
|
7
7
|
static struct { char bytes[16]; } ALIGN_TO(16, compactLUT[32768]);
|
|
8
8
|
#pragma pack()
|
|
9
9
|
|
|
10
|
-
static uint8_t eqFixLUT[256];
|
|
11
|
-
|
|
12
10
|
|
|
13
11
|
// AArch64 GCC lacks these functions until 8.5, 9.4 and 10.1 (10.0 unknown)
|
|
14
12
|
#if !defined(__clang__) && !defined(_MSC_VER) && (!defined(__aarch64__) || !(HEDLEY_GCC_VERSION_CHECK(9,4,0) || (!HEDLEY_GCC_VERSION_CHECK(9,0,0) && HEDLEY_GCC_VERSION_CHECK(8,5,0))))
|
|
@@ -46,6 +44,8 @@ static HEDLEY_ALWAYS_INLINE uint8x16_t mergeCompares(uint8x16_t a, uint8x16_t b,
|
|
|
46
44
|
}
|
|
47
45
|
|
|
48
46
|
|
|
47
|
+
namespace RapidYenc {
|
|
48
|
+
|
|
49
49
|
template<bool isRaw, bool searchEnd>
|
|
50
50
|
HEDLEY_ALWAYS_INLINE void do_decode_neon(const uint8_t* src, long& len, unsigned char*& p, unsigned char& escFirst, uint16_t& nextMask) {
|
|
51
51
|
HEDLEY_ASSUME(escFirst == 0 || escFirst == 1);
|
|
@@ -292,19 +292,15 @@ HEDLEY_ALWAYS_INLINE void do_decode_neon(const uint8_t* src, long& len, unsigned
|
|
|
292
292
|
// a spec compliant encoder should never generate sequences: ==, =\n and =\r, but we'll handle them to be spec compliant
|
|
293
293
|
// the yEnc specification requires any character following = to be unescaped, not skipped over, so we'll deal with that
|
|
294
294
|
// firstly, check for invalid sequences of = (we assume that these are rare, as a spec compliant yEnc encoder should not generate these)
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
for(int j=8; j<64; j+=8) {
|
|
299
|
-
tmp = eqFixLUT[((maskEq>>j)&0xff) & ~(tmp>>7)];
|
|
300
|
-
maskEq2 |= ((uint64_t)tmp)<<j;
|
|
301
|
-
}
|
|
302
|
-
maskEq = maskEq2;
|
|
295
|
+
uint64_t maskEqShift1 = (maskEq << 1) | escFirst;
|
|
296
|
+
if(LIKELIHOOD(0.0001, (mask & maskEqShift1) != 0)) {
|
|
297
|
+
maskEq = fix_eqMask<uint64_t>(maskEq, maskEqShift1);
|
|
303
298
|
|
|
299
|
+
unsigned char nextEscFirst = maskEq>>63;
|
|
304
300
|
// next, eliminate anything following a `=` from the special char mask; this eliminates cases of `=\r` so that they aren't removed
|
|
305
301
|
maskEq = (maskEq<<1) | escFirst;
|
|
306
302
|
mask &= ~maskEq;
|
|
307
|
-
escFirst =
|
|
303
|
+
escFirst = nextEscFirst;
|
|
308
304
|
|
|
309
305
|
// unescape chars following `=`
|
|
310
306
|
#if defined(__GNUC__) && !defined(__clang__)
|
|
@@ -436,14 +432,15 @@ HEDLEY_ALWAYS_INLINE void do_decode_neon(const uint8_t* src, long& len, unsigned
|
|
|
436
432
|
}
|
|
437
433
|
}
|
|
438
434
|
}
|
|
435
|
+
} // namespace
|
|
439
436
|
|
|
440
|
-
void decoder_set_neon_funcs() {
|
|
441
|
-
decoder_init_lut(
|
|
437
|
+
void RapidYenc::decoder_set_neon_funcs() {
|
|
438
|
+
decoder_init_lut(compactLUT);
|
|
442
439
|
_do_decode = &do_decode_simd<false, false, sizeof(uint8x16_t)*4, do_decode_neon<false, false> >;
|
|
443
440
|
_do_decode_raw = &do_decode_simd<true, false, sizeof(uint8x16_t)*4, do_decode_neon<true, false> >;
|
|
444
441
|
_do_decode_end_raw = &do_decode_simd<true, true, sizeof(uint8x16_t)*4, do_decode_neon<true, true> >;
|
|
445
442
|
_decode_isa = ISA_LEVEL_NEON;
|
|
446
443
|
}
|
|
447
444
|
#else
|
|
448
|
-
void decoder_set_neon_funcs() {}
|
|
445
|
+
void RapidYenc::decoder_set_neon_funcs() {}
|
|
449
446
|
#endif
|
|
@@ -0,0 +1,280 @@
|
|
|
1
|
+
#include "common.h"
|
|
2
|
+
#include "decoder_common.h"
|
|
3
|
+
#ifdef __riscv_vector
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
#ifdef __riscv_v_intrinsic
|
|
7
|
+
# define RV_vmerge_vxm_u8m2 RV(vmerge_vxm_u8m2)
|
|
8
|
+
# define RV_vmerge_vxm_u16m2 RV(vmerge_vxm_u16m2)
|
|
9
|
+
#else
|
|
10
|
+
# define RV_vmerge_vxm_u8m2(v, x, m, vl) RV(vmerge_vxm_u8m2)(m, v, x, vl)
|
|
11
|
+
# define RV_vmerge_vxm_u16m2(v, x, m, vl) RV(vmerge_vxm_u16m2)(m, v, x, vl)
|
|
12
|
+
#endif
|
|
13
|
+
|
|
14
|
+
#if defined(__riscv_v_intrinsic) && __riscv_v_intrinsic >= 12000
|
|
15
|
+
# define RV_VEC_CAST(masksz, vecsz, vec) RV(vreinterpret_v_b##masksz##_u##vecsz##m1)(vec)
|
|
16
|
+
#else
|
|
17
|
+
# define RV_VEC_CAST(masksz, vecsz, vec) *(vuint##vecsz##m1_t*)(&(vec))
|
|
18
|
+
#endif
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
template<int shift>
|
|
22
|
+
static inline vbool4_t mask_lshift(vbool4_t m, unsigned shiftIn, size_t vl) {
|
|
23
|
+
vuint8m1_t mv = RV_VEC_CAST(4, 8, m);
|
|
24
|
+
vuint8m1_t mvl = RV(vsll_vx_u8m1)(mv, shift, vl/8);
|
|
25
|
+
vuint8m1_t mvr = RV(vsrl_vx_u8m1)(mv, 8-shift, vl/8);
|
|
26
|
+
mvr = RV(vslide1up_vx_u8m1)(mvr, shiftIn, vl/8);
|
|
27
|
+
|
|
28
|
+
return RV(vmor_mm_b4)(
|
|
29
|
+
RV_MASK_CAST(4, 8, mvl), RV_MASK_CAST(4, 8, mvr), vl
|
|
30
|
+
);
|
|
31
|
+
}
|
|
32
|
+
template<int shift>
|
|
33
|
+
static inline vbool64_t mask_lshift(vbool64_t m, unsigned shiftIn, size_t vl) {
|
|
34
|
+
vuint8m1_t mv = RV_VEC_CAST(64, 8, m);
|
|
35
|
+
vuint8m1_t mvl = RV(vsll_vx_u8m1)(mv, shift, vl/8);
|
|
36
|
+
vuint8m1_t mvr = RV(vsrl_vx_u8m1)(mv, 8-shift, vl/8);
|
|
37
|
+
mvr = RV(vslide1up_vx_u8m1)(mvr, shiftIn, vl/8);
|
|
38
|
+
|
|
39
|
+
return RV(vmor_mm_b64)(
|
|
40
|
+
RV_MASK_CAST(64, 8, mvl), RV_MASK_CAST(64, 8, mvr), vl
|
|
41
|
+
);
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
static inline vuint8m2_t set_first_vu8(vuint8m2_t src, uint8_t item, size_t vl) {
|
|
45
|
+
#ifdef __riscv_v_intrinsic
|
|
46
|
+
return RV(vmv_s_x_u8m2_tu)(src, item, vl);
|
|
47
|
+
#else
|
|
48
|
+
vuint8m1_t m = RV(vslide1up_vx_u8m1)(RV(vmv_v_x_u8m1)(0, ~0), 1, ~0);
|
|
49
|
+
return RV_vmerge_vxm_u8m2(src, item, RV_MASK_CAST(4, 8, m), vl);
|
|
50
|
+
#endif
|
|
51
|
+
}
|
|
52
|
+
static inline vuint16m2_t set_first_vu16(vuint16m2_t src, uint16_t item, size_t vl) {
|
|
53
|
+
#ifdef __riscv_v_intrinsic
|
|
54
|
+
return RV(vmv_s_x_u16m2_tu)(src, item, vl);
|
|
55
|
+
#else
|
|
56
|
+
vuint16m1_t m = RV(vslide1up_vx_u16m1)(RV(vmv_v_x_u16m1)(0, ~0), 1, ~0);
|
|
57
|
+
return RV_vmerge_vxm_u16m2(src, item, RV_MASK_CAST(8, 16, m), vl);
|
|
58
|
+
#endif
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
namespace RapidYenc {
|
|
63
|
+
|
|
64
|
+
template<bool isRaw, bool searchEnd>
|
|
65
|
+
HEDLEY_ALWAYS_INLINE void do_decode_rvv(const uint8_t* src, long& len, unsigned char*& outp, unsigned char& escFirst, uint16_t& nextMask) {
|
|
66
|
+
HEDLEY_ASSUME(escFirst == 0 || escFirst == 1);
|
|
67
|
+
HEDLEY_ASSUME(nextMask == 0 || nextMask == 1 || nextMask == 2);
|
|
68
|
+
|
|
69
|
+
size_t vl2 = RV(vsetvlmax_e8m2)();
|
|
70
|
+
|
|
71
|
+
vuint8m2_t yencOffset = RV(vmv_v_x_u8m2)(42, vl2);
|
|
72
|
+
if(escFirst) yencOffset = set_first_vu8(yencOffset, 42+64, vl2);
|
|
73
|
+
vuint8m2_t lfCompare = RV(vmv_v_x_u8m2)('\n', vl2);
|
|
74
|
+
if(nextMask && isRaw) {
|
|
75
|
+
lfCompare = RV(vreinterpret_v_u16m2_u8m2)(
|
|
76
|
+
set_first_vu16(RV(vreinterpret_v_u8m2_u16m2)(lfCompare), nextMask == 1 ? 0x0a2e /*".\n"*/ : 0x2e0a /*"\n."*/, vl2/2)
|
|
77
|
+
);
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
// mask where only the highest bit is set
|
|
81
|
+
vbool4_t lastBit = RV(vmseq_vx_u8m2_b4)(
|
|
82
|
+
RV(vslide1down_vx_u8m2)(RV(vmv_v_x_u8m2)(0, vl2), 1, vl2),
|
|
83
|
+
1, vl2
|
|
84
|
+
);
|
|
85
|
+
|
|
86
|
+
decoder_set_nextMask<isRaw>(src, len, nextMask);
|
|
87
|
+
|
|
88
|
+
// TODO: consider exploiting partial vector capability
|
|
89
|
+
long inpos;
|
|
90
|
+
for(inpos = -len; inpos; inpos += vl2) {
|
|
91
|
+
vuint8m2_t data = RV(vle8_v_u8m2)(src + inpos, vl2);
|
|
92
|
+
|
|
93
|
+
// search for special chars
|
|
94
|
+
vbool4_t cmpEq = RV(vmseq_vx_u8m2_b4)(data, '=', vl2);
|
|
95
|
+
vbool4_t cmpCr = RV(vmseq_vx_u8m2_b4)(data, '\r', vl2);
|
|
96
|
+
// note: cmp is always negated (unlike cmpEq/Cr)
|
|
97
|
+
vbool4_t cmp = RV(vmnor_mm_b4)(
|
|
98
|
+
RV(vmor_mm_b4)(cmpEq, cmpCr, vl2),
|
|
99
|
+
isRaw ? RV(vmseq_vv_u8m2_b4)(data, lfCompare, vl2) : RV(vmseq_vx_u8m2_b4)(data, '\n', vl2),
|
|
100
|
+
vl2
|
|
101
|
+
);
|
|
102
|
+
|
|
103
|
+
size_t numOutputChars = RV(vcpop_m_b4)(cmp, vl2);
|
|
104
|
+
|
|
105
|
+
if(numOutputChars != vl2) {
|
|
106
|
+
// dot-unstuffing + end detection
|
|
107
|
+
if((isRaw || searchEnd) && RV(vcpop_m_b4)(RV(vmxnor_mm_b4)(cmp, cmpEq, vl2), vl2)) {
|
|
108
|
+
uint32_t nextWord;
|
|
109
|
+
if(!searchEnd) {
|
|
110
|
+
memcpy(&nextWord, src + inpos + vl2, 2);
|
|
111
|
+
} else {
|
|
112
|
+
memcpy(&nextWord, src + inpos + vl2, 4);
|
|
113
|
+
}
|
|
114
|
+
vuint8m2_t nextData2 = RV(vreinterpret_v_u16m2_u8m2)(RV(vslide1down_vx_u16m2)(RV(vreinterpret_v_u8m2_u16m2)(data), nextWord, vl2/2));
|
|
115
|
+
|
|
116
|
+
vbool4_t match2Cr_Dot, match3EqY;
|
|
117
|
+
vuint8m2_t nextData3;
|
|
118
|
+
if(isRaw) {
|
|
119
|
+
match2Cr_Dot = RV(vmand_mm_b4)(cmpCr, RV(vmseq_vx_u8m2_b4)(nextData2, '.', vl2), vl2);
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
if(searchEnd) {
|
|
123
|
+
nextData3 = RV(vslide1down_vx_u8m2)(nextData2, nextWord>>16, vl2);
|
|
124
|
+
match3EqY = RV(vmand_mm_b4)(
|
|
125
|
+
RV(vmseq_vx_u8m2_b4)(nextData2, '=', vl2),
|
|
126
|
+
RV(vmseq_vx_u8m2_b4)(nextData3, 'y', vl2),
|
|
127
|
+
vl2
|
|
128
|
+
);
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
// find patterns of \r_.
|
|
132
|
+
if(isRaw && LIKELIHOOD(0.001, RV(vcpop_m_b4)(match2Cr_Dot, vl2) > 0)) {
|
|
133
|
+
// find \r\n.
|
|
134
|
+
vuint8m2_t nextData1 = RV(vslide1down_vx_u8m2)(data, nextWord, vl2);
|
|
135
|
+
vbool4_t match1Lf = RV(vmseq_vx_u8m2_b4)(nextData1, '\n', vl2);
|
|
136
|
+
vbool4_t match2NlDot = RV(vmand_mm_b4)(match2Cr_Dot, match1Lf, vl2);
|
|
137
|
+
|
|
138
|
+
if(searchEnd) {
|
|
139
|
+
vbool4_t match1Nl = RV(vmand_mm_b4)(cmpCr, match1Lf, vl2);
|
|
140
|
+
|
|
141
|
+
vuint8m2_t nextData4 = RV(vreinterpret_v_u32m2_u8m2)(RV(vslide1down_vx_u32m2)(RV(vreinterpret_v_u8m2_u32m2)(data), nextWord, vl2/4));
|
|
142
|
+
|
|
143
|
+
// match instances of \r\n.\r\n and \r\n.=y
|
|
144
|
+
vbool4_t match4Nl = RV(vmand_mm_b4)(
|
|
145
|
+
RV(vmseq_vx_u8m2_b4)(nextData3, '\r', vl2),
|
|
146
|
+
RV(vmseq_vx_u8m2_b4)(nextData4, '\n', vl2),
|
|
147
|
+
vl2
|
|
148
|
+
);
|
|
149
|
+
vbool4_t match4EqY = RV(vmand_mm_b4)(
|
|
150
|
+
RV(vmseq_vx_u8m2_b4)(nextData3, '=', vl2),
|
|
151
|
+
RV(vmseq_vx_u8m2_b4)(nextData4, 'y', vl2),
|
|
152
|
+
vl2
|
|
153
|
+
);
|
|
154
|
+
|
|
155
|
+
// merge \r\n and =y matches
|
|
156
|
+
vbool4_t match4End = RV(vmor_mm_b4)(match4Nl, match4EqY, vl2);
|
|
157
|
+
// merge with \r\n.
|
|
158
|
+
match4End = RV(vmand_mm_b4)(match4End, match2NlDot, vl2);
|
|
159
|
+
// merge \r\n=y
|
|
160
|
+
vbool4_t match3End = RV(vmand_mm_b4)(match1Nl, match3EqY, vl2);
|
|
161
|
+
|
|
162
|
+
vbool4_t matchEnd = RV(vmor_mm_b4)(match4End, match3End, vl2);
|
|
163
|
+
|
|
164
|
+
// combine match sequences
|
|
165
|
+
if(LIKELIHOOD(0.001, RV(vcpop_m_b4)(matchEnd, vl2) > 0)) {
|
|
166
|
+
// terminator found
|
|
167
|
+
len += inpos;
|
|
168
|
+
nextMask = decoder_set_nextMask<isRaw>(src+inpos, ~RV(vmv_x_s_u8m1_u8)(RV_VEC_CAST(4, 8, cmp)));
|
|
169
|
+
break;
|
|
170
|
+
}
|
|
171
|
+
}
|
|
172
|
+
|
|
173
|
+
// shift match2NlDot by 2
|
|
174
|
+
cmp = RV(vmandn_mm_b4)(cmp, mask_lshift<2>(match2NlDot, 0, vl2), vl2);
|
|
175
|
+
numOutputChars = RV(vcpop_m_b4)(cmp, vl2);
|
|
176
|
+
|
|
177
|
+
vuint8mf4_t nextNlDot = RV(vslidedown_vx_u8mf4)(
|
|
178
|
+
#ifndef __riscv_v_intrinsic
|
|
179
|
+
RV(vmv_v_x_u8mf4)(0, vl2/8),
|
|
180
|
+
#endif
|
|
181
|
+
RV_VEC_U8MF4_CAST(match2NlDot), vl2/8-1, vl2/8
|
|
182
|
+
);
|
|
183
|
+
nextNlDot = RV(vsrl_vx_u8mf4)(nextNlDot, 6, vl2/8);
|
|
184
|
+
vuint8m1_t nextNlDotVec = RV(vlmul_ext_v_u8mf4_u8m1)(nextNlDot);
|
|
185
|
+
lfCompare = RV_vmerge_vxm_u8m2(RV(vmv_v_x_u8m2)('\n', vl2), '.', RV_MASK_CAST(4, 8, nextNlDotVec), vl2);
|
|
186
|
+
} else if(searchEnd) {
|
|
187
|
+
if(LIKELIHOOD(0.001, RV(vcpop_m_b4)(match3EqY, vl2) != 0)) {
|
|
188
|
+
vuint8m2_t nextData1 = RV(vslide1down_vx_u8m2)(data, nextWord, vl2);
|
|
189
|
+
vbool4_t match1Lf = RV(vmseq_vx_u8m2_b4)(nextData1, '\n', vl2);
|
|
190
|
+
vbool4_t matchEnd = RV(vmand_mm_b4)(RV(vmand_mm_b4)(match3EqY, cmpCr, vl2), match1Lf, vl2);
|
|
191
|
+
if(LIKELIHOOD(0.001, RV(vcpop_m_b4)(matchEnd, vl2) > 0)) {
|
|
192
|
+
len += inpos;
|
|
193
|
+
nextMask = decoder_set_nextMask<isRaw>(src+inpos, ~RV(vmv_x_s_u8m1_u8)(RV_VEC_CAST(4, 8, cmp)));
|
|
194
|
+
break;
|
|
195
|
+
}
|
|
196
|
+
}
|
|
197
|
+
if(isRaw)
|
|
198
|
+
lfCompare = RV(vmv_v_x_u8m2)('\n', vl2);
|
|
199
|
+
} else if(isRaw) // no \r_. found
|
|
200
|
+
lfCompare = RV(vmv_v_x_u8m2)('\n', vl2);
|
|
201
|
+
}
|
|
202
|
+
|
|
203
|
+
// the second character in an escape sequence
|
|
204
|
+
vbool4_t cmpEqShift1 = mask_lshift<1>(cmpEq, escFirst, vl2);
|
|
205
|
+
|
|
206
|
+
// a spec compliant encoder should never generate sequences: ==, =\n and =\r, but we'll handle them to be spec compliant
|
|
207
|
+
// the yEnc specification requires any character following = to be unescaped, not skipped over, so we'll deal with that
|
|
208
|
+
// firstly, check for invalid sequences of = (we assume that these are rare, as a spec compliant yEnc encoder should not generate these)
|
|
209
|
+
if(LIKELIHOOD(0.0001, RV(vcpop_m_b4)(RV(vmandn_mm_b4)(cmpEqShift1, cmp, vl2), vl2) != 0)) {
|
|
210
|
+
// replicate fix_eqMask, but in vector form
|
|
211
|
+
vbool4_t groupStart = RV(vmandn_mm_b4)(cmpEq, cmpEqShift1, vl2);
|
|
212
|
+
vbool4_t evenBits = RV_MASK_CAST(4, 8, RV(vmv_v_x_u8m1)(0x55, vl2));
|
|
213
|
+
vbool4_t evenStart = RV(vmand_mm_b4)(groupStart, evenBits, vl2);
|
|
214
|
+
|
|
215
|
+
// compute `cmpEq + evenStart` to obtain oddGroups
|
|
216
|
+
vbool4_t oddGroups;
|
|
217
|
+
vuint64m1_t cmpEq64 = RV_VEC_CAST(4, 64, cmpEq);
|
|
218
|
+
vuint64m1_t evenStart64 = RV_VEC_CAST(4, 64, evenStart);
|
|
219
|
+
vuint64m1_t oddGroups64;
|
|
220
|
+
if(vl2 <= 64) {
|
|
221
|
+
// no loop needed - single 64b add will work
|
|
222
|
+
oddGroups64 = RV(vadd_vv_u64m1)(cmpEq64, evenStart64, 1);
|
|
223
|
+
} else {
|
|
224
|
+
// need to loop whilst the add causes a carry
|
|
225
|
+
unsigned vl64 = vl2/64;
|
|
226
|
+
vbool64_t carry = RV(vmadc_vv_u64m1_b64)(cmpEq64, evenStart64, vl64);
|
|
227
|
+
carry = mask_lshift<1>(carry, 0, vl64);
|
|
228
|
+
oddGroups64 = RV(vadd_vv_u64m1)(cmpEq64, evenStart64, 1);
|
|
229
|
+
while(RV(vcpop_m_b64)(carry, vl64)) {
|
|
230
|
+
vbool64_t nextCarry = RV(vmadc_vx_u64m1_b64)(oddGroups64, 1, vl64);
|
|
231
|
+
oddGroups64 = RV(vadd_vx_u64m1_mu)(carry, oddGroups64, oddGroups64, 1, vl64);
|
|
232
|
+
carry = mask_lshift<1>(nextCarry, 0, vl64);
|
|
233
|
+
}
|
|
234
|
+
}
|
|
235
|
+
oddGroups = RV_MASK_CAST(4, 64, oddGroups64);
|
|
236
|
+
|
|
237
|
+
cmpEq = RV(vmand_mm_b4)(RV(vmxor_mm_b4)(oddGroups, evenBits, vl2), cmpEq, vl2);
|
|
238
|
+
|
|
239
|
+
cmpEqShift1 = mask_lshift<1>(cmpEq, escFirst, vl2);
|
|
240
|
+
cmp = RV(vmor_mm_b4)(cmpEqShift1, cmp, vl2); // ~(~cmp & ~cmpEqShift1)
|
|
241
|
+
numOutputChars = RV(vcpop_m_b4)(cmp, vl2);
|
|
242
|
+
}
|
|
243
|
+
escFirst = RV(vcpop_m_b4)(RV(vmand_mm_b4)(cmpEq, lastBit, vl2), vl2);
|
|
244
|
+
|
|
245
|
+
data = RV(vsub_vv_u8m2)(data, RV_vmerge_vxm_u8m2(yencOffset, 64+42, cmpEqShift1, vl2), vl2);
|
|
246
|
+
yencOffset = set_first_vu8(yencOffset, 42 | (escFirst<<6), vl2);
|
|
247
|
+
|
|
248
|
+
// all that's left is to remove unwanted chars
|
|
249
|
+
#ifdef __riscv_v_intrinsic
|
|
250
|
+
data = RV(vcompress_vm_u8m2)(data, cmp, vl2);
|
|
251
|
+
#else
|
|
252
|
+
data = RV(vcompress_vm_u8m2)(cmp, data, data, vl2);
|
|
253
|
+
#endif
|
|
254
|
+
RV(vse8_v_u8m2)(outp, data, vl2);
|
|
255
|
+
} else {
|
|
256
|
+
data = RV(vsub_vv_u8m2)(data, yencOffset, vl2);
|
|
257
|
+
RV(vse8_v_u8m2)(outp, data, vl2);
|
|
258
|
+
// TODO: should these be done at LMUL=1? or, it might not be worth this strategy (e.g. do an additional OR instead), considering the cost of LMUL=2
|
|
259
|
+
yencOffset = RV(vmv_v_x_u8m2)(42, vl2);
|
|
260
|
+
if(isRaw) lfCompare = RV(vmv_v_x_u8m2)('\n', vl2);
|
|
261
|
+
escFirst = 0;
|
|
262
|
+
}
|
|
263
|
+
outp += numOutputChars;
|
|
264
|
+
}
|
|
265
|
+
}
|
|
266
|
+
|
|
267
|
+
size_t decoder_rvv_width() {
|
|
268
|
+
return RV(vsetvlmax_e8m2)();
|
|
269
|
+
}
|
|
270
|
+
} // namespace
|
|
271
|
+
|
|
272
|
+
void RapidYenc::decoder_set_rvv_funcs() {
|
|
273
|
+
_do_decode = &do_decode_simd<false, false, decoder_rvv_width, do_decode_rvv<false, false> >;
|
|
274
|
+
_do_decode_raw = &do_decode_simd<true, false, decoder_rvv_width, do_decode_rvv<true, false> >;
|
|
275
|
+
_do_decode_end_raw = &do_decode_simd<true, true, decoder_rvv_width, do_decode_rvv<true, true> >;
|
|
276
|
+
_decode_isa = ISA_LEVEL_RVV;
|
|
277
|
+
}
|
|
278
|
+
#else
|
|
279
|
+
void RapidYenc::decoder_set_rvv_funcs() {}
|
|
280
|
+
#endif
|
package/src/decoder_sse2.cc
CHANGED
|
@@ -1,17 +1,38 @@
|
|
|
1
1
|
#include "common.h"
|
|
2
2
|
|
|
3
|
-
#ifdef __SSE2__
|
|
4
3
|
#include "decoder_common.h"
|
|
4
|
+
#ifdef __SSE2__
|
|
5
5
|
#include "decoder_sse_base.h"
|
|
6
6
|
|
|
7
|
-
void
|
|
8
|
-
|
|
9
|
-
|
|
7
|
+
void RapidYenc::decoder_sse_init(RapidYenc::SSELookups* HEDLEY_RESTRICT& lookups) {
|
|
8
|
+
ALIGN_ALLOC(lookups, sizeof(SSELookups), 16);
|
|
9
|
+
for(int i=0; i<256; i++) {
|
|
10
|
+
lookups->BitsSetTable256inv[i] = 8 - (
|
|
11
|
+
(i & 1) + ((i>>1) & 1) + ((i>>2) & 1) + ((i>>3) & 1) + ((i>>4) & 1) + ((i>>5) & 1) + ((i>>6) & 1) + ((i>>7) & 1)
|
|
12
|
+
);
|
|
13
|
+
|
|
14
|
+
#define _X(n, k) ((((n) & (1<<k)) ? 192ULL : 0ULL) << (k*8))
|
|
15
|
+
lookups->eqAdd[i] = _X(i, 0) | _X(i, 1) | _X(i, 2) | _X(i, 3) | _X(i, 4) | _X(i, 5) | _X(i, 6) | _X(i, 7);
|
|
16
|
+
#undef _X
|
|
17
|
+
}
|
|
18
|
+
for(int i=0; i<32; i++) {
|
|
19
|
+
for(int j=0; j<16; j++) {
|
|
20
|
+
if(i >= 16) // only used for LZCNT
|
|
21
|
+
lookups->unshufMask[i*16 + j] = ((31-i)>j ? -1 : 0);
|
|
22
|
+
else // only used for BSR
|
|
23
|
+
lookups->unshufMask[i*16 + j] = (i>j ? -1 : 0);
|
|
24
|
+
}
|
|
25
|
+
}
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
void RapidYenc::decoder_set_sse2_funcs() {
|
|
29
|
+
decoder_sse_init(lookups);
|
|
30
|
+
decoder_init_lut(lookups->compact);
|
|
10
31
|
_do_decode = &do_decode_simd<false, false, sizeof(__m128i)*2, do_decode_sse<false, false, ISA_LEVEL_SSE2> >;
|
|
11
32
|
_do_decode_raw = &do_decode_simd<true, false, sizeof(__m128i)*2, do_decode_sse<true, false, ISA_LEVEL_SSE2> >;
|
|
12
33
|
_do_decode_end_raw = &do_decode_simd<true, true, sizeof(__m128i)*2, do_decode_sse<true, true, ISA_LEVEL_SSE2> >;
|
|
13
34
|
_decode_isa = ISA_LEVEL_SSE2;
|
|
14
35
|
}
|
|
15
36
|
#else
|
|
16
|
-
void decoder_set_sse2_funcs() {}
|
|
37
|
+
void RapidYenc::decoder_set_sse2_funcs() {}
|
|
17
38
|
#endif
|
package/src/decoder_sse_base.h
CHANGED
|
@@ -25,15 +25,17 @@
|
|
|
25
25
|
# define KOR16(a, b) ((a) | (b))
|
|
26
26
|
#endif
|
|
27
27
|
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
}
|
|
36
|
-
#pragma pack()
|
|
28
|
+
namespace RapidYenc {
|
|
29
|
+
#pragma pack(16)
|
|
30
|
+
typedef struct {
|
|
31
|
+
unsigned char BitsSetTable256inv[256];
|
|
32
|
+
/*align16*/ struct { char bytes[16]; } compact[32768];
|
|
33
|
+
/*align8*/ uint64_t eqAdd[256];
|
|
34
|
+
/*align16*/ int8_t unshufMask[32*16];
|
|
35
|
+
} SSELookups;
|
|
36
|
+
#pragma pack()
|
|
37
|
+
}
|
|
38
|
+
static RapidYenc::SSELookups* HEDLEY_RESTRICT lookups;
|
|
37
39
|
|
|
38
40
|
|
|
39
41
|
static HEDLEY_ALWAYS_INLINE __m128i force_align_read_128(const void* p) {
|
|
@@ -45,26 +47,8 @@ static HEDLEY_ALWAYS_INLINE __m128i force_align_read_128(const void* p) {
|
|
|
45
47
|
#endif
|
|
46
48
|
}
|
|
47
49
|
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
ALIGN_ALLOC(lookups, sizeof(*lookups), 16);
|
|
51
|
-
for(int i=0; i<256; i++) {
|
|
52
|
-
lookups->BitsSetTable256inv[i] = 8 - (
|
|
53
|
-
(i & 1) + ((i>>1) & 1) + ((i>>2) & 1) + ((i>>3) & 1) + ((i>>4) & 1) + ((i>>5) & 1) + ((i>>6) & 1) + ((i>>7) & 1)
|
|
54
|
-
);
|
|
55
|
-
|
|
56
|
-
#define _X(n, k) ((((n) & (1<<k)) ? 192ULL : 0ULL) << (k*8))
|
|
57
|
-
lookups->eqAdd[i] = _X(i, 0) | _X(i, 1) | _X(i, 2) | _X(i, 3) | _X(i, 4) | _X(i, 5) | _X(i, 6) | _X(i, 7);
|
|
58
|
-
#undef _X
|
|
59
|
-
}
|
|
60
|
-
for(int i=0; i<32; i++) {
|
|
61
|
-
for(int j=0; j<16; j++) {
|
|
62
|
-
if(i >= 16) // only used for LZCNT
|
|
63
|
-
lookups->unshufMask[i*16 + j] = ((31-i)>j ? -1 : 0);
|
|
64
|
-
else // only used for BSR
|
|
65
|
-
lookups->unshufMask[i*16 + j] = (i>j ? -1 : 0);
|
|
66
|
-
}
|
|
67
|
-
}
|
|
50
|
+
namespace RapidYenc {
|
|
51
|
+
void decoder_sse_init(SSELookups* HEDLEY_RESTRICT& lookups); // defined in decoder_sse2.cc
|
|
68
52
|
}
|
|
69
53
|
|
|
70
54
|
|
|
@@ -110,6 +94,8 @@ static HEDLEY_ALWAYS_INLINE __m128i sse2_compact_vect(uint32_t mask, __m128i dat
|
|
|
110
94
|
return data;
|
|
111
95
|
}
|
|
112
96
|
|
|
97
|
+
namespace RapidYenc {
|
|
98
|
+
|
|
113
99
|
template<bool isRaw, bool searchEnd, enum YEncDecIsaLevel use_isa>
|
|
114
100
|
HEDLEY_ALWAYS_INLINE void do_decode_sse(const uint8_t* src, long& len, unsigned char*& p, unsigned char& _escFirst, uint16_t& _nextMask) {
|
|
115
101
|
HEDLEY_ASSUME(_escFirst == 0 || _escFirst == 1);
|
|
@@ -520,18 +506,11 @@ HEDLEY_ALWAYS_INLINE void do_decode_sse(const uint8_t* src, long& len, unsigned
|
|
|
520
506
|
if(!_USING_BLEND_ADD)
|
|
521
507
|
dataB = _mm_add_epi8(oDataB, _mm_set1_epi8(-42));
|
|
522
508
|
|
|
523
|
-
|
|
524
|
-
|
|
525
|
-
|
|
526
|
-
uint32_t maskEq2 = tmp;
|
|
527
|
-
for(int j=8; j<32; j+=8) {
|
|
528
|
-
tmp = lookups->eqFix[((maskEq>>j)&0xff) & ~(tmp>>7)];
|
|
529
|
-
maskEq2 |= tmp<<j;
|
|
530
|
-
}
|
|
531
|
-
maskEq = maskEq2;
|
|
532
|
-
|
|
509
|
+
uint32_t maskEqShift1 = (maskEq << 1) + escFirst;
|
|
510
|
+
if(LIKELIHOOD(0.0001, (mask & maskEqShift1) != 0)) {
|
|
511
|
+
maskEq = fix_eqMask<uint32_t>(maskEq, maskEqShift1);
|
|
533
512
|
mask &= ~escFirst;
|
|
534
|
-
escFirst =
|
|
513
|
+
escFirst = maskEq >> 31;
|
|
535
514
|
// next, eliminate anything following a `=` from the special char mask; this eliminates cases of `=\r` so that they aren't removed
|
|
536
515
|
maskEq <<= 1;
|
|
537
516
|
mask &= ~maskEq;
|
|
@@ -716,4 +695,5 @@ HEDLEY_ALWAYS_INLINE void do_decode_sse(const uint8_t* src, long& len, unsigned
|
|
|
716
695
|
}
|
|
717
696
|
_escFirst = (unsigned char)escFirst;
|
|
718
697
|
}
|
|
698
|
+
} // namespace
|
|
719
699
|
#endif
|
package/src/decoder_ssse3.cc
CHANGED
|
@@ -1,19 +1,18 @@
|
|
|
1
1
|
#include "common.h"
|
|
2
2
|
|
|
3
|
-
#ifdef __SSSE3__
|
|
4
3
|
#include "decoder_common.h"
|
|
4
|
+
#ifdef __SSSE3__
|
|
5
5
|
#include "decoder_sse_base.h"
|
|
6
|
-
void decoder_set_ssse3_funcs() {
|
|
7
|
-
decoder_sse_init();
|
|
8
|
-
decoder_init_lut(lookups->
|
|
6
|
+
void RapidYenc::decoder_set_ssse3_funcs() {
|
|
7
|
+
decoder_sse_init(lookups);
|
|
8
|
+
decoder_init_lut(lookups->compact);
|
|
9
9
|
_do_decode = &do_decode_simd<false, false, sizeof(__m128i)*2, do_decode_sse<false, false, ISA_LEVEL_SSSE3> >;
|
|
10
10
|
_do_decode_raw = &do_decode_simd<true, false, sizeof(__m128i)*2, do_decode_sse<true, false, ISA_LEVEL_SSSE3> >;
|
|
11
11
|
_do_decode_end_raw = &do_decode_simd<true, true, sizeof(__m128i)*2, do_decode_sse<true, true, ISA_LEVEL_SSSE3> >;
|
|
12
12
|
_decode_isa = ISA_LEVEL_SSSE3;
|
|
13
13
|
}
|
|
14
14
|
#else
|
|
15
|
-
void
|
|
16
|
-
void decoder_set_ssse3_funcs() {
|
|
15
|
+
void RapidYenc::decoder_set_ssse3_funcs() {
|
|
17
16
|
decoder_set_sse2_funcs();
|
|
18
17
|
}
|
|
19
18
|
#endif
|
package/src/decoder_vbmi2.cc
CHANGED
|
@@ -1,20 +1,16 @@
|
|
|
1
1
|
#include "common.h"
|
|
2
|
+
# include "decoder_common.h"
|
|
2
3
|
|
|
3
|
-
extern const bool decoder_has_avx10;
|
|
4
4
|
#if !defined(__EVEX512__) && (defined(__AVX10_1__) || defined(__EVEX256__)) && defined(__AVX512VL__) && defined(__AVX512VBMI2__) && defined(__AVX512BW__)
|
|
5
|
-
const bool decoder_has_avx10 = true;
|
|
5
|
+
const bool RapidYenc::decoder_has_avx10 = true;
|
|
6
6
|
#else
|
|
7
|
-
const bool decoder_has_avx10 = false;
|
|
7
|
+
const bool RapidYenc::decoder_has_avx10 = false;
|
|
8
8
|
#endif
|
|
9
9
|
|
|
10
10
|
#if defined(__AVX512VL__) && defined(__AVX512VBMI2__) && defined(__AVX512BW__)
|
|
11
|
-
# include "decoder_common.h"
|
|
12
11
|
# ifndef YENC_DISABLE_AVX256
|
|
13
12
|
# include "decoder_avx2_base.h"
|
|
14
|
-
void decoder_set_vbmi2_funcs() {
|
|
15
|
-
ALIGN_ALLOC(lookups, sizeof(*lookups), 16);
|
|
16
|
-
// TODO: consider removing compact LUT
|
|
17
|
-
decoder_init_lut(lookups->eqFix, lookups->compact);
|
|
13
|
+
void RapidYenc::decoder_set_vbmi2_funcs() {
|
|
18
14
|
_do_decode = &do_decode_simd<false, false, sizeof(__m256i)*2, do_decode_avx2<false, false, ISA_LEVEL_VBMI2> >;
|
|
19
15
|
_do_decode_raw = &do_decode_simd<true, false, sizeof(__m256i)*2, do_decode_avx2<true, false, ISA_LEVEL_VBMI2> >;
|
|
20
16
|
_do_decode_end_raw = &do_decode_simd<true, true, sizeof(__m256i)*2, do_decode_avx2<true, true, ISA_LEVEL_VBMI2> >;
|
|
@@ -22,9 +18,7 @@ void decoder_set_vbmi2_funcs() {
|
|
|
22
18
|
}
|
|
23
19
|
# else
|
|
24
20
|
# include "decoder_sse_base.h"
|
|
25
|
-
void decoder_set_vbmi2_funcs() {
|
|
26
|
-
decoder_sse_init();
|
|
27
|
-
decoder_init_lut(lookups->eqFix, lookups->compact);
|
|
21
|
+
void RapidYenc::decoder_set_vbmi2_funcs() {
|
|
28
22
|
_do_decode = &do_decode_simd<false, false, sizeof(__m128i)*2, do_decode_sse<false, false, ISA_LEVEL_VBMI2> >;
|
|
29
23
|
_do_decode_raw = &do_decode_simd<true, false, sizeof(__m128i)*2, do_decode_sse<true, false, ISA_LEVEL_VBMI2> >;
|
|
30
24
|
_do_decode_end_raw = &do_decode_simd<true, true, sizeof(__m128i)*2, do_decode_sse<true, true, ISA_LEVEL_VBMI2> >;
|
|
@@ -32,8 +26,7 @@ void decoder_set_vbmi2_funcs() {
|
|
|
32
26
|
}
|
|
33
27
|
# endif
|
|
34
28
|
#else
|
|
35
|
-
void
|
|
36
|
-
void decoder_set_vbmi2_funcs() {
|
|
29
|
+
void RapidYenc::decoder_set_vbmi2_funcs() {
|
|
37
30
|
decoder_set_avx2_funcs();
|
|
38
31
|
}
|
|
39
32
|
#endif
|