yencode 1.1.5 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,274 @@
1
+ #include "common.h"
2
+ #ifdef __riscv_vector
3
+ #include "decoder_common.h"
4
+
5
+
6
+ #ifdef __riscv_v_intrinsic
7
+ # define RV_vmerge_vxm_u8m2 RV(vmerge_vxm_u8m2)
8
+ # define RV_vmerge_vxm_u16m2 RV(vmerge_vxm_u16m2)
9
+ #else
10
+ # define RV_vmerge_vxm_u8m2(v, x, m, vl) RV(vmerge_vxm_u8m2)(m, v, x, vl)
11
+ # define RV_vmerge_vxm_u16m2(v, x, m, vl) RV(vmerge_vxm_u16m2)(m, v, x, vl)
12
+ #endif
13
+
14
+ #if defined(__riscv_v_intrinsic) && __riscv_v_intrinsic >= 12000
15
+ # define RV_VEC_CAST(masksz, vecsz, vec) RV(vreinterpret_v_b##masksz##_u##vecsz##m1)(vec)
16
+ #else
17
+ # define RV_VEC_CAST(masksz, vecsz, vec) *(vuint##vecsz##m1_t*)(&(vec))
18
+ #endif
19
+
20
+
21
+ template<int shift>
22
+ static inline vbool4_t mask_lshift(vbool4_t m, unsigned shiftIn, size_t vl) {
23
+ vuint8m1_t mv = RV_VEC_CAST(4, 8, m);
24
+ vuint8m1_t mvl = RV(vsll_vx_u8m1)(mv, shift, vl/8);
25
+ vuint8m1_t mvr = RV(vsrl_vx_u8m1)(mv, 8-shift, vl/8);
26
+ mvr = RV(vslide1up_vx_u8m1)(mvr, shiftIn, vl/8);
27
+
28
+ return RV(vmor_mm_b4)(
29
+ RV_MASK_CAST(4, 8, mvl), RV_MASK_CAST(4, 8, mvr), vl
30
+ );
31
+ }
32
+
33
+ static inline vuint8m2_t set_first_vu8(vuint8m2_t src, uint8_t item, size_t vl) {
34
+ #ifdef __riscv_v_intrinsic
35
+ return RV(vmv_s_x_u8m2_tu)(src, item, vl);
36
+ #else
37
+ vuint8m1_t m = RV(vslide1up_vx_u8m1)(RV(vmv_v_x_u8m1)(0, ~0), 1, ~0);
38
+ return RV_vmerge_vxm_u8m2(src, item, RV_MASK_CAST(4, 8, m), vl);
39
+ #endif
40
+ }
41
+ static inline vuint16m2_t set_first_vu16(vuint16m2_t src, uint16_t item, size_t vl) {
42
+ #ifdef __riscv_v_intrinsic
43
+ return RV(vmv_s_x_u16m2_tu)(src, item, vl);
44
+ #else
45
+ vuint16m1_t m = RV(vslide1up_vx_u16m1)(RV(vmv_v_x_u16m1)(0, ~0), 1, ~0);
46
+ return RV_vmerge_vxm_u16m2(src, item, RV_MASK_CAST(8, 16, m), vl);
47
+ #endif
48
+ }
49
+
50
+
51
+
52
+ template<bool isRaw, bool searchEnd>
53
+ HEDLEY_ALWAYS_INLINE void do_decode_rvv(const uint8_t* src, long& len, unsigned char*& outp, unsigned char& escFirst, uint16_t& nextMask) {
54
+ HEDLEY_ASSUME(escFirst == 0 || escFirst == 1);
55
+ HEDLEY_ASSUME(nextMask == 0 || nextMask == 1 || nextMask == 2);
56
+
57
+ size_t vl2 = RV(vsetvlmax_e8m2)();
58
+
59
+ vuint8m2_t yencOffset = RV(vmv_v_x_u8m2)(42, vl2);
60
+ if(escFirst) yencOffset = set_first_vu8(yencOffset, 42+64, vl2);
61
+ vuint8m2_t lfCompare = RV(vmv_v_x_u8m2)('\n', vl2);
62
+ if(nextMask && isRaw) {
63
+ lfCompare = RV(vreinterpret_v_u16m2_u8m2)(
64
+ set_first_vu16(RV(vreinterpret_v_u8m2_u16m2)(lfCompare), nextMask == 1 ? 0x0a2e /*".\n"*/ : 0x2e0a /*"\n."*/, vl2/2)
65
+ );
66
+ }
67
+
68
+ // mask where only the highest bit is set
69
+ vbool4_t lastBit = RV(vmseq_vx_u8m2_b4)(
70
+ RV(vslide1down_vx_u8m2)(RV(vmv_v_x_u8m2)(0, vl2), 1, vl2),
71
+ 1, vl2
72
+ );
73
+
74
+ decoder_set_nextMask<isRaw>(src, len, nextMask);
75
+
76
+ // TODO: consider exploiting partial vector capability
77
+ long inpos;
78
+ for(inpos = -len; inpos; inpos += vl2) {
79
+ vuint8m2_t data = RV(vle8_v_u8m2)(src + inpos, vl2);
80
+
81
+ // search for special chars
82
+ vbool4_t cmpEq = RV(vmseq_vx_u8m2_b4)(data, '=', vl2);
83
+ vbool4_t cmpCr = RV(vmseq_vx_u8m2_b4)(data, '\r', vl2);
84
+ // note: cmp is always negated (unlike cmpEq/Cr)
85
+ vbool4_t cmp = RV(vmnor_mm_b4)(
86
+ RV(vmor_mm_b4)(cmpEq, cmpCr, vl2),
87
+ isRaw ? RV(vmseq_vv_u8m2_b4)(data, lfCompare, vl2) : RV(vmseq_vx_u8m2_b4)(data, '\n', vl2),
88
+ vl2
89
+ );
90
+
91
+ size_t numOutputChars = RV(vcpop_m_b4)(cmp, vl2);
92
+
93
+ if(numOutputChars != vl2) {
94
+ // dot-unstuffing + end detection
95
+ if((isRaw || searchEnd) && RV(vcpop_m_b4)(RV(vmxnor_mm_b4)(cmp, cmpEq, vl2), vl2)) {
96
+ uint32_t nextWord;
97
+ if(!searchEnd) {
98
+ memcpy(&nextWord, src + inpos + vl2, 2);
99
+ } else {
100
+ memcpy(&nextWord, src + inpos + vl2, 4);
101
+ }
102
+ vuint8m2_t nextData2 = RV(vreinterpret_v_u16m2_u8m2)(RV(vslide1down_vx_u16m2)(RV(vreinterpret_v_u8m2_u16m2)(data), nextWord, vl2/2));
103
+
104
+ vbool4_t match2Cr_Dot, match3EqY;
105
+ vuint8m2_t nextData3;
106
+ if(isRaw) {
107
+ match2Cr_Dot = RV(vmand_mm_b4)(cmpCr, RV(vmseq_vx_u8m2_b4)(nextData2, '.', vl2), vl2);
108
+ }
109
+
110
+ if(searchEnd) {
111
+ nextData3 = RV(vslide1down_vx_u8m2)(nextData2, nextWord>>16, vl2);
112
+ match3EqY = RV(vmand_mm_b4)(
113
+ RV(vmseq_vx_u8m2_b4)(nextData2, '=', vl2),
114
+ RV(vmseq_vx_u8m2_b4)(nextData3, 'y', vl2),
115
+ vl2
116
+ );
117
+ }
118
+
119
+ // find patterns of \r_.
120
+ if(isRaw && LIKELIHOOD(0.001, RV(vcpop_m_b4)(match2Cr_Dot, vl2) > 0)) {
121
+ // find \r\n.
122
+ vuint8m2_t nextData1 = RV(vslide1down_vx_u8m2)(data, nextWord, vl2);
123
+ vbool4_t match1Lf = RV(vmseq_vx_u8m2_b4)(nextData1, '\n', vl2);
124
+ vbool4_t match2NlDot = RV(vmand_mm_b4)(match2Cr_Dot, match1Lf, vl2);
125
+
126
+ if(searchEnd) {
127
+ vbool4_t match1Nl = RV(vmand_mm_b4)(cmpCr, match1Lf, vl2);
128
+
129
+ vuint8m2_t nextData4 = RV(vreinterpret_v_u32m2_u8m2)(RV(vslide1down_vx_u32m2)(RV(vreinterpret_v_u8m2_u32m2)(data), nextWord, vl2/4));
130
+
131
+ // match instances of \r\n.\r\n and \r\n.=y
132
+ vbool4_t match4Nl = RV(vmand_mm_b4)(
133
+ RV(vmseq_vx_u8m2_b4)(nextData3, '\r', vl2),
134
+ RV(vmseq_vx_u8m2_b4)(nextData4, '\n', vl2),
135
+ vl2
136
+ );
137
+ vbool4_t match4EqY = RV(vmand_mm_b4)(
138
+ RV(vmseq_vx_u8m2_b4)(nextData3, '=', vl2),
139
+ RV(vmseq_vx_u8m2_b4)(nextData4, 'y', vl2),
140
+ vl2
141
+ );
142
+
143
+ // merge \r\n and =y matches
144
+ vbool4_t match4End = RV(vmor_mm_b4)(match4Nl, match4EqY, vl2);
145
+ // merge with \r\n.
146
+ match4End = RV(vmand_mm_b4)(match4End, match2NlDot, vl2);
147
+ // merge \r\n=y
148
+ vbool4_t match3End = RV(vmand_mm_b4)(match1Nl, match3EqY, vl2);
149
+
150
+ vbool4_t matchEnd = RV(vmor_mm_b4)(match4End, match3End, vl2);
151
+
152
+ // combine match sequences
153
+ if(LIKELIHOOD(0.001, RV(vcpop_m_b4)(matchEnd, vl2) > 0)) {
154
+ // terminator found
155
+ len += inpos;
156
+ nextMask = decoder_set_nextMask<isRaw>(src+inpos, ~RV(vmv_x_s_u8m1_u8)(RV_VEC_CAST(4, 8, cmp)));
157
+ break;
158
+ }
159
+ }
160
+
161
+ // shift match2NlDot by 2
162
+ cmp = RV(vmandn_mm_b4)(cmp, mask_lshift<2>(match2NlDot, 0, vl2), vl2);
163
+ numOutputChars = RV(vcpop_m_b4)(cmp, vl2);
164
+
165
+ vuint8mf4_t nextNlDot = RV(vslidedown_vx_u8mf4)(
166
+ #ifndef __riscv_v_intrinsic
167
+ RV(vmv_v_x_u8mf4)(0, vl2/8),
168
+ #endif
169
+ RV_VEC_U8MF4_CAST(match2NlDot), vl2/8-1, vl2/8
170
+ );
171
+ nextNlDot = RV(vsrl_vx_u8mf4)(nextNlDot, 6, vl2/8);
172
+ vuint8m1_t nextNlDotVec = RV(vlmul_ext_v_u8mf4_u8m1)(nextNlDot);
173
+ lfCompare = RV_vmerge_vxm_u8m2(RV(vmv_v_x_u8m2)('\n', vl2), '.', RV_MASK_CAST(4, 8, nextNlDotVec), vl2);
174
+ } else if(searchEnd) {
175
+ if(LIKELIHOOD(0.001, RV(vcpop_m_b4)(match3EqY, vl2) != 0)) {
176
+ vuint8m2_t nextData1 = RV(vslide1down_vx_u8m2)(data, nextWord, vl2);
177
+ vbool4_t match1Lf = RV(vmseq_vx_u8m2_b4)(nextData1, '\n', vl2);
178
+ vbool4_t matchEnd = RV(vmand_mm_b4)(RV(vmand_mm_b4)(match3EqY, cmpCr, vl2), match1Lf, vl2);
179
+ if(LIKELIHOOD(0.001, RV(vcpop_m_b4)(matchEnd, vl2) > 0)) {
180
+ len += inpos;
181
+ nextMask = decoder_set_nextMask<isRaw>(src+inpos, ~RV(vmv_x_s_u8m1_u8)(RV_VEC_CAST(4, 8, cmp)));
182
+ break;
183
+ }
184
+ }
185
+ if(isRaw)
186
+ lfCompare = RV(vmv_v_x_u8m2)('\n', vl2);
187
+ } else if(isRaw) // no \r_. found
188
+ lfCompare = RV(vmv_v_x_u8m2)('\n', vl2);
189
+ }
190
+
191
+ // the second character in an escape sequence
192
+ vbool4_t cmpEqShift1 = mask_lshift<1>(cmpEq, escFirst, vl2);
193
+
194
+ // a spec compliant encoder should never generate sequences: ==, =\n and =\r, but we'll handle them to be spec compliant
195
+ // the yEnc specification requires any character following = to be unescaped, not skipped over, so we'll deal with that
196
+ // firstly, check for invalid sequences of = (we assume that these are rare, as a spec compliant yEnc encoder should not generate these)
197
+ if(LIKELIHOOD(0.0001, RV(vcpop_m_b4)(RV(vmandn_mm_b4)(cmpEqShift1, cmp, vl2), vl2) != 0)) {
198
+ // note: we assume that uintptr_t corresponds with __riscv_xlen
199
+ #if __riscv_xlen == 64
200
+ vuint64m1_t cmpEqW = RV_VEC_CAST(4, 64, cmpEq);
201
+ #else
202
+ vuint32m1_t cmpEqW = RV_VEC_CAST(4, 32, cmpEq);
203
+ #endif
204
+ size_t nextShiftDown = (vl2 > sizeof(uintptr_t)*8 ? sizeof(uintptr_t)*8 : vl2) - 1;
205
+ size_t wvl = (vl2 + sizeof(uintptr_t)*8 -1) / (sizeof(uintptr_t)*8);
206
+ for(size_t w=0; w<vl2; w+=sizeof(uintptr_t)*8) {
207
+ // extract bottom word
208
+ #if __riscv_xlen == 64
209
+ uintptr_t maskW = RV(vmv_x_s_u64m1_u64)(cmpEqW);
210
+ #else
211
+ uintptr_t maskW = RV(vmv_x_s_u32m1_u32)(cmpEqW);
212
+ #endif
213
+
214
+ // fix it
215
+ maskW = fix_eqMask<uintptr_t>(maskW & ~(uintptr_t)escFirst);
216
+ uint8_t nextEscFirst = (maskW >> nextShiftDown) & 1;
217
+
218
+ // shift it up (will be used for cmpEqShift1)
219
+ maskW = (maskW<<1) | escFirst; // TODO: should this be done using mask_lshift<1> instead?
220
+ escFirst = nextEscFirst;
221
+
222
+ // slide the new value in from the top
223
+ #if __riscv_xlen == 64
224
+ cmpEqW = RV(vslide1down_vx_u64m1)(cmpEqW, maskW, wvl);
225
+ #else
226
+ cmpEqW = RV(vslide1down_vx_u32m1)(cmpEqW, maskW, wvl);
227
+ #endif
228
+ }
229
+ #if __riscv_xlen == 64
230
+ cmpEqShift1 = RV_MASK_CAST(4, 64, cmpEqW);
231
+ #else
232
+ cmpEqShift1 = RV_MASK_CAST(4, 32, cmpEqW);
233
+ #endif
234
+ cmp = RV(vmor_mm_b4)(cmpEqShift1, cmp, vl2); // ~(~cmp & ~cmpEqShift1)
235
+ numOutputChars = RV(vcpop_m_b4)(cmp, vl2);
236
+ } else {
237
+ // no invalid = sequences found - don't need to fix up cmpEq
238
+ escFirst = RV(vcpop_m_b4)(RV(vmand_mm_b4)(cmpEq, lastBit, vl2), vl2);
239
+ }
240
+ data = RV(vsub_vv_u8m2)(data, RV_vmerge_vxm_u8m2(yencOffset, 64+42, cmpEqShift1, vl2), vl2);
241
+ yencOffset = set_first_vu8(yencOffset, 42 | (escFirst<<6), vl2);
242
+
243
+ // all that's left is to remove unwanted chars
244
+ #ifdef __riscv_v_intrinsic
245
+ data = RV(vcompress_vm_u8m2)(data, cmp, vl2);
246
+ #else
247
+ data = RV(vcompress_vm_u8m2)(cmp, data, data, vl2);
248
+ #endif
249
+ RV(vse8_v_u8m2)(outp, data, vl2);
250
+ } else {
251
+ data = RV(vsub_vv_u8m2)(data, yencOffset, vl2);
252
+ RV(vse8_v_u8m2)(outp, data, vl2);
253
+ // TODO: should these be done at LMUL=1? or, it might not be worth this strategy (e.g. do an additional OR instead), considering the cost of LMUL=2
254
+ yencOffset = RV(vmv_v_x_u8m2)(42, vl2);
255
+ if(isRaw) lfCompare = RV(vmv_v_x_u8m2)('\n', vl2);
256
+ escFirst = 0;
257
+ }
258
+ outp += numOutputChars;
259
+ }
260
+ }
261
+
262
+ size_t decoder_rvv_width() {
263
+ return RV(vsetvlmax_e8m2)();
264
+ }
265
+
266
+ void decoder_set_rvv_funcs() {
267
+ _do_decode = &do_decode_simd<false, false, decoder_rvv_width, do_decode_rvv<false, false> >;
268
+ _do_decode_raw = &do_decode_simd<true, false, decoder_rvv_width, do_decode_rvv<true, false> >;
269
+ _do_decode_end_raw = &do_decode_simd<true, true, decoder_rvv_width, do_decode_rvv<true, true> >;
270
+ _decode_isa = ISA_LEVEL_RVV;
271
+ }
272
+ #else
273
+ void decoder_set_rvv_funcs() {}
274
+ #endif
@@ -4,9 +4,30 @@
4
4
  #include "decoder_common.h"
5
5
  #include "decoder_sse_base.h"
6
6
 
7
+ void decoder_sse_init(SSELookups* HEDLEY_RESTRICT& lookups) {
8
+ ALIGN_ALLOC(lookups, sizeof(SSELookups), 16);
9
+ for(int i=0; i<256; i++) {
10
+ lookups->BitsSetTable256inv[i] = 8 - (
11
+ (i & 1) + ((i>>1) & 1) + ((i>>2) & 1) + ((i>>3) & 1) + ((i>>4) & 1) + ((i>>5) & 1) + ((i>>6) & 1) + ((i>>7) & 1)
12
+ );
13
+
14
+ #define _X(n, k) ((((n) & (1<<k)) ? 192ULL : 0ULL) << (k*8))
15
+ lookups->eqAdd[i] = _X(i, 0) | _X(i, 1) | _X(i, 2) | _X(i, 3) | _X(i, 4) | _X(i, 5) | _X(i, 6) | _X(i, 7);
16
+ #undef _X
17
+ }
18
+ for(int i=0; i<32; i++) {
19
+ for(int j=0; j<16; j++) {
20
+ if(i >= 16) // only used for LZCNT
21
+ lookups->unshufMask[i*16 + j] = ((31-i)>j ? -1 : 0);
22
+ else // only used for BSR
23
+ lookups->unshufMask[i*16 + j] = (i>j ? -1 : 0);
24
+ }
25
+ }
26
+ }
27
+
7
28
  void decoder_set_sse2_funcs() {
8
- decoder_sse_init();
9
- decoder_init_lut(lookups->eqFix, lookups->compact);
29
+ decoder_sse_init(lookups);
30
+ decoder_init_lut(lookups->compact);
10
31
  _do_decode = &do_decode_simd<false, false, sizeof(__m128i)*2, do_decode_sse<false, false, ISA_LEVEL_SSE2> >;
11
32
  _do_decode_raw = &do_decode_simd<true, false, sizeof(__m128i)*2, do_decode_sse<true, false, ISA_LEVEL_SSE2> >;
12
33
  _do_decode_end_raw = &do_decode_simd<true, true, sizeof(__m128i)*2, do_decode_sse<true, true, ISA_LEVEL_SSE2> >;
@@ -26,13 +26,13 @@
26
26
  #endif
27
27
 
28
28
  #pragma pack(16)
29
- static struct {
29
+ typedef struct {
30
30
  unsigned char BitsSetTable256inv[256];
31
31
  /*align16*/ struct { char bytes[16]; } compact[32768];
32
- uint8_t eqFix[256];
33
32
  /*align8*/ uint64_t eqAdd[256];
34
33
  /*align16*/ int8_t unshufMask[32*16];
35
- } * HEDLEY_RESTRICT lookups;
34
+ } SSELookups;
35
+ static SSELookups* HEDLEY_RESTRICT lookups;
36
36
  #pragma pack()
37
37
 
38
38
 
@@ -45,27 +45,7 @@ static HEDLEY_ALWAYS_INLINE __m128i force_align_read_128(const void* p) {
45
45
  #endif
46
46
  }
47
47
 
48
-
49
- static void decoder_sse_init() {
50
- ALIGN_ALLOC(lookups, sizeof(*lookups), 16);
51
- for(int i=0; i<256; i++) {
52
- lookups->BitsSetTable256inv[i] = 8 - (
53
- (i & 1) + ((i>>1) & 1) + ((i>>2) & 1) + ((i>>3) & 1) + ((i>>4) & 1) + ((i>>5) & 1) + ((i>>6) & 1) + ((i>>7) & 1)
54
- );
55
-
56
- #define _X(n, k) ((((n) & (1<<k)) ? 192ULL : 0ULL) << (k*8))
57
- lookups->eqAdd[i] = _X(i, 0) | _X(i, 1) | _X(i, 2) | _X(i, 3) | _X(i, 4) | _X(i, 5) | _X(i, 6) | _X(i, 7);
58
- #undef _X
59
- }
60
- for(int i=0; i<32; i++) {
61
- for(int j=0; j<16; j++) {
62
- if(i >= 16) // only used for LZCNT
63
- lookups->unshufMask[i*16 + j] = ((31-i)>j ? -1 : 0);
64
- else // only used for BSR
65
- lookups->unshufMask[i*16 + j] = (i>j ? -1 : 0);
66
- }
67
- }
68
- }
48
+ void decoder_sse_init(SSELookups* HEDLEY_RESTRICT& lookups); // defined in decoder_sse2.cc
69
49
 
70
50
 
71
51
  // for LZCNT/BSR
@@ -521,17 +501,9 @@ HEDLEY_ALWAYS_INLINE void do_decode_sse(const uint8_t* src, long& len, unsigned
521
501
  dataB = _mm_add_epi8(oDataB, _mm_set1_epi8(-42));
522
502
 
523
503
  if(LIKELIHOOD(0.0001, (mask & ((maskEq << 1) + escFirst)) != 0)) {
524
- // resolve invalid sequences of = to deal with cases like '===='
525
- unsigned tmp = lookups->eqFix[(maskEq&0xff) & ~escFirst];
526
- uint32_t maskEq2 = tmp;
527
- for(int j=8; j<32; j+=8) {
528
- tmp = lookups->eqFix[((maskEq>>j)&0xff) & ~(tmp>>7)];
529
- maskEq2 |= tmp<<j;
530
- }
531
- maskEq = maskEq2;
532
-
504
+ maskEq = fix_eqMask<uint32_t>(maskEq & ~escFirst);
533
505
  mask &= ~escFirst;
534
- escFirst = (maskEq >> 31);
506
+ escFirst = maskEq >> 31;
535
507
  // next, eliminate anything following a `=` from the special char mask; this eliminates cases of `=\r` so that they aren't removed
536
508
  maskEq <<= 1;
537
509
  mask &= ~maskEq;
@@ -4,8 +4,8 @@
4
4
  #include "decoder_common.h"
5
5
  #include "decoder_sse_base.h"
6
6
  void decoder_set_ssse3_funcs() {
7
- decoder_sse_init();
8
- decoder_init_lut(lookups->eqFix, lookups->compact);
7
+ decoder_sse_init(lookups);
8
+ decoder_init_lut(lookups->compact);
9
9
  _do_decode = &do_decode_simd<false, false, sizeof(__m128i)*2, do_decode_sse<false, false, ISA_LEVEL_SSSE3> >;
10
10
  _do_decode_raw = &do_decode_simd<true, false, sizeof(__m128i)*2, do_decode_sse<true, false, ISA_LEVEL_SSSE3> >;
11
11
  _do_decode_end_raw = &do_decode_simd<true, true, sizeof(__m128i)*2, do_decode_sse<true, true, ISA_LEVEL_SSSE3> >;
@@ -12,9 +12,6 @@ const bool decoder_has_avx10 = false;
12
12
  # ifndef YENC_DISABLE_AVX256
13
13
  # include "decoder_avx2_base.h"
14
14
  void decoder_set_vbmi2_funcs() {
15
- ALIGN_ALLOC(lookups, sizeof(*lookups), 16);
16
- // TODO: consider removing compact LUT
17
- decoder_init_lut(lookups->eqFix, lookups->compact);
18
15
  _do_decode = &do_decode_simd<false, false, sizeof(__m256i)*2, do_decode_avx2<false, false, ISA_LEVEL_VBMI2> >;
19
16
  _do_decode_raw = &do_decode_simd<true, false, sizeof(__m256i)*2, do_decode_avx2<true, false, ISA_LEVEL_VBMI2> >;
20
17
  _do_decode_end_raw = &do_decode_simd<true, true, sizeof(__m256i)*2, do_decode_avx2<true, true, ISA_LEVEL_VBMI2> >;
@@ -23,8 +20,6 @@ void decoder_set_vbmi2_funcs() {
23
20
  # else
24
21
  # include "decoder_sse_base.h"
25
22
  void decoder_set_vbmi2_funcs() {
26
- decoder_sse_init();
27
- decoder_init_lut(lookups->eqFix, lookups->compact);
28
23
  _do_decode = &do_decode_simd<false, false, sizeof(__m128i)*2, do_decode_sse<false, false, ISA_LEVEL_VBMI2> >;
29
24
  _do_decode_raw = &do_decode_simd<true, false, sizeof(__m128i)*2, do_decode_sse<true, false, ISA_LEVEL_VBMI2> >;
30
25
  _do_decode_end_raw = &do_decode_simd<true, true, sizeof(__m128i)*2, do_decode_sse<true, true, ISA_LEVEL_VBMI2> >;
package/src/encoder.cc CHANGED
@@ -2,6 +2,31 @@
2
2
  #include "encoder_common.h"
3
3
  #include "encoder.h"
4
4
 
5
+
6
+ // lookup tables for scalar processing
7
+ #define _B1(n) _B(n), _B(n+1), _B(n+2), _B(n+3)
8
+ #define _B2(n) _B1(n), _B1(n+4), _B1(n+8), _B1(n+12)
9
+ #define _B3(n) _B2(n), _B2(n+16), _B2(n+32), _B2(n+48)
10
+ #define _BX _B3(0), _B3(64), _B3(128), _B3(192)
11
+
12
+ const unsigned char escapeLUT[256] = { // whether or not the character is critical
13
+ #define _B(n) ((n == 214 || n == '\r'+214 || n == '\n'+214 || n == '='-42) ? 0 : (n+42) & 0xff)
14
+ _BX
15
+ #undef _B
16
+ };
17
+ const uint16_t escapedLUT[256] = { // escaped sequences for characters that need escaping
18
+ #define _B(n) ((n == 214 || n == 214+'\r' || n == 214+'\n' || n == '='-42 || n == 214+'\t' || n == 214+' ' || n == '.'-42) ? UINT16_PACK('=', ((n+42+64)&0xff)) : 0)
19
+ _BX
20
+ #undef _B
21
+ };
22
+
23
+ #undef _B1
24
+ #undef _B2
25
+ #undef _B3
26
+ #undef _BX
27
+
28
+
29
+
5
30
  size_t do_encode_generic(int line_size, int* colOffset, const unsigned char* HEDLEY_RESTRICT src, unsigned char* HEDLEY_RESTRICT dest, size_t len, int doEnd) {
6
31
  unsigned char* es = (unsigned char*)src + len;
7
32
  unsigned char *p = dest; // destination pointer
@@ -2,26 +2,8 @@
2
2
  #define __YENC_ENCODER_COMMON
3
3
 
4
4
  // lookup tables for scalar processing
5
- #define _B1(n) _B(n), _B(n+1), _B(n+2), _B(n+3)
6
- #define _B2(n) _B1(n), _B1(n+4), _B1(n+8), _B1(n+12)
7
- #define _B3(n) _B2(n), _B2(n+16), _B2(n+32), _B2(n+48)
8
- #define _BX _B3(0), _B3(64), _B3(128), _B3(192)
9
-
10
- static const unsigned char escapeLUT[256] = { // whether or not the character is critical
11
- #define _B(n) ((n == 214 || n == '\r'+214 || n == '\n'+214 || n == '='-42) ? 0 : (n+42) & 0xff)
12
- _BX
13
- #undef _B
14
- };
15
- static const uint16_t escapedLUT[256] = { // escaped sequences for characters that need escaping
16
- #define _B(n) ((n == 214 || n == 214+'\r' || n == 214+'\n' || n == '='-42 || n == 214+'\t' || n == 214+' ' || n == '.'-42) ? UINT16_PACK('=', ((n+42+64)&0xff)) : 0)
17
- _BX
18
- #undef _B
19
- };
20
-
21
- #undef _B1
22
- #undef _B2
23
- #undef _B3
24
- #undef _BX
5
+ extern const unsigned char escapeLUT[256];
6
+ extern const uint16_t escapedLUT[256];
25
7
 
26
8
 
27
9
  size_t do_encode_generic(int line_size, int* colOffset, const unsigned char* HEDLEY_RESTRICT src, unsigned char* HEDLEY_RESTRICT dest, size_t len, int doEnd);
@@ -4,13 +4,6 @@
4
4
  #include "encoder.h"
5
5
  #include "encoder_common.h"
6
6
 
7
- # include <riscv_vector.h>
8
- # if defined(__clang__) && __clang_major__ < 16
9
- # define RV(f) f
10
- # else
11
- # define RV(f) __riscv_##f
12
- # endif
13
-
14
7
 
15
8
  static HEDLEY_ALWAYS_INLINE void encode_eol_handle_pre(const uint8_t* HEDLEY_RESTRICT _src, long& inpos, uint8_t*& outp, long& col, long lineSizeOffset) {
16
9
  // TODO: vectorize
@@ -112,7 +105,7 @@ HEDLEY_ALWAYS_INLINE void do_encode_rvv(int line_size, int* colOffset, const uin
112
105
  vl2
113
106
  );
114
107
 
115
- #if defined(__riscv_v_intrinsic) && __riscv_v_intrinsic >= 13000
108
+ #ifdef __riscv_v_intrinsic
116
109
  data = RV(vor_vx_u8m2_mu)(cmp, tmpData, tmpData, 64, vl2);
117
110
  #else
118
111
  data = RV(vor_vx_u8m2_m)(cmp, tmpData, tmpData, 64, vl2);
@@ -122,11 +115,7 @@ HEDLEY_ALWAYS_INLINE void do_encode_rvv(int line_size, int* colOffset, const uin
122
115
  size_t count = RV(vcpop_m_b4)(cmp, vl2);
123
116
  if(count > 1) {
124
117
  // widen mask: 4b->8b
125
- #if defined(__riscv_v_intrinsic) && __riscv_v_intrinsic >= 13000
126
- vuint8mf4_t vcmp = RV(vlmul_trunc_v_u8m1_u8mf4)(RV(vreinterpret_v_b4_u8m1)(cmp));
127
- #else
128
- vuint8mf4_t vcmp = *(vuint8mf4_t*)(&cmp);
129
- #endif
118
+ vuint8mf4_t vcmp = RV_VEC_U8MF4_CAST(cmp);
130
119
  // TODO: use vwsll instead if available
131
120
  // - is clmul useful here?
132
121
  vuint8mf2_t xcmp = RV(vreinterpret_v_u16mf2_u8mf2)(RV(vwmulu_vx_u16mf2)(vcmp, 16, vl2));
@@ -134,11 +123,7 @@ HEDLEY_ALWAYS_INLINE void do_encode_rvv(int line_size, int* colOffset, const uin
134
123
 
135
124
  // expand mask by inserting '1' between each bit (0000abcd -> 1a1b1c1d)
136
125
  vuint8m1_t xcmpTmp = RV(vrgather_vv_u8m1)(MASK_EXPAND, RV(vlmul_ext_v_u8mf2_u8m1)(xcmp), vl2);
137
- #if defined(__riscv_v_intrinsic) && __riscv_v_intrinsic >= 13000
138
- vbool2_t cmpmask = RV(vreinterpret_b2)(xcmpTmp);
139
- #else
140
- vbool2_t cmpmask = *(vbool2_t*)(&xcmpTmp);
141
- #endif
126
+ vbool2_t cmpmask = RV_MASK_CAST(2, 8, xcmpTmp);
142
127
 
143
128
  // expand data and insert =
144
129
  // TODO: use vwsll instead if available
@@ -149,7 +134,7 @@ HEDLEY_ALWAYS_INLINE void do_encode_rvv(int line_size, int* colOffset, const uin
149
134
  // prune unneeded =
150
135
  vuint8m4_t dataTmp = RV(vreinterpret_v_u16m4_u8m4)(data2);
151
136
  vuint8m4_t final_data = RV(vcompress_vm_u8m4)(
152
- #if defined(__riscv_v_intrinsic) && __riscv_v_intrinsic >= 13000
137
+ #ifdef __riscv_v_intrinsic
153
138
  dataTmp, cmpmask, vl2*2
154
139
  #else
155
140
  cmpmask, dataTmp, dataTmp, vl2*2
package/src/platform.cc CHANGED
@@ -138,14 +138,14 @@ int cpu_supports_isa() {
138
138
  _cpuidX(cpuInfo2, 7, 1);
139
139
  if(cpuInfo2[3] & 0x80000) {
140
140
  _cpuidX(cpuInfo2, 0x24, 0);
141
- if((cpuInfo2[2] & 0xff) >= 1 && ( // minimum AVX10.1
141
+ if((cpuInfo2[1] & 0xff) >= 1 && ( // minimum AVX10.1
142
142
  #ifdef YENC_DISABLE_AVX256
143
- cpuInfo2[2] & 0x10000 // AVX10/128
143
+ cpuInfo2[1] & 0x10000 // AVX10/128
144
144
  #else
145
- cpuInfo2[2] & 0x20000 // AVX10/256
145
+ cpuInfo2[1] & 0x20000 // AVX10/256
146
146
  #endif
147
147
  )) {
148
- if(((xcr & 0xE0) == 0xE0) && (cpuInfo2[2] & 0x40000)) ret |= ISA_FEATURE_EVEX512;
148
+ if(((xcr & 0xE0) == 0xE0) && (cpuInfo2[1] & 0x40000)) ret |= ISA_FEATURE_EVEX512;
149
149
  return ret | ISA_LEVEL_VBMI2;
150
150
  }
151
151
  }
package/src/yencode.cc CHANGED
@@ -383,7 +383,7 @@ FUNC(CRC32Combine) {
383
383
  uint32_t crc1 = read_crc32(args[0]), crc2 = read_crc32(args[1]);
384
384
  size_t len = (size_t)ARG_TO_INT(args[2]);
385
385
 
386
- crc1 = do_crc32_combine(crc1, crc2, len);
386
+ crc1 = crc32_combine(crc1, crc2, len);
387
387
  RETURN_VAL(pack_crc32(ISOLATE crc1));
388
388
  }
389
389
 
@@ -399,11 +399,51 @@ FUNC(CRC32Zeroes) {
399
399
  RETURN_ERROR("Second argument must be a 4 byte buffer");
400
400
  crc1 = read_crc32(args[1]);
401
401
  }
402
- size_t len = (size_t)ARG_TO_INT(args[0]);
403
- crc1 = do_crc32_zeros(crc1, len);
402
+ int len = ARG_TO_INT(args[0]);
403
+ if(len < 0)
404
+ crc1 = crc32_unzero(crc1, -len);
405
+ else
406
+ crc1 = crc32_zeros(crc1, len);
404
407
  RETURN_VAL(pack_crc32(ISOLATE crc1));
405
408
  }
406
409
 
410
+ FUNC(CRC32Multiply) {
411
+ FUNC_START;
412
+
413
+ if (args.Length() < 2)
414
+ RETURN_ERROR("At least 2 arguments required");
415
+
416
+ if (!node::Buffer::HasInstance(args[0]) || node::Buffer::Length(args[0]) != 4
417
+ || !node::Buffer::HasInstance(args[1]) || node::Buffer::Length(args[1]) != 4)
418
+ RETURN_ERROR("You must supply a 4 byte Buffer for the first two arguments");
419
+
420
+ uint32_t crc1 = read_crc32(args[0]);
421
+ uint32_t crc2 = read_crc32(args[1]);
422
+ crc1 = crc32_multiply(crc1, crc2);
423
+ RETURN_VAL(pack_crc32(ISOLATE crc1));
424
+ }
425
+
426
+ FUNC(CRC32Shift) {
427
+ FUNC_START;
428
+
429
+ if (args.Length() < 1)
430
+ RETURN_ERROR("At least 1 argument required");
431
+
432
+ uint32_t crc1 = 0x80000000;
433
+ if (args.Length() >= 2) {
434
+ if (!node::Buffer::HasInstance(args[1]) || node::Buffer::Length(args[1]) != 4)
435
+ RETURN_ERROR("Second argument must be a 4 byte buffer");
436
+ crc1 = read_crc32(args[1]);
437
+ }
438
+ int n = ARG_TO_INT(args[0]);
439
+ if(n < 0)
440
+ crc1 = crc32_shift(crc1, ~crc32_powmod(-n));
441
+ else
442
+ crc1 = crc32_shift(crc1, crc32_powmod(n));
443
+ RETURN_VAL(pack_crc32(ISOLATE crc1));
444
+ }
445
+
446
+
407
447
  static void init_all() {
408
448
  encoder_init();
409
449
  decoder_init();
@@ -436,6 +476,8 @@ void yencode_init(
436
476
  NODE_SET_METHOD(exports, "crc32", CRC32);
437
477
  NODE_SET_METHOD(exports, "crc32_combine", CRC32Combine);
438
478
  NODE_SET_METHOD(exports, "crc32_zeroes", CRC32Zeroes);
479
+ NODE_SET_METHOD(exports, "crc32_multiply", CRC32Multiply);
480
+ NODE_SET_METHOD(exports, "crc32_shift", CRC32Shift);
439
481
 
440
482
  #if NODE_VERSION_AT_LEAST(10, 7, 0)
441
483
  uv_once(&init_once, init_all);
package/test/testcrc.js CHANGED
@@ -14,7 +14,7 @@ var ycrc32 = function(s) {
14
14
  };
15
15
  var doTest = function(msg, f, test, expected) {
16
16
  if(!Array.isArray(test)) test = [test];
17
- test[0] = Buffer(test[0]);
17
+ if(!Buffer.isBuffer(test[0])) test[0] = Buffer(test[0]);
18
18
  if(!expected && test.length == 1 && f == 'crc32') expected = crc32(test[0]).toString('hex');
19
19
  else if(Buffer.isBuffer(expected)) expected = expected.toString('hex');
20
20
  assert.equal(y[f].apply(null, test).toString('hex'), expected, msg);
@@ -45,6 +45,22 @@ assert.equal(y.crc32_zeroes(1, ycrc32('')).toString('hex'), 'd202ef8d', 'Zeroes-
45
45
  assert.equal(y.crc32_zeroes(0, ycrc32('z')).toString('hex'), crc32('z').toString('hex'), 'Zeroes Empty Join');
46
46
  assert.equal(y.crc32_zeroes(4, ycrc32('z')).toString('hex'), crc32('z\u0000\u0000\u0000\u0000').toString('hex'), 'Zeroes (4) Join');
47
47
 
48
+ assert.equal(y.crc32_zeroes(4, y.crc32_zeroes(-4)).toString('hex'), '00000000', 'Zeroes (-4+4)');
49
+ assert.equal(y.crc32_zeroes(-4, y.crc32_zeroes(4)).toString('hex'), '00000000', 'Zeroes (4-4)');
50
+ assert.equal(y.crc32_zeroes(6, y.crc32_zeroes(-2, ycrc32('z'))).toString('hex'), crc32('z\u0000\u0000\u0000\u0000').toString('hex'), 'Zeroes (-2+6) Join');
51
+
52
+
53
+ assert.equal(y.crc32_multiply(Buffer([1,2,3,4]), ycrc32('')).toString('hex'), '00000000', 'Multiply by 0');
54
+ assert.equal(y.crc32_multiply(Buffer([0x80,0,0,0]), Buffer([0x80,0,0,0])).toString('hex'), '80000000', 'Multiply by 1');
55
+ assert.equal(y.crc32_multiply(Buffer([1,2,3,4]), Buffer([5,6,7,8])).toString('hex'), '81e243a3', 'Multiply random');
56
+
57
+ assert.equal(y.crc32_shift(0).toString('hex'), '80000000', '2^0');
58
+ assert.equal(y.crc32_shift(1).toString('hex'), '40000000', '2^1');
59
+ assert.equal(y.crc32_shift(2).toString('hex'), '20000000', '2^2');
60
+ assert.equal(y.crc32_shift(-1).toString('hex'), 'db710641', '2^-1');
61
+ assert.equal(y.crc32_shift(-1, y.crc32_shift(1)).toString('hex'), '80000000', '2^(1-1)');
62
+ assert.equal(y.crc32_shift(4, y.crc32_shift(-2)).toString('hex'), '20000000', '2^(-2+4)');
63
+
48
64
 
49
65
  doTest('Random', 'crc32', 'fj[-oqijnw34-59n26 4345j8yn89032q78t9ab9gabh023quhoiBO Z GEB780a sdasdq2345673-98hq2-9348h-na9we8zdfgh-n9 8qwhn-098');
50
66
  doTest('Random Continue', 'crc32', ['KZSHZ5EDOVAmDdakZZOrGSUGGKSpCJoWH7M0MHy6ohnSzvHY4DjpxXmyfWYJQoJ7tKdNhGcuRVUzrgXM', ycrc32('BdenbmoBgiB10ZkeUBjrsZV3dg2Da2fhHqU9TMdi69AHhLRck3Nk60YuFBXh6lvtefBpjdTxbeEmsaEm')], crc32('BdenbmoBgiB10ZkeUBjrsZV3dg2Da2fhHqU9TMdi69AHhLRck3Nk60YuFBXh6lvtefBpjdTxbeEmsaEmKZSHZ5EDOVAmDdakZZOrGSUGGKSpCJoWH7M0MHy6ohnSzvHY4DjpxXmyfWYJQoJ7tKdNhGcuRVUzrgXM'));