yencode 1.1.4 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,9 +1,6 @@
1
1
  #include "common.h"
2
2
  #ifdef __ARM_NEON
3
3
 
4
- #ifndef __aarch64__
5
- #define YENC_DEC_USE_THINTABLE 1
6
- #endif
7
4
  #include "decoder_common.h"
8
5
 
9
6
 
@@ -43,8 +40,6 @@ static struct { char bytes[16]; } ALIGN_TO(16, compactLUT[32768]);
43
40
  # pragma pack()
44
41
  #endif
45
42
 
46
- static uint8_t eqFixLUT[256];
47
-
48
43
 
49
44
 
50
45
  static bool neon_vect_is_nonzero(uint8x16_t v) {
@@ -78,6 +73,9 @@ HEDLEY_ALWAYS_INLINE void do_decode_neon(const uint8_t* src, long& len, unsigned
78
73
  lfCompare = vsetq_lane_u8('.', lfCompare, 1);
79
74
  }
80
75
  #endif
76
+
77
+ decoder_set_nextMask<isRaw>(src, len, nextMask);
78
+
81
79
  long i;
82
80
  for(i = -len; i; i += sizeof(uint8x16_t)*2) {
83
81
  uint8x16x2_t data = vld1q_u8_x2_align(src+i, 32);
@@ -251,6 +249,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_neon(const uint8_t* src, long& len, unsigned
251
249
  // terminator found
252
250
  // there's probably faster ways to do this, but reverting to scalar code should be good enough
253
251
  len += i;
252
+ nextMask = decoder_set_nextMask<isRaw>(src+i, mask);
254
253
  break;
255
254
  }
256
255
  }
@@ -301,6 +300,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_neon(const uint8_t* src, long& len, unsigned
301
300
  );
302
301
  if(LIKELIHOOD(0.001, neon_vect_is_nonzero(matchEnd))) {
303
302
  len += i;
303
+ nextMask = decoder_set_nextMask<isRaw>(src+i, mask);
304
304
  break;
305
305
  }
306
306
  }
@@ -323,18 +323,13 @@ HEDLEY_ALWAYS_INLINE void do_decode_neon(const uint8_t* src, long& len, unsigned
323
323
  // the yEnc specification requires any character following = to be unescaped, not skipped over, so we'll deal with that
324
324
  // firstly, check for invalid sequences of = (we assume that these are rare, as a spec compliant yEnc encoder should not generate these)
325
325
  if(LIKELIHOOD(0.0001, (mask & ((maskEq << 1) | escFirst)) != 0)) {
326
- uint8_t tmp = eqFixLUT[(maskEq&0xff) & ~escFirst];
327
- uint32_t maskEq2 = tmp;
328
- for(int j=8; j<32; j+=8) {
329
- tmp = eqFixLUT[((maskEq>>j)&0xff) & ~(tmp>>7)];
330
- maskEq2 |= tmp<<j;
331
- }
332
- maskEq = maskEq2;
326
+ maskEq = fix_eqMask<uint32_t>(maskEq & ~escFirst);
333
327
 
328
+ unsigned char nextEscFirst = maskEq>>31;
334
329
  // next, eliminate anything following a `=` from the special char mask; this eliminates cases of `=\r` so that they aren't removed
335
330
  maskEq = (maskEq<<1) | escFirst;
336
331
  mask &= ~maskEq;
337
- escFirst = tmp>>7;
332
+ escFirst = nextEscFirst;
338
333
 
339
334
  // unescape chars following `=`
340
335
  uint8x8_t maskEqTemp = vreinterpret_u8_u32(vmov_n_u32(maskEq));
@@ -449,25 +444,14 @@ HEDLEY_ALWAYS_INLINE void do_decode_neon(const uint8_t* src, long& len, unsigned
449
444
  #endif
450
445
  }
451
446
  }
452
-
453
- if(isRaw) {
454
- if(len != 0) { // have to gone through at least one loop cycle
455
- if(src[i-2] == '\r' && src[i-1] == '\n' && src[i] == '.')
456
- nextMask = 1;
457
- else if(src[i-1] == '\r' && src[i] == '\n' && src[i+1] == '.')
458
- nextMask = 2;
459
- else
460
- nextMask = 0;
461
- }
462
- } else
463
- nextMask = 0;
464
447
  }
465
448
 
466
449
  void decoder_set_neon_funcs() {
467
- decoder_init_lut(eqFixLUT, compactLUT);
450
+ decoder_init_lut(compactLUT);
468
451
  _do_decode = &do_decode_simd<false, false, sizeof(uint8x16_t)*2, do_decode_neon<false, false> >;
469
452
  _do_decode_raw = &do_decode_simd<true, false, sizeof(uint8x16_t)*2, do_decode_neon<true, false> >;
470
453
  _do_decode_end_raw = &do_decode_simd<true, true, sizeof(uint8x16_t)*2, do_decode_neon<true, true> >;
454
+ _decode_isa = ISA_LEVEL_NEON;
471
455
  }
472
456
  #else
473
457
  void decoder_set_neon_funcs() {}
@@ -7,8 +7,6 @@
7
7
  static struct { char bytes[16]; } ALIGN_TO(16, compactLUT[32768]);
8
8
  #pragma pack()
9
9
 
10
- static uint8_t eqFixLUT[256];
11
-
12
10
 
13
11
  // AArch64 GCC lacks these functions until 8.5, 9.4 and 10.1 (10.0 unknown)
14
12
  #if !defined(__clang__) && !defined(_MSC_VER) && (!defined(__aarch64__) || !(HEDLEY_GCC_VERSION_CHECK(9,4,0) || (!HEDLEY_GCC_VERSION_CHECK(9,0,0) && HEDLEY_GCC_VERSION_CHECK(8,5,0))))
@@ -56,6 +54,9 @@ HEDLEY_ALWAYS_INLINE void do_decode_neon(const uint8_t* src, long& len, unsigned
56
54
  if(nextMask == 2)
57
55
  nextMaskMix = vsetq_lane_u8(2, nextMaskMix, 1);
58
56
  uint8x16_t yencOffset = escFirst ? vmakeq_u8(42+64,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42) : vdupq_n_u8(42);
57
+
58
+ decoder_set_nextMask<isRaw>(src, len, nextMask);
59
+
59
60
  long i;
60
61
  for(i = -len; i; i += sizeof(uint8x16_t)*4) {
61
62
  uint8x16x4_t data = _vld1q_u8_x4(src+i);
@@ -227,6 +228,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_neon(const uint8_t* src, long& len, unsigned
227
228
  // terminator found
228
229
  // there's probably faster ways to do this, but reverting to scalar code should be good enough
229
230
  len += i;
231
+ nextMask = decoder_set_nextMask<isRaw>(src+i, mask);
230
232
  break;
231
233
  }
232
234
  }
@@ -275,6 +277,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_neon(const uint8_t* src, long& len, unsigned
275
277
  );
276
278
  if(LIKELIHOOD(0.001, neon_vect_is_nonzero(matchEnd))) {
277
279
  len += i;
280
+ nextMask = decoder_set_nextMask<isRaw>(src+i, mask);
278
281
  break;
279
282
  }
280
283
  }
@@ -288,18 +291,13 @@ HEDLEY_ALWAYS_INLINE void do_decode_neon(const uint8_t* src, long& len, unsigned
288
291
  // the yEnc specification requires any character following = to be unescaped, not skipped over, so we'll deal with that
289
292
  // firstly, check for invalid sequences of = (we assume that these are rare, as a spec compliant yEnc encoder should not generate these)
290
293
  if(LIKELIHOOD(0.0001, (mask & ((maskEq << 1) | escFirst)) != 0)) {
291
- uint8_t tmp = eqFixLUT[(maskEq&0xff) & ~escFirst];
292
- uint64_t maskEq2 = tmp;
293
- for(int j=8; j<64; j+=8) {
294
- tmp = eqFixLUT[((maskEq>>j)&0xff) & ~(tmp>>7)];
295
- maskEq2 |= ((uint64_t)tmp)<<j;
296
- }
297
- maskEq = maskEq2;
294
+ maskEq = fix_eqMask<uint64_t>(maskEq & ~(uint64_t)escFirst);
298
295
 
296
+ unsigned char nextEscFirst = maskEq>>63;
299
297
  // next, eliminate anything following a `=` from the special char mask; this eliminates cases of `=\r` so that they aren't removed
300
298
  maskEq = (maskEq<<1) | escFirst;
301
299
  mask &= ~maskEq;
302
- escFirst = tmp>>7;
300
+ escFirst = nextEscFirst;
303
301
 
304
302
  // unescape chars following `=`
305
303
  #if defined(__GNUC__) && !defined(__clang__)
@@ -430,24 +428,14 @@ HEDLEY_ALWAYS_INLINE void do_decode_neon(const uint8_t* src, long& len, unsigned
430
428
  yencOffset = vdupq_n_u8(42);
431
429
  }
432
430
  }
433
- if(isRaw) {
434
- if(len != 0) { // have to gone through at least one loop cycle
435
- if(src[i-2] == '\r' && src[i-1] == '\n' && src[i] == '.')
436
- nextMask = 1;
437
- else if(src[i-1] == '\r' && src[i] == '\n' && src[i+1] == '.')
438
- nextMask = 2;
439
- else
440
- nextMask = 0;
441
- }
442
- } else
443
- nextMask = 0;
444
431
  }
445
432
 
446
433
  void decoder_set_neon_funcs() {
447
- decoder_init_lut(eqFixLUT, compactLUT);
434
+ decoder_init_lut(compactLUT);
448
435
  _do_decode = &do_decode_simd<false, false, sizeof(uint8x16_t)*4, do_decode_neon<false, false> >;
449
436
  _do_decode_raw = &do_decode_simd<true, false, sizeof(uint8x16_t)*4, do_decode_neon<true, false> >;
450
437
  _do_decode_end_raw = &do_decode_simd<true, true, sizeof(uint8x16_t)*4, do_decode_neon<true, true> >;
438
+ _decode_isa = ISA_LEVEL_NEON;
451
439
  }
452
440
  #else
453
441
  void decoder_set_neon_funcs() {}
@@ -0,0 +1,274 @@
1
+ #include "common.h"
2
+ #ifdef __riscv_vector
3
+ #include "decoder_common.h"
4
+
5
+
6
+ #ifdef __riscv_v_intrinsic
7
+ # define RV_vmerge_vxm_u8m2 RV(vmerge_vxm_u8m2)
8
+ # define RV_vmerge_vxm_u16m2 RV(vmerge_vxm_u16m2)
9
+ #else
10
+ # define RV_vmerge_vxm_u8m2(v, x, m, vl) RV(vmerge_vxm_u8m2)(m, v, x, vl)
11
+ # define RV_vmerge_vxm_u16m2(v, x, m, vl) RV(vmerge_vxm_u16m2)(m, v, x, vl)
12
+ #endif
13
+
14
+ #if defined(__riscv_v_intrinsic) && __riscv_v_intrinsic >= 12000
15
+ # define RV_VEC_CAST(masksz, vecsz, vec) RV(vreinterpret_v_b##masksz##_u##vecsz##m1)(vec)
16
+ #else
17
+ # define RV_VEC_CAST(masksz, vecsz, vec) *(vuint##vecsz##m1_t*)(&(vec))
18
+ #endif
19
+
20
+
21
+ template<int shift>
22
+ static inline vbool4_t mask_lshift(vbool4_t m, unsigned shiftIn, size_t vl) {
23
+ vuint8m1_t mv = RV_VEC_CAST(4, 8, m);
24
+ vuint8m1_t mvl = RV(vsll_vx_u8m1)(mv, shift, vl/8);
25
+ vuint8m1_t mvr = RV(vsrl_vx_u8m1)(mv, 8-shift, vl/8);
26
+ mvr = RV(vslide1up_vx_u8m1)(mvr, shiftIn, vl/8);
27
+
28
+ return RV(vmor_mm_b4)(
29
+ RV_MASK_CAST(4, 8, mvl), RV_MASK_CAST(4, 8, mvr), vl
30
+ );
31
+ }
32
+
33
+ static inline vuint8m2_t set_first_vu8(vuint8m2_t src, uint8_t item, size_t vl) {
34
+ #ifdef __riscv_v_intrinsic
35
+ return RV(vmv_s_x_u8m2_tu)(src, item, vl);
36
+ #else
37
+ vuint8m1_t m = RV(vslide1up_vx_u8m1)(RV(vmv_v_x_u8m1)(0, ~0), 1, ~0);
38
+ return RV_vmerge_vxm_u8m2(src, item, RV_MASK_CAST(4, 8, m), vl);
39
+ #endif
40
+ }
41
+ static inline vuint16m2_t set_first_vu16(vuint16m2_t src, uint16_t item, size_t vl) {
42
+ #ifdef __riscv_v_intrinsic
43
+ return RV(vmv_s_x_u16m2_tu)(src, item, vl);
44
+ #else
45
+ vuint16m1_t m = RV(vslide1up_vx_u16m1)(RV(vmv_v_x_u16m1)(0, ~0), 1, ~0);
46
+ return RV_vmerge_vxm_u16m2(src, item, RV_MASK_CAST(8, 16, m), vl);
47
+ #endif
48
+ }
49
+
50
+
51
+
52
+ template<bool isRaw, bool searchEnd>
53
+ HEDLEY_ALWAYS_INLINE void do_decode_rvv(const uint8_t* src, long& len, unsigned char*& outp, unsigned char& escFirst, uint16_t& nextMask) {
54
+ HEDLEY_ASSUME(escFirst == 0 || escFirst == 1);
55
+ HEDLEY_ASSUME(nextMask == 0 || nextMask == 1 || nextMask == 2);
56
+
57
+ size_t vl2 = RV(vsetvlmax_e8m2)();
58
+
59
+ vuint8m2_t yencOffset = RV(vmv_v_x_u8m2)(42, vl2);
60
+ if(escFirst) yencOffset = set_first_vu8(yencOffset, 42+64, vl2);
61
+ vuint8m2_t lfCompare = RV(vmv_v_x_u8m2)('\n', vl2);
62
+ if(nextMask && isRaw) {
63
+ lfCompare = RV(vreinterpret_v_u16m2_u8m2)(
64
+ set_first_vu16(RV(vreinterpret_v_u8m2_u16m2)(lfCompare), nextMask == 1 ? 0x0a2e /*".\n"*/ : 0x2e0a /*"\n."*/, vl2/2)
65
+ );
66
+ }
67
+
68
+ // mask where only the highest bit is set
69
+ vbool4_t lastBit = RV(vmseq_vx_u8m2_b4)(
70
+ RV(vslide1down_vx_u8m2)(RV(vmv_v_x_u8m2)(0, vl2), 1, vl2),
71
+ 1, vl2
72
+ );
73
+
74
+ decoder_set_nextMask<isRaw>(src, len, nextMask);
75
+
76
+ // TODO: consider exploiting partial vector capability
77
+ long inpos;
78
+ for(inpos = -len; inpos; inpos += vl2) {
79
+ vuint8m2_t data = RV(vle8_v_u8m2)(src + inpos, vl2);
80
+
81
+ // search for special chars
82
+ vbool4_t cmpEq = RV(vmseq_vx_u8m2_b4)(data, '=', vl2);
83
+ vbool4_t cmpCr = RV(vmseq_vx_u8m2_b4)(data, '\r', vl2);
84
+ // note: cmp is always negated (unlike cmpEq/Cr)
85
+ vbool4_t cmp = RV(vmnor_mm_b4)(
86
+ RV(vmor_mm_b4)(cmpEq, cmpCr, vl2),
87
+ isRaw ? RV(vmseq_vv_u8m2_b4)(data, lfCompare, vl2) : RV(vmseq_vx_u8m2_b4)(data, '\n', vl2),
88
+ vl2
89
+ );
90
+
91
+ size_t numOutputChars = RV(vcpop_m_b4)(cmp, vl2);
92
+
93
+ if(numOutputChars != vl2) {
94
+ // dot-unstuffing + end detection
95
+ if((isRaw || searchEnd) && RV(vcpop_m_b4)(RV(vmxnor_mm_b4)(cmp, cmpEq, vl2), vl2)) {
96
+ uint32_t nextWord;
97
+ if(!searchEnd) {
98
+ memcpy(&nextWord, src + inpos + vl2, 2);
99
+ } else {
100
+ memcpy(&nextWord, src + inpos + vl2, 4);
101
+ }
102
+ vuint8m2_t nextData2 = RV(vreinterpret_v_u16m2_u8m2)(RV(vslide1down_vx_u16m2)(RV(vreinterpret_v_u8m2_u16m2)(data), nextWord, vl2/2));
103
+
104
+ vbool4_t match2Cr_Dot, match3EqY;
105
+ vuint8m2_t nextData3;
106
+ if(isRaw) {
107
+ match2Cr_Dot = RV(vmand_mm_b4)(cmpCr, RV(vmseq_vx_u8m2_b4)(nextData2, '.', vl2), vl2);
108
+ }
109
+
110
+ if(searchEnd) {
111
+ nextData3 = RV(vslide1down_vx_u8m2)(nextData2, nextWord>>16, vl2);
112
+ match3EqY = RV(vmand_mm_b4)(
113
+ RV(vmseq_vx_u8m2_b4)(nextData2, '=', vl2),
114
+ RV(vmseq_vx_u8m2_b4)(nextData3, 'y', vl2),
115
+ vl2
116
+ );
117
+ }
118
+
119
+ // find patterns of \r_.
120
+ if(isRaw && LIKELIHOOD(0.001, RV(vcpop_m_b4)(match2Cr_Dot, vl2) > 0)) {
121
+ // find \r\n.
122
+ vuint8m2_t nextData1 = RV(vslide1down_vx_u8m2)(data, nextWord, vl2);
123
+ vbool4_t match1Lf = RV(vmseq_vx_u8m2_b4)(nextData1, '\n', vl2);
124
+ vbool4_t match2NlDot = RV(vmand_mm_b4)(match2Cr_Dot, match1Lf, vl2);
125
+
126
+ if(searchEnd) {
127
+ vbool4_t match1Nl = RV(vmand_mm_b4)(cmpCr, match1Lf, vl2);
128
+
129
+ vuint8m2_t nextData4 = RV(vreinterpret_v_u32m2_u8m2)(RV(vslide1down_vx_u32m2)(RV(vreinterpret_v_u8m2_u32m2)(data), nextWord, vl2/4));
130
+
131
+ // match instances of \r\n.\r\n and \r\n.=y
132
+ vbool4_t match4Nl = RV(vmand_mm_b4)(
133
+ RV(vmseq_vx_u8m2_b4)(nextData3, '\r', vl2),
134
+ RV(vmseq_vx_u8m2_b4)(nextData4, '\n', vl2),
135
+ vl2
136
+ );
137
+ vbool4_t match4EqY = RV(vmand_mm_b4)(
138
+ RV(vmseq_vx_u8m2_b4)(nextData3, '=', vl2),
139
+ RV(vmseq_vx_u8m2_b4)(nextData4, 'y', vl2),
140
+ vl2
141
+ );
142
+
143
+ // merge \r\n and =y matches
144
+ vbool4_t match4End = RV(vmor_mm_b4)(match4Nl, match4EqY, vl2);
145
+ // merge with \r\n.
146
+ match4End = RV(vmand_mm_b4)(match4End, match2NlDot, vl2);
147
+ // merge \r\n=y
148
+ vbool4_t match3End = RV(vmand_mm_b4)(match1Nl, match3EqY, vl2);
149
+
150
+ vbool4_t matchEnd = RV(vmor_mm_b4)(match4End, match3End, vl2);
151
+
152
+ // combine match sequences
153
+ if(LIKELIHOOD(0.001, RV(vcpop_m_b4)(matchEnd, vl2) > 0)) {
154
+ // terminator found
155
+ len += inpos;
156
+ nextMask = decoder_set_nextMask<isRaw>(src+inpos, ~RV(vmv_x_s_u8m1_u8)(RV_VEC_CAST(4, 8, cmp)));
157
+ break;
158
+ }
159
+ }
160
+
161
+ // shift match2NlDot by 2
162
+ cmp = RV(vmandn_mm_b4)(cmp, mask_lshift<2>(match2NlDot, 0, vl2), vl2);
163
+ numOutputChars = RV(vcpop_m_b4)(cmp, vl2);
164
+
165
+ vuint8mf4_t nextNlDot = RV(vslidedown_vx_u8mf4)(
166
+ #ifndef __riscv_v_intrinsic
167
+ RV(vmv_v_x_u8mf4)(0, vl2/8),
168
+ #endif
169
+ RV_VEC_U8MF4_CAST(match2NlDot), vl2/8-1, vl2/8
170
+ );
171
+ nextNlDot = RV(vsrl_vx_u8mf4)(nextNlDot, 6, vl2/8);
172
+ vuint8m1_t nextNlDotVec = RV(vlmul_ext_v_u8mf4_u8m1)(nextNlDot);
173
+ lfCompare = RV_vmerge_vxm_u8m2(RV(vmv_v_x_u8m2)('\n', vl2), '.', RV_MASK_CAST(4, 8, nextNlDotVec), vl2);
174
+ } else if(searchEnd) {
175
+ if(LIKELIHOOD(0.001, RV(vcpop_m_b4)(match3EqY, vl2) != 0)) {
176
+ vuint8m2_t nextData1 = RV(vslide1down_vx_u8m2)(data, nextWord, vl2);
177
+ vbool4_t match1Lf = RV(vmseq_vx_u8m2_b4)(nextData1, '\n', vl2);
178
+ vbool4_t matchEnd = RV(vmand_mm_b4)(RV(vmand_mm_b4)(match3EqY, cmpCr, vl2), match1Lf, vl2);
179
+ if(LIKELIHOOD(0.001, RV(vcpop_m_b4)(matchEnd, vl2) > 0)) {
180
+ len += inpos;
181
+ nextMask = decoder_set_nextMask<isRaw>(src+inpos, ~RV(vmv_x_s_u8m1_u8)(RV_VEC_CAST(4, 8, cmp)));
182
+ break;
183
+ }
184
+ }
185
+ if(isRaw)
186
+ lfCompare = RV(vmv_v_x_u8m2)('\n', vl2);
187
+ } else if(isRaw) // no \r_. found
188
+ lfCompare = RV(vmv_v_x_u8m2)('\n', vl2);
189
+ }
190
+
191
+ // the second character in an escape sequence
192
+ vbool4_t cmpEqShift1 = mask_lshift<1>(cmpEq, escFirst, vl2);
193
+
194
+ // a spec compliant encoder should never generate sequences: ==, =\n and =\r, but we'll handle them to be spec compliant
195
+ // the yEnc specification requires any character following = to be unescaped, not skipped over, so we'll deal with that
196
+ // firstly, check for invalid sequences of = (we assume that these are rare, as a spec compliant yEnc encoder should not generate these)
197
+ if(LIKELIHOOD(0.0001, RV(vcpop_m_b4)(RV(vmandn_mm_b4)(cmpEqShift1, cmp, vl2), vl2) != 0)) {
198
+ // note: we assume that uintptr_t corresponds with __riscv_xlen
199
+ #if __riscv_xlen == 64
200
+ vuint64m1_t cmpEqW = RV_VEC_CAST(4, 64, cmpEq);
201
+ #else
202
+ vuint32m1_t cmpEqW = RV_VEC_CAST(4, 32, cmpEq);
203
+ #endif
204
+ size_t nextShiftDown = (vl2 > sizeof(uintptr_t)*8 ? sizeof(uintptr_t)*8 : vl2) - 1;
205
+ size_t wvl = (vl2 + sizeof(uintptr_t)*8 -1) / (sizeof(uintptr_t)*8);
206
+ for(size_t w=0; w<vl2; w+=sizeof(uintptr_t)*8) {
207
+ // extract bottom word
208
+ #if __riscv_xlen == 64
209
+ uintptr_t maskW = RV(vmv_x_s_u64m1_u64)(cmpEqW);
210
+ #else
211
+ uintptr_t maskW = RV(vmv_x_s_u32m1_u32)(cmpEqW);
212
+ #endif
213
+
214
+ // fix it
215
+ maskW = fix_eqMask<uintptr_t>(maskW & ~(uintptr_t)escFirst);
216
+ uint8_t nextEscFirst = (maskW >> nextShiftDown) & 1;
217
+
218
+ // shift it up (will be used for cmpEqShift1)
219
+ maskW = (maskW<<1) | escFirst; // TODO: should this be done using mask_lshift<1> instead?
220
+ escFirst = nextEscFirst;
221
+
222
+ // slide the new value in from the top
223
+ #if __riscv_xlen == 64
224
+ cmpEqW = RV(vslide1down_vx_u64m1)(cmpEqW, maskW, wvl);
225
+ #else
226
+ cmpEqW = RV(vslide1down_vx_u32m1)(cmpEqW, maskW, wvl);
227
+ #endif
228
+ }
229
+ #if __riscv_xlen == 64
230
+ cmpEqShift1 = RV_MASK_CAST(4, 64, cmpEqW);
231
+ #else
232
+ cmpEqShift1 = RV_MASK_CAST(4, 32, cmpEqW);
233
+ #endif
234
+ cmp = RV(vmor_mm_b4)(cmpEqShift1, cmp, vl2); // ~(~cmp & ~cmpEqShift1)
235
+ numOutputChars = RV(vcpop_m_b4)(cmp, vl2);
236
+ } else {
237
+ // no invalid = sequences found - don't need to fix up cmpEq
238
+ escFirst = RV(vcpop_m_b4)(RV(vmand_mm_b4)(cmpEq, lastBit, vl2), vl2);
239
+ }
240
+ data = RV(vsub_vv_u8m2)(data, RV_vmerge_vxm_u8m2(yencOffset, 64+42, cmpEqShift1, vl2), vl2);
241
+ yencOffset = set_first_vu8(yencOffset, 42 | (escFirst<<6), vl2);
242
+
243
+ // all that's left is to remove unwanted chars
244
+ #ifdef __riscv_v_intrinsic
245
+ data = RV(vcompress_vm_u8m2)(data, cmp, vl2);
246
+ #else
247
+ data = RV(vcompress_vm_u8m2)(cmp, data, data, vl2);
248
+ #endif
249
+ RV(vse8_v_u8m2)(outp, data, vl2);
250
+ } else {
251
+ data = RV(vsub_vv_u8m2)(data, yencOffset, vl2);
252
+ RV(vse8_v_u8m2)(outp, data, vl2);
253
+ // TODO: should these be done at LMUL=1? or, it might not be worth this strategy (e.g. do an additional OR instead), considering the cost of LMUL=2
254
+ yencOffset = RV(vmv_v_x_u8m2)(42, vl2);
255
+ if(isRaw) lfCompare = RV(vmv_v_x_u8m2)('\n', vl2);
256
+ escFirst = 0;
257
+ }
258
+ outp += numOutputChars;
259
+ }
260
+ }
261
+
262
+ size_t decoder_rvv_width() {
263
+ return RV(vsetvlmax_e8m2)();
264
+ }
265
+
266
+ void decoder_set_rvv_funcs() {
267
+ _do_decode = &do_decode_simd<false, false, decoder_rvv_width, do_decode_rvv<false, false> >;
268
+ _do_decode_raw = &do_decode_simd<true, false, decoder_rvv_width, do_decode_rvv<true, false> >;
269
+ _do_decode_end_raw = &do_decode_simd<true, true, decoder_rvv_width, do_decode_rvv<true, true> >;
270
+ _decode_isa = ISA_LEVEL_RVV;
271
+ }
272
+ #else
273
+ void decoder_set_rvv_funcs() {}
274
+ #endif
@@ -4,12 +4,34 @@
4
4
  #include "decoder_common.h"
5
5
  #include "decoder_sse_base.h"
6
6
 
7
+ void decoder_sse_init(SSELookups* HEDLEY_RESTRICT& lookups) {
8
+ ALIGN_ALLOC(lookups, sizeof(SSELookups), 16);
9
+ for(int i=0; i<256; i++) {
10
+ lookups->BitsSetTable256inv[i] = 8 - (
11
+ (i & 1) + ((i>>1) & 1) + ((i>>2) & 1) + ((i>>3) & 1) + ((i>>4) & 1) + ((i>>5) & 1) + ((i>>6) & 1) + ((i>>7) & 1)
12
+ );
13
+
14
+ #define _X(n, k) ((((n) & (1<<k)) ? 192ULL : 0ULL) << (k*8))
15
+ lookups->eqAdd[i] = _X(i, 0) | _X(i, 1) | _X(i, 2) | _X(i, 3) | _X(i, 4) | _X(i, 5) | _X(i, 6) | _X(i, 7);
16
+ #undef _X
17
+ }
18
+ for(int i=0; i<32; i++) {
19
+ for(int j=0; j<16; j++) {
20
+ if(i >= 16) // only used for LZCNT
21
+ lookups->unshufMask[i*16 + j] = ((31-i)>j ? -1 : 0);
22
+ else // only used for BSR
23
+ lookups->unshufMask[i*16 + j] = (i>j ? -1 : 0);
24
+ }
25
+ }
26
+ }
27
+
7
28
  void decoder_set_sse2_funcs() {
8
- decoder_sse_init();
9
- decoder_init_lut(lookups->eqFix, lookups->compact);
29
+ decoder_sse_init(lookups);
30
+ decoder_init_lut(lookups->compact);
10
31
  _do_decode = &do_decode_simd<false, false, sizeof(__m128i)*2, do_decode_sse<false, false, ISA_LEVEL_SSE2> >;
11
32
  _do_decode_raw = &do_decode_simd<true, false, sizeof(__m128i)*2, do_decode_sse<true, false, ISA_LEVEL_SSE2> >;
12
33
  _do_decode_end_raw = &do_decode_simd<true, true, sizeof(__m128i)*2, do_decode_sse<true, true, ISA_LEVEL_SSE2> >;
34
+ _decode_isa = ISA_LEVEL_SSE2;
13
35
  }
14
36
  #else
15
37
  void decoder_set_sse2_funcs() {}
@@ -26,13 +26,13 @@
26
26
  #endif
27
27
 
28
28
  #pragma pack(16)
29
- static struct {
29
+ typedef struct {
30
30
  unsigned char BitsSetTable256inv[256];
31
31
  /*align16*/ struct { char bytes[16]; } compact[32768];
32
- uint8_t eqFix[256];
33
32
  /*align8*/ uint64_t eqAdd[256];
34
33
  /*align16*/ int8_t unshufMask[32*16];
35
- } * HEDLEY_RESTRICT lookups;
34
+ } SSELookups;
35
+ static SSELookups* HEDLEY_RESTRICT lookups;
36
36
  #pragma pack()
37
37
 
38
38
 
@@ -45,27 +45,7 @@ static HEDLEY_ALWAYS_INLINE __m128i force_align_read_128(const void* p) {
45
45
  #endif
46
46
  }
47
47
 
48
-
49
- static void decoder_sse_init() {
50
- ALIGN_ALLOC(lookups, sizeof(*lookups), 16);
51
- for(int i=0; i<256; i++) {
52
- lookups->BitsSetTable256inv[i] = 8 - (
53
- (i & 1) + ((i>>1) & 1) + ((i>>2) & 1) + ((i>>3) & 1) + ((i>>4) & 1) + ((i>>5) & 1) + ((i>>6) & 1) + ((i>>7) & 1)
54
- );
55
-
56
- #define _X(n, k) ((((n) & (1<<k)) ? 192ULL : 0ULL) << (k*8))
57
- lookups->eqAdd[i] = _X(i, 0) | _X(i, 1) | _X(i, 2) | _X(i, 3) | _X(i, 4) | _X(i, 5) | _X(i, 6) | _X(i, 7);
58
- #undef _X
59
- }
60
- for(int i=0; i<32; i++) {
61
- for(int j=0; j<16; j++) {
62
- if(i >= 16) // only used for LZCNT
63
- lookups->unshufMask[i*16 + j] = ((31-i)>j ? -1 : 0);
64
- else // only used for BSR
65
- lookups->unshufMask[i*16 + j] = (i>j ? -1 : 0);
66
- }
67
- }
68
- }
48
+ void decoder_sse_init(SSELookups* HEDLEY_RESTRICT& lookups); // defined in decoder_sse2.cc
69
49
 
70
50
 
71
51
  // for LZCNT/BSR
@@ -145,6 +125,9 @@ HEDLEY_ALWAYS_INLINE void do_decode_sse(const uint8_t* src, long& len, unsigned
145
125
  else
146
126
  lfCompare = _mm_insert_epi16(lfCompare, _nextMask == 1 ? 0x0a2e /*".\n"*/ : 0x2e0a /*"\n."*/, 0);
147
127
  }
128
+
129
+ decoder_set_nextMask<isRaw>(src, len, _nextMask); // set this before the loop because we can't check src after it's been overwritten
130
+
148
131
  intptr_t i;
149
132
  for(i = -len; i; i += sizeof(__m128i)*2) {
150
133
  __m128i oDataA = _mm_load_si128((__m128i *)(src+i));
@@ -383,6 +366,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_sse(const uint8_t* src, long& len, unsigned
383
366
  // terminator found
384
367
  // there's probably faster ways to do this, but reverting to scalar code should be good enough
385
368
  len += (long)i;
369
+ _nextMask = decoder_set_nextMask<isRaw>(src+i, mask);
386
370
  break;
387
371
  }
388
372
  }
@@ -492,6 +476,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_sse(const uint8_t* src, long& len, unsigned
492
476
 
493
477
  if(endFound) {
494
478
  len += (long)i;
479
+ _nextMask = decoder_set_nextMask<isRaw>(src+i, mask);
495
480
  break;
496
481
  }
497
482
  }
@@ -516,17 +501,9 @@ HEDLEY_ALWAYS_INLINE void do_decode_sse(const uint8_t* src, long& len, unsigned
516
501
  dataB = _mm_add_epi8(oDataB, _mm_set1_epi8(-42));
517
502
 
518
503
  if(LIKELIHOOD(0.0001, (mask & ((maskEq << 1) + escFirst)) != 0)) {
519
- // resolve invalid sequences of = to deal with cases like '===='
520
- unsigned tmp = lookups->eqFix[(maskEq&0xff) & ~escFirst];
521
- uint32_t maskEq2 = tmp;
522
- for(int j=8; j<32; j+=8) {
523
- tmp = lookups->eqFix[((maskEq>>j)&0xff) & ~(tmp>>7)];
524
- maskEq2 |= tmp<<j;
525
- }
526
- maskEq = maskEq2;
527
-
504
+ maskEq = fix_eqMask<uint32_t>(maskEq & ~escFirst);
528
505
  mask &= ~escFirst;
529
- escFirst = (maskEq >> 31);
506
+ escFirst = maskEq >> 31;
530
507
  // next, eliminate anything following a `=` from the special char mask; this eliminates cases of `=\r` so that they aren't removed
531
508
  maskEq <<= 1;
532
509
  mask &= ~maskEq;
@@ -710,16 +687,5 @@ HEDLEY_ALWAYS_INLINE void do_decode_sse(const uint8_t* src, long& len, unsigned
710
687
  }
711
688
  }
712
689
  _escFirst = (unsigned char)escFirst;
713
- if(isRaw) {
714
- if(len != 0) { // have to gone through at least one loop cycle
715
- if(src[i-2] == '\r' && src[i-1] == '\n' && src[i] == '.')
716
- _nextMask = 1;
717
- else if(src[i-1] == '\r' && src[i] == '\n' && src[i+1] == '.')
718
- _nextMask = 2;
719
- else
720
- _nextMask = 0;
721
- }
722
- } else
723
- _nextMask = 0;
724
690
  }
725
691
  #endif
@@ -4,11 +4,12 @@
4
4
  #include "decoder_common.h"
5
5
  #include "decoder_sse_base.h"
6
6
  void decoder_set_ssse3_funcs() {
7
- decoder_sse_init();
8
- decoder_init_lut(lookups->eqFix, lookups->compact);
7
+ decoder_sse_init(lookups);
8
+ decoder_init_lut(lookups->compact);
9
9
  _do_decode = &do_decode_simd<false, false, sizeof(__m128i)*2, do_decode_sse<false, false, ISA_LEVEL_SSSE3> >;
10
10
  _do_decode_raw = &do_decode_simd<true, false, sizeof(__m128i)*2, do_decode_sse<true, false, ISA_LEVEL_SSSE3> >;
11
11
  _do_decode_end_raw = &do_decode_simd<true, true, sizeof(__m128i)*2, do_decode_sse<true, true, ISA_LEVEL_SSSE3> >;
12
+ _decode_isa = ISA_LEVEL_SSSE3;
12
13
  }
13
14
  #else
14
15
  void decoder_set_sse2_funcs();
@@ -12,21 +12,18 @@ const bool decoder_has_avx10 = false;
12
12
  # ifndef YENC_DISABLE_AVX256
13
13
  # include "decoder_avx2_base.h"
14
14
  void decoder_set_vbmi2_funcs() {
15
- ALIGN_ALLOC(lookups, sizeof(*lookups), 16);
16
- // TODO: consider removing compact LUT
17
- decoder_init_lut(lookups->eqFix, lookups->compact);
18
15
  _do_decode = &do_decode_simd<false, false, sizeof(__m256i)*2, do_decode_avx2<false, false, ISA_LEVEL_VBMI2> >;
19
16
  _do_decode_raw = &do_decode_simd<true, false, sizeof(__m256i)*2, do_decode_avx2<true, false, ISA_LEVEL_VBMI2> >;
20
17
  _do_decode_end_raw = &do_decode_simd<true, true, sizeof(__m256i)*2, do_decode_avx2<true, true, ISA_LEVEL_VBMI2> >;
18
+ _decode_isa = ISA_LEVEL_VBMI2;
21
19
  }
22
20
  # else
23
21
  # include "decoder_sse_base.h"
24
22
  void decoder_set_vbmi2_funcs() {
25
- decoder_sse_init();
26
- decoder_init_lut(lookups->eqFix, lookups->compact);
27
23
  _do_decode = &do_decode_simd<false, false, sizeof(__m128i)*2, do_decode_sse<false, false, ISA_LEVEL_VBMI2> >;
28
24
  _do_decode_raw = &do_decode_simd<true, false, sizeof(__m128i)*2, do_decode_sse<true, false, ISA_LEVEL_VBMI2> >;
29
25
  _do_decode_end_raw = &do_decode_simd<true, true, sizeof(__m128i)*2, do_decode_sse<true, true, ISA_LEVEL_VBMI2> >;
26
+ _decode_isa = ISA_LEVEL_VBMI2;
30
27
  }
31
28
  # endif
32
29
  #else
package/src/encoder.cc CHANGED
@@ -2,6 +2,31 @@
2
2
  #include "encoder_common.h"
3
3
  #include "encoder.h"
4
4
 
5
+
6
+ // lookup tables for scalar processing
7
+ #define _B1(n) _B(n), _B(n+1), _B(n+2), _B(n+3)
8
+ #define _B2(n) _B1(n), _B1(n+4), _B1(n+8), _B1(n+12)
9
+ #define _B3(n) _B2(n), _B2(n+16), _B2(n+32), _B2(n+48)
10
+ #define _BX _B3(0), _B3(64), _B3(128), _B3(192)
11
+
12
+ const unsigned char escapeLUT[256] = { // whether or not the character is critical
13
+ #define _B(n) ((n == 214 || n == '\r'+214 || n == '\n'+214 || n == '='-42) ? 0 : (n+42) & 0xff)
14
+ _BX
15
+ #undef _B
16
+ };
17
+ const uint16_t escapedLUT[256] = { // escaped sequences for characters that need escaping
18
+ #define _B(n) ((n == 214 || n == 214+'\r' || n == 214+'\n' || n == '='-42 || n == 214+'\t' || n == 214+' ' || n == '.'-42) ? UINT16_PACK('=', ((n+42+64)&0xff)) : 0)
19
+ _BX
20
+ #undef _B
21
+ };
22
+
23
+ #undef _B1
24
+ #undef _B2
25
+ #undef _B3
26
+ #undef _BX
27
+
28
+
29
+
5
30
  size_t do_encode_generic(int line_size, int* colOffset, const unsigned char* HEDLEY_RESTRICT src, unsigned char* HEDLEY_RESTRICT dest, size_t len, int doEnd) {
6
31
  unsigned char* es = (unsigned char*)src + len;
7
32
  unsigned char *p = dest; // destination pointer
@@ -122,6 +147,7 @@ size_t do_encode_generic(int line_size, int* colOffset, const unsigned char* HED
122
147
 
123
148
  extern "C" {
124
149
  size_t (*_do_encode)(int, int*, const unsigned char* HEDLEY_RESTRICT, unsigned char* HEDLEY_RESTRICT, size_t, int) = &do_encode_generic;
150
+ int _encode_isa = ISA_GENERIC;
125
151
  }
126
152
 
127
153
  void encoder_sse2_init();
@@ -139,12 +165,14 @@ void encoder_rvv_init();
139
165
  static inline void encoder_native_init() {
140
166
  _do_encode = &do_encode_simd< do_encode_avx2<ISA_NATIVE> >;
141
167
  encoder_avx2_lut<ISA_NATIVE>();
168
+ _encode_isa = ISA_NATIVE;
142
169
  }
143
170
  # else
144
171
  # include "encoder_sse_base.h"
145
172
  static inline void encoder_native_init() {
146
173
  _do_encode = &do_encode_simd< do_encode_sse<ISA_NATIVE> >;
147
174
  encoder_sse_lut<ISA_NATIVE>();
175
+ _encode_isa = ISA_NATIVE;
148
176
  }
149
177
  # endif
150
178
  #endif