yencode 1.1.2 → 1.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,219 @@
1
+ #include "common.h"
2
+
3
+ #ifdef __riscv_vector
4
+ #include "encoder.h"
5
+ #include "encoder_common.h"
6
+
7
+ # include <riscv_vector.h>
8
+ # if defined(__clang__) && __clang_major__ < 16
9
+ # define RV(f) f
10
+ # else
11
+ # define RV(f) __riscv_##f
12
+ # endif
13
+
14
+
15
+ static HEDLEY_ALWAYS_INLINE void encode_eol_handle_pre(const uint8_t* HEDLEY_RESTRICT _src, long& inpos, uint8_t*& outp, long& col, long lineSizeOffset) {
16
+ // TODO: vectorize
17
+ uint8_t c = _src[inpos++];
18
+ if(HEDLEY_UNLIKELY(escapedLUT[c] && c != '.'-42)) {
19
+ memcpy(outp, &escapedLUT[c], sizeof(uint16_t));
20
+ outp += 2;
21
+ } else {
22
+ *(outp++) = c + 42;
23
+ }
24
+
25
+ c = _src[inpos++];
26
+ if(LIKELIHOOD(0.0273, escapedLUT[c]!=0)) {
27
+ uint32_t w = UINT32_16_PACK(UINT16_PACK('\r', '\n'), (uint32_t)escapedLUT[c]);
28
+ memcpy(outp, &w, sizeof(w));
29
+ outp += 4;
30
+ col = lineSizeOffset + 2;
31
+ } else {
32
+ uint32_t w = UINT32_PACK('\r', '\n', (uint32_t)(c+42), 0);
33
+ memcpy(outp, &w, sizeof(w));
34
+ outp += 3;
35
+ col = lineSizeOffset + 1;
36
+ }
37
+ }
38
+
39
+
40
+ HEDLEY_ALWAYS_INLINE void do_encode_rvv(int line_size, int* colOffset, const uint8_t* HEDLEY_RESTRICT srcEnd, uint8_t* HEDLEY_RESTRICT& dest, size_t& len) {
41
+ size_t vl2 = RV(vsetvlmax_e8m2)(); // TODO: limit to line length
42
+ // TODO: have a LMUL=1 variant if line_size < vl
43
+
44
+ // offset position to enable simpler loop condition checking
45
+ const int INPUT_OFFSET = vl2*2 -1; // extra chars for EOL handling, -1 to change <= to <
46
+ if((intptr_t)len <= INPUT_OFFSET || line_size < (int)vl2*2) return;
47
+
48
+ uint8_t *outp = dest;
49
+ long inpos = -(long)len;
50
+ long lineSizeOffset = -line_size +1;
51
+ long col = *colOffset - line_size +1;
52
+
53
+ inpos += INPUT_OFFSET;
54
+ const uint8_t* _src = srcEnd - INPUT_OFFSET;
55
+
56
+ if (HEDLEY_LIKELY(col == -line_size+1)) {
57
+ uint8_t c = _src[inpos++];
58
+ if (LIKELIHOOD(0.0273, escapedLUT[c] != 0)) {
59
+ memcpy(outp, escapedLUT + c, 2);
60
+ outp += 2;
61
+ col += 2;
62
+ } else {
63
+ *(outp++) = c + 42;
64
+ col += 1;
65
+ }
66
+ }
67
+ if(HEDLEY_UNLIKELY(col >= 0)) {
68
+ if(col == 0)
69
+ encode_eol_handle_pre(_src, inpos, outp, col, lineSizeOffset);
70
+ else {
71
+ uint8_t c = _src[inpos++];
72
+ if(LIKELIHOOD(0.0273, escapedLUT[c]!=0)) {
73
+ uint32_t v = UINT32_16_PACK(UINT16_PACK('\r', '\n'), (uint32_t)escapedLUT[c]);
74
+ memcpy(outp, &v, sizeof(v));
75
+ outp += 4;
76
+ col = 2-line_size + 1;
77
+ } else {
78
+ uint32_t v = UINT32_PACK('\r', '\n', (uint32_t)(c+42), 0);
79
+ memcpy(outp, &v, sizeof(v));
80
+ outp += 3;
81
+ col = 2-line_size;
82
+ }
83
+ }
84
+ }
85
+
86
+ // vector constants
87
+ const vuint8mf2_t ALT_SHIFT = RV(vreinterpret_v_u16mf2_u8mf2)(RV(vmv_v_x_u16mf2)(4, vl2));
88
+ const uint8_t _MASK_EXPAND[] = {0xAA, 0xAB, 0xAE, 0xAF, 0xBA, 0xBB, 0xBE, 0xBF, 0xEA, 0xEB, 0xEE, 0xEF, 0xFA, 0xFB, 0xFE, 0xFF};
89
+ const vuint8m1_t MASK_EXPAND = RV(vle8_v_u8m1)(_MASK_EXPAND, 16);
90
+
91
+
92
+ // TODO: consider exploiting partial vector capability
93
+ while(inpos < 0) {
94
+ vuint8m2_t data = RV(vle8_v_u8m2)(_src + inpos, vl2);
95
+ inpos += vl2;
96
+
97
+ // search for special chars
98
+ // TODO: vrgather strat
99
+
100
+ vuint8m2_t tmpData = RV(vsub_vx_u8m2)(data, -42, vl2);
101
+ vbool4_t cmp = RV(vmor_mm_b4)(
102
+ RV(vmor_mm_b4)(
103
+ RV(vmseq_vx_u8m2_b4)(data, -42, vl2),
104
+ RV(vmseq_vx_u8m2_b4)(tmpData, '=', vl2),
105
+ vl2
106
+ ),
107
+ RV(vmor_mm_b4)(
108
+ RV(vmseq_vx_u8m2_b4)(data, '\r'-42, vl2),
109
+ RV(vmseq_vx_u8m2_b4)(data, '\n'-42, vl2),
110
+ vl2
111
+ ),
112
+ vl2
113
+ );
114
+
115
+ #if defined(__riscv_v_intrinsic) && __riscv_v_intrinsic >= 13000
116
+ data = RV(vor_vx_u8m2_mu)(cmp, tmpData, tmpData, 64, vl2);
117
+ #else
118
+ data = RV(vor_vx_u8m2_m)(cmp, tmpData, tmpData, 64, vl2);
119
+ #endif
120
+
121
+ int idx;
122
+ size_t count = RV(vcpop_m_b4)(cmp, vl2);
123
+ if(count > 1) {
124
+ // widen mask: 4b->8b
125
+ #if defined(__riscv_v_intrinsic) && __riscv_v_intrinsic >= 13000
126
+ vuint8mf4_t vcmp = RV(vlmul_trunc_v_u8m1_u8mf4)(RV(vreinterpret_v_b4_u8m1)(cmp));
127
+ #else
128
+ vuint8mf4_t vcmp = *(vuint8mf4_t*)(&cmp);
129
+ #endif
130
+ // TODO: use vwsll instead if available
131
+ // - is clmul useful here?
132
+ vuint8mf2_t xcmp = RV(vreinterpret_v_u16mf2_u8mf2)(RV(vwmulu_vx_u16mf2)(vcmp, 16, vl2));
133
+ xcmp = RV(vsrl_vv_u8mf2)(xcmp, ALT_SHIFT, vl2);
134
+
135
+ // expand mask by inserting '1' between each bit (0000abcd -> 1a1b1c1d)
136
+ vuint8m1_t xcmpTmp = RV(vrgather_vv_u8m1)(MASK_EXPAND, RV(vlmul_ext_v_u8mf2_u8m1)(xcmp), vl2);
137
+ #if defined(__riscv_v_intrinsic) && __riscv_v_intrinsic >= 13000
138
+ vbool2_t cmpmask = RV(vreinterpret_b2)(xcmpTmp);
139
+ #else
140
+ vbool2_t cmpmask = *(vbool2_t*)(&xcmpTmp);
141
+ #endif
142
+
143
+ // expand data and insert =
144
+ // TODO: use vwsll instead if available
145
+ vuint16m4_t data2 = RV(vzext_vf2_u16m4)(data, vl2);
146
+ data2 = RV(vsll_vx_u16m4)(data2, 8, vl2);
147
+ data2 = RV(vor_vx_u16m4)(data2, '=', vl2);
148
+
149
+ // prune unneeded =
150
+ vuint8m4_t dataTmp = RV(vreinterpret_v_u16m4_u8m4)(data2);
151
+ vuint8m4_t final_data = RV(vcompress_vm_u8m4)(
152
+ #if defined(__riscv_v_intrinsic) && __riscv_v_intrinsic >= 13000
153
+ dataTmp, cmpmask, vl2*2
154
+ #else
155
+ cmpmask, dataTmp, dataTmp, vl2*2
156
+ #endif
157
+ );
158
+
159
+ RV(vse8_v_u8m4)(outp, final_data, vl2*2);
160
+ outp += vl2 + count;
161
+ col += vl2 + count;
162
+
163
+ if(col >= 0) {
164
+ // we overflowed - find correct position to revert back to
165
+ // TODO: stick with u8 type for vlmax <= 2048 (need to check if ok if vlmax == 2048)
166
+ // - considering that it's rare for colWidth > 128, maybe just don't support vectors that long
167
+ vuint16m8_t xidx = RV(viota_m_u16m8)(cmpmask, vl2*2);
168
+ vbool2_t discardmask = RV(vmsgeu_vx_u16m8_b2)(xidx, vl2 + count - col, vl2*2);
169
+ long idx_revert = RV(vcpop_m_b2)(discardmask, vl2*2);
170
+
171
+ outp -= col + (idx_revert & 1);
172
+ inpos -= ((idx_revert+1) >> 1);
173
+
174
+ goto _encode_eol_handle_pre;
175
+ }
176
+ } else {
177
+ // 0 or 1 special characters
178
+ {
179
+ vbool4_t mask = RV(vmsbf_m_b4)(cmp, vl2);
180
+ // TODO: is it better to shuffle this into two stores, instead of three?
181
+ RV(vse8_v_u8m2_m)(mask, outp, data, vl2);
182
+ idx = RV(vcpop_m_b4)(mask, vl2);
183
+ outp[idx] = '=';
184
+ RV(vse8_v_u8m2_m)(RV(vmnot_m_b4)(mask, vl2), outp+1, data, vl2);
185
+
186
+ outp += vl2 + count;
187
+ col += vl2 + count;
188
+ }
189
+
190
+ if(col >= 0) {
191
+ if(count > 0) {
192
+ idx = vl2 - idx;
193
+ if(HEDLEY_UNLIKELY(col == idx)) {
194
+ // this is an escape character, so line will need to overflow
195
+ outp--;
196
+ } else {
197
+ inpos += (col > idx);
198
+ }
199
+ }
200
+ outp -= col;
201
+ inpos -= col;
202
+
203
+ _encode_eol_handle_pre:
204
+ encode_eol_handle_pre(_src, inpos, outp, col, lineSizeOffset);
205
+ }
206
+ }
207
+ }
208
+
209
+ *colOffset = col + line_size -1;
210
+ dest = outp;
211
+ len = -(inpos - INPUT_OFFSET);
212
+ }
213
+
214
+ void encoder_rvv_init() {
215
+ _do_encode = &do_encode_simd<do_encode_rvv>;
216
+ }
217
+ #else
218
+ void encoder_rvv_init() {}
219
+ #endif /* defined(__riscv_vector) */
@@ -8,7 +8,7 @@
8
8
  # define _mm_mask_expand_epi8 _mm128_mask_expand_epi8
9
9
  #endif
10
10
 
11
- #if defined(__GNUC__) && __GNUC__ >= 7
11
+ #if (defined(__GNUC__) && __GNUC__ >= 7) || (defined(_MSC_VER) && _MSC_VER >= 1924)
12
12
  # define KLOAD16(a, offs) _load_mask16((__mmask16*)(a) + (offs))
13
13
  #else
14
14
  # define KLOAD16(a, offs) (((uint16_t*)(a))[(offs)])
@@ -155,7 +155,7 @@ HEDLEY_ALWAYS_INLINE void do_encode_sse(int line_size, int* colOffset, const uin
155
155
  if(len <= INPUT_OFFSET || line_size < XMM_SIZE) return;
156
156
 
157
157
  // slower CPUs prefer to branch as mispredict penalty is probably small relative to general execution
158
- #if defined(__tune_atom__) || defined(__tune_slm__) || defined(__tune_btver1__)
158
+ #if defined(__tune_atom__) || defined(__tune_slm__) || defined(__tune_btver1__) || defined(__tune_btver2__)
159
159
  const bool _PREFER_BRANCHING = true;
160
160
  #else
161
161
  const bool _PREFER_BRANCHING = (use_isa < ISA_LEVEL_SSSE3);
@@ -350,7 +350,7 @@ HEDLEY_ALWAYS_INLINE void do_encode_sse(int line_size, int* colOffset, const uin
350
350
  #if defined(__POPCNT__) && !defined(__tune_btver1__)
351
351
  if(use_isa & ISA_FEATURE_POPCNT) {
352
352
  shuf2Len = popcnt32(maskA) + 16;
353
- # if defined(__tune_znver3__) || defined(__tune_znver2__) || defined(__tune_znver1__) || defined(__tune_btver2__)
353
+ # if defined(__tune_znver4__) || defined(__tune_znver3__) || defined(__tune_znver2__) || defined(__tune_znver1__) || defined(__tune_btver2__)
354
354
  shuf1Len = popcnt32(m1) + 8;
355
355
  shuf3Len = popcnt32(m3) + shuf2Len + 8;
356
356
  # else
@@ -409,11 +409,11 @@ HEDLEY_ALWAYS_INLINE void do_encode_sse(int line_size, int* colOffset, const uin
409
409
  if(use_isa >= ISA_LEVEL_VBMI2)
410
410
  # endif
411
411
  {
412
- asm(
412
+ __asm__(
413
413
  "shrl $1, %[eqMask] \n"
414
414
  "shrl %%cl, %[eqMask] \n" // TODO: can use shrq to avoid above shift?
415
- # if defined(PLATFORM_AMD64)
416
- "adcq %[col], %[p] \n"
415
+ # if defined(PLATFORM_AMD64) && !defined(__ILP32__)
416
+ "adcq %q[col], %q[p] \n"
417
417
  # else
418
418
  "adcl %[col], %[p] \n"
419
419
  # endif
@@ -484,7 +484,7 @@ HEDLEY_ALWAYS_INLINE void do_encode_sse(int line_size, int* colOffset, const uin
484
484
  dataB = _mm_ternarylogic_epi32(dataB, cmpB, _mm_set1_epi8(64), 0xf8);
485
485
 
486
486
  // store last char
487
- _mm_mask_storeu_epi8(p+XMM_SIZE+1, 1<<15, dataB);
487
+ p[XMM_SIZE*2] = _mm_extract_epi8(dataB, 15);
488
488
 
489
489
  uint32_t blendMask = (uint32_t)(-(int32_t)mask);
490
490
  dataB = _mm_mask_alignr_epi8(dataB, blendMask>>16, dataB, dataA, 15);
@@ -539,7 +539,6 @@ HEDLEY_ALWAYS_INLINE void do_encode_sse(int line_size, int* colOffset, const uin
539
539
  dataA = _mm_shuffle_epi8(dataA, shufMaskA);
540
540
 
541
541
  # if defined(__SSE4_1__) && !defined(__tune_slm__) && !defined(__tune_goldmont__) && !defined(__tune_goldmont_plus__) && !defined(__tune_tremont__)
542
- // unsure if worth on: Jaguar/Puma (3|2), Core2 (2|2)
543
542
  if(use_isa >= ISA_LEVEL_SSE41) {
544
543
  dataB = _mm_blendv_epi8(dataBShifted, dataB, mergeMaskB);
545
544
  } else
@@ -0,0 +1,30 @@
1
+ #include "common.h"
2
+
3
+ extern const bool encoder_has_avx10;
4
+ #if !defined(__EVEX512__) && (defined(__AVX10_1__) || defined(__EVEX256__)) && defined(__AVX512VL__) && defined(__AVX512VBMI2__) && defined(__AVX512BW__)
5
+ const bool encoder_has_avx10 = true;
6
+ #else
7
+ const bool encoder_has_avx10 = false;
8
+ #endif
9
+
10
+ #if defined(__AVX512VL__) && defined(__AVX512VBMI2__) && defined(__AVX512BW__)
11
+ # ifndef YENC_DISABLE_AVX256
12
+ # include "encoder_avx_base.h"
13
+
14
+ void encoder_vbmi2_init() {
15
+ _do_encode = &do_encode_simd< do_encode_avx2<ISA_LEVEL_VBMI2> >;
16
+ encoder_avx2_lut<ISA_LEVEL_VBMI2>();
17
+ }
18
+ # else
19
+ # include "encoder_sse_base.h"
20
+ void encoder_vbmi2_init() {
21
+ _do_encode = &do_encode_simd< do_encode_sse<ISA_LEVEL_VBMI2> >;
22
+ encoder_sse_lut<ISA_LEVEL_VBMI2>();
23
+ }
24
+ # endif
25
+ #else
26
+ void encoder_avx2_init();
27
+ void encoder_vbmi2_init() {
28
+ encoder_avx2_init();
29
+ }
30
+ #endif