yencode 1.1.3 → 1.1.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -215,7 +215,7 @@ HEDLEY_ALWAYS_INLINE void do_encode_avx2(int line_size, int* colOffset, const ui
215
215
  // duplicate halves
216
216
  data1A = _mm256_inserti128_si256(dataA, _mm256_castsi256_si128(dataA), 1);
217
217
  data1B = _mm256_inserti128_si256(dataB, _mm256_castsi256_si128(dataB), 1);
218
- #if defined(__tune_znver2__) || defined(__tune_znver3__)
218
+ #if defined(__tune_znver2__) || defined(__tune_znver3__) || defined(__tune_znver4__)
219
219
  data2A = _mm256_permute2x128_si256(dataA, dataA, 0x11);
220
220
  data2B = _mm256_permute2x128_si256(dataB, dataB, 0x11);
221
221
  #else
@@ -290,7 +290,7 @@ HEDLEY_ALWAYS_INLINE void do_encode_avx2(int line_size, int* colOffset, const ui
290
290
 
291
291
  #if defined(__GNUC__) && defined(PLATFORM_AMD64)
292
292
  if(use_isa >= ISA_LEVEL_VBMI2) {
293
- asm(
293
+ __asm__(
294
294
  "shrq $1, %[eqMask] \n"
295
295
  "shrq %%cl, %[eqMask] \n"
296
296
  "adcq %q[col], %q[p] \n"
@@ -334,28 +334,32 @@ HEDLEY_ALWAYS_INLINE void do_encode_avx2(int line_size, int* colOffset, const ui
334
334
  if(use_isa >= ISA_LEVEL_AVX3) {
335
335
  # if defined(__AVX512VBMI2__)
336
336
  if(use_isa >= ISA_LEVEL_VBMI2) {
337
- _mm256_mask_storeu_epi8(p+1, 1UL<<31, dataA);
337
+ __m128i dataTop = _mm256_extracti128_si256(dataA, 1);
338
338
  dataA = _mm256_mask_expand_epi8(_mm256_set1_epi8('='), KNOT32(maskA), dataA);
339
339
  _mm256_storeu_si256((__m256i*)p, dataA);
340
+ p[32] = _mm_extract_epi8(dataTop, 15);
340
341
  p += outputBytesA;
341
342
 
342
- _mm256_mask_storeu_epi8(p+1, 1UL<<31, dataB);
343
+ dataTop = _mm256_extracti128_si256(dataB, 1);
343
344
  dataB = _mm256_mask_expand_epi8(_mm256_set1_epi8('='), KNOT32(maskB), dataB);
344
345
  _mm256_storeu_si256((__m256i*)p, dataB);
346
+ p[32] = _mm_extract_epi8(dataTop, 15);
345
347
  p += maskBitsB;
346
348
  } else
347
349
  # endif
348
350
  {
349
- _mm256_mask_storeu_epi8(p+1, 1UL<<31, dataA);
350
- dataA = _mm256_mask_alignr_epi8(dataA, (uint32_t)(-(int32_t)maskA), dataA, _mm256_permute4x64_epi64(dataA, _MM_SHUFFLE(1,0,3,2)), 15);
351
+ __m256i dataSwapped = _mm256_permute4x64_epi64(dataA, _MM_SHUFFLE(1,0,3,2));
352
+ dataA = _mm256_mask_alignr_epi8(dataA, (uint32_t)(-(int32_t)maskA), dataA, dataSwapped, 15);
351
353
  dataA = _mm256_ternarylogic_epi32(dataA, cmpA, _mm256_set1_epi8('='), 0xb8); // (data & ~cmp) | (cmp & '=')
352
354
  _mm256_storeu_si256((__m256i*)p, dataA);
355
+ p[32] = _mm_extract_epi8(_mm256_castsi256_si128(dataSwapped), 15);
353
356
  p += outputBytesA;
354
357
 
355
- _mm256_mask_storeu_epi8(p+1, 1UL<<31, dataB);
356
- dataB = _mm256_mask_alignr_epi8(dataB, (uint32_t)(-(int32_t)maskB), dataB, _mm256_permute4x64_epi64(dataB, _MM_SHUFFLE(1,0,3,2)), 15);
358
+ dataSwapped = _mm256_permute4x64_epi64(dataB, _MM_SHUFFLE(1,0,3,2));
359
+ dataB = _mm256_mask_alignr_epi8(dataB, (uint32_t)(-(int32_t)maskB), dataB, dataSwapped, 15);
357
360
  dataB = _mm256_ternarylogic_epi32(dataB, cmpB, _mm256_set1_epi8('='), 0xb8);
358
361
  _mm256_storeu_si256((__m256i*)p, dataB);
362
+ p[32] = _mm_extract_epi8(_mm256_castsi256_si128(dataSwapped), 15);
359
363
  p += maskBitsB;
360
364
  }
361
365
  } else
@@ -484,28 +488,32 @@ HEDLEY_ALWAYS_INLINE void do_encode_avx2(int line_size, int* colOffset, const ui
484
488
  if(use_isa >= ISA_LEVEL_AVX3) {
485
489
  # if defined(__AVX512VBMI2__)
486
490
  if(use_isa >= ISA_LEVEL_VBMI2) {
487
- _mm256_mask_storeu_epi8(p+1, 1UL<<31, dataA);
491
+ __m128i dataTop = _mm256_extracti128_si256(dataA, 1);
488
492
  dataA = _mm256_mask_expand_epi8(_mm256_set1_epi8('='), KNOT32(maskA), dataA);
489
493
  _mm256_storeu_si256((__m256i*)p, dataA);
494
+ p[32] = _mm_extract_epi8(dataTop, 15);
490
495
  p += outputBytesA;
491
496
 
492
- _mm256_mask_storeu_epi8(p+1, 1UL<<31, dataB);
497
+ dataTop = _mm256_extracti128_si256(dataB, 1);
493
498
  dataB = _mm256_mask_expand_epi8(_mm256_set1_epi8('='), KNOT32(maskB), dataB);
494
499
  _mm256_storeu_si256((__m256i*)p, dataB);
500
+ p[32] = _mm_extract_epi8(dataTop, 15);
495
501
  p += maskBitsB;
496
502
  } else
497
503
  # endif
498
504
  {
499
- _mm256_mask_storeu_epi8(p+1, 1UL<<31, dataA);
500
- dataA = _mm256_mask_alignr_epi8(dataA, (uint32_t)(-(int32_t)maskA), dataA, _mm256_permute4x64_epi64(dataA, _MM_SHUFFLE(1,0,3,2)), 15);
505
+ __m256i dataSwapped = _mm256_permute4x64_epi64(dataA, _MM_SHUFFLE(1,0,3,2));
506
+ dataA = _mm256_mask_alignr_epi8(dataA, (uint32_t)(-(int32_t)maskA), dataA, dataSwapped, 15);
501
507
  dataA = _mm256_ternarylogic_epi32(dataA, cmpA, _mm256_set1_epi8('='), 0xb8); // (data & ~cmp) | (cmp & '=')
502
508
  _mm256_storeu_si256((__m256i*)p, dataA);
509
+ p[32] = _mm_extract_epi8(_mm256_castsi256_si128(dataSwapped), 15);
503
510
  p += outputBytesA;
504
511
 
505
- _mm256_mask_storeu_epi8(p+1, 1UL<<31, dataB);
506
- dataB = _mm256_mask_alignr_epi8(dataB, (uint32_t)(-(int32_t)maskB), dataB, _mm256_permute4x64_epi64(dataB, _MM_SHUFFLE(1,0,3,2)), 15);
512
+ dataSwapped = _mm256_permute4x64_epi64(dataB, _MM_SHUFFLE(1,0,3,2));
513
+ dataB = _mm256_mask_alignr_epi8(dataB, (uint32_t)(-(int32_t)maskB), dataB, dataSwapped, 15);
507
514
  dataB = _mm256_ternarylogic_epi32(dataB, cmpB, _mm256_set1_epi8('='), 0xb8);
508
515
  _mm256_storeu_si256((__m256i*)p, dataB);
516
+ p[32] = _mm_extract_epi8(_mm256_castsi256_si128(dataSwapped), 15);
509
517
  p += maskBitsB;
510
518
  }
511
519
  } else
@@ -15,6 +15,43 @@ static HEDLEY_ALWAYS_INLINE void vst1q_u8_x2_unaligned(uint8_t* p, uint8x16x2_t
15
15
  #endif
16
16
 
17
17
 
18
+ // ARM's CLZ instruction at native bit-width
19
+ #ifdef __aarch64__
20
+ static HEDLEY_ALWAYS_INLINE int clz_n(uint64_t v) {
21
+ # ifdef _MSC_VER
22
+ long r;
23
+ // does this work?
24
+ if(_BitScanReverse64((unsigned long*)&r, v))
25
+ r ^= 63;
26
+ else
27
+ r = 64;
28
+ return r;
29
+ # else
30
+ # if defined(__clang__) || HEDLEY_GCC_VERSION_CHECK(11,0,0)
31
+ // this pattern is only detected on GCC >= 11 (Clang 9 seems to as well, unsure about earlier versions)
32
+ // - note: return type must be 'int'; GCC fails to optimise this if type is 'long'
33
+ // GCC <= 10 doesn't optimize around the '0 = undefined behaviour', so not needed there
34
+ if(v == 0) return 64;
35
+ # endif
36
+ return __builtin_clzll(v);
37
+ # endif
38
+ }
39
+ #else
40
+ static HEDLEY_ALWAYS_INLINE int clz_n(uint32_t v) {
41
+ # ifdef __GNUC__
42
+ # if defined(__clang__) || HEDLEY_GCC_VERSION_CHECK(7,0,0)
43
+ // as with AArch64 version above, only insert this check if compiler can optimise it away
44
+ if(v == 0) return 32;
45
+ # endif
46
+ return __builtin_clz(v);
47
+ # elif defined(_MSC_VER)
48
+ return _arm_clz(v);
49
+ # else
50
+ return __clz(v); // ARM compiler?
51
+ # endif
52
+ }
53
+ #endif
54
+
18
55
  static uint8x16_t ALIGN_TO(16, shufLUT[256]);
19
56
  static uint16_t expandLUT[256];
20
57
 
@@ -195,26 +232,7 @@ static HEDLEY_ALWAYS_INLINE void encode_eol_handle_pre(const uint8_t* HEDLEY_RES
195
232
  col = shufTotalLen+1 + lineSizeOffset-32;
196
233
  } else {
197
234
  // shuffle stuff up
198
- #ifdef __aarch64__
199
- # ifdef _MSC_VER
200
- long bitIndex;
201
- if(_BitScanReverse64((unsigned long*)&bitIndex, mask))
202
- bitIndex ^= 63;
203
- else
204
- bitIndex = 64;
205
- # else
206
- long bitIndex = __builtin_clzll(mask);
207
- # endif
208
- #else
209
- # ifdef __GNUC__
210
- long bitIndex = __builtin_clz(mask); // TODO: is the 'undefined if 0' case problematic here?
211
- # elif defined(_MSC_VER)
212
- long bitIndex = _arm_clz(mask);
213
- # else
214
- long bitIndex = __clz(mask); // ARM compiler?
215
- # endif
216
- #endif
217
-
235
+ long bitIndex = clz_n(mask);
218
236
  uint8x16_t vClz = vdupq_n_u8(bitIndex & ~(sizeof(mask)*8));
219
237
  #ifdef __aarch64__
220
238
  uint8x16_t blendA = vcgtq_u8(vmakeq_u8(63,62,61,60,51,50,49,48,47,46,45,44,35,34,33,32), vClz);
@@ -450,26 +468,7 @@ HEDLEY_ALWAYS_INLINE void do_encode_neon(int line_size, int* colOffset, const ui
450
468
  }
451
469
  } else {
452
470
  {
453
- #ifdef __aarch64__
454
- # ifdef _MSC_VER
455
- // does this work?
456
- if(_BitScanReverse64((unsigned long*)&bitIndex, mask))
457
- bitIndex ^= 63;
458
- else
459
- bitIndex = 64;
460
- # else
461
- bitIndex = __builtin_clzll(mask); // TODO: is the 'undefined if 0' case problematic here?
462
- # endif
463
- #else
464
- # ifdef __GNUC__
465
- bitIndex = __builtin_clz(mask);
466
- # elif defined(_MSC_VER)
467
- bitIndex = _arm_clz(mask);
468
- # else
469
- bitIndex = __clz(mask); // ARM compiler?
470
- # endif
471
- #endif
472
-
471
+ bitIndex = clz_n(mask);
473
472
  uint8x16_t vClz = vdupq_n_u8(bitIndex & ~(sizeof(mask)*8));
474
473
  #ifdef __aarch64__
475
474
  uint8x16_t blendA = vcgeq_u8(vmakeq_u8(63,62,61,60,51,50,49,48,47,46,45,44,35,34,33,32), vClz);
@@ -521,6 +520,7 @@ HEDLEY_ALWAYS_INLINE void do_encode_neon(int line_size, int* colOffset, const ui
521
520
 
522
521
  void encoder_neon_init() {
523
522
  _do_encode = &do_encode_simd<do_encode_neon>;
523
+ _encode_isa = ISA_LEVEL_NEON;
524
524
  // generate shuf LUT
525
525
  for(int i=0; i<256; i++) {
526
526
  int k = i;
@@ -0,0 +1,220 @@
1
+ #include "common.h"
2
+
3
+ #ifdef __riscv_vector
4
+ #include "encoder.h"
5
+ #include "encoder_common.h"
6
+
7
+ # include <riscv_vector.h>
8
+ # if defined(__clang__) && __clang_major__ < 16
9
+ # define RV(f) f
10
+ # else
11
+ # define RV(f) __riscv_##f
12
+ # endif
13
+
14
+
15
+ static HEDLEY_ALWAYS_INLINE void encode_eol_handle_pre(const uint8_t* HEDLEY_RESTRICT _src, long& inpos, uint8_t*& outp, long& col, long lineSizeOffset) {
16
+ // TODO: vectorize
17
+ uint8_t c = _src[inpos++];
18
+ if(HEDLEY_UNLIKELY(escapedLUT[c] && c != '.'-42)) {
19
+ memcpy(outp, &escapedLUT[c], sizeof(uint16_t));
20
+ outp += 2;
21
+ } else {
22
+ *(outp++) = c + 42;
23
+ }
24
+
25
+ c = _src[inpos++];
26
+ if(LIKELIHOOD(0.0273, escapedLUT[c]!=0)) {
27
+ uint32_t w = UINT32_16_PACK(UINT16_PACK('\r', '\n'), (uint32_t)escapedLUT[c]);
28
+ memcpy(outp, &w, sizeof(w));
29
+ outp += 4;
30
+ col = lineSizeOffset + 2;
31
+ } else {
32
+ uint32_t w = UINT32_PACK('\r', '\n', (uint32_t)(c+42), 0);
33
+ memcpy(outp, &w, sizeof(w));
34
+ outp += 3;
35
+ col = lineSizeOffset + 1;
36
+ }
37
+ }
38
+
39
+
40
+ HEDLEY_ALWAYS_INLINE void do_encode_rvv(int line_size, int* colOffset, const uint8_t* HEDLEY_RESTRICT srcEnd, uint8_t* HEDLEY_RESTRICT& dest, size_t& len) {
41
+ size_t vl2 = RV(vsetvlmax_e8m2)(); // TODO: limit to line length
42
+ // TODO: have a LMUL=1 variant if line_size < vl
43
+
44
+ // offset position to enable simpler loop condition checking
45
+ const int INPUT_OFFSET = vl2*2 -1; // extra chars for EOL handling, -1 to change <= to <
46
+ if((intptr_t)len <= INPUT_OFFSET || line_size < (int)vl2*2) return;
47
+
48
+ uint8_t *outp = dest;
49
+ long inpos = -(long)len;
50
+ long lineSizeOffset = -line_size +1;
51
+ long col = *colOffset - line_size +1;
52
+
53
+ inpos += INPUT_OFFSET;
54
+ const uint8_t* _src = srcEnd - INPUT_OFFSET;
55
+
56
+ if (HEDLEY_LIKELY(col == -line_size+1)) {
57
+ uint8_t c = _src[inpos++];
58
+ if (LIKELIHOOD(0.0273, escapedLUT[c] != 0)) {
59
+ memcpy(outp, escapedLUT + c, 2);
60
+ outp += 2;
61
+ col += 2;
62
+ } else {
63
+ *(outp++) = c + 42;
64
+ col += 1;
65
+ }
66
+ }
67
+ if(HEDLEY_UNLIKELY(col >= 0)) {
68
+ if(col == 0)
69
+ encode_eol_handle_pre(_src, inpos, outp, col, lineSizeOffset);
70
+ else {
71
+ uint8_t c = _src[inpos++];
72
+ if(LIKELIHOOD(0.0273, escapedLUT[c]!=0)) {
73
+ uint32_t v = UINT32_16_PACK(UINT16_PACK('\r', '\n'), (uint32_t)escapedLUT[c]);
74
+ memcpy(outp, &v, sizeof(v));
75
+ outp += 4;
76
+ col = 2-line_size + 1;
77
+ } else {
78
+ uint32_t v = UINT32_PACK('\r', '\n', (uint32_t)(c+42), 0);
79
+ memcpy(outp, &v, sizeof(v));
80
+ outp += 3;
81
+ col = 2-line_size;
82
+ }
83
+ }
84
+ }
85
+
86
+ // vector constants
87
+ const vuint8mf2_t ALT_SHIFT = RV(vreinterpret_v_u16mf2_u8mf2)(RV(vmv_v_x_u16mf2)(4, vl2));
88
+ const uint8_t _MASK_EXPAND[] = {0xAA, 0xAB, 0xAE, 0xAF, 0xBA, 0xBB, 0xBE, 0xBF, 0xEA, 0xEB, 0xEE, 0xEF, 0xFA, 0xFB, 0xFE, 0xFF};
89
+ const vuint8m1_t MASK_EXPAND = RV(vle8_v_u8m1)(_MASK_EXPAND, 16);
90
+
91
+
92
+ // TODO: consider exploiting partial vector capability
93
+ while(inpos < 0) {
94
+ vuint8m2_t data = RV(vle8_v_u8m2)(_src + inpos, vl2);
95
+ inpos += vl2;
96
+
97
+ // search for special chars
98
+ // TODO: vrgather strat
99
+
100
+ vuint8m2_t tmpData = RV(vsub_vx_u8m2)(data, -42, vl2);
101
+ vbool4_t cmp = RV(vmor_mm_b4)(
102
+ RV(vmor_mm_b4)(
103
+ RV(vmseq_vx_u8m2_b4)(data, -42, vl2),
104
+ RV(vmseq_vx_u8m2_b4)(tmpData, '=', vl2),
105
+ vl2
106
+ ),
107
+ RV(vmor_mm_b4)(
108
+ RV(vmseq_vx_u8m2_b4)(data, '\r'-42, vl2),
109
+ RV(vmseq_vx_u8m2_b4)(data, '\n'-42, vl2),
110
+ vl2
111
+ ),
112
+ vl2
113
+ );
114
+
115
+ #if defined(__riscv_v_intrinsic) && __riscv_v_intrinsic >= 13000
116
+ data = RV(vor_vx_u8m2_mu)(cmp, tmpData, tmpData, 64, vl2);
117
+ #else
118
+ data = RV(vor_vx_u8m2_m)(cmp, tmpData, tmpData, 64, vl2);
119
+ #endif
120
+
121
+ int idx;
122
+ size_t count = RV(vcpop_m_b4)(cmp, vl2);
123
+ if(count > 1) {
124
+ // widen mask: 4b->8b
125
+ #if defined(__riscv_v_intrinsic) && __riscv_v_intrinsic >= 13000
126
+ vuint8mf4_t vcmp = RV(vlmul_trunc_v_u8m1_u8mf4)(RV(vreinterpret_v_b4_u8m1)(cmp));
127
+ #else
128
+ vuint8mf4_t vcmp = *(vuint8mf4_t*)(&cmp);
129
+ #endif
130
+ // TODO: use vwsll instead if available
131
+ // - is clmul useful here?
132
+ vuint8mf2_t xcmp = RV(vreinterpret_v_u16mf2_u8mf2)(RV(vwmulu_vx_u16mf2)(vcmp, 16, vl2));
133
+ xcmp = RV(vsrl_vv_u8mf2)(xcmp, ALT_SHIFT, vl2);
134
+
135
+ // expand mask by inserting '1' between each bit (0000abcd -> 1a1b1c1d)
136
+ vuint8m1_t xcmpTmp = RV(vrgather_vv_u8m1)(MASK_EXPAND, RV(vlmul_ext_v_u8mf2_u8m1)(xcmp), vl2);
137
+ #if defined(__riscv_v_intrinsic) && __riscv_v_intrinsic >= 13000
138
+ vbool2_t cmpmask = RV(vreinterpret_b2)(xcmpTmp);
139
+ #else
140
+ vbool2_t cmpmask = *(vbool2_t*)(&xcmpTmp);
141
+ #endif
142
+
143
+ // expand data and insert =
144
+ // TODO: use vwsll instead if available
145
+ vuint16m4_t data2 = RV(vzext_vf2_u16m4)(data, vl2);
146
+ data2 = RV(vsll_vx_u16m4)(data2, 8, vl2);
147
+ data2 = RV(vor_vx_u16m4)(data2, '=', vl2);
148
+
149
+ // prune unneeded =
150
+ vuint8m4_t dataTmp = RV(vreinterpret_v_u16m4_u8m4)(data2);
151
+ vuint8m4_t final_data = RV(vcompress_vm_u8m4)(
152
+ #if defined(__riscv_v_intrinsic) && __riscv_v_intrinsic >= 13000
153
+ dataTmp, cmpmask, vl2*2
154
+ #else
155
+ cmpmask, dataTmp, dataTmp, vl2*2
156
+ #endif
157
+ );
158
+
159
+ RV(vse8_v_u8m4)(outp, final_data, vl2*2);
160
+ outp += vl2 + count;
161
+ col += vl2 + count;
162
+
163
+ if(col >= 0) {
164
+ // we overflowed - find correct position to revert back to
165
+ // TODO: stick with u8 type for vlmax <= 2048 (need to check if ok if vlmax == 2048)
166
+ // - considering that it's rare for colWidth > 128, maybe just don't support vectors that long
167
+ vuint16m8_t xidx = RV(viota_m_u16m8)(cmpmask, vl2*2);
168
+ vbool2_t discardmask = RV(vmsgeu_vx_u16m8_b2)(xidx, vl2 + count - col, vl2*2);
169
+ long idx_revert = RV(vcpop_m_b2)(discardmask, vl2*2);
170
+
171
+ outp -= col + (idx_revert & 1);
172
+ inpos -= ((idx_revert+1) >> 1);
173
+
174
+ goto _encode_eol_handle_pre;
175
+ }
176
+ } else {
177
+ // 0 or 1 special characters
178
+ {
179
+ vbool4_t mask = RV(vmsbf_m_b4)(cmp, vl2);
180
+ // TODO: is it better to shuffle this into two stores, instead of three?
181
+ RV(vse8_v_u8m2_m)(mask, outp, data, vl2);
182
+ idx = RV(vcpop_m_b4)(mask, vl2);
183
+ outp[idx] = '=';
184
+ RV(vse8_v_u8m2_m)(RV(vmnot_m_b4)(mask, vl2), outp+1, data, vl2);
185
+
186
+ outp += vl2 + count;
187
+ col += vl2 + count;
188
+ }
189
+
190
+ if(col >= 0) {
191
+ if(count > 0) {
192
+ idx = vl2 - idx;
193
+ if(HEDLEY_UNLIKELY(col == idx)) {
194
+ // this is an escape character, so line will need to overflow
195
+ outp--;
196
+ } else {
197
+ inpos += (col > idx);
198
+ }
199
+ }
200
+ outp -= col;
201
+ inpos -= col;
202
+
203
+ _encode_eol_handle_pre:
204
+ encode_eol_handle_pre(_src, inpos, outp, col, lineSizeOffset);
205
+ }
206
+ }
207
+ }
208
+
209
+ *colOffset = col + line_size -1;
210
+ dest = outp;
211
+ len = -(inpos - INPUT_OFFSET);
212
+ }
213
+
214
+ void encoder_rvv_init() {
215
+ _do_encode = &do_encode_simd<do_encode_rvv>;
216
+ _encode_isa = ISA_LEVEL_RVV;
217
+ }
218
+ #else
219
+ void encoder_rvv_init() {}
220
+ #endif /* defined(__riscv_vector) */
@@ -6,6 +6,7 @@
6
6
  void encoder_sse2_init() {
7
7
  _do_encode = &do_encode_simd< do_encode_sse<ISA_LEVEL_SSE2> >;
8
8
  encoder_sse_lut<ISA_LEVEL_SSE2>();
9
+ _encode_isa = ISA_LEVEL_SSE2;
9
10
  }
10
11
  #else
11
12
  void encoder_sse2_init() {}
@@ -350,7 +350,7 @@ HEDLEY_ALWAYS_INLINE void do_encode_sse(int line_size, int* colOffset, const uin
350
350
  #if defined(__POPCNT__) && !defined(__tune_btver1__)
351
351
  if(use_isa & ISA_FEATURE_POPCNT) {
352
352
  shuf2Len = popcnt32(maskA) + 16;
353
- # if defined(__tune_znver3__) || defined(__tune_znver2__) || defined(__tune_znver1__) || defined(__tune_btver2__)
353
+ # if defined(__tune_znver4__) || defined(__tune_znver3__) || defined(__tune_znver2__) || defined(__tune_znver1__) || defined(__tune_btver2__)
354
354
  shuf1Len = popcnt32(m1) + 8;
355
355
  shuf3Len = popcnt32(m3) + shuf2Len + 8;
356
356
  # else
@@ -409,7 +409,7 @@ HEDLEY_ALWAYS_INLINE void do_encode_sse(int line_size, int* colOffset, const uin
409
409
  if(use_isa >= ISA_LEVEL_VBMI2)
410
410
  # endif
411
411
  {
412
- asm(
412
+ __asm__(
413
413
  "shrl $1, %[eqMask] \n"
414
414
  "shrl %%cl, %[eqMask] \n" // TODO: can use shrq to avoid above shift?
415
415
  # if defined(PLATFORM_AMD64) && !defined(__ILP32__)
@@ -484,7 +484,7 @@ HEDLEY_ALWAYS_INLINE void do_encode_sse(int line_size, int* colOffset, const uin
484
484
  dataB = _mm_ternarylogic_epi32(dataB, cmpB, _mm_set1_epi8(64), 0xf8);
485
485
 
486
486
  // store last char
487
- _mm_mask_storeu_epi8(p+XMM_SIZE+1, 1<<15, dataB);
487
+ p[XMM_SIZE*2] = _mm_extract_epi8(dataB, 15);
488
488
 
489
489
  uint32_t blendMask = (uint32_t)(-(int32_t)mask);
490
490
  dataB = _mm_mask_alignr_epi8(dataB, blendMask>>16, dataB, dataA, 15);
@@ -8,6 +8,7 @@
8
8
  void encoder_ssse3_init() {
9
9
  _do_encode = &do_encode_simd< do_encode_sse<ISA_LEVEL_SSSE3> >;
10
10
  encoder_sse_lut<ISA_LEVEL_SSSE3>();
11
+ _encode_isa = ISA_LEVEL_SSSE3;
11
12
  }
12
13
  #else
13
14
  void encoder_sse2_init();
@@ -1,5 +1,12 @@
1
1
  #include "common.h"
2
2
 
3
+ extern const bool encoder_has_avx10;
4
+ #if !defined(__EVEX512__) && (defined(__AVX10_1__) || defined(__EVEX256__)) && defined(__AVX512VL__) && defined(__AVX512VBMI2__) && defined(__AVX512BW__)
5
+ const bool encoder_has_avx10 = true;
6
+ #else
7
+ const bool encoder_has_avx10 = false;
8
+ #endif
9
+
3
10
  #if defined(__AVX512VL__) && defined(__AVX512VBMI2__) && defined(__AVX512BW__)
4
11
  # ifndef YENC_DISABLE_AVX256
5
12
  # include "encoder_avx_base.h"
@@ -7,12 +14,14 @@
7
14
  void encoder_vbmi2_init() {
8
15
  _do_encode = &do_encode_simd< do_encode_avx2<ISA_LEVEL_VBMI2> >;
9
16
  encoder_avx2_lut<ISA_LEVEL_VBMI2>();
17
+ _encode_isa = ISA_LEVEL_VBMI2;
10
18
  }
11
19
  # else
12
20
  # include "encoder_sse_base.h"
13
21
  void encoder_vbmi2_init() {
14
22
  _do_encode = &do_encode_simd< do_encode_sse<ISA_LEVEL_VBMI2> >;
15
23
  encoder_sse_lut<ISA_LEVEL_VBMI2>();
24
+ _encode_isa = ISA_LEVEL_VBMI2;
16
25
  }
17
26
  # endif
18
27
  #else