yencode 1.1.2 → 1.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -6,7 +6,7 @@
6
6
 
7
7
  // state var: refers to the previous state - only used for incremental processing
8
8
  template<bool isRaw>
9
- size_t do_decode_noend_scalar(const unsigned char* HEDLEY_RESTRICT src, unsigned char* HEDLEY_RESTRICT dest, size_t len, YencDecoderState* state) {
9
+ size_t do_decode_noend_scalar(const unsigned char* src, unsigned char* dest, size_t len, YencDecoderState* state) {
10
10
  const unsigned char *es = src + len; // end source pointer
11
11
  unsigned char *p = dest; // destination pointer
12
12
  long i = -(long)len; // input position
@@ -140,7 +140,7 @@ size_t do_decode_noend_scalar(const unsigned char* HEDLEY_RESTRICT src, unsigned
140
140
  }
141
141
 
142
142
  template<bool isRaw>
143
- YencDecoderEnd do_decode_end_scalar(const unsigned char* HEDLEY_RESTRICT* src, unsigned char* HEDLEY_RESTRICT* dest, size_t len, YencDecoderState* state) {
143
+ YencDecoderEnd do_decode_end_scalar(const unsigned char** src, unsigned char** dest, size_t len, YencDecoderState* state) {
144
144
  const unsigned char *es = (*src) + len; // end source pointer
145
145
  unsigned char *p = *dest; // destination pointer
146
146
  long i = -(long)len; // input position
@@ -321,7 +321,7 @@ YencDecoderEnd do_decode_end_scalar(const unsigned char* HEDLEY_RESTRICT* src, u
321
321
  }
322
322
 
323
323
  template<bool isRaw, bool searchEnd>
324
- YencDecoderEnd do_decode_scalar(const unsigned char* HEDLEY_RESTRICT* src, unsigned char* HEDLEY_RESTRICT* dest, size_t len, YencDecoderState* state) {
324
+ YencDecoderEnd do_decode_scalar(const unsigned char** src, unsigned char** dest, size_t len, YencDecoderState* state) {
325
325
  if(searchEnd)
326
326
  return do_decode_end_scalar<isRaw>(src, dest, len, state);
327
327
  *dest += do_decode_noend_scalar<isRaw>(*src, *dest, len, state);
@@ -331,8 +331,8 @@ YencDecoderEnd do_decode_scalar(const unsigned char* HEDLEY_RESTRICT* src, unsig
331
331
 
332
332
 
333
333
 
334
- template<bool isRaw, bool searchEnd, int width, void(&kernel)(const uint8_t* HEDLEY_RESTRICT, long&, unsigned char* HEDLEY_RESTRICT &, unsigned char&, uint16_t&)>
335
- YencDecoderEnd do_decode_simd(const unsigned char* HEDLEY_RESTRICT* src, unsigned char* HEDLEY_RESTRICT* dest, size_t len, YencDecoderState* state) {
334
+ template<bool isRaw, bool searchEnd, int width, void(&kernel)(const uint8_t*, long&, unsigned char*&, unsigned char&, uint16_t&)>
335
+ YencDecoderEnd do_decode_simd(const unsigned char** src, unsigned char** dest, size_t len, YencDecoderState* state) {
336
336
  if(len <= width*2) return do_decode_scalar<isRaw, searchEnd>(src, dest, len, state);
337
337
 
338
338
  YencDecoderState tState = YDEC_STATE_CRLF;
@@ -19,14 +19,14 @@
19
19
  #endif
20
20
 
21
21
 
22
- // for compilers that lack these functions
23
- #if defined(__clang__) || (defined(__GNUC__) && (defined(__aarch64__) && __GNUC__ >= 8))
22
+ // for compilers that lack these functions (Clang armv7 9-12 seems to have issues with multi-vector loads)
23
+ #if (defined(__clang__) && (defined(__aarch64__) || __clang_major__<9 || __clang_major__>12)) || (defined(__GNUC__) && (defined(__aarch64__) && __GNUC__ >= 8))
24
24
  # define vld1q_u8_x2_align(p, n) vld1q_u8_x2((uint8_t*)__builtin_assume_aligned(p, n))
25
25
  #else
26
26
  # define vld1q_u8_x2_align(p, n) vcreate2_u8(vld1q_u8_align(p, (n)/2), vld1q_u8_align((p)+16, (n)/2))
27
27
  #endif
28
28
  // Clang wrongly assumes alignment on vld1q_u8_x2, and ARMv7 GCC doesn't support the function, so effectively, it can only be used in ARMv8 compilers
29
- #if defined(__aarch64__) && (defined(__clang__) || (defined(__GNUC__) && __GNUC__ >= 9))
29
+ #if defined(__aarch64__) && (defined(__clang__) || HEDLEY_GCC_VERSION_CHECK(8,5,0))
30
30
  # define vst1q_u8_x2_unaligned vst1q_u8_x2
31
31
  #else
32
32
  static HEDLEY_ALWAYS_INLINE void vst1q_u8_x2_unaligned(uint8_t* p, uint8x16x2_t data) {
@@ -59,7 +59,7 @@ static bool neon_vect_is_nonzero(uint8x16_t v) {
59
59
 
60
60
 
61
61
  template<bool isRaw, bool searchEnd>
62
- HEDLEY_ALWAYS_INLINE void do_decode_neon(const uint8_t* HEDLEY_RESTRICT src, long& len, unsigned char* HEDLEY_RESTRICT & p, unsigned char& escFirst, uint16_t& nextMask) {
62
+ HEDLEY_ALWAYS_INLINE void do_decode_neon(const uint8_t* src, long& len, unsigned char*& p, unsigned char& escFirst, uint16_t& nextMask) {
63
63
  HEDLEY_ASSUME(escFirst == 0 || escFirst == 1);
64
64
  HEDLEY_ASSUME(nextMask == 0 || nextMask == 1 || nextMask == 2);
65
65
  uint8x16_t yencOffset = escFirst ? vmakeq_u8(42+64,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42) : vdupq_n_u8(42);
@@ -10,9 +10,9 @@ static struct { char bytes[16]; } ALIGN_TO(16, compactLUT[32768]);
10
10
  static uint8_t eqFixLUT[256];
11
11
 
12
12
 
13
-
14
- #if !defined(__clang__) && !defined(_MSC_VER) && (!defined(__aarch64__) || !HEDLEY_GCC_VERSION_CHECK(10,0,0))
15
- static HEDLEY_ALWAYS_INLINE uint8x16x4_t vld1q_u8_x4(const uint8_t* p) {
13
+ // AArch64 GCC lacks these functions until 8.5, 9.4 and 10.1 (10.0 unknown)
14
+ #if !defined(__clang__) && !defined(_MSC_VER) && (!defined(__aarch64__) || !(HEDLEY_GCC_VERSION_CHECK(9,4,0) || (!HEDLEY_GCC_VERSION_CHECK(9,0,0) && HEDLEY_GCC_VERSION_CHECK(8,5,0))))
15
+ static HEDLEY_ALWAYS_INLINE uint8x16x4_t _vld1q_u8_x4(const uint8_t* p) {
16
16
  uint8x16x4_t ret;
17
17
  ret.val[0] = vld1q_u8(p);
18
18
  ret.val[1] = vld1q_u8(p+16);
@@ -20,12 +20,15 @@ static HEDLEY_ALWAYS_INLINE uint8x16x4_t vld1q_u8_x4(const uint8_t* p) {
20
20
  ret.val[3] = vld1q_u8(p+48);
21
21
  return ret;
22
22
  }
23
- static HEDLEY_ALWAYS_INLINE void vst1q_u8_x4(uint8_t* p, uint8x16x4_t data) {
23
+ static HEDLEY_ALWAYS_INLINE void _vst1q_u8_x4(uint8_t* p, uint8x16x4_t data) {
24
24
  vst1q_u8(p, data.val[0]);
25
25
  vst1q_u8(p+16, data.val[1]);
26
26
  vst1q_u8(p+32, data.val[2]);
27
27
  vst1q_u8(p+48, data.val[3]);
28
28
  }
29
+ #else
30
+ # define _vld1q_u8_x4 vld1q_u8_x4
31
+ # define _vst1q_u8_x4 vst1q_u8_x4
29
32
  #endif
30
33
 
31
34
 
@@ -44,7 +47,7 @@ static HEDLEY_ALWAYS_INLINE uint8x16_t mergeCompares(uint8x16_t a, uint8x16_t b,
44
47
 
45
48
 
46
49
  template<bool isRaw, bool searchEnd>
47
- HEDLEY_ALWAYS_INLINE void do_decode_neon(const uint8_t* HEDLEY_RESTRICT src, long& len, unsigned char* HEDLEY_RESTRICT & p, unsigned char& escFirst, uint16_t& nextMask) {
50
+ HEDLEY_ALWAYS_INLINE void do_decode_neon(const uint8_t* src, long& len, unsigned char*& p, unsigned char& escFirst, uint16_t& nextMask) {
48
51
  HEDLEY_ASSUME(escFirst == 0 || escFirst == 1);
49
52
  HEDLEY_ASSUME(nextMask == 0 || nextMask == 1 || nextMask == 2);
50
53
  uint8x16_t nextMaskMix = vdupq_n_u8(0);
@@ -55,7 +58,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_neon(const uint8_t* HEDLEY_RESTRICT src, lon
55
58
  uint8x16_t yencOffset = escFirst ? vmakeq_u8(42+64,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42) : vdupq_n_u8(42);
56
59
  long i;
57
60
  for(i = -len; i; i += sizeof(uint8x16_t)*4) {
58
- uint8x16x4_t data = vld1q_u8_x4(src+i);
61
+ uint8x16x4_t data = _vld1q_u8_x4(src+i);
59
62
  uint8x16_t dataA = data.val[0];
60
63
  uint8x16_t dataB = data.val[1];
61
64
  uint8x16_t dataC = data.val[2];
@@ -421,7 +424,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_neon(const uint8_t* HEDLEY_RESTRICT src, lon
421
424
  dataB = vsubq_u8(dataB, vdupq_n_u8(42));
422
425
  dataC = vsubq_u8(dataC, vdupq_n_u8(42));
423
426
  dataD = vsubq_u8(dataD, vdupq_n_u8(42));
424
- vst1q_u8_x4(p, vcreate4_u8(dataA, dataB, dataC, dataD));
427
+ _vst1q_u8_x4(p, vcreate4_u8(dataA, dataB, dataC, dataD));
425
428
  p += sizeof(uint8x16_t)*4;
426
429
  escFirst = 0;
427
430
  yencOffset = vdupq_n_u8(42);
@@ -7,8 +7,15 @@
7
7
  # define _mm_shrdi_epi16 _mm128_shrdi_epi16
8
8
  #endif
9
9
 
10
+ #if defined(__tune_icelake_client__) || defined(__tune_icelake_server__) || defined(__tune_tigerlake__) || defined(__tune_rocketlake__) || defined(__tune_alderlake__) || defined(__tune_sapphirerapids__)
11
+ # define COMPRESS_STORE _mm_mask_compressstoreu_epi8
12
+ #else
13
+ // avoid uCode on Zen4
14
+ # define COMPRESS_STORE(dst, mask, vec) _mm_storeu_si128((__m128i*)(dst), _mm_maskz_compress_epi8(mask, vec))
15
+ #endif
16
+
10
17
  // GCC (ver 6-10(dev)) fails to optimize pure C version of mask testing, but has this intrinsic; Clang >= 7 optimizes C version fine
11
- #if defined(__GNUC__) && __GNUC__ >= 7
18
+ #if (defined(__GNUC__) && __GNUC__ >= 7) || (defined(_MSC_VER) && _MSC_VER >= 1924)
12
19
  # define KORTEST16(a, b) !_kortestz_mask16_u8((a), (b))
13
20
  # define KAND16(a, b) _kand_mask16((a), (b))
14
21
  # define KOR16(a, b) _kor_mask16((a), (b))
@@ -104,7 +111,7 @@ static HEDLEY_ALWAYS_INLINE __m128i sse2_compact_vect(uint32_t mask, __m128i dat
104
111
  }
105
112
 
106
113
  template<bool isRaw, bool searchEnd, enum YEncDecIsaLevel use_isa>
107
- HEDLEY_ALWAYS_INLINE void do_decode_sse(const uint8_t* HEDLEY_RESTRICT src, long& len, unsigned char* HEDLEY_RESTRICT & p, unsigned char& _escFirst, uint16_t& _nextMask) {
114
+ HEDLEY_ALWAYS_INLINE void do_decode_sse(const uint8_t* src, long& len, unsigned char*& p, unsigned char& _escFirst, uint16_t& _nextMask) {
108
115
  HEDLEY_ASSUME(_escFirst == 0 || _escFirst == 1);
109
116
  HEDLEY_ASSUME(_nextMask == 0 || _nextMask == 1 || _nextMask == 2);
110
117
  uintptr_t escFirst = _escFirst;
@@ -112,7 +119,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_sse(const uint8_t* HEDLEY_RESTRICT src, long
112
119
  -42,-42,-42,-42,-42,-42,-42,-42,-42,-42,-42,-42,-42,-42,-42,-42-64
113
120
  ) : _mm_set1_epi8(-42);
114
121
 
115
- #if defined(__SSSE3__) && !defined(__tune_atom__) && !defined(__tune_slm__) && !defined(__tune_btver1__)
122
+ #if defined(__SSSE3__) && !defined(__tune_atom__) && !defined(__tune_slm__) && !defined(__tune_btver1__) && !defined(__tune_btver2__)
116
123
  const bool _USING_FAST_MATCH = (use_isa >= ISA_LEVEL_SSSE3);
117
124
  #else
118
125
  const bool _USING_FAST_MATCH = false;
@@ -121,6 +128,13 @@ HEDLEY_ALWAYS_INLINE void do_decode_sse(const uint8_t* HEDLEY_RESTRICT src, long
121
128
  const bool _USING_BLEND_ADD = (use_isa >= ISA_LEVEL_SSE41);
122
129
  #else
123
130
  const bool _USING_BLEND_ADD = false;
131
+ #endif
132
+ #if defined(__AVX512VL__) && defined(__AVX512BW__)
133
+ # if defined(_MSC_VER) && !defined(PLATFORM_AMD64) && !defined(__clang__)
134
+ const bool useAVX3MaskCmp = false;
135
+ # else
136
+ const bool useAVX3MaskCmp = (use_isa >= ISA_LEVEL_AVX3);
137
+ # endif
124
138
  #endif
125
139
 
126
140
  __m128i lfCompare = _mm_set1_epi8('\n');
@@ -214,7 +228,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_sse(const uint8_t* HEDLEY_RESTRICT src, long
214
228
  __mmask16 match2EqMaskA, match2EqMaskB;
215
229
  __mmask16 match0CrMaskA, match0CrMaskB;
216
230
  __mmask16 match2CrXDtMaskA, match2CrXDtMaskB;
217
- if(use_isa >= ISA_LEVEL_AVX3 && searchEnd) {
231
+ if(useAVX3MaskCmp && searchEnd) {
218
232
  match2EqMaskA = _mm_cmpeq_epi8_mask(_mm_set1_epi8('='), tmpData2A);
219
233
  match2EqMaskB = _mm_cmpeq_epi8_mask(_mm_set1_epi8('='), tmpData2B);
220
234
  } else
@@ -230,7 +244,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_sse(const uint8_t* HEDLEY_RESTRICT src, long
230
244
  __m128i match2CrXDtA, match2CrXDtB;
231
245
  if(isRaw) {
232
246
  #if defined(__AVX512VL__) && defined(__AVX512BW__)
233
- if(use_isa >= ISA_LEVEL_AVX3) {
247
+ if(useAVX3MaskCmp) {
234
248
  match0CrMaskA = _mm_cmpeq_epi8_mask(oDataA, _mm_set1_epi8('\r'));
235
249
  match0CrMaskB = _mm_cmpeq_epi8_mask(oDataB, _mm_set1_epi8('\r'));
236
250
  match2CrXDtMaskA = _mm_mask_cmpeq_epi8_mask(match0CrMaskA, tmpData2A, _mm_set1_epi8('.'));
@@ -256,7 +270,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_sse(const uint8_t* HEDLEY_RESTRICT src, long
256
270
  #if defined(__AVX512VL__) && defined(__AVX512BW__)
257
271
  __mmask16 match1NlMaskA, match1NlMaskB;
258
272
  __mmask16 match2NlDotMaskA, match2NlDotMaskB;
259
- if(use_isa >= ISA_LEVEL_AVX3) {
273
+ if(useAVX3MaskCmp) {
260
274
  match1NlMaskA = _mm_mask_cmpeq_epi8_mask(
261
275
  match0CrMaskA,
262
276
  _mm_set1_epi8('\n'),
@@ -299,7 +313,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_sse(const uint8_t* HEDLEY_RESTRICT src, long
299
313
 
300
314
  int matchEnd;
301
315
  #if defined(__AVX512VL__) && defined(__AVX512BW__)
302
- if(use_isa >= ISA_LEVEL_AVX3) {
316
+ if(useAVX3MaskCmp) {
303
317
  __mmask16 match3EqYMaskA = _mm_mask_cmpeq_epi8_mask(
304
318
  match2EqMaskA, _mm_set1_epi8('y'), tmpData3A
305
319
  );
@@ -373,7 +387,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_sse(const uint8_t* HEDLEY_RESTRICT src, long
373
387
  }
374
388
  }
375
389
  #if defined(__AVX512VL__) && defined(__AVX512BW__)
376
- if(use_isa >= ISA_LEVEL_AVX3) {
390
+ if(useAVX3MaskCmp) {
377
391
  mask |= match2NlDotMaskA << 2;
378
392
  mask |= (match2NlDotMaskB << 18) & 0xffffffff;
379
393
  minMask = _mm_maskz_mov_epi8(~(match2NlDotMaskB>>14), _mm_set1_epi8('.'));
@@ -398,7 +412,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_sse(const uint8_t* HEDLEY_RESTRICT src, long
398
412
  __m128i match3EqYA, match3EqYB;
399
413
  #if defined(__AVX512VL__) && defined(__AVX512BW__)
400
414
  __mmask16 match3EqYMaskA, match3EqYMaskB;
401
- if(use_isa >= ISA_LEVEL_AVX3) {
415
+ if(useAVX3MaskCmp) {
402
416
  match3EqYMaskA = _mm_mask_cmpeq_epi8_mask(
403
417
  match2EqMaskA,
404
418
  _mm_set1_epi8('y'),
@@ -434,7 +448,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_sse(const uint8_t* HEDLEY_RESTRICT src, long
434
448
  bool endFound;
435
449
 
436
450
  #if defined(__AVX512VL__) && defined(__AVX512BW__)
437
- if(use_isa >= ISA_LEVEL_AVX3) {
451
+ if(useAVX3MaskCmp) {
438
452
  __mmask16 match3LfEqYMaskA = _mm_mask_cmpeq_epi8_mask(
439
453
  match3EqYMaskA,
440
454
  _mm_set1_epi8('\n'),
@@ -642,9 +656,9 @@ HEDLEY_ALWAYS_INLINE void do_decode_sse(const uint8_t* HEDLEY_RESTRICT src, long
642
656
  if(use_isa >= ISA_LEVEL_SSSE3) {
643
657
  # if defined(__AVX512VBMI2__) && defined(__AVX512VL__) && defined(__POPCNT__)
644
658
  if(use_isa >= ISA_LEVEL_VBMI2) {
645
- _mm_mask_compressstoreu_epi8(p, KNOT16(mask), dataA);
659
+ COMPRESS_STORE(p, KNOT16(mask), dataA);
646
660
  p -= popcnt32(mask & 0xffff);
647
- _mm_mask_compressstoreu_epi8(p+XMM_SIZE, KNOT16(mask>>16), dataB);
661
+ COMPRESS_STORE(p+XMM_SIZE, KNOT16(mask>>16), dataB);
648
662
  p -= popcnt32(mask>>16);
649
663
  p += XMM_SIZE*2;
650
664
  } else
@@ -0,0 +1,37 @@
1
+ #include "common.h"
2
+
3
+ extern const bool decoder_has_avx10;
4
+ #if !defined(__EVEX512__) && (defined(__AVX10_1__) || defined(__EVEX256__)) && defined(__AVX512VL__) && defined(__AVX512VBMI2__) && defined(__AVX512BW__)
5
+ const bool decoder_has_avx10 = true;
6
+ #else
7
+ const bool decoder_has_avx10 = false;
8
+ #endif
9
+
10
+ #if defined(__AVX512VL__) && defined(__AVX512VBMI2__) && defined(__AVX512BW__)
11
+ # include "decoder_common.h"
12
+ # ifndef YENC_DISABLE_AVX256
13
+ # include "decoder_avx2_base.h"
14
+ void decoder_set_vbmi2_funcs() {
15
+ ALIGN_ALLOC(lookups, sizeof(*lookups), 16);
16
+ // TODO: consider removing compact LUT
17
+ decoder_init_lut(lookups->eqFix, lookups->compact);
18
+ _do_decode = &do_decode_simd<false, false, sizeof(__m256i)*2, do_decode_avx2<false, false, ISA_LEVEL_VBMI2> >;
19
+ _do_decode_raw = &do_decode_simd<true, false, sizeof(__m256i)*2, do_decode_avx2<true, false, ISA_LEVEL_VBMI2> >;
20
+ _do_decode_end_raw = &do_decode_simd<true, true, sizeof(__m256i)*2, do_decode_avx2<true, true, ISA_LEVEL_VBMI2> >;
21
+ }
22
+ # else
23
+ # include "decoder_sse_base.h"
24
+ void decoder_set_vbmi2_funcs() {
25
+ decoder_sse_init();
26
+ decoder_init_lut(lookups->eqFix, lookups->compact);
27
+ _do_decode = &do_decode_simd<false, false, sizeof(__m128i)*2, do_decode_sse<false, false, ISA_LEVEL_VBMI2> >;
28
+ _do_decode_raw = &do_decode_simd<true, false, sizeof(__m128i)*2, do_decode_sse<true, false, ISA_LEVEL_VBMI2> >;
29
+ _do_decode_end_raw = &do_decode_simd<true, true, sizeof(__m128i)*2, do_decode_sse<true, true, ISA_LEVEL_VBMI2> >;
30
+ }
31
+ # endif
32
+ #else
33
+ void decoder_set_avx2_funcs();
34
+ void decoder_set_vbmi2_funcs() {
35
+ decoder_set_avx2_funcs();
36
+ }
37
+ #endif
package/src/encoder.cc CHANGED
@@ -128,7 +128,10 @@ void encoder_sse2_init();
128
128
  void encoder_ssse3_init();
129
129
  void encoder_avx_init();
130
130
  void encoder_avx2_init();
131
+ void encoder_vbmi2_init();
132
+ extern const bool encoder_has_avx10;
131
133
  void encoder_neon_init();
134
+ void encoder_rvv_init();
132
135
 
133
136
  #if defined(PLATFORM_X86) && defined(YENC_BUILD_NATIVE) && YENC_BUILD_NATIVE!=0
134
137
  # if defined(__AVX2__) && !defined(YENC_DISABLE_AVX256)
@@ -153,7 +156,9 @@ void encoder_init() {
153
156
  encoder_native_init();
154
157
  # else
155
158
  int use_isa = cpu_supports_isa();
156
- if(use_isa >= ISA_LEVEL_AVX2)
159
+ if(use_isa >= ISA_LEVEL_VBMI2 && (encoder_has_avx10 || (use_isa & ISA_FEATURE_EVEX512)))
160
+ encoder_vbmi2_init();
161
+ else if(use_isa >= ISA_LEVEL_AVX2)
157
162
  encoder_avx2_init();
158
163
  else if(use_isa >= ISA_LEVEL_AVX)
159
164
  encoder_avx_init();
@@ -167,4 +172,8 @@ void encoder_init() {
167
172
  if(cpu_supports_neon())
168
173
  encoder_neon_init();
169
174
  #endif
175
+ #ifdef __riscv
176
+ if(cpu_supports_rvv())
177
+ encoder_rvv_init();
178
+ #endif
170
179
  }
@@ -6,7 +6,7 @@
6
6
  #include "encoder_common.h"
7
7
  #define YMM_SIZE 32
8
8
 
9
- #if defined(__GNUC__) && __GNUC__ >= 7
9
+ #if (defined(__GNUC__) && __GNUC__ >= 7) || (defined(_MSC_VER) && _MSC_VER >= 1924)
10
10
  # define KLOAD32(a, offs) _load_mask32((__mmask32*)(a) + (offs))
11
11
  #else
12
12
  # define KLOAD32(a, offs) (((uint32_t*)(a))[(offs)])
@@ -215,7 +215,7 @@ HEDLEY_ALWAYS_INLINE void do_encode_avx2(int line_size, int* colOffset, const ui
215
215
  // duplicate halves
216
216
  data1A = _mm256_inserti128_si256(dataA, _mm256_castsi256_si128(dataA), 1);
217
217
  data1B = _mm256_inserti128_si256(dataB, _mm256_castsi256_si128(dataB), 1);
218
- #if defined(__tune_znver2__) || defined(__tune_znver3__)
218
+ #if defined(__tune_znver2__) || defined(__tune_znver3__) || defined(__tune_znver4__)
219
219
  data2A = _mm256_permute2x128_si256(dataA, dataA, 0x11);
220
220
  data2B = _mm256_permute2x128_si256(dataB, dataB, 0x11);
221
221
  #else
@@ -290,10 +290,10 @@ HEDLEY_ALWAYS_INLINE void do_encode_avx2(int line_size, int* colOffset, const ui
290
290
 
291
291
  #if defined(__GNUC__) && defined(PLATFORM_AMD64)
292
292
  if(use_isa >= ISA_LEVEL_VBMI2) {
293
- asm(
293
+ __asm__(
294
294
  "shrq $1, %[eqMask] \n"
295
295
  "shrq %%cl, %[eqMask] \n"
296
- "adcq %[col], %[p] \n"
296
+ "adcq %q[col], %q[p] \n"
297
297
  : [eqMask]"+r"(eqMask), [p]"+r"(p)
298
298
  : "c"(shiftAmt), [col]"r"(~col)
299
299
  );
@@ -334,28 +334,32 @@ HEDLEY_ALWAYS_INLINE void do_encode_avx2(int line_size, int* colOffset, const ui
334
334
  if(use_isa >= ISA_LEVEL_AVX3) {
335
335
  # if defined(__AVX512VBMI2__)
336
336
  if(use_isa >= ISA_LEVEL_VBMI2) {
337
- _mm256_mask_storeu_epi8(p+1, 1UL<<31, dataA);
337
+ __m128i dataTop = _mm256_extracti128_si256(dataA, 1);
338
338
  dataA = _mm256_mask_expand_epi8(_mm256_set1_epi8('='), KNOT32(maskA), dataA);
339
339
  _mm256_storeu_si256((__m256i*)p, dataA);
340
+ p[32] = _mm_extract_epi8(dataTop, 15);
340
341
  p += outputBytesA;
341
342
 
342
- _mm256_mask_storeu_epi8(p+1, 1UL<<31, dataB);
343
+ dataTop = _mm256_extracti128_si256(dataB, 1);
343
344
  dataB = _mm256_mask_expand_epi8(_mm256_set1_epi8('='), KNOT32(maskB), dataB);
344
345
  _mm256_storeu_si256((__m256i*)p, dataB);
346
+ p[32] = _mm_extract_epi8(dataTop, 15);
345
347
  p += maskBitsB;
346
348
  } else
347
349
  # endif
348
350
  {
349
- _mm256_mask_storeu_epi8(p+1, 1UL<<31, dataA);
350
- dataA = _mm256_mask_alignr_epi8(dataA, (uint32_t)(-(int32_t)maskA), dataA, _mm256_permute4x64_epi64(dataA, _MM_SHUFFLE(1,0,3,2)), 15);
351
+ __m256i dataSwapped = _mm256_permute4x64_epi64(dataA, _MM_SHUFFLE(1,0,3,2));
352
+ dataA = _mm256_mask_alignr_epi8(dataA, (uint32_t)(-(int32_t)maskA), dataA, dataSwapped, 15);
351
353
  dataA = _mm256_ternarylogic_epi32(dataA, cmpA, _mm256_set1_epi8('='), 0xb8); // (data & ~cmp) | (cmp & '=')
352
354
  _mm256_storeu_si256((__m256i*)p, dataA);
355
+ p[32] = _mm_extract_epi8(_mm256_castsi256_si128(dataSwapped), 15);
353
356
  p += outputBytesA;
354
357
 
355
- _mm256_mask_storeu_epi8(p+1, 1UL<<31, dataB);
356
- dataB = _mm256_mask_alignr_epi8(dataB, (uint32_t)(-(int32_t)maskB), dataB, _mm256_permute4x64_epi64(dataB, _MM_SHUFFLE(1,0,3,2)), 15);
358
+ dataSwapped = _mm256_permute4x64_epi64(dataB, _MM_SHUFFLE(1,0,3,2));
359
+ dataB = _mm256_mask_alignr_epi8(dataB, (uint32_t)(-(int32_t)maskB), dataB, dataSwapped, 15);
357
360
  dataB = _mm256_ternarylogic_epi32(dataB, cmpB, _mm256_set1_epi8('='), 0xb8);
358
361
  _mm256_storeu_si256((__m256i*)p, dataB);
362
+ p[32] = _mm_extract_epi8(_mm256_castsi256_si128(dataSwapped), 15);
359
363
  p += maskBitsB;
360
364
  }
361
365
  } else
@@ -484,28 +488,32 @@ HEDLEY_ALWAYS_INLINE void do_encode_avx2(int line_size, int* colOffset, const ui
484
488
  if(use_isa >= ISA_LEVEL_AVX3) {
485
489
  # if defined(__AVX512VBMI2__)
486
490
  if(use_isa >= ISA_LEVEL_VBMI2) {
487
- _mm256_mask_storeu_epi8(p+1, 1UL<<31, dataA);
491
+ __m128i dataTop = _mm256_extracti128_si256(dataA, 1);
488
492
  dataA = _mm256_mask_expand_epi8(_mm256_set1_epi8('='), KNOT32(maskA), dataA);
489
493
  _mm256_storeu_si256((__m256i*)p, dataA);
494
+ p[32] = _mm_extract_epi8(dataTop, 15);
490
495
  p += outputBytesA;
491
496
 
492
- _mm256_mask_storeu_epi8(p+1, 1UL<<31, dataB);
497
+ dataTop = _mm256_extracti128_si256(dataB, 1);
493
498
  dataB = _mm256_mask_expand_epi8(_mm256_set1_epi8('='), KNOT32(maskB), dataB);
494
499
  _mm256_storeu_si256((__m256i*)p, dataB);
500
+ p[32] = _mm_extract_epi8(dataTop, 15);
495
501
  p += maskBitsB;
496
502
  } else
497
503
  # endif
498
504
  {
499
- _mm256_mask_storeu_epi8(p+1, 1UL<<31, dataA);
500
- dataA = _mm256_mask_alignr_epi8(dataA, (uint32_t)(-(int32_t)maskA), dataA, _mm256_permute4x64_epi64(dataA, _MM_SHUFFLE(1,0,3,2)), 15);
505
+ __m256i dataSwapped = _mm256_permute4x64_epi64(dataA, _MM_SHUFFLE(1,0,3,2));
506
+ dataA = _mm256_mask_alignr_epi8(dataA, (uint32_t)(-(int32_t)maskA), dataA, dataSwapped, 15);
501
507
  dataA = _mm256_ternarylogic_epi32(dataA, cmpA, _mm256_set1_epi8('='), 0xb8); // (data & ~cmp) | (cmp & '=')
502
508
  _mm256_storeu_si256((__m256i*)p, dataA);
509
+ p[32] = _mm_extract_epi8(_mm256_castsi256_si128(dataSwapped), 15);
503
510
  p += outputBytesA;
504
511
 
505
- _mm256_mask_storeu_epi8(p+1, 1UL<<31, dataB);
506
- dataB = _mm256_mask_alignr_epi8(dataB, (uint32_t)(-(int32_t)maskB), dataB, _mm256_permute4x64_epi64(dataB, _MM_SHUFFLE(1,0,3,2)), 15);
512
+ dataSwapped = _mm256_permute4x64_epi64(dataB, _MM_SHUFFLE(1,0,3,2));
513
+ dataB = _mm256_mask_alignr_epi8(dataB, (uint32_t)(-(int32_t)maskB), dataB, dataSwapped, 15);
507
514
  dataB = _mm256_ternarylogic_epi32(dataB, cmpB, _mm256_set1_epi8('='), 0xb8);
508
515
  _mm256_storeu_si256((__m256i*)p, dataB);
516
+ p[32] = _mm_extract_epi8(_mm256_castsi256_si128(dataSwapped), 15);
509
517
  p += maskBitsB;
510
518
  }
511
519
  } else
@@ -5,7 +5,7 @@
5
5
  #include "encoder_common.h"
6
6
 
7
7
  // Clang wrongly assumes alignment on vst1q_u8_x2, and ARMv7 GCC doesn't support the function, so effectively, it can only be used in ARMv8 compilers
8
- #if defined(__aarch64__) && (defined(__clang__) || (defined(__GNUC__) && __GNUC__ >= 9))
8
+ #if defined(__aarch64__) && (defined(__clang__) || HEDLEY_GCC_VERSION_CHECK(8,5,0))
9
9
  # define vst1q_u8_x2_unaligned vst1q_u8_x2
10
10
  #else
11
11
  static HEDLEY_ALWAYS_INLINE void vst1q_u8_x2_unaligned(uint8_t* p, uint8x16x2_t data) {
@@ -15,6 +15,43 @@ static HEDLEY_ALWAYS_INLINE void vst1q_u8_x2_unaligned(uint8_t* p, uint8x16x2_t
15
15
  #endif
16
16
 
17
17
 
18
+ // ARM's CLZ instruction at native bit-width
19
+ #ifdef __aarch64__
20
+ static HEDLEY_ALWAYS_INLINE int clz_n(uint64_t v) {
21
+ # ifdef _MSC_VER
22
+ long r;
23
+ // does this work?
24
+ if(_BitScanReverse64((unsigned long*)&r, v))
25
+ r ^= 63;
26
+ else
27
+ r = 64;
28
+ return r;
29
+ # else
30
+ # if defined(__clang__) || HEDLEY_GCC_VERSION_CHECK(11,0,0)
31
+ // this pattern is only detected on GCC >= 11 (Clang 9 seems to as well, unsure about earlier versions)
32
+ // - note: return type must be 'int'; GCC fails to optimise this if type is 'long'
33
+ // GCC <= 10 doesn't optimize around the '0 = undefined behaviour', so not needed there
34
+ if(v == 0) return 64;
35
+ # endif
36
+ return __builtin_clzll(v);
37
+ # endif
38
+ }
39
+ #else
40
+ static HEDLEY_ALWAYS_INLINE int clz_n(uint32_t v) {
41
+ # ifdef __GNUC__
42
+ # if defined(__clang__) || HEDLEY_GCC_VERSION_CHECK(7,0,0)
43
+ // as with AArch64 version above, only insert this check if compiler can optimise it away
44
+ if(v == 0) return 32;
45
+ # endif
46
+ return __builtin_clz(v);
47
+ # elif defined(_MSC_VER)
48
+ return _arm_clz(v);
49
+ # else
50
+ return __clz(v); // ARM compiler?
51
+ # endif
52
+ }
53
+ #endif
54
+
18
55
  static uint8x16_t ALIGN_TO(16, shufLUT[256]);
19
56
  static uint16_t expandLUT[256];
20
57
 
@@ -195,26 +232,7 @@ static HEDLEY_ALWAYS_INLINE void encode_eol_handle_pre(const uint8_t* HEDLEY_RES
195
232
  col = shufTotalLen+1 + lineSizeOffset-32;
196
233
  } else {
197
234
  // shuffle stuff up
198
- #ifdef __aarch64__
199
- # ifdef _MSC_VER
200
- long bitIndex;
201
- if(_BitScanReverse64((unsigned long*)&bitIndex, mask))
202
- bitIndex ^= 63;
203
- else
204
- bitIndex = 64;
205
- # else
206
- long bitIndex = __builtin_clzll(mask);
207
- # endif
208
- #else
209
- # ifdef __GNUC__
210
- long bitIndex = __builtin_clz(mask); // TODO: is the 'undefined if 0' case problematic here?
211
- # elif defined(_MSC_VER)
212
- long bitIndex = _arm_clz(mask);
213
- # else
214
- long bitIndex = __clz(mask); // ARM compiler?
215
- # endif
216
- #endif
217
-
235
+ long bitIndex = clz_n(mask);
218
236
  uint8x16_t vClz = vdupq_n_u8(bitIndex & ~(sizeof(mask)*8));
219
237
  #ifdef __aarch64__
220
238
  uint8x16_t blendA = vcgtq_u8(vmakeq_u8(63,62,61,60,51,50,49,48,47,46,45,44,35,34,33,32), vClz);
@@ -450,26 +468,7 @@ HEDLEY_ALWAYS_INLINE void do_encode_neon(int line_size, int* colOffset, const ui
450
468
  }
451
469
  } else {
452
470
  {
453
- #ifdef __aarch64__
454
- # ifdef _MSC_VER
455
- // does this work?
456
- if(_BitScanReverse64((unsigned long*)&bitIndex, mask))
457
- bitIndex ^= 63;
458
- else
459
- bitIndex = 64;
460
- # else
461
- bitIndex = __builtin_clzll(mask); // TODO: is the 'undefined if 0' case problematic here?
462
- # endif
463
- #else
464
- # ifdef __GNUC__
465
- bitIndex = __builtin_clz(mask);
466
- # elif defined(_MSC_VER)
467
- bitIndex = _arm_clz(mask);
468
- # else
469
- bitIndex = __clz(mask); // ARM compiler?
470
- # endif
471
- #endif
472
-
471
+ bitIndex = clz_n(mask);
473
472
  uint8x16_t vClz = vdupq_n_u8(bitIndex & ~(sizeof(mask)*8));
474
473
  #ifdef __aarch64__
475
474
  uint8x16_t blendA = vcgeq_u8(vmakeq_u8(63,62,61,60,51,50,49,48,47,46,45,44,35,34,33,32), vClz);