yencode 1.1.3 → 1.1.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -365,12 +365,11 @@ static uint32_t do_crc32_incremental_clmul(const void* data, size_t length, uint
365
365
  return crc_fold((const unsigned char*)data, (long)length, init);
366
366
  }
367
367
 
368
- void crc_clmul_set_funcs(crc_func* _do_crc32_incremental) {
369
- *_do_crc32_incremental = &do_crc32_incremental_clmul;
368
+ void crc_clmul_set_funcs() {
369
+ _do_crc32_incremental = &do_crc32_incremental_clmul;
370
+ _crc32_isa = ISA_LEVEL_PCLMUL;
370
371
  }
371
372
  #else
372
- void crc_clmul_set_funcs(crc_func* _do_crc32_incremental) {
373
- (void)_do_crc32_incremental;
374
- }
373
+ void crc_clmul_set_funcs() {}
375
374
  #endif
376
375
 
@@ -26,10 +26,9 @@ static __m256i do_one_fold(__m256i src, __m256i data) {
26
26
  0x96
27
27
  );
28
28
  #else
29
- return _mm256_xor_si256(data, _mm256_xor_si256(
30
- _mm256_clmulepi64_epi128(src, fold4, 0x01),
31
- _mm256_clmulepi64_epi128(src, fold4, 0x10)
32
- ));
29
+ return _mm256_xor_si256(_mm256_xor_si256(
30
+ data, _mm256_clmulepi64_epi128(src, fold4, 0x01)
31
+ ), _mm256_clmulepi64_epi128(src, fold4, 0x10));
33
32
  #endif
34
33
  }
35
34
 
@@ -38,7 +37,7 @@ ALIGN_TO(32, static const uint8_t pshufb_rot_table[]) = {
38
37
  16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31
39
38
  };
40
39
  // _mm256_castsi128_si256, but upper is defined to be 0
41
- #if (defined(__clang__) && __clang_major__ >= 5 && (!defined(__APPLE__) || __clang_major__ >= 7)) || (defined(__GNUC__) && __GNUC__ >= 10)
40
+ #if (defined(__clang__) && __clang_major__ >= 5 && (!defined(__APPLE__) || __clang_major__ >= 7)) || (defined(__GNUC__) && __GNUC__ >= 10) || (defined(_MSC_VER) && _MSC_VER >= 1910)
42
41
  // intrinsic unsupported in GCC 9 and MSVC < 2017
43
42
  # define zext128_256 _mm256_zextsi128_si256
44
43
  #else
@@ -218,13 +217,14 @@ static uint32_t do_crc32_incremental_clmul(const void* data, size_t length, uint
218
217
  return crc_fold((const unsigned char*)data, (long)length, init);
219
218
  }
220
219
 
221
- void crc_clmul256_set_funcs(crc_func* _do_crc32_incremental) {
222
- *_do_crc32_incremental = &do_crc32_incremental_clmul;
220
+ void crc_clmul256_set_funcs() {
221
+ _do_crc32_incremental = &do_crc32_incremental_clmul;
222
+ _crc32_isa = ISA_LEVEL_VPCLMUL;
223
223
  }
224
224
  #else
225
- void crc_clmul_set_funcs(crc_func* _do_crc32_incremental);
226
- void crc_clmul256_set_funcs(crc_func* _do_crc32_incremental) {
227
- crc_clmul_set_funcs(_do_crc32_incremental);
225
+ void crc_clmul_set_funcs();
226
+ void crc_clmul256_set_funcs() {
227
+ crc_clmul_set_funcs();
228
228
  }
229
229
  #endif
230
230
 
package/src/decoder.cc CHANGED
@@ -4,9 +4,11 @@
4
4
  #include "decoder.h"
5
5
 
6
6
  extern "C" {
7
- YencDecoderEnd (*_do_decode)(const unsigned char*HEDLEY_RESTRICT*, unsigned char*HEDLEY_RESTRICT*, size_t, YencDecoderState*) = &do_decode_scalar<false, false>;
8
- YencDecoderEnd (*_do_decode_raw)(const unsigned char*HEDLEY_RESTRICT*, unsigned char*HEDLEY_RESTRICT*, size_t, YencDecoderState*) = &do_decode_scalar<true, false>;
9
- YencDecoderEnd (*_do_decode_end_raw)(const unsigned char*HEDLEY_RESTRICT*, unsigned char*HEDLEY_RESTRICT*, size_t, YencDecoderState*) = &do_decode_end_scalar<true>;
7
+ YencDecoderEnd (*_do_decode)(const unsigned char**, unsigned char**, size_t, YencDecoderState*) = &do_decode_scalar<false, false>;
8
+ YencDecoderEnd (*_do_decode_raw)(const unsigned char**, unsigned char**, size_t, YencDecoderState*) = &do_decode_scalar<true, false>;
9
+ YencDecoderEnd (*_do_decode_end_raw)(const unsigned char**, unsigned char**, size_t, YencDecoderState*) = &do_decode_end_scalar<true>;
10
+
11
+ int _decode_isa = ISA_GENERIC;
10
12
  }
11
13
 
12
14
  void decoder_set_sse2_funcs();
@@ -14,6 +16,7 @@ void decoder_set_ssse3_funcs();
14
16
  void decoder_set_avx_funcs();
15
17
  void decoder_set_avx2_funcs();
16
18
  void decoder_set_vbmi2_funcs();
19
+ extern const bool decoder_has_avx10;
17
20
  void decoder_set_neon_funcs();
18
21
 
19
22
 
@@ -26,6 +29,7 @@ static inline void decoder_set_native_funcs() {
26
29
  _do_decode = &do_decode_simd<false, false, sizeof(__m256i)*2, do_decode_avx2<false, false, ISA_NATIVE> >;
27
30
  _do_decode_raw = &do_decode_simd<true, false, sizeof(__m256i)*2, do_decode_avx2<true, false, ISA_NATIVE> >;
28
31
  _do_decode_end_raw = &do_decode_simd<true, true, sizeof(__m256i)*2, do_decode_avx2<true, true, ISA_NATIVE> >;
32
+ _decode_isa = ISA_NATIVE;
29
33
  }
30
34
  # else
31
35
  # include "decoder_sse_base.h"
@@ -35,6 +39,7 @@ static inline void decoder_set_native_funcs() {
35
39
  _do_decode = &do_decode_simd<false, false, sizeof(__m128i)*2, do_decode_sse<false, false, ISA_NATIVE> >;
36
40
  _do_decode_raw = &do_decode_simd<true, false, sizeof(__m128i)*2, do_decode_sse<true, false, ISA_NATIVE> >;
37
41
  _do_decode_end_raw = &do_decode_simd<true, true, sizeof(__m128i)*2, do_decode_sse<true, true, ISA_NATIVE> >;
42
+ _decode_isa = ISA_NATIVE;
38
43
  }
39
44
  # endif
40
45
  #endif
@@ -45,7 +50,7 @@ void decoder_init() {
45
50
  decoder_set_native_funcs();
46
51
  # else
47
52
  int use_isa = cpu_supports_isa();
48
- if(use_isa >= ISA_LEVEL_VBMI2)
53
+ if(use_isa >= ISA_LEVEL_VBMI2 && (decoder_has_avx10 || (use_isa & ISA_FEATURE_EVEX512)))
49
54
  decoder_set_vbmi2_funcs();
50
55
  else if(use_isa >= ISA_LEVEL_AVX2)
51
56
  decoder_set_avx2_funcs();
package/src/decoder.h CHANGED
@@ -29,22 +29,26 @@ typedef enum {
29
29
 
30
30
  #include "hedley.h"
31
31
 
32
- extern YencDecoderEnd (*_do_decode)(const unsigned char*HEDLEY_RESTRICT*, unsigned char*HEDLEY_RESTRICT*, size_t, YencDecoderState*);
33
- extern YencDecoderEnd (*_do_decode_raw)(const unsigned char*HEDLEY_RESTRICT*, unsigned char*HEDLEY_RESTRICT*, size_t, YencDecoderState*);
34
- extern YencDecoderEnd (*_do_decode_end_raw)(const unsigned char*HEDLEY_RESTRICT*, unsigned char*HEDLEY_RESTRICT*, size_t, YencDecoderState*);
32
+ extern YencDecoderEnd (*_do_decode)(const unsigned char**, unsigned char**, size_t, YencDecoderState*);
33
+ extern YencDecoderEnd (*_do_decode_raw)(const unsigned char**, unsigned char**, size_t, YencDecoderState*);
34
+ extern YencDecoderEnd (*_do_decode_end_raw)(const unsigned char**, unsigned char**, size_t, YencDecoderState*);
35
+ extern int _decode_isa;
35
36
 
36
- static inline size_t do_decode(int isRaw, const unsigned char* HEDLEY_RESTRICT src, unsigned char* HEDLEY_RESTRICT dest, size_t len, YencDecoderState* state) {
37
+ static inline size_t do_decode(int isRaw, const unsigned char* src, unsigned char* dest, size_t len, YencDecoderState* state) {
37
38
  unsigned char* ds = dest;
38
39
  (*(isRaw ? _do_decode_raw : _do_decode))(&src, &ds, len, state);
39
40
  return ds - dest;
40
41
  }
41
42
 
42
- static inline YencDecoderEnd do_decode_end(const unsigned char*HEDLEY_RESTRICT* src, unsigned char*HEDLEY_RESTRICT* dest, size_t len, YencDecoderState* state) {
43
+ static inline YencDecoderEnd do_decode_end(const unsigned char** src, unsigned char** dest, size_t len, YencDecoderState* state) {
43
44
  return _do_decode_end_raw(src, dest, len, state);
44
45
  }
45
46
 
46
47
  void decoder_init();
47
48
 
49
+ static inline int decode_isa_level() {
50
+ return _decode_isa;
51
+ }
48
52
 
49
53
 
50
54
  #ifdef __cplusplus
@@ -9,6 +9,7 @@ void decoder_set_avx_funcs() {
9
9
  _do_decode = &do_decode_simd<false, false, sizeof(__m128i)*2, do_decode_sse<false, false, ISA_LEVEL_SSE4_POPCNT> >;
10
10
  _do_decode_raw = &do_decode_simd<true, false, sizeof(__m128i)*2, do_decode_sse<true, false, ISA_LEVEL_SSE4_POPCNT> >;
11
11
  _do_decode_end_raw = &do_decode_simd<true, true, sizeof(__m128i)*2, do_decode_sse<true, true, ISA_LEVEL_SSE4_POPCNT> >;
12
+ _decode_isa = ISA_LEVEL_AVX;
12
13
  }
13
14
  #else
14
15
  void decoder_set_ssse3_funcs();
@@ -9,6 +9,7 @@ void decoder_set_avx2_funcs() {
9
9
  _do_decode = &do_decode_simd<false, false, sizeof(__m256i)*2, do_decode_avx2<false, false, ISA_LEVEL_AVX2> >;
10
10
  _do_decode_raw = &do_decode_simd<true, false, sizeof(__m256i)*2, do_decode_avx2<true, false, ISA_LEVEL_AVX2> >;
11
11
  _do_decode_end_raw = &do_decode_simd<true, true, sizeof(__m256i)*2, do_decode_avx2<true, true, ISA_LEVEL_AVX2> >;
12
+ _decode_isa = ISA_LEVEL_AVX2;
12
13
  }
13
14
  #else
14
15
  void decoder_set_avx_funcs();
@@ -30,7 +30,7 @@ static HEDLEY_ALWAYS_INLINE __m256i force_align_read_256(const void* p) {
30
30
  }
31
31
 
32
32
  // _mm256_castsi128_si256, but upper is defined to be 0
33
- #if (defined(__clang__) && __clang_major__ >= 5 && (!defined(__APPLE__) || __clang_major__ >= 7)) || (defined(__GNUC__) && __GNUC__ >= 10)
33
+ #if (defined(__clang__) && __clang_major__ >= 5 && (!defined(__APPLE__) || __clang_major__ >= 7)) || (defined(__GNUC__) && __GNUC__ >= 10) || (defined(_MSC_VER) && _MSC_VER >= 1910)
34
34
  // intrinsic unsupported in GCC 9 and MSVC < 2017
35
35
  # define zext128_256 _mm256_zextsi128_si256
36
36
  #else
@@ -43,9 +43,15 @@ static HEDLEY_ALWAYS_INLINE __m256i force_align_read_256(const void* p) {
43
43
  # endif
44
44
  #endif
45
45
 
46
+ #if defined(__tune_icelake_client__) || defined(__tune_icelake_server__) || defined(__tune_tigerlake__) || defined(__tune_rocketlake__) || defined(__tune_alderlake__) || defined(__tune_sapphirerapids__)
47
+ # define COMPRESS_STORE _mm256_mask_compressstoreu_epi8
48
+ #else
49
+ // avoid uCode on Zen4
50
+ # define COMPRESS_STORE(dst, mask, vec) _mm256_storeu_si256((__m256i*)(dst), _mm256_maskz_compress_epi8(mask, vec))
51
+ #endif
46
52
 
47
53
  template<bool isRaw, bool searchEnd, enum YEncDecIsaLevel use_isa>
48
- HEDLEY_ALWAYS_INLINE void do_decode_avx2(const uint8_t* HEDLEY_RESTRICT src, long& len, unsigned char* HEDLEY_RESTRICT & p, unsigned char& _escFirst, uint16_t& _nextMask) {
54
+ HEDLEY_ALWAYS_INLINE void do_decode_avx2(const uint8_t* src, long& len, unsigned char*& p, unsigned char& _escFirst, uint16_t& _nextMask) {
49
55
  HEDLEY_ASSUME(_escFirst == 0 || _escFirst == 1);
50
56
  HEDLEY_ASSUME(_nextMask == 0 || _nextMask == 1 || _nextMask == 2);
51
57
  uintptr_t escFirst = _escFirst;
@@ -61,6 +67,8 @@ HEDLEY_ALWAYS_INLINE void do_decode_avx2(const uint8_t* HEDLEY_RESTRICT src, lon
61
67
  );
62
68
  }
63
69
 
70
+ decoder_set_nextMask<isRaw>(src, len, _nextMask); // set this before the loop because we can't check src after it's been overwritten
71
+
64
72
  // for some reason, MSVC Win32 seems to crash when trying to compile _mm256_mask_cmpeq_epi8_mask
65
73
  // the crash can be fixed by switching the order of the last two arguments, but it seems to generate wrong code
66
74
  // so just disable the optimisation as it seems to be problematic there
@@ -314,6 +322,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_avx2(const uint8_t* HEDLEY_RESTRICT src, lon
314
322
  // terminator found
315
323
  // there's probably faster ways to do this, but reverting to scalar code should be good enough
316
324
  len += (long)i;
325
+ _nextMask = decoder_set_nextMask<isRaw>(src+i, mask);
317
326
  break;
318
327
  }
319
328
  }
@@ -406,6 +415,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_avx2(const uint8_t* HEDLEY_RESTRICT src, lon
406
415
  }
407
416
  if(endFound) {
408
417
  len += (long)i;
418
+ _nextMask = decoder_set_nextMask<isRaw>(src+i, mask);
409
419
  break;
410
420
  }
411
421
  }
@@ -541,9 +551,9 @@ HEDLEY_ALWAYS_INLINE void do_decode_avx2(const uint8_t* HEDLEY_RESTRICT src, lon
541
551
  // all that's left is to 'compress' the data (skip over masked chars)
542
552
  #if defined(__AVX512VBMI2__) && defined(__AVX512VL__)
543
553
  if(use_isa >= ISA_LEVEL_VBMI2) {
544
- _mm256_mask_compressstoreu_epi8(p, KNOT32(mask), dataA);
554
+ COMPRESS_STORE(p, KNOT32(mask), dataA);
545
555
  p -= popcnt32(mask & 0xffffffff);
546
- _mm256_mask_compressstoreu_epi8((p + XMM_SIZE*2), KNOT32(mask>>32), dataB);
556
+ COMPRESS_STORE((p + XMM_SIZE*2), KNOT32(mask>>32), dataB);
547
557
  p += XMM_SIZE*4 - popcnt32(mask >> 32);
548
558
  } else
549
559
  #endif
@@ -607,20 +617,6 @@ HEDLEY_ALWAYS_INLINE void do_decode_avx2(const uint8_t* HEDLEY_RESTRICT src, lon
607
617
  }
608
618
  }
609
619
  _escFirst = (unsigned char)escFirst;
610
- if(isRaw) {
611
- // this would be the trivial solution, but requires the compiler holding onto minMask throughout the loop:
612
- //_nextMask = ~(uint16_t)_mm256_movemask_epi8(_mm256_cmpeq_epi8(minMask, _mm256_set1_epi8('.')));
613
- // instead, just scan the memory to determine what to set nextMask to
614
- if(len != 0) { // have to gone through at least one loop cycle
615
- if(src[i-2] == '\r' && src[i-1] == '\n' && src[i] == '.')
616
- _nextMask = 1;
617
- else if(src[i-1] == '\r' && src[i] == '\n' && src[i+1] == '.')
618
- _nextMask = 2;
619
- else
620
- _nextMask = 0;
621
- }
622
- } else
623
- _nextMask = 0;
624
620
  _mm256_zeroupper();
625
621
  }
626
622
  #endif
@@ -6,7 +6,7 @@
6
6
 
7
7
  // state var: refers to the previous state - only used for incremental processing
8
8
  template<bool isRaw>
9
- size_t do_decode_noend_scalar(const unsigned char* HEDLEY_RESTRICT src, unsigned char* HEDLEY_RESTRICT dest, size_t len, YencDecoderState* state) {
9
+ size_t do_decode_noend_scalar(const unsigned char* src, unsigned char* dest, size_t len, YencDecoderState* state) {
10
10
  const unsigned char *es = src + len; // end source pointer
11
11
  unsigned char *p = dest; // destination pointer
12
12
  long i = -(long)len; // input position
@@ -140,7 +140,7 @@ size_t do_decode_noend_scalar(const unsigned char* HEDLEY_RESTRICT src, unsigned
140
140
  }
141
141
 
142
142
  template<bool isRaw>
143
- YencDecoderEnd do_decode_end_scalar(const unsigned char* HEDLEY_RESTRICT* src, unsigned char* HEDLEY_RESTRICT* dest, size_t len, YencDecoderState* state) {
143
+ YencDecoderEnd do_decode_end_scalar(const unsigned char** src, unsigned char** dest, size_t len, YencDecoderState* state) {
144
144
  const unsigned char *es = (*src) + len; // end source pointer
145
145
  unsigned char *p = *dest; // destination pointer
146
146
  long i = -(long)len; // input position
@@ -321,7 +321,7 @@ YencDecoderEnd do_decode_end_scalar(const unsigned char* HEDLEY_RESTRICT* src, u
321
321
  }
322
322
 
323
323
  template<bool isRaw, bool searchEnd>
324
- YencDecoderEnd do_decode_scalar(const unsigned char* HEDLEY_RESTRICT* src, unsigned char* HEDLEY_RESTRICT* dest, size_t len, YencDecoderState* state) {
324
+ YencDecoderEnd do_decode_scalar(const unsigned char** src, unsigned char** dest, size_t len, YencDecoderState* state) {
325
325
  if(searchEnd)
326
326
  return do_decode_end_scalar<isRaw>(src, dest, len, state);
327
327
  *dest += do_decode_noend_scalar<isRaw>(*src, *dest, len, state);
@@ -331,8 +331,8 @@ YencDecoderEnd do_decode_scalar(const unsigned char* HEDLEY_RESTRICT* src, unsig
331
331
 
332
332
 
333
333
 
334
- template<bool isRaw, bool searchEnd, int width, void(&kernel)(const uint8_t* HEDLEY_RESTRICT, long&, unsigned char* HEDLEY_RESTRICT &, unsigned char&, uint16_t&)>
335
- YencDecoderEnd do_decode_simd(const unsigned char* HEDLEY_RESTRICT* src, unsigned char* HEDLEY_RESTRICT* dest, size_t len, YencDecoderState* state) {
334
+ template<bool isRaw, bool searchEnd, int width, void(&kernel)(const uint8_t*, long&, unsigned char*&, unsigned char&, uint16_t&)>
335
+ YencDecoderEnd do_decode_simd(const unsigned char** src, unsigned char** dest, size_t len, YencDecoderState* state) {
336
336
  if(len <= width*2) return do_decode_scalar<isRaw, searchEnd>(src, dest, len, state);
337
337
 
338
338
  YencDecoderState tState = YDEC_STATE_CRLF;
@@ -509,4 +509,29 @@ static inline void decoder_init_lut(uint8_t* eqFixLUT, void* compactLUT) {
509
509
  }
510
510
  #endif
511
511
  }
512
+ template<bool isRaw>
513
+ static inline void decoder_set_nextMask(const uint8_t* src, size_t len, uint16_t& nextMask) {
514
+ if(isRaw) {
515
+ if(len != 0) { // have to gone through at least one loop cycle
516
+ if(src[-2] == '\r' && src[-1] == '\n' && src[0] == '.')
517
+ nextMask = 1;
518
+ else if(src[-1] == '\r' && src[0] == '\n' && src[1] == '.')
519
+ nextMask = 2;
520
+ else
521
+ nextMask = 0;
522
+ }
523
+ } else
524
+ nextMask = 0;
525
+ }
512
526
 
527
+ // without backtracking
528
+ template<bool isRaw>
529
+ static inline uint16_t decoder_set_nextMask(const uint8_t* src, unsigned mask) {
530
+ if(isRaw) {
531
+ if(src[0] == '.')
532
+ return mask & 1;
533
+ if(src[1] == '.')
534
+ return mask & 2;
535
+ }
536
+ return 0;
537
+ }
@@ -59,7 +59,7 @@ static bool neon_vect_is_nonzero(uint8x16_t v) {
59
59
 
60
60
 
61
61
  template<bool isRaw, bool searchEnd>
62
- HEDLEY_ALWAYS_INLINE void do_decode_neon(const uint8_t* HEDLEY_RESTRICT src, long& len, unsigned char* HEDLEY_RESTRICT & p, unsigned char& escFirst, uint16_t& nextMask) {
62
+ HEDLEY_ALWAYS_INLINE void do_decode_neon(const uint8_t* src, long& len, unsigned char*& p, unsigned char& escFirst, uint16_t& nextMask) {
63
63
  HEDLEY_ASSUME(escFirst == 0 || escFirst == 1);
64
64
  HEDLEY_ASSUME(nextMask == 0 || nextMask == 1 || nextMask == 2);
65
65
  uint8x16_t yencOffset = escFirst ? vmakeq_u8(42+64,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42) : vdupq_n_u8(42);
@@ -78,6 +78,9 @@ HEDLEY_ALWAYS_INLINE void do_decode_neon(const uint8_t* HEDLEY_RESTRICT src, lon
78
78
  lfCompare = vsetq_lane_u8('.', lfCompare, 1);
79
79
  }
80
80
  #endif
81
+
82
+ decoder_set_nextMask<isRaw>(src, len, nextMask);
83
+
81
84
  long i;
82
85
  for(i = -len; i; i += sizeof(uint8x16_t)*2) {
83
86
  uint8x16x2_t data = vld1q_u8_x2_align(src+i, 32);
@@ -251,6 +254,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_neon(const uint8_t* HEDLEY_RESTRICT src, lon
251
254
  // terminator found
252
255
  // there's probably faster ways to do this, but reverting to scalar code should be good enough
253
256
  len += i;
257
+ nextMask = decoder_set_nextMask<isRaw>(src+i, mask);
254
258
  break;
255
259
  }
256
260
  }
@@ -301,6 +305,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_neon(const uint8_t* HEDLEY_RESTRICT src, lon
301
305
  );
302
306
  if(LIKELIHOOD(0.001, neon_vect_is_nonzero(matchEnd))) {
303
307
  len += i;
308
+ nextMask = decoder_set_nextMask<isRaw>(src+i, mask);
304
309
  break;
305
310
  }
306
311
  }
@@ -449,18 +454,6 @@ HEDLEY_ALWAYS_INLINE void do_decode_neon(const uint8_t* HEDLEY_RESTRICT src, lon
449
454
  #endif
450
455
  }
451
456
  }
452
-
453
- if(isRaw) {
454
- if(len != 0) { // have to gone through at least one loop cycle
455
- if(src[i-2] == '\r' && src[i-1] == '\n' && src[i] == '.')
456
- nextMask = 1;
457
- else if(src[i-1] == '\r' && src[i] == '\n' && src[i+1] == '.')
458
- nextMask = 2;
459
- else
460
- nextMask = 0;
461
- }
462
- } else
463
- nextMask = 0;
464
457
  }
465
458
 
466
459
  void decoder_set_neon_funcs() {
@@ -468,6 +461,7 @@ void decoder_set_neon_funcs() {
468
461
  _do_decode = &do_decode_simd<false, false, sizeof(uint8x16_t)*2, do_decode_neon<false, false> >;
469
462
  _do_decode_raw = &do_decode_simd<true, false, sizeof(uint8x16_t)*2, do_decode_neon<true, false> >;
470
463
  _do_decode_end_raw = &do_decode_simd<true, true, sizeof(uint8x16_t)*2, do_decode_neon<true, true> >;
464
+ _decode_isa = ISA_LEVEL_NEON;
471
465
  }
472
466
  #else
473
467
  void decoder_set_neon_funcs() {}
@@ -47,7 +47,7 @@ static HEDLEY_ALWAYS_INLINE uint8x16_t mergeCompares(uint8x16_t a, uint8x16_t b,
47
47
 
48
48
 
49
49
  template<bool isRaw, bool searchEnd>
50
- HEDLEY_ALWAYS_INLINE void do_decode_neon(const uint8_t* HEDLEY_RESTRICT src, long& len, unsigned char* HEDLEY_RESTRICT & p, unsigned char& escFirst, uint16_t& nextMask) {
50
+ HEDLEY_ALWAYS_INLINE void do_decode_neon(const uint8_t* src, long& len, unsigned char*& p, unsigned char& escFirst, uint16_t& nextMask) {
51
51
  HEDLEY_ASSUME(escFirst == 0 || escFirst == 1);
52
52
  HEDLEY_ASSUME(nextMask == 0 || nextMask == 1 || nextMask == 2);
53
53
  uint8x16_t nextMaskMix = vdupq_n_u8(0);
@@ -56,6 +56,9 @@ HEDLEY_ALWAYS_INLINE void do_decode_neon(const uint8_t* HEDLEY_RESTRICT src, lon
56
56
  if(nextMask == 2)
57
57
  nextMaskMix = vsetq_lane_u8(2, nextMaskMix, 1);
58
58
  uint8x16_t yencOffset = escFirst ? vmakeq_u8(42+64,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42) : vdupq_n_u8(42);
59
+
60
+ decoder_set_nextMask<isRaw>(src, len, nextMask);
61
+
59
62
  long i;
60
63
  for(i = -len; i; i += sizeof(uint8x16_t)*4) {
61
64
  uint8x16x4_t data = _vld1q_u8_x4(src+i);
@@ -227,6 +230,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_neon(const uint8_t* HEDLEY_RESTRICT src, lon
227
230
  // terminator found
228
231
  // there's probably faster ways to do this, but reverting to scalar code should be good enough
229
232
  len += i;
233
+ nextMask = decoder_set_nextMask<isRaw>(src+i, mask);
230
234
  break;
231
235
  }
232
236
  }
@@ -275,6 +279,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_neon(const uint8_t* HEDLEY_RESTRICT src, lon
275
279
  );
276
280
  if(LIKELIHOOD(0.001, neon_vect_is_nonzero(matchEnd))) {
277
281
  len += i;
282
+ nextMask = decoder_set_nextMask<isRaw>(src+i, mask);
278
283
  break;
279
284
  }
280
285
  }
@@ -430,17 +435,6 @@ HEDLEY_ALWAYS_INLINE void do_decode_neon(const uint8_t* HEDLEY_RESTRICT src, lon
430
435
  yencOffset = vdupq_n_u8(42);
431
436
  }
432
437
  }
433
- if(isRaw) {
434
- if(len != 0) { // have to gone through at least one loop cycle
435
- if(src[i-2] == '\r' && src[i-1] == '\n' && src[i] == '.')
436
- nextMask = 1;
437
- else if(src[i-1] == '\r' && src[i] == '\n' && src[i+1] == '.')
438
- nextMask = 2;
439
- else
440
- nextMask = 0;
441
- }
442
- } else
443
- nextMask = 0;
444
438
  }
445
439
 
446
440
  void decoder_set_neon_funcs() {
@@ -448,6 +442,7 @@ void decoder_set_neon_funcs() {
448
442
  _do_decode = &do_decode_simd<false, false, sizeof(uint8x16_t)*4, do_decode_neon<false, false> >;
449
443
  _do_decode_raw = &do_decode_simd<true, false, sizeof(uint8x16_t)*4, do_decode_neon<true, false> >;
450
444
  _do_decode_end_raw = &do_decode_simd<true, true, sizeof(uint8x16_t)*4, do_decode_neon<true, true> >;
445
+ _decode_isa = ISA_LEVEL_NEON;
451
446
  }
452
447
  #else
453
448
  void decoder_set_neon_funcs() {}
@@ -10,6 +10,7 @@ void decoder_set_sse2_funcs() {
10
10
  _do_decode = &do_decode_simd<false, false, sizeof(__m128i)*2, do_decode_sse<false, false, ISA_LEVEL_SSE2> >;
11
11
  _do_decode_raw = &do_decode_simd<true, false, sizeof(__m128i)*2, do_decode_sse<true, false, ISA_LEVEL_SSE2> >;
12
12
  _do_decode_end_raw = &do_decode_simd<true, true, sizeof(__m128i)*2, do_decode_sse<true, true, ISA_LEVEL_SSE2> >;
13
+ _decode_isa = ISA_LEVEL_SSE2;
13
14
  }
14
15
  #else
15
16
  void decoder_set_sse2_funcs() {}
@@ -7,6 +7,13 @@
7
7
  # define _mm_shrdi_epi16 _mm128_shrdi_epi16
8
8
  #endif
9
9
 
10
+ #if defined(__tune_icelake_client__) || defined(__tune_icelake_server__) || defined(__tune_tigerlake__) || defined(__tune_rocketlake__) || defined(__tune_alderlake__) || defined(__tune_sapphirerapids__)
11
+ # define COMPRESS_STORE _mm_mask_compressstoreu_epi8
12
+ #else
13
+ // avoid uCode on Zen4
14
+ # define COMPRESS_STORE(dst, mask, vec) _mm_storeu_si128((__m128i*)(dst), _mm_maskz_compress_epi8(mask, vec))
15
+ #endif
16
+
10
17
  // GCC (ver 6-10(dev)) fails to optimize pure C version of mask testing, but has this intrinsic; Clang >= 7 optimizes C version fine
11
18
  #if (defined(__GNUC__) && __GNUC__ >= 7) || (defined(_MSC_VER) && _MSC_VER >= 1924)
12
19
  # define KORTEST16(a, b) !_kortestz_mask16_u8((a), (b))
@@ -104,7 +111,7 @@ static HEDLEY_ALWAYS_INLINE __m128i sse2_compact_vect(uint32_t mask, __m128i dat
104
111
  }
105
112
 
106
113
  template<bool isRaw, bool searchEnd, enum YEncDecIsaLevel use_isa>
107
- HEDLEY_ALWAYS_INLINE void do_decode_sse(const uint8_t* HEDLEY_RESTRICT src, long& len, unsigned char* HEDLEY_RESTRICT & p, unsigned char& _escFirst, uint16_t& _nextMask) {
114
+ HEDLEY_ALWAYS_INLINE void do_decode_sse(const uint8_t* src, long& len, unsigned char*& p, unsigned char& _escFirst, uint16_t& _nextMask) {
108
115
  HEDLEY_ASSUME(_escFirst == 0 || _escFirst == 1);
109
116
  HEDLEY_ASSUME(_nextMask == 0 || _nextMask == 1 || _nextMask == 2);
110
117
  uintptr_t escFirst = _escFirst;
@@ -138,6 +145,9 @@ HEDLEY_ALWAYS_INLINE void do_decode_sse(const uint8_t* HEDLEY_RESTRICT src, long
138
145
  else
139
146
  lfCompare = _mm_insert_epi16(lfCompare, _nextMask == 1 ? 0x0a2e /*".\n"*/ : 0x2e0a /*"\n."*/, 0);
140
147
  }
148
+
149
+ decoder_set_nextMask<isRaw>(src, len, _nextMask); // set this before the loop because we can't check src after it's been overwritten
150
+
141
151
  intptr_t i;
142
152
  for(i = -len; i; i += sizeof(__m128i)*2) {
143
153
  __m128i oDataA = _mm_load_si128((__m128i *)(src+i));
@@ -376,6 +386,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_sse(const uint8_t* HEDLEY_RESTRICT src, long
376
386
  // terminator found
377
387
  // there's probably faster ways to do this, but reverting to scalar code should be good enough
378
388
  len += (long)i;
389
+ _nextMask = decoder_set_nextMask<isRaw>(src+i, mask);
379
390
  break;
380
391
  }
381
392
  }
@@ -485,6 +496,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_sse(const uint8_t* HEDLEY_RESTRICT src, long
485
496
 
486
497
  if(endFound) {
487
498
  len += (long)i;
499
+ _nextMask = decoder_set_nextMask<isRaw>(src+i, mask);
488
500
  break;
489
501
  }
490
502
  }
@@ -649,9 +661,9 @@ HEDLEY_ALWAYS_INLINE void do_decode_sse(const uint8_t* HEDLEY_RESTRICT src, long
649
661
  if(use_isa >= ISA_LEVEL_SSSE3) {
650
662
  # if defined(__AVX512VBMI2__) && defined(__AVX512VL__) && defined(__POPCNT__)
651
663
  if(use_isa >= ISA_LEVEL_VBMI2) {
652
- _mm_mask_compressstoreu_epi8(p, KNOT16(mask), dataA);
664
+ COMPRESS_STORE(p, KNOT16(mask), dataA);
653
665
  p -= popcnt32(mask & 0xffff);
654
- _mm_mask_compressstoreu_epi8(p+XMM_SIZE, KNOT16(mask>>16), dataB);
666
+ COMPRESS_STORE(p+XMM_SIZE, KNOT16(mask>>16), dataB);
655
667
  p -= popcnt32(mask>>16);
656
668
  p += XMM_SIZE*2;
657
669
  } else
@@ -703,16 +715,5 @@ HEDLEY_ALWAYS_INLINE void do_decode_sse(const uint8_t* HEDLEY_RESTRICT src, long
703
715
  }
704
716
  }
705
717
  _escFirst = (unsigned char)escFirst;
706
- if(isRaw) {
707
- if(len != 0) { // have to gone through at least one loop cycle
708
- if(src[i-2] == '\r' && src[i-1] == '\n' && src[i] == '.')
709
- _nextMask = 1;
710
- else if(src[i-1] == '\r' && src[i] == '\n' && src[i+1] == '.')
711
- _nextMask = 2;
712
- else
713
- _nextMask = 0;
714
- }
715
- } else
716
- _nextMask = 0;
717
718
  }
718
719
  #endif
@@ -9,6 +9,7 @@ void decoder_set_ssse3_funcs() {
9
9
  _do_decode = &do_decode_simd<false, false, sizeof(__m128i)*2, do_decode_sse<false, false, ISA_LEVEL_SSSE3> >;
10
10
  _do_decode_raw = &do_decode_simd<true, false, sizeof(__m128i)*2, do_decode_sse<true, false, ISA_LEVEL_SSSE3> >;
11
11
  _do_decode_end_raw = &do_decode_simd<true, true, sizeof(__m128i)*2, do_decode_sse<true, true, ISA_LEVEL_SSSE3> >;
12
+ _decode_isa = ISA_LEVEL_SSSE3;
12
13
  }
13
14
  #else
14
15
  void decoder_set_sse2_funcs();
@@ -1,5 +1,12 @@
1
1
  #include "common.h"
2
2
 
3
+ extern const bool decoder_has_avx10;
4
+ #if !defined(__EVEX512__) && (defined(__AVX10_1__) || defined(__EVEX256__)) && defined(__AVX512VL__) && defined(__AVX512VBMI2__) && defined(__AVX512BW__)
5
+ const bool decoder_has_avx10 = true;
6
+ #else
7
+ const bool decoder_has_avx10 = false;
8
+ #endif
9
+
3
10
  #if defined(__AVX512VL__) && defined(__AVX512VBMI2__) && defined(__AVX512BW__)
4
11
  # include "decoder_common.h"
5
12
  # ifndef YENC_DISABLE_AVX256
@@ -11,6 +18,7 @@ void decoder_set_vbmi2_funcs() {
11
18
  _do_decode = &do_decode_simd<false, false, sizeof(__m256i)*2, do_decode_avx2<false, false, ISA_LEVEL_VBMI2> >;
12
19
  _do_decode_raw = &do_decode_simd<true, false, sizeof(__m256i)*2, do_decode_avx2<true, false, ISA_LEVEL_VBMI2> >;
13
20
  _do_decode_end_raw = &do_decode_simd<true, true, sizeof(__m256i)*2, do_decode_avx2<true, true, ISA_LEVEL_VBMI2> >;
21
+ _decode_isa = ISA_LEVEL_VBMI2;
14
22
  }
15
23
  # else
16
24
  # include "decoder_sse_base.h"
@@ -20,6 +28,7 @@ void decoder_set_vbmi2_funcs() {
20
28
  _do_decode = &do_decode_simd<false, false, sizeof(__m128i)*2, do_decode_sse<false, false, ISA_LEVEL_VBMI2> >;
21
29
  _do_decode_raw = &do_decode_simd<true, false, sizeof(__m128i)*2, do_decode_sse<true, false, ISA_LEVEL_VBMI2> >;
22
30
  _do_decode_end_raw = &do_decode_simd<true, true, sizeof(__m128i)*2, do_decode_sse<true, true, ISA_LEVEL_VBMI2> >;
31
+ _decode_isa = ISA_LEVEL_VBMI2;
23
32
  }
24
33
  # endif
25
34
  #else
package/src/encoder.cc CHANGED
@@ -122,6 +122,7 @@ size_t do_encode_generic(int line_size, int* colOffset, const unsigned char* HED
122
122
 
123
123
  extern "C" {
124
124
  size_t (*_do_encode)(int, int*, const unsigned char* HEDLEY_RESTRICT, unsigned char* HEDLEY_RESTRICT, size_t, int) = &do_encode_generic;
125
+ int _encode_isa = ISA_GENERIC;
125
126
  }
126
127
 
127
128
  void encoder_sse2_init();
@@ -129,7 +130,9 @@ void encoder_ssse3_init();
129
130
  void encoder_avx_init();
130
131
  void encoder_avx2_init();
131
132
  void encoder_vbmi2_init();
133
+ extern const bool encoder_has_avx10;
132
134
  void encoder_neon_init();
135
+ void encoder_rvv_init();
133
136
 
134
137
  #if defined(PLATFORM_X86) && defined(YENC_BUILD_NATIVE) && YENC_BUILD_NATIVE!=0
135
138
  # if defined(__AVX2__) && !defined(YENC_DISABLE_AVX256)
@@ -137,12 +140,14 @@ void encoder_neon_init();
137
140
  static inline void encoder_native_init() {
138
141
  _do_encode = &do_encode_simd< do_encode_avx2<ISA_NATIVE> >;
139
142
  encoder_avx2_lut<ISA_NATIVE>();
143
+ _encode_isa = ISA_NATIVE;
140
144
  }
141
145
  # else
142
146
  # include "encoder_sse_base.h"
143
147
  static inline void encoder_native_init() {
144
148
  _do_encode = &do_encode_simd< do_encode_sse<ISA_NATIVE> >;
145
149
  encoder_sse_lut<ISA_NATIVE>();
150
+ _encode_isa = ISA_NATIVE;
146
151
  }
147
152
  # endif
148
153
  #endif
@@ -154,7 +159,7 @@ void encoder_init() {
154
159
  encoder_native_init();
155
160
  # else
156
161
  int use_isa = cpu_supports_isa();
157
- if(use_isa >= ISA_LEVEL_VBMI2)
162
+ if(use_isa >= ISA_LEVEL_VBMI2 && (encoder_has_avx10 || (use_isa & ISA_FEATURE_EVEX512)))
158
163
  encoder_vbmi2_init();
159
164
  else if(use_isa >= ISA_LEVEL_AVX2)
160
165
  encoder_avx2_init();
@@ -170,4 +175,8 @@ void encoder_init() {
170
175
  if(cpu_supports_neon())
171
176
  encoder_neon_init();
172
177
  #endif
178
+ #ifdef __riscv
179
+ if(cpu_supports_rvv())
180
+ encoder_rvv_init();
181
+ #endif
173
182
  }
package/src/encoder.h CHANGED
@@ -10,8 +10,12 @@ extern "C" {
10
10
  #include "hedley.h"
11
11
 
12
12
  extern size_t (*_do_encode)(int, int*, const unsigned char* HEDLEY_RESTRICT, unsigned char* HEDLEY_RESTRICT, size_t, int);
13
+ extern int _encode_isa;
13
14
  #define do_encode (*_do_encode)
14
15
  void encoder_init();
16
+ static inline int encode_isa_level() {
17
+ return _encode_isa;
18
+ }
15
19
 
16
20
 
17
21
 
@@ -6,6 +6,7 @@
6
6
  void encoder_avx_init() {
7
7
  _do_encode = &do_encode_simd< do_encode_sse<ISA_LEVEL_SSE4_POPCNT> >;
8
8
  encoder_sse_lut<ISA_LEVEL_SSE4_POPCNT>();
9
+ _encode_isa = ISA_LEVEL_AVX;
9
10
  }
10
11
  #else
11
12
  void encoder_ssse3_init();
@@ -6,6 +6,7 @@
6
6
  void encoder_avx2_init() {
7
7
  _do_encode = &do_encode_simd< do_encode_avx2<ISA_LEVEL_AVX2> >;
8
8
  encoder_avx2_lut<ISA_LEVEL_AVX2>();
9
+ _encode_isa = ISA_LEVEL_AVX2;
9
10
  }
10
11
  #else
11
12
  void encoder_avx_init();