yencode 1.1.5 → 1.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. package/README.md +130 -189
  2. package/binding.gyp +115 -6
  3. package/index.js +2 -0
  4. package/package.json +1 -1
  5. package/src/common.h +37 -7
  6. package/src/crc.cc +121 -47
  7. package/src/crc.h +74 -10
  8. package/src/crc_arm.cc +51 -34
  9. package/src/crc_arm_pmull.cc +215 -0
  10. package/src/crc_common.h +22 -0
  11. package/src/crc_folding.cc +154 -16
  12. package/src/crc_folding_256.cc +7 -14
  13. package/src/crc_riscv.cc +251 -0
  14. package/src/decoder.cc +373 -13
  15. package/src/decoder.h +10 -14
  16. package/src/decoder_avx.cc +5 -6
  17. package/src/decoder_avx2.cc +8 -9
  18. package/src/decoder_avx2_base.h +7 -11
  19. package/src/decoder_common.h +56 -373
  20. package/src/decoder_neon.cc +13 -19
  21. package/src/decoder_neon64.cc +12 -15
  22. package/src/decoder_rvv.cc +280 -0
  23. package/src/decoder_sse2.cc +26 -5
  24. package/src/decoder_sse_base.h +20 -40
  25. package/src/decoder_ssse3.cc +5 -6
  26. package/src/decoder_vbmi2.cc +6 -13
  27. package/src/encoder.cc +42 -26
  28. package/src/encoder.h +5 -7
  29. package/src/encoder_avx.cc +3 -3
  30. package/src/encoder_avx2.cc +3 -3
  31. package/src/encoder_avx_base.h +3 -0
  32. package/src/encoder_common.h +26 -32
  33. package/src/encoder_neon.cc +6 -3
  34. package/src/encoder_rvv.cc +13 -26
  35. package/src/encoder_sse2.cc +3 -2
  36. package/src/encoder_sse_base.h +2 -0
  37. package/src/encoder_ssse3.cc +3 -3
  38. package/src/encoder_vbmi2.cc +6 -7
  39. package/src/platform.cc +24 -23
  40. package/src/yencode.cc +54 -11
  41. package/test/_speedbase.js +4 -2
  42. package/test/speeddec.js +25 -16
  43. package/test/speedenc.js +21 -17
  44. package/test/testcrc.js +17 -1
  45. package/test/testcrcfuncs.c +53 -0
  46. package/test/testdec.js +1 -0
package/src/encoder.cc CHANGED
@@ -2,7 +2,32 @@
2
2
  #include "encoder_common.h"
3
3
  #include "encoder.h"
4
4
 
5
- size_t do_encode_generic(int line_size, int* colOffset, const unsigned char* HEDLEY_RESTRICT src, unsigned char* HEDLEY_RESTRICT dest, size_t len, int doEnd) {
5
+
6
+ // lookup tables for scalar processing
7
+ #define _B1(n) _B(n), _B(n+1), _B(n+2), _B(n+3)
8
+ #define _B2(n) _B1(n), _B1(n+4), _B1(n+8), _B1(n+12)
9
+ #define _B3(n) _B2(n), _B2(n+16), _B2(n+32), _B2(n+48)
10
+ #define _BX _B3(0), _B3(64), _B3(128), _B3(192)
11
+
12
+ const unsigned char RapidYenc::escapeLUT[256] = { // whether or not the character is critical
13
+ #define _B(n) ((n == 214 || n == '\r'+214 || n == '\n'+214 || n == '='-42) ? 0 : (n+42) & 0xff)
14
+ _BX
15
+ #undef _B
16
+ };
17
+ const uint16_t RapidYenc::escapedLUT[256] = { // escaped sequences for characters that need escaping
18
+ #define _B(n) ((n == 214 || n == 214+'\r' || n == 214+'\n' || n == '='-42 || n == 214+'\t' || n == 214+' ' || n == '.'-42) ? UINT16_PACK('=', ((n+42+64)&0xff)) : 0)
19
+ _BX
20
+ #undef _B
21
+ };
22
+
23
+ #undef _B1
24
+ #undef _B2
25
+ #undef _B3
26
+ #undef _BX
27
+
28
+
29
+
30
+ size_t RapidYenc::do_encode_generic(int line_size, int* colOffset, const unsigned char* HEDLEY_RESTRICT src, unsigned char* HEDLEY_RESTRICT dest, size_t len, int doEnd) {
6
31
  unsigned char* es = (unsigned char*)src + len;
7
32
  unsigned char *p = dest; // destination pointer
8
33
  long i = -(long)len; // input position
@@ -11,8 +36,8 @@ size_t do_encode_generic(int line_size, int* colOffset, const unsigned char* HED
11
36
 
12
37
  if (col == 0) {
13
38
  c = es[i++];
14
- if (escapedLUT[c]) {
15
- memcpy(p, &escapedLUT[c], sizeof(uint16_t));
39
+ if (RapidYenc::escapedLUT[c]) {
40
+ memcpy(p, &RapidYenc::escapedLUT[c], sizeof(uint16_t));
16
41
  p += 2;
17
42
  col = 2;
18
43
  } else {
@@ -27,11 +52,11 @@ size_t do_encode_generic(int line_size, int* colOffset, const unsigned char* HED
27
52
  // 8 cycle unrolled version
28
53
  sp = p;
29
54
  #define DO_THING(n) \
30
- c = es[i+n], escaped = escapeLUT[c]; \
55
+ c = es[i+n], escaped = RapidYenc::escapeLUT[c]; \
31
56
  if (escaped) \
32
57
  *(p++) = escaped; \
33
58
  else { \
34
- memcpy(p, &escapedLUT[c], sizeof(uint16_t)); \
59
+ memcpy(p, &RapidYenc::escapedLUT[c], sizeof(uint16_t)); \
35
60
  p += 2; \
36
61
  }
37
62
  DO_THING(0);
@@ -55,13 +80,13 @@ size_t do_encode_generic(int line_size, int* colOffset, const unsigned char* HED
55
80
  }
56
81
  // handle remaining chars
57
82
  while(col < line_size-1) {
58
- c = es[i++], escaped = escapeLUT[c];
83
+ c = es[i++], escaped = RapidYenc::escapeLUT[c];
59
84
  if (escaped) {
60
85
  *(p++) = escaped;
61
86
  col++;
62
87
  }
63
88
  else {
64
- memcpy(p, &escapedLUT[c], sizeof(uint16_t));
89
+ memcpy(p, &RapidYenc::escapedLUT[c], sizeof(uint16_t));
65
90
  p += 2;
66
91
  col += 2;
67
92
  }
@@ -79,8 +104,8 @@ size_t do_encode_generic(int line_size, int* colOffset, const unsigned char* HED
79
104
  // last line char
80
105
  if(col < line_size) { // this can only be false if the last character was an escape sequence (or line_size is horribly small), in which case, we don't need to handle space/tab cases
81
106
  c = es[i++];
82
- if (escapedLUT[c] && c != '.'-42) {
83
- memcpy(p, &escapedLUT[c], sizeof(uint16_t));
107
+ if (RapidYenc::escapedLUT[c] && c != '.'-42) {
108
+ memcpy(p, &RapidYenc::escapedLUT[c], sizeof(uint16_t));
84
109
  p += 2;
85
110
  } else {
86
111
  *(p++) = c + 42;
@@ -90,8 +115,8 @@ size_t do_encode_generic(int line_size, int* colOffset, const unsigned char* HED
90
115
  if (i >= 0) break;
91
116
 
92
117
  c = es[i++];
93
- if (escapedLUT[c]) {
94
- uint32_t w = UINT32_16_PACK(UINT16_PACK('\r', '\n'), (uint32_t)escapedLUT[c]);
118
+ if (RapidYenc::escapedLUT[c]) {
119
+ uint32_t w = UINT32_16_PACK(UINT16_PACK('\r', '\n'), (uint32_t)RapidYenc::escapedLUT[c]);
95
120
  memcpy(p, &w, sizeof(w));
96
121
  p += 4;
97
122
  col = 2;
@@ -120,40 +145,31 @@ size_t do_encode_generic(int line_size, int* colOffset, const unsigned char* HED
120
145
  }
121
146
 
122
147
 
123
- extern "C" {
148
+ namespace RapidYenc {
124
149
  size_t (*_do_encode)(int, int*, const unsigned char* HEDLEY_RESTRICT, unsigned char* HEDLEY_RESTRICT, size_t, int) = &do_encode_generic;
125
150
  int _encode_isa = ISA_GENERIC;
126
151
  }
127
152
 
128
- void encoder_sse2_init();
129
- void encoder_ssse3_init();
130
- void encoder_avx_init();
131
- void encoder_avx2_init();
132
- void encoder_vbmi2_init();
133
- extern const bool encoder_has_avx10;
134
- void encoder_neon_init();
135
- void encoder_rvv_init();
136
-
137
153
  #if defined(PLATFORM_X86) && defined(YENC_BUILD_NATIVE) && YENC_BUILD_NATIVE!=0
138
154
  # if defined(__AVX2__) && !defined(YENC_DISABLE_AVX256)
139
155
  # include "encoder_avx_base.h"
140
156
  static inline void encoder_native_init() {
141
- _do_encode = &do_encode_simd< do_encode_avx2<ISA_NATIVE> >;
157
+ RapidYenc::_do_encode = &do_encode_simd< RapidYenc::do_encode_avx2<ISA_NATIVE> >;
142
158
  encoder_avx2_lut<ISA_NATIVE>();
143
- _encode_isa = ISA_NATIVE;
159
+ RapidYenc::_encode_isa = ISA_NATIVE;
144
160
  }
145
161
  # else
146
162
  # include "encoder_sse_base.h"
147
163
  static inline void encoder_native_init() {
148
- _do_encode = &do_encode_simd< do_encode_sse<ISA_NATIVE> >;
164
+ RapidYenc::_do_encode = &do_encode_simd< RapidYenc::do_encode_sse<ISA_NATIVE> >;
149
165
  encoder_sse_lut<ISA_NATIVE>();
150
- _encode_isa = ISA_NATIVE;
166
+ RapidYenc::_encode_isa = ISA_NATIVE;
151
167
  }
152
168
  # endif
153
169
  #endif
154
170
 
155
171
 
156
- void encoder_init() {
172
+ void RapidYenc::encoder_init() {
157
173
  #ifdef PLATFORM_X86
158
174
  # if defined(YENC_BUILD_NATIVE) && YENC_BUILD_NATIVE!=0
159
175
  encoder_native_init();
package/src/encoder.h CHANGED
@@ -1,17 +1,17 @@
1
1
  #ifndef __YENC_ENCODER_H
2
2
  #define __YENC_ENCODER_H
3
3
 
4
- #ifdef __cplusplus
5
- extern "C" {
6
- #endif
4
+ #include "hedley.h"
7
5
 
6
+ namespace RapidYenc {
8
7
 
9
8
 
10
- #include "hedley.h"
11
9
 
12
10
  extern size_t (*_do_encode)(int, int*, const unsigned char* HEDLEY_RESTRICT, unsigned char* HEDLEY_RESTRICT, size_t, int);
13
11
  extern int _encode_isa;
14
- #define do_encode (*_do_encode)
12
+ static inline size_t encode(int line_size, int* colOffset, const void* HEDLEY_RESTRICT src, void* HEDLEY_RESTRICT dest, size_t len, int doEnd) {
13
+ return (*_do_encode)(line_size, colOffset, (const unsigned char* HEDLEY_RESTRICT)src, (unsigned char*)dest, len, doEnd);
14
+ }
15
15
  void encoder_init();
16
16
  static inline int encode_isa_level() {
17
17
  return _encode_isa;
@@ -19,7 +19,5 @@ static inline int encode_isa_level() {
19
19
 
20
20
 
21
21
 
22
- #ifdef __cplusplus
23
22
  }
24
23
  #endif
25
- #endif
@@ -1,16 +1,16 @@
1
1
  #include "common.h"
2
+ #include "encoder_common.h"
2
3
 
3
4
  #if defined(__AVX__) && defined(__POPCNT__)
4
5
  #include "encoder_sse_base.h"
5
6
 
6
- void encoder_avx_init() {
7
+ void RapidYenc::encoder_avx_init() {
7
8
  _do_encode = &do_encode_simd< do_encode_sse<ISA_LEVEL_SSE4_POPCNT> >;
8
9
  encoder_sse_lut<ISA_LEVEL_SSE4_POPCNT>();
9
10
  _encode_isa = ISA_LEVEL_AVX;
10
11
  }
11
12
  #else
12
- void encoder_ssse3_init();
13
- void encoder_avx_init() {
13
+ void RapidYenc::encoder_avx_init() {
14
14
  encoder_ssse3_init();
15
15
  }
16
16
  #endif
@@ -1,16 +1,16 @@
1
1
  #include "common.h"
2
+ #include "encoder_common.h"
2
3
 
3
4
  #if defined(__AVX2__) && !defined(YENC_DISABLE_AVX256)
4
5
  #include "encoder_avx_base.h"
5
6
 
6
- void encoder_avx2_init() {
7
+ void RapidYenc::encoder_avx2_init() {
7
8
  _do_encode = &do_encode_simd< do_encode_avx2<ISA_LEVEL_AVX2> >;
8
9
  encoder_avx2_lut<ISA_LEVEL_AVX2>();
9
10
  _encode_isa = ISA_LEVEL_AVX2;
10
11
  }
11
12
  #else
12
- void encoder_avx_init();
13
- void encoder_avx2_init() {
13
+ void RapidYenc::encoder_avx2_init() {
14
14
  encoder_avx_init();
15
15
  }
16
16
  #endif
@@ -76,6 +76,8 @@ static void encoder_avx2_lut() {
76
76
  }
77
77
  }
78
78
 
79
+ namespace RapidYenc {
80
+
79
81
  template<enum YEncDecIsaLevel use_isa>
80
82
  HEDLEY_ALWAYS_INLINE void do_encode_avx2(int line_size, int* colOffset, const uint8_t* HEDLEY_RESTRICT srcEnd, uint8_t* HEDLEY_RESTRICT& dest, size_t& len) {
81
83
  // offset position to enable simpler loop condition checking
@@ -568,5 +570,6 @@ HEDLEY_ALWAYS_INLINE void do_encode_avx2(int line_size, int* colOffset, const ui
568
570
  dest = p;
569
571
  len = -(i - INPUT_OFFSET);
570
572
  }
573
+ } // namespace
571
574
 
572
575
  #endif
@@ -1,37 +1,31 @@
1
1
  #ifndef __YENC_ENCODER_COMMON
2
2
  #define __YENC_ENCODER_COMMON
3
3
 
4
- // lookup tables for scalar processing
5
- #define _B1(n) _B(n), _B(n+1), _B(n+2), _B(n+3)
6
- #define _B2(n) _B1(n), _B1(n+4), _B1(n+8), _B1(n+12)
7
- #define _B3(n) _B2(n), _B2(n+16), _B2(n+32), _B2(n+48)
8
- #define _BX _B3(0), _B3(64), _B3(128), _B3(192)
9
-
10
- static const unsigned char escapeLUT[256] = { // whether or not the character is critical
11
- #define _B(n) ((n == 214 || n == '\r'+214 || n == '\n'+214 || n == '='-42) ? 0 : (n+42) & 0xff)
12
- _BX
13
- #undef _B
14
- };
15
- static const uint16_t escapedLUT[256] = { // escaped sequences for characters that need escaping
16
- #define _B(n) ((n == 214 || n == 214+'\r' || n == 214+'\n' || n == '='-42 || n == 214+'\t' || n == 214+' ' || n == '.'-42) ? UINT16_PACK('=', ((n+42+64)&0xff)) : 0)
17
- _BX
18
- #undef _B
19
- };
20
-
21
- #undef _B1
22
- #undef _B2
23
- #undef _B3
24
- #undef _BX
4
+ namespace RapidYenc {
5
+ void encoder_sse2_init();
6
+ void encoder_ssse3_init();
7
+ void encoder_avx_init();
8
+ void encoder_avx2_init();
9
+ void encoder_vbmi2_init();
10
+ extern const bool encoder_has_avx10;
11
+ void encoder_neon_init();
12
+ void encoder_rvv_init();
13
+
14
+ // lookup tables for scalar processing
15
+ extern const unsigned char escapeLUT[256];
16
+ extern const uint16_t escapedLUT[256];
17
+
18
+ size_t do_encode_generic(int line_size, int* colOffset, const unsigned char* HEDLEY_RESTRICT src, unsigned char* HEDLEY_RESTRICT dest, size_t len, int doEnd);
19
+ }
25
20
 
26
21
 
27
- size_t do_encode_generic(int line_size, int* colOffset, const unsigned char* HEDLEY_RESTRICT src, unsigned char* HEDLEY_RESTRICT dest, size_t len, int doEnd);
28
22
 
29
23
  template<void(&kernel)(int, int*, const uint8_t* HEDLEY_RESTRICT, uint8_t* HEDLEY_RESTRICT&, size_t&)>
30
- static size_t do_encode_simd(int line_size, int* colOffset, const uint8_t* HEDLEY_RESTRICT src, uint8_t* HEDLEY_RESTRICT dest, size_t len, int doEnd) {
24
+ static size_t do_encode_simd(int line_size, int* colOffset, const unsigned char* HEDLEY_RESTRICT src, unsigned char* HEDLEY_RESTRICT dest, size_t len, int doEnd) {
31
25
  if(len < 1) return 0;
32
26
  if(line_size < 12) { // short lines probably not worth processing in a SIMD way
33
27
  // we assume at least the first and last char exist in the line, and since the first char could be escaped, and SIMD encoder assumes at least one non-first/last char, assumption means that line size has to be >= 4
34
- return do_encode_generic(line_size, colOffset, src, dest, len, doEnd);
28
+ return RapidYenc::do_encode_generic(line_size, colOffset, src, dest, len, doEnd);
35
29
  }
36
30
 
37
31
  const uint8_t* es = src + len;
@@ -45,8 +39,8 @@ static size_t do_encode_simd(int line_size, int* colOffset, const uint8_t* HEDLE
45
39
  long i = -(long)len;
46
40
  if(*colOffset == 0 && i < 0) {
47
41
  uint8_t c = es[i++];
48
- if (LIKELIHOOD(0.0273, escapedLUT[c] != 0)) {
49
- memcpy(p, escapedLUT + c, 2);
42
+ if (LIKELIHOOD(0.0273, RapidYenc::escapedLUT[c] != 0)) {
43
+ memcpy(p, RapidYenc::escapedLUT + c, 2);
50
44
  p += 2;
51
45
  *colOffset = 2;
52
46
  } else {
@@ -57,19 +51,19 @@ static size_t do_encode_simd(int line_size, int* colOffset, const uint8_t* HEDLE
57
51
  while(i < 0) {
58
52
  uint8_t c = es[i++];
59
53
  if(*colOffset < line_size-1) {
60
- if(!escapeLUT[c]) {
54
+ if(!RapidYenc::escapeLUT[c]) {
61
55
  p[0] = '=';
62
56
  p[1] = c+42+64;
63
57
  p += 2;
64
58
  (*colOffset) += 2;
65
59
  } else {
66
- *(p++) = escapeLUT[c];
60
+ *(p++) = RapidYenc::escapeLUT[c];
67
61
  (*colOffset) += 1;
68
62
  }
69
63
  } else {
70
64
  if(*colOffset < line_size) {
71
- if (escapedLUT[c] && c != '.'-42) {
72
- memcpy(p, escapedLUT + c, 2);
65
+ if (RapidYenc::escapedLUT[c] && c != '.'-42) {
66
+ memcpy(p, RapidYenc::escapedLUT + c, 2);
73
67
  p += 2;
74
68
  } else {
75
69
  *(p++) = c + 42;
@@ -79,8 +73,8 @@ static size_t do_encode_simd(int line_size, int* colOffset, const uint8_t* HEDLE
79
73
  }
80
74
 
81
75
  // handle EOL
82
- if (escapedLUT[c]) {
83
- uint32_t w = UINT32_16_PACK(UINT16_PACK('\r', '\n'), (uint32_t)escapedLUT[c]);
76
+ if (RapidYenc::escapedLUT[c]) {
77
+ uint32_t w = UINT32_16_PACK(UINT16_PACK('\r', '\n'), (uint32_t)RapidYenc::escapedLUT[c]);
84
78
  memcpy(p, &w, sizeof(w));
85
79
  p += 4;
86
80
  *colOffset = 2;
@@ -1,8 +1,8 @@
1
1
  #include "common.h"
2
+ #include "encoder_common.h"
2
3
 
3
4
  #ifdef __ARM_NEON
4
5
  #include "encoder.h"
5
- #include "encoder_common.h"
6
6
 
7
7
  // Clang wrongly assumes alignment on vst1q_u8_x2, and ARMv7 GCC doesn't support the function, so effectively, it can only be used in ARMv8 compilers
8
8
  #if defined(__aarch64__) && (defined(__clang__) || HEDLEY_GCC_VERSION_CHECK(8,5,0))
@@ -259,6 +259,8 @@ static HEDLEY_ALWAYS_INLINE void encode_eol_handle_pre(const uint8_t* HEDLEY_RES
259
259
  }
260
260
 
261
261
 
262
+ namespace RapidYenc {
263
+
262
264
  HEDLEY_ALWAYS_INLINE void do_encode_neon(int line_size, int* colOffset, const uint8_t* HEDLEY_RESTRICT srcEnd, uint8_t* HEDLEY_RESTRICT& dest, size_t& len) {
263
265
  // offset position to enable simpler loop condition checking
264
266
  const int INPUT_OFFSET = sizeof(uint8x16_t)*4 -1; // extra chars for EOL handling, -1 to change <= to <
@@ -517,8 +519,9 @@ HEDLEY_ALWAYS_INLINE void do_encode_neon(int line_size, int* colOffset, const ui
517
519
  dest = p;
518
520
  len = -(i - INPUT_OFFSET);
519
521
  }
522
+ } // namespace
520
523
 
521
- void encoder_neon_init() {
524
+ void RapidYenc::encoder_neon_init() {
522
525
  _do_encode = &do_encode_simd<do_encode_neon>;
523
526
  _encode_isa = ISA_LEVEL_NEON;
524
527
  // generate shuf LUT
@@ -543,5 +546,5 @@ void encoder_neon_init() {
543
546
  }
544
547
  }
545
548
  #else
546
- void encoder_neon_init() {}
549
+ void RapidYenc::encoder_neon_init() {}
547
550
  #endif /* defined(__ARM_NEON) */
@@ -1,30 +1,23 @@
1
1
  #include "common.h"
2
+ #include "encoder_common.h"
2
3
 
3
4
  #ifdef __riscv_vector
4
5
  #include "encoder.h"
5
- #include "encoder_common.h"
6
-
7
- # include <riscv_vector.h>
8
- # if defined(__clang__) && __clang_major__ < 16
9
- # define RV(f) f
10
- # else
11
- # define RV(f) __riscv_##f
12
- # endif
13
6
 
14
7
 
15
8
  static HEDLEY_ALWAYS_INLINE void encode_eol_handle_pre(const uint8_t* HEDLEY_RESTRICT _src, long& inpos, uint8_t*& outp, long& col, long lineSizeOffset) {
16
9
  // TODO: vectorize
17
10
  uint8_t c = _src[inpos++];
18
- if(HEDLEY_UNLIKELY(escapedLUT[c] && c != '.'-42)) {
19
- memcpy(outp, &escapedLUT[c], sizeof(uint16_t));
11
+ if(HEDLEY_UNLIKELY(RapidYenc::escapedLUT[c] && c != '.'-42)) {
12
+ memcpy(outp, &RapidYenc::escapedLUT[c], sizeof(uint16_t));
20
13
  outp += 2;
21
14
  } else {
22
15
  *(outp++) = c + 42;
23
16
  }
24
17
 
25
18
  c = _src[inpos++];
26
- if(LIKELIHOOD(0.0273, escapedLUT[c]!=0)) {
27
- uint32_t w = UINT32_16_PACK(UINT16_PACK('\r', '\n'), (uint32_t)escapedLUT[c]);
19
+ if(LIKELIHOOD(0.0273, RapidYenc::escapedLUT[c]!=0)) {
20
+ uint32_t w = UINT32_16_PACK(UINT16_PACK('\r', '\n'), (uint32_t)RapidYenc::escapedLUT[c]);
28
21
  memcpy(outp, &w, sizeof(w));
29
22
  outp += 4;
30
23
  col = lineSizeOffset + 2;
@@ -36,6 +29,7 @@ static HEDLEY_ALWAYS_INLINE void encode_eol_handle_pre(const uint8_t* HEDLEY_RES
36
29
  }
37
30
  }
38
31
 
32
+ namespace RapidYenc {
39
33
 
40
34
  HEDLEY_ALWAYS_INLINE void do_encode_rvv(int line_size, int* colOffset, const uint8_t* HEDLEY_RESTRICT srcEnd, uint8_t* HEDLEY_RESTRICT& dest, size_t& len) {
41
35
  size_t vl2 = RV(vsetvlmax_e8m2)(); // TODO: limit to line length
@@ -112,7 +106,7 @@ HEDLEY_ALWAYS_INLINE void do_encode_rvv(int line_size, int* colOffset, const uin
112
106
  vl2
113
107
  );
114
108
 
115
- #if defined(__riscv_v_intrinsic) && __riscv_v_intrinsic >= 13000
109
+ #ifdef __riscv_v_intrinsic
116
110
  data = RV(vor_vx_u8m2_mu)(cmp, tmpData, tmpData, 64, vl2);
117
111
  #else
118
112
  data = RV(vor_vx_u8m2_m)(cmp, tmpData, tmpData, 64, vl2);
@@ -122,11 +116,7 @@ HEDLEY_ALWAYS_INLINE void do_encode_rvv(int line_size, int* colOffset, const uin
122
116
  size_t count = RV(vcpop_m_b4)(cmp, vl2);
123
117
  if(count > 1) {
124
118
  // widen mask: 4b->8b
125
- #if defined(__riscv_v_intrinsic) && __riscv_v_intrinsic >= 13000
126
- vuint8mf4_t vcmp = RV(vlmul_trunc_v_u8m1_u8mf4)(RV(vreinterpret_v_b4_u8m1)(cmp));
127
- #else
128
- vuint8mf4_t vcmp = *(vuint8mf4_t*)(&cmp);
129
- #endif
119
+ vuint8mf4_t vcmp = RV_VEC_U8MF4_CAST(cmp);
130
120
  // TODO: use vwsll instead if available
131
121
  // - is clmul useful here?
132
122
  vuint8mf2_t xcmp = RV(vreinterpret_v_u16mf2_u8mf2)(RV(vwmulu_vx_u16mf2)(vcmp, 16, vl2));
@@ -134,11 +124,7 @@ HEDLEY_ALWAYS_INLINE void do_encode_rvv(int line_size, int* colOffset, const uin
134
124
 
135
125
  // expand mask by inserting '1' between each bit (0000abcd -> 1a1b1c1d)
136
126
  vuint8m1_t xcmpTmp = RV(vrgather_vv_u8m1)(MASK_EXPAND, RV(vlmul_ext_v_u8mf2_u8m1)(xcmp), vl2);
137
- #if defined(__riscv_v_intrinsic) && __riscv_v_intrinsic >= 13000
138
- vbool2_t cmpmask = RV(vreinterpret_b2)(xcmpTmp);
139
- #else
140
- vbool2_t cmpmask = *(vbool2_t*)(&xcmpTmp);
141
- #endif
127
+ vbool2_t cmpmask = RV_MASK_CAST(2, 8, xcmpTmp);
142
128
 
143
129
  // expand data and insert =
144
130
  // TODO: use vwsll instead if available
@@ -149,7 +135,7 @@ HEDLEY_ALWAYS_INLINE void do_encode_rvv(int line_size, int* colOffset, const uin
149
135
  // prune unneeded =
150
136
  vuint8m4_t dataTmp = RV(vreinterpret_v_u16m4_u8m4)(data2);
151
137
  vuint8m4_t final_data = RV(vcompress_vm_u8m4)(
152
- #if defined(__riscv_v_intrinsic) && __riscv_v_intrinsic >= 13000
138
+ #ifdef __riscv_v_intrinsic
153
139
  dataTmp, cmpmask, vl2*2
154
140
  #else
155
141
  cmpmask, dataTmp, dataTmp, vl2*2
@@ -210,11 +196,12 @@ HEDLEY_ALWAYS_INLINE void do_encode_rvv(int line_size, int* colOffset, const uin
210
196
  dest = outp;
211
197
  len = -(inpos - INPUT_OFFSET);
212
198
  }
199
+ } // namespace
213
200
 
214
- void encoder_rvv_init() {
201
+ void RapidYenc::encoder_rvv_init() {
215
202
  _do_encode = &do_encode_simd<do_encode_rvv>;
216
203
  _encode_isa = ISA_LEVEL_RVV;
217
204
  }
218
205
  #else
219
- void encoder_rvv_init() {}
206
+ void RapidYenc::encoder_rvv_init() {}
220
207
  #endif /* defined(__riscv_vector) */
@@ -1,14 +1,15 @@
1
1
  #include "common.h"
2
+ #include "encoder_common.h"
2
3
 
3
4
  #ifdef __SSE2__
4
5
  #include "encoder_sse_base.h"
5
6
 
6
- void encoder_sse2_init() {
7
+ void RapidYenc::encoder_sse2_init() {
7
8
  _do_encode = &do_encode_simd< do_encode_sse<ISA_LEVEL_SSE2> >;
8
9
  encoder_sse_lut<ISA_LEVEL_SSE2>();
9
10
  _encode_isa = ISA_LEVEL_SSE2;
10
11
  }
11
12
  #else
12
- void encoder_sse2_init() {}
13
+ void RapidYenc::encoder_sse2_init() {}
13
14
  #endif
14
15
 
@@ -147,6 +147,7 @@ static HEDLEY_ALWAYS_INLINE uintptr_t sse2_expand_store_vector(__m128i data, uns
147
147
  }
148
148
  }
149
149
 
150
+ namespace RapidYenc {
150
151
 
151
152
  template<enum YEncDecIsaLevel use_isa>
152
153
  HEDLEY_ALWAYS_INLINE void do_encode_sse(int line_size, int* colOffset, const uint8_t* HEDLEY_RESTRICT srcEnd, uint8_t* HEDLEY_RESTRICT& dest, size_t& len) {
@@ -720,4 +721,5 @@ HEDLEY_ALWAYS_INLINE void do_encode_sse(int line_size, int* colOffset, const uin
720
721
  dest = p;
721
722
  len = -(i - INPUT_OFFSET);
722
723
  }
724
+ } // namespace
723
725
 
@@ -1,18 +1,18 @@
1
1
  #include "common.h"
2
+ #include "encoder_common.h"
2
3
 
3
4
  // slightly faster version which improves the worst case scenario significantly; since worst case doesn't happen often, overall speedup is relatively minor
4
5
  // requires PSHUFB (SSSE3) instruction, but will use POPCNT (SSE4.2 (or AMD's ABM, but Phenom doesn't support SSSE3 so doesn't matter)) if available (these only seem to give minor speedups, so considered optional)
5
6
  #ifdef __SSSE3__
6
7
  #include "encoder_sse_base.h"
7
8
 
8
- void encoder_ssse3_init() {
9
+ void RapidYenc::encoder_ssse3_init() {
9
10
  _do_encode = &do_encode_simd< do_encode_sse<ISA_LEVEL_SSSE3> >;
10
11
  encoder_sse_lut<ISA_LEVEL_SSSE3>();
11
12
  _encode_isa = ISA_LEVEL_SSSE3;
12
13
  }
13
14
  #else
14
- void encoder_sse2_init();
15
- void encoder_ssse3_init() {
15
+ void RapidYenc::encoder_ssse3_init() {
16
16
  encoder_sse2_init();
17
17
  }
18
18
  #endif
@@ -1,32 +1,31 @@
1
1
  #include "common.h"
2
+ #include "encoder_common.h"
2
3
 
3
- extern const bool encoder_has_avx10;
4
4
  #if !defined(__EVEX512__) && (defined(__AVX10_1__) || defined(__EVEX256__)) && defined(__AVX512VL__) && defined(__AVX512VBMI2__) && defined(__AVX512BW__)
5
- const bool encoder_has_avx10 = true;
5
+ const bool RapidYenc::encoder_has_avx10 = true;
6
6
  #else
7
- const bool encoder_has_avx10 = false;
7
+ const bool RapidYenc::encoder_has_avx10 = false;
8
8
  #endif
9
9
 
10
10
  #if defined(__AVX512VL__) && defined(__AVX512VBMI2__) && defined(__AVX512BW__)
11
11
  # ifndef YENC_DISABLE_AVX256
12
12
  # include "encoder_avx_base.h"
13
13
 
14
- void encoder_vbmi2_init() {
14
+ void RapidYenc::encoder_vbmi2_init() {
15
15
  _do_encode = &do_encode_simd< do_encode_avx2<ISA_LEVEL_VBMI2> >;
16
16
  encoder_avx2_lut<ISA_LEVEL_VBMI2>();
17
17
  _encode_isa = ISA_LEVEL_VBMI2;
18
18
  }
19
19
  # else
20
20
  # include "encoder_sse_base.h"
21
- void encoder_vbmi2_init() {
21
+ void RapidYenc::encoder_vbmi2_init() {
22
22
  _do_encode = &do_encode_simd< do_encode_sse<ISA_LEVEL_VBMI2> >;
23
23
  encoder_sse_lut<ISA_LEVEL_VBMI2>();
24
24
  _encode_isa = ISA_LEVEL_VBMI2;
25
25
  }
26
26
  # endif
27
27
  #else
28
- void encoder_avx2_init();
29
- void encoder_vbmi2_init() {
28
+ void RapidYenc::encoder_vbmi2_init() {
30
29
  encoder_avx2_init();
31
30
  }
32
31
  #endif
package/src/platform.cc CHANGED
@@ -17,7 +17,7 @@
17
17
  # endif
18
18
  # endif
19
19
  # endif
20
- bool cpu_supports_neon() {
20
+ bool RapidYenc::cpu_supports_neon() {
21
21
  # if defined(AT_HWCAP)
22
22
  # ifdef __FreeBSD__
23
23
  unsigned long supported;
@@ -95,7 +95,7 @@ static inline int _GET_XCR() {
95
95
  // }
96
96
 
97
97
 
98
- int cpu_supports_isa() {
98
+ int RapidYenc::cpu_supports_isa() {
99
99
  int flags[4];
100
100
  _cpuid1(flags);
101
101
  int ret = 0;
@@ -132,29 +132,30 @@ int cpu_supports_isa() {
132
132
  int cpuInfo[4];
133
133
  _cpuidX(cpuInfo, 7, 0);
134
134
  if((cpuInfo[1] & 0x128) == 0x128 && (ret & ISA_FEATURE_LZCNT)) { // BMI2 + AVX2 + BMI1
135
-
136
- // check AVX10
137
- int cpuInfo2[4];
138
- _cpuidX(cpuInfo2, 7, 1);
139
- if(cpuInfo2[3] & 0x80000) {
140
- _cpuidX(cpuInfo2, 0x24, 0);
141
- if((cpuInfo2[2] & 0xff) >= 1 && ( // minimum AVX10.1
135
+ if((xcr & 0xE0) == 0xE0) { // AVX512 XSTATE (also applies to AVX10)
136
+ // check AVX10
137
+ int cpuInfo2[4];
138
+ _cpuidX(cpuInfo2, 7, 1);
139
+ if(cpuInfo2[3] & 0x80000) {
140
+ _cpuidX(cpuInfo2, 0x24, 0);
141
+ if((cpuInfo2[1] & 0xff) >= 1 && ( // minimum AVX10.1
142
142
  #ifdef YENC_DISABLE_AVX256
143
- cpuInfo2[2] & 0x10000 // AVX10/128
143
+ cpuInfo2[1] & 0x10000 // AVX10/128
144
144
  #else
145
- cpuInfo2[2] & 0x20000 // AVX10/256
145
+ cpuInfo2[1] & 0x20000 // AVX10/256
146
146
  #endif
147
- )) {
148
- if(((xcr & 0xE0) == 0xE0) && (cpuInfo2[2] & 0x40000)) ret |= ISA_FEATURE_EVEX512;
149
- return ret | ISA_LEVEL_VBMI2;
147
+ )) {
148
+ if(cpuInfo2[1] & 0x40000) ret |= ISA_FEATURE_EVEX512;
149
+ return ret | ISA_LEVEL_VBMI2;
150
+ }
151
+ }
152
+
153
+ if((cpuInfo[1] & 0xC0010000) == 0xC0010000) { // AVX512BW + AVX512VL + AVX512F
154
+ ret |= ISA_FEATURE_EVEX512;
155
+ if(cpuInfo[2] & 0x40)
156
+ return ret | ISA_LEVEL_VBMI2;
157
+ return ret | ISA_LEVEL_AVX3;
150
158
  }
151
- }
152
-
153
- if(((xcr & 0xE0) == 0xE0) && (cpuInfo[1] & 0xC0010000) == 0xC0010000) { // AVX512BW + AVX512VL + AVX512F
154
- ret |= ISA_FEATURE_EVEX512;
155
- if(cpuInfo[2] & 0x40)
156
- return ret | ISA_LEVEL_VBMI2;
157
- return ret | ISA_LEVEL_AVX3;
158
159
  }
159
160
  // AVX2 is beneficial even on Zen1
160
161
  return ret | ISA_LEVEL_AVX2;
@@ -169,7 +170,7 @@ int cpu_supports_isa() {
169
170
  return ret | ISA_LEVEL_SSE2;
170
171
  }
171
172
 
172
- int cpu_supports_crc_isa() {
173
+ int RapidYenc::cpu_supports_crc_isa() {
173
174
  int flags[4];
174
175
  _cpuid1(flags);
175
176
 
@@ -200,7 +201,7 @@ int cpu_supports_crc_isa() {
200
201
  # endif
201
202
  # endif
202
203
  # endif
203
- bool cpu_supports_rvv() {
204
+ bool RapidYenc::cpu_supports_rvv() {
204
205
  # if defined(AT_HWCAP)
205
206
  unsigned long ret;
206
207
  # ifdef __FreeBSD__