yencode 1.1.5 → 1.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +130 -189
- package/binding.gyp +115 -6
- package/index.js +2 -0
- package/package.json +1 -1
- package/src/common.h +37 -7
- package/src/crc.cc +121 -47
- package/src/crc.h +74 -10
- package/src/crc_arm.cc +51 -34
- package/src/crc_arm_pmull.cc +215 -0
- package/src/crc_common.h +22 -0
- package/src/crc_folding.cc +154 -16
- package/src/crc_folding_256.cc +7 -14
- package/src/crc_riscv.cc +251 -0
- package/src/decoder.cc +373 -13
- package/src/decoder.h +10 -14
- package/src/decoder_avx.cc +5 -6
- package/src/decoder_avx2.cc +8 -9
- package/src/decoder_avx2_base.h +7 -11
- package/src/decoder_common.h +56 -373
- package/src/decoder_neon.cc +13 -19
- package/src/decoder_neon64.cc +12 -15
- package/src/decoder_rvv.cc +280 -0
- package/src/decoder_sse2.cc +26 -5
- package/src/decoder_sse_base.h +20 -40
- package/src/decoder_ssse3.cc +5 -6
- package/src/decoder_vbmi2.cc +6 -13
- package/src/encoder.cc +42 -26
- package/src/encoder.h +5 -7
- package/src/encoder_avx.cc +3 -3
- package/src/encoder_avx2.cc +3 -3
- package/src/encoder_avx_base.h +3 -0
- package/src/encoder_common.h +26 -32
- package/src/encoder_neon.cc +6 -3
- package/src/encoder_rvv.cc +13 -26
- package/src/encoder_sse2.cc +3 -2
- package/src/encoder_sse_base.h +2 -0
- package/src/encoder_ssse3.cc +3 -3
- package/src/encoder_vbmi2.cc +6 -7
- package/src/platform.cc +24 -23
- package/src/yencode.cc +54 -11
- package/test/_speedbase.js +4 -2
- package/test/speeddec.js +25 -16
- package/test/speedenc.js +21 -17
- package/test/testcrc.js +17 -1
- package/test/testcrcfuncs.c +53 -0
- package/test/testdec.js +1 -0
package/src/encoder.cc
CHANGED
|
@@ -2,7 +2,32 @@
|
|
|
2
2
|
#include "encoder_common.h"
|
|
3
3
|
#include "encoder.h"
|
|
4
4
|
|
|
5
|
-
|
|
5
|
+
|
|
6
|
+
// lookup tables for scalar processing
|
|
7
|
+
#define _B1(n) _B(n), _B(n+1), _B(n+2), _B(n+3)
|
|
8
|
+
#define _B2(n) _B1(n), _B1(n+4), _B1(n+8), _B1(n+12)
|
|
9
|
+
#define _B3(n) _B2(n), _B2(n+16), _B2(n+32), _B2(n+48)
|
|
10
|
+
#define _BX _B3(0), _B3(64), _B3(128), _B3(192)
|
|
11
|
+
|
|
12
|
+
const unsigned char RapidYenc::escapeLUT[256] = { // whether or not the character is critical
|
|
13
|
+
#define _B(n) ((n == 214 || n == '\r'+214 || n == '\n'+214 || n == '='-42) ? 0 : (n+42) & 0xff)
|
|
14
|
+
_BX
|
|
15
|
+
#undef _B
|
|
16
|
+
};
|
|
17
|
+
const uint16_t RapidYenc::escapedLUT[256] = { // escaped sequences for characters that need escaping
|
|
18
|
+
#define _B(n) ((n == 214 || n == 214+'\r' || n == 214+'\n' || n == '='-42 || n == 214+'\t' || n == 214+' ' || n == '.'-42) ? UINT16_PACK('=', ((n+42+64)&0xff)) : 0)
|
|
19
|
+
_BX
|
|
20
|
+
#undef _B
|
|
21
|
+
};
|
|
22
|
+
|
|
23
|
+
#undef _B1
|
|
24
|
+
#undef _B2
|
|
25
|
+
#undef _B3
|
|
26
|
+
#undef _BX
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
size_t RapidYenc::do_encode_generic(int line_size, int* colOffset, const unsigned char* HEDLEY_RESTRICT src, unsigned char* HEDLEY_RESTRICT dest, size_t len, int doEnd) {
|
|
6
31
|
unsigned char* es = (unsigned char*)src + len;
|
|
7
32
|
unsigned char *p = dest; // destination pointer
|
|
8
33
|
long i = -(long)len; // input position
|
|
@@ -11,8 +36,8 @@ size_t do_encode_generic(int line_size, int* colOffset, const unsigned char* HED
|
|
|
11
36
|
|
|
12
37
|
if (col == 0) {
|
|
13
38
|
c = es[i++];
|
|
14
|
-
if (escapedLUT[c]) {
|
|
15
|
-
memcpy(p, &escapedLUT[c], sizeof(uint16_t));
|
|
39
|
+
if (RapidYenc::escapedLUT[c]) {
|
|
40
|
+
memcpy(p, &RapidYenc::escapedLUT[c], sizeof(uint16_t));
|
|
16
41
|
p += 2;
|
|
17
42
|
col = 2;
|
|
18
43
|
} else {
|
|
@@ -27,11 +52,11 @@ size_t do_encode_generic(int line_size, int* colOffset, const unsigned char* HED
|
|
|
27
52
|
// 8 cycle unrolled version
|
|
28
53
|
sp = p;
|
|
29
54
|
#define DO_THING(n) \
|
|
30
|
-
c = es[i+n], escaped = escapeLUT[c]; \
|
|
55
|
+
c = es[i+n], escaped = RapidYenc::escapeLUT[c]; \
|
|
31
56
|
if (escaped) \
|
|
32
57
|
*(p++) = escaped; \
|
|
33
58
|
else { \
|
|
34
|
-
memcpy(p, &escapedLUT[c], sizeof(uint16_t)); \
|
|
59
|
+
memcpy(p, &RapidYenc::escapedLUT[c], sizeof(uint16_t)); \
|
|
35
60
|
p += 2; \
|
|
36
61
|
}
|
|
37
62
|
DO_THING(0);
|
|
@@ -55,13 +80,13 @@ size_t do_encode_generic(int line_size, int* colOffset, const unsigned char* HED
|
|
|
55
80
|
}
|
|
56
81
|
// handle remaining chars
|
|
57
82
|
while(col < line_size-1) {
|
|
58
|
-
c = es[i++], escaped = escapeLUT[c];
|
|
83
|
+
c = es[i++], escaped = RapidYenc::escapeLUT[c];
|
|
59
84
|
if (escaped) {
|
|
60
85
|
*(p++) = escaped;
|
|
61
86
|
col++;
|
|
62
87
|
}
|
|
63
88
|
else {
|
|
64
|
-
memcpy(p, &escapedLUT[c], sizeof(uint16_t));
|
|
89
|
+
memcpy(p, &RapidYenc::escapedLUT[c], sizeof(uint16_t));
|
|
65
90
|
p += 2;
|
|
66
91
|
col += 2;
|
|
67
92
|
}
|
|
@@ -79,8 +104,8 @@ size_t do_encode_generic(int line_size, int* colOffset, const unsigned char* HED
|
|
|
79
104
|
// last line char
|
|
80
105
|
if(col < line_size) { // this can only be false if the last character was an escape sequence (or line_size is horribly small), in which case, we don't need to handle space/tab cases
|
|
81
106
|
c = es[i++];
|
|
82
|
-
if (escapedLUT[c] && c != '.'-42) {
|
|
83
|
-
memcpy(p, &escapedLUT[c], sizeof(uint16_t));
|
|
107
|
+
if (RapidYenc::escapedLUT[c] && c != '.'-42) {
|
|
108
|
+
memcpy(p, &RapidYenc::escapedLUT[c], sizeof(uint16_t));
|
|
84
109
|
p += 2;
|
|
85
110
|
} else {
|
|
86
111
|
*(p++) = c + 42;
|
|
@@ -90,8 +115,8 @@ size_t do_encode_generic(int line_size, int* colOffset, const unsigned char* HED
|
|
|
90
115
|
if (i >= 0) break;
|
|
91
116
|
|
|
92
117
|
c = es[i++];
|
|
93
|
-
if (escapedLUT[c]) {
|
|
94
|
-
uint32_t w = UINT32_16_PACK(UINT16_PACK('\r', '\n'), (uint32_t)escapedLUT[c]);
|
|
118
|
+
if (RapidYenc::escapedLUT[c]) {
|
|
119
|
+
uint32_t w = UINT32_16_PACK(UINT16_PACK('\r', '\n'), (uint32_t)RapidYenc::escapedLUT[c]);
|
|
95
120
|
memcpy(p, &w, sizeof(w));
|
|
96
121
|
p += 4;
|
|
97
122
|
col = 2;
|
|
@@ -120,40 +145,31 @@ size_t do_encode_generic(int line_size, int* colOffset, const unsigned char* HED
|
|
|
120
145
|
}
|
|
121
146
|
|
|
122
147
|
|
|
123
|
-
|
|
148
|
+
namespace RapidYenc {
|
|
124
149
|
size_t (*_do_encode)(int, int*, const unsigned char* HEDLEY_RESTRICT, unsigned char* HEDLEY_RESTRICT, size_t, int) = &do_encode_generic;
|
|
125
150
|
int _encode_isa = ISA_GENERIC;
|
|
126
151
|
}
|
|
127
152
|
|
|
128
|
-
void encoder_sse2_init();
|
|
129
|
-
void encoder_ssse3_init();
|
|
130
|
-
void encoder_avx_init();
|
|
131
|
-
void encoder_avx2_init();
|
|
132
|
-
void encoder_vbmi2_init();
|
|
133
|
-
extern const bool encoder_has_avx10;
|
|
134
|
-
void encoder_neon_init();
|
|
135
|
-
void encoder_rvv_init();
|
|
136
|
-
|
|
137
153
|
#if defined(PLATFORM_X86) && defined(YENC_BUILD_NATIVE) && YENC_BUILD_NATIVE!=0
|
|
138
154
|
# if defined(__AVX2__) && !defined(YENC_DISABLE_AVX256)
|
|
139
155
|
# include "encoder_avx_base.h"
|
|
140
156
|
static inline void encoder_native_init() {
|
|
141
|
-
_do_encode = &do_encode_simd< do_encode_avx2<ISA_NATIVE> >;
|
|
157
|
+
RapidYenc::_do_encode = &do_encode_simd< RapidYenc::do_encode_avx2<ISA_NATIVE> >;
|
|
142
158
|
encoder_avx2_lut<ISA_NATIVE>();
|
|
143
|
-
_encode_isa = ISA_NATIVE;
|
|
159
|
+
RapidYenc::_encode_isa = ISA_NATIVE;
|
|
144
160
|
}
|
|
145
161
|
# else
|
|
146
162
|
# include "encoder_sse_base.h"
|
|
147
163
|
static inline void encoder_native_init() {
|
|
148
|
-
_do_encode = &do_encode_simd< do_encode_sse<ISA_NATIVE> >;
|
|
164
|
+
RapidYenc::_do_encode = &do_encode_simd< RapidYenc::do_encode_sse<ISA_NATIVE> >;
|
|
149
165
|
encoder_sse_lut<ISA_NATIVE>();
|
|
150
|
-
_encode_isa = ISA_NATIVE;
|
|
166
|
+
RapidYenc::_encode_isa = ISA_NATIVE;
|
|
151
167
|
}
|
|
152
168
|
# endif
|
|
153
169
|
#endif
|
|
154
170
|
|
|
155
171
|
|
|
156
|
-
void encoder_init() {
|
|
172
|
+
void RapidYenc::encoder_init() {
|
|
157
173
|
#ifdef PLATFORM_X86
|
|
158
174
|
# if defined(YENC_BUILD_NATIVE) && YENC_BUILD_NATIVE!=0
|
|
159
175
|
encoder_native_init();
|
package/src/encoder.h
CHANGED
|
@@ -1,17 +1,17 @@
|
|
|
1
1
|
#ifndef __YENC_ENCODER_H
|
|
2
2
|
#define __YENC_ENCODER_H
|
|
3
3
|
|
|
4
|
-
#
|
|
5
|
-
extern "C" {
|
|
6
|
-
#endif
|
|
4
|
+
#include "hedley.h"
|
|
7
5
|
|
|
6
|
+
namespace RapidYenc {
|
|
8
7
|
|
|
9
8
|
|
|
10
|
-
#include "hedley.h"
|
|
11
9
|
|
|
12
10
|
extern size_t (*_do_encode)(int, int*, const unsigned char* HEDLEY_RESTRICT, unsigned char* HEDLEY_RESTRICT, size_t, int);
|
|
13
11
|
extern int _encode_isa;
|
|
14
|
-
|
|
12
|
+
static inline size_t encode(int line_size, int* colOffset, const void* HEDLEY_RESTRICT src, void* HEDLEY_RESTRICT dest, size_t len, int doEnd) {
|
|
13
|
+
return (*_do_encode)(line_size, colOffset, (const unsigned char* HEDLEY_RESTRICT)src, (unsigned char*)dest, len, doEnd);
|
|
14
|
+
}
|
|
15
15
|
void encoder_init();
|
|
16
16
|
static inline int encode_isa_level() {
|
|
17
17
|
return _encode_isa;
|
|
@@ -19,7 +19,5 @@ static inline int encode_isa_level() {
|
|
|
19
19
|
|
|
20
20
|
|
|
21
21
|
|
|
22
|
-
#ifdef __cplusplus
|
|
23
22
|
}
|
|
24
23
|
#endif
|
|
25
|
-
#endif
|
package/src/encoder_avx.cc
CHANGED
|
@@ -1,16 +1,16 @@
|
|
|
1
1
|
#include "common.h"
|
|
2
|
+
#include "encoder_common.h"
|
|
2
3
|
|
|
3
4
|
#if defined(__AVX__) && defined(__POPCNT__)
|
|
4
5
|
#include "encoder_sse_base.h"
|
|
5
6
|
|
|
6
|
-
void encoder_avx_init() {
|
|
7
|
+
void RapidYenc::encoder_avx_init() {
|
|
7
8
|
_do_encode = &do_encode_simd< do_encode_sse<ISA_LEVEL_SSE4_POPCNT> >;
|
|
8
9
|
encoder_sse_lut<ISA_LEVEL_SSE4_POPCNT>();
|
|
9
10
|
_encode_isa = ISA_LEVEL_AVX;
|
|
10
11
|
}
|
|
11
12
|
#else
|
|
12
|
-
void
|
|
13
|
-
void encoder_avx_init() {
|
|
13
|
+
void RapidYenc::encoder_avx_init() {
|
|
14
14
|
encoder_ssse3_init();
|
|
15
15
|
}
|
|
16
16
|
#endif
|
package/src/encoder_avx2.cc
CHANGED
|
@@ -1,16 +1,16 @@
|
|
|
1
1
|
#include "common.h"
|
|
2
|
+
#include "encoder_common.h"
|
|
2
3
|
|
|
3
4
|
#if defined(__AVX2__) && !defined(YENC_DISABLE_AVX256)
|
|
4
5
|
#include "encoder_avx_base.h"
|
|
5
6
|
|
|
6
|
-
void encoder_avx2_init() {
|
|
7
|
+
void RapidYenc::encoder_avx2_init() {
|
|
7
8
|
_do_encode = &do_encode_simd< do_encode_avx2<ISA_LEVEL_AVX2> >;
|
|
8
9
|
encoder_avx2_lut<ISA_LEVEL_AVX2>();
|
|
9
10
|
_encode_isa = ISA_LEVEL_AVX2;
|
|
10
11
|
}
|
|
11
12
|
#else
|
|
12
|
-
void
|
|
13
|
-
void encoder_avx2_init() {
|
|
13
|
+
void RapidYenc::encoder_avx2_init() {
|
|
14
14
|
encoder_avx_init();
|
|
15
15
|
}
|
|
16
16
|
#endif
|
package/src/encoder_avx_base.h
CHANGED
|
@@ -76,6 +76,8 @@ static void encoder_avx2_lut() {
|
|
|
76
76
|
}
|
|
77
77
|
}
|
|
78
78
|
|
|
79
|
+
namespace RapidYenc {
|
|
80
|
+
|
|
79
81
|
template<enum YEncDecIsaLevel use_isa>
|
|
80
82
|
HEDLEY_ALWAYS_INLINE void do_encode_avx2(int line_size, int* colOffset, const uint8_t* HEDLEY_RESTRICT srcEnd, uint8_t* HEDLEY_RESTRICT& dest, size_t& len) {
|
|
81
83
|
// offset position to enable simpler loop condition checking
|
|
@@ -568,5 +570,6 @@ HEDLEY_ALWAYS_INLINE void do_encode_avx2(int line_size, int* colOffset, const ui
|
|
|
568
570
|
dest = p;
|
|
569
571
|
len = -(i - INPUT_OFFSET);
|
|
570
572
|
}
|
|
573
|
+
} // namespace
|
|
571
574
|
|
|
572
575
|
#endif
|
package/src/encoder_common.h
CHANGED
|
@@ -1,37 +1,31 @@
|
|
|
1
1
|
#ifndef __YENC_ENCODER_COMMON
|
|
2
2
|
#define __YENC_ENCODER_COMMON
|
|
3
3
|
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
}
|
|
20
|
-
|
|
21
|
-
#undef _B1
|
|
22
|
-
#undef _B2
|
|
23
|
-
#undef _B3
|
|
24
|
-
#undef _BX
|
|
4
|
+
namespace RapidYenc {
|
|
5
|
+
void encoder_sse2_init();
|
|
6
|
+
void encoder_ssse3_init();
|
|
7
|
+
void encoder_avx_init();
|
|
8
|
+
void encoder_avx2_init();
|
|
9
|
+
void encoder_vbmi2_init();
|
|
10
|
+
extern const bool encoder_has_avx10;
|
|
11
|
+
void encoder_neon_init();
|
|
12
|
+
void encoder_rvv_init();
|
|
13
|
+
|
|
14
|
+
// lookup tables for scalar processing
|
|
15
|
+
extern const unsigned char escapeLUT[256];
|
|
16
|
+
extern const uint16_t escapedLUT[256];
|
|
17
|
+
|
|
18
|
+
size_t do_encode_generic(int line_size, int* colOffset, const unsigned char* HEDLEY_RESTRICT src, unsigned char* HEDLEY_RESTRICT dest, size_t len, int doEnd);
|
|
19
|
+
}
|
|
25
20
|
|
|
26
21
|
|
|
27
|
-
size_t do_encode_generic(int line_size, int* colOffset, const unsigned char* HEDLEY_RESTRICT src, unsigned char* HEDLEY_RESTRICT dest, size_t len, int doEnd);
|
|
28
22
|
|
|
29
23
|
template<void(&kernel)(int, int*, const uint8_t* HEDLEY_RESTRICT, uint8_t* HEDLEY_RESTRICT&, size_t&)>
|
|
30
|
-
static size_t do_encode_simd(int line_size, int* colOffset, const
|
|
24
|
+
static size_t do_encode_simd(int line_size, int* colOffset, const unsigned char* HEDLEY_RESTRICT src, unsigned char* HEDLEY_RESTRICT dest, size_t len, int doEnd) {
|
|
31
25
|
if(len < 1) return 0;
|
|
32
26
|
if(line_size < 12) { // short lines probably not worth processing in a SIMD way
|
|
33
27
|
// we assume at least the first and last char exist in the line, and since the first char could be escaped, and SIMD encoder assumes at least one non-first/last char, assumption means that line size has to be >= 4
|
|
34
|
-
return do_encode_generic(line_size, colOffset, src, dest, len, doEnd);
|
|
28
|
+
return RapidYenc::do_encode_generic(line_size, colOffset, src, dest, len, doEnd);
|
|
35
29
|
}
|
|
36
30
|
|
|
37
31
|
const uint8_t* es = src + len;
|
|
@@ -45,8 +39,8 @@ static size_t do_encode_simd(int line_size, int* colOffset, const uint8_t* HEDLE
|
|
|
45
39
|
long i = -(long)len;
|
|
46
40
|
if(*colOffset == 0 && i < 0) {
|
|
47
41
|
uint8_t c = es[i++];
|
|
48
|
-
if (LIKELIHOOD(0.0273, escapedLUT[c] != 0)) {
|
|
49
|
-
memcpy(p, escapedLUT + c, 2);
|
|
42
|
+
if (LIKELIHOOD(0.0273, RapidYenc::escapedLUT[c] != 0)) {
|
|
43
|
+
memcpy(p, RapidYenc::escapedLUT + c, 2);
|
|
50
44
|
p += 2;
|
|
51
45
|
*colOffset = 2;
|
|
52
46
|
} else {
|
|
@@ -57,19 +51,19 @@ static size_t do_encode_simd(int line_size, int* colOffset, const uint8_t* HEDLE
|
|
|
57
51
|
while(i < 0) {
|
|
58
52
|
uint8_t c = es[i++];
|
|
59
53
|
if(*colOffset < line_size-1) {
|
|
60
|
-
if(!escapeLUT[c]) {
|
|
54
|
+
if(!RapidYenc::escapeLUT[c]) {
|
|
61
55
|
p[0] = '=';
|
|
62
56
|
p[1] = c+42+64;
|
|
63
57
|
p += 2;
|
|
64
58
|
(*colOffset) += 2;
|
|
65
59
|
} else {
|
|
66
|
-
*(p++) = escapeLUT[c];
|
|
60
|
+
*(p++) = RapidYenc::escapeLUT[c];
|
|
67
61
|
(*colOffset) += 1;
|
|
68
62
|
}
|
|
69
63
|
} else {
|
|
70
64
|
if(*colOffset < line_size) {
|
|
71
|
-
if (escapedLUT[c] && c != '.'-42) {
|
|
72
|
-
memcpy(p, escapedLUT + c, 2);
|
|
65
|
+
if (RapidYenc::escapedLUT[c] && c != '.'-42) {
|
|
66
|
+
memcpy(p, RapidYenc::escapedLUT + c, 2);
|
|
73
67
|
p += 2;
|
|
74
68
|
} else {
|
|
75
69
|
*(p++) = c + 42;
|
|
@@ -79,8 +73,8 @@ static size_t do_encode_simd(int line_size, int* colOffset, const uint8_t* HEDLE
|
|
|
79
73
|
}
|
|
80
74
|
|
|
81
75
|
// handle EOL
|
|
82
|
-
if (escapedLUT[c]) {
|
|
83
|
-
uint32_t w = UINT32_16_PACK(UINT16_PACK('\r', '\n'), (uint32_t)escapedLUT[c]);
|
|
76
|
+
if (RapidYenc::escapedLUT[c]) {
|
|
77
|
+
uint32_t w = UINT32_16_PACK(UINT16_PACK('\r', '\n'), (uint32_t)RapidYenc::escapedLUT[c]);
|
|
84
78
|
memcpy(p, &w, sizeof(w));
|
|
85
79
|
p += 4;
|
|
86
80
|
*colOffset = 2;
|
package/src/encoder_neon.cc
CHANGED
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
#include "common.h"
|
|
2
|
+
#include "encoder_common.h"
|
|
2
3
|
|
|
3
4
|
#ifdef __ARM_NEON
|
|
4
5
|
#include "encoder.h"
|
|
5
|
-
#include "encoder_common.h"
|
|
6
6
|
|
|
7
7
|
// Clang wrongly assumes alignment on vst1q_u8_x2, and ARMv7 GCC doesn't support the function, so effectively, it can only be used in ARMv8 compilers
|
|
8
8
|
#if defined(__aarch64__) && (defined(__clang__) || HEDLEY_GCC_VERSION_CHECK(8,5,0))
|
|
@@ -259,6 +259,8 @@ static HEDLEY_ALWAYS_INLINE void encode_eol_handle_pre(const uint8_t* HEDLEY_RES
|
|
|
259
259
|
}
|
|
260
260
|
|
|
261
261
|
|
|
262
|
+
namespace RapidYenc {
|
|
263
|
+
|
|
262
264
|
HEDLEY_ALWAYS_INLINE void do_encode_neon(int line_size, int* colOffset, const uint8_t* HEDLEY_RESTRICT srcEnd, uint8_t* HEDLEY_RESTRICT& dest, size_t& len) {
|
|
263
265
|
// offset position to enable simpler loop condition checking
|
|
264
266
|
const int INPUT_OFFSET = sizeof(uint8x16_t)*4 -1; // extra chars for EOL handling, -1 to change <= to <
|
|
@@ -517,8 +519,9 @@ HEDLEY_ALWAYS_INLINE void do_encode_neon(int line_size, int* colOffset, const ui
|
|
|
517
519
|
dest = p;
|
|
518
520
|
len = -(i - INPUT_OFFSET);
|
|
519
521
|
}
|
|
522
|
+
} // namespace
|
|
520
523
|
|
|
521
|
-
void encoder_neon_init() {
|
|
524
|
+
void RapidYenc::encoder_neon_init() {
|
|
522
525
|
_do_encode = &do_encode_simd<do_encode_neon>;
|
|
523
526
|
_encode_isa = ISA_LEVEL_NEON;
|
|
524
527
|
// generate shuf LUT
|
|
@@ -543,5 +546,5 @@ void encoder_neon_init() {
|
|
|
543
546
|
}
|
|
544
547
|
}
|
|
545
548
|
#else
|
|
546
|
-
void encoder_neon_init() {}
|
|
549
|
+
void RapidYenc::encoder_neon_init() {}
|
|
547
550
|
#endif /* defined(__ARM_NEON) */
|
package/src/encoder_rvv.cc
CHANGED
|
@@ -1,30 +1,23 @@
|
|
|
1
1
|
#include "common.h"
|
|
2
|
+
#include "encoder_common.h"
|
|
2
3
|
|
|
3
4
|
#ifdef __riscv_vector
|
|
4
5
|
#include "encoder.h"
|
|
5
|
-
#include "encoder_common.h"
|
|
6
|
-
|
|
7
|
-
# include <riscv_vector.h>
|
|
8
|
-
# if defined(__clang__) && __clang_major__ < 16
|
|
9
|
-
# define RV(f) f
|
|
10
|
-
# else
|
|
11
|
-
# define RV(f) __riscv_##f
|
|
12
|
-
# endif
|
|
13
6
|
|
|
14
7
|
|
|
15
8
|
static HEDLEY_ALWAYS_INLINE void encode_eol_handle_pre(const uint8_t* HEDLEY_RESTRICT _src, long& inpos, uint8_t*& outp, long& col, long lineSizeOffset) {
|
|
16
9
|
// TODO: vectorize
|
|
17
10
|
uint8_t c = _src[inpos++];
|
|
18
|
-
if(HEDLEY_UNLIKELY(escapedLUT[c] && c != '.'-42)) {
|
|
19
|
-
memcpy(outp, &escapedLUT[c], sizeof(uint16_t));
|
|
11
|
+
if(HEDLEY_UNLIKELY(RapidYenc::escapedLUT[c] && c != '.'-42)) {
|
|
12
|
+
memcpy(outp, &RapidYenc::escapedLUT[c], sizeof(uint16_t));
|
|
20
13
|
outp += 2;
|
|
21
14
|
} else {
|
|
22
15
|
*(outp++) = c + 42;
|
|
23
16
|
}
|
|
24
17
|
|
|
25
18
|
c = _src[inpos++];
|
|
26
|
-
if(LIKELIHOOD(0.0273, escapedLUT[c]!=0)) {
|
|
27
|
-
uint32_t w = UINT32_16_PACK(UINT16_PACK('\r', '\n'), (uint32_t)escapedLUT[c]);
|
|
19
|
+
if(LIKELIHOOD(0.0273, RapidYenc::escapedLUT[c]!=0)) {
|
|
20
|
+
uint32_t w = UINT32_16_PACK(UINT16_PACK('\r', '\n'), (uint32_t)RapidYenc::escapedLUT[c]);
|
|
28
21
|
memcpy(outp, &w, sizeof(w));
|
|
29
22
|
outp += 4;
|
|
30
23
|
col = lineSizeOffset + 2;
|
|
@@ -36,6 +29,7 @@ static HEDLEY_ALWAYS_INLINE void encode_eol_handle_pre(const uint8_t* HEDLEY_RES
|
|
|
36
29
|
}
|
|
37
30
|
}
|
|
38
31
|
|
|
32
|
+
namespace RapidYenc {
|
|
39
33
|
|
|
40
34
|
HEDLEY_ALWAYS_INLINE void do_encode_rvv(int line_size, int* colOffset, const uint8_t* HEDLEY_RESTRICT srcEnd, uint8_t* HEDLEY_RESTRICT& dest, size_t& len) {
|
|
41
35
|
size_t vl2 = RV(vsetvlmax_e8m2)(); // TODO: limit to line length
|
|
@@ -112,7 +106,7 @@ HEDLEY_ALWAYS_INLINE void do_encode_rvv(int line_size, int* colOffset, const uin
|
|
|
112
106
|
vl2
|
|
113
107
|
);
|
|
114
108
|
|
|
115
|
-
#
|
|
109
|
+
#ifdef __riscv_v_intrinsic
|
|
116
110
|
data = RV(vor_vx_u8m2_mu)(cmp, tmpData, tmpData, 64, vl2);
|
|
117
111
|
#else
|
|
118
112
|
data = RV(vor_vx_u8m2_m)(cmp, tmpData, tmpData, 64, vl2);
|
|
@@ -122,11 +116,7 @@ HEDLEY_ALWAYS_INLINE void do_encode_rvv(int line_size, int* colOffset, const uin
|
|
|
122
116
|
size_t count = RV(vcpop_m_b4)(cmp, vl2);
|
|
123
117
|
if(count > 1) {
|
|
124
118
|
// widen mask: 4b->8b
|
|
125
|
-
|
|
126
|
-
vuint8mf4_t vcmp = RV(vlmul_trunc_v_u8m1_u8mf4)(RV(vreinterpret_v_b4_u8m1)(cmp));
|
|
127
|
-
#else
|
|
128
|
-
vuint8mf4_t vcmp = *(vuint8mf4_t*)(&cmp);
|
|
129
|
-
#endif
|
|
119
|
+
vuint8mf4_t vcmp = RV_VEC_U8MF4_CAST(cmp);
|
|
130
120
|
// TODO: use vwsll instead if available
|
|
131
121
|
// - is clmul useful here?
|
|
132
122
|
vuint8mf2_t xcmp = RV(vreinterpret_v_u16mf2_u8mf2)(RV(vwmulu_vx_u16mf2)(vcmp, 16, vl2));
|
|
@@ -134,11 +124,7 @@ HEDLEY_ALWAYS_INLINE void do_encode_rvv(int line_size, int* colOffset, const uin
|
|
|
134
124
|
|
|
135
125
|
// expand mask by inserting '1' between each bit (0000abcd -> 1a1b1c1d)
|
|
136
126
|
vuint8m1_t xcmpTmp = RV(vrgather_vv_u8m1)(MASK_EXPAND, RV(vlmul_ext_v_u8mf2_u8m1)(xcmp), vl2);
|
|
137
|
-
|
|
138
|
-
vbool2_t cmpmask = RV(vreinterpret_b2)(xcmpTmp);
|
|
139
|
-
#else
|
|
140
|
-
vbool2_t cmpmask = *(vbool2_t*)(&xcmpTmp);
|
|
141
|
-
#endif
|
|
127
|
+
vbool2_t cmpmask = RV_MASK_CAST(2, 8, xcmpTmp);
|
|
142
128
|
|
|
143
129
|
// expand data and insert =
|
|
144
130
|
// TODO: use vwsll instead if available
|
|
@@ -149,7 +135,7 @@ HEDLEY_ALWAYS_INLINE void do_encode_rvv(int line_size, int* colOffset, const uin
|
|
|
149
135
|
// prune unneeded =
|
|
150
136
|
vuint8m4_t dataTmp = RV(vreinterpret_v_u16m4_u8m4)(data2);
|
|
151
137
|
vuint8m4_t final_data = RV(vcompress_vm_u8m4)(
|
|
152
|
-
#
|
|
138
|
+
#ifdef __riscv_v_intrinsic
|
|
153
139
|
dataTmp, cmpmask, vl2*2
|
|
154
140
|
#else
|
|
155
141
|
cmpmask, dataTmp, dataTmp, vl2*2
|
|
@@ -210,11 +196,12 @@ HEDLEY_ALWAYS_INLINE void do_encode_rvv(int line_size, int* colOffset, const uin
|
|
|
210
196
|
dest = outp;
|
|
211
197
|
len = -(inpos - INPUT_OFFSET);
|
|
212
198
|
}
|
|
199
|
+
} // namespace
|
|
213
200
|
|
|
214
|
-
void encoder_rvv_init() {
|
|
201
|
+
void RapidYenc::encoder_rvv_init() {
|
|
215
202
|
_do_encode = &do_encode_simd<do_encode_rvv>;
|
|
216
203
|
_encode_isa = ISA_LEVEL_RVV;
|
|
217
204
|
}
|
|
218
205
|
#else
|
|
219
|
-
void encoder_rvv_init() {}
|
|
206
|
+
void RapidYenc::encoder_rvv_init() {}
|
|
220
207
|
#endif /* defined(__riscv_vector) */
|
package/src/encoder_sse2.cc
CHANGED
|
@@ -1,14 +1,15 @@
|
|
|
1
1
|
#include "common.h"
|
|
2
|
+
#include "encoder_common.h"
|
|
2
3
|
|
|
3
4
|
#ifdef __SSE2__
|
|
4
5
|
#include "encoder_sse_base.h"
|
|
5
6
|
|
|
6
|
-
void encoder_sse2_init() {
|
|
7
|
+
void RapidYenc::encoder_sse2_init() {
|
|
7
8
|
_do_encode = &do_encode_simd< do_encode_sse<ISA_LEVEL_SSE2> >;
|
|
8
9
|
encoder_sse_lut<ISA_LEVEL_SSE2>();
|
|
9
10
|
_encode_isa = ISA_LEVEL_SSE2;
|
|
10
11
|
}
|
|
11
12
|
#else
|
|
12
|
-
void encoder_sse2_init() {}
|
|
13
|
+
void RapidYenc::encoder_sse2_init() {}
|
|
13
14
|
#endif
|
|
14
15
|
|
package/src/encoder_sse_base.h
CHANGED
|
@@ -147,6 +147,7 @@ static HEDLEY_ALWAYS_INLINE uintptr_t sse2_expand_store_vector(__m128i data, uns
|
|
|
147
147
|
}
|
|
148
148
|
}
|
|
149
149
|
|
|
150
|
+
namespace RapidYenc {
|
|
150
151
|
|
|
151
152
|
template<enum YEncDecIsaLevel use_isa>
|
|
152
153
|
HEDLEY_ALWAYS_INLINE void do_encode_sse(int line_size, int* colOffset, const uint8_t* HEDLEY_RESTRICT srcEnd, uint8_t* HEDLEY_RESTRICT& dest, size_t& len) {
|
|
@@ -720,4 +721,5 @@ HEDLEY_ALWAYS_INLINE void do_encode_sse(int line_size, int* colOffset, const uin
|
|
|
720
721
|
dest = p;
|
|
721
722
|
len = -(i - INPUT_OFFSET);
|
|
722
723
|
}
|
|
724
|
+
} // namespace
|
|
723
725
|
|
package/src/encoder_ssse3.cc
CHANGED
|
@@ -1,18 +1,18 @@
|
|
|
1
1
|
#include "common.h"
|
|
2
|
+
#include "encoder_common.h"
|
|
2
3
|
|
|
3
4
|
// slightly faster version which improves the worst case scenario significantly; since worst case doesn't happen often, overall speedup is relatively minor
|
|
4
5
|
// requires PSHUFB (SSSE3) instruction, but will use POPCNT (SSE4.2 (or AMD's ABM, but Phenom doesn't support SSSE3 so doesn't matter)) if available (these only seem to give minor speedups, so considered optional)
|
|
5
6
|
#ifdef __SSSE3__
|
|
6
7
|
#include "encoder_sse_base.h"
|
|
7
8
|
|
|
8
|
-
void encoder_ssse3_init() {
|
|
9
|
+
void RapidYenc::encoder_ssse3_init() {
|
|
9
10
|
_do_encode = &do_encode_simd< do_encode_sse<ISA_LEVEL_SSSE3> >;
|
|
10
11
|
encoder_sse_lut<ISA_LEVEL_SSSE3>();
|
|
11
12
|
_encode_isa = ISA_LEVEL_SSSE3;
|
|
12
13
|
}
|
|
13
14
|
#else
|
|
14
|
-
void
|
|
15
|
-
void encoder_ssse3_init() {
|
|
15
|
+
void RapidYenc::encoder_ssse3_init() {
|
|
16
16
|
encoder_sse2_init();
|
|
17
17
|
}
|
|
18
18
|
#endif
|
package/src/encoder_vbmi2.cc
CHANGED
|
@@ -1,32 +1,31 @@
|
|
|
1
1
|
#include "common.h"
|
|
2
|
+
#include "encoder_common.h"
|
|
2
3
|
|
|
3
|
-
extern const bool encoder_has_avx10;
|
|
4
4
|
#if !defined(__EVEX512__) && (defined(__AVX10_1__) || defined(__EVEX256__)) && defined(__AVX512VL__) && defined(__AVX512VBMI2__) && defined(__AVX512BW__)
|
|
5
|
-
const bool encoder_has_avx10 = true;
|
|
5
|
+
const bool RapidYenc::encoder_has_avx10 = true;
|
|
6
6
|
#else
|
|
7
|
-
const bool encoder_has_avx10 = false;
|
|
7
|
+
const bool RapidYenc::encoder_has_avx10 = false;
|
|
8
8
|
#endif
|
|
9
9
|
|
|
10
10
|
#if defined(__AVX512VL__) && defined(__AVX512VBMI2__) && defined(__AVX512BW__)
|
|
11
11
|
# ifndef YENC_DISABLE_AVX256
|
|
12
12
|
# include "encoder_avx_base.h"
|
|
13
13
|
|
|
14
|
-
void encoder_vbmi2_init() {
|
|
14
|
+
void RapidYenc::encoder_vbmi2_init() {
|
|
15
15
|
_do_encode = &do_encode_simd< do_encode_avx2<ISA_LEVEL_VBMI2> >;
|
|
16
16
|
encoder_avx2_lut<ISA_LEVEL_VBMI2>();
|
|
17
17
|
_encode_isa = ISA_LEVEL_VBMI2;
|
|
18
18
|
}
|
|
19
19
|
# else
|
|
20
20
|
# include "encoder_sse_base.h"
|
|
21
|
-
void encoder_vbmi2_init() {
|
|
21
|
+
void RapidYenc::encoder_vbmi2_init() {
|
|
22
22
|
_do_encode = &do_encode_simd< do_encode_sse<ISA_LEVEL_VBMI2> >;
|
|
23
23
|
encoder_sse_lut<ISA_LEVEL_VBMI2>();
|
|
24
24
|
_encode_isa = ISA_LEVEL_VBMI2;
|
|
25
25
|
}
|
|
26
26
|
# endif
|
|
27
27
|
#else
|
|
28
|
-
void
|
|
29
|
-
void encoder_vbmi2_init() {
|
|
28
|
+
void RapidYenc::encoder_vbmi2_init() {
|
|
30
29
|
encoder_avx2_init();
|
|
31
30
|
}
|
|
32
31
|
#endif
|
package/src/platform.cc
CHANGED
|
@@ -17,7 +17,7 @@
|
|
|
17
17
|
# endif
|
|
18
18
|
# endif
|
|
19
19
|
# endif
|
|
20
|
-
bool cpu_supports_neon() {
|
|
20
|
+
bool RapidYenc::cpu_supports_neon() {
|
|
21
21
|
# if defined(AT_HWCAP)
|
|
22
22
|
# ifdef __FreeBSD__
|
|
23
23
|
unsigned long supported;
|
|
@@ -95,7 +95,7 @@ static inline int _GET_XCR() {
|
|
|
95
95
|
// }
|
|
96
96
|
|
|
97
97
|
|
|
98
|
-
int cpu_supports_isa() {
|
|
98
|
+
int RapidYenc::cpu_supports_isa() {
|
|
99
99
|
int flags[4];
|
|
100
100
|
_cpuid1(flags);
|
|
101
101
|
int ret = 0;
|
|
@@ -132,29 +132,30 @@ int cpu_supports_isa() {
|
|
|
132
132
|
int cpuInfo[4];
|
|
133
133
|
_cpuidX(cpuInfo, 7, 0);
|
|
134
134
|
if((cpuInfo[1] & 0x128) == 0x128 && (ret & ISA_FEATURE_LZCNT)) { // BMI2 + AVX2 + BMI1
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
135
|
+
if((xcr & 0xE0) == 0xE0) { // AVX512 XSTATE (also applies to AVX10)
|
|
136
|
+
// check AVX10
|
|
137
|
+
int cpuInfo2[4];
|
|
138
|
+
_cpuidX(cpuInfo2, 7, 1);
|
|
139
|
+
if(cpuInfo2[3] & 0x80000) {
|
|
140
|
+
_cpuidX(cpuInfo2, 0x24, 0);
|
|
141
|
+
if((cpuInfo2[1] & 0xff) >= 1 && ( // minimum AVX10.1
|
|
142
142
|
#ifdef YENC_DISABLE_AVX256
|
|
143
|
-
|
|
143
|
+
cpuInfo2[1] & 0x10000 // AVX10/128
|
|
144
144
|
#else
|
|
145
|
-
|
|
145
|
+
cpuInfo2[1] & 0x20000 // AVX10/256
|
|
146
146
|
#endif
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
147
|
+
)) {
|
|
148
|
+
if(cpuInfo2[1] & 0x40000) ret |= ISA_FEATURE_EVEX512;
|
|
149
|
+
return ret | ISA_LEVEL_VBMI2;
|
|
150
|
+
}
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
if((cpuInfo[1] & 0xC0010000) == 0xC0010000) { // AVX512BW + AVX512VL + AVX512F
|
|
154
|
+
ret |= ISA_FEATURE_EVEX512;
|
|
155
|
+
if(cpuInfo[2] & 0x40)
|
|
156
|
+
return ret | ISA_LEVEL_VBMI2;
|
|
157
|
+
return ret | ISA_LEVEL_AVX3;
|
|
150
158
|
}
|
|
151
|
-
}
|
|
152
|
-
|
|
153
|
-
if(((xcr & 0xE0) == 0xE0) && (cpuInfo[1] & 0xC0010000) == 0xC0010000) { // AVX512BW + AVX512VL + AVX512F
|
|
154
|
-
ret |= ISA_FEATURE_EVEX512;
|
|
155
|
-
if(cpuInfo[2] & 0x40)
|
|
156
|
-
return ret | ISA_LEVEL_VBMI2;
|
|
157
|
-
return ret | ISA_LEVEL_AVX3;
|
|
158
159
|
}
|
|
159
160
|
// AVX2 is beneficial even on Zen1
|
|
160
161
|
return ret | ISA_LEVEL_AVX2;
|
|
@@ -169,7 +170,7 @@ int cpu_supports_isa() {
|
|
|
169
170
|
return ret | ISA_LEVEL_SSE2;
|
|
170
171
|
}
|
|
171
172
|
|
|
172
|
-
int cpu_supports_crc_isa() {
|
|
173
|
+
int RapidYenc::cpu_supports_crc_isa() {
|
|
173
174
|
int flags[4];
|
|
174
175
|
_cpuid1(flags);
|
|
175
176
|
|
|
@@ -200,7 +201,7 @@ int cpu_supports_crc_isa() {
|
|
|
200
201
|
# endif
|
|
201
202
|
# endif
|
|
202
203
|
# endif
|
|
203
|
-
bool cpu_supports_rvv() {
|
|
204
|
+
bool RapidYenc::cpu_supports_rvv() {
|
|
204
205
|
# if defined(AT_HWCAP)
|
|
205
206
|
unsigned long ret;
|
|
206
207
|
# ifdef __FreeBSD__
|