yencode 1.1.3 → 1.1.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +3 -2
- package/binding.gyp +75 -12
- package/index.js +21 -19
- package/package.json +2 -1
- package/src/common.h +23 -5
- package/src/crc.cc +129 -8
- package/src/crc_arm.cc +7 -1
- package/src/crc_folding_256.cc +4 -5
- package/src/decoder.cc +5 -4
- package/src/decoder.h +5 -5
- package/src/decoder_avx2_base.h +10 -4
- package/src/decoder_common.h +5 -5
- package/src/decoder_neon.cc +1 -1
- package/src/decoder_neon64.cc +1 -1
- package/src/decoder_sse_base.h +10 -3
- package/src/decoder_vbmi2.cc +7 -0
- package/src/encoder.cc +7 -1
- package/src/encoder_avx_base.h +22 -14
- package/src/encoder_neon.cc +39 -40
- package/src/encoder_rvv.cc +219 -0
- package/src/encoder_sse_base.h +3 -3
- package/src/encoder_vbmi2.cc +7 -0
- package/src/hedley.h +278 -135
- package/src/platform.cc +57 -9
- package/src/test_alignalloc.c +6 -0
- package/test/_speedbase.js +12 -11
- package/test/speeddec.js +6 -5
- package/test/testdec.js +30 -14
- package/test/testenc.js +10 -7
- package/test/testpostdec.js +6 -5
package/README.md
CHANGED
|
@@ -129,7 +129,8 @@ int decodeTo(Buffer data, Buffer output, bool stripDots=false)
|
|
|
129
129
|
Same as above, but instead of returning a Buffer, writes it to the supplied
|
|
130
130
|
*output* Buffer. Returns the length of the decoded data.
|
|
131
131
|
Note that the *output* Buffer must be at least large enough to hold the largest
|
|
132
|
-
possible output size (i.e. length of the input), otherwise an error is thrown.
|
|
132
|
+
possible output size (i.e. length of the input), otherwise an error is thrown.
|
|
133
|
+
The *data* and *output* Buffers can be the same, for in-situ decoding.
|
|
133
134
|
|
|
134
135
|
Object decodeChunk\(Buffer data \[, string state=null\]\[, Buffer output\]\)
|
|
135
136
|
-----------------------------------------------------------------------------
|
|
@@ -142,7 +143,7 @@ designed to incrementally process a stream from the network, and will perform NN
|
|
|
142
143
|
*state* is the current state of the incremental decode. Set to *null* if this is starting the decode of a new article, otherwise this should be set to the value of *state* given from the previous invocation of *decodeChunk*
|
|
143
144
|
If *output* is supplied, the output will be written here \(see *decodeTo* for notes
|
|
144
145
|
on required size\), otherwise a new buffer will be created where the output will be
|
|
145
|
-
written to.
|
|
146
|
+
written to. The *data* and *output* Buffers can be the same, for in-situ decoding.
|
|
146
147
|
|
|
147
148
|
Returns an object with the following keys:
|
|
148
149
|
|
package/binding.gyp
CHANGED
|
@@ -43,10 +43,20 @@
|
|
|
43
43
|
}],
|
|
44
44
|
['OS!="win" and enable_native_tuning!=0', {
|
|
45
45
|
"defines": ["YENC_BUILD_NATIVE=1"]
|
|
46
|
+
}],
|
|
47
|
+
['OS!="win"', {
|
|
48
|
+
"variables": {
|
|
49
|
+
"missing_memalign%": "<!(<!(echo ${CC_target:-${CC:-cc}}) -c src/test_alignalloc.c -o /dev/null -Werror 2>/dev/null || echo failed)",
|
|
50
|
+
},
|
|
51
|
+
"conditions": [
|
|
52
|
+
['missing_memalign!=""', {
|
|
53
|
+
"defines": ["_POSIX_C_SOURCE=200112L"],
|
|
54
|
+
}]
|
|
55
|
+
]
|
|
46
56
|
}]
|
|
47
57
|
],
|
|
48
58
|
"cflags": ["-Wno-unused-function"],
|
|
49
|
-
"cxxflags": ["-Wno-unused-function"],
|
|
59
|
+
"cxxflags": ["-Wno-unused-function", "-std=c++03", "-D_POSIX_C_SOURCE=200112L"],
|
|
50
60
|
"xcode_settings": {
|
|
51
61
|
"OTHER_CFLAGS": ["-Wno-unused-function"],
|
|
52
62
|
"OTHER_CXXFLAGS": ["-Wno-unused-function"]
|
|
@@ -64,7 +74,7 @@
|
|
|
64
74
|
"targets": [
|
|
65
75
|
{
|
|
66
76
|
"target_name": "yencode",
|
|
67
|
-
"dependencies": ["crcutil", "yencode_sse2", "yencode_ssse3", "yencode_clmul", "yencode_clmul256", "yencode_avx", "yencode_avx2", "yencode_vbmi2", "yencode_neon", "yencode_armcrc"],
|
|
77
|
+
"dependencies": ["crcutil", "yencode_sse2", "yencode_ssse3", "yencode_clmul", "yencode_clmul256", "yencode_avx", "yencode_avx2", "yencode_vbmi2", "yencode_neon", "yencode_armcrc", "yencode_rvv"],
|
|
68
78
|
"sources": [
|
|
69
79
|
"src/yencode.cc",
|
|
70
80
|
"src/platform.cc",
|
|
@@ -221,7 +231,7 @@
|
|
|
221
231
|
"msvs_settings": {"VCCLCompilerTool": {"BufferSecurityCheck": "false"}},
|
|
222
232
|
"conditions": [
|
|
223
233
|
['target_arch in "ia32 x64" and OS!="win"', {
|
|
224
|
-
"variables": {"supports_vpclmul%": "<!(<!(echo ${
|
|
234
|
+
"variables": {"supports_vpclmul%": "<!(<!(echo ${CXX_target:-${CXX:-c++}}) -MM -E src/crc_folding_256.cc -mavx2 -mvpclmulqdq 2>/dev/null || true)"},
|
|
225
235
|
"conditions": [
|
|
226
236
|
['supports_vpclmul!=""', {
|
|
227
237
|
"cflags": ["-mavx2", "-mvpclmulqdq", "-mpclmul"],
|
|
@@ -253,7 +263,10 @@
|
|
|
253
263
|
"msvs_settings": {"VCCLCompilerTool": {"BufferSecurityCheck": "false"}},
|
|
254
264
|
"conditions": [
|
|
255
265
|
['target_arch in "ia32 x64" and OS!="win"', {
|
|
256
|
-
"variables": {
|
|
266
|
+
"variables": {
|
|
267
|
+
"supports_vbmi2%": "<!(<!(echo ${CXX_target:-${CXX:-c++}}) -MM -E src/encoder_vbmi2.cc -mavx512vl -mavx512vbmi2 2>/dev/null || true)",
|
|
268
|
+
"supports_avx10%": "<!(<!(echo ${CXX_target:-${CXX:-c++}}) -MM -E src/encoder_vbmi2.cc -mavx512vl -mno-evex512 2>/dev/null || true)"
|
|
269
|
+
},
|
|
257
270
|
"conditions": [
|
|
258
271
|
['supports_vbmi2!=""', {
|
|
259
272
|
"cflags": ["-mavx512vbmi2", "-mavx512vl", "-mavx512bw", "-mpopcnt", "-mbmi", "-mbmi2", "-mlzcnt"],
|
|
@@ -262,6 +275,14 @@
|
|
|
262
275
|
"OTHER_CFLAGS": ["-mavx512vbmi2", "-mavx512vl", "-mavx512bw", "-mpopcnt", "-mbmi", "-mbmi2", "-mlzcnt"],
|
|
263
276
|
"OTHER_CXXFLAGS": ["-mavx512vbmi2", "-mavx512vl", "-mavx512bw", "-mpopcnt", "-mbmi", "-mbmi2", "-mlzcnt"],
|
|
264
277
|
}
|
|
278
|
+
}],
|
|
279
|
+
['supports_avx10!=""', {
|
|
280
|
+
"cflags": ["-mno-evex512"],
|
|
281
|
+
"cxxflags": ["-mno-evex512"],
|
|
282
|
+
"xcode_settings": {
|
|
283
|
+
"OTHER_CFLAGS": ["-mno-evex512"],
|
|
284
|
+
"OTHER_CXXFLAGS": ["-mno-evex512"],
|
|
285
|
+
}
|
|
265
286
|
}]
|
|
266
287
|
]
|
|
267
288
|
}],
|
|
@@ -285,11 +306,11 @@
|
|
|
285
306
|
"msvs_settings": {"VCCLCompilerTool": {"BufferSecurityCheck": "false"}},
|
|
286
307
|
"conditions": [
|
|
287
308
|
['target_arch=="arm"', {
|
|
288
|
-
"cflags": ["-mfpu=neon"],
|
|
289
|
-
"cxxflags": ["-mfpu=neon"],
|
|
309
|
+
"cflags": ["-mfpu=neon","-fno-lto"],
|
|
310
|
+
"cxxflags": ["-mfpu=neon","-fno-lto"],
|
|
290
311
|
"xcode_settings": {
|
|
291
|
-
"OTHER_CFLAGS": ["-mfpu=neon"],
|
|
292
|
-
"OTHER_CXXFLAGS": ["-mfpu=neon"],
|
|
312
|
+
"OTHER_CFLAGS": ["-mfpu=neon","-fno-lto"],
|
|
313
|
+
"OTHER_CXXFLAGS": ["-mfpu=neon","-fno-lto"],
|
|
293
314
|
}
|
|
294
315
|
}],
|
|
295
316
|
['target_arch=="arm64"', {
|
|
@@ -299,6 +320,48 @@
|
|
|
299
320
|
}]
|
|
300
321
|
]
|
|
301
322
|
},
|
|
323
|
+
{
|
|
324
|
+
"target_name": "yencode_rvv",
|
|
325
|
+
"type": "static_library",
|
|
326
|
+
"sources": [
|
|
327
|
+
"src/encoder_rvv.cc"
|
|
328
|
+
],
|
|
329
|
+
"cflags!": ["-fno-omit-frame-pointer", "-fno-tree-vrp", "-fno-strict-aliasing"],
|
|
330
|
+
"cxxflags!": ["-fno-omit-frame-pointer", "-fno-tree-vrp", "-fno-strict-aliasing"],
|
|
331
|
+
"xcode_settings": {
|
|
332
|
+
"OTHER_CFLAGS!": ["-fno-omit-frame-pointer", "-fno-tree-vrp", "-fno-strict-aliasing"],
|
|
333
|
+
"OTHER_CXXFLAGS!": ["-fno-omit-frame-pointer", "-fno-tree-vrp", "-fno-strict-aliasing"]
|
|
334
|
+
},
|
|
335
|
+
"msvs_settings": {"VCCLCompilerTool": {"BufferSecurityCheck": "false"}},
|
|
336
|
+
"conditions": [
|
|
337
|
+
['target_arch=="riscv64" and OS!="win"', {
|
|
338
|
+
"variables": {"supports_rvv%": "<!(<!(echo ${CXX_target:-${CXX:-c++}}) -MM -E src/encoder_rvv.cc -march=rv64gcv 2>/dev/null || true)"},
|
|
339
|
+
"conditions": [
|
|
340
|
+
['supports_rvv!=""', {
|
|
341
|
+
"cflags": ["-march=rv64gcv"],
|
|
342
|
+
"cxxflags": ["-march=rv64gcv"],
|
|
343
|
+
"xcode_settings": {
|
|
344
|
+
"OTHER_CFLAGS": ["-march=rv64gcv"],
|
|
345
|
+
"OTHER_CXXFLAGS": ["-march=rv64gcv"],
|
|
346
|
+
}
|
|
347
|
+
}]
|
|
348
|
+
]
|
|
349
|
+
}],
|
|
350
|
+
['target_arch=="riscv32" and OS!="win"', {
|
|
351
|
+
"variables": {"supports_rvv%": "<!(<!(echo ${CXX_target:-${CXX:-c++}}) -MM -E src/encoder_rvv.cc -march=rv32gcv 2>/dev/null || true)"},
|
|
352
|
+
"conditions": [
|
|
353
|
+
['supports_rvv!=""', {
|
|
354
|
+
"cflags": ["-march=rv32gcv"],
|
|
355
|
+
"cxxflags": ["-march=rv32gcv"],
|
|
356
|
+
"xcode_settings": {
|
|
357
|
+
"OTHER_CFLAGS": ["-march=rv32gcv"],
|
|
358
|
+
"OTHER_CXXFLAGS": ["-march=rv32gcv"],
|
|
359
|
+
}
|
|
360
|
+
}]
|
|
361
|
+
]
|
|
362
|
+
}]
|
|
363
|
+
]
|
|
364
|
+
},
|
|
302
365
|
{
|
|
303
366
|
"target_name": "yencode_armcrc",
|
|
304
367
|
"type": "static_library",
|
|
@@ -326,11 +389,11 @@
|
|
|
326
389
|
}
|
|
327
390
|
}],
|
|
328
391
|
['OS!="win" and target_arch=="arm"', {
|
|
329
|
-
"cflags": ["-mfpu=fp-armv8"],
|
|
330
|
-
"cxxflags": ["-mfpu=fp-armv8"],
|
|
392
|
+
"cflags": ["-mfpu=fp-armv8","-fno-lto"],
|
|
393
|
+
"cxxflags": ["-mfpu=fp-armv8","-fno-lto"],
|
|
331
394
|
"xcode_settings": {
|
|
332
|
-
"OTHER_CFLAGS": ["-mfpu=fp-armv8"],
|
|
333
|
-
"OTHER_CXXFLAGS": ["-mfpu=fp-armv8"]
|
|
395
|
+
"OTHER_CFLAGS": ["-mfpu=fp-armv8","-fno-lto"],
|
|
396
|
+
"OTHER_CXXFLAGS": ["-mfpu=fp-armv8","-fno-lto"]
|
|
334
397
|
}
|
|
335
398
|
}]
|
|
336
399
|
]
|
package/index.js
CHANGED
|
@@ -3,6 +3,7 @@
|
|
|
3
3
|
var y = require('./build/Release/yencode.node');
|
|
4
4
|
|
|
5
5
|
var toBuffer = Buffer.alloc ? Buffer.from : Buffer;
|
|
6
|
+
var bufferSlice = Buffer.prototype.readBigInt64BE ? Buffer.prototype.subarray : Buffer.prototype.slice;
|
|
6
7
|
|
|
7
8
|
var nl = toBuffer([13, 10]);
|
|
8
9
|
var RE_BADCHAR = /\r\n\0/g;
|
|
@@ -76,28 +77,28 @@ var decoderParseLines = function(lines, ydata) {
|
|
|
76
77
|
for(var i=0; i<lines.length; i++) {
|
|
77
78
|
var yprops = {};
|
|
78
79
|
|
|
79
|
-
var line = lines[i].
|
|
80
|
+
var line = lines[i].substring(2); // cut off '=y'
|
|
80
81
|
// parse tag
|
|
81
82
|
var p = line.indexOf(' ');
|
|
82
|
-
var tag = (p<0 ? line : line.
|
|
83
|
-
line = line.
|
|
83
|
+
var tag = (p<0 ? line : line.substring(0, p));
|
|
84
|
+
line = line.substring(tag.length+1).trim();
|
|
84
85
|
|
|
85
86
|
// parse props
|
|
86
87
|
var m = line.match(RE_YPROP);
|
|
87
88
|
while(m) {
|
|
88
89
|
if(m.index != 0) {
|
|
89
|
-
warnings.push(DecoderWarning('ignored_line_data', 'Unknown additional data ignored: "' + line.
|
|
90
|
+
warnings.push(DecoderWarning('ignored_line_data', 'Unknown additional data ignored: "' + line.substring(0, m.index) + '"'));
|
|
90
91
|
}
|
|
91
92
|
var prop = m[1], val;
|
|
92
93
|
var valPos = m.index + m[0].length;
|
|
93
94
|
if(tag == 'begin' && prop == 'name') {
|
|
94
95
|
// special treatment of filename - the value is the rest of the line (can include spaces)
|
|
95
|
-
val = line.
|
|
96
|
+
val = line.substring(valPos);
|
|
96
97
|
line = '';
|
|
97
98
|
} else {
|
|
98
99
|
p = line.indexOf(' ', valPos);
|
|
99
|
-
val = (p<0 ? line.
|
|
100
|
-
line = line.
|
|
100
|
+
val = (p<0 ? line.substring(valPos) : line.substring(valPos, p));
|
|
101
|
+
line = line.substring(valPos + val.length +1);
|
|
101
102
|
}
|
|
102
103
|
if(prop in yprops) {
|
|
103
104
|
warnings.push(DecoderWarning('duplicate_property', 'Duplicate property encountered: `' + prop + '`'));
|
|
@@ -139,7 +140,7 @@ module.exports = {
|
|
|
139
140
|
prev = '\r\n';
|
|
140
141
|
|
|
141
142
|
if(Buffer.isBuffer(prev)) prev = prev.toString();
|
|
142
|
-
prev = prev.
|
|
143
|
+
prev = prev.slice(-4); // only care about the last 4 chars of previous state
|
|
143
144
|
if(prev == '\r\n.=') prev = '\r\n='; // aliased after dot stripped
|
|
144
145
|
if(data.length == 0) return {
|
|
145
146
|
read: 0,
|
|
@@ -151,7 +152,7 @@ module.exports = {
|
|
|
151
152
|
var state = decodePrev.indexOf(prev);
|
|
152
153
|
if(state < 0) {
|
|
153
154
|
for(var l=-3; l<0; i++) {
|
|
154
|
-
state = decodePrev.indexOf(prev.
|
|
155
|
+
state = decodePrev.indexOf(prev.slice(l));
|
|
155
156
|
if(state >= 0) break;
|
|
156
157
|
}
|
|
157
158
|
if(state < 0) state = decodePrev.indexOf('');
|
|
@@ -195,12 +196,13 @@ module.exports = {
|
|
|
195
196
|
|
|
196
197
|
if(!Buffer.isBuffer(data)) data = toBuffer(data);
|
|
197
198
|
|
|
198
|
-
filename = toBuffer(filename.replace(RE_BADCHAR, '').
|
|
199
|
+
filename = toBuffer(filename.replace(RE_BADCHAR, '').substring(0, 256), exports.encoding);
|
|
200
|
+
var e = encodeCrc(data, line_size);
|
|
199
201
|
return Buffer.concat([
|
|
200
202
|
toBuffer('=ybegin line='+line_size+' size='+data.length+' name='),
|
|
201
203
|
filename, nl,
|
|
202
|
-
|
|
203
|
-
toBuffer('\r\n=yend size='+data.length+' crc32=' +
|
|
204
|
+
e.output,
|
|
205
|
+
toBuffer('\r\n=yend size='+data.length+' crc32=' + e.crc32.toString('hex'))
|
|
204
206
|
]);
|
|
205
207
|
},
|
|
206
208
|
multi_post: function(filename, size, parts, line_size) {
|
|
@@ -214,7 +216,7 @@ module.exports = {
|
|
|
214
216
|
|
|
215
217
|
// find '=ybegin' to know where the yEnc data starts
|
|
216
218
|
var yencStart;
|
|
217
|
-
if(
|
|
219
|
+
if(bufferSlice.call(data, 0, 8).toString('hex') == '3d79626567696e20' /*=ybegin */) {
|
|
218
220
|
// common case: starts right at the beginning
|
|
219
221
|
yencStart = 0;
|
|
220
222
|
} else {
|
|
@@ -231,10 +233,10 @@ module.exports = {
|
|
|
231
233
|
var sp = yencStart;
|
|
232
234
|
var p = bufferFind(data, '\r\n', yencStart+8);
|
|
233
235
|
while(p > 0) {
|
|
234
|
-
var line =
|
|
236
|
+
var line = bufferSlice.call(data, sp, p).toString(this.encoding).trim();
|
|
235
237
|
lines.push(line);
|
|
236
238
|
sp = p+2;
|
|
237
|
-
if(line.
|
|
239
|
+
if(line.substring(0, 6) == '=yend ') { // no data in post
|
|
238
240
|
ret.yencEnd = sp;
|
|
239
241
|
break;
|
|
240
242
|
}
|
|
@@ -252,7 +254,7 @@ module.exports = {
|
|
|
252
254
|
var warnings = decoderParseLines(lines, ydata);
|
|
253
255
|
|
|
254
256
|
if(!ret.yencEnd) {
|
|
255
|
-
var yencEnd = bufferFindRev(
|
|
257
|
+
var yencEnd = bufferFindRev(bufferSlice.call(data, ret.dataStart), '\r\n=yend ');
|
|
256
258
|
if(yencEnd < 0)
|
|
257
259
|
return DecoderError('no_end_found', 'yEnd end marker not found');
|
|
258
260
|
|
|
@@ -265,7 +267,7 @@ module.exports = {
|
|
|
265
267
|
ret.yencEnd = p;
|
|
266
268
|
} else
|
|
267
269
|
ret.yencEnd = p+2;
|
|
268
|
-
var endLine =
|
|
270
|
+
var endLine = bufferSlice.call(data, yencEnd+2, p).toString(this.encoding).trim();
|
|
269
271
|
|
|
270
272
|
warnings = warnings.concat(decoderParseLines([endLine], ydata));
|
|
271
273
|
}
|
|
@@ -321,7 +323,7 @@ module.exports = {
|
|
|
321
323
|
warnings.push(DecoderWarning('size_mismatch', 'Size specified for part exceeds size specified for whole file'));
|
|
322
324
|
|
|
323
325
|
if(ret.dataStart) {
|
|
324
|
-
ret.data = y.decode(
|
|
326
|
+
ret.data = y.decode(bufferSlice.call(data, ret.dataStart, ret.dataEnd), !!isRaw);
|
|
325
327
|
ret.crc32 = y.crc32(ret.data);
|
|
326
328
|
var hexCrc = ret.crc32.toString('hex');
|
|
327
329
|
|
|
@@ -360,7 +362,7 @@ function YEncoder(filename, size, parts, line_size) {
|
|
|
360
362
|
this.pos = 0;
|
|
361
363
|
this.crc = toBuffer([0,0,0,0]);
|
|
362
364
|
|
|
363
|
-
filename = toBuffer(filename.replace(RE_BADCHAR, '').
|
|
365
|
+
filename = toBuffer(filename.replace(RE_BADCHAR, '').substring(0, 256), exports.encoding);
|
|
364
366
|
if(parts > 1) {
|
|
365
367
|
this.yInfo = Buffer.concat([
|
|
366
368
|
toBuffer(' total='+parts+' line='+line_size+' size='+size+' name='),
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "yencode",
|
|
3
|
-
"version": "1.1.
|
|
3
|
+
"version": "1.1.4",
|
|
4
4
|
"description": "SIMD accelerated yEnc encoder/decoder and CRC32 calculator",
|
|
5
5
|
"keywords": [
|
|
6
6
|
"yenc",
|
|
@@ -21,6 +21,7 @@
|
|
|
21
21
|
"install": "node-gyp rebuild"
|
|
22
22
|
},
|
|
23
23
|
"gypfile": true,
|
|
24
|
+
"type": "commonjs",
|
|
24
25
|
"bugs": {
|
|
25
26
|
"url": "https://github.com/animetosho/node-yencode/issues"
|
|
26
27
|
},
|
package/src/common.h
CHANGED
|
@@ -57,18 +57,18 @@
|
|
|
57
57
|
|
|
58
58
|
|
|
59
59
|
// MSVC compatibility
|
|
60
|
-
#if ((defined(_M_IX86_FP) && _M_IX86_FP == 2) || defined(_M_X64)) && !defined(__clang__)
|
|
60
|
+
#if ((defined(_M_IX86_FP) && _M_IX86_FP == 2) || defined(_M_X64)) && defined(_MSC_VER) && !defined(__clang__)
|
|
61
61
|
#define __SSE2__ 1
|
|
62
62
|
#define __SSSE3__ 1
|
|
63
63
|
#define __SSE4_1__ 1
|
|
64
|
-
#if
|
|
64
|
+
#if _MSC_VER >= 1600 && defined(__SSE2__)
|
|
65
65
|
#define __POPCNT__ 1
|
|
66
66
|
#define __LZCNT__ 1
|
|
67
67
|
#endif
|
|
68
68
|
#if !defined(__AVX__) && (_MSC_VER >= 1700 && defined(__SSE2__))
|
|
69
69
|
#define __AVX__ 1
|
|
70
70
|
#endif
|
|
71
|
-
#if !defined(__AVX2__) && (_MSC_VER >= 1800 && defined(
|
|
71
|
+
#if !defined(__AVX2__) && (_MSC_VER >= 1800 && defined(__AVX__))
|
|
72
72
|
#define __AVX2__ 1
|
|
73
73
|
#define __BMI2__ 1
|
|
74
74
|
#endif
|
|
@@ -145,6 +145,13 @@
|
|
|
145
145
|
|
|
146
146
|
#endif
|
|
147
147
|
|
|
148
|
+
#if defined(__ARM_NEON) && defined(__has_include)
|
|
149
|
+
# if !__has_include(<arm_neon.h>)
|
|
150
|
+
# undef __ARM_NEON
|
|
151
|
+
HEDLEY_WARNING("NEON has been disabled due to missing arm_neon.h");
|
|
152
|
+
# endif
|
|
153
|
+
#endif
|
|
154
|
+
|
|
148
155
|
#ifdef __ARM_NEON
|
|
149
156
|
# include <arm_neon.h>
|
|
150
157
|
|
|
@@ -216,14 +223,15 @@ bool cpu_supports_neon();
|
|
|
216
223
|
enum YEncDecIsaLevel {
|
|
217
224
|
ISA_FEATURE_POPCNT = 0x1,
|
|
218
225
|
ISA_FEATURE_LZCNT = 0x2,
|
|
226
|
+
ISA_FEATURE_EVEX512 = 0x4, // AVX512 support
|
|
219
227
|
ISA_LEVEL_SSE2 = 0x100,
|
|
220
228
|
ISA_LEVEL_SSSE3 = 0x200,
|
|
221
229
|
ISA_LEVEL_SSE41 = 0x300,
|
|
222
230
|
ISA_LEVEL_SSE4_POPCNT = 0x301,
|
|
223
231
|
ISA_LEVEL_AVX = 0x381, // same as above, just used as a differentiator for `cpu_supports_isa`
|
|
224
232
|
ISA_LEVEL_AVX2 = 0x403, // also includes BMI1/2 and LZCNT
|
|
225
|
-
ISA_LEVEL_AVX3 =
|
|
226
|
-
ISA_LEVEL_VBMI2 = 0x603 // ICL
|
|
233
|
+
ISA_LEVEL_AVX3 = 0x507, // SKX variant; AVX512VL + AVX512BW
|
|
234
|
+
ISA_LEVEL_VBMI2 = 0x603 // ICL, AVX10
|
|
227
235
|
};
|
|
228
236
|
#ifdef _MSC_VER
|
|
229
237
|
// native tuning not supported in MSVC
|
|
@@ -256,6 +264,16 @@ enum YEncDecIsaLevel {
|
|
|
256
264
|
int cpu_supports_isa();
|
|
257
265
|
#endif // PLATFORM_X86
|
|
258
266
|
|
|
267
|
+
|
|
268
|
+
#ifdef __riscv
|
|
269
|
+
bool cpu_supports_rvv();
|
|
270
|
+
#endif
|
|
271
|
+
#if defined(__riscv_vector) && defined(HEDLEY_GCC_VERSION) && !HEDLEY_GCC_VERSION_CHECK(13,0,0)
|
|
272
|
+
// GCC added RVV intrinsics in GCC13
|
|
273
|
+
# undef __riscv_vector
|
|
274
|
+
#endif
|
|
275
|
+
|
|
276
|
+
|
|
259
277
|
#include <string.h>
|
|
260
278
|
#if !defined(_MSC_VER) || defined(_STDINT) || _MSC_VER >= 1900
|
|
261
279
|
# include <stdint.h>
|
package/src/crc.cc
CHANGED
|
@@ -3,11 +3,127 @@
|
|
|
3
3
|
#include "interface.h"
|
|
4
4
|
crcutil_interface::CRC* crc = NULL;
|
|
5
5
|
|
|
6
|
+
#if defined(PLATFORM_X86) && !defined(__ILP32__)
|
|
6
7
|
static uint32_t do_crc32_incremental_generic(const void* data, size_t length, uint32_t init) {
|
|
8
|
+
// use optimised ASM on x86 platforms
|
|
7
9
|
crcutil_interface::UINT64 tmp = init;
|
|
8
10
|
crc->Compute(data, length, &tmp);
|
|
9
11
|
return (uint32_t)tmp;
|
|
10
12
|
}
|
|
13
|
+
#else
|
|
14
|
+
static uint32_t* HEDLEY_RESTRICT crc_slice_table;
|
|
15
|
+
#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
|
|
16
|
+
# if defined(__GNUC__) || defined(__clang__)
|
|
17
|
+
# define bswap32 __builtin_bswap32
|
|
18
|
+
# else
|
|
19
|
+
static inline uint32_t bswap32(uint32_t x) {
|
|
20
|
+
return (x >> 24) | ((x >> 8) & 0x0000FF00) | ((x << 8) & 0x00FF0000) | (x << 24);
|
|
21
|
+
}
|
|
22
|
+
# endif
|
|
23
|
+
#endif
|
|
24
|
+
|
|
25
|
+
#define CRC32_GENERIC_CHAINS 4 // newer processors may prefer 8
|
|
26
|
+
static uint32_t do_crc32_incremental_generic(const void* data, size_t length, uint32_t init) {
|
|
27
|
+
const uint32_t* crc_base_table = crc_slice_table + 4*256; // this also seems to help MSVC's optimiser, which otherwise keeps trying to add to crc_slice_table every time it's referenced
|
|
28
|
+
uint32_t crc[CRC32_GENERIC_CHAINS]; // Clang seems to be more spill happy with an array over individual variables :(
|
|
29
|
+
crc[0] = ~init;
|
|
30
|
+
uint8_t* current8 = (uint8_t*)data;
|
|
31
|
+
|
|
32
|
+
// align to multiple of 4
|
|
33
|
+
if(((uintptr_t)current8 & 1) && length >= 1) {
|
|
34
|
+
crc[0] = (crc[0] >> 8) ^ crc_base_table[(crc[0] & 0xFF) ^ *current8++];
|
|
35
|
+
length--;
|
|
36
|
+
}
|
|
37
|
+
if(((uintptr_t)current8 & 2) && length >= 2) {
|
|
38
|
+
crc[0] = (crc[0] >> 8) ^ crc_base_table[(crc[0] & 0xFF) ^ *current8++];
|
|
39
|
+
crc[0] = (crc[0] >> 8) ^ crc_base_table[(crc[0] & 0xFF) ^ *current8++];
|
|
40
|
+
length -= 2;
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
uint8_t* end8 = current8 + length;
|
|
44
|
+
uint32_t* current = (uint32_t*)current8;
|
|
45
|
+
if(length >= 8*CRC32_GENERIC_CHAINS-4) {
|
|
46
|
+
size_t lenMain = ((length-(CRC32_GENERIC_CHAINS-1)*4) / 4);
|
|
47
|
+
uint32_t* end = current + (lenMain / CRC32_GENERIC_CHAINS) * CRC32_GENERIC_CHAINS;
|
|
48
|
+
for(int c=1; c<CRC32_GENERIC_CHAINS; c++)
|
|
49
|
+
crc[c] = 0;
|
|
50
|
+
while(current != end) {
|
|
51
|
+
#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
|
|
52
|
+
#define CRC_PROC4(v, in) \
|
|
53
|
+
v ^= bswap32(in); \
|
|
54
|
+
v = crc_slice_table[v >> 24] ^ crc_slice_table[0x100L + ((v >> 16) & 0xff)] ^ crc_slice_table[0x200L + ((v >> 8) & 0xff)] ^ crc_slice_table[0x300L + (v & 0xff)]
|
|
55
|
+
#else
|
|
56
|
+
#define CRC_PROC4(v, in) \
|
|
57
|
+
v ^= (in); \
|
|
58
|
+
v = crc_slice_table[v >> 24] ^ crc_slice_table[0x100L + ((v >> 16) & 0xff)] ^ crc_slice_table[0x200L + ((v >> 8) & 0xff)] ^ crc_slice_table[0x300L + (v & 0xff)]
|
|
59
|
+
#endif
|
|
60
|
+
for(int c=0; c<CRC32_GENERIC_CHAINS; c++) {
|
|
61
|
+
CRC_PROC4(crc[c], *current);
|
|
62
|
+
current++;
|
|
63
|
+
}
|
|
64
|
+
}
|
|
65
|
+
// aggregate accumulators
|
|
66
|
+
current8 = (uint8_t*)current;
|
|
67
|
+
#if (CRC32_GENERIC_CHAINS & (CRC32_GENERIC_CHAINS-1)) == 0
|
|
68
|
+
// assume that lengths which are a multiple of 4/8/16/32 are common
|
|
69
|
+
if((end8 - current8) & (CRC32_GENERIC_CHAINS*4)) {
|
|
70
|
+
CRC_PROC4(crc[0], *current);
|
|
71
|
+
current8 += 4;
|
|
72
|
+
|
|
73
|
+
for(int c=1; c<CRC32_GENERIC_CHAINS; c++) {
|
|
74
|
+
for(int i=0; i<4; i++)
|
|
75
|
+
crc[c] = (crc[c] >> 8) ^ crc_base_table[(crc[c] & 0xff) ^ *current8++];
|
|
76
|
+
crc[(c+1) & ~CRC32_GENERIC_CHAINS] ^= crc[c];
|
|
77
|
+
}
|
|
78
|
+
} else
|
|
79
|
+
#endif
|
|
80
|
+
#undef CRC_PROC4
|
|
81
|
+
for(int c=1; c<CRC32_GENERIC_CHAINS; c++) {
|
|
82
|
+
for(int i=0; i<4; i++)
|
|
83
|
+
crc[0] = (crc[0] >> 8) ^ crc_base_table[(crc[0] & 0xff) ^ *current8++];
|
|
84
|
+
crc[0] ^= crc[c];
|
|
85
|
+
}
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
// tail loop
|
|
89
|
+
while(current8 != end8) {
|
|
90
|
+
crc[0] = (crc[0] >> 8) ^ crc_base_table[(crc[0] & 0xFF) ^ *current8++];
|
|
91
|
+
}
|
|
92
|
+
return ~crc[0];
|
|
93
|
+
}
|
|
94
|
+
static void generate_crc32_slice_table() {
|
|
95
|
+
crc_slice_table = (uint32_t*)malloc(5*256*sizeof(uint32_t));
|
|
96
|
+
// generate standard byte-by-byte table
|
|
97
|
+
uint32_t* crc_base_table = crc_slice_table + 4*256;
|
|
98
|
+
for(int v=0; v<256; v++) {
|
|
99
|
+
uint32_t crc = v;
|
|
100
|
+
for(int j = 0; j < 8; j++) {
|
|
101
|
+
crc = (crc >> 1) ^ (-(int32_t)(crc & 1) & 0xEDB88320);
|
|
102
|
+
}
|
|
103
|
+
crc_base_table[v] = crc;
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
// generate slice-by-4 shifted across for X independent chains
|
|
107
|
+
for(int v=0; v<256; v++) {
|
|
108
|
+
uint32_t crc = crc_base_table[v];
|
|
109
|
+
#if CRC32_GENERIC_CHAINS > 1
|
|
110
|
+
for(int i=0; i<4*CRC32_GENERIC_CHAINS-5; i++)
|
|
111
|
+
crc = (crc >> 8) ^ crc_base_table[crc & 0xff];
|
|
112
|
+
for(int i=0; i<4; i++) {
|
|
113
|
+
crc = (crc >> 8) ^ crc_base_table[crc & 0xff];
|
|
114
|
+
crc_slice_table[i*256 + v] = crc;
|
|
115
|
+
}
|
|
116
|
+
#else
|
|
117
|
+
for(int i=0; i<4; i++) {
|
|
118
|
+
crc_slice_table[i*256 + v] = crc;
|
|
119
|
+
crc = (crc >> 8) ^ crc_base_table[crc & 0xff];
|
|
120
|
+
}
|
|
121
|
+
#endif
|
|
122
|
+
}
|
|
123
|
+
}
|
|
124
|
+
#endif
|
|
125
|
+
|
|
126
|
+
|
|
11
127
|
crc_func _do_crc32_incremental = &do_crc32_incremental_generic;
|
|
12
128
|
|
|
13
129
|
|
|
@@ -39,22 +155,23 @@ int cpu_supports_crc_isa();
|
|
|
39
155
|
#ifdef PLATFORM_ARM
|
|
40
156
|
# ifdef __ANDROID__
|
|
41
157
|
# include <cpu-features.h>
|
|
42
|
-
# elif defined(__linux__) || (defined(__FreeBSD__) && __FreeBSD__ >= 12)
|
|
43
|
-
# include <sys/auxv.h>
|
|
44
|
-
# include <asm/hwcap.h>
|
|
45
|
-
# elif (defined(__FreeBSD__) && __FreeBSD__ < 12)
|
|
46
|
-
# include <sys/sysctl.h>
|
|
47
|
-
# include <asm/hwcap.h>
|
|
48
158
|
# elif defined(__APPLE__)
|
|
49
159
|
# include <sys/types.h>
|
|
50
160
|
# include <sys/sysctl.h>
|
|
51
|
-
#
|
|
52
|
-
#
|
|
161
|
+
# elif defined(__has_include)
|
|
162
|
+
# if __has_include(<sys/auxv.h>)
|
|
163
|
+
# include <sys/auxv.h>
|
|
164
|
+
# ifdef __FreeBSD__
|
|
53
165
|
static unsigned long getauxval(unsigned long cap) {
|
|
54
166
|
unsigned long ret;
|
|
55
167
|
elf_aux_info(cap, &ret, sizeof(ret));
|
|
56
168
|
return ret;
|
|
57
169
|
}
|
|
170
|
+
# endif
|
|
171
|
+
# if __has_include(<asm/hwcap.h>)
|
|
172
|
+
# include <asm/hwcap.h>
|
|
173
|
+
# endif
|
|
174
|
+
# endif
|
|
58
175
|
# endif
|
|
59
176
|
#endif
|
|
60
177
|
void crc_init() {
|
|
@@ -62,6 +179,10 @@ void crc_init() {
|
|
|
62
179
|
0xEDB88320, 0, 32, true, 0, 0, 0, 0, NULL);
|
|
63
180
|
// instance never deleted... oh well...
|
|
64
181
|
|
|
182
|
+
#if !defined(PLATFORM_X86) || defined(__ILP32__)
|
|
183
|
+
generate_crc32_slice_table();
|
|
184
|
+
#endif
|
|
185
|
+
|
|
65
186
|
#ifdef PLATFORM_X86
|
|
66
187
|
int support = cpu_supports_crc_isa();
|
|
67
188
|
if(support == 2)
|
package/src/crc_arm.cc
CHANGED
|
@@ -16,6 +16,12 @@ HEDLEY_WARNING("CRC32 acceleration has been disabled due to broken arm_acle.h sh
|
|
|
16
16
|
HEDLEY_WARNING("CRC32 acceleration has been disabled due to broken arm_acle.h shipped in GCC 9.4 [https://gcc.gnu.org/bugzilla/show_bug.cgi?id=100985]. If you need this feature, please use a different compiler or version of GCC");
|
|
17
17
|
# endif
|
|
18
18
|
#endif
|
|
19
|
+
#if defined(__ARM_FEATURE_CRC32) && defined(__has_include)
|
|
20
|
+
# if !__has_include(<arm_acle.h>)
|
|
21
|
+
# undef __ARM_FEATURE_CRC32
|
|
22
|
+
HEDLEY_WARNING("CRC32 acceleration has been disabled due to missing arm_acle.h");
|
|
23
|
+
# endif
|
|
24
|
+
#endif
|
|
19
25
|
|
|
20
26
|
#if defined(__ARM_FEATURE_CRC32) || (defined(_M_ARM64) && !defined(__clang__)) // MSVC doesn't support CRC for ARM32
|
|
21
27
|
|
|
@@ -73,7 +79,7 @@ static HEDLEY_ALWAYS_INLINE uint32_t crc_multiply(uint32_t a, uint32_t b) {
|
|
|
73
79
|
return res;
|
|
74
80
|
}
|
|
75
81
|
|
|
76
|
-
static const uint32_t crc_power[] = { // pre-computed 2^n, with first 3 entries removed (saves a shift)
|
|
82
|
+
static const uint32_t crc_power[] = { // pre-computed 2^(2^n), with first 3 entries removed (saves a shift)
|
|
77
83
|
0x00800000, 0x00008000, 0xedb88320, 0xb1e6b092, 0xa06a2517, 0xed627dae, 0x88d14467, 0xd7bbfe6a,
|
|
78
84
|
0xec447f11, 0x8e7ea170, 0x6427800e, 0x4d47bae0, 0x09fe548f, 0x83852d0f, 0x30362f1a, 0x7b5a9cc3,
|
|
79
85
|
0x31fec169, 0x9fec022a, 0x6c8dedc4, 0x15d6874d, 0x5fde7a4e, 0xbad90e37, 0x2e4e5eef, 0x4eaba214,
|
package/src/crc_folding_256.cc
CHANGED
|
@@ -26,10 +26,9 @@ static __m256i do_one_fold(__m256i src, __m256i data) {
|
|
|
26
26
|
0x96
|
|
27
27
|
);
|
|
28
28
|
#else
|
|
29
|
-
return _mm256_xor_si256(
|
|
30
|
-
_mm256_clmulepi64_epi128(src, fold4, 0x01)
|
|
31
|
-
|
|
32
|
-
));
|
|
29
|
+
return _mm256_xor_si256(_mm256_xor_si256(
|
|
30
|
+
data, _mm256_clmulepi64_epi128(src, fold4, 0x01)
|
|
31
|
+
), _mm256_clmulepi64_epi128(src, fold4, 0x10));
|
|
33
32
|
#endif
|
|
34
33
|
}
|
|
35
34
|
|
|
@@ -38,7 +37,7 @@ ALIGN_TO(32, static const uint8_t pshufb_rot_table[]) = {
|
|
|
38
37
|
16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31
|
|
39
38
|
};
|
|
40
39
|
// _mm256_castsi128_si256, but upper is defined to be 0
|
|
41
|
-
#if (defined(__clang__) && __clang_major__ >= 5 && (!defined(__APPLE__) || __clang_major__ >= 7)) || (defined(__GNUC__) && __GNUC__ >= 10)
|
|
40
|
+
#if (defined(__clang__) && __clang_major__ >= 5 && (!defined(__APPLE__) || __clang_major__ >= 7)) || (defined(__GNUC__) && __GNUC__ >= 10) || (defined(_MSC_VER) && _MSC_VER >= 1910)
|
|
42
41
|
// intrinsic unsupported in GCC 9 and MSVC < 2017
|
|
43
42
|
# define zext128_256 _mm256_zextsi128_si256
|
|
44
43
|
#else
|
package/src/decoder.cc
CHANGED
|
@@ -4,9 +4,9 @@
|
|
|
4
4
|
#include "decoder.h"
|
|
5
5
|
|
|
6
6
|
extern "C" {
|
|
7
|
-
YencDecoderEnd (*_do_decode)(const unsigned char
|
|
8
|
-
YencDecoderEnd (*_do_decode_raw)(const unsigned char
|
|
9
|
-
YencDecoderEnd (*_do_decode_end_raw)(const unsigned char
|
|
7
|
+
YencDecoderEnd (*_do_decode)(const unsigned char**, unsigned char**, size_t, YencDecoderState*) = &do_decode_scalar<false, false>;
|
|
8
|
+
YencDecoderEnd (*_do_decode_raw)(const unsigned char**, unsigned char**, size_t, YencDecoderState*) = &do_decode_scalar<true, false>;
|
|
9
|
+
YencDecoderEnd (*_do_decode_end_raw)(const unsigned char**, unsigned char**, size_t, YencDecoderState*) = &do_decode_end_scalar<true>;
|
|
10
10
|
}
|
|
11
11
|
|
|
12
12
|
void decoder_set_sse2_funcs();
|
|
@@ -14,6 +14,7 @@ void decoder_set_ssse3_funcs();
|
|
|
14
14
|
void decoder_set_avx_funcs();
|
|
15
15
|
void decoder_set_avx2_funcs();
|
|
16
16
|
void decoder_set_vbmi2_funcs();
|
|
17
|
+
extern const bool decoder_has_avx10;
|
|
17
18
|
void decoder_set_neon_funcs();
|
|
18
19
|
|
|
19
20
|
|
|
@@ -45,7 +46,7 @@ void decoder_init() {
|
|
|
45
46
|
decoder_set_native_funcs();
|
|
46
47
|
# else
|
|
47
48
|
int use_isa = cpu_supports_isa();
|
|
48
|
-
if(use_isa >= ISA_LEVEL_VBMI2)
|
|
49
|
+
if(use_isa >= ISA_LEVEL_VBMI2 && (decoder_has_avx10 || (use_isa & ISA_FEATURE_EVEX512)))
|
|
49
50
|
decoder_set_vbmi2_funcs();
|
|
50
51
|
else if(use_isa >= ISA_LEVEL_AVX2)
|
|
51
52
|
decoder_set_avx2_funcs();
|
package/src/decoder.h
CHANGED
|
@@ -29,17 +29,17 @@ typedef enum {
|
|
|
29
29
|
|
|
30
30
|
#include "hedley.h"
|
|
31
31
|
|
|
32
|
-
extern YencDecoderEnd (*_do_decode)(const unsigned char
|
|
33
|
-
extern YencDecoderEnd (*_do_decode_raw)(const unsigned char
|
|
34
|
-
extern YencDecoderEnd (*_do_decode_end_raw)(const unsigned char
|
|
32
|
+
extern YencDecoderEnd (*_do_decode)(const unsigned char**, unsigned char**, size_t, YencDecoderState*);
|
|
33
|
+
extern YencDecoderEnd (*_do_decode_raw)(const unsigned char**, unsigned char**, size_t, YencDecoderState*);
|
|
34
|
+
extern YencDecoderEnd (*_do_decode_end_raw)(const unsigned char**, unsigned char**, size_t, YencDecoderState*);
|
|
35
35
|
|
|
36
|
-
static inline size_t do_decode(int isRaw, const unsigned char*
|
|
36
|
+
static inline size_t do_decode(int isRaw, const unsigned char* src, unsigned char* dest, size_t len, YencDecoderState* state) {
|
|
37
37
|
unsigned char* ds = dest;
|
|
38
38
|
(*(isRaw ? _do_decode_raw : _do_decode))(&src, &ds, len, state);
|
|
39
39
|
return ds - dest;
|
|
40
40
|
}
|
|
41
41
|
|
|
42
|
-
static inline YencDecoderEnd do_decode_end(const unsigned char
|
|
42
|
+
static inline YencDecoderEnd do_decode_end(const unsigned char** src, unsigned char** dest, size_t len, YencDecoderState* state) {
|
|
43
43
|
return _do_decode_end_raw(src, dest, len, state);
|
|
44
44
|
}
|
|
45
45
|
|