yencode 1.1.3 → 1.1.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +3 -2
- package/binding.gyp +75 -12
- package/index.js +21 -19
- package/package.json +2 -1
- package/src/common.h +43 -5
- package/src/crc.cc +137 -15
- package/src/crc.h +4 -0
- package/src/crc_arm.cc +11 -6
- package/src/crc_folding.cc +4 -5
- package/src/crc_folding_256.cc +10 -10
- package/src/decoder.cc +9 -4
- package/src/decoder.h +9 -5
- package/src/decoder_avx.cc +1 -0
- package/src/decoder_avx2.cc +1 -0
- package/src/decoder_avx2_base.h +14 -18
- package/src/decoder_common.h +30 -5
- package/src/decoder_neon.cc +7 -13
- package/src/decoder_neon64.cc +7 -12
- package/src/decoder_sse2.cc +1 -0
- package/src/decoder_sse_base.h +15 -14
- package/src/decoder_ssse3.cc +1 -0
- package/src/decoder_vbmi2.cc +9 -0
- package/src/encoder.cc +10 -1
- package/src/encoder.h +4 -0
- package/src/encoder_avx.cc +1 -0
- package/src/encoder_avx2.cc +1 -0
- package/src/encoder_avx_base.h +22 -14
- package/src/encoder_neon.cc +40 -40
- package/src/encoder_rvv.cc +220 -0
- package/src/encoder_sse2.cc +1 -0
- package/src/encoder_sse_base.h +3 -3
- package/src/encoder_ssse3.cc +1 -0
- package/src/encoder_vbmi2.cc +9 -0
- package/src/hedley.h +278 -135
- package/src/platform.cc +57 -9
- package/src/test_alignalloc.c +6 -0
- package/test/_speedbase.js +12 -11
- package/test/speeddec.js +6 -5
- package/test/testcrc.js +2 -2
- package/test/testdec.js +31 -15
- package/test/testenc.js +11 -8
- package/test/testpostdec.js +6 -5
package/README.md
CHANGED
|
@@ -129,7 +129,8 @@ int decodeTo(Buffer data, Buffer output, bool stripDots=false)
|
|
|
129
129
|
Same as above, but instead of returning a Buffer, writes it to the supplied
|
|
130
130
|
*output* Buffer. Returns the length of the decoded data.
|
|
131
131
|
Note that the *output* Buffer must be at least large enough to hold the largest
|
|
132
|
-
possible output size (i.e. length of the input), otherwise an error is thrown.
|
|
132
|
+
possible output size (i.e. length of the input), otherwise an error is thrown.
|
|
133
|
+
The *data* and *output* Buffers can be the same, for in-situ decoding.
|
|
133
134
|
|
|
134
135
|
Object decodeChunk\(Buffer data \[, string state=null\]\[, Buffer output\]\)
|
|
135
136
|
-----------------------------------------------------------------------------
|
|
@@ -142,7 +143,7 @@ designed to incrementally process a stream from the network, and will perform NN
|
|
|
142
143
|
*state* is the current state of the incremental decode. Set to *null* if this is starting the decode of a new article, otherwise this should be set to the value of *state* given from the previous invocation of *decodeChunk*
|
|
143
144
|
If *output* is supplied, the output will be written here \(see *decodeTo* for notes
|
|
144
145
|
on required size\), otherwise a new buffer will be created where the output will be
|
|
145
|
-
written to.
|
|
146
|
+
written to. The *data* and *output* Buffers can be the same, for in-situ decoding.
|
|
146
147
|
|
|
147
148
|
Returns an object with the following keys:
|
|
148
149
|
|
package/binding.gyp
CHANGED
|
@@ -43,10 +43,20 @@
|
|
|
43
43
|
}],
|
|
44
44
|
['OS!="win" and enable_native_tuning!=0', {
|
|
45
45
|
"defines": ["YENC_BUILD_NATIVE=1"]
|
|
46
|
+
}],
|
|
47
|
+
['OS!="win"', {
|
|
48
|
+
"variables": {
|
|
49
|
+
"missing_memalign%": "<!(<!(echo ${CC_target:-${CC:-cc}}) -c src/test_alignalloc.c -o /dev/null -Werror 2>/dev/null || echo failed)",
|
|
50
|
+
},
|
|
51
|
+
"conditions": [
|
|
52
|
+
['missing_memalign!=""', {
|
|
53
|
+
"defines": ["_POSIX_C_SOURCE=200112L"],
|
|
54
|
+
}]
|
|
55
|
+
]
|
|
46
56
|
}]
|
|
47
57
|
],
|
|
48
58
|
"cflags": ["-Wno-unused-function"],
|
|
49
|
-
"cxxflags": ["-Wno-unused-function"],
|
|
59
|
+
"cxxflags": ["-Wno-unused-function", "-std=c++03", "-D_POSIX_C_SOURCE=200112L"],
|
|
50
60
|
"xcode_settings": {
|
|
51
61
|
"OTHER_CFLAGS": ["-Wno-unused-function"],
|
|
52
62
|
"OTHER_CXXFLAGS": ["-Wno-unused-function"]
|
|
@@ -64,7 +74,7 @@
|
|
|
64
74
|
"targets": [
|
|
65
75
|
{
|
|
66
76
|
"target_name": "yencode",
|
|
67
|
-
"dependencies": ["crcutil", "yencode_sse2", "yencode_ssse3", "yencode_clmul", "yencode_clmul256", "yencode_avx", "yencode_avx2", "yencode_vbmi2", "yencode_neon", "yencode_armcrc"],
|
|
77
|
+
"dependencies": ["crcutil", "yencode_sse2", "yencode_ssse3", "yencode_clmul", "yencode_clmul256", "yencode_avx", "yencode_avx2", "yencode_vbmi2", "yencode_neon", "yencode_armcrc", "yencode_rvv"],
|
|
68
78
|
"sources": [
|
|
69
79
|
"src/yencode.cc",
|
|
70
80
|
"src/platform.cc",
|
|
@@ -221,7 +231,7 @@
|
|
|
221
231
|
"msvs_settings": {"VCCLCompilerTool": {"BufferSecurityCheck": "false"}},
|
|
222
232
|
"conditions": [
|
|
223
233
|
['target_arch in "ia32 x64" and OS!="win"', {
|
|
224
|
-
"variables": {"supports_vpclmul%": "<!(<!(echo ${
|
|
234
|
+
"variables": {"supports_vpclmul%": "<!(<!(echo ${CXX_target:-${CXX:-c++}}) -MM -E src/crc_folding_256.cc -mavx2 -mvpclmulqdq 2>/dev/null || true)"},
|
|
225
235
|
"conditions": [
|
|
226
236
|
['supports_vpclmul!=""', {
|
|
227
237
|
"cflags": ["-mavx2", "-mvpclmulqdq", "-mpclmul"],
|
|
@@ -253,7 +263,10 @@
|
|
|
253
263
|
"msvs_settings": {"VCCLCompilerTool": {"BufferSecurityCheck": "false"}},
|
|
254
264
|
"conditions": [
|
|
255
265
|
['target_arch in "ia32 x64" and OS!="win"', {
|
|
256
|
-
"variables": {
|
|
266
|
+
"variables": {
|
|
267
|
+
"supports_vbmi2%": "<!(<!(echo ${CXX_target:-${CXX:-c++}}) -MM -E src/encoder_vbmi2.cc -mavx512vl -mavx512vbmi2 2>/dev/null || true)",
|
|
268
|
+
"supports_avx10%": "<!(<!(echo ${CXX_target:-${CXX:-c++}}) -MM -E src/encoder_vbmi2.cc -mavx512vl -mno-evex512 2>/dev/null || true)"
|
|
269
|
+
},
|
|
257
270
|
"conditions": [
|
|
258
271
|
['supports_vbmi2!=""', {
|
|
259
272
|
"cflags": ["-mavx512vbmi2", "-mavx512vl", "-mavx512bw", "-mpopcnt", "-mbmi", "-mbmi2", "-mlzcnt"],
|
|
@@ -262,6 +275,14 @@
|
|
|
262
275
|
"OTHER_CFLAGS": ["-mavx512vbmi2", "-mavx512vl", "-mavx512bw", "-mpopcnt", "-mbmi", "-mbmi2", "-mlzcnt"],
|
|
263
276
|
"OTHER_CXXFLAGS": ["-mavx512vbmi2", "-mavx512vl", "-mavx512bw", "-mpopcnt", "-mbmi", "-mbmi2", "-mlzcnt"],
|
|
264
277
|
}
|
|
278
|
+
}],
|
|
279
|
+
['supports_avx10!=""', {
|
|
280
|
+
"cflags": ["-mno-evex512"],
|
|
281
|
+
"cxxflags": ["-mno-evex512"],
|
|
282
|
+
"xcode_settings": {
|
|
283
|
+
"OTHER_CFLAGS": ["-mno-evex512"],
|
|
284
|
+
"OTHER_CXXFLAGS": ["-mno-evex512"],
|
|
285
|
+
}
|
|
265
286
|
}]
|
|
266
287
|
]
|
|
267
288
|
}],
|
|
@@ -285,11 +306,11 @@
|
|
|
285
306
|
"msvs_settings": {"VCCLCompilerTool": {"BufferSecurityCheck": "false"}},
|
|
286
307
|
"conditions": [
|
|
287
308
|
['target_arch=="arm"', {
|
|
288
|
-
"cflags": ["-mfpu=neon"],
|
|
289
|
-
"cxxflags": ["-mfpu=neon"],
|
|
309
|
+
"cflags": ["-mfpu=neon","-fno-lto"],
|
|
310
|
+
"cxxflags": ["-mfpu=neon","-fno-lto"],
|
|
290
311
|
"xcode_settings": {
|
|
291
|
-
"OTHER_CFLAGS": ["-mfpu=neon"],
|
|
292
|
-
"OTHER_CXXFLAGS": ["-mfpu=neon"],
|
|
312
|
+
"OTHER_CFLAGS": ["-mfpu=neon","-fno-lto"],
|
|
313
|
+
"OTHER_CXXFLAGS": ["-mfpu=neon","-fno-lto"],
|
|
293
314
|
}
|
|
294
315
|
}],
|
|
295
316
|
['target_arch=="arm64"', {
|
|
@@ -299,6 +320,48 @@
|
|
|
299
320
|
}]
|
|
300
321
|
]
|
|
301
322
|
},
|
|
323
|
+
{
|
|
324
|
+
"target_name": "yencode_rvv",
|
|
325
|
+
"type": "static_library",
|
|
326
|
+
"sources": [
|
|
327
|
+
"src/encoder_rvv.cc"
|
|
328
|
+
],
|
|
329
|
+
"cflags!": ["-fno-omit-frame-pointer", "-fno-tree-vrp", "-fno-strict-aliasing"],
|
|
330
|
+
"cxxflags!": ["-fno-omit-frame-pointer", "-fno-tree-vrp", "-fno-strict-aliasing"],
|
|
331
|
+
"xcode_settings": {
|
|
332
|
+
"OTHER_CFLAGS!": ["-fno-omit-frame-pointer", "-fno-tree-vrp", "-fno-strict-aliasing"],
|
|
333
|
+
"OTHER_CXXFLAGS!": ["-fno-omit-frame-pointer", "-fno-tree-vrp", "-fno-strict-aliasing"]
|
|
334
|
+
},
|
|
335
|
+
"msvs_settings": {"VCCLCompilerTool": {"BufferSecurityCheck": "false"}},
|
|
336
|
+
"conditions": [
|
|
337
|
+
['target_arch=="riscv64" and OS!="win"', {
|
|
338
|
+
"variables": {"supports_rvv%": "<!(<!(echo ${CXX_target:-${CXX:-c++}}) -MM -E src/encoder_rvv.cc -march=rv64gcv 2>/dev/null || true)"},
|
|
339
|
+
"conditions": [
|
|
340
|
+
['supports_rvv!=""', {
|
|
341
|
+
"cflags": ["-march=rv64gcv"],
|
|
342
|
+
"cxxflags": ["-march=rv64gcv"],
|
|
343
|
+
"xcode_settings": {
|
|
344
|
+
"OTHER_CFLAGS": ["-march=rv64gcv"],
|
|
345
|
+
"OTHER_CXXFLAGS": ["-march=rv64gcv"],
|
|
346
|
+
}
|
|
347
|
+
}]
|
|
348
|
+
]
|
|
349
|
+
}],
|
|
350
|
+
['target_arch=="riscv32" and OS!="win"', {
|
|
351
|
+
"variables": {"supports_rvv%": "<!(<!(echo ${CXX_target:-${CXX:-c++}}) -MM -E src/encoder_rvv.cc -march=rv32gcv 2>/dev/null || true)"},
|
|
352
|
+
"conditions": [
|
|
353
|
+
['supports_rvv!=""', {
|
|
354
|
+
"cflags": ["-march=rv32gcv"],
|
|
355
|
+
"cxxflags": ["-march=rv32gcv"],
|
|
356
|
+
"xcode_settings": {
|
|
357
|
+
"OTHER_CFLAGS": ["-march=rv32gcv"],
|
|
358
|
+
"OTHER_CXXFLAGS": ["-march=rv32gcv"],
|
|
359
|
+
}
|
|
360
|
+
}]
|
|
361
|
+
]
|
|
362
|
+
}]
|
|
363
|
+
]
|
|
364
|
+
},
|
|
302
365
|
{
|
|
303
366
|
"target_name": "yencode_armcrc",
|
|
304
367
|
"type": "static_library",
|
|
@@ -326,11 +389,11 @@
|
|
|
326
389
|
}
|
|
327
390
|
}],
|
|
328
391
|
['OS!="win" and target_arch=="arm"', {
|
|
329
|
-
"cflags": ["-mfpu=fp-armv8"],
|
|
330
|
-
"cxxflags": ["-mfpu=fp-armv8"],
|
|
392
|
+
"cflags": ["-mfpu=fp-armv8","-fno-lto"],
|
|
393
|
+
"cxxflags": ["-mfpu=fp-armv8","-fno-lto"],
|
|
331
394
|
"xcode_settings": {
|
|
332
|
-
"OTHER_CFLAGS": ["-mfpu=fp-armv8"],
|
|
333
|
-
"OTHER_CXXFLAGS": ["-mfpu=fp-armv8"]
|
|
395
|
+
"OTHER_CFLAGS": ["-mfpu=fp-armv8","-fno-lto"],
|
|
396
|
+
"OTHER_CXXFLAGS": ["-mfpu=fp-armv8","-fno-lto"]
|
|
334
397
|
}
|
|
335
398
|
}]
|
|
336
399
|
]
|
package/index.js
CHANGED
|
@@ -3,6 +3,7 @@
|
|
|
3
3
|
var y = require('./build/Release/yencode.node');
|
|
4
4
|
|
|
5
5
|
var toBuffer = Buffer.alloc ? Buffer.from : Buffer;
|
|
6
|
+
var bufferSlice = Buffer.prototype.readBigInt64BE ? Buffer.prototype.subarray : Buffer.prototype.slice;
|
|
6
7
|
|
|
7
8
|
var nl = toBuffer([13, 10]);
|
|
8
9
|
var RE_BADCHAR = /\r\n\0/g;
|
|
@@ -76,28 +77,28 @@ var decoderParseLines = function(lines, ydata) {
|
|
|
76
77
|
for(var i=0; i<lines.length; i++) {
|
|
77
78
|
var yprops = {};
|
|
78
79
|
|
|
79
|
-
var line = lines[i].
|
|
80
|
+
var line = lines[i].substring(2); // cut off '=y'
|
|
80
81
|
// parse tag
|
|
81
82
|
var p = line.indexOf(' ');
|
|
82
|
-
var tag = (p<0 ? line : line.
|
|
83
|
-
line = line.
|
|
83
|
+
var tag = (p<0 ? line : line.substring(0, p));
|
|
84
|
+
line = line.substring(tag.length+1).trim();
|
|
84
85
|
|
|
85
86
|
// parse props
|
|
86
87
|
var m = line.match(RE_YPROP);
|
|
87
88
|
while(m) {
|
|
88
89
|
if(m.index != 0) {
|
|
89
|
-
warnings.push(DecoderWarning('ignored_line_data', 'Unknown additional data ignored: "' + line.
|
|
90
|
+
warnings.push(DecoderWarning('ignored_line_data', 'Unknown additional data ignored: "' + line.substring(0, m.index) + '"'));
|
|
90
91
|
}
|
|
91
92
|
var prop = m[1], val;
|
|
92
93
|
var valPos = m.index + m[0].length;
|
|
93
94
|
if(tag == 'begin' && prop == 'name') {
|
|
94
95
|
// special treatment of filename - the value is the rest of the line (can include spaces)
|
|
95
|
-
val = line.
|
|
96
|
+
val = line.substring(valPos);
|
|
96
97
|
line = '';
|
|
97
98
|
} else {
|
|
98
99
|
p = line.indexOf(' ', valPos);
|
|
99
|
-
val = (p<0 ? line.
|
|
100
|
-
line = line.
|
|
100
|
+
val = (p<0 ? line.substring(valPos) : line.substring(valPos, p));
|
|
101
|
+
line = line.substring(valPos + val.length +1);
|
|
101
102
|
}
|
|
102
103
|
if(prop in yprops) {
|
|
103
104
|
warnings.push(DecoderWarning('duplicate_property', 'Duplicate property encountered: `' + prop + '`'));
|
|
@@ -139,7 +140,7 @@ module.exports = {
|
|
|
139
140
|
prev = '\r\n';
|
|
140
141
|
|
|
141
142
|
if(Buffer.isBuffer(prev)) prev = prev.toString();
|
|
142
|
-
prev = prev.
|
|
143
|
+
prev = prev.slice(-4); // only care about the last 4 chars of previous state
|
|
143
144
|
if(prev == '\r\n.=') prev = '\r\n='; // aliased after dot stripped
|
|
144
145
|
if(data.length == 0) return {
|
|
145
146
|
read: 0,
|
|
@@ -151,7 +152,7 @@ module.exports = {
|
|
|
151
152
|
var state = decodePrev.indexOf(prev);
|
|
152
153
|
if(state < 0) {
|
|
153
154
|
for(var l=-3; l<0; i++) {
|
|
154
|
-
state = decodePrev.indexOf(prev.
|
|
155
|
+
state = decodePrev.indexOf(prev.slice(l));
|
|
155
156
|
if(state >= 0) break;
|
|
156
157
|
}
|
|
157
158
|
if(state < 0) state = decodePrev.indexOf('');
|
|
@@ -195,12 +196,13 @@ module.exports = {
|
|
|
195
196
|
|
|
196
197
|
if(!Buffer.isBuffer(data)) data = toBuffer(data);
|
|
197
198
|
|
|
198
|
-
filename = toBuffer(filename.replace(RE_BADCHAR, '').
|
|
199
|
+
filename = toBuffer(filename.replace(RE_BADCHAR, '').substring(0, 256), exports.encoding);
|
|
200
|
+
var e = encodeCrc(data, line_size);
|
|
199
201
|
return Buffer.concat([
|
|
200
202
|
toBuffer('=ybegin line='+line_size+' size='+data.length+' name='),
|
|
201
203
|
filename, nl,
|
|
202
|
-
|
|
203
|
-
toBuffer('\r\n=yend size='+data.length+' crc32=' +
|
|
204
|
+
e.output,
|
|
205
|
+
toBuffer('\r\n=yend size='+data.length+' crc32=' + e.crc32.toString('hex'))
|
|
204
206
|
]);
|
|
205
207
|
},
|
|
206
208
|
multi_post: function(filename, size, parts, line_size) {
|
|
@@ -214,7 +216,7 @@ module.exports = {
|
|
|
214
216
|
|
|
215
217
|
// find '=ybegin' to know where the yEnc data starts
|
|
216
218
|
var yencStart;
|
|
217
|
-
if(
|
|
219
|
+
if(bufferSlice.call(data, 0, 8).toString('hex') == '3d79626567696e20' /*=ybegin */) {
|
|
218
220
|
// common case: starts right at the beginning
|
|
219
221
|
yencStart = 0;
|
|
220
222
|
} else {
|
|
@@ -231,10 +233,10 @@ module.exports = {
|
|
|
231
233
|
var sp = yencStart;
|
|
232
234
|
var p = bufferFind(data, '\r\n', yencStart+8);
|
|
233
235
|
while(p > 0) {
|
|
234
|
-
var line =
|
|
236
|
+
var line = bufferSlice.call(data, sp, p).toString(this.encoding).trim();
|
|
235
237
|
lines.push(line);
|
|
236
238
|
sp = p+2;
|
|
237
|
-
if(line.
|
|
239
|
+
if(line.substring(0, 6) == '=yend ') { // no data in post
|
|
238
240
|
ret.yencEnd = sp;
|
|
239
241
|
break;
|
|
240
242
|
}
|
|
@@ -252,7 +254,7 @@ module.exports = {
|
|
|
252
254
|
var warnings = decoderParseLines(lines, ydata);
|
|
253
255
|
|
|
254
256
|
if(!ret.yencEnd) {
|
|
255
|
-
var yencEnd = bufferFindRev(
|
|
257
|
+
var yencEnd = bufferFindRev(bufferSlice.call(data, ret.dataStart), '\r\n=yend ');
|
|
256
258
|
if(yencEnd < 0)
|
|
257
259
|
return DecoderError('no_end_found', 'yEnd end marker not found');
|
|
258
260
|
|
|
@@ -265,7 +267,7 @@ module.exports = {
|
|
|
265
267
|
ret.yencEnd = p;
|
|
266
268
|
} else
|
|
267
269
|
ret.yencEnd = p+2;
|
|
268
|
-
var endLine =
|
|
270
|
+
var endLine = bufferSlice.call(data, yencEnd+2, p).toString(this.encoding).trim();
|
|
269
271
|
|
|
270
272
|
warnings = warnings.concat(decoderParseLines([endLine], ydata));
|
|
271
273
|
}
|
|
@@ -321,7 +323,7 @@ module.exports = {
|
|
|
321
323
|
warnings.push(DecoderWarning('size_mismatch', 'Size specified for part exceeds size specified for whole file'));
|
|
322
324
|
|
|
323
325
|
if(ret.dataStart) {
|
|
324
|
-
ret.data = y.decode(
|
|
326
|
+
ret.data = y.decode(bufferSlice.call(data, ret.dataStart, ret.dataEnd), !!isRaw);
|
|
325
327
|
ret.crc32 = y.crc32(ret.data);
|
|
326
328
|
var hexCrc = ret.crc32.toString('hex');
|
|
327
329
|
|
|
@@ -360,7 +362,7 @@ function YEncoder(filename, size, parts, line_size) {
|
|
|
360
362
|
this.pos = 0;
|
|
361
363
|
this.crc = toBuffer([0,0,0,0]);
|
|
362
364
|
|
|
363
|
-
filename = toBuffer(filename.replace(RE_BADCHAR, '').
|
|
365
|
+
filename = toBuffer(filename.replace(RE_BADCHAR, '').substring(0, 256), exports.encoding);
|
|
364
366
|
if(parts > 1) {
|
|
365
367
|
this.yInfo = Buffer.concat([
|
|
366
368
|
toBuffer(' total='+parts+' line='+line_size+' size='+size+' name='),
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "yencode",
|
|
3
|
-
"version": "1.1.
|
|
3
|
+
"version": "1.1.5",
|
|
4
4
|
"description": "SIMD accelerated yEnc encoder/decoder and CRC32 calculator",
|
|
5
5
|
"keywords": [
|
|
6
6
|
"yenc",
|
|
@@ -21,6 +21,7 @@
|
|
|
21
21
|
"install": "node-gyp rebuild"
|
|
22
22
|
},
|
|
23
23
|
"gypfile": true,
|
|
24
|
+
"type": "commonjs",
|
|
24
25
|
"bugs": {
|
|
25
26
|
"url": "https://github.com/animetosho/node-yencode/issues"
|
|
26
27
|
},
|
package/src/common.h
CHANGED
|
@@ -57,18 +57,18 @@
|
|
|
57
57
|
|
|
58
58
|
|
|
59
59
|
// MSVC compatibility
|
|
60
|
-
#if ((defined(_M_IX86_FP) && _M_IX86_FP == 2) || defined(_M_X64)) && !defined(__clang__)
|
|
60
|
+
#if ((defined(_M_IX86_FP) && _M_IX86_FP == 2) || defined(_M_X64)) && defined(_MSC_VER) && !defined(__clang__)
|
|
61
61
|
#define __SSE2__ 1
|
|
62
62
|
#define __SSSE3__ 1
|
|
63
63
|
#define __SSE4_1__ 1
|
|
64
|
-
#if
|
|
64
|
+
#if _MSC_VER >= 1600 && defined(__SSE2__)
|
|
65
65
|
#define __POPCNT__ 1
|
|
66
66
|
#define __LZCNT__ 1
|
|
67
67
|
#endif
|
|
68
68
|
#if !defined(__AVX__) && (_MSC_VER >= 1700 && defined(__SSE2__))
|
|
69
69
|
#define __AVX__ 1
|
|
70
70
|
#endif
|
|
71
|
-
#if !defined(__AVX2__) && (_MSC_VER >= 1800 && defined(
|
|
71
|
+
#if !defined(__AVX2__) && (_MSC_VER >= 1800 && defined(__AVX__))
|
|
72
72
|
#define __AVX2__ 1
|
|
73
73
|
#define __BMI2__ 1
|
|
74
74
|
#endif
|
|
@@ -145,6 +145,13 @@
|
|
|
145
145
|
|
|
146
146
|
#endif
|
|
147
147
|
|
|
148
|
+
#if defined(__ARM_NEON) && defined(__has_include)
|
|
149
|
+
# if !__has_include(<arm_neon.h>)
|
|
150
|
+
# undef __ARM_NEON
|
|
151
|
+
HEDLEY_WARNING("NEON has been disabled due to missing arm_neon.h");
|
|
152
|
+
# endif
|
|
153
|
+
#endif
|
|
154
|
+
|
|
148
155
|
#ifdef __ARM_NEON
|
|
149
156
|
# include <arm_neon.h>
|
|
150
157
|
|
|
@@ -214,17 +221,38 @@ bool cpu_supports_neon();
|
|
|
214
221
|
|
|
215
222
|
#ifdef PLATFORM_X86
|
|
216
223
|
enum YEncDecIsaLevel {
|
|
224
|
+
ISA_GENERIC = 0,
|
|
217
225
|
ISA_FEATURE_POPCNT = 0x1,
|
|
218
226
|
ISA_FEATURE_LZCNT = 0x2,
|
|
227
|
+
ISA_FEATURE_EVEX512 = 0x4, // AVX512 support
|
|
219
228
|
ISA_LEVEL_SSE2 = 0x100,
|
|
220
229
|
ISA_LEVEL_SSSE3 = 0x200,
|
|
221
230
|
ISA_LEVEL_SSE41 = 0x300,
|
|
222
231
|
ISA_LEVEL_SSE4_POPCNT = 0x301,
|
|
232
|
+
ISA_LEVEL_PCLMUL = 0x340,
|
|
223
233
|
ISA_LEVEL_AVX = 0x381, // same as above, just used as a differentiator for `cpu_supports_isa`
|
|
224
234
|
ISA_LEVEL_AVX2 = 0x403, // also includes BMI1/2 and LZCNT
|
|
225
|
-
|
|
226
|
-
|
|
235
|
+
ISA_LEVEL_VPCLMUL = 0x440,
|
|
236
|
+
ISA_LEVEL_AVX3 = 0x507, // SKX variant; AVX512VL + AVX512BW
|
|
237
|
+
ISA_LEVEL_VBMI2 = 0x603 // ICL, AVX10
|
|
238
|
+
};
|
|
239
|
+
#elif defined(PLATFORM_ARM)
|
|
240
|
+
enum YEncDecIsaLevel {
|
|
241
|
+
ISA_GENERIC = 0,
|
|
242
|
+
ISA_FEATURE_CRC = 8,
|
|
243
|
+
ISA_LEVEL_NEON = 0x1000
|
|
227
244
|
};
|
|
245
|
+
#elif defined(__riscv)
|
|
246
|
+
enum YEncDecIsaLevel {
|
|
247
|
+
ISA_GENERIC = 0,
|
|
248
|
+
ISA_LEVEL_RVV = 0x10000
|
|
249
|
+
};
|
|
250
|
+
#else
|
|
251
|
+
enum YEncDecIsaLevel {
|
|
252
|
+
ISA_GENERIC = 0
|
|
253
|
+
};
|
|
254
|
+
#endif
|
|
255
|
+
#ifdef PLATFORM_X86
|
|
228
256
|
#ifdef _MSC_VER
|
|
229
257
|
// native tuning not supported in MSVC
|
|
230
258
|
# define ISA_NATIVE ISA_LEVEL_SSE2
|
|
@@ -256,6 +284,16 @@ enum YEncDecIsaLevel {
|
|
|
256
284
|
int cpu_supports_isa();
|
|
257
285
|
#endif // PLATFORM_X86
|
|
258
286
|
|
|
287
|
+
|
|
288
|
+
#ifdef __riscv
|
|
289
|
+
bool cpu_supports_rvv();
|
|
290
|
+
#endif
|
|
291
|
+
#if defined(__riscv_vector) && defined(HEDLEY_GCC_VERSION) && !HEDLEY_GCC_VERSION_CHECK(13,0,0)
|
|
292
|
+
// GCC added RVV intrinsics in GCC13
|
|
293
|
+
# undef __riscv_vector
|
|
294
|
+
#endif
|
|
295
|
+
|
|
296
|
+
|
|
259
297
|
#include <string.h>
|
|
260
298
|
#if !defined(_MSC_VER) || defined(_STDINT) || _MSC_VER >= 1900
|
|
261
299
|
# include <stdint.h>
|
package/src/crc.cc
CHANGED
|
@@ -3,13 +3,130 @@
|
|
|
3
3
|
#include "interface.h"
|
|
4
4
|
crcutil_interface::CRC* crc = NULL;
|
|
5
5
|
|
|
6
|
+
#if defined(PLATFORM_X86) && !defined(__ILP32__)
|
|
6
7
|
static uint32_t do_crc32_incremental_generic(const void* data, size_t length, uint32_t init) {
|
|
8
|
+
// use optimised ASM on x86 platforms
|
|
7
9
|
crcutil_interface::UINT64 tmp = init;
|
|
8
10
|
crc->Compute(data, length, &tmp);
|
|
9
11
|
return (uint32_t)tmp;
|
|
10
12
|
}
|
|
11
|
-
|
|
13
|
+
#else
|
|
14
|
+
static uint32_t* HEDLEY_RESTRICT crc_slice_table;
|
|
15
|
+
#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
|
|
16
|
+
# if defined(__GNUC__) || defined(__clang__)
|
|
17
|
+
# define bswap32 __builtin_bswap32
|
|
18
|
+
# else
|
|
19
|
+
static inline uint32_t bswap32(uint32_t x) {
|
|
20
|
+
return (x >> 24) | ((x >> 8) & 0x0000FF00) | ((x << 8) & 0x00FF0000) | (x << 24);
|
|
21
|
+
}
|
|
22
|
+
# endif
|
|
23
|
+
#endif
|
|
12
24
|
|
|
25
|
+
#define CRC32_GENERIC_CHAINS 4 // newer processors may prefer 8
|
|
26
|
+
static uint32_t do_crc32_incremental_generic(const void* data, size_t length, uint32_t init) {
|
|
27
|
+
const uint32_t* crc_base_table = crc_slice_table + 4*256; // this also seems to help MSVC's optimiser, which otherwise keeps trying to add to crc_slice_table every time it's referenced
|
|
28
|
+
uint32_t crc[CRC32_GENERIC_CHAINS]; // Clang seems to be more spill happy with an array over individual variables :(
|
|
29
|
+
crc[0] = ~init;
|
|
30
|
+
uint8_t* current8 = (uint8_t*)data;
|
|
31
|
+
|
|
32
|
+
// align to multiple of 4
|
|
33
|
+
if(((uintptr_t)current8 & 1) && length >= 1) {
|
|
34
|
+
crc[0] = (crc[0] >> 8) ^ crc_base_table[(crc[0] & 0xFF) ^ *current8++];
|
|
35
|
+
length--;
|
|
36
|
+
}
|
|
37
|
+
if(((uintptr_t)current8 & 2) && length >= 2) {
|
|
38
|
+
crc[0] = (crc[0] >> 8) ^ crc_base_table[(crc[0] & 0xFF) ^ *current8++];
|
|
39
|
+
crc[0] = (crc[0] >> 8) ^ crc_base_table[(crc[0] & 0xFF) ^ *current8++];
|
|
40
|
+
length -= 2;
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
uint8_t* end8 = current8 + length;
|
|
44
|
+
uint32_t* current = (uint32_t*)current8;
|
|
45
|
+
if(length >= 8*CRC32_GENERIC_CHAINS-4) {
|
|
46
|
+
size_t lenMain = ((length-(CRC32_GENERIC_CHAINS-1)*4) / 4);
|
|
47
|
+
uint32_t* end = current + (lenMain / CRC32_GENERIC_CHAINS) * CRC32_GENERIC_CHAINS;
|
|
48
|
+
for(int c=1; c<CRC32_GENERIC_CHAINS; c++)
|
|
49
|
+
crc[c] = 0;
|
|
50
|
+
while(current != end) {
|
|
51
|
+
#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
|
|
52
|
+
#define CRC_PROC4(v, in) \
|
|
53
|
+
v ^= bswap32(in); \
|
|
54
|
+
v = crc_slice_table[v >> 24] ^ crc_slice_table[0x100L + ((v >> 16) & 0xff)] ^ crc_slice_table[0x200L + ((v >> 8) & 0xff)] ^ crc_slice_table[0x300L + (v & 0xff)]
|
|
55
|
+
#else
|
|
56
|
+
#define CRC_PROC4(v, in) \
|
|
57
|
+
v ^= (in); \
|
|
58
|
+
v = crc_slice_table[v >> 24] ^ crc_slice_table[0x100L + ((v >> 16) & 0xff)] ^ crc_slice_table[0x200L + ((v >> 8) & 0xff)] ^ crc_slice_table[0x300L + (v & 0xff)]
|
|
59
|
+
#endif
|
|
60
|
+
for(int c=0; c<CRC32_GENERIC_CHAINS; c++) {
|
|
61
|
+
CRC_PROC4(crc[c], *current);
|
|
62
|
+
current++;
|
|
63
|
+
}
|
|
64
|
+
}
|
|
65
|
+
// aggregate accumulators
|
|
66
|
+
current8 = (uint8_t*)current;
|
|
67
|
+
#if (CRC32_GENERIC_CHAINS & (CRC32_GENERIC_CHAINS-1)) == 0
|
|
68
|
+
// assume that lengths which are a multiple of 4/8/16/32 are common
|
|
69
|
+
if((end8 - current8) & (CRC32_GENERIC_CHAINS*4)) {
|
|
70
|
+
CRC_PROC4(crc[0], *current);
|
|
71
|
+
current8 += 4;
|
|
72
|
+
|
|
73
|
+
for(int c=1; c<CRC32_GENERIC_CHAINS; c++) {
|
|
74
|
+
for(int i=0; i<4; i++)
|
|
75
|
+
crc[c] = (crc[c] >> 8) ^ crc_base_table[(crc[c] & 0xff) ^ *current8++];
|
|
76
|
+
crc[(c+1) & ~CRC32_GENERIC_CHAINS] ^= crc[c];
|
|
77
|
+
}
|
|
78
|
+
} else
|
|
79
|
+
#endif
|
|
80
|
+
#undef CRC_PROC4
|
|
81
|
+
for(int c=1; c<CRC32_GENERIC_CHAINS; c++) {
|
|
82
|
+
for(int i=0; i<4; i++)
|
|
83
|
+
crc[0] = (crc[0] >> 8) ^ crc_base_table[(crc[0] & 0xff) ^ *current8++];
|
|
84
|
+
crc[0] ^= crc[c];
|
|
85
|
+
}
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
// tail loop
|
|
89
|
+
while(current8 != end8) {
|
|
90
|
+
crc[0] = (crc[0] >> 8) ^ crc_base_table[(crc[0] & 0xFF) ^ *current8++];
|
|
91
|
+
}
|
|
92
|
+
return ~crc[0];
|
|
93
|
+
}
|
|
94
|
+
static void generate_crc32_slice_table() {
|
|
95
|
+
crc_slice_table = (uint32_t*)malloc(5*256*sizeof(uint32_t));
|
|
96
|
+
// generate standard byte-by-byte table
|
|
97
|
+
uint32_t* crc_base_table = crc_slice_table + 4*256;
|
|
98
|
+
for(int v=0; v<256; v++) {
|
|
99
|
+
uint32_t crc = v;
|
|
100
|
+
for(int j = 0; j < 8; j++) {
|
|
101
|
+
crc = (crc >> 1) ^ (-(int32_t)(crc & 1) & 0xEDB88320);
|
|
102
|
+
}
|
|
103
|
+
crc_base_table[v] = crc;
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
// generate slice-by-4 shifted across for X independent chains
|
|
107
|
+
for(int v=0; v<256; v++) {
|
|
108
|
+
uint32_t crc = crc_base_table[v];
|
|
109
|
+
#if CRC32_GENERIC_CHAINS > 1
|
|
110
|
+
for(int i=0; i<4*CRC32_GENERIC_CHAINS-5; i++)
|
|
111
|
+
crc = (crc >> 8) ^ crc_base_table[crc & 0xff];
|
|
112
|
+
for(int i=0; i<4; i++) {
|
|
113
|
+
crc = (crc >> 8) ^ crc_base_table[crc & 0xff];
|
|
114
|
+
crc_slice_table[i*256 + v] = crc;
|
|
115
|
+
}
|
|
116
|
+
#else
|
|
117
|
+
for(int i=0; i<4; i++) {
|
|
118
|
+
crc_slice_table[i*256 + v] = crc;
|
|
119
|
+
crc = (crc >> 8) ^ crc_base_table[crc & 0xff];
|
|
120
|
+
}
|
|
121
|
+
#endif
|
|
122
|
+
}
|
|
123
|
+
}
|
|
124
|
+
#endif
|
|
125
|
+
|
|
126
|
+
extern "C" {
|
|
127
|
+
crc_func _do_crc32_incremental = &do_crc32_incremental_generic;
|
|
128
|
+
int _crc32_isa = ISA_GENERIC;
|
|
129
|
+
}
|
|
13
130
|
|
|
14
131
|
|
|
15
132
|
uint32_t do_crc32_combine(uint32_t crc1, uint32_t crc2, size_t len2) {
|
|
@@ -24,9 +141,9 @@ uint32_t do_crc32_zeros(uint32_t crc1, size_t len) {
|
|
|
24
141
|
return (uint32_t)crc_;
|
|
25
142
|
}
|
|
26
143
|
|
|
27
|
-
void crc_clmul_set_funcs(
|
|
28
|
-
void crc_clmul256_set_funcs(
|
|
29
|
-
void crc_arm_set_funcs(
|
|
144
|
+
void crc_clmul_set_funcs();
|
|
145
|
+
void crc_clmul256_set_funcs();
|
|
146
|
+
void crc_arm_set_funcs();
|
|
30
147
|
|
|
31
148
|
#ifdef PLATFORM_X86
|
|
32
149
|
int cpu_supports_crc_isa();
|
|
@@ -39,22 +156,23 @@ int cpu_supports_crc_isa();
|
|
|
39
156
|
#ifdef PLATFORM_ARM
|
|
40
157
|
# ifdef __ANDROID__
|
|
41
158
|
# include <cpu-features.h>
|
|
42
|
-
# elif defined(__linux__) || (defined(__FreeBSD__) && __FreeBSD__ >= 12)
|
|
43
|
-
# include <sys/auxv.h>
|
|
44
|
-
# include <asm/hwcap.h>
|
|
45
|
-
# elif (defined(__FreeBSD__) && __FreeBSD__ < 12)
|
|
46
|
-
# include <sys/sysctl.h>
|
|
47
|
-
# include <asm/hwcap.h>
|
|
48
159
|
# elif defined(__APPLE__)
|
|
49
160
|
# include <sys/types.h>
|
|
50
161
|
# include <sys/sysctl.h>
|
|
51
|
-
#
|
|
52
|
-
#
|
|
162
|
+
# elif defined(__has_include)
|
|
163
|
+
# if __has_include(<sys/auxv.h>)
|
|
164
|
+
# include <sys/auxv.h>
|
|
165
|
+
# ifdef __FreeBSD__
|
|
53
166
|
static unsigned long getauxval(unsigned long cap) {
|
|
54
167
|
unsigned long ret;
|
|
55
168
|
elf_aux_info(cap, &ret, sizeof(ret));
|
|
56
169
|
return ret;
|
|
57
170
|
}
|
|
171
|
+
# endif
|
|
172
|
+
# if __has_include(<asm/hwcap.h>)
|
|
173
|
+
# include <asm/hwcap.h>
|
|
174
|
+
# endif
|
|
175
|
+
# endif
|
|
58
176
|
# endif
|
|
59
177
|
#endif
|
|
60
178
|
void crc_init() {
|
|
@@ -62,12 +180,16 @@ void crc_init() {
|
|
|
62
180
|
0xEDB88320, 0, 32, true, 0, 0, 0, 0, NULL);
|
|
63
181
|
// instance never deleted... oh well...
|
|
64
182
|
|
|
183
|
+
#if !defined(PLATFORM_X86) || defined(__ILP32__)
|
|
184
|
+
generate_crc32_slice_table();
|
|
185
|
+
#endif
|
|
186
|
+
|
|
65
187
|
#ifdef PLATFORM_X86
|
|
66
188
|
int support = cpu_supports_crc_isa();
|
|
67
189
|
if(support == 2)
|
|
68
|
-
crc_clmul256_set_funcs(
|
|
190
|
+
crc_clmul256_set_funcs();
|
|
69
191
|
else if(support == 1)
|
|
70
|
-
crc_clmul_set_funcs(
|
|
192
|
+
crc_clmul_set_funcs();
|
|
71
193
|
#endif
|
|
72
194
|
#ifdef PLATFORM_ARM
|
|
73
195
|
# ifdef __APPLE__
|
|
@@ -95,7 +217,7 @@ void crc_init() {
|
|
|
95
217
|
false
|
|
96
218
|
# endif
|
|
97
219
|
) {
|
|
98
|
-
crc_arm_set_funcs(
|
|
220
|
+
crc_arm_set_funcs();
|
|
99
221
|
}
|
|
100
222
|
#endif
|
|
101
223
|
}
|
package/src/crc.h
CHANGED
|
@@ -9,11 +9,15 @@ extern "C" {
|
|
|
9
9
|
|
|
10
10
|
typedef uint32_t (*crc_func)(const void*, size_t, uint32_t);
|
|
11
11
|
extern crc_func _do_crc32_incremental;
|
|
12
|
+
extern int _crc32_isa;
|
|
12
13
|
#define do_crc32 (*_do_crc32_incremental)
|
|
13
14
|
|
|
14
15
|
uint32_t do_crc32_combine(uint32_t crc1, const uint32_t crc2, size_t len2);
|
|
15
16
|
uint32_t do_crc32_zeros(uint32_t crc1, size_t len);
|
|
16
17
|
void crc_init();
|
|
18
|
+
static inline int crc32_isa_level() {
|
|
19
|
+
return _crc32_isa;
|
|
20
|
+
}
|
|
17
21
|
|
|
18
22
|
|
|
19
23
|
|
package/src/crc_arm.cc
CHANGED
|
@@ -16,6 +16,12 @@ HEDLEY_WARNING("CRC32 acceleration has been disabled due to broken arm_acle.h sh
|
|
|
16
16
|
HEDLEY_WARNING("CRC32 acceleration has been disabled due to broken arm_acle.h shipped in GCC 9.4 [https://gcc.gnu.org/bugzilla/show_bug.cgi?id=100985]. If you need this feature, please use a different compiler or version of GCC");
|
|
17
17
|
# endif
|
|
18
18
|
#endif
|
|
19
|
+
#if defined(__ARM_FEATURE_CRC32) && defined(__has_include)
|
|
20
|
+
# if !__has_include(<arm_acle.h>)
|
|
21
|
+
# undef __ARM_FEATURE_CRC32
|
|
22
|
+
HEDLEY_WARNING("CRC32 acceleration has been disabled due to missing arm_acle.h");
|
|
23
|
+
# endif
|
|
24
|
+
#endif
|
|
19
25
|
|
|
20
26
|
#if defined(__ARM_FEATURE_CRC32) || (defined(_M_ARM64) && !defined(__clang__)) // MSVC doesn't support CRC for ARM32
|
|
21
27
|
|
|
@@ -73,7 +79,7 @@ static HEDLEY_ALWAYS_INLINE uint32_t crc_multiply(uint32_t a, uint32_t b) {
|
|
|
73
79
|
return res;
|
|
74
80
|
}
|
|
75
81
|
|
|
76
|
-
static const uint32_t crc_power[] = { // pre-computed 2^n, with first 3 entries removed (saves a shift)
|
|
82
|
+
static const uint32_t crc_power[] = { // pre-computed 2^(2^n), with first 3 entries removed (saves a shift)
|
|
77
83
|
0x00800000, 0x00008000, 0xedb88320, 0xb1e6b092, 0xa06a2517, 0xed627dae, 0x88d14467, 0xd7bbfe6a,
|
|
78
84
|
0xec447f11, 0x8e7ea170, 0x6427800e, 0x4d47bae0, 0x09fe548f, 0x83852d0f, 0x30362f1a, 0x7b5a9cc3,
|
|
79
85
|
0x31fec169, 0x9fec022a, 0x6c8dedc4, 0x15d6874d, 0x5fde7a4e, 0xbad90e37, 0x2e4e5eef, 0x4eaba214,
|
|
@@ -194,11 +200,10 @@ static uint32_t do_crc32_incremental_arm(const void* data, size_t length, uint32
|
|
|
194
200
|
return ~arm_crc_calc(~init, (const unsigned char*)data, (long)length);
|
|
195
201
|
}
|
|
196
202
|
|
|
197
|
-
void crc_arm_set_funcs(
|
|
198
|
-
|
|
203
|
+
void crc_arm_set_funcs() {
|
|
204
|
+
_do_crc32_incremental = &do_crc32_incremental_arm;
|
|
205
|
+
_crc32_isa = ISA_FEATURE_CRC;
|
|
199
206
|
}
|
|
200
207
|
#else
|
|
201
|
-
void crc_arm_set_funcs(
|
|
202
|
-
(void)_do_crc32_incremental;
|
|
203
|
-
}
|
|
208
|
+
void crc_arm_set_funcs() {}
|
|
204
209
|
#endif
|