yencode 1.1.2 → 1.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -129,7 +129,8 @@ int decodeTo(Buffer data, Buffer output, bool stripDots=false)
129
129
  Same as above, but instead of returning a Buffer, writes it to the supplied
130
130
  *output* Buffer. Returns the length of the decoded data.
131
131
  Note that the *output* Buffer must be at least large enough to hold the largest
132
- possible output size (i.e. length of the input), otherwise an error is thrown.
132
+ possible output size (i.e. length of the input), otherwise an error is thrown.
133
+ The *data* and *output* Buffers can be the same, for in-situ decoding.
133
134
 
134
135
  Object decodeChunk\(Buffer data \[, string state=null\]\[, Buffer output\]\)
135
136
  -----------------------------------------------------------------------------
@@ -142,7 +143,7 @@ designed to incrementally process a stream from the network, and will perform NN
142
143
  *state* is the current state of the incremental decode. Set to *null* if this is starting the decode of a new article, otherwise this should be set to the value of *state* given from the previous invocation of *decodeChunk*
143
144
  If *output* is supplied, the output will be written here \(see *decodeTo* for notes
144
145
  on required size\), otherwise a new buffer will be created where the output will be
145
- written to.
146
+ written to. The *data* and *output* Buffers can be the same, for in-situ decoding.
146
147
 
147
148
  Returns an object with the following keys:
148
149
 
package/binding.gyp CHANGED
@@ -43,10 +43,20 @@
43
43
  }],
44
44
  ['OS!="win" and enable_native_tuning!=0', {
45
45
  "defines": ["YENC_BUILD_NATIVE=1"]
46
+ }],
47
+ ['OS!="win"', {
48
+ "variables": {
49
+ "missing_memalign%": "<!(<!(echo ${CC_target:-${CC:-cc}}) -c src/test_alignalloc.c -o /dev/null -Werror 2>/dev/null || echo failed)",
50
+ },
51
+ "conditions": [
52
+ ['missing_memalign!=""', {
53
+ "defines": ["_POSIX_C_SOURCE=200112L"],
54
+ }]
55
+ ]
46
56
  }]
47
57
  ],
48
58
  "cflags": ["-Wno-unused-function"],
49
- "cxxflags": ["-Wno-unused-function"],
59
+ "cxxflags": ["-Wno-unused-function", "-std=c++03", "-D_POSIX_C_SOURCE=200112L"],
50
60
  "xcode_settings": {
51
61
  "OTHER_CFLAGS": ["-Wno-unused-function"],
52
62
  "OTHER_CXXFLAGS": ["-Wno-unused-function"]
@@ -64,7 +74,7 @@
64
74
  "targets": [
65
75
  {
66
76
  "target_name": "yencode",
67
- "dependencies": ["crcutil", "yencode_sse2", "yencode_ssse3", "yencode_clmul", "yencode_avx", "yencode_avx2", "yencode_neon", "yencode_armcrc"],
77
+ "dependencies": ["crcutil", "yencode_sse2", "yencode_ssse3", "yencode_clmul", "yencode_clmul256", "yencode_avx", "yencode_avx2", "yencode_vbmi2", "yencode_neon", "yencode_armcrc", "yencode_rvv"],
68
78
  "sources": [
69
79
  "src/yencode.cc",
70
80
  "src/platform.cc",
@@ -206,6 +216,81 @@
206
216
  }]
207
217
  ]
208
218
  },
219
+ {
220
+ "target_name": "yencode_clmul256",
221
+ "type": "static_library",
222
+ "sources": [
223
+ "src/crc_folding_256.cc"
224
+ ],
225
+ "cflags!": ["-fno-omit-frame-pointer", "-fno-tree-vrp", "-fno-strict-aliasing"],
226
+ "cxxflags!": ["-fno-omit-frame-pointer", "-fno-tree-vrp", "-fno-strict-aliasing"],
227
+ "xcode_settings": {
228
+ "OTHER_CFLAGS!": ["-fno-omit-frame-pointer", "-fno-tree-vrp", "-fno-strict-aliasing"],
229
+ "OTHER_CXXFLAGS!": ["-fno-omit-frame-pointer", "-fno-tree-vrp", "-fno-strict-aliasing"]
230
+ },
231
+ "msvs_settings": {"VCCLCompilerTool": {"BufferSecurityCheck": "false"}},
232
+ "conditions": [
233
+ ['target_arch in "ia32 x64" and OS!="win"', {
234
+ "variables": {"supports_vpclmul%": "<!(<!(echo ${CXX_target:-${CXX:-c++}}) -MM -E src/crc_folding_256.cc -mavx2 -mvpclmulqdq 2>/dev/null || true)"},
235
+ "conditions": [
236
+ ['supports_vpclmul!=""', {
237
+ "cflags": ["-mavx2", "-mvpclmulqdq", "-mpclmul"],
238
+ "cxxflags": ["-mavx2", "-mvpclmulqdq", "-mpclmul"],
239
+ "xcode_settings": {
240
+ "OTHER_CFLAGS": ["-mavx2", "-mvpclmulqdq", "-mpclmul"],
241
+ "OTHER_CXXFLAGS": ["-mavx2", "-mvpclmulqdq", "-mpclmul"],
242
+ }
243
+ }]
244
+ ]
245
+ }],
246
+ ['target_arch in "ia32 x64" and OS=="win"', {
247
+ "msvs_settings": {"VCCLCompilerTool": {"EnableEnhancedInstructionSet": "3"}}
248
+ }]
249
+ ]
250
+ },
251
+ {
252
+ "target_name": "yencode_vbmi2",
253
+ "type": "static_library",
254
+ "sources": [
255
+ "src/decoder_vbmi2.cc", "src/encoder_vbmi2.cc"
256
+ ],
257
+ "cflags!": ["-fno-omit-frame-pointer", "-fno-tree-vrp", "-fno-strict-aliasing"],
258
+ "cxxflags!": ["-fno-omit-frame-pointer", "-fno-tree-vrp", "-fno-strict-aliasing"],
259
+ "xcode_settings": {
260
+ "OTHER_CFLAGS!": ["-fno-omit-frame-pointer", "-fno-tree-vrp", "-fno-strict-aliasing"],
261
+ "OTHER_CXXFLAGS!": ["-fno-omit-frame-pointer", "-fno-tree-vrp", "-fno-strict-aliasing"]
262
+ },
263
+ "msvs_settings": {"VCCLCompilerTool": {"BufferSecurityCheck": "false"}},
264
+ "conditions": [
265
+ ['target_arch in "ia32 x64" and OS!="win"', {
266
+ "variables": {
267
+ "supports_vbmi2%": "<!(<!(echo ${CXX_target:-${CXX:-c++}}) -MM -E src/encoder_vbmi2.cc -mavx512vl -mavx512vbmi2 2>/dev/null || true)",
268
+ "supports_avx10%": "<!(<!(echo ${CXX_target:-${CXX:-c++}}) -MM -E src/encoder_vbmi2.cc -mavx512vl -mno-evex512 2>/dev/null || true)"
269
+ },
270
+ "conditions": [
271
+ ['supports_vbmi2!=""', {
272
+ "cflags": ["-mavx512vbmi2", "-mavx512vl", "-mavx512bw", "-mpopcnt", "-mbmi", "-mbmi2", "-mlzcnt"],
273
+ "cxxflags": ["-mavx512vbmi2", "-mavx512vl", "-mavx512bw", "-mpopcnt", "-mbmi", "-mbmi2", "-mlzcnt"],
274
+ "xcode_settings": {
275
+ "OTHER_CFLAGS": ["-mavx512vbmi2", "-mavx512vl", "-mavx512bw", "-mpopcnt", "-mbmi", "-mbmi2", "-mlzcnt"],
276
+ "OTHER_CXXFLAGS": ["-mavx512vbmi2", "-mavx512vl", "-mavx512bw", "-mpopcnt", "-mbmi", "-mbmi2", "-mlzcnt"],
277
+ }
278
+ }],
279
+ ['supports_avx10!=""', {
280
+ "cflags": ["-mno-evex512"],
281
+ "cxxflags": ["-mno-evex512"],
282
+ "xcode_settings": {
283
+ "OTHER_CFLAGS": ["-mno-evex512"],
284
+ "OTHER_CXXFLAGS": ["-mno-evex512"],
285
+ }
286
+ }]
287
+ ]
288
+ }],
289
+ ['target_arch in "ia32 x64" and OS=="win"', {
290
+ "msvs_settings": {"VCCLCompilerTool": {"AdditionalOptions": ["/arch:AVX512"], "EnableEnhancedInstructionSet": "0"}}
291
+ }]
292
+ ]
293
+ },
209
294
  {
210
295
  "target_name": "yencode_neon",
211
296
  "type": "static_library",
@@ -221,11 +306,11 @@
221
306
  "msvs_settings": {"VCCLCompilerTool": {"BufferSecurityCheck": "false"}},
222
307
  "conditions": [
223
308
  ['target_arch=="arm"', {
224
- "cflags": ["-mfpu=neon"],
225
- "cxxflags": ["-mfpu=neon"],
309
+ "cflags": ["-mfpu=neon","-fno-lto"],
310
+ "cxxflags": ["-mfpu=neon","-fno-lto"],
226
311
  "xcode_settings": {
227
- "OTHER_CFLAGS": ["-mfpu=neon"],
228
- "OTHER_CXXFLAGS": ["-mfpu=neon"],
312
+ "OTHER_CFLAGS": ["-mfpu=neon","-fno-lto"],
313
+ "OTHER_CXXFLAGS": ["-mfpu=neon","-fno-lto"],
229
314
  }
230
315
  }],
231
316
  ['target_arch=="arm64"', {
@@ -235,6 +320,48 @@
235
320
  }]
236
321
  ]
237
322
  },
323
+ {
324
+ "target_name": "yencode_rvv",
325
+ "type": "static_library",
326
+ "sources": [
327
+ "src/encoder_rvv.cc"
328
+ ],
329
+ "cflags!": ["-fno-omit-frame-pointer", "-fno-tree-vrp", "-fno-strict-aliasing"],
330
+ "cxxflags!": ["-fno-omit-frame-pointer", "-fno-tree-vrp", "-fno-strict-aliasing"],
331
+ "xcode_settings": {
332
+ "OTHER_CFLAGS!": ["-fno-omit-frame-pointer", "-fno-tree-vrp", "-fno-strict-aliasing"],
333
+ "OTHER_CXXFLAGS!": ["-fno-omit-frame-pointer", "-fno-tree-vrp", "-fno-strict-aliasing"]
334
+ },
335
+ "msvs_settings": {"VCCLCompilerTool": {"BufferSecurityCheck": "false"}},
336
+ "conditions": [
337
+ ['target_arch=="riscv64" and OS!="win"', {
338
+ "variables": {"supports_rvv%": "<!(<!(echo ${CXX_target:-${CXX:-c++}}) -MM -E src/encoder_rvv.cc -march=rv64gcv 2>/dev/null || true)"},
339
+ "conditions": [
340
+ ['supports_rvv!=""', {
341
+ "cflags": ["-march=rv64gcv"],
342
+ "cxxflags": ["-march=rv64gcv"],
343
+ "xcode_settings": {
344
+ "OTHER_CFLAGS": ["-march=rv64gcv"],
345
+ "OTHER_CXXFLAGS": ["-march=rv64gcv"],
346
+ }
347
+ }]
348
+ ]
349
+ }],
350
+ ['target_arch=="riscv32" and OS!="win"', {
351
+ "variables": {"supports_rvv%": "<!(<!(echo ${CXX_target:-${CXX:-c++}}) -MM -E src/encoder_rvv.cc -march=rv32gcv 2>/dev/null || true)"},
352
+ "conditions": [
353
+ ['supports_rvv!=""', {
354
+ "cflags": ["-march=rv32gcv"],
355
+ "cxxflags": ["-march=rv32gcv"],
356
+ "xcode_settings": {
357
+ "OTHER_CFLAGS": ["-march=rv32gcv"],
358
+ "OTHER_CXXFLAGS": ["-march=rv32gcv"],
359
+ }
360
+ }]
361
+ ]
362
+ }]
363
+ ]
364
+ },
238
365
  {
239
366
  "target_name": "yencode_armcrc",
240
367
  "type": "static_library",
@@ -260,6 +387,14 @@
260
387
  "OTHER_CFLAGS": ["-march=armv8-a+crc"],
261
388
  "OTHER_CXXFLAGS": ["-march=armv8-a+crc"],
262
389
  }
390
+ }],
391
+ ['OS!="win" and target_arch=="arm"', {
392
+ "cflags": ["-mfpu=fp-armv8","-fno-lto"],
393
+ "cxxflags": ["-mfpu=fp-armv8","-fno-lto"],
394
+ "xcode_settings": {
395
+ "OTHER_CFLAGS": ["-mfpu=fp-armv8","-fno-lto"],
396
+ "OTHER_CXXFLAGS": ["-mfpu=fp-armv8","-fno-lto"]
397
+ }
263
398
  }]
264
399
  ]
265
400
  },
package/index.js CHANGED
@@ -3,6 +3,7 @@
3
3
  var y = require('./build/Release/yencode.node');
4
4
 
5
5
  var toBuffer = Buffer.alloc ? Buffer.from : Buffer;
6
+ var bufferSlice = Buffer.prototype.readBigInt64BE ? Buffer.prototype.subarray : Buffer.prototype.slice;
6
7
 
7
8
  var nl = toBuffer([13, 10]);
8
9
  var RE_BADCHAR = /\r\n\0/g;
@@ -76,28 +77,28 @@ var decoderParseLines = function(lines, ydata) {
76
77
  for(var i=0; i<lines.length; i++) {
77
78
  var yprops = {};
78
79
 
79
- var line = lines[i].substr(2); // cut off '=y'
80
+ var line = lines[i].substring(2); // cut off '=y'
80
81
  // parse tag
81
82
  var p = line.indexOf(' ');
82
- var tag = (p<0 ? line : line.substr(0, p));
83
- line = line.substr(tag.length+1).trim();
83
+ var tag = (p<0 ? line : line.substring(0, p));
84
+ line = line.substring(tag.length+1).trim();
84
85
 
85
86
  // parse props
86
87
  var m = line.match(RE_YPROP);
87
88
  while(m) {
88
89
  if(m.index != 0) {
89
- warnings.push(DecoderWarning('ignored_line_data', 'Unknown additional data ignored: "' + line.substr(0, m.index) + '"'));
90
+ warnings.push(DecoderWarning('ignored_line_data', 'Unknown additional data ignored: "' + line.substring(0, m.index) + '"'));
90
91
  }
91
92
  var prop = m[1], val;
92
93
  var valPos = m.index + m[0].length;
93
94
  if(tag == 'begin' && prop == 'name') {
94
95
  // special treatment of filename - the value is the rest of the line (can include spaces)
95
- val = line.substr(valPos);
96
+ val = line.substring(valPos);
96
97
  line = '';
97
98
  } else {
98
99
  p = line.indexOf(' ', valPos);
99
- val = (p<0 ? line.substr(valPos) : line.substr(valPos, p-valPos));
100
- line = line.substr(valPos + val.length +1);
100
+ val = (p<0 ? line.substring(valPos) : line.substring(valPos, p));
101
+ line = line.substring(valPos + val.length +1);
101
102
  }
102
103
  if(prop in yprops) {
103
104
  warnings.push(DecoderWarning('duplicate_property', 'Duplicate property encountered: `' + prop + '`'));
@@ -139,7 +140,7 @@ module.exports = {
139
140
  prev = '\r\n';
140
141
 
141
142
  if(Buffer.isBuffer(prev)) prev = prev.toString();
142
- prev = prev.substr(-4); // only care about the last 4 chars of previous state
143
+ prev = prev.slice(-4); // only care about the last 4 chars of previous state
143
144
  if(prev == '\r\n.=') prev = '\r\n='; // aliased after dot stripped
144
145
  if(data.length == 0) return {
145
146
  read: 0,
@@ -151,7 +152,7 @@ module.exports = {
151
152
  var state = decodePrev.indexOf(prev);
152
153
  if(state < 0) {
153
154
  for(var l=-3; l<0; i++) {
154
- state = decodePrev.indexOf(prev.substr(l));
155
+ state = decodePrev.indexOf(prev.slice(l));
155
156
  if(state >= 0) break;
156
157
  }
157
158
  if(state < 0) state = decodePrev.indexOf('');
@@ -195,12 +196,13 @@ module.exports = {
195
196
 
196
197
  if(!Buffer.isBuffer(data)) data = toBuffer(data);
197
198
 
198
- filename = toBuffer(filename.replace(RE_BADCHAR, '').substr(0, 256), exports.encoding);
199
+ filename = toBuffer(filename.replace(RE_BADCHAR, '').substring(0, 256), exports.encoding);
200
+ var e = encodeCrc(data, line_size);
199
201
  return Buffer.concat([
200
202
  toBuffer('=ybegin line='+line_size+' size='+data.length+' name='),
201
203
  filename, nl,
202
- y.encode(data, line_size),
203
- toBuffer('\r\n=yend size='+data.length+' crc32=' + y.crc32(data).toString('hex'))
204
+ e.output,
205
+ toBuffer('\r\n=yend size='+data.length+' crc32=' + e.crc32.toString('hex'))
204
206
  ]);
205
207
  },
206
208
  multi_post: function(filename, size, parts, line_size) {
@@ -214,7 +216,7 @@ module.exports = {
214
216
 
215
217
  // find '=ybegin' to know where the yEnc data starts
216
218
  var yencStart;
217
- if(data.slice(0, 8).toString('hex') == '3d79626567696e20' /*=ybegin */) {
219
+ if(bufferSlice.call(data, 0, 8).toString('hex') == '3d79626567696e20' /*=ybegin */) {
218
220
  // common case: starts right at the beginning
219
221
  yencStart = 0;
220
222
  } else {
@@ -231,10 +233,10 @@ module.exports = {
231
233
  var sp = yencStart;
232
234
  var p = bufferFind(data, '\r\n', yencStart+8);
233
235
  while(p > 0) {
234
- var line = data.slice(sp, p).toString(this.encoding).trim();
236
+ var line = bufferSlice.call(data, sp, p).toString(this.encoding).trim();
235
237
  lines.push(line);
236
238
  sp = p+2;
237
- if(line.substr(0, 6) == '=yend ') { // no data in post
239
+ if(line.substring(0, 6) == '=yend ') { // no data in post
238
240
  ret.yencEnd = sp;
239
241
  break;
240
242
  }
@@ -252,7 +254,7 @@ module.exports = {
252
254
  var warnings = decoderParseLines(lines, ydata);
253
255
 
254
256
  if(!ret.yencEnd) {
255
- var yencEnd = bufferFindRev(data.slice(ret.dataStart), '\r\n=yend ');
257
+ var yencEnd = bufferFindRev(bufferSlice.call(data, ret.dataStart), '\r\n=yend ');
256
258
  if(yencEnd < 0)
257
259
  return DecoderError('no_end_found', 'yEnd end marker not found');
258
260
 
@@ -265,7 +267,7 @@ module.exports = {
265
267
  ret.yencEnd = p;
266
268
  } else
267
269
  ret.yencEnd = p+2;
268
- var endLine = data.slice(yencEnd+2, p).toString(this.encoding).trim();
270
+ var endLine = bufferSlice.call(data, yencEnd+2, p).toString(this.encoding).trim();
269
271
 
270
272
  warnings = warnings.concat(decoderParseLines([endLine], ydata));
271
273
  }
@@ -321,7 +323,7 @@ module.exports = {
321
323
  warnings.push(DecoderWarning('size_mismatch', 'Size specified for part exceeds size specified for whole file'));
322
324
 
323
325
  if(ret.dataStart) {
324
- ret.data = y.decode(data.slice(ret.dataStart, ret.dataEnd), isRaw);
326
+ ret.data = y.decode(bufferSlice.call(data, ret.dataStart, ret.dataEnd), !!isRaw);
325
327
  ret.crc32 = y.crc32(ret.data);
326
328
  var hexCrc = ret.crc32.toString('hex');
327
329
 
@@ -360,7 +362,7 @@ function YEncoder(filename, size, parts, line_size) {
360
362
  this.pos = 0;
361
363
  this.crc = toBuffer([0,0,0,0]);
362
364
 
363
- filename = toBuffer(filename.replace(RE_BADCHAR, '').substr(0, 256), exports.encoding);
365
+ filename = toBuffer(filename.replace(RE_BADCHAR, '').substring(0, 256), exports.encoding);
364
366
  if(parts > 1) {
365
367
  this.yInfo = Buffer.concat([
366
368
  toBuffer(' total='+parts+' line='+line_size+' size='+size+' name='),
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "yencode",
3
- "version": "1.1.2",
3
+ "version": "1.1.4",
4
4
  "description": "SIMD accelerated yEnc encoder/decoder and CRC32 calculator",
5
5
  "keywords": [
6
6
  "yenc",
@@ -21,6 +21,7 @@
21
21
  "install": "node-gyp rebuild"
22
22
  },
23
23
  "gypfile": true,
24
+ "type": "commonjs",
24
25
  "bugs": {
25
26
  "url": "https://github.com/animetosho/node-yencode/issues"
26
27
  },
package/src/common.h CHANGED
@@ -35,36 +35,40 @@
35
35
  #endif
36
36
 
37
37
 
38
+ #include <stdlib.h>
38
39
  #if defined(_MSC_VER) || defined(__MINGW32__) || defined(__MINGW64__)
39
- #include <stdlib.h> // MSVC ARM64 seems to need this
40
+ // MSVC doesn't support C11 aligned_alloc: https://stackoverflow.com/a/62963007
40
41
  #define ALIGN_ALLOC(buf, len, align) *(void**)&(buf) = _aligned_malloc((len), align)
41
42
  #define ALIGN_FREE _aligned_free
42
- #elif defined(__cplusplus) && __cplusplus >= 201100 && !(defined(_MSC_VER) && (defined(__clang__) || defined(_M_ARM64) || defined(_M_ARM))) && !defined(__APPLE__)
43
- // C++11 method
43
+ #elif defined(_ISOC11_SOURCE)
44
+ // C11 method
44
45
  // len needs to be a multiple of alignment, although it sometimes works if it isn't...
45
- #include <cstdlib>
46
46
  #define ALIGN_ALLOC(buf, len, align) *(void**)&(buf) = aligned_alloc(align, ((len) + (align)-1) & ~((align)-1))
47
47
  #define ALIGN_FREE free
48
+ #elif defined(__cplusplus) && __cplusplus >= 201700
49
+ // C++17 method
50
+ #include <cstdlib>
51
+ #define ALIGN_ALLOC(buf, len, align) *(void**)&(buf) = std::aligned_alloc(align, ((len) + (align)-1) & ~((align)-1))
52
+ #define ALIGN_FREE free
48
53
  #else
49
- #include <stdlib.h>
50
54
  #define ALIGN_ALLOC(buf, len, align) if(posix_memalign((void**)&(buf), align, (len))) (buf) = NULL
51
55
  #define ALIGN_FREE free
52
56
  #endif
53
57
 
54
58
 
55
59
  // MSVC compatibility
56
- #if ((defined(_M_IX86_FP) && _M_IX86_FP == 2) || defined(_M_X64)) && !defined(__clang__)
60
+ #if ((defined(_M_IX86_FP) && _M_IX86_FP == 2) || defined(_M_X64)) && defined(_MSC_VER) && !defined(__clang__)
57
61
  #define __SSE2__ 1
58
62
  #define __SSSE3__ 1
59
63
  #define __SSE4_1__ 1
60
- #if defined(_MSC_VER) && _MSC_VER >= 1600
64
+ #if _MSC_VER >= 1600 && defined(__SSE2__)
61
65
  #define __POPCNT__ 1
62
66
  #define __LZCNT__ 1
63
67
  #endif
64
68
  #if !defined(__AVX__) && (_MSC_VER >= 1700 && defined(__SSE2__))
65
69
  #define __AVX__ 1
66
70
  #endif
67
- #if !defined(__AVX2__) && (_MSC_VER >= 1800 && defined(__SSE2__))
71
+ #if !defined(__AVX2__) && (_MSC_VER >= 1800 && defined(__AVX__))
68
72
  #define __AVX2__ 1
69
73
  #define __BMI2__ 1
70
74
  #endif
@@ -141,6 +145,13 @@
141
145
 
142
146
  #endif
143
147
 
148
+ #if defined(__ARM_NEON) && defined(__has_include)
149
+ # if !__has_include(<arm_neon.h>)
150
+ # undef __ARM_NEON
151
+ HEDLEY_WARNING("NEON has been disabled due to missing arm_neon.h");
152
+ # endif
153
+ #endif
154
+
144
155
  #ifdef __ARM_NEON
145
156
  # include <arm_neon.h>
146
157
 
@@ -212,14 +223,15 @@ bool cpu_supports_neon();
212
223
  enum YEncDecIsaLevel {
213
224
  ISA_FEATURE_POPCNT = 0x1,
214
225
  ISA_FEATURE_LZCNT = 0x2,
226
+ ISA_FEATURE_EVEX512 = 0x4, // AVX512 support
215
227
  ISA_LEVEL_SSE2 = 0x100,
216
228
  ISA_LEVEL_SSSE3 = 0x200,
217
229
  ISA_LEVEL_SSE41 = 0x300,
218
230
  ISA_LEVEL_SSE4_POPCNT = 0x301,
219
231
  ISA_LEVEL_AVX = 0x381, // same as above, just used as a differentiator for `cpu_supports_isa`
220
- ISA_LEVEL_AVX2 = 0x383, // also includes BMI1/2 and LZCNT
221
- ISA_LEVEL_AVX3 = 0x403, // SKX variant; AVX512VL + AVX512BW
222
- ISA_LEVEL_VBMI2 = 0x503 // ICL
232
+ ISA_LEVEL_AVX2 = 0x403, // also includes BMI1/2 and LZCNT
233
+ ISA_LEVEL_AVX3 = 0x507, // SKX variant; AVX512VL + AVX512BW
234
+ ISA_LEVEL_VBMI2 = 0x603 // ICL, AVX10
223
235
  };
224
236
  #ifdef _MSC_VER
225
237
  // native tuning not supported in MSVC
@@ -249,16 +261,19 @@ enum YEncDecIsaLevel {
249
261
  # endif
250
262
  #endif
251
263
 
252
- #ifdef _MSC_VER
253
- # define _cpuid1(ar) __cpuid(ar, 1)
254
- #else
255
- # include <cpuid.h>
256
- # define _cpuid1(ar) __cpuid(1, ar[0], ar[1], ar[2], ar[3])
257
- #endif
258
-
259
264
  int cpu_supports_isa();
260
265
  #endif // PLATFORM_X86
261
266
 
267
+
268
+ #ifdef __riscv
269
+ bool cpu_supports_rvv();
270
+ #endif
271
+ #if defined(__riscv_vector) && defined(HEDLEY_GCC_VERSION) && !HEDLEY_GCC_VERSION_CHECK(13,0,0)
272
+ // GCC added RVV intrinsics in GCC13
273
+ # undef __riscv_vector
274
+ #endif
275
+
276
+
262
277
  #include <string.h>
263
278
  #if !defined(_MSC_VER) || defined(_STDINT) || _MSC_VER >= 1900
264
279
  # include <stdint.h>
@@ -270,7 +285,7 @@ int cpu_supports_isa();
270
285
 
271
286
 
272
287
  // GCC 8/9/10(dev) fails to optimize cases where KNOT should be used, so use intrinsic explicitly; Clang 6+ has no issue, but Clang 6/7 doesn't have the intrinsic; MSVC 2019 also fails and lacks the intrinsic
273
- #if defined(__GNUC__) && __GNUC__ >= 7
288
+ #if (defined(__GNUC__) && __GNUC__ >= 7) || (defined(_MSC_VER) && _MSC_VER >= 1924)
274
289
  # define KNOT16 _knot_mask16
275
290
  # define KNOT32 _knot_mask32
276
291
  #else
package/src/crc.cc CHANGED
@@ -3,11 +3,127 @@
3
3
  #include "interface.h"
4
4
  crcutil_interface::CRC* crc = NULL;
5
5
 
6
+ #if defined(PLATFORM_X86) && !defined(__ILP32__)
6
7
  static uint32_t do_crc32_incremental_generic(const void* data, size_t length, uint32_t init) {
8
+ // use optimised ASM on x86 platforms
7
9
  crcutil_interface::UINT64 tmp = init;
8
10
  crc->Compute(data, length, &tmp);
9
11
  return (uint32_t)tmp;
10
12
  }
13
+ #else
14
+ static uint32_t* HEDLEY_RESTRICT crc_slice_table;
15
+ #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
16
+ # if defined(__GNUC__) || defined(__clang__)
17
+ # define bswap32 __builtin_bswap32
18
+ # else
19
+ static inline uint32_t bswap32(uint32_t x) {
20
+ return (x >> 24) | ((x >> 8) & 0x0000FF00) | ((x << 8) & 0x00FF0000) | (x << 24);
21
+ }
22
+ # endif
23
+ #endif
24
+
25
+ #define CRC32_GENERIC_CHAINS 4 // newer processors may prefer 8
26
+ static uint32_t do_crc32_incremental_generic(const void* data, size_t length, uint32_t init) {
27
+ const uint32_t* crc_base_table = crc_slice_table + 4*256; // this also seems to help MSVC's optimiser, which otherwise keeps trying to add to crc_slice_table every time it's referenced
28
+ uint32_t crc[CRC32_GENERIC_CHAINS]; // Clang seems to be more spill happy with an array over individual variables :(
29
+ crc[0] = ~init;
30
+ uint8_t* current8 = (uint8_t*)data;
31
+
32
+ // align to multiple of 4
33
+ if(((uintptr_t)current8 & 1) && length >= 1) {
34
+ crc[0] = (crc[0] >> 8) ^ crc_base_table[(crc[0] & 0xFF) ^ *current8++];
35
+ length--;
36
+ }
37
+ if(((uintptr_t)current8 & 2) && length >= 2) {
38
+ crc[0] = (crc[0] >> 8) ^ crc_base_table[(crc[0] & 0xFF) ^ *current8++];
39
+ crc[0] = (crc[0] >> 8) ^ crc_base_table[(crc[0] & 0xFF) ^ *current8++];
40
+ length -= 2;
41
+ }
42
+
43
+ uint8_t* end8 = current8 + length;
44
+ uint32_t* current = (uint32_t*)current8;
45
+ if(length >= 8*CRC32_GENERIC_CHAINS-4) {
46
+ size_t lenMain = ((length-(CRC32_GENERIC_CHAINS-1)*4) / 4);
47
+ uint32_t* end = current + (lenMain / CRC32_GENERIC_CHAINS) * CRC32_GENERIC_CHAINS;
48
+ for(int c=1; c<CRC32_GENERIC_CHAINS; c++)
49
+ crc[c] = 0;
50
+ while(current != end) {
51
+ #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
52
+ #define CRC_PROC4(v, in) \
53
+ v ^= bswap32(in); \
54
+ v = crc_slice_table[v >> 24] ^ crc_slice_table[0x100L + ((v >> 16) & 0xff)] ^ crc_slice_table[0x200L + ((v >> 8) & 0xff)] ^ crc_slice_table[0x300L + (v & 0xff)]
55
+ #else
56
+ #define CRC_PROC4(v, in) \
57
+ v ^= (in); \
58
+ v = crc_slice_table[v >> 24] ^ crc_slice_table[0x100L + ((v >> 16) & 0xff)] ^ crc_slice_table[0x200L + ((v >> 8) & 0xff)] ^ crc_slice_table[0x300L + (v & 0xff)]
59
+ #endif
60
+ for(int c=0; c<CRC32_GENERIC_CHAINS; c++) {
61
+ CRC_PROC4(crc[c], *current);
62
+ current++;
63
+ }
64
+ }
65
+ // aggregate accumulators
66
+ current8 = (uint8_t*)current;
67
+ #if (CRC32_GENERIC_CHAINS & (CRC32_GENERIC_CHAINS-1)) == 0
68
+ // assume that lengths which are a multiple of 4/8/16/32 are common
69
+ if((end8 - current8) & (CRC32_GENERIC_CHAINS*4)) {
70
+ CRC_PROC4(crc[0], *current);
71
+ current8 += 4;
72
+
73
+ for(int c=1; c<CRC32_GENERIC_CHAINS; c++) {
74
+ for(int i=0; i<4; i++)
75
+ crc[c] = (crc[c] >> 8) ^ crc_base_table[(crc[c] & 0xff) ^ *current8++];
76
+ crc[(c+1) & ~CRC32_GENERIC_CHAINS] ^= crc[c];
77
+ }
78
+ } else
79
+ #endif
80
+ #undef CRC_PROC4
81
+ for(int c=1; c<CRC32_GENERIC_CHAINS; c++) {
82
+ for(int i=0; i<4; i++)
83
+ crc[0] = (crc[0] >> 8) ^ crc_base_table[(crc[0] & 0xff) ^ *current8++];
84
+ crc[0] ^= crc[c];
85
+ }
86
+ }
87
+
88
+ // tail loop
89
+ while(current8 != end8) {
90
+ crc[0] = (crc[0] >> 8) ^ crc_base_table[(crc[0] & 0xFF) ^ *current8++];
91
+ }
92
+ return ~crc[0];
93
+ }
94
+ static void generate_crc32_slice_table() {
95
+ crc_slice_table = (uint32_t*)malloc(5*256*sizeof(uint32_t));
96
+ // generate standard byte-by-byte table
97
+ uint32_t* crc_base_table = crc_slice_table + 4*256;
98
+ for(int v=0; v<256; v++) {
99
+ uint32_t crc = v;
100
+ for(int j = 0; j < 8; j++) {
101
+ crc = (crc >> 1) ^ (-(int32_t)(crc & 1) & 0xEDB88320);
102
+ }
103
+ crc_base_table[v] = crc;
104
+ }
105
+
106
+ // generate slice-by-4 shifted across for X independent chains
107
+ for(int v=0; v<256; v++) {
108
+ uint32_t crc = crc_base_table[v];
109
+ #if CRC32_GENERIC_CHAINS > 1
110
+ for(int i=0; i<4*CRC32_GENERIC_CHAINS-5; i++)
111
+ crc = (crc >> 8) ^ crc_base_table[crc & 0xff];
112
+ for(int i=0; i<4; i++) {
113
+ crc = (crc >> 8) ^ crc_base_table[crc & 0xff];
114
+ crc_slice_table[i*256 + v] = crc;
115
+ }
116
+ #else
117
+ for(int i=0; i<4; i++) {
118
+ crc_slice_table[i*256 + v] = crc;
119
+ crc = (crc >> 8) ^ crc_base_table[crc & 0xff];
120
+ }
121
+ #endif
122
+ }
123
+ }
124
+ #endif
125
+
126
+
11
127
  crc_func _do_crc32_incremental = &do_crc32_incremental_generic;
12
128
 
13
129
 
@@ -25,8 +141,13 @@ uint32_t do_crc32_zeros(uint32_t crc1, size_t len) {
25
141
  }
26
142
 
27
143
  void crc_clmul_set_funcs(crc_func*);
144
+ void crc_clmul256_set_funcs(crc_func*);
28
145
  void crc_arm_set_funcs(crc_func*);
29
146
 
147
+ #ifdef PLATFORM_X86
148
+ int cpu_supports_crc_isa();
149
+ #endif
150
+
30
151
  #if defined(PLATFORM_ARM) && defined(_WIN32)
31
152
  # define WIN32_LEAN_AND_MEAN
32
153
  # include <Windows.h>
@@ -34,22 +155,23 @@ void crc_arm_set_funcs(crc_func*);
34
155
  #ifdef PLATFORM_ARM
35
156
  # ifdef __ANDROID__
36
157
  # include <cpu-features.h>
37
- # elif defined(__linux__) || (defined(__FreeBSD__) && __FreeBSD__ >= 12)
38
- # include <sys/auxv.h>
39
- # include <asm/hwcap.h>
40
- # elif (defined(__FreeBSD__) && __FreeBSD__ < 12)
41
- # include <sys/sysctl.h>
42
- # include <asm/hwcap.h>
43
158
  # elif defined(__APPLE__)
44
159
  # include <sys/types.h>
45
160
  # include <sys/sysctl.h>
46
- # endif
47
- # ifdef __FreeBSD__
161
+ # elif defined(__has_include)
162
+ # if __has_include(<sys/auxv.h>)
163
+ # include <sys/auxv.h>
164
+ # ifdef __FreeBSD__
48
165
  static unsigned long getauxval(unsigned long cap) {
49
166
  unsigned long ret;
50
167
  elf_aux_info(cap, &ret, sizeof(ret));
51
168
  return ret;
52
169
  }
170
+ # endif
171
+ # if __has_include(<asm/hwcap.h>)
172
+ # include <asm/hwcap.h>
173
+ # endif
174
+ # endif
53
175
  # endif
54
176
  #endif
55
177
  void crc_init() {
@@ -57,10 +179,15 @@ void crc_init() {
57
179
  0xEDB88320, 0, 32, true, 0, 0, 0, 0, NULL);
58
180
  // instance never deleted... oh well...
59
181
 
182
+ #if !defined(PLATFORM_X86) || defined(__ILP32__)
183
+ generate_crc32_slice_table();
184
+ #endif
185
+
60
186
  #ifdef PLATFORM_X86
61
- int flags[4];
62
- _cpuid1(flags);
63
- if((flags[2] & 0x80202) == 0x80202) // SSE4.1 + SSSE3 + CLMUL
187
+ int support = cpu_supports_crc_isa();
188
+ if(support == 2)
189
+ crc_clmul256_set_funcs(&_do_crc32_incremental);
190
+ else if(support == 1)
64
191
  crc_clmul_set_funcs(&_do_crc32_incremental);
65
192
  #endif
66
193
  #ifdef PLATFORM_ARM