yencode 1.1.3 → 1.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -129,7 +129,8 @@ int decodeTo(Buffer data, Buffer output, bool stripDots=false)
129
129
  Same as above, but instead of returning a Buffer, writes it to the supplied
130
130
  *output* Buffer. Returns the length of the decoded data.
131
131
  Note that the *output* Buffer must be at least large enough to hold the largest
132
- possible output size (i.e. length of the input), otherwise an error is thrown.
132
+ possible output size (i.e. length of the input), otherwise an error is thrown.
133
+ The *data* and *output* Buffers can be the same, for in-situ decoding.
133
134
 
134
135
  Object decodeChunk\(Buffer data \[, string state=null\]\[, Buffer output\]\)
135
136
  -----------------------------------------------------------------------------
@@ -142,7 +143,7 @@ designed to incrementally process a stream from the network, and will perform NN
142
143
  *state* is the current state of the incremental decode. Set to *null* if this is starting the decode of a new article, otherwise this should be set to the value of *state* given from the previous invocation of *decodeChunk*
143
144
  If *output* is supplied, the output will be written here \(see *decodeTo* for notes
144
145
  on required size\), otherwise a new buffer will be created where the output will be
145
- written to.
146
+ written to. The *data* and *output* Buffers can be the same, for in-situ decoding.
146
147
 
147
148
  Returns an object with the following keys:
148
149
 
package/binding.gyp CHANGED
@@ -43,10 +43,20 @@
43
43
  }],
44
44
  ['OS!="win" and enable_native_tuning!=0', {
45
45
  "defines": ["YENC_BUILD_NATIVE=1"]
46
+ }],
47
+ ['OS!="win"', {
48
+ "variables": {
49
+ "missing_memalign%": "<!(<!(echo ${CC_target:-${CC:-cc}}) -c src/test_alignalloc.c -o /dev/null -Werror 2>/dev/null || echo failed)",
50
+ },
51
+ "conditions": [
52
+ ['missing_memalign!=""', {
53
+ "defines": ["_POSIX_C_SOURCE=200112L"],
54
+ }]
55
+ ]
46
56
  }]
47
57
  ],
48
58
  "cflags": ["-Wno-unused-function"],
49
- "cxxflags": ["-Wno-unused-function"],
59
+ "cxxflags": ["-Wno-unused-function", "-std=c++03", "-D_POSIX_C_SOURCE=200112L"],
50
60
  "xcode_settings": {
51
61
  "OTHER_CFLAGS": ["-Wno-unused-function"],
52
62
  "OTHER_CXXFLAGS": ["-Wno-unused-function"]
@@ -64,7 +74,7 @@
64
74
  "targets": [
65
75
  {
66
76
  "target_name": "yencode",
67
- "dependencies": ["crcutil", "yencode_sse2", "yencode_ssse3", "yencode_clmul", "yencode_clmul256", "yencode_avx", "yencode_avx2", "yencode_vbmi2", "yencode_neon", "yencode_armcrc"],
77
+ "dependencies": ["crcutil", "yencode_sse2", "yencode_ssse3", "yencode_clmul", "yencode_clmul256", "yencode_avx", "yencode_avx2", "yencode_vbmi2", "yencode_neon", "yencode_armcrc", "yencode_rvv"],
68
78
  "sources": [
69
79
  "src/yencode.cc",
70
80
  "src/platform.cc",
@@ -221,7 +231,7 @@
221
231
  "msvs_settings": {"VCCLCompilerTool": {"BufferSecurityCheck": "false"}},
222
232
  "conditions": [
223
233
  ['target_arch in "ia32 x64" and OS!="win"', {
224
- "variables": {"supports_vpclmul%": "<!(<!(echo ${CC_target:-${CC:-cc}}) -MM -E src/crc_folding_256.cc -mavx2 -mvpclmulqdq 2>/dev/null || true)"},
234
+ "variables": {"supports_vpclmul%": "<!(<!(echo ${CXX_target:-${CXX:-c++}}) -MM -E src/crc_folding_256.cc -mavx2 -mvpclmulqdq 2>/dev/null || true)"},
225
235
  "conditions": [
226
236
  ['supports_vpclmul!=""', {
227
237
  "cflags": ["-mavx2", "-mvpclmulqdq", "-mpclmul"],
@@ -253,7 +263,10 @@
253
263
  "msvs_settings": {"VCCLCompilerTool": {"BufferSecurityCheck": "false"}},
254
264
  "conditions": [
255
265
  ['target_arch in "ia32 x64" and OS!="win"', {
256
- "variables": {"supports_vbmi2%": "<!(<!(echo ${CC_target:-${CC:-cc}}) -MM -E src/encoder_vbmi2.cc -mavx512vl -mavx512vbmi2 2>/dev/null || true)"},
266
+ "variables": {
267
+ "supports_vbmi2%": "<!(<!(echo ${CXX_target:-${CXX:-c++}}) -MM -E src/encoder_vbmi2.cc -mavx512vl -mavx512vbmi2 2>/dev/null || true)",
268
+ "supports_avx10%": "<!(<!(echo ${CXX_target:-${CXX:-c++}}) -MM -E src/encoder_vbmi2.cc -mavx512vl -mno-evex512 2>/dev/null || true)"
269
+ },
257
270
  "conditions": [
258
271
  ['supports_vbmi2!=""', {
259
272
  "cflags": ["-mavx512vbmi2", "-mavx512vl", "-mavx512bw", "-mpopcnt", "-mbmi", "-mbmi2", "-mlzcnt"],
@@ -262,6 +275,14 @@
262
275
  "OTHER_CFLAGS": ["-mavx512vbmi2", "-mavx512vl", "-mavx512bw", "-mpopcnt", "-mbmi", "-mbmi2", "-mlzcnt"],
263
276
  "OTHER_CXXFLAGS": ["-mavx512vbmi2", "-mavx512vl", "-mavx512bw", "-mpopcnt", "-mbmi", "-mbmi2", "-mlzcnt"],
264
277
  }
278
+ }],
279
+ ['supports_avx10!=""', {
280
+ "cflags": ["-mno-evex512"],
281
+ "cxxflags": ["-mno-evex512"],
282
+ "xcode_settings": {
283
+ "OTHER_CFLAGS": ["-mno-evex512"],
284
+ "OTHER_CXXFLAGS": ["-mno-evex512"],
285
+ }
265
286
  }]
266
287
  ]
267
288
  }],
@@ -285,11 +306,11 @@
285
306
  "msvs_settings": {"VCCLCompilerTool": {"BufferSecurityCheck": "false"}},
286
307
  "conditions": [
287
308
  ['target_arch=="arm"', {
288
- "cflags": ["-mfpu=neon"],
289
- "cxxflags": ["-mfpu=neon"],
309
+ "cflags": ["-mfpu=neon","-fno-lto"],
310
+ "cxxflags": ["-mfpu=neon","-fno-lto"],
290
311
  "xcode_settings": {
291
- "OTHER_CFLAGS": ["-mfpu=neon"],
292
- "OTHER_CXXFLAGS": ["-mfpu=neon"],
312
+ "OTHER_CFLAGS": ["-mfpu=neon","-fno-lto"],
313
+ "OTHER_CXXFLAGS": ["-mfpu=neon","-fno-lto"],
293
314
  }
294
315
  }],
295
316
  ['target_arch=="arm64"', {
@@ -299,6 +320,48 @@
299
320
  }]
300
321
  ]
301
322
  },
323
+ {
324
+ "target_name": "yencode_rvv",
325
+ "type": "static_library",
326
+ "sources": [
327
+ "src/encoder_rvv.cc"
328
+ ],
329
+ "cflags!": ["-fno-omit-frame-pointer", "-fno-tree-vrp", "-fno-strict-aliasing"],
330
+ "cxxflags!": ["-fno-omit-frame-pointer", "-fno-tree-vrp", "-fno-strict-aliasing"],
331
+ "xcode_settings": {
332
+ "OTHER_CFLAGS!": ["-fno-omit-frame-pointer", "-fno-tree-vrp", "-fno-strict-aliasing"],
333
+ "OTHER_CXXFLAGS!": ["-fno-omit-frame-pointer", "-fno-tree-vrp", "-fno-strict-aliasing"]
334
+ },
335
+ "msvs_settings": {"VCCLCompilerTool": {"BufferSecurityCheck": "false"}},
336
+ "conditions": [
337
+ ['target_arch=="riscv64" and OS!="win"', {
338
+ "variables": {"supports_rvv%": "<!(<!(echo ${CXX_target:-${CXX:-c++}}) -MM -E src/encoder_rvv.cc -march=rv64gcv 2>/dev/null || true)"},
339
+ "conditions": [
340
+ ['supports_rvv!=""', {
341
+ "cflags": ["-march=rv64gcv"],
342
+ "cxxflags": ["-march=rv64gcv"],
343
+ "xcode_settings": {
344
+ "OTHER_CFLAGS": ["-march=rv64gcv"],
345
+ "OTHER_CXXFLAGS": ["-march=rv64gcv"],
346
+ }
347
+ }]
348
+ ]
349
+ }],
350
+ ['target_arch=="riscv32" and OS!="win"', {
351
+ "variables": {"supports_rvv%": "<!(<!(echo ${CXX_target:-${CXX:-c++}}) -MM -E src/encoder_rvv.cc -march=rv32gcv 2>/dev/null || true)"},
352
+ "conditions": [
353
+ ['supports_rvv!=""', {
354
+ "cflags": ["-march=rv32gcv"],
355
+ "cxxflags": ["-march=rv32gcv"],
356
+ "xcode_settings": {
357
+ "OTHER_CFLAGS": ["-march=rv32gcv"],
358
+ "OTHER_CXXFLAGS": ["-march=rv32gcv"],
359
+ }
360
+ }]
361
+ ]
362
+ }]
363
+ ]
364
+ },
302
365
  {
303
366
  "target_name": "yencode_armcrc",
304
367
  "type": "static_library",
@@ -326,11 +389,11 @@
326
389
  }
327
390
  }],
328
391
  ['OS!="win" and target_arch=="arm"', {
329
- "cflags": ["-mfpu=fp-armv8"],
330
- "cxxflags": ["-mfpu=fp-armv8"],
392
+ "cflags": ["-mfpu=fp-armv8","-fno-lto"],
393
+ "cxxflags": ["-mfpu=fp-armv8","-fno-lto"],
331
394
  "xcode_settings": {
332
- "OTHER_CFLAGS": ["-mfpu=fp-armv8"],
333
- "OTHER_CXXFLAGS": ["-mfpu=fp-armv8"]
395
+ "OTHER_CFLAGS": ["-mfpu=fp-armv8","-fno-lto"],
396
+ "OTHER_CXXFLAGS": ["-mfpu=fp-armv8","-fno-lto"]
334
397
  }
335
398
  }]
336
399
  ]
package/index.js CHANGED
@@ -3,6 +3,7 @@
3
3
  var y = require('./build/Release/yencode.node');
4
4
 
5
5
  var toBuffer = Buffer.alloc ? Buffer.from : Buffer;
6
+ var bufferSlice = Buffer.prototype.readBigInt64BE ? Buffer.prototype.subarray : Buffer.prototype.slice;
6
7
 
7
8
  var nl = toBuffer([13, 10]);
8
9
  var RE_BADCHAR = /\r\n\0/g;
@@ -76,28 +77,28 @@ var decoderParseLines = function(lines, ydata) {
76
77
  for(var i=0; i<lines.length; i++) {
77
78
  var yprops = {};
78
79
 
79
- var line = lines[i].substr(2); // cut off '=y'
80
+ var line = lines[i].substring(2); // cut off '=y'
80
81
  // parse tag
81
82
  var p = line.indexOf(' ');
82
- var tag = (p<0 ? line : line.substr(0, p));
83
- line = line.substr(tag.length+1).trim();
83
+ var tag = (p<0 ? line : line.substring(0, p));
84
+ line = line.substring(tag.length+1).trim();
84
85
 
85
86
  // parse props
86
87
  var m = line.match(RE_YPROP);
87
88
  while(m) {
88
89
  if(m.index != 0) {
89
- warnings.push(DecoderWarning('ignored_line_data', 'Unknown additional data ignored: "' + line.substr(0, m.index) + '"'));
90
+ warnings.push(DecoderWarning('ignored_line_data', 'Unknown additional data ignored: "' + line.substring(0, m.index) + '"'));
90
91
  }
91
92
  var prop = m[1], val;
92
93
  var valPos = m.index + m[0].length;
93
94
  if(tag == 'begin' && prop == 'name') {
94
95
  // special treatment of filename - the value is the rest of the line (can include spaces)
95
- val = line.substr(valPos);
96
+ val = line.substring(valPos);
96
97
  line = '';
97
98
  } else {
98
99
  p = line.indexOf(' ', valPos);
99
- val = (p<0 ? line.substr(valPos) : line.substr(valPos, p-valPos));
100
- line = line.substr(valPos + val.length +1);
100
+ val = (p<0 ? line.substring(valPos) : line.substring(valPos, p));
101
+ line = line.substring(valPos + val.length +1);
101
102
  }
102
103
  if(prop in yprops) {
103
104
  warnings.push(DecoderWarning('duplicate_property', 'Duplicate property encountered: `' + prop + '`'));
@@ -139,7 +140,7 @@ module.exports = {
139
140
  prev = '\r\n';
140
141
 
141
142
  if(Buffer.isBuffer(prev)) prev = prev.toString();
142
- prev = prev.substr(-4); // only care about the last 4 chars of previous state
143
+ prev = prev.slice(-4); // only care about the last 4 chars of previous state
143
144
  if(prev == '\r\n.=') prev = '\r\n='; // aliased after dot stripped
144
145
  if(data.length == 0) return {
145
146
  read: 0,
@@ -151,7 +152,7 @@ module.exports = {
151
152
  var state = decodePrev.indexOf(prev);
152
153
  if(state < 0) {
153
154
  for(var l=-3; l<0; i++) {
154
- state = decodePrev.indexOf(prev.substr(l));
155
+ state = decodePrev.indexOf(prev.slice(l));
155
156
  if(state >= 0) break;
156
157
  }
157
158
  if(state < 0) state = decodePrev.indexOf('');
@@ -195,12 +196,13 @@ module.exports = {
195
196
 
196
197
  if(!Buffer.isBuffer(data)) data = toBuffer(data);
197
198
 
198
- filename = toBuffer(filename.replace(RE_BADCHAR, '').substr(0, 256), exports.encoding);
199
+ filename = toBuffer(filename.replace(RE_BADCHAR, '').substring(0, 256), exports.encoding);
200
+ var e = encodeCrc(data, line_size);
199
201
  return Buffer.concat([
200
202
  toBuffer('=ybegin line='+line_size+' size='+data.length+' name='),
201
203
  filename, nl,
202
- y.encode(data, line_size),
203
- toBuffer('\r\n=yend size='+data.length+' crc32=' + y.crc32(data).toString('hex'))
204
+ e.output,
205
+ toBuffer('\r\n=yend size='+data.length+' crc32=' + e.crc32.toString('hex'))
204
206
  ]);
205
207
  },
206
208
  multi_post: function(filename, size, parts, line_size) {
@@ -214,7 +216,7 @@ module.exports = {
214
216
 
215
217
  // find '=ybegin' to know where the yEnc data starts
216
218
  var yencStart;
217
- if(data.slice(0, 8).toString('hex') == '3d79626567696e20' /*=ybegin */) {
219
+ if(bufferSlice.call(data, 0, 8).toString('hex') == '3d79626567696e20' /*=ybegin */) {
218
220
  // common case: starts right at the beginning
219
221
  yencStart = 0;
220
222
  } else {
@@ -231,10 +233,10 @@ module.exports = {
231
233
  var sp = yencStart;
232
234
  var p = bufferFind(data, '\r\n', yencStart+8);
233
235
  while(p > 0) {
234
- var line = data.slice(sp, p).toString(this.encoding).trim();
236
+ var line = bufferSlice.call(data, sp, p).toString(this.encoding).trim();
235
237
  lines.push(line);
236
238
  sp = p+2;
237
- if(line.substr(0, 6) == '=yend ') { // no data in post
239
+ if(line.substring(0, 6) == '=yend ') { // no data in post
238
240
  ret.yencEnd = sp;
239
241
  break;
240
242
  }
@@ -252,7 +254,7 @@ module.exports = {
252
254
  var warnings = decoderParseLines(lines, ydata);
253
255
 
254
256
  if(!ret.yencEnd) {
255
- var yencEnd = bufferFindRev(data.slice(ret.dataStart), '\r\n=yend ');
257
+ var yencEnd = bufferFindRev(bufferSlice.call(data, ret.dataStart), '\r\n=yend ');
256
258
  if(yencEnd < 0)
257
259
  return DecoderError('no_end_found', 'yEnd end marker not found');
258
260
 
@@ -265,7 +267,7 @@ module.exports = {
265
267
  ret.yencEnd = p;
266
268
  } else
267
269
  ret.yencEnd = p+2;
268
- var endLine = data.slice(yencEnd+2, p).toString(this.encoding).trim();
270
+ var endLine = bufferSlice.call(data, yencEnd+2, p).toString(this.encoding).trim();
269
271
 
270
272
  warnings = warnings.concat(decoderParseLines([endLine], ydata));
271
273
  }
@@ -321,7 +323,7 @@ module.exports = {
321
323
  warnings.push(DecoderWarning('size_mismatch', 'Size specified for part exceeds size specified for whole file'));
322
324
 
323
325
  if(ret.dataStart) {
324
- ret.data = y.decode(data.slice(ret.dataStart, ret.dataEnd), isRaw);
326
+ ret.data = y.decode(bufferSlice.call(data, ret.dataStart, ret.dataEnd), !!isRaw);
325
327
  ret.crc32 = y.crc32(ret.data);
326
328
  var hexCrc = ret.crc32.toString('hex');
327
329
 
@@ -360,7 +362,7 @@ function YEncoder(filename, size, parts, line_size) {
360
362
  this.pos = 0;
361
363
  this.crc = toBuffer([0,0,0,0]);
362
364
 
363
- filename = toBuffer(filename.replace(RE_BADCHAR, '').substr(0, 256), exports.encoding);
365
+ filename = toBuffer(filename.replace(RE_BADCHAR, '').substring(0, 256), exports.encoding);
364
366
  if(parts > 1) {
365
367
  this.yInfo = Buffer.concat([
366
368
  toBuffer(' total='+parts+' line='+line_size+' size='+size+' name='),
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "yencode",
3
- "version": "1.1.3",
3
+ "version": "1.1.4",
4
4
  "description": "SIMD accelerated yEnc encoder/decoder and CRC32 calculator",
5
5
  "keywords": [
6
6
  "yenc",
@@ -21,6 +21,7 @@
21
21
  "install": "node-gyp rebuild"
22
22
  },
23
23
  "gypfile": true,
24
+ "type": "commonjs",
24
25
  "bugs": {
25
26
  "url": "https://github.com/animetosho/node-yencode/issues"
26
27
  },
package/src/common.h CHANGED
@@ -57,18 +57,18 @@
57
57
 
58
58
 
59
59
  // MSVC compatibility
60
- #if ((defined(_M_IX86_FP) && _M_IX86_FP == 2) || defined(_M_X64)) && !defined(__clang__)
60
+ #if ((defined(_M_IX86_FP) && _M_IX86_FP == 2) || defined(_M_X64)) && defined(_MSC_VER) && !defined(__clang__)
61
61
  #define __SSE2__ 1
62
62
  #define __SSSE3__ 1
63
63
  #define __SSE4_1__ 1
64
- #if defined(_MSC_VER) && _MSC_VER >= 1600
64
+ #if _MSC_VER >= 1600 && defined(__SSE2__)
65
65
  #define __POPCNT__ 1
66
66
  #define __LZCNT__ 1
67
67
  #endif
68
68
  #if !defined(__AVX__) && (_MSC_VER >= 1700 && defined(__SSE2__))
69
69
  #define __AVX__ 1
70
70
  #endif
71
- #if !defined(__AVX2__) && (_MSC_VER >= 1800 && defined(__SSE2__))
71
+ #if !defined(__AVX2__) && (_MSC_VER >= 1800 && defined(__AVX__))
72
72
  #define __AVX2__ 1
73
73
  #define __BMI2__ 1
74
74
  #endif
@@ -145,6 +145,13 @@
145
145
 
146
146
  #endif
147
147
 
148
+ #if defined(__ARM_NEON) && defined(__has_include)
149
+ # if !__has_include(<arm_neon.h>)
150
+ # undef __ARM_NEON
151
+ HEDLEY_WARNING("NEON has been disabled due to missing arm_neon.h");
152
+ # endif
153
+ #endif
154
+
148
155
  #ifdef __ARM_NEON
149
156
  # include <arm_neon.h>
150
157
 
@@ -216,14 +223,15 @@ bool cpu_supports_neon();
216
223
  enum YEncDecIsaLevel {
217
224
  ISA_FEATURE_POPCNT = 0x1,
218
225
  ISA_FEATURE_LZCNT = 0x2,
226
+ ISA_FEATURE_EVEX512 = 0x4, // AVX512 support
219
227
  ISA_LEVEL_SSE2 = 0x100,
220
228
  ISA_LEVEL_SSSE3 = 0x200,
221
229
  ISA_LEVEL_SSE41 = 0x300,
222
230
  ISA_LEVEL_SSE4_POPCNT = 0x301,
223
231
  ISA_LEVEL_AVX = 0x381, // same as above, just used as a differentiator for `cpu_supports_isa`
224
232
  ISA_LEVEL_AVX2 = 0x403, // also includes BMI1/2 and LZCNT
225
- ISA_LEVEL_AVX3 = 0x503, // SKX variant; AVX512VL + AVX512BW
226
- ISA_LEVEL_VBMI2 = 0x603 // ICL
233
+ ISA_LEVEL_AVX3 = 0x507, // SKX variant; AVX512VL + AVX512BW
234
+ ISA_LEVEL_VBMI2 = 0x603 // ICL, AVX10
227
235
  };
228
236
  #ifdef _MSC_VER
229
237
  // native tuning not supported in MSVC
@@ -256,6 +264,16 @@ enum YEncDecIsaLevel {
256
264
  int cpu_supports_isa();
257
265
  #endif // PLATFORM_X86
258
266
 
267
+
268
+ #ifdef __riscv
269
+ bool cpu_supports_rvv();
270
+ #endif
271
+ #if defined(__riscv_vector) && defined(HEDLEY_GCC_VERSION) && !HEDLEY_GCC_VERSION_CHECK(13,0,0)
272
+ // GCC added RVV intrinsics in GCC13
273
+ # undef __riscv_vector
274
+ #endif
275
+
276
+
259
277
  #include <string.h>
260
278
  #if !defined(_MSC_VER) || defined(_STDINT) || _MSC_VER >= 1900
261
279
  # include <stdint.h>
package/src/crc.cc CHANGED
@@ -3,11 +3,127 @@
3
3
  #include "interface.h"
4
4
  crcutil_interface::CRC* crc = NULL;
5
5
 
6
+ #if defined(PLATFORM_X86) && !defined(__ILP32__)
6
7
  static uint32_t do_crc32_incremental_generic(const void* data, size_t length, uint32_t init) {
8
+ // use optimised ASM on x86 platforms
7
9
  crcutil_interface::UINT64 tmp = init;
8
10
  crc->Compute(data, length, &tmp);
9
11
  return (uint32_t)tmp;
10
12
  }
13
+ #else
14
+ static uint32_t* HEDLEY_RESTRICT crc_slice_table;
15
+ #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
16
+ # if defined(__GNUC__) || defined(__clang__)
17
+ # define bswap32 __builtin_bswap32
18
+ # else
19
+ static inline uint32_t bswap32(uint32_t x) {
20
+ return (x >> 24) | ((x >> 8) & 0x0000FF00) | ((x << 8) & 0x00FF0000) | (x << 24);
21
+ }
22
+ # endif
23
+ #endif
24
+
25
+ #define CRC32_GENERIC_CHAINS 4 // newer processors may prefer 8
26
+ static uint32_t do_crc32_incremental_generic(const void* data, size_t length, uint32_t init) {
27
+ const uint32_t* crc_base_table = crc_slice_table + 4*256; // this also seems to help MSVC's optimiser, which otherwise keeps trying to add to crc_slice_table every time it's referenced
28
+ uint32_t crc[CRC32_GENERIC_CHAINS]; // Clang seems to be more spill happy with an array over individual variables :(
29
+ crc[0] = ~init;
30
+ uint8_t* current8 = (uint8_t*)data;
31
+
32
+ // align to multiple of 4
33
+ if(((uintptr_t)current8 & 1) && length >= 1) {
34
+ crc[0] = (crc[0] >> 8) ^ crc_base_table[(crc[0] & 0xFF) ^ *current8++];
35
+ length--;
36
+ }
37
+ if(((uintptr_t)current8 & 2) && length >= 2) {
38
+ crc[0] = (crc[0] >> 8) ^ crc_base_table[(crc[0] & 0xFF) ^ *current8++];
39
+ crc[0] = (crc[0] >> 8) ^ crc_base_table[(crc[0] & 0xFF) ^ *current8++];
40
+ length -= 2;
41
+ }
42
+
43
+ uint8_t* end8 = current8 + length;
44
+ uint32_t* current = (uint32_t*)current8;
45
+ if(length >= 8*CRC32_GENERIC_CHAINS-4) {
46
+ size_t lenMain = ((length-(CRC32_GENERIC_CHAINS-1)*4) / 4);
47
+ uint32_t* end = current + (lenMain / CRC32_GENERIC_CHAINS) * CRC32_GENERIC_CHAINS;
48
+ for(int c=1; c<CRC32_GENERIC_CHAINS; c++)
49
+ crc[c] = 0;
50
+ while(current != end) {
51
+ #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
52
+ #define CRC_PROC4(v, in) \
53
+ v ^= bswap32(in); \
54
+ v = crc_slice_table[v >> 24] ^ crc_slice_table[0x100L + ((v >> 16) & 0xff)] ^ crc_slice_table[0x200L + ((v >> 8) & 0xff)] ^ crc_slice_table[0x300L + (v & 0xff)]
55
+ #else
56
+ #define CRC_PROC4(v, in) \
57
+ v ^= (in); \
58
+ v = crc_slice_table[v >> 24] ^ crc_slice_table[0x100L + ((v >> 16) & 0xff)] ^ crc_slice_table[0x200L + ((v >> 8) & 0xff)] ^ crc_slice_table[0x300L + (v & 0xff)]
59
+ #endif
60
+ for(int c=0; c<CRC32_GENERIC_CHAINS; c++) {
61
+ CRC_PROC4(crc[c], *current);
62
+ current++;
63
+ }
64
+ }
65
+ // aggregate accumulators
66
+ current8 = (uint8_t*)current;
67
+ #if (CRC32_GENERIC_CHAINS & (CRC32_GENERIC_CHAINS-1)) == 0
68
+ // assume that lengths which are a multiple of 4/8/16/32 are common
69
+ if((end8 - current8) & (CRC32_GENERIC_CHAINS*4)) {
70
+ CRC_PROC4(crc[0], *current);
71
+ current8 += 4;
72
+
73
+ for(int c=1; c<CRC32_GENERIC_CHAINS; c++) {
74
+ for(int i=0; i<4; i++)
75
+ crc[c] = (crc[c] >> 8) ^ crc_base_table[(crc[c] & 0xff) ^ *current8++];
76
+ crc[(c+1) & ~CRC32_GENERIC_CHAINS] ^= crc[c];
77
+ }
78
+ } else
79
+ #endif
80
+ #undef CRC_PROC4
81
+ for(int c=1; c<CRC32_GENERIC_CHAINS; c++) {
82
+ for(int i=0; i<4; i++)
83
+ crc[0] = (crc[0] >> 8) ^ crc_base_table[(crc[0] & 0xff) ^ *current8++];
84
+ crc[0] ^= crc[c];
85
+ }
86
+ }
87
+
88
+ // tail loop
89
+ while(current8 != end8) {
90
+ crc[0] = (crc[0] >> 8) ^ crc_base_table[(crc[0] & 0xFF) ^ *current8++];
91
+ }
92
+ return ~crc[0];
93
+ }
94
+ static void generate_crc32_slice_table() {
95
+ crc_slice_table = (uint32_t*)malloc(5*256*sizeof(uint32_t));
96
+ // generate standard byte-by-byte table
97
+ uint32_t* crc_base_table = crc_slice_table + 4*256;
98
+ for(int v=0; v<256; v++) {
99
+ uint32_t crc = v;
100
+ for(int j = 0; j < 8; j++) {
101
+ crc = (crc >> 1) ^ (-(int32_t)(crc & 1) & 0xEDB88320);
102
+ }
103
+ crc_base_table[v] = crc;
104
+ }
105
+
106
+ // generate slice-by-4 shifted across for X independent chains
107
+ for(int v=0; v<256; v++) {
108
+ uint32_t crc = crc_base_table[v];
109
+ #if CRC32_GENERIC_CHAINS > 1
110
+ for(int i=0; i<4*CRC32_GENERIC_CHAINS-5; i++)
111
+ crc = (crc >> 8) ^ crc_base_table[crc & 0xff];
112
+ for(int i=0; i<4; i++) {
113
+ crc = (crc >> 8) ^ crc_base_table[crc & 0xff];
114
+ crc_slice_table[i*256 + v] = crc;
115
+ }
116
+ #else
117
+ for(int i=0; i<4; i++) {
118
+ crc_slice_table[i*256 + v] = crc;
119
+ crc = (crc >> 8) ^ crc_base_table[crc & 0xff];
120
+ }
121
+ #endif
122
+ }
123
+ }
124
+ #endif
125
+
126
+
11
127
  crc_func _do_crc32_incremental = &do_crc32_incremental_generic;
12
128
 
13
129
 
@@ -39,22 +155,23 @@ int cpu_supports_crc_isa();
39
155
  #ifdef PLATFORM_ARM
40
156
  # ifdef __ANDROID__
41
157
  # include <cpu-features.h>
42
- # elif defined(__linux__) || (defined(__FreeBSD__) && __FreeBSD__ >= 12)
43
- # include <sys/auxv.h>
44
- # include <asm/hwcap.h>
45
- # elif (defined(__FreeBSD__) && __FreeBSD__ < 12)
46
- # include <sys/sysctl.h>
47
- # include <asm/hwcap.h>
48
158
  # elif defined(__APPLE__)
49
159
  # include <sys/types.h>
50
160
  # include <sys/sysctl.h>
51
- # endif
52
- # ifdef __FreeBSD__
161
+ # elif defined(__has_include)
162
+ # if __has_include(<sys/auxv.h>)
163
+ # include <sys/auxv.h>
164
+ # ifdef __FreeBSD__
53
165
  static unsigned long getauxval(unsigned long cap) {
54
166
  unsigned long ret;
55
167
  elf_aux_info(cap, &ret, sizeof(ret));
56
168
  return ret;
57
169
  }
170
+ # endif
171
+ # if __has_include(<asm/hwcap.h>)
172
+ # include <asm/hwcap.h>
173
+ # endif
174
+ # endif
58
175
  # endif
59
176
  #endif
60
177
  void crc_init() {
@@ -62,6 +179,10 @@ void crc_init() {
62
179
  0xEDB88320, 0, 32, true, 0, 0, 0, 0, NULL);
63
180
  // instance never deleted... oh well...
64
181
 
182
+ #if !defined(PLATFORM_X86) || defined(__ILP32__)
183
+ generate_crc32_slice_table();
184
+ #endif
185
+
65
186
  #ifdef PLATFORM_X86
66
187
  int support = cpu_supports_crc_isa();
67
188
  if(support == 2)
package/src/crc_arm.cc CHANGED
@@ -16,6 +16,12 @@ HEDLEY_WARNING("CRC32 acceleration has been disabled due to broken arm_acle.h sh
16
16
  HEDLEY_WARNING("CRC32 acceleration has been disabled due to broken arm_acle.h shipped in GCC 9.4 [https://gcc.gnu.org/bugzilla/show_bug.cgi?id=100985]. If you need this feature, please use a different compiler or version of GCC");
17
17
  # endif
18
18
  #endif
19
+ #if defined(__ARM_FEATURE_CRC32) && defined(__has_include)
20
+ # if !__has_include(<arm_acle.h>)
21
+ # undef __ARM_FEATURE_CRC32
22
+ HEDLEY_WARNING("CRC32 acceleration has been disabled due to missing arm_acle.h");
23
+ # endif
24
+ #endif
19
25
 
20
26
  #if defined(__ARM_FEATURE_CRC32) || (defined(_M_ARM64) && !defined(__clang__)) // MSVC doesn't support CRC for ARM32
21
27
 
@@ -73,7 +79,7 @@ static HEDLEY_ALWAYS_INLINE uint32_t crc_multiply(uint32_t a, uint32_t b) {
73
79
  return res;
74
80
  }
75
81
 
76
- static const uint32_t crc_power[] = { // pre-computed 2^n, with first 3 entries removed (saves a shift)
82
+ static const uint32_t crc_power[] = { // pre-computed 2^(2^n), with first 3 entries removed (saves a shift)
77
83
  0x00800000, 0x00008000, 0xedb88320, 0xb1e6b092, 0xa06a2517, 0xed627dae, 0x88d14467, 0xd7bbfe6a,
78
84
  0xec447f11, 0x8e7ea170, 0x6427800e, 0x4d47bae0, 0x09fe548f, 0x83852d0f, 0x30362f1a, 0x7b5a9cc3,
79
85
  0x31fec169, 0x9fec022a, 0x6c8dedc4, 0x15d6874d, 0x5fde7a4e, 0xbad90e37, 0x2e4e5eef, 0x4eaba214,
@@ -26,10 +26,9 @@ static __m256i do_one_fold(__m256i src, __m256i data) {
26
26
  0x96
27
27
  );
28
28
  #else
29
- return _mm256_xor_si256(data, _mm256_xor_si256(
30
- _mm256_clmulepi64_epi128(src, fold4, 0x01),
31
- _mm256_clmulepi64_epi128(src, fold4, 0x10)
32
- ));
29
+ return _mm256_xor_si256(_mm256_xor_si256(
30
+ data, _mm256_clmulepi64_epi128(src, fold4, 0x01)
31
+ ), _mm256_clmulepi64_epi128(src, fold4, 0x10));
33
32
  #endif
34
33
  }
35
34
 
@@ -38,7 +37,7 @@ ALIGN_TO(32, static const uint8_t pshufb_rot_table[]) = {
38
37
  16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31
39
38
  };
40
39
  // _mm256_castsi128_si256, but upper is defined to be 0
41
- #if (defined(__clang__) && __clang_major__ >= 5 && (!defined(__APPLE__) || __clang_major__ >= 7)) || (defined(__GNUC__) && __GNUC__ >= 10)
40
+ #if (defined(__clang__) && __clang_major__ >= 5 && (!defined(__APPLE__) || __clang_major__ >= 7)) || (defined(__GNUC__) && __GNUC__ >= 10) || (defined(_MSC_VER) && _MSC_VER >= 1910)
42
41
  // intrinsic unsupported in GCC 9 and MSVC < 2017
43
42
  # define zext128_256 _mm256_zextsi128_si256
44
43
  #else
package/src/decoder.cc CHANGED
@@ -4,9 +4,9 @@
4
4
  #include "decoder.h"
5
5
 
6
6
  extern "C" {
7
- YencDecoderEnd (*_do_decode)(const unsigned char*HEDLEY_RESTRICT*, unsigned char*HEDLEY_RESTRICT*, size_t, YencDecoderState*) = &do_decode_scalar<false, false>;
8
- YencDecoderEnd (*_do_decode_raw)(const unsigned char*HEDLEY_RESTRICT*, unsigned char*HEDLEY_RESTRICT*, size_t, YencDecoderState*) = &do_decode_scalar<true, false>;
9
- YencDecoderEnd (*_do_decode_end_raw)(const unsigned char*HEDLEY_RESTRICT*, unsigned char*HEDLEY_RESTRICT*, size_t, YencDecoderState*) = &do_decode_end_scalar<true>;
7
+ YencDecoderEnd (*_do_decode)(const unsigned char**, unsigned char**, size_t, YencDecoderState*) = &do_decode_scalar<false, false>;
8
+ YencDecoderEnd (*_do_decode_raw)(const unsigned char**, unsigned char**, size_t, YencDecoderState*) = &do_decode_scalar<true, false>;
9
+ YencDecoderEnd (*_do_decode_end_raw)(const unsigned char**, unsigned char**, size_t, YencDecoderState*) = &do_decode_end_scalar<true>;
10
10
  }
11
11
 
12
12
  void decoder_set_sse2_funcs();
@@ -14,6 +14,7 @@ void decoder_set_ssse3_funcs();
14
14
  void decoder_set_avx_funcs();
15
15
  void decoder_set_avx2_funcs();
16
16
  void decoder_set_vbmi2_funcs();
17
+ extern const bool decoder_has_avx10;
17
18
  void decoder_set_neon_funcs();
18
19
 
19
20
 
@@ -45,7 +46,7 @@ void decoder_init() {
45
46
  decoder_set_native_funcs();
46
47
  # else
47
48
  int use_isa = cpu_supports_isa();
48
- if(use_isa >= ISA_LEVEL_VBMI2)
49
+ if(use_isa >= ISA_LEVEL_VBMI2 && (decoder_has_avx10 || (use_isa & ISA_FEATURE_EVEX512)))
49
50
  decoder_set_vbmi2_funcs();
50
51
  else if(use_isa >= ISA_LEVEL_AVX2)
51
52
  decoder_set_avx2_funcs();
package/src/decoder.h CHANGED
@@ -29,17 +29,17 @@ typedef enum {
29
29
 
30
30
  #include "hedley.h"
31
31
 
32
- extern YencDecoderEnd (*_do_decode)(const unsigned char*HEDLEY_RESTRICT*, unsigned char*HEDLEY_RESTRICT*, size_t, YencDecoderState*);
33
- extern YencDecoderEnd (*_do_decode_raw)(const unsigned char*HEDLEY_RESTRICT*, unsigned char*HEDLEY_RESTRICT*, size_t, YencDecoderState*);
34
- extern YencDecoderEnd (*_do_decode_end_raw)(const unsigned char*HEDLEY_RESTRICT*, unsigned char*HEDLEY_RESTRICT*, size_t, YencDecoderState*);
32
+ extern YencDecoderEnd (*_do_decode)(const unsigned char**, unsigned char**, size_t, YencDecoderState*);
33
+ extern YencDecoderEnd (*_do_decode_raw)(const unsigned char**, unsigned char**, size_t, YencDecoderState*);
34
+ extern YencDecoderEnd (*_do_decode_end_raw)(const unsigned char**, unsigned char**, size_t, YencDecoderState*);
35
35
 
36
- static inline size_t do_decode(int isRaw, const unsigned char* HEDLEY_RESTRICT src, unsigned char* HEDLEY_RESTRICT dest, size_t len, YencDecoderState* state) {
36
+ static inline size_t do_decode(int isRaw, const unsigned char* src, unsigned char* dest, size_t len, YencDecoderState* state) {
37
37
  unsigned char* ds = dest;
38
38
  (*(isRaw ? _do_decode_raw : _do_decode))(&src, &ds, len, state);
39
39
  return ds - dest;
40
40
  }
41
41
 
42
- static inline YencDecoderEnd do_decode_end(const unsigned char*HEDLEY_RESTRICT* src, unsigned char*HEDLEY_RESTRICT* dest, size_t len, YencDecoderState* state) {
42
+ static inline YencDecoderEnd do_decode_end(const unsigned char** src, unsigned char** dest, size_t len, YencDecoderState* state) {
43
43
  return _do_decode_end_raw(src, dest, len, state);
44
44
  }
45
45