yencode 1.1.0 → 1.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,5 +1,5 @@
1
1
  #include "common.h"
2
- #ifdef __ARM_NEON
2
+ #if defined(__ARM_NEON) && defined(__aarch64__)
3
3
 
4
4
  #include "decoder_common.h"
5
5
 
@@ -10,9 +10,9 @@ static struct { char bytes[16]; } ALIGN_TO(16, compactLUT[32768]);
10
10
  static uint8_t eqFixLUT[256];
11
11
 
12
12
 
13
-
14
- #if !defined(__clang__)
15
- HEDLEY_ALWAYS_INLINE uint8x16x4_t vld1q_u8_x4(const uint8_t* p) {
13
+ // AArch64 GCC lacks these functions until 8.5, 9.4 and 10.1 (10.0 unknown)
14
+ #if !defined(__clang__) && !defined(_MSC_VER) && (!defined(__aarch64__) || !(HEDLEY_GCC_VERSION_CHECK(9,4,0) || (!HEDLEY_GCC_VERSION_CHECK(9,0,0) && HEDLEY_GCC_VERSION_CHECK(8,5,0))))
15
+ static HEDLEY_ALWAYS_INLINE uint8x16x4_t _vld1q_u8_x4(const uint8_t* p) {
16
16
  uint8x16x4_t ret;
17
17
  ret.val[0] = vld1q_u8(p);
18
18
  ret.val[1] = vld1q_u8(p+16);
@@ -20,12 +20,15 @@ HEDLEY_ALWAYS_INLINE uint8x16x4_t vld1q_u8_x4(const uint8_t* p) {
20
20
  ret.val[3] = vld1q_u8(p+48);
21
21
  return ret;
22
22
  }
23
- HEDLEY_ALWAYS_INLINE void vst1q_u8_x4(uint8_t* p, uint8x16x4_t data) {
23
+ static HEDLEY_ALWAYS_INLINE void _vst1q_u8_x4(uint8_t* p, uint8x16x4_t data) {
24
24
  vst1q_u8(p, data.val[0]);
25
25
  vst1q_u8(p+16, data.val[1]);
26
26
  vst1q_u8(p+32, data.val[2]);
27
27
  vst1q_u8(p+48, data.val[3]);
28
28
  }
29
+ #else
30
+ # define _vld1q_u8_x4 vld1q_u8_x4
31
+ # define _vst1q_u8_x4 vst1q_u8_x4
29
32
  #endif
30
33
 
31
34
 
@@ -48,12 +51,14 @@ HEDLEY_ALWAYS_INLINE void do_decode_neon(const uint8_t* HEDLEY_RESTRICT src, lon
48
51
  HEDLEY_ASSUME(escFirst == 0 || escFirst == 1);
49
52
  HEDLEY_ASSUME(nextMask == 0 || nextMask == 1 || nextMask == 2);
50
53
  uint8x16_t nextMaskMix = vdupq_n_u8(0);
51
- if(nextMask)
52
- nextMaskMix[nextMask-1] = nextMask;
53
- uint8x16_t yencOffset = escFirst ? (uint8x16_t){42+64,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42} : vdupq_n_u8(42);
54
+ if(nextMask == 1)
55
+ nextMaskMix = vsetq_lane_u8(1, nextMaskMix, 0);
56
+ if(nextMask == 2)
57
+ nextMaskMix = vsetq_lane_u8(2, nextMaskMix, 1);
58
+ uint8x16_t yencOffset = escFirst ? vmakeq_u8(42+64,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42) : vdupq_n_u8(42);
54
59
  long i;
55
60
  for(i = -len; i; i += sizeof(uint8x16_t)*4) {
56
- uint8x16x4_t data = vld1q_u8_x4(src+i);
61
+ uint8x16x4_t data = _vld1q_u8_x4(src+i);
57
62
  uint8x16_t dataA = data.val[0];
58
63
  uint8x16_t dataB = data.val[1];
59
64
  uint8x16_t dataC = data.val[2];
@@ -66,23 +71,23 @@ HEDLEY_ALWAYS_INLINE void do_decode_neon(const uint8_t* HEDLEY_RESTRICT src, lon
66
71
  cmpEqD = vceqq_u8(dataD, vdupq_n_u8('=')),
67
72
  cmpA = vqtbx1q_u8(
68
73
  cmpEqA,
69
- // \n \r
70
- (uint8x16_t){0,0,0,0,0,0,0,0,0,0,255,0,0,255,0,0},
74
+ // \n \r
75
+ vmakeq_u8(0,0,0,0,0,0,0,0,0,0,255,0,0,255,0,0),
71
76
  dataA
72
77
  ),
73
78
  cmpB = vqtbx1q_u8(
74
79
  cmpEqB,
75
- (uint8x16_t){0,0,0,0,0,0,0,0,0,0,255,0,0,255,0,0},
80
+ vmakeq_u8(0,0,0,0,0,0,0,0,0,0,255,0,0,255,0,0),
76
81
  dataB
77
82
  ),
78
83
  cmpC = vqtbx1q_u8(
79
84
  cmpEqC,
80
- (uint8x16_t){0,0,0,0,0,0,0,0,0,0,255,0,0,255,0,0},
85
+ vmakeq_u8(0,0,0,0,0,0,0,0,0,0,255,0,0,255,0,0),
81
86
  dataC
82
87
  ),
83
88
  cmpD = vqtbx1q_u8(
84
89
  cmpEqD,
85
- (uint8x16_t){0,0,0,0,0,0,0,0,0,0,255,0,0,255,0,0},
90
+ vmakeq_u8(0,0,0,0,0,0,0,0,0,0,255,0,0,255,0,0),
86
91
  dataD
87
92
  );
88
93
  if(isRaw) cmpA = vorrq_u8(cmpA, nextMaskMix);
@@ -93,22 +98,22 @@ HEDLEY_ALWAYS_INLINE void do_decode_neon(const uint8_t* HEDLEY_RESTRICT src, lon
93
98
  )))) {
94
99
  uint8x16_t cmpMerge = vpaddq_u8(
95
100
  vpaddq_u8(
96
- vandq_u8(cmpA, (uint8x16_t){1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128}),
97
- vandq_u8(cmpB, (uint8x16_t){1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128})
101
+ vandq_u8(cmpA, vmakeq_u8(1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128)),
102
+ vandq_u8(cmpB, vmakeq_u8(1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128))
98
103
  ),
99
104
  vpaddq_u8(
100
- vandq_u8(cmpC, (uint8x16_t){1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128}),
101
- vandq_u8(cmpD, (uint8x16_t){1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128})
105
+ vandq_u8(cmpC, vmakeq_u8(1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128)),
106
+ vandq_u8(cmpD, vmakeq_u8(1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128))
102
107
  )
103
108
  );
104
109
  uint8x16_t cmpEqMerge = vpaddq_u8(
105
110
  vpaddq_u8(
106
- vandq_u8(cmpEqA, (uint8x16_t){1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128}),
107
- vandq_u8(cmpEqB, (uint8x16_t){1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128})
111
+ vandq_u8(cmpEqA, vmakeq_u8(1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128)),
112
+ vandq_u8(cmpEqB, vmakeq_u8(1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128))
108
113
  ),
109
114
  vpaddq_u8(
110
- vandq_u8(cmpEqC, (uint8x16_t){1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128}),
111
- vandq_u8(cmpEqD, (uint8x16_t){1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128})
115
+ vandq_u8(cmpEqC, vmakeq_u8(1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128)),
116
+ vandq_u8(cmpEqD, vmakeq_u8(1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128))
112
117
  )
113
118
  );
114
119
 
@@ -225,14 +230,14 @@ HEDLEY_ALWAYS_INLINE void do_decode_neon(const uint8_t* HEDLEY_RESTRICT src, lon
225
230
  break;
226
231
  }
227
232
  }
228
- uint8x16_t match2NlDotDMasked = vandq_u8(match2NlDotD, (uint8x16_t){1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128});
233
+ uint8x16_t match2NlDotDMasked = vandq_u8(match2NlDotD, vmakeq_u8(1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128));
229
234
  uint8x16_t mergeKillDots = vpaddq_u8(
230
235
  vpaddq_u8(
231
- vandq_u8(match2NlDotA, (uint8x16_t){1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128}),
232
- vandq_u8(match2NlDotB, (uint8x16_t){1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128})
236
+ vandq_u8(match2NlDotA, vmakeq_u8(1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128)),
237
+ vandq_u8(match2NlDotB, vmakeq_u8(1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128))
233
238
  ),
234
239
  vpaddq_u8(
235
- vandq_u8(match2NlDotC, (uint8x16_t){1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128}),
240
+ vandq_u8(match2NlDotC, vmakeq_u8(1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128)),
236
241
  match2NlDotDMasked
237
242
  )
238
243
  );
@@ -308,27 +313,27 @@ HEDLEY_ALWAYS_INLINE void do_decode_neon(const uint8_t* HEDLEY_RESTRICT src, lon
308
313
 
309
314
  uint8x16_t vMaskEqA = vqtbl1q_u8(
310
315
  maskEqTemp,
311
- (uint8x16_t){0,0,0,0,0,0,0,0, 1,1,1,1,1,1,1,1}
316
+ vmakeq_u8(0,0,0,0,0,0,0,0, 1,1,1,1,1,1,1,1)
312
317
  );
313
318
  maskEqTemp = vextq_u8(maskEqTemp, maskEqTemp, 2);
314
319
  uint8x16_t vMaskEqB = vqtbl1q_u8(
315
320
  maskEqTemp,
316
- (uint8x16_t){0,0,0,0,0,0,0,0, 1,1,1,1,1,1,1,1}
321
+ vmakeq_u8(0,0,0,0,0,0,0,0, 1,1,1,1,1,1,1,1)
317
322
  );
318
323
  maskEqTemp = vextq_u8(maskEqTemp, maskEqTemp, 2);
319
324
  uint8x16_t vMaskEqC = vqtbl1q_u8(
320
325
  maskEqTemp,
321
- (uint8x16_t){0,0,0,0,0,0,0,0, 1,1,1,1,1,1,1,1}
326
+ vmakeq_u8(0,0,0,0,0,0,0,0, 1,1,1,1,1,1,1,1)
322
327
  );
323
328
  maskEqTemp = vextq_u8(maskEqTemp, maskEqTemp, 2);
324
329
  uint8x16_t vMaskEqD = vqtbl1q_u8(
325
330
  maskEqTemp,
326
- (uint8x16_t){0,0,0,0,0,0,0,0, 1,1,1,1,1,1,1,1}
331
+ vmakeq_u8(0,0,0,0,0,0,0,0, 1,1,1,1,1,1,1,1)
327
332
  );
328
- vMaskEqA = vtstq_u8(vMaskEqA, (uint8x16_t){1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128});
329
- vMaskEqB = vtstq_u8(vMaskEqB, (uint8x16_t){1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128});
330
- vMaskEqC = vtstq_u8(vMaskEqC, (uint8x16_t){1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128});
331
- vMaskEqD = vtstq_u8(vMaskEqD, (uint8x16_t){1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128});
333
+ vMaskEqA = vtstq_u8(vMaskEqA, vmakeq_u8(1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128));
334
+ vMaskEqB = vtstq_u8(vMaskEqB, vmakeq_u8(1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128));
335
+ vMaskEqC = vtstq_u8(vMaskEqC, vmakeq_u8(1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128));
336
+ vMaskEqD = vtstq_u8(vMaskEqD, vmakeq_u8(1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128));
332
337
 
333
338
  dataA = vsubq_u8(
334
339
  dataA,
@@ -384,7 +389,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_neon(const uint8_t* HEDLEY_RESTRICT src, lon
384
389
  )
385
390
  );
386
391
  }
387
- yencOffset[0] = (escFirst << 6) | 42;
392
+ yencOffset = vsetq_lane_u8((escFirst << 6) | 42, yencOffset, 0);
388
393
 
389
394
  // all that's left is to 'compress' the data (skip over masked chars)
390
395
  uint64_t counts = vget_lane_u64(vreinterpret_u64_u8(vcnt_u8(vget_low_u8(cmpCombined))), 0);
@@ -419,7 +424,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_neon(const uint8_t* HEDLEY_RESTRICT src, lon
419
424
  dataB = vsubq_u8(dataB, vdupq_n_u8(42));
420
425
  dataC = vsubq_u8(dataC, vdupq_n_u8(42));
421
426
  dataD = vsubq_u8(dataD, vdupq_n_u8(42));
422
- vst1q_u8_x4(p, ((uint8x16x4_t){dataA, dataB, dataC, dataD}));
427
+ _vst1q_u8_x4(p, vcreate4_u8(dataA, dataB, dataC, dataD));
423
428
  p += sizeof(uint8x16_t)*4;
424
429
  escFirst = 0;
425
430
  yencOffset = vdupq_n_u8(42);
@@ -8,7 +8,7 @@
8
8
  #endif
9
9
 
10
10
  // GCC (ver 6-10(dev)) fails to optimize pure C version of mask testing, but has this intrinsic; Clang >= 7 optimizes C version fine
11
- #if defined(__GNUC__) && __GNUC__ >= 7
11
+ #if (defined(__GNUC__) && __GNUC__ >= 7) || (defined(_MSC_VER) && _MSC_VER >= 1924)
12
12
  # define KORTEST16(a, b) !_kortestz_mask16_u8((a), (b))
13
13
  # define KAND16(a, b) _kand_mask16((a), (b))
14
14
  # define KOR16(a, b) _kor_mask16((a), (b))
@@ -112,15 +112,22 @@ HEDLEY_ALWAYS_INLINE void do_decode_sse(const uint8_t* HEDLEY_RESTRICT src, long
112
112
  -42,-42,-42,-42,-42,-42,-42,-42,-42,-42,-42,-42,-42,-42,-42,-42-64
113
113
  ) : _mm_set1_epi8(-42);
114
114
 
115
- #if defined(__SSSE3__) && !defined(__tune_atom__) && !defined(__tune_slm__) && !defined(__tune_btver1__)
115
+ #if defined(__SSSE3__) && !defined(__tune_atom__) && !defined(__tune_slm__) && !defined(__tune_btver1__) && !defined(__tune_btver2__)
116
116
  const bool _USING_FAST_MATCH = (use_isa >= ISA_LEVEL_SSSE3);
117
117
  #else
118
118
  const bool _USING_FAST_MATCH = false;
119
119
  #endif
120
- #if defined(__SSE4_1__) && !defined(__tune_slm__) && !defined(__tune_goldmont__) && !defined(__tune_goldmont_plus__)
120
+ #if defined(__SSE4_1__) && !defined(__tune_slm__) && !defined(__tune_goldmont__) && !defined(__tune_goldmont_plus__) && !defined(__tune_tremont__)
121
121
  const bool _USING_BLEND_ADD = (use_isa >= ISA_LEVEL_SSE41);
122
122
  #else
123
123
  const bool _USING_BLEND_ADD = false;
124
+ #endif
125
+ #if defined(__AVX512VL__) && defined(__AVX512BW__)
126
+ # if defined(_MSC_VER) && !defined(PLATFORM_AMD64) && !defined(__clang__)
127
+ const bool useAVX3MaskCmp = false;
128
+ # else
129
+ const bool useAVX3MaskCmp = (use_isa >= ISA_LEVEL_AVX3);
130
+ # endif
124
131
  #endif
125
132
 
126
133
  __m128i lfCompare = _mm_set1_epi8('\n');
@@ -214,7 +221,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_sse(const uint8_t* HEDLEY_RESTRICT src, long
214
221
  __mmask16 match2EqMaskA, match2EqMaskB;
215
222
  __mmask16 match0CrMaskA, match0CrMaskB;
216
223
  __mmask16 match2CrXDtMaskA, match2CrXDtMaskB;
217
- if(use_isa >= ISA_LEVEL_AVX3 && searchEnd) {
224
+ if(useAVX3MaskCmp && searchEnd) {
218
225
  match2EqMaskA = _mm_cmpeq_epi8_mask(_mm_set1_epi8('='), tmpData2A);
219
226
  match2EqMaskB = _mm_cmpeq_epi8_mask(_mm_set1_epi8('='), tmpData2B);
220
227
  } else
@@ -230,7 +237,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_sse(const uint8_t* HEDLEY_RESTRICT src, long
230
237
  __m128i match2CrXDtA, match2CrXDtB;
231
238
  if(isRaw) {
232
239
  #if defined(__AVX512VL__) && defined(__AVX512BW__)
233
- if(use_isa >= ISA_LEVEL_AVX3) {
240
+ if(useAVX3MaskCmp) {
234
241
  match0CrMaskA = _mm_cmpeq_epi8_mask(oDataA, _mm_set1_epi8('\r'));
235
242
  match0CrMaskB = _mm_cmpeq_epi8_mask(oDataB, _mm_set1_epi8('\r'));
236
243
  match2CrXDtMaskA = _mm_mask_cmpeq_epi8_mask(match0CrMaskA, tmpData2A, _mm_set1_epi8('.'));
@@ -256,7 +263,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_sse(const uint8_t* HEDLEY_RESTRICT src, long
256
263
  #if defined(__AVX512VL__) && defined(__AVX512BW__)
257
264
  __mmask16 match1NlMaskA, match1NlMaskB;
258
265
  __mmask16 match2NlDotMaskA, match2NlDotMaskB;
259
- if(use_isa >= ISA_LEVEL_AVX3) {
266
+ if(useAVX3MaskCmp) {
260
267
  match1NlMaskA = _mm_mask_cmpeq_epi8_mask(
261
268
  match0CrMaskA,
262
269
  _mm_set1_epi8('\n'),
@@ -299,7 +306,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_sse(const uint8_t* HEDLEY_RESTRICT src, long
299
306
 
300
307
  int matchEnd;
301
308
  #if defined(__AVX512VL__) && defined(__AVX512BW__)
302
- if(use_isa >= ISA_LEVEL_AVX3) {
309
+ if(useAVX3MaskCmp) {
303
310
  __mmask16 match3EqYMaskA = _mm_mask_cmpeq_epi8_mask(
304
311
  match2EqMaskA, _mm_set1_epi8('y'), tmpData3A
305
312
  );
@@ -368,12 +375,12 @@ HEDLEY_ALWAYS_INLINE void do_decode_sse(const uint8_t* HEDLEY_RESTRICT src, long
368
375
  if(LIKELIHOOD(0.001, matchEnd)) {
369
376
  // terminator found
370
377
  // there's probably faster ways to do this, but reverting to scalar code should be good enough
371
- len += i;
378
+ len += (long)i;
372
379
  break;
373
380
  }
374
381
  }
375
382
  #if defined(__AVX512VL__) && defined(__AVX512BW__)
376
- if(use_isa >= ISA_LEVEL_AVX3) {
383
+ if(useAVX3MaskCmp) {
377
384
  mask |= match2NlDotMaskA << 2;
378
385
  mask |= (match2NlDotMaskB << 18) & 0xffffffff;
379
386
  minMask = _mm_maskz_mov_epi8(~(match2NlDotMaskB>>14), _mm_set1_epi8('.'));
@@ -398,7 +405,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_sse(const uint8_t* HEDLEY_RESTRICT src, long
398
405
  __m128i match3EqYA, match3EqYB;
399
406
  #if defined(__AVX512VL__) && defined(__AVX512BW__)
400
407
  __mmask16 match3EqYMaskA, match3EqYMaskB;
401
- if(use_isa >= ISA_LEVEL_AVX3) {
408
+ if(useAVX3MaskCmp) {
402
409
  match3EqYMaskA = _mm_mask_cmpeq_epi8_mask(
403
410
  match2EqMaskA,
404
411
  _mm_set1_epi8('y'),
@@ -434,7 +441,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_sse(const uint8_t* HEDLEY_RESTRICT src, long
434
441
  bool endFound;
435
442
 
436
443
  #if defined(__AVX512VL__) && defined(__AVX512BW__)
437
- if(use_isa >= ISA_LEVEL_AVX3) {
444
+ if(useAVX3MaskCmp) {
438
445
  __mmask16 match3LfEqYMaskA = _mm_mask_cmpeq_epi8_mask(
439
446
  match3EqYMaskA,
440
447
  _mm_set1_epi8('\n'),
@@ -477,7 +484,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_sse(const uint8_t* HEDLEY_RESTRICT src, long
477
484
  }
478
485
 
479
486
  if(endFound) {
480
- len += i;
487
+ len += (long)i;
481
488
  break;
482
489
  }
483
490
  }
@@ -558,7 +565,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_sse(const uint8_t* HEDLEY_RESTRICT src, long
558
565
  );
559
566
 
560
567
  yencOffset = _mm_xor_si128(_mm_set1_epi8(-42),
561
- _mm_slli_epi16(_mm_cvtsi32_si128(escFirst), 6)
568
+ _mm_slli_epi16(_mm_cvtsi32_si128((int)escFirst), 6)
562
569
  );
563
570
  }
564
571
  } else {
@@ -608,7 +615,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_sse(const uint8_t* HEDLEY_RESTRICT src, long
608
615
  )
609
616
  );
610
617
  yencOffset = _mm_xor_si128(_mm_set1_epi8(-42),
611
- _mm_slli_epi16(_mm_cvtsi32_si128(escFirst), 6)
618
+ _mm_slli_epi16(_mm_cvtsi32_si128((int)escFirst), 6)
612
619
  );
613
620
  } else
614
621
  #endif
@@ -0,0 +1,30 @@
1
+ #include "common.h"
2
+
3
+ #if defined(__AVX512VL__) && defined(__AVX512VBMI2__) && defined(__AVX512BW__)
4
+ # include "decoder_common.h"
5
+ # ifndef YENC_DISABLE_AVX256
6
+ # include "decoder_avx2_base.h"
7
+ void decoder_set_vbmi2_funcs() {
8
+ ALIGN_ALLOC(lookups, sizeof(*lookups), 16);
9
+ // TODO: consider removing compact LUT
10
+ decoder_init_lut(lookups->eqFix, lookups->compact);
11
+ _do_decode = &do_decode_simd<false, false, sizeof(__m256i)*2, do_decode_avx2<false, false, ISA_LEVEL_VBMI2> >;
12
+ _do_decode_raw = &do_decode_simd<true, false, sizeof(__m256i)*2, do_decode_avx2<true, false, ISA_LEVEL_VBMI2> >;
13
+ _do_decode_end_raw = &do_decode_simd<true, true, sizeof(__m256i)*2, do_decode_avx2<true, true, ISA_LEVEL_VBMI2> >;
14
+ }
15
+ # else
16
+ # include "decoder_sse_base.h"
17
+ void decoder_set_vbmi2_funcs() {
18
+ decoder_sse_init();
19
+ decoder_init_lut(lookups->eqFix, lookups->compact);
20
+ _do_decode = &do_decode_simd<false, false, sizeof(__m128i)*2, do_decode_sse<false, false, ISA_LEVEL_VBMI2> >;
21
+ _do_decode_raw = &do_decode_simd<true, false, sizeof(__m128i)*2, do_decode_sse<true, false, ISA_LEVEL_VBMI2> >;
22
+ _do_decode_end_raw = &do_decode_simd<true, true, sizeof(__m128i)*2, do_decode_sse<true, true, ISA_LEVEL_VBMI2> >;
23
+ }
24
+ # endif
25
+ #else
26
+ void decoder_set_avx2_funcs();
27
+ void decoder_set_vbmi2_funcs() {
28
+ decoder_set_avx2_funcs();
29
+ }
30
+ #endif
package/src/encoder.cc CHANGED
@@ -1,7 +1,8 @@
1
1
  #include "common.h"
2
2
  #include "encoder_common.h"
3
+ #include "encoder.h"
3
4
 
4
- size_t do_encode_generic(int line_size, int* colOffset, const unsigned char* HEDLEY_RESTRICT src, unsigned char* HEDLEY_RESTRICT dest, size_t len, bool doEnd) {
5
+ size_t do_encode_generic(int line_size, int* colOffset, const unsigned char* HEDLEY_RESTRICT src, unsigned char* HEDLEY_RESTRICT dest, size_t len, int doEnd) {
5
6
  unsigned char* es = (unsigned char*)src + len;
6
7
  unsigned char *p = dest; // destination pointer
7
8
  long i = -(long)len; // input position
@@ -119,12 +120,15 @@ size_t do_encode_generic(int line_size, int* colOffset, const unsigned char* HED
119
120
  }
120
121
 
121
122
 
122
- size_t (*_do_encode)(int, int*, const unsigned char* HEDLEY_RESTRICT, unsigned char* HEDLEY_RESTRICT, size_t, bool) = &do_encode_generic;
123
+ extern "C" {
124
+ size_t (*_do_encode)(int, int*, const unsigned char* HEDLEY_RESTRICT, unsigned char* HEDLEY_RESTRICT, size_t, int) = &do_encode_generic;
125
+ }
123
126
 
124
127
  void encoder_sse2_init();
125
128
  void encoder_ssse3_init();
126
129
  void encoder_avx_init();
127
130
  void encoder_avx2_init();
131
+ void encoder_vbmi2_init();
128
132
  void encoder_neon_init();
129
133
 
130
134
  #if defined(PLATFORM_X86) && defined(YENC_BUILD_NATIVE) && YENC_BUILD_NATIVE!=0
@@ -150,7 +154,9 @@ void encoder_init() {
150
154
  encoder_native_init();
151
155
  # else
152
156
  int use_isa = cpu_supports_isa();
153
- if(use_isa >= ISA_LEVEL_AVX2)
157
+ if(use_isa >= ISA_LEVEL_VBMI2)
158
+ encoder_vbmi2_init();
159
+ else if(use_isa >= ISA_LEVEL_AVX2)
154
160
  encoder_avx2_init();
155
161
  else if(use_isa >= ISA_LEVEL_AVX)
156
162
  encoder_avx_init();
package/src/encoder.h CHANGED
@@ -1,5 +1,21 @@
1
+ #ifndef __YENC_ENCODER_H
2
+ #define __YENC_ENCODER_H
3
+
4
+ #ifdef __cplusplus
5
+ extern "C" {
6
+ #endif
7
+
8
+
9
+
1
10
  #include "hedley.h"
2
11
 
3
- extern size_t (*_do_encode)(int, int*, const unsigned char* HEDLEY_RESTRICT, unsigned char* HEDLEY_RESTRICT, size_t, bool);
12
+ extern size_t (*_do_encode)(int, int*, const unsigned char* HEDLEY_RESTRICT, unsigned char* HEDLEY_RESTRICT, size_t, int);
4
13
  #define do_encode (*_do_encode)
5
14
  void encoder_init();
15
+
16
+
17
+
18
+ #ifdef __cplusplus
19
+ }
20
+ #endif
21
+ #endif
@@ -6,7 +6,7 @@
6
6
  #include "encoder_common.h"
7
7
  #define YMM_SIZE 32
8
8
 
9
- #if defined(__GNUC__) && __GNUC__ >= 7
9
+ #if (defined(__GNUC__) && __GNUC__ >= 7) || (defined(_MSC_VER) && _MSC_VER >= 1924)
10
10
  # define KLOAD32(a, offs) _load_mask32((__mmask32*)(a) + (offs))
11
11
  #else
12
12
  # define KLOAD32(a, offs) (((uint32_t*)(a))[(offs)])
@@ -112,7 +112,7 @@ HEDLEY_ALWAYS_INLINE void do_encode_avx2(int line_size, int* colOffset, const ui
112
112
  // last char
113
113
  uint32_t eolChar = (use_isa >= ISA_LEVEL_VBMI2 ? lookupsVBMI2->eolLastChar[c] : lookupsAVX2->eolLastChar[c]);
114
114
  *(uint32_t*)p = eolChar;
115
- p += 3 + (eolChar>>27);
115
+ p += 3 + (uintptr_t)(eolChar>>27);
116
116
  col = -line_size+1;
117
117
  } else {
118
118
  // line overflowed, insert a newline
@@ -215,7 +215,7 @@ HEDLEY_ALWAYS_INLINE void do_encode_avx2(int line_size, int* colOffset, const ui
215
215
  // duplicate halves
216
216
  data1A = _mm256_inserti128_si256(dataA, _mm256_castsi256_si128(dataA), 1);
217
217
  data1B = _mm256_inserti128_si256(dataB, _mm256_castsi256_si128(dataB), 1);
218
- #ifdef __tune_znver2__
218
+ #if defined(__tune_znver2__) || defined(__tune_znver3__)
219
219
  data2A = _mm256_permute2x128_si256(dataA, dataA, 0x11);
220
220
  data2B = _mm256_permute2x128_si256(dataB, dataB, 0x11);
221
221
  #else
@@ -254,7 +254,7 @@ HEDLEY_ALWAYS_INLINE void do_encode_avx2(int line_size, int* colOffset, const ui
254
254
  // we overflowed - find correct position to revert back to
255
255
  // this is perhaps sub-optimal on 32-bit, but who still uses that with AVX2?
256
256
  uint64_t eqMask;
257
- int shiftAmt = maskBitsB + YMM_SIZE - col -1;
257
+ int shiftAmt = (int)(maskBitsB + YMM_SIZE -1 - col);
258
258
  if(HEDLEY_UNLIKELY(shiftAmt < 0)) {
259
259
  uint32_t eqMask1, eqMask2;
260
260
  #if defined(__AVX512VBMI2__) && defined(__AVX512VL__) && defined(__AVX512BW__)
@@ -293,7 +293,7 @@ HEDLEY_ALWAYS_INLINE void do_encode_avx2(int line_size, int* colOffset, const ui
293
293
  asm(
294
294
  "shrq $1, %[eqMask] \n"
295
295
  "shrq %%cl, %[eqMask] \n"
296
- "adcq %[col], %[p] \n"
296
+ "adcq %q[col], %q[p] \n"
297
297
  : [eqMask]"+r"(eqMask), [p]"+r"(p)
298
298
  : "c"(shiftAmt), [col]"r"(~col)
299
299
  );
@@ -320,7 +320,7 @@ HEDLEY_ALWAYS_INLINE void do_encode_avx2(int line_size, int* colOffset, const ui
320
320
  #endif
321
321
  {
322
322
  i += bitCount;
323
- unsigned int revert = col + (eqMask & 1);
323
+ unsigned int revert = (unsigned int)(col + (eqMask & 1));
324
324
  p -= revert;
325
325
  i -= revert;
326
326
  }
@@ -429,7 +429,7 @@ HEDLEY_ALWAYS_INLINE void do_encode_avx2(int line_size, int* colOffset, const ui
429
429
  _encode_eol_handle_pre:
430
430
  uint32_t eolChar = (use_isa >= ISA_LEVEL_VBMI2 ? lookupsVBMI2->eolLastChar[es[i]] : lookupsAVX2->eolLastChar[es[i]]);
431
431
  *(uint32_t*)p = eolChar;
432
- p += 3 + (eolChar>>27);
432
+ p += 3 + (uintptr_t)(eolChar>>27);
433
433
  col = lineSizeOffset;
434
434
 
435
435
  if(HEDLEY_UNLIKELY(i >= 0)) { // this isn't really a proper check - it's only needed to support short lines; basically, if the line is too short, `i` never gets checked, so we need one somewhere
@@ -556,7 +556,7 @@ HEDLEY_ALWAYS_INLINE void do_encode_avx2(int line_size, int* colOffset, const ui
556
556
 
557
557
  _mm256_zeroupper();
558
558
 
559
- *colOffset = col + line_size -1;
559
+ *colOffset = (int)(col + line_size -1);
560
560
  dest = p;
561
561
  len = -(i - INPUT_OFFSET);
562
562
  }
@@ -8,7 +8,7 @@
8
8
  #define _BX _B3(0), _B3(64), _B3(128), _B3(192)
9
9
 
10
10
  static const unsigned char escapeLUT[256] = { // whether or not the character is critical
11
- #define _B(n) ((n == 214 || n == 214+'\r' || n == 214+'\n' || n == '='-42) ? 0 : (n+42) & 0xff)
11
+ #define _B(n) ((n == 214 || n == '\r'+214 || n == '\n'+214 || n == '='-42) ? 0 : (n+42) & 0xff)
12
12
  _BX
13
13
  #undef _B
14
14
  };
@@ -24,10 +24,10 @@ static const uint16_t escapedLUT[256] = { // escaped sequences for characters th
24
24
  #undef _BX
25
25
 
26
26
 
27
- size_t do_encode_generic(int line_size, int* colOffset, const unsigned char* HEDLEY_RESTRICT src, unsigned char* HEDLEY_RESTRICT dest, size_t len, bool doEnd);
27
+ size_t do_encode_generic(int line_size, int* colOffset, const unsigned char* HEDLEY_RESTRICT src, unsigned char* HEDLEY_RESTRICT dest, size_t len, int doEnd);
28
28
 
29
29
  template<void(&kernel)(int, int*, const uint8_t* HEDLEY_RESTRICT, uint8_t* HEDLEY_RESTRICT&, size_t&)>
30
- static size_t do_encode_simd(int line_size, int* colOffset, const uint8_t* HEDLEY_RESTRICT src, uint8_t* HEDLEY_RESTRICT dest, size_t len, bool doEnd) {
30
+ static size_t do_encode_simd(int line_size, int* colOffset, const uint8_t* HEDLEY_RESTRICT src, uint8_t* HEDLEY_RESTRICT dest, size_t len, int doEnd) {
31
31
  if(len < 1) return 0;
32
32
  if(line_size < 12) { // short lines probably not worth processing in a SIMD way
33
33
  // we assume at least the first and last char exist in the line, and since the first char could be escaped, and SIMD encoder assumes at least one non-first/last char, assumption means that line size has to be >= 4