yencode 1.1.1 → 1.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,5 +1,5 @@
1
1
  #include "common.h"
2
- #ifdef __ARM_NEON
2
+ #if defined(__ARM_NEON) && defined(__aarch64__)
3
3
 
4
4
  #include "decoder_common.h"
5
5
 
@@ -11,8 +11,8 @@ static uint8_t eqFixLUT[256];
11
11
 
12
12
 
13
13
 
14
- #if !defined(__clang__)
15
- HEDLEY_ALWAYS_INLINE uint8x16x4_t vld1q_u8_x4(const uint8_t* p) {
14
+ #if !defined(__clang__) && !defined(_MSC_VER) && (!defined(__aarch64__) || !HEDLEY_GCC_VERSION_CHECK(10,0,0))
15
+ static HEDLEY_ALWAYS_INLINE uint8x16x4_t vld1q_u8_x4(const uint8_t* p) {
16
16
  uint8x16x4_t ret;
17
17
  ret.val[0] = vld1q_u8(p);
18
18
  ret.val[1] = vld1q_u8(p+16);
@@ -20,7 +20,7 @@ HEDLEY_ALWAYS_INLINE uint8x16x4_t vld1q_u8_x4(const uint8_t* p) {
20
20
  ret.val[3] = vld1q_u8(p+48);
21
21
  return ret;
22
22
  }
23
- HEDLEY_ALWAYS_INLINE void vst1q_u8_x4(uint8_t* p, uint8x16x4_t data) {
23
+ static HEDLEY_ALWAYS_INLINE void vst1q_u8_x4(uint8_t* p, uint8x16x4_t data) {
24
24
  vst1q_u8(p, data.val[0]);
25
25
  vst1q_u8(p+16, data.val[1]);
26
26
  vst1q_u8(p+32, data.val[2]);
@@ -48,9 +48,11 @@ HEDLEY_ALWAYS_INLINE void do_decode_neon(const uint8_t* HEDLEY_RESTRICT src, lon
48
48
  HEDLEY_ASSUME(escFirst == 0 || escFirst == 1);
49
49
  HEDLEY_ASSUME(nextMask == 0 || nextMask == 1 || nextMask == 2);
50
50
  uint8x16_t nextMaskMix = vdupq_n_u8(0);
51
- if(nextMask)
52
- nextMaskMix[nextMask-1] = nextMask;
53
- uint8x16_t yencOffset = escFirst ? (uint8x16_t){42+64,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42} : vdupq_n_u8(42);
51
+ if(nextMask == 1)
52
+ nextMaskMix = vsetq_lane_u8(1, nextMaskMix, 0);
53
+ if(nextMask == 2)
54
+ nextMaskMix = vsetq_lane_u8(2, nextMaskMix, 1);
55
+ uint8x16_t yencOffset = escFirst ? vmakeq_u8(42+64,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42) : vdupq_n_u8(42);
54
56
  long i;
55
57
  for(i = -len; i; i += sizeof(uint8x16_t)*4) {
56
58
  uint8x16x4_t data = vld1q_u8_x4(src+i);
@@ -66,23 +68,23 @@ HEDLEY_ALWAYS_INLINE void do_decode_neon(const uint8_t* HEDLEY_RESTRICT src, lon
66
68
  cmpEqD = vceqq_u8(dataD, vdupq_n_u8('=')),
67
69
  cmpA = vqtbx1q_u8(
68
70
  cmpEqA,
69
- // \n \r
70
- (uint8x16_t){0,0,0,0,0,0,0,0,0,0,255,0,0,255,0,0},
71
+ // \n \r
72
+ vmakeq_u8(0,0,0,0,0,0,0,0,0,0,255,0,0,255,0,0),
71
73
  dataA
72
74
  ),
73
75
  cmpB = vqtbx1q_u8(
74
76
  cmpEqB,
75
- (uint8x16_t){0,0,0,0,0,0,0,0,0,0,255,0,0,255,0,0},
77
+ vmakeq_u8(0,0,0,0,0,0,0,0,0,0,255,0,0,255,0,0),
76
78
  dataB
77
79
  ),
78
80
  cmpC = vqtbx1q_u8(
79
81
  cmpEqC,
80
- (uint8x16_t){0,0,0,0,0,0,0,0,0,0,255,0,0,255,0,0},
82
+ vmakeq_u8(0,0,0,0,0,0,0,0,0,0,255,0,0,255,0,0),
81
83
  dataC
82
84
  ),
83
85
  cmpD = vqtbx1q_u8(
84
86
  cmpEqD,
85
- (uint8x16_t){0,0,0,0,0,0,0,0,0,0,255,0,0,255,0,0},
87
+ vmakeq_u8(0,0,0,0,0,0,0,0,0,0,255,0,0,255,0,0),
86
88
  dataD
87
89
  );
88
90
  if(isRaw) cmpA = vorrq_u8(cmpA, nextMaskMix);
@@ -93,22 +95,22 @@ HEDLEY_ALWAYS_INLINE void do_decode_neon(const uint8_t* HEDLEY_RESTRICT src, lon
93
95
  )))) {
94
96
  uint8x16_t cmpMerge = vpaddq_u8(
95
97
  vpaddq_u8(
96
- vandq_u8(cmpA, (uint8x16_t){1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128}),
97
- vandq_u8(cmpB, (uint8x16_t){1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128})
98
+ vandq_u8(cmpA, vmakeq_u8(1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128)),
99
+ vandq_u8(cmpB, vmakeq_u8(1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128))
98
100
  ),
99
101
  vpaddq_u8(
100
- vandq_u8(cmpC, (uint8x16_t){1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128}),
101
- vandq_u8(cmpD, (uint8x16_t){1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128})
102
+ vandq_u8(cmpC, vmakeq_u8(1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128)),
103
+ vandq_u8(cmpD, vmakeq_u8(1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128))
102
104
  )
103
105
  );
104
106
  uint8x16_t cmpEqMerge = vpaddq_u8(
105
107
  vpaddq_u8(
106
- vandq_u8(cmpEqA, (uint8x16_t){1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128}),
107
- vandq_u8(cmpEqB, (uint8x16_t){1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128})
108
+ vandq_u8(cmpEqA, vmakeq_u8(1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128)),
109
+ vandq_u8(cmpEqB, vmakeq_u8(1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128))
108
110
  ),
109
111
  vpaddq_u8(
110
- vandq_u8(cmpEqC, (uint8x16_t){1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128}),
111
- vandq_u8(cmpEqD, (uint8x16_t){1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128})
112
+ vandq_u8(cmpEqC, vmakeq_u8(1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128)),
113
+ vandq_u8(cmpEqD, vmakeq_u8(1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128))
112
114
  )
113
115
  );
114
116
 
@@ -225,14 +227,14 @@ HEDLEY_ALWAYS_INLINE void do_decode_neon(const uint8_t* HEDLEY_RESTRICT src, lon
225
227
  break;
226
228
  }
227
229
  }
228
- uint8x16_t match2NlDotDMasked = vandq_u8(match2NlDotD, (uint8x16_t){1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128});
230
+ uint8x16_t match2NlDotDMasked = vandq_u8(match2NlDotD, vmakeq_u8(1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128));
229
231
  uint8x16_t mergeKillDots = vpaddq_u8(
230
232
  vpaddq_u8(
231
- vandq_u8(match2NlDotA, (uint8x16_t){1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128}),
232
- vandq_u8(match2NlDotB, (uint8x16_t){1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128})
233
+ vandq_u8(match2NlDotA, vmakeq_u8(1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128)),
234
+ vandq_u8(match2NlDotB, vmakeq_u8(1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128))
233
235
  ),
234
236
  vpaddq_u8(
235
- vandq_u8(match2NlDotC, (uint8x16_t){1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128}),
237
+ vandq_u8(match2NlDotC, vmakeq_u8(1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128)),
236
238
  match2NlDotDMasked
237
239
  )
238
240
  );
@@ -308,27 +310,27 @@ HEDLEY_ALWAYS_INLINE void do_decode_neon(const uint8_t* HEDLEY_RESTRICT src, lon
308
310
 
309
311
  uint8x16_t vMaskEqA = vqtbl1q_u8(
310
312
  maskEqTemp,
311
- (uint8x16_t){0,0,0,0,0,0,0,0, 1,1,1,1,1,1,1,1}
313
+ vmakeq_u8(0,0,0,0,0,0,0,0, 1,1,1,1,1,1,1,1)
312
314
  );
313
315
  maskEqTemp = vextq_u8(maskEqTemp, maskEqTemp, 2);
314
316
  uint8x16_t vMaskEqB = vqtbl1q_u8(
315
317
  maskEqTemp,
316
- (uint8x16_t){0,0,0,0,0,0,0,0, 1,1,1,1,1,1,1,1}
318
+ vmakeq_u8(0,0,0,0,0,0,0,0, 1,1,1,1,1,1,1,1)
317
319
  );
318
320
  maskEqTemp = vextq_u8(maskEqTemp, maskEqTemp, 2);
319
321
  uint8x16_t vMaskEqC = vqtbl1q_u8(
320
322
  maskEqTemp,
321
- (uint8x16_t){0,0,0,0,0,0,0,0, 1,1,1,1,1,1,1,1}
323
+ vmakeq_u8(0,0,0,0,0,0,0,0, 1,1,1,1,1,1,1,1)
322
324
  );
323
325
  maskEqTemp = vextq_u8(maskEqTemp, maskEqTemp, 2);
324
326
  uint8x16_t vMaskEqD = vqtbl1q_u8(
325
327
  maskEqTemp,
326
- (uint8x16_t){0,0,0,0,0,0,0,0, 1,1,1,1,1,1,1,1}
328
+ vmakeq_u8(0,0,0,0,0,0,0,0, 1,1,1,1,1,1,1,1)
327
329
  );
328
- vMaskEqA = vtstq_u8(vMaskEqA, (uint8x16_t){1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128});
329
- vMaskEqB = vtstq_u8(vMaskEqB, (uint8x16_t){1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128});
330
- vMaskEqC = vtstq_u8(vMaskEqC, (uint8x16_t){1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128});
331
- vMaskEqD = vtstq_u8(vMaskEqD, (uint8x16_t){1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128});
330
+ vMaskEqA = vtstq_u8(vMaskEqA, vmakeq_u8(1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128));
331
+ vMaskEqB = vtstq_u8(vMaskEqB, vmakeq_u8(1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128));
332
+ vMaskEqC = vtstq_u8(vMaskEqC, vmakeq_u8(1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128));
333
+ vMaskEqD = vtstq_u8(vMaskEqD, vmakeq_u8(1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128));
332
334
 
333
335
  dataA = vsubq_u8(
334
336
  dataA,
@@ -384,7 +386,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_neon(const uint8_t* HEDLEY_RESTRICT src, lon
384
386
  )
385
387
  );
386
388
  }
387
- yencOffset[0] = (escFirst << 6) | 42;
389
+ yencOffset = vsetq_lane_u8((escFirst << 6) | 42, yencOffset, 0);
388
390
 
389
391
  // all that's left is to 'compress' the data (skip over masked chars)
390
392
  uint64_t counts = vget_lane_u64(vreinterpret_u64_u8(vcnt_u8(vget_low_u8(cmpCombined))), 0);
@@ -419,7 +421,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_neon(const uint8_t* HEDLEY_RESTRICT src, lon
419
421
  dataB = vsubq_u8(dataB, vdupq_n_u8(42));
420
422
  dataC = vsubq_u8(dataC, vdupq_n_u8(42));
421
423
  dataD = vsubq_u8(dataD, vdupq_n_u8(42));
422
- vst1q_u8_x4(p, ((uint8x16x4_t){dataA, dataB, dataC, dataD}));
424
+ vst1q_u8_x4(p, vcreate4_u8(dataA, dataB, dataC, dataD));
423
425
  p += sizeof(uint8x16_t)*4;
424
426
  escFirst = 0;
425
427
  yencOffset = vdupq_n_u8(42);
@@ -117,7 +117,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_sse(const uint8_t* HEDLEY_RESTRICT src, long
117
117
  #else
118
118
  const bool _USING_FAST_MATCH = false;
119
119
  #endif
120
- #if defined(__SSE4_1__) && !defined(__tune_slm__) && !defined(__tune_goldmont__) && !defined(__tune_goldmont_plus__)
120
+ #if defined(__SSE4_1__) && !defined(__tune_slm__) && !defined(__tune_goldmont__) && !defined(__tune_goldmont_plus__) && !defined(__tune_tremont__)
121
121
  const bool _USING_BLEND_ADD = (use_isa >= ISA_LEVEL_SSE41);
122
122
  #else
123
123
  const bool _USING_BLEND_ADD = false;
@@ -368,7 +368,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_sse(const uint8_t* HEDLEY_RESTRICT src, long
368
368
  if(LIKELIHOOD(0.001, matchEnd)) {
369
369
  // terminator found
370
370
  // there's probably faster ways to do this, but reverting to scalar code should be good enough
371
- len += i;
371
+ len += (long)i;
372
372
  break;
373
373
  }
374
374
  }
@@ -477,7 +477,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_sse(const uint8_t* HEDLEY_RESTRICT src, long
477
477
  }
478
478
 
479
479
  if(endFound) {
480
- len += i;
480
+ len += (long)i;
481
481
  break;
482
482
  }
483
483
  }
@@ -558,7 +558,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_sse(const uint8_t* HEDLEY_RESTRICT src, long
558
558
  );
559
559
 
560
560
  yencOffset = _mm_xor_si128(_mm_set1_epi8(-42),
561
- _mm_slli_epi16(_mm_cvtsi32_si128(escFirst), 6)
561
+ _mm_slli_epi16(_mm_cvtsi32_si128((int)escFirst), 6)
562
562
  );
563
563
  }
564
564
  } else {
@@ -608,7 +608,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_sse(const uint8_t* HEDLEY_RESTRICT src, long
608
608
  )
609
609
  );
610
610
  yencOffset = _mm_xor_si128(_mm_set1_epi8(-42),
611
- _mm_slli_epi16(_mm_cvtsi32_si128(escFirst), 6)
611
+ _mm_slli_epi16(_mm_cvtsi32_si128((int)escFirst), 6)
612
612
  );
613
613
  } else
614
614
  #endif
package/src/encoder.cc CHANGED
@@ -1,7 +1,8 @@
1
1
  #include "common.h"
2
2
  #include "encoder_common.h"
3
+ #include "encoder.h"
3
4
 
4
- size_t do_encode_generic(int line_size, int* colOffset, const unsigned char* HEDLEY_RESTRICT src, unsigned char* HEDLEY_RESTRICT dest, size_t len, bool doEnd) {
5
+ size_t do_encode_generic(int line_size, int* colOffset, const unsigned char* HEDLEY_RESTRICT src, unsigned char* HEDLEY_RESTRICT dest, size_t len, int doEnd) {
5
6
  unsigned char* es = (unsigned char*)src + len;
6
7
  unsigned char *p = dest; // destination pointer
7
8
  long i = -(long)len; // input position
@@ -119,7 +120,9 @@ size_t do_encode_generic(int line_size, int* colOffset, const unsigned char* HED
119
120
  }
120
121
 
121
122
 
122
- size_t (*_do_encode)(int, int*, const unsigned char* HEDLEY_RESTRICT, unsigned char* HEDLEY_RESTRICT, size_t, bool) = &do_encode_generic;
123
+ extern "C" {
124
+ size_t (*_do_encode)(int, int*, const unsigned char* HEDLEY_RESTRICT, unsigned char* HEDLEY_RESTRICT, size_t, int) = &do_encode_generic;
125
+ }
123
126
 
124
127
  void encoder_sse2_init();
125
128
  void encoder_ssse3_init();
package/src/encoder.h CHANGED
@@ -1,5 +1,21 @@
1
+ #ifndef __YENC_ENCODER_H
2
+ #define __YENC_ENCODER_H
3
+
4
+ #ifdef __cplusplus
5
+ extern "C" {
6
+ #endif
7
+
8
+
9
+
1
10
  #include "hedley.h"
2
11
 
3
- extern size_t (*_do_encode)(int, int*, const unsigned char* HEDLEY_RESTRICT, unsigned char* HEDLEY_RESTRICT, size_t, bool);
12
+ extern size_t (*_do_encode)(int, int*, const unsigned char* HEDLEY_RESTRICT, unsigned char* HEDLEY_RESTRICT, size_t, int);
4
13
  #define do_encode (*_do_encode)
5
14
  void encoder_init();
15
+
16
+
17
+
18
+ #ifdef __cplusplus
19
+ }
20
+ #endif
21
+ #endif
@@ -112,7 +112,7 @@ HEDLEY_ALWAYS_INLINE void do_encode_avx2(int line_size, int* colOffset, const ui
112
112
  // last char
113
113
  uint32_t eolChar = (use_isa >= ISA_LEVEL_VBMI2 ? lookupsVBMI2->eolLastChar[c] : lookupsAVX2->eolLastChar[c]);
114
114
  *(uint32_t*)p = eolChar;
115
- p += 3 + (eolChar>>27);
115
+ p += 3 + (uintptr_t)(eolChar>>27);
116
116
  col = -line_size+1;
117
117
  } else {
118
118
  // line overflowed, insert a newline
@@ -215,7 +215,7 @@ HEDLEY_ALWAYS_INLINE void do_encode_avx2(int line_size, int* colOffset, const ui
215
215
  // duplicate halves
216
216
  data1A = _mm256_inserti128_si256(dataA, _mm256_castsi256_si128(dataA), 1);
217
217
  data1B = _mm256_inserti128_si256(dataB, _mm256_castsi256_si128(dataB), 1);
218
- #ifdef __tune_znver2__
218
+ #if defined(__tune_znver2__) || defined(__tune_znver3__)
219
219
  data2A = _mm256_permute2x128_si256(dataA, dataA, 0x11);
220
220
  data2B = _mm256_permute2x128_si256(dataB, dataB, 0x11);
221
221
  #else
@@ -254,7 +254,7 @@ HEDLEY_ALWAYS_INLINE void do_encode_avx2(int line_size, int* colOffset, const ui
254
254
  // we overflowed - find correct position to revert back to
255
255
  // this is perhaps sub-optimal on 32-bit, but who still uses that with AVX2?
256
256
  uint64_t eqMask;
257
- int shiftAmt = maskBitsB + YMM_SIZE - col -1;
257
+ int shiftAmt = (int)(maskBitsB + YMM_SIZE -1 - col);
258
258
  if(HEDLEY_UNLIKELY(shiftAmt < 0)) {
259
259
  uint32_t eqMask1, eqMask2;
260
260
  #if defined(__AVX512VBMI2__) && defined(__AVX512VL__) && defined(__AVX512BW__)
@@ -320,7 +320,7 @@ HEDLEY_ALWAYS_INLINE void do_encode_avx2(int line_size, int* colOffset, const ui
320
320
  #endif
321
321
  {
322
322
  i += bitCount;
323
- unsigned int revert = col + (eqMask & 1);
323
+ unsigned int revert = (unsigned int)(col + (eqMask & 1));
324
324
  p -= revert;
325
325
  i -= revert;
326
326
  }
@@ -429,7 +429,7 @@ HEDLEY_ALWAYS_INLINE void do_encode_avx2(int line_size, int* colOffset, const ui
429
429
  _encode_eol_handle_pre:
430
430
  uint32_t eolChar = (use_isa >= ISA_LEVEL_VBMI2 ? lookupsVBMI2->eolLastChar[es[i]] : lookupsAVX2->eolLastChar[es[i]]);
431
431
  *(uint32_t*)p = eolChar;
432
- p += 3 + (eolChar>>27);
432
+ p += 3 + (uintptr_t)(eolChar>>27);
433
433
  col = lineSizeOffset;
434
434
 
435
435
  if(HEDLEY_UNLIKELY(i >= 0)) { // this isn't really a proper check - it's only needed to support short lines; basically, if the line is too short, `i` never gets checked, so we need one somewhere
@@ -556,7 +556,7 @@ HEDLEY_ALWAYS_INLINE void do_encode_avx2(int line_size, int* colOffset, const ui
556
556
 
557
557
  _mm256_zeroupper();
558
558
 
559
- *colOffset = col + line_size -1;
559
+ *colOffset = (int)(col + line_size -1);
560
560
  dest = p;
561
561
  len = -(i - INPUT_OFFSET);
562
562
  }
@@ -8,7 +8,7 @@
8
8
  #define _BX _B3(0), _B3(64), _B3(128), _B3(192)
9
9
 
10
10
  static const unsigned char escapeLUT[256] = { // whether or not the character is critical
11
- #define _B(n) ((n == 214 || n == 214+'\r' || n == 214+'\n' || n == '='-42) ? 0 : (n+42) & 0xff)
11
+ #define _B(n) ((n == 214 || n == '\r'+214 || n == '\n'+214 || n == '='-42) ? 0 : (n+42) & 0xff)
12
12
  _BX
13
13
  #undef _B
14
14
  };
@@ -24,10 +24,10 @@ static const uint16_t escapedLUT[256] = { // escaped sequences for characters th
24
24
  #undef _BX
25
25
 
26
26
 
27
- size_t do_encode_generic(int line_size, int* colOffset, const unsigned char* HEDLEY_RESTRICT src, unsigned char* HEDLEY_RESTRICT dest, size_t len, bool doEnd);
27
+ size_t do_encode_generic(int line_size, int* colOffset, const unsigned char* HEDLEY_RESTRICT src, unsigned char* HEDLEY_RESTRICT dest, size_t len, int doEnd);
28
28
 
29
29
  template<void(&kernel)(int, int*, const uint8_t* HEDLEY_RESTRICT, uint8_t* HEDLEY_RESTRICT&, size_t&)>
30
- static size_t do_encode_simd(int line_size, int* colOffset, const uint8_t* HEDLEY_RESTRICT src, uint8_t* HEDLEY_RESTRICT dest, size_t len, bool doEnd) {
30
+ static size_t do_encode_simd(int line_size, int* colOffset, const uint8_t* HEDLEY_RESTRICT src, uint8_t* HEDLEY_RESTRICT dest, size_t len, int doEnd) {
31
31
  if(len < 1) return 0;
32
32
  if(line_size < 12) { // short lines probably not worth processing in a SIMD way
33
33
  // we assume at least the first and last char exist in the line, and since the first char could be escaped, and SIMD encoder assumes at least one non-first/last char, assumption means that line size has to be >= 4
@@ -8,7 +8,7 @@
8
8
  #if defined(__aarch64__) && (defined(__clang__) || (defined(__GNUC__) && __GNUC__ >= 9))
9
9
  # define vst1q_u8_x2_unaligned vst1q_u8_x2
10
10
  #else
11
- HEDLEY_ALWAYS_INLINE void vst1q_u8_x2_unaligned(uint8_t* p, uint8x16x2_t data) {
11
+ static HEDLEY_ALWAYS_INLINE void vst1q_u8_x2_unaligned(uint8_t* p, uint8x16x2_t data) {
12
12
  vst1q_u8(p, data.val[0]);
13
13
  vst1q_u8(p+16, data.val[1]);
14
14
  }
@@ -26,16 +26,16 @@ static HEDLEY_ALWAYS_INLINE void encode_eol_handle_pre(const uint8_t* HEDLEY_RES
26
26
  #ifdef __aarch64__
27
27
  uint8x16_t cmpA = vreinterpretq_u8_s8(vqtbx2q_s8(
28
28
  vdupq_n_s8('='-42),
29
- (int8x16x2_t){'\0'-42,-128,-128,'\0'-42,'\t'-42,'\n'-42,'\r'-42,'\t'-42,'\n'-42,'\r'-42,-128,-128,'\0'-42,-128,-128,-128, ' '-42,'\n'-42,'\r'-42,' '-42,-128,-128,-128,-128,-128,-128,'.'-42,-128,-128,-128,'='-42,-128},
30
- vreinterpretq_u8_s8(vhaddq_s8(vreinterpretq_s8_u8(dataA), (int8x16_t){42,48,66,66, 66,66,66,66, 66,66,66,66, 66,66,66,66}))
29
+ vcreate2_s8(vmakeq_s8('\0'-42,-128,-128,'\0'-42,'\t'-42,'\n'-42,'\r'-42,'\t'-42,'\n'-42,'\r'-42,-128,-128,'\0'-42,-128,-128,-128), vmakeq_s8(' '-42,'\n'-42,'\r'-42,' '-42,-128,-128,-128,-128,-128,-128,'.'-42,-128,-128,-128,'='-42,-128)),
30
+ vreinterpretq_u8_s8(vhaddq_s8(vreinterpretq_s8_u8(dataA), vmakeq_s8(42,48,66,66, 66,66,66,66, 66,66,66,66, 66,66,66,66)))
31
31
  ));
32
32
  cmpA = vceqq_u8(cmpA, dataA);
33
33
 
34
34
  dataB = vaddq_u8(oDataB, vdupq_n_u8(42));
35
35
  uint8x16_t cmpB = vqtbx1q_u8(
36
36
  vceqq_u8(oDataB, vdupq_n_u8('='-42)),
37
- // \0 \n \r
38
- (uint8x16_t){255,0,0,0,0,0,0,0,0,0,255,0,0,255,0,0},
37
+ // \0 \n \r
38
+ vmakeq_u8(255,0,0,0,0,0,0,0,0,0,255,0,0,255,0,0),
39
39
  dataB
40
40
  );
41
41
  dataA = vaddq_u8(dataA, vbslq_u8(cmpA, vdupq_n_u8(64+42), vdupq_n_u8(42)));
@@ -64,9 +64,9 @@ static HEDLEY_ALWAYS_INLINE void encode_eol_handle_pre(const uint8_t* HEDLEY_RES
64
64
 
65
65
  // dup low 2 bytes & compare
66
66
  uint8x8_t firstTwoChars = vreinterpret_u8_u16(vdup_lane_u16(vreinterpret_u16_u8(vget_low_u8(oDataA)), 0));
67
- uint8x8_t cmpNl = vceq_u8(firstTwoChars, vreinterpret_u8_s8((int8x8_t){
68
- ' '-42,' '-42,'\t'-42,'\t'-42,'\r'-42,'.'-42,'='-42,'='-42
69
- }));
67
+ uint8x8_t cmpNl = vceq_u8(firstTwoChars, vmake_u8(
68
+ ' '+214,' '+214,'\t'+214,'\t'+214,'\r'+214,'.'-42,'='-42,'='-42
69
+ ));
70
70
  // use padd to merge comparisons
71
71
  uint16x4_t cmpNl2 = vreinterpret_u16_u8(cmpNl);
72
72
  cmpNl2 = vpadd_u16(cmpNl2, vdup_n_u16(0));
@@ -80,8 +80,8 @@ static HEDLEY_ALWAYS_INLINE void encode_eol_handle_pre(const uint8_t* HEDLEY_RES
80
80
  #endif
81
81
 
82
82
 
83
- uint8x16_t cmpAMasked = vandq_u8(cmpA, (uint8x16_t){1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128});
84
- uint8x16_t cmpBMasked = vandq_u8(cmpB, (uint8x16_t){1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128});
83
+ uint8x16_t cmpAMasked = vandq_u8(cmpA, vmakeq_u8(1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128));
84
+ uint8x16_t cmpBMasked = vandq_u8(cmpB, vmakeq_u8(1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128));
85
85
  #ifdef __aarch64__
86
86
  uint8x16_t cmpMerge = vpaddq_u8(cmpAMasked, cmpBMasked);
87
87
  cmpMerge = vpaddq_u8(cmpMerge, cmpMerge);
@@ -95,7 +95,7 @@ static HEDLEY_ALWAYS_INLINE void encode_eol_handle_pre(const uint8_t* HEDLEY_RES
95
95
  memcpy(p, &firstChar, sizeof(firstChar));
96
96
  p += 4;
97
97
  mask ^= 1;
98
- cmpMerge = vbicq_u8(cmpMerge, (uint8x16_t){1,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0});
98
+ cmpMerge = vbicq_u8(cmpMerge, vmakeq_u8(1,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0));
99
99
  } else {
100
100
  firstChar |= 0x0a0d00;
101
101
  memcpy(p, &firstChar, sizeof(firstChar));
@@ -130,7 +130,7 @@ static HEDLEY_ALWAYS_INLINE void encode_eol_handle_pre(const uint8_t* HEDLEY_RES
130
130
  memcpy(p, &firstChar, sizeof(firstChar));
131
131
  p += 4;
132
132
  mask ^= 1;
133
- cmpPacked = vbic_u8(cmpPacked, (uint8x8_t){1,0,0,0, 0,0,0,0});
133
+ cmpPacked = vbic_u8(cmpPacked, vmake_u8(1,0,0,0, 0,0,0,0));
134
134
  } else {
135
135
  firstChar |= 0x0a0d00;
136
136
  memcpy(p, &firstChar, sizeof(firstChar));
@@ -198,7 +198,7 @@ static HEDLEY_ALWAYS_INLINE void encode_eol_handle_pre(const uint8_t* HEDLEY_RES
198
198
  #ifdef __aarch64__
199
199
  # ifdef _MSC_VER
200
200
  long bitIndex;
201
- if(_BitScanReverse64(&bitIndex, mask))
201
+ if(_BitScanReverse64((unsigned long*)&bitIndex, mask))
202
202
  bitIndex ^= 63;
203
203
  else
204
204
  bitIndex = 64;
@@ -217,11 +217,11 @@ static HEDLEY_ALWAYS_INLINE void encode_eol_handle_pre(const uint8_t* HEDLEY_RES
217
217
 
218
218
  uint8x16_t vClz = vdupq_n_u8(bitIndex & ~(sizeof(mask)*8));
219
219
  #ifdef __aarch64__
220
- uint8x16_t blendA = vcgtq_u8((uint8x16_t){63,62,61,60,51,50,49,48,47,46,45,44,35,34,33,32}, vClz);
221
- uint8x16_t blendB = vcgtq_u8((uint8x16_t){31,30,29,28,19,18,17,16,15,14,13,12, 3, 2, 1, 0}, vClz);
220
+ uint8x16_t blendA = vcgtq_u8(vmakeq_u8(63,62,61,60,51,50,49,48,47,46,45,44,35,34,33,32), vClz);
221
+ uint8x16_t blendB = vcgtq_u8(vmakeq_u8(31,30,29,28,19,18,17,16,15,14,13,12, 3, 2, 1, 0), vClz);
222
222
  #else
223
- uint8x16_t blendA = vcgtq_u8((uint8x16_t){31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16}, vClz);
224
- uint8x16_t blendB = vcgtq_u8((uint8x16_t){15,14,13,12,11,10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0}, vClz);
223
+ uint8x16_t blendA = vcgtq_u8(vmakeq_u8(31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16), vClz);
224
+ uint8x16_t blendB = vcgtq_u8(vmakeq_u8(15,14,13,12,11,10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0), vClz);
225
225
  #endif
226
226
  uint8x16_t dataAShifted = vbslq_u8(cmpA, vdupq_n_u8('='), dataA);
227
227
  uint8x16_t dataBShifted = vbslq_u8(cmpB, vdupq_n_u8('='), dataB);
@@ -230,7 +230,7 @@ static HEDLEY_ALWAYS_INLINE void encode_eol_handle_pre(const uint8_t* HEDLEY_RES
230
230
  dataA = vbslq_u8(blendA, dataAShifted, dataA);
231
231
  dataB = vbslq_u8(blendB, dataBShifted, dataB);
232
232
 
233
- vst1q_u8_x2_unaligned(p, ((uint8x16x2_t){dataA, dataB}));
233
+ vst1q_u8_x2_unaligned(p, vcreate2_u8(dataA, dataB));
234
234
  p += sizeof(uint8x16_t)*2 - 1;
235
235
  p += (mask != 0);
236
236
  col = lineSizeOffset + (mask != 0);
@@ -296,14 +296,14 @@ HEDLEY_ALWAYS_INLINE void do_encode_neon(int line_size, int* colOffset, const ui
296
296
  dataB = vaddq_u8(dataB, vdupq_n_u8(42));
297
297
  uint8x16_t cmpA = vqtbx1q_u8(
298
298
  cmpEqA,
299
- // \0 \n \r
300
- (uint8x16_t){255,0,0,0,0,0,0,0,0,0,255,0,0,255,0,0},
299
+ // \0 \n \r
300
+ vmakeq_u8(255,0,0,0,0,0,0,0,0,0,255,0,0,255,0,0),
301
301
  dataA
302
302
  );
303
303
  uint8x16_t cmpB = vqtbx1q_u8(
304
304
  cmpEqB,
305
- // \0 \n \r
306
- (uint8x16_t){255,0,0,0,0,0,0,0,0,0,255,0,0,255,0,0},
305
+ // \0 \n \r
306
+ vmakeq_u8(255,0,0,0,0,0,0,0,0,0,255,0,0,255,0,0),
307
307
  dataB
308
308
  );
309
309
 
@@ -338,8 +338,8 @@ HEDLEY_ALWAYS_INLINE void do_encode_neon(int line_size, int* colOffset, const ui
338
338
 
339
339
 
340
340
  long bitIndex; // prevent compiler whining
341
- uint8x16_t cmpAMasked = vandq_u8(cmpA, (uint8x16_t){1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128});
342
- uint8x16_t cmpBMasked = vandq_u8(cmpB, (uint8x16_t){1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128});
341
+ uint8x16_t cmpAMasked = vandq_u8(cmpA, vmakeq_u8(1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128));
342
+ uint8x16_t cmpBMasked = vandq_u8(cmpB, vmakeq_u8(1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128));
343
343
  #ifdef __aarch64__
344
344
  uint8x16_t cmpMerge = vpaddq_u8(cmpAMasked, cmpBMasked);
345
345
  cmpMerge = vpaddq_u8(cmpMerge, cmpMerge);
@@ -453,7 +453,7 @@ HEDLEY_ALWAYS_INLINE void do_encode_neon(int line_size, int* colOffset, const ui
453
453
  #ifdef __aarch64__
454
454
  # ifdef _MSC_VER
455
455
  // does this work?
456
- if(_BitScanReverse64(&bitIndex, mask))
456
+ if(_BitScanReverse64((unsigned long*)&bitIndex, mask))
457
457
  bitIndex ^= 63;
458
458
  else
459
459
  bitIndex = 64;
@@ -472,11 +472,11 @@ HEDLEY_ALWAYS_INLINE void do_encode_neon(int line_size, int* colOffset, const ui
472
472
 
473
473
  uint8x16_t vClz = vdupq_n_u8(bitIndex & ~(sizeof(mask)*8));
474
474
  #ifdef __aarch64__
475
- uint8x16_t blendA = vcgeq_u8((uint8x16_t){63,62,61,60,51,50,49,48,47,46,45,44,35,34,33,32}, vClz);
476
- uint8x16_t blendB = vcgeq_u8((uint8x16_t){31,30,29,28,19,18,17,16,15,14,13,12, 3, 2, 1, 0}, vClz);
475
+ uint8x16_t blendA = vcgeq_u8(vmakeq_u8(63,62,61,60,51,50,49,48,47,46,45,44,35,34,33,32), vClz);
476
+ uint8x16_t blendB = vcgeq_u8(vmakeq_u8(31,30,29,28,19,18,17,16,15,14,13,12, 3, 2, 1, 0), vClz);
477
477
  #else
478
- uint8x16_t blendA = vcgeq_u8((uint8x16_t){31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16}, vClz);
479
- uint8x16_t blendB = vcgeq_u8((uint8x16_t){15,14,13,12,11,10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0}, vClz);
478
+ uint8x16_t blendA = vcgeq_u8(vmakeq_u8(31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16), vClz);
479
+ uint8x16_t blendB = vcgeq_u8(vmakeq_u8(15,14,13,12,11,10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0), vClz);
480
480
  #endif
481
481
  uint8x16_t dataAShifted = vextq_u8(dataA, dataA, 15);
482
482
  uint8x16_t dataBShifted = vextq_u8(dataA, dataB, 15);
@@ -485,7 +485,7 @@ HEDLEY_ALWAYS_INLINE void do_encode_neon(int line_size, int* colOffset, const ui
485
485
  dataA = vbslq_u8(blendA, dataA, dataAShifted);
486
486
  outDataB = vbslq_u8(blendB, outDataB, dataBShifted);
487
487
 
488
- vst1q_u8_x2_unaligned(p, ((uint8x16x2_t){dataA, outDataB}));
488
+ vst1q_u8_x2_unaligned(p, vcreate2_u8(dataA, outDataB));
489
489
  p += sizeof(uint8x16_t)*2;
490
490
  // write last byte
491
491
  *p = vgetq_lane_u8(dataB, 15);
@@ -350,7 +350,7 @@ HEDLEY_ALWAYS_INLINE void do_encode_sse(int line_size, int* colOffset, const uin
350
350
  #if defined(__POPCNT__) && !defined(__tune_btver1__)
351
351
  if(use_isa & ISA_FEATURE_POPCNT) {
352
352
  shuf2Len = popcnt32(maskA) + 16;
353
- # if defined(__tune_znver2__) || defined(__tune_znver1__) || defined(__tune_btver2__)
353
+ # if defined(__tune_znver3__) || defined(__tune_znver2__) || defined(__tune_znver1__) || defined(__tune_btver2__)
354
354
  shuf1Len = popcnt32(m1) + 8;
355
355
  shuf3Len = popcnt32(m3) + shuf2Len + 8;
356
356
  # else
@@ -538,7 +538,7 @@ HEDLEY_ALWAYS_INLINE void do_encode_sse(int line_size, int* colOffset, const uin
538
538
 
539
539
  dataA = _mm_shuffle_epi8(dataA, shufMaskA);
540
540
 
541
- # if defined(__SSE4_1__) && !defined(__tune_slm__) && !defined(__tune_goldmont__) && !defined(__tune_goldmont_plus__)
541
+ # if defined(__SSE4_1__) && !defined(__tune_slm__) && !defined(__tune_goldmont__) && !defined(__tune_goldmont_plus__) && !defined(__tune_tremont__)
542
542
  // unsure if worth on: Jaguar/Puma (3|2), Core2 (2|2)
543
543
  if(use_isa >= ISA_LEVEL_SSE41) {
544
544
  dataB = _mm_blendv_epi8(dataBShifted, dataB, mergeMaskB);
@@ -717,7 +717,7 @@ HEDLEY_ALWAYS_INLINE void do_encode_sse(int line_size, int* colOffset, const uin
717
717
  }
718
718
  } while(i < 0);
719
719
 
720
- *colOffset = col + line_size -1;
720
+ *colOffset = (int)(col + line_size -1);
721
721
  dest = p;
722
722
  len = -(i - INPUT_OFFSET);
723
723
  }
package/src/platform.cc CHANGED
@@ -2,16 +2,36 @@
2
2
  #ifdef PLATFORM_ARM
3
3
  # ifdef __ANDROID__
4
4
  # include <cpu-features.h>
5
- # elif defined(__linux__)
5
+ # elif defined(__linux__) || (defined(__FreeBSD__) && __FreeBSD__ >= 12)
6
6
  # include <sys/auxv.h>
7
7
  # include <asm/hwcap.h>
8
+ # elif (defined(__FreeBSD__) && __FreeBSD__ < 12)
9
+ # include <sys/sysctl.h>
10
+ # include <asm/hwcap.h>
11
+ # elif defined(_WIN32)
12
+ # define WIN32_LEAN_AND_MEAN
13
+ # define NOMINMAX
14
+ # include <Windows.h>
15
+ # elif defined(__APPLE__)
16
+ # include <sys/types.h>
17
+ # include <sys/sysctl.h>
8
18
  # endif
9
19
  bool cpu_supports_neon() {
10
20
  # if defined(AT_HWCAP)
11
- # ifdef __aarch64__
12
- return getauxval(AT_HWCAP) & HWCAP_ASIMD;
21
+ # ifdef __FreeBSD__
22
+ unsigned long supported;
23
+ elf_aux_info(AT_HWCAP, &supported, sizeof(supported));
24
+ # ifdef __aarch64__
25
+ return supported & HWCAP_ASIMD;
26
+ # else
27
+ return supported & HWCAP_NEON;
28
+ # endif
13
29
  # else
30
+ # ifdef __aarch64__
31
+ return getauxval(AT_HWCAP) & HWCAP_ASIMD;
32
+ # else
14
33
  return getauxval(AT_HWCAP) & HWCAP_NEON;
34
+ # endif
15
35
  # endif
16
36
  # elif defined(ANDROID_CPU_FAMILY_ARM)
17
37
  # ifdef __aarch64__
@@ -19,8 +39,16 @@ bool cpu_supports_neon() {
19
39
  # else
20
40
  return android_getCpuFeatures() & ANDROID_CPU_ARM_FEATURE_NEON;
21
41
  # endif
42
+ # elif defined(_WIN32)
43
+ return IsProcessorFeaturePresent(PF_ARM_NEON_INSTRUCTIONS_AVAILABLE);
44
+ # elif defined(__APPLE__)
45
+ int supported = 0;
46
+ size_t len = sizeof(supported);
47
+ if(sysctlbyname("hw.optional.neon", &supported, &len, NULL, 0))
48
+ return false;
49
+ return (bool)supported;
22
50
  # endif
23
- return true; // assume NEON support, if compiled as such, otherwise
51
+ return true; // assume NEON support, if compiled as such, otherwise (I think Windows and iOS require it)
24
52
  }
25
53
  #endif
26
54
 
@@ -87,8 +115,8 @@ int cpu_supports_isa() {
87
115
  // Jaguar/Puma performance unkown (slowish PSHUFB/PBLENDVB)
88
116
 
89
117
  if((flags[2] & 0x200) == 0x200) { // SSSE3
90
- if(family == 6 && (model == 0x5c || model == 0x5f || model == 0x7a))
91
- // Intel Goldmont/plus with slow PBLENDVB
118
+ if(family == 6 && (model == 0x5c || model == 0x5f || model == 0x7a || model == 0x9c))
119
+ // Intel Goldmont/plus / Tremont with slow PBLENDVB
92
120
  return ret | ISA_LEVEL_SSSE3;
93
121
 
94
122
  if(flags[2] & 0x80000) { // SSE4.1