yencode 1.1.1 → 1.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/binding.gyp +6 -6
- package/crcutil-1.0/code/multiword_64_64_intrinsic_i386_mmx.cc +1 -1
- package/package.json +1 -1
- package/src/common.h +74 -12
- package/src/crc.cc +50 -24
- package/src/crc.h +20 -6
- package/src/crc_arm.cc +121 -23
- package/src/crc_common.h +3 -10
- package/src/{crc_folding.c → crc_folding.cc} +40 -74
- package/src/decoder.cc +6 -3
- package/src/decoder.h +16 -2
- package/src/decoder_avx2_base.h +12 -12
- package/src/decoder_common.h +2 -2
- package/src/decoder_neon.cc +34 -34
- package/src/decoder_neon64.cc +36 -34
- package/src/decoder_sse_base.h +5 -5
- package/src/encoder.cc +5 -2
- package/src/encoder.h +17 -1
- package/src/encoder_avx_base.h +6 -6
- package/src/encoder_common.h +3 -3
- package/src/encoder_neon.cc +30 -30
- package/src/encoder_sse_base.h +3 -3
- package/src/platform.cc +34 -6
- package/src/yencode.cc +33 -44
package/src/decoder_neon64.cc
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
#include "common.h"
|
|
2
|
-
#
|
|
2
|
+
#if defined(__ARM_NEON) && defined(__aarch64__)
|
|
3
3
|
|
|
4
4
|
#include "decoder_common.h"
|
|
5
5
|
|
|
@@ -11,8 +11,8 @@ static uint8_t eqFixLUT[256];
|
|
|
11
11
|
|
|
12
12
|
|
|
13
13
|
|
|
14
|
-
#if !defined(__clang__)
|
|
15
|
-
HEDLEY_ALWAYS_INLINE uint8x16x4_t vld1q_u8_x4(const uint8_t* p) {
|
|
14
|
+
#if !defined(__clang__) && !defined(_MSC_VER) && (!defined(__aarch64__) || !HEDLEY_GCC_VERSION_CHECK(10,0,0))
|
|
15
|
+
static HEDLEY_ALWAYS_INLINE uint8x16x4_t vld1q_u8_x4(const uint8_t* p) {
|
|
16
16
|
uint8x16x4_t ret;
|
|
17
17
|
ret.val[0] = vld1q_u8(p);
|
|
18
18
|
ret.val[1] = vld1q_u8(p+16);
|
|
@@ -20,7 +20,7 @@ HEDLEY_ALWAYS_INLINE uint8x16x4_t vld1q_u8_x4(const uint8_t* p) {
|
|
|
20
20
|
ret.val[3] = vld1q_u8(p+48);
|
|
21
21
|
return ret;
|
|
22
22
|
}
|
|
23
|
-
HEDLEY_ALWAYS_INLINE void vst1q_u8_x4(uint8_t* p, uint8x16x4_t data) {
|
|
23
|
+
static HEDLEY_ALWAYS_INLINE void vst1q_u8_x4(uint8_t* p, uint8x16x4_t data) {
|
|
24
24
|
vst1q_u8(p, data.val[0]);
|
|
25
25
|
vst1q_u8(p+16, data.val[1]);
|
|
26
26
|
vst1q_u8(p+32, data.val[2]);
|
|
@@ -48,9 +48,11 @@ HEDLEY_ALWAYS_INLINE void do_decode_neon(const uint8_t* HEDLEY_RESTRICT src, lon
|
|
|
48
48
|
HEDLEY_ASSUME(escFirst == 0 || escFirst == 1);
|
|
49
49
|
HEDLEY_ASSUME(nextMask == 0 || nextMask == 1 || nextMask == 2);
|
|
50
50
|
uint8x16_t nextMaskMix = vdupq_n_u8(0);
|
|
51
|
-
if(nextMask)
|
|
52
|
-
nextMaskMix
|
|
53
|
-
|
|
51
|
+
if(nextMask == 1)
|
|
52
|
+
nextMaskMix = vsetq_lane_u8(1, nextMaskMix, 0);
|
|
53
|
+
if(nextMask == 2)
|
|
54
|
+
nextMaskMix = vsetq_lane_u8(2, nextMaskMix, 1);
|
|
55
|
+
uint8x16_t yencOffset = escFirst ? vmakeq_u8(42+64,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42) : vdupq_n_u8(42);
|
|
54
56
|
long i;
|
|
55
57
|
for(i = -len; i; i += sizeof(uint8x16_t)*4) {
|
|
56
58
|
uint8x16x4_t data = vld1q_u8_x4(src+i);
|
|
@@ -66,23 +68,23 @@ HEDLEY_ALWAYS_INLINE void do_decode_neon(const uint8_t* HEDLEY_RESTRICT src, lon
|
|
|
66
68
|
cmpEqD = vceqq_u8(dataD, vdupq_n_u8('=')),
|
|
67
69
|
cmpA = vqtbx1q_u8(
|
|
68
70
|
cmpEqA,
|
|
69
|
-
//
|
|
70
|
-
(
|
|
71
|
+
// \n \r
|
|
72
|
+
vmakeq_u8(0,0,0,0,0,0,0,0,0,0,255,0,0,255,0,0),
|
|
71
73
|
dataA
|
|
72
74
|
),
|
|
73
75
|
cmpB = vqtbx1q_u8(
|
|
74
76
|
cmpEqB,
|
|
75
|
-
(
|
|
77
|
+
vmakeq_u8(0,0,0,0,0,0,0,0,0,0,255,0,0,255,0,0),
|
|
76
78
|
dataB
|
|
77
79
|
),
|
|
78
80
|
cmpC = vqtbx1q_u8(
|
|
79
81
|
cmpEqC,
|
|
80
|
-
(
|
|
82
|
+
vmakeq_u8(0,0,0,0,0,0,0,0,0,0,255,0,0,255,0,0),
|
|
81
83
|
dataC
|
|
82
84
|
),
|
|
83
85
|
cmpD = vqtbx1q_u8(
|
|
84
86
|
cmpEqD,
|
|
85
|
-
(
|
|
87
|
+
vmakeq_u8(0,0,0,0,0,0,0,0,0,0,255,0,0,255,0,0),
|
|
86
88
|
dataD
|
|
87
89
|
);
|
|
88
90
|
if(isRaw) cmpA = vorrq_u8(cmpA, nextMaskMix);
|
|
@@ -93,22 +95,22 @@ HEDLEY_ALWAYS_INLINE void do_decode_neon(const uint8_t* HEDLEY_RESTRICT src, lon
|
|
|
93
95
|
)))) {
|
|
94
96
|
uint8x16_t cmpMerge = vpaddq_u8(
|
|
95
97
|
vpaddq_u8(
|
|
96
|
-
vandq_u8(cmpA, (
|
|
97
|
-
vandq_u8(cmpB, (
|
|
98
|
+
vandq_u8(cmpA, vmakeq_u8(1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128)),
|
|
99
|
+
vandq_u8(cmpB, vmakeq_u8(1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128))
|
|
98
100
|
),
|
|
99
101
|
vpaddq_u8(
|
|
100
|
-
vandq_u8(cmpC, (
|
|
101
|
-
vandq_u8(cmpD, (
|
|
102
|
+
vandq_u8(cmpC, vmakeq_u8(1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128)),
|
|
103
|
+
vandq_u8(cmpD, vmakeq_u8(1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128))
|
|
102
104
|
)
|
|
103
105
|
);
|
|
104
106
|
uint8x16_t cmpEqMerge = vpaddq_u8(
|
|
105
107
|
vpaddq_u8(
|
|
106
|
-
vandq_u8(cmpEqA, (
|
|
107
|
-
vandq_u8(cmpEqB, (
|
|
108
|
+
vandq_u8(cmpEqA, vmakeq_u8(1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128)),
|
|
109
|
+
vandq_u8(cmpEqB, vmakeq_u8(1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128))
|
|
108
110
|
),
|
|
109
111
|
vpaddq_u8(
|
|
110
|
-
vandq_u8(cmpEqC, (
|
|
111
|
-
vandq_u8(cmpEqD, (
|
|
112
|
+
vandq_u8(cmpEqC, vmakeq_u8(1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128)),
|
|
113
|
+
vandq_u8(cmpEqD, vmakeq_u8(1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128))
|
|
112
114
|
)
|
|
113
115
|
);
|
|
114
116
|
|
|
@@ -225,14 +227,14 @@ HEDLEY_ALWAYS_INLINE void do_decode_neon(const uint8_t* HEDLEY_RESTRICT src, lon
|
|
|
225
227
|
break;
|
|
226
228
|
}
|
|
227
229
|
}
|
|
228
|
-
uint8x16_t match2NlDotDMasked = vandq_u8(match2NlDotD, (
|
|
230
|
+
uint8x16_t match2NlDotDMasked = vandq_u8(match2NlDotD, vmakeq_u8(1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128));
|
|
229
231
|
uint8x16_t mergeKillDots = vpaddq_u8(
|
|
230
232
|
vpaddq_u8(
|
|
231
|
-
vandq_u8(match2NlDotA, (
|
|
232
|
-
vandq_u8(match2NlDotB, (
|
|
233
|
+
vandq_u8(match2NlDotA, vmakeq_u8(1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128)),
|
|
234
|
+
vandq_u8(match2NlDotB, vmakeq_u8(1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128))
|
|
233
235
|
),
|
|
234
236
|
vpaddq_u8(
|
|
235
|
-
vandq_u8(match2NlDotC, (
|
|
237
|
+
vandq_u8(match2NlDotC, vmakeq_u8(1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128)),
|
|
236
238
|
match2NlDotDMasked
|
|
237
239
|
)
|
|
238
240
|
);
|
|
@@ -308,27 +310,27 @@ HEDLEY_ALWAYS_INLINE void do_decode_neon(const uint8_t* HEDLEY_RESTRICT src, lon
|
|
|
308
310
|
|
|
309
311
|
uint8x16_t vMaskEqA = vqtbl1q_u8(
|
|
310
312
|
maskEqTemp,
|
|
311
|
-
(
|
|
313
|
+
vmakeq_u8(0,0,0,0,0,0,0,0, 1,1,1,1,1,1,1,1)
|
|
312
314
|
);
|
|
313
315
|
maskEqTemp = vextq_u8(maskEqTemp, maskEqTemp, 2);
|
|
314
316
|
uint8x16_t vMaskEqB = vqtbl1q_u8(
|
|
315
317
|
maskEqTemp,
|
|
316
|
-
(
|
|
318
|
+
vmakeq_u8(0,0,0,0,0,0,0,0, 1,1,1,1,1,1,1,1)
|
|
317
319
|
);
|
|
318
320
|
maskEqTemp = vextq_u8(maskEqTemp, maskEqTemp, 2);
|
|
319
321
|
uint8x16_t vMaskEqC = vqtbl1q_u8(
|
|
320
322
|
maskEqTemp,
|
|
321
|
-
(
|
|
323
|
+
vmakeq_u8(0,0,0,0,0,0,0,0, 1,1,1,1,1,1,1,1)
|
|
322
324
|
);
|
|
323
325
|
maskEqTemp = vextq_u8(maskEqTemp, maskEqTemp, 2);
|
|
324
326
|
uint8x16_t vMaskEqD = vqtbl1q_u8(
|
|
325
327
|
maskEqTemp,
|
|
326
|
-
(
|
|
328
|
+
vmakeq_u8(0,0,0,0,0,0,0,0, 1,1,1,1,1,1,1,1)
|
|
327
329
|
);
|
|
328
|
-
vMaskEqA = vtstq_u8(vMaskEqA, (
|
|
329
|
-
vMaskEqB = vtstq_u8(vMaskEqB, (
|
|
330
|
-
vMaskEqC = vtstq_u8(vMaskEqC, (
|
|
331
|
-
vMaskEqD = vtstq_u8(vMaskEqD, (
|
|
330
|
+
vMaskEqA = vtstq_u8(vMaskEqA, vmakeq_u8(1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128));
|
|
331
|
+
vMaskEqB = vtstq_u8(vMaskEqB, vmakeq_u8(1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128));
|
|
332
|
+
vMaskEqC = vtstq_u8(vMaskEqC, vmakeq_u8(1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128));
|
|
333
|
+
vMaskEqD = vtstq_u8(vMaskEqD, vmakeq_u8(1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128));
|
|
332
334
|
|
|
333
335
|
dataA = vsubq_u8(
|
|
334
336
|
dataA,
|
|
@@ -384,7 +386,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_neon(const uint8_t* HEDLEY_RESTRICT src, lon
|
|
|
384
386
|
)
|
|
385
387
|
);
|
|
386
388
|
}
|
|
387
|
-
yencOffset
|
|
389
|
+
yencOffset = vsetq_lane_u8((escFirst << 6) | 42, yencOffset, 0);
|
|
388
390
|
|
|
389
391
|
// all that's left is to 'compress' the data (skip over masked chars)
|
|
390
392
|
uint64_t counts = vget_lane_u64(vreinterpret_u64_u8(vcnt_u8(vget_low_u8(cmpCombined))), 0);
|
|
@@ -419,7 +421,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_neon(const uint8_t* HEDLEY_RESTRICT src, lon
|
|
|
419
421
|
dataB = vsubq_u8(dataB, vdupq_n_u8(42));
|
|
420
422
|
dataC = vsubq_u8(dataC, vdupq_n_u8(42));
|
|
421
423
|
dataD = vsubq_u8(dataD, vdupq_n_u8(42));
|
|
422
|
-
vst1q_u8_x4(p, (
|
|
424
|
+
vst1q_u8_x4(p, vcreate4_u8(dataA, dataB, dataC, dataD));
|
|
423
425
|
p += sizeof(uint8x16_t)*4;
|
|
424
426
|
escFirst = 0;
|
|
425
427
|
yencOffset = vdupq_n_u8(42);
|
package/src/decoder_sse_base.h
CHANGED
|
@@ -117,7 +117,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_sse(const uint8_t* HEDLEY_RESTRICT src, long
|
|
|
117
117
|
#else
|
|
118
118
|
const bool _USING_FAST_MATCH = false;
|
|
119
119
|
#endif
|
|
120
|
-
#if defined(__SSE4_1__) && !defined(__tune_slm__) && !defined(__tune_goldmont__) && !defined(__tune_goldmont_plus__)
|
|
120
|
+
#if defined(__SSE4_1__) && !defined(__tune_slm__) && !defined(__tune_goldmont__) && !defined(__tune_goldmont_plus__) && !defined(__tune_tremont__)
|
|
121
121
|
const bool _USING_BLEND_ADD = (use_isa >= ISA_LEVEL_SSE41);
|
|
122
122
|
#else
|
|
123
123
|
const bool _USING_BLEND_ADD = false;
|
|
@@ -368,7 +368,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_sse(const uint8_t* HEDLEY_RESTRICT src, long
|
|
|
368
368
|
if(LIKELIHOOD(0.001, matchEnd)) {
|
|
369
369
|
// terminator found
|
|
370
370
|
// there's probably faster ways to do this, but reverting to scalar code should be good enough
|
|
371
|
-
len += i;
|
|
371
|
+
len += (long)i;
|
|
372
372
|
break;
|
|
373
373
|
}
|
|
374
374
|
}
|
|
@@ -477,7 +477,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_sse(const uint8_t* HEDLEY_RESTRICT src, long
|
|
|
477
477
|
}
|
|
478
478
|
|
|
479
479
|
if(endFound) {
|
|
480
|
-
len += i;
|
|
480
|
+
len += (long)i;
|
|
481
481
|
break;
|
|
482
482
|
}
|
|
483
483
|
}
|
|
@@ -558,7 +558,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_sse(const uint8_t* HEDLEY_RESTRICT src, long
|
|
|
558
558
|
);
|
|
559
559
|
|
|
560
560
|
yencOffset = _mm_xor_si128(_mm_set1_epi8(-42),
|
|
561
|
-
_mm_slli_epi16(_mm_cvtsi32_si128(escFirst), 6)
|
|
561
|
+
_mm_slli_epi16(_mm_cvtsi32_si128((int)escFirst), 6)
|
|
562
562
|
);
|
|
563
563
|
}
|
|
564
564
|
} else {
|
|
@@ -608,7 +608,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_sse(const uint8_t* HEDLEY_RESTRICT src, long
|
|
|
608
608
|
)
|
|
609
609
|
);
|
|
610
610
|
yencOffset = _mm_xor_si128(_mm_set1_epi8(-42),
|
|
611
|
-
_mm_slli_epi16(_mm_cvtsi32_si128(escFirst), 6)
|
|
611
|
+
_mm_slli_epi16(_mm_cvtsi32_si128((int)escFirst), 6)
|
|
612
612
|
);
|
|
613
613
|
} else
|
|
614
614
|
#endif
|
package/src/encoder.cc
CHANGED
|
@@ -1,7 +1,8 @@
|
|
|
1
1
|
#include "common.h"
|
|
2
2
|
#include "encoder_common.h"
|
|
3
|
+
#include "encoder.h"
|
|
3
4
|
|
|
4
|
-
size_t do_encode_generic(int line_size, int* colOffset, const unsigned char* HEDLEY_RESTRICT src, unsigned char* HEDLEY_RESTRICT dest, size_t len,
|
|
5
|
+
size_t do_encode_generic(int line_size, int* colOffset, const unsigned char* HEDLEY_RESTRICT src, unsigned char* HEDLEY_RESTRICT dest, size_t len, int doEnd) {
|
|
5
6
|
unsigned char* es = (unsigned char*)src + len;
|
|
6
7
|
unsigned char *p = dest; // destination pointer
|
|
7
8
|
long i = -(long)len; // input position
|
|
@@ -119,7 +120,9 @@ size_t do_encode_generic(int line_size, int* colOffset, const unsigned char* HED
|
|
|
119
120
|
}
|
|
120
121
|
|
|
121
122
|
|
|
122
|
-
|
|
123
|
+
extern "C" {
|
|
124
|
+
size_t (*_do_encode)(int, int*, const unsigned char* HEDLEY_RESTRICT, unsigned char* HEDLEY_RESTRICT, size_t, int) = &do_encode_generic;
|
|
125
|
+
}
|
|
123
126
|
|
|
124
127
|
void encoder_sse2_init();
|
|
125
128
|
void encoder_ssse3_init();
|
package/src/encoder.h
CHANGED
|
@@ -1,5 +1,21 @@
|
|
|
1
|
+
#ifndef __YENC_ENCODER_H
|
|
2
|
+
#define __YENC_ENCODER_H
|
|
3
|
+
|
|
4
|
+
#ifdef __cplusplus
|
|
5
|
+
extern "C" {
|
|
6
|
+
#endif
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
|
|
1
10
|
#include "hedley.h"
|
|
2
11
|
|
|
3
|
-
extern size_t (*_do_encode)(int, int*, const unsigned char* HEDLEY_RESTRICT, unsigned char* HEDLEY_RESTRICT, size_t,
|
|
12
|
+
extern size_t (*_do_encode)(int, int*, const unsigned char* HEDLEY_RESTRICT, unsigned char* HEDLEY_RESTRICT, size_t, int);
|
|
4
13
|
#define do_encode (*_do_encode)
|
|
5
14
|
void encoder_init();
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
#ifdef __cplusplus
|
|
19
|
+
}
|
|
20
|
+
#endif
|
|
21
|
+
#endif
|
package/src/encoder_avx_base.h
CHANGED
|
@@ -112,7 +112,7 @@ HEDLEY_ALWAYS_INLINE void do_encode_avx2(int line_size, int* colOffset, const ui
|
|
|
112
112
|
// last char
|
|
113
113
|
uint32_t eolChar = (use_isa >= ISA_LEVEL_VBMI2 ? lookupsVBMI2->eolLastChar[c] : lookupsAVX2->eolLastChar[c]);
|
|
114
114
|
*(uint32_t*)p = eolChar;
|
|
115
|
-
p += 3 + (eolChar>>27);
|
|
115
|
+
p += 3 + (uintptr_t)(eolChar>>27);
|
|
116
116
|
col = -line_size+1;
|
|
117
117
|
} else {
|
|
118
118
|
// line overflowed, insert a newline
|
|
@@ -215,7 +215,7 @@ HEDLEY_ALWAYS_INLINE void do_encode_avx2(int line_size, int* colOffset, const ui
|
|
|
215
215
|
// duplicate halves
|
|
216
216
|
data1A = _mm256_inserti128_si256(dataA, _mm256_castsi256_si128(dataA), 1);
|
|
217
217
|
data1B = _mm256_inserti128_si256(dataB, _mm256_castsi256_si128(dataB), 1);
|
|
218
|
-
#
|
|
218
|
+
#if defined(__tune_znver2__) || defined(__tune_znver3__)
|
|
219
219
|
data2A = _mm256_permute2x128_si256(dataA, dataA, 0x11);
|
|
220
220
|
data2B = _mm256_permute2x128_si256(dataB, dataB, 0x11);
|
|
221
221
|
#else
|
|
@@ -254,7 +254,7 @@ HEDLEY_ALWAYS_INLINE void do_encode_avx2(int line_size, int* colOffset, const ui
|
|
|
254
254
|
// we overflowed - find correct position to revert back to
|
|
255
255
|
// this is perhaps sub-optimal on 32-bit, but who still uses that with AVX2?
|
|
256
256
|
uint64_t eqMask;
|
|
257
|
-
int shiftAmt = maskBitsB + YMM_SIZE - col
|
|
257
|
+
int shiftAmt = (int)(maskBitsB + YMM_SIZE -1 - col);
|
|
258
258
|
if(HEDLEY_UNLIKELY(shiftAmt < 0)) {
|
|
259
259
|
uint32_t eqMask1, eqMask2;
|
|
260
260
|
#if defined(__AVX512VBMI2__) && defined(__AVX512VL__) && defined(__AVX512BW__)
|
|
@@ -320,7 +320,7 @@ HEDLEY_ALWAYS_INLINE void do_encode_avx2(int line_size, int* colOffset, const ui
|
|
|
320
320
|
#endif
|
|
321
321
|
{
|
|
322
322
|
i += bitCount;
|
|
323
|
-
unsigned int revert = col + (eqMask & 1);
|
|
323
|
+
unsigned int revert = (unsigned int)(col + (eqMask & 1));
|
|
324
324
|
p -= revert;
|
|
325
325
|
i -= revert;
|
|
326
326
|
}
|
|
@@ -429,7 +429,7 @@ HEDLEY_ALWAYS_INLINE void do_encode_avx2(int line_size, int* colOffset, const ui
|
|
|
429
429
|
_encode_eol_handle_pre:
|
|
430
430
|
uint32_t eolChar = (use_isa >= ISA_LEVEL_VBMI2 ? lookupsVBMI2->eolLastChar[es[i]] : lookupsAVX2->eolLastChar[es[i]]);
|
|
431
431
|
*(uint32_t*)p = eolChar;
|
|
432
|
-
p += 3 + (eolChar>>27);
|
|
432
|
+
p += 3 + (uintptr_t)(eolChar>>27);
|
|
433
433
|
col = lineSizeOffset;
|
|
434
434
|
|
|
435
435
|
if(HEDLEY_UNLIKELY(i >= 0)) { // this isn't really a proper check - it's only needed to support short lines; basically, if the line is too short, `i` never gets checked, so we need one somewhere
|
|
@@ -556,7 +556,7 @@ HEDLEY_ALWAYS_INLINE void do_encode_avx2(int line_size, int* colOffset, const ui
|
|
|
556
556
|
|
|
557
557
|
_mm256_zeroupper();
|
|
558
558
|
|
|
559
|
-
*colOffset = col + line_size -1;
|
|
559
|
+
*colOffset = (int)(col + line_size -1);
|
|
560
560
|
dest = p;
|
|
561
561
|
len = -(i - INPUT_OFFSET);
|
|
562
562
|
}
|
package/src/encoder_common.h
CHANGED
|
@@ -8,7 +8,7 @@
|
|
|
8
8
|
#define _BX _B3(0), _B3(64), _B3(128), _B3(192)
|
|
9
9
|
|
|
10
10
|
static const unsigned char escapeLUT[256] = { // whether or not the character is critical
|
|
11
|
-
#define _B(n) ((n == 214 || n ==
|
|
11
|
+
#define _B(n) ((n == 214 || n == '\r'+214 || n == '\n'+214 || n == '='-42) ? 0 : (n+42) & 0xff)
|
|
12
12
|
_BX
|
|
13
13
|
#undef _B
|
|
14
14
|
};
|
|
@@ -24,10 +24,10 @@ static const uint16_t escapedLUT[256] = { // escaped sequences for characters th
|
|
|
24
24
|
#undef _BX
|
|
25
25
|
|
|
26
26
|
|
|
27
|
-
size_t do_encode_generic(int line_size, int* colOffset, const unsigned char* HEDLEY_RESTRICT src, unsigned char* HEDLEY_RESTRICT dest, size_t len,
|
|
27
|
+
size_t do_encode_generic(int line_size, int* colOffset, const unsigned char* HEDLEY_RESTRICT src, unsigned char* HEDLEY_RESTRICT dest, size_t len, int doEnd);
|
|
28
28
|
|
|
29
29
|
template<void(&kernel)(int, int*, const uint8_t* HEDLEY_RESTRICT, uint8_t* HEDLEY_RESTRICT&, size_t&)>
|
|
30
|
-
static size_t do_encode_simd(int line_size, int* colOffset, const uint8_t* HEDLEY_RESTRICT src, uint8_t* HEDLEY_RESTRICT dest, size_t len,
|
|
30
|
+
static size_t do_encode_simd(int line_size, int* colOffset, const uint8_t* HEDLEY_RESTRICT src, uint8_t* HEDLEY_RESTRICT dest, size_t len, int doEnd) {
|
|
31
31
|
if(len < 1) return 0;
|
|
32
32
|
if(line_size < 12) { // short lines probably not worth processing in a SIMD way
|
|
33
33
|
// we assume at least the first and last char exist in the line, and since the first char could be escaped, and SIMD encoder assumes at least one non-first/last char, assumption means that line size has to be >= 4
|
package/src/encoder_neon.cc
CHANGED
|
@@ -8,7 +8,7 @@
|
|
|
8
8
|
#if defined(__aarch64__) && (defined(__clang__) || (defined(__GNUC__) && __GNUC__ >= 9))
|
|
9
9
|
# define vst1q_u8_x2_unaligned vst1q_u8_x2
|
|
10
10
|
#else
|
|
11
|
-
HEDLEY_ALWAYS_INLINE void vst1q_u8_x2_unaligned(uint8_t* p, uint8x16x2_t data) {
|
|
11
|
+
static HEDLEY_ALWAYS_INLINE void vst1q_u8_x2_unaligned(uint8_t* p, uint8x16x2_t data) {
|
|
12
12
|
vst1q_u8(p, data.val[0]);
|
|
13
13
|
vst1q_u8(p+16, data.val[1]);
|
|
14
14
|
}
|
|
@@ -26,16 +26,16 @@ static HEDLEY_ALWAYS_INLINE void encode_eol_handle_pre(const uint8_t* HEDLEY_RES
|
|
|
26
26
|
#ifdef __aarch64__
|
|
27
27
|
uint8x16_t cmpA = vreinterpretq_u8_s8(vqtbx2q_s8(
|
|
28
28
|
vdupq_n_s8('='-42),
|
|
29
|
-
(
|
|
30
|
-
vreinterpretq_u8_s8(vhaddq_s8(vreinterpretq_s8_u8(dataA), (
|
|
29
|
+
vcreate2_s8(vmakeq_s8('\0'-42,-128,-128,'\0'-42,'\t'-42,'\n'-42,'\r'-42,'\t'-42,'\n'-42,'\r'-42,-128,-128,'\0'-42,-128,-128,-128), vmakeq_s8(' '-42,'\n'-42,'\r'-42,' '-42,-128,-128,-128,-128,-128,-128,'.'-42,-128,-128,-128,'='-42,-128)),
|
|
30
|
+
vreinterpretq_u8_s8(vhaddq_s8(vreinterpretq_s8_u8(dataA), vmakeq_s8(42,48,66,66, 66,66,66,66, 66,66,66,66, 66,66,66,66)))
|
|
31
31
|
));
|
|
32
32
|
cmpA = vceqq_u8(cmpA, dataA);
|
|
33
33
|
|
|
34
34
|
dataB = vaddq_u8(oDataB, vdupq_n_u8(42));
|
|
35
35
|
uint8x16_t cmpB = vqtbx1q_u8(
|
|
36
36
|
vceqq_u8(oDataB, vdupq_n_u8('='-42)),
|
|
37
|
-
//
|
|
38
|
-
(
|
|
37
|
+
// \0 \n \r
|
|
38
|
+
vmakeq_u8(255,0,0,0,0,0,0,0,0,0,255,0,0,255,0,0),
|
|
39
39
|
dataB
|
|
40
40
|
);
|
|
41
41
|
dataA = vaddq_u8(dataA, vbslq_u8(cmpA, vdupq_n_u8(64+42), vdupq_n_u8(42)));
|
|
@@ -64,9 +64,9 @@ static HEDLEY_ALWAYS_INLINE void encode_eol_handle_pre(const uint8_t* HEDLEY_RES
|
|
|
64
64
|
|
|
65
65
|
// dup low 2 bytes & compare
|
|
66
66
|
uint8x8_t firstTwoChars = vreinterpret_u8_u16(vdup_lane_u16(vreinterpret_u16_u8(vget_low_u8(oDataA)), 0));
|
|
67
|
-
uint8x8_t cmpNl = vceq_u8(firstTwoChars,
|
|
68
|
-
' '
|
|
69
|
-
|
|
67
|
+
uint8x8_t cmpNl = vceq_u8(firstTwoChars, vmake_u8(
|
|
68
|
+
' '+214,' '+214,'\t'+214,'\t'+214,'\r'+214,'.'-42,'='-42,'='-42
|
|
69
|
+
));
|
|
70
70
|
// use padd to merge comparisons
|
|
71
71
|
uint16x4_t cmpNl2 = vreinterpret_u16_u8(cmpNl);
|
|
72
72
|
cmpNl2 = vpadd_u16(cmpNl2, vdup_n_u16(0));
|
|
@@ -80,8 +80,8 @@ static HEDLEY_ALWAYS_INLINE void encode_eol_handle_pre(const uint8_t* HEDLEY_RES
|
|
|
80
80
|
#endif
|
|
81
81
|
|
|
82
82
|
|
|
83
|
-
uint8x16_t cmpAMasked = vandq_u8(cmpA, (
|
|
84
|
-
uint8x16_t cmpBMasked = vandq_u8(cmpB, (
|
|
83
|
+
uint8x16_t cmpAMasked = vandq_u8(cmpA, vmakeq_u8(1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128));
|
|
84
|
+
uint8x16_t cmpBMasked = vandq_u8(cmpB, vmakeq_u8(1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128));
|
|
85
85
|
#ifdef __aarch64__
|
|
86
86
|
uint8x16_t cmpMerge = vpaddq_u8(cmpAMasked, cmpBMasked);
|
|
87
87
|
cmpMerge = vpaddq_u8(cmpMerge, cmpMerge);
|
|
@@ -95,7 +95,7 @@ static HEDLEY_ALWAYS_INLINE void encode_eol_handle_pre(const uint8_t* HEDLEY_RES
|
|
|
95
95
|
memcpy(p, &firstChar, sizeof(firstChar));
|
|
96
96
|
p += 4;
|
|
97
97
|
mask ^= 1;
|
|
98
|
-
cmpMerge = vbicq_u8(cmpMerge, (
|
|
98
|
+
cmpMerge = vbicq_u8(cmpMerge, vmakeq_u8(1,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0));
|
|
99
99
|
} else {
|
|
100
100
|
firstChar |= 0x0a0d00;
|
|
101
101
|
memcpy(p, &firstChar, sizeof(firstChar));
|
|
@@ -130,7 +130,7 @@ static HEDLEY_ALWAYS_INLINE void encode_eol_handle_pre(const uint8_t* HEDLEY_RES
|
|
|
130
130
|
memcpy(p, &firstChar, sizeof(firstChar));
|
|
131
131
|
p += 4;
|
|
132
132
|
mask ^= 1;
|
|
133
|
-
cmpPacked = vbic_u8(cmpPacked, (
|
|
133
|
+
cmpPacked = vbic_u8(cmpPacked, vmake_u8(1,0,0,0, 0,0,0,0));
|
|
134
134
|
} else {
|
|
135
135
|
firstChar |= 0x0a0d00;
|
|
136
136
|
memcpy(p, &firstChar, sizeof(firstChar));
|
|
@@ -198,7 +198,7 @@ static HEDLEY_ALWAYS_INLINE void encode_eol_handle_pre(const uint8_t* HEDLEY_RES
|
|
|
198
198
|
#ifdef __aarch64__
|
|
199
199
|
# ifdef _MSC_VER
|
|
200
200
|
long bitIndex;
|
|
201
|
-
if(_BitScanReverse64(&bitIndex, mask))
|
|
201
|
+
if(_BitScanReverse64((unsigned long*)&bitIndex, mask))
|
|
202
202
|
bitIndex ^= 63;
|
|
203
203
|
else
|
|
204
204
|
bitIndex = 64;
|
|
@@ -217,11 +217,11 @@ static HEDLEY_ALWAYS_INLINE void encode_eol_handle_pre(const uint8_t* HEDLEY_RES
|
|
|
217
217
|
|
|
218
218
|
uint8x16_t vClz = vdupq_n_u8(bitIndex & ~(sizeof(mask)*8));
|
|
219
219
|
#ifdef __aarch64__
|
|
220
|
-
uint8x16_t blendA = vcgtq_u8((
|
|
221
|
-
uint8x16_t blendB = vcgtq_u8((
|
|
220
|
+
uint8x16_t blendA = vcgtq_u8(vmakeq_u8(63,62,61,60,51,50,49,48,47,46,45,44,35,34,33,32), vClz);
|
|
221
|
+
uint8x16_t blendB = vcgtq_u8(vmakeq_u8(31,30,29,28,19,18,17,16,15,14,13,12, 3, 2, 1, 0), vClz);
|
|
222
222
|
#else
|
|
223
|
-
uint8x16_t blendA = vcgtq_u8((
|
|
224
|
-
uint8x16_t blendB = vcgtq_u8((
|
|
223
|
+
uint8x16_t blendA = vcgtq_u8(vmakeq_u8(31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16), vClz);
|
|
224
|
+
uint8x16_t blendB = vcgtq_u8(vmakeq_u8(15,14,13,12,11,10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0), vClz);
|
|
225
225
|
#endif
|
|
226
226
|
uint8x16_t dataAShifted = vbslq_u8(cmpA, vdupq_n_u8('='), dataA);
|
|
227
227
|
uint8x16_t dataBShifted = vbslq_u8(cmpB, vdupq_n_u8('='), dataB);
|
|
@@ -230,7 +230,7 @@ static HEDLEY_ALWAYS_INLINE void encode_eol_handle_pre(const uint8_t* HEDLEY_RES
|
|
|
230
230
|
dataA = vbslq_u8(blendA, dataAShifted, dataA);
|
|
231
231
|
dataB = vbslq_u8(blendB, dataBShifted, dataB);
|
|
232
232
|
|
|
233
|
-
vst1q_u8_x2_unaligned(p, (
|
|
233
|
+
vst1q_u8_x2_unaligned(p, vcreate2_u8(dataA, dataB));
|
|
234
234
|
p += sizeof(uint8x16_t)*2 - 1;
|
|
235
235
|
p += (mask != 0);
|
|
236
236
|
col = lineSizeOffset + (mask != 0);
|
|
@@ -296,14 +296,14 @@ HEDLEY_ALWAYS_INLINE void do_encode_neon(int line_size, int* colOffset, const ui
|
|
|
296
296
|
dataB = vaddq_u8(dataB, vdupq_n_u8(42));
|
|
297
297
|
uint8x16_t cmpA = vqtbx1q_u8(
|
|
298
298
|
cmpEqA,
|
|
299
|
-
//
|
|
300
|
-
(
|
|
299
|
+
// \0 \n \r
|
|
300
|
+
vmakeq_u8(255,0,0,0,0,0,0,0,0,0,255,0,0,255,0,0),
|
|
301
301
|
dataA
|
|
302
302
|
);
|
|
303
303
|
uint8x16_t cmpB = vqtbx1q_u8(
|
|
304
304
|
cmpEqB,
|
|
305
|
-
//
|
|
306
|
-
(
|
|
305
|
+
// \0 \n \r
|
|
306
|
+
vmakeq_u8(255,0,0,0,0,0,0,0,0,0,255,0,0,255,0,0),
|
|
307
307
|
dataB
|
|
308
308
|
);
|
|
309
309
|
|
|
@@ -338,8 +338,8 @@ HEDLEY_ALWAYS_INLINE void do_encode_neon(int line_size, int* colOffset, const ui
|
|
|
338
338
|
|
|
339
339
|
|
|
340
340
|
long bitIndex; // prevent compiler whining
|
|
341
|
-
uint8x16_t cmpAMasked = vandq_u8(cmpA, (
|
|
342
|
-
uint8x16_t cmpBMasked = vandq_u8(cmpB, (
|
|
341
|
+
uint8x16_t cmpAMasked = vandq_u8(cmpA, vmakeq_u8(1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128));
|
|
342
|
+
uint8x16_t cmpBMasked = vandq_u8(cmpB, vmakeq_u8(1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128));
|
|
343
343
|
#ifdef __aarch64__
|
|
344
344
|
uint8x16_t cmpMerge = vpaddq_u8(cmpAMasked, cmpBMasked);
|
|
345
345
|
cmpMerge = vpaddq_u8(cmpMerge, cmpMerge);
|
|
@@ -453,7 +453,7 @@ HEDLEY_ALWAYS_INLINE void do_encode_neon(int line_size, int* colOffset, const ui
|
|
|
453
453
|
#ifdef __aarch64__
|
|
454
454
|
# ifdef _MSC_VER
|
|
455
455
|
// does this work?
|
|
456
|
-
if(_BitScanReverse64(&bitIndex, mask))
|
|
456
|
+
if(_BitScanReverse64((unsigned long*)&bitIndex, mask))
|
|
457
457
|
bitIndex ^= 63;
|
|
458
458
|
else
|
|
459
459
|
bitIndex = 64;
|
|
@@ -472,11 +472,11 @@ HEDLEY_ALWAYS_INLINE void do_encode_neon(int line_size, int* colOffset, const ui
|
|
|
472
472
|
|
|
473
473
|
uint8x16_t vClz = vdupq_n_u8(bitIndex & ~(sizeof(mask)*8));
|
|
474
474
|
#ifdef __aarch64__
|
|
475
|
-
uint8x16_t blendA = vcgeq_u8((
|
|
476
|
-
uint8x16_t blendB = vcgeq_u8((
|
|
475
|
+
uint8x16_t blendA = vcgeq_u8(vmakeq_u8(63,62,61,60,51,50,49,48,47,46,45,44,35,34,33,32), vClz);
|
|
476
|
+
uint8x16_t blendB = vcgeq_u8(vmakeq_u8(31,30,29,28,19,18,17,16,15,14,13,12, 3, 2, 1, 0), vClz);
|
|
477
477
|
#else
|
|
478
|
-
uint8x16_t blendA = vcgeq_u8((
|
|
479
|
-
uint8x16_t blendB = vcgeq_u8((
|
|
478
|
+
uint8x16_t blendA = vcgeq_u8(vmakeq_u8(31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16), vClz);
|
|
479
|
+
uint8x16_t blendB = vcgeq_u8(vmakeq_u8(15,14,13,12,11,10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0), vClz);
|
|
480
480
|
#endif
|
|
481
481
|
uint8x16_t dataAShifted = vextq_u8(dataA, dataA, 15);
|
|
482
482
|
uint8x16_t dataBShifted = vextq_u8(dataA, dataB, 15);
|
|
@@ -485,7 +485,7 @@ HEDLEY_ALWAYS_INLINE void do_encode_neon(int line_size, int* colOffset, const ui
|
|
|
485
485
|
dataA = vbslq_u8(blendA, dataA, dataAShifted);
|
|
486
486
|
outDataB = vbslq_u8(blendB, outDataB, dataBShifted);
|
|
487
487
|
|
|
488
|
-
vst1q_u8_x2_unaligned(p, (
|
|
488
|
+
vst1q_u8_x2_unaligned(p, vcreate2_u8(dataA, outDataB));
|
|
489
489
|
p += sizeof(uint8x16_t)*2;
|
|
490
490
|
// write last byte
|
|
491
491
|
*p = vgetq_lane_u8(dataB, 15);
|
package/src/encoder_sse_base.h
CHANGED
|
@@ -350,7 +350,7 @@ HEDLEY_ALWAYS_INLINE void do_encode_sse(int line_size, int* colOffset, const uin
|
|
|
350
350
|
#if defined(__POPCNT__) && !defined(__tune_btver1__)
|
|
351
351
|
if(use_isa & ISA_FEATURE_POPCNT) {
|
|
352
352
|
shuf2Len = popcnt32(maskA) + 16;
|
|
353
|
-
# if defined(__tune_znver2__) || defined(__tune_znver1__) || defined(__tune_btver2__)
|
|
353
|
+
# if defined(__tune_znver3__) || defined(__tune_znver2__) || defined(__tune_znver1__) || defined(__tune_btver2__)
|
|
354
354
|
shuf1Len = popcnt32(m1) + 8;
|
|
355
355
|
shuf3Len = popcnt32(m3) + shuf2Len + 8;
|
|
356
356
|
# else
|
|
@@ -538,7 +538,7 @@ HEDLEY_ALWAYS_INLINE void do_encode_sse(int line_size, int* colOffset, const uin
|
|
|
538
538
|
|
|
539
539
|
dataA = _mm_shuffle_epi8(dataA, shufMaskA);
|
|
540
540
|
|
|
541
|
-
# if defined(__SSE4_1__) && !defined(__tune_slm__) && !defined(__tune_goldmont__) && !defined(__tune_goldmont_plus__)
|
|
541
|
+
# if defined(__SSE4_1__) && !defined(__tune_slm__) && !defined(__tune_goldmont__) && !defined(__tune_goldmont_plus__) && !defined(__tune_tremont__)
|
|
542
542
|
// unsure if worth on: Jaguar/Puma (3|2), Core2 (2|2)
|
|
543
543
|
if(use_isa >= ISA_LEVEL_SSE41) {
|
|
544
544
|
dataB = _mm_blendv_epi8(dataBShifted, dataB, mergeMaskB);
|
|
@@ -717,7 +717,7 @@ HEDLEY_ALWAYS_INLINE void do_encode_sse(int line_size, int* colOffset, const uin
|
|
|
717
717
|
}
|
|
718
718
|
} while(i < 0);
|
|
719
719
|
|
|
720
|
-
*colOffset = col + line_size -1;
|
|
720
|
+
*colOffset = (int)(col + line_size -1);
|
|
721
721
|
dest = p;
|
|
722
722
|
len = -(i - INPUT_OFFSET);
|
|
723
723
|
}
|
package/src/platform.cc
CHANGED
|
@@ -2,16 +2,36 @@
|
|
|
2
2
|
#ifdef PLATFORM_ARM
|
|
3
3
|
# ifdef __ANDROID__
|
|
4
4
|
# include <cpu-features.h>
|
|
5
|
-
# elif defined(__linux__)
|
|
5
|
+
# elif defined(__linux__) || (defined(__FreeBSD__) && __FreeBSD__ >= 12)
|
|
6
6
|
# include <sys/auxv.h>
|
|
7
7
|
# include <asm/hwcap.h>
|
|
8
|
+
# elif (defined(__FreeBSD__) && __FreeBSD__ < 12)
|
|
9
|
+
# include <sys/sysctl.h>
|
|
10
|
+
# include <asm/hwcap.h>
|
|
11
|
+
# elif defined(_WIN32)
|
|
12
|
+
# define WIN32_LEAN_AND_MEAN
|
|
13
|
+
# define NOMINMAX
|
|
14
|
+
# include <Windows.h>
|
|
15
|
+
# elif defined(__APPLE__)
|
|
16
|
+
# include <sys/types.h>
|
|
17
|
+
# include <sys/sysctl.h>
|
|
8
18
|
# endif
|
|
9
19
|
bool cpu_supports_neon() {
|
|
10
20
|
# if defined(AT_HWCAP)
|
|
11
|
-
# ifdef
|
|
12
|
-
|
|
21
|
+
# ifdef __FreeBSD__
|
|
22
|
+
unsigned long supported;
|
|
23
|
+
elf_aux_info(AT_HWCAP, &supported, sizeof(supported));
|
|
24
|
+
# ifdef __aarch64__
|
|
25
|
+
return supported & HWCAP_ASIMD;
|
|
26
|
+
# else
|
|
27
|
+
return supported & HWCAP_NEON;
|
|
28
|
+
# endif
|
|
13
29
|
# else
|
|
30
|
+
# ifdef __aarch64__
|
|
31
|
+
return getauxval(AT_HWCAP) & HWCAP_ASIMD;
|
|
32
|
+
# else
|
|
14
33
|
return getauxval(AT_HWCAP) & HWCAP_NEON;
|
|
34
|
+
# endif
|
|
15
35
|
# endif
|
|
16
36
|
# elif defined(ANDROID_CPU_FAMILY_ARM)
|
|
17
37
|
# ifdef __aarch64__
|
|
@@ -19,8 +39,16 @@ bool cpu_supports_neon() {
|
|
|
19
39
|
# else
|
|
20
40
|
return android_getCpuFeatures() & ANDROID_CPU_ARM_FEATURE_NEON;
|
|
21
41
|
# endif
|
|
42
|
+
# elif defined(_WIN32)
|
|
43
|
+
return IsProcessorFeaturePresent(PF_ARM_NEON_INSTRUCTIONS_AVAILABLE);
|
|
44
|
+
# elif defined(__APPLE__)
|
|
45
|
+
int supported = 0;
|
|
46
|
+
size_t len = sizeof(supported);
|
|
47
|
+
if(sysctlbyname("hw.optional.neon", &supported, &len, NULL, 0))
|
|
48
|
+
return false;
|
|
49
|
+
return (bool)supported;
|
|
22
50
|
# endif
|
|
23
|
-
return true; // assume NEON support, if compiled as such, otherwise
|
|
51
|
+
return true; // assume NEON support, if compiled as such, otherwise (I think Windows and iOS require it)
|
|
24
52
|
}
|
|
25
53
|
#endif
|
|
26
54
|
|
|
@@ -87,8 +115,8 @@ int cpu_supports_isa() {
|
|
|
87
115
|
// Jaguar/Puma performance unkown (slowish PSHUFB/PBLENDVB)
|
|
88
116
|
|
|
89
117
|
if((flags[2] & 0x200) == 0x200) { // SSSE3
|
|
90
|
-
if(family == 6 && (model == 0x5c || model == 0x5f || model == 0x7a))
|
|
91
|
-
// Intel Goldmont/plus with slow PBLENDVB
|
|
118
|
+
if(family == 6 && (model == 0x5c || model == 0x5f || model == 0x7a || model == 0x9c))
|
|
119
|
+
// Intel Goldmont/plus / Tremont with slow PBLENDVB
|
|
92
120
|
return ret | ISA_LEVEL_SSSE3;
|
|
93
121
|
|
|
94
122
|
if(flags[2] & 0x80000) { // SSE4.1
|