npm - yencode - Versions diffs - 1.1.0 → 1.1.3 - Mend

yencode 1.1.0 → 1.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (28) hide show

package/binding.gyp +79 -7
package/crcutil-1.0/code/multiword_64_64_intrinsic_i386_mmx.cc +1 -1
package/package.json +1 -1
package/src/common.h +88 -24
package/src/crc.cc +59 -27
package/src/crc.h +20 -6
package/src/crc_arm.cc +154 -27
package/src/crc_common.h +3 -10
package/src/{crc_folding.c → crc_folding.cc} +53 -122
package/src/crc_folding_256.cc +230 -0
package/src/decoder.cc +10 -4
package/src/decoder.h +16 -2
package/src/decoder_avx2_base.h +32 -21
package/src/decoder_common.h +2 -2
package/src/decoder_neon.cc +37 -37
package/src/decoder_neon64.cc +41 -36
package/src/decoder_sse_base.h +21 -14
package/src/decoder_vbmi2.cc +30 -0
package/src/encoder.cc +9 -3
package/src/encoder.h +17 -1
package/src/encoder_avx_base.h +8 -8
package/src/encoder_common.h +3 -3
package/src/encoder_neon.cc +31 -31
package/src/encoder_sse_base.h +7 -8
package/src/encoder_vbmi2.cc +23 -0
package/src/platform.cc +57 -8
package/src/yencode.cc +33 -44
package/test/testcrc.js +14 -0

package/src/decoder_neon64.cc CHANGED Viewed

@@ -1,5 +1,5 @@
 #include "common.h"
-#ifdef __ARM_NEON
+#if defined(__ARM_NEON) && defined(__aarch64__)
 #include "decoder_common.h"
@@ -10,9 +10,9 @@ static struct { char bytes[16]; } ALIGN_TO(16, compactLUT[32768]);
 static uint8_t eqFixLUT[256];
-#if !defined(__clang__)
-HEDLEY_ALWAYS_INLINE uint8x16x4_t vld1q_u8_x4(const uint8_t* p) {
+// AArch64 GCC lacks these functions until 8.5, 9.4 and 10.1 (10.0 unknown)
+#if !defined(__clang__) && !defined(_MSC_VER) && (!defined(__aarch64__) || !(HEDLEY_GCC_VERSION_CHECK(9,4,0) || (!HEDLEY_GCC_VERSION_CHECK(9,0,0) && HEDLEY_GCC_VERSION_CHECK(8,5,0))))
+static HEDLEY_ALWAYS_INLINE uint8x16x4_t _vld1q_u8_x4(const uint8_t* p) {
 	uint8x16x4_t ret;
 	ret.val[0] = vld1q_u8(p);
 	ret.val[1] = vld1q_u8(p+16);
@@ -20,12 +20,15 @@ HEDLEY_ALWAYS_INLINE uint8x16x4_t vld1q_u8_x4(const uint8_t* p) {
 	ret.val[3] = vld1q_u8(p+48);
 	return ret;
 }
-HEDLEY_ALWAYS_INLINE void vst1q_u8_x4(uint8_t* p, uint8x16x4_t data) {
+static HEDLEY_ALWAYS_INLINE void _vst1q_u8_x4(uint8_t* p, uint8x16x4_t data) {
 	vst1q_u8(p, data.val[0]);
 	vst1q_u8(p+16, data.val[1]);
 	vst1q_u8(p+32, data.val[2]);
 	vst1q_u8(p+48, data.val[3]);
 }
+#else
+# define _vld1q_u8_x4 vld1q_u8_x4
+# define _vst1q_u8_x4 vst1q_u8_x4
 #endif
@@ -48,12 +51,14 @@ HEDLEY_ALWAYS_INLINE void do_decode_neon(const uint8_t* HEDLEY_RESTRICT src, lon
 	HEDLEY_ASSUME(escFirst == 0 || escFirst == 1);
 	HEDLEY_ASSUME(nextMask == 0 || nextMask == 1 || nextMask == 2);
 	uint8x16_t nextMaskMix = vdupq_n_u8(0);
-	if(nextMask)
-		nextMaskMix[nextMask-1] = nextMask;
-	uint8x16_t yencOffset = escFirst ? (uint8x16_t){42+64,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42} : vdupq_n_u8(42);
+	if(nextMask == 1)
+		nextMaskMix = vsetq_lane_u8(1, nextMaskMix, 0);
+	if(nextMask == 2)
+		nextMaskMix = vsetq_lane_u8(2, nextMaskMix, 1);
+	uint8x16_t yencOffset = escFirst ? vmakeq_u8(42+64,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42) : vdupq_n_u8(42);
 	long i;
 	for(i = -len; i; i += sizeof(uint8x16_t)*4) {
-		uint8x16x4_t data = vld1q_u8_x4(src+i);
+		uint8x16x4_t data = _vld1q_u8_x4(src+i);
 		uint8x16_t dataA = data.val[0];
 		uint8x16_t dataB = data.val[1];
 		uint8x16_t dataC = data.val[2];
@@ -66,23 +71,23 @@ HEDLEY_ALWAYS_INLINE void do_decode_neon(const uint8_t* HEDLEY_RESTRICT src, lon
 		cmpEqD = vceqq_u8(dataD, vdupq_n_u8('=')),
 		cmpA = vqtbx1q_u8(
 			cmpEqA,
-			//                                \n      \r
-			(uint8x16_t){0,0,0,0,0,0,0,0,0,0,255,0,0,255,0,0},
+			//                             \n      \r
+			vmakeq_u8(0,0,0,0,0,0,0,0,0,0,255,0,0,255,0,0),
 			dataA
 		),
 		cmpB = vqtbx1q_u8(
 			cmpEqB,
-			(uint8x16_t){0,0,0,0,0,0,0,0,0,0,255,0,0,255,0,0},
+			vmakeq_u8(0,0,0,0,0,0,0,0,0,0,255,0,0,255,0,0),
 			dataB
 		),
 		cmpC = vqtbx1q_u8(
 			cmpEqC,
-			(uint8x16_t){0,0,0,0,0,0,0,0,0,0,255,0,0,255,0,0},
+			vmakeq_u8(0,0,0,0,0,0,0,0,0,0,255,0,0,255,0,0),
 			dataC
 		),
 		cmpD = vqtbx1q_u8(
 			cmpEqD,
-			(uint8x16_t){0,0,0,0,0,0,0,0,0,0,255,0,0,255,0,0},
+			vmakeq_u8(0,0,0,0,0,0,0,0,0,0,255,0,0,255,0,0),
 			dataD
 		);
 		if(isRaw) cmpA = vorrq_u8(cmpA, nextMaskMix);
@@ -93,22 +98,22 @@ HEDLEY_ALWAYS_INLINE void do_decode_neon(const uint8_t* HEDLEY_RESTRICT src, lon
 		)))) {
 			uint8x16_t cmpMerge = vpaddq_u8(
 				vpaddq_u8(
-					vandq_u8(cmpA, (uint8x16_t){1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128}),
-					vandq_u8(cmpB, (uint8x16_t){1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128})
+					vandq_u8(cmpA, vmakeq_u8(1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128)),
+					vandq_u8(cmpB, vmakeq_u8(1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128))
 				),
 				vpaddq_u8(
-					vandq_u8(cmpC, (uint8x16_t){1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128}),
-					vandq_u8(cmpD, (uint8x16_t){1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128})
+					vandq_u8(cmpC, vmakeq_u8(1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128)),
+					vandq_u8(cmpD, vmakeq_u8(1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128))
 				)
 			);
 			uint8x16_t cmpEqMerge = vpaddq_u8(
 				vpaddq_u8(
-					vandq_u8(cmpEqA, (uint8x16_t){1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128}),
-					vandq_u8(cmpEqB, (uint8x16_t){1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128})
+					vandq_u8(cmpEqA, vmakeq_u8(1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128)),
+					vandq_u8(cmpEqB, vmakeq_u8(1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128))
 				),
 				vpaddq_u8(
-					vandq_u8(cmpEqC, (uint8x16_t){1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128}),
-					vandq_u8(cmpEqD, (uint8x16_t){1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128})
+					vandq_u8(cmpEqC, vmakeq_u8(1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128)),
+					vandq_u8(cmpEqD, vmakeq_u8(1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128))
 				)
 			);
@@ -225,14 +230,14 @@ HEDLEY_ALWAYS_INLINE void do_decode_neon(const uint8_t* HEDLEY_RESTRICT src, lon
 							break;
 						}
 					}
-					uint8x16_t match2NlDotDMasked = vandq_u8(match2NlDotD, (uint8x16_t){1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128});
+					uint8x16_t match2NlDotDMasked = vandq_u8(match2NlDotD, vmakeq_u8(1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128));
 					uint8x16_t mergeKillDots = vpaddq_u8(
 						vpaddq_u8(
-							vandq_u8(match2NlDotA, (uint8x16_t){1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128}),
-							vandq_u8(match2NlDotB, (uint8x16_t){1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128})
+							vandq_u8(match2NlDotA, vmakeq_u8(1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128)),
+							vandq_u8(match2NlDotB, vmakeq_u8(1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128))
 						),
 						vpaddq_u8(
-							vandq_u8(match2NlDotC, (uint8x16_t){1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128}),
+							vandq_u8(match2NlDotC, vmakeq_u8(1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128)),
 							match2NlDotDMasked
 						)
 					);
@@ -308,27 +313,27 @@ HEDLEY_ALWAYS_INLINE void do_decode_neon(const uint8_t* HEDLEY_RESTRICT src, lon
 				uint8x16_t vMaskEqA = vqtbl1q_u8(
 					maskEqTemp,
-					(uint8x16_t){0,0,0,0,0,0,0,0, 1,1,1,1,1,1,1,1}
+					vmakeq_u8(0,0,0,0,0,0,0,0, 1,1,1,1,1,1,1,1)
 				);
 				maskEqTemp = vextq_u8(maskEqTemp, maskEqTemp, 2);
 				uint8x16_t vMaskEqB = vqtbl1q_u8(
 					maskEqTemp,
-					(uint8x16_t){0,0,0,0,0,0,0,0, 1,1,1,1,1,1,1,1}
+					vmakeq_u8(0,0,0,0,0,0,0,0, 1,1,1,1,1,1,1,1)
 				);
 				maskEqTemp = vextq_u8(maskEqTemp, maskEqTemp, 2);
 				uint8x16_t vMaskEqC = vqtbl1q_u8(
 					maskEqTemp,
-					(uint8x16_t){0,0,0,0,0,0,0,0, 1,1,1,1,1,1,1,1}
+					vmakeq_u8(0,0,0,0,0,0,0,0, 1,1,1,1,1,1,1,1)
 				);
 				maskEqTemp = vextq_u8(maskEqTemp, maskEqTemp, 2);
 				uint8x16_t vMaskEqD = vqtbl1q_u8(
 					maskEqTemp,
-					(uint8x16_t){0,0,0,0,0,0,0,0, 1,1,1,1,1,1,1,1}
+					vmakeq_u8(0,0,0,0,0,0,0,0, 1,1,1,1,1,1,1,1)
 				);
-				vMaskEqA = vtstq_u8(vMaskEqA, (uint8x16_t){1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128});
-				vMaskEqB = vtstq_u8(vMaskEqB, (uint8x16_t){1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128});
-				vMaskEqC = vtstq_u8(vMaskEqC, (uint8x16_t){1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128});
-				vMaskEqD = vtstq_u8(vMaskEqD, (uint8x16_t){1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128});
+				vMaskEqA = vtstq_u8(vMaskEqA, vmakeq_u8(1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128));
+				vMaskEqB = vtstq_u8(vMaskEqB, vmakeq_u8(1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128));
+				vMaskEqC = vtstq_u8(vMaskEqC, vmakeq_u8(1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128));
+				vMaskEqD = vtstq_u8(vMaskEqD, vmakeq_u8(1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128));
 				dataA = vsubq_u8(
 					dataA,
@@ -384,7 +389,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_neon(const uint8_t* HEDLEY_RESTRICT src, lon
 					)
 				);
 			}
-			yencOffset[0] = (escFirst << 6) | 42;
+			yencOffset = vsetq_lane_u8((escFirst << 6) | 42, yencOffset, 0);
 			// all that's left is to 'compress' the data (skip over masked chars)
 			uint64_t counts = vget_lane_u64(vreinterpret_u64_u8(vcnt_u8(vget_low_u8(cmpCombined))), 0);
@@ -419,7 +424,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_neon(const uint8_t* HEDLEY_RESTRICT src, lon
 			dataB = vsubq_u8(dataB, vdupq_n_u8(42));
 			dataC = vsubq_u8(dataC, vdupq_n_u8(42));
 			dataD = vsubq_u8(dataD, vdupq_n_u8(42));
-			vst1q_u8_x4(p, ((uint8x16x4_t){dataA, dataB, dataC, dataD}));
+			_vst1q_u8_x4(p, vcreate4_u8(dataA, dataB, dataC, dataD));
 			p += sizeof(uint8x16_t)*4;
 			escFirst = 0;
 			yencOffset = vdupq_n_u8(42);

package/src/decoder_sse_base.h CHANGED Viewed

@@ -8,7 +8,7 @@
 #endif
 // GCC (ver 6-10(dev)) fails to optimize pure C version of mask testing, but has this intrinsic; Clang >= 7 optimizes C version fine
-#if defined(__GNUC__) && __GNUC__ >= 7
+#if (defined(__GNUC__) && __GNUC__ >= 7) || (defined(_MSC_VER) && _MSC_VER >= 1924)
 # define KORTEST16(a, b) !_kortestz_mask16_u8((a), (b))
 # define KAND16(a, b) _kand_mask16((a), (b))
 # define KOR16(a, b) _kor_mask16((a), (b))
@@ -112,15 +112,22 @@ HEDLEY_ALWAYS_INLINE void do_decode_sse(const uint8_t* HEDLEY_RESTRICT src, long
 		-42,-42,-42,-42,-42,-42,-42,-42,-42,-42,-42,-42,-42,-42,-42,-42-64
 	) : _mm_set1_epi8(-42);
-#if defined(__SSSE3__) && !defined(__tune_atom__) && !defined(__tune_slm__) && !defined(__tune_btver1__)
+#if defined(__SSSE3__) && !defined(__tune_atom__) && !defined(__tune_slm__) && !defined(__tune_btver1__) && !defined(__tune_btver2__)
 	const bool _USING_FAST_MATCH = (use_isa >= ISA_LEVEL_SSSE3);
 #else
 	const bool _USING_FAST_MATCH = false;
 #endif
-#if defined(__SSE4_1__) && !defined(__tune_slm__) && !defined(__tune_goldmont__) && !defined(__tune_goldmont_plus__)
+#if defined(__SSE4_1__) && !defined(__tune_slm__) && !defined(__tune_goldmont__) && !defined(__tune_goldmont_plus__) && !defined(__tune_tremont__)
 	const bool _USING_BLEND_ADD = (use_isa >= ISA_LEVEL_SSE41);
 #else
 	const bool _USING_BLEND_ADD = false;
+#endif
+#if defined(__AVX512VL__) && defined(__AVX512BW__)
+# if defined(_MSC_VER) && !defined(PLATFORM_AMD64) && !defined(__clang__)
+	const bool useAVX3MaskCmp = false;
+# else
+	const bool useAVX3MaskCmp = (use_isa >= ISA_LEVEL_AVX3);
+# endif
 #endif
 	__m128i lfCompare = _mm_set1_epi8('\n');
@@ -214,7 +221,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_sse(const uint8_t* HEDLEY_RESTRICT src, long
 				__mmask16 match2EqMaskA, match2EqMaskB;
 				__mmask16 match0CrMaskA, match0CrMaskB;
 				__mmask16 match2CrXDtMaskA, match2CrXDtMaskB;
-				if(use_isa >= ISA_LEVEL_AVX3 && searchEnd) {
+				if(useAVX3MaskCmp && searchEnd) {
 					match2EqMaskA = _mm_cmpeq_epi8_mask(_mm_set1_epi8('='), tmpData2A);
 					match2EqMaskB = _mm_cmpeq_epi8_mask(_mm_set1_epi8('='), tmpData2B);
 				} else
@@ -230,7 +237,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_sse(const uint8_t* HEDLEY_RESTRICT src, long
 				__m128i match2CrXDtA, match2CrXDtB;
 				if(isRaw) {
 #if defined(__AVX512VL__) && defined(__AVX512BW__)
-					if(use_isa >= ISA_LEVEL_AVX3) {
+					if(useAVX3MaskCmp) {
 						match0CrMaskA = _mm_cmpeq_epi8_mask(oDataA, _mm_set1_epi8('\r'));
 						match0CrMaskB = _mm_cmpeq_epi8_mask(oDataB, _mm_set1_epi8('\r'));
 						match2CrXDtMaskA = _mm_mask_cmpeq_epi8_mask(match0CrMaskA, tmpData2A, _mm_set1_epi8('.'));
@@ -256,7 +263,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_sse(const uint8_t* HEDLEY_RESTRICT src, long
 #if defined(__AVX512VL__) && defined(__AVX512BW__)
 					__mmask16 match1NlMaskA, match1NlMaskB;
 					__mmask16 match2NlDotMaskA, match2NlDotMaskB;
-					if(use_isa >= ISA_LEVEL_AVX3) {
+					if(useAVX3MaskCmp) {
 						match1NlMaskA = _mm_mask_cmpeq_epi8_mask(
 							match0CrMaskA,
 							_mm_set1_epi8('\n'),
@@ -299,7 +306,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_sse(const uint8_t* HEDLEY_RESTRICT src, long
 						int matchEnd;
 #if defined(__AVX512VL__) && defined(__AVX512BW__)
-						if(use_isa >= ISA_LEVEL_AVX3) {
+						if(useAVX3MaskCmp) {
 							__mmask16 match3EqYMaskA = _mm_mask_cmpeq_epi8_mask(
 								match2EqMaskA, _mm_set1_epi8('y'), tmpData3A
 							);
@@ -368,12 +375,12 @@ HEDLEY_ALWAYS_INLINE void do_decode_sse(const uint8_t* HEDLEY_RESTRICT src, long
 						if(LIKELIHOOD(0.001, matchEnd)) {
 							// terminator found
 							// there's probably faster ways to do this, but reverting to scalar code should be good enough
-							len += i;
+							len += (long)i;
 							break;
 						}
 					}
 #if defined(__AVX512VL__) && defined(__AVX512BW__)
-					if(use_isa >= ISA_LEVEL_AVX3) {
+					if(useAVX3MaskCmp) {
 						mask |= match2NlDotMaskA << 2;
 						mask |= (match2NlDotMaskB << 18) & 0xffffffff;
 						minMask = _mm_maskz_mov_epi8(~(match2NlDotMaskB>>14), _mm_set1_epi8('.'));
@@ -398,7 +405,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_sse(const uint8_t* HEDLEY_RESTRICT src, long
 					__m128i match3EqYA, match3EqYB;
 #if defined(__AVX512VL__) && defined(__AVX512BW__)
 					__mmask16 match3EqYMaskA, match3EqYMaskB;
-					if(use_isa >= ISA_LEVEL_AVX3) {
+					if(useAVX3MaskCmp) {
 						match3EqYMaskA = _mm_mask_cmpeq_epi8_mask(
 							match2EqMaskA,
 							_mm_set1_epi8('y'),
@@ -434,7 +441,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_sse(const uint8_t* HEDLEY_RESTRICT src, long
 						bool endFound;
 #if defined(__AVX512VL__) && defined(__AVX512BW__)
-						if(use_isa >= ISA_LEVEL_AVX3) {
+						if(useAVX3MaskCmp) {
 							__mmask16 match3LfEqYMaskA = _mm_mask_cmpeq_epi8_mask(
 								match3EqYMaskA,
 								_mm_set1_epi8('\n'),
@@ -477,7 +484,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_sse(const uint8_t* HEDLEY_RESTRICT src, long
 						}
 						if(endFound) {
-							len += i;
+							len += (long)i;
 							break;
 						}
 					}
@@ -558,7 +565,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_sse(const uint8_t* HEDLEY_RESTRICT src, long
 					);
 					yencOffset = _mm_xor_si128(_mm_set1_epi8(-42),
-						_mm_slli_epi16(_mm_cvtsi32_si128(escFirst), 6)
+						_mm_slli_epi16(_mm_cvtsi32_si128((int)escFirst), 6)
 					);
 				}
 			} else {
@@ -608,7 +615,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_sse(const uint8_t* HEDLEY_RESTRICT src, long
 						)
 					);
 					yencOffset = _mm_xor_si128(_mm_set1_epi8(-42),
-						_mm_slli_epi16(_mm_cvtsi32_si128(escFirst), 6)
+						_mm_slli_epi16(_mm_cvtsi32_si128((int)escFirst), 6)
 					);
 				} else
 #endif

package/src/decoder_vbmi2.cc ADDED Viewed

@@ -0,0 +1,30 @@
+#include "common.h"
+#if defined(__AVX512VL__) && defined(__AVX512VBMI2__) && defined(__AVX512BW__)
+# include "decoder_common.h"
+# ifndef YENC_DISABLE_AVX256
+#  include "decoder_avx2_base.h"
+void decoder_set_vbmi2_funcs() {
+	ALIGN_ALLOC(lookups, sizeof(*lookups), 16);
+	// TODO: consider removing compact LUT
+	decoder_init_lut(lookups->eqFix, lookups->compact);
+	_do_decode = &do_decode_simd<false, false, sizeof(__m256i)*2, do_decode_avx2<false, false, ISA_LEVEL_VBMI2> >;
+	_do_decode_raw = &do_decode_simd<true, false, sizeof(__m256i)*2, do_decode_avx2<true, false, ISA_LEVEL_VBMI2> >;
+	_do_decode_end_raw = &do_decode_simd<true, true, sizeof(__m256i)*2, do_decode_avx2<true, true, ISA_LEVEL_VBMI2> >;
+}
+# else
+#  include "decoder_sse_base.h"
+void decoder_set_vbmi2_funcs() {
+	decoder_sse_init();
+	decoder_init_lut(lookups->eqFix, lookups->compact);
+	_do_decode = &do_decode_simd<false, false, sizeof(__m128i)*2, do_decode_sse<false, false, ISA_LEVEL_VBMI2> >;
+	_do_decode_raw = &do_decode_simd<true, false, sizeof(__m128i)*2, do_decode_sse<true, false, ISA_LEVEL_VBMI2> >;
+	_do_decode_end_raw = &do_decode_simd<true, true, sizeof(__m128i)*2, do_decode_sse<true, true, ISA_LEVEL_VBMI2> >;
+}
+# endif
+#else
+void decoder_set_avx2_funcs();
+void decoder_set_vbmi2_funcs() {
+	decoder_set_avx2_funcs();
+}
+#endif

package/src/encoder.cc CHANGED Viewed

@@ -1,7 +1,8 @@
 #include "common.h"
 #include "encoder_common.h"
+#include "encoder.h"
-size_t do_encode_generic(int line_size, int* colOffset, const unsigned char* HEDLEY_RESTRICT src, unsigned char* HEDLEY_RESTRICT dest, size_t len, bool doEnd) {
+size_t do_encode_generic(int line_size, int* colOffset, const unsigned char* HEDLEY_RESTRICT src, unsigned char* HEDLEY_RESTRICT dest, size_t len, int doEnd) {
 	unsigned char* es = (unsigned char*)src + len;
 	unsigned char *p = dest; // destination pointer
 	long i = -(long)len; // input position
@@ -119,12 +120,15 @@ size_t do_encode_generic(int line_size, int* colOffset, const unsigned char* HED
 }
-size_t (*_do_encode)(int, int*, const unsigned char* HEDLEY_RESTRICT, unsigned char* HEDLEY_RESTRICT, size_t, bool) = &do_encode_generic;
+extern "C" {
+	size_t (*_do_encode)(int, int*, const unsigned char* HEDLEY_RESTRICT, unsigned char* HEDLEY_RESTRICT, size_t, int) = &do_encode_generic;
+}
 void encoder_sse2_init();
 void encoder_ssse3_init();
 void encoder_avx_init();
 void encoder_avx2_init();
+void encoder_vbmi2_init();
 void encoder_neon_init();
 #if defined(PLATFORM_X86) && defined(YENC_BUILD_NATIVE) && YENC_BUILD_NATIVE!=0
@@ -150,7 +154,9 @@ void encoder_init() {
 	encoder_native_init();
 # else
 	int use_isa = cpu_supports_isa();
-	if(use_isa >= ISA_LEVEL_AVX2)
+	if(use_isa >= ISA_LEVEL_VBMI2)
+		encoder_vbmi2_init();
+	else if(use_isa >= ISA_LEVEL_AVX2)
 		encoder_avx2_init();
 	else if(use_isa >= ISA_LEVEL_AVX)
 		encoder_avx_init();

package/src/encoder.h CHANGED Viewed

@@ -1,5 +1,21 @@
+#ifndef __YENC_ENCODER_H
+#define __YENC_ENCODER_H
+#ifdef __cplusplus
+extern "C" {
+#endif
 #include "hedley.h"
-extern size_t (*_do_encode)(int, int*, const unsigned char* HEDLEY_RESTRICT, unsigned char* HEDLEY_RESTRICT, size_t, bool);
+extern size_t (*_do_encode)(int, int*, const unsigned char* HEDLEY_RESTRICT, unsigned char* HEDLEY_RESTRICT, size_t, int);
 #define do_encode (*_do_encode)
 void encoder_init();
+#ifdef __cplusplus
+}
+#endif
+#endif

package/src/encoder_avx_base.h CHANGED Viewed

@@ -6,7 +6,7 @@
 #include "encoder_common.h"
 #define YMM_SIZE 32
-#if defined(__GNUC__) && __GNUC__ >= 7
+#if (defined(__GNUC__) && __GNUC__ >= 7) || (defined(_MSC_VER) && _MSC_VER >= 1924)
 # define KLOAD32(a, offs) _load_mask32((__mmask32*)(a) + (offs))
 #else
 # define KLOAD32(a, offs) (((uint32_t*)(a))[(offs)])
@@ -112,7 +112,7 @@ HEDLEY_ALWAYS_INLINE void do_encode_avx2(int line_size, int* colOffset, const ui
 			// last char
 			uint32_t eolChar = (use_isa >= ISA_LEVEL_VBMI2 ? lookupsVBMI2->eolLastChar[c] : lookupsAVX2->eolLastChar[c]);
 			*(uint32_t*)p = eolChar;
-			p += 3 + (eolChar>>27);
+			p += 3 + (uintptr_t)(eolChar>>27);
 			col = -line_size+1;
 		} else {
 			// line overflowed, insert a newline
@@ -215,7 +215,7 @@ HEDLEY_ALWAYS_INLINE void do_encode_avx2(int line_size, int* colOffset, const ui
 				// duplicate halves
 				data1A = _mm256_inserti128_si256(dataA, _mm256_castsi256_si128(dataA), 1);
 				data1B = _mm256_inserti128_si256(dataB, _mm256_castsi256_si128(dataB), 1);
-#ifdef __tune_znver2__
+#if defined(__tune_znver2__) || defined(__tune_znver3__)
 				data2A = _mm256_permute2x128_si256(dataA, dataA, 0x11);
 				data2B = _mm256_permute2x128_si256(dataB, dataB, 0x11);
 #else
@@ -254,7 +254,7 @@ HEDLEY_ALWAYS_INLINE void do_encode_avx2(int line_size, int* colOffset, const ui
 				// we overflowed - find correct position to revert back to
 				// this is perhaps sub-optimal on 32-bit, but who still uses that with AVX2?
 				uint64_t eqMask;
-				int shiftAmt = maskBitsB + YMM_SIZE - col -1;
+				int shiftAmt = (int)(maskBitsB + YMM_SIZE -1 - col);
 				if(HEDLEY_UNLIKELY(shiftAmt < 0)) {
 					uint32_t eqMask1, eqMask2;
 #if defined(__AVX512VBMI2__) && defined(__AVX512VL__) && defined(__AVX512BW__)
@@ -293,7 +293,7 @@ HEDLEY_ALWAYS_INLINE void do_encode_avx2(int line_size, int* colOffset, const ui
 					asm(
 						"shrq $1, %[eqMask] \n"
 						"shrq %%cl, %[eqMask] \n"
-						"adcq %[col], %[p] \n"
+						"adcq %q[col], %q[p] \n"
 						: [eqMask]"+r"(eqMask), [p]"+r"(p)
 						: "c"(shiftAmt), [col]"r"(~col)
 					);
@@ -320,7 +320,7 @@ HEDLEY_ALWAYS_INLINE void do_encode_avx2(int line_size, int* colOffset, const ui
 #endif
 					{
 						i += bitCount;
-						unsigned int revert = col + (eqMask & 1);
+						unsigned int revert = (unsigned int)(col + (eqMask & 1));
 						p -= revert;
 						i -= revert;
 					}
@@ -429,7 +429,7 @@ HEDLEY_ALWAYS_INLINE void do_encode_avx2(int line_size, int* colOffset, const ui
 				_encode_eol_handle_pre:
 				uint32_t eolChar = (use_isa >= ISA_LEVEL_VBMI2 ? lookupsVBMI2->eolLastChar[es[i]] : lookupsAVX2->eolLastChar[es[i]]);
 				*(uint32_t*)p = eolChar;
-				p += 3 + (eolChar>>27);
+				p += 3 + (uintptr_t)(eolChar>>27);
 				col = lineSizeOffset;
 				if(HEDLEY_UNLIKELY(i >= 0)) { // this isn't really a proper check - it's only needed to support short lines; basically, if the line is too short, `i` never gets checked, so we need one somewhere
@@ -556,7 +556,7 @@ HEDLEY_ALWAYS_INLINE void do_encode_avx2(int line_size, int* colOffset, const ui
 	_mm256_zeroupper();
-	*colOffset = col + line_size -1;
+	*colOffset = (int)(col + line_size -1);
 	dest = p;
 	len = -(i - INPUT_OFFSET);
 }

package/src/encoder_common.h CHANGED Viewed

@@ -8,7 +8,7 @@
 #define _BX _B3(0), _B3(64), _B3(128), _B3(192)
 static const unsigned char escapeLUT[256] = { // whether or not the character is critical
-#define _B(n) ((n == 214 || n == 214+'\r' || n == 214+'\n' || n == '='-42) ? 0 : (n+42) & 0xff)
+#define _B(n) ((n == 214 || n == '\r'+214 || n == '\n'+214 || n == '='-42) ? 0 : (n+42) & 0xff)
 	_BX
 #undef _B
 };
@@ -24,10 +24,10 @@ static const uint16_t escapedLUT[256] = { // escaped sequences for characters th
 #undef _BX
-size_t do_encode_generic(int line_size, int* colOffset, const unsigned char* HEDLEY_RESTRICT src, unsigned char* HEDLEY_RESTRICT dest, size_t len, bool doEnd);
+size_t do_encode_generic(int line_size, int* colOffset, const unsigned char* HEDLEY_RESTRICT src, unsigned char* HEDLEY_RESTRICT dest, size_t len, int doEnd);
 template<void(&kernel)(int, int*, const uint8_t* HEDLEY_RESTRICT, uint8_t* HEDLEY_RESTRICT&, size_t&)>
-static size_t do_encode_simd(int line_size, int* colOffset, const uint8_t* HEDLEY_RESTRICT src, uint8_t* HEDLEY_RESTRICT dest, size_t len, bool doEnd) {
+static size_t do_encode_simd(int line_size, int* colOffset, const uint8_t* HEDLEY_RESTRICT src, uint8_t* HEDLEY_RESTRICT dest, size_t len, int doEnd) {
 	if(len < 1) return 0;
 	if(line_size < 12) { // short lines probably not worth processing in a SIMD way
 		// we assume at least the first and last char exist in the line, and since the first char could be escaped, and SIMD encoder assumes at least one non-first/last char, assumption means that line size has to be >= 4