npm - yencode - Versions diffs - 1.1.3 → 1.1.5 - Mend

yencode 1.1.3 → 1.1.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (42) hide show

package/src/crc_folding.cc CHANGED Viewed

@@ -365,12 +365,11 @@ static uint32_t do_crc32_incremental_clmul(const void* data, size_t length, uint
 	return crc_fold((const unsigned char*)data, (long)length, init);
 }
-void crc_clmul_set_funcs(crc_func* _do_crc32_incremental) {
-	*_do_crc32_incremental = &do_crc32_incremental_clmul;
+void crc_clmul_set_funcs() {
+	_do_crc32_incremental = &do_crc32_incremental_clmul;
+	_crc32_isa = ISA_LEVEL_PCLMUL;
 }
 #else
-void crc_clmul_set_funcs(crc_func* _do_crc32_incremental) {
-    (void)_do_crc32_incremental;
-}
+void crc_clmul_set_funcs() {}
 #endif

package/src/crc_folding_256.cc CHANGED Viewed

@@ -26,10 +26,9 @@ static __m256i do_one_fold(__m256i src, __m256i data) {
 	  0x96
 	);
 #else
-	return _mm256_xor_si256(data, _mm256_xor_si256(
-	  _mm256_clmulepi64_epi128(src, fold4, 0x01),
-	  _mm256_clmulepi64_epi128(src, fold4, 0x10)
-	));
+	return _mm256_xor_si256(_mm256_xor_si256(
+	  data, _mm256_clmulepi64_epi128(src, fold4, 0x01)
+	), _mm256_clmulepi64_epi128(src, fold4, 0x10));
 #endif
 }
@@ -38,7 +37,7 @@ ALIGN_TO(32, static const uint8_t  pshufb_rot_table[]) = {
 	16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31
 };
 // _mm256_castsi128_si256, but upper is defined to be 0
-#if (defined(__clang__) && __clang_major__ >= 5 && (!defined(__APPLE__) || __clang_major__ >= 7)) || (defined(__GNUC__) && __GNUC__ >= 10)
+#if (defined(__clang__) && __clang_major__ >= 5 && (!defined(__APPLE__) || __clang_major__ >= 7)) || (defined(__GNUC__) && __GNUC__ >= 10) || (defined(_MSC_VER) && _MSC_VER >= 1910)
 // intrinsic unsupported in GCC 9 and MSVC < 2017
 # define zext128_256 _mm256_zextsi128_si256
 #else
@@ -218,13 +217,14 @@ static uint32_t do_crc32_incremental_clmul(const void* data, size_t length, uint
 	return crc_fold((const unsigned char*)data, (long)length, init);
 }
-void crc_clmul256_set_funcs(crc_func* _do_crc32_incremental) {
-	*_do_crc32_incremental = &do_crc32_incremental_clmul;
+void crc_clmul256_set_funcs() {
+	_do_crc32_incremental = &do_crc32_incremental_clmul;
+	_crc32_isa = ISA_LEVEL_VPCLMUL;
 }
 #else
-void crc_clmul_set_funcs(crc_func* _do_crc32_incremental);
-void crc_clmul256_set_funcs(crc_func* _do_crc32_incremental) {
-	crc_clmul_set_funcs(_do_crc32_incremental);
+void crc_clmul_set_funcs();
+void crc_clmul256_set_funcs() {
+	crc_clmul_set_funcs();
 }
 #endif

package/src/decoder.cc CHANGED Viewed

@@ -4,9 +4,11 @@
 #include "decoder.h"
 extern "C" {
-	YencDecoderEnd (*_do_decode)(const unsigned char*HEDLEY_RESTRICT*, unsigned char*HEDLEY_RESTRICT*, size_t, YencDecoderState*) = &do_decode_scalar<false, false>;
-	YencDecoderEnd (*_do_decode_raw)(const unsigned char*HEDLEY_RESTRICT*, unsigned char*HEDLEY_RESTRICT*, size_t, YencDecoderState*) = &do_decode_scalar<true, false>;
-	YencDecoderEnd (*_do_decode_end_raw)(const unsigned char*HEDLEY_RESTRICT*, unsigned char*HEDLEY_RESTRICT*, size_t, YencDecoderState*) = &do_decode_end_scalar<true>;
+	YencDecoderEnd (*_do_decode)(const unsigned char**, unsigned char**, size_t, YencDecoderState*) = &do_decode_scalar<false, false>;
+	YencDecoderEnd (*_do_decode_raw)(const unsigned char**, unsigned char**, size_t, YencDecoderState*) = &do_decode_scalar<true, false>;
+	YencDecoderEnd (*_do_decode_end_raw)(const unsigned char**, unsigned char**, size_t, YencDecoderState*) = &do_decode_end_scalar<true>;
+	int _decode_isa = ISA_GENERIC;
 }
 void decoder_set_sse2_funcs();
@@ -14,6 +16,7 @@ void decoder_set_ssse3_funcs();
 void decoder_set_avx_funcs();
 void decoder_set_avx2_funcs();
 void decoder_set_vbmi2_funcs();
+extern const bool decoder_has_avx10;
 void decoder_set_neon_funcs();
@@ -26,6 +29,7 @@ static inline void decoder_set_native_funcs() {
 	_do_decode = &do_decode_simd<false, false, sizeof(__m256i)*2, do_decode_avx2<false, false, ISA_NATIVE> >;
 	_do_decode_raw = &do_decode_simd<true, false, sizeof(__m256i)*2, do_decode_avx2<true, false, ISA_NATIVE> >;
 	_do_decode_end_raw = &do_decode_simd<true, true, sizeof(__m256i)*2, do_decode_avx2<true, true, ISA_NATIVE> >;
+	_decode_isa = ISA_NATIVE;
 }
 # else
 #  include "decoder_sse_base.h"
@@ -35,6 +39,7 @@ static inline void decoder_set_native_funcs() {
 	_do_decode = &do_decode_simd<false, false, sizeof(__m128i)*2, do_decode_sse<false, false, ISA_NATIVE> >;
 	_do_decode_raw = &do_decode_simd<true, false, sizeof(__m128i)*2, do_decode_sse<true, false, ISA_NATIVE> >;
 	_do_decode_end_raw = &do_decode_simd<true, true, sizeof(__m128i)*2, do_decode_sse<true, true, ISA_NATIVE> >;
+	_decode_isa = ISA_NATIVE;
 }
 # endif
 #endif
@@ -45,7 +50,7 @@ void decoder_init() {
 	decoder_set_native_funcs();
 # else
 	int use_isa = cpu_supports_isa();
-	if(use_isa >= ISA_LEVEL_VBMI2)
+	if(use_isa >= ISA_LEVEL_VBMI2 && (decoder_has_avx10 || (use_isa & ISA_FEATURE_EVEX512)))
 		decoder_set_vbmi2_funcs();
 	else if(use_isa >= ISA_LEVEL_AVX2)
 		decoder_set_avx2_funcs();

package/src/decoder.h CHANGED Viewed

@@ -29,22 +29,26 @@ typedef enum {
 #include "hedley.h"
-extern YencDecoderEnd (*_do_decode)(const unsigned char*HEDLEY_RESTRICT*, unsigned char*HEDLEY_RESTRICT*, size_t, YencDecoderState*);
-extern YencDecoderEnd (*_do_decode_raw)(const unsigned char*HEDLEY_RESTRICT*, unsigned char*HEDLEY_RESTRICT*, size_t, YencDecoderState*);
-extern YencDecoderEnd (*_do_decode_end_raw)(const unsigned char*HEDLEY_RESTRICT*, unsigned char*HEDLEY_RESTRICT*, size_t, YencDecoderState*);
+extern YencDecoderEnd (*_do_decode)(const unsigned char**, unsigned char**, size_t, YencDecoderState*);
+extern YencDecoderEnd (*_do_decode_raw)(const unsigned char**, unsigned char**, size_t, YencDecoderState*);
+extern YencDecoderEnd (*_do_decode_end_raw)(const unsigned char**, unsigned char**, size_t, YencDecoderState*);
+extern int _decode_isa;
-static inline size_t do_decode(int isRaw, const unsigned char* HEDLEY_RESTRICT src, unsigned char* HEDLEY_RESTRICT dest, size_t len, YencDecoderState* state) {
+static inline size_t do_decode(int isRaw, const unsigned char* src, unsigned char* dest, size_t len, YencDecoderState* state) {
 	unsigned char* ds = dest;
 	(*(isRaw ? _do_decode_raw : _do_decode))(&src, &ds, len, state);
 	return ds - dest;
 }
-static inline YencDecoderEnd do_decode_end(const unsigned char*HEDLEY_RESTRICT* src, unsigned char*HEDLEY_RESTRICT* dest, size_t len, YencDecoderState* state) {
+static inline YencDecoderEnd do_decode_end(const unsigned char** src, unsigned char** dest, size_t len, YencDecoderState* state) {
 	return _do_decode_end_raw(src, dest, len, state);
 }
 void decoder_init();
+static inline int decode_isa_level() {
+	return _decode_isa;
+}
 #ifdef __cplusplus

package/src/decoder_avx.cc CHANGED Viewed

@@ -9,6 +9,7 @@ void decoder_set_avx_funcs() {
 	_do_decode = &do_decode_simd<false, false, sizeof(__m128i)*2, do_decode_sse<false, false, ISA_LEVEL_SSE4_POPCNT> >;
 	_do_decode_raw = &do_decode_simd<true, false, sizeof(__m128i)*2, do_decode_sse<true, false, ISA_LEVEL_SSE4_POPCNT> >;
 	_do_decode_end_raw = &do_decode_simd<true, true, sizeof(__m128i)*2, do_decode_sse<true, true, ISA_LEVEL_SSE4_POPCNT> >;
+	_decode_isa = ISA_LEVEL_AVX;
 }
 #else
 void decoder_set_ssse3_funcs();

package/src/decoder_avx2.cc CHANGED Viewed

@@ -9,6 +9,7 @@ void decoder_set_avx2_funcs() {
 	_do_decode = &do_decode_simd<false, false, sizeof(__m256i)*2, do_decode_avx2<false, false, ISA_LEVEL_AVX2> >;
 	_do_decode_raw = &do_decode_simd<true, false, sizeof(__m256i)*2, do_decode_avx2<true, false, ISA_LEVEL_AVX2> >;
 	_do_decode_end_raw = &do_decode_simd<true, true, sizeof(__m256i)*2, do_decode_avx2<true, true, ISA_LEVEL_AVX2> >;
+	_decode_isa = ISA_LEVEL_AVX2;
 }
 #else
 void decoder_set_avx_funcs();

package/src/decoder_avx2_base.h CHANGED Viewed

@@ -30,7 +30,7 @@ static HEDLEY_ALWAYS_INLINE __m256i force_align_read_256(const void* p) {
 }
 // _mm256_castsi128_si256, but upper is defined to be 0
-#if (defined(__clang__) && __clang_major__ >= 5 && (!defined(__APPLE__) || __clang_major__ >= 7)) || (defined(__GNUC__) && __GNUC__ >= 10)
+#if (defined(__clang__) && __clang_major__ >= 5 && (!defined(__APPLE__) || __clang_major__ >= 7)) || (defined(__GNUC__) && __GNUC__ >= 10) || (defined(_MSC_VER) && _MSC_VER >= 1910)
 // intrinsic unsupported in GCC 9 and MSVC < 2017
 # define zext128_256 _mm256_zextsi128_si256
 #else
@@ -43,9 +43,15 @@ static HEDLEY_ALWAYS_INLINE __m256i force_align_read_256(const void* p) {
 # endif
 #endif
+#if defined(__tune_icelake_client__) || defined(__tune_icelake_server__) || defined(__tune_tigerlake__) || defined(__tune_rocketlake__) || defined(__tune_alderlake__) || defined(__tune_sapphirerapids__)
+# define COMPRESS_STORE _mm256_mask_compressstoreu_epi8
+#else
+// avoid uCode on Zen4
+# define COMPRESS_STORE(dst, mask, vec) _mm256_storeu_si256((__m256i*)(dst), _mm256_maskz_compress_epi8(mask, vec))
+#endif
 template<bool isRaw, bool searchEnd, enum YEncDecIsaLevel use_isa>
-HEDLEY_ALWAYS_INLINE void do_decode_avx2(const uint8_t* HEDLEY_RESTRICT src, long& len, unsigned char* HEDLEY_RESTRICT & p, unsigned char& _escFirst, uint16_t& _nextMask) {
+HEDLEY_ALWAYS_INLINE void do_decode_avx2(const uint8_t* src, long& len, unsigned char*& p, unsigned char& _escFirst, uint16_t& _nextMask) {
 	HEDLEY_ASSUME(_escFirst == 0 || _escFirst == 1);
 	HEDLEY_ASSUME(_nextMask == 0 || _nextMask == 1 || _nextMask == 2);
 	uintptr_t escFirst = _escFirst;
@@ -61,6 +67,8 @@ HEDLEY_ALWAYS_INLINE void do_decode_avx2(const uint8_t* HEDLEY_RESTRICT src, lon
 		);
 	}
+	decoder_set_nextMask<isRaw>(src, len, _nextMask); // set this before the loop because we can't check src after it's been overwritten
 	// for some reason, MSVC Win32 seems to crash when trying to compile _mm256_mask_cmpeq_epi8_mask
 	// the crash can be fixed by switching the order of the last two arguments, but it seems to generate wrong code
 	// so just disable the optimisation as it seems to be problematic there
@@ -314,6 +322,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_avx2(const uint8_t* HEDLEY_RESTRICT src, lon
 							// terminator found
 							// there's probably faster ways to do this, but reverting to scalar code should be good enough
 							len += (long)i;
+							_nextMask = decoder_set_nextMask<isRaw>(src+i, mask);
 							break;
 						}
 					}
@@ -406,6 +415,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_avx2(const uint8_t* HEDLEY_RESTRICT src, lon
 						}
 						if(endFound) {
 							len += (long)i;
+							_nextMask = decoder_set_nextMask<isRaw>(src+i, mask);
 							break;
 						}
 					}
@@ -541,9 +551,9 @@ HEDLEY_ALWAYS_INLINE void do_decode_avx2(const uint8_t* HEDLEY_RESTRICT src, lon
 			// all that's left is to 'compress' the data (skip over masked chars)
 #if defined(__AVX512VBMI2__) && defined(__AVX512VL__)
 			if(use_isa >= ISA_LEVEL_VBMI2) {
-				_mm256_mask_compressstoreu_epi8(p, KNOT32(mask), dataA);
+				COMPRESS_STORE(p, KNOT32(mask), dataA);
 				p -= popcnt32(mask & 0xffffffff);
-				_mm256_mask_compressstoreu_epi8((p + XMM_SIZE*2), KNOT32(mask>>32), dataB);
+				COMPRESS_STORE((p + XMM_SIZE*2), KNOT32(mask>>32), dataB);
 				p += XMM_SIZE*4 - popcnt32(mask >> 32);
 			} else
 #endif
@@ -607,20 +617,6 @@ HEDLEY_ALWAYS_INLINE void do_decode_avx2(const uint8_t* HEDLEY_RESTRICT src, lon
 		}
 	}
 	_escFirst = (unsigned char)escFirst;
-	if(isRaw) {
-		// this would be the trivial solution, but requires the compiler holding onto minMask throughout the loop:
-		//_nextMask = ~(uint16_t)_mm256_movemask_epi8(_mm256_cmpeq_epi8(minMask, _mm256_set1_epi8('.')));
-		// instead, just scan the memory to determine what to set nextMask to
-		if(len != 0) { // have to gone through at least one loop cycle
-			if(src[i-2] == '\r' && src[i-1] == '\n' && src[i] == '.')
-				_nextMask = 1;
-			else if(src[i-1] == '\r' && src[i] == '\n' && src[i+1] == '.')
-				_nextMask = 2;
-			else
-				_nextMask = 0;
-		}
-	} else
-		_nextMask = 0;
 	_mm256_zeroupper();
 }
 #endif

package/src/decoder_common.h CHANGED Viewed

@@ -6,7 +6,7 @@
 // state var: refers to the previous state - only used for incremental processing
 template<bool isRaw>
-size_t do_decode_noend_scalar(const unsigned char* HEDLEY_RESTRICT src, unsigned char* HEDLEY_RESTRICT dest, size_t len, YencDecoderState* state) {
+size_t do_decode_noend_scalar(const unsigned char* src, unsigned char* dest, size_t len, YencDecoderState* state) {
 	const unsigned char *es = src + len; // end source pointer
 	unsigned char *p = dest; // destination pointer
 	long i = -(long)len; // input position
@@ -140,7 +140,7 @@ size_t do_decode_noend_scalar(const unsigned char* HEDLEY_RESTRICT src, unsigned
 }
 template<bool isRaw>
-YencDecoderEnd do_decode_end_scalar(const unsigned char* HEDLEY_RESTRICT* src, unsigned char* HEDLEY_RESTRICT* dest, size_t len, YencDecoderState* state) {
+YencDecoderEnd do_decode_end_scalar(const unsigned char** src, unsigned char** dest, size_t len, YencDecoderState* state) {
 	const unsigned char *es = (*src) + len; // end source pointer
 	unsigned char *p = *dest; // destination pointer
 	long i = -(long)len; // input position
@@ -321,7 +321,7 @@ YencDecoderEnd do_decode_end_scalar(const unsigned char* HEDLEY_RESTRICT* src, u
 }
 template<bool isRaw, bool searchEnd>
-YencDecoderEnd do_decode_scalar(const unsigned char* HEDLEY_RESTRICT* src, unsigned char* HEDLEY_RESTRICT* dest, size_t len, YencDecoderState* state) {
+YencDecoderEnd do_decode_scalar(const unsigned char** src, unsigned char** dest, size_t len, YencDecoderState* state) {
 	if(searchEnd)
 		return do_decode_end_scalar<isRaw>(src, dest, len, state);
 	*dest += do_decode_noend_scalar<isRaw>(*src, *dest, len, state);
@@ -331,8 +331,8 @@ YencDecoderEnd do_decode_scalar(const unsigned char* HEDLEY_RESTRICT* src, unsig
-template<bool isRaw, bool searchEnd, int width, void(&kernel)(const uint8_t* HEDLEY_RESTRICT, long&, unsigned char* HEDLEY_RESTRICT &, unsigned char&, uint16_t&)>
-YencDecoderEnd do_decode_simd(const unsigned char* HEDLEY_RESTRICT* src, unsigned char* HEDLEY_RESTRICT* dest, size_t len, YencDecoderState* state) {
+template<bool isRaw, bool searchEnd, int width, void(&kernel)(const uint8_t*, long&, unsigned char*&, unsigned char&, uint16_t&)>
+YencDecoderEnd do_decode_simd(const unsigned char** src, unsigned char** dest, size_t len, YencDecoderState* state) {
 	if(len <= width*2) return do_decode_scalar<isRaw, searchEnd>(src, dest, len, state);
 	YencDecoderState tState = YDEC_STATE_CRLF;
@@ -509,4 +509,29 @@ static inline void decoder_init_lut(uint8_t* eqFixLUT, void* compactLUT) {
 	}
 	#endif
 }
+template<bool isRaw>
+static inline void decoder_set_nextMask(const uint8_t* src, size_t len, uint16_t& nextMask) {
+	if(isRaw) {
+		if(len != 0) { // have to gone through at least one loop cycle
+			if(src[-2] == '\r' && src[-1] == '\n' && src[0] == '.')
+				nextMask = 1;
+			else if(src[-1] == '\r' && src[0] == '\n' && src[1] == '.')
+				nextMask = 2;
+			else
+				nextMask = 0;
+		}
+	} else
+		nextMask = 0;
+}
+// without backtracking
+template<bool isRaw>
+static inline uint16_t decoder_set_nextMask(const uint8_t* src, unsigned mask) {
+	if(isRaw) {
+		if(src[0] == '.')
+			return mask & 1;
+		if(src[1] == '.')
+			return mask & 2;
+	}
+	return 0;
+}

package/src/decoder_neon.cc CHANGED Viewed

@@ -59,7 +59,7 @@ static bool neon_vect_is_nonzero(uint8x16_t v) {
 template<bool isRaw, bool searchEnd>
-HEDLEY_ALWAYS_INLINE void do_decode_neon(const uint8_t* HEDLEY_RESTRICT src, long& len, unsigned char* HEDLEY_RESTRICT & p, unsigned char& escFirst, uint16_t& nextMask) {
+HEDLEY_ALWAYS_INLINE void do_decode_neon(const uint8_t* src, long& len, unsigned char*& p, unsigned char& escFirst, uint16_t& nextMask) {
 	HEDLEY_ASSUME(escFirst == 0 || escFirst == 1);
 	HEDLEY_ASSUME(nextMask == 0 || nextMask == 1 || nextMask == 2);
 	uint8x16_t yencOffset = escFirst ? vmakeq_u8(42+64,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42) : vdupq_n_u8(42);
@@ -78,6 +78,9 @@ HEDLEY_ALWAYS_INLINE void do_decode_neon(const uint8_t* HEDLEY_RESTRICT src, lon
 			lfCompare = vsetq_lane_u8('.', lfCompare, 1);
 	}
 #endif
+	decoder_set_nextMask<isRaw>(src, len, nextMask);
 	long i;
 	for(i = -len; i; i += sizeof(uint8x16_t)*2) {
 		uint8x16x2_t data = vld1q_u8_x2_align(src+i, 32);
@@ -251,6 +254,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_neon(const uint8_t* HEDLEY_RESTRICT src, lon
 							// terminator found
 							// there's probably faster ways to do this, but reverting to scalar code should be good enough
 							len += i;
+							nextMask = decoder_set_nextMask<isRaw>(src+i, mask);
 							break;
 						}
 					}
@@ -301,6 +305,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_neon(const uint8_t* HEDLEY_RESTRICT src, lon
 						);
 						if(LIKELIHOOD(0.001, neon_vect_is_nonzero(matchEnd))) {
 							len += i;
+							nextMask = decoder_set_nextMask<isRaw>(src+i, mask);
 							break;
 						}
 					}
@@ -449,18 +454,6 @@ HEDLEY_ALWAYS_INLINE void do_decode_neon(const uint8_t* HEDLEY_RESTRICT src, lon
 #endif
 		}
 	}
-	if(isRaw) {
-		if(len != 0) { // have to gone through at least one loop cycle
-			if(src[i-2] == '\r' && src[i-1] == '\n' && src[i] == '.')
-				nextMask = 1;
-			else if(src[i-1] == '\r' && src[i] == '\n' && src[i+1] == '.')
-				nextMask = 2;
-			else
-				nextMask = 0;
-		}
-	} else
-		nextMask = 0;
 }
 void decoder_set_neon_funcs() {
@@ -468,6 +461,7 @@ void decoder_set_neon_funcs() {
 	_do_decode = &do_decode_simd<false, false, sizeof(uint8x16_t)*2, do_decode_neon<false, false> >;
 	_do_decode_raw = &do_decode_simd<true, false, sizeof(uint8x16_t)*2, do_decode_neon<true, false> >;
 	_do_decode_end_raw = &do_decode_simd<true, true, sizeof(uint8x16_t)*2, do_decode_neon<true, true> >;
+	_decode_isa = ISA_LEVEL_NEON;
 }
 #else
 void decoder_set_neon_funcs() {}

package/src/decoder_neon64.cc CHANGED Viewed

@@ -47,7 +47,7 @@ static HEDLEY_ALWAYS_INLINE uint8x16_t mergeCompares(uint8x16_t a, uint8x16_t b,
 template<bool isRaw, bool searchEnd>
-HEDLEY_ALWAYS_INLINE void do_decode_neon(const uint8_t* HEDLEY_RESTRICT src, long& len, unsigned char* HEDLEY_RESTRICT & p, unsigned char& escFirst, uint16_t& nextMask) {
+HEDLEY_ALWAYS_INLINE void do_decode_neon(const uint8_t* src, long& len, unsigned char*& p, unsigned char& escFirst, uint16_t& nextMask) {
 	HEDLEY_ASSUME(escFirst == 0 || escFirst == 1);
 	HEDLEY_ASSUME(nextMask == 0 || nextMask == 1 || nextMask == 2);
 	uint8x16_t nextMaskMix = vdupq_n_u8(0);
@@ -56,6 +56,9 @@ HEDLEY_ALWAYS_INLINE void do_decode_neon(const uint8_t* HEDLEY_RESTRICT src, lon
 	if(nextMask == 2)
 		nextMaskMix = vsetq_lane_u8(2, nextMaskMix, 1);
 	uint8x16_t yencOffset = escFirst ? vmakeq_u8(42+64,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42) : vdupq_n_u8(42);
+	decoder_set_nextMask<isRaw>(src, len, nextMask);
 	long i;
 	for(i = -len; i; i += sizeof(uint8x16_t)*4) {
 		uint8x16x4_t data = _vld1q_u8_x4(src+i);
@@ -227,6 +230,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_neon(const uint8_t* HEDLEY_RESTRICT src, lon
 							// terminator found
 							// there's probably faster ways to do this, but reverting to scalar code should be good enough
 							len += i;
+							nextMask = decoder_set_nextMask<isRaw>(src+i, mask);
 							break;
 						}
 					}
@@ -275,6 +279,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_neon(const uint8_t* HEDLEY_RESTRICT src, lon
 						);
 						if(LIKELIHOOD(0.001, neon_vect_is_nonzero(matchEnd))) {
 							len += i;
+							nextMask = decoder_set_nextMask<isRaw>(src+i, mask);
 							break;
 						}
 					}
@@ -430,17 +435,6 @@ HEDLEY_ALWAYS_INLINE void do_decode_neon(const uint8_t* HEDLEY_RESTRICT src, lon
 			yencOffset = vdupq_n_u8(42);
 		}
 	}
-	if(isRaw) {
-		if(len != 0) { // have to gone through at least one loop cycle
-			if(src[i-2] == '\r' && src[i-1] == '\n' && src[i] == '.')
-				nextMask = 1;
-			else if(src[i-1] == '\r' && src[i] == '\n' && src[i+1] == '.')
-				nextMask = 2;
-			else
-				nextMask = 0;
-		}
-	} else
-		nextMask = 0;
 }
 void decoder_set_neon_funcs() {
@@ -448,6 +442,7 @@ void decoder_set_neon_funcs() {
 	_do_decode = &do_decode_simd<false, false, sizeof(uint8x16_t)*4, do_decode_neon<false, false> >;
 	_do_decode_raw = &do_decode_simd<true, false, sizeof(uint8x16_t)*4, do_decode_neon<true, false> >;
 	_do_decode_end_raw = &do_decode_simd<true, true, sizeof(uint8x16_t)*4, do_decode_neon<true, true> >;
+	_decode_isa = ISA_LEVEL_NEON;
 }
 #else
 void decoder_set_neon_funcs() {}

package/src/decoder_sse2.cc CHANGED Viewed

@@ -10,6 +10,7 @@ void decoder_set_sse2_funcs() {
 	_do_decode = &do_decode_simd<false, false, sizeof(__m128i)*2, do_decode_sse<false, false, ISA_LEVEL_SSE2> >;
 	_do_decode_raw = &do_decode_simd<true, false, sizeof(__m128i)*2, do_decode_sse<true, false, ISA_LEVEL_SSE2> >;
 	_do_decode_end_raw = &do_decode_simd<true, true, sizeof(__m128i)*2, do_decode_sse<true, true, ISA_LEVEL_SSE2> >;
+	_decode_isa = ISA_LEVEL_SSE2;
 }
 #else
 void decoder_set_sse2_funcs() {}

package/src/decoder_sse_base.h CHANGED Viewed

@@ -7,6 +7,13 @@
 # define _mm_shrdi_epi16 _mm128_shrdi_epi16
 #endif
+#if defined(__tune_icelake_client__) || defined(__tune_icelake_server__) || defined(__tune_tigerlake__) || defined(__tune_rocketlake__) || defined(__tune_alderlake__) || defined(__tune_sapphirerapids__)
+# define COMPRESS_STORE _mm_mask_compressstoreu_epi8
+#else
+// avoid uCode on Zen4
+# define COMPRESS_STORE(dst, mask, vec) _mm_storeu_si128((__m128i*)(dst), _mm_maskz_compress_epi8(mask, vec))
+#endif
 // GCC (ver 6-10(dev)) fails to optimize pure C version of mask testing, but has this intrinsic; Clang >= 7 optimizes C version fine
 #if (defined(__GNUC__) && __GNUC__ >= 7) || (defined(_MSC_VER) && _MSC_VER >= 1924)
 # define KORTEST16(a, b) !_kortestz_mask16_u8((a), (b))
@@ -104,7 +111,7 @@ static HEDLEY_ALWAYS_INLINE __m128i sse2_compact_vect(uint32_t mask, __m128i dat
 }
 template<bool isRaw, bool searchEnd, enum YEncDecIsaLevel use_isa>
-HEDLEY_ALWAYS_INLINE void do_decode_sse(const uint8_t* HEDLEY_RESTRICT src, long& len, unsigned char* HEDLEY_RESTRICT & p, unsigned char& _escFirst, uint16_t& _nextMask) {
+HEDLEY_ALWAYS_INLINE void do_decode_sse(const uint8_t* src, long& len, unsigned char*& p, unsigned char& _escFirst, uint16_t& _nextMask) {
 	HEDLEY_ASSUME(_escFirst == 0 || _escFirst == 1);
 	HEDLEY_ASSUME(_nextMask == 0 || _nextMask == 1 || _nextMask == 2);
 	uintptr_t escFirst = _escFirst;
@@ -138,6 +145,9 @@ HEDLEY_ALWAYS_INLINE void do_decode_sse(const uint8_t* HEDLEY_RESTRICT src, long
 		else
 			lfCompare = _mm_insert_epi16(lfCompare, _nextMask == 1 ? 0x0a2e /*".\n"*/ : 0x2e0a /*"\n."*/, 0);
 	}
+	decoder_set_nextMask<isRaw>(src, len, _nextMask); // set this before the loop because we can't check src after it's been overwritten
 	intptr_t i;
 	for(i = -len; i; i += sizeof(__m128i)*2) {
 		__m128i oDataA = _mm_load_si128((__m128i *)(src+i));
@@ -376,6 +386,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_sse(const uint8_t* HEDLEY_RESTRICT src, long
 							// terminator found
 							// there's probably faster ways to do this, but reverting to scalar code should be good enough
 							len += (long)i;
+							_nextMask = decoder_set_nextMask<isRaw>(src+i, mask);
 							break;
 						}
 					}
@@ -485,6 +496,7 @@ HEDLEY_ALWAYS_INLINE void do_decode_sse(const uint8_t* HEDLEY_RESTRICT src, long
 						if(endFound) {
 							len += (long)i;
+							_nextMask = decoder_set_nextMask<isRaw>(src+i, mask);
 							break;
 						}
 					}
@@ -649,9 +661,9 @@ HEDLEY_ALWAYS_INLINE void do_decode_sse(const uint8_t* HEDLEY_RESTRICT src, long
 			if(use_isa >= ISA_LEVEL_SSSE3) {
 # if defined(__AVX512VBMI2__) && defined(__AVX512VL__) && defined(__POPCNT__)
 				if(use_isa >= ISA_LEVEL_VBMI2) {
-					_mm_mask_compressstoreu_epi8(p, KNOT16(mask), dataA);
+					COMPRESS_STORE(p, KNOT16(mask), dataA);
 					p -= popcnt32(mask & 0xffff);
-					_mm_mask_compressstoreu_epi8(p+XMM_SIZE, KNOT16(mask>>16), dataB);
+					COMPRESS_STORE(p+XMM_SIZE, KNOT16(mask>>16), dataB);
 					p -= popcnt32(mask>>16);
 					p += XMM_SIZE*2;
 				} else
@@ -703,16 +715,5 @@ HEDLEY_ALWAYS_INLINE void do_decode_sse(const uint8_t* HEDLEY_RESTRICT src, long
 		}
 	}
 	_escFirst = (unsigned char)escFirst;
-	if(isRaw) {
-		if(len != 0) { // have to gone through at least one loop cycle
-			if(src[i-2] == '\r' && src[i-1] == '\n' && src[i] == '.')
-				_nextMask = 1;
-			else if(src[i-1] == '\r' && src[i] == '\n' && src[i+1] == '.')
-				_nextMask = 2;
-			else
-				_nextMask = 0;
-		}
-	} else
-		_nextMask = 0;
 }
 #endif

package/src/decoder_ssse3.cc CHANGED Viewed

@@ -9,6 +9,7 @@ void decoder_set_ssse3_funcs() {
 	_do_decode = &do_decode_simd<false, false, sizeof(__m128i)*2, do_decode_sse<false, false, ISA_LEVEL_SSSE3> >;
 	_do_decode_raw = &do_decode_simd<true, false, sizeof(__m128i)*2, do_decode_sse<true, false, ISA_LEVEL_SSSE3> >;
 	_do_decode_end_raw = &do_decode_simd<true, true, sizeof(__m128i)*2, do_decode_sse<true, true, ISA_LEVEL_SSSE3> >;
+	_decode_isa = ISA_LEVEL_SSSE3;
 }
 #else
 void decoder_set_sse2_funcs();

package/src/decoder_vbmi2.cc CHANGED Viewed

@@ -1,5 +1,12 @@
 #include "common.h"
+extern const bool decoder_has_avx10;
+#if !defined(__EVEX512__) && (defined(__AVX10_1__) || defined(__EVEX256__)) && defined(__AVX512VL__) && defined(__AVX512VBMI2__) && defined(__AVX512BW__)
+const bool decoder_has_avx10 = true;
+#else
+const bool decoder_has_avx10 = false;
+#endif
 #if defined(__AVX512VL__) && defined(__AVX512VBMI2__) && defined(__AVX512BW__)
 # include "decoder_common.h"
 # ifndef YENC_DISABLE_AVX256
@@ -11,6 +18,7 @@ void decoder_set_vbmi2_funcs() {
 	_do_decode = &do_decode_simd<false, false, sizeof(__m256i)*2, do_decode_avx2<false, false, ISA_LEVEL_VBMI2> >;
 	_do_decode_raw = &do_decode_simd<true, false, sizeof(__m256i)*2, do_decode_avx2<true, false, ISA_LEVEL_VBMI2> >;
 	_do_decode_end_raw = &do_decode_simd<true, true, sizeof(__m256i)*2, do_decode_avx2<true, true, ISA_LEVEL_VBMI2> >;
+	_decode_isa = ISA_LEVEL_VBMI2;
 }
 # else
 #  include "decoder_sse_base.h"
@@ -20,6 +28,7 @@ void decoder_set_vbmi2_funcs() {
 	_do_decode = &do_decode_simd<false, false, sizeof(__m128i)*2, do_decode_sse<false, false, ISA_LEVEL_VBMI2> >;
 	_do_decode_raw = &do_decode_simd<true, false, sizeof(__m128i)*2, do_decode_sse<true, false, ISA_LEVEL_VBMI2> >;
 	_do_decode_end_raw = &do_decode_simd<true, true, sizeof(__m128i)*2, do_decode_sse<true, true, ISA_LEVEL_VBMI2> >;
+	_decode_isa = ISA_LEVEL_VBMI2;
 }
 # endif
 #else

package/src/encoder.cc CHANGED Viewed

@@ -122,6 +122,7 @@ size_t do_encode_generic(int line_size, int* colOffset, const unsigned char* HED
 extern "C" {
 	size_t (*_do_encode)(int, int*, const unsigned char* HEDLEY_RESTRICT, unsigned char* HEDLEY_RESTRICT, size_t, int) = &do_encode_generic;
+	int _encode_isa = ISA_GENERIC;
 }
 void encoder_sse2_init();
@@ -129,7 +130,9 @@ void encoder_ssse3_init();
 void encoder_avx_init();
 void encoder_avx2_init();
 void encoder_vbmi2_init();
+extern const bool encoder_has_avx10;
 void encoder_neon_init();
+void encoder_rvv_init();
 #if defined(PLATFORM_X86) && defined(YENC_BUILD_NATIVE) && YENC_BUILD_NATIVE!=0
 # if defined(__AVX2__) && !defined(YENC_DISABLE_AVX256)
@@ -137,12 +140,14 @@ void encoder_neon_init();
 static inline void encoder_native_init() {
 	_do_encode = &do_encode_simd< do_encode_avx2<ISA_NATIVE> >;
 	encoder_avx2_lut<ISA_NATIVE>();
+	_encode_isa = ISA_NATIVE;
 }
 # else
 #  include "encoder_sse_base.h"
 static inline void encoder_native_init() {
 	_do_encode = &do_encode_simd< do_encode_sse<ISA_NATIVE> >;
 	encoder_sse_lut<ISA_NATIVE>();
+	_encode_isa = ISA_NATIVE;
 }
 # endif
 #endif
@@ -154,7 +159,7 @@ void encoder_init() {
 	encoder_native_init();
 # else
 	int use_isa = cpu_supports_isa();
-	if(use_isa >= ISA_LEVEL_VBMI2)
+	if(use_isa >= ISA_LEVEL_VBMI2 && (encoder_has_avx10 || (use_isa & ISA_FEATURE_EVEX512)))
 		encoder_vbmi2_init();
 	else if(use_isa >= ISA_LEVEL_AVX2)
 		encoder_avx2_init();
@@ -170,4 +175,8 @@ void encoder_init() {
 	if(cpu_supports_neon())
 		encoder_neon_init();
 #endif
+#ifdef __riscv
+	if(cpu_supports_rvv())
+		encoder_rvv_init();
+#endif
 }

package/src/encoder.h CHANGED Viewed

@@ -10,8 +10,12 @@ extern "C" {
 #include "hedley.h"
 extern size_t (*_do_encode)(int, int*, const unsigned char* HEDLEY_RESTRICT, unsigned char* HEDLEY_RESTRICT, size_t, int);
+extern int _encode_isa;
 #define do_encode (*_do_encode)
 void encoder_init();
+static inline int encode_isa_level() {
+	return _encode_isa;
+}

package/src/encoder_avx.cc CHANGED Viewed

@@ -6,6 +6,7 @@
 void encoder_avx_init() {
 	_do_encode = &do_encode_simd< do_encode_sse<ISA_LEVEL_SSE4_POPCNT> >;
 	encoder_sse_lut<ISA_LEVEL_SSE4_POPCNT>();
+	_encode_isa = ISA_LEVEL_AVX;
 }
 #else
 void encoder_ssse3_init();

package/src/encoder_avx2.cc CHANGED Viewed

@@ -6,6 +6,7 @@
 void encoder_avx2_init() {
 	_do_encode = &do_encode_simd< do_encode_avx2<ISA_LEVEL_AVX2> >;
 	encoder_avx2_lut<ISA_LEVEL_AVX2>();
+	_encode_isa = ISA_LEVEL_AVX2;
 }
 #else
 void encoder_avx_init();