npm - yencode - Versions diffs - 1.2.0 → 1.2.1 - Mend

yencode 1.2.0 → 1.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (41) hide show

package/src/decoder_common.h CHANGED Viewed

@@ -1,342 +1,32 @@
 #include "decoder.h"
+namespace RapidYenc {
+	void decoder_set_sse2_funcs();
+	void decoder_set_ssse3_funcs();
+	void decoder_set_avx_funcs();
+	void decoder_set_avx2_funcs();
+	void decoder_set_vbmi2_funcs();
+	extern const bool decoder_has_avx10;
+	void decoder_set_neon_funcs();
+	void decoder_set_rvv_funcs();
+	template<bool isRaw, bool searchEnd>
+	YencDecoderEnd do_decode_scalar(const unsigned char** src, unsigned char** dest, size_t len, YencDecoderState* state);
+}
 #if defined(PLATFORM_ARM) && !defined(__aarch64__)
 #define YENC_DEC_USE_THINTABLE 1
 #endif
 // TODO: need to support max output length somehow
-// TODO: add branch probabilities
-// state var: refers to the previous state - only used for incremental processing
-template<bool isRaw>
-size_t do_decode_noend_scalar(const unsigned char* src, unsigned char* dest, size_t len, YencDecoderState* state) {
-	const unsigned char *es = src + len; // end source pointer
-	unsigned char *p = dest; // destination pointer
-	long i = -(long)len; // input position
-	unsigned char c; // input character
-	if(len < 1) return 0;
-	if(isRaw) {
-		if(state) switch(*state) {
-			case YDEC_STATE_EQ:
-				c = es[i];
-				*p++ = c - 42 - 64;
-				i++;
-				if(c == '\r') {
-					*state = YDEC_STATE_CR;
-					if(i >= 0) return 0;
-				} else {
-					*state = YDEC_STATE_NONE;
-					break;
-				}
-				// fall-thru
-			case YDEC_STATE_CR:
-				if(es[i] != '\n') break;
-				i++;
-				*state = YDEC_STATE_CRLF;
-				if(i >= 0) return 0;
-				// Else fall-thru
-			case YDEC_STATE_CRLF:
-				// skip past first dot
-				if(es[i] == '.') i++;
-				// fall-thru
-			default: break; // silence compiler warnings
-		} else // treat as YDEC_STATE_CRLF
-			if(es[i] == '.') i++;
-		for(; i < -2; i++) {
-			c = es[i];
-			switch(c) {
-				case '\r':
-					// skip past \r\n. sequences
-					//i += (es[i+1] == '\n' && es[i+2] == '.') << 1;
-					if(es[i+1] == '\n' && es[i+2] == '.')
-						i += 2;
-					// fall-thru
-				case '\n':
-					continue;
-				case '=':
-					c = es[i+1];
-					*p++ = c - 42 - 64;
-					i += (c != '\r'); // if we have a \r, reprocess character to deal with \r\n. case
-					continue;
-				default:
-					*p++ = c - 42;
-			}
-		}
-		if(state) *state = YDEC_STATE_NONE;
-		if(i == -2) { // 2nd last char
-			c = es[i];
-			switch(c) {
-				case '\r':
-					if(state && es[i+1] == '\n') {
-						*state = YDEC_STATE_CRLF;
-						return p - dest;
-					}
-					// Else fall-thru
-				case '\n':
-					break;
-				case '=':
-					c = es[i+1];
-					*p++ = c - 42 - 64;
-					i += (c != '\r');
-					break;
-				default:
-					*p++ = c - 42;
-			}
-			i++;
-		}
-		// do final char; we process this separately to prevent an overflow if the final char is '='
-		if(i == -1) {
-			c = es[i];
-			if(c != '\n' && c != '\r' && c != '=') {
-				*p++ = c - 42;
-			} else if(state) {
-				if(c == '=') *state = YDEC_STATE_EQ;
-				else if(c == '\r') *state = YDEC_STATE_CR;
-				else *state = YDEC_STATE_NONE;
-			}
-		}
-	} else {
-		if(state && *state == YDEC_STATE_EQ) {
-			*p++ = es[i] - 42 - 64;
-			i++;
-			*state = YDEC_STATE_NONE;
-		}
-		/*for(i = 0; i < len - 1; i++) {
-			c = src[i];
-			if(c == '\n' || c == '\r') continue;
-			unsigned char isEquals = (c == '=');
-			i += isEquals;
-			*p++ = src[i] - (42 + (isEquals << 6));
-		}*/
-		for(; i < -1; i++) {
-			c = es[i];
-			switch(c) {
-				case '\n': case '\r': continue;
-				case '=':
-					i++;
-					c = es[i] - 64;
-			}
-			*p++ = c - 42;
-		}
-		if(state) *state = YDEC_STATE_NONE;
-		// do final char; we process this separately to prevent an overflow if the final char is '='
-		if(i == -1) {
-			c = es[i];
-			if(c != '\n' && c != '\r' && c != '=') {
-				*p++ = c - 42;
-			} else
-				if(state) *state = (c == '=' ? YDEC_STATE_EQ : YDEC_STATE_NONE);
-		}
-	}
-	return p - dest;
-}
-template<bool isRaw>
-YencDecoderEnd do_decode_end_scalar(const unsigned char** src, unsigned char** dest, size_t len, YencDecoderState* state) {
-	const unsigned char *es = (*src) + len; // end source pointer
-	unsigned char *p = *dest; // destination pointer
-	long i = -(long)len; // input position
-	unsigned char c; // input character
-	if(len < 1) return YDEC_END_NONE;
-#define YDEC_CHECK_END(s) if(i == 0) { \
-	*state = s; \
-	*src = es; \
-	*dest = p; \
-	return YDEC_END_NONE; \
-}
-	if(state) switch(*state) {
-		case YDEC_STATE_CRLFEQ: do_decode_endable_scalar_ceq:
-			if(es[i] == 'y') {
-				*state = YDEC_STATE_NONE;
-				*src = es+i+1;
-				*dest = p;
-				return YDEC_END_CONTROL;
-			} // Else fall-thru
-		case YDEC_STATE_EQ:
-			c = es[i];
-			*p++ = c - 42 - 64;
-			i++;
-			if(c != '\r') break;
-			YDEC_CHECK_END(YDEC_STATE_CR)
-			// fall-through
-		case YDEC_STATE_CR:
-			if(es[i] != '\n') break;
-			i++;
-			YDEC_CHECK_END(YDEC_STATE_CRLF)
-			// fall-through
-		case YDEC_STATE_CRLF: do_decode_endable_scalar_c0:
-			if(es[i] == '.' && isRaw) {
-				i++;
-				YDEC_CHECK_END(YDEC_STATE_CRLFDT)
-			} else if(es[i] == '=') {
-				i++;
-				YDEC_CHECK_END(YDEC_STATE_CRLFEQ)
-				goto do_decode_endable_scalar_ceq;
-			} else
-				break;
-			// fall-through
-		case YDEC_STATE_CRLFDT:
-			if(isRaw && es[i] == '\r') {
-				i++;
-				YDEC_CHECK_END(YDEC_STATE_CRLFDTCR)
-			} else if(isRaw && es[i] == '=') { // check for dot-stuffed ending: \r\n.=y
-				i++;
-				YDEC_CHECK_END(YDEC_STATE_CRLFEQ)
-				goto do_decode_endable_scalar_ceq;
-			} else
-				break;
-			// fall-through
-		case YDEC_STATE_CRLFDTCR:
-			if(es[i] == '\n') {
-				if(isRaw) {
-					*state = YDEC_STATE_CRLF;
-					*src = es + i + 1;
-					*dest = p;
-					return YDEC_END_ARTICLE;
-				} else {
-					i++;
-					YDEC_CHECK_END(YDEC_STATE_CRLF)
-					goto do_decode_endable_scalar_c0; // handle as CRLF
-				}
-			} else
-				break;
-		case YDEC_STATE_NONE: break; // silence compiler warning
-	} else // treat as YDEC_STATE_CRLF
-		goto do_decode_endable_scalar_c0;
-	for(; i < -2; i++) {
-		c = es[i];
-		switch(c) {
-			case '\r': if(es[i+1] == '\n') {
-				if(isRaw && es[i+2] == '.') {
-					// skip past \r\n. sequences
-					i += 3;
-					YDEC_CHECK_END(YDEC_STATE_CRLFDT)
-					// check for end
-					if(es[i] == '\r') {
-						i++;
-						YDEC_CHECK_END(YDEC_STATE_CRLFDTCR)
-						if(es[i] == '\n') {
-							*src = es + i + 1;
-							*dest = p;
-							*state = YDEC_STATE_CRLF;
-							return YDEC_END_ARTICLE;
-						} else i--;
-					} else if(es[i] == '=') {
-						i++;
-						YDEC_CHECK_END(YDEC_STATE_CRLFEQ)
-						if(es[i] == 'y') {
-							*src = es + i + 1;
-							*dest = p;
-							*state = YDEC_STATE_NONE;
-							return YDEC_END_CONTROL;
-						} else {
-							// escape char & continue
-							c = es[i];
-							*p++ = c - 42 - 64;
-							i -= (c == '\r');
-						}
-					} else i--;
-				}
-				else if(es[i+2] == '=') {
-					i += 3;
-					YDEC_CHECK_END(YDEC_STATE_CRLFEQ)
-					if(es[i] == 'y') {
-						// ended
-						*src = es + i + 1;
-						*dest = p;
-						*state = YDEC_STATE_NONE;
-						return YDEC_END_CONTROL;
-					} else {
-						// escape char & continue
-						c = es[i];
-						*p++ = c - 42 - 64;
-						i -= (c == '\r');
-					}
-				}
-			} // fall-thru
-			case '\n':
-				continue;
-			case '=':
-				c = es[i+1];
-				*p++ = c - 42 - 64;
-				i += (c != '\r'); // if we have a \r, reprocess character to deal with \r\n. case
-				continue;
-			default:
-				*p++ = c - 42;
-		}
-	}
-	if(state) *state = YDEC_STATE_NONE;
-	if(i == -2) { // 2nd last char
-		c = es[i];
-		switch(c) {
-			case '\r':
-				if(state && es[i+1] == '\n') {
-					*state = YDEC_STATE_CRLF;
-					*src = es;
-					*dest = p;
-					return YDEC_END_NONE;
-				}
-				// Else fall-thru
-			case '\n':
-				break;
-			case '=':
-				c = es[i+1];
-				*p++ = c - 42 - 64;
-				i += (c != '\r');
-				break;
-			default:
-				*p++ = c - 42;
-		}
-		i++;
-	}
-	// do final char; we process this separately to prevent an overflow if the final char is '='
-	if(i == -1) {
-		c = es[i];
-		if(c != '\n' && c != '\r' && c != '=') {
-			*p++ = c - 42;
-		} else if(state) {
-			if(c == '=') *state = YDEC_STATE_EQ;
-			else if(c == '\r') *state = YDEC_STATE_CR;
-			else *state = YDEC_STATE_NONE;
-		}
-	}
-#undef YDEC_CHECK_END
-	*src = es;
-	*dest = p;
-	return YDEC_END_NONE;
-}
-template<bool isRaw, bool searchEnd>
-YencDecoderEnd do_decode_scalar(const unsigned char** src, unsigned char** dest, size_t len, YencDecoderState* state) {
-	if(searchEnd)
-		return do_decode_end_scalar<isRaw>(src, dest, len, state);
-	*dest += do_decode_noend_scalar<isRaw>(*src, *dest, len, state);
-	*src += len;
-	return YDEC_END_NONE;
-}
 template<bool isRaw, bool searchEnd, void(&kernel)(const uint8_t*, long&, unsigned char*&, unsigned char&, uint16_t&)>
-inline YencDecoderEnd _do_decode_simd(size_t width, const unsigned char** src, unsigned char** dest, size_t len, YencDecoderState* state) {
+static inline RapidYenc::YencDecoderEnd _do_decode_simd(size_t width, const unsigned char** src, unsigned char** dest, size_t len, RapidYenc::YencDecoderState* state) {
+	using namespace RapidYenc;
 	if(len <= width*2) return do_decode_scalar<isRaw, searchEnd>(src, dest, len, state);
 	YencDecoderState tState = YDEC_STATE_CRLF;
@@ -466,17 +156,19 @@ inline YencDecoderEnd _do_decode_simd(size_t width, const unsigned char** src, u
 }
 template<bool isRaw, bool searchEnd, size_t width, void(&kernel)(const uint8_t*, long&, unsigned char*&, unsigned char&, uint16_t&)>
-YencDecoderEnd do_decode_simd(const unsigned char** src, unsigned char** dest, size_t len, YencDecoderState* state) {
+static RapidYenc::YencDecoderEnd do_decode_simd(const unsigned char** src, unsigned char** dest, size_t len, RapidYenc::YencDecoderState* state) {
 	return _do_decode_simd<isRaw, searchEnd, kernel>(width, src, dest, len, state);
 }
 template<bool isRaw, bool searchEnd, size_t(&getWidth)(), void(&kernel)(const uint8_t*, long&, unsigned char*&, unsigned char&, uint16_t&)>
-YencDecoderEnd do_decode_simd(const unsigned char** src, unsigned char** dest, size_t len, YencDecoderState* state) {
+static RapidYenc::YencDecoderEnd do_decode_simd(const unsigned char** src, unsigned char** dest, size_t len, RapidYenc::YencDecoderState* state) {
 	return _do_decode_simd<isRaw, searchEnd, kernel>(getWidth(), src, dest, len, state);
 }
 #if defined(PLATFORM_X86) || defined(PLATFORM_ARM)
-void decoder_init_lut(void* compactLUT);
+namespace RapidYenc {
+	void decoder_init_lut(void* compactLUT);
+}
 #endif
 template<bool isRaw>
@@ -509,16 +201,20 @@ static inline uint16_t decoder_set_nextMask(const uint8_t* src, unsigned mask) {
 // resolve invalid sequences of = to deal with cases like '===='
 // bit hack inspired from simdjson: https://youtu.be/wlvKAT7SZIQ?t=33m38s
 template<typename T>
-static inline T fix_eqMask(T mask) {
+static inline T fix_eqMask(T mask, T maskShift1) {
 	// isolate the start of each consecutive bit group (e.g. 01011101 -> 01000101)
-	T start = mask & ~(mask << 1);
+	T start = mask & ~maskShift1;
+	// this strategy works by firstly separating groups that start on even/odd bits
+	// generally, it doesn't matter which one (even/odd) we pick, but clearing even groups specifically allows the escFirst bit in maskShift1 to work
+	// (this is because the start of the escFirst group is at index -1, an odd bit, but we can't clear it due to being < 0, so we just retain all odd groups instead)
-	const T odd = (T)0xaaaaaaaaaaaaaaaa; // every odd bit (10101010...)
+	const T even = (T)0x5555555555555555; // every even bit (01010101...)
-	// obtain groups which start on an even bit (clear groups that start on an odd bit, but this leaves an unwanted trailing bit)
-	T evenGroups = mask + (start & odd);
+	// obtain groups which start on an odd bit (clear groups that start on an even bit, but this leaves an unwanted trailing bit)
+	T oddGroups = mask + (start & even);
-	// clear odd bits in even groups, whilst conversely preserving odd bits in odd groups
+	// clear even bits in odd groups, whilst conversely preserving even bits in even groups
 	// the `& mask` also conveniently gets rid of unwanted trailing bits
-	return (evenGroups ^ odd) & mask;
+	return (oddGroups ^ even) & mask;
 }

package/src/decoder_neon.cc CHANGED Viewed

@@ -1,8 +1,8 @@
 #include "common.h"
-#ifdef __ARM_NEON
 #include "decoder_common.h"
+#ifdef __ARM_NEON
 #if defined(_MSC_VER) && !defined(__clang__)
 # define vld1_u8_align(p, a) vld1_u8_ex(p, a*8)
@@ -53,6 +53,8 @@ static bool neon_vect_is_nonzero(uint8x16_t v) {
 }
+namespace RapidYenc {
 template<bool isRaw, bool searchEnd>
 HEDLEY_ALWAYS_INLINE void do_decode_neon(const uint8_t* src, long& len, unsigned char*& p, unsigned char& escFirst, uint16_t& nextMask) {
 	HEDLEY_ASSUME(escFirst == 0 || escFirst == 1);
@@ -322,8 +324,9 @@ HEDLEY_ALWAYS_INLINE void do_decode_neon(const uint8_t* src, long& len, unsigned
 			// a spec compliant encoder should never generate sequences: ==, =\n and =\r, but we'll handle them to be spec compliant
 			// the yEnc specification requires any character following = to be unescaped, not skipped over, so we'll deal with that
 			// firstly, check for invalid sequences of = (we assume that these are rare, as a spec compliant yEnc encoder should not generate these)
-			if(LIKELIHOOD(0.0001, (mask & ((maskEq << 1) | escFirst)) != 0)) {
-				maskEq = fix_eqMask<uint32_t>(maskEq & ~escFirst);
+			uint32_t maskEqShift1 = (maskEq << 1) | escFirst;
+			if(LIKELIHOOD(0.0001, (mask & maskEqShift1) != 0)) {
+				maskEq = fix_eqMask<uint32_t>(maskEq, maskEqShift1);
 				unsigned char nextEscFirst = maskEq>>31;
 				// next, eliminate anything following a `=` from the special char mask; this eliminates cases of `=\r` so that they aren't removed
@@ -445,8 +448,9 @@ HEDLEY_ALWAYS_INLINE void do_decode_neon(const uint8_t* src, long& len, unsigned
 		}
 	}
 }
+} // namespace
-void decoder_set_neon_funcs() {
+void RapidYenc::decoder_set_neon_funcs() {
 	decoder_init_lut(compactLUT);
 	_do_decode = &do_decode_simd<false, false, sizeof(uint8x16_t)*2, do_decode_neon<false, false> >;
 	_do_decode_raw = &do_decode_simd<true, false, sizeof(uint8x16_t)*2, do_decode_neon<true, false> >;
@@ -454,5 +458,5 @@ void decoder_set_neon_funcs() {
 	_decode_isa = ISA_LEVEL_NEON;
 }
 #else
-void decoder_set_neon_funcs() {}
+void RapidYenc::decoder_set_neon_funcs() {}
 #endif

package/src/decoder_neon64.cc CHANGED Viewed

@@ -1,7 +1,7 @@
 #include "common.h"
+#include "decoder_common.h"
 #if defined(__ARM_NEON) && defined(__aarch64__)
-#include "decoder_common.h"
 #pragma pack(16)
 static struct { char bytes[16]; } ALIGN_TO(16, compactLUT[32768]);
@@ -44,6 +44,8 @@ static HEDLEY_ALWAYS_INLINE uint8x16_t mergeCompares(uint8x16_t a, uint8x16_t b,
 }
+namespace RapidYenc {
 template<bool isRaw, bool searchEnd>
 HEDLEY_ALWAYS_INLINE void do_decode_neon(const uint8_t* src, long& len, unsigned char*& p, unsigned char& escFirst, uint16_t& nextMask) {
 	HEDLEY_ASSUME(escFirst == 0 || escFirst == 1);
@@ -290,8 +292,9 @@ HEDLEY_ALWAYS_INLINE void do_decode_neon(const uint8_t* src, long& len, unsigned
 			// a spec compliant encoder should never generate sequences: ==, =\n and =\r, but we'll handle them to be spec compliant
 			// the yEnc specification requires any character following = to be unescaped, not skipped over, so we'll deal with that
 			// firstly, check for invalid sequences of = (we assume that these are rare, as a spec compliant yEnc encoder should not generate these)
-			if(LIKELIHOOD(0.0001, (mask & ((maskEq << 1) | escFirst)) != 0)) {
-				maskEq = fix_eqMask<uint64_t>(maskEq & ~(uint64_t)escFirst);
+			uint64_t maskEqShift1 = (maskEq << 1) | escFirst;
+			if(LIKELIHOOD(0.0001, (mask & maskEqShift1) != 0)) {
+				maskEq = fix_eqMask<uint64_t>(maskEq, maskEqShift1);
 				unsigned char nextEscFirst = maskEq>>63;
 				// next, eliminate anything following a `=` from the special char mask; this eliminates cases of `=\r` so that they aren't removed
@@ -429,8 +432,9 @@ HEDLEY_ALWAYS_INLINE void do_decode_neon(const uint8_t* src, long& len, unsigned
 		}
 	}
 }
+} // namespace
-void decoder_set_neon_funcs() {
+void RapidYenc::decoder_set_neon_funcs() {
 	decoder_init_lut(compactLUT);
 	_do_decode = &do_decode_simd<false, false, sizeof(uint8x16_t)*4, do_decode_neon<false, false> >;
 	_do_decode_raw = &do_decode_simd<true, false, sizeof(uint8x16_t)*4, do_decode_neon<true, false> >;
@@ -438,5 +442,5 @@ void decoder_set_neon_funcs() {
 	_decode_isa = ISA_LEVEL_NEON;
 }
 #else
-void decoder_set_neon_funcs() {}
+void RapidYenc::decoder_set_neon_funcs() {}
 #endif

package/src/decoder_rvv.cc CHANGED Viewed

@@ -1,6 +1,6 @@
 #include "common.h"
-#ifdef __riscv_vector
 #include "decoder_common.h"
+#ifdef __riscv_vector
 #ifdef __riscv_v_intrinsic
@@ -29,6 +29,17 @@ static inline vbool4_t mask_lshift(vbool4_t m, unsigned shiftIn, size_t vl) {
 		RV_MASK_CAST(4, 8, mvl), RV_MASK_CAST(4, 8, mvr), vl
 	);
 }
+template<int shift>
+static inline vbool64_t mask_lshift(vbool64_t m, unsigned shiftIn, size_t vl) {
+	vuint8m1_t mv = RV_VEC_CAST(64, 8, m);
+	vuint8m1_t mvl = RV(vsll_vx_u8m1)(mv, shift, vl/8);
+	vuint8m1_t mvr = RV(vsrl_vx_u8m1)(mv, 8-shift, vl/8);
+	mvr = RV(vslide1up_vx_u8m1)(mvr, shiftIn, vl/8);
+	return RV(vmor_mm_b64)(
+		RV_MASK_CAST(64, 8, mvl), RV_MASK_CAST(64, 8, mvr), vl
+	);
+}
 static inline vuint8m2_t set_first_vu8(vuint8m2_t src, uint8_t item, size_t vl) {
 #ifdef __riscv_v_intrinsic
@@ -48,6 +59,7 @@ static inline vuint16m2_t set_first_vu16(vuint16m2_t src, uint16_t item, size_t
 }
+namespace RapidYenc {
 template<bool isRaw, bool searchEnd>
 HEDLEY_ALWAYS_INLINE void do_decode_rvv(const uint8_t* src, long& len, unsigned char*& outp, unsigned char& escFirst, uint16_t& nextMask) {
@@ -195,48 +207,41 @@ HEDLEY_ALWAYS_INLINE void do_decode_rvv(const uint8_t* src, long& len, unsigned
 			// the yEnc specification requires any character following = to be unescaped, not skipped over, so we'll deal with that
 			// firstly, check for invalid sequences of = (we assume that these are rare, as a spec compliant yEnc encoder should not generate these)
 			if(LIKELIHOOD(0.0001, RV(vcpop_m_b4)(RV(vmandn_mm_b4)(cmpEqShift1, cmp, vl2), vl2) != 0)) {
-				// note: we assume that uintptr_t corresponds with __riscv_xlen
-				#if __riscv_xlen == 64
-				vuint64m1_t cmpEqW = RV_VEC_CAST(4, 64, cmpEq);
-				#else
-				vuint32m1_t cmpEqW = RV_VEC_CAST(4, 32, cmpEq);
-				#endif
-				size_t nextShiftDown = (vl2 > sizeof(uintptr_t)*8 ? sizeof(uintptr_t)*8 : vl2) - 1;
-				size_t wvl = (vl2 + sizeof(uintptr_t)*8 -1) / (sizeof(uintptr_t)*8);
-				for(size_t w=0; w<vl2; w+=sizeof(uintptr_t)*8) {
-					// extract bottom word
-					#if __riscv_xlen == 64
-					uintptr_t maskW = RV(vmv_x_s_u64m1_u64)(cmpEqW);
-					#else
-					uintptr_t maskW = RV(vmv_x_s_u32m1_u32)(cmpEqW);
-					#endif
-					// fix it
-					maskW = fix_eqMask<uintptr_t>(maskW & ~(uintptr_t)escFirst);
-					uint8_t nextEscFirst = (maskW >> nextShiftDown) & 1;
-					// shift it up (will be used for cmpEqShift1)
-					maskW = (maskW<<1) | escFirst; // TODO: should this be done using mask_lshift<1> instead?
-					escFirst = nextEscFirst;
-					// slide the new value in from the top
-					#if __riscv_xlen == 64
-					cmpEqW = RV(vslide1down_vx_u64m1)(cmpEqW, maskW, wvl);
-					#else
-					cmpEqW = RV(vslide1down_vx_u32m1)(cmpEqW, maskW, wvl);
-					#endif
+				// replicate fix_eqMask, but in vector form
+				vbool4_t groupStart = RV(vmandn_mm_b4)(cmpEq, cmpEqShift1, vl2);
+				vbool4_t evenBits = RV_MASK_CAST(4, 8, RV(vmv_v_x_u8m1)(0x55, vl2));
+				vbool4_t evenStart = RV(vmand_mm_b4)(groupStart, evenBits, vl2);
+				// compute `cmpEq + evenStart` to obtain oddGroups
+				vbool4_t oddGroups;
+				vuint64m1_t cmpEq64 = RV_VEC_CAST(4, 64, cmpEq);
+				vuint64m1_t evenStart64 = RV_VEC_CAST(4, 64, evenStart);
+				vuint64m1_t oddGroups64;
+				if(vl2 <= 64) {
+					// no loop needed - single 64b add will work
+					oddGroups64 = RV(vadd_vv_u64m1)(cmpEq64, evenStart64, 1);
+				} else {
+					// need to loop whilst the add causes a carry
+					unsigned vl64 = vl2/64;
+					vbool64_t carry = RV(vmadc_vv_u64m1_b64)(cmpEq64, evenStart64, vl64);
+					carry = mask_lshift<1>(carry, 0, vl64);
+					oddGroups64 = RV(vadd_vv_u64m1)(cmpEq64, evenStart64, 1);
+					while(RV(vcpop_m_b64)(carry, vl64)) {
+						vbool64_t nextCarry = RV(vmadc_vx_u64m1_b64)(oddGroups64, 1, vl64);
+						oddGroups64 = RV(vadd_vx_u64m1_mu)(carry, oddGroups64, oddGroups64, 1, vl64);
+						carry = mask_lshift<1>(nextCarry, 0, vl64);
+					}
 				}
-				#if __riscv_xlen == 64
-				cmpEqShift1 = RV_MASK_CAST(4, 64, cmpEqW);
-				#else
-				cmpEqShift1 = RV_MASK_CAST(4, 32, cmpEqW);
-				#endif
+				oddGroups = RV_MASK_CAST(4, 64, oddGroups64);
+				cmpEq = RV(vmand_mm_b4)(RV(vmxor_mm_b4)(oddGroups, evenBits, vl2), cmpEq, vl2);
+				cmpEqShift1 = mask_lshift<1>(cmpEq, escFirst, vl2);
 				cmp = RV(vmor_mm_b4)(cmpEqShift1, cmp, vl2); // ~(~cmp & ~cmpEqShift1)
 				numOutputChars = RV(vcpop_m_b4)(cmp, vl2);
-			} else {
-				// no invalid = sequences found - don't need to fix up cmpEq
-				escFirst = RV(vcpop_m_b4)(RV(vmand_mm_b4)(cmpEq, lastBit, vl2), vl2);
 			}
+			escFirst = RV(vcpop_m_b4)(RV(vmand_mm_b4)(cmpEq, lastBit, vl2), vl2);
 			data = RV(vsub_vv_u8m2)(data, RV_vmerge_vxm_u8m2(yencOffset, 64+42, cmpEqShift1, vl2), vl2);
 			yencOffset = set_first_vu8(yencOffset, 42 | (escFirst<<6), vl2);
@@ -262,13 +267,14 @@ HEDLEY_ALWAYS_INLINE void do_decode_rvv(const uint8_t* src, long& len, unsigned
 size_t decoder_rvv_width() {
 	return RV(vsetvlmax_e8m2)();
 }
+} // namespace
-void decoder_set_rvv_funcs() {
+void RapidYenc::decoder_set_rvv_funcs() {
 	_do_decode = &do_decode_simd<false, false, decoder_rvv_width, do_decode_rvv<false, false> >;
 	_do_decode_raw = &do_decode_simd<true, false, decoder_rvv_width, do_decode_rvv<true, false> >;
 	_do_decode_end_raw = &do_decode_simd<true, true, decoder_rvv_width, do_decode_rvv<true, true> >;
 	_decode_isa = ISA_LEVEL_RVV;
 }
 #else
-void decoder_set_rvv_funcs() {}
+void RapidYenc::decoder_set_rvv_funcs() {}
 #endif

package/src/decoder_sse2.cc CHANGED Viewed

@@ -1,10 +1,10 @@
 #include "common.h"
-#ifdef __SSE2__
 #include "decoder_common.h"
+#ifdef __SSE2__
 #include "decoder_sse_base.h"
-void decoder_sse_init(SSELookups* HEDLEY_RESTRICT& lookups) {
+void RapidYenc::decoder_sse_init(RapidYenc::SSELookups* HEDLEY_RESTRICT& lookups) {
 	ALIGN_ALLOC(lookups, sizeof(SSELookups), 16);
 	for(int i=0; i<256; i++) {
 		lookups->BitsSetTable256inv[i] = 8 - (
@@ -25,7 +25,7 @@ void decoder_sse_init(SSELookups* HEDLEY_RESTRICT& lookups) {
 	}
 }
-void decoder_set_sse2_funcs() {
+void RapidYenc::decoder_set_sse2_funcs() {
 	decoder_sse_init(lookups);
 	decoder_init_lut(lookups->compact);
 	_do_decode = &do_decode_simd<false, false, sizeof(__m128i)*2, do_decode_sse<false, false, ISA_LEVEL_SSE2> >;
@@ -34,5 +34,5 @@ void decoder_set_sse2_funcs() {
 	_decode_isa = ISA_LEVEL_SSE2;
 }
 #else
-void decoder_set_sse2_funcs() {}
+void RapidYenc::decoder_set_sse2_funcs() {}
 #endif