RubyGems - deflate-ruby - Versions diffs - 1.0.1 → 1.0.2 - Mend

deflate-ruby 1.0.1 → 1.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (138) hide show

data/ext/deflate_ruby/deflate_ruby.h ADDED Viewed

@@ -0,0 +1,6 @@
+#ifndef DEFLATE_RUBY_H
+#define DEFLATE_RUBY_H 1
+#include "ruby.h"
+#endif /* DEFLATE_RUBY_H */

data/ext/deflate_ruby/extconf.rb CHANGED Viewed

@@ -1,34 +1,44 @@
 # frozen_string_literal: true
 require "mkmf"
+require "rbconfig"
-# Add libdeflate source directory to the include path
+# Makes all symbols private by default to avoid unintended conflict
+# with other gems. To explicitly export symbols you can use RUBY_FUNC_EXPORTED
+# selectively, or entirely remove this flag.
+append_cflags("-fvisibility=hidden")
+# Add libdeflate source directory and subdirectories to include path
 $INCFLAGS << " -I$(srcdir)/libdeflate"
-$CFLAGS << " -O2 -std=c99"
-# Define source files to compile
-libdeflate_sources = %w[
-  libdeflate/lib/deflate_compress.c
-  libdeflate/lib/deflate_decompress.c
-  libdeflate/lib/zlib_compress.c
-  libdeflate/lib/zlib_decompress.c
-  libdeflate/lib/gzip_compress.c
-  libdeflate/lib/gzip_decompress.c
-  libdeflate/lib/adler32.c
-  libdeflate/lib/crc32.c
-  libdeflate/lib/utils.c
-]
-# Add CPU architecture-specific files
-arch_dirs = Dir.glob("libdeflate/lib/*/").select { |d| File.directory?(d) }
-arch_dirs.each do |dir|
-  Dir.glob("#{dir}*.c").each do |source|
-    libdeflate_sources << source
-  end
+$INCFLAGS << " -I$(srcdir)/libdeflate/arm"
+$INCFLAGS << " -I$(srcdir)/libdeflate/x86"
+$INCFLAGS << " -I$(srcdir)/libdeflate/riscv"
+# Detect CPU architecture
+arch = RbConfig::CONFIG['host_cpu']
+# Get base libdeflate C files (not in subdirectories)
+libdeflate_sources = Dir.glob("#{__dir__}/libdeflate/*.c")
+# Add architecture-specific files
+if arch =~ /arm|aarch64/
+  libdeflate_sources += Dir.glob("#{__dir__}/libdeflate/arm/*.c")
+elsif arch =~ /x86_64|i686|i386/
+  libdeflate_sources += Dir.glob("#{__dir__}/libdeflate/x86/*.c")
+elsif arch =~ /riscv/
+  libdeflate_sources += Dir.glob("#{__dir__}/libdeflate/riscv/*.c")
 end
-# Set object files for libdeflate
-$objs = libdeflate_sources.map { |src| src.sub(/\.c$/, ".o") }
-$objs << "deflate_ruby.o"
+# Build source file list for mkmf
+$srcs = ["deflate_ruby.c"] + libdeflate_sources.map { |f| File.basename(f) }
+# Optimization flags for better performance
+append_cflags("-O3")
+# Platform-specific optimizations
+if arch =~ /x86_64|i686|i386/
+  # Enable SSE2 on x86 (generally available on x86_64)
+  have_func("__builtin_cpu_supports")
+end
 create_makefile("deflate_ruby/deflate_ruby")

data/ext/deflate_ruby/libdeflate/adler32.c ADDED Viewed

@@ -0,0 +1,162 @@
+/*
+ * adler32.c - Adler-32 checksum algorithm
+ *
+ * Copyright 2016 Eric Biggers
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+#include "lib_common.h"
+/* The Adler-32 divisor, or "base", value */
+#define DIVISOR 65521
+/*
+ * MAX_CHUNK_LEN is the most bytes that can be processed without the possibility
+ * of s2 overflowing when it is represented as an unsigned 32-bit integer.  This
+ * value was computed using the following Python script:
+ *
+ *	divisor = 65521
+ *	count = 0
+ *	s1 = divisor - 1
+ *	s2 = divisor - 1
+ *	while True:
+ *		s1 += 0xFF
+ *		s2 += s1
+ *		if s2 > 0xFFFFFFFF:
+ *			break
+ *		count += 1
+ *	print(count)
+ *
+ * Note that to get the correct worst-case value, we must assume that every byte
+ * has value 0xFF and that s1 and s2 started with the highest possible values
+ * modulo the divisor.
+ */
+#define MAX_CHUNK_LEN	5552
+/*
+ * Update the Adler-32 values s1 and s2 using n bytes from p, update p to p + n,
+ * update n to 0, and reduce s1 and s2 mod DIVISOR.  It is assumed that neither
+ * s1 nor s2 can overflow before the reduction at the end, i.e. n plus any bytes
+ * already processed after the last reduction must not exceed MAX_CHUNK_LEN.
+ *
+ * This uses only portable C code.  This is used as a fallback when a vectorized
+ * implementation of Adler-32 (e.g. AVX2) is unavailable on the platform.
+ *
+ * Some of the vectorized implementations also use this to handle the end of the
+ * data when the data isn't evenly divisible by the length the vectorized code
+ * works on.  To avoid compiler errors about target-specific option mismatches
+ * when this is used in that way, this is a macro rather than a function.
+ *
+ * Although this is unvectorized, this does include an optimization where the
+ * main loop processes four bytes at a time using a strategy similar to that
+ * used by vectorized implementations.  This provides increased instruction-
+ * level parallelism compared to the traditional 's1 += *p++; s2 += s1;'.
+ */
+#define ADLER32_CHUNK(s1, s2, p, n)					\
+do {									\
+	if (n >= 4) {							\
+		u32 s1_sum = 0;						\
+		u32 byte_0_sum = 0;					\
+		u32 byte_1_sum = 0;					\
+		u32 byte_2_sum = 0;					\
+		u32 byte_3_sum = 0;					\
+									\
+		do {							\
+			s1_sum += s1;					\
+			s1 += p[0] + p[1] + p[2] + p[3];		\
+			byte_0_sum += p[0];				\
+			byte_1_sum += p[1];				\
+			byte_2_sum += p[2];				\
+			byte_3_sum += p[3];				\
+			p += 4;						\
+			n -= 4;						\
+		} while (n >= 4);					\
+		s2 += (4 * (s1_sum + byte_0_sum)) + (3 * byte_1_sum) +	\
+		      (2 * byte_2_sum) + byte_3_sum;			\
+	}								\
+	for (; n; n--, p++) {						\
+		s1 += *p;						\
+		s2 += s1;						\
+	}								\
+	s1 %= DIVISOR;							\
+	s2 %= DIVISOR;							\
+} while (0)
+static u32 MAYBE_UNUSED
+adler32_generic(u32 adler, const u8 *p, size_t len)
+{
+	u32 s1 = adler & 0xFFFF;
+	u32 s2 = adler >> 16;
+	while (len) {
+		size_t n = MIN(len, MAX_CHUNK_LEN & ~3);
+		len -= n;
+		ADLER32_CHUNK(s1, s2, p, n);
+	}
+	return (s2 << 16) | s1;
+}
+/* Include architecture-specific implementation(s) if available. */
+#undef DEFAULT_IMPL
+#undef arch_select_adler32_func
+typedef u32 (*adler32_func_t)(u32 adler, const u8 *p, size_t len);
+#if defined(ARCH_ARM32) || defined(ARCH_ARM64)
+#  include "arm/adler32_impl.h"
+#elif defined(ARCH_X86_32) || defined(ARCH_X86_64)
+#  include "x86/adler32_impl.h"
+#endif
+#ifndef DEFAULT_IMPL
+#  define DEFAULT_IMPL adler32_generic
+#endif
+#ifdef arch_select_adler32_func
+static u32 dispatch_adler32(u32 adler, const u8 *p, size_t len);
+static volatile adler32_func_t adler32_impl = dispatch_adler32;
+/* Choose the best implementation at runtime. */
+static u32 dispatch_adler32(u32 adler, const u8 *p, size_t len)
+{
+	adler32_func_t f = arch_select_adler32_func();
+	if (f == NULL)
+		f = DEFAULT_IMPL;
+	adler32_impl = f;
+	return f(adler, p, len);
+}
+#else
+/* The best implementation is statically known, so call it directly. */
+#define adler32_impl DEFAULT_IMPL
+#endif
+LIBDEFLATEAPI u32
+libdeflate_adler32(u32 adler, const void *buffer, size_t len)
+{
+	if (buffer == NULL) /* Return initial value. */
+		return 1;
+	return adler32_impl(adler, buffer, len);
+}

data/ext/deflate_ruby/libdeflate/{lib/arm → arm}/adler32_impl.h RENAMED Viewed

@@ -209,18 +209,25 @@ adler32_arm_neon(u32 adler, const u8 *p, size_t len)
 #endif /* Regular NEON implementation */
 /* NEON+dotprod implementation */
-#if HAVE_DOTPROD_INTRIN && CPU_IS_LITTLE_ENDIAN()
+#if HAVE_DOTPROD_INTRIN && CPU_IS_LITTLE_ENDIAN() && \
+	!defined(LIBDEFLATE_ASSEMBLER_DOES_NOT_SUPPORT_DOTPROD)
 #  define adler32_arm_neon_dotprod	adler32_arm_neon_dotprod
 #  ifdef __clang__
 #    define ATTRIBUTES	_target_attribute("dotprod")
    /*
-    * With gcc 13.1 and earlier (before gcc commit 73d3bc348190 or 9aac37ab8a7b,
-    * "aarch64: Remove architecture dependencies from intrinsics"),
-    * arch=armv8.2-a is needed for the dotprod intrinsics, unless the default
-    * target is armv8.3-a or later in which case it must be omitted.  armv8.3-a
-    * or later can be detected by checking for __ARM_FEATURE_JCVT.
+    * Both gcc and binutils originally considered dotprod to depend on
+    * arch=armv8.2-a or later.  This was fixed in gcc 13.2 by commit
+    * 9aac37ab8a7b ("aarch64: Remove architecture dependencies from intrinsics")
+    * and in binutils 2.41 by commit 205e4380c800 ("aarch64: Remove version
+    * dependencies from features").  Unfortunately, always using arch=armv8.2-a
+    * causes build errors with some compiler options because it may reduce the
+    * arch rather than increase it.  Therefore we try to omit the arch whenever
+    * possible.  If gcc is 14 or later, then both gcc and binutils are probably
+    * fixed, so we omit the arch.  We also omit the arch if a feature that
+    * depends on armv8.2-a or later (in gcc 13.1 and earlier) is present.
     */
-#  elif GCC_PREREQ(13, 2) || defined(__ARM_FEATURE_JCVT)
+#  elif GCC_PREREQ(14, 0) || defined(__ARM_FEATURE_JCVT) \
+			  || defined(__ARM_FEATURE_DOTPROD)
 #    define ATTRIBUTES	_target_attribute("+dotprod")
 #  else
 #    define ATTRIBUTES	_target_attribute("arch=armv8.2-a+dotprod")

data/ext/deflate_ruby/libdeflate/{lib/arm → arm}/crc32_impl.h RENAMED Viewed

@@ -434,13 +434,11 @@ crc32_arm_pmullx4(u32 crc, const u8 *p, size_t len)
 		{ CRC32_X543_MODG, CRC32_X479_MODG }, /* 4 vecs */
 		{ CRC32_X287_MODG, CRC32_X223_MODG }, /* 2 vecs */
 	};
-	static const u64 _aligned_attribute(16) final_mults[3][2] = {
-		{ CRC32_X63_MODG, 0 },
-		{ CRC32_BARRETT_CONSTANT_1, 0 },
-		{ CRC32_BARRETT_CONSTANT_2, 0 },
+	static const u64 _aligned_attribute(16) barrett_consts[3][2] = {
+		{ CRC32_X95_MODG, },
+		{ CRC32_BARRETT_CONSTANT_1, },
+		{ CRC32_BARRETT_CONSTANT_2, },
 	};
-	const uint8x16_t zeroes = vdupq_n_u8(0);
-	const uint8x16_t mask32 = vreinterpretq_u8_u64(vdupq_n_u64(0xFFFFFFFF));
 	const poly64x2_t multipliers_1 = load_multipliers(mults[0]);
 	uint8x16_t v0, v1, v2, v3;
@@ -497,24 +495,13 @@ crc32_arm_pmullx4(u32 crc, const u8 *p, size_t len)
 	if (len)
 		v0 = fold_partial_vec(v0, p, len, multipliers_1);
-	/*
-	 * Fold 128 => 96 bits.  This also implicitly appends 32 zero bits,
-	 * which is equivalent to multiplying by x^32.  This is needed because
-	 * the CRC is defined as M(x)*x^32 mod G(x), not just M(x) mod G(x).
-	 */
-	v0 = veorq_u8(vextq_u8(v0, zeroes, 8),
-		      clmul_high(vextq_u8(zeroes, v0, 8), multipliers_1));
-	/* Fold 96 => 64 bits. */
-	v0 = veorq_u8(vextq_u8(v0, zeroes, 4),
-		      clmul_low(vandq_u8(v0, mask32),
-				load_multipliers(final_mults[0])));
-	/* Reduce 64 => 32 bits using Barrett reduction. */
-	v1 = clmul_low(vandq_u8(v0, mask32), load_multipliers(final_mults[1]));
-	v1 = clmul_low(vandq_u8(v1, mask32), load_multipliers(final_mults[2]));
-	return vgetq_lane_u32(vreinterpretq_u32_u8(veorq_u8(v0, v1)), 1);
+	/* Reduce to 32 bits, following lib/x86/crc32_pclmul_template.h */
+	v0 = veorq_u8(clmul_low(v0, load_multipliers(barrett_consts[0])),
+		      vextq_u8(v0, vdupq_n_u8(0), 8));
+	v1 = clmul_low(v0, load_multipliers(barrett_consts[1]));
+	v1 = clmul_low(v1, load_multipliers(barrett_consts[2]));
+	v0 = veorq_u8(v0, v1);
+	return vgetq_lane_u32(vreinterpretq_u32_u8(v0), 2);
 }
 #undef SUFFIX
 #undef ATTRIBUTES
@@ -545,19 +532,26 @@ crc32_arm_pmullx4(u32 crc, const u8 *p, size_t len)
  * This like crc32_arm_pmullx12_crc(), but it adds the eor3 instruction (from
  * the sha3 extension) for even better performance.
  */
-#if HAVE_PMULL_INTRIN && HAVE_CRC32_INTRIN && HAVE_SHA3_INTRIN
+#if HAVE_PMULL_INTRIN && HAVE_CRC32_INTRIN && HAVE_SHA3_INTRIN && \
+	!defined(LIBDEFLATE_ASSEMBLER_DOES_NOT_SUPPORT_SHA3)
 #  define crc32_arm_pmullx12_crc_eor3	crc32_arm_pmullx12_crc_eor3
 #  define SUFFIX				 _pmullx12_crc_eor3
 #  ifdef __clang__
 #    define ATTRIBUTES	_target_attribute("aes,crc,sha3")
    /*
-    * With gcc 13.1 and earlier (before gcc commit 73d3bc348190 or 9aac37ab8a7b,
-    * "aarch64: Remove architecture dependencies from intrinsics"),
-    * arch=armv8.2-a is needed for the sha3 intrinsics, unless the default
-    * target is armv8.3-a or later in which case it must be omitted.  armv8.3-a
-    * or later can be detected by checking for __ARM_FEATURE_JCVT.
+    * Both gcc and binutils originally considered sha3 to depend on
+    * arch=armv8.2-a or later.  This was fixed in gcc 13.2 by commit
+    * 9aac37ab8a7b ("aarch64: Remove architecture dependencies from intrinsics")
+    * and in binutils 2.41 by commit 205e4380c800 ("aarch64: Remove version
+    * dependencies from features").  Unfortunately, always using arch=armv8.2-a
+    * causes build errors with some compiler options because it may reduce the
+    * arch rather than increase it.  Therefore we try to omit the arch whenever
+    * possible.  If gcc is 14 or later, then both gcc and binutils are probably
+    * fixed, so we omit the arch.  We also omit the arch if a feature that
+    * depends on armv8.2-a or later (in gcc 13.1 and earlier) is present.
     */
-#  elif GCC_PREREQ(13, 2) || defined(__ARM_FEATURE_JCVT)
+#  elif GCC_PREREQ(14, 0) || defined(__ARM_FEATURE_JCVT) \
+			  || defined(__ARM_FEATURE_DOTPROD)
 #    define ATTRIBUTES	_target_attribute("+crypto,+crc,+sha3")
 #  else
 #    define ATTRIBUTES	_target_attribute("arch=armv8.2-a+crypto+crc+sha3")

data/ext/deflate_ruby/libdeflate/arm/crc32_pmull_helpers.h ADDED Viewed

@@ -0,0 +1,156 @@
+/*
+ * arm/crc32_pmull_helpers.h - helper functions for CRC-32 folding with PMULL
+ *
+ * Copyright 2022 Eric Biggers
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+/*
+ * This file is a "template" for instantiating helper functions for CRC folding
+ * with pmull instructions.  It accepts the following parameters:
+ *
+ * SUFFIX:
+ *	Name suffix to append to all instantiated functions.
+ * ATTRIBUTES:
+ *	Target function attributes to use.
+ * ENABLE_EOR3:
+ *	Use the eor3 instruction (from the sha3 extension).
+ */
+/* Create a vector with 'a' in the first 4 bytes, and the rest zeroed out. */
+#undef u32_to_bytevec
+static forceinline ATTRIBUTES uint8x16_t
+ADD_SUFFIX(u32_to_bytevec)(u32 a)
+{
+	return vreinterpretq_u8_u32(vsetq_lane_u32(a, vdupq_n_u32(0), 0));
+}
+#define u32_to_bytevec	ADD_SUFFIX(u32_to_bytevec)
+/* Load two 64-bit values into a vector. */
+#undef load_multipliers
+static forceinline ATTRIBUTES poly64x2_t
+ADD_SUFFIX(load_multipliers)(const u64 p[2])
+{
+	return vreinterpretq_p64_u64(vld1q_u64(p));
+}
+#define load_multipliers	ADD_SUFFIX(load_multipliers)
+/* Do carryless multiplication of the low halves of two vectors. */
+#undef clmul_low
+static forceinline ATTRIBUTES uint8x16_t
+ADD_SUFFIX(clmul_low)(uint8x16_t a, poly64x2_t b)
+{
+	return vreinterpretq_u8_p128(
+		     compat_vmull_p64(vgetq_lane_p64(vreinterpretq_p64_u8(a), 0),
+				      vgetq_lane_p64(b, 0)));
+}
+#define clmul_low	ADD_SUFFIX(clmul_low)
+/* Do carryless multiplication of the high halves of two vectors. */
+#undef clmul_high
+static forceinline ATTRIBUTES uint8x16_t
+ADD_SUFFIX(clmul_high)(uint8x16_t a, poly64x2_t b)
+{
+#ifdef __clang__
+	/*
+	 * Use inline asm to ensure that pmull2 is really used.  This works
+	 * around clang bug https://github.com/llvm/llvm-project/issues/52868.
+	 */
+	uint8x16_t res;
+	__asm__("pmull2 %0.1q, %1.2d, %2.2d" : "=w" (res) : "w" (a), "w" (b));
+	return res;
+#else
+	return vreinterpretq_u8_p128(vmull_high_p64(vreinterpretq_p64_u8(a), b));
+#endif
+}
+#define clmul_high	ADD_SUFFIX(clmul_high)
+#undef eor3
+static forceinline ATTRIBUTES uint8x16_t
+ADD_SUFFIX(eor3)(uint8x16_t a, uint8x16_t b, uint8x16_t c)
+{
+#if ENABLE_EOR3
+	return veor3q_u8(a, b, c);
+#else
+	return veorq_u8(veorq_u8(a, b), c);
+#endif
+}
+#define eor3	ADD_SUFFIX(eor3)
+#undef fold_vec
+static forceinline ATTRIBUTES uint8x16_t
+ADD_SUFFIX(fold_vec)(uint8x16_t src, uint8x16_t dst, poly64x2_t multipliers)
+{
+	uint8x16_t a = clmul_low(src, multipliers);
+	uint8x16_t b = clmul_high(src, multipliers);
+	return eor3(a, b, dst);
+}
+#define fold_vec	ADD_SUFFIX(fold_vec)
+/*
+ * Given v containing a 16-byte polynomial, and a pointer 'p' that points to the
+ * next '1 <= len <= 15' data bytes, rearrange the concatenation of v and the
+ * data into vectors x0 and x1 that contain 'len' bytes and 16 bytes,
+ * respectively.  Then fold x0 into x1 and return the result.  Assumes that
+ * 'p + len - 16' is in-bounds.
+ */
+#undef fold_partial_vec
+static forceinline ATTRIBUTES MAYBE_UNUSED uint8x16_t
+ADD_SUFFIX(fold_partial_vec)(uint8x16_t v, const u8 *p, size_t len,
+			     poly64x2_t multipliers_1)
+{
+	/*
+	 * vqtbl1q_u8(v, shift_tab[len..len+15]) left shifts v by 16-len bytes.
+	 * vqtbl1q_u8(v, shift_tab[len+16..len+31]) right shifts v by len bytes.
+	 */
+	static const u8 shift_tab[48] = {
+		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+		0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+		0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
+		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+	};
+	const uint8x16_t lshift = vld1q_u8(&shift_tab[len]);
+	const uint8x16_t rshift = vld1q_u8(&shift_tab[len + 16]);
+	uint8x16_t x0, x1, bsl_mask;
+	/* x0 = v left-shifted by '16 - len' bytes */
+	x0 = vqtbl1q_u8(v, lshift);
+	/* Create a vector of '16 - len' 0x00 bytes, then 'len' 0xff bytes. */
+	bsl_mask = vreinterpretq_u8_s8(
+			vshrq_n_s8(vreinterpretq_s8_u8(rshift), 7));
+	/*
+	 * x1 = the last '16 - len' bytes from v (i.e. v right-shifted by 'len'
+	 * bytes) followed by the remaining data.
+	 */
+	x1 = vbslq_u8(bsl_mask /* 0 bits select from arg3, 1 bits from arg2 */,
+		      vld1q_u8(p + len - 16), vqtbl1q_u8(v, rshift));
+	return fold_vec(x0, x1, multipliers_1);
+}
+#define fold_partial_vec	ADD_SUFFIX(fold_partial_vec)