RubyGems - deflate-ruby - Versions diffs - 1.0.1 → 1.0.2 - Mend

deflate-ruby 1.0.1 → 1.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (138) hide show

data/ext/deflate_ruby/libdeflate/x86/adler32_template.h ADDED Viewed

@@ -0,0 +1,518 @@
+/*
+ * x86/adler32_template.h - template for vectorized Adler-32 implementations
+ *
+ * Copyright 2016 Eric Biggers
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+/*
+ * This file is a "template" for instantiating Adler-32 functions for x86.
+ * The "parameters" are:
+ *
+ * SUFFIX:
+ *	Name suffix to append to all instantiated functions.
+ * ATTRIBUTES:
+ *	Target function attributes to use.  Must satisfy the dependencies of the
+ *	other parameters as follows:
+ *	   VL=16 && USE_VNNI=0 && USE_AVX512=0: at least sse2
+ *	   VL=32 && USE_VNNI=0 && USE_AVX512=0: at least avx2
+ *	   VL=32 && USE_VNNI=1 && USE_AVX512=0: at least avx2,avxvnni
+ *	   VL=32 && USE_VNNI=1 && USE_AVX512=1: at least avx512bw,avx512vl,avx512vnni
+ *	   VL=64 && USE_VNNI=1 && USE_AVX512=1: at least avx512bw,avx512vnni
+ *	   (Other combinations are not useful and have not been tested.)
+ * VL:
+ *	Vector length in bytes.  Must be 16, 32, or 64.
+ * USE_VNNI:
+ *	If 1, use the VNNI dot product based algorithm.
+ *	If 0, use the legacy SSE2 and AVX2 compatible algorithm.
+ * USE_AVX512:
+ *	If 1, take advantage of AVX-512 features such as masking.  This doesn't
+ *	enable the use of 512-bit vectors; the vector length is controlled by
+ *	VL.  If 0, assume that the CPU might not support AVX-512.
+ */
+#if VL == 16
+#  define vec_t			__m128i
+#  define mask_t		u16
+#  define LOG2_VL		4
+#  define VADD8(a, b)		_mm_add_epi8((a), (b))
+#  define VADD16(a, b)		_mm_add_epi16((a), (b))
+#  define VADD32(a, b)		_mm_add_epi32((a), (b))
+#  if USE_AVX512
+#    define VDPBUSD(a, b, c)	_mm_dpbusd_epi32((a), (b), (c))
+#  else
+#    define VDPBUSD(a, b, c)	_mm_dpbusd_avx_epi32((a), (b), (c))
+#  endif
+#  define VLOAD(p)		_mm_load_si128((const void *)(p))
+#  define VLOADU(p)		_mm_loadu_si128((const void *)(p))
+#  define VMADD16(a, b)		_mm_madd_epi16((a), (b))
+#  define VMASKZ_LOADU(mask, p) _mm_maskz_loadu_epi8((mask), (p))
+#  define VMULLO32(a, b)	_mm_mullo_epi32((a), (b))
+#  define VSAD8(a, b)		_mm_sad_epu8((a), (b))
+#  define VSET1_8(a)		_mm_set1_epi8(a)
+#  define VSET1_32(a)		_mm_set1_epi32(a)
+#  define VSETZERO()		_mm_setzero_si128()
+#  define VSLL32(a, b)		_mm_slli_epi32((a), (b))
+#  define VUNPACKLO8(a, b)	_mm_unpacklo_epi8((a), (b))
+#  define VUNPACKHI8(a, b)	_mm_unpackhi_epi8((a), (b))
+#elif VL == 32
+#  define vec_t			__m256i
+#  define mask_t		u32
+#  define LOG2_VL		5
+#  define VADD8(a, b)		_mm256_add_epi8((a), (b))
+#  define VADD16(a, b)		_mm256_add_epi16((a), (b))
+#  define VADD32(a, b)		_mm256_add_epi32((a), (b))
+#  if USE_AVX512
+#    define VDPBUSD(a, b, c)	_mm256_dpbusd_epi32((a), (b), (c))
+#  else
+#    define VDPBUSD(a, b, c)	_mm256_dpbusd_avx_epi32((a), (b), (c))
+#  endif
+#  define VLOAD(p)		_mm256_load_si256((const void *)(p))
+#  define VLOADU(p)		_mm256_loadu_si256((const void *)(p))
+#  define VMADD16(a, b)		_mm256_madd_epi16((a), (b))
+#  define VMASKZ_LOADU(mask, p) _mm256_maskz_loadu_epi8((mask), (p))
+#  define VMULLO32(a, b)	_mm256_mullo_epi32((a), (b))
+#  define VSAD8(a, b)		_mm256_sad_epu8((a), (b))
+#  define VSET1_8(a)		_mm256_set1_epi8(a)
+#  define VSET1_32(a)		_mm256_set1_epi32(a)
+#  define VSETZERO()		_mm256_setzero_si256()
+#  define VSLL32(a, b)		_mm256_slli_epi32((a), (b))
+#  define VUNPACKLO8(a, b)	_mm256_unpacklo_epi8((a), (b))
+#  define VUNPACKHI8(a, b)	_mm256_unpackhi_epi8((a), (b))
+#elif VL == 64
+#  define vec_t			__m512i
+#  define mask_t		u64
+#  define LOG2_VL		6
+#  define VADD8(a, b)		_mm512_add_epi8((a), (b))
+#  define VADD16(a, b)		_mm512_add_epi16((a), (b))
+#  define VADD32(a, b)		_mm512_add_epi32((a), (b))
+#  define VDPBUSD(a, b, c)	_mm512_dpbusd_epi32((a), (b), (c))
+#  define VLOAD(p)		_mm512_load_si512((const void *)(p))
+#  define VLOADU(p)		_mm512_loadu_si512((const void *)(p))
+#  define VMADD16(a, b)		_mm512_madd_epi16((a), (b))
+#  define VMASKZ_LOADU(mask, p) _mm512_maskz_loadu_epi8((mask), (p))
+#  define VMULLO32(a, b)	_mm512_mullo_epi32((a), (b))
+#  define VSAD8(a, b)		_mm512_sad_epu8((a), (b))
+#  define VSET1_8(a)		_mm512_set1_epi8(a)
+#  define VSET1_32(a)		_mm512_set1_epi32(a)
+#  define VSETZERO()		_mm512_setzero_si512()
+#  define VSLL32(a, b)		_mm512_slli_epi32((a), (b))
+#  define VUNPACKLO8(a, b)	_mm512_unpacklo_epi8((a), (b))
+#  define VUNPACKHI8(a, b)	_mm512_unpackhi_epi8((a), (b))
+#else
+#  error "unsupported vector length"
+#endif
+#define VADD32_3X(a, b, c)	VADD32(VADD32((a), (b)), (c))
+#define VADD32_4X(a, b, c, d)	VADD32(VADD32((a), (b)), VADD32((c), (d)))
+#define VADD32_5X(a, b, c, d, e) VADD32((a), VADD32_4X((b), (c), (d), (e)))
+#define VADD32_7X(a, b, c, d, e, f, g)	\
+	VADD32(VADD32_3X((a), (b), (c)), VADD32_4X((d), (e), (f), (g)))
+/* Sum the 32-bit elements of v_s1 and add them to s1, and likewise for s2. */
+#undef reduce_to_32bits
+static forceinline ATTRIBUTES void
+ADD_SUFFIX(reduce_to_32bits)(vec_t v_s1, vec_t v_s2, u32 *s1_p, u32 *s2_p)
+{
+	__m128i v_s1_128, v_s2_128;
+#if VL == 16
+	{
+		v_s1_128 = v_s1;
+		v_s2_128 = v_s2;
+	}
+#else
+	{
+		__m256i v_s1_256, v_s2_256;
+	#if VL == 32
+		v_s1_256 = v_s1;
+		v_s2_256 = v_s2;
+	#else
+		/* Reduce 512 bits to 256 bits. */
+		v_s1_256 = _mm256_add_epi32(_mm512_extracti64x4_epi64(v_s1, 0),
+					    _mm512_extracti64x4_epi64(v_s1, 1));
+		v_s2_256 = _mm256_add_epi32(_mm512_extracti64x4_epi64(v_s2, 0),
+					    _mm512_extracti64x4_epi64(v_s2, 1));
+	#endif
+		/* Reduce 256 bits to 128 bits. */
+		v_s1_128 = _mm_add_epi32(_mm256_extracti128_si256(v_s1_256, 0),
+					 _mm256_extracti128_si256(v_s1_256, 1));
+		v_s2_128 = _mm_add_epi32(_mm256_extracti128_si256(v_s2_256, 0),
+					 _mm256_extracti128_si256(v_s2_256, 1));
+	}
+#endif
+	/*
+	 * Reduce 128 bits to 32 bits.
+	 *
+	 * If the bytes were summed into v_s1 using psadbw + paddd, then ignore
+	 * the odd-indexed elements of v_s1_128 since they are zero.
+	 */
+#if USE_VNNI
+	v_s1_128 = _mm_add_epi32(v_s1_128, _mm_shuffle_epi32(v_s1_128, 0x31));
+#endif
+	v_s2_128 = _mm_add_epi32(v_s2_128, _mm_shuffle_epi32(v_s2_128, 0x31));
+	v_s1_128 = _mm_add_epi32(v_s1_128, _mm_shuffle_epi32(v_s1_128, 0x02));
+	v_s2_128 = _mm_add_epi32(v_s2_128, _mm_shuffle_epi32(v_s2_128, 0x02));
+	*s1_p += (u32)_mm_cvtsi128_si32(v_s1_128);
+	*s2_p += (u32)_mm_cvtsi128_si32(v_s2_128);
+}
+#define reduce_to_32bits	ADD_SUFFIX(reduce_to_32bits)
+static ATTRIBUTES u32
+ADD_SUFFIX(adler32_x86)(u32 adler, const u8 *p, size_t len)
+{
+#if USE_VNNI
+	/* This contains the bytes [VL, VL-1, VL-2, ..., 1]. */
+	static const u8 _aligned_attribute(VL) raw_mults[VL] = {
+	#if VL == 64
+		64, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49,
+		48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33,
+	#endif
+	#if VL >= 32
+		32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17,
+	#endif
+		16, 15, 14, 13, 12, 11, 10,  9,  8,  7,  6,  5,  4,  3,  2,  1,
+	};
+	const vec_t ones = VSET1_8(1);
+#else
+	/*
+	 * This contains the 16-bit values [2*VL, 2*VL - 1, 2*VL - 2, ..., 1].
+	 * For VL==32 the ordering is weird because it has to match the way that
+	 * vpunpcklbw and vpunpckhbw work on 128-bit lanes separately.
+	 */
+	static const u16 _aligned_attribute(VL) raw_mults[4][VL / 2] = {
+	#if VL == 16
+		{ 32, 31, 30, 29, 28, 27, 26, 25 },
+		{ 24, 23, 22, 21, 20, 19, 18, 17 },
+		{ 16, 15, 14, 13, 12, 11, 10, 9  },
+		{ 8,  7,  6,  5,  4,  3,  2,  1  },
+	#elif VL == 32
+		{ 64, 63, 62, 61, 60, 59, 58, 57, 48, 47, 46, 45, 44, 43, 42, 41 },
+		{ 56, 55, 54, 53, 52, 51, 50, 49, 40, 39, 38, 37, 36, 35, 34, 33 },
+		{ 32, 31, 30, 29, 28, 27, 26, 25, 16, 15, 14, 13, 12, 11, 10,  9 },
+		{ 24, 23, 22, 21, 20, 19, 18, 17,  8,  7,  6,  5,  4,  3,  2,  1 },
+	#else
+	#  error "unsupported parameters"
+	#endif
+	};
+	const vec_t mults_a = VLOAD(raw_mults[0]);
+	const vec_t mults_b = VLOAD(raw_mults[1]);
+	const vec_t mults_c = VLOAD(raw_mults[2]);
+	const vec_t mults_d = VLOAD(raw_mults[3]);
+#endif
+	const vec_t zeroes = VSETZERO();
+	u32 s1 = adler & 0xFFFF;
+	u32 s2 = adler >> 16;
+	/*
+	 * If the length is large and the pointer is misaligned, align it.
+	 * For smaller lengths, just take the misaligned load penalty.
+	 */
+	if (unlikely(len > 65536 && ((uintptr_t)p & (VL-1)))) {
+		do {
+			s1 += *p++;
+			s2 += s1;
+			len--;
+		} while ((uintptr_t)p & (VL-1));
+		s1 %= DIVISOR;
+		s2 %= DIVISOR;
+	}
+#if USE_VNNI
+	/*
+	 * This is Adler-32 using the vpdpbusd instruction from AVX512VNNI or
+	 * AVX-VNNI.  vpdpbusd multiplies the unsigned bytes of one vector by
+	 * the signed bytes of another vector and adds the sums in groups of 4
+	 * to the 32-bit elements of a third vector.  We use it in two ways:
+	 * multiplying the data bytes by a sequence like 64,63,62,...,1 for
+	 * calculating part of s2, and multiplying the data bytes by an all-ones
+	 * sequence 1,1,1,...,1 for calculating s1 and part of s2.  The all-ones
+	 * trick seems to be faster than the alternative of vpsadbw + vpaddd.
+	 */
+	while (len) {
+		/*
+		 * Calculate the length of the next data chunk such that s1 and
+		 * s2 are guaranteed to not exceed UINT32_MAX.
+		 */
+		size_t n = MIN(len, MAX_CHUNK_LEN & ~(4*VL - 1));
+		vec_t mults = VLOAD(raw_mults);
+		vec_t v_s1 = zeroes;
+		vec_t v_s2 = zeroes;
+		s2 += s1 * n;
+		len -= n;
+		if (n >= 4*VL) {
+			vec_t v_s1_b = zeroes;
+			vec_t v_s1_c = zeroes;
+			vec_t v_s1_d = zeroes;
+			vec_t v_s2_b = zeroes;
+			vec_t v_s2_c = zeroes;
+			vec_t v_s2_d = zeroes;
+			vec_t v_s1_sums   = zeroes;
+			vec_t v_s1_sums_b = zeroes;
+			vec_t v_s1_sums_c = zeroes;
+			vec_t v_s1_sums_d = zeroes;
+			vec_t tmp0, tmp1;
+			do {
+				vec_t data_a = VLOADU(p + 0*VL);
+				vec_t data_b = VLOADU(p + 1*VL);
+				vec_t data_c = VLOADU(p + 2*VL);
+				vec_t data_d = VLOADU(p + 3*VL);
+				/*
+				 * Workaround for gcc bug where it generates
+				 * unnecessary move instructions
+				 * (https://gcc.gnu.org/bugzilla/show_bug.cgi?id=107892)
+				 */
+			#if GCC_PREREQ(1, 0)
+				__asm__("" : "+v" (data_a), "+v" (data_b),
+					     "+v" (data_c), "+v" (data_d));
+			#endif
+				v_s2   = VDPBUSD(v_s2,   data_a, mults);
+				v_s2_b = VDPBUSD(v_s2_b, data_b, mults);
+				v_s2_c = VDPBUSD(v_s2_c, data_c, mults);
+				v_s2_d = VDPBUSD(v_s2_d, data_d, mults);
+				v_s1_sums   = VADD32(v_s1_sums,   v_s1);
+				v_s1_sums_b = VADD32(v_s1_sums_b, v_s1_b);
+				v_s1_sums_c = VADD32(v_s1_sums_c, v_s1_c);
+				v_s1_sums_d = VADD32(v_s1_sums_d, v_s1_d);
+				v_s1   = VDPBUSD(v_s1,   data_a, ones);
+				v_s1_b = VDPBUSD(v_s1_b, data_b, ones);
+				v_s1_c = VDPBUSD(v_s1_c, data_c, ones);
+				v_s1_d = VDPBUSD(v_s1_d, data_d, ones);
+				/* Same gcc bug workaround.  See above */
+			#if GCC_PREREQ(1, 0) && !defined(ARCH_X86_32)
+				__asm__("" : "+v" (v_s2), "+v" (v_s2_b),
+					     "+v" (v_s2_c), "+v" (v_s2_d),
+					     "+v" (v_s1_sums),
+					     "+v" (v_s1_sums_b),
+					     "+v" (v_s1_sums_c),
+					     "+v" (v_s1_sums_d),
+					     "+v" (v_s1), "+v" (v_s1_b),
+					     "+v" (v_s1_c), "+v" (v_s1_d));
+			#endif
+				p += 4*VL;
+				n -= 4*VL;
+			} while (n >= 4*VL);
+			/*
+			 * Reduce into v_s1 and v_s2 as follows:
+			 *
+			 * v_s2 = v_s2 + v_s2_b + v_s2_c + v_s2_d +
+			 *	  (4*VL)*(v_s1_sums   + v_s1_sums_b +
+			 *		  v_s1_sums_c + v_s1_sums_d) +
+			 *	  (3*VL)*v_s1 + (2*VL)*v_s1_b + VL*v_s1_c
+			 * v_s1 = v_s1 + v_s1_b + v_s1_c + v_s1_d
+			 */
+			tmp0 = VADD32(v_s1, v_s1_b);
+			tmp1 = VADD32(v_s1, v_s1_c);
+			v_s1_sums = VADD32_4X(v_s1_sums, v_s1_sums_b,
+					      v_s1_sums_c, v_s1_sums_d);
+			v_s1 = VADD32_3X(tmp0, v_s1_c, v_s1_d);
+			v_s2 = VADD32_7X(VSLL32(v_s1_sums, LOG2_VL + 2),
+					 VSLL32(tmp0, LOG2_VL + 1),
+					 VSLL32(tmp1, LOG2_VL),
+					 v_s2, v_s2_b, v_s2_c, v_s2_d);
+		}
+		/* Process the last 0 <= n < 4*VL bytes of the chunk. */
+		if (n >= 2*VL) {
+			const vec_t data_a = VLOADU(p + 0*VL);
+			const vec_t data_b = VLOADU(p + 1*VL);
+			v_s2 = VADD32(v_s2, VSLL32(v_s1, LOG2_VL + 1));
+			v_s1 = VDPBUSD(v_s1, data_a, ones);
+			v_s1 = VDPBUSD(v_s1, data_b, ones);
+			v_s2 = VDPBUSD(v_s2, data_a, VSET1_8(VL));
+			v_s2 = VDPBUSD(v_s2, data_a, mults);
+			v_s2 = VDPBUSD(v_s2, data_b, mults);
+			p += 2*VL;
+			n -= 2*VL;
+		}
+		if (n) {
+			/* Process the last 0 < n < 2*VL bytes of the chunk. */
+			vec_t data;
+			v_s2 = VADD32(v_s2, VMULLO32(v_s1, VSET1_32(n)));
+			mults = VADD8(mults, VSET1_8((int)n - VL));
+			if (n > VL) {
+				data = VLOADU(p);
+				v_s1 = VDPBUSD(v_s1, data, ones);
+				v_s2 = VDPBUSD(v_s2, data, mults);
+				p += VL;
+				n -= VL;
+				mults = VADD8(mults, VSET1_8(-VL));
+			}
+			/*
+			 * Process the last 0 < n <= VL bytes of the chunk.
+			 * Utilize a masked load if it's available.
+			 */
+		#if USE_AVX512
+			data = VMASKZ_LOADU((mask_t)-1 >> (VL - n), p);
+		#else
+			data = zeroes;
+			memcpy(&data, p, n);
+		#endif
+			v_s1 = VDPBUSD(v_s1, data, ones);
+			v_s2 = VDPBUSD(v_s2, data, mults);
+			p += n;
+		}
+		reduce_to_32bits(v_s1, v_s2, &s1, &s2);
+		s1 %= DIVISOR;
+		s2 %= DIVISOR;
+	}
+#else /* USE_VNNI */
+	/*
+	 * This is Adler-32 for SSE2 and AVX2.
+	 *
+	 * To horizontally sum bytes, use psadbw + paddd, where one of the
+	 * arguments to psadbw is all-zeroes.
+	 *
+	 * For the s2 contribution from (2*VL - i)*data[i] for each of the 2*VL
+	 * bytes of each iteration of the inner loop, use punpck{l,h}bw + paddw
+	 * to sum, for each i across iterations, byte i into a corresponding
+	 * 16-bit counter in v_byte_sums_*.  After the inner loop, use pmaddwd
+	 * to multiply each counter by (2*VL - i), then add the products to s2.
+	 *
+	 * An alternative implementation would use pmaddubsw and pmaddwd in the
+	 * inner loop to do (2*VL - i)*data[i] directly and add the products in
+	 * groups of 4 to 32-bit counters.  However, on average that approach
+	 * seems to be slower than the current approach which delays the
+	 * multiplications.  Also, pmaddubsw requires SSSE3; the current
+	 * approach keeps the implementation aligned between SSE2 and AVX2.
+	 *
+	 * The inner loop processes 2*VL bytes per iteration.  Increasing this
+	 * to 4*VL doesn't seem to be helpful here.
+	 */
+	while (len) {
+		/*
+		 * Calculate the length of the next data chunk such that s1 and
+		 * s2 are guaranteed to not exceed UINT32_MAX, and every
+		 * v_byte_sums_* counter is guaranteed to not exceed INT16_MAX.
+		 * It's INT16_MAX, not UINT16_MAX, because v_byte_sums_* are
+		 * used with pmaddwd which does signed multiplication.  In the
+		 * SSE2 case this limits chunks to 4096 bytes instead of 5536.
+		 */
+		size_t n = MIN(len, MIN(2 * VL * (INT16_MAX / UINT8_MAX),
+					MAX_CHUNK_LEN) & ~(2*VL - 1));
+		len -= n;
+		if (n >= 2*VL) {
+			vec_t v_s1 = zeroes;
+			vec_t v_s1_sums = zeroes;
+			vec_t v_byte_sums_a = zeroes;
+			vec_t v_byte_sums_b = zeroes;
+			vec_t v_byte_sums_c = zeroes;
+			vec_t v_byte_sums_d = zeroes;
+			vec_t v_s2;
+			s2 += s1 * (n & ~(2*VL - 1));
+			do {
+				vec_t data_a = VLOADU(p + 0*VL);
+				vec_t data_b = VLOADU(p + 1*VL);
+				v_s1_sums = VADD32(v_s1_sums, v_s1);
+				v_byte_sums_a = VADD16(v_byte_sums_a,
+						       VUNPACKLO8(data_a, zeroes));
+				v_byte_sums_b = VADD16(v_byte_sums_b,
+						       VUNPACKHI8(data_a, zeroes));
+				v_byte_sums_c = VADD16(v_byte_sums_c,
+						       VUNPACKLO8(data_b, zeroes));
+				v_byte_sums_d = VADD16(v_byte_sums_d,
+						       VUNPACKHI8(data_b, zeroes));
+				v_s1 = VADD32(v_s1,
+					      VADD32(VSAD8(data_a, zeroes),
+						     VSAD8(data_b, zeroes)));
+				/*
+				 * Workaround for gcc bug where it generates
+				 * unnecessary move instructions
+				 * (https://gcc.gnu.org/bugzilla/show_bug.cgi?id=107892)
+				 */
+			#if GCC_PREREQ(1, 0)
+				__asm__("" : "+x" (v_s1), "+x" (v_s1_sums),
+					     "+x" (v_byte_sums_a),
+					     "+x" (v_byte_sums_b),
+					     "+x" (v_byte_sums_c),
+					     "+x" (v_byte_sums_d));
+			#endif
+				p += 2*VL;
+				n -= 2*VL;
+			} while (n >= 2*VL);
+			/*
+			 * Calculate v_s2 as (2*VL)*v_s1_sums +
+			 * [2*VL, 2*VL - 1, 2*VL - 2, ..., 1] * v_byte_sums.
+			 * Then update s1 and s2 from v_s1 and v_s2.
+			 */
+			v_s2 = VADD32_5X(VSLL32(v_s1_sums, LOG2_VL + 1),
+					 VMADD16(v_byte_sums_a, mults_a),
+					 VMADD16(v_byte_sums_b, mults_b),
+					 VMADD16(v_byte_sums_c, mults_c),
+					 VMADD16(v_byte_sums_d, mults_d));
+			reduce_to_32bits(v_s1, v_s2, &s1, &s2);
+		}
+		/*
+		 * Process the last 0 <= n < 2*VL bytes of the chunk using
+		 * scalar instructions and reduce s1 and s2 mod DIVISOR.
+		 */
+		ADLER32_CHUNK(s1, s2, p, n);
+	}
+#endif /* !USE_VNNI */
+	return (s2 << 16) | s1;
+}
+#undef vec_t
+#undef mask_t
+#undef LOG2_VL
+#undef VADD8
+#undef VADD16
+#undef VADD32
+#undef VDPBUSD
+#undef VLOAD
+#undef VLOADU
+#undef VMADD16
+#undef VMASKZ_LOADU
+#undef VMULLO32
+#undef VSAD8
+#undef VSET1_8
+#undef VSET1_32
+#undef VSETZERO
+#undef VSLL32
+#undef VUNPACKLO8
+#undef VUNPACKHI8
+#undef SUFFIX
+#undef ATTRIBUTES
+#undef VL
+#undef USE_VNNI
+#undef USE_AVX512