RubyGems - deflate-ruby - Versions diffs - 1.0.1 → 1.0.2 - Mend

deflate-ruby 1.0.1 → 1.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (138) hide show

data/ext/deflate_ruby/libdeflate/x86/crc32_pclmul_template.h ADDED Viewed

@@ -0,0 +1,424 @@
+/*
+ * x86/crc32_pclmul_template.h - gzip CRC-32 with PCLMULQDQ instructions
+ *
+ * Copyright 2016 Eric Biggers
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+/*
+ * This file is a "template" for instantiating PCLMULQDQ-based crc32_x86
+ * functions.  The "parameters" are:
+ *
+ * SUFFIX:
+ *	Name suffix to append to all instantiated functions.
+ * ATTRIBUTES:
+ *	Target function attributes to use.  Must satisfy the dependencies of the
+ *	other parameters as follows:
+ *	   VL=16 && USE_AVX512=0: at least pclmul,sse4.1
+ *	   VL=32 && USE_AVX512=0: at least vpclmulqdq,pclmul,avx2
+ *	   VL=32 && USE_AVX512=1: at least vpclmulqdq,pclmul,avx512bw,avx512vl
+ *	   VL=64 && USE_AVX512=1: at least vpclmulqdq,pclmul,avx512bw,avx512vl
+ *	   (Other combinations are not useful and have not been tested.)
+ * VL:
+ *	Vector length in bytes.  Must be 16, 32, or 64.
+ * USE_AVX512:
+ *	If 1, take advantage of AVX-512 features such as masking and the
+ *	vpternlog instruction.  This doesn't enable the use of 512-bit vectors;
+ *	the vector length is controlled by VL.  If 0, assume that the CPU might
+ *	not support AVX-512.
+ *
+ * The overall algorithm used is CRC folding with carryless multiplication
+ * instructions.  Note that the x86 crc32 instruction cannot be used, as it is
+ * for a different polynomial, not the gzip one.  For an explanation of CRC
+ * folding with carryless multiplication instructions, see
+ * scripts/gen-crc32-consts.py and the following blog posts and papers:
+ *
+ *	"An alternative exposition of crc32_4k_pclmulqdq"
+ *	https://www.corsix.org/content/alternative-exposition-crc32_4k_pclmulqdq
+ *
+ *	"Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
+ *	https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
+ *
+ * The original pclmulqdq instruction does one 64x64 to 128-bit carryless
+ * multiplication.  The VPCLMULQDQ feature added instructions that do two
+ * parallel 64x64 to 128-bit carryless multiplications in combination with AVX
+ * or AVX512VL, or four in combination with AVX512F.
+ */
+#if VL == 16
+#  define vec_t			__m128i
+#  define fold_vec		fold_vec128
+#  define VLOADU(p)		_mm_loadu_si128((const void *)(p))
+#  define VXOR(a, b)		_mm_xor_si128((a), (b))
+#  define M128I_TO_VEC(a)	a
+#  define MULTS_8V		_mm_set_epi64x(CRC32_X991_MODG, CRC32_X1055_MODG)
+#  define MULTS_4V		_mm_set_epi64x(CRC32_X479_MODG, CRC32_X543_MODG)
+#  define MULTS_2V		_mm_set_epi64x(CRC32_X223_MODG, CRC32_X287_MODG)
+#  define MULTS_1V		_mm_set_epi64x(CRC32_X95_MODG, CRC32_X159_MODG)
+#elif VL == 32
+#  define vec_t			__m256i
+#  define fold_vec		fold_vec256
+#  define VLOADU(p)		_mm256_loadu_si256((const void *)(p))
+#  define VXOR(a, b)		_mm256_xor_si256((a), (b))
+#  define M128I_TO_VEC(a)	_mm256_zextsi128_si256(a)
+#  define MULTS(a, b)		_mm256_set_epi64x(a, b, a, b)
+#  define MULTS_8V		MULTS(CRC32_X2015_MODG, CRC32_X2079_MODG)
+#  define MULTS_4V		MULTS(CRC32_X991_MODG, CRC32_X1055_MODG)
+#  define MULTS_2V		MULTS(CRC32_X479_MODG, CRC32_X543_MODG)
+#  define MULTS_1V		MULTS(CRC32_X223_MODG, CRC32_X287_MODG)
+#elif VL == 64
+#  define vec_t			__m512i
+#  define fold_vec		fold_vec512
+#  define VLOADU(p)		_mm512_loadu_si512((const void *)(p))
+#  define VXOR(a, b)		_mm512_xor_si512((a), (b))
+#  define M128I_TO_VEC(a)	_mm512_zextsi128_si512(a)
+#  define MULTS(a, b)		_mm512_set_epi64(a, b, a, b, a, b, a, b)
+#  define MULTS_8V		MULTS(CRC32_X4063_MODG, CRC32_X4127_MODG)
+#  define MULTS_4V		MULTS(CRC32_X2015_MODG, CRC32_X2079_MODG)
+#  define MULTS_2V		MULTS(CRC32_X991_MODG, CRC32_X1055_MODG)
+#  define MULTS_1V		MULTS(CRC32_X479_MODG, CRC32_X543_MODG)
+#else
+#  error "unsupported vector length"
+#endif
+#undef fold_vec128
+static forceinline ATTRIBUTES __m128i
+ADD_SUFFIX(fold_vec128)(__m128i src, __m128i dst, __m128i /* __v2du */ mults)
+{
+	dst = _mm_xor_si128(dst, _mm_clmulepi64_si128(src, mults, 0x00));
+	dst = _mm_xor_si128(dst, _mm_clmulepi64_si128(src, mults, 0x11));
+	return dst;
+}
+#define fold_vec128	ADD_SUFFIX(fold_vec128)
+#if VL >= 32
+#undef fold_vec256
+static forceinline ATTRIBUTES __m256i
+ADD_SUFFIX(fold_vec256)(__m256i src, __m256i dst, __m256i /* __v4du */ mults)
+{
+#if USE_AVX512
+	/* vpternlog with immediate 0x96 is a three-argument XOR. */
+	return _mm256_ternarylogic_epi32(
+			_mm256_clmulepi64_epi128(src, mults, 0x00),
+			_mm256_clmulepi64_epi128(src, mults, 0x11),
+			dst,
+			0x96);
+#else
+	return _mm256_xor_si256(
+			_mm256_xor_si256(dst,
+					 _mm256_clmulepi64_epi128(src, mults, 0x00)),
+			_mm256_clmulepi64_epi128(src, mults, 0x11));
+#endif
+}
+#define fold_vec256	ADD_SUFFIX(fold_vec256)
+#endif /* VL >= 32 */
+#if VL >= 64
+#undef fold_vec512
+static forceinline ATTRIBUTES __m512i
+ADD_SUFFIX(fold_vec512)(__m512i src, __m512i dst, __m512i /* __v8du */ mults)
+{
+	/* vpternlog with immediate 0x96 is a three-argument XOR. */
+	return _mm512_ternarylogic_epi32(
+			_mm512_clmulepi64_epi128(src, mults, 0x00),
+			_mm512_clmulepi64_epi128(src, mults, 0x11),
+			dst,
+			0x96);
+}
+#define fold_vec512	ADD_SUFFIX(fold_vec512)
+#endif /* VL >= 64 */
+/*
+ * Given 'x' containing a 16-byte polynomial, and a pointer 'p' that points to
+ * the next '1 <= len <= 15' data bytes, rearrange the concatenation of 'x' and
+ * the data into vectors x0 and x1 that contain 'len' bytes and 16 bytes,
+ * respectively.  Then fold x0 into x1 and return the result.
+ * Assumes that 'p + len - 16' is in-bounds.
+ */
+#undef fold_lessthan16bytes
+static forceinline ATTRIBUTES __m128i
+ADD_SUFFIX(fold_lessthan16bytes)(__m128i x, const u8 *p, size_t len,
+				 __m128i /* __v2du */ mults_128b)
+{
+	__m128i lshift = _mm_loadu_si128((const void *)&shift_tab[len]);
+	__m128i rshift = _mm_loadu_si128((const void *)&shift_tab[len + 16]);
+	__m128i x0, x1;
+	/* x0 = x left-shifted by '16 - len' bytes */
+	x0 = _mm_shuffle_epi8(x, lshift);
+	/*
+	 * x1 = the last '16 - len' bytes from x (i.e. x right-shifted by 'len'
+	 * bytes) followed by the remaining data.
+	 */
+	x1 = _mm_blendv_epi8(_mm_shuffle_epi8(x, rshift),
+			     _mm_loadu_si128((const void *)(p + len - 16)),
+			     /* msb 0/1 of each byte selects byte from arg1/2 */
+			     rshift);
+	return fold_vec128(x0, x1, mults_128b);
+}
+#define fold_lessthan16bytes	ADD_SUFFIX(fold_lessthan16bytes)
+static ATTRIBUTES u32
+ADD_SUFFIX(crc32_x86)(u32 crc, const u8 *p, size_t len)
+{
+	/*
+	 * mults_{N}v are the vectors of multipliers for folding across N vec_t
+	 * vectors, i.e. N*VL*8 bits.  mults_128b are the two multipliers for
+	 * folding across 128 bits.  mults_128b differs from mults_1v when
+	 * VL != 16.  All multipliers are 64-bit, to match what pclmulqdq needs,
+	 * but since this is for CRC-32 only their low 32 bits are nonzero.
+	 * For more details, see scripts/gen-crc32-consts.py.
+	 */
+	const vec_t mults_8v = MULTS_8V;
+	const vec_t mults_4v = MULTS_4V;
+	const vec_t mults_2v = MULTS_2V;
+	const vec_t mults_1v = MULTS_1V;
+	const __m128i mults_128b = _mm_set_epi64x(CRC32_X95_MODG, CRC32_X159_MODG);
+	const __m128i barrett_reduction_constants =
+		_mm_set_epi64x(CRC32_BARRETT_CONSTANT_2, CRC32_BARRETT_CONSTANT_1);
+	vec_t v0, v1, v2, v3, v4, v5, v6, v7;
+	__m128i x0 = _mm_cvtsi32_si128(crc);
+	__m128i x1;
+	if (len < 8*VL) {
+		if (len < VL) {
+			STATIC_ASSERT(VL == 16 || VL == 32 || VL == 64);
+			if (len < 16) {
+			#if USE_AVX512
+				if (len < 4)
+					return crc32_slice1(crc, p, len);
+				/*
+				 * Handle 4 <= len <= 15 bytes by doing a masked
+				 * load, XOR'ing the current CRC with the first
+				 * 4 bytes, left-shifting by '16 - len' bytes to
+				 * align the result to the end of x0 (so that it
+				 * becomes the low-order coefficients of a
+				 * 128-bit polynomial), and then doing the usual
+				 * reduction from 128 bits to 32 bits.
+				 */
+				x0 = _mm_xor_si128(
+					x0, _mm_maskz_loadu_epi8((1 << len) - 1, p));
+				x0 = _mm_shuffle_epi8(
+					x0, _mm_loadu_si128((const void *)&shift_tab[len]));
+				goto reduce_x0;
+			#else
+				return crc32_slice1(crc, p, len);
+			#endif
+			}
+			/*
+			 * Handle 16 <= len < VL bytes where VL is 32 or 64.
+			 * Use 128-bit instructions so that these lengths aren't
+			 * slower with VL > 16 than with VL=16.
+			 */
+			x0 = _mm_xor_si128(_mm_loadu_si128((const void *)p), x0);
+			if (len >= 32) {
+				x0 = fold_vec128(x0, _mm_loadu_si128((const void *)(p + 16)),
+						 mults_128b);
+				if (len >= 48)
+					x0 = fold_vec128(x0, _mm_loadu_si128((const void *)(p + 32)),
+							 mults_128b);
+			}
+			p += len & ~15;
+			goto less_than_16_remaining;
+		}
+		v0 = VXOR(VLOADU(p), M128I_TO_VEC(x0));
+		if (len < 2*VL) {
+			p += VL;
+			goto less_than_vl_remaining;
+		}
+		v1 = VLOADU(p + 1*VL);
+		if (len < 4*VL) {
+			p += 2*VL;
+			goto less_than_2vl_remaining;
+		}
+		v2 = VLOADU(p + 2*VL);
+		v3 = VLOADU(p + 3*VL);
+		p += 4*VL;
+	} else {
+		/*
+		 * If the length is large and the pointer is misaligned, align
+		 * it.  For smaller lengths, just take the misaligned load
+		 * penalty.  Note that on recent x86 CPUs, vmovdqu with an
+		 * aligned address is just as fast as vmovdqa, so there's no
+		 * need to use vmovdqa in the main loop.
+		 */
+		if (len > 65536 && ((uintptr_t)p & (VL-1))) {
+			size_t align = -(uintptr_t)p & (VL-1);
+			len -= align;
+			x0 = _mm_xor_si128(_mm_loadu_si128((const void *)p), x0);
+			p += 16;
+			if (align & 15) {
+				x0 = fold_lessthan16bytes(x0, p, align & 15,
+							  mults_128b);
+				p += align & 15;
+				align &= ~15;
+			}
+			while (align) {
+				x0 = fold_vec128(x0, *(const __m128i *)p,
+						 mults_128b);
+				p += 16;
+				align -= 16;
+			}
+			v0 = M128I_TO_VEC(x0);
+		#  if VL == 32
+			v0 = _mm256_inserti128_si256(v0, *(const __m128i *)p, 1);
+		#  elif VL == 64
+			v0 = _mm512_inserti32x4(v0, *(const __m128i *)p, 1);
+			v0 = _mm512_inserti64x4(v0, *(const __m256i *)(p + 16), 1);
+		#  endif
+			p -= 16;
+		} else {
+			v0 = VXOR(VLOADU(p), M128I_TO_VEC(x0));
+		}
+		v1 = VLOADU(p + 1*VL);
+		v2 = VLOADU(p + 2*VL);
+		v3 = VLOADU(p + 3*VL);
+		v4 = VLOADU(p + 4*VL);
+		v5 = VLOADU(p + 5*VL);
+		v6 = VLOADU(p + 6*VL);
+		v7 = VLOADU(p + 7*VL);
+		p += 8*VL;
+		/*
+		 * This is the main loop, processing 8*VL bytes per iteration.
+		 * 4*VL is usually enough and would result in smaller code, but
+		 * Skylake and Cascade Lake need 8*VL to get full performance.
+		 */
+		while (len >= 16*VL) {
+			v0 = fold_vec(v0, VLOADU(p + 0*VL), mults_8v);
+			v1 = fold_vec(v1, VLOADU(p + 1*VL), mults_8v);
+			v2 = fold_vec(v2, VLOADU(p + 2*VL), mults_8v);
+			v3 = fold_vec(v3, VLOADU(p + 3*VL), mults_8v);
+			v4 = fold_vec(v4, VLOADU(p + 4*VL), mults_8v);
+			v5 = fold_vec(v5, VLOADU(p + 5*VL), mults_8v);
+			v6 = fold_vec(v6, VLOADU(p + 6*VL), mults_8v);
+			v7 = fold_vec(v7, VLOADU(p + 7*VL), mults_8v);
+			p += 8*VL;
+			len -= 8*VL;
+		}
+		/* Fewer than 8*VL bytes remain. */
+		v0 = fold_vec(v0, v4, mults_4v);
+		v1 = fold_vec(v1, v5, mults_4v);
+		v2 = fold_vec(v2, v6, mults_4v);
+		v3 = fold_vec(v3, v7, mults_4v);
+		if (len & (4*VL)) {
+			v0 = fold_vec(v0, VLOADU(p + 0*VL), mults_4v);
+			v1 = fold_vec(v1, VLOADU(p + 1*VL), mults_4v);
+			v2 = fold_vec(v2, VLOADU(p + 2*VL), mults_4v);
+			v3 = fold_vec(v3, VLOADU(p + 3*VL), mults_4v);
+			p += 4*VL;
+		}
+	}
+	/* Fewer than 4*VL bytes remain. */
+	v0 = fold_vec(v0, v2, mults_2v);
+	v1 = fold_vec(v1, v3, mults_2v);
+	if (len & (2*VL)) {
+		v0 = fold_vec(v0, VLOADU(p + 0*VL), mults_2v);
+		v1 = fold_vec(v1, VLOADU(p + 1*VL), mults_2v);
+		p += 2*VL;
+	}
+less_than_2vl_remaining:
+	/* Fewer than 2*VL bytes remain. */
+	v0 = fold_vec(v0, v1, mults_1v);
+	if (len & VL) {
+		v0 = fold_vec(v0, VLOADU(p), mults_1v);
+		p += VL;
+	}
+less_than_vl_remaining:
+	/*
+	 * Fewer than VL bytes remain.  Reduce v0 (length VL bytes) to x0
+	 * (length 16 bytes) and fold in any 16-byte data segments that remain.
+	 */
+#if VL == 16
+	x0 = v0;
+#else
+	{
+	#if VL == 32
+		__m256i y0 = v0;
+	#else
+		const __m256i mults_256b =
+			_mm256_set_epi64x(CRC32_X223_MODG, CRC32_X287_MODG,
+					  CRC32_X223_MODG, CRC32_X287_MODG);
+		__m256i y0 = fold_vec256(_mm512_extracti64x4_epi64(v0, 0),
+					 _mm512_extracti64x4_epi64(v0, 1),
+					 mults_256b);
+		if (len & 32) {
+			y0 = fold_vec256(y0, _mm256_loadu_si256((const void *)p),
+					 mults_256b);
+			p += 32;
+		}
+	#endif
+		x0 = fold_vec128(_mm256_extracti128_si256(y0, 0),
+				 _mm256_extracti128_si256(y0, 1), mults_128b);
+	}
+	if (len & 16) {
+		x0 = fold_vec128(x0, _mm_loadu_si128((const void *)p),
+				 mults_128b);
+		p += 16;
+	}
+#endif
+less_than_16_remaining:
+	len &= 15;
+	/* Handle any remainder of 1 to 15 bytes. */
+	if (len)
+		x0 = fold_lessthan16bytes(x0, p, len, mults_128b);
+#if USE_AVX512
+reduce_x0:
+#endif
+	/*
+	 * Multiply the remaining 128-bit message polynomial 'x0' by x^32, then
+	 * reduce it modulo the generator polynomial G.  This gives the CRC.
+	 *
+	 * This implementation matches that used in crc-pclmul-template.S from
+	 * https://lore.kernel.org/r/20250210174540.161705-4-ebiggers@kernel.org/
+	 * with the parameters n=32 and LSB_CRC=1 (what the gzip CRC uses).  See
+	 * there for a detailed explanation of the math used here.
+	 */
+	x0 = _mm_xor_si128(_mm_clmulepi64_si128(x0, mults_128b, 0x10),
+			   _mm_bsrli_si128(x0, 8));
+	x1 = _mm_clmulepi64_si128(x0, barrett_reduction_constants, 0x00);
+	x1 = _mm_clmulepi64_si128(x1, barrett_reduction_constants, 0x10);
+	x0 = _mm_xor_si128(x0, x1);
+	return _mm_extract_epi32(x0, 2);
+}
+#undef vec_t
+#undef fold_vec
+#undef VLOADU
+#undef VXOR
+#undef M128I_TO_VEC
+#undef MULTS
+#undef MULTS_8V
+#undef MULTS_4V
+#undef MULTS_2V
+#undef MULTS_1V
+#undef SUFFIX
+#undef ATTRIBUTES
+#undef VL
+#undef USE_AVX512

data/ext/deflate_ruby/libdeflate/x86/decompress_impl.h ADDED Viewed

@@ -0,0 +1,57 @@
+#ifndef LIB_X86_DECOMPRESS_IMPL_H
+#define LIB_X86_DECOMPRESS_IMPL_H
+#include "cpu_features.h"
+/*
+ * BMI2 optimized decompression function.
+ *
+ * With gcc and clang we just compile the whole function with
+ * __attribute__((target("bmi2"))), and the compiler uses bmi2 automatically.
+ *
+ * With MSVC, there is no target function attribute, but it's still possible to
+ * use bmi2 intrinsics explicitly.  Currently we mostly don't, but there's a
+ * case in which we do (see below), so we at least take advantage of that.
+ * However, MSVC from VS2017 (toolset v141) apparently miscompiles the _bzhi_*()
+ * intrinsics.  It seems to be fixed in VS2022.  Hence, use MSVC_PREREQ(1930).
+ */
+#if defined(__GNUC__) || defined(__clang__) || MSVC_PREREQ(1930)
+#  define deflate_decompress_bmi2	deflate_decompress_bmi2
+#  define FUNCNAME			deflate_decompress_bmi2
+#  define ATTRIBUTES			_target_attribute("bmi2")
+   /*
+    * Even with __attribute__((target("bmi2"))), gcc doesn't reliably use the
+    * bzhi instruction for 'word & BITMASK(count)'.  So use the bzhi intrinsic
+    * explicitly.  EXTRACT_VARBITS() is equivalent to 'word & BITMASK(count)';
+    * EXTRACT_VARBITS8() is equivalent to 'word & BITMASK((u8)count)'.
+    * Nevertheless, their implementation using the bzhi intrinsic is identical,
+    * as the bzhi instruction truncates the count to 8 bits implicitly.
+    */
+#  ifndef __clang__
+#    ifdef ARCH_X86_64
+#      define EXTRACT_VARBITS(word, count)  _bzhi_u64((word), (count))
+#      define EXTRACT_VARBITS8(word, count) _bzhi_u64((word), (count))
+#    else
+#      define EXTRACT_VARBITS(word, count)  _bzhi_u32((word), (count))
+#      define EXTRACT_VARBITS8(word, count) _bzhi_u32((word), (count))
+#    endif
+#  endif
+#  include "../decompress_template.h"
+#endif
+#if defined(deflate_decompress_bmi2) && HAVE_BMI2_NATIVE
+#define DEFAULT_IMPL	deflate_decompress_bmi2
+#else
+static inline decompress_func_t
+arch_select_decompress_func(void)
+{
+#ifdef deflate_decompress_bmi2
+	if (HAVE_BMI2(get_x86_cpu_features()))
+		return deflate_decompress_bmi2;
+#endif
+	return NULL;
+}
+#define arch_select_decompress_func	arch_select_decompress_func
+#endif
+#endif /* LIB_X86_DECOMPRESS_IMPL_H */