RubyGems - deflate-ruby - Versions diffs - 1.0.1 → 1.0.2 - Mend

deflate-ruby 1.0.1 → 1.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (138) hide show

data/ext/deflate_ruby/{libdeflate/lib/x86/cpu_features.c → cpu_features.c} RENAMED Viewed

@@ -88,9 +88,31 @@ static const struct cpu_feature x86_cpu_feature_table[] = {
 volatile u32 libdeflate_x86_cpu_features = 0;
+static inline bool
+os_supports_avx512(u64 xcr0)
+{
+#ifdef __APPLE__
+	/*
+	 * The Darwin kernel had a bug where it could corrupt the opmask
+	 * registers.  See
+	 * https://community.intel.com/t5/Software-Tuning-Performance/MacOS-Darwin-kernel-bug-clobbers-AVX-512-opmask-register-state/m-p/1327259
+	 * Darwin also does not initially set the XCR0 bits for AVX512, but they
+	 * are set if the thread tries to use AVX512 anyway.  Thus, to safely
+	 * and consistently use AVX512 on macOS we'd need to check the kernel
+	 * version as well as detect AVX512 support using a macOS-specific
+	 * method.  We don't bother with this, especially given Apple's
+	 * transition to arm64.
+	 */
+	return false;
+#else
+	return (xcr0 & 0xe6) == 0xe6;
+#endif
+}
 /*
- * Don't use 512-bit vectors on Intel CPUs before Rocket Lake and Sapphire
- * Rapids, due to the downclocking penalty.
+ * Don't use 512-bit vectors (ZMM registers) on Intel CPUs before Rocket Lake
+ * and Sapphire Rapids, due to the overly-eager downclocking which can reduce
+ * the performance of workloads that use ZMM registers only occasionally.
  */
 static inline bool
 allow_512bit_vectors(const u32 manufacturer[3], u32 family, u32 model)
@@ -140,7 +162,12 @@ void libdeflate_init_x86_cpu_features(void)
 		family += (a >> 20) & 0xff;
 	if (d & (1 << 26))
 		features |= X86_CPU_FEATURE_SSE2;
-	if (c & (1 << 1))
+	/*
+	 * No known CPUs have pclmulqdq without sse4.1, so in practice code
+	 * targeting pclmulqdq can use sse4.1 instructions.  But to be safe,
+	 * explicitly check for both the pclmulqdq and sse4.1 bits.
+	 */
+	if ((c & (1 << 1)) && (c & (1 << 19)))
 		features |= X86_CPU_FEATURE_PCLMULQDQ;
 	if (c & (1 << 27))
 		xcr0 = read_xcr(0);
@@ -152,21 +179,24 @@ void libdeflate_init_x86_cpu_features(void)
 	/* EAX=7, ECX=0: Extended Features */
 	cpuid(7, 0, &a, &b, &c, &d);
-	if ((b & (1 << 5)) && ((xcr0 & 0x6) == 0x6))
-		features |= X86_CPU_FEATURE_AVX2;
 	if (b & (1 << 8))
 		features |= X86_CPU_FEATURE_BMI2;
-	if (((xcr0 & 0xe6) == 0xe6) &&
-	    allow_512bit_vectors(manufacturer, family, model))
-		features |= X86_CPU_FEATURE_ZMM;
-	if ((b & (1 << 30)) && ((xcr0 & 0xe6) == 0xe6))
-		features |= X86_CPU_FEATURE_AVX512BW;
-	if ((b & (1U << 31)) && ((xcr0 & 0xe6) == 0xe6))
-		features |= X86_CPU_FEATURE_AVX512VL;
-	if ((c & (1 << 10)) && ((xcr0 & 0x6) == 0x6))
-		features |= X86_CPU_FEATURE_VPCLMULQDQ;
-	if ((c & (1 << 11)) && ((xcr0 & 0xe6) == 0xe6))
-		features |= X86_CPU_FEATURE_AVX512VNNI;
+	if ((xcr0 & 0x6) == 0x6) {
+		if (b & (1 << 5))
+			features |= X86_CPU_FEATURE_AVX2;
+		if (c & (1 << 10))
+			features |= X86_CPU_FEATURE_VPCLMULQDQ;
+	}
+	if (os_supports_avx512(xcr0)) {
+		if (allow_512bit_vectors(manufacturer, family, model))
+			features |= X86_CPU_FEATURE_ZMM;
+		if (b & (1 << 30))
+			features |= X86_CPU_FEATURE_AVX512BW;
+		if (b & (1U << 31))
+			features |= X86_CPU_FEATURE_AVX512VL;
+		if (c & (1 << 11))
+			features |= X86_CPU_FEATURE_AVX512VNNI;
+	}
 	/* EAX=7, ECX=1: Extended Features */
 	cpuid(7, 1, &a, &b, &c, &d);

data/ext/deflate_ruby/{libdeflate/lib/x86/cpu_features.h → cpu_features.h} RENAMED Viewed

@@ -108,7 +108,8 @@ static inline u32 get_x86_cpu_features(void) { return 0; }
 #  define HAVE_SSE2_NATIVE		0
 #endif
-#if defined(__PCLMUL__) || (defined(_MSC_VER) && defined(__AVX2__))
+#if (defined(__PCLMUL__) && defined(__SSE4_1__)) || \
+	(defined(_MSC_VER) && defined(__AVX2__))
 #  define HAVE_PCLMULQDQ(features)	1
 #else
 #  define HAVE_PCLMULQDQ(features)	((features) & X86_CPU_FEATURE_PCLMULQDQ)

data/ext/deflate_ruby/{libdeflate/lib/x86/crc32_impl.h → crc32_impl.h} RENAMED Viewed

@@ -44,31 +44,26 @@ static const u8 MAYBE_UNUSED shift_tab[48] = {
 };
 #if defined(__GNUC__) || defined(__clang__) || defined(_MSC_VER)
-/* PCLMULQDQ implementation */
+/*
+ * PCLMULQDQ implementation.  This targets PCLMULQDQ+SSE4.1, since in practice
+ * all CPUs that support PCLMULQDQ also support SSE4.1.
+ */
 #  define crc32_x86_pclmulqdq	crc32_x86_pclmulqdq
 #  define SUFFIX			 _pclmulqdq
-#  define ATTRIBUTES		_target_attribute("pclmul")
+#  define ATTRIBUTES		_target_attribute("pclmul,sse4.1")
 #  define VL			16
-#  define USE_SSE4_1		0
 #  define USE_AVX512		0
 #  include "crc32_pclmul_template.h"
 /*
- * PCLMULQDQ/AVX implementation.  Compared to the regular PCLMULQDQ
- * implementation, this still uses 128-bit vectors, but it has two potential
- * benefits.  First, simply compiling against the AVX target can improve
- * performance significantly (e.g. 10100 MB/s to 16700 MB/s on Skylake) without
- * actually using any AVX intrinsics, probably due to the availability of
- * non-destructive VEX-encoded instructions.  Second, AVX support implies SSSE3
- * and SSE4.1 support, and we can use SSSE3 and SSE4.1 intrinsics for efficient
- * handling of partial blocks.  (We *could* compile a variant with
- * PCLMULQDQ+SSE4.1 without AVX, but for simplicity we currently don't bother.)
+ * PCLMULQDQ/AVX implementation.  Same as above, but this is compiled with AVX
+ * enabled so that the compiler can generate VEX-coded instructions which can be
+ * slightly more efficient.  It still uses 128-bit vectors.
  */
 #  define crc32_x86_pclmulqdq_avx	crc32_x86_pclmulqdq_avx
 #  define SUFFIX				 _pclmulqdq_avx
 #  define ATTRIBUTES		_target_attribute("pclmul,avx")
 #  define VL			16
-#  define USE_SSE4_1		1
 #  define USE_AVX512		0
 #  include "crc32_pclmul_template.h"
 #endif
@@ -83,43 +78,47 @@ static const u8 MAYBE_UNUSED shift_tab[48] = {
  *
  * gcc 8.1 and 8.2 had a similar bug where they assumed that
  * _mm256_clmulepi64_epi128() always needed AVX512.  It's fixed in gcc 8.3.
+ *
+ * _mm256_zextsi128_si256() requires gcc 10.
  */
-#if GCC_PREREQ(8, 3) || CLANG_PREREQ(6, 0, 10000000)
+#if (GCC_PREREQ(10, 1) || CLANG_PREREQ(6, 0, 10000000)) && \
+	!defined(LIBDEFLATE_ASSEMBLER_DOES_NOT_SUPPORT_VPCLMULQDQ)
 #  define crc32_x86_vpclmulqdq_avx2	crc32_x86_vpclmulqdq_avx2
 #  define SUFFIX				 _vpclmulqdq_avx2
 #  define ATTRIBUTES		_target_attribute("vpclmulqdq,pclmul,avx2")
 #  define VL			32
-#  define USE_SSE4_1		1
 #  define USE_AVX512		0
 #  include "crc32_pclmul_template.h"
 #endif
-#if GCC_PREREQ(8, 1) || CLANG_PREREQ(6, 0, 10000000) || MSVC_PREREQ(1920)
+#if (GCC_PREREQ(10, 1) || CLANG_PREREQ(6, 0, 10000000) || MSVC_PREREQ(1920)) && \
+	!defined(LIBDEFLATE_ASSEMBLER_DOES_NOT_SUPPORT_VPCLMULQDQ)
 /*
  * VPCLMULQDQ/AVX512 implementation using 256-bit vectors.  This is very similar
  * to the VPCLMULQDQ/AVX2 implementation but takes advantage of the vpternlog
- * instruction and more registers.  This is used on CPUs that support AVX-512
- * but where using 512-bit vectors causes downclocking.  This should also be the
- * optimal implementation on CPUs that support AVX10/256 but not AVX10/512.
+ * instruction and more registers.  This is used on certain older Intel CPUs,
+ * specifically Ice Lake and Tiger Lake, which support VPCLMULQDQ and AVX512 but
+ * downclock a bit too eagerly when ZMM registers are used.
+ *
+ * _mm256_zextsi128_si256() requires gcc 10.
  */
 #  define crc32_x86_vpclmulqdq_avx512_vl256  crc32_x86_vpclmulqdq_avx512_vl256
 #  define SUFFIX				      _vpclmulqdq_avx512_vl256
 #  define ATTRIBUTES		_target_attribute("vpclmulqdq,pclmul,avx512bw,avx512vl")
 #  define VL			32
-#  define USE_SSE4_1		1
 #  define USE_AVX512		1
 #  include "crc32_pclmul_template.h"
 /*
  * VPCLMULQDQ/AVX512 implementation using 512-bit vectors.  This is used on CPUs
- * that have a good AVX-512 implementation including VPCLMULQDQ.  This should
- * also be the optimal implementation on CPUs that support AVX10/512.
+ * that have a good AVX-512 implementation including VPCLMULQDQ.
+ *
+ * _mm512_zextsi128_si512() requires gcc 10.
  */
 #  define crc32_x86_vpclmulqdq_avx512_vl512  crc32_x86_vpclmulqdq_avx512_vl512
 #  define SUFFIX				      _vpclmulqdq_avx512_vl512
 #  define ATTRIBUTES		_target_attribute("vpclmulqdq,pclmul,avx512bw,avx512vl")
 #  define VL			64
-#  define USE_SSE4_1		1
 #  define USE_AVX512		1
 #  include "crc32_pclmul_template.h"
 #endif

data/ext/deflate_ruby/{libdeflate/lib/crc32_multipliers.h → crc32_multipliers.h} RENAMED Viewed

@@ -1,7 +1,7 @@
 /*
  * crc32_multipliers.h - constants for CRC-32 folding
  *
- * THIS FILE WAS GENERATED BY gen_crc32_multipliers.c.  DO NOT EDIT.
+ * THIS FILE WAS GENERATED BY gen-crc32-consts.py.  DO NOT EDIT.
  */
 #define CRC32_X159_MODG 0xae689191 /* x^159 mod G(x) */
@@ -100,10 +100,8 @@
 #define CRC32_X4127_MODG 0x1072db28 /* x^4127 mod G(x) */
 #define CRC32_X4063_MODG 0x0c30f51d /* x^4063 mod G(x) */
-#define CRC32_X63_MODG 0xb8bc6765 /* x^63 mod G(x) */
-#define CRC32_BARRETT_CONSTANT_1 0x00000001f7011641ULL /* floor(x^64 / G(x)) */
+#define CRC32_BARRETT_CONSTANT_1 0xb4e5b025f7011641ULL /* floor(x^95 / G(x)) */
 #define CRC32_BARRETT_CONSTANT_2 0x00000001db710641ULL /* G(x) */
-#define CRC32_BARRETT_CONSTANTS { CRC32_BARRETT_CONSTANT_1, CRC32_BARRETT_CONSTANT_2 }
 #define CRC32_NUM_CHUNKS 4
 #define CRC32_MIN_VARIABLE_CHUNK_LEN 128UL

data/ext/deflate_ruby/{libdeflate/lib/x86/crc32_pclmul_template.h → crc32_pclmul_template.h} RENAMED Viewed

@@ -34,17 +34,13 @@
  * ATTRIBUTES:
  *	Target function attributes to use.  Must satisfy the dependencies of the
  *	other parameters as follows:
- *	   VL=16 && USE_SSE4_1=0 && USE_AVX512=0: at least pclmul
- *	   VL=16 && USE_SSE4_1=1 && USE_AVX512=0: at least pclmul,sse4.1
- *	   VL=32 && USE_SSE4_1=1 && USE_AVX512=0: at least vpclmulqdq,pclmul,avx2
- *	   VL=32 && USE_SSE4_1=1 && USE_AVX512=1: at least vpclmulqdq,pclmul,avx512bw,avx512vl
- *	   VL=64 && USE_SSE4_1=1 && USE_AVX512=1: at least vpclmulqdq,pclmul,avx512bw,avx512vl
+ *	   VL=16 && USE_AVX512=0: at least pclmul,sse4.1
+ *	   VL=32 && USE_AVX512=0: at least vpclmulqdq,pclmul,avx2
+ *	   VL=32 && USE_AVX512=1: at least vpclmulqdq,pclmul,avx512bw,avx512vl
+ *	   VL=64 && USE_AVX512=1: at least vpclmulqdq,pclmul,avx512bw,avx512vl
  *	   (Other combinations are not useful and have not been tested.)
  * VL:
  *	Vector length in bytes.  Must be 16, 32, or 64.
- * USE_SSE4_1:
- *	If 1, take advantage of SSE4.1 instructions such as pblendvb.
- *	If 0, assume that the CPU might not support SSE4.1.
  * USE_AVX512:
  *	If 1, take advantage of AVX-512 features such as masking and the
  *	vpternlog instruction.  This doesn't enable the use of 512-bit vectors;
@@ -55,7 +51,10 @@
  * instructions.  Note that the x86 crc32 instruction cannot be used, as it is
  * for a different polynomial, not the gzip one.  For an explanation of CRC
  * folding with carryless multiplication instructions, see
- * scripts/gen_crc32_multipliers.c and the following paper:
+ * scripts/gen-crc32-consts.py and the following blog posts and papers:
+ *
+ *	"An alternative exposition of crc32_4k_pclmulqdq"
+ *	https://www.corsix.org/content/alternative-exposition-crc32_4k_pclmulqdq
  *
  *	"Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
  *	https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
@@ -81,7 +80,7 @@
 #  define fold_vec		fold_vec256
 #  define VLOADU(p)		_mm256_loadu_si256((const void *)(p))
 #  define VXOR(a, b)		_mm256_xor_si256((a), (b))
-#  define M128I_TO_VEC(a)	_mm256_castsi128_si256(a)
+#  define M128I_TO_VEC(a)	_mm256_zextsi128_si256(a)
 #  define MULTS(a, b)		_mm256_set_epi64x(a, b, a, b)
 #  define MULTS_8V		MULTS(CRC32_X2015_MODG, CRC32_X2079_MODG)
 #  define MULTS_4V		MULTS(CRC32_X991_MODG, CRC32_X1055_MODG)
@@ -92,7 +91,7 @@
 #  define fold_vec		fold_vec512
 #  define VLOADU(p)		_mm512_loadu_si512((const void *)(p))
 #  define VXOR(a, b)		_mm512_xor_si512((a), (b))
-#  define M128I_TO_VEC(a)	_mm512_castsi128_si512(a)
+#  define M128I_TO_VEC(a)	_mm512_zextsi128_si512(a)
 #  define MULTS(a, b)		_mm512_set_epi64(a, b, a, b, a, b, a, b)
 #  define MULTS_8V		MULTS(CRC32_X4063_MODG, CRC32_X4127_MODG)
 #  define MULTS_4V		MULTS(CRC32_X2015_MODG, CRC32_X2079_MODG)
@@ -149,7 +148,6 @@ ADD_SUFFIX(fold_vec512)(__m512i src, __m512i dst, __m512i /* __v8du */ mults)
 #define fold_vec512	ADD_SUFFIX(fold_vec512)
 #endif /* VL >= 64 */
-#if USE_SSE4_1
 /*
  * Given 'x' containing a 16-byte polynomial, and a pointer 'p' that points to
  * the next '1 <= len <= 15' data bytes, rearrange the concatenation of 'x' and
@@ -181,7 +179,6 @@ ADD_SUFFIX(fold_lessthan16bytes)(__m128i x, const u8 *p, size_t len,
 	return fold_vec128(x0, x1, mults_128b);
 }
 #define fold_lessthan16bytes	ADD_SUFFIX(fold_lessthan16bytes)
-#endif /* USE_SSE4_1 */
 static ATTRIBUTES u32
 ADD_SUFFIX(crc32_x86)(u32 crc, const u8 *p, size_t len)
@@ -192,15 +189,13 @@ ADD_SUFFIX(crc32_x86)(u32 crc, const u8 *p, size_t len)
 	 * folding across 128 bits.  mults_128b differs from mults_1v when
 	 * VL != 16.  All multipliers are 64-bit, to match what pclmulqdq needs,
 	 * but since this is for CRC-32 only their low 32 bits are nonzero.
-	 * For more details, see scripts/gen_crc32_multipliers.c.
+	 * For more details, see scripts/gen-crc32-consts.py.
 	 */
 	const vec_t mults_8v = MULTS_8V;
 	const vec_t mults_4v = MULTS_4V;
 	const vec_t mults_2v = MULTS_2V;
 	const vec_t mults_1v = MULTS_1V;
 	const __m128i mults_128b = _mm_set_epi64x(CRC32_X95_MODG, CRC32_X159_MODG);
-	const __m128i final_mult = _mm_set_epi64x(0, CRC32_X63_MODG);
-	const __m128i mask32 = _mm_set_epi32(0, 0, 0, 0xFFFFFFFF);
 	const __m128i barrett_reduction_constants =
 		_mm_set_epi64x(CRC32_BARRETT_CONSTANT_2, CRC32_BARRETT_CONSTANT_1);
 	vec_t v0, v1, v2, v3, v4, v5, v6, v7;
@@ -273,7 +268,6 @@ ADD_SUFFIX(crc32_x86)(u32 crc, const u8 *p, size_t len)
 			size_t align = -(uintptr_t)p & (VL-1);
 			len -= align;
-		#if USE_SSE4_1
 			x0 = _mm_xor_si128(_mm_loadu_si128((const void *)p), x0);
 			p += 16;
 			if (align & 15) {
@@ -296,11 +290,6 @@ ADD_SUFFIX(crc32_x86)(u32 crc, const u8 *p, size_t len)
 			v0 = _mm512_inserti64x4(v0, *(const __m256i *)(p + 16), 1);
 		#  endif
 			p -= 16;
-		#else
-			crc = crc32_slice1(crc, p, align);
-			p += align;
-			v0 = VXOR(VLOADU(p), M128I_TO_VEC(_mm_cvtsi32_si128(crc)));
-		#endif
 		} else {
 			v0 = VXOR(VLOADU(p), M128I_TO_VEC(x0));
 		}
@@ -395,86 +384,27 @@ less_than_vl_remaining:
 less_than_16_remaining:
 	len &= 15;
-	/*
-	 * If fold_lessthan16bytes() is available, handle any remainder
-	 * of 1 to 15 bytes now, before reducing to 32 bits.
-	 */
-#if USE_SSE4_1
+	/* Handle any remainder of 1 to 15 bytes. */
 	if (len)
 		x0 = fold_lessthan16bytes(x0, p, len, mults_128b);
-#endif
 #if USE_AVX512
 reduce_x0:
 #endif
-	/*
-	 * Fold 128 => 96 bits.  This also implicitly appends 32 zero bits,
-	 * which is equivalent to multiplying by x^32.  This is needed because
-	 * the CRC is defined as M(x)*x^32 mod G(x), not just M(x) mod G(x).
-	 */
-	x0 = _mm_xor_si128(_mm_srli_si128(x0, 8),
-			   _mm_clmulepi64_si128(x0, mults_128b, 0x10));
-	/* Fold 96 => 64 bits. */
-	x0 = _mm_xor_si128(_mm_srli_si128(x0, 4),
-			   _mm_clmulepi64_si128(_mm_and_si128(x0, mask32),
-						final_mult, 0x00));
 	/*
-	 * Reduce 64 => 32 bits using Barrett reduction.
-	 *
-	 * Let M(x) = A(x)*x^32 + B(x) be the remaining message.  The goal is to
-	 * compute R(x) = M(x) mod G(x).  Since degree(B(x)) < degree(G(x)):
-	 *
-	 *	R(x) = (A(x)*x^32 + B(x)) mod G(x)
-	 *	     = (A(x)*x^32) mod G(x) + B(x)
-	 *
-	 * Then, by the Division Algorithm there exists a unique q(x) such that:
+	 * Multiply the remaining 128-bit message polynomial 'x0' by x^32, then
+	 * reduce it modulo the generator polynomial G.  This gives the CRC.
 	 *
-	 *	A(x)*x^32 mod G(x) = A(x)*x^32 - q(x)*G(x)
-	 *
-	 * Since the left-hand side is of maximum degree 31, the right-hand side
-	 * must be too.  This implies that we can apply 'mod x^32' to the
-	 * right-hand side without changing its value:
-	 *
-	 *	(A(x)*x^32 - q(x)*G(x)) mod x^32 = q(x)*G(x) mod x^32
-	 *
-	 * Note that '+' is equivalent to '-' in polynomials over GF(2).
-	 *
-	 * We also know that:
-	 *
-	 *	              / A(x)*x^32 \
-	 *	q(x) = floor (  ---------  )
-	 *	              \    G(x)   /
-	 *
-	 * To compute this efficiently, we can multiply the top and bottom by
-	 * x^32 and move the division by G(x) to the top:
-	 *
-	 *	              / A(x) * floor(x^64 / G(x)) \
-	 *	q(x) = floor (  -------------------------  )
-	 *	              \           x^32            /
-	 *
-	 * Note that floor(x^64 / G(x)) is a constant.
-	 *
-	 * So finally we have:
-	 *
-	 *	                          / A(x) * floor(x^64 / G(x)) \
-	 *	R(x) = B(x) + G(x)*floor (  -------------------------  )
-	 *	                          \           x^32            /
+	 * This implementation matches that used in crc-pclmul-template.S from
+	 * https://lore.kernel.org/r/20250210174540.161705-4-ebiggers@kernel.org/
+	 * with the parameters n=32 and LSB_CRC=1 (what the gzip CRC uses).  See
+	 * there for a detailed explanation of the math used here.
 	 */
-	x1 = _mm_clmulepi64_si128(_mm_and_si128(x0, mask32),
-				  barrett_reduction_constants, 0x00);
-	x1 = _mm_clmulepi64_si128(_mm_and_si128(x1, mask32),
-				  barrett_reduction_constants, 0x10);
+	x0 = _mm_xor_si128(_mm_clmulepi64_si128(x0, mults_128b, 0x10),
+			   _mm_bsrli_si128(x0, 8));
+	x1 = _mm_clmulepi64_si128(x0, barrett_reduction_constants, 0x00);
+	x1 = _mm_clmulepi64_si128(x1, barrett_reduction_constants, 0x10);
 	x0 = _mm_xor_si128(x0, x1);
-#if USE_SSE4_1
-	crc = _mm_extract_epi32(x0, 1);
-#else
-	crc = _mm_cvtsi128_si32(_mm_shuffle_epi32(x0, 0x01));
-	/* Process up to 15 bytes left over at the end. */
-	crc = crc32_slice1(crc, p, len);
-#endif
-	return crc;
+	return _mm_extract_epi32(x0, 2);
 }
 #undef vec_t
@@ -491,5 +421,4 @@ reduce_x0:
 #undef SUFFIX
 #undef ATTRIBUTES
 #undef VL
-#undef USE_SSE4_1
 #undef USE_AVX512

data/ext/deflate_ruby/{libdeflate/lib/crc32_tables.h → crc32_tables.h} RENAMED Viewed

@@ -1,7 +1,7 @@
 /*
  * crc32_tables.h - data tables for CRC-32 computation
  *
- * THIS FILE WAS GENERATED BY gen_crc32_tables.c.  DO NOT EDIT.
+ * THIS FILE WAS GENERATED BY gen-crc32-consts.py.  DO NOT EDIT.
  */
 static const u32 crc32_slice1_table[] MAYBE_UNUSED = {