RubyGems - libdeflate - Versions diffs - 0.1.1 → 0.2.0 - Mend

libdeflate 0.1.1 → 0.2.0

Files changed (79) hide show

checksums.yaml +5 -5
data/.github/workflows/test.yml +34 -0
data/README.md +1 -6
data/ext/libdeflate/extconf.rb +18 -7
data/ext/libdeflate/libdeflate_ext.c +17 -17
data/lib/libdeflate/version.rb +1 -1
data/libdeflate.gemspec +2 -1
metadata +13 -84
data/.gitmodules +0 -3
data/.travis.yml +0 -5
data/ext/libdeflate/libdeflate/.gitignore +0 -19
data/ext/libdeflate/libdeflate/COPYING +0 -21
data/ext/libdeflate/libdeflate/Makefile +0 -231
data/ext/libdeflate/libdeflate/Makefile.msc +0 -64
data/ext/libdeflate/libdeflate/NEWS +0 -57
data/ext/libdeflate/libdeflate/README.md +0 -170
data/ext/libdeflate/libdeflate/common/common_defs.h +0 -351
data/ext/libdeflate/libdeflate/common/compiler_gcc.h +0 -134
data/ext/libdeflate/libdeflate/common/compiler_msc.h +0 -95
data/ext/libdeflate/libdeflate/lib/adler32.c +0 -213
data/ext/libdeflate/libdeflate/lib/adler32_impl.h +0 -281
data/ext/libdeflate/libdeflate/lib/aligned_malloc.c +0 -57
data/ext/libdeflate/libdeflate/lib/aligned_malloc.h +0 -13
data/ext/libdeflate/libdeflate/lib/bt_matchfinder.h +0 -357
data/ext/libdeflate/libdeflate/lib/crc32.c +0 -368
data/ext/libdeflate/libdeflate/lib/crc32_impl.h +0 -286
data/ext/libdeflate/libdeflate/lib/crc32_table.h +0 -526
data/ext/libdeflate/libdeflate/lib/decompress_impl.h +0 -404
data/ext/libdeflate/libdeflate/lib/deflate_compress.c +0 -2817
data/ext/libdeflate/libdeflate/lib/deflate_compress.h +0 -14
data/ext/libdeflate/libdeflate/lib/deflate_constants.h +0 -66
data/ext/libdeflate/libdeflate/lib/deflate_decompress.c +0 -889
data/ext/libdeflate/libdeflate/lib/gzip_compress.c +0 -95
data/ext/libdeflate/libdeflate/lib/gzip_constants.h +0 -45
data/ext/libdeflate/libdeflate/lib/gzip_decompress.c +0 -130
data/ext/libdeflate/libdeflate/lib/hc_matchfinder.h +0 -405
data/ext/libdeflate/libdeflate/lib/lib_common.h +0 -35
data/ext/libdeflate/libdeflate/lib/matchfinder_avx2.h +0 -53
data/ext/libdeflate/libdeflate/lib/matchfinder_common.h +0 -205
data/ext/libdeflate/libdeflate/lib/matchfinder_neon.h +0 -61
data/ext/libdeflate/libdeflate/lib/matchfinder_sse2.h +0 -53
data/ext/libdeflate/libdeflate/lib/unaligned.h +0 -202
data/ext/libdeflate/libdeflate/lib/x86_cpu_features.c +0 -169
data/ext/libdeflate/libdeflate/lib/x86_cpu_features.h +0 -48
data/ext/libdeflate/libdeflate/lib/zlib_compress.c +0 -87
data/ext/libdeflate/libdeflate/lib/zlib_constants.h +0 -21
data/ext/libdeflate/libdeflate/lib/zlib_decompress.c +0 -91
data/ext/libdeflate/libdeflate/libdeflate.h +0 -274
data/ext/libdeflate/libdeflate/programs/benchmark.c +0 -558
data/ext/libdeflate/libdeflate/programs/checksum.c +0 -197
data/ext/libdeflate/libdeflate/programs/detect.sh +0 -62
data/ext/libdeflate/libdeflate/programs/gzip.c +0 -603
data/ext/libdeflate/libdeflate/programs/prog_util.c +0 -530
data/ext/libdeflate/libdeflate/programs/prog_util.h +0 -162
data/ext/libdeflate/libdeflate/programs/test_checksums.c +0 -135
data/ext/libdeflate/libdeflate/programs/tgetopt.c +0 -118
data/ext/libdeflate/libdeflate/tools/afl-fuzz/Makefile +0 -12
data/ext/libdeflate/libdeflate/tools/afl-fuzz/deflate_compress/fuzz.c +0 -40
data/ext/libdeflate/libdeflate/tools/afl-fuzz/deflate_compress/inputs/0 +0 -0
data/ext/libdeflate/libdeflate/tools/afl-fuzz/deflate_decompress/fuzz.c +0 -28
data/ext/libdeflate/libdeflate/tools/afl-fuzz/deflate_decompress/inputs/0 +0 -3
data/ext/libdeflate/libdeflate/tools/afl-fuzz/gzip_decompress/fuzz.c +0 -28
data/ext/libdeflate/libdeflate/tools/afl-fuzz/gzip_decompress/inputs/0 +0 -0
data/ext/libdeflate/libdeflate/tools/afl-fuzz/prepare_for_fuzz.sh +0 -14
data/ext/libdeflate/libdeflate/tools/afl-fuzz/zlib_decompress/fuzz.c +0 -28
data/ext/libdeflate/libdeflate/tools/afl-fuzz/zlib_decompress/inputs/0 +0 -3
data/ext/libdeflate/libdeflate/tools/android_build.sh +0 -104
data/ext/libdeflate/libdeflate/tools/checksum_benchmarks.sh +0 -76
data/ext/libdeflate/libdeflate/tools/exec_tests.sh +0 -30
data/ext/libdeflate/libdeflate/tools/gen_crc32_multipliers.c +0 -108
data/ext/libdeflate/libdeflate/tools/gen_crc32_table.c +0 -100
data/ext/libdeflate/libdeflate/tools/gzip_tests.sh +0 -412
data/ext/libdeflate/libdeflate/tools/make-windows-releases +0 -21
data/ext/libdeflate/libdeflate/tools/mips_build.sh +0 -9
data/ext/libdeflate/libdeflate/tools/msc_test.bat +0 -3
data/ext/libdeflate/libdeflate/tools/pgo_build.sh +0 -23
data/ext/libdeflate/libdeflate/tools/produce_gzip_benchmark_table.sh +0 -37
data/ext/libdeflate/libdeflate/tools/run_tests.sh +0 -305
data/ext/libdeflate/libdeflate/tools/windows_build.sh +0 -10

data/ext/libdeflate/libdeflate/lib/crc32.c DELETED Viewed

@@ -1,368 +0,0 @@
-/*
- * crc32.c - CRC-32 checksum algorithm for the gzip format
- *
- * Originally public domain; changes after 2016-09-07 are copyrighted.
- *
- * Copyright 2016 Eric Biggers
- *
- * Permission is hereby granted, free of charge, to any person
- * obtaining a copy of this software and associated documentation
- * files (the "Software"), to deal in the Software without
- * restriction, including without limitation the rights to use,
- * copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following
- * conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
- * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
- * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
- * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
- * OTHER DEALINGS IN THE SOFTWARE.
- */
-/*
- * High-level description of CRC
- * =============================
- *
- * Consider a bit sequence 'bits[1...len]'.  Interpret 'bits' as the "message"
- * polynomial M(x) with coefficients in GF(2) (the field of integers modulo 2),
- * where the coefficient of 'x^i' is 'bits[len - i]'.  Then, compute:
- *
- *			R(x) = M(x)*x^n mod G(x)
- *
- * where G(x) is a selected "generator" polynomial of degree 'n'.  The remainder
- * R(x) is a polynomial of max degree 'n - 1'.  The CRC of 'bits' is R(x)
- * interpreted as a bitstring of length 'n'.
- *
- * CRC used in gzip
- * ================
- *
- * In the gzip format (RFC 1952):
- *
- *	- The bitstring to checksum is formed from the bytes of the uncompressed
- *	  data by concatenating the bits from the bytes in order, proceeding
- *	  from the low-order bit to the high-order bit within each byte.
- *
- *	- The generator polynomial G(x) is: x^32 + x^26 + x^23 + x^22 + x^16 +
- *	  x^12 + x^11 + x^10 + x^8 + x^7 + x^5 + x^4 + x^2 + x + 1.
- *	  Consequently, the CRC length is 32 bits ("CRC-32").
- *
- *	- The highest order 32 coefficients of M(x)*x^n are inverted.
- *
- *	- All 32 coefficients of R(x) are inverted.
- *
- * The two inversions cause added leading and trailing zero bits to affect the
- * resulting CRC, whereas with a regular CRC such bits would have no effect on
- * the CRC.
- *
- * Computation and optimizations
- * =============================
- *
- * We can compute R(x) through "long division", maintaining only 32 bits of
- * state at any given time.  Multiplication by 'x' can be implemented as
- * right-shifting by 1 (assuming the polynomial<=>bitstring mapping where the
- * highest order bit represents the coefficient of x^0), and both addition and
- * subtraction can be implemented as bitwise exclusive OR (since we are working
- * in GF(2)).  Here is an unoptimized implementation:
- *
- *	static u32 crc32_gzip(const u8 *buffer, size_t nbytes)
- *	{
- *		u32 remainder = 0;
- *		const u32 divisor = 0xEDB88320;
- *
- *		for (size_t i = 0; i < nbytes * 8 + 32; i++) {
- *			int bit;
- *			u32 multiple;
- *
- *			if (i < nbytes * 8)
- *				bit = (buffer[i / 8] >> (i % 8)) & 1;
- *			else
- *				bit = 0; // one of the 32 appended 0 bits
- *
- *			if (i < 32) // the first 32 bits are inverted
- *				bit ^= 1;
- *
- *			if (remainder & 1)
- *				multiple = divisor;
- *			else
- *				multiple = 0;
- *
- *			remainder >>= 1;
- *			remainder |= (u32)bit << 31;
- *			remainder ^= multiple;
- *		}
- *
- *		return ~remainder;
- *	}
- *
- * In this implementation, the 32-bit integer 'remainder' maintains the
- * remainder of the currently processed portion of the message (with 32 zero
- * bits appended) when divided by the generator polynomial.  'remainder' is the
- * representation of R(x), and 'divisor' is the representation of G(x) excluding
- * the x^32 coefficient.  For each bit to process, we multiply R(x) by 'x^1',
- * then add 'x^0' if the new bit is a 1.  If this causes R(x) to gain a nonzero
- * x^32 term, then we subtract G(x) from R(x).
- *
- * We can speed this up by taking advantage of the fact that XOR is commutative
- * and associative, so the order in which we combine the inputs into 'remainder'
- * is unimportant.  And since each message bit we add doesn't affect the choice
- * of 'multiple' until 32 bits later, we need not actually add each message bit
- * until that point:
- *
- *	static u32 crc32_gzip(const u8 *buffer, size_t nbytes)
- *	{
- *		u32 remainder = ~0;
- *		const u32 divisor = 0xEDB88320;
- *
- *		for (size_t i = 0; i < nbytes * 8; i++) {
- *			int bit;
- *			u32 multiple;
- *
- *			bit = (buffer[i / 8] >> (i % 8)) & 1;
- *			remainder ^= bit;
- *			if (remainder & 1)
- *				multiple = divisor;
- *			else
- *				multiple = 0;
- *			remainder >>= 1;
- *			remainder ^= multiple;
- *		}
- *
- *		return ~remainder;
- *	}
- *
- * With the above implementation we get the effect of 32 appended 0 bits for
- * free; they never affect the choice of a divisor, nor would they change the
- * value of 'remainder' if they were to be actually XOR'ed in.  And by starting
- * with a remainder of all 1 bits, we get the effect of complementing the first
- * 32 message bits.
- *
- * The next optimization is to process the input in multi-bit units.  Suppose
- * that we insert the next 'n' message bits into the remainder.  Then we get an
- * intermediate remainder of length '32 + n' bits, and the CRC of the extra 'n'
- * bits is the amount by which the low 32 bits of the remainder will change as a
- * result of cancelling out those 'n' bits.  Taking n=8 (one byte) and
- * precomputing a table containing the CRC of each possible byte, we get
- * crc32_slice1() defined below.
- *
- * As a further optimization, we could increase the multi-bit unit size to 16.
- * However, that is inefficient because the table size explodes from 256 entries
- * (1024 bytes) to 65536 entries (262144 bytes), which wastes memory and won't
- * fit in L1 cache on typical processors.
- *
- * However, we can actually process 4 bytes at a time using 4 different tables
- * with 256 entries each.  Logically, we form a 64-bit intermediate remainder
- * and cancel out the high 32 bits in 8-bit chunks.  Bits 32-39 are cancelled
- * out by the CRC of those bits, whereas bits 40-47 are be cancelled out by the
- * CRC of those bits with 8 zero bits appended, and so on.  This method is
- * implemented in crc32_slice4(), defined below.
- *
- * In crc32_slice8(), this method is extended to 8 bytes at a time.  The
- * intermediate remainder (which we never actually store explicitly) is 96 bits.
- *
- * On CPUs that support fast carryless multiplication, CRCs can be computed even
- * more quickly via "folding".  See crc32_pclmul() for an example.
- */
-#include "x86_cpu_features.h"
-#include "libdeflate.h"
-/* Select the implementations to compile in. */
-#define NEED_GENERIC_IMPL 1 /* include generic impl unless overridden */
-#define DEFAULT_IMPL crc32_slice8
-/* Include the PCLMUL implementation? */
-#define NEED_PCLMUL_IMPL 0
-#if defined(__PCLMUL__) || \
-	(X86_CPU_FEATURES_ENABLED && COMPILER_SUPPORTS_PCLMUL_TARGET &&	\
-	 COMPILER_SUPPORTS_TARGET_INTRINSICS)
-#  include <wmmintrin.h>
-#  undef NEED_PCLMUL_IMPL
-#  define NEED_PCLMUL_IMPL 1
-#  ifdef __PCLMUL__ /* compiling for PCLMUL, i.e. can we assume it's there? */
-#    undef NEED_GENERIC_IMPL
-#    define NEED_GENERIC_IMPL 0 /* generic impl not needed */
-#    undef DEFAULT_IMPL
-#    define DEFAULT_IMPL crc32_pclmul
-#  endif /* otherwise, we can build a PCLMUL version, but we won't know whether
-	    we can use it until runtime */
-#endif
-/*
- * Include the PCLMUL/AVX implementation?  Although our PCLMUL-optimized CRC-32
- * function doesn't use any AVX intrinsics specifically, it can benefit a lot
- * from being compiled for an AVX target: on Skylake, ~16700 MB/s vs. ~10100
- * MB/s.  I expect this is related to the PCLMULQDQ instructions being assembled
- * in the newer three-operand form rather than the older two-operand form.
- *
- * Note: this is only needed if __AVX__ is *not* defined, since otherwise the
- * "regular" PCLMUL implementation would already be AVX enabled.
- */
-#define NEED_PCLMUL_AVX_IMPL 0
-#if NEED_PCLMUL_IMPL && !defined(__AVX__) && \
-	 X86_CPU_FEATURES_ENABLED && COMPILER_SUPPORTS_AVX_TARGET
-#  undef NEED_PCLMUL_AVX_IMPL
-#  define NEED_PCLMUL_AVX_IMPL 1
-#endif
-#define NUM_IMPLS (NEED_GENERIC_IMPL + NEED_PCLMUL_IMPL + NEED_PCLMUL_AVX_IMPL)
-/* Define the CRC-32 table */
-#if NEED_GENERIC_IMPL
-#  define CRC32_SLICE8
-#else
-#  define CRC32_SLICE1 /* only need short table for unaligned ends */
-#endif
-#include "crc32_table.h"
-static forceinline u32
-crc32_update_byte(u32 remainder, u8 next_byte)
-{
-	return (remainder >> 8) ^ crc32_table[(u8)remainder ^ next_byte];
-}
-#if defined(CRC32_SLICE1) || (NUM_IMPLS > NEED_GENERIC_IMPL)
-static u32
-crc32_slice1(u32 remainder, const u8 *buffer, size_t nbytes)
-{
-	size_t i;
-	STATIC_ASSERT(ARRAY_LEN(crc32_table) >= 0x100);
-	for (i = 0; i < nbytes; i++)
-		remainder = crc32_update_byte(remainder, buffer[i]);
-	return remainder;
-}
-#endif
-#ifdef CRC32_SLICE4
-static u32
-crc32_slice4(u32 remainder, const u8 *buffer, size_t nbytes)
-{
-	const u8 *p = buffer;
-	const u8 *end = buffer + nbytes;
-	const u8 *end32;
-	STATIC_ASSERT(ARRAY_LEN(crc32_table) >= 0x400);
-	for (; ((uintptr_t)p & 3) && p != end; p++)
-		remainder = crc32_update_byte(remainder, *p);
-	end32 = p + ((end - p) & ~3);
-	for (; p != end32; p += 4) {
-		u32 v = le32_bswap(*(const u32 *)p);
-		remainder =
-		    crc32_table[0x300 + (u8)((remainder ^ v) >>  0)] ^
-		    crc32_table[0x200 + (u8)((remainder ^ v) >>  8)] ^
-		    crc32_table[0x100 + (u8)((remainder ^ v) >> 16)] ^
-		    crc32_table[0x000 + (u8)((remainder ^ v) >> 24)];
-	}
-	for (; p != end; p++)
-		remainder = crc32_update_byte(remainder, *p);
-	return remainder;
-}
-#endif
-#ifdef CRC32_SLICE8
-static u32
-crc32_slice8(u32 remainder, const u8 *buffer, size_t nbytes)
-{
-	const u8 *p = buffer;
-	const u8 *end = buffer + nbytes;
-	const u8 *end64;
-	STATIC_ASSERT(ARRAY_LEN(crc32_table) >= 0x800);
-	for (; ((uintptr_t)p & 7) && p != end; p++)
-		remainder = crc32_update_byte(remainder, *p);
-	end64 = p + ((end - p) & ~7);
-	for (; p != end64; p += 8) {
-		u32 v1 = le32_bswap(*(const u32 *)(p + 0));
-		u32 v2 = le32_bswap(*(const u32 *)(p + 4));
-		remainder =
-		    crc32_table[0x700 + (u8)((remainder ^ v1) >>  0)] ^
-		    crc32_table[0x600 + (u8)((remainder ^ v1) >>  8)] ^
-		    crc32_table[0x500 + (u8)((remainder ^ v1) >> 16)] ^
-		    crc32_table[0x400 + (u8)((remainder ^ v1) >> 24)] ^
-		    crc32_table[0x300 + (u8)(v2 >>  0)] ^
-		    crc32_table[0x200 + (u8)(v2 >>  8)] ^
-		    crc32_table[0x100 + (u8)(v2 >> 16)] ^
-		    crc32_table[0x000 + (u8)(v2 >> 24)];
-	}
-	for (; p != end; p++)
-		remainder = crc32_update_byte(remainder, *p);
-	return remainder;
-}
-#endif
-/* Define the PCLMUL implementation if needed. */
-#if NEED_PCLMUL_IMPL
-#  define FUNCNAME		crc32_pclmul
-#  define FUNCNAME_ALIGNED	crc32_pclmul_aligned
-#  ifdef __PCLMUL__
-#    define ATTRIBUTES
-#  else
-#    define ATTRIBUTES		__attribute__((target("pclmul")))
-#  endif
-#  include "crc32_impl.h"
-#endif
-/* Define the PCLMUL/AVX implementation if needed. */
-#if NEED_PCLMUL_AVX_IMPL
-#  define FUNCNAME		crc32_pclmul_avx
-#  define FUNCNAME_ALIGNED	crc32_pclmul_avx_aligned
-#  define ATTRIBUTES		__attribute__((target("pclmul,avx")))
-#  include "crc32_impl.h"
-#endif
-typedef u32 (*crc32_func_t)(u32, const u8 *, size_t);
-/*
- * If multiple implementations are available, then dispatch among them based on
- * CPU features at runtime.  Otherwise just call the single one directly.
- */
-#if NUM_IMPLS == 1
-#  define crc32_impl DEFAULT_IMPL
-#else
-static u32 dispatch(u32, const u8 *, size_t);
-static crc32_func_t crc32_impl = dispatch;
-static u32 dispatch(u32 remainder, const u8 *buffer, size_t nbytes)
-{
-	crc32_func_t f = DEFAULT_IMPL;
-#if NEED_PCLMUL_IMPL && !defined(__PCLMUL__)
-	if (x86_have_cpu_features(X86_CPU_FEATURE_PCLMULQDQ))
-		f = crc32_pclmul;
-#endif
-#if NEED_PCLMUL_AVX_IMPL
-	if (x86_have_cpu_features(X86_CPU_FEATURE_PCLMULQDQ |
-				  X86_CPU_FEATURE_AVX))
-		f = crc32_pclmul_avx;
-#endif
-	crc32_impl = f;
-	return crc32_impl(remainder, buffer, nbytes);
-}
-#endif /* NUM_IMPLS != 1 */
-LIBDEFLATEAPI u32
-libdeflate_crc32(u32 remainder, const void *buffer, size_t nbytes)
-{
-	if (buffer == NULL) /* return initial value */
-		return 0;
-	return ~crc32_impl(~remainder, buffer, nbytes);
-}

data/ext/libdeflate/libdeflate/lib/crc32_impl.h DELETED Viewed

@@ -1,286 +0,0 @@
-/*
- * crc32_impl.h
- *
- * Copyright 2016 Eric Biggers
- *
- * Permission is hereby granted, free of charge, to any person
- * obtaining a copy of this software and associated documentation
- * files (the "Software"), to deal in the Software without
- * restriction, including without limitation the rights to use,
- * copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following
- * conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
- * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
- * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
- * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
- * OTHER DEALINGS IN THE SOFTWARE.
- */
-/*
- * CRC-32 folding with PCLMULQDQ.
- *
- * The basic idea is to repeatedly "fold" each 512 bits into the next 512 bits,
- * producing an abbreviated message which is congruent the original message
- * modulo the generator polynomial G(x).
- *
- * Folding each 512 bits is implemented as eight 64-bit folds, each of which
- * uses one carryless multiplication instruction.  It's expected that CPUs may
- * be able to execute some of these multiplications in parallel.
- *
- * Explanation of "folding": let A(x) be 64 bits from the message, and let B(x)
- * be 95 bits from a constant distance D later in the message.  The relevant
- * portion of the message can be written as:
- *
- *	M(x) = A(x)*x^D + B(x)
- *
- * ... where + and * represent addition and multiplication, respectively, of
- * polynomials over GF(2).  Note that when implemented on a computer, these
- * operations are equivalent to XOR and carryless multiplication, respectively.
- *
- * For the purpose of CRC calculation, only the remainder modulo the generator
- * polynomial G(x) matters:
- *
- *	M(x) mod G(x) = (A(x)*x^D + B(x)) mod G(x)
- *
- * Since the modulo operation can be applied anywhere in a sequence of additions
- * and multiplications without affecting the result, this is equivalent to:
- *
- *	M(x) mod G(x) = (A(x)*(x^D mod G(x)) + B(x)) mod G(x)
- *
- * For any D, 'x^D mod G(x)' will be a polynomial with maximum degree 31, i.e.
- * a 32-bit quantity.  So 'A(x) * (x^D mod G(x))' is equivalent to a carryless
- * multiplication of a 64-bit quantity by a 32-bit quantity, producing a 95-bit
- * product.  Then, adding (XOR-ing) the product to B(x) produces a polynomial
- * with the same length as B(x) but with the same remainder as 'A(x)*x^D +
- * B(x)'.  This is the basic fold operation with 64 bits.
- *
- * Note that the carryless multiplication instruction PCLMULQDQ actually takes
- * two 64-bit inputs and produces a 127-bit product in the low-order bits of a
- * 128-bit XMM register.  This works fine, but care must be taken to account for
- * "bit endianness".  With the CRC version implemented here, bits are always
- * ordered such that the lowest-order bit represents the coefficient of highest
- * power of x and the highest-order bit represents the coefficient of the lowest
- * power of x.  This is backwards from the more intuitive order.  Still,
- * carryless multiplication works essentially the same either way.  It just must
- * be accounted for that when we XOR the 95-bit product in the low-order 95 bits
- * of a 128-bit XMM register into 128-bits of later data held in another XMM
- * register, we'll really be XOR-ing the product into the mathematically higher
- * degree end of those later bits, not the lower degree end as may be expected.
- *
- * So given that caveat and the fact that we process 512 bits per iteration, the
- * 'D' values we need for the two 64-bit halves of each 128 bits of data are:
- *
- *	D = (512 + 95) - 64	 for the higher-degree half of each 128 bits,
- *				 i.e. the lower order bits in the XMM register
- *
- *	D = (512 + 95) - 128	 for the lower-degree half of each 128 bits,
- *				 i.e. the higher order bits in the XMM register
- *
- * The required 'x^D mod G(x)' values were precomputed.
- *
- * When <= 512 bits remain in the message, we finish up by folding across
- * smaller distances.  This works similarly; the distance D is just different,
- * so different constant multipliers must be used.  Finally, once the remaining
- * message is just 64 bits, it is is reduced to the CRC-32 using Barrett
- * reduction (explained later).
- *
- * For more information see the original paper from Intel:
- *	"Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
- *	December 2009
- *	http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
- */
-static u32 ATTRIBUTES
-FUNCNAME_ALIGNED(u32 remainder, const __m128i *p, size_t vec_count)
-{
-	/* Constants precomputed by gen_crc32_multipliers.c.  Do not edit! */
-	const __v2di multipliers_4 = (__v2di){ 0x8F352D95, 0x1D9513D7 };
-	const __v2di multipliers_2 = (__v2di){ 0xF1DA05AA, 0x81256527 };
-	const __v2di multipliers_1 = (__v2di){ 0xAE689191, 0xCCAA009E };
-	const __v2di final_multiplier = (__v2di){ 0xB8BC6765 };
-	const __m128i mask32 = (__m128i)(__v4si){ 0xFFFFFFFF };
-	const __v2di barrett_reduction_constants =
-			(__v2di){ 0x00000001F7011641, 0x00000001DB710641 };
-	const __m128i * const end = p + vec_count;
-	const __m128i * const end512 = p + (vec_count & ~3);
-	__m128i x0, x1, x2, x3;
-	/*
-	 * Account for the current 'remainder', i.e. the CRC of the part of the
-	 * message already processed.  Explanation: rewrite the message
-	 * polynomial M(x) in terms of the first part A(x), the second part
-	 * B(x), and the length of the second part in bits |B(x)| >= 32:
-	 *
-	 *	M(x) = A(x)*x^|B(x)| + B(x)
-	 *
-	 * Then the CRC of M(x) is:
-	 *
-	 *	CRC(M(x)) = CRC(A(x)*x^|B(x)| + B(x))
-	 *	          = CRC(A(x)*x^32*x^(|B(x)| - 32) + B(x))
-	 *	          = CRC(CRC(A(x))*x^(|B(x)| - 32) + B(x))
-	 *
-	 * Note: all arithmetic is modulo G(x), the generator polynomial; that's
-	 * why A(x)*x^32 can be replaced with CRC(A(x)) = A(x)*x^32 mod G(x).
-	 *
-	 * So the CRC of the full message is the CRC of the second part of the
-	 * message where the first 32 bits of the second part of the message
-	 * have been XOR'ed with the CRC of the first part of the message.
-	 */
-	x0 = *p++;
-	x0 ^= (__m128i)(__v4si){ remainder };
-	if (p > end512) /* only 128, 256, or 384 bits of input? */
-		goto _128_bits_at_a_time;
-	x1 = *p++;
-	x2 = *p++;
-	x3 = *p++;
-	/* Fold 512 bits at a time */
-	for (; p != end512; p += 4) {
-		__m128i y0, y1, y2, y3;
-		y0 = p[0];
-		y1 = p[1];
-		y2 = p[2];
-		y3 = p[3];
-		/*
-		 * Note: the immediate constant for PCLMULQDQ specifies which
-		 * 64-bit halves of the 128-bit vectors to multiply:
-		 *
-		 * 0x00 means low halves (higher degree polynomial terms for us)
-		 * 0x11 means high halves (lower degree polynomial terms for us)
-		 */
-		y0 ^= _mm_clmulepi64_si128(x0, multipliers_4, 0x00);
-		y1 ^= _mm_clmulepi64_si128(x1, multipliers_4, 0x00);
-		y2 ^= _mm_clmulepi64_si128(x2, multipliers_4, 0x00);
-		y3 ^= _mm_clmulepi64_si128(x3, multipliers_4, 0x00);
-		y0 ^= _mm_clmulepi64_si128(x0, multipliers_4, 0x11);
-		y1 ^= _mm_clmulepi64_si128(x1, multipliers_4, 0x11);
-		y2 ^= _mm_clmulepi64_si128(x2, multipliers_4, 0x11);
-		y3 ^= _mm_clmulepi64_si128(x3, multipliers_4, 0x11);
-		x0 = y0;
-		x1 = y1;
-		x2 = y2;
-		x3 = y3;
-	}
-	/* Fold 512 bits => 128 bits */
-	x2 ^= _mm_clmulepi64_si128(x0, multipliers_2, 0x00);
-	x3 ^= _mm_clmulepi64_si128(x1, multipliers_2, 0x00);
-	x2 ^= _mm_clmulepi64_si128(x0, multipliers_2, 0x11);
-	x3 ^= _mm_clmulepi64_si128(x1, multipliers_2, 0x11);
-	x3 ^= _mm_clmulepi64_si128(x2, multipliers_1, 0x00);
-	x3 ^= _mm_clmulepi64_si128(x2, multipliers_1, 0x11);
-	x0 = x3;
-_128_bits_at_a_time:
-	while (p != end) {
-		/* Fold 128 bits into next 128 bits */
-		x1 = *p++;
-		x1 ^= _mm_clmulepi64_si128(x0, multipliers_1, 0x00);
-		x1 ^= _mm_clmulepi64_si128(x0, multipliers_1, 0x11);
-		x0 = x1;
-	}
-	/* Now there are just 128 bits left, stored in 'x0'. */
-	/*
-	 * Fold 128 => 96 bits.  This also implicitly appends 32 zero bits,
-	 * which is equivalent to multiplying by x^32.  This is needed because
-	 * the CRC is defined as M(x)*x^32 mod G(x), not just M(x) mod G(x).
-	 */
-	x0 = _mm_srli_si128(x0, 8) ^
-	     _mm_clmulepi64_si128(x0, multipliers_1, 0x10);
-	/* Fold 96 => 64 bits */
-	x0 = _mm_srli_si128(x0, 4) ^
-	     _mm_clmulepi64_si128(x0 & mask32, final_multiplier, 0x00);
-        /*
-	 * Finally, reduce 64 => 32 bits using Barrett reduction.
-	 *
-	 * Let M(x) = A(x)*x^32 + B(x) be the remaining message.  The goal is to
-	 * compute R(x) = M(x) mod G(x).  Since degree(B(x)) < degree(G(x)):
-	 *
-	 *	R(x) = (A(x)*x^32 + B(x)) mod G(x)
-	 *	     = (A(x)*x^32) mod G(x) + B(x)
-	 *
-	 * Then, by the Division Algorithm there exists a unique q(x) such that:
-	 *
-	 *	A(x)*x^32 mod G(x) = A(x)*x^32 - q(x)*G(x)
-	 *
-	 * Since the left-hand side is of maximum degree 31, the right-hand side
-	 * must be too.  This implies that we can apply 'mod x^32' to the
-	 * right-hand side without changing its value:
-	 *
-	 *	(A(x)*x^32 - q(x)*G(x)) mod x^32 = q(x)*G(x) mod x^32
-	 *
-	 * Note that '+' is equivalent to '-' in polynomials over GF(2).
-	 *
-	 * We also know that:
-	 *
-	 *	              / A(x)*x^32 \
-	 *	q(x) = floor (  ---------  )
-	 *	              \    G(x)   /
-	 *
-	 * To compute this efficiently, we can multiply the top and bottom by
-	 * x^32 and move the division by G(x) to the top:
-	 *
-	 *	              / A(x) * floor(x^64 / G(x)) \
-	 *	q(x) = floor (  -------------------------  )
-	 *	              \           x^32            /
-	 *
-	 * Note that floor(x^64 / G(x)) is a constant.
-	 *
-	 * So finally we have:
-	 *
-	 *	                          / A(x) * floor(x^64 / G(x)) \
-	 *	R(x) = B(x) + G(x)*floor (  -------------------------  )
-	 *	                          \           x^32            /
-	 */
-	x1 = x0;
-	x0 = _mm_clmulepi64_si128(x0 & mask32, barrett_reduction_constants, 0x00);
-	x0 = _mm_clmulepi64_si128(x0 & mask32, barrett_reduction_constants, 0x10);
-	return _mm_cvtsi128_si32(_mm_srli_si128(x0 ^ x1, 4));
-}
-/*
- * Fast CRC-32 implementation for x86_64 processors that have the carryless
- * multiplication extension (PCLMUL).
- *
- * Note: on unaligned ends of the buffer, we fall back to crc32_slice1() instead
- * of crc32_slice8() because only a few bytes need to be processed, so a smaller
- * table is preferable.
- */
-static u32 ATTRIBUTES
-FUNCNAME(u32 remainder, const u8 *buffer, size_t nbytes)
-{
-	if ((uintptr_t)buffer & 15) {
-		size_t n = MIN(nbytes, -(uintptr_t)buffer & 15);
-		remainder = crc32_slice1(remainder, buffer, n);
-		buffer += n;
-		nbytes -= n;
-	}
-	if (nbytes >= 16) {
-		remainder = FUNCNAME_ALIGNED(remainder, (const __m128i *)buffer,
-					     nbytes / 16);
-		buffer += nbytes & ~15;
-		nbytes &= 15;
-	}
-	return crc32_slice1(remainder, buffer, nbytes);
-}
-#undef FUNCNAME
-#undef FUNCNAME_ALIGNED
-#undef ATTRIBUTES