RubyGems - libdeflate - Versions diffs - 0.1.0 - Mend

libdeflate 0.1.0

Files changed (89) hide show

checksums.yaml +7 -0
data/.gitignore +17 -0
data/.gitmodules +3 -0
data/.rspec +2 -0
data/.rubocop.yml +1 -0
data/.rubocop_todo.yml +9 -0
data/.travis.yml +5 -0
data/Gemfile +4 -0
data/LICENSE.txt +21 -0
data/README.md +52 -0
data/Rakefile +15 -0
data/bin/console +14 -0
data/bin/setup +8 -0
data/ext/libdeflate/extconf.rb +14 -0
data/ext/libdeflate/libdeflate/.gitignore +19 -0
data/ext/libdeflate/libdeflate/COPYING +21 -0
data/ext/libdeflate/libdeflate/Makefile +231 -0
data/ext/libdeflate/libdeflate/Makefile.msc +64 -0
data/ext/libdeflate/libdeflate/NEWS +57 -0
data/ext/libdeflate/libdeflate/README.md +170 -0
data/ext/libdeflate/libdeflate/common/common_defs.h +351 -0
data/ext/libdeflate/libdeflate/common/compiler_gcc.h +134 -0
data/ext/libdeflate/libdeflate/common/compiler_msc.h +95 -0
data/ext/libdeflate/libdeflate/lib/adler32.c +213 -0
data/ext/libdeflate/libdeflate/lib/adler32_impl.h +281 -0
data/ext/libdeflate/libdeflate/lib/aligned_malloc.c +57 -0
data/ext/libdeflate/libdeflate/lib/aligned_malloc.h +13 -0
data/ext/libdeflate/libdeflate/lib/bt_matchfinder.h +357 -0
data/ext/libdeflate/libdeflate/lib/crc32.c +368 -0
data/ext/libdeflate/libdeflate/lib/crc32_impl.h +286 -0
data/ext/libdeflate/libdeflate/lib/crc32_table.h +526 -0
data/ext/libdeflate/libdeflate/lib/decompress_impl.h +404 -0
data/ext/libdeflate/libdeflate/lib/deflate_compress.c +2817 -0
data/ext/libdeflate/libdeflate/lib/deflate_compress.h +14 -0
data/ext/libdeflate/libdeflate/lib/deflate_constants.h +66 -0
data/ext/libdeflate/libdeflate/lib/deflate_decompress.c +889 -0
data/ext/libdeflate/libdeflate/lib/gzip_compress.c +95 -0
data/ext/libdeflate/libdeflate/lib/gzip_constants.h +45 -0
data/ext/libdeflate/libdeflate/lib/gzip_decompress.c +130 -0
data/ext/libdeflate/libdeflate/lib/hc_matchfinder.h +405 -0
data/ext/libdeflate/libdeflate/lib/lib_common.h +35 -0
data/ext/libdeflate/libdeflate/lib/matchfinder_avx2.h +53 -0
data/ext/libdeflate/libdeflate/lib/matchfinder_common.h +205 -0
data/ext/libdeflate/libdeflate/lib/matchfinder_neon.h +61 -0
data/ext/libdeflate/libdeflate/lib/matchfinder_sse2.h +53 -0
data/ext/libdeflate/libdeflate/lib/unaligned.h +202 -0
data/ext/libdeflate/libdeflate/lib/x86_cpu_features.c +169 -0
data/ext/libdeflate/libdeflate/lib/x86_cpu_features.h +48 -0
data/ext/libdeflate/libdeflate/lib/zlib_compress.c +87 -0
data/ext/libdeflate/libdeflate/lib/zlib_constants.h +21 -0
data/ext/libdeflate/libdeflate/lib/zlib_decompress.c +91 -0
data/ext/libdeflate/libdeflate/libdeflate.h +274 -0
data/ext/libdeflate/libdeflate/programs/benchmark.c +558 -0
data/ext/libdeflate/libdeflate/programs/checksum.c +197 -0
data/ext/libdeflate/libdeflate/programs/detect.sh +62 -0
data/ext/libdeflate/libdeflate/programs/gzip.c +603 -0
data/ext/libdeflate/libdeflate/programs/prog_util.c +530 -0
data/ext/libdeflate/libdeflate/programs/prog_util.h +162 -0
data/ext/libdeflate/libdeflate/programs/test_checksums.c +135 -0
data/ext/libdeflate/libdeflate/programs/tgetopt.c +118 -0
data/ext/libdeflate/libdeflate/tools/afl-fuzz/Makefile +12 -0
data/ext/libdeflate/libdeflate/tools/afl-fuzz/deflate_compress/fuzz.c +40 -0
data/ext/libdeflate/libdeflate/tools/afl-fuzz/deflate_compress/inputs/0 +0 -0
data/ext/libdeflate/libdeflate/tools/afl-fuzz/deflate_decompress/fuzz.c +28 -0
data/ext/libdeflate/libdeflate/tools/afl-fuzz/deflate_decompress/inputs/0 +3 -0
data/ext/libdeflate/libdeflate/tools/afl-fuzz/gzip_decompress/fuzz.c +28 -0
data/ext/libdeflate/libdeflate/tools/afl-fuzz/gzip_decompress/inputs/0 +0 -0
data/ext/libdeflate/libdeflate/tools/afl-fuzz/prepare_for_fuzz.sh +14 -0
data/ext/libdeflate/libdeflate/tools/afl-fuzz/zlib_decompress/fuzz.c +28 -0
data/ext/libdeflate/libdeflate/tools/afl-fuzz/zlib_decompress/inputs/0 +3 -0
data/ext/libdeflate/libdeflate/tools/android_build.sh +104 -0
data/ext/libdeflate/libdeflate/tools/checksum_benchmarks.sh +76 -0
data/ext/libdeflate/libdeflate/tools/exec_tests.sh +30 -0
data/ext/libdeflate/libdeflate/tools/gen_crc32_multipliers.c +108 -0
data/ext/libdeflate/libdeflate/tools/gen_crc32_table.c +100 -0
data/ext/libdeflate/libdeflate/tools/gzip_tests.sh +412 -0
data/ext/libdeflate/libdeflate/tools/make-windows-releases +21 -0
data/ext/libdeflate/libdeflate/tools/mips_build.sh +9 -0
data/ext/libdeflate/libdeflate/tools/msc_test.bat +3 -0
data/ext/libdeflate/libdeflate/tools/pgo_build.sh +23 -0
data/ext/libdeflate/libdeflate/tools/produce_gzip_benchmark_table.sh +37 -0
data/ext/libdeflate/libdeflate/tools/run_tests.sh +305 -0
data/ext/libdeflate/libdeflate/tools/windows_build.sh +10 -0
data/ext/libdeflate/libdeflate_ext.c +389 -0
data/ext/libdeflate/libdeflate_ext.h +8 -0
data/lib/libdeflate.rb +2 -0
data/lib/libdeflate/version.rb +3 -0
data/libdeflate.gemspec +33 -0
metadata +230 -0

data/ext/libdeflate/libdeflate/lib/aligned_malloc.c ADDED Viewed

@@ -0,0 +1,57 @@
+/*
+ * aligned_malloc.c - aligned memory allocation
+ *
+ * Originally public domain; changes after 2016-09-07 are copyrighted.
+ *
+ * Copyright 2016 Eric Biggers
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+/*
+ * This file provides portable aligned memory allocation functions that only
+ * use malloc() and free().  This avoids portability problems with
+ * posix_memalign(), aligned_alloc(), etc.
+ */
+#include <stdlib.h>
+#include "aligned_malloc.h"
+void *
+aligned_malloc(size_t alignment, size_t size)
+{
+	void *ptr = malloc(sizeof(void *) + alignment - 1 + size);
+	if (ptr) {
+		void *orig_ptr = ptr;
+		ptr = (void *)ALIGN((uintptr_t)ptr + sizeof(void *), alignment);
+		((void **)ptr)[-1] = orig_ptr;
+	}
+	return ptr;
+}
+void
+aligned_free(void *ptr)
+{
+	if (ptr)
+		free(((void **)ptr)[-1]);
+}

data/ext/libdeflate/libdeflate/lib/aligned_malloc.h ADDED Viewed

@@ -0,0 +1,13 @@
+/*
+ * aligned_malloc.c - aligned memory allocation
+ */
+#ifndef LIB_ALIGNED_MALLOC_H
+#define LIB_ALIGNED_MALLOC_H
+#include "lib_common.h"
+extern void *aligned_malloc(size_t alignment, size_t size);
+extern void aligned_free(void *ptr);
+#endif /* LIB_ALIGNED_MALLOC_H */

data/ext/libdeflate/libdeflate/lib/bt_matchfinder.h ADDED Viewed

@@ -0,0 +1,357 @@
+/*
+ * bt_matchfinder.h - Lempel-Ziv matchfinding with a hash table of binary trees
+ *
+ * Originally public domain; changes after 2016-09-07 are copyrighted.
+ *
+ * Copyright 2016 Eric Biggers
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ----------------------------------------------------------------------------
+ *
+ * This is a Binary Trees (bt) based matchfinder.
+ *
+ * The main data structure is a hash table where each hash bucket contains a
+ * binary tree of sequences whose first 4 bytes share the same hash code.  Each
+ * sequence is identified by its starting position in the input buffer.  Each
+ * binary tree is always sorted such that each left child represents a sequence
+ * lexicographically lesser than its parent and each right child represents a
+ * sequence lexicographically greater than its parent.
+ *
+ * The algorithm processes the input buffer sequentially.  At each byte
+ * position, the hash code of the first 4 bytes of the sequence beginning at
+ * that position (the sequence being matched against) is computed.  This
+ * identifies the hash bucket to use for that position.  Then, a new binary tree
+ * node is created to represent the current sequence.  Then, in a single tree
+ * traversal, the hash bucket's binary tree is searched for matches and is
+ * re-rooted at the new node.
+ *
+ * Compared to the simpler algorithm that uses linked lists instead of binary
+ * trees (see hc_matchfinder.h), the binary tree version gains more information
+ * at each node visitation.  Ideally, the binary tree version will examine only
+ * 'log(n)' nodes to find the same matches that the linked list version will
+ * find by examining 'n' nodes.  In addition, the binary tree version can
+ * examine fewer bytes at each node by taking advantage of the common prefixes
+ * that result from the sort order, whereas the linked list version may have to
+ * examine up to the full length of the match at each node.
+ *
+ * However, it is not always best to use the binary tree version.  It requires
+ * nearly twice as much memory as the linked list version, and it takes time to
+ * keep the binary trees sorted, even at positions where the compressor does not
+ * need matches.  Generally, when doing fast compression on small buffers,
+ * binary trees are the wrong approach.  They are best suited for thorough
+ * compression and/or large buffers.
+ *
+ * ----------------------------------------------------------------------------
+ */
+#include "matchfinder_common.h"
+#define BT_MATCHFINDER_HASH3_ORDER 16
+#define BT_MATCHFINDER_HASH3_WAYS  2
+#define BT_MATCHFINDER_HASH4_ORDER 16
+#define BT_MATCHFINDER_TOTAL_HASH_LENGTH		\
+	((1UL << BT_MATCHFINDER_HASH3_ORDER) * BT_MATCHFINDER_HASH3_WAYS + \
+	 (1UL << BT_MATCHFINDER_HASH4_ORDER))
+/* Representation of a match found by the bt_matchfinder  */
+struct lz_match {
+	/* The number of bytes matched.  */
+	u16 length;
+	/* The offset back from the current position that was matched.  */
+	u16 offset;
+};
+struct bt_matchfinder {
+	/* The hash table for finding length 3 matches  */
+	mf_pos_t hash3_tab[1UL << BT_MATCHFINDER_HASH3_ORDER][BT_MATCHFINDER_HASH3_WAYS];
+	/* The hash table which contains the roots of the binary trees for
+	 * finding length 4+ matches  */
+	mf_pos_t hash4_tab[1UL << BT_MATCHFINDER_HASH4_ORDER];
+	/* The child node references for the binary trees.  The left and right
+	 * children of the node for the sequence with position 'pos' are
+	 * 'child_tab[pos * 2]' and 'child_tab[pos * 2 + 1]', respectively.  */
+	mf_pos_t child_tab[2UL * MATCHFINDER_WINDOW_SIZE];
+}
+#ifdef _aligned_attribute
+_aligned_attribute(MATCHFINDER_ALIGNMENT)
+#endif
+;
+/* Prepare the matchfinder for a new input buffer.  */
+static forceinline void
+bt_matchfinder_init(struct bt_matchfinder *mf)
+{
+	matchfinder_init((mf_pos_t *)mf, BT_MATCHFINDER_TOTAL_HASH_LENGTH);
+}
+static forceinline void
+bt_matchfinder_slide_window(struct bt_matchfinder *mf)
+{
+	matchfinder_rebase((mf_pos_t *)mf,
+			   sizeof(struct bt_matchfinder) / sizeof(mf_pos_t));
+}
+static forceinline mf_pos_t *
+bt_left_child(struct bt_matchfinder *mf, s32 node)
+{
+	return &mf->child_tab[2 * (node & (MATCHFINDER_WINDOW_SIZE - 1)) + 0];
+}
+static forceinline mf_pos_t *
+bt_right_child(struct bt_matchfinder *mf, s32 node)
+{
+	return &mf->child_tab[2 * (node & (MATCHFINDER_WINDOW_SIZE - 1)) + 1];
+}
+/* The minimum permissible value of 'max_len' for bt_matchfinder_get_matches()
+ * and bt_matchfinder_skip_position().  There must be sufficiently many bytes
+ * remaining to load a 32-bit integer from the *next* position.  */
+#define BT_MATCHFINDER_REQUIRED_NBYTES	5
+/* Advance the binary tree matchfinder by one byte, optionally recording
+ * matches.  @record_matches should be a compile-time constant.  */
+static forceinline struct lz_match *
+bt_matchfinder_advance_one_byte(struct bt_matchfinder * const restrict mf,
+				const u8 * const restrict in_base,
+				const ptrdiff_t cur_pos,
+				const u32 max_len,
+				const u32 nice_len,
+				const u32 max_search_depth,
+				u32 * const restrict next_hashes,
+				u32 * const restrict best_len_ret,
+				struct lz_match * restrict lz_matchptr,
+				const bool record_matches)
+{
+	const u8 *in_next = in_base + cur_pos;
+	u32 depth_remaining = max_search_depth;
+	const s32 cutoff = cur_pos - MATCHFINDER_WINDOW_SIZE;
+	u32 next_seq4;
+	u32 next_seq3;
+	u32 hash3;
+	u32 hash4;
+	s32 cur_node;
+#if BT_MATCHFINDER_HASH3_WAYS >= 2
+	s32 cur_node_2;
+#endif
+	const u8 *matchptr;
+	mf_pos_t *pending_lt_ptr, *pending_gt_ptr;
+	u32 best_lt_len, best_gt_len;
+	u32 len;
+	u32 best_len = 3;
+	STATIC_ASSERT(BT_MATCHFINDER_HASH3_WAYS >= 1 &&
+		      BT_MATCHFINDER_HASH3_WAYS <= 2);
+	next_seq4 = load_u32_unaligned(in_next + 1);
+	next_seq3 = loaded_u32_to_u24(next_seq4);
+	hash3 = next_hashes[0];
+	hash4 = next_hashes[1];
+	next_hashes[0] = lz_hash(next_seq3, BT_MATCHFINDER_HASH3_ORDER);
+	next_hashes[1] = lz_hash(next_seq4, BT_MATCHFINDER_HASH4_ORDER);
+	prefetchw(&mf->hash3_tab[next_hashes[0]]);
+	prefetchw(&mf->hash4_tab[next_hashes[1]]);
+	cur_node = mf->hash3_tab[hash3][0];
+	mf->hash3_tab[hash3][0] = cur_pos;
+#if BT_MATCHFINDER_HASH3_WAYS >= 2
+	cur_node_2 = mf->hash3_tab[hash3][1];
+	mf->hash3_tab[hash3][1] = cur_node;
+#endif
+	if (record_matches && cur_node > cutoff) {
+		u32 seq3 = load_u24_unaligned(in_next);
+		if (seq3 == load_u24_unaligned(&in_base[cur_node])) {
+			lz_matchptr->length = 3;
+			lz_matchptr->offset = in_next - &in_base[cur_node];
+			lz_matchptr++;
+		}
+	#if BT_MATCHFINDER_HASH3_WAYS >= 2
+		else if (cur_node_2 > cutoff &&
+			seq3 == load_u24_unaligned(&in_base[cur_node_2]))
+		{
+			lz_matchptr->length = 3;
+			lz_matchptr->offset = in_next - &in_base[cur_node_2];
+			lz_matchptr++;
+		}
+	#endif
+	}
+	cur_node = mf->hash4_tab[hash4];
+	mf->hash4_tab[hash4] = cur_pos;
+	pending_lt_ptr = bt_left_child(mf, cur_pos);
+	pending_gt_ptr = bt_right_child(mf, cur_pos);
+	if (cur_node <= cutoff) {
+		*pending_lt_ptr = MATCHFINDER_INITVAL;
+		*pending_gt_ptr = MATCHFINDER_INITVAL;
+		*best_len_ret = best_len;
+		return lz_matchptr;
+	}
+	best_lt_len = 0;
+	best_gt_len = 0;
+	len = 0;
+	for (;;) {
+		matchptr = &in_base[cur_node];
+		if (matchptr[len] == in_next[len]) {
+			len = lz_extend(in_next, matchptr, len + 1, max_len);
+			if (!record_matches || len > best_len) {
+				if (record_matches) {
+					best_len = len;
+					lz_matchptr->length = len;
+					lz_matchptr->offset = in_next - matchptr;
+					lz_matchptr++;
+				}
+				if (len >= nice_len) {
+					*pending_lt_ptr = *bt_left_child(mf, cur_node);
+					*pending_gt_ptr = *bt_right_child(mf, cur_node);
+					*best_len_ret = best_len;
+					return lz_matchptr;
+				}
+			}
+		}
+		if (matchptr[len] < in_next[len]) {
+			*pending_lt_ptr = cur_node;
+			pending_lt_ptr = bt_right_child(mf, cur_node);
+			cur_node = *pending_lt_ptr;
+			best_lt_len = len;
+			if (best_gt_len < len)
+				len = best_gt_len;
+		} else {
+			*pending_gt_ptr = cur_node;
+			pending_gt_ptr = bt_left_child(mf, cur_node);
+			cur_node = *pending_gt_ptr;
+			best_gt_len = len;
+			if (best_lt_len < len)
+				len = best_lt_len;
+		}
+		if (cur_node <= cutoff || !--depth_remaining) {
+			*pending_lt_ptr = MATCHFINDER_INITVAL;
+			*pending_gt_ptr = MATCHFINDER_INITVAL;
+			*best_len_ret = best_len;
+			return lz_matchptr;
+		}
+	}
+}
+/*
+ * Retrieve a list of matches with the current position.
+ *
+ * @mf
+ *	The matchfinder structure.
+ * @in_base
+ *	Pointer to the next byte in the input buffer to process _at the last
+ *	time bt_matchfinder_init() or bt_matchfinder_slide_window() was called_.
+ * @cur_pos
+ *	The current position in the input buffer relative to @in_base (the
+ *	position of the sequence being matched against).
+ * @max_len
+ *	The maximum permissible match length at this position.  Must be >=
+ *	BT_MATCHFINDER_REQUIRED_NBYTES.
+ * @nice_len
+ *	Stop searching if a match of at least this length is found.
+ *	Must be <= @max_len.
+ * @max_search_depth
+ *	Limit on the number of potential matches to consider.  Must be >= 1.
+ * @next_hashes
+ *	The precomputed hash codes for the sequence beginning at @in_next.
+ *	These will be used and then updated with the precomputed hashcodes for
+ *	the sequence beginning at @in_next + 1.
+ * @best_len_ret
+ *	If a match of length >= 4 was found, then the length of the longest such
+ *	match is written here; otherwise 3 is written here.  (Note: this is
+ *	redundant with the 'struct lz_match' array, but this is easier for the
+ *	compiler to optimize when inlined and the caller immediately does a
+ *	check against 'best_len'.)
+ * @lz_matchptr
+ *	An array in which this function will record the matches.  The recorded
+ *	matches will be sorted by strictly increasing length and (non-strictly)
+ *	increasing offset.  The maximum number of matches that may be found is
+ *	'nice_len - 2'.
+ *
+ * The return value is a pointer to the next available slot in the @lz_matchptr
+ * array.  (If no matches were found, this will be the same as @lz_matchptr.)
+ */
+static forceinline struct lz_match *
+bt_matchfinder_get_matches(struct bt_matchfinder *mf,
+			   const u8 *in_base,
+			   ptrdiff_t cur_pos,
+			   u32 max_len,
+			   u32 nice_len,
+			   u32 max_search_depth,
+			   u32 next_hashes[2],
+			   u32 *best_len_ret,
+			   struct lz_match *lz_matchptr)
+{
+	return bt_matchfinder_advance_one_byte(mf,
+					       in_base,
+					       cur_pos,
+					       max_len,
+					       nice_len,
+					       max_search_depth,
+					       next_hashes,
+					       best_len_ret,
+					       lz_matchptr,
+					       true);
+}
+/*
+ * Advance the matchfinder, but don't record any matches.
+ *
+ * This is very similar to bt_matchfinder_get_matches() because both functions
+ * must do hashing and tree re-rooting.
+ */
+static forceinline void
+bt_matchfinder_skip_position(struct bt_matchfinder *mf,
+			     const u8 *in_base,
+			     ptrdiff_t cur_pos,
+			     u32 nice_len,
+			     u32 max_search_depth,
+			     u32 next_hashes[2])
+{
+	u32 best_len;
+	bt_matchfinder_advance_one_byte(mf,
+					in_base,
+					cur_pos,
+					nice_len,
+					nice_len,
+					max_search_depth,
+					next_hashes,
+					&best_len,
+					NULL,
+					false);
+}

data/ext/libdeflate/libdeflate/lib/crc32.c ADDED Viewed

@@ -0,0 +1,368 @@
+/*
+ * crc32.c - CRC-32 checksum algorithm for the gzip format
+ *
+ * Originally public domain; changes after 2016-09-07 are copyrighted.
+ *
+ * Copyright 2016 Eric Biggers
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+/*
+ * High-level description of CRC
+ * =============================
+ *
+ * Consider a bit sequence 'bits[1...len]'.  Interpret 'bits' as the "message"
+ * polynomial M(x) with coefficients in GF(2) (the field of integers modulo 2),
+ * where the coefficient of 'x^i' is 'bits[len - i]'.  Then, compute:
+ *
+ *			R(x) = M(x)*x^n mod G(x)
+ *
+ * where G(x) is a selected "generator" polynomial of degree 'n'.  The remainder
+ * R(x) is a polynomial of max degree 'n - 1'.  The CRC of 'bits' is R(x)
+ * interpreted as a bitstring of length 'n'.
+ *
+ * CRC used in gzip
+ * ================
+ *
+ * In the gzip format (RFC 1952):
+ *
+ *	- The bitstring to checksum is formed from the bytes of the uncompressed
+ *	  data by concatenating the bits from the bytes in order, proceeding
+ *	  from the low-order bit to the high-order bit within each byte.
+ *
+ *	- The generator polynomial G(x) is: x^32 + x^26 + x^23 + x^22 + x^16 +
+ *	  x^12 + x^11 + x^10 + x^8 + x^7 + x^5 + x^4 + x^2 + x + 1.
+ *	  Consequently, the CRC length is 32 bits ("CRC-32").
+ *
+ *	- The highest order 32 coefficients of M(x)*x^n are inverted.
+ *
+ *	- All 32 coefficients of R(x) are inverted.
+ *
+ * The two inversions cause added leading and trailing zero bits to affect the
+ * resulting CRC, whereas with a regular CRC such bits would have no effect on
+ * the CRC.
+ *
+ * Computation and optimizations
+ * =============================
+ *
+ * We can compute R(x) through "long division", maintaining only 32 bits of
+ * state at any given time.  Multiplication by 'x' can be implemented as
+ * right-shifting by 1 (assuming the polynomial<=>bitstring mapping where the
+ * highest order bit represents the coefficient of x^0), and both addition and
+ * subtraction can be implemented as bitwise exclusive OR (since we are working
+ * in GF(2)).  Here is an unoptimized implementation:
+ *
+ *	static u32 crc32_gzip(const u8 *buffer, size_t nbytes)
+ *	{
+ *		u32 remainder = 0;
+ *		const u32 divisor = 0xEDB88320;
+ *
+ *		for (size_t i = 0; i < nbytes * 8 + 32; i++) {
+ *			int bit;
+ *			u32 multiple;
+ *
+ *			if (i < nbytes * 8)
+ *				bit = (buffer[i / 8] >> (i % 8)) & 1;
+ *			else
+ *				bit = 0; // one of the 32 appended 0 bits
+ *
+ *			if (i < 32) // the first 32 bits are inverted
+ *				bit ^= 1;
+ *
+ *			if (remainder & 1)
+ *				multiple = divisor;
+ *			else
+ *				multiple = 0;
+ *
+ *			remainder >>= 1;
+ *			remainder |= (u32)bit << 31;
+ *			remainder ^= multiple;
+ *		}
+ *
+ *		return ~remainder;
+ *	}
+ *
+ * In this implementation, the 32-bit integer 'remainder' maintains the
+ * remainder of the currently processed portion of the message (with 32 zero
+ * bits appended) when divided by the generator polynomial.  'remainder' is the
+ * representation of R(x), and 'divisor' is the representation of G(x) excluding
+ * the x^32 coefficient.  For each bit to process, we multiply R(x) by 'x^1',
+ * then add 'x^0' if the new bit is a 1.  If this causes R(x) to gain a nonzero
+ * x^32 term, then we subtract G(x) from R(x).
+ *
+ * We can speed this up by taking advantage of the fact that XOR is commutative
+ * and associative, so the order in which we combine the inputs into 'remainder'
+ * is unimportant.  And since each message bit we add doesn't affect the choice
+ * of 'multiple' until 32 bits later, we need not actually add each message bit
+ * until that point:
+ *
+ *	static u32 crc32_gzip(const u8 *buffer, size_t nbytes)
+ *	{
+ *		u32 remainder = ~0;
+ *		const u32 divisor = 0xEDB88320;
+ *
+ *		for (size_t i = 0; i < nbytes * 8; i++) {
+ *			int bit;
+ *			u32 multiple;
+ *
+ *			bit = (buffer[i / 8] >> (i % 8)) & 1;
+ *			remainder ^= bit;
+ *			if (remainder & 1)
+ *				multiple = divisor;
+ *			else
+ *				multiple = 0;
+ *			remainder >>= 1;
+ *			remainder ^= multiple;
+ *		}
+ *
+ *		return ~remainder;
+ *	}
+ *
+ * With the above implementation we get the effect of 32 appended 0 bits for
+ * free; they never affect the choice of a divisor, nor would they change the
+ * value of 'remainder' if they were to be actually XOR'ed in.  And by starting
+ * with a remainder of all 1 bits, we get the effect of complementing the first
+ * 32 message bits.
+ *
+ * The next optimization is to process the input in multi-bit units.  Suppose
+ * that we insert the next 'n' message bits into the remainder.  Then we get an
+ * intermediate remainder of length '32 + n' bits, and the CRC of the extra 'n'
+ * bits is the amount by which the low 32 bits of the remainder will change as a
+ * result of cancelling out those 'n' bits.  Taking n=8 (one byte) and
+ * precomputing a table containing the CRC of each possible byte, we get
+ * crc32_slice1() defined below.
+ *
+ * As a further optimization, we could increase the multi-bit unit size to 16.
+ * However, that is inefficient because the table size explodes from 256 entries
+ * (1024 bytes) to 65536 entries (262144 bytes), which wastes memory and won't
+ * fit in L1 cache on typical processors.
+ *
+ * However, we can actually process 4 bytes at a time using 4 different tables
+ * with 256 entries each.  Logically, we form a 64-bit intermediate remainder
+ * and cancel out the high 32 bits in 8-bit chunks.  Bits 32-39 are cancelled
+ * out by the CRC of those bits, whereas bits 40-47 are be cancelled out by the
+ * CRC of those bits with 8 zero bits appended, and so on.  This method is
+ * implemented in crc32_slice4(), defined below.
+ *
+ * In crc32_slice8(), this method is extended to 8 bytes at a time.  The
+ * intermediate remainder (which we never actually store explicitly) is 96 bits.
+ *
+ * On CPUs that support fast carryless multiplication, CRCs can be computed even
+ * more quickly via "folding".  See crc32_pclmul() for an example.
+ */
+#include "x86_cpu_features.h"
+#include "libdeflate.h"
+/* Select the implementations to compile in. */
+#define NEED_GENERIC_IMPL 1 /* include generic impl unless overridden */
+#define DEFAULT_IMPL crc32_slice8
+/* Include the PCLMUL implementation? */
+#define NEED_PCLMUL_IMPL 0
+#if defined(__PCLMUL__) || \
+	(X86_CPU_FEATURES_ENABLED && COMPILER_SUPPORTS_PCLMUL_TARGET &&	\
+	 COMPILER_SUPPORTS_TARGET_INTRINSICS)
+#  include <wmmintrin.h>
+#  undef NEED_PCLMUL_IMPL
+#  define NEED_PCLMUL_IMPL 1
+#  ifdef __PCLMUL__ /* compiling for PCLMUL, i.e. can we assume it's there? */
+#    undef NEED_GENERIC_IMPL
+#    define NEED_GENERIC_IMPL 0 /* generic impl not needed */
+#    undef DEFAULT_IMPL
+#    define DEFAULT_IMPL crc32_pclmul
+#  endif /* otherwise, we can build a PCLMUL version, but we won't know whether
+	    we can use it until runtime */
+#endif
+/*
+ * Include the PCLMUL/AVX implementation?  Although our PCLMUL-optimized CRC-32
+ * function doesn't use any AVX intrinsics specifically, it can benefit a lot
+ * from being compiled for an AVX target: on Skylake, ~16700 MB/s vs. ~10100
+ * MB/s.  I expect this is related to the PCLMULQDQ instructions being assembled
+ * in the newer three-operand form rather than the older two-operand form.
+ *
+ * Note: this is only needed if __AVX__ is *not* defined, since otherwise the
+ * "regular" PCLMUL implementation would already be AVX enabled.
+ */
+#define NEED_PCLMUL_AVX_IMPL 0
+#if NEED_PCLMUL_IMPL && !defined(__AVX__) && \
+	 X86_CPU_FEATURES_ENABLED && COMPILER_SUPPORTS_AVX_TARGET
+#  undef NEED_PCLMUL_AVX_IMPL
+#  define NEED_PCLMUL_AVX_IMPL 1
+#endif
+#define NUM_IMPLS (NEED_GENERIC_IMPL + NEED_PCLMUL_IMPL + NEED_PCLMUL_AVX_IMPL)
+/* Define the CRC-32 table */
+#if NEED_GENERIC_IMPL
+#  define CRC32_SLICE8
+#else
+#  define CRC32_SLICE1 /* only need short table for unaligned ends */
+#endif
+#include "crc32_table.h"
+static forceinline u32
+crc32_update_byte(u32 remainder, u8 next_byte)
+{
+	return (remainder >> 8) ^ crc32_table[(u8)remainder ^ next_byte];
+}
+#if defined(CRC32_SLICE1) || (NUM_IMPLS > NEED_GENERIC_IMPL)
+static u32
+crc32_slice1(u32 remainder, const u8 *buffer, size_t nbytes)
+{
+	size_t i;
+	STATIC_ASSERT(ARRAY_LEN(crc32_table) >= 0x100);
+	for (i = 0; i < nbytes; i++)
+		remainder = crc32_update_byte(remainder, buffer[i]);
+	return remainder;
+}
+#endif
+#ifdef CRC32_SLICE4
+static u32
+crc32_slice4(u32 remainder, const u8 *buffer, size_t nbytes)
+{
+	const u8 *p = buffer;
+	const u8 *end = buffer + nbytes;
+	const u8 *end32;
+	STATIC_ASSERT(ARRAY_LEN(crc32_table) >= 0x400);
+	for (; ((uintptr_t)p & 3) && p != end; p++)
+		remainder = crc32_update_byte(remainder, *p);
+	end32 = p + ((end - p) & ~3);
+	for (; p != end32; p += 4) {
+		u32 v = le32_bswap(*(const u32 *)p);
+		remainder =
+		    crc32_table[0x300 + (u8)((remainder ^ v) >>  0)] ^
+		    crc32_table[0x200 + (u8)((remainder ^ v) >>  8)] ^
+		    crc32_table[0x100 + (u8)((remainder ^ v) >> 16)] ^
+		    crc32_table[0x000 + (u8)((remainder ^ v) >> 24)];
+	}
+	for (; p != end; p++)
+		remainder = crc32_update_byte(remainder, *p);
+	return remainder;
+}
+#endif
+#ifdef CRC32_SLICE8
+static u32
+crc32_slice8(u32 remainder, const u8 *buffer, size_t nbytes)
+{
+	const u8 *p = buffer;
+	const u8 *end = buffer + nbytes;
+	const u8 *end64;
+	STATIC_ASSERT(ARRAY_LEN(crc32_table) >= 0x800);
+	for (; ((uintptr_t)p & 7) && p != end; p++)
+		remainder = crc32_update_byte(remainder, *p);
+	end64 = p + ((end - p) & ~7);
+	for (; p != end64; p += 8) {
+		u32 v1 = le32_bswap(*(const u32 *)(p + 0));
+		u32 v2 = le32_bswap(*(const u32 *)(p + 4));
+		remainder =
+		    crc32_table[0x700 + (u8)((remainder ^ v1) >>  0)] ^
+		    crc32_table[0x600 + (u8)((remainder ^ v1) >>  8)] ^
+		    crc32_table[0x500 + (u8)((remainder ^ v1) >> 16)] ^
+		    crc32_table[0x400 + (u8)((remainder ^ v1) >> 24)] ^
+		    crc32_table[0x300 + (u8)(v2 >>  0)] ^
+		    crc32_table[0x200 + (u8)(v2 >>  8)] ^
+		    crc32_table[0x100 + (u8)(v2 >> 16)] ^
+		    crc32_table[0x000 + (u8)(v2 >> 24)];
+	}
+	for (; p != end; p++)
+		remainder = crc32_update_byte(remainder, *p);
+	return remainder;
+}
+#endif
+/* Define the PCLMUL implementation if needed. */
+#if NEED_PCLMUL_IMPL
+#  define FUNCNAME		crc32_pclmul
+#  define FUNCNAME_ALIGNED	crc32_pclmul_aligned
+#  ifdef __PCLMUL__
+#    define ATTRIBUTES
+#  else
+#    define ATTRIBUTES		__attribute__((target("pclmul")))
+#  endif
+#  include "crc32_impl.h"
+#endif
+/* Define the PCLMUL/AVX implementation if needed. */
+#if NEED_PCLMUL_AVX_IMPL
+#  define FUNCNAME		crc32_pclmul_avx
+#  define FUNCNAME_ALIGNED	crc32_pclmul_avx_aligned
+#  define ATTRIBUTES		__attribute__((target("pclmul,avx")))
+#  include "crc32_impl.h"
+#endif
+typedef u32 (*crc32_func_t)(u32, const u8 *, size_t);
+/*
+ * If multiple implementations are available, then dispatch among them based on
+ * CPU features at runtime.  Otherwise just call the single one directly.
+ */
+#if NUM_IMPLS == 1
+#  define crc32_impl DEFAULT_IMPL
+#else
+static u32 dispatch(u32, const u8 *, size_t);
+static crc32_func_t crc32_impl = dispatch;
+static u32 dispatch(u32 remainder, const u8 *buffer, size_t nbytes)
+{
+	crc32_func_t f = DEFAULT_IMPL;
+#if NEED_PCLMUL_IMPL && !defined(__PCLMUL__)
+	if (x86_have_cpu_features(X86_CPU_FEATURE_PCLMULQDQ))
+		f = crc32_pclmul;
+#endif
+#if NEED_PCLMUL_AVX_IMPL
+	if (x86_have_cpu_features(X86_CPU_FEATURE_PCLMULQDQ |
+				  X86_CPU_FEATURE_AVX))
+		f = crc32_pclmul_avx;
+#endif
+	crc32_impl = f;
+	return crc32_impl(remainder, buffer, nbytes);
+}
+#endif /* NUM_IMPLS != 1 */
+LIBDEFLATEAPI u32
+libdeflate_crc32(u32 remainder, const void *buffer, size_t nbytes)
+{
+	if (buffer == NULL) /* return initial value */
+		return 0;
+	return ~crc32_impl(~remainder, buffer, nbytes);
+}