RubyGems - libdeflate - Versions diffs - 0.1.1 → 0.2.0 - Mend

libdeflate 0.1.1 → 0.2.0

Files changed (79) hide show

checksums.yaml +5 -5
data/.github/workflows/test.yml +34 -0
data/README.md +1 -6
data/ext/libdeflate/extconf.rb +18 -7
data/ext/libdeflate/libdeflate_ext.c +17 -17
data/lib/libdeflate/version.rb +1 -1
data/libdeflate.gemspec +2 -1
metadata +13 -84
data/.gitmodules +0 -3
data/.travis.yml +0 -5
data/ext/libdeflate/libdeflate/.gitignore +0 -19
data/ext/libdeflate/libdeflate/COPYING +0 -21
data/ext/libdeflate/libdeflate/Makefile +0 -231
data/ext/libdeflate/libdeflate/Makefile.msc +0 -64
data/ext/libdeflate/libdeflate/NEWS +0 -57
data/ext/libdeflate/libdeflate/README.md +0 -170
data/ext/libdeflate/libdeflate/common/common_defs.h +0 -351
data/ext/libdeflate/libdeflate/common/compiler_gcc.h +0 -134
data/ext/libdeflate/libdeflate/common/compiler_msc.h +0 -95
data/ext/libdeflate/libdeflate/lib/adler32.c +0 -213
data/ext/libdeflate/libdeflate/lib/adler32_impl.h +0 -281
data/ext/libdeflate/libdeflate/lib/aligned_malloc.c +0 -57
data/ext/libdeflate/libdeflate/lib/aligned_malloc.h +0 -13
data/ext/libdeflate/libdeflate/lib/bt_matchfinder.h +0 -357
data/ext/libdeflate/libdeflate/lib/crc32.c +0 -368
data/ext/libdeflate/libdeflate/lib/crc32_impl.h +0 -286
data/ext/libdeflate/libdeflate/lib/crc32_table.h +0 -526
data/ext/libdeflate/libdeflate/lib/decompress_impl.h +0 -404
data/ext/libdeflate/libdeflate/lib/deflate_compress.c +0 -2817
data/ext/libdeflate/libdeflate/lib/deflate_compress.h +0 -14
data/ext/libdeflate/libdeflate/lib/deflate_constants.h +0 -66
data/ext/libdeflate/libdeflate/lib/deflate_decompress.c +0 -889
data/ext/libdeflate/libdeflate/lib/gzip_compress.c +0 -95
data/ext/libdeflate/libdeflate/lib/gzip_constants.h +0 -45
data/ext/libdeflate/libdeflate/lib/gzip_decompress.c +0 -130
data/ext/libdeflate/libdeflate/lib/hc_matchfinder.h +0 -405
data/ext/libdeflate/libdeflate/lib/lib_common.h +0 -35
data/ext/libdeflate/libdeflate/lib/matchfinder_avx2.h +0 -53
data/ext/libdeflate/libdeflate/lib/matchfinder_common.h +0 -205
data/ext/libdeflate/libdeflate/lib/matchfinder_neon.h +0 -61
data/ext/libdeflate/libdeflate/lib/matchfinder_sse2.h +0 -53
data/ext/libdeflate/libdeflate/lib/unaligned.h +0 -202
data/ext/libdeflate/libdeflate/lib/x86_cpu_features.c +0 -169
data/ext/libdeflate/libdeflate/lib/x86_cpu_features.h +0 -48
data/ext/libdeflate/libdeflate/lib/zlib_compress.c +0 -87
data/ext/libdeflate/libdeflate/lib/zlib_constants.h +0 -21
data/ext/libdeflate/libdeflate/lib/zlib_decompress.c +0 -91
data/ext/libdeflate/libdeflate/libdeflate.h +0 -274
data/ext/libdeflate/libdeflate/programs/benchmark.c +0 -558
data/ext/libdeflate/libdeflate/programs/checksum.c +0 -197
data/ext/libdeflate/libdeflate/programs/detect.sh +0 -62
data/ext/libdeflate/libdeflate/programs/gzip.c +0 -603
data/ext/libdeflate/libdeflate/programs/prog_util.c +0 -530
data/ext/libdeflate/libdeflate/programs/prog_util.h +0 -162
data/ext/libdeflate/libdeflate/programs/test_checksums.c +0 -135
data/ext/libdeflate/libdeflate/programs/tgetopt.c +0 -118
data/ext/libdeflate/libdeflate/tools/afl-fuzz/Makefile +0 -12
data/ext/libdeflate/libdeflate/tools/afl-fuzz/deflate_compress/fuzz.c +0 -40
data/ext/libdeflate/libdeflate/tools/afl-fuzz/deflate_compress/inputs/0 +0 -0
data/ext/libdeflate/libdeflate/tools/afl-fuzz/deflate_decompress/fuzz.c +0 -28
data/ext/libdeflate/libdeflate/tools/afl-fuzz/deflate_decompress/inputs/0 +0 -3
data/ext/libdeflate/libdeflate/tools/afl-fuzz/gzip_decompress/fuzz.c +0 -28
data/ext/libdeflate/libdeflate/tools/afl-fuzz/gzip_decompress/inputs/0 +0 -0
data/ext/libdeflate/libdeflate/tools/afl-fuzz/prepare_for_fuzz.sh +0 -14
data/ext/libdeflate/libdeflate/tools/afl-fuzz/zlib_decompress/fuzz.c +0 -28
data/ext/libdeflate/libdeflate/tools/afl-fuzz/zlib_decompress/inputs/0 +0 -3
data/ext/libdeflate/libdeflate/tools/android_build.sh +0 -104
data/ext/libdeflate/libdeflate/tools/checksum_benchmarks.sh +0 -76
data/ext/libdeflate/libdeflate/tools/exec_tests.sh +0 -30
data/ext/libdeflate/libdeflate/tools/gen_crc32_multipliers.c +0 -108
data/ext/libdeflate/libdeflate/tools/gen_crc32_table.c +0 -100
data/ext/libdeflate/libdeflate/tools/gzip_tests.sh +0 -412
data/ext/libdeflate/libdeflate/tools/make-windows-releases +0 -21
data/ext/libdeflate/libdeflate/tools/mips_build.sh +0 -9
data/ext/libdeflate/libdeflate/tools/msc_test.bat +0 -3
data/ext/libdeflate/libdeflate/tools/pgo_build.sh +0 -23
data/ext/libdeflate/libdeflate/tools/produce_gzip_benchmark_table.sh +0 -37
data/ext/libdeflate/libdeflate/tools/run_tests.sh +0 -305
data/ext/libdeflate/libdeflate/tools/windows_build.sh +0 -10

data/ext/libdeflate/libdeflate/lib/adler32_impl.h DELETED Viewed

@@ -1,281 +0,0 @@
-/*
- * adler32_impl.h
- *
- * Originally public domain; changes after 2016-09-07 are copyrighted.
- *
- * Copyright 2016 Eric Biggers
- *
- * Permission is hereby granted, free of charge, to any person
- * obtaining a copy of this software and associated documentation
- * files (the "Software"), to deal in the Software without
- * restriction, including without limitation the rights to use,
- * copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following
- * conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
- * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
- * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
- * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
- * OTHER DEALINGS IN THE SOFTWARE.
- */
-/*
- * This file contains a template for vectorized Adler-32 implementations.
- *
- * The inner loop between reductions modulo 65521 of an unvectorized Adler-32
- * implementation looks something like this:
- *
- *	do {
- * 		s1 += *p;
- * 		s2 += s1;
- *	} while (++p != chunk_end);
- *
- * For vectorized calculation of s1, we only need to sum the input bytes.  They
- * can be accumulated into multiple counters which are eventually summed
- * together.
- *
- * For vectorized calculation of s2, the basic idea is that for each iteration
- * that processes N bytes, we can perform the following vectorizable
- * calculation:
- *
- *	s2 += N*byte_1 + (N-1)*byte_2 + (N-2)*byte_3 + ... + 1*byte_N
- *
- * Or, equivalently, we can sum the byte_1...byte_N for each iteration into N
- * separate counters, then do the multiplications by N...1 just once at the end
- * rather than once per iteration.
- *
- * Also, we must account for how previous bytes will affect s2 by doing the
- * following at beginning of each iteration:
- *
- *	s2 += s1 * N
- *
- * Furthermore, like s1, "s2" can actually be multiple counters which are
- * eventually summed together.
- */
-static u32 ATTRIBUTES
-FUNCNAME(u32 adler, const void *buffer, size_t size)
-{
-	u32 s1 = adler & 0xFFFF;
-	u32 s2 = adler >> 16;
-	const u8 *p = buffer;
-	const u8 * const end = p + size;
-	const u8 *vend;
-	/* Process a byte at a time until the required alignment is reached. */
-	if (p != end && (uintptr_t)p % ALIGNMENT_REQUIRED) {
-		do {
-			s1 += *p++;
-			s2 += s1;
-		} while (p != end && (uintptr_t)p % ALIGNMENT_REQUIRED);
-		s1 %= DIVISOR;
-		s2 %= DIVISOR;
-	}
-	/*
-	 * Process "chunks" of bytes using vector instructions.  Chunk sizes are
-	 * limited to MAX_BYTES_PER_CHUNK, which guarantees that s1 and s2 never
-	 * overflow before being reduced modulo DIVISOR.  For vector processing,
-	 * chunks size are also made evenly divisible by BYTES_PER_ITERATION.
-	 */
-	STATIC_ASSERT(BYTES_PER_ITERATION % ALIGNMENT_REQUIRED == 0);
-	vend = end - ((size_t)(end - p) % BYTES_PER_ITERATION);
-	while (p != vend) {
-		size_t chunk_size;
-		const u8 *chunk_end;
-		chunk_size = MIN((size_t)(vend - p), MAX_BYTES_PER_CHUNK);
-	#if TARGET == TARGET_SSE2
-		/* SSE2: the 16-bit precision byte counters must not undergo
-		 * *signed* overflow, otherwise the signed multiplication at the
-		 * end will not behave as desired. */
-		chunk_size = MIN(chunk_size, BYTES_PER_ITERATION * (0x7FFF / 0xFF));
-	#elif TARGET == TARGET_NEON
-		/* NEON: the 16-bit precision counters must not undergo
-		 * *unsigned* overflow. */
-		chunk_size = MIN(chunk_size, BYTES_PER_ITERATION * (0xFFFF / 0xFF));
-	#endif
-		chunk_size -= chunk_size % BYTES_PER_ITERATION;
-		chunk_end = p + chunk_size;
-		s2 += s1 * chunk_size;
-		{
-	#if TARGET == TARGET_AVX2
-		/* AVX2 implementation */
-		const __m256i zeroes = _mm256_setzero_si256();
-		const __v32qi multipliers = (__v32qi) { 32, 31, 30, 29, 28, 27, 26, 25,
-							24, 23, 22, 21, 20, 19, 18, 17,
-							16, 15, 14, 13, 12, 11, 10, 9,
-							8,  7,  6,  5,  4,  3,  2,  1 };
-		const __v16hi ones = (__v16hi)_mm256_set1_epi16(1);
-		__v8si v_s1 = (__v8si)zeroes;
-		__v8si v_s1_sums = (__v8si)zeroes;
-		__v8si v_s2 = (__v8si)zeroes;
-		STATIC_ASSERT(ALIGNMENT_REQUIRED == 32 && BYTES_PER_ITERATION == 32);
-		do {
-			__m256i bytes = *(const __m256i *)p;
-			__v16hi sums = (__v16hi)_mm256_maddubs_epi16(
-							bytes, (__m256i)multipliers);
-			v_s1_sums += v_s1;
-			v_s1 += (__v8si)_mm256_sad_epu8(bytes, zeroes);
-			v_s2 += (__v8si)_mm256_madd_epi16((__m256i)sums, (__m256i)ones);
-		} while ((p += BYTES_PER_ITERATION) != chunk_end);
-		v_s1 = (__v8si)_mm256_hadd_epi32((__m256i)v_s1, zeroes);
-		v_s1 = (__v8si)_mm256_hadd_epi32((__m256i)v_s1, zeroes);
-		s1 += v_s1[0] + v_s1[4];
-		v_s2 += (__v8si)_mm256_slli_epi32((__m256i)v_s1_sums, 5);
-		v_s2 = (__v8si)_mm256_hadd_epi32((__m256i)v_s2, zeroes);
-		v_s2 = (__v8si)_mm256_hadd_epi32((__m256i)v_s2, zeroes);
-		s2 += v_s2[0] + v_s2[4];
-	#elif TARGET == TARGET_SSE2
-		/* SSE2 implementation */
-		const __m128i zeroes = _mm_setzero_si128();
-		/* s1 counters: 32-bit, sum of bytes */
-		__v4si v_s1 = (__v4si)zeroes;
-		/* s2 counters: 32-bit, sum of s1 values */
-		__v4si v_s2 = (__v4si)zeroes;
-		/*
-		 * Thirty-two 16-bit counters for byte sums.  Each accumulates
-		 * the bytes that eventually need to be multiplied by a number
-		 * 32...1 for addition into s2.
-		 */
-		__v8hi v_byte_sums_a = (__v8hi)zeroes;
-		__v8hi v_byte_sums_b = (__v8hi)zeroes;
-		__v8hi v_byte_sums_c = (__v8hi)zeroes;
-		__v8hi v_byte_sums_d = (__v8hi)zeroes;
-		STATIC_ASSERT(ALIGNMENT_REQUIRED == 16 && BYTES_PER_ITERATION == 32);
-		do {
-			/* Load the next 32 bytes. */
-			const __m128i bytes1 = *(const __m128i *)p;
-			const __m128i bytes2 = *(const __m128i *)(p + 16);
-			/*
-			 * Accumulate the previous s1 counters into the s2
-			 * counters.  Logically, this really should be
-			 * v_s2 += v_s1 * BYTES_PER_ITERATION, but we can do the
-			 * multiplication (or left shift) later.
-			 */
-			v_s2 += v_s1;
-			/*
-			 * s1 update: use "Packed Sum of Absolute Differences"
-			 * to add the bytes horizontally with 8 bytes per sum.
-			 * Then add the sums to the s1 counters.
-			 */
-			v_s1 += (__v4si)_mm_sad_epu8(bytes1, zeroes);
-			v_s1 += (__v4si)_mm_sad_epu8(bytes2, zeroes);
-			/*
-			 * Also accumulate the bytes into 32 separate counters
-			 * that have 16-bit precision.
-			 */
-			v_byte_sums_a += (__v8hi)_mm_unpacklo_epi8(bytes1, zeroes);
-			v_byte_sums_b += (__v8hi)_mm_unpackhi_epi8(bytes1, zeroes);
-			v_byte_sums_c += (__v8hi)_mm_unpacklo_epi8(bytes2, zeroes);
-			v_byte_sums_d += (__v8hi)_mm_unpackhi_epi8(bytes2, zeroes);
-		} while ((p += BYTES_PER_ITERATION) != chunk_end);
-		/* Finish calculating the s2 counters. */
-		v_s2 = (__v4si)_mm_slli_epi32((__m128i)v_s2, 5);
-		v_s2 += (__v4si)_mm_madd_epi16((__m128i)v_byte_sums_a,
-					       (__m128i)(__v8hi){ 32, 31, 30, 29, 28, 27, 26, 25 });
-		v_s2 += (__v4si)_mm_madd_epi16((__m128i)v_byte_sums_b,
-					       (__m128i)(__v8hi){ 24, 23, 22, 21, 20, 19, 18, 17 });
-		v_s2 += (__v4si)_mm_madd_epi16((__m128i)v_byte_sums_c,
-					       (__m128i)(__v8hi){ 16, 15, 14, 13, 12, 11, 10, 9 });
-		v_s2 += (__v4si)_mm_madd_epi16((__m128i)v_byte_sums_d,
-					       (__m128i)(__v8hi){ 8,  7,  6,  5,  4,  3,  2,  1 });
-		/* Now accumulate what we computed into the real s1 and s2. */
-		v_s1 += (__v4si)_mm_shuffle_epi32((__m128i)v_s1, 0x31);
-		v_s1 += (__v4si)_mm_shuffle_epi32((__m128i)v_s1, 0x02);
-		s1 += _mm_cvtsi128_si32((__m128i)v_s1);
-		v_s2 += (__v4si)_mm_shuffle_epi32((__m128i)v_s2, 0x31);
-		v_s2 += (__v4si)_mm_shuffle_epi32((__m128i)v_s2, 0x02);
-		s2 += _mm_cvtsi128_si32((__m128i)v_s2);
-	#elif TARGET == TARGET_NEON
-		/* ARM NEON (Advanced SIMD) implementation */
-		uint32x4_t v_s1 = (uint32x4_t) { 0, 0, 0, 0 };
-		uint32x4_t v_s2 = (uint32x4_t) { 0, 0, 0, 0 };
-		uint16x8_t v_byte_sums_a = (uint16x8_t) { 0, 0, 0, 0, 0, 0, 0, 0 };
-		uint16x8_t v_byte_sums_b = (uint16x8_t) { 0, 0, 0, 0, 0, 0, 0, 0 };
-		uint16x8_t v_byte_sums_c = (uint16x8_t) { 0, 0, 0, 0, 0, 0, 0, 0 };
-		uint16x8_t v_byte_sums_d = (uint16x8_t) { 0, 0, 0, 0, 0, 0, 0, 0 };
-		STATIC_ASSERT(ALIGNMENT_REQUIRED == 16 && BYTES_PER_ITERATION == 32);
-		do {
-			const uint8x16_t bytes1 = *(const uint8x16_t *)p;
-			const uint8x16_t bytes2 = *(const uint8x16_t *)(p + 16);
-			uint16x8_t tmp;
-			v_s2 += v_s1;
-			tmp = vpaddlq_u8(bytes1);
-			tmp = vpadalq_u8(tmp, bytes2);
-			v_s1 = vpadalq_u16(v_s1, tmp);
-			v_byte_sums_a = vaddw_u8(v_byte_sums_a, vget_low_u8(bytes1));
-			v_byte_sums_b = vaddw_u8(v_byte_sums_b, vget_high_u8(bytes1));
-			v_byte_sums_c = vaddw_u8(v_byte_sums_c, vget_low_u8(bytes2));
-			v_byte_sums_d = vaddw_u8(v_byte_sums_d, vget_high_u8(bytes2));
-		} while ((p += BYTES_PER_ITERATION) != chunk_end);
-		v_s2 = vqshlq_n_u32(v_s2, 5);
-		v_s2 = vmlal_u16(v_s2, vget_low_u16(v_byte_sums_a),  (uint16x4_t) { 32, 31, 30, 29 });
-		v_s2 = vmlal_u16(v_s2, vget_high_u16(v_byte_sums_a), (uint16x4_t) { 28, 27, 26, 25 });
-		v_s2 = vmlal_u16(v_s2, vget_low_u16(v_byte_sums_b),  (uint16x4_t) { 24, 23, 22, 21 });
-		v_s2 = vmlal_u16(v_s2, vget_high_u16(v_byte_sums_b), (uint16x4_t) { 20, 19, 18, 17 });
-		v_s2 = vmlal_u16(v_s2, vget_low_u16(v_byte_sums_c),  (uint16x4_t) { 16, 15, 14, 13 });
-		v_s2 = vmlal_u16(v_s2, vget_high_u16(v_byte_sums_c), (uint16x4_t) { 12, 11, 10,  9 });
-		v_s2 = vmlal_u16(v_s2, vget_low_u16 (v_byte_sums_d), (uint16x4_t) {  8,  7,  6,  5 });
-		v_s2 = vmlal_u16(v_s2, vget_high_u16(v_byte_sums_d), (uint16x4_t) {  4,  3,  2,  1 });
-		s1 += v_s1[0] + v_s1[1] + v_s1[2] + v_s1[3];
-		s2 += v_s2[0] + v_s2[1] + v_s2[2] + v_s2[3];
-	#else
-	#  error "BUG: unknown target"
-	#endif
-		}
-		s1 %= DIVISOR;
-		s2 %= DIVISOR;
-	}
-	/* Process any remaining bytes. */
-	if (p != end) {
-		do {
-			s1 += *p++;
-			s2 += s1;
-		} while (p != end);
-		s1 %= DIVISOR;
-		s2 %= DIVISOR;
-	}
-	return (s2 << 16) | s1;
-}
-#undef FUNCNAME
-#undef TARGET
-#undef ALIGNMENT_REQUIRED
-#undef BYTES_PER_ITERATION
-#undef ATTRIBUTES

data/ext/libdeflate/libdeflate/lib/aligned_malloc.c DELETED Viewed

@@ -1,57 +0,0 @@
-/*
- * aligned_malloc.c - aligned memory allocation
- *
- * Originally public domain; changes after 2016-09-07 are copyrighted.
- *
- * Copyright 2016 Eric Biggers
- *
- * Permission is hereby granted, free of charge, to any person
- * obtaining a copy of this software and associated documentation
- * files (the "Software"), to deal in the Software without
- * restriction, including without limitation the rights to use,
- * copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following
- * conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
- * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
- * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
- * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
- * OTHER DEALINGS IN THE SOFTWARE.
- */
-/*
- * This file provides portable aligned memory allocation functions that only
- * use malloc() and free().  This avoids portability problems with
- * posix_memalign(), aligned_alloc(), etc.
- */
-#include <stdlib.h>
-#include "aligned_malloc.h"
-void *
-aligned_malloc(size_t alignment, size_t size)
-{
-	void *ptr = malloc(sizeof(void *) + alignment - 1 + size);
-	if (ptr) {
-		void *orig_ptr = ptr;
-		ptr = (void *)ALIGN((uintptr_t)ptr + sizeof(void *), alignment);
-		((void **)ptr)[-1] = orig_ptr;
-	}
-	return ptr;
-}
-void
-aligned_free(void *ptr)
-{
-	if (ptr)
-		free(((void **)ptr)[-1]);
-}

data/ext/libdeflate/libdeflate/lib/aligned_malloc.h DELETED Viewed

@@ -1,13 +0,0 @@
-/*
- * aligned_malloc.c - aligned memory allocation
- */
-#ifndef LIB_ALIGNED_MALLOC_H
-#define LIB_ALIGNED_MALLOC_H
-#include "lib_common.h"
-extern void *aligned_malloc(size_t alignment, size_t size);
-extern void aligned_free(void *ptr);
-#endif /* LIB_ALIGNED_MALLOC_H */

data/ext/libdeflate/libdeflate/lib/bt_matchfinder.h DELETED Viewed

@@ -1,357 +0,0 @@
-/*
- * bt_matchfinder.h - Lempel-Ziv matchfinding with a hash table of binary trees
- *
- * Originally public domain; changes after 2016-09-07 are copyrighted.
- *
- * Copyright 2016 Eric Biggers
- *
- * Permission is hereby granted, free of charge, to any person
- * obtaining a copy of this software and associated documentation
- * files (the "Software"), to deal in the Software without
- * restriction, including without limitation the rights to use,
- * copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following
- * conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
- * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
- * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
- * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
- * OTHER DEALINGS IN THE SOFTWARE.
- *
- * ----------------------------------------------------------------------------
- *
- * This is a Binary Trees (bt) based matchfinder.
- *
- * The main data structure is a hash table where each hash bucket contains a
- * binary tree of sequences whose first 4 bytes share the same hash code.  Each
- * sequence is identified by its starting position in the input buffer.  Each
- * binary tree is always sorted such that each left child represents a sequence
- * lexicographically lesser than its parent and each right child represents a
- * sequence lexicographically greater than its parent.
- *
- * The algorithm processes the input buffer sequentially.  At each byte
- * position, the hash code of the first 4 bytes of the sequence beginning at
- * that position (the sequence being matched against) is computed.  This
- * identifies the hash bucket to use for that position.  Then, a new binary tree
- * node is created to represent the current sequence.  Then, in a single tree
- * traversal, the hash bucket's binary tree is searched for matches and is
- * re-rooted at the new node.
- *
- * Compared to the simpler algorithm that uses linked lists instead of binary
- * trees (see hc_matchfinder.h), the binary tree version gains more information
- * at each node visitation.  Ideally, the binary tree version will examine only
- * 'log(n)' nodes to find the same matches that the linked list version will
- * find by examining 'n' nodes.  In addition, the binary tree version can
- * examine fewer bytes at each node by taking advantage of the common prefixes
- * that result from the sort order, whereas the linked list version may have to
- * examine up to the full length of the match at each node.
- *
- * However, it is not always best to use the binary tree version.  It requires
- * nearly twice as much memory as the linked list version, and it takes time to
- * keep the binary trees sorted, even at positions where the compressor does not
- * need matches.  Generally, when doing fast compression on small buffers,
- * binary trees are the wrong approach.  They are best suited for thorough
- * compression and/or large buffers.
- *
- * ----------------------------------------------------------------------------
- */
-#include "matchfinder_common.h"
-#define BT_MATCHFINDER_HASH3_ORDER 16
-#define BT_MATCHFINDER_HASH3_WAYS  2
-#define BT_MATCHFINDER_HASH4_ORDER 16
-#define BT_MATCHFINDER_TOTAL_HASH_LENGTH		\
-	((1UL << BT_MATCHFINDER_HASH3_ORDER) * BT_MATCHFINDER_HASH3_WAYS + \
-	 (1UL << BT_MATCHFINDER_HASH4_ORDER))
-/* Representation of a match found by the bt_matchfinder  */
-struct lz_match {
-	/* The number of bytes matched.  */
-	u16 length;
-	/* The offset back from the current position that was matched.  */
-	u16 offset;
-};
-struct bt_matchfinder {
-	/* The hash table for finding length 3 matches  */
-	mf_pos_t hash3_tab[1UL << BT_MATCHFINDER_HASH3_ORDER][BT_MATCHFINDER_HASH3_WAYS];
-	/* The hash table which contains the roots of the binary trees for
-	 * finding length 4+ matches  */
-	mf_pos_t hash4_tab[1UL << BT_MATCHFINDER_HASH4_ORDER];
-	/* The child node references for the binary trees.  The left and right
-	 * children of the node for the sequence with position 'pos' are
-	 * 'child_tab[pos * 2]' and 'child_tab[pos * 2 + 1]', respectively.  */
-	mf_pos_t child_tab[2UL * MATCHFINDER_WINDOW_SIZE];
-}
-#ifdef _aligned_attribute
-_aligned_attribute(MATCHFINDER_ALIGNMENT)
-#endif
-;
-/* Prepare the matchfinder for a new input buffer.  */
-static forceinline void
-bt_matchfinder_init(struct bt_matchfinder *mf)
-{
-	matchfinder_init((mf_pos_t *)mf, BT_MATCHFINDER_TOTAL_HASH_LENGTH);
-}
-static forceinline void
-bt_matchfinder_slide_window(struct bt_matchfinder *mf)
-{
-	matchfinder_rebase((mf_pos_t *)mf,
-			   sizeof(struct bt_matchfinder) / sizeof(mf_pos_t));
-}
-static forceinline mf_pos_t *
-bt_left_child(struct bt_matchfinder *mf, s32 node)
-{
-	return &mf->child_tab[2 * (node & (MATCHFINDER_WINDOW_SIZE - 1)) + 0];
-}
-static forceinline mf_pos_t *
-bt_right_child(struct bt_matchfinder *mf, s32 node)
-{
-	return &mf->child_tab[2 * (node & (MATCHFINDER_WINDOW_SIZE - 1)) + 1];
-}
-/* The minimum permissible value of 'max_len' for bt_matchfinder_get_matches()
- * and bt_matchfinder_skip_position().  There must be sufficiently many bytes
- * remaining to load a 32-bit integer from the *next* position.  */
-#define BT_MATCHFINDER_REQUIRED_NBYTES	5
-/* Advance the binary tree matchfinder by one byte, optionally recording
- * matches.  @record_matches should be a compile-time constant.  */
-static forceinline struct lz_match *
-bt_matchfinder_advance_one_byte(struct bt_matchfinder * const restrict mf,
-				const u8 * const restrict in_base,
-				const ptrdiff_t cur_pos,
-				const u32 max_len,
-				const u32 nice_len,
-				const u32 max_search_depth,
-				u32 * const restrict next_hashes,
-				u32 * const restrict best_len_ret,
-				struct lz_match * restrict lz_matchptr,
-				const bool record_matches)
-{
-	const u8 *in_next = in_base + cur_pos;
-	u32 depth_remaining = max_search_depth;
-	const s32 cutoff = cur_pos - MATCHFINDER_WINDOW_SIZE;
-	u32 next_seq4;
-	u32 next_seq3;
-	u32 hash3;
-	u32 hash4;
-	s32 cur_node;
-#if BT_MATCHFINDER_HASH3_WAYS >= 2
-	s32 cur_node_2;
-#endif
-	const u8 *matchptr;
-	mf_pos_t *pending_lt_ptr, *pending_gt_ptr;
-	u32 best_lt_len, best_gt_len;
-	u32 len;
-	u32 best_len = 3;
-	STATIC_ASSERT(BT_MATCHFINDER_HASH3_WAYS >= 1 &&
-		      BT_MATCHFINDER_HASH3_WAYS <= 2);
-	next_seq4 = load_u32_unaligned(in_next + 1);
-	next_seq3 = loaded_u32_to_u24(next_seq4);
-	hash3 = next_hashes[0];
-	hash4 = next_hashes[1];
-	next_hashes[0] = lz_hash(next_seq3, BT_MATCHFINDER_HASH3_ORDER);
-	next_hashes[1] = lz_hash(next_seq4, BT_MATCHFINDER_HASH4_ORDER);
-	prefetchw(&mf->hash3_tab[next_hashes[0]]);
-	prefetchw(&mf->hash4_tab[next_hashes[1]]);
-	cur_node = mf->hash3_tab[hash3][0];
-	mf->hash3_tab[hash3][0] = cur_pos;
-#if BT_MATCHFINDER_HASH3_WAYS >= 2
-	cur_node_2 = mf->hash3_tab[hash3][1];
-	mf->hash3_tab[hash3][1] = cur_node;
-#endif
-	if (record_matches && cur_node > cutoff) {
-		u32 seq3 = load_u24_unaligned(in_next);
-		if (seq3 == load_u24_unaligned(&in_base[cur_node])) {
-			lz_matchptr->length = 3;
-			lz_matchptr->offset = in_next - &in_base[cur_node];
-			lz_matchptr++;
-		}
-	#if BT_MATCHFINDER_HASH3_WAYS >= 2
-		else if (cur_node_2 > cutoff &&
-			seq3 == load_u24_unaligned(&in_base[cur_node_2]))
-		{
-			lz_matchptr->length = 3;
-			lz_matchptr->offset = in_next - &in_base[cur_node_2];
-			lz_matchptr++;
-		}
-	#endif
-	}
-	cur_node = mf->hash4_tab[hash4];
-	mf->hash4_tab[hash4] = cur_pos;
-	pending_lt_ptr = bt_left_child(mf, cur_pos);
-	pending_gt_ptr = bt_right_child(mf, cur_pos);
-	if (cur_node <= cutoff) {
-		*pending_lt_ptr = MATCHFINDER_INITVAL;
-		*pending_gt_ptr = MATCHFINDER_INITVAL;
-		*best_len_ret = best_len;
-		return lz_matchptr;
-	}
-	best_lt_len = 0;
-	best_gt_len = 0;
-	len = 0;
-	for (;;) {
-		matchptr = &in_base[cur_node];
-		if (matchptr[len] == in_next[len]) {
-			len = lz_extend(in_next, matchptr, len + 1, max_len);
-			if (!record_matches || len > best_len) {
-				if (record_matches) {
-					best_len = len;
-					lz_matchptr->length = len;
-					lz_matchptr->offset = in_next - matchptr;
-					lz_matchptr++;
-				}
-				if (len >= nice_len) {
-					*pending_lt_ptr = *bt_left_child(mf, cur_node);
-					*pending_gt_ptr = *bt_right_child(mf, cur_node);
-					*best_len_ret = best_len;
-					return lz_matchptr;
-				}
-			}
-		}
-		if (matchptr[len] < in_next[len]) {
-			*pending_lt_ptr = cur_node;
-			pending_lt_ptr = bt_right_child(mf, cur_node);
-			cur_node = *pending_lt_ptr;
-			best_lt_len = len;
-			if (best_gt_len < len)
-				len = best_gt_len;
-		} else {
-			*pending_gt_ptr = cur_node;
-			pending_gt_ptr = bt_left_child(mf, cur_node);
-			cur_node = *pending_gt_ptr;
-			best_gt_len = len;
-			if (best_lt_len < len)
-				len = best_lt_len;
-		}
-		if (cur_node <= cutoff || !--depth_remaining) {
-			*pending_lt_ptr = MATCHFINDER_INITVAL;
-			*pending_gt_ptr = MATCHFINDER_INITVAL;
-			*best_len_ret = best_len;
-			return lz_matchptr;
-		}
-	}
-}
-/*
- * Retrieve a list of matches with the current position.
- *
- * @mf
- *	The matchfinder structure.
- * @in_base
- *	Pointer to the next byte in the input buffer to process _at the last
- *	time bt_matchfinder_init() or bt_matchfinder_slide_window() was called_.
- * @cur_pos
- *	The current position in the input buffer relative to @in_base (the
- *	position of the sequence being matched against).
- * @max_len
- *	The maximum permissible match length at this position.  Must be >=
- *	BT_MATCHFINDER_REQUIRED_NBYTES.
- * @nice_len
- *	Stop searching if a match of at least this length is found.
- *	Must be <= @max_len.
- * @max_search_depth
- *	Limit on the number of potential matches to consider.  Must be >= 1.
- * @next_hashes
- *	The precomputed hash codes for the sequence beginning at @in_next.
- *	These will be used and then updated with the precomputed hashcodes for
- *	the sequence beginning at @in_next + 1.
- * @best_len_ret
- *	If a match of length >= 4 was found, then the length of the longest such
- *	match is written here; otherwise 3 is written here.  (Note: this is
- *	redundant with the 'struct lz_match' array, but this is easier for the
- *	compiler to optimize when inlined and the caller immediately does a
- *	check against 'best_len'.)
- * @lz_matchptr
- *	An array in which this function will record the matches.  The recorded
- *	matches will be sorted by strictly increasing length and (non-strictly)
- *	increasing offset.  The maximum number of matches that may be found is
- *	'nice_len - 2'.
- *
- * The return value is a pointer to the next available slot in the @lz_matchptr
- * array.  (If no matches were found, this will be the same as @lz_matchptr.)
- */
-static forceinline struct lz_match *
-bt_matchfinder_get_matches(struct bt_matchfinder *mf,
-			   const u8 *in_base,
-			   ptrdiff_t cur_pos,
-			   u32 max_len,
-			   u32 nice_len,
-			   u32 max_search_depth,
-			   u32 next_hashes[2],
-			   u32 *best_len_ret,
-			   struct lz_match *lz_matchptr)
-{
-	return bt_matchfinder_advance_one_byte(mf,
-					       in_base,
-					       cur_pos,
-					       max_len,
-					       nice_len,
-					       max_search_depth,
-					       next_hashes,
-					       best_len_ret,
-					       lz_matchptr,
-					       true);
-}
-/*
- * Advance the matchfinder, but don't record any matches.
- *
- * This is very similar to bt_matchfinder_get_matches() because both functions
- * must do hashing and tree re-rooting.
- */
-static forceinline void
-bt_matchfinder_skip_position(struct bt_matchfinder *mf,
-			     const u8 *in_base,
-			     ptrdiff_t cur_pos,
-			     u32 nice_len,
-			     u32 max_search_depth,
-			     u32 next_hashes[2])
-{
-	u32 best_len;
-	bt_matchfinder_advance_one_byte(mf,
-					in_base,
-					cur_pos,
-					nice_len,
-					nice_len,
-					max_search_depth,
-					next_hashes,
-					&best_len,
-					NULL,
-					false);
-}