libdeflate 0.1.1 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- data/.github/workflows/test.yml +34 -0
- data/README.md +1 -6
- data/ext/libdeflate/extconf.rb +18 -7
- data/ext/libdeflate/libdeflate_ext.c +17 -17
- data/lib/libdeflate/version.rb +1 -1
- data/libdeflate.gemspec +2 -1
- metadata +13 -84
- data/.gitmodules +0 -3
- data/.travis.yml +0 -5
- data/ext/libdeflate/libdeflate/.gitignore +0 -19
- data/ext/libdeflate/libdeflate/COPYING +0 -21
- data/ext/libdeflate/libdeflate/Makefile +0 -231
- data/ext/libdeflate/libdeflate/Makefile.msc +0 -64
- data/ext/libdeflate/libdeflate/NEWS +0 -57
- data/ext/libdeflate/libdeflate/README.md +0 -170
- data/ext/libdeflate/libdeflate/common/common_defs.h +0 -351
- data/ext/libdeflate/libdeflate/common/compiler_gcc.h +0 -134
- data/ext/libdeflate/libdeflate/common/compiler_msc.h +0 -95
- data/ext/libdeflate/libdeflate/lib/adler32.c +0 -213
- data/ext/libdeflate/libdeflate/lib/adler32_impl.h +0 -281
- data/ext/libdeflate/libdeflate/lib/aligned_malloc.c +0 -57
- data/ext/libdeflate/libdeflate/lib/aligned_malloc.h +0 -13
- data/ext/libdeflate/libdeflate/lib/bt_matchfinder.h +0 -357
- data/ext/libdeflate/libdeflate/lib/crc32.c +0 -368
- data/ext/libdeflate/libdeflate/lib/crc32_impl.h +0 -286
- data/ext/libdeflate/libdeflate/lib/crc32_table.h +0 -526
- data/ext/libdeflate/libdeflate/lib/decompress_impl.h +0 -404
- data/ext/libdeflate/libdeflate/lib/deflate_compress.c +0 -2817
- data/ext/libdeflate/libdeflate/lib/deflate_compress.h +0 -14
- data/ext/libdeflate/libdeflate/lib/deflate_constants.h +0 -66
- data/ext/libdeflate/libdeflate/lib/deflate_decompress.c +0 -889
- data/ext/libdeflate/libdeflate/lib/gzip_compress.c +0 -95
- data/ext/libdeflate/libdeflate/lib/gzip_constants.h +0 -45
- data/ext/libdeflate/libdeflate/lib/gzip_decompress.c +0 -130
- data/ext/libdeflate/libdeflate/lib/hc_matchfinder.h +0 -405
- data/ext/libdeflate/libdeflate/lib/lib_common.h +0 -35
- data/ext/libdeflate/libdeflate/lib/matchfinder_avx2.h +0 -53
- data/ext/libdeflate/libdeflate/lib/matchfinder_common.h +0 -205
- data/ext/libdeflate/libdeflate/lib/matchfinder_neon.h +0 -61
- data/ext/libdeflate/libdeflate/lib/matchfinder_sse2.h +0 -53
- data/ext/libdeflate/libdeflate/lib/unaligned.h +0 -202
- data/ext/libdeflate/libdeflate/lib/x86_cpu_features.c +0 -169
- data/ext/libdeflate/libdeflate/lib/x86_cpu_features.h +0 -48
- data/ext/libdeflate/libdeflate/lib/zlib_compress.c +0 -87
- data/ext/libdeflate/libdeflate/lib/zlib_constants.h +0 -21
- data/ext/libdeflate/libdeflate/lib/zlib_decompress.c +0 -91
- data/ext/libdeflate/libdeflate/libdeflate.h +0 -274
- data/ext/libdeflate/libdeflate/programs/benchmark.c +0 -558
- data/ext/libdeflate/libdeflate/programs/checksum.c +0 -197
- data/ext/libdeflate/libdeflate/programs/detect.sh +0 -62
- data/ext/libdeflate/libdeflate/programs/gzip.c +0 -603
- data/ext/libdeflate/libdeflate/programs/prog_util.c +0 -530
- data/ext/libdeflate/libdeflate/programs/prog_util.h +0 -162
- data/ext/libdeflate/libdeflate/programs/test_checksums.c +0 -135
- data/ext/libdeflate/libdeflate/programs/tgetopt.c +0 -118
- data/ext/libdeflate/libdeflate/tools/afl-fuzz/Makefile +0 -12
- data/ext/libdeflate/libdeflate/tools/afl-fuzz/deflate_compress/fuzz.c +0 -40
- data/ext/libdeflate/libdeflate/tools/afl-fuzz/deflate_compress/inputs/0 +0 -0
- data/ext/libdeflate/libdeflate/tools/afl-fuzz/deflate_decompress/fuzz.c +0 -28
- data/ext/libdeflate/libdeflate/tools/afl-fuzz/deflate_decompress/inputs/0 +0 -3
- data/ext/libdeflate/libdeflate/tools/afl-fuzz/gzip_decompress/fuzz.c +0 -28
- data/ext/libdeflate/libdeflate/tools/afl-fuzz/gzip_decompress/inputs/0 +0 -0
- data/ext/libdeflate/libdeflate/tools/afl-fuzz/prepare_for_fuzz.sh +0 -14
- data/ext/libdeflate/libdeflate/tools/afl-fuzz/zlib_decompress/fuzz.c +0 -28
- data/ext/libdeflate/libdeflate/tools/afl-fuzz/zlib_decompress/inputs/0 +0 -3
- data/ext/libdeflate/libdeflate/tools/android_build.sh +0 -104
- data/ext/libdeflate/libdeflate/tools/checksum_benchmarks.sh +0 -76
- data/ext/libdeflate/libdeflate/tools/exec_tests.sh +0 -30
- data/ext/libdeflate/libdeflate/tools/gen_crc32_multipliers.c +0 -108
- data/ext/libdeflate/libdeflate/tools/gen_crc32_table.c +0 -100
- data/ext/libdeflate/libdeflate/tools/gzip_tests.sh +0 -412
- data/ext/libdeflate/libdeflate/tools/make-windows-releases +0 -21
- data/ext/libdeflate/libdeflate/tools/mips_build.sh +0 -9
- data/ext/libdeflate/libdeflate/tools/msc_test.bat +0 -3
- data/ext/libdeflate/libdeflate/tools/pgo_build.sh +0 -23
- data/ext/libdeflate/libdeflate/tools/produce_gzip_benchmark_table.sh +0 -37
- data/ext/libdeflate/libdeflate/tools/run_tests.sh +0 -305
- data/ext/libdeflate/libdeflate/tools/windows_build.sh +0 -10
@@ -1,281 +0,0 @@
|
|
1
|
-
/*
|
2
|
-
* adler32_impl.h
|
3
|
-
*
|
4
|
-
* Originally public domain; changes after 2016-09-07 are copyrighted.
|
5
|
-
*
|
6
|
-
* Copyright 2016 Eric Biggers
|
7
|
-
*
|
8
|
-
* Permission is hereby granted, free of charge, to any person
|
9
|
-
* obtaining a copy of this software and associated documentation
|
10
|
-
* files (the "Software"), to deal in the Software without
|
11
|
-
* restriction, including without limitation the rights to use,
|
12
|
-
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
13
|
-
* copies of the Software, and to permit persons to whom the
|
14
|
-
* Software is furnished to do so, subject to the following
|
15
|
-
* conditions:
|
16
|
-
*
|
17
|
-
* The above copyright notice and this permission notice shall be
|
18
|
-
* included in all copies or substantial portions of the Software.
|
19
|
-
*
|
20
|
-
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
21
|
-
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
22
|
-
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
23
|
-
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
24
|
-
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
25
|
-
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
26
|
-
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
27
|
-
* OTHER DEALINGS IN THE SOFTWARE.
|
28
|
-
*/
|
29
|
-
|
30
|
-
/*
|
31
|
-
* This file contains a template for vectorized Adler-32 implementations.
|
32
|
-
*
|
33
|
-
* The inner loop between reductions modulo 65521 of an unvectorized Adler-32
|
34
|
-
* implementation looks something like this:
|
35
|
-
*
|
36
|
-
* do {
|
37
|
-
* s1 += *p;
|
38
|
-
* s2 += s1;
|
39
|
-
* } while (++p != chunk_end);
|
40
|
-
*
|
41
|
-
* For vectorized calculation of s1, we only need to sum the input bytes. They
|
42
|
-
* can be accumulated into multiple counters which are eventually summed
|
43
|
-
* together.
|
44
|
-
*
|
45
|
-
* For vectorized calculation of s2, the basic idea is that for each iteration
|
46
|
-
* that processes N bytes, we can perform the following vectorizable
|
47
|
-
* calculation:
|
48
|
-
*
|
49
|
-
* s2 += N*byte_1 + (N-1)*byte_2 + (N-2)*byte_3 + ... + 1*byte_N
|
50
|
-
*
|
51
|
-
* Or, equivalently, we can sum the byte_1...byte_N for each iteration into N
|
52
|
-
* separate counters, then do the multiplications by N...1 just once at the end
|
53
|
-
* rather than once per iteration.
|
54
|
-
*
|
55
|
-
* Also, we must account for how previous bytes will affect s2 by doing the
|
56
|
-
* following at beginning of each iteration:
|
57
|
-
*
|
58
|
-
* s2 += s1 * N
|
59
|
-
*
|
60
|
-
* Furthermore, like s1, "s2" can actually be multiple counters which are
|
61
|
-
* eventually summed together.
|
62
|
-
*/
|
63
|
-
|
64
|
-
static u32 ATTRIBUTES
|
65
|
-
FUNCNAME(u32 adler, const void *buffer, size_t size)
|
66
|
-
{
|
67
|
-
u32 s1 = adler & 0xFFFF;
|
68
|
-
u32 s2 = adler >> 16;
|
69
|
-
const u8 *p = buffer;
|
70
|
-
const u8 * const end = p + size;
|
71
|
-
const u8 *vend;
|
72
|
-
|
73
|
-
/* Process a byte at a time until the required alignment is reached. */
|
74
|
-
if (p != end && (uintptr_t)p % ALIGNMENT_REQUIRED) {
|
75
|
-
do {
|
76
|
-
s1 += *p++;
|
77
|
-
s2 += s1;
|
78
|
-
} while (p != end && (uintptr_t)p % ALIGNMENT_REQUIRED);
|
79
|
-
s1 %= DIVISOR;
|
80
|
-
s2 %= DIVISOR;
|
81
|
-
}
|
82
|
-
|
83
|
-
/*
|
84
|
-
* Process "chunks" of bytes using vector instructions. Chunk sizes are
|
85
|
-
* limited to MAX_BYTES_PER_CHUNK, which guarantees that s1 and s2 never
|
86
|
-
* overflow before being reduced modulo DIVISOR. For vector processing,
|
87
|
-
* chunks size are also made evenly divisible by BYTES_PER_ITERATION.
|
88
|
-
*/
|
89
|
-
STATIC_ASSERT(BYTES_PER_ITERATION % ALIGNMENT_REQUIRED == 0);
|
90
|
-
vend = end - ((size_t)(end - p) % BYTES_PER_ITERATION);
|
91
|
-
while (p != vend) {
|
92
|
-
size_t chunk_size;
|
93
|
-
const u8 *chunk_end;
|
94
|
-
|
95
|
-
chunk_size = MIN((size_t)(vend - p), MAX_BYTES_PER_CHUNK);
|
96
|
-
#if TARGET == TARGET_SSE2
|
97
|
-
/* SSE2: the 16-bit precision byte counters must not undergo
|
98
|
-
* *signed* overflow, otherwise the signed multiplication at the
|
99
|
-
* end will not behave as desired. */
|
100
|
-
chunk_size = MIN(chunk_size, BYTES_PER_ITERATION * (0x7FFF / 0xFF));
|
101
|
-
#elif TARGET == TARGET_NEON
|
102
|
-
/* NEON: the 16-bit precision counters must not undergo
|
103
|
-
* *unsigned* overflow. */
|
104
|
-
chunk_size = MIN(chunk_size, BYTES_PER_ITERATION * (0xFFFF / 0xFF));
|
105
|
-
#endif
|
106
|
-
chunk_size -= chunk_size % BYTES_PER_ITERATION;
|
107
|
-
|
108
|
-
chunk_end = p + chunk_size;
|
109
|
-
|
110
|
-
s2 += s1 * chunk_size;
|
111
|
-
{
|
112
|
-
#if TARGET == TARGET_AVX2
|
113
|
-
/* AVX2 implementation */
|
114
|
-
const __m256i zeroes = _mm256_setzero_si256();
|
115
|
-
const __v32qi multipliers = (__v32qi) { 32, 31, 30, 29, 28, 27, 26, 25,
|
116
|
-
24, 23, 22, 21, 20, 19, 18, 17,
|
117
|
-
16, 15, 14, 13, 12, 11, 10, 9,
|
118
|
-
8, 7, 6, 5, 4, 3, 2, 1 };
|
119
|
-
const __v16hi ones = (__v16hi)_mm256_set1_epi16(1);
|
120
|
-
__v8si v_s1 = (__v8si)zeroes;
|
121
|
-
__v8si v_s1_sums = (__v8si)zeroes;
|
122
|
-
__v8si v_s2 = (__v8si)zeroes;
|
123
|
-
STATIC_ASSERT(ALIGNMENT_REQUIRED == 32 && BYTES_PER_ITERATION == 32);
|
124
|
-
do {
|
125
|
-
__m256i bytes = *(const __m256i *)p;
|
126
|
-
__v16hi sums = (__v16hi)_mm256_maddubs_epi16(
|
127
|
-
bytes, (__m256i)multipliers);
|
128
|
-
v_s1_sums += v_s1;
|
129
|
-
v_s1 += (__v8si)_mm256_sad_epu8(bytes, zeroes);
|
130
|
-
v_s2 += (__v8si)_mm256_madd_epi16((__m256i)sums, (__m256i)ones);
|
131
|
-
} while ((p += BYTES_PER_ITERATION) != chunk_end);
|
132
|
-
|
133
|
-
v_s1 = (__v8si)_mm256_hadd_epi32((__m256i)v_s1, zeroes);
|
134
|
-
v_s1 = (__v8si)_mm256_hadd_epi32((__m256i)v_s1, zeroes);
|
135
|
-
s1 += v_s1[0] + v_s1[4];
|
136
|
-
|
137
|
-
v_s2 += (__v8si)_mm256_slli_epi32((__m256i)v_s1_sums, 5);
|
138
|
-
v_s2 = (__v8si)_mm256_hadd_epi32((__m256i)v_s2, zeroes);
|
139
|
-
v_s2 = (__v8si)_mm256_hadd_epi32((__m256i)v_s2, zeroes);
|
140
|
-
s2 += v_s2[0] + v_s2[4];
|
141
|
-
|
142
|
-
#elif TARGET == TARGET_SSE2
|
143
|
-
/* SSE2 implementation */
|
144
|
-
const __m128i zeroes = _mm_setzero_si128();
|
145
|
-
|
146
|
-
/* s1 counters: 32-bit, sum of bytes */
|
147
|
-
__v4si v_s1 = (__v4si)zeroes;
|
148
|
-
|
149
|
-
/* s2 counters: 32-bit, sum of s1 values */
|
150
|
-
__v4si v_s2 = (__v4si)zeroes;
|
151
|
-
|
152
|
-
/*
|
153
|
-
* Thirty-two 16-bit counters for byte sums. Each accumulates
|
154
|
-
* the bytes that eventually need to be multiplied by a number
|
155
|
-
* 32...1 for addition into s2.
|
156
|
-
*/
|
157
|
-
__v8hi v_byte_sums_a = (__v8hi)zeroes;
|
158
|
-
__v8hi v_byte_sums_b = (__v8hi)zeroes;
|
159
|
-
__v8hi v_byte_sums_c = (__v8hi)zeroes;
|
160
|
-
__v8hi v_byte_sums_d = (__v8hi)zeroes;
|
161
|
-
|
162
|
-
STATIC_ASSERT(ALIGNMENT_REQUIRED == 16 && BYTES_PER_ITERATION == 32);
|
163
|
-
do {
|
164
|
-
/* Load the next 32 bytes. */
|
165
|
-
const __m128i bytes1 = *(const __m128i *)p;
|
166
|
-
const __m128i bytes2 = *(const __m128i *)(p + 16);
|
167
|
-
|
168
|
-
/*
|
169
|
-
* Accumulate the previous s1 counters into the s2
|
170
|
-
* counters. Logically, this really should be
|
171
|
-
* v_s2 += v_s1 * BYTES_PER_ITERATION, but we can do the
|
172
|
-
* multiplication (or left shift) later.
|
173
|
-
*/
|
174
|
-
v_s2 += v_s1;
|
175
|
-
|
176
|
-
/*
|
177
|
-
* s1 update: use "Packed Sum of Absolute Differences"
|
178
|
-
* to add the bytes horizontally with 8 bytes per sum.
|
179
|
-
* Then add the sums to the s1 counters.
|
180
|
-
*/
|
181
|
-
v_s1 += (__v4si)_mm_sad_epu8(bytes1, zeroes);
|
182
|
-
v_s1 += (__v4si)_mm_sad_epu8(bytes2, zeroes);
|
183
|
-
|
184
|
-
/*
|
185
|
-
* Also accumulate the bytes into 32 separate counters
|
186
|
-
* that have 16-bit precision.
|
187
|
-
*/
|
188
|
-
v_byte_sums_a += (__v8hi)_mm_unpacklo_epi8(bytes1, zeroes);
|
189
|
-
v_byte_sums_b += (__v8hi)_mm_unpackhi_epi8(bytes1, zeroes);
|
190
|
-
v_byte_sums_c += (__v8hi)_mm_unpacklo_epi8(bytes2, zeroes);
|
191
|
-
v_byte_sums_d += (__v8hi)_mm_unpackhi_epi8(bytes2, zeroes);
|
192
|
-
|
193
|
-
} while ((p += BYTES_PER_ITERATION) != chunk_end);
|
194
|
-
|
195
|
-
/* Finish calculating the s2 counters. */
|
196
|
-
v_s2 = (__v4si)_mm_slli_epi32((__m128i)v_s2, 5);
|
197
|
-
v_s2 += (__v4si)_mm_madd_epi16((__m128i)v_byte_sums_a,
|
198
|
-
(__m128i)(__v8hi){ 32, 31, 30, 29, 28, 27, 26, 25 });
|
199
|
-
v_s2 += (__v4si)_mm_madd_epi16((__m128i)v_byte_sums_b,
|
200
|
-
(__m128i)(__v8hi){ 24, 23, 22, 21, 20, 19, 18, 17 });
|
201
|
-
v_s2 += (__v4si)_mm_madd_epi16((__m128i)v_byte_sums_c,
|
202
|
-
(__m128i)(__v8hi){ 16, 15, 14, 13, 12, 11, 10, 9 });
|
203
|
-
v_s2 += (__v4si)_mm_madd_epi16((__m128i)v_byte_sums_d,
|
204
|
-
(__m128i)(__v8hi){ 8, 7, 6, 5, 4, 3, 2, 1 });
|
205
|
-
|
206
|
-
/* Now accumulate what we computed into the real s1 and s2. */
|
207
|
-
v_s1 += (__v4si)_mm_shuffle_epi32((__m128i)v_s1, 0x31);
|
208
|
-
v_s1 += (__v4si)_mm_shuffle_epi32((__m128i)v_s1, 0x02);
|
209
|
-
s1 += _mm_cvtsi128_si32((__m128i)v_s1);
|
210
|
-
|
211
|
-
v_s2 += (__v4si)_mm_shuffle_epi32((__m128i)v_s2, 0x31);
|
212
|
-
v_s2 += (__v4si)_mm_shuffle_epi32((__m128i)v_s2, 0x02);
|
213
|
-
s2 += _mm_cvtsi128_si32((__m128i)v_s2);
|
214
|
-
|
215
|
-
#elif TARGET == TARGET_NEON
|
216
|
-
/* ARM NEON (Advanced SIMD) implementation */
|
217
|
-
uint32x4_t v_s1 = (uint32x4_t) { 0, 0, 0, 0 };
|
218
|
-
uint32x4_t v_s2 = (uint32x4_t) { 0, 0, 0, 0 };
|
219
|
-
uint16x8_t v_byte_sums_a = (uint16x8_t) { 0, 0, 0, 0, 0, 0, 0, 0 };
|
220
|
-
uint16x8_t v_byte_sums_b = (uint16x8_t) { 0, 0, 0, 0, 0, 0, 0, 0 };
|
221
|
-
uint16x8_t v_byte_sums_c = (uint16x8_t) { 0, 0, 0, 0, 0, 0, 0, 0 };
|
222
|
-
uint16x8_t v_byte_sums_d = (uint16x8_t) { 0, 0, 0, 0, 0, 0, 0, 0 };
|
223
|
-
|
224
|
-
STATIC_ASSERT(ALIGNMENT_REQUIRED == 16 && BYTES_PER_ITERATION == 32);
|
225
|
-
do {
|
226
|
-
const uint8x16_t bytes1 = *(const uint8x16_t *)p;
|
227
|
-
const uint8x16_t bytes2 = *(const uint8x16_t *)(p + 16);
|
228
|
-
uint16x8_t tmp;
|
229
|
-
|
230
|
-
v_s2 += v_s1;
|
231
|
-
|
232
|
-
tmp = vpaddlq_u8(bytes1);
|
233
|
-
tmp = vpadalq_u8(tmp, bytes2);
|
234
|
-
v_s1 = vpadalq_u16(v_s1, tmp);
|
235
|
-
|
236
|
-
v_byte_sums_a = vaddw_u8(v_byte_sums_a, vget_low_u8(bytes1));
|
237
|
-
v_byte_sums_b = vaddw_u8(v_byte_sums_b, vget_high_u8(bytes1));
|
238
|
-
v_byte_sums_c = vaddw_u8(v_byte_sums_c, vget_low_u8(bytes2));
|
239
|
-
v_byte_sums_d = vaddw_u8(v_byte_sums_d, vget_high_u8(bytes2));
|
240
|
-
|
241
|
-
} while ((p += BYTES_PER_ITERATION) != chunk_end);
|
242
|
-
|
243
|
-
v_s2 = vqshlq_n_u32(v_s2, 5);
|
244
|
-
v_s2 = vmlal_u16(v_s2, vget_low_u16(v_byte_sums_a), (uint16x4_t) { 32, 31, 30, 29 });
|
245
|
-
v_s2 = vmlal_u16(v_s2, vget_high_u16(v_byte_sums_a), (uint16x4_t) { 28, 27, 26, 25 });
|
246
|
-
v_s2 = vmlal_u16(v_s2, vget_low_u16(v_byte_sums_b), (uint16x4_t) { 24, 23, 22, 21 });
|
247
|
-
v_s2 = vmlal_u16(v_s2, vget_high_u16(v_byte_sums_b), (uint16x4_t) { 20, 19, 18, 17 });
|
248
|
-
v_s2 = vmlal_u16(v_s2, vget_low_u16(v_byte_sums_c), (uint16x4_t) { 16, 15, 14, 13 });
|
249
|
-
v_s2 = vmlal_u16(v_s2, vget_high_u16(v_byte_sums_c), (uint16x4_t) { 12, 11, 10, 9 });
|
250
|
-
v_s2 = vmlal_u16(v_s2, vget_low_u16 (v_byte_sums_d), (uint16x4_t) { 8, 7, 6, 5 });
|
251
|
-
v_s2 = vmlal_u16(v_s2, vget_high_u16(v_byte_sums_d), (uint16x4_t) { 4, 3, 2, 1 });
|
252
|
-
|
253
|
-
s1 += v_s1[0] + v_s1[1] + v_s1[2] + v_s1[3];
|
254
|
-
s2 += v_s2[0] + v_s2[1] + v_s2[2] + v_s2[3];
|
255
|
-
#else
|
256
|
-
# error "BUG: unknown target"
|
257
|
-
#endif
|
258
|
-
}
|
259
|
-
|
260
|
-
s1 %= DIVISOR;
|
261
|
-
s2 %= DIVISOR;
|
262
|
-
}
|
263
|
-
|
264
|
-
/* Process any remaining bytes. */
|
265
|
-
if (p != end) {
|
266
|
-
do {
|
267
|
-
s1 += *p++;
|
268
|
-
s2 += s1;
|
269
|
-
} while (p != end);
|
270
|
-
s1 %= DIVISOR;
|
271
|
-
s2 %= DIVISOR;
|
272
|
-
}
|
273
|
-
|
274
|
-
return (s2 << 16) | s1;
|
275
|
-
}
|
276
|
-
|
277
|
-
#undef FUNCNAME
|
278
|
-
#undef TARGET
|
279
|
-
#undef ALIGNMENT_REQUIRED
|
280
|
-
#undef BYTES_PER_ITERATION
|
281
|
-
#undef ATTRIBUTES
|
@@ -1,57 +0,0 @@
|
|
1
|
-
/*
|
2
|
-
* aligned_malloc.c - aligned memory allocation
|
3
|
-
*
|
4
|
-
* Originally public domain; changes after 2016-09-07 are copyrighted.
|
5
|
-
*
|
6
|
-
* Copyright 2016 Eric Biggers
|
7
|
-
*
|
8
|
-
* Permission is hereby granted, free of charge, to any person
|
9
|
-
* obtaining a copy of this software and associated documentation
|
10
|
-
* files (the "Software"), to deal in the Software without
|
11
|
-
* restriction, including without limitation the rights to use,
|
12
|
-
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
13
|
-
* copies of the Software, and to permit persons to whom the
|
14
|
-
* Software is furnished to do so, subject to the following
|
15
|
-
* conditions:
|
16
|
-
*
|
17
|
-
* The above copyright notice and this permission notice shall be
|
18
|
-
* included in all copies or substantial portions of the Software.
|
19
|
-
*
|
20
|
-
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
21
|
-
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
22
|
-
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
23
|
-
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
24
|
-
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
25
|
-
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
26
|
-
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
27
|
-
* OTHER DEALINGS IN THE SOFTWARE.
|
28
|
-
*/
|
29
|
-
|
30
|
-
/*
|
31
|
-
* This file provides portable aligned memory allocation functions that only
|
32
|
-
* use malloc() and free(). This avoids portability problems with
|
33
|
-
* posix_memalign(), aligned_alloc(), etc.
|
34
|
-
*/
|
35
|
-
|
36
|
-
#include <stdlib.h>
|
37
|
-
|
38
|
-
#include "aligned_malloc.h"
|
39
|
-
|
40
|
-
void *
|
41
|
-
aligned_malloc(size_t alignment, size_t size)
|
42
|
-
{
|
43
|
-
void *ptr = malloc(sizeof(void *) + alignment - 1 + size);
|
44
|
-
if (ptr) {
|
45
|
-
void *orig_ptr = ptr;
|
46
|
-
ptr = (void *)ALIGN((uintptr_t)ptr + sizeof(void *), alignment);
|
47
|
-
((void **)ptr)[-1] = orig_ptr;
|
48
|
-
}
|
49
|
-
return ptr;
|
50
|
-
}
|
51
|
-
|
52
|
-
void
|
53
|
-
aligned_free(void *ptr)
|
54
|
-
{
|
55
|
-
if (ptr)
|
56
|
-
free(((void **)ptr)[-1]);
|
57
|
-
}
|
@@ -1,13 +0,0 @@
|
|
1
|
-
/*
|
2
|
-
* aligned_malloc.c - aligned memory allocation
|
3
|
-
*/
|
4
|
-
|
5
|
-
#ifndef LIB_ALIGNED_MALLOC_H
|
6
|
-
#define LIB_ALIGNED_MALLOC_H
|
7
|
-
|
8
|
-
#include "lib_common.h"
|
9
|
-
|
10
|
-
extern void *aligned_malloc(size_t alignment, size_t size);
|
11
|
-
extern void aligned_free(void *ptr);
|
12
|
-
|
13
|
-
#endif /* LIB_ALIGNED_MALLOC_H */
|
@@ -1,357 +0,0 @@
|
|
1
|
-
/*
|
2
|
-
* bt_matchfinder.h - Lempel-Ziv matchfinding with a hash table of binary trees
|
3
|
-
*
|
4
|
-
* Originally public domain; changes after 2016-09-07 are copyrighted.
|
5
|
-
*
|
6
|
-
* Copyright 2016 Eric Biggers
|
7
|
-
*
|
8
|
-
* Permission is hereby granted, free of charge, to any person
|
9
|
-
* obtaining a copy of this software and associated documentation
|
10
|
-
* files (the "Software"), to deal in the Software without
|
11
|
-
* restriction, including without limitation the rights to use,
|
12
|
-
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
13
|
-
* copies of the Software, and to permit persons to whom the
|
14
|
-
* Software is furnished to do so, subject to the following
|
15
|
-
* conditions:
|
16
|
-
*
|
17
|
-
* The above copyright notice and this permission notice shall be
|
18
|
-
* included in all copies or substantial portions of the Software.
|
19
|
-
*
|
20
|
-
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
21
|
-
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
22
|
-
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
23
|
-
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
24
|
-
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
25
|
-
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
26
|
-
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
27
|
-
* OTHER DEALINGS IN THE SOFTWARE.
|
28
|
-
*
|
29
|
-
* ----------------------------------------------------------------------------
|
30
|
-
*
|
31
|
-
* This is a Binary Trees (bt) based matchfinder.
|
32
|
-
*
|
33
|
-
* The main data structure is a hash table where each hash bucket contains a
|
34
|
-
* binary tree of sequences whose first 4 bytes share the same hash code. Each
|
35
|
-
* sequence is identified by its starting position in the input buffer. Each
|
36
|
-
* binary tree is always sorted such that each left child represents a sequence
|
37
|
-
* lexicographically lesser than its parent and each right child represents a
|
38
|
-
* sequence lexicographically greater than its parent.
|
39
|
-
*
|
40
|
-
* The algorithm processes the input buffer sequentially. At each byte
|
41
|
-
* position, the hash code of the first 4 bytes of the sequence beginning at
|
42
|
-
* that position (the sequence being matched against) is computed. This
|
43
|
-
* identifies the hash bucket to use for that position. Then, a new binary tree
|
44
|
-
* node is created to represent the current sequence. Then, in a single tree
|
45
|
-
* traversal, the hash bucket's binary tree is searched for matches and is
|
46
|
-
* re-rooted at the new node.
|
47
|
-
*
|
48
|
-
* Compared to the simpler algorithm that uses linked lists instead of binary
|
49
|
-
* trees (see hc_matchfinder.h), the binary tree version gains more information
|
50
|
-
* at each node visitation. Ideally, the binary tree version will examine only
|
51
|
-
* 'log(n)' nodes to find the same matches that the linked list version will
|
52
|
-
* find by examining 'n' nodes. In addition, the binary tree version can
|
53
|
-
* examine fewer bytes at each node by taking advantage of the common prefixes
|
54
|
-
* that result from the sort order, whereas the linked list version may have to
|
55
|
-
* examine up to the full length of the match at each node.
|
56
|
-
*
|
57
|
-
* However, it is not always best to use the binary tree version. It requires
|
58
|
-
* nearly twice as much memory as the linked list version, and it takes time to
|
59
|
-
* keep the binary trees sorted, even at positions where the compressor does not
|
60
|
-
* need matches. Generally, when doing fast compression on small buffers,
|
61
|
-
* binary trees are the wrong approach. They are best suited for thorough
|
62
|
-
* compression and/or large buffers.
|
63
|
-
*
|
64
|
-
* ----------------------------------------------------------------------------
|
65
|
-
*/
|
66
|
-
|
67
|
-
|
68
|
-
#include "matchfinder_common.h"
|
69
|
-
|
70
|
-
#define BT_MATCHFINDER_HASH3_ORDER 16
|
71
|
-
#define BT_MATCHFINDER_HASH3_WAYS 2
|
72
|
-
#define BT_MATCHFINDER_HASH4_ORDER 16
|
73
|
-
|
74
|
-
#define BT_MATCHFINDER_TOTAL_HASH_LENGTH \
|
75
|
-
((1UL << BT_MATCHFINDER_HASH3_ORDER) * BT_MATCHFINDER_HASH3_WAYS + \
|
76
|
-
(1UL << BT_MATCHFINDER_HASH4_ORDER))
|
77
|
-
|
78
|
-
/* Representation of a match found by the bt_matchfinder */
|
79
|
-
struct lz_match {
|
80
|
-
|
81
|
-
/* The number of bytes matched. */
|
82
|
-
u16 length;
|
83
|
-
|
84
|
-
/* The offset back from the current position that was matched. */
|
85
|
-
u16 offset;
|
86
|
-
};
|
87
|
-
|
88
|
-
struct bt_matchfinder {
|
89
|
-
|
90
|
-
/* The hash table for finding length 3 matches */
|
91
|
-
mf_pos_t hash3_tab[1UL << BT_MATCHFINDER_HASH3_ORDER][BT_MATCHFINDER_HASH3_WAYS];
|
92
|
-
|
93
|
-
/* The hash table which contains the roots of the binary trees for
|
94
|
-
* finding length 4+ matches */
|
95
|
-
mf_pos_t hash4_tab[1UL << BT_MATCHFINDER_HASH4_ORDER];
|
96
|
-
|
97
|
-
/* The child node references for the binary trees. The left and right
|
98
|
-
* children of the node for the sequence with position 'pos' are
|
99
|
-
* 'child_tab[pos * 2]' and 'child_tab[pos * 2 + 1]', respectively. */
|
100
|
-
mf_pos_t child_tab[2UL * MATCHFINDER_WINDOW_SIZE];
|
101
|
-
|
102
|
-
}
|
103
|
-
#ifdef _aligned_attribute
|
104
|
-
_aligned_attribute(MATCHFINDER_ALIGNMENT)
|
105
|
-
#endif
|
106
|
-
;
|
107
|
-
|
108
|
-
/* Prepare the matchfinder for a new input buffer. */
|
109
|
-
static forceinline void
|
110
|
-
bt_matchfinder_init(struct bt_matchfinder *mf)
|
111
|
-
{
|
112
|
-
matchfinder_init((mf_pos_t *)mf, BT_MATCHFINDER_TOTAL_HASH_LENGTH);
|
113
|
-
}
|
114
|
-
|
115
|
-
static forceinline void
|
116
|
-
bt_matchfinder_slide_window(struct bt_matchfinder *mf)
|
117
|
-
{
|
118
|
-
matchfinder_rebase((mf_pos_t *)mf,
|
119
|
-
sizeof(struct bt_matchfinder) / sizeof(mf_pos_t));
|
120
|
-
}
|
121
|
-
|
122
|
-
static forceinline mf_pos_t *
|
123
|
-
bt_left_child(struct bt_matchfinder *mf, s32 node)
|
124
|
-
{
|
125
|
-
return &mf->child_tab[2 * (node & (MATCHFINDER_WINDOW_SIZE - 1)) + 0];
|
126
|
-
}
|
127
|
-
|
128
|
-
static forceinline mf_pos_t *
|
129
|
-
bt_right_child(struct bt_matchfinder *mf, s32 node)
|
130
|
-
{
|
131
|
-
return &mf->child_tab[2 * (node & (MATCHFINDER_WINDOW_SIZE - 1)) + 1];
|
132
|
-
}
|
133
|
-
|
134
|
-
/* The minimum permissible value of 'max_len' for bt_matchfinder_get_matches()
|
135
|
-
* and bt_matchfinder_skip_position(). There must be sufficiently many bytes
|
136
|
-
* remaining to load a 32-bit integer from the *next* position. */
|
137
|
-
#define BT_MATCHFINDER_REQUIRED_NBYTES 5
|
138
|
-
|
139
|
-
/* Advance the binary tree matchfinder by one byte, optionally recording
|
140
|
-
* matches. @record_matches should be a compile-time constant. */
|
141
|
-
static forceinline struct lz_match *
|
142
|
-
bt_matchfinder_advance_one_byte(struct bt_matchfinder * const restrict mf,
|
143
|
-
const u8 * const restrict in_base,
|
144
|
-
const ptrdiff_t cur_pos,
|
145
|
-
const u32 max_len,
|
146
|
-
const u32 nice_len,
|
147
|
-
const u32 max_search_depth,
|
148
|
-
u32 * const restrict next_hashes,
|
149
|
-
u32 * const restrict best_len_ret,
|
150
|
-
struct lz_match * restrict lz_matchptr,
|
151
|
-
const bool record_matches)
|
152
|
-
{
|
153
|
-
const u8 *in_next = in_base + cur_pos;
|
154
|
-
u32 depth_remaining = max_search_depth;
|
155
|
-
const s32 cutoff = cur_pos - MATCHFINDER_WINDOW_SIZE;
|
156
|
-
u32 next_seq4;
|
157
|
-
u32 next_seq3;
|
158
|
-
u32 hash3;
|
159
|
-
u32 hash4;
|
160
|
-
s32 cur_node;
|
161
|
-
#if BT_MATCHFINDER_HASH3_WAYS >= 2
|
162
|
-
s32 cur_node_2;
|
163
|
-
#endif
|
164
|
-
const u8 *matchptr;
|
165
|
-
mf_pos_t *pending_lt_ptr, *pending_gt_ptr;
|
166
|
-
u32 best_lt_len, best_gt_len;
|
167
|
-
u32 len;
|
168
|
-
u32 best_len = 3;
|
169
|
-
|
170
|
-
STATIC_ASSERT(BT_MATCHFINDER_HASH3_WAYS >= 1 &&
|
171
|
-
BT_MATCHFINDER_HASH3_WAYS <= 2);
|
172
|
-
|
173
|
-
next_seq4 = load_u32_unaligned(in_next + 1);
|
174
|
-
next_seq3 = loaded_u32_to_u24(next_seq4);
|
175
|
-
|
176
|
-
hash3 = next_hashes[0];
|
177
|
-
hash4 = next_hashes[1];
|
178
|
-
|
179
|
-
next_hashes[0] = lz_hash(next_seq3, BT_MATCHFINDER_HASH3_ORDER);
|
180
|
-
next_hashes[1] = lz_hash(next_seq4, BT_MATCHFINDER_HASH4_ORDER);
|
181
|
-
prefetchw(&mf->hash3_tab[next_hashes[0]]);
|
182
|
-
prefetchw(&mf->hash4_tab[next_hashes[1]]);
|
183
|
-
|
184
|
-
cur_node = mf->hash3_tab[hash3][0];
|
185
|
-
mf->hash3_tab[hash3][0] = cur_pos;
|
186
|
-
#if BT_MATCHFINDER_HASH3_WAYS >= 2
|
187
|
-
cur_node_2 = mf->hash3_tab[hash3][1];
|
188
|
-
mf->hash3_tab[hash3][1] = cur_node;
|
189
|
-
#endif
|
190
|
-
if (record_matches && cur_node > cutoff) {
|
191
|
-
u32 seq3 = load_u24_unaligned(in_next);
|
192
|
-
if (seq3 == load_u24_unaligned(&in_base[cur_node])) {
|
193
|
-
lz_matchptr->length = 3;
|
194
|
-
lz_matchptr->offset = in_next - &in_base[cur_node];
|
195
|
-
lz_matchptr++;
|
196
|
-
}
|
197
|
-
#if BT_MATCHFINDER_HASH3_WAYS >= 2
|
198
|
-
else if (cur_node_2 > cutoff &&
|
199
|
-
seq3 == load_u24_unaligned(&in_base[cur_node_2]))
|
200
|
-
{
|
201
|
-
lz_matchptr->length = 3;
|
202
|
-
lz_matchptr->offset = in_next - &in_base[cur_node_2];
|
203
|
-
lz_matchptr++;
|
204
|
-
}
|
205
|
-
#endif
|
206
|
-
}
|
207
|
-
|
208
|
-
cur_node = mf->hash4_tab[hash4];
|
209
|
-
mf->hash4_tab[hash4] = cur_pos;
|
210
|
-
|
211
|
-
pending_lt_ptr = bt_left_child(mf, cur_pos);
|
212
|
-
pending_gt_ptr = bt_right_child(mf, cur_pos);
|
213
|
-
|
214
|
-
if (cur_node <= cutoff) {
|
215
|
-
*pending_lt_ptr = MATCHFINDER_INITVAL;
|
216
|
-
*pending_gt_ptr = MATCHFINDER_INITVAL;
|
217
|
-
*best_len_ret = best_len;
|
218
|
-
return lz_matchptr;
|
219
|
-
}
|
220
|
-
|
221
|
-
best_lt_len = 0;
|
222
|
-
best_gt_len = 0;
|
223
|
-
len = 0;
|
224
|
-
|
225
|
-
for (;;) {
|
226
|
-
matchptr = &in_base[cur_node];
|
227
|
-
|
228
|
-
if (matchptr[len] == in_next[len]) {
|
229
|
-
len = lz_extend(in_next, matchptr, len + 1, max_len);
|
230
|
-
if (!record_matches || len > best_len) {
|
231
|
-
if (record_matches) {
|
232
|
-
best_len = len;
|
233
|
-
lz_matchptr->length = len;
|
234
|
-
lz_matchptr->offset = in_next - matchptr;
|
235
|
-
lz_matchptr++;
|
236
|
-
}
|
237
|
-
if (len >= nice_len) {
|
238
|
-
*pending_lt_ptr = *bt_left_child(mf, cur_node);
|
239
|
-
*pending_gt_ptr = *bt_right_child(mf, cur_node);
|
240
|
-
*best_len_ret = best_len;
|
241
|
-
return lz_matchptr;
|
242
|
-
}
|
243
|
-
}
|
244
|
-
}
|
245
|
-
|
246
|
-
if (matchptr[len] < in_next[len]) {
|
247
|
-
*pending_lt_ptr = cur_node;
|
248
|
-
pending_lt_ptr = bt_right_child(mf, cur_node);
|
249
|
-
cur_node = *pending_lt_ptr;
|
250
|
-
best_lt_len = len;
|
251
|
-
if (best_gt_len < len)
|
252
|
-
len = best_gt_len;
|
253
|
-
} else {
|
254
|
-
*pending_gt_ptr = cur_node;
|
255
|
-
pending_gt_ptr = bt_left_child(mf, cur_node);
|
256
|
-
cur_node = *pending_gt_ptr;
|
257
|
-
best_gt_len = len;
|
258
|
-
if (best_lt_len < len)
|
259
|
-
len = best_lt_len;
|
260
|
-
}
|
261
|
-
|
262
|
-
if (cur_node <= cutoff || !--depth_remaining) {
|
263
|
-
*pending_lt_ptr = MATCHFINDER_INITVAL;
|
264
|
-
*pending_gt_ptr = MATCHFINDER_INITVAL;
|
265
|
-
*best_len_ret = best_len;
|
266
|
-
return lz_matchptr;
|
267
|
-
}
|
268
|
-
}
|
269
|
-
}
|
270
|
-
|
271
|
-
/*
|
272
|
-
* Retrieve a list of matches with the current position.
|
273
|
-
*
|
274
|
-
* @mf
|
275
|
-
* The matchfinder structure.
|
276
|
-
* @in_base
|
277
|
-
* Pointer to the next byte in the input buffer to process _at the last
|
278
|
-
* time bt_matchfinder_init() or bt_matchfinder_slide_window() was called_.
|
279
|
-
* @cur_pos
|
280
|
-
* The current position in the input buffer relative to @in_base (the
|
281
|
-
* position of the sequence being matched against).
|
282
|
-
* @max_len
|
283
|
-
* The maximum permissible match length at this position. Must be >=
|
284
|
-
* BT_MATCHFINDER_REQUIRED_NBYTES.
|
285
|
-
* @nice_len
|
286
|
-
* Stop searching if a match of at least this length is found.
|
287
|
-
* Must be <= @max_len.
|
288
|
-
* @max_search_depth
|
289
|
-
* Limit on the number of potential matches to consider. Must be >= 1.
|
290
|
-
* @next_hashes
|
291
|
-
* The precomputed hash codes for the sequence beginning at @in_next.
|
292
|
-
* These will be used and then updated with the precomputed hashcodes for
|
293
|
-
* the sequence beginning at @in_next + 1.
|
294
|
-
* @best_len_ret
|
295
|
-
* If a match of length >= 4 was found, then the length of the longest such
|
296
|
-
* match is written here; otherwise 3 is written here. (Note: this is
|
297
|
-
* redundant with the 'struct lz_match' array, but this is easier for the
|
298
|
-
* compiler to optimize when inlined and the caller immediately does a
|
299
|
-
* check against 'best_len'.)
|
300
|
-
* @lz_matchptr
|
301
|
-
* An array in which this function will record the matches. The recorded
|
302
|
-
* matches will be sorted by strictly increasing length and (non-strictly)
|
303
|
-
* increasing offset. The maximum number of matches that may be found is
|
304
|
-
* 'nice_len - 2'.
|
305
|
-
*
|
306
|
-
* The return value is a pointer to the next available slot in the @lz_matchptr
|
307
|
-
* array. (If no matches were found, this will be the same as @lz_matchptr.)
|
308
|
-
*/
|
309
|
-
static forceinline struct lz_match *
|
310
|
-
bt_matchfinder_get_matches(struct bt_matchfinder *mf,
|
311
|
-
const u8 *in_base,
|
312
|
-
ptrdiff_t cur_pos,
|
313
|
-
u32 max_len,
|
314
|
-
u32 nice_len,
|
315
|
-
u32 max_search_depth,
|
316
|
-
u32 next_hashes[2],
|
317
|
-
u32 *best_len_ret,
|
318
|
-
struct lz_match *lz_matchptr)
|
319
|
-
{
|
320
|
-
return bt_matchfinder_advance_one_byte(mf,
|
321
|
-
in_base,
|
322
|
-
cur_pos,
|
323
|
-
max_len,
|
324
|
-
nice_len,
|
325
|
-
max_search_depth,
|
326
|
-
next_hashes,
|
327
|
-
best_len_ret,
|
328
|
-
lz_matchptr,
|
329
|
-
true);
|
330
|
-
}
|
331
|
-
|
332
|
-
/*
|
333
|
-
* Advance the matchfinder, but don't record any matches.
|
334
|
-
*
|
335
|
-
* This is very similar to bt_matchfinder_get_matches() because both functions
|
336
|
-
* must do hashing and tree re-rooting.
|
337
|
-
*/
|
338
|
-
static forceinline void
|
339
|
-
bt_matchfinder_skip_position(struct bt_matchfinder *mf,
|
340
|
-
const u8 *in_base,
|
341
|
-
ptrdiff_t cur_pos,
|
342
|
-
u32 nice_len,
|
343
|
-
u32 max_search_depth,
|
344
|
-
u32 next_hashes[2])
|
345
|
-
{
|
346
|
-
u32 best_len;
|
347
|
-
bt_matchfinder_advance_one_byte(mf,
|
348
|
-
in_base,
|
349
|
-
cur_pos,
|
350
|
-
nice_len,
|
351
|
-
nice_len,
|
352
|
-
max_search_depth,
|
353
|
-
next_hashes,
|
354
|
-
&best_len,
|
355
|
-
NULL,
|
356
|
-
false);
|
357
|
-
}
|