libdeflate 0.1.1 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (79) hide show
  1. checksums.yaml +5 -5
  2. data/.github/workflows/test.yml +34 -0
  3. data/README.md +1 -6
  4. data/ext/libdeflate/extconf.rb +18 -7
  5. data/ext/libdeflate/libdeflate_ext.c +17 -17
  6. data/lib/libdeflate/version.rb +1 -1
  7. data/libdeflate.gemspec +2 -1
  8. metadata +13 -84
  9. data/.gitmodules +0 -3
  10. data/.travis.yml +0 -5
  11. data/ext/libdeflate/libdeflate/.gitignore +0 -19
  12. data/ext/libdeflate/libdeflate/COPYING +0 -21
  13. data/ext/libdeflate/libdeflate/Makefile +0 -231
  14. data/ext/libdeflate/libdeflate/Makefile.msc +0 -64
  15. data/ext/libdeflate/libdeflate/NEWS +0 -57
  16. data/ext/libdeflate/libdeflate/README.md +0 -170
  17. data/ext/libdeflate/libdeflate/common/common_defs.h +0 -351
  18. data/ext/libdeflate/libdeflate/common/compiler_gcc.h +0 -134
  19. data/ext/libdeflate/libdeflate/common/compiler_msc.h +0 -95
  20. data/ext/libdeflate/libdeflate/lib/adler32.c +0 -213
  21. data/ext/libdeflate/libdeflate/lib/adler32_impl.h +0 -281
  22. data/ext/libdeflate/libdeflate/lib/aligned_malloc.c +0 -57
  23. data/ext/libdeflate/libdeflate/lib/aligned_malloc.h +0 -13
  24. data/ext/libdeflate/libdeflate/lib/bt_matchfinder.h +0 -357
  25. data/ext/libdeflate/libdeflate/lib/crc32.c +0 -368
  26. data/ext/libdeflate/libdeflate/lib/crc32_impl.h +0 -286
  27. data/ext/libdeflate/libdeflate/lib/crc32_table.h +0 -526
  28. data/ext/libdeflate/libdeflate/lib/decompress_impl.h +0 -404
  29. data/ext/libdeflate/libdeflate/lib/deflate_compress.c +0 -2817
  30. data/ext/libdeflate/libdeflate/lib/deflate_compress.h +0 -14
  31. data/ext/libdeflate/libdeflate/lib/deflate_constants.h +0 -66
  32. data/ext/libdeflate/libdeflate/lib/deflate_decompress.c +0 -889
  33. data/ext/libdeflate/libdeflate/lib/gzip_compress.c +0 -95
  34. data/ext/libdeflate/libdeflate/lib/gzip_constants.h +0 -45
  35. data/ext/libdeflate/libdeflate/lib/gzip_decompress.c +0 -130
  36. data/ext/libdeflate/libdeflate/lib/hc_matchfinder.h +0 -405
  37. data/ext/libdeflate/libdeflate/lib/lib_common.h +0 -35
  38. data/ext/libdeflate/libdeflate/lib/matchfinder_avx2.h +0 -53
  39. data/ext/libdeflate/libdeflate/lib/matchfinder_common.h +0 -205
  40. data/ext/libdeflate/libdeflate/lib/matchfinder_neon.h +0 -61
  41. data/ext/libdeflate/libdeflate/lib/matchfinder_sse2.h +0 -53
  42. data/ext/libdeflate/libdeflate/lib/unaligned.h +0 -202
  43. data/ext/libdeflate/libdeflate/lib/x86_cpu_features.c +0 -169
  44. data/ext/libdeflate/libdeflate/lib/x86_cpu_features.h +0 -48
  45. data/ext/libdeflate/libdeflate/lib/zlib_compress.c +0 -87
  46. data/ext/libdeflate/libdeflate/lib/zlib_constants.h +0 -21
  47. data/ext/libdeflate/libdeflate/lib/zlib_decompress.c +0 -91
  48. data/ext/libdeflate/libdeflate/libdeflate.h +0 -274
  49. data/ext/libdeflate/libdeflate/programs/benchmark.c +0 -558
  50. data/ext/libdeflate/libdeflate/programs/checksum.c +0 -197
  51. data/ext/libdeflate/libdeflate/programs/detect.sh +0 -62
  52. data/ext/libdeflate/libdeflate/programs/gzip.c +0 -603
  53. data/ext/libdeflate/libdeflate/programs/prog_util.c +0 -530
  54. data/ext/libdeflate/libdeflate/programs/prog_util.h +0 -162
  55. data/ext/libdeflate/libdeflate/programs/test_checksums.c +0 -135
  56. data/ext/libdeflate/libdeflate/programs/tgetopt.c +0 -118
  57. data/ext/libdeflate/libdeflate/tools/afl-fuzz/Makefile +0 -12
  58. data/ext/libdeflate/libdeflate/tools/afl-fuzz/deflate_compress/fuzz.c +0 -40
  59. data/ext/libdeflate/libdeflate/tools/afl-fuzz/deflate_compress/inputs/0 +0 -0
  60. data/ext/libdeflate/libdeflate/tools/afl-fuzz/deflate_decompress/fuzz.c +0 -28
  61. data/ext/libdeflate/libdeflate/tools/afl-fuzz/deflate_decompress/inputs/0 +0 -3
  62. data/ext/libdeflate/libdeflate/tools/afl-fuzz/gzip_decompress/fuzz.c +0 -28
  63. data/ext/libdeflate/libdeflate/tools/afl-fuzz/gzip_decompress/inputs/0 +0 -0
  64. data/ext/libdeflate/libdeflate/tools/afl-fuzz/prepare_for_fuzz.sh +0 -14
  65. data/ext/libdeflate/libdeflate/tools/afl-fuzz/zlib_decompress/fuzz.c +0 -28
  66. data/ext/libdeflate/libdeflate/tools/afl-fuzz/zlib_decompress/inputs/0 +0 -3
  67. data/ext/libdeflate/libdeflate/tools/android_build.sh +0 -104
  68. data/ext/libdeflate/libdeflate/tools/checksum_benchmarks.sh +0 -76
  69. data/ext/libdeflate/libdeflate/tools/exec_tests.sh +0 -30
  70. data/ext/libdeflate/libdeflate/tools/gen_crc32_multipliers.c +0 -108
  71. data/ext/libdeflate/libdeflate/tools/gen_crc32_table.c +0 -100
  72. data/ext/libdeflate/libdeflate/tools/gzip_tests.sh +0 -412
  73. data/ext/libdeflate/libdeflate/tools/make-windows-releases +0 -21
  74. data/ext/libdeflate/libdeflate/tools/mips_build.sh +0 -9
  75. data/ext/libdeflate/libdeflate/tools/msc_test.bat +0 -3
  76. data/ext/libdeflate/libdeflate/tools/pgo_build.sh +0 -23
  77. data/ext/libdeflate/libdeflate/tools/produce_gzip_benchmark_table.sh +0 -37
  78. data/ext/libdeflate/libdeflate/tools/run_tests.sh +0 -305
  79. data/ext/libdeflate/libdeflate/tools/windows_build.sh +0 -10
@@ -1,281 +0,0 @@
1
- /*
2
- * adler32_impl.h
3
- *
4
- * Originally public domain; changes after 2016-09-07 are copyrighted.
5
- *
6
- * Copyright 2016 Eric Biggers
7
- *
8
- * Permission is hereby granted, free of charge, to any person
9
- * obtaining a copy of this software and associated documentation
10
- * files (the "Software"), to deal in the Software without
11
- * restriction, including without limitation the rights to use,
12
- * copy, modify, merge, publish, distribute, sublicense, and/or sell
13
- * copies of the Software, and to permit persons to whom the
14
- * Software is furnished to do so, subject to the following
15
- * conditions:
16
- *
17
- * The above copyright notice and this permission notice shall be
18
- * included in all copies or substantial portions of the Software.
19
- *
20
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
21
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
22
- * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
23
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
24
- * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
25
- * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
26
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
27
- * OTHER DEALINGS IN THE SOFTWARE.
28
- */
29
-
30
- /*
31
- * This file contains a template for vectorized Adler-32 implementations.
32
- *
33
- * The inner loop between reductions modulo 65521 of an unvectorized Adler-32
34
- * implementation looks something like this:
35
- *
36
- * do {
37
- * s1 += *p;
38
- * s2 += s1;
39
- * } while (++p != chunk_end);
40
- *
41
- * For vectorized calculation of s1, we only need to sum the input bytes. They
42
- * can be accumulated into multiple counters which are eventually summed
43
- * together.
44
- *
45
- * For vectorized calculation of s2, the basic idea is that for each iteration
46
- * that processes N bytes, we can perform the following vectorizable
47
- * calculation:
48
- *
49
- * s2 += N*byte_1 + (N-1)*byte_2 + (N-2)*byte_3 + ... + 1*byte_N
50
- *
51
- * Or, equivalently, we can sum the byte_1...byte_N for each iteration into N
52
- * separate counters, then do the multiplications by N...1 just once at the end
53
- * rather than once per iteration.
54
- *
55
- * Also, we must account for how previous bytes will affect s2 by doing the
56
- * following at beginning of each iteration:
57
- *
58
- * s2 += s1 * N
59
- *
60
- * Furthermore, like s1, "s2" can actually be multiple counters which are
61
- * eventually summed together.
62
- */
63
-
64
- static u32 ATTRIBUTES
65
- FUNCNAME(u32 adler, const void *buffer, size_t size)
66
- {
67
- u32 s1 = adler & 0xFFFF;
68
- u32 s2 = adler >> 16;
69
- const u8 *p = buffer;
70
- const u8 * const end = p + size;
71
- const u8 *vend;
72
-
73
- /* Process a byte at a time until the required alignment is reached. */
74
- if (p != end && (uintptr_t)p % ALIGNMENT_REQUIRED) {
75
- do {
76
- s1 += *p++;
77
- s2 += s1;
78
- } while (p != end && (uintptr_t)p % ALIGNMENT_REQUIRED);
79
- s1 %= DIVISOR;
80
- s2 %= DIVISOR;
81
- }
82
-
83
- /*
84
- * Process "chunks" of bytes using vector instructions. Chunk sizes are
85
- * limited to MAX_BYTES_PER_CHUNK, which guarantees that s1 and s2 never
86
- * overflow before being reduced modulo DIVISOR. For vector processing,
87
- * chunks size are also made evenly divisible by BYTES_PER_ITERATION.
88
- */
89
- STATIC_ASSERT(BYTES_PER_ITERATION % ALIGNMENT_REQUIRED == 0);
90
- vend = end - ((size_t)(end - p) % BYTES_PER_ITERATION);
91
- while (p != vend) {
92
- size_t chunk_size;
93
- const u8 *chunk_end;
94
-
95
- chunk_size = MIN((size_t)(vend - p), MAX_BYTES_PER_CHUNK);
96
- #if TARGET == TARGET_SSE2
97
- /* SSE2: the 16-bit precision byte counters must not undergo
98
- * *signed* overflow, otherwise the signed multiplication at the
99
- * end will not behave as desired. */
100
- chunk_size = MIN(chunk_size, BYTES_PER_ITERATION * (0x7FFF / 0xFF));
101
- #elif TARGET == TARGET_NEON
102
- /* NEON: the 16-bit precision counters must not undergo
103
- * *unsigned* overflow. */
104
- chunk_size = MIN(chunk_size, BYTES_PER_ITERATION * (0xFFFF / 0xFF));
105
- #endif
106
- chunk_size -= chunk_size % BYTES_PER_ITERATION;
107
-
108
- chunk_end = p + chunk_size;
109
-
110
- s2 += s1 * chunk_size;
111
- {
112
- #if TARGET == TARGET_AVX2
113
- /* AVX2 implementation */
114
- const __m256i zeroes = _mm256_setzero_si256();
115
- const __v32qi multipliers = (__v32qi) { 32, 31, 30, 29, 28, 27, 26, 25,
116
- 24, 23, 22, 21, 20, 19, 18, 17,
117
- 16, 15, 14, 13, 12, 11, 10, 9,
118
- 8, 7, 6, 5, 4, 3, 2, 1 };
119
- const __v16hi ones = (__v16hi)_mm256_set1_epi16(1);
120
- __v8si v_s1 = (__v8si)zeroes;
121
- __v8si v_s1_sums = (__v8si)zeroes;
122
- __v8si v_s2 = (__v8si)zeroes;
123
- STATIC_ASSERT(ALIGNMENT_REQUIRED == 32 && BYTES_PER_ITERATION == 32);
124
- do {
125
- __m256i bytes = *(const __m256i *)p;
126
- __v16hi sums = (__v16hi)_mm256_maddubs_epi16(
127
- bytes, (__m256i)multipliers);
128
- v_s1_sums += v_s1;
129
- v_s1 += (__v8si)_mm256_sad_epu8(bytes, zeroes);
130
- v_s2 += (__v8si)_mm256_madd_epi16((__m256i)sums, (__m256i)ones);
131
- } while ((p += BYTES_PER_ITERATION) != chunk_end);
132
-
133
- v_s1 = (__v8si)_mm256_hadd_epi32((__m256i)v_s1, zeroes);
134
- v_s1 = (__v8si)_mm256_hadd_epi32((__m256i)v_s1, zeroes);
135
- s1 += v_s1[0] + v_s1[4];
136
-
137
- v_s2 += (__v8si)_mm256_slli_epi32((__m256i)v_s1_sums, 5);
138
- v_s2 = (__v8si)_mm256_hadd_epi32((__m256i)v_s2, zeroes);
139
- v_s2 = (__v8si)_mm256_hadd_epi32((__m256i)v_s2, zeroes);
140
- s2 += v_s2[0] + v_s2[4];
141
-
142
- #elif TARGET == TARGET_SSE2
143
- /* SSE2 implementation */
144
- const __m128i zeroes = _mm_setzero_si128();
145
-
146
- /* s1 counters: 32-bit, sum of bytes */
147
- __v4si v_s1 = (__v4si)zeroes;
148
-
149
- /* s2 counters: 32-bit, sum of s1 values */
150
- __v4si v_s2 = (__v4si)zeroes;
151
-
152
- /*
153
- * Thirty-two 16-bit counters for byte sums. Each accumulates
154
- * the bytes that eventually need to be multiplied by a number
155
- * 32...1 for addition into s2.
156
- */
157
- __v8hi v_byte_sums_a = (__v8hi)zeroes;
158
- __v8hi v_byte_sums_b = (__v8hi)zeroes;
159
- __v8hi v_byte_sums_c = (__v8hi)zeroes;
160
- __v8hi v_byte_sums_d = (__v8hi)zeroes;
161
-
162
- STATIC_ASSERT(ALIGNMENT_REQUIRED == 16 && BYTES_PER_ITERATION == 32);
163
- do {
164
- /* Load the next 32 bytes. */
165
- const __m128i bytes1 = *(const __m128i *)p;
166
- const __m128i bytes2 = *(const __m128i *)(p + 16);
167
-
168
- /*
169
- * Accumulate the previous s1 counters into the s2
170
- * counters. Logically, this really should be
171
- * v_s2 += v_s1 * BYTES_PER_ITERATION, but we can do the
172
- * multiplication (or left shift) later.
173
- */
174
- v_s2 += v_s1;
175
-
176
- /*
177
- * s1 update: use "Packed Sum of Absolute Differences"
178
- * to add the bytes horizontally with 8 bytes per sum.
179
- * Then add the sums to the s1 counters.
180
- */
181
- v_s1 += (__v4si)_mm_sad_epu8(bytes1, zeroes);
182
- v_s1 += (__v4si)_mm_sad_epu8(bytes2, zeroes);
183
-
184
- /*
185
- * Also accumulate the bytes into 32 separate counters
186
- * that have 16-bit precision.
187
- */
188
- v_byte_sums_a += (__v8hi)_mm_unpacklo_epi8(bytes1, zeroes);
189
- v_byte_sums_b += (__v8hi)_mm_unpackhi_epi8(bytes1, zeroes);
190
- v_byte_sums_c += (__v8hi)_mm_unpacklo_epi8(bytes2, zeroes);
191
- v_byte_sums_d += (__v8hi)_mm_unpackhi_epi8(bytes2, zeroes);
192
-
193
- } while ((p += BYTES_PER_ITERATION) != chunk_end);
194
-
195
- /* Finish calculating the s2 counters. */
196
- v_s2 = (__v4si)_mm_slli_epi32((__m128i)v_s2, 5);
197
- v_s2 += (__v4si)_mm_madd_epi16((__m128i)v_byte_sums_a,
198
- (__m128i)(__v8hi){ 32, 31, 30, 29, 28, 27, 26, 25 });
199
- v_s2 += (__v4si)_mm_madd_epi16((__m128i)v_byte_sums_b,
200
- (__m128i)(__v8hi){ 24, 23, 22, 21, 20, 19, 18, 17 });
201
- v_s2 += (__v4si)_mm_madd_epi16((__m128i)v_byte_sums_c,
202
- (__m128i)(__v8hi){ 16, 15, 14, 13, 12, 11, 10, 9 });
203
- v_s2 += (__v4si)_mm_madd_epi16((__m128i)v_byte_sums_d,
204
- (__m128i)(__v8hi){ 8, 7, 6, 5, 4, 3, 2, 1 });
205
-
206
- /* Now accumulate what we computed into the real s1 and s2. */
207
- v_s1 += (__v4si)_mm_shuffle_epi32((__m128i)v_s1, 0x31);
208
- v_s1 += (__v4si)_mm_shuffle_epi32((__m128i)v_s1, 0x02);
209
- s1 += _mm_cvtsi128_si32((__m128i)v_s1);
210
-
211
- v_s2 += (__v4si)_mm_shuffle_epi32((__m128i)v_s2, 0x31);
212
- v_s2 += (__v4si)_mm_shuffle_epi32((__m128i)v_s2, 0x02);
213
- s2 += _mm_cvtsi128_si32((__m128i)v_s2);
214
-
215
- #elif TARGET == TARGET_NEON
216
- /* ARM NEON (Advanced SIMD) implementation */
217
- uint32x4_t v_s1 = (uint32x4_t) { 0, 0, 0, 0 };
218
- uint32x4_t v_s2 = (uint32x4_t) { 0, 0, 0, 0 };
219
- uint16x8_t v_byte_sums_a = (uint16x8_t) { 0, 0, 0, 0, 0, 0, 0, 0 };
220
- uint16x8_t v_byte_sums_b = (uint16x8_t) { 0, 0, 0, 0, 0, 0, 0, 0 };
221
- uint16x8_t v_byte_sums_c = (uint16x8_t) { 0, 0, 0, 0, 0, 0, 0, 0 };
222
- uint16x8_t v_byte_sums_d = (uint16x8_t) { 0, 0, 0, 0, 0, 0, 0, 0 };
223
-
224
- STATIC_ASSERT(ALIGNMENT_REQUIRED == 16 && BYTES_PER_ITERATION == 32);
225
- do {
226
- const uint8x16_t bytes1 = *(const uint8x16_t *)p;
227
- const uint8x16_t bytes2 = *(const uint8x16_t *)(p + 16);
228
- uint16x8_t tmp;
229
-
230
- v_s2 += v_s1;
231
-
232
- tmp = vpaddlq_u8(bytes1);
233
- tmp = vpadalq_u8(tmp, bytes2);
234
- v_s1 = vpadalq_u16(v_s1, tmp);
235
-
236
- v_byte_sums_a = vaddw_u8(v_byte_sums_a, vget_low_u8(bytes1));
237
- v_byte_sums_b = vaddw_u8(v_byte_sums_b, vget_high_u8(bytes1));
238
- v_byte_sums_c = vaddw_u8(v_byte_sums_c, vget_low_u8(bytes2));
239
- v_byte_sums_d = vaddw_u8(v_byte_sums_d, vget_high_u8(bytes2));
240
-
241
- } while ((p += BYTES_PER_ITERATION) != chunk_end);
242
-
243
- v_s2 = vqshlq_n_u32(v_s2, 5);
244
- v_s2 = vmlal_u16(v_s2, vget_low_u16(v_byte_sums_a), (uint16x4_t) { 32, 31, 30, 29 });
245
- v_s2 = vmlal_u16(v_s2, vget_high_u16(v_byte_sums_a), (uint16x4_t) { 28, 27, 26, 25 });
246
- v_s2 = vmlal_u16(v_s2, vget_low_u16(v_byte_sums_b), (uint16x4_t) { 24, 23, 22, 21 });
247
- v_s2 = vmlal_u16(v_s2, vget_high_u16(v_byte_sums_b), (uint16x4_t) { 20, 19, 18, 17 });
248
- v_s2 = vmlal_u16(v_s2, vget_low_u16(v_byte_sums_c), (uint16x4_t) { 16, 15, 14, 13 });
249
- v_s2 = vmlal_u16(v_s2, vget_high_u16(v_byte_sums_c), (uint16x4_t) { 12, 11, 10, 9 });
250
- v_s2 = vmlal_u16(v_s2, vget_low_u16 (v_byte_sums_d), (uint16x4_t) { 8, 7, 6, 5 });
251
- v_s2 = vmlal_u16(v_s2, vget_high_u16(v_byte_sums_d), (uint16x4_t) { 4, 3, 2, 1 });
252
-
253
- s1 += v_s1[0] + v_s1[1] + v_s1[2] + v_s1[3];
254
- s2 += v_s2[0] + v_s2[1] + v_s2[2] + v_s2[3];
255
- #else
256
- # error "BUG: unknown target"
257
- #endif
258
- }
259
-
260
- s1 %= DIVISOR;
261
- s2 %= DIVISOR;
262
- }
263
-
264
- /* Process any remaining bytes. */
265
- if (p != end) {
266
- do {
267
- s1 += *p++;
268
- s2 += s1;
269
- } while (p != end);
270
- s1 %= DIVISOR;
271
- s2 %= DIVISOR;
272
- }
273
-
274
- return (s2 << 16) | s1;
275
- }
276
-
277
- #undef FUNCNAME
278
- #undef TARGET
279
- #undef ALIGNMENT_REQUIRED
280
- #undef BYTES_PER_ITERATION
281
- #undef ATTRIBUTES
@@ -1,57 +0,0 @@
1
- /*
2
- * aligned_malloc.c - aligned memory allocation
3
- *
4
- * Originally public domain; changes after 2016-09-07 are copyrighted.
5
- *
6
- * Copyright 2016 Eric Biggers
7
- *
8
- * Permission is hereby granted, free of charge, to any person
9
- * obtaining a copy of this software and associated documentation
10
- * files (the "Software"), to deal in the Software without
11
- * restriction, including without limitation the rights to use,
12
- * copy, modify, merge, publish, distribute, sublicense, and/or sell
13
- * copies of the Software, and to permit persons to whom the
14
- * Software is furnished to do so, subject to the following
15
- * conditions:
16
- *
17
- * The above copyright notice and this permission notice shall be
18
- * included in all copies or substantial portions of the Software.
19
- *
20
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
21
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
22
- * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
23
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
24
- * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
25
- * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
26
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
27
- * OTHER DEALINGS IN THE SOFTWARE.
28
- */
29
-
30
- /*
31
- * This file provides portable aligned memory allocation functions that only
32
- * use malloc() and free(). This avoids portability problems with
33
- * posix_memalign(), aligned_alloc(), etc.
34
- */
35
-
36
- #include <stdlib.h>
37
-
38
- #include "aligned_malloc.h"
39
-
40
- void *
41
- aligned_malloc(size_t alignment, size_t size)
42
- {
43
- void *ptr = malloc(sizeof(void *) + alignment - 1 + size);
44
- if (ptr) {
45
- void *orig_ptr = ptr;
46
- ptr = (void *)ALIGN((uintptr_t)ptr + sizeof(void *), alignment);
47
- ((void **)ptr)[-1] = orig_ptr;
48
- }
49
- return ptr;
50
- }
51
-
52
- void
53
- aligned_free(void *ptr)
54
- {
55
- if (ptr)
56
- free(((void **)ptr)[-1]);
57
- }
@@ -1,13 +0,0 @@
1
- /*
2
- * aligned_malloc.c - aligned memory allocation
3
- */
4
-
5
- #ifndef LIB_ALIGNED_MALLOC_H
6
- #define LIB_ALIGNED_MALLOC_H
7
-
8
- #include "lib_common.h"
9
-
10
- extern void *aligned_malloc(size_t alignment, size_t size);
11
- extern void aligned_free(void *ptr);
12
-
13
- #endif /* LIB_ALIGNED_MALLOC_H */
@@ -1,357 +0,0 @@
1
- /*
2
- * bt_matchfinder.h - Lempel-Ziv matchfinding with a hash table of binary trees
3
- *
4
- * Originally public domain; changes after 2016-09-07 are copyrighted.
5
- *
6
- * Copyright 2016 Eric Biggers
7
- *
8
- * Permission is hereby granted, free of charge, to any person
9
- * obtaining a copy of this software and associated documentation
10
- * files (the "Software"), to deal in the Software without
11
- * restriction, including without limitation the rights to use,
12
- * copy, modify, merge, publish, distribute, sublicense, and/or sell
13
- * copies of the Software, and to permit persons to whom the
14
- * Software is furnished to do so, subject to the following
15
- * conditions:
16
- *
17
- * The above copyright notice and this permission notice shall be
18
- * included in all copies or substantial portions of the Software.
19
- *
20
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
21
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
22
- * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
23
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
24
- * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
25
- * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
26
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
27
- * OTHER DEALINGS IN THE SOFTWARE.
28
- *
29
- * ----------------------------------------------------------------------------
30
- *
31
- * This is a Binary Trees (bt) based matchfinder.
32
- *
33
- * The main data structure is a hash table where each hash bucket contains a
34
- * binary tree of sequences whose first 4 bytes share the same hash code. Each
35
- * sequence is identified by its starting position in the input buffer. Each
36
- * binary tree is always sorted such that each left child represents a sequence
37
- * lexicographically lesser than its parent and each right child represents a
38
- * sequence lexicographically greater than its parent.
39
- *
40
- * The algorithm processes the input buffer sequentially. At each byte
41
- * position, the hash code of the first 4 bytes of the sequence beginning at
42
- * that position (the sequence being matched against) is computed. This
43
- * identifies the hash bucket to use for that position. Then, a new binary tree
44
- * node is created to represent the current sequence. Then, in a single tree
45
- * traversal, the hash bucket's binary tree is searched for matches and is
46
- * re-rooted at the new node.
47
- *
48
- * Compared to the simpler algorithm that uses linked lists instead of binary
49
- * trees (see hc_matchfinder.h), the binary tree version gains more information
50
- * at each node visitation. Ideally, the binary tree version will examine only
51
- * 'log(n)' nodes to find the same matches that the linked list version will
52
- * find by examining 'n' nodes. In addition, the binary tree version can
53
- * examine fewer bytes at each node by taking advantage of the common prefixes
54
- * that result from the sort order, whereas the linked list version may have to
55
- * examine up to the full length of the match at each node.
56
- *
57
- * However, it is not always best to use the binary tree version. It requires
58
- * nearly twice as much memory as the linked list version, and it takes time to
59
- * keep the binary trees sorted, even at positions where the compressor does not
60
- * need matches. Generally, when doing fast compression on small buffers,
61
- * binary trees are the wrong approach. They are best suited for thorough
62
- * compression and/or large buffers.
63
- *
64
- * ----------------------------------------------------------------------------
65
- */
66
-
67
-
68
- #include "matchfinder_common.h"
69
-
70
- #define BT_MATCHFINDER_HASH3_ORDER 16
71
- #define BT_MATCHFINDER_HASH3_WAYS 2
72
- #define BT_MATCHFINDER_HASH4_ORDER 16
73
-
74
- #define BT_MATCHFINDER_TOTAL_HASH_LENGTH \
75
- ((1UL << BT_MATCHFINDER_HASH3_ORDER) * BT_MATCHFINDER_HASH3_WAYS + \
76
- (1UL << BT_MATCHFINDER_HASH4_ORDER))
77
-
78
- /* Representation of a match found by the bt_matchfinder */
79
- struct lz_match {
80
-
81
- /* The number of bytes matched. */
82
- u16 length;
83
-
84
- /* The offset back from the current position that was matched. */
85
- u16 offset;
86
- };
87
-
88
- struct bt_matchfinder {
89
-
90
- /* The hash table for finding length 3 matches */
91
- mf_pos_t hash3_tab[1UL << BT_MATCHFINDER_HASH3_ORDER][BT_MATCHFINDER_HASH3_WAYS];
92
-
93
- /* The hash table which contains the roots of the binary trees for
94
- * finding length 4+ matches */
95
- mf_pos_t hash4_tab[1UL << BT_MATCHFINDER_HASH4_ORDER];
96
-
97
- /* The child node references for the binary trees. The left and right
98
- * children of the node for the sequence with position 'pos' are
99
- * 'child_tab[pos * 2]' and 'child_tab[pos * 2 + 1]', respectively. */
100
- mf_pos_t child_tab[2UL * MATCHFINDER_WINDOW_SIZE];
101
-
102
- }
103
- #ifdef _aligned_attribute
104
- _aligned_attribute(MATCHFINDER_ALIGNMENT)
105
- #endif
106
- ;
107
-
108
- /* Prepare the matchfinder for a new input buffer. */
109
- static forceinline void
110
- bt_matchfinder_init(struct bt_matchfinder *mf)
111
- {
112
- matchfinder_init((mf_pos_t *)mf, BT_MATCHFINDER_TOTAL_HASH_LENGTH);
113
- }
114
-
115
- static forceinline void
116
- bt_matchfinder_slide_window(struct bt_matchfinder *mf)
117
- {
118
- matchfinder_rebase((mf_pos_t *)mf,
119
- sizeof(struct bt_matchfinder) / sizeof(mf_pos_t));
120
- }
121
-
122
- static forceinline mf_pos_t *
123
- bt_left_child(struct bt_matchfinder *mf, s32 node)
124
- {
125
- return &mf->child_tab[2 * (node & (MATCHFINDER_WINDOW_SIZE - 1)) + 0];
126
- }
127
-
128
- static forceinline mf_pos_t *
129
- bt_right_child(struct bt_matchfinder *mf, s32 node)
130
- {
131
- return &mf->child_tab[2 * (node & (MATCHFINDER_WINDOW_SIZE - 1)) + 1];
132
- }
133
-
134
- /* The minimum permissible value of 'max_len' for bt_matchfinder_get_matches()
135
- * and bt_matchfinder_skip_position(). There must be sufficiently many bytes
136
- * remaining to load a 32-bit integer from the *next* position. */
137
- #define BT_MATCHFINDER_REQUIRED_NBYTES 5
138
-
139
- /* Advance the binary tree matchfinder by one byte, optionally recording
140
- * matches. @record_matches should be a compile-time constant. */
141
- static forceinline struct lz_match *
142
- bt_matchfinder_advance_one_byte(struct bt_matchfinder * const restrict mf,
143
- const u8 * const restrict in_base,
144
- const ptrdiff_t cur_pos,
145
- const u32 max_len,
146
- const u32 nice_len,
147
- const u32 max_search_depth,
148
- u32 * const restrict next_hashes,
149
- u32 * const restrict best_len_ret,
150
- struct lz_match * restrict lz_matchptr,
151
- const bool record_matches)
152
- {
153
- const u8 *in_next = in_base + cur_pos;
154
- u32 depth_remaining = max_search_depth;
155
- const s32 cutoff = cur_pos - MATCHFINDER_WINDOW_SIZE;
156
- u32 next_seq4;
157
- u32 next_seq3;
158
- u32 hash3;
159
- u32 hash4;
160
- s32 cur_node;
161
- #if BT_MATCHFINDER_HASH3_WAYS >= 2
162
- s32 cur_node_2;
163
- #endif
164
- const u8 *matchptr;
165
- mf_pos_t *pending_lt_ptr, *pending_gt_ptr;
166
- u32 best_lt_len, best_gt_len;
167
- u32 len;
168
- u32 best_len = 3;
169
-
170
- STATIC_ASSERT(BT_MATCHFINDER_HASH3_WAYS >= 1 &&
171
- BT_MATCHFINDER_HASH3_WAYS <= 2);
172
-
173
- next_seq4 = load_u32_unaligned(in_next + 1);
174
- next_seq3 = loaded_u32_to_u24(next_seq4);
175
-
176
- hash3 = next_hashes[0];
177
- hash4 = next_hashes[1];
178
-
179
- next_hashes[0] = lz_hash(next_seq3, BT_MATCHFINDER_HASH3_ORDER);
180
- next_hashes[1] = lz_hash(next_seq4, BT_MATCHFINDER_HASH4_ORDER);
181
- prefetchw(&mf->hash3_tab[next_hashes[0]]);
182
- prefetchw(&mf->hash4_tab[next_hashes[1]]);
183
-
184
- cur_node = mf->hash3_tab[hash3][0];
185
- mf->hash3_tab[hash3][0] = cur_pos;
186
- #if BT_MATCHFINDER_HASH3_WAYS >= 2
187
- cur_node_2 = mf->hash3_tab[hash3][1];
188
- mf->hash3_tab[hash3][1] = cur_node;
189
- #endif
190
- if (record_matches && cur_node > cutoff) {
191
- u32 seq3 = load_u24_unaligned(in_next);
192
- if (seq3 == load_u24_unaligned(&in_base[cur_node])) {
193
- lz_matchptr->length = 3;
194
- lz_matchptr->offset = in_next - &in_base[cur_node];
195
- lz_matchptr++;
196
- }
197
- #if BT_MATCHFINDER_HASH3_WAYS >= 2
198
- else if (cur_node_2 > cutoff &&
199
- seq3 == load_u24_unaligned(&in_base[cur_node_2]))
200
- {
201
- lz_matchptr->length = 3;
202
- lz_matchptr->offset = in_next - &in_base[cur_node_2];
203
- lz_matchptr++;
204
- }
205
- #endif
206
- }
207
-
208
- cur_node = mf->hash4_tab[hash4];
209
- mf->hash4_tab[hash4] = cur_pos;
210
-
211
- pending_lt_ptr = bt_left_child(mf, cur_pos);
212
- pending_gt_ptr = bt_right_child(mf, cur_pos);
213
-
214
- if (cur_node <= cutoff) {
215
- *pending_lt_ptr = MATCHFINDER_INITVAL;
216
- *pending_gt_ptr = MATCHFINDER_INITVAL;
217
- *best_len_ret = best_len;
218
- return lz_matchptr;
219
- }
220
-
221
- best_lt_len = 0;
222
- best_gt_len = 0;
223
- len = 0;
224
-
225
- for (;;) {
226
- matchptr = &in_base[cur_node];
227
-
228
- if (matchptr[len] == in_next[len]) {
229
- len = lz_extend(in_next, matchptr, len + 1, max_len);
230
- if (!record_matches || len > best_len) {
231
- if (record_matches) {
232
- best_len = len;
233
- lz_matchptr->length = len;
234
- lz_matchptr->offset = in_next - matchptr;
235
- lz_matchptr++;
236
- }
237
- if (len >= nice_len) {
238
- *pending_lt_ptr = *bt_left_child(mf, cur_node);
239
- *pending_gt_ptr = *bt_right_child(mf, cur_node);
240
- *best_len_ret = best_len;
241
- return lz_matchptr;
242
- }
243
- }
244
- }
245
-
246
- if (matchptr[len] < in_next[len]) {
247
- *pending_lt_ptr = cur_node;
248
- pending_lt_ptr = bt_right_child(mf, cur_node);
249
- cur_node = *pending_lt_ptr;
250
- best_lt_len = len;
251
- if (best_gt_len < len)
252
- len = best_gt_len;
253
- } else {
254
- *pending_gt_ptr = cur_node;
255
- pending_gt_ptr = bt_left_child(mf, cur_node);
256
- cur_node = *pending_gt_ptr;
257
- best_gt_len = len;
258
- if (best_lt_len < len)
259
- len = best_lt_len;
260
- }
261
-
262
- if (cur_node <= cutoff || !--depth_remaining) {
263
- *pending_lt_ptr = MATCHFINDER_INITVAL;
264
- *pending_gt_ptr = MATCHFINDER_INITVAL;
265
- *best_len_ret = best_len;
266
- return lz_matchptr;
267
- }
268
- }
269
- }
270
-
271
- /*
272
- * Retrieve a list of matches with the current position.
273
- *
274
- * @mf
275
- * The matchfinder structure.
276
- * @in_base
277
- * Pointer to the next byte in the input buffer to process _at the last
278
- * time bt_matchfinder_init() or bt_matchfinder_slide_window() was called_.
279
- * @cur_pos
280
- * The current position in the input buffer relative to @in_base (the
281
- * position of the sequence being matched against).
282
- * @max_len
283
- * The maximum permissible match length at this position. Must be >=
284
- * BT_MATCHFINDER_REQUIRED_NBYTES.
285
- * @nice_len
286
- * Stop searching if a match of at least this length is found.
287
- * Must be <= @max_len.
288
- * @max_search_depth
289
- * Limit on the number of potential matches to consider. Must be >= 1.
290
- * @next_hashes
291
- * The precomputed hash codes for the sequence beginning at @in_next.
292
- * These will be used and then updated with the precomputed hashcodes for
293
- * the sequence beginning at @in_next + 1.
294
- * @best_len_ret
295
- * If a match of length >= 4 was found, then the length of the longest such
296
- * match is written here; otherwise 3 is written here. (Note: this is
297
- * redundant with the 'struct lz_match' array, but this is easier for the
298
- * compiler to optimize when inlined and the caller immediately does a
299
- * check against 'best_len'.)
300
- * @lz_matchptr
301
- * An array in which this function will record the matches. The recorded
302
- * matches will be sorted by strictly increasing length and (non-strictly)
303
- * increasing offset. The maximum number of matches that may be found is
304
- * 'nice_len - 2'.
305
- *
306
- * The return value is a pointer to the next available slot in the @lz_matchptr
307
- * array. (If no matches were found, this will be the same as @lz_matchptr.)
308
- */
309
- static forceinline struct lz_match *
310
- bt_matchfinder_get_matches(struct bt_matchfinder *mf,
311
- const u8 *in_base,
312
- ptrdiff_t cur_pos,
313
- u32 max_len,
314
- u32 nice_len,
315
- u32 max_search_depth,
316
- u32 next_hashes[2],
317
- u32 *best_len_ret,
318
- struct lz_match *lz_matchptr)
319
- {
320
- return bt_matchfinder_advance_one_byte(mf,
321
- in_base,
322
- cur_pos,
323
- max_len,
324
- nice_len,
325
- max_search_depth,
326
- next_hashes,
327
- best_len_ret,
328
- lz_matchptr,
329
- true);
330
- }
331
-
332
- /*
333
- * Advance the matchfinder, but don't record any matches.
334
- *
335
- * This is very similar to bt_matchfinder_get_matches() because both functions
336
- * must do hashing and tree re-rooting.
337
- */
338
- static forceinline void
339
- bt_matchfinder_skip_position(struct bt_matchfinder *mf,
340
- const u8 *in_base,
341
- ptrdiff_t cur_pos,
342
- u32 nice_len,
343
- u32 max_search_depth,
344
- u32 next_hashes[2])
345
- {
346
- u32 best_len;
347
- bt_matchfinder_advance_one_byte(mf,
348
- in_base,
349
- cur_pos,
350
- nice_len,
351
- nice_len,
352
- max_search_depth,
353
- next_hashes,
354
- &best_len,
355
- NULL,
356
- false);
357
- }