libdeflate 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (89) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +17 -0
  3. data/.gitmodules +3 -0
  4. data/.rspec +2 -0
  5. data/.rubocop.yml +1 -0
  6. data/.rubocop_todo.yml +9 -0
  7. data/.travis.yml +5 -0
  8. data/Gemfile +4 -0
  9. data/LICENSE.txt +21 -0
  10. data/README.md +52 -0
  11. data/Rakefile +15 -0
  12. data/bin/console +14 -0
  13. data/bin/setup +8 -0
  14. data/ext/libdeflate/extconf.rb +14 -0
  15. data/ext/libdeflate/libdeflate/.gitignore +19 -0
  16. data/ext/libdeflate/libdeflate/COPYING +21 -0
  17. data/ext/libdeflate/libdeflate/Makefile +231 -0
  18. data/ext/libdeflate/libdeflate/Makefile.msc +64 -0
  19. data/ext/libdeflate/libdeflate/NEWS +57 -0
  20. data/ext/libdeflate/libdeflate/README.md +170 -0
  21. data/ext/libdeflate/libdeflate/common/common_defs.h +351 -0
  22. data/ext/libdeflate/libdeflate/common/compiler_gcc.h +134 -0
  23. data/ext/libdeflate/libdeflate/common/compiler_msc.h +95 -0
  24. data/ext/libdeflate/libdeflate/lib/adler32.c +213 -0
  25. data/ext/libdeflate/libdeflate/lib/adler32_impl.h +281 -0
  26. data/ext/libdeflate/libdeflate/lib/aligned_malloc.c +57 -0
  27. data/ext/libdeflate/libdeflate/lib/aligned_malloc.h +13 -0
  28. data/ext/libdeflate/libdeflate/lib/bt_matchfinder.h +357 -0
  29. data/ext/libdeflate/libdeflate/lib/crc32.c +368 -0
  30. data/ext/libdeflate/libdeflate/lib/crc32_impl.h +286 -0
  31. data/ext/libdeflate/libdeflate/lib/crc32_table.h +526 -0
  32. data/ext/libdeflate/libdeflate/lib/decompress_impl.h +404 -0
  33. data/ext/libdeflate/libdeflate/lib/deflate_compress.c +2817 -0
  34. data/ext/libdeflate/libdeflate/lib/deflate_compress.h +14 -0
  35. data/ext/libdeflate/libdeflate/lib/deflate_constants.h +66 -0
  36. data/ext/libdeflate/libdeflate/lib/deflate_decompress.c +889 -0
  37. data/ext/libdeflate/libdeflate/lib/gzip_compress.c +95 -0
  38. data/ext/libdeflate/libdeflate/lib/gzip_constants.h +45 -0
  39. data/ext/libdeflate/libdeflate/lib/gzip_decompress.c +130 -0
  40. data/ext/libdeflate/libdeflate/lib/hc_matchfinder.h +405 -0
  41. data/ext/libdeflate/libdeflate/lib/lib_common.h +35 -0
  42. data/ext/libdeflate/libdeflate/lib/matchfinder_avx2.h +53 -0
  43. data/ext/libdeflate/libdeflate/lib/matchfinder_common.h +205 -0
  44. data/ext/libdeflate/libdeflate/lib/matchfinder_neon.h +61 -0
  45. data/ext/libdeflate/libdeflate/lib/matchfinder_sse2.h +53 -0
  46. data/ext/libdeflate/libdeflate/lib/unaligned.h +202 -0
  47. data/ext/libdeflate/libdeflate/lib/x86_cpu_features.c +169 -0
  48. data/ext/libdeflate/libdeflate/lib/x86_cpu_features.h +48 -0
  49. data/ext/libdeflate/libdeflate/lib/zlib_compress.c +87 -0
  50. data/ext/libdeflate/libdeflate/lib/zlib_constants.h +21 -0
  51. data/ext/libdeflate/libdeflate/lib/zlib_decompress.c +91 -0
  52. data/ext/libdeflate/libdeflate/libdeflate.h +274 -0
  53. data/ext/libdeflate/libdeflate/programs/benchmark.c +558 -0
  54. data/ext/libdeflate/libdeflate/programs/checksum.c +197 -0
  55. data/ext/libdeflate/libdeflate/programs/detect.sh +62 -0
  56. data/ext/libdeflate/libdeflate/programs/gzip.c +603 -0
  57. data/ext/libdeflate/libdeflate/programs/prog_util.c +530 -0
  58. data/ext/libdeflate/libdeflate/programs/prog_util.h +162 -0
  59. data/ext/libdeflate/libdeflate/programs/test_checksums.c +135 -0
  60. data/ext/libdeflate/libdeflate/programs/tgetopt.c +118 -0
  61. data/ext/libdeflate/libdeflate/tools/afl-fuzz/Makefile +12 -0
  62. data/ext/libdeflate/libdeflate/tools/afl-fuzz/deflate_compress/fuzz.c +40 -0
  63. data/ext/libdeflate/libdeflate/tools/afl-fuzz/deflate_compress/inputs/0 +0 -0
  64. data/ext/libdeflate/libdeflate/tools/afl-fuzz/deflate_decompress/fuzz.c +28 -0
  65. data/ext/libdeflate/libdeflate/tools/afl-fuzz/deflate_decompress/inputs/0 +3 -0
  66. data/ext/libdeflate/libdeflate/tools/afl-fuzz/gzip_decompress/fuzz.c +28 -0
  67. data/ext/libdeflate/libdeflate/tools/afl-fuzz/gzip_decompress/inputs/0 +0 -0
  68. data/ext/libdeflate/libdeflate/tools/afl-fuzz/prepare_for_fuzz.sh +14 -0
  69. data/ext/libdeflate/libdeflate/tools/afl-fuzz/zlib_decompress/fuzz.c +28 -0
  70. data/ext/libdeflate/libdeflate/tools/afl-fuzz/zlib_decompress/inputs/0 +3 -0
  71. data/ext/libdeflate/libdeflate/tools/android_build.sh +104 -0
  72. data/ext/libdeflate/libdeflate/tools/checksum_benchmarks.sh +76 -0
  73. data/ext/libdeflate/libdeflate/tools/exec_tests.sh +30 -0
  74. data/ext/libdeflate/libdeflate/tools/gen_crc32_multipliers.c +108 -0
  75. data/ext/libdeflate/libdeflate/tools/gen_crc32_table.c +100 -0
  76. data/ext/libdeflate/libdeflate/tools/gzip_tests.sh +412 -0
  77. data/ext/libdeflate/libdeflate/tools/make-windows-releases +21 -0
  78. data/ext/libdeflate/libdeflate/tools/mips_build.sh +9 -0
  79. data/ext/libdeflate/libdeflate/tools/msc_test.bat +3 -0
  80. data/ext/libdeflate/libdeflate/tools/pgo_build.sh +23 -0
  81. data/ext/libdeflate/libdeflate/tools/produce_gzip_benchmark_table.sh +37 -0
  82. data/ext/libdeflate/libdeflate/tools/run_tests.sh +305 -0
  83. data/ext/libdeflate/libdeflate/tools/windows_build.sh +10 -0
  84. data/ext/libdeflate/libdeflate_ext.c +389 -0
  85. data/ext/libdeflate/libdeflate_ext.h +8 -0
  86. data/lib/libdeflate.rb +2 -0
  87. data/lib/libdeflate/version.rb +3 -0
  88. data/libdeflate.gemspec +33 -0
  89. metadata +230 -0
@@ -0,0 +1,57 @@
1
+ /*
2
+ * aligned_malloc.c - aligned memory allocation
3
+ *
4
+ * Originally public domain; changes after 2016-09-07 are copyrighted.
5
+ *
6
+ * Copyright 2016 Eric Biggers
7
+ *
8
+ * Permission is hereby granted, free of charge, to any person
9
+ * obtaining a copy of this software and associated documentation
10
+ * files (the "Software"), to deal in the Software without
11
+ * restriction, including without limitation the rights to use,
12
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
13
+ * copies of the Software, and to permit persons to whom the
14
+ * Software is furnished to do so, subject to the following
15
+ * conditions:
16
+ *
17
+ * The above copyright notice and this permission notice shall be
18
+ * included in all copies or substantial portions of the Software.
19
+ *
20
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
21
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
22
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
23
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
24
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
25
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
26
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
27
+ * OTHER DEALINGS IN THE SOFTWARE.
28
+ */
29
+
30
+ /*
31
+ * This file provides portable aligned memory allocation functions that only
32
+ * use malloc() and free(). This avoids portability problems with
33
+ * posix_memalign(), aligned_alloc(), etc.
34
+ */
35
+
36
+ #include <stdlib.h>
37
+
38
+ #include "aligned_malloc.h"
39
+
40
+ void *
41
+ aligned_malloc(size_t alignment, size_t size)
42
+ {
43
+ void *ptr = malloc(sizeof(void *) + alignment - 1 + size);
44
+ if (ptr) {
45
+ void *orig_ptr = ptr;
46
+ ptr = (void *)ALIGN((uintptr_t)ptr + sizeof(void *), alignment);
47
+ ((void **)ptr)[-1] = orig_ptr;
48
+ }
49
+ return ptr;
50
+ }
51
+
52
+ void
53
+ aligned_free(void *ptr)
54
+ {
55
+ if (ptr)
56
+ free(((void **)ptr)[-1]);
57
+ }
@@ -0,0 +1,13 @@
1
+ /*
2
+ * aligned_malloc.c - aligned memory allocation
3
+ */
4
+
5
+ #ifndef LIB_ALIGNED_MALLOC_H
6
+ #define LIB_ALIGNED_MALLOC_H
7
+
8
+ #include "lib_common.h"
9
+
10
+ extern void *aligned_malloc(size_t alignment, size_t size);
11
+ extern void aligned_free(void *ptr);
12
+
13
+ #endif /* LIB_ALIGNED_MALLOC_H */
@@ -0,0 +1,357 @@
1
+ /*
2
+ * bt_matchfinder.h - Lempel-Ziv matchfinding with a hash table of binary trees
3
+ *
4
+ * Originally public domain; changes after 2016-09-07 are copyrighted.
5
+ *
6
+ * Copyright 2016 Eric Biggers
7
+ *
8
+ * Permission is hereby granted, free of charge, to any person
9
+ * obtaining a copy of this software and associated documentation
10
+ * files (the "Software"), to deal in the Software without
11
+ * restriction, including without limitation the rights to use,
12
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
13
+ * copies of the Software, and to permit persons to whom the
14
+ * Software is furnished to do so, subject to the following
15
+ * conditions:
16
+ *
17
+ * The above copyright notice and this permission notice shall be
18
+ * included in all copies or substantial portions of the Software.
19
+ *
20
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
21
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
22
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
23
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
24
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
25
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
26
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
27
+ * OTHER DEALINGS IN THE SOFTWARE.
28
+ *
29
+ * ----------------------------------------------------------------------------
30
+ *
31
+ * This is a Binary Trees (bt) based matchfinder.
32
+ *
33
+ * The main data structure is a hash table where each hash bucket contains a
34
+ * binary tree of sequences whose first 4 bytes share the same hash code. Each
35
+ * sequence is identified by its starting position in the input buffer. Each
36
+ * binary tree is always sorted such that each left child represents a sequence
37
+ * lexicographically lesser than its parent and each right child represents a
38
+ * sequence lexicographically greater than its parent.
39
+ *
40
+ * The algorithm processes the input buffer sequentially. At each byte
41
+ * position, the hash code of the first 4 bytes of the sequence beginning at
42
+ * that position (the sequence being matched against) is computed. This
43
+ * identifies the hash bucket to use for that position. Then, a new binary tree
44
+ * node is created to represent the current sequence. Then, in a single tree
45
+ * traversal, the hash bucket's binary tree is searched for matches and is
46
+ * re-rooted at the new node.
47
+ *
48
+ * Compared to the simpler algorithm that uses linked lists instead of binary
49
+ * trees (see hc_matchfinder.h), the binary tree version gains more information
50
+ * at each node visitation. Ideally, the binary tree version will examine only
51
+ * 'log(n)' nodes to find the same matches that the linked list version will
52
+ * find by examining 'n' nodes. In addition, the binary tree version can
53
+ * examine fewer bytes at each node by taking advantage of the common prefixes
54
+ * that result from the sort order, whereas the linked list version may have to
55
+ * examine up to the full length of the match at each node.
56
+ *
57
+ * However, it is not always best to use the binary tree version. It requires
58
+ * nearly twice as much memory as the linked list version, and it takes time to
59
+ * keep the binary trees sorted, even at positions where the compressor does not
60
+ * need matches. Generally, when doing fast compression on small buffers,
61
+ * binary trees are the wrong approach. They are best suited for thorough
62
+ * compression and/or large buffers.
63
+ *
64
+ * ----------------------------------------------------------------------------
65
+ */
66
+
67
+
68
+ #include "matchfinder_common.h"
69
+
70
+ #define BT_MATCHFINDER_HASH3_ORDER 16
71
+ #define BT_MATCHFINDER_HASH3_WAYS 2
72
+ #define BT_MATCHFINDER_HASH4_ORDER 16
73
+
74
+ #define BT_MATCHFINDER_TOTAL_HASH_LENGTH \
75
+ ((1UL << BT_MATCHFINDER_HASH3_ORDER) * BT_MATCHFINDER_HASH3_WAYS + \
76
+ (1UL << BT_MATCHFINDER_HASH4_ORDER))
77
+
78
+ /* Representation of a match found by the bt_matchfinder */
79
+ struct lz_match {
80
+
81
+ /* The number of bytes matched. */
82
+ u16 length;
83
+
84
+ /* The offset back from the current position that was matched. */
85
+ u16 offset;
86
+ };
87
+
88
+ struct bt_matchfinder {
89
+
90
+ /* The hash table for finding length 3 matches */
91
+ mf_pos_t hash3_tab[1UL << BT_MATCHFINDER_HASH3_ORDER][BT_MATCHFINDER_HASH3_WAYS];
92
+
93
+ /* The hash table which contains the roots of the binary trees for
94
+ * finding length 4+ matches */
95
+ mf_pos_t hash4_tab[1UL << BT_MATCHFINDER_HASH4_ORDER];
96
+
97
+ /* The child node references for the binary trees. The left and right
98
+ * children of the node for the sequence with position 'pos' are
99
+ * 'child_tab[pos * 2]' and 'child_tab[pos * 2 + 1]', respectively. */
100
+ mf_pos_t child_tab[2UL * MATCHFINDER_WINDOW_SIZE];
101
+
102
+ }
103
+ #ifdef _aligned_attribute
104
+ _aligned_attribute(MATCHFINDER_ALIGNMENT)
105
+ #endif
106
+ ;
107
+
108
+ /* Prepare the matchfinder for a new input buffer. */
109
+ static forceinline void
110
+ bt_matchfinder_init(struct bt_matchfinder *mf)
111
+ {
112
+ matchfinder_init((mf_pos_t *)mf, BT_MATCHFINDER_TOTAL_HASH_LENGTH);
113
+ }
114
+
115
+ static forceinline void
116
+ bt_matchfinder_slide_window(struct bt_matchfinder *mf)
117
+ {
118
+ matchfinder_rebase((mf_pos_t *)mf,
119
+ sizeof(struct bt_matchfinder) / sizeof(mf_pos_t));
120
+ }
121
+
122
+ static forceinline mf_pos_t *
123
+ bt_left_child(struct bt_matchfinder *mf, s32 node)
124
+ {
125
+ return &mf->child_tab[2 * (node & (MATCHFINDER_WINDOW_SIZE - 1)) + 0];
126
+ }
127
+
128
+ static forceinline mf_pos_t *
129
+ bt_right_child(struct bt_matchfinder *mf, s32 node)
130
+ {
131
+ return &mf->child_tab[2 * (node & (MATCHFINDER_WINDOW_SIZE - 1)) + 1];
132
+ }
133
+
134
+ /* The minimum permissible value of 'max_len' for bt_matchfinder_get_matches()
135
+ * and bt_matchfinder_skip_position(). There must be sufficiently many bytes
136
+ * remaining to load a 32-bit integer from the *next* position. */
137
+ #define BT_MATCHFINDER_REQUIRED_NBYTES 5
138
+
139
+ /* Advance the binary tree matchfinder by one byte, optionally recording
140
+ * matches. @record_matches should be a compile-time constant. */
141
+ static forceinline struct lz_match *
142
+ bt_matchfinder_advance_one_byte(struct bt_matchfinder * const restrict mf,
143
+ const u8 * const restrict in_base,
144
+ const ptrdiff_t cur_pos,
145
+ const u32 max_len,
146
+ const u32 nice_len,
147
+ const u32 max_search_depth,
148
+ u32 * const restrict next_hashes,
149
+ u32 * const restrict best_len_ret,
150
+ struct lz_match * restrict lz_matchptr,
151
+ const bool record_matches)
152
+ {
153
+ const u8 *in_next = in_base + cur_pos;
154
+ u32 depth_remaining = max_search_depth;
155
+ const s32 cutoff = cur_pos - MATCHFINDER_WINDOW_SIZE;
156
+ u32 next_seq4;
157
+ u32 next_seq3;
158
+ u32 hash3;
159
+ u32 hash4;
160
+ s32 cur_node;
161
+ #if BT_MATCHFINDER_HASH3_WAYS >= 2
162
+ s32 cur_node_2;
163
+ #endif
164
+ const u8 *matchptr;
165
+ mf_pos_t *pending_lt_ptr, *pending_gt_ptr;
166
+ u32 best_lt_len, best_gt_len;
167
+ u32 len;
168
+ u32 best_len = 3;
169
+
170
+ STATIC_ASSERT(BT_MATCHFINDER_HASH3_WAYS >= 1 &&
171
+ BT_MATCHFINDER_HASH3_WAYS <= 2);
172
+
173
+ next_seq4 = load_u32_unaligned(in_next + 1);
174
+ next_seq3 = loaded_u32_to_u24(next_seq4);
175
+
176
+ hash3 = next_hashes[0];
177
+ hash4 = next_hashes[1];
178
+
179
+ next_hashes[0] = lz_hash(next_seq3, BT_MATCHFINDER_HASH3_ORDER);
180
+ next_hashes[1] = lz_hash(next_seq4, BT_MATCHFINDER_HASH4_ORDER);
181
+ prefetchw(&mf->hash3_tab[next_hashes[0]]);
182
+ prefetchw(&mf->hash4_tab[next_hashes[1]]);
183
+
184
+ cur_node = mf->hash3_tab[hash3][0];
185
+ mf->hash3_tab[hash3][0] = cur_pos;
186
+ #if BT_MATCHFINDER_HASH3_WAYS >= 2
187
+ cur_node_2 = mf->hash3_tab[hash3][1];
188
+ mf->hash3_tab[hash3][1] = cur_node;
189
+ #endif
190
+ if (record_matches && cur_node > cutoff) {
191
+ u32 seq3 = load_u24_unaligned(in_next);
192
+ if (seq3 == load_u24_unaligned(&in_base[cur_node])) {
193
+ lz_matchptr->length = 3;
194
+ lz_matchptr->offset = in_next - &in_base[cur_node];
195
+ lz_matchptr++;
196
+ }
197
+ #if BT_MATCHFINDER_HASH3_WAYS >= 2
198
+ else if (cur_node_2 > cutoff &&
199
+ seq3 == load_u24_unaligned(&in_base[cur_node_2]))
200
+ {
201
+ lz_matchptr->length = 3;
202
+ lz_matchptr->offset = in_next - &in_base[cur_node_2];
203
+ lz_matchptr++;
204
+ }
205
+ #endif
206
+ }
207
+
208
+ cur_node = mf->hash4_tab[hash4];
209
+ mf->hash4_tab[hash4] = cur_pos;
210
+
211
+ pending_lt_ptr = bt_left_child(mf, cur_pos);
212
+ pending_gt_ptr = bt_right_child(mf, cur_pos);
213
+
214
+ if (cur_node <= cutoff) {
215
+ *pending_lt_ptr = MATCHFINDER_INITVAL;
216
+ *pending_gt_ptr = MATCHFINDER_INITVAL;
217
+ *best_len_ret = best_len;
218
+ return lz_matchptr;
219
+ }
220
+
221
+ best_lt_len = 0;
222
+ best_gt_len = 0;
223
+ len = 0;
224
+
225
+ for (;;) {
226
+ matchptr = &in_base[cur_node];
227
+
228
+ if (matchptr[len] == in_next[len]) {
229
+ len = lz_extend(in_next, matchptr, len + 1, max_len);
230
+ if (!record_matches || len > best_len) {
231
+ if (record_matches) {
232
+ best_len = len;
233
+ lz_matchptr->length = len;
234
+ lz_matchptr->offset = in_next - matchptr;
235
+ lz_matchptr++;
236
+ }
237
+ if (len >= nice_len) {
238
+ *pending_lt_ptr = *bt_left_child(mf, cur_node);
239
+ *pending_gt_ptr = *bt_right_child(mf, cur_node);
240
+ *best_len_ret = best_len;
241
+ return lz_matchptr;
242
+ }
243
+ }
244
+ }
245
+
246
+ if (matchptr[len] < in_next[len]) {
247
+ *pending_lt_ptr = cur_node;
248
+ pending_lt_ptr = bt_right_child(mf, cur_node);
249
+ cur_node = *pending_lt_ptr;
250
+ best_lt_len = len;
251
+ if (best_gt_len < len)
252
+ len = best_gt_len;
253
+ } else {
254
+ *pending_gt_ptr = cur_node;
255
+ pending_gt_ptr = bt_left_child(mf, cur_node);
256
+ cur_node = *pending_gt_ptr;
257
+ best_gt_len = len;
258
+ if (best_lt_len < len)
259
+ len = best_lt_len;
260
+ }
261
+
262
+ if (cur_node <= cutoff || !--depth_remaining) {
263
+ *pending_lt_ptr = MATCHFINDER_INITVAL;
264
+ *pending_gt_ptr = MATCHFINDER_INITVAL;
265
+ *best_len_ret = best_len;
266
+ return lz_matchptr;
267
+ }
268
+ }
269
+ }
270
+
271
+ /*
272
+ * Retrieve a list of matches with the current position.
273
+ *
274
+ * @mf
275
+ * The matchfinder structure.
276
+ * @in_base
277
+ * Pointer to the next byte in the input buffer to process _at the last
278
+ * time bt_matchfinder_init() or bt_matchfinder_slide_window() was called_.
279
+ * @cur_pos
280
+ * The current position in the input buffer relative to @in_base (the
281
+ * position of the sequence being matched against).
282
+ * @max_len
283
+ * The maximum permissible match length at this position. Must be >=
284
+ * BT_MATCHFINDER_REQUIRED_NBYTES.
285
+ * @nice_len
286
+ * Stop searching if a match of at least this length is found.
287
+ * Must be <= @max_len.
288
+ * @max_search_depth
289
+ * Limit on the number of potential matches to consider. Must be >= 1.
290
+ * @next_hashes
291
+ * The precomputed hash codes for the sequence beginning at @in_next.
292
+ * These will be used and then updated with the precomputed hashcodes for
293
+ * the sequence beginning at @in_next + 1.
294
+ * @best_len_ret
295
+ * If a match of length >= 4 was found, then the length of the longest such
296
+ * match is written here; otherwise 3 is written here. (Note: this is
297
+ * redundant with the 'struct lz_match' array, but this is easier for the
298
+ * compiler to optimize when inlined and the caller immediately does a
299
+ * check against 'best_len'.)
300
+ * @lz_matchptr
301
+ * An array in which this function will record the matches. The recorded
302
+ * matches will be sorted by strictly increasing length and (non-strictly)
303
+ * increasing offset. The maximum number of matches that may be found is
304
+ * 'nice_len - 2'.
305
+ *
306
+ * The return value is a pointer to the next available slot in the @lz_matchptr
307
+ * array. (If no matches were found, this will be the same as @lz_matchptr.)
308
+ */
309
+ static forceinline struct lz_match *
310
+ bt_matchfinder_get_matches(struct bt_matchfinder *mf,
311
+ const u8 *in_base,
312
+ ptrdiff_t cur_pos,
313
+ u32 max_len,
314
+ u32 nice_len,
315
+ u32 max_search_depth,
316
+ u32 next_hashes[2],
317
+ u32 *best_len_ret,
318
+ struct lz_match *lz_matchptr)
319
+ {
320
+ return bt_matchfinder_advance_one_byte(mf,
321
+ in_base,
322
+ cur_pos,
323
+ max_len,
324
+ nice_len,
325
+ max_search_depth,
326
+ next_hashes,
327
+ best_len_ret,
328
+ lz_matchptr,
329
+ true);
330
+ }
331
+
332
+ /*
333
+ * Advance the matchfinder, but don't record any matches.
334
+ *
335
+ * This is very similar to bt_matchfinder_get_matches() because both functions
336
+ * must do hashing and tree re-rooting.
337
+ */
338
+ static forceinline void
339
+ bt_matchfinder_skip_position(struct bt_matchfinder *mf,
340
+ const u8 *in_base,
341
+ ptrdiff_t cur_pos,
342
+ u32 nice_len,
343
+ u32 max_search_depth,
344
+ u32 next_hashes[2])
345
+ {
346
+ u32 best_len;
347
+ bt_matchfinder_advance_one_byte(mf,
348
+ in_base,
349
+ cur_pos,
350
+ nice_len,
351
+ nice_len,
352
+ max_search_depth,
353
+ next_hashes,
354
+ &best_len,
355
+ NULL,
356
+ false);
357
+ }
@@ -0,0 +1,368 @@
1
+ /*
2
+ * crc32.c - CRC-32 checksum algorithm for the gzip format
3
+ *
4
+ * Originally public domain; changes after 2016-09-07 are copyrighted.
5
+ *
6
+ * Copyright 2016 Eric Biggers
7
+ *
8
+ * Permission is hereby granted, free of charge, to any person
9
+ * obtaining a copy of this software and associated documentation
10
+ * files (the "Software"), to deal in the Software without
11
+ * restriction, including without limitation the rights to use,
12
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
13
+ * copies of the Software, and to permit persons to whom the
14
+ * Software is furnished to do so, subject to the following
15
+ * conditions:
16
+ *
17
+ * The above copyright notice and this permission notice shall be
18
+ * included in all copies or substantial portions of the Software.
19
+ *
20
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
21
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
22
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
23
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
24
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
25
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
26
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
27
+ * OTHER DEALINGS IN THE SOFTWARE.
28
+ */
29
+
30
+ /*
31
+ * High-level description of CRC
32
+ * =============================
33
+ *
34
+ * Consider a bit sequence 'bits[1...len]'. Interpret 'bits' as the "message"
35
+ * polynomial M(x) with coefficients in GF(2) (the field of integers modulo 2),
36
+ * where the coefficient of 'x^i' is 'bits[len - i]'. Then, compute:
37
+ *
38
+ * R(x) = M(x)*x^n mod G(x)
39
+ *
40
+ * where G(x) is a selected "generator" polynomial of degree 'n'. The remainder
41
+ * R(x) is a polynomial of max degree 'n - 1'. The CRC of 'bits' is R(x)
42
+ * interpreted as a bitstring of length 'n'.
43
+ *
44
+ * CRC used in gzip
45
+ * ================
46
+ *
47
+ * In the gzip format (RFC 1952):
48
+ *
49
+ * - The bitstring to checksum is formed from the bytes of the uncompressed
50
+ * data by concatenating the bits from the bytes in order, proceeding
51
+ * from the low-order bit to the high-order bit within each byte.
52
+ *
53
+ * - The generator polynomial G(x) is: x^32 + x^26 + x^23 + x^22 + x^16 +
54
+ * x^12 + x^11 + x^10 + x^8 + x^7 + x^5 + x^4 + x^2 + x + 1.
55
+ * Consequently, the CRC length is 32 bits ("CRC-32").
56
+ *
57
+ * - The highest order 32 coefficients of M(x)*x^n are inverted.
58
+ *
59
+ * - All 32 coefficients of R(x) are inverted.
60
+ *
61
+ * The two inversions cause added leading and trailing zero bits to affect the
62
+ * resulting CRC, whereas with a regular CRC such bits would have no effect on
63
+ * the CRC.
64
+ *
65
+ * Computation and optimizations
66
+ * =============================
67
+ *
68
+ * We can compute R(x) through "long division", maintaining only 32 bits of
69
+ * state at any given time. Multiplication by 'x' can be implemented as
70
+ * right-shifting by 1 (assuming the polynomial<=>bitstring mapping where the
71
+ * highest order bit represents the coefficient of x^0), and both addition and
72
+ * subtraction can be implemented as bitwise exclusive OR (since we are working
73
+ * in GF(2)). Here is an unoptimized implementation:
74
+ *
75
+ * static u32 crc32_gzip(const u8 *buffer, size_t nbytes)
76
+ * {
77
+ * u32 remainder = 0;
78
+ * const u32 divisor = 0xEDB88320;
79
+ *
80
+ * for (size_t i = 0; i < nbytes * 8 + 32; i++) {
81
+ * int bit;
82
+ * u32 multiple;
83
+ *
84
+ * if (i < nbytes * 8)
85
+ * bit = (buffer[i / 8] >> (i % 8)) & 1;
86
+ * else
87
+ * bit = 0; // one of the 32 appended 0 bits
88
+ *
89
+ * if (i < 32) // the first 32 bits are inverted
90
+ * bit ^= 1;
91
+ *
92
+ * if (remainder & 1)
93
+ * multiple = divisor;
94
+ * else
95
+ * multiple = 0;
96
+ *
97
+ * remainder >>= 1;
98
+ * remainder |= (u32)bit << 31;
99
+ * remainder ^= multiple;
100
+ * }
101
+ *
102
+ * return ~remainder;
103
+ * }
104
+ *
105
+ * In this implementation, the 32-bit integer 'remainder' maintains the
106
+ * remainder of the currently processed portion of the message (with 32 zero
107
+ * bits appended) when divided by the generator polynomial. 'remainder' is the
108
+ * representation of R(x), and 'divisor' is the representation of G(x) excluding
109
+ * the x^32 coefficient. For each bit to process, we multiply R(x) by 'x^1',
110
+ * then add 'x^0' if the new bit is a 1. If this causes R(x) to gain a nonzero
111
+ * x^32 term, then we subtract G(x) from R(x).
112
+ *
113
+ * We can speed this up by taking advantage of the fact that XOR is commutative
114
+ * and associative, so the order in which we combine the inputs into 'remainder'
115
+ * is unimportant. And since each message bit we add doesn't affect the choice
116
+ * of 'multiple' until 32 bits later, we need not actually add each message bit
117
+ * until that point:
118
+ *
119
+ * static u32 crc32_gzip(const u8 *buffer, size_t nbytes)
120
+ * {
121
+ * u32 remainder = ~0;
122
+ * const u32 divisor = 0xEDB88320;
123
+ *
124
+ * for (size_t i = 0; i < nbytes * 8; i++) {
125
+ * int bit;
126
+ * u32 multiple;
127
+ *
128
+ * bit = (buffer[i / 8] >> (i % 8)) & 1;
129
+ * remainder ^= bit;
130
+ * if (remainder & 1)
131
+ * multiple = divisor;
132
+ * else
133
+ * multiple = 0;
134
+ * remainder >>= 1;
135
+ * remainder ^= multiple;
136
+ * }
137
+ *
138
+ * return ~remainder;
139
+ * }
140
+ *
141
+ * With the above implementation we get the effect of 32 appended 0 bits for
142
+ * free; they never affect the choice of a divisor, nor would they change the
143
+ * value of 'remainder' if they were to be actually XOR'ed in. And by starting
144
+ * with a remainder of all 1 bits, we get the effect of complementing the first
145
+ * 32 message bits.
146
+ *
147
+ * The next optimization is to process the input in multi-bit units. Suppose
148
+ * that we insert the next 'n' message bits into the remainder. Then we get an
149
+ * intermediate remainder of length '32 + n' bits, and the CRC of the extra 'n'
150
+ * bits is the amount by which the low 32 bits of the remainder will change as a
151
+ * result of cancelling out those 'n' bits. Taking n=8 (one byte) and
152
+ * precomputing a table containing the CRC of each possible byte, we get
153
+ * crc32_slice1() defined below.
154
+ *
155
+ * As a further optimization, we could increase the multi-bit unit size to 16.
156
+ * However, that is inefficient because the table size explodes from 256 entries
157
+ * (1024 bytes) to 65536 entries (262144 bytes), which wastes memory and won't
158
+ * fit in L1 cache on typical processors.
159
+ *
160
+ * However, we can actually process 4 bytes at a time using 4 different tables
161
+ * with 256 entries each. Logically, we form a 64-bit intermediate remainder
162
+ * and cancel out the high 32 bits in 8-bit chunks. Bits 32-39 are cancelled
163
+ * out by the CRC of those bits, whereas bits 40-47 are be cancelled out by the
164
+ * CRC of those bits with 8 zero bits appended, and so on. This method is
165
+ * implemented in crc32_slice4(), defined below.
166
+ *
167
+ * In crc32_slice8(), this method is extended to 8 bytes at a time. The
168
+ * intermediate remainder (which we never actually store explicitly) is 96 bits.
169
+ *
170
+ * On CPUs that support fast carryless multiplication, CRCs can be computed even
171
+ * more quickly via "folding". See crc32_pclmul() for an example.
172
+ */
173
+
174
+ #include "x86_cpu_features.h"
175
+
176
+ #include "libdeflate.h"
177
+
178
+ /* Select the implementations to compile in. */
179
+
180
+ #define NEED_GENERIC_IMPL 1 /* include generic impl unless overridden */
181
+ #define DEFAULT_IMPL crc32_slice8
182
+
183
+ /* Include the PCLMUL implementation? */
184
+ #define NEED_PCLMUL_IMPL 0
185
+ #if defined(__PCLMUL__) || \
186
+ (X86_CPU_FEATURES_ENABLED && COMPILER_SUPPORTS_PCLMUL_TARGET && \
187
+ COMPILER_SUPPORTS_TARGET_INTRINSICS)
188
+ # include <wmmintrin.h>
189
+ # undef NEED_PCLMUL_IMPL
190
+ # define NEED_PCLMUL_IMPL 1
191
+ # ifdef __PCLMUL__ /* compiling for PCLMUL, i.e. can we assume it's there? */
192
+ # undef NEED_GENERIC_IMPL
193
+ # define NEED_GENERIC_IMPL 0 /* generic impl not needed */
194
+ # undef DEFAULT_IMPL
195
+ # define DEFAULT_IMPL crc32_pclmul
196
+ # endif /* otherwise, we can build a PCLMUL version, but we won't know whether
197
+ we can use it until runtime */
198
+ #endif
199
+
200
+ /*
201
+ * Include the PCLMUL/AVX implementation? Although our PCLMUL-optimized CRC-32
202
+ * function doesn't use any AVX intrinsics specifically, it can benefit a lot
203
+ * from being compiled for an AVX target: on Skylake, ~16700 MB/s vs. ~10100
204
+ * MB/s. I expect this is related to the PCLMULQDQ instructions being assembled
205
+ * in the newer three-operand form rather than the older two-operand form.
206
+ *
207
+ * Note: this is only needed if __AVX__ is *not* defined, since otherwise the
208
+ * "regular" PCLMUL implementation would already be AVX enabled.
209
+ */
210
+ #define NEED_PCLMUL_AVX_IMPL 0
211
+ #if NEED_PCLMUL_IMPL && !defined(__AVX__) && \
212
+ X86_CPU_FEATURES_ENABLED && COMPILER_SUPPORTS_AVX_TARGET
213
+ # undef NEED_PCLMUL_AVX_IMPL
214
+ # define NEED_PCLMUL_AVX_IMPL 1
215
+ #endif
216
+
217
+ #define NUM_IMPLS (NEED_GENERIC_IMPL + NEED_PCLMUL_IMPL + NEED_PCLMUL_AVX_IMPL)
218
+
219
+ /* Define the CRC-32 table */
220
+ #if NEED_GENERIC_IMPL
221
+ # define CRC32_SLICE8
222
+ #else
223
+ # define CRC32_SLICE1 /* only need short table for unaligned ends */
224
+ #endif
225
+ #include "crc32_table.h"
226
+
227
+ static forceinline u32
228
+ crc32_update_byte(u32 remainder, u8 next_byte)
229
+ {
230
+ return (remainder >> 8) ^ crc32_table[(u8)remainder ^ next_byte];
231
+ }
232
+
233
+ #if defined(CRC32_SLICE1) || (NUM_IMPLS > NEED_GENERIC_IMPL)
234
+ static u32
235
+ crc32_slice1(u32 remainder, const u8 *buffer, size_t nbytes)
236
+ {
237
+ size_t i;
238
+
239
+ STATIC_ASSERT(ARRAY_LEN(crc32_table) >= 0x100);
240
+
241
+ for (i = 0; i < nbytes; i++)
242
+ remainder = crc32_update_byte(remainder, buffer[i]);
243
+ return remainder;
244
+ }
245
+ #endif
246
+
247
+ #ifdef CRC32_SLICE4
248
+ static u32
249
+ crc32_slice4(u32 remainder, const u8 *buffer, size_t nbytes)
250
+ {
251
+ const u8 *p = buffer;
252
+ const u8 *end = buffer + nbytes;
253
+ const u8 *end32;
254
+
255
+ STATIC_ASSERT(ARRAY_LEN(crc32_table) >= 0x400);
256
+
257
+ for (; ((uintptr_t)p & 3) && p != end; p++)
258
+ remainder = crc32_update_byte(remainder, *p);
259
+
260
+ end32 = p + ((end - p) & ~3);
261
+ for (; p != end32; p += 4) {
262
+ u32 v = le32_bswap(*(const u32 *)p);
263
+ remainder =
264
+ crc32_table[0x300 + (u8)((remainder ^ v) >> 0)] ^
265
+ crc32_table[0x200 + (u8)((remainder ^ v) >> 8)] ^
266
+ crc32_table[0x100 + (u8)((remainder ^ v) >> 16)] ^
267
+ crc32_table[0x000 + (u8)((remainder ^ v) >> 24)];
268
+ }
269
+
270
+ for (; p != end; p++)
271
+ remainder = crc32_update_byte(remainder, *p);
272
+
273
+ return remainder;
274
+ }
275
+ #endif
276
+
277
+ #ifdef CRC32_SLICE8
278
+ static u32
279
+ crc32_slice8(u32 remainder, const u8 *buffer, size_t nbytes)
280
+ {
281
+ const u8 *p = buffer;
282
+ const u8 *end = buffer + nbytes;
283
+ const u8 *end64;
284
+
285
+ STATIC_ASSERT(ARRAY_LEN(crc32_table) >= 0x800);
286
+
287
+ for (; ((uintptr_t)p & 7) && p != end; p++)
288
+ remainder = crc32_update_byte(remainder, *p);
289
+
290
+ end64 = p + ((end - p) & ~7);
291
+ for (; p != end64; p += 8) {
292
+ u32 v1 = le32_bswap(*(const u32 *)(p + 0));
293
+ u32 v2 = le32_bswap(*(const u32 *)(p + 4));
294
+ remainder =
295
+ crc32_table[0x700 + (u8)((remainder ^ v1) >> 0)] ^
296
+ crc32_table[0x600 + (u8)((remainder ^ v1) >> 8)] ^
297
+ crc32_table[0x500 + (u8)((remainder ^ v1) >> 16)] ^
298
+ crc32_table[0x400 + (u8)((remainder ^ v1) >> 24)] ^
299
+ crc32_table[0x300 + (u8)(v2 >> 0)] ^
300
+ crc32_table[0x200 + (u8)(v2 >> 8)] ^
301
+ crc32_table[0x100 + (u8)(v2 >> 16)] ^
302
+ crc32_table[0x000 + (u8)(v2 >> 24)];
303
+ }
304
+
305
+ for (; p != end; p++)
306
+ remainder = crc32_update_byte(remainder, *p);
307
+
308
+ return remainder;
309
+ }
310
+ #endif
311
+
312
+ /* Define the PCLMUL implementation if needed. */
313
+ #if NEED_PCLMUL_IMPL
314
+ # define FUNCNAME crc32_pclmul
315
+ # define FUNCNAME_ALIGNED crc32_pclmul_aligned
316
+ # ifdef __PCLMUL__
317
+ # define ATTRIBUTES
318
+ # else
319
+ # define ATTRIBUTES __attribute__((target("pclmul")))
320
+ # endif
321
+ # include "crc32_impl.h"
322
+ #endif
323
+
324
+ /* Define the PCLMUL/AVX implementation if needed. */
325
+ #if NEED_PCLMUL_AVX_IMPL
326
+ # define FUNCNAME crc32_pclmul_avx
327
+ # define FUNCNAME_ALIGNED crc32_pclmul_avx_aligned
328
+ # define ATTRIBUTES __attribute__((target("pclmul,avx")))
329
+ # include "crc32_impl.h"
330
+ #endif
331
+
332
+ typedef u32 (*crc32_func_t)(u32, const u8 *, size_t);
333
+
334
+ /*
335
+ * If multiple implementations are available, then dispatch among them based on
336
+ * CPU features at runtime. Otherwise just call the single one directly.
337
+ */
338
+ #if NUM_IMPLS == 1
339
+ # define crc32_impl DEFAULT_IMPL
340
+ #else
341
+ static u32 dispatch(u32, const u8 *, size_t);
342
+
343
+ static crc32_func_t crc32_impl = dispatch;
344
+
345
+ static u32 dispatch(u32 remainder, const u8 *buffer, size_t nbytes)
346
+ {
347
+ crc32_func_t f = DEFAULT_IMPL;
348
+ #if NEED_PCLMUL_IMPL && !defined(__PCLMUL__)
349
+ if (x86_have_cpu_features(X86_CPU_FEATURE_PCLMULQDQ))
350
+ f = crc32_pclmul;
351
+ #endif
352
+ #if NEED_PCLMUL_AVX_IMPL
353
+ if (x86_have_cpu_features(X86_CPU_FEATURE_PCLMULQDQ |
354
+ X86_CPU_FEATURE_AVX))
355
+ f = crc32_pclmul_avx;
356
+ #endif
357
+ crc32_impl = f;
358
+ return crc32_impl(remainder, buffer, nbytes);
359
+ }
360
+ #endif /* NUM_IMPLS != 1 */
361
+
362
+ LIBDEFLATEAPI u32
363
+ libdeflate_crc32(u32 remainder, const void *buffer, size_t nbytes)
364
+ {
365
+ if (buffer == NULL) /* return initial value */
366
+ return 0;
367
+ return ~crc32_impl(~remainder, buffer, nbytes);
368
+ }