libdeflate 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +17 -0
- data/.gitmodules +3 -0
- data/.rspec +2 -0
- data/.rubocop.yml +1 -0
- data/.rubocop_todo.yml +9 -0
- data/.travis.yml +5 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +21 -0
- data/README.md +52 -0
- data/Rakefile +15 -0
- data/bin/console +14 -0
- data/bin/setup +8 -0
- data/ext/libdeflate/extconf.rb +14 -0
- data/ext/libdeflate/libdeflate/.gitignore +19 -0
- data/ext/libdeflate/libdeflate/COPYING +21 -0
- data/ext/libdeflate/libdeflate/Makefile +231 -0
- data/ext/libdeflate/libdeflate/Makefile.msc +64 -0
- data/ext/libdeflate/libdeflate/NEWS +57 -0
- data/ext/libdeflate/libdeflate/README.md +170 -0
- data/ext/libdeflate/libdeflate/common/common_defs.h +351 -0
- data/ext/libdeflate/libdeflate/common/compiler_gcc.h +134 -0
- data/ext/libdeflate/libdeflate/common/compiler_msc.h +95 -0
- data/ext/libdeflate/libdeflate/lib/adler32.c +213 -0
- data/ext/libdeflate/libdeflate/lib/adler32_impl.h +281 -0
- data/ext/libdeflate/libdeflate/lib/aligned_malloc.c +57 -0
- data/ext/libdeflate/libdeflate/lib/aligned_malloc.h +13 -0
- data/ext/libdeflate/libdeflate/lib/bt_matchfinder.h +357 -0
- data/ext/libdeflate/libdeflate/lib/crc32.c +368 -0
- data/ext/libdeflate/libdeflate/lib/crc32_impl.h +286 -0
- data/ext/libdeflate/libdeflate/lib/crc32_table.h +526 -0
- data/ext/libdeflate/libdeflate/lib/decompress_impl.h +404 -0
- data/ext/libdeflate/libdeflate/lib/deflate_compress.c +2817 -0
- data/ext/libdeflate/libdeflate/lib/deflate_compress.h +14 -0
- data/ext/libdeflate/libdeflate/lib/deflate_constants.h +66 -0
- data/ext/libdeflate/libdeflate/lib/deflate_decompress.c +889 -0
- data/ext/libdeflate/libdeflate/lib/gzip_compress.c +95 -0
- data/ext/libdeflate/libdeflate/lib/gzip_constants.h +45 -0
- data/ext/libdeflate/libdeflate/lib/gzip_decompress.c +130 -0
- data/ext/libdeflate/libdeflate/lib/hc_matchfinder.h +405 -0
- data/ext/libdeflate/libdeflate/lib/lib_common.h +35 -0
- data/ext/libdeflate/libdeflate/lib/matchfinder_avx2.h +53 -0
- data/ext/libdeflate/libdeflate/lib/matchfinder_common.h +205 -0
- data/ext/libdeflate/libdeflate/lib/matchfinder_neon.h +61 -0
- data/ext/libdeflate/libdeflate/lib/matchfinder_sse2.h +53 -0
- data/ext/libdeflate/libdeflate/lib/unaligned.h +202 -0
- data/ext/libdeflate/libdeflate/lib/x86_cpu_features.c +169 -0
- data/ext/libdeflate/libdeflate/lib/x86_cpu_features.h +48 -0
- data/ext/libdeflate/libdeflate/lib/zlib_compress.c +87 -0
- data/ext/libdeflate/libdeflate/lib/zlib_constants.h +21 -0
- data/ext/libdeflate/libdeflate/lib/zlib_decompress.c +91 -0
- data/ext/libdeflate/libdeflate/libdeflate.h +274 -0
- data/ext/libdeflate/libdeflate/programs/benchmark.c +558 -0
- data/ext/libdeflate/libdeflate/programs/checksum.c +197 -0
- data/ext/libdeflate/libdeflate/programs/detect.sh +62 -0
- data/ext/libdeflate/libdeflate/programs/gzip.c +603 -0
- data/ext/libdeflate/libdeflate/programs/prog_util.c +530 -0
- data/ext/libdeflate/libdeflate/programs/prog_util.h +162 -0
- data/ext/libdeflate/libdeflate/programs/test_checksums.c +135 -0
- data/ext/libdeflate/libdeflate/programs/tgetopt.c +118 -0
- data/ext/libdeflate/libdeflate/tools/afl-fuzz/Makefile +12 -0
- data/ext/libdeflate/libdeflate/tools/afl-fuzz/deflate_compress/fuzz.c +40 -0
- data/ext/libdeflate/libdeflate/tools/afl-fuzz/deflate_compress/inputs/0 +0 -0
- data/ext/libdeflate/libdeflate/tools/afl-fuzz/deflate_decompress/fuzz.c +28 -0
- data/ext/libdeflate/libdeflate/tools/afl-fuzz/deflate_decompress/inputs/0 +3 -0
- data/ext/libdeflate/libdeflate/tools/afl-fuzz/gzip_decompress/fuzz.c +28 -0
- data/ext/libdeflate/libdeflate/tools/afl-fuzz/gzip_decompress/inputs/0 +0 -0
- data/ext/libdeflate/libdeflate/tools/afl-fuzz/prepare_for_fuzz.sh +14 -0
- data/ext/libdeflate/libdeflate/tools/afl-fuzz/zlib_decompress/fuzz.c +28 -0
- data/ext/libdeflate/libdeflate/tools/afl-fuzz/zlib_decompress/inputs/0 +3 -0
- data/ext/libdeflate/libdeflate/tools/android_build.sh +104 -0
- data/ext/libdeflate/libdeflate/tools/checksum_benchmarks.sh +76 -0
- data/ext/libdeflate/libdeflate/tools/exec_tests.sh +30 -0
- data/ext/libdeflate/libdeflate/tools/gen_crc32_multipliers.c +108 -0
- data/ext/libdeflate/libdeflate/tools/gen_crc32_table.c +100 -0
- data/ext/libdeflate/libdeflate/tools/gzip_tests.sh +412 -0
- data/ext/libdeflate/libdeflate/tools/make-windows-releases +21 -0
- data/ext/libdeflate/libdeflate/tools/mips_build.sh +9 -0
- data/ext/libdeflate/libdeflate/tools/msc_test.bat +3 -0
- data/ext/libdeflate/libdeflate/tools/pgo_build.sh +23 -0
- data/ext/libdeflate/libdeflate/tools/produce_gzip_benchmark_table.sh +37 -0
- data/ext/libdeflate/libdeflate/tools/run_tests.sh +305 -0
- data/ext/libdeflate/libdeflate/tools/windows_build.sh +10 -0
- data/ext/libdeflate/libdeflate_ext.c +389 -0
- data/ext/libdeflate/libdeflate_ext.h +8 -0
- data/lib/libdeflate.rb +2 -0
- data/lib/libdeflate/version.rb +3 -0
- data/libdeflate.gemspec +33 -0
- metadata +230 -0
@@ -0,0 +1,57 @@
|
|
1
|
+
/*
|
2
|
+
* aligned_malloc.c - aligned memory allocation
|
3
|
+
*
|
4
|
+
* Originally public domain; changes after 2016-09-07 are copyrighted.
|
5
|
+
*
|
6
|
+
* Copyright 2016 Eric Biggers
|
7
|
+
*
|
8
|
+
* Permission is hereby granted, free of charge, to any person
|
9
|
+
* obtaining a copy of this software and associated documentation
|
10
|
+
* files (the "Software"), to deal in the Software without
|
11
|
+
* restriction, including without limitation the rights to use,
|
12
|
+
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
13
|
+
* copies of the Software, and to permit persons to whom the
|
14
|
+
* Software is furnished to do so, subject to the following
|
15
|
+
* conditions:
|
16
|
+
*
|
17
|
+
* The above copyright notice and this permission notice shall be
|
18
|
+
* included in all copies or substantial portions of the Software.
|
19
|
+
*
|
20
|
+
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
21
|
+
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
22
|
+
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
23
|
+
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
24
|
+
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
25
|
+
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
26
|
+
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
27
|
+
* OTHER DEALINGS IN THE SOFTWARE.
|
28
|
+
*/
|
29
|
+
|
30
|
+
/*
|
31
|
+
* This file provides portable aligned memory allocation functions that only
|
32
|
+
* use malloc() and free(). This avoids portability problems with
|
33
|
+
* posix_memalign(), aligned_alloc(), etc.
|
34
|
+
*/
|
35
|
+
|
36
|
+
#include <stdlib.h>
|
37
|
+
|
38
|
+
#include "aligned_malloc.h"
|
39
|
+
|
40
|
+
void *
|
41
|
+
aligned_malloc(size_t alignment, size_t size)
|
42
|
+
{
|
43
|
+
void *ptr = malloc(sizeof(void *) + alignment - 1 + size);
|
44
|
+
if (ptr) {
|
45
|
+
void *orig_ptr = ptr;
|
46
|
+
ptr = (void *)ALIGN((uintptr_t)ptr + sizeof(void *), alignment);
|
47
|
+
((void **)ptr)[-1] = orig_ptr;
|
48
|
+
}
|
49
|
+
return ptr;
|
50
|
+
}
|
51
|
+
|
52
|
+
void
|
53
|
+
aligned_free(void *ptr)
|
54
|
+
{
|
55
|
+
if (ptr)
|
56
|
+
free(((void **)ptr)[-1]);
|
57
|
+
}
|
@@ -0,0 +1,13 @@
|
|
1
|
+
/*
|
2
|
+
* aligned_malloc.c - aligned memory allocation
|
3
|
+
*/
|
4
|
+
|
5
|
+
#ifndef LIB_ALIGNED_MALLOC_H
|
6
|
+
#define LIB_ALIGNED_MALLOC_H
|
7
|
+
|
8
|
+
#include "lib_common.h"
|
9
|
+
|
10
|
+
extern void *aligned_malloc(size_t alignment, size_t size);
|
11
|
+
extern void aligned_free(void *ptr);
|
12
|
+
|
13
|
+
#endif /* LIB_ALIGNED_MALLOC_H */
|
@@ -0,0 +1,357 @@
|
|
1
|
+
/*
|
2
|
+
* bt_matchfinder.h - Lempel-Ziv matchfinding with a hash table of binary trees
|
3
|
+
*
|
4
|
+
* Originally public domain; changes after 2016-09-07 are copyrighted.
|
5
|
+
*
|
6
|
+
* Copyright 2016 Eric Biggers
|
7
|
+
*
|
8
|
+
* Permission is hereby granted, free of charge, to any person
|
9
|
+
* obtaining a copy of this software and associated documentation
|
10
|
+
* files (the "Software"), to deal in the Software without
|
11
|
+
* restriction, including without limitation the rights to use,
|
12
|
+
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
13
|
+
* copies of the Software, and to permit persons to whom the
|
14
|
+
* Software is furnished to do so, subject to the following
|
15
|
+
* conditions:
|
16
|
+
*
|
17
|
+
* The above copyright notice and this permission notice shall be
|
18
|
+
* included in all copies or substantial portions of the Software.
|
19
|
+
*
|
20
|
+
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
21
|
+
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
22
|
+
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
23
|
+
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
24
|
+
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
25
|
+
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
26
|
+
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
27
|
+
* OTHER DEALINGS IN THE SOFTWARE.
|
28
|
+
*
|
29
|
+
* ----------------------------------------------------------------------------
|
30
|
+
*
|
31
|
+
* This is a Binary Trees (bt) based matchfinder.
|
32
|
+
*
|
33
|
+
* The main data structure is a hash table where each hash bucket contains a
|
34
|
+
* binary tree of sequences whose first 4 bytes share the same hash code. Each
|
35
|
+
* sequence is identified by its starting position in the input buffer. Each
|
36
|
+
* binary tree is always sorted such that each left child represents a sequence
|
37
|
+
* lexicographically lesser than its parent and each right child represents a
|
38
|
+
* sequence lexicographically greater than its parent.
|
39
|
+
*
|
40
|
+
* The algorithm processes the input buffer sequentially. At each byte
|
41
|
+
* position, the hash code of the first 4 bytes of the sequence beginning at
|
42
|
+
* that position (the sequence being matched against) is computed. This
|
43
|
+
* identifies the hash bucket to use for that position. Then, a new binary tree
|
44
|
+
* node is created to represent the current sequence. Then, in a single tree
|
45
|
+
* traversal, the hash bucket's binary tree is searched for matches and is
|
46
|
+
* re-rooted at the new node.
|
47
|
+
*
|
48
|
+
* Compared to the simpler algorithm that uses linked lists instead of binary
|
49
|
+
* trees (see hc_matchfinder.h), the binary tree version gains more information
|
50
|
+
* at each node visitation. Ideally, the binary tree version will examine only
|
51
|
+
* 'log(n)' nodes to find the same matches that the linked list version will
|
52
|
+
* find by examining 'n' nodes. In addition, the binary tree version can
|
53
|
+
* examine fewer bytes at each node by taking advantage of the common prefixes
|
54
|
+
* that result from the sort order, whereas the linked list version may have to
|
55
|
+
* examine up to the full length of the match at each node.
|
56
|
+
*
|
57
|
+
* However, it is not always best to use the binary tree version. It requires
|
58
|
+
* nearly twice as much memory as the linked list version, and it takes time to
|
59
|
+
* keep the binary trees sorted, even at positions where the compressor does not
|
60
|
+
* need matches. Generally, when doing fast compression on small buffers,
|
61
|
+
* binary trees are the wrong approach. They are best suited for thorough
|
62
|
+
* compression and/or large buffers.
|
63
|
+
*
|
64
|
+
* ----------------------------------------------------------------------------
|
65
|
+
*/
|
66
|
+
|
67
|
+
|
68
|
+
#include "matchfinder_common.h"
|
69
|
+
|
70
|
+
#define BT_MATCHFINDER_HASH3_ORDER 16
|
71
|
+
#define BT_MATCHFINDER_HASH3_WAYS 2
|
72
|
+
#define BT_MATCHFINDER_HASH4_ORDER 16
|
73
|
+
|
74
|
+
#define BT_MATCHFINDER_TOTAL_HASH_LENGTH \
|
75
|
+
((1UL << BT_MATCHFINDER_HASH3_ORDER) * BT_MATCHFINDER_HASH3_WAYS + \
|
76
|
+
(1UL << BT_MATCHFINDER_HASH4_ORDER))
|
77
|
+
|
78
|
+
/* Representation of a match found by the bt_matchfinder */
|
79
|
+
struct lz_match {
|
80
|
+
|
81
|
+
/* The number of bytes matched. */
|
82
|
+
u16 length;
|
83
|
+
|
84
|
+
/* The offset back from the current position that was matched. */
|
85
|
+
u16 offset;
|
86
|
+
};
|
87
|
+
|
88
|
+
struct bt_matchfinder {
|
89
|
+
|
90
|
+
/* The hash table for finding length 3 matches */
|
91
|
+
mf_pos_t hash3_tab[1UL << BT_MATCHFINDER_HASH3_ORDER][BT_MATCHFINDER_HASH3_WAYS];
|
92
|
+
|
93
|
+
/* The hash table which contains the roots of the binary trees for
|
94
|
+
* finding length 4+ matches */
|
95
|
+
mf_pos_t hash4_tab[1UL << BT_MATCHFINDER_HASH4_ORDER];
|
96
|
+
|
97
|
+
/* The child node references for the binary trees. The left and right
|
98
|
+
* children of the node for the sequence with position 'pos' are
|
99
|
+
* 'child_tab[pos * 2]' and 'child_tab[pos * 2 + 1]', respectively. */
|
100
|
+
mf_pos_t child_tab[2UL * MATCHFINDER_WINDOW_SIZE];
|
101
|
+
|
102
|
+
}
|
103
|
+
#ifdef _aligned_attribute
|
104
|
+
_aligned_attribute(MATCHFINDER_ALIGNMENT)
|
105
|
+
#endif
|
106
|
+
;
|
107
|
+
|
108
|
+
/* Prepare the matchfinder for a new input buffer. */
|
109
|
+
static forceinline void
|
110
|
+
bt_matchfinder_init(struct bt_matchfinder *mf)
|
111
|
+
{
|
112
|
+
matchfinder_init((mf_pos_t *)mf, BT_MATCHFINDER_TOTAL_HASH_LENGTH);
|
113
|
+
}
|
114
|
+
|
115
|
+
static forceinline void
|
116
|
+
bt_matchfinder_slide_window(struct bt_matchfinder *mf)
|
117
|
+
{
|
118
|
+
matchfinder_rebase((mf_pos_t *)mf,
|
119
|
+
sizeof(struct bt_matchfinder) / sizeof(mf_pos_t));
|
120
|
+
}
|
121
|
+
|
122
|
+
static forceinline mf_pos_t *
|
123
|
+
bt_left_child(struct bt_matchfinder *mf, s32 node)
|
124
|
+
{
|
125
|
+
return &mf->child_tab[2 * (node & (MATCHFINDER_WINDOW_SIZE - 1)) + 0];
|
126
|
+
}
|
127
|
+
|
128
|
+
static forceinline mf_pos_t *
|
129
|
+
bt_right_child(struct bt_matchfinder *mf, s32 node)
|
130
|
+
{
|
131
|
+
return &mf->child_tab[2 * (node & (MATCHFINDER_WINDOW_SIZE - 1)) + 1];
|
132
|
+
}
|
133
|
+
|
134
|
+
/* The minimum permissible value of 'max_len' for bt_matchfinder_get_matches()
|
135
|
+
* and bt_matchfinder_skip_position(). There must be sufficiently many bytes
|
136
|
+
* remaining to load a 32-bit integer from the *next* position. */
|
137
|
+
#define BT_MATCHFINDER_REQUIRED_NBYTES 5
|
138
|
+
|
139
|
+
/* Advance the binary tree matchfinder by one byte, optionally recording
|
140
|
+
* matches. @record_matches should be a compile-time constant. */
|
141
|
+
static forceinline struct lz_match *
|
142
|
+
bt_matchfinder_advance_one_byte(struct bt_matchfinder * const restrict mf,
|
143
|
+
const u8 * const restrict in_base,
|
144
|
+
const ptrdiff_t cur_pos,
|
145
|
+
const u32 max_len,
|
146
|
+
const u32 nice_len,
|
147
|
+
const u32 max_search_depth,
|
148
|
+
u32 * const restrict next_hashes,
|
149
|
+
u32 * const restrict best_len_ret,
|
150
|
+
struct lz_match * restrict lz_matchptr,
|
151
|
+
const bool record_matches)
|
152
|
+
{
|
153
|
+
const u8 *in_next = in_base + cur_pos;
|
154
|
+
u32 depth_remaining = max_search_depth;
|
155
|
+
const s32 cutoff = cur_pos - MATCHFINDER_WINDOW_SIZE;
|
156
|
+
u32 next_seq4;
|
157
|
+
u32 next_seq3;
|
158
|
+
u32 hash3;
|
159
|
+
u32 hash4;
|
160
|
+
s32 cur_node;
|
161
|
+
#if BT_MATCHFINDER_HASH3_WAYS >= 2
|
162
|
+
s32 cur_node_2;
|
163
|
+
#endif
|
164
|
+
const u8 *matchptr;
|
165
|
+
mf_pos_t *pending_lt_ptr, *pending_gt_ptr;
|
166
|
+
u32 best_lt_len, best_gt_len;
|
167
|
+
u32 len;
|
168
|
+
u32 best_len = 3;
|
169
|
+
|
170
|
+
STATIC_ASSERT(BT_MATCHFINDER_HASH3_WAYS >= 1 &&
|
171
|
+
BT_MATCHFINDER_HASH3_WAYS <= 2);
|
172
|
+
|
173
|
+
next_seq4 = load_u32_unaligned(in_next + 1);
|
174
|
+
next_seq3 = loaded_u32_to_u24(next_seq4);
|
175
|
+
|
176
|
+
hash3 = next_hashes[0];
|
177
|
+
hash4 = next_hashes[1];
|
178
|
+
|
179
|
+
next_hashes[0] = lz_hash(next_seq3, BT_MATCHFINDER_HASH3_ORDER);
|
180
|
+
next_hashes[1] = lz_hash(next_seq4, BT_MATCHFINDER_HASH4_ORDER);
|
181
|
+
prefetchw(&mf->hash3_tab[next_hashes[0]]);
|
182
|
+
prefetchw(&mf->hash4_tab[next_hashes[1]]);
|
183
|
+
|
184
|
+
cur_node = mf->hash3_tab[hash3][0];
|
185
|
+
mf->hash3_tab[hash3][0] = cur_pos;
|
186
|
+
#if BT_MATCHFINDER_HASH3_WAYS >= 2
|
187
|
+
cur_node_2 = mf->hash3_tab[hash3][1];
|
188
|
+
mf->hash3_tab[hash3][1] = cur_node;
|
189
|
+
#endif
|
190
|
+
if (record_matches && cur_node > cutoff) {
|
191
|
+
u32 seq3 = load_u24_unaligned(in_next);
|
192
|
+
if (seq3 == load_u24_unaligned(&in_base[cur_node])) {
|
193
|
+
lz_matchptr->length = 3;
|
194
|
+
lz_matchptr->offset = in_next - &in_base[cur_node];
|
195
|
+
lz_matchptr++;
|
196
|
+
}
|
197
|
+
#if BT_MATCHFINDER_HASH3_WAYS >= 2
|
198
|
+
else if (cur_node_2 > cutoff &&
|
199
|
+
seq3 == load_u24_unaligned(&in_base[cur_node_2]))
|
200
|
+
{
|
201
|
+
lz_matchptr->length = 3;
|
202
|
+
lz_matchptr->offset = in_next - &in_base[cur_node_2];
|
203
|
+
lz_matchptr++;
|
204
|
+
}
|
205
|
+
#endif
|
206
|
+
}
|
207
|
+
|
208
|
+
cur_node = mf->hash4_tab[hash4];
|
209
|
+
mf->hash4_tab[hash4] = cur_pos;
|
210
|
+
|
211
|
+
pending_lt_ptr = bt_left_child(mf, cur_pos);
|
212
|
+
pending_gt_ptr = bt_right_child(mf, cur_pos);
|
213
|
+
|
214
|
+
if (cur_node <= cutoff) {
|
215
|
+
*pending_lt_ptr = MATCHFINDER_INITVAL;
|
216
|
+
*pending_gt_ptr = MATCHFINDER_INITVAL;
|
217
|
+
*best_len_ret = best_len;
|
218
|
+
return lz_matchptr;
|
219
|
+
}
|
220
|
+
|
221
|
+
best_lt_len = 0;
|
222
|
+
best_gt_len = 0;
|
223
|
+
len = 0;
|
224
|
+
|
225
|
+
for (;;) {
|
226
|
+
matchptr = &in_base[cur_node];
|
227
|
+
|
228
|
+
if (matchptr[len] == in_next[len]) {
|
229
|
+
len = lz_extend(in_next, matchptr, len + 1, max_len);
|
230
|
+
if (!record_matches || len > best_len) {
|
231
|
+
if (record_matches) {
|
232
|
+
best_len = len;
|
233
|
+
lz_matchptr->length = len;
|
234
|
+
lz_matchptr->offset = in_next - matchptr;
|
235
|
+
lz_matchptr++;
|
236
|
+
}
|
237
|
+
if (len >= nice_len) {
|
238
|
+
*pending_lt_ptr = *bt_left_child(mf, cur_node);
|
239
|
+
*pending_gt_ptr = *bt_right_child(mf, cur_node);
|
240
|
+
*best_len_ret = best_len;
|
241
|
+
return lz_matchptr;
|
242
|
+
}
|
243
|
+
}
|
244
|
+
}
|
245
|
+
|
246
|
+
if (matchptr[len] < in_next[len]) {
|
247
|
+
*pending_lt_ptr = cur_node;
|
248
|
+
pending_lt_ptr = bt_right_child(mf, cur_node);
|
249
|
+
cur_node = *pending_lt_ptr;
|
250
|
+
best_lt_len = len;
|
251
|
+
if (best_gt_len < len)
|
252
|
+
len = best_gt_len;
|
253
|
+
} else {
|
254
|
+
*pending_gt_ptr = cur_node;
|
255
|
+
pending_gt_ptr = bt_left_child(mf, cur_node);
|
256
|
+
cur_node = *pending_gt_ptr;
|
257
|
+
best_gt_len = len;
|
258
|
+
if (best_lt_len < len)
|
259
|
+
len = best_lt_len;
|
260
|
+
}
|
261
|
+
|
262
|
+
if (cur_node <= cutoff || !--depth_remaining) {
|
263
|
+
*pending_lt_ptr = MATCHFINDER_INITVAL;
|
264
|
+
*pending_gt_ptr = MATCHFINDER_INITVAL;
|
265
|
+
*best_len_ret = best_len;
|
266
|
+
return lz_matchptr;
|
267
|
+
}
|
268
|
+
}
|
269
|
+
}
|
270
|
+
|
271
|
+
/*
|
272
|
+
* Retrieve a list of matches with the current position.
|
273
|
+
*
|
274
|
+
* @mf
|
275
|
+
* The matchfinder structure.
|
276
|
+
* @in_base
|
277
|
+
* Pointer to the next byte in the input buffer to process _at the last
|
278
|
+
* time bt_matchfinder_init() or bt_matchfinder_slide_window() was called_.
|
279
|
+
* @cur_pos
|
280
|
+
* The current position in the input buffer relative to @in_base (the
|
281
|
+
* position of the sequence being matched against).
|
282
|
+
* @max_len
|
283
|
+
* The maximum permissible match length at this position. Must be >=
|
284
|
+
* BT_MATCHFINDER_REQUIRED_NBYTES.
|
285
|
+
* @nice_len
|
286
|
+
* Stop searching if a match of at least this length is found.
|
287
|
+
* Must be <= @max_len.
|
288
|
+
* @max_search_depth
|
289
|
+
* Limit on the number of potential matches to consider. Must be >= 1.
|
290
|
+
* @next_hashes
|
291
|
+
* The precomputed hash codes for the sequence beginning at @in_next.
|
292
|
+
* These will be used and then updated with the precomputed hashcodes for
|
293
|
+
* the sequence beginning at @in_next + 1.
|
294
|
+
* @best_len_ret
|
295
|
+
* If a match of length >= 4 was found, then the length of the longest such
|
296
|
+
* match is written here; otherwise 3 is written here. (Note: this is
|
297
|
+
* redundant with the 'struct lz_match' array, but this is easier for the
|
298
|
+
* compiler to optimize when inlined and the caller immediately does a
|
299
|
+
* check against 'best_len'.)
|
300
|
+
* @lz_matchptr
|
301
|
+
* An array in which this function will record the matches. The recorded
|
302
|
+
* matches will be sorted by strictly increasing length and (non-strictly)
|
303
|
+
* increasing offset. The maximum number of matches that may be found is
|
304
|
+
* 'nice_len - 2'.
|
305
|
+
*
|
306
|
+
* The return value is a pointer to the next available slot in the @lz_matchptr
|
307
|
+
* array. (If no matches were found, this will be the same as @lz_matchptr.)
|
308
|
+
*/
|
309
|
+
static forceinline struct lz_match *
|
310
|
+
bt_matchfinder_get_matches(struct bt_matchfinder *mf,
|
311
|
+
const u8 *in_base,
|
312
|
+
ptrdiff_t cur_pos,
|
313
|
+
u32 max_len,
|
314
|
+
u32 nice_len,
|
315
|
+
u32 max_search_depth,
|
316
|
+
u32 next_hashes[2],
|
317
|
+
u32 *best_len_ret,
|
318
|
+
struct lz_match *lz_matchptr)
|
319
|
+
{
|
320
|
+
return bt_matchfinder_advance_one_byte(mf,
|
321
|
+
in_base,
|
322
|
+
cur_pos,
|
323
|
+
max_len,
|
324
|
+
nice_len,
|
325
|
+
max_search_depth,
|
326
|
+
next_hashes,
|
327
|
+
best_len_ret,
|
328
|
+
lz_matchptr,
|
329
|
+
true);
|
330
|
+
}
|
331
|
+
|
332
|
+
/*
|
333
|
+
* Advance the matchfinder, but don't record any matches.
|
334
|
+
*
|
335
|
+
* This is very similar to bt_matchfinder_get_matches() because both functions
|
336
|
+
* must do hashing and tree re-rooting.
|
337
|
+
*/
|
338
|
+
static forceinline void
|
339
|
+
bt_matchfinder_skip_position(struct bt_matchfinder *mf,
|
340
|
+
const u8 *in_base,
|
341
|
+
ptrdiff_t cur_pos,
|
342
|
+
u32 nice_len,
|
343
|
+
u32 max_search_depth,
|
344
|
+
u32 next_hashes[2])
|
345
|
+
{
|
346
|
+
u32 best_len;
|
347
|
+
bt_matchfinder_advance_one_byte(mf,
|
348
|
+
in_base,
|
349
|
+
cur_pos,
|
350
|
+
nice_len,
|
351
|
+
nice_len,
|
352
|
+
max_search_depth,
|
353
|
+
next_hashes,
|
354
|
+
&best_len,
|
355
|
+
NULL,
|
356
|
+
false);
|
357
|
+
}
|
@@ -0,0 +1,368 @@
|
|
1
|
+
/*
|
2
|
+
* crc32.c - CRC-32 checksum algorithm for the gzip format
|
3
|
+
*
|
4
|
+
* Originally public domain; changes after 2016-09-07 are copyrighted.
|
5
|
+
*
|
6
|
+
* Copyright 2016 Eric Biggers
|
7
|
+
*
|
8
|
+
* Permission is hereby granted, free of charge, to any person
|
9
|
+
* obtaining a copy of this software and associated documentation
|
10
|
+
* files (the "Software"), to deal in the Software without
|
11
|
+
* restriction, including without limitation the rights to use,
|
12
|
+
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
13
|
+
* copies of the Software, and to permit persons to whom the
|
14
|
+
* Software is furnished to do so, subject to the following
|
15
|
+
* conditions:
|
16
|
+
*
|
17
|
+
* The above copyright notice and this permission notice shall be
|
18
|
+
* included in all copies or substantial portions of the Software.
|
19
|
+
*
|
20
|
+
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
21
|
+
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
22
|
+
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
23
|
+
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
24
|
+
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
25
|
+
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
26
|
+
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
27
|
+
* OTHER DEALINGS IN THE SOFTWARE.
|
28
|
+
*/
|
29
|
+
|
30
|
+
/*
|
31
|
+
* High-level description of CRC
|
32
|
+
* =============================
|
33
|
+
*
|
34
|
+
* Consider a bit sequence 'bits[1...len]'. Interpret 'bits' as the "message"
|
35
|
+
* polynomial M(x) with coefficients in GF(2) (the field of integers modulo 2),
|
36
|
+
* where the coefficient of 'x^i' is 'bits[len - i]'. Then, compute:
|
37
|
+
*
|
38
|
+
* R(x) = M(x)*x^n mod G(x)
|
39
|
+
*
|
40
|
+
* where G(x) is a selected "generator" polynomial of degree 'n'. The remainder
|
41
|
+
* R(x) is a polynomial of max degree 'n - 1'. The CRC of 'bits' is R(x)
|
42
|
+
* interpreted as a bitstring of length 'n'.
|
43
|
+
*
|
44
|
+
* CRC used in gzip
|
45
|
+
* ================
|
46
|
+
*
|
47
|
+
* In the gzip format (RFC 1952):
|
48
|
+
*
|
49
|
+
* - The bitstring to checksum is formed from the bytes of the uncompressed
|
50
|
+
* data by concatenating the bits from the bytes in order, proceeding
|
51
|
+
* from the low-order bit to the high-order bit within each byte.
|
52
|
+
*
|
53
|
+
* - The generator polynomial G(x) is: x^32 + x^26 + x^23 + x^22 + x^16 +
|
54
|
+
* x^12 + x^11 + x^10 + x^8 + x^7 + x^5 + x^4 + x^2 + x + 1.
|
55
|
+
* Consequently, the CRC length is 32 bits ("CRC-32").
|
56
|
+
*
|
57
|
+
* - The highest order 32 coefficients of M(x)*x^n are inverted.
|
58
|
+
*
|
59
|
+
* - All 32 coefficients of R(x) are inverted.
|
60
|
+
*
|
61
|
+
* The two inversions cause added leading and trailing zero bits to affect the
|
62
|
+
* resulting CRC, whereas with a regular CRC such bits would have no effect on
|
63
|
+
* the CRC.
|
64
|
+
*
|
65
|
+
* Computation and optimizations
|
66
|
+
* =============================
|
67
|
+
*
|
68
|
+
* We can compute R(x) through "long division", maintaining only 32 bits of
|
69
|
+
* state at any given time. Multiplication by 'x' can be implemented as
|
70
|
+
* right-shifting by 1 (assuming the polynomial<=>bitstring mapping where the
|
71
|
+
* highest order bit represents the coefficient of x^0), and both addition and
|
72
|
+
* subtraction can be implemented as bitwise exclusive OR (since we are working
|
73
|
+
* in GF(2)). Here is an unoptimized implementation:
|
74
|
+
*
|
75
|
+
* static u32 crc32_gzip(const u8 *buffer, size_t nbytes)
|
76
|
+
* {
|
77
|
+
* u32 remainder = 0;
|
78
|
+
* const u32 divisor = 0xEDB88320;
|
79
|
+
*
|
80
|
+
* for (size_t i = 0; i < nbytes * 8 + 32; i++) {
|
81
|
+
* int bit;
|
82
|
+
* u32 multiple;
|
83
|
+
*
|
84
|
+
* if (i < nbytes * 8)
|
85
|
+
* bit = (buffer[i / 8] >> (i % 8)) & 1;
|
86
|
+
* else
|
87
|
+
* bit = 0; // one of the 32 appended 0 bits
|
88
|
+
*
|
89
|
+
* if (i < 32) // the first 32 bits are inverted
|
90
|
+
* bit ^= 1;
|
91
|
+
*
|
92
|
+
* if (remainder & 1)
|
93
|
+
* multiple = divisor;
|
94
|
+
* else
|
95
|
+
* multiple = 0;
|
96
|
+
*
|
97
|
+
* remainder >>= 1;
|
98
|
+
* remainder |= (u32)bit << 31;
|
99
|
+
* remainder ^= multiple;
|
100
|
+
* }
|
101
|
+
*
|
102
|
+
* return ~remainder;
|
103
|
+
* }
|
104
|
+
*
|
105
|
+
* In this implementation, the 32-bit integer 'remainder' maintains the
|
106
|
+
* remainder of the currently processed portion of the message (with 32 zero
|
107
|
+
* bits appended) when divided by the generator polynomial. 'remainder' is the
|
108
|
+
* representation of R(x), and 'divisor' is the representation of G(x) excluding
|
109
|
+
* the x^32 coefficient. For each bit to process, we multiply R(x) by 'x^1',
|
110
|
+
* then add 'x^0' if the new bit is a 1. If this causes R(x) to gain a nonzero
|
111
|
+
* x^32 term, then we subtract G(x) from R(x).
|
112
|
+
*
|
113
|
+
* We can speed this up by taking advantage of the fact that XOR is commutative
|
114
|
+
* and associative, so the order in which we combine the inputs into 'remainder'
|
115
|
+
* is unimportant. And since each message bit we add doesn't affect the choice
|
116
|
+
* of 'multiple' until 32 bits later, we need not actually add each message bit
|
117
|
+
* until that point:
|
118
|
+
*
|
119
|
+
* static u32 crc32_gzip(const u8 *buffer, size_t nbytes)
|
120
|
+
* {
|
121
|
+
* u32 remainder = ~0;
|
122
|
+
* const u32 divisor = 0xEDB88320;
|
123
|
+
*
|
124
|
+
* for (size_t i = 0; i < nbytes * 8; i++) {
|
125
|
+
* int bit;
|
126
|
+
* u32 multiple;
|
127
|
+
*
|
128
|
+
* bit = (buffer[i / 8] >> (i % 8)) & 1;
|
129
|
+
* remainder ^= bit;
|
130
|
+
* if (remainder & 1)
|
131
|
+
* multiple = divisor;
|
132
|
+
* else
|
133
|
+
* multiple = 0;
|
134
|
+
* remainder >>= 1;
|
135
|
+
* remainder ^= multiple;
|
136
|
+
* }
|
137
|
+
*
|
138
|
+
* return ~remainder;
|
139
|
+
* }
|
140
|
+
*
|
141
|
+
* With the above implementation we get the effect of 32 appended 0 bits for
|
142
|
+
* free; they never affect the choice of a divisor, nor would they change the
|
143
|
+
* value of 'remainder' if they were to be actually XOR'ed in. And by starting
|
144
|
+
* with a remainder of all 1 bits, we get the effect of complementing the first
|
145
|
+
* 32 message bits.
|
146
|
+
*
|
147
|
+
* The next optimization is to process the input in multi-bit units. Suppose
|
148
|
+
* that we insert the next 'n' message bits into the remainder. Then we get an
|
149
|
+
* intermediate remainder of length '32 + n' bits, and the CRC of the extra 'n'
|
150
|
+
* bits is the amount by which the low 32 bits of the remainder will change as a
|
151
|
+
* result of cancelling out those 'n' bits. Taking n=8 (one byte) and
|
152
|
+
* precomputing a table containing the CRC of each possible byte, we get
|
153
|
+
* crc32_slice1() defined below.
|
154
|
+
*
|
155
|
+
* As a further optimization, we could increase the multi-bit unit size to 16.
|
156
|
+
* However, that is inefficient because the table size explodes from 256 entries
|
157
|
+
* (1024 bytes) to 65536 entries (262144 bytes), which wastes memory and won't
|
158
|
+
* fit in L1 cache on typical processors.
|
159
|
+
*
|
160
|
+
* However, we can actually process 4 bytes at a time using 4 different tables
|
161
|
+
* with 256 entries each. Logically, we form a 64-bit intermediate remainder
|
162
|
+
* and cancel out the high 32 bits in 8-bit chunks. Bits 32-39 are cancelled
|
163
|
+
* out by the CRC of those bits, whereas bits 40-47 are be cancelled out by the
|
164
|
+
* CRC of those bits with 8 zero bits appended, and so on. This method is
|
165
|
+
* implemented in crc32_slice4(), defined below.
|
166
|
+
*
|
167
|
+
* In crc32_slice8(), this method is extended to 8 bytes at a time. The
|
168
|
+
* intermediate remainder (which we never actually store explicitly) is 96 bits.
|
169
|
+
*
|
170
|
+
* On CPUs that support fast carryless multiplication, CRCs can be computed even
|
171
|
+
* more quickly via "folding". See crc32_pclmul() for an example.
|
172
|
+
*/
|
173
|
+
|
174
|
+
#include "x86_cpu_features.h"
|
175
|
+
|
176
|
+
#include "libdeflate.h"
|
177
|
+
|
178
|
+
/* Select the implementations to compile in. */
|
179
|
+
|
180
|
+
#define NEED_GENERIC_IMPL 1 /* include generic impl unless overridden */
|
181
|
+
#define DEFAULT_IMPL crc32_slice8
|
182
|
+
|
183
|
+
/* Include the PCLMUL implementation? */
|
184
|
+
#define NEED_PCLMUL_IMPL 0
|
185
|
+
#if defined(__PCLMUL__) || \
|
186
|
+
(X86_CPU_FEATURES_ENABLED && COMPILER_SUPPORTS_PCLMUL_TARGET && \
|
187
|
+
COMPILER_SUPPORTS_TARGET_INTRINSICS)
|
188
|
+
# include <wmmintrin.h>
|
189
|
+
# undef NEED_PCLMUL_IMPL
|
190
|
+
# define NEED_PCLMUL_IMPL 1
|
191
|
+
# ifdef __PCLMUL__ /* compiling for PCLMUL, i.e. can we assume it's there? */
|
192
|
+
# undef NEED_GENERIC_IMPL
|
193
|
+
# define NEED_GENERIC_IMPL 0 /* generic impl not needed */
|
194
|
+
# undef DEFAULT_IMPL
|
195
|
+
# define DEFAULT_IMPL crc32_pclmul
|
196
|
+
# endif /* otherwise, we can build a PCLMUL version, but we won't know whether
|
197
|
+
we can use it until runtime */
|
198
|
+
#endif
|
199
|
+
|
200
|
+
/*
|
201
|
+
* Include the PCLMUL/AVX implementation? Although our PCLMUL-optimized CRC-32
|
202
|
+
* function doesn't use any AVX intrinsics specifically, it can benefit a lot
|
203
|
+
* from being compiled for an AVX target: on Skylake, ~16700 MB/s vs. ~10100
|
204
|
+
* MB/s. I expect this is related to the PCLMULQDQ instructions being assembled
|
205
|
+
* in the newer three-operand form rather than the older two-operand form.
|
206
|
+
*
|
207
|
+
* Note: this is only needed if __AVX__ is *not* defined, since otherwise the
|
208
|
+
* "regular" PCLMUL implementation would already be AVX enabled.
|
209
|
+
*/
|
210
|
+
#define NEED_PCLMUL_AVX_IMPL 0
|
211
|
+
#if NEED_PCLMUL_IMPL && !defined(__AVX__) && \
|
212
|
+
X86_CPU_FEATURES_ENABLED && COMPILER_SUPPORTS_AVX_TARGET
|
213
|
+
# undef NEED_PCLMUL_AVX_IMPL
|
214
|
+
# define NEED_PCLMUL_AVX_IMPL 1
|
215
|
+
#endif
|
216
|
+
|
217
|
+
#define NUM_IMPLS (NEED_GENERIC_IMPL + NEED_PCLMUL_IMPL + NEED_PCLMUL_AVX_IMPL)
|
218
|
+
|
219
|
+
/* Define the CRC-32 table */
|
220
|
+
#if NEED_GENERIC_IMPL
|
221
|
+
# define CRC32_SLICE8
|
222
|
+
#else
|
223
|
+
# define CRC32_SLICE1 /* only need short table for unaligned ends */
|
224
|
+
#endif
|
225
|
+
#include "crc32_table.h"
|
226
|
+
|
227
|
+
static forceinline u32
|
228
|
+
crc32_update_byte(u32 remainder, u8 next_byte)
|
229
|
+
{
|
230
|
+
return (remainder >> 8) ^ crc32_table[(u8)remainder ^ next_byte];
|
231
|
+
}
|
232
|
+
|
233
|
+
#if defined(CRC32_SLICE1) || (NUM_IMPLS > NEED_GENERIC_IMPL)
|
234
|
+
static u32
|
235
|
+
crc32_slice1(u32 remainder, const u8 *buffer, size_t nbytes)
|
236
|
+
{
|
237
|
+
size_t i;
|
238
|
+
|
239
|
+
STATIC_ASSERT(ARRAY_LEN(crc32_table) >= 0x100);
|
240
|
+
|
241
|
+
for (i = 0; i < nbytes; i++)
|
242
|
+
remainder = crc32_update_byte(remainder, buffer[i]);
|
243
|
+
return remainder;
|
244
|
+
}
|
245
|
+
#endif
|
246
|
+
|
247
|
+
#ifdef CRC32_SLICE4
|
248
|
+
static u32
|
249
|
+
crc32_slice4(u32 remainder, const u8 *buffer, size_t nbytes)
|
250
|
+
{
|
251
|
+
const u8 *p = buffer;
|
252
|
+
const u8 *end = buffer + nbytes;
|
253
|
+
const u8 *end32;
|
254
|
+
|
255
|
+
STATIC_ASSERT(ARRAY_LEN(crc32_table) >= 0x400);
|
256
|
+
|
257
|
+
for (; ((uintptr_t)p & 3) && p != end; p++)
|
258
|
+
remainder = crc32_update_byte(remainder, *p);
|
259
|
+
|
260
|
+
end32 = p + ((end - p) & ~3);
|
261
|
+
for (; p != end32; p += 4) {
|
262
|
+
u32 v = le32_bswap(*(const u32 *)p);
|
263
|
+
remainder =
|
264
|
+
crc32_table[0x300 + (u8)((remainder ^ v) >> 0)] ^
|
265
|
+
crc32_table[0x200 + (u8)((remainder ^ v) >> 8)] ^
|
266
|
+
crc32_table[0x100 + (u8)((remainder ^ v) >> 16)] ^
|
267
|
+
crc32_table[0x000 + (u8)((remainder ^ v) >> 24)];
|
268
|
+
}
|
269
|
+
|
270
|
+
for (; p != end; p++)
|
271
|
+
remainder = crc32_update_byte(remainder, *p);
|
272
|
+
|
273
|
+
return remainder;
|
274
|
+
}
|
275
|
+
#endif
|
276
|
+
|
277
|
+
#ifdef CRC32_SLICE8
|
278
|
+
static u32
|
279
|
+
crc32_slice8(u32 remainder, const u8 *buffer, size_t nbytes)
|
280
|
+
{
|
281
|
+
const u8 *p = buffer;
|
282
|
+
const u8 *end = buffer + nbytes;
|
283
|
+
const u8 *end64;
|
284
|
+
|
285
|
+
STATIC_ASSERT(ARRAY_LEN(crc32_table) >= 0x800);
|
286
|
+
|
287
|
+
for (; ((uintptr_t)p & 7) && p != end; p++)
|
288
|
+
remainder = crc32_update_byte(remainder, *p);
|
289
|
+
|
290
|
+
end64 = p + ((end - p) & ~7);
|
291
|
+
for (; p != end64; p += 8) {
|
292
|
+
u32 v1 = le32_bswap(*(const u32 *)(p + 0));
|
293
|
+
u32 v2 = le32_bswap(*(const u32 *)(p + 4));
|
294
|
+
remainder =
|
295
|
+
crc32_table[0x700 + (u8)((remainder ^ v1) >> 0)] ^
|
296
|
+
crc32_table[0x600 + (u8)((remainder ^ v1) >> 8)] ^
|
297
|
+
crc32_table[0x500 + (u8)((remainder ^ v1) >> 16)] ^
|
298
|
+
crc32_table[0x400 + (u8)((remainder ^ v1) >> 24)] ^
|
299
|
+
crc32_table[0x300 + (u8)(v2 >> 0)] ^
|
300
|
+
crc32_table[0x200 + (u8)(v2 >> 8)] ^
|
301
|
+
crc32_table[0x100 + (u8)(v2 >> 16)] ^
|
302
|
+
crc32_table[0x000 + (u8)(v2 >> 24)];
|
303
|
+
}
|
304
|
+
|
305
|
+
for (; p != end; p++)
|
306
|
+
remainder = crc32_update_byte(remainder, *p);
|
307
|
+
|
308
|
+
return remainder;
|
309
|
+
}
|
310
|
+
#endif
|
311
|
+
|
312
|
+
/* Define the PCLMUL implementation if needed. */
|
313
|
+
#if NEED_PCLMUL_IMPL
|
314
|
+
# define FUNCNAME crc32_pclmul
|
315
|
+
# define FUNCNAME_ALIGNED crc32_pclmul_aligned
|
316
|
+
# ifdef __PCLMUL__
|
317
|
+
# define ATTRIBUTES
|
318
|
+
# else
|
319
|
+
# define ATTRIBUTES __attribute__((target("pclmul")))
|
320
|
+
# endif
|
321
|
+
# include "crc32_impl.h"
|
322
|
+
#endif
|
323
|
+
|
324
|
+
/* Define the PCLMUL/AVX implementation if needed. */
|
325
|
+
#if NEED_PCLMUL_AVX_IMPL
|
326
|
+
# define FUNCNAME crc32_pclmul_avx
|
327
|
+
# define FUNCNAME_ALIGNED crc32_pclmul_avx_aligned
|
328
|
+
# define ATTRIBUTES __attribute__((target("pclmul,avx")))
|
329
|
+
# include "crc32_impl.h"
|
330
|
+
#endif
|
331
|
+
|
332
|
+
typedef u32 (*crc32_func_t)(u32, const u8 *, size_t);
|
333
|
+
|
334
|
+
/*
|
335
|
+
* If multiple implementations are available, then dispatch among them based on
|
336
|
+
* CPU features at runtime. Otherwise just call the single one directly.
|
337
|
+
*/
|
338
|
+
#if NUM_IMPLS == 1
|
339
|
+
# define crc32_impl DEFAULT_IMPL
|
340
|
+
#else
|
341
|
+
static u32 dispatch(u32, const u8 *, size_t);
|
342
|
+
|
343
|
+
static crc32_func_t crc32_impl = dispatch;
|
344
|
+
|
345
|
+
static u32 dispatch(u32 remainder, const u8 *buffer, size_t nbytes)
|
346
|
+
{
|
347
|
+
crc32_func_t f = DEFAULT_IMPL;
|
348
|
+
#if NEED_PCLMUL_IMPL && !defined(__PCLMUL__)
|
349
|
+
if (x86_have_cpu_features(X86_CPU_FEATURE_PCLMULQDQ))
|
350
|
+
f = crc32_pclmul;
|
351
|
+
#endif
|
352
|
+
#if NEED_PCLMUL_AVX_IMPL
|
353
|
+
if (x86_have_cpu_features(X86_CPU_FEATURE_PCLMULQDQ |
|
354
|
+
X86_CPU_FEATURE_AVX))
|
355
|
+
f = crc32_pclmul_avx;
|
356
|
+
#endif
|
357
|
+
crc32_impl = f;
|
358
|
+
return crc32_impl(remainder, buffer, nbytes);
|
359
|
+
}
|
360
|
+
#endif /* NUM_IMPLS != 1 */
|
361
|
+
|
362
|
+
LIBDEFLATEAPI u32
|
363
|
+
libdeflate_crc32(u32 remainder, const void *buffer, size_t nbytes)
|
364
|
+
{
|
365
|
+
if (buffer == NULL) /* return initial value */
|
366
|
+
return 0;
|
367
|
+
return ~crc32_impl(~remainder, buffer, nbytes);
|
368
|
+
}
|