libdeflate 0.1.1 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- data/.github/workflows/test.yml +34 -0
- data/README.md +1 -6
- data/ext/libdeflate/extconf.rb +18 -7
- data/ext/libdeflate/libdeflate_ext.c +17 -17
- data/lib/libdeflate/version.rb +1 -1
- data/libdeflate.gemspec +2 -1
- metadata +13 -84
- data/.gitmodules +0 -3
- data/.travis.yml +0 -5
- data/ext/libdeflate/libdeflate/.gitignore +0 -19
- data/ext/libdeflate/libdeflate/COPYING +0 -21
- data/ext/libdeflate/libdeflate/Makefile +0 -231
- data/ext/libdeflate/libdeflate/Makefile.msc +0 -64
- data/ext/libdeflate/libdeflate/NEWS +0 -57
- data/ext/libdeflate/libdeflate/README.md +0 -170
- data/ext/libdeflate/libdeflate/common/common_defs.h +0 -351
- data/ext/libdeflate/libdeflate/common/compiler_gcc.h +0 -134
- data/ext/libdeflate/libdeflate/common/compiler_msc.h +0 -95
- data/ext/libdeflate/libdeflate/lib/adler32.c +0 -213
- data/ext/libdeflate/libdeflate/lib/adler32_impl.h +0 -281
- data/ext/libdeflate/libdeflate/lib/aligned_malloc.c +0 -57
- data/ext/libdeflate/libdeflate/lib/aligned_malloc.h +0 -13
- data/ext/libdeflate/libdeflate/lib/bt_matchfinder.h +0 -357
- data/ext/libdeflate/libdeflate/lib/crc32.c +0 -368
- data/ext/libdeflate/libdeflate/lib/crc32_impl.h +0 -286
- data/ext/libdeflate/libdeflate/lib/crc32_table.h +0 -526
- data/ext/libdeflate/libdeflate/lib/decompress_impl.h +0 -404
- data/ext/libdeflate/libdeflate/lib/deflate_compress.c +0 -2817
- data/ext/libdeflate/libdeflate/lib/deflate_compress.h +0 -14
- data/ext/libdeflate/libdeflate/lib/deflate_constants.h +0 -66
- data/ext/libdeflate/libdeflate/lib/deflate_decompress.c +0 -889
- data/ext/libdeflate/libdeflate/lib/gzip_compress.c +0 -95
- data/ext/libdeflate/libdeflate/lib/gzip_constants.h +0 -45
- data/ext/libdeflate/libdeflate/lib/gzip_decompress.c +0 -130
- data/ext/libdeflate/libdeflate/lib/hc_matchfinder.h +0 -405
- data/ext/libdeflate/libdeflate/lib/lib_common.h +0 -35
- data/ext/libdeflate/libdeflate/lib/matchfinder_avx2.h +0 -53
- data/ext/libdeflate/libdeflate/lib/matchfinder_common.h +0 -205
- data/ext/libdeflate/libdeflate/lib/matchfinder_neon.h +0 -61
- data/ext/libdeflate/libdeflate/lib/matchfinder_sse2.h +0 -53
- data/ext/libdeflate/libdeflate/lib/unaligned.h +0 -202
- data/ext/libdeflate/libdeflate/lib/x86_cpu_features.c +0 -169
- data/ext/libdeflate/libdeflate/lib/x86_cpu_features.h +0 -48
- data/ext/libdeflate/libdeflate/lib/zlib_compress.c +0 -87
- data/ext/libdeflate/libdeflate/lib/zlib_constants.h +0 -21
- data/ext/libdeflate/libdeflate/lib/zlib_decompress.c +0 -91
- data/ext/libdeflate/libdeflate/libdeflate.h +0 -274
- data/ext/libdeflate/libdeflate/programs/benchmark.c +0 -558
- data/ext/libdeflate/libdeflate/programs/checksum.c +0 -197
- data/ext/libdeflate/libdeflate/programs/detect.sh +0 -62
- data/ext/libdeflate/libdeflate/programs/gzip.c +0 -603
- data/ext/libdeflate/libdeflate/programs/prog_util.c +0 -530
- data/ext/libdeflate/libdeflate/programs/prog_util.h +0 -162
- data/ext/libdeflate/libdeflate/programs/test_checksums.c +0 -135
- data/ext/libdeflate/libdeflate/programs/tgetopt.c +0 -118
- data/ext/libdeflate/libdeflate/tools/afl-fuzz/Makefile +0 -12
- data/ext/libdeflate/libdeflate/tools/afl-fuzz/deflate_compress/fuzz.c +0 -40
- data/ext/libdeflate/libdeflate/tools/afl-fuzz/deflate_compress/inputs/0 +0 -0
- data/ext/libdeflate/libdeflate/tools/afl-fuzz/deflate_decompress/fuzz.c +0 -28
- data/ext/libdeflate/libdeflate/tools/afl-fuzz/deflate_decompress/inputs/0 +0 -3
- data/ext/libdeflate/libdeflate/tools/afl-fuzz/gzip_decompress/fuzz.c +0 -28
- data/ext/libdeflate/libdeflate/tools/afl-fuzz/gzip_decompress/inputs/0 +0 -0
- data/ext/libdeflate/libdeflate/tools/afl-fuzz/prepare_for_fuzz.sh +0 -14
- data/ext/libdeflate/libdeflate/tools/afl-fuzz/zlib_decompress/fuzz.c +0 -28
- data/ext/libdeflate/libdeflate/tools/afl-fuzz/zlib_decompress/inputs/0 +0 -3
- data/ext/libdeflate/libdeflate/tools/android_build.sh +0 -104
- data/ext/libdeflate/libdeflate/tools/checksum_benchmarks.sh +0 -76
- data/ext/libdeflate/libdeflate/tools/exec_tests.sh +0 -30
- data/ext/libdeflate/libdeflate/tools/gen_crc32_multipliers.c +0 -108
- data/ext/libdeflate/libdeflate/tools/gen_crc32_table.c +0 -100
- data/ext/libdeflate/libdeflate/tools/gzip_tests.sh +0 -412
- data/ext/libdeflate/libdeflate/tools/make-windows-releases +0 -21
- data/ext/libdeflate/libdeflate/tools/mips_build.sh +0 -9
- data/ext/libdeflate/libdeflate/tools/msc_test.bat +0 -3
- data/ext/libdeflate/libdeflate/tools/pgo_build.sh +0 -23
- data/ext/libdeflate/libdeflate/tools/produce_gzip_benchmark_table.sh +0 -37
- data/ext/libdeflate/libdeflate/tools/run_tests.sh +0 -305
- data/ext/libdeflate/libdeflate/tools/windows_build.sh +0 -10
@@ -1,2817 +0,0 @@
|
|
1
|
-
/*
|
2
|
-
* deflate_compress.c - a compressor for DEFLATE
|
3
|
-
*
|
4
|
-
* Originally public domain; changes after 2016-09-07 are copyrighted.
|
5
|
-
*
|
6
|
-
* Copyright 2016 Eric Biggers
|
7
|
-
*
|
8
|
-
* Permission is hereby granted, free of charge, to any person
|
9
|
-
* obtaining a copy of this software and associated documentation
|
10
|
-
* files (the "Software"), to deal in the Software without
|
11
|
-
* restriction, including without limitation the rights to use,
|
12
|
-
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
13
|
-
* copies of the Software, and to permit persons to whom the
|
14
|
-
* Software is furnished to do so, subject to the following
|
15
|
-
* conditions:
|
16
|
-
*
|
17
|
-
* The above copyright notice and this permission notice shall be
|
18
|
-
* included in all copies or substantial portions of the Software.
|
19
|
-
*
|
20
|
-
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
21
|
-
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
22
|
-
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
23
|
-
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
24
|
-
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
25
|
-
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
26
|
-
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
27
|
-
* OTHER DEALINGS IN THE SOFTWARE.
|
28
|
-
*/
|
29
|
-
|
30
|
-
#include <stdlib.h>
|
31
|
-
#include <string.h>
|
32
|
-
|
33
|
-
#include "aligned_malloc.h"
|
34
|
-
#include "deflate_compress.h"
|
35
|
-
#include "deflate_constants.h"
|
36
|
-
#include "unaligned.h"
|
37
|
-
|
38
|
-
#include "libdeflate.h"
|
39
|
-
|
40
|
-
/*
|
41
|
-
* By default, the near-optimal parsing algorithm is enabled at compression
|
42
|
-
* level 8 and above. The near-optimal parsing algorithm produces a compression
|
43
|
-
* ratio significantly better than the greedy and lazy algorithms implemented
|
44
|
-
* here, and also the algorithm used by zlib at level 9. However, it is slow.
|
45
|
-
*/
|
46
|
-
#define SUPPORT_NEAR_OPTIMAL_PARSING 1
|
47
|
-
|
48
|
-
/*
|
49
|
-
* Define to 1 to maintain the full map from match offsets to offset slots.
|
50
|
-
* This slightly speeds up translations of match offsets to offset slots, but it
|
51
|
-
* uses 32769 bytes of memory rather than the 512 bytes used by the condensed
|
52
|
-
* map. The speedup provided by the larger map is most helpful when the
|
53
|
-
* near-optimal parsing algorithm is being used.
|
54
|
-
*/
|
55
|
-
#define USE_FULL_OFFSET_SLOT_FAST SUPPORT_NEAR_OPTIMAL_PARSING
|
56
|
-
|
57
|
-
/*
|
58
|
-
* DEFLATE uses a 32768 byte sliding window; set the matchfinder parameters
|
59
|
-
* appropriately.
|
60
|
-
*/
|
61
|
-
#define MATCHFINDER_WINDOW_ORDER 15
|
62
|
-
|
63
|
-
#include "hc_matchfinder.h"
|
64
|
-
#if SUPPORT_NEAR_OPTIMAL_PARSING
|
65
|
-
# include "bt_matchfinder.h"
|
66
|
-
#endif
|
67
|
-
|
68
|
-
/*
|
69
|
-
* The compressor always chooses a block of at least MIN_BLOCK_LENGTH bytes,
|
70
|
-
* except if the last block has to be shorter.
|
71
|
-
*/
|
72
|
-
#define MIN_BLOCK_LENGTH 10000
|
73
|
-
|
74
|
-
/*
|
75
|
-
* The compressor attempts to end blocks after SOFT_MAX_BLOCK_LENGTH bytes, but
|
76
|
-
* the final length might be slightly longer due to matches extending beyond
|
77
|
-
* this limit.
|
78
|
-
*/
|
79
|
-
#define SOFT_MAX_BLOCK_LENGTH 300000
|
80
|
-
|
81
|
-
/*
|
82
|
-
* The number of observed matches or literals that represents sufficient data to
|
83
|
-
* decide whether the current block should be terminated or not.
|
84
|
-
*/
|
85
|
-
#define NUM_OBSERVATIONS_PER_BLOCK_CHECK 512
|
86
|
-
|
87
|
-
|
88
|
-
#if SUPPORT_NEAR_OPTIMAL_PARSING
|
89
|
-
/* Constants specific to the near-optimal parsing algorithm */
|
90
|
-
|
91
|
-
/*
|
92
|
-
* The maximum number of matches the matchfinder can find at a single position.
|
93
|
-
* Since the matchfinder never finds more than one match for the same length,
|
94
|
-
* presuming one of each possible length is sufficient for an upper bound.
|
95
|
-
* (This says nothing about whether it is worthwhile to consider so many
|
96
|
-
* matches; this is just defining the worst case.)
|
97
|
-
*/
|
98
|
-
# define MAX_MATCHES_PER_POS (DEFLATE_MAX_MATCH_LEN - DEFLATE_MIN_MATCH_LEN + 1)
|
99
|
-
|
100
|
-
/*
|
101
|
-
* The number of lz_match structures in the match cache, excluding the extra
|
102
|
-
* "overflow" entries. This value should be high enough so that nearly the
|
103
|
-
* time, all matches found in a given block can fit in the match cache.
|
104
|
-
* However, fallback behavior (immediately terminating the block) on cache
|
105
|
-
* overflow is still required.
|
106
|
-
*/
|
107
|
-
# define CACHE_LENGTH (SOFT_MAX_BLOCK_LENGTH * 5)
|
108
|
-
|
109
|
-
#endif /* SUPPORT_NEAR_OPTIMAL_PARSING */
|
110
|
-
|
111
|
-
/*
|
112
|
-
* These are the compressor-side limits on the codeword lengths for each Huffman
|
113
|
-
* code. To make outputting bits slightly faster, some of these limits are
|
114
|
-
* lower than the limits defined by the DEFLATE format. This does not
|
115
|
-
* significantly affect the compression ratio, at least for the block lengths we
|
116
|
-
* use.
|
117
|
-
*/
|
118
|
-
#define MAX_LITLEN_CODEWORD_LEN 14
|
119
|
-
#define MAX_OFFSET_CODEWORD_LEN DEFLATE_MAX_OFFSET_CODEWORD_LEN
|
120
|
-
#define MAX_PRE_CODEWORD_LEN DEFLATE_MAX_PRE_CODEWORD_LEN
|
121
|
-
|
122
|
-
/* Table: length slot => length slot base value */
|
123
|
-
static const unsigned deflate_length_slot_base[] = {
|
124
|
-
3 , 4 , 5 , 6 , 7 , 8 , 9 , 10 ,
|
125
|
-
11 , 13 , 15 , 17 , 19 , 23 , 27 , 31 ,
|
126
|
-
35 , 43 , 51 , 59 , 67 , 83 , 99 , 115 ,
|
127
|
-
131 , 163 , 195 , 227 , 258 ,
|
128
|
-
};
|
129
|
-
|
130
|
-
/* Table: length slot => number of extra length bits */
|
131
|
-
static const u8 deflate_extra_length_bits[] = {
|
132
|
-
0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
|
133
|
-
1 , 1 , 1 , 1 , 2 , 2 , 2 , 2 ,
|
134
|
-
3 , 3 , 3 , 3 , 4 , 4 , 4 , 4 ,
|
135
|
-
5 , 5 , 5 , 5 , 0 ,
|
136
|
-
};
|
137
|
-
|
138
|
-
/* Table: offset slot => offset slot base value */
|
139
|
-
static const unsigned deflate_offset_slot_base[] = {
|
140
|
-
1 , 2 , 3 , 4 , 5 , 7 , 9 , 13 ,
|
141
|
-
17 , 25 , 33 , 49 , 65 , 97 , 129 , 193 ,
|
142
|
-
257 , 385 , 513 , 769 , 1025 , 1537 , 2049 , 3073 ,
|
143
|
-
4097 , 6145 , 8193 , 12289 , 16385 , 24577 ,
|
144
|
-
};
|
145
|
-
|
146
|
-
/* Table: offset slot => number of extra offset bits */
|
147
|
-
static const u8 deflate_extra_offset_bits[] = {
|
148
|
-
0 , 0 , 0 , 0 , 1 , 1 , 2 , 2 ,
|
149
|
-
3 , 3 , 4 , 4 , 5 , 5 , 6 , 6 ,
|
150
|
-
7 , 7 , 8 , 8 , 9 , 9 , 10 , 10 ,
|
151
|
-
11 , 11 , 12 , 12 , 13 , 13 ,
|
152
|
-
};
|
153
|
-
|
154
|
-
/* Table: length => length slot */
|
155
|
-
static const u8 deflate_length_slot[DEFLATE_MAX_MATCH_LEN + 1] = {
|
156
|
-
0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 12,
|
157
|
-
12, 13, 13, 13, 13, 14, 14, 14, 14, 15, 15, 15, 15, 16, 16, 16, 16, 16,
|
158
|
-
16, 16, 16, 17, 17, 17, 17, 17, 17, 17, 17, 18, 18, 18, 18, 18, 18, 18,
|
159
|
-
18, 19, 19, 19, 19, 19, 19, 19, 19, 20, 20, 20, 20, 20, 20, 20, 20, 20,
|
160
|
-
20, 20, 20, 20, 20, 20, 20, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
|
161
|
-
21, 21, 21, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22,
|
162
|
-
22, 22, 22, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23,
|
163
|
-
23, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
|
164
|
-
24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 25, 25, 25,
|
165
|
-
25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
|
166
|
-
25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 26, 26, 26, 26, 26, 26, 26,
|
167
|
-
26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
|
168
|
-
26, 26, 26, 26, 26, 26, 26, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
|
169
|
-
27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
|
170
|
-
27, 27, 28,
|
171
|
-
};
|
172
|
-
|
173
|
-
/* The order in which precode codeword lengths are stored */
|
174
|
-
static const u8 deflate_precode_lens_permutation[DEFLATE_NUM_PRECODE_SYMS] = {
|
175
|
-
16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15
|
176
|
-
};
|
177
|
-
|
178
|
-
/* Codewords for the DEFLATE Huffman codes. */
|
179
|
-
struct deflate_codewords {
|
180
|
-
u32 litlen[DEFLATE_NUM_LITLEN_SYMS];
|
181
|
-
u32 offset[DEFLATE_NUM_OFFSET_SYMS];
|
182
|
-
};
|
183
|
-
|
184
|
-
/* Codeword lengths (in bits) for the DEFLATE Huffman codes.
|
185
|
-
* A zero length means the corresponding symbol had zero frequency. */
|
186
|
-
struct deflate_lens {
|
187
|
-
u8 litlen[DEFLATE_NUM_LITLEN_SYMS];
|
188
|
-
u8 offset[DEFLATE_NUM_OFFSET_SYMS];
|
189
|
-
};
|
190
|
-
|
191
|
-
/* Codewords and lengths for the DEFLATE Huffman codes. */
|
192
|
-
struct deflate_codes {
|
193
|
-
struct deflate_codewords codewords;
|
194
|
-
struct deflate_lens lens;
|
195
|
-
};
|
196
|
-
|
197
|
-
/* Symbol frequency counters for the DEFLATE Huffman codes. */
|
198
|
-
struct deflate_freqs {
|
199
|
-
u32 litlen[DEFLATE_NUM_LITLEN_SYMS];
|
200
|
-
u32 offset[DEFLATE_NUM_OFFSET_SYMS];
|
201
|
-
};
|
202
|
-
|
203
|
-
#if SUPPORT_NEAR_OPTIMAL_PARSING
|
204
|
-
|
205
|
-
/* Costs for the near-optimal parsing algorithm. */
|
206
|
-
struct deflate_costs {
|
207
|
-
|
208
|
-
/* The cost to output each possible literal. */
|
209
|
-
u32 literal[DEFLATE_NUM_LITERALS];
|
210
|
-
|
211
|
-
/* The cost to output each possible match length. */
|
212
|
-
u32 length[DEFLATE_MAX_MATCH_LEN + 1];
|
213
|
-
|
214
|
-
/* The cost to output a match offset of each possible offset slot. */
|
215
|
-
u32 offset_slot[DEFLATE_NUM_OFFSET_SYMS];
|
216
|
-
};
|
217
|
-
|
218
|
-
/*
|
219
|
-
* COST_SHIFT is a scaling factor that makes it possible to consider fractional
|
220
|
-
* bit costs. A token requiring 'n' bits to represent has cost n << COST_SHIFT.
|
221
|
-
*
|
222
|
-
* Note: this is only useful as a statistical trick for when the true costs are
|
223
|
-
* unknown. In reality, each token in DEFLATE requires a whole number of bits
|
224
|
-
* to output.
|
225
|
-
*/
|
226
|
-
#define COST_SHIFT 3
|
227
|
-
|
228
|
-
/*
|
229
|
-
* The NOSTAT_BITS value for a given alphabet is the number of bits assumed to
|
230
|
-
* be needed to output a symbol that was unused in the previous optimization
|
231
|
-
* pass. Assigning a default cost allows the symbol to be used in the next
|
232
|
-
* optimization pass. However, the cost should be relatively high because the
|
233
|
-
* symbol probably won't be used very many times (if at all).
|
234
|
-
*/
|
235
|
-
#define LITERAL_NOSTAT_BITS 13
|
236
|
-
#define LENGTH_NOSTAT_BITS 13
|
237
|
-
#define OFFSET_NOSTAT_BITS 10
|
238
|
-
|
239
|
-
#endif /* SUPPORT_NEAR_OPTIMAL_PARSING */
|
240
|
-
|
241
|
-
/*
|
242
|
-
* Represents a run of literals followed by a match or end-of-block. This
|
243
|
-
* struct is needed to temporarily store items chosen by the parser, since items
|
244
|
-
* cannot be written until all items for the block have been chosen and the
|
245
|
-
* block's Huffman codes have been computed.
|
246
|
-
*/
|
247
|
-
struct deflate_sequence {
|
248
|
-
|
249
|
-
/* Bits 0..22: the number of literals in this run. This may be 0 and
|
250
|
-
* can be at most about SOFT_MAX_BLOCK_LENGTH. The literals are not
|
251
|
-
* stored explicitly in this structure; instead, they are read directly
|
252
|
-
* from the uncompressed data.
|
253
|
-
*
|
254
|
-
* Bits 23..31: the length of the match which follows the literals, or 0
|
255
|
-
* if this literal run was the last in the block, so there is no match
|
256
|
-
* which follows it. */
|
257
|
-
u32 litrunlen_and_length;
|
258
|
-
|
259
|
-
/* If 'length' doesn't indicate end-of-block, then this is the offset of
|
260
|
-
* the match which follows the literals. */
|
261
|
-
u16 offset;
|
262
|
-
|
263
|
-
/* If 'length' doesn't indicate end-of-block, then this is the offset
|
264
|
-
* symbol of the match which follows the literals. */
|
265
|
-
u8 offset_symbol;
|
266
|
-
|
267
|
-
/* If 'length' doesn't indicate end-of-block, then this is the length
|
268
|
-
* slot of the match which follows the literals. */
|
269
|
-
u8 length_slot;
|
270
|
-
};
|
271
|
-
|
272
|
-
#if SUPPORT_NEAR_OPTIMAL_PARSING
|
273
|
-
|
274
|
-
/*
|
275
|
-
* This structure represents a byte position in the input data and a node in the
|
276
|
-
* graph of possible match/literal choices for the current block.
|
277
|
-
*
|
278
|
-
* Logically, each incoming edge to this node is labeled with a literal or a
|
279
|
-
* match that can be taken to reach this position from an earlier position; and
|
280
|
-
* each outgoing edge from this node is labeled with a literal or a match that
|
281
|
-
* can be taken to advance from this position to a later position.
|
282
|
-
*
|
283
|
-
* But these "edges" are actually stored elsewhere (in 'match_cache'). Here we
|
284
|
-
* associate with each node just two pieces of information:
|
285
|
-
*
|
286
|
-
* 'cost_to_end' is the minimum cost to reach the end of the block from
|
287
|
-
* this position.
|
288
|
-
*
|
289
|
-
* 'item' represents the literal or match that must be chosen from here to
|
290
|
-
* reach the end of the block with the minimum cost. Equivalently, this
|
291
|
-
* can be interpreted as the label of the outgoing edge on the minimum-cost
|
292
|
-
* path to the "end of block" node from this node.
|
293
|
-
*/
|
294
|
-
struct deflate_optimum_node {
|
295
|
-
|
296
|
-
u32 cost_to_end;
|
297
|
-
|
298
|
-
/*
|
299
|
-
* Notes on the match/literal representation used here:
|
300
|
-
*
|
301
|
-
* The low bits of 'item' are the length: 1 if this is a literal,
|
302
|
-
* or the match length if this is a match.
|
303
|
-
*
|
304
|
-
* The high bits of 'item' are the actual literal byte if this is a
|
305
|
-
* literal, or the match offset if this is a match.
|
306
|
-
*/
|
307
|
-
#define OPTIMUM_OFFSET_SHIFT 9
|
308
|
-
#define OPTIMUM_LEN_MASK (((u32)1 << OPTIMUM_OFFSET_SHIFT) - 1)
|
309
|
-
u32 item;
|
310
|
-
|
311
|
-
};
|
312
|
-
|
313
|
-
#endif /* SUPPORT_NEAR_OPTIMAL_PARSING */
|
314
|
-
|
315
|
-
/* Block split statistics. See "Block splitting algorithm" below. */
|
316
|
-
#define NUM_LITERAL_OBSERVATION_TYPES 8
|
317
|
-
#define NUM_MATCH_OBSERVATION_TYPES 2
|
318
|
-
#define NUM_OBSERVATION_TYPES (NUM_LITERAL_OBSERVATION_TYPES + NUM_MATCH_OBSERVATION_TYPES)
|
319
|
-
struct block_split_stats {
|
320
|
-
u32 new_observations[NUM_OBSERVATION_TYPES];
|
321
|
-
u32 observations[NUM_OBSERVATION_TYPES];
|
322
|
-
u32 num_new_observations;
|
323
|
-
u32 num_observations;
|
324
|
-
};
|
325
|
-
|
326
|
-
/* The main DEFLATE compressor structure */
|
327
|
-
struct libdeflate_compressor {
|
328
|
-
|
329
|
-
/* Pointer to the compress() implementation chosen at allocation time */
|
330
|
-
size_t (*impl)(struct libdeflate_compressor *,
|
331
|
-
const u8 *, size_t, u8 *, size_t);
|
332
|
-
|
333
|
-
/* Frequency counters for the current block */
|
334
|
-
struct deflate_freqs freqs;
|
335
|
-
|
336
|
-
/* Dynamic Huffman codes for the current block */
|
337
|
-
struct deflate_codes codes;
|
338
|
-
|
339
|
-
/* Static Huffman codes */
|
340
|
-
struct deflate_codes static_codes;
|
341
|
-
|
342
|
-
/* Block split statistics for the currently pending block */
|
343
|
-
struct block_split_stats split_stats;
|
344
|
-
|
345
|
-
/* A table for fast lookups of offset slot by match offset.
|
346
|
-
*
|
347
|
-
* If the full table is being used, it is a direct mapping from offset
|
348
|
-
* to offset slot.
|
349
|
-
*
|
350
|
-
* If the condensed table is being used, the first 256 entries map
|
351
|
-
* directly to the offset slots of offsets 1 through 256. The next 256
|
352
|
-
* entries map to the offset slots for the remaining offsets, stepping
|
353
|
-
* through the offsets with a stride of 128. This relies on the fact
|
354
|
-
* that each of the remaining offset slots contains at least 128 offsets
|
355
|
-
* and has an offset base that is a multiple of 128. */
|
356
|
-
#if USE_FULL_OFFSET_SLOT_FAST
|
357
|
-
u8 offset_slot_fast[DEFLATE_MAX_MATCH_OFFSET + 1];
|
358
|
-
#else
|
359
|
-
u8 offset_slot_fast[512];
|
360
|
-
#endif
|
361
|
-
|
362
|
-
/* The "nice" match length: if a match of this length is found, choose
|
363
|
-
* it immediately without further consideration. */
|
364
|
-
unsigned nice_match_length;
|
365
|
-
|
366
|
-
/* The maximum search depth: consider at most this many potential
|
367
|
-
* matches at each position. */
|
368
|
-
unsigned max_search_depth;
|
369
|
-
|
370
|
-
/* The compression level with which this compressor was created. */
|
371
|
-
unsigned compression_level;
|
372
|
-
|
373
|
-
/* Temporary space for Huffman code output */
|
374
|
-
u32 precode_freqs[DEFLATE_NUM_PRECODE_SYMS];
|
375
|
-
u8 precode_lens[DEFLATE_NUM_PRECODE_SYMS];
|
376
|
-
u32 precode_codewords[DEFLATE_NUM_PRECODE_SYMS];
|
377
|
-
unsigned precode_items[DEFLATE_NUM_LITLEN_SYMS + DEFLATE_NUM_OFFSET_SYMS];
|
378
|
-
unsigned num_litlen_syms;
|
379
|
-
unsigned num_offset_syms;
|
380
|
-
unsigned num_explicit_lens;
|
381
|
-
unsigned num_precode_items;
|
382
|
-
|
383
|
-
union {
|
384
|
-
/* Data for greedy or lazy parsing */
|
385
|
-
struct {
|
386
|
-
/* Hash chain matchfinder */
|
387
|
-
struct hc_matchfinder hc_mf;
|
388
|
-
|
389
|
-
/* The matches and literals that the parser has chosen
|
390
|
-
* for the current block. The required length of this
|
391
|
-
* array is limited by the maximum number of matches
|
392
|
-
* that can ever be chosen for a single block, plus one
|
393
|
-
* for the special entry at the end. */
|
394
|
-
struct deflate_sequence sequences[
|
395
|
-
DIV_ROUND_UP(SOFT_MAX_BLOCK_LENGTH,
|
396
|
-
DEFLATE_MIN_MATCH_LEN) + 1];
|
397
|
-
} g; /* (g)reedy */
|
398
|
-
|
399
|
-
#if SUPPORT_NEAR_OPTIMAL_PARSING
|
400
|
-
/* Data for near-optimal parsing */
|
401
|
-
struct {
|
402
|
-
|
403
|
-
/* Binary tree matchfinder */
|
404
|
-
struct bt_matchfinder bt_mf;
|
405
|
-
|
406
|
-
/*
|
407
|
-
* Cached matches for the current block. This array
|
408
|
-
* contains the matches that were found at each position
|
409
|
-
* in the block. Specifically, for each position, there
|
410
|
-
* is a list of matches found at that position, if any,
|
411
|
-
* sorted by strictly increasing length. In addition,
|
412
|
-
* following the matches for each position, there is a
|
413
|
-
* special 'struct lz_match' whose 'length' member
|
414
|
-
* contains the number of matches found at that
|
415
|
-
* position, and whose 'offset' member contains the
|
416
|
-
* literal at that position.
|
417
|
-
*
|
418
|
-
* Note: in rare cases, there will be a very high number
|
419
|
-
* of matches in the block and this array will overflow.
|
420
|
-
* If this happens, we force the end of the current
|
421
|
-
* block. CACHE_LENGTH is the length at which we
|
422
|
-
* actually check for overflow. The extra slots beyond
|
423
|
-
* this are enough to absorb the worst case overflow,
|
424
|
-
* which occurs if starting at &match_cache[CACHE_LENGTH
|
425
|
-
* - 1], we write MAX_MATCHES_PER_POS matches and a
|
426
|
-
* match count header, then skip searching for matches
|
427
|
-
* at 'DEFLATE_MAX_MATCH_LEN - 1' positions and write
|
428
|
-
* the match count header for each.
|
429
|
-
*/
|
430
|
-
struct lz_match match_cache[CACHE_LENGTH +
|
431
|
-
MAX_MATCHES_PER_POS +
|
432
|
-
DEFLATE_MAX_MATCH_LEN - 1];
|
433
|
-
|
434
|
-
/*
|
435
|
-
* Array of nodes, one per position, for running the
|
436
|
-
* minimum-cost path algorithm.
|
437
|
-
*
|
438
|
-
* This array must be large enough to accommodate the
|
439
|
-
* worst-case number of nodes, which occurs if we find a
|
440
|
-
* match of length DEFLATE_MAX_MATCH_LEN at position
|
441
|
-
* SOFT_MAX_BLOCK_LENGTH - 1, producing a block of
|
442
|
-
* length SOFT_MAX_BLOCK_LENGTH - 1 +
|
443
|
-
* DEFLATE_MAX_MATCH_LEN. Add one for the end-of-block
|
444
|
-
* node.
|
445
|
-
*/
|
446
|
-
struct deflate_optimum_node optimum_nodes[SOFT_MAX_BLOCK_LENGTH - 1 +
|
447
|
-
DEFLATE_MAX_MATCH_LEN + 1];
|
448
|
-
|
449
|
-
/* The current cost model being used. */
|
450
|
-
struct deflate_costs costs;
|
451
|
-
|
452
|
-
unsigned num_optim_passes;
|
453
|
-
} n; /* (n)ear-optimal */
|
454
|
-
#endif /* SUPPORT_NEAR_OPTIMAL_PARSING */
|
455
|
-
|
456
|
-
} p; /* (p)arser */
|
457
|
-
};
|
458
|
-
|
459
|
-
/*
|
460
|
-
* The type for the bitbuffer variable, which temporarily holds bits that are
|
461
|
-
* being packed into bytes and written to the output buffer. For best
|
462
|
-
* performance, this should have size equal to a machine word.
|
463
|
-
*/
|
464
|
-
typedef machine_word_t bitbuf_t;
|
465
|
-
#define BITBUF_NBITS (8 * sizeof(bitbuf_t))
|
466
|
-
|
467
|
-
/* Can the specified number of bits always be added to 'bitbuf' after any
|
468
|
-
* pending bytes have been flushed? */
|
469
|
-
#define CAN_BUFFER(n) ((n) <= BITBUF_NBITS - 7)
|
470
|
-
|
471
|
-
/*
|
472
|
-
* Structure to keep track of the current state of sending bits to the
|
473
|
-
* compressed output buffer.
|
474
|
-
*/
|
475
|
-
struct deflate_output_bitstream {
|
476
|
-
|
477
|
-
/* Bits that haven't yet been written to the output buffer. */
|
478
|
-
bitbuf_t bitbuf;
|
479
|
-
|
480
|
-
/* Number of bits currently held in @bitbuf. */
|
481
|
-
unsigned bitcount;
|
482
|
-
|
483
|
-
/* Pointer to the beginning of the output buffer. */
|
484
|
-
u8 *begin;
|
485
|
-
|
486
|
-
/* Pointer to the position in the output buffer at which the next byte
|
487
|
-
* should be written. */
|
488
|
-
u8 *next;
|
489
|
-
|
490
|
-
/* Pointer just past the end of the output buffer. */
|
491
|
-
u8 *end;
|
492
|
-
};
|
493
|
-
|
494
|
-
#define MIN_OUTPUT_SIZE (UNALIGNED_ACCESS_IS_FAST ? sizeof(bitbuf_t) : 1)
|
495
|
-
|
496
|
-
/* Initialize the output bitstream. 'size' is assumed to be at least
|
497
|
-
* MIN_OUTPUT_SIZE. */
|
498
|
-
static void
|
499
|
-
deflate_init_output(struct deflate_output_bitstream *os,
|
500
|
-
void *buffer, size_t size)
|
501
|
-
{
|
502
|
-
os->bitbuf = 0;
|
503
|
-
os->bitcount = 0;
|
504
|
-
os->begin = buffer;
|
505
|
-
os->next = os->begin;
|
506
|
-
os->end = os->begin + size - MIN_OUTPUT_SIZE;
|
507
|
-
}
|
508
|
-
|
509
|
-
/* Add some bits to the bitbuffer variable of the output bitstream. The caller
|
510
|
-
* must make sure there is enough room. */
|
511
|
-
static forceinline void
|
512
|
-
deflate_add_bits(struct deflate_output_bitstream *os,
|
513
|
-
const bitbuf_t bits, const unsigned num_bits)
|
514
|
-
{
|
515
|
-
os->bitbuf |= bits << os->bitcount;
|
516
|
-
os->bitcount += num_bits;
|
517
|
-
}
|
518
|
-
|
519
|
-
/* Flush bits from the bitbuffer variable to the output buffer. */
|
520
|
-
static forceinline void
|
521
|
-
deflate_flush_bits(struct deflate_output_bitstream *os)
|
522
|
-
{
|
523
|
-
if (UNALIGNED_ACCESS_IS_FAST) {
|
524
|
-
/* Flush a whole word (branchlessly). */
|
525
|
-
put_unaligned_leword(os->bitbuf, os->next);
|
526
|
-
os->bitbuf >>= os->bitcount & ~7;
|
527
|
-
os->next += MIN(os->end - os->next, os->bitcount >> 3);
|
528
|
-
os->bitcount &= 7;
|
529
|
-
} else {
|
530
|
-
/* Flush a byte at a time. */
|
531
|
-
while (os->bitcount >= 8) {
|
532
|
-
*os->next = os->bitbuf;
|
533
|
-
if (os->next != os->end)
|
534
|
-
os->next++;
|
535
|
-
os->bitcount -= 8;
|
536
|
-
os->bitbuf >>= 8;
|
537
|
-
}
|
538
|
-
}
|
539
|
-
}
|
540
|
-
|
541
|
-
/* Align the bitstream on a byte boundary. */
|
542
|
-
static forceinline void
|
543
|
-
deflate_align_bitstream(struct deflate_output_bitstream *os)
|
544
|
-
{
|
545
|
-
os->bitcount += -os->bitcount & 7;
|
546
|
-
deflate_flush_bits(os);
|
547
|
-
}
|
548
|
-
|
549
|
-
/*
|
550
|
-
* Flush any remaining bits to the output buffer if needed. Return the total
|
551
|
-
* number of bytes written to the output buffer, or 0 if an overflow occurred.
|
552
|
-
*/
|
553
|
-
static u32
|
554
|
-
deflate_flush_output(struct deflate_output_bitstream *os)
|
555
|
-
{
|
556
|
-
if (os->next == os->end) /* overflow? */
|
557
|
-
return 0;
|
558
|
-
|
559
|
-
while ((int)os->bitcount > 0) {
|
560
|
-
*os->next++ = os->bitbuf;
|
561
|
-
os->bitcount -= 8;
|
562
|
-
os->bitbuf >>= 8;
|
563
|
-
}
|
564
|
-
|
565
|
-
return os->next - os->begin;
|
566
|
-
}
|
567
|
-
|
568
|
-
/* Given the binary tree node A[subtree_idx] whose children already
|
569
|
-
* satisfy the maxheap property, swap the node with its greater child
|
570
|
-
* until it is greater than both its children, so that the maxheap
|
571
|
-
* property is satisfied in the subtree rooted at A[subtree_idx]. */
|
572
|
-
static void
|
573
|
-
heapify_subtree(u32 A[], unsigned length, unsigned subtree_idx)
|
574
|
-
{
|
575
|
-
unsigned parent_idx;
|
576
|
-
unsigned child_idx;
|
577
|
-
u32 v;
|
578
|
-
|
579
|
-
v = A[subtree_idx];
|
580
|
-
parent_idx = subtree_idx;
|
581
|
-
while ((child_idx = parent_idx * 2) <= length) {
|
582
|
-
if (child_idx < length && A[child_idx + 1] > A[child_idx])
|
583
|
-
child_idx++;
|
584
|
-
if (v >= A[child_idx])
|
585
|
-
break;
|
586
|
-
A[parent_idx] = A[child_idx];
|
587
|
-
parent_idx = child_idx;
|
588
|
-
}
|
589
|
-
A[parent_idx] = v;
|
590
|
-
}
|
591
|
-
|
592
|
-
/* Rearrange the array 'A' so that it satisfies the maxheap property.
|
593
|
-
* 'A' uses 1-based indices, so the children of A[i] are A[i*2] and A[i*2 + 1].
|
594
|
-
*/
|
595
|
-
static void
|
596
|
-
heapify_array(u32 A[], unsigned length)
|
597
|
-
{
|
598
|
-
unsigned subtree_idx;
|
599
|
-
|
600
|
-
for (subtree_idx = length / 2; subtree_idx >= 1; subtree_idx--)
|
601
|
-
heapify_subtree(A, length, subtree_idx);
|
602
|
-
}
|
603
|
-
|
604
|
-
/*
|
605
|
-
* Sort the array 'A', which contains 'length' unsigned 32-bit integers.
|
606
|
-
*
|
607
|
-
* Note: name this function heap_sort() instead of heapsort() to avoid colliding
|
608
|
-
* with heapsort() from stdlib.h on BSD-derived systems --- though this isn't
|
609
|
-
* necessary when compiling with -D_ANSI_SOURCE, which is the better solution.
|
610
|
-
*/
|
611
|
-
static void
|
612
|
-
heap_sort(u32 A[], unsigned length)
|
613
|
-
{
|
614
|
-
A--; /* Use 1-based indices */
|
615
|
-
|
616
|
-
heapify_array(A, length);
|
617
|
-
|
618
|
-
while (length >= 2) {
|
619
|
-
u32 tmp = A[length];
|
620
|
-
A[length] = A[1];
|
621
|
-
A[1] = tmp;
|
622
|
-
length--;
|
623
|
-
heapify_subtree(A, length, 1);
|
624
|
-
}
|
625
|
-
}
|
626
|
-
|
627
|
-
#define NUM_SYMBOL_BITS 10
|
628
|
-
#define SYMBOL_MASK ((1 << NUM_SYMBOL_BITS) - 1)
|
629
|
-
|
630
|
-
#define GET_NUM_COUNTERS(num_syms) ((((num_syms) + 3 / 4) + 3) & ~3)
|
631
|
-
/*
|
632
|
-
* Sort the symbols primarily by frequency and secondarily by symbol
|
633
|
-
* value. Discard symbols with zero frequency and fill in an array with
|
634
|
-
* the remaining symbols, along with their frequencies. The low
|
635
|
-
* NUM_SYMBOL_BITS bits of each array entry will contain the symbol
|
636
|
-
* value, and the remaining bits will contain the frequency.
|
637
|
-
*
|
638
|
-
* @num_syms
|
639
|
-
* Number of symbols in the alphabet.
|
640
|
-
* Can't be greater than (1 << NUM_SYMBOL_BITS).
|
641
|
-
*
|
642
|
-
* @freqs[num_syms]
|
643
|
-
* The frequency of each symbol.
|
644
|
-
*
|
645
|
-
* @lens[num_syms]
|
646
|
-
* An array that eventually will hold the length of each codeword.
|
647
|
-
* This function only fills in the codeword lengths for symbols that
|
648
|
-
* have zero frequency, which are not well defined per se but will
|
649
|
-
* be set to 0.
|
650
|
-
*
|
651
|
-
* @symout[num_syms]
|
652
|
-
* The output array, described above.
|
653
|
-
*
|
654
|
-
* Returns the number of entries in 'symout' that were filled. This is
|
655
|
-
* the number of symbols that have nonzero frequency.
|
656
|
-
*/
|
657
|
-
static unsigned
|
658
|
-
sort_symbols(unsigned num_syms, const u32 freqs[restrict],
|
659
|
-
u8 lens[restrict], u32 symout[restrict])
|
660
|
-
{
|
661
|
-
unsigned sym;
|
662
|
-
unsigned i;
|
663
|
-
unsigned num_used_syms;
|
664
|
-
unsigned num_counters;
|
665
|
-
unsigned counters[GET_NUM_COUNTERS(DEFLATE_MAX_NUM_SYMS)];
|
666
|
-
|
667
|
-
/* We rely on heapsort, but with an added optimization. Since
|
668
|
-
* it's common for most symbol frequencies to be low, we first do
|
669
|
-
* a count sort using a limited number of counters. High
|
670
|
-
* frequencies will be counted in the last counter, and only they
|
671
|
-
* will be sorted with heapsort.
|
672
|
-
*
|
673
|
-
* Note: with more symbols, it is generally beneficial to have more
|
674
|
-
* counters. About 1 counter per 4 symbols seems fast.
|
675
|
-
*
|
676
|
-
* Note: I also tested radix sort, but even for large symbol
|
677
|
-
* counts (> 255) and frequencies bounded at 16 bits (enabling
|
678
|
-
* radix sort by just two base-256 digits), it didn't seem any
|
679
|
-
* faster than the method implemented here.
|
680
|
-
*
|
681
|
-
* Note: I tested the optimized quicksort implementation from
|
682
|
-
* glibc (with indirection overhead removed), but it was only
|
683
|
-
* marginally faster than the simple heapsort implemented here.
|
684
|
-
*
|
685
|
-
* Tests were done with building the codes for LZX. Results may
|
686
|
-
* vary for different compression algorithms...! */
|
687
|
-
|
688
|
-
num_counters = GET_NUM_COUNTERS(num_syms);
|
689
|
-
|
690
|
-
memset(counters, 0, num_counters * sizeof(counters[0]));
|
691
|
-
|
692
|
-
/* Count the frequencies. */
|
693
|
-
for (sym = 0; sym < num_syms; sym++)
|
694
|
-
counters[MIN(freqs[sym], num_counters - 1)]++;
|
695
|
-
|
696
|
-
/* Make the counters cumulative, ignoring the zero-th, which
|
697
|
-
* counted symbols with zero frequency. As a side effect, this
|
698
|
-
* calculates the number of symbols with nonzero frequency. */
|
699
|
-
num_used_syms = 0;
|
700
|
-
for (i = 1; i < num_counters; i++) {
|
701
|
-
unsigned count = counters[i];
|
702
|
-
counters[i] = num_used_syms;
|
703
|
-
num_used_syms += count;
|
704
|
-
}
|
705
|
-
|
706
|
-
/* Sort nonzero-frequency symbols using the counters. At the
|
707
|
-
* same time, set the codeword lengths of zero-frequency symbols
|
708
|
-
* to 0. */
|
709
|
-
for (sym = 0; sym < num_syms; sym++) {
|
710
|
-
u32 freq = freqs[sym];
|
711
|
-
if (freq != 0) {
|
712
|
-
symout[counters[MIN(freq, num_counters - 1)]++] =
|
713
|
-
sym | (freq << NUM_SYMBOL_BITS);
|
714
|
-
} else {
|
715
|
-
lens[sym] = 0;
|
716
|
-
}
|
717
|
-
}
|
718
|
-
|
719
|
-
/* Sort the symbols counted in the last counter. */
|
720
|
-
heap_sort(symout + counters[num_counters - 2],
|
721
|
-
counters[num_counters - 1] - counters[num_counters - 2]);
|
722
|
-
|
723
|
-
return num_used_syms;
|
724
|
-
}
|
725
|
-
|
726
|
-
/*
|
727
|
-
* Build the Huffman tree.
|
728
|
-
*
|
729
|
-
* This is an optimized implementation that
|
730
|
-
* (a) takes advantage of the frequencies being already sorted;
|
731
|
-
* (b) only generates non-leaf nodes, since the non-leaf nodes of a
|
732
|
-
* Huffman tree are sufficient to generate a canonical code;
|
733
|
-
* (c) Only stores parent pointers, not child pointers;
|
734
|
-
* (d) Produces the nodes in the same memory used for input
|
735
|
-
* frequency information.
|
736
|
-
*
|
737
|
-
* Array 'A', which contains 'sym_count' entries, is used for both input
|
738
|
-
* and output. For this function, 'sym_count' must be at least 2.
|
739
|
-
*
|
740
|
-
* For input, the array must contain the frequencies of the symbols,
|
741
|
-
* sorted in increasing order. Specifically, each entry must contain a
|
742
|
-
* frequency left shifted by NUM_SYMBOL_BITS bits. Any data in the low
|
743
|
-
* NUM_SYMBOL_BITS bits of the entries will be ignored by this function.
|
744
|
-
* Although these bits will, in fact, contain the symbols that correspond
|
745
|
-
* to the frequencies, this function is concerned with frequencies only
|
746
|
-
* and keeps the symbols as-is.
|
747
|
-
*
|
748
|
-
* For output, this function will produce the non-leaf nodes of the
|
749
|
-
* Huffman tree. These nodes will be stored in the first (sym_count - 1)
|
750
|
-
* entries of the array. Entry A[sym_count - 2] will represent the root
|
751
|
-
* node. Each other node will contain the zero-based index of its parent
|
752
|
-
* node in 'A', left shifted by NUM_SYMBOL_BITS bits. The low
|
753
|
-
* NUM_SYMBOL_BITS bits of each entry in A will be kept as-is. Again,
|
754
|
-
* note that although these low bits will, in fact, contain a symbol
|
755
|
-
* value, this symbol will have *no relationship* with the Huffman tree
|
756
|
-
* node that happens to occupy the same slot. This is because this
|
757
|
-
* implementation only generates the non-leaf nodes of the tree.
|
758
|
-
*/
|
759
|
-
static void
|
760
|
-
build_tree(u32 A[], unsigned sym_count)
|
761
|
-
{
|
762
|
-
/* Index, in 'A', of next lowest frequency symbol that has not
|
763
|
-
* yet been processed. */
|
764
|
-
unsigned i = 0;
|
765
|
-
|
766
|
-
/* Index, in 'A', of next lowest frequency parentless non-leaf
|
767
|
-
* node; or, if equal to 'e', then no such node exists yet. */
|
768
|
-
unsigned b = 0;
|
769
|
-
|
770
|
-
/* Index, in 'A', of next node to allocate as a non-leaf. */
|
771
|
-
unsigned e = 0;
|
772
|
-
|
773
|
-
do {
|
774
|
-
unsigned m, n;
|
775
|
-
u32 freq_shifted;
|
776
|
-
|
777
|
-
/* Choose the two next lowest frequency entries. */
|
778
|
-
|
779
|
-
if (i != sym_count &&
|
780
|
-
(b == e || (A[i] >> NUM_SYMBOL_BITS) <= (A[b] >> NUM_SYMBOL_BITS)))
|
781
|
-
m = i++;
|
782
|
-
else
|
783
|
-
m = b++;
|
784
|
-
|
785
|
-
if (i != sym_count &&
|
786
|
-
(b == e || (A[i] >> NUM_SYMBOL_BITS) <= (A[b] >> NUM_SYMBOL_BITS)))
|
787
|
-
n = i++;
|
788
|
-
else
|
789
|
-
n = b++;
|
790
|
-
|
791
|
-
/* Allocate a non-leaf node and link the entries to it.
|
792
|
-
*
|
793
|
-
* If we link an entry that we're visiting for the first
|
794
|
-
* time (via index 'i'), then we're actually linking a
|
795
|
-
* leaf node and it will have no effect, since the leaf
|
796
|
-
* will be overwritten with a non-leaf when index 'e'
|
797
|
-
* catches up to it. But it's not any slower to
|
798
|
-
* unconditionally set the parent index.
|
799
|
-
*
|
800
|
-
* We also compute the frequency of the non-leaf node as
|
801
|
-
* the sum of its two children's frequencies. */
|
802
|
-
|
803
|
-
freq_shifted = (A[m] & ~SYMBOL_MASK) + (A[n] & ~SYMBOL_MASK);
|
804
|
-
|
805
|
-
A[m] = (A[m] & SYMBOL_MASK) | (e << NUM_SYMBOL_BITS);
|
806
|
-
A[n] = (A[n] & SYMBOL_MASK) | (e << NUM_SYMBOL_BITS);
|
807
|
-
A[e] = (A[e] & SYMBOL_MASK) | freq_shifted;
|
808
|
-
e++;
|
809
|
-
} while (sym_count - e > 1);
|
810
|
-
/* When just one entry remains, it is a "leaf" that was
|
811
|
-
* linked to some other node. We ignore it, since the
|
812
|
-
* rest of the array contains the non-leaves which we
|
813
|
-
* need. (Note that we're assuming the cases with 0 or 1
|
814
|
-
* symbols were handled separately.) */
|
815
|
-
}
|
816
|
-
|
817
|
-
/*
|
818
|
-
* Given the stripped-down Huffman tree constructed by build_tree(),
|
819
|
-
* determine the number of codewords that should be assigned each
|
820
|
-
* possible length, taking into account the length-limited constraint.
|
821
|
-
*
|
822
|
-
* @A
|
823
|
-
* The array produced by build_tree(), containing parent index
|
824
|
-
* information for the non-leaf nodes of the Huffman tree. Each
|
825
|
-
* entry in this array is a node; a node's parent always has a
|
826
|
-
* greater index than that node itself. This function will
|
827
|
-
* overwrite the parent index information in this array, so
|
828
|
-
* essentially it will destroy the tree. However, the data in the
|
829
|
-
* low NUM_SYMBOL_BITS of each entry will be preserved.
|
830
|
-
*
|
831
|
-
* @root_idx
|
832
|
-
* The 0-based index of the root node in 'A', and consequently one
|
833
|
-
* less than the number of tree node entries in 'A'. (Or, really 2
|
834
|
-
* less than the actual length of 'A'.)
|
835
|
-
*
|
836
|
-
* @len_counts
|
837
|
-
* An array of length ('max_codeword_len' + 1) in which the number of
|
838
|
-
* codewords having each length <= max_codeword_len will be
|
839
|
-
* returned.
|
840
|
-
*
|
841
|
-
* @max_codeword_len
|
842
|
-
* The maximum permissible codeword length.
|
843
|
-
*/
|
844
|
-
static void
|
845
|
-
compute_length_counts(u32 A[restrict], unsigned root_idx,
|
846
|
-
unsigned len_counts[restrict], unsigned max_codeword_len)
|
847
|
-
{
|
848
|
-
unsigned len;
|
849
|
-
int node;
|
850
|
-
|
851
|
-
/* The key observations are:
|
852
|
-
*
|
853
|
-
* (1) We can traverse the non-leaf nodes of the tree, always
|
854
|
-
* visiting a parent before its children, by simply iterating
|
855
|
-
* through the array in reverse order. Consequently, we can
|
856
|
-
* compute the depth of each node in one pass, overwriting the
|
857
|
-
* parent indices with depths.
|
858
|
-
*
|
859
|
-
* (2) We can initially assume that in the real Huffman tree,
|
860
|
-
* both children of the root are leaves. This corresponds to two
|
861
|
-
* codewords of length 1. Then, whenever we visit a (non-leaf)
|
862
|
-
* node during the traversal, we modify this assumption to
|
863
|
-
* account for the current node *not* being a leaf, but rather
|
864
|
-
* its two children being leaves. This causes the loss of one
|
865
|
-
* codeword for the current depth and the addition of two
|
866
|
-
* codewords for the current depth plus one.
|
867
|
-
*
|
868
|
-
* (3) We can handle the length-limited constraint fairly easily
|
869
|
-
* by simply using the largest length available when a depth
|
870
|
-
* exceeds max_codeword_len.
|
871
|
-
*/
|
872
|
-
|
873
|
-
for (len = 0; len <= max_codeword_len; len++)
|
874
|
-
len_counts[len] = 0;
|
875
|
-
len_counts[1] = 2;
|
876
|
-
|
877
|
-
/* Set the root node's depth to 0. */
|
878
|
-
A[root_idx] &= SYMBOL_MASK;
|
879
|
-
|
880
|
-
for (node = root_idx - 1; node >= 0; node--) {
|
881
|
-
|
882
|
-
/* Calculate the depth of this node. */
|
883
|
-
|
884
|
-
unsigned parent = A[node] >> NUM_SYMBOL_BITS;
|
885
|
-
unsigned parent_depth = A[parent] >> NUM_SYMBOL_BITS;
|
886
|
-
unsigned depth = parent_depth + 1;
|
887
|
-
unsigned len = depth;
|
888
|
-
|
889
|
-
/* Set the depth of this node so that it is available
|
890
|
-
* when its children (if any) are processed. */
|
891
|
-
|
892
|
-
A[node] = (A[node] & SYMBOL_MASK) | (depth << NUM_SYMBOL_BITS);
|
893
|
-
|
894
|
-
/* If needed, decrease the length to meet the
|
895
|
-
* length-limited constraint. This is not the optimal
|
896
|
-
* method for generating length-limited Huffman codes!
|
897
|
-
* But it should be good enough. */
|
898
|
-
if (len >= max_codeword_len) {
|
899
|
-
len = max_codeword_len;
|
900
|
-
do {
|
901
|
-
len--;
|
902
|
-
} while (len_counts[len] == 0);
|
903
|
-
}
|
904
|
-
|
905
|
-
/* Account for the fact that we have a non-leaf node at
|
906
|
-
* the current depth. */
|
907
|
-
len_counts[len]--;
|
908
|
-
len_counts[len + 1] += 2;
|
909
|
-
}
|
910
|
-
}
|
911
|
-
|
912
|
-
/*
|
913
|
-
* Generate the codewords for a canonical Huffman code.
|
914
|
-
*
|
915
|
-
* @A
|
916
|
-
* The output array for codewords. In addition, initially this
|
917
|
-
* array must contain the symbols, sorted primarily by frequency and
|
918
|
-
* secondarily by symbol value, in the low NUM_SYMBOL_BITS bits of
|
919
|
-
* each entry.
|
920
|
-
*
|
921
|
-
* @len
|
922
|
-
* Output array for codeword lengths.
|
923
|
-
*
|
924
|
-
* @len_counts
|
925
|
-
* An array that provides the number of codewords that will have
|
926
|
-
* each possible length <= max_codeword_len.
|
927
|
-
*
|
928
|
-
* @max_codeword_len
|
929
|
-
* Maximum length, in bits, of each codeword.
|
930
|
-
*
|
931
|
-
* @num_syms
|
932
|
-
* Number of symbols in the alphabet, including symbols with zero
|
933
|
-
* frequency. This is the length of the 'A' and 'len' arrays.
|
934
|
-
*/
|
935
|
-
static void
|
936
|
-
gen_codewords(u32 A[restrict], u8 lens[restrict],
|
937
|
-
const unsigned len_counts[restrict],
|
938
|
-
unsigned max_codeword_len, unsigned num_syms)
|
939
|
-
{
|
940
|
-
u32 next_codewords[DEFLATE_MAX_CODEWORD_LEN + 1];
|
941
|
-
unsigned i;
|
942
|
-
unsigned len;
|
943
|
-
unsigned sym;
|
944
|
-
|
945
|
-
/* Given the number of codewords that will have each length,
|
946
|
-
* assign codeword lengths to symbols. We do this by assigning
|
947
|
-
* the lengths in decreasing order to the symbols sorted
|
948
|
-
* primarily by increasing frequency and secondarily by
|
949
|
-
* increasing symbol value. */
|
950
|
-
for (i = 0, len = max_codeword_len; len >= 1; len--) {
|
951
|
-
unsigned count = len_counts[len];
|
952
|
-
while (count--)
|
953
|
-
lens[A[i++] & SYMBOL_MASK] = len;
|
954
|
-
}
|
955
|
-
|
956
|
-
/* Generate the codewords themselves. We initialize the
|
957
|
-
* 'next_codewords' array to provide the lexicographically first
|
958
|
-
* codeword of each length, then assign codewords in symbol
|
959
|
-
* order. This produces a canonical code. */
|
960
|
-
next_codewords[0] = 0;
|
961
|
-
next_codewords[1] = 0;
|
962
|
-
for (len = 2; len <= max_codeword_len; len++)
|
963
|
-
next_codewords[len] =
|
964
|
-
(next_codewords[len - 1] + len_counts[len - 1]) << 1;
|
965
|
-
|
966
|
-
for (sym = 0; sym < num_syms; sym++)
|
967
|
-
A[sym] = next_codewords[lens[sym]]++;
|
968
|
-
}
|
969
|
-
|
970
|
-
/*
|
971
|
-
* ---------------------------------------------------------------------
|
972
|
-
* make_canonical_huffman_code()
|
973
|
-
* ---------------------------------------------------------------------
|
974
|
-
*
|
975
|
-
* Given an alphabet and the frequency of each symbol in it, construct a
|
976
|
-
* length-limited canonical Huffman code.
|
977
|
-
*
|
978
|
-
* @num_syms
|
979
|
-
* The number of symbols in the alphabet. The symbols are the
|
980
|
-
* integers in the range [0, num_syms - 1]. This parameter must be
|
981
|
-
* at least 2 and can't be greater than (1 << NUM_SYMBOL_BITS).
|
982
|
-
*
|
983
|
-
* @max_codeword_len
|
984
|
-
* The maximum permissible codeword length.
|
985
|
-
*
|
986
|
-
* @freqs
|
987
|
-
* An array of @num_syms entries, each of which specifies the
|
988
|
-
* frequency of the corresponding symbol. It is valid for some,
|
989
|
-
* none, or all of the frequencies to be 0.
|
990
|
-
*
|
991
|
-
* @lens
|
992
|
-
* An array of @num_syms entries in which this function will return
|
993
|
-
* the length, in bits, of the codeword assigned to each symbol.
|
994
|
-
* Symbols with 0 frequency will not have codewords per se, but
|
995
|
-
* their entries in this array will be set to 0. No lengths greater
|
996
|
-
* than @max_codeword_len will be assigned.
|
997
|
-
*
|
998
|
-
* @codewords
|
999
|
-
* An array of @num_syms entries in which this function will return
|
1000
|
-
* the codeword for each symbol, right-justified and padded on the
|
1001
|
-
* left with zeroes. Codewords for symbols with 0 frequency will be
|
1002
|
-
* undefined.
|
1003
|
-
*
|
1004
|
-
* ---------------------------------------------------------------------
|
1005
|
-
*
|
1006
|
-
* This function builds a length-limited canonical Huffman code.
|
1007
|
-
*
|
1008
|
-
* A length-limited Huffman code contains no codewords longer than some
|
1009
|
-
* specified length, and has exactly (with some algorithms) or
|
1010
|
-
* approximately (with the algorithm used here) the minimum weighted path
|
1011
|
-
* length from the root, given this constraint.
|
1012
|
-
*
|
1013
|
-
* A canonical Huffman code satisfies the properties that a longer
|
1014
|
-
* codeword never lexicographically precedes a shorter codeword, and the
|
1015
|
-
* lexicographic ordering of codewords of the same length is the same as
|
1016
|
-
* the lexicographic ordering of the corresponding symbols. A canonical
|
1017
|
-
* Huffman code, or more generally a canonical prefix code, can be
|
1018
|
-
* reconstructed from only a list containing the codeword length of each
|
1019
|
-
* symbol.
|
1020
|
-
*
|
1021
|
-
* The classic algorithm to generate a Huffman code creates a node for
|
1022
|
-
* each symbol, then inserts these nodes into a min-heap keyed by symbol
|
1023
|
-
* frequency. Then, repeatedly, the two lowest-frequency nodes are
|
1024
|
-
* removed from the min-heap and added as the children of a new node
|
1025
|
-
* having frequency equal to the sum of its two children, which is then
|
1026
|
-
* inserted into the min-heap. When only a single node remains in the
|
1027
|
-
* min-heap, it is the root of the Huffman tree. The codeword for each
|
1028
|
-
* symbol is determined by the path needed to reach the corresponding
|
1029
|
-
* node from the root. Descending to the left child appends a 0 bit,
|
1030
|
-
* whereas descending to the right child appends a 1 bit.
|
1031
|
-
*
|
1032
|
-
* The classic algorithm is relatively easy to understand, but it is
|
1033
|
-
* subject to a number of inefficiencies. In practice, it is fastest to
|
1034
|
-
* first sort the symbols by frequency. (This itself can be subject to
|
1035
|
-
* an optimization based on the fact that most frequencies tend to be
|
1036
|
-
* low.) At the same time, we sort secondarily by symbol value, which
|
1037
|
-
* aids the process of generating a canonical code. Then, during tree
|
1038
|
-
* construction, no heap is necessary because both the leaf nodes and the
|
1039
|
-
* unparented non-leaf nodes can be easily maintained in sorted order.
|
1040
|
-
* Consequently, there can never be more than two possibilities for the
|
1041
|
-
* next-lowest-frequency node.
|
1042
|
-
*
|
1043
|
-
* In addition, because we're generating a canonical code, we actually
|
1044
|
-
* don't need the leaf nodes of the tree at all, only the non-leaf nodes.
|
1045
|
-
* This is because for canonical code generation we don't need to know
|
1046
|
-
* where the symbols are in the tree. Rather, we only need to know how
|
1047
|
-
* many leaf nodes have each depth (codeword length). And this
|
1048
|
-
* information can, in fact, be quickly generated from the tree of
|
1049
|
-
* non-leaves only.
|
1050
|
-
*
|
1051
|
-
* Furthermore, we can build this stripped-down Huffman tree directly in
|
1052
|
-
* the array in which the codewords are to be generated, provided that
|
1053
|
-
* these array slots are large enough to hold a symbol and frequency
|
1054
|
-
* value.
|
1055
|
-
*
|
1056
|
-
* Still furthermore, we don't even need to maintain explicit child
|
1057
|
-
* pointers. We only need the parent pointers, and even those can be
|
1058
|
-
* overwritten in-place with depth information as part of the process of
|
1059
|
-
* extracting codeword lengths from the tree. So in summary, we do NOT
|
1060
|
-
* need a big structure like:
|
1061
|
-
*
|
1062
|
-
* struct huffman_tree_node {
|
1063
|
-
* unsigned int symbol;
|
1064
|
-
* unsigned int frequency;
|
1065
|
-
* unsigned int depth;
|
1066
|
-
* struct huffman_tree_node *left_child;
|
1067
|
-
* struct huffman_tree_node *right_child;
|
1068
|
-
* };
|
1069
|
-
*
|
1070
|
-
*
|
1071
|
-
* ... which often gets used in "naive" implementations of Huffman code
|
1072
|
-
* generation.
|
1073
|
-
*
|
1074
|
-
* Many of these optimizations are based on the implementation in 7-Zip
|
1075
|
-
* (source file: C/HuffEnc.c), which has been placed in the public domain
|
1076
|
-
* by Igor Pavlov.
|
1077
|
-
*/
|
1078
|
-
static void
|
1079
|
-
make_canonical_huffman_code(unsigned num_syms, unsigned max_codeword_len,
|
1080
|
-
const u32 freqs[restrict],
|
1081
|
-
u8 lens[restrict], u32 codewords[restrict])
|
1082
|
-
{
|
1083
|
-
u32 *A = codewords;
|
1084
|
-
unsigned num_used_syms;
|
1085
|
-
|
1086
|
-
STATIC_ASSERT(DEFLATE_MAX_NUM_SYMS <= 1 << NUM_SYMBOL_BITS);
|
1087
|
-
|
1088
|
-
/* We begin by sorting the symbols primarily by frequency and
|
1089
|
-
* secondarily by symbol value. As an optimization, the array
|
1090
|
-
* used for this purpose ('A') shares storage with the space in
|
1091
|
-
* which we will eventually return the codewords. */
|
1092
|
-
|
1093
|
-
num_used_syms = sort_symbols(num_syms, freqs, lens, A);
|
1094
|
-
|
1095
|
-
/* 'num_used_syms' is the number of symbols with nonzero
|
1096
|
-
* frequency. This may be less than @num_syms. 'num_used_syms'
|
1097
|
-
* is also the number of entries in 'A' that are valid. Each
|
1098
|
-
* entry consists of a distinct symbol and a nonzero frequency
|
1099
|
-
* packed into a 32-bit integer. */
|
1100
|
-
|
1101
|
-
/* Handle special cases where only 0 or 1 symbols were used (had
|
1102
|
-
* nonzero frequency). */
|
1103
|
-
|
1104
|
-
if (unlikely(num_used_syms == 0)) {
|
1105
|
-
/* Code is empty. sort_symbols() already set all lengths
|
1106
|
-
* to 0, so there is nothing more to do. */
|
1107
|
-
return;
|
1108
|
-
}
|
1109
|
-
|
1110
|
-
if (unlikely(num_used_syms == 1)) {
|
1111
|
-
/* Only one symbol was used, so we only need one
|
1112
|
-
* codeword. But two codewords are needed to form the
|
1113
|
-
* smallest complete Huffman code, which uses codewords 0
|
1114
|
-
* and 1. Therefore, we choose another symbol to which
|
1115
|
-
* to assign a codeword. We use 0 (if the used symbol is
|
1116
|
-
* not 0) or 1 (if the used symbol is 0). In either
|
1117
|
-
* case, the lesser-valued symbol must be assigned
|
1118
|
-
* codeword 0 so that the resulting code is canonical. */
|
1119
|
-
|
1120
|
-
unsigned sym = A[0] & SYMBOL_MASK;
|
1121
|
-
unsigned nonzero_idx = sym ? sym : 1;
|
1122
|
-
|
1123
|
-
codewords[0] = 0;
|
1124
|
-
lens[0] = 1;
|
1125
|
-
codewords[nonzero_idx] = 1;
|
1126
|
-
lens[nonzero_idx] = 1;
|
1127
|
-
return;
|
1128
|
-
}
|
1129
|
-
|
1130
|
-
/* Build a stripped-down version of the Huffman tree, sharing the
|
1131
|
-
* array 'A' with the symbol values. Then extract length counts
|
1132
|
-
* from the tree and use them to generate the final codewords. */
|
1133
|
-
|
1134
|
-
build_tree(A, num_used_syms);
|
1135
|
-
|
1136
|
-
{
|
1137
|
-
unsigned len_counts[DEFLATE_MAX_CODEWORD_LEN + 1];
|
1138
|
-
|
1139
|
-
compute_length_counts(A, num_used_syms - 2,
|
1140
|
-
len_counts, max_codeword_len);
|
1141
|
-
|
1142
|
-
gen_codewords(A, lens, len_counts, max_codeword_len, num_syms);
|
1143
|
-
}
|
1144
|
-
}
|
1145
|
-
|
1146
|
-
/*
|
1147
|
-
* Clear the Huffman symbol frequency counters.
|
1148
|
-
* This must be called when starting a new DEFLATE block.
|
1149
|
-
*/
|
1150
|
-
static void
|
1151
|
-
deflate_reset_symbol_frequencies(struct libdeflate_compressor *c)
|
1152
|
-
{
|
1153
|
-
memset(&c->freqs, 0, sizeof(c->freqs));
|
1154
|
-
}
|
1155
|
-
|
1156
|
-
/* Reverse the Huffman codeword 'codeword', which is 'len' bits in length. */
|
1157
|
-
static u32
|
1158
|
-
deflate_reverse_codeword(u32 codeword, u8 len)
|
1159
|
-
{
|
1160
|
-
/* The following branchless algorithm is faster than going bit by bit.
|
1161
|
-
* Note: since no codewords are longer than 16 bits, we only need to
|
1162
|
-
* reverse the low 16 bits of the 'u32'. */
|
1163
|
-
STATIC_ASSERT(DEFLATE_MAX_CODEWORD_LEN <= 16);
|
1164
|
-
|
1165
|
-
/* Flip adjacent 1-bit fields */
|
1166
|
-
codeword = ((codeword & 0x5555) << 1) | ((codeword & 0xAAAA) >> 1);
|
1167
|
-
|
1168
|
-
/* Flip adjacent 2-bit fields */
|
1169
|
-
codeword = ((codeword & 0x3333) << 2) | ((codeword & 0xCCCC) >> 2);
|
1170
|
-
|
1171
|
-
/* Flip adjacent 4-bit fields */
|
1172
|
-
codeword = ((codeword & 0x0F0F) << 4) | ((codeword & 0xF0F0) >> 4);
|
1173
|
-
|
1174
|
-
/* Flip adjacent 8-bit fields */
|
1175
|
-
codeword = ((codeword & 0x00FF) << 8) | ((codeword & 0xFF00) >> 8);
|
1176
|
-
|
1177
|
-
/* Return the high 'len' bits of the bit-reversed 16 bit value. */
|
1178
|
-
return codeword >> (16 - len);
|
1179
|
-
}
|
1180
|
-
|
1181
|
-
/* Make a canonical Huffman code with bit-reversed codewords. */
|
1182
|
-
static void
|
1183
|
-
deflate_make_huffman_code(unsigned num_syms, unsigned max_codeword_len,
|
1184
|
-
const u32 freqs[], u8 lens[], u32 codewords[])
|
1185
|
-
{
|
1186
|
-
unsigned sym;
|
1187
|
-
|
1188
|
-
make_canonical_huffman_code(num_syms, max_codeword_len,
|
1189
|
-
freqs, lens, codewords);
|
1190
|
-
|
1191
|
-
for (sym = 0; sym < num_syms; sym++)
|
1192
|
-
codewords[sym] = deflate_reverse_codeword(codewords[sym], lens[sym]);
|
1193
|
-
}
|
1194
|
-
|
1195
|
-
/*
|
1196
|
-
* Build the literal/length and offset Huffman codes for a DEFLATE block.
|
1197
|
-
*
|
1198
|
-
* This takes as input the frequency tables for each code and produces as output
|
1199
|
-
* a set of tables that map symbols to codewords and codeword lengths.
|
1200
|
-
*/
|
1201
|
-
static void
|
1202
|
-
deflate_make_huffman_codes(const struct deflate_freqs *freqs,
|
1203
|
-
struct deflate_codes *codes)
|
1204
|
-
{
|
1205
|
-
STATIC_ASSERT(MAX_LITLEN_CODEWORD_LEN <= DEFLATE_MAX_LITLEN_CODEWORD_LEN);
|
1206
|
-
STATIC_ASSERT(MAX_OFFSET_CODEWORD_LEN <= DEFLATE_MAX_OFFSET_CODEWORD_LEN);
|
1207
|
-
|
1208
|
-
deflate_make_huffman_code(DEFLATE_NUM_LITLEN_SYMS,
|
1209
|
-
MAX_LITLEN_CODEWORD_LEN,
|
1210
|
-
freqs->litlen,
|
1211
|
-
codes->lens.litlen,
|
1212
|
-
codes->codewords.litlen);
|
1213
|
-
|
1214
|
-
deflate_make_huffman_code(DEFLATE_NUM_OFFSET_SYMS,
|
1215
|
-
MAX_OFFSET_CODEWORD_LEN,
|
1216
|
-
freqs->offset,
|
1217
|
-
codes->lens.offset,
|
1218
|
-
codes->codewords.offset);
|
1219
|
-
}
|
1220
|
-
|
1221
|
-
/* Initialize c->static_codes. */
|
1222
|
-
static void
|
1223
|
-
deflate_init_static_codes(struct libdeflate_compressor *c)
|
1224
|
-
{
|
1225
|
-
unsigned i;
|
1226
|
-
|
1227
|
-
for (i = 0; i < 144; i++)
|
1228
|
-
c->freqs.litlen[i] = 1 << (9 - 8);
|
1229
|
-
for (; i < 256; i++)
|
1230
|
-
c->freqs.litlen[i] = 1 << (9 - 9);
|
1231
|
-
for (; i < 280; i++)
|
1232
|
-
c->freqs.litlen[i] = 1 << (9 - 7);
|
1233
|
-
for (; i < 288; i++)
|
1234
|
-
c->freqs.litlen[i] = 1 << (9 - 8);
|
1235
|
-
|
1236
|
-
for (i = 0; i < 32; i++)
|
1237
|
-
c->freqs.offset[i] = 1 << (5 - 5);
|
1238
|
-
|
1239
|
-
deflate_make_huffman_codes(&c->freqs, &c->static_codes);
|
1240
|
-
}
|
1241
|
-
|
1242
|
-
/* Return the offset slot for the specified match offset. */
|
1243
|
-
static forceinline unsigned
|
1244
|
-
deflate_get_offset_slot(struct libdeflate_compressor *c, unsigned offset)
|
1245
|
-
{
|
1246
|
-
#if USE_FULL_OFFSET_SLOT_FAST
|
1247
|
-
return c->offset_slot_fast[offset];
|
1248
|
-
#else
|
1249
|
-
if (offset <= 256)
|
1250
|
-
return c->offset_slot_fast[offset - 1];
|
1251
|
-
else
|
1252
|
-
return c->offset_slot_fast[256 + ((offset - 1) >> 7)];
|
1253
|
-
#endif
|
1254
|
-
}
|
1255
|
-
|
1256
|
-
/* Write the header fields common to all DEFLATE block types. */
|
1257
|
-
static void
|
1258
|
-
deflate_write_block_header(struct deflate_output_bitstream *os,
|
1259
|
-
bool is_final_block, unsigned block_type)
|
1260
|
-
{
|
1261
|
-
deflate_add_bits(os, is_final_block, 1);
|
1262
|
-
deflate_add_bits(os, block_type, 2);
|
1263
|
-
deflate_flush_bits(os);
|
1264
|
-
}
|
1265
|
-
|
1266
|
-
static unsigned
|
1267
|
-
deflate_compute_precode_items(const u8 lens[restrict],
|
1268
|
-
const unsigned num_lens,
|
1269
|
-
u32 precode_freqs[restrict],
|
1270
|
-
unsigned precode_items[restrict])
|
1271
|
-
{
|
1272
|
-
unsigned *itemptr;
|
1273
|
-
unsigned run_start;
|
1274
|
-
unsigned run_end;
|
1275
|
-
unsigned extra_bits;
|
1276
|
-
u8 len;
|
1277
|
-
|
1278
|
-
memset(precode_freqs, 0,
|
1279
|
-
DEFLATE_NUM_PRECODE_SYMS * sizeof(precode_freqs[0]));
|
1280
|
-
|
1281
|
-
itemptr = precode_items;
|
1282
|
-
run_start = 0;
|
1283
|
-
do {
|
1284
|
-
/* Find the next run of codeword lengths. */
|
1285
|
-
|
1286
|
-
/* len = the length being repeated */
|
1287
|
-
len = lens[run_start];
|
1288
|
-
|
1289
|
-
/* Extend the run. */
|
1290
|
-
run_end = run_start;
|
1291
|
-
do {
|
1292
|
-
run_end++;
|
1293
|
-
} while (run_end != num_lens && len == lens[run_end]);
|
1294
|
-
|
1295
|
-
if (len == 0) {
|
1296
|
-
/* Run of zeroes. */
|
1297
|
-
|
1298
|
-
/* Symbol 18: RLE 11 to 138 zeroes at a time. */
|
1299
|
-
while ((run_end - run_start) >= 11) {
|
1300
|
-
extra_bits = MIN((run_end - run_start) - 11, 0x7F);
|
1301
|
-
precode_freqs[18]++;
|
1302
|
-
*itemptr++ = 18 | (extra_bits << 5);
|
1303
|
-
run_start += 11 + extra_bits;
|
1304
|
-
}
|
1305
|
-
|
1306
|
-
/* Symbol 17: RLE 3 to 10 zeroes at a time. */
|
1307
|
-
if ((run_end - run_start) >= 3) {
|
1308
|
-
extra_bits = MIN((run_end - run_start) - 3, 0x7);
|
1309
|
-
precode_freqs[17]++;
|
1310
|
-
*itemptr++ = 17 | (extra_bits << 5);
|
1311
|
-
run_start += 3 + extra_bits;
|
1312
|
-
}
|
1313
|
-
} else {
|
1314
|
-
|
1315
|
-
/* A run of nonzero lengths. */
|
1316
|
-
|
1317
|
-
/* Symbol 16: RLE 3 to 6 of the previous length. */
|
1318
|
-
if ((run_end - run_start) >= 4) {
|
1319
|
-
precode_freqs[len]++;
|
1320
|
-
*itemptr++ = len;
|
1321
|
-
run_start++;
|
1322
|
-
do {
|
1323
|
-
extra_bits = MIN((run_end - run_start) - 3, 0x3);
|
1324
|
-
precode_freqs[16]++;
|
1325
|
-
*itemptr++ = 16 | (extra_bits << 5);
|
1326
|
-
run_start += 3 + extra_bits;
|
1327
|
-
} while ((run_end - run_start) >= 3);
|
1328
|
-
}
|
1329
|
-
}
|
1330
|
-
|
1331
|
-
/* Output any remaining lengths without RLE. */
|
1332
|
-
while (run_start != run_end) {
|
1333
|
-
precode_freqs[len]++;
|
1334
|
-
*itemptr++ = len;
|
1335
|
-
run_start++;
|
1336
|
-
}
|
1337
|
-
} while (run_start != num_lens);
|
1338
|
-
|
1339
|
-
return itemptr - precode_items;
|
1340
|
-
}
|
1341
|
-
|
1342
|
-
/*
|
1343
|
-
* Huffman codeword lengths for dynamic Huffman blocks are compressed using a
|
1344
|
-
* separate Huffman code, the "precode", which contains a symbol for each
|
1345
|
-
* possible codeword length in the larger code as well as several special
|
1346
|
-
* symbols to represent repeated codeword lengths (a form of run-length
|
1347
|
-
* encoding). The precode is itself constructed in canonical form, and its
|
1348
|
-
* codeword lengths are represented literally in 19 3-bit fields that
|
1349
|
-
* immediately precede the compressed codeword lengths of the larger code.
|
1350
|
-
*/
|
1351
|
-
|
1352
|
-
/* Precompute the information needed to output Huffman codes. */
|
1353
|
-
static void
|
1354
|
-
deflate_precompute_huffman_header(struct libdeflate_compressor *c)
|
1355
|
-
{
|
1356
|
-
/* Compute how many litlen and offset symbols are needed. */
|
1357
|
-
|
1358
|
-
for (c->num_litlen_syms = DEFLATE_NUM_LITLEN_SYMS;
|
1359
|
-
c->num_litlen_syms > 257;
|
1360
|
-
c->num_litlen_syms--)
|
1361
|
-
if (c->codes.lens.litlen[c->num_litlen_syms - 1] != 0)
|
1362
|
-
break;
|
1363
|
-
|
1364
|
-
for (c->num_offset_syms = DEFLATE_NUM_OFFSET_SYMS;
|
1365
|
-
c->num_offset_syms > 1;
|
1366
|
-
c->num_offset_syms--)
|
1367
|
-
if (c->codes.lens.offset[c->num_offset_syms - 1] != 0)
|
1368
|
-
break;
|
1369
|
-
|
1370
|
-
/* If we're not using the full set of literal/length codeword lengths,
|
1371
|
-
* then temporarily move the offset codeword lengths over so that the
|
1372
|
-
* literal/length and offset codeword lengths are contiguous. */
|
1373
|
-
|
1374
|
-
STATIC_ASSERT(offsetof(struct deflate_lens, offset) ==
|
1375
|
-
DEFLATE_NUM_LITLEN_SYMS);
|
1376
|
-
|
1377
|
-
if (c->num_litlen_syms != DEFLATE_NUM_LITLEN_SYMS) {
|
1378
|
-
memmove((u8 *)&c->codes.lens + c->num_litlen_syms,
|
1379
|
-
(u8 *)&c->codes.lens + DEFLATE_NUM_LITLEN_SYMS,
|
1380
|
-
c->num_offset_syms);
|
1381
|
-
}
|
1382
|
-
|
1383
|
-
/* Compute the "items" (RLE / literal tokens and extra bits) with which
|
1384
|
-
* the codeword lengths in the larger code will be output. */
|
1385
|
-
c->num_precode_items =
|
1386
|
-
deflate_compute_precode_items((u8 *)&c->codes.lens,
|
1387
|
-
c->num_litlen_syms +
|
1388
|
-
c->num_offset_syms,
|
1389
|
-
c->precode_freqs,
|
1390
|
-
c->precode_items);
|
1391
|
-
|
1392
|
-
/* Build the precode. */
|
1393
|
-
STATIC_ASSERT(MAX_PRE_CODEWORD_LEN <= DEFLATE_MAX_PRE_CODEWORD_LEN);
|
1394
|
-
deflate_make_huffman_code(DEFLATE_NUM_PRECODE_SYMS,
|
1395
|
-
MAX_PRE_CODEWORD_LEN,
|
1396
|
-
c->precode_freqs, c->precode_lens,
|
1397
|
-
c->precode_codewords);
|
1398
|
-
|
1399
|
-
/* Count how many precode lengths we actually need to output. */
|
1400
|
-
for (c->num_explicit_lens = DEFLATE_NUM_PRECODE_SYMS;
|
1401
|
-
c->num_explicit_lens > 4;
|
1402
|
-
c->num_explicit_lens--)
|
1403
|
-
if (c->precode_lens[deflate_precode_lens_permutation[
|
1404
|
-
c->num_explicit_lens - 1]] != 0)
|
1405
|
-
break;
|
1406
|
-
|
1407
|
-
/* Restore the offset codeword lengths if needed. */
|
1408
|
-
if (c->num_litlen_syms != DEFLATE_NUM_LITLEN_SYMS) {
|
1409
|
-
memmove((u8 *)&c->codes.lens + DEFLATE_NUM_LITLEN_SYMS,
|
1410
|
-
(u8 *)&c->codes.lens + c->num_litlen_syms,
|
1411
|
-
c->num_offset_syms);
|
1412
|
-
}
|
1413
|
-
}
|
1414
|
-
|
1415
|
-
/* Output the Huffman codes. */
|
1416
|
-
static void
|
1417
|
-
deflate_write_huffman_header(struct libdeflate_compressor *c,
|
1418
|
-
struct deflate_output_bitstream *os)
|
1419
|
-
{
|
1420
|
-
unsigned i;
|
1421
|
-
|
1422
|
-
deflate_add_bits(os, c->num_litlen_syms - 257, 5);
|
1423
|
-
deflate_add_bits(os, c->num_offset_syms - 1, 5);
|
1424
|
-
deflate_add_bits(os, c->num_explicit_lens - 4, 4);
|
1425
|
-
deflate_flush_bits(os);
|
1426
|
-
|
1427
|
-
/* Output the lengths of the codewords in the precode. */
|
1428
|
-
for (i = 0; i < c->num_explicit_lens; i++) {
|
1429
|
-
deflate_add_bits(os, c->precode_lens[
|
1430
|
-
deflate_precode_lens_permutation[i]], 3);
|
1431
|
-
deflate_flush_bits(os);
|
1432
|
-
}
|
1433
|
-
|
1434
|
-
/* Output the encoded lengths of the codewords in the larger code. */
|
1435
|
-
for (i = 0; i < c->num_precode_items; i++) {
|
1436
|
-
unsigned precode_item = c->precode_items[i];
|
1437
|
-
unsigned precode_sym = precode_item & 0x1F;
|
1438
|
-
deflate_add_bits(os, c->precode_codewords[precode_sym],
|
1439
|
-
c->precode_lens[precode_sym]);
|
1440
|
-
if (precode_sym >= 16) {
|
1441
|
-
if (precode_sym == 16)
|
1442
|
-
deflate_add_bits(os, precode_item >> 5, 2);
|
1443
|
-
else if (precode_sym == 17)
|
1444
|
-
deflate_add_bits(os, precode_item >> 5, 3);
|
1445
|
-
else
|
1446
|
-
deflate_add_bits(os, precode_item >> 5, 7);
|
1447
|
-
}
|
1448
|
-
STATIC_ASSERT(CAN_BUFFER(DEFLATE_MAX_PRE_CODEWORD_LEN + 7));
|
1449
|
-
deflate_flush_bits(os);
|
1450
|
-
}
|
1451
|
-
}
|
1452
|
-
|
1453
|
-
static void
|
1454
|
-
deflate_write_sequences(struct deflate_output_bitstream * restrict os,
|
1455
|
-
const struct deflate_codes * restrict codes,
|
1456
|
-
const struct deflate_sequence sequences[restrict],
|
1457
|
-
const u8 * restrict in_next)
|
1458
|
-
{
|
1459
|
-
const struct deflate_sequence *seq = sequences;
|
1460
|
-
|
1461
|
-
for (;;) {
|
1462
|
-
u32 litrunlen = seq->litrunlen_and_length & 0x7FFFFF;
|
1463
|
-
unsigned length = seq->litrunlen_and_length >> 23;
|
1464
|
-
unsigned length_slot;
|
1465
|
-
unsigned litlen_symbol;
|
1466
|
-
unsigned offset_symbol;
|
1467
|
-
|
1468
|
-
if (litrunlen) {
|
1469
|
-
#if 1
|
1470
|
-
while (litrunlen >= 4) {
|
1471
|
-
unsigned lit0 = in_next[0];
|
1472
|
-
unsigned lit1 = in_next[1];
|
1473
|
-
unsigned lit2 = in_next[2];
|
1474
|
-
unsigned lit3 = in_next[3];
|
1475
|
-
|
1476
|
-
deflate_add_bits(os, codes->codewords.litlen[lit0],
|
1477
|
-
codes->lens.litlen[lit0]);
|
1478
|
-
if (!CAN_BUFFER(2 * MAX_LITLEN_CODEWORD_LEN))
|
1479
|
-
deflate_flush_bits(os);
|
1480
|
-
|
1481
|
-
deflate_add_bits(os, codes->codewords.litlen[lit1],
|
1482
|
-
codes->lens.litlen[lit1]);
|
1483
|
-
if (!CAN_BUFFER(4 * MAX_LITLEN_CODEWORD_LEN))
|
1484
|
-
deflate_flush_bits(os);
|
1485
|
-
|
1486
|
-
deflate_add_bits(os, codes->codewords.litlen[lit2],
|
1487
|
-
codes->lens.litlen[lit2]);
|
1488
|
-
if (!CAN_BUFFER(2 * MAX_LITLEN_CODEWORD_LEN))
|
1489
|
-
deflate_flush_bits(os);
|
1490
|
-
|
1491
|
-
deflate_add_bits(os, codes->codewords.litlen[lit3],
|
1492
|
-
codes->lens.litlen[lit3]);
|
1493
|
-
deflate_flush_bits(os);
|
1494
|
-
in_next += 4;
|
1495
|
-
litrunlen -= 4;
|
1496
|
-
}
|
1497
|
-
if (litrunlen-- != 0) {
|
1498
|
-
deflate_add_bits(os, codes->codewords.litlen[*in_next],
|
1499
|
-
codes->lens.litlen[*in_next]);
|
1500
|
-
if (!CAN_BUFFER(3 * MAX_LITLEN_CODEWORD_LEN))
|
1501
|
-
deflate_flush_bits(os);
|
1502
|
-
in_next++;
|
1503
|
-
if (litrunlen-- != 0) {
|
1504
|
-
deflate_add_bits(os, codes->codewords.litlen[*in_next],
|
1505
|
-
codes->lens.litlen[*in_next]);
|
1506
|
-
if (!CAN_BUFFER(3 * MAX_LITLEN_CODEWORD_LEN))
|
1507
|
-
deflate_flush_bits(os);
|
1508
|
-
in_next++;
|
1509
|
-
if (litrunlen-- != 0) {
|
1510
|
-
deflate_add_bits(os, codes->codewords.litlen[*in_next],
|
1511
|
-
codes->lens.litlen[*in_next]);
|
1512
|
-
if (!CAN_BUFFER(3 * MAX_LITLEN_CODEWORD_LEN))
|
1513
|
-
deflate_flush_bits(os);
|
1514
|
-
in_next++;
|
1515
|
-
}
|
1516
|
-
}
|
1517
|
-
if (CAN_BUFFER(3 * MAX_LITLEN_CODEWORD_LEN))
|
1518
|
-
deflate_flush_bits(os);
|
1519
|
-
}
|
1520
|
-
#else
|
1521
|
-
do {
|
1522
|
-
unsigned lit = *in_next++;
|
1523
|
-
deflate_add_bits(os, codes->codewords.litlen[lit],
|
1524
|
-
codes->lens.litlen[lit]);
|
1525
|
-
deflate_flush_bits(os);
|
1526
|
-
} while (--litrunlen);
|
1527
|
-
#endif
|
1528
|
-
}
|
1529
|
-
|
1530
|
-
if (length == 0)
|
1531
|
-
return;
|
1532
|
-
|
1533
|
-
in_next += length;
|
1534
|
-
|
1535
|
-
length_slot = seq->length_slot;
|
1536
|
-
litlen_symbol = 257 + length_slot;
|
1537
|
-
|
1538
|
-
/* Litlen symbol */
|
1539
|
-
deflate_add_bits(os, codes->codewords.litlen[litlen_symbol],
|
1540
|
-
codes->lens.litlen[litlen_symbol]);
|
1541
|
-
|
1542
|
-
/* Extra length bits */
|
1543
|
-
STATIC_ASSERT(CAN_BUFFER(MAX_LITLEN_CODEWORD_LEN +
|
1544
|
-
DEFLATE_MAX_EXTRA_LENGTH_BITS));
|
1545
|
-
deflate_add_bits(os, length - deflate_length_slot_base[length_slot],
|
1546
|
-
deflate_extra_length_bits[length_slot]);
|
1547
|
-
|
1548
|
-
if (!CAN_BUFFER(MAX_LITLEN_CODEWORD_LEN +
|
1549
|
-
DEFLATE_MAX_EXTRA_LENGTH_BITS +
|
1550
|
-
MAX_OFFSET_CODEWORD_LEN +
|
1551
|
-
DEFLATE_MAX_EXTRA_OFFSET_BITS))
|
1552
|
-
deflate_flush_bits(os);
|
1553
|
-
|
1554
|
-
/* Offset symbol */
|
1555
|
-
offset_symbol = seq->offset_symbol;
|
1556
|
-
deflate_add_bits(os, codes->codewords.offset[offset_symbol],
|
1557
|
-
codes->lens.offset[offset_symbol]);
|
1558
|
-
|
1559
|
-
if (!CAN_BUFFER(MAX_OFFSET_CODEWORD_LEN +
|
1560
|
-
DEFLATE_MAX_EXTRA_OFFSET_BITS))
|
1561
|
-
deflate_flush_bits(os);
|
1562
|
-
|
1563
|
-
/* Extra offset bits */
|
1564
|
-
deflate_add_bits(os, seq->offset - deflate_offset_slot_base[offset_symbol],
|
1565
|
-
deflate_extra_offset_bits[offset_symbol]);
|
1566
|
-
|
1567
|
-
deflate_flush_bits(os);
|
1568
|
-
|
1569
|
-
seq++;
|
1570
|
-
}
|
1571
|
-
}
|
1572
|
-
|
1573
|
-
#if SUPPORT_NEAR_OPTIMAL_PARSING
|
1574
|
-
/*
|
1575
|
-
* Follow the minimum-cost path in the graph of possible match/literal choices
|
1576
|
-
* for the current block and write out the matches/literals using the specified
|
1577
|
-
* Huffman codes.
|
1578
|
-
*
|
1579
|
-
* Note: this is slightly duplicated with deflate_write_sequences(), the reason
|
1580
|
-
* being that we don't want to waste time translating between intermediate
|
1581
|
-
* match/literal representations.
|
1582
|
-
*/
|
1583
|
-
static void
|
1584
|
-
deflate_write_item_list(struct deflate_output_bitstream *os,
|
1585
|
-
const struct deflate_codes *codes,
|
1586
|
-
struct libdeflate_compressor *c,
|
1587
|
-
u32 block_length)
|
1588
|
-
{
|
1589
|
-
struct deflate_optimum_node *cur_node = &c->p.n.optimum_nodes[0];
|
1590
|
-
struct deflate_optimum_node * const end_node = &c->p.n.optimum_nodes[block_length];
|
1591
|
-
do {
|
1592
|
-
unsigned length = cur_node->item & OPTIMUM_LEN_MASK;
|
1593
|
-
unsigned offset = cur_node->item >> OPTIMUM_OFFSET_SHIFT;
|
1594
|
-
unsigned litlen_symbol;
|
1595
|
-
unsigned length_slot;
|
1596
|
-
unsigned offset_slot;
|
1597
|
-
|
1598
|
-
if (length == 1) {
|
1599
|
-
/* Literal */
|
1600
|
-
litlen_symbol = offset;
|
1601
|
-
deflate_add_bits(os, codes->codewords.litlen[litlen_symbol],
|
1602
|
-
codes->lens.litlen[litlen_symbol]);
|
1603
|
-
deflate_flush_bits(os);
|
1604
|
-
} else {
|
1605
|
-
/* Match length */
|
1606
|
-
length_slot = deflate_length_slot[length];
|
1607
|
-
litlen_symbol = 257 + length_slot;
|
1608
|
-
deflate_add_bits(os, codes->codewords.litlen[litlen_symbol],
|
1609
|
-
codes->lens.litlen[litlen_symbol]);
|
1610
|
-
|
1611
|
-
deflate_add_bits(os, length - deflate_length_slot_base[length_slot],
|
1612
|
-
deflate_extra_length_bits[length_slot]);
|
1613
|
-
|
1614
|
-
if (!CAN_BUFFER(MAX_LITLEN_CODEWORD_LEN +
|
1615
|
-
DEFLATE_MAX_EXTRA_LENGTH_BITS +
|
1616
|
-
MAX_OFFSET_CODEWORD_LEN +
|
1617
|
-
DEFLATE_MAX_EXTRA_OFFSET_BITS))
|
1618
|
-
deflate_flush_bits(os);
|
1619
|
-
|
1620
|
-
|
1621
|
-
/* Match offset */
|
1622
|
-
offset_slot = deflate_get_offset_slot(c, offset);
|
1623
|
-
deflate_add_bits(os, codes->codewords.offset[offset_slot],
|
1624
|
-
codes->lens.offset[offset_slot]);
|
1625
|
-
|
1626
|
-
if (!CAN_BUFFER(MAX_OFFSET_CODEWORD_LEN +
|
1627
|
-
DEFLATE_MAX_EXTRA_OFFSET_BITS))
|
1628
|
-
deflate_flush_bits(os);
|
1629
|
-
|
1630
|
-
deflate_add_bits(os, offset - deflate_offset_slot_base[offset_slot],
|
1631
|
-
deflate_extra_offset_bits[offset_slot]);
|
1632
|
-
|
1633
|
-
deflate_flush_bits(os);
|
1634
|
-
}
|
1635
|
-
cur_node += length;
|
1636
|
-
} while (cur_node != end_node);
|
1637
|
-
}
|
1638
|
-
#endif /* SUPPORT_NEAR_OPTIMAL_PARSING */
|
1639
|
-
|
1640
|
-
/* Output the end-of-block symbol. */
|
1641
|
-
static void
|
1642
|
-
deflate_write_end_of_block(struct deflate_output_bitstream *os,
|
1643
|
-
const struct deflate_codes *codes)
|
1644
|
-
{
|
1645
|
-
deflate_add_bits(os, codes->codewords.litlen[DEFLATE_END_OF_BLOCK],
|
1646
|
-
codes->lens.litlen[DEFLATE_END_OF_BLOCK]);
|
1647
|
-
deflate_flush_bits(os);
|
1648
|
-
}
|
1649
|
-
|
1650
|
-
static void
|
1651
|
-
deflate_write_uncompressed_block(struct deflate_output_bitstream *os,
|
1652
|
-
const u8 *data, u16 len,
|
1653
|
-
bool is_final_block)
|
1654
|
-
{
|
1655
|
-
deflate_write_block_header(os, is_final_block,
|
1656
|
-
DEFLATE_BLOCKTYPE_UNCOMPRESSED);
|
1657
|
-
deflate_align_bitstream(os);
|
1658
|
-
|
1659
|
-
if (4 + (u32)len >= os->end - os->next) {
|
1660
|
-
os->next = os->end;
|
1661
|
-
return;
|
1662
|
-
}
|
1663
|
-
|
1664
|
-
put_unaligned_le16(len, os->next);
|
1665
|
-
os->next += 2;
|
1666
|
-
put_unaligned_le16(~len, os->next);
|
1667
|
-
os->next += 2;
|
1668
|
-
memcpy(os->next, data, len);
|
1669
|
-
os->next += len;
|
1670
|
-
}
|
1671
|
-
|
1672
|
-
static void
|
1673
|
-
deflate_write_uncompressed_blocks(struct deflate_output_bitstream *os,
|
1674
|
-
const u8 *data, u32 data_length,
|
1675
|
-
bool is_final_block)
|
1676
|
-
{
|
1677
|
-
do {
|
1678
|
-
u16 len = MIN(data_length, UINT16_MAX);
|
1679
|
-
|
1680
|
-
deflate_write_uncompressed_block(os, data, len,
|
1681
|
-
is_final_block && len == data_length);
|
1682
|
-
data += len;
|
1683
|
-
data_length -= len;
|
1684
|
-
} while (data_length != 0);
|
1685
|
-
}
|
1686
|
-
|
1687
|
-
/*
|
1688
|
-
* Choose the best type of block to use (dynamic Huffman, static Huffman, or
|
1689
|
-
* uncompressed), then output it.
|
1690
|
-
*/
|
1691
|
-
static void
|
1692
|
-
deflate_flush_block(struct libdeflate_compressor * restrict c,
|
1693
|
-
struct deflate_output_bitstream * restrict os,
|
1694
|
-
const u8 * restrict block_begin, u32 block_length,
|
1695
|
-
bool is_final_block, bool use_item_list)
|
1696
|
-
{
|
1697
|
-
static const u8 deflate_extra_precode_bits[DEFLATE_NUM_PRECODE_SYMS] = {
|
1698
|
-
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 3, 7,
|
1699
|
-
};
|
1700
|
-
|
1701
|
-
/* Costs are measured in bits */
|
1702
|
-
u32 dynamic_cost = 0;
|
1703
|
-
u32 static_cost = 0;
|
1704
|
-
u32 uncompressed_cost = 0;
|
1705
|
-
struct deflate_codes *codes;
|
1706
|
-
int block_type;
|
1707
|
-
unsigned sym;
|
1708
|
-
|
1709
|
-
/* Tally the end-of-block symbol. */
|
1710
|
-
c->freqs.litlen[DEFLATE_END_OF_BLOCK]++;
|
1711
|
-
|
1712
|
-
/* Build dynamic Huffman codes. */
|
1713
|
-
deflate_make_huffman_codes(&c->freqs, &c->codes);
|
1714
|
-
|
1715
|
-
/* Account for the cost of sending dynamic Huffman codes. */
|
1716
|
-
deflate_precompute_huffman_header(c);
|
1717
|
-
dynamic_cost += 5 + 5 + 4 + (3 * c->num_explicit_lens);
|
1718
|
-
for (sym = 0; sym < DEFLATE_NUM_PRECODE_SYMS; sym++) {
|
1719
|
-
u32 extra = deflate_extra_precode_bits[sym];
|
1720
|
-
dynamic_cost += c->precode_freqs[sym] *
|
1721
|
-
(extra + c->precode_lens[sym]);
|
1722
|
-
}
|
1723
|
-
|
1724
|
-
/* Account for the cost of encoding literals. */
|
1725
|
-
for (sym = 0; sym < 256; sym++) {
|
1726
|
-
dynamic_cost += c->freqs.litlen[sym] *
|
1727
|
-
c->codes.lens.litlen[sym];
|
1728
|
-
}
|
1729
|
-
for (sym = 0; sym < 144; sym++)
|
1730
|
-
static_cost += c->freqs.litlen[sym] * 8;
|
1731
|
-
for (; sym < 256; sym++)
|
1732
|
-
static_cost += c->freqs.litlen[sym] * 9;
|
1733
|
-
|
1734
|
-
/* Account for the cost of encoding the end-of-block symbol. */
|
1735
|
-
dynamic_cost += c->codes.lens.litlen[256];
|
1736
|
-
static_cost += 7;
|
1737
|
-
|
1738
|
-
/* Account for the cost of encoding lengths. */
|
1739
|
-
for (sym = 257; sym < 257 + ARRAY_LEN(deflate_extra_length_bits); sym++) {
|
1740
|
-
u32 extra = deflate_extra_length_bits[sym - 257];
|
1741
|
-
dynamic_cost += c->freqs.litlen[sym] *
|
1742
|
-
(extra + c->codes.lens.litlen[sym]);
|
1743
|
-
static_cost += c->freqs.litlen[sym] *
|
1744
|
-
(extra + c->static_codes.lens.litlen[sym]);
|
1745
|
-
}
|
1746
|
-
|
1747
|
-
/* Account for the cost of encoding offsets. */
|
1748
|
-
for (sym = 0; sym < ARRAY_LEN(deflate_extra_offset_bits); sym++) {
|
1749
|
-
u32 extra = deflate_extra_offset_bits[sym];
|
1750
|
-
dynamic_cost += c->freqs.offset[sym] *
|
1751
|
-
(extra + c->codes.lens.offset[sym]);
|
1752
|
-
static_cost += c->freqs.offset[sym] * (extra + 5);
|
1753
|
-
}
|
1754
|
-
|
1755
|
-
/* Compute the cost of using uncompressed blocks. */
|
1756
|
-
uncompressed_cost += (-(os->bitcount + 3) & 7) + 32 +
|
1757
|
-
(40 * (DIV_ROUND_UP(block_length,
|
1758
|
-
UINT16_MAX) - 1)) +
|
1759
|
-
(8 * block_length);
|
1760
|
-
|
1761
|
-
/* Choose the cheapest block type. */
|
1762
|
-
if (dynamic_cost < MIN(static_cost, uncompressed_cost)) {
|
1763
|
-
block_type = DEFLATE_BLOCKTYPE_DYNAMIC_HUFFMAN;
|
1764
|
-
codes = &c->codes;
|
1765
|
-
} else if (static_cost < uncompressed_cost) {
|
1766
|
-
block_type = DEFLATE_BLOCKTYPE_STATIC_HUFFMAN;
|
1767
|
-
codes = &c->static_codes;
|
1768
|
-
} else {
|
1769
|
-
block_type = DEFLATE_BLOCKTYPE_UNCOMPRESSED;
|
1770
|
-
}
|
1771
|
-
|
1772
|
-
/* Now actually output the block. */
|
1773
|
-
|
1774
|
-
if (block_type == DEFLATE_BLOCKTYPE_UNCOMPRESSED) {
|
1775
|
-
/* Note: the length being flushed may exceed the maximum length
|
1776
|
-
* of an uncompressed block (65535 bytes). Therefore, more than
|
1777
|
-
* one uncompressed block might be needed. */
|
1778
|
-
deflate_write_uncompressed_blocks(os, block_begin, block_length,
|
1779
|
-
is_final_block);
|
1780
|
-
} else {
|
1781
|
-
/* Output the block header. */
|
1782
|
-
deflate_write_block_header(os, is_final_block, block_type);
|
1783
|
-
|
1784
|
-
/* Output the Huffman codes (dynamic Huffman blocks only). */
|
1785
|
-
if (block_type == DEFLATE_BLOCKTYPE_DYNAMIC_HUFFMAN)
|
1786
|
-
deflate_write_huffman_header(c, os);
|
1787
|
-
|
1788
|
-
/* Output the literals, matches, and end-of-block symbol. */
|
1789
|
-
#if SUPPORT_NEAR_OPTIMAL_PARSING
|
1790
|
-
if (use_item_list)
|
1791
|
-
deflate_write_item_list(os, codes, c, block_length);
|
1792
|
-
else
|
1793
|
-
#endif
|
1794
|
-
deflate_write_sequences(os, codes, c->p.g.sequences,
|
1795
|
-
block_begin);
|
1796
|
-
deflate_write_end_of_block(os, codes);
|
1797
|
-
}
|
1798
|
-
}
|
1799
|
-
|
1800
|
-
static forceinline void
|
1801
|
-
deflate_choose_literal(struct libdeflate_compressor *c, unsigned literal,
|
1802
|
-
u32 *litrunlen_p)
|
1803
|
-
{
|
1804
|
-
c->freqs.litlen[literal]++;
|
1805
|
-
++*litrunlen_p;
|
1806
|
-
}
|
1807
|
-
|
1808
|
-
static forceinline void
|
1809
|
-
deflate_choose_match(struct libdeflate_compressor *c,
|
1810
|
-
unsigned length, unsigned offset,
|
1811
|
-
u32 *litrunlen_p, struct deflate_sequence **next_seq_p)
|
1812
|
-
{
|
1813
|
-
struct deflate_sequence *seq = *next_seq_p;
|
1814
|
-
unsigned length_slot = deflate_length_slot[length];
|
1815
|
-
unsigned offset_slot = deflate_get_offset_slot(c, offset);
|
1816
|
-
|
1817
|
-
c->freqs.litlen[257 + length_slot]++;
|
1818
|
-
c->freqs.offset[offset_slot]++;
|
1819
|
-
|
1820
|
-
seq->litrunlen_and_length = ((u32)length << 23) | *litrunlen_p;
|
1821
|
-
seq->offset = offset;
|
1822
|
-
seq->length_slot = length_slot;
|
1823
|
-
seq->offset_symbol = offset_slot;
|
1824
|
-
|
1825
|
-
*litrunlen_p = 0;
|
1826
|
-
*next_seq_p = seq + 1;
|
1827
|
-
}
|
1828
|
-
|
1829
|
-
static forceinline void
|
1830
|
-
deflate_finish_sequence(struct deflate_sequence *seq, u32 litrunlen)
|
1831
|
-
{
|
1832
|
-
seq->litrunlen_and_length = litrunlen; /* length = 0 */
|
1833
|
-
}
|
1834
|
-
|
1835
|
-
/******************************************************************************/
|
1836
|
-
|
1837
|
-
/*
|
1838
|
-
* Block splitting algorithm. The problem is to decide when it is worthwhile to
|
1839
|
-
* start a new block with new Huffman codes. There is a theoretically optimal
|
1840
|
-
* solution: recursively consider every possible block split, considering the
|
1841
|
-
* exact cost of each block, and choose the minimum cost approach. But this is
|
1842
|
-
* far too slow. Instead, as an approximation, we can count symbols and after
|
1843
|
-
* every N symbols, compare the expected distribution of symbols based on the
|
1844
|
-
* previous data with the actual distribution. If they differ "by enough", then
|
1845
|
-
* start a new block.
|
1846
|
-
*
|
1847
|
-
* As an optimization and heuristic, we don't distinguish between every symbol
|
1848
|
-
* but rather we combine many symbols into a single "observation type". For
|
1849
|
-
* literals we only look at the high bits and low bits, and for matches we only
|
1850
|
-
* look at whether the match is long or not. The assumption is that for typical
|
1851
|
-
* "real" data, places that are good block boundaries will tend to be noticable
|
1852
|
-
* based only on changes in these aggregate frequencies, without looking for
|
1853
|
-
* subtle differences in individual symbols. For example, a change from ASCII
|
1854
|
-
* bytes to non-ASCII bytes, or from few matches (generally less compressible)
|
1855
|
-
* to many matches (generally more compressible), would be easily noticed based
|
1856
|
-
* on the aggregates.
|
1857
|
-
*
|
1858
|
-
* For determining whether the frequency distributions are "different enough" to
|
1859
|
-
* start a new block, the simply heuristic of splitting when the sum of absolute
|
1860
|
-
* differences exceeds a constant seems to be good enough. We also add a number
|
1861
|
-
* proportional to the block length so that the algorithm is more likely to end
|
1862
|
-
* long blocks than short blocks. This reflects the general expectation that it
|
1863
|
-
* will become increasingly beneficial to start a new block as the current
|
1864
|
-
* block grows longer.
|
1865
|
-
*
|
1866
|
-
* Finally, for an approximation, it is not strictly necessary that the exact
|
1867
|
-
* symbols being used are considered. With "near-optimal parsing", for example,
|
1868
|
-
* the actual symbols that will be used are unknown until after the block
|
1869
|
-
* boundary is chosen and the block has been optimized. Since the final choices
|
1870
|
-
* cannot be used, we can use preliminary "greedy" choices instead.
|
1871
|
-
*/
|
1872
|
-
|
1873
|
-
/* Initialize the block split statistics when starting a new block. */
|
1874
|
-
static void
|
1875
|
-
init_block_split_stats(struct block_split_stats *stats)
|
1876
|
-
{
|
1877
|
-
int i;
|
1878
|
-
|
1879
|
-
for (i = 0; i < NUM_OBSERVATION_TYPES; i++) {
|
1880
|
-
stats->new_observations[i] = 0;
|
1881
|
-
stats->observations[i] = 0;
|
1882
|
-
}
|
1883
|
-
stats->num_new_observations = 0;
|
1884
|
-
stats->num_observations = 0;
|
1885
|
-
}
|
1886
|
-
|
1887
|
-
/* Literal observation. Heuristic: use the top 2 bits and low 1 bits of the
|
1888
|
-
* literal, for 8 possible literal observation types. */
|
1889
|
-
static forceinline void
|
1890
|
-
observe_literal(struct block_split_stats *stats, u8 lit)
|
1891
|
-
{
|
1892
|
-
stats->new_observations[((lit >> 5) & 0x6) | (lit & 1)]++;
|
1893
|
-
stats->num_new_observations++;
|
1894
|
-
}
|
1895
|
-
|
1896
|
-
/* Match observation. Heuristic: use one observation type for "short match" and
|
1897
|
-
* one observation type for "long match". */
|
1898
|
-
static forceinline void
|
1899
|
-
observe_match(struct block_split_stats *stats, unsigned length)
|
1900
|
-
{
|
1901
|
-
stats->new_observations[NUM_LITERAL_OBSERVATION_TYPES + (length >= 9)]++;
|
1902
|
-
stats->num_new_observations++;
|
1903
|
-
}
|
1904
|
-
|
1905
|
-
static bool
|
1906
|
-
do_end_block_check(struct block_split_stats *stats, u32 block_length)
|
1907
|
-
{
|
1908
|
-
int i;
|
1909
|
-
|
1910
|
-
if (stats->num_observations > 0) {
|
1911
|
-
|
1912
|
-
/* Note: to avoid slow divisions, we do not divide by
|
1913
|
-
* 'num_observations', but rather do all math with the numbers
|
1914
|
-
* multiplied by 'num_observations'. */
|
1915
|
-
u32 total_delta = 0;
|
1916
|
-
for (i = 0; i < NUM_OBSERVATION_TYPES; i++) {
|
1917
|
-
u32 expected = stats->observations[i] * stats->num_new_observations;
|
1918
|
-
u32 actual = stats->new_observations[i] * stats->num_observations;
|
1919
|
-
u32 delta = (actual > expected) ? actual - expected :
|
1920
|
-
expected - actual;
|
1921
|
-
total_delta += delta;
|
1922
|
-
}
|
1923
|
-
|
1924
|
-
/* Ready to end the block? */
|
1925
|
-
if (total_delta + (block_length / 4096) * stats->num_observations >=
|
1926
|
-
NUM_OBSERVATIONS_PER_BLOCK_CHECK * 200 / 512 * stats->num_observations)
|
1927
|
-
return true;
|
1928
|
-
}
|
1929
|
-
|
1930
|
-
for (i = 0; i < NUM_OBSERVATION_TYPES; i++) {
|
1931
|
-
stats->num_observations += stats->new_observations[i];
|
1932
|
-
stats->observations[i] += stats->new_observations[i];
|
1933
|
-
stats->new_observations[i] = 0;
|
1934
|
-
}
|
1935
|
-
stats->num_new_observations = 0;
|
1936
|
-
return false;
|
1937
|
-
}
|
1938
|
-
|
1939
|
-
static forceinline bool
|
1940
|
-
should_end_block(struct block_split_stats *stats,
|
1941
|
-
const u8 *in_block_begin, const u8 *in_next, const u8 *in_end)
|
1942
|
-
{
|
1943
|
-
/* Ready to check block split statistics? */
|
1944
|
-
if (stats->num_new_observations < NUM_OBSERVATIONS_PER_BLOCK_CHECK ||
|
1945
|
-
in_next - in_block_begin < MIN_BLOCK_LENGTH ||
|
1946
|
-
in_end - in_next < MIN_BLOCK_LENGTH)
|
1947
|
-
return false;
|
1948
|
-
|
1949
|
-
return do_end_block_check(stats, in_next - in_block_begin);
|
1950
|
-
}
|
1951
|
-
|
1952
|
-
/******************************************************************************/
|
1953
|
-
|
1954
|
-
/*
|
1955
|
-
* This is the "greedy" DEFLATE compressor. It always chooses the longest match.
|
1956
|
-
*/
|
1957
|
-
static size_t
|
1958
|
-
deflate_compress_greedy(struct libdeflate_compressor * restrict c,
|
1959
|
-
const u8 * restrict in, size_t in_nbytes,
|
1960
|
-
u8 * restrict out, size_t out_nbytes_avail)
|
1961
|
-
{
|
1962
|
-
const u8 *in_next = in;
|
1963
|
-
const u8 *in_end = in_next + in_nbytes;
|
1964
|
-
struct deflate_output_bitstream os;
|
1965
|
-
const u8 *in_cur_base = in_next;
|
1966
|
-
unsigned max_len = DEFLATE_MAX_MATCH_LEN;
|
1967
|
-
unsigned nice_len = MIN(c->nice_match_length, max_len);
|
1968
|
-
u32 next_hashes[2] = {0, 0};
|
1969
|
-
|
1970
|
-
deflate_init_output(&os, out, out_nbytes_avail);
|
1971
|
-
hc_matchfinder_init(&c->p.g.hc_mf);
|
1972
|
-
|
1973
|
-
do {
|
1974
|
-
/* Starting a new DEFLATE block. */
|
1975
|
-
|
1976
|
-
const u8 * const in_block_begin = in_next;
|
1977
|
-
const u8 * const in_max_block_end =
|
1978
|
-
in_next + MIN(in_end - in_next, SOFT_MAX_BLOCK_LENGTH);
|
1979
|
-
u32 litrunlen = 0;
|
1980
|
-
struct deflate_sequence *next_seq = c->p.g.sequences;
|
1981
|
-
|
1982
|
-
init_block_split_stats(&c->split_stats);
|
1983
|
-
deflate_reset_symbol_frequencies(c);
|
1984
|
-
|
1985
|
-
do {
|
1986
|
-
u32 length;
|
1987
|
-
u32 offset;
|
1988
|
-
|
1989
|
-
/* Decrease the maximum and nice match lengths if we're
|
1990
|
-
* approaching the end of the input buffer. */
|
1991
|
-
if (unlikely(max_len > in_end - in_next)) {
|
1992
|
-
max_len = in_end - in_next;
|
1993
|
-
nice_len = MIN(nice_len, max_len);
|
1994
|
-
}
|
1995
|
-
|
1996
|
-
length = hc_matchfinder_longest_match(&c->p.g.hc_mf,
|
1997
|
-
&in_cur_base,
|
1998
|
-
in_next,
|
1999
|
-
DEFLATE_MIN_MATCH_LEN - 1,
|
2000
|
-
max_len,
|
2001
|
-
nice_len,
|
2002
|
-
c->max_search_depth,
|
2003
|
-
next_hashes,
|
2004
|
-
&offset);
|
2005
|
-
|
2006
|
-
if (length >= DEFLATE_MIN_MATCH_LEN) {
|
2007
|
-
/* Match found. */
|
2008
|
-
deflate_choose_match(c, length, offset,
|
2009
|
-
&litrunlen, &next_seq);
|
2010
|
-
observe_match(&c->split_stats, length);
|
2011
|
-
in_next = hc_matchfinder_skip_positions(&c->p.g.hc_mf,
|
2012
|
-
&in_cur_base,
|
2013
|
-
in_next + 1,
|
2014
|
-
in_end,
|
2015
|
-
length - 1,
|
2016
|
-
next_hashes);
|
2017
|
-
} else {
|
2018
|
-
/* No match found. */
|
2019
|
-
deflate_choose_literal(c, *in_next, &litrunlen);
|
2020
|
-
observe_literal(&c->split_stats, *in_next);
|
2021
|
-
in_next++;
|
2022
|
-
}
|
2023
|
-
|
2024
|
-
/* Check if it's time to output another block. */
|
2025
|
-
} while (in_next < in_max_block_end &&
|
2026
|
-
!should_end_block(&c->split_stats, in_block_begin, in_next, in_end));
|
2027
|
-
|
2028
|
-
deflate_finish_sequence(next_seq, litrunlen);
|
2029
|
-
deflate_flush_block(c, &os, in_block_begin,
|
2030
|
-
in_next - in_block_begin,
|
2031
|
-
in_next == in_end, false);
|
2032
|
-
} while (in_next != in_end);
|
2033
|
-
|
2034
|
-
return deflate_flush_output(&os);
|
2035
|
-
}
|
2036
|
-
|
2037
|
-
/*
|
2038
|
-
* This is the "lazy" DEFLATE compressor. Before choosing a match, it checks to
|
2039
|
-
* see if there's a longer match at the next position. If yes, it outputs a
|
2040
|
-
* literal and continues to the next position. If no, it outputs the match.
|
2041
|
-
*/
|
2042
|
-
static size_t
|
2043
|
-
deflate_compress_lazy(struct libdeflate_compressor * restrict c,
|
2044
|
-
const u8 * restrict in, size_t in_nbytes,
|
2045
|
-
u8 * restrict out, size_t out_nbytes_avail)
|
2046
|
-
{
|
2047
|
-
const u8 *in_next = in;
|
2048
|
-
const u8 *in_end = in_next + in_nbytes;
|
2049
|
-
struct deflate_output_bitstream os;
|
2050
|
-
const u8 *in_cur_base = in_next;
|
2051
|
-
unsigned max_len = DEFLATE_MAX_MATCH_LEN;
|
2052
|
-
unsigned nice_len = MIN(c->nice_match_length, max_len);
|
2053
|
-
u32 next_hashes[2] = {0, 0};
|
2054
|
-
|
2055
|
-
deflate_init_output(&os, out, out_nbytes_avail);
|
2056
|
-
hc_matchfinder_init(&c->p.g.hc_mf);
|
2057
|
-
|
2058
|
-
do {
|
2059
|
-
/* Starting a new DEFLATE block. */
|
2060
|
-
|
2061
|
-
const u8 * const in_block_begin = in_next;
|
2062
|
-
const u8 * const in_max_block_end =
|
2063
|
-
in_next + MIN(in_end - in_next, SOFT_MAX_BLOCK_LENGTH);
|
2064
|
-
u32 litrunlen = 0;
|
2065
|
-
struct deflate_sequence *next_seq = c->p.g.sequences;
|
2066
|
-
|
2067
|
-
init_block_split_stats(&c->split_stats);
|
2068
|
-
deflate_reset_symbol_frequencies(c);
|
2069
|
-
|
2070
|
-
do {
|
2071
|
-
unsigned cur_len;
|
2072
|
-
unsigned cur_offset;
|
2073
|
-
unsigned next_len;
|
2074
|
-
unsigned next_offset;
|
2075
|
-
|
2076
|
-
if (unlikely(in_end - in_next < DEFLATE_MAX_MATCH_LEN)) {
|
2077
|
-
max_len = in_end - in_next;
|
2078
|
-
nice_len = MIN(nice_len, max_len);
|
2079
|
-
}
|
2080
|
-
|
2081
|
-
/* Find the longest match at the current position. */
|
2082
|
-
cur_len = hc_matchfinder_longest_match(&c->p.g.hc_mf,
|
2083
|
-
&in_cur_base,
|
2084
|
-
in_next,
|
2085
|
-
DEFLATE_MIN_MATCH_LEN - 1,
|
2086
|
-
max_len,
|
2087
|
-
nice_len,
|
2088
|
-
c->max_search_depth,
|
2089
|
-
next_hashes,
|
2090
|
-
&cur_offset);
|
2091
|
-
in_next += 1;
|
2092
|
-
|
2093
|
-
if (cur_len < DEFLATE_MIN_MATCH_LEN) {
|
2094
|
-
/* No match found. Choose a literal. */
|
2095
|
-
deflate_choose_literal(c, *(in_next - 1), &litrunlen);
|
2096
|
-
observe_literal(&c->split_stats, *(in_next - 1));
|
2097
|
-
continue;
|
2098
|
-
}
|
2099
|
-
|
2100
|
-
have_cur_match:
|
2101
|
-
observe_match(&c->split_stats, cur_len);
|
2102
|
-
|
2103
|
-
/* We have a match at the current position. */
|
2104
|
-
|
2105
|
-
/* If the current match is very long, choose it
|
2106
|
-
* immediately. */
|
2107
|
-
if (cur_len >= nice_len) {
|
2108
|
-
deflate_choose_match(c, cur_len, cur_offset,
|
2109
|
-
&litrunlen, &next_seq);
|
2110
|
-
in_next = hc_matchfinder_skip_positions(&c->p.g.hc_mf,
|
2111
|
-
&in_cur_base,
|
2112
|
-
in_next,
|
2113
|
-
in_end,
|
2114
|
-
cur_len - 1,
|
2115
|
-
next_hashes);
|
2116
|
-
continue;
|
2117
|
-
}
|
2118
|
-
|
2119
|
-
/*
|
2120
|
-
* Try to find a match at the next position.
|
2121
|
-
*
|
2122
|
-
* Note: since we already have a match at the *current*
|
2123
|
-
* position, we use only half the 'max_search_depth'
|
2124
|
-
* when checking the *next* position. This is a useful
|
2125
|
-
* trade-off because it's more worthwhile to use a
|
2126
|
-
* greater search depth on the initial match.
|
2127
|
-
*
|
2128
|
-
* Note: it's possible to structure the code such that
|
2129
|
-
* there's only one call to longest_match(), which
|
2130
|
-
* handles both the "find the initial match" and "try to
|
2131
|
-
* find a longer match" cases. However, it is faster to
|
2132
|
-
* have two call sites, with longest_match() inlined at
|
2133
|
-
* each.
|
2134
|
-
*/
|
2135
|
-
if (unlikely(in_end - in_next < DEFLATE_MAX_MATCH_LEN)) {
|
2136
|
-
max_len = in_end - in_next;
|
2137
|
-
nice_len = MIN(nice_len, max_len);
|
2138
|
-
}
|
2139
|
-
next_len = hc_matchfinder_longest_match(&c->p.g.hc_mf,
|
2140
|
-
&in_cur_base,
|
2141
|
-
in_next,
|
2142
|
-
cur_len,
|
2143
|
-
max_len,
|
2144
|
-
nice_len,
|
2145
|
-
c->max_search_depth / 2,
|
2146
|
-
next_hashes,
|
2147
|
-
&next_offset);
|
2148
|
-
in_next += 1;
|
2149
|
-
|
2150
|
-
if (next_len > cur_len) {
|
2151
|
-
/* Found a longer match at the next position.
|
2152
|
-
* Output a literal. Then the next match
|
2153
|
-
* becomes the current match. */
|
2154
|
-
deflate_choose_literal(c, *(in_next - 2), &litrunlen);
|
2155
|
-
cur_len = next_len;
|
2156
|
-
cur_offset = next_offset;
|
2157
|
-
goto have_cur_match;
|
2158
|
-
}
|
2159
|
-
|
2160
|
-
/* No longer match at the next position.
|
2161
|
-
* Output the current match. */
|
2162
|
-
deflate_choose_match(c, cur_len, cur_offset,
|
2163
|
-
&litrunlen, &next_seq);
|
2164
|
-
in_next = hc_matchfinder_skip_positions(&c->p.g.hc_mf,
|
2165
|
-
&in_cur_base,
|
2166
|
-
in_next,
|
2167
|
-
in_end,
|
2168
|
-
cur_len - 2,
|
2169
|
-
next_hashes);
|
2170
|
-
|
2171
|
-
/* Check if it's time to output another block. */
|
2172
|
-
} while (in_next < in_max_block_end &&
|
2173
|
-
!should_end_block(&c->split_stats, in_block_begin, in_next, in_end));
|
2174
|
-
|
2175
|
-
deflate_finish_sequence(next_seq, litrunlen);
|
2176
|
-
deflate_flush_block(c, &os, in_block_begin,
|
2177
|
-
in_next - in_block_begin,
|
2178
|
-
in_next == in_end, false);
|
2179
|
-
} while (in_next != in_end);
|
2180
|
-
|
2181
|
-
return deflate_flush_output(&os);
|
2182
|
-
}
|
2183
|
-
|
2184
|
-
#if SUPPORT_NEAR_OPTIMAL_PARSING
|
2185
|
-
|
2186
|
-
/*
|
2187
|
-
* Follow the minimum-cost path in the graph of possible match/literal choices
|
2188
|
-
* for the current block and compute the frequencies of the Huffman symbols that
|
2189
|
-
* would be needed to output those matches and literals.
|
2190
|
-
*/
|
2191
|
-
static void
|
2192
|
-
deflate_tally_item_list(struct libdeflate_compressor *c, u32 block_length)
|
2193
|
-
{
|
2194
|
-
struct deflate_optimum_node *cur_node = &c->p.n.optimum_nodes[0];
|
2195
|
-
struct deflate_optimum_node *end_node = &c->p.n.optimum_nodes[block_length];
|
2196
|
-
do {
|
2197
|
-
unsigned length = cur_node->item & OPTIMUM_LEN_MASK;
|
2198
|
-
unsigned offset = cur_node->item >> OPTIMUM_OFFSET_SHIFT;
|
2199
|
-
|
2200
|
-
if (length == 1) {
|
2201
|
-
/* Literal */
|
2202
|
-
c->freqs.litlen[offset]++;
|
2203
|
-
} else {
|
2204
|
-
/* Match */
|
2205
|
-
c->freqs.litlen[257 + deflate_length_slot[length]]++;
|
2206
|
-
c->freqs.offset[deflate_get_offset_slot(c, offset)]++;
|
2207
|
-
}
|
2208
|
-
cur_node += length;
|
2209
|
-
} while (cur_node != end_node);
|
2210
|
-
}
|
2211
|
-
|
2212
|
-
/* Set the current cost model from the codeword lengths specified in @lens. */
|
2213
|
-
static void
|
2214
|
-
deflate_set_costs_from_codes(struct libdeflate_compressor *c,
|
2215
|
-
const struct deflate_lens *lens)
|
2216
|
-
{
|
2217
|
-
unsigned i;
|
2218
|
-
|
2219
|
-
/* Literals */
|
2220
|
-
for (i = 0; i < DEFLATE_NUM_LITERALS; i++) {
|
2221
|
-
u32 bits = (lens->litlen[i] ? lens->litlen[i] : LITERAL_NOSTAT_BITS);
|
2222
|
-
c->p.n.costs.literal[i] = bits << COST_SHIFT;
|
2223
|
-
}
|
2224
|
-
|
2225
|
-
/* Lengths */
|
2226
|
-
for (i = DEFLATE_MIN_MATCH_LEN; i <= DEFLATE_MAX_MATCH_LEN; i++) {
|
2227
|
-
unsigned length_slot = deflate_length_slot[i];
|
2228
|
-
unsigned litlen_sym = 257 + length_slot;
|
2229
|
-
u32 bits = (lens->litlen[litlen_sym] ? lens->litlen[litlen_sym] : LENGTH_NOSTAT_BITS);
|
2230
|
-
bits += deflate_extra_length_bits[length_slot];
|
2231
|
-
c->p.n.costs.length[i] = bits << COST_SHIFT;
|
2232
|
-
}
|
2233
|
-
|
2234
|
-
/* Offset slots */
|
2235
|
-
for (i = 0; i < ARRAY_LEN(deflate_offset_slot_base); i++) {
|
2236
|
-
u32 bits = (lens->offset[i] ? lens->offset[i] : OFFSET_NOSTAT_BITS);
|
2237
|
-
bits += deflate_extra_offset_bits[i];
|
2238
|
-
c->p.n.costs.offset_slot[i] = bits << COST_SHIFT;
|
2239
|
-
}
|
2240
|
-
}
|
2241
|
-
|
2242
|
-
static forceinline u32
|
2243
|
-
deflate_default_literal_cost(unsigned literal)
|
2244
|
-
{
|
2245
|
-
STATIC_ASSERT(COST_SHIFT == 3);
|
2246
|
-
/* 66 is 8.25 bits/symbol */
|
2247
|
-
return 66;
|
2248
|
-
}
|
2249
|
-
|
2250
|
-
static forceinline u32
|
2251
|
-
deflate_default_length_slot_cost(unsigned length_slot)
|
2252
|
-
{
|
2253
|
-
STATIC_ASSERT(COST_SHIFT == 3);
|
2254
|
-
/* 60 is 7.5 bits/symbol */
|
2255
|
-
return 60 + ((u32)deflate_extra_length_bits[length_slot] << COST_SHIFT);
|
2256
|
-
}
|
2257
|
-
|
2258
|
-
static forceinline u32
|
2259
|
-
deflate_default_offset_slot_cost(unsigned offset_slot)
|
2260
|
-
{
|
2261
|
-
STATIC_ASSERT(COST_SHIFT == 3);
|
2262
|
-
/* 39 is 4.875 bits/symbol */
|
2263
|
-
return 39 + ((u32)deflate_extra_offset_bits[offset_slot] << COST_SHIFT);
|
2264
|
-
}
|
2265
|
-
|
2266
|
-
/*
|
2267
|
-
* Set default symbol costs for the first block's first optimization pass.
|
2268
|
-
*
|
2269
|
-
* It works well to assume that each symbol is equally probable. This results
|
2270
|
-
* in each symbol being assigned a cost of (-log2(1.0/num_syms) * (1 <<
|
2271
|
-
* COST_SHIFT)) where 'num_syms' is the number of symbols in the corresponding
|
2272
|
-
* alphabet. However, we intentionally bias the parse towards matches rather
|
2273
|
-
* than literals by using a slightly lower default cost for length symbols than
|
2274
|
-
* for literals. This often improves the compression ratio slightly.
|
2275
|
-
*/
|
2276
|
-
static void
|
2277
|
-
deflate_set_default_costs(struct libdeflate_compressor *c)
|
2278
|
-
{
|
2279
|
-
unsigned i;
|
2280
|
-
|
2281
|
-
/* Literals */
|
2282
|
-
for (i = 0; i < DEFLATE_NUM_LITERALS; i++)
|
2283
|
-
c->p.n.costs.literal[i] = deflate_default_literal_cost(i);
|
2284
|
-
|
2285
|
-
/* Lengths */
|
2286
|
-
for (i = DEFLATE_MIN_MATCH_LEN; i <= DEFLATE_MAX_MATCH_LEN; i++)
|
2287
|
-
c->p.n.costs.length[i] = deflate_default_length_slot_cost(
|
2288
|
-
deflate_length_slot[i]);
|
2289
|
-
|
2290
|
-
/* Offset slots */
|
2291
|
-
for (i = 0; i < ARRAY_LEN(deflate_offset_slot_base); i++)
|
2292
|
-
c->p.n.costs.offset_slot[i] = deflate_default_offset_slot_cost(i);
|
2293
|
-
}
|
2294
|
-
|
2295
|
-
static forceinline void
|
2296
|
-
deflate_adjust_cost(u32 *cost_p, u32 default_cost)
|
2297
|
-
{
|
2298
|
-
*cost_p += ((s32)default_cost - (s32)*cost_p) >> 1;
|
2299
|
-
}
|
2300
|
-
|
2301
|
-
/*
|
2302
|
-
* Adjust the costs when beginning a new block.
|
2303
|
-
*
|
2304
|
-
* Since the current costs have been optimized for the data, it's undesirable to
|
2305
|
-
* throw them away and start over with the default costs. At the same time, we
|
2306
|
-
* don't want to bias the parse by assuming that the next block will be similar
|
2307
|
-
* to the current block. As a compromise, make the costs closer to the
|
2308
|
-
* defaults, but don't simply set them to the defaults.
|
2309
|
-
*/
|
2310
|
-
static void
|
2311
|
-
deflate_adjust_costs(struct libdeflate_compressor *c)
|
2312
|
-
{
|
2313
|
-
unsigned i;
|
2314
|
-
|
2315
|
-
/* Literals */
|
2316
|
-
for (i = 0; i < DEFLATE_NUM_LITERALS; i++)
|
2317
|
-
deflate_adjust_cost(&c->p.n.costs.literal[i],
|
2318
|
-
deflate_default_literal_cost(i));
|
2319
|
-
|
2320
|
-
/* Lengths */
|
2321
|
-
for (i = DEFLATE_MIN_MATCH_LEN; i <= DEFLATE_MAX_MATCH_LEN; i++)
|
2322
|
-
deflate_adjust_cost(&c->p.n.costs.length[i],
|
2323
|
-
deflate_default_length_slot_cost(
|
2324
|
-
deflate_length_slot[i]));
|
2325
|
-
|
2326
|
-
/* Offset slots */
|
2327
|
-
for (i = 0; i < ARRAY_LEN(deflate_offset_slot_base); i++)
|
2328
|
-
deflate_adjust_cost(&c->p.n.costs.offset_slot[i],
|
2329
|
-
deflate_default_offset_slot_cost(i));
|
2330
|
-
}
|
2331
|
-
|
2332
|
-
/*
|
2333
|
-
* Find the minimum-cost path through the graph of possible match/literal
|
2334
|
-
* choices for this block.
|
2335
|
-
*
|
2336
|
-
* We find the minimum cost path from 'c->p.n.optimum_nodes[0]', which
|
2337
|
-
* represents the node at the beginning of the block, to
|
2338
|
-
* 'c->p.n.optimum_nodes[block_length]', which represents the node at the end of
|
2339
|
-
* the block. Edge costs are evaluated using the cost model 'c->p.n.costs'.
|
2340
|
-
*
|
2341
|
-
* The algorithm works backwards, starting at the end node and proceeding
|
2342
|
-
* backwards one node at a time. At each node, the minimum cost to reach the
|
2343
|
-
* end node is computed and the match/literal choice that begins that path is
|
2344
|
-
* saved.
|
2345
|
-
*/
|
2346
|
-
static void
|
2347
|
-
deflate_find_min_cost_path(struct libdeflate_compressor *c,
|
2348
|
-
const u32 block_length,
|
2349
|
-
const struct lz_match *cache_ptr)
|
2350
|
-
{
|
2351
|
-
struct deflate_optimum_node *end_node = &c->p.n.optimum_nodes[block_length];
|
2352
|
-
struct deflate_optimum_node *cur_node = end_node;
|
2353
|
-
|
2354
|
-
cur_node->cost_to_end = 0;
|
2355
|
-
do {
|
2356
|
-
unsigned num_matches;
|
2357
|
-
unsigned literal;
|
2358
|
-
u32 best_cost_to_end;
|
2359
|
-
|
2360
|
-
cur_node--;
|
2361
|
-
cache_ptr--;
|
2362
|
-
|
2363
|
-
num_matches = cache_ptr->length;
|
2364
|
-
literal = cache_ptr->offset;
|
2365
|
-
|
2366
|
-
/* It's always possible to choose a literal. */
|
2367
|
-
best_cost_to_end = c->p.n.costs.literal[literal] +
|
2368
|
-
(cur_node + 1)->cost_to_end;
|
2369
|
-
cur_node->item = ((u32)literal << OPTIMUM_OFFSET_SHIFT) | 1;
|
2370
|
-
|
2371
|
-
/* Also consider matches if there are any. */
|
2372
|
-
if (num_matches) {
|
2373
|
-
const struct lz_match *match;
|
2374
|
-
unsigned len;
|
2375
|
-
unsigned offset;
|
2376
|
-
unsigned offset_slot;
|
2377
|
-
u32 offset_cost;
|
2378
|
-
u32 cost_to_end;
|
2379
|
-
|
2380
|
-
/*
|
2381
|
-
* Consider each length from the minimum
|
2382
|
-
* (DEFLATE_MIN_MATCH_LEN) to the length of the longest
|
2383
|
-
* match found at this position. For each length, we
|
2384
|
-
* consider only the smallest offset for which that
|
2385
|
-
* length is available. Although this is not guaranteed
|
2386
|
-
* to be optimal due to the possibility of a larger
|
2387
|
-
* offset costing less than a smaller offset to code,
|
2388
|
-
* this is a very useful heuristic.
|
2389
|
-
*/
|
2390
|
-
match = cache_ptr - num_matches;
|
2391
|
-
len = DEFLATE_MIN_MATCH_LEN;
|
2392
|
-
do {
|
2393
|
-
offset = match->offset;
|
2394
|
-
offset_slot = deflate_get_offset_slot(c, offset);
|
2395
|
-
offset_cost = c->p.n.costs.offset_slot[offset_slot];
|
2396
|
-
do {
|
2397
|
-
cost_to_end = offset_cost +
|
2398
|
-
c->p.n.costs.length[len] +
|
2399
|
-
(cur_node + len)->cost_to_end;
|
2400
|
-
if (cost_to_end < best_cost_to_end) {
|
2401
|
-
best_cost_to_end = cost_to_end;
|
2402
|
-
cur_node->item = ((u32)offset << OPTIMUM_OFFSET_SHIFT) | len;
|
2403
|
-
}
|
2404
|
-
} while (++len <= match->length);
|
2405
|
-
} while (++match != cache_ptr);
|
2406
|
-
cache_ptr -= num_matches;
|
2407
|
-
}
|
2408
|
-
cur_node->cost_to_end = best_cost_to_end;
|
2409
|
-
} while (cur_node != &c->p.n.optimum_nodes[0]);
|
2410
|
-
}
|
2411
|
-
|
2412
|
-
/*
|
2413
|
-
* Choose the literal/match sequence to use for the current block. The basic
|
2414
|
-
* algorithm finds a minimum-cost path through the block's graph of
|
2415
|
-
* literal/match choices, given a cost model. However, the cost of each symbol
|
2416
|
-
* is unknown until the Huffman codes have been built, but at the same time the
|
2417
|
-
* Huffman codes depend on the frequencies of chosen symbols. Consequently,
|
2418
|
-
* multiple passes must be used to try to approximate an optimal solution. The
|
2419
|
-
* first pass uses default costs, mixed with the costs from the previous block
|
2420
|
-
* if any. Later passes use the Huffman codeword lengths from the previous pass
|
2421
|
-
* as the costs.
|
2422
|
-
*/
|
2423
|
-
static void
|
2424
|
-
deflate_optimize_block(struct libdeflate_compressor *c, u32 block_length,
|
2425
|
-
const struct lz_match *cache_ptr, bool is_first_block)
|
2426
|
-
{
|
2427
|
-
unsigned num_passes_remaining = c->p.n.num_optim_passes;
|
2428
|
-
u32 i;
|
2429
|
-
|
2430
|
-
/* Force the block to really end at the desired length, even if some
|
2431
|
-
* matches extend beyond it. */
|
2432
|
-
for (i = block_length; i <= MIN(block_length - 1 + DEFLATE_MAX_MATCH_LEN,
|
2433
|
-
ARRAY_LEN(c->p.n.optimum_nodes) - 1); i++)
|
2434
|
-
c->p.n.optimum_nodes[i].cost_to_end = 0x80000000;
|
2435
|
-
|
2436
|
-
/* Set the initial costs. */
|
2437
|
-
if (is_first_block)
|
2438
|
-
deflate_set_default_costs(c);
|
2439
|
-
else
|
2440
|
-
deflate_adjust_costs(c);
|
2441
|
-
|
2442
|
-
for (;;) {
|
2443
|
-
/* Find the minimum cost path for this pass. */
|
2444
|
-
deflate_find_min_cost_path(c, block_length, cache_ptr);
|
2445
|
-
|
2446
|
-
/* Compute frequencies of the chosen symbols. */
|
2447
|
-
deflate_reset_symbol_frequencies(c);
|
2448
|
-
deflate_tally_item_list(c, block_length);
|
2449
|
-
|
2450
|
-
if (--num_passes_remaining == 0)
|
2451
|
-
break;
|
2452
|
-
|
2453
|
-
/* At least one optimization pass remains; update the costs. */
|
2454
|
-
deflate_make_huffman_codes(&c->freqs, &c->codes);
|
2455
|
-
deflate_set_costs_from_codes(c, &c->codes.lens);
|
2456
|
-
}
|
2457
|
-
}
|
2458
|
-
|
2459
|
-
/*
|
2460
|
-
* This is the "near-optimal" DEFLATE compressor. It computes the optimal
|
2461
|
-
* representation of each DEFLATE block using a minimum-cost path search over
|
2462
|
-
* the graph of possible match/literal choices for that block, assuming a
|
2463
|
-
* certain cost for each Huffman symbol.
|
2464
|
-
*
|
2465
|
-
* For several reasons, the end result is not guaranteed to be optimal:
|
2466
|
-
*
|
2467
|
-
* - Nonoptimal choice of blocks
|
2468
|
-
* - Heuristic limitations on which matches are actually considered
|
2469
|
-
* - Symbol costs are unknown until the symbols have already been chosen
|
2470
|
-
* (so iterative optimization must be used)
|
2471
|
-
*/
|
2472
|
-
static size_t
|
2473
|
-
deflate_compress_near_optimal(struct libdeflate_compressor * restrict c,
|
2474
|
-
const u8 * restrict in, size_t in_nbytes,
|
2475
|
-
u8 * restrict out, size_t out_nbytes_avail)
|
2476
|
-
{
|
2477
|
-
const u8 *in_next = in;
|
2478
|
-
const u8 *in_end = in_next + in_nbytes;
|
2479
|
-
struct deflate_output_bitstream os;
|
2480
|
-
const u8 *in_cur_base = in_next;
|
2481
|
-
const u8 *in_next_slide = in_next + MIN(in_end - in_next, MATCHFINDER_WINDOW_SIZE);
|
2482
|
-
unsigned max_len = DEFLATE_MAX_MATCH_LEN;
|
2483
|
-
unsigned nice_len = MIN(c->nice_match_length, max_len);
|
2484
|
-
u32 next_hashes[2] = {0, 0};
|
2485
|
-
|
2486
|
-
deflate_init_output(&os, out, out_nbytes_avail);
|
2487
|
-
bt_matchfinder_init(&c->p.n.bt_mf);
|
2488
|
-
|
2489
|
-
do {
|
2490
|
-
/* Starting a new DEFLATE block. */
|
2491
|
-
|
2492
|
-
struct lz_match *cache_ptr = c->p.n.match_cache;
|
2493
|
-
const u8 * const in_block_begin = in_next;
|
2494
|
-
const u8 * const in_max_block_end =
|
2495
|
-
in_next + MIN(in_end - in_next, SOFT_MAX_BLOCK_LENGTH);
|
2496
|
-
const u8 *next_observation = in_next;
|
2497
|
-
|
2498
|
-
init_block_split_stats(&c->split_stats);
|
2499
|
-
|
2500
|
-
/*
|
2501
|
-
* Find matches until we decide to end the block. We end the
|
2502
|
-
* block if any of the following is true:
|
2503
|
-
*
|
2504
|
-
* (1) Maximum block length has been reached
|
2505
|
-
* (2) Match catch may overflow.
|
2506
|
-
* (3) Block split heuristic says to split now.
|
2507
|
-
*/
|
2508
|
-
do {
|
2509
|
-
struct lz_match *matches;
|
2510
|
-
unsigned best_len;
|
2511
|
-
|
2512
|
-
/* Slide the window forward if needed. */
|
2513
|
-
if (in_next == in_next_slide) {
|
2514
|
-
bt_matchfinder_slide_window(&c->p.n.bt_mf);
|
2515
|
-
in_cur_base = in_next;
|
2516
|
-
in_next_slide = in_next + MIN(in_end - in_next,
|
2517
|
-
MATCHFINDER_WINDOW_SIZE);
|
2518
|
-
}
|
2519
|
-
|
2520
|
-
/* Decrease the maximum and nice match lengths if we're
|
2521
|
-
* approaching the end of the input buffer. */
|
2522
|
-
if (unlikely(max_len > in_end - in_next)) {
|
2523
|
-
max_len = in_end - in_next;
|
2524
|
-
nice_len = MIN(nice_len, max_len);
|
2525
|
-
}
|
2526
|
-
|
2527
|
-
/*
|
2528
|
-
* Find matches with the current position using the
|
2529
|
-
* binary tree matchfinder and save them in
|
2530
|
-
* 'match_cache'.
|
2531
|
-
*
|
2532
|
-
* Note: the binary tree matchfinder is more suited for
|
2533
|
-
* optimal parsing than the hash chain matchfinder. The
|
2534
|
-
* reasons for this include:
|
2535
|
-
*
|
2536
|
-
* - The binary tree matchfinder can find more matches
|
2537
|
-
* in the same number of steps.
|
2538
|
-
* - One of the major advantages of hash chains is that
|
2539
|
-
* skipping positions (not searching for matches at
|
2540
|
-
* them) is faster; however, with optimal parsing we
|
2541
|
-
* search for matches at almost all positions, so this
|
2542
|
-
* advantage of hash chains is negated.
|
2543
|
-
*/
|
2544
|
-
matches = cache_ptr;
|
2545
|
-
best_len = 0;
|
2546
|
-
if (likely(max_len >= BT_MATCHFINDER_REQUIRED_NBYTES)) {
|
2547
|
-
cache_ptr = bt_matchfinder_get_matches(&c->p.n.bt_mf,
|
2548
|
-
in_cur_base,
|
2549
|
-
in_next - in_cur_base,
|
2550
|
-
max_len,
|
2551
|
-
nice_len,
|
2552
|
-
c->max_search_depth,
|
2553
|
-
next_hashes,
|
2554
|
-
&best_len,
|
2555
|
-
matches);
|
2556
|
-
}
|
2557
|
-
|
2558
|
-
if (in_next >= next_observation) {
|
2559
|
-
if (best_len >= 4) {
|
2560
|
-
observe_match(&c->split_stats, best_len);
|
2561
|
-
next_observation = in_next + best_len;
|
2562
|
-
} else {
|
2563
|
-
observe_literal(&c->split_stats, *in_next);
|
2564
|
-
next_observation = in_next + 1;
|
2565
|
-
}
|
2566
|
-
}
|
2567
|
-
|
2568
|
-
cache_ptr->length = cache_ptr - matches;
|
2569
|
-
cache_ptr->offset = *in_next;
|
2570
|
-
in_next++;
|
2571
|
-
cache_ptr++;
|
2572
|
-
|
2573
|
-
/*
|
2574
|
-
* If there was a very long match found, don't cache any
|
2575
|
-
* matches for the bytes covered by that match. This
|
2576
|
-
* avoids degenerate behavior when compressing highly
|
2577
|
-
* redundant data, where the number of matches can be
|
2578
|
-
* very large.
|
2579
|
-
*
|
2580
|
-
* This heuristic doesn't actually hurt the compression
|
2581
|
-
* ratio very much. If there's a long match, then the
|
2582
|
-
* data must be highly compressible, so it doesn't
|
2583
|
-
* matter much what we do.
|
2584
|
-
*/
|
2585
|
-
if (best_len >= DEFLATE_MIN_MATCH_LEN && best_len >= nice_len) {
|
2586
|
-
--best_len;
|
2587
|
-
do {
|
2588
|
-
if (in_next == in_next_slide) {
|
2589
|
-
bt_matchfinder_slide_window(&c->p.n.bt_mf);
|
2590
|
-
in_cur_base = in_next;
|
2591
|
-
in_next_slide = in_next + MIN(in_end - in_next,
|
2592
|
-
MATCHFINDER_WINDOW_SIZE);
|
2593
|
-
}
|
2594
|
-
if (unlikely(max_len > in_end - in_next)) {
|
2595
|
-
max_len = in_end - in_next;
|
2596
|
-
nice_len = MIN(nice_len, max_len);
|
2597
|
-
}
|
2598
|
-
if (max_len >= BT_MATCHFINDER_REQUIRED_NBYTES) {
|
2599
|
-
bt_matchfinder_skip_position(&c->p.n.bt_mf,
|
2600
|
-
in_cur_base,
|
2601
|
-
in_next - in_cur_base,
|
2602
|
-
nice_len,
|
2603
|
-
c->max_search_depth,
|
2604
|
-
next_hashes);
|
2605
|
-
}
|
2606
|
-
cache_ptr->length = 0;
|
2607
|
-
cache_ptr->offset = *in_next;
|
2608
|
-
in_next++;
|
2609
|
-
cache_ptr++;
|
2610
|
-
} while (--best_len);
|
2611
|
-
}
|
2612
|
-
} while (in_next < in_max_block_end &&
|
2613
|
-
cache_ptr < &c->p.n.match_cache[CACHE_LENGTH] &&
|
2614
|
-
!should_end_block(&c->split_stats, in_block_begin, in_next, in_end));
|
2615
|
-
|
2616
|
-
/* All the matches for this block have been cached. Now choose
|
2617
|
-
* the sequence of items to output and flush the block. */
|
2618
|
-
deflate_optimize_block(c, in_next - in_block_begin, cache_ptr,
|
2619
|
-
in_block_begin == in);
|
2620
|
-
deflate_flush_block(c, &os, in_block_begin, in_next - in_block_begin,
|
2621
|
-
in_next == in_end, true);
|
2622
|
-
} while (in_next != in_end);
|
2623
|
-
|
2624
|
-
return deflate_flush_output(&os);
|
2625
|
-
}
|
2626
|
-
|
2627
|
-
#endif /* SUPPORT_NEAR_OPTIMAL_PARSING */
|
2628
|
-
|
2629
|
-
/* Initialize c->offset_slot_fast. */
|
2630
|
-
static void
|
2631
|
-
deflate_init_offset_slot_fast(struct libdeflate_compressor *c)
|
2632
|
-
{
|
2633
|
-
unsigned offset_slot;
|
2634
|
-
unsigned offset;
|
2635
|
-
unsigned offset_end;
|
2636
|
-
|
2637
|
-
for (offset_slot = 0;
|
2638
|
-
offset_slot < ARRAY_LEN(deflate_offset_slot_base);
|
2639
|
-
offset_slot++)
|
2640
|
-
{
|
2641
|
-
offset = deflate_offset_slot_base[offset_slot];
|
2642
|
-
#if USE_FULL_OFFSET_SLOT_FAST
|
2643
|
-
offset_end = offset + (1 << deflate_extra_offset_bits[offset_slot]);
|
2644
|
-
do {
|
2645
|
-
c->offset_slot_fast[offset] = offset_slot;
|
2646
|
-
} while (++offset != offset_end);
|
2647
|
-
#else
|
2648
|
-
if (offset <= 256) {
|
2649
|
-
offset_end = offset + (1 << deflate_extra_offset_bits[offset_slot]);
|
2650
|
-
do {
|
2651
|
-
c->offset_slot_fast[offset - 1] = offset_slot;
|
2652
|
-
} while (++offset != offset_end);
|
2653
|
-
} else {
|
2654
|
-
offset_end = offset + (1 << deflate_extra_offset_bits[offset_slot]);
|
2655
|
-
do {
|
2656
|
-
c->offset_slot_fast[256 + ((offset - 1) >> 7)] = offset_slot;
|
2657
|
-
} while ((offset += (1 << 7)) != offset_end);
|
2658
|
-
}
|
2659
|
-
#endif
|
2660
|
-
}
|
2661
|
-
}
|
2662
|
-
|
2663
|
-
LIBDEFLATEAPI struct libdeflate_compressor *
|
2664
|
-
libdeflate_alloc_compressor(int compression_level)
|
2665
|
-
{
|
2666
|
-
struct libdeflate_compressor *c;
|
2667
|
-
size_t size;
|
2668
|
-
|
2669
|
-
#if SUPPORT_NEAR_OPTIMAL_PARSING
|
2670
|
-
if (compression_level >= 8)
|
2671
|
-
size = offsetof(struct libdeflate_compressor, p) + sizeof(c->p.n);
|
2672
|
-
else
|
2673
|
-
#endif
|
2674
|
-
size = offsetof(struct libdeflate_compressor, p) + sizeof(c->p.g);
|
2675
|
-
|
2676
|
-
c = aligned_malloc(MATCHFINDER_ALIGNMENT, size);
|
2677
|
-
if (!c)
|
2678
|
-
return NULL;
|
2679
|
-
|
2680
|
-
switch (compression_level) {
|
2681
|
-
case 1:
|
2682
|
-
c->impl = deflate_compress_greedy;
|
2683
|
-
c->max_search_depth = 2;
|
2684
|
-
c->nice_match_length = 8;
|
2685
|
-
break;
|
2686
|
-
case 2:
|
2687
|
-
c->impl = deflate_compress_greedy;
|
2688
|
-
c->max_search_depth = 6;
|
2689
|
-
c->nice_match_length = 10;
|
2690
|
-
break;
|
2691
|
-
case 3:
|
2692
|
-
c->impl = deflate_compress_greedy;
|
2693
|
-
c->max_search_depth = 12;
|
2694
|
-
c->nice_match_length = 14;
|
2695
|
-
break;
|
2696
|
-
case 4:
|
2697
|
-
c->impl = deflate_compress_greedy;
|
2698
|
-
c->max_search_depth = 24;
|
2699
|
-
c->nice_match_length = 24;
|
2700
|
-
break;
|
2701
|
-
case 5:
|
2702
|
-
c->impl = deflate_compress_lazy;
|
2703
|
-
c->max_search_depth = 20;
|
2704
|
-
c->nice_match_length = 30;
|
2705
|
-
break;
|
2706
|
-
case 6:
|
2707
|
-
c->impl = deflate_compress_lazy;
|
2708
|
-
c->max_search_depth = 40;
|
2709
|
-
c->nice_match_length = 65;
|
2710
|
-
break;
|
2711
|
-
case 7:
|
2712
|
-
c->impl = deflate_compress_lazy;
|
2713
|
-
c->max_search_depth = 100;
|
2714
|
-
c->nice_match_length = 130;
|
2715
|
-
break;
|
2716
|
-
#if SUPPORT_NEAR_OPTIMAL_PARSING
|
2717
|
-
case 8:
|
2718
|
-
c->impl = deflate_compress_near_optimal;
|
2719
|
-
c->max_search_depth = 12;
|
2720
|
-
c->nice_match_length = 20;
|
2721
|
-
c->p.n.num_optim_passes = 1;
|
2722
|
-
break;
|
2723
|
-
case 9:
|
2724
|
-
c->impl = deflate_compress_near_optimal;
|
2725
|
-
c->max_search_depth = 16;
|
2726
|
-
c->nice_match_length = 26;
|
2727
|
-
c->p.n.num_optim_passes = 2;
|
2728
|
-
break;
|
2729
|
-
case 10:
|
2730
|
-
c->impl = deflate_compress_near_optimal;
|
2731
|
-
c->max_search_depth = 30;
|
2732
|
-
c->nice_match_length = 50;
|
2733
|
-
c->p.n.num_optim_passes = 2;
|
2734
|
-
break;
|
2735
|
-
case 11:
|
2736
|
-
c->impl = deflate_compress_near_optimal;
|
2737
|
-
c->max_search_depth = 60;
|
2738
|
-
c->nice_match_length = 80;
|
2739
|
-
c->p.n.num_optim_passes = 3;
|
2740
|
-
break;
|
2741
|
-
case 12:
|
2742
|
-
c->impl = deflate_compress_near_optimal;
|
2743
|
-
c->max_search_depth = 100;
|
2744
|
-
c->nice_match_length = 133;
|
2745
|
-
c->p.n.num_optim_passes = 4;
|
2746
|
-
break;
|
2747
|
-
#else
|
2748
|
-
case 8:
|
2749
|
-
c->impl = deflate_compress_lazy;
|
2750
|
-
c->max_search_depth = 150;
|
2751
|
-
c->nice_match_length = 200;
|
2752
|
-
break;
|
2753
|
-
case 9:
|
2754
|
-
c->impl = deflate_compress_lazy;
|
2755
|
-
c->max_search_depth = 200;
|
2756
|
-
c->nice_match_length = DEFLATE_MAX_MATCH_LEN;
|
2757
|
-
break;
|
2758
|
-
#endif
|
2759
|
-
default:
|
2760
|
-
aligned_free(c);
|
2761
|
-
return NULL;
|
2762
|
-
}
|
2763
|
-
|
2764
|
-
c->compression_level = compression_level;
|
2765
|
-
|
2766
|
-
deflate_init_offset_slot_fast(c);
|
2767
|
-
deflate_init_static_codes(c);
|
2768
|
-
|
2769
|
-
return c;
|
2770
|
-
}
|
2771
|
-
|
2772
|
-
LIBDEFLATEAPI size_t
|
2773
|
-
libdeflate_deflate_compress(struct libdeflate_compressor *c,
|
2774
|
-
const void *in, size_t in_nbytes,
|
2775
|
-
void *out, size_t out_nbytes_avail)
|
2776
|
-
{
|
2777
|
-
if (unlikely(out_nbytes_avail < MIN_OUTPUT_SIZE))
|
2778
|
-
return 0;
|
2779
|
-
|
2780
|
-
/* For extremely small inputs just use a single uncompressed block. */
|
2781
|
-
if (unlikely(in_nbytes < 16)) {
|
2782
|
-
struct deflate_output_bitstream os;
|
2783
|
-
deflate_init_output(&os, out, out_nbytes_avail);
|
2784
|
-
if (in_nbytes == 0)
|
2785
|
-
in = &os; /* Avoid passing NULL to memcpy() */
|
2786
|
-
deflate_write_uncompressed_block(&os, in, in_nbytes, true);
|
2787
|
-
return deflate_flush_output(&os);
|
2788
|
-
}
|
2789
|
-
|
2790
|
-
return (*c->impl)(c, in, in_nbytes, out, out_nbytes_avail);
|
2791
|
-
}
|
2792
|
-
|
2793
|
-
LIBDEFLATEAPI void
|
2794
|
-
libdeflate_free_compressor(struct libdeflate_compressor *c)
|
2795
|
-
{
|
2796
|
-
aligned_free(c);
|
2797
|
-
}
|
2798
|
-
|
2799
|
-
unsigned int
|
2800
|
-
deflate_get_compression_level(struct libdeflate_compressor *c)
|
2801
|
-
{
|
2802
|
-
return c->compression_level;
|
2803
|
-
}
|
2804
|
-
|
2805
|
-
LIBDEFLATEAPI size_t
|
2806
|
-
libdeflate_deflate_compress_bound(struct libdeflate_compressor *c,
|
2807
|
-
size_t in_nbytes)
|
2808
|
-
{
|
2809
|
-
/*
|
2810
|
-
* The worst case is all uncompressed blocks where one block has length
|
2811
|
-
* <= MIN_BLOCK_LENGTH and the others have length MIN_BLOCK_LENGTH.
|
2812
|
-
* Each uncompressed block has 5 bytes of overhead: 1 for BFINAL, BTYPE,
|
2813
|
-
* and alignment to a byte boundary; 2 for LEN; and 2 for NLEN.
|
2814
|
-
*/
|
2815
|
-
size_t max_num_blocks = MAX(DIV_ROUND_UP(in_nbytes, MIN_BLOCK_LENGTH), 1);
|
2816
|
-
return (5 * max_num_blocks) + in_nbytes + 1 + MIN_OUTPUT_SIZE;
|
2817
|
-
}
|