libdeflate 0.1.1 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (79) hide show
  1. checksums.yaml +5 -5
  2. data/.github/workflows/test.yml +34 -0
  3. data/README.md +1 -6
  4. data/ext/libdeflate/extconf.rb +18 -7
  5. data/ext/libdeflate/libdeflate_ext.c +17 -17
  6. data/lib/libdeflate/version.rb +1 -1
  7. data/libdeflate.gemspec +2 -1
  8. metadata +13 -84
  9. data/.gitmodules +0 -3
  10. data/.travis.yml +0 -5
  11. data/ext/libdeflate/libdeflate/.gitignore +0 -19
  12. data/ext/libdeflate/libdeflate/COPYING +0 -21
  13. data/ext/libdeflate/libdeflate/Makefile +0 -231
  14. data/ext/libdeflate/libdeflate/Makefile.msc +0 -64
  15. data/ext/libdeflate/libdeflate/NEWS +0 -57
  16. data/ext/libdeflate/libdeflate/README.md +0 -170
  17. data/ext/libdeflate/libdeflate/common/common_defs.h +0 -351
  18. data/ext/libdeflate/libdeflate/common/compiler_gcc.h +0 -134
  19. data/ext/libdeflate/libdeflate/common/compiler_msc.h +0 -95
  20. data/ext/libdeflate/libdeflate/lib/adler32.c +0 -213
  21. data/ext/libdeflate/libdeflate/lib/adler32_impl.h +0 -281
  22. data/ext/libdeflate/libdeflate/lib/aligned_malloc.c +0 -57
  23. data/ext/libdeflate/libdeflate/lib/aligned_malloc.h +0 -13
  24. data/ext/libdeflate/libdeflate/lib/bt_matchfinder.h +0 -357
  25. data/ext/libdeflate/libdeflate/lib/crc32.c +0 -368
  26. data/ext/libdeflate/libdeflate/lib/crc32_impl.h +0 -286
  27. data/ext/libdeflate/libdeflate/lib/crc32_table.h +0 -526
  28. data/ext/libdeflate/libdeflate/lib/decompress_impl.h +0 -404
  29. data/ext/libdeflate/libdeflate/lib/deflate_compress.c +0 -2817
  30. data/ext/libdeflate/libdeflate/lib/deflate_compress.h +0 -14
  31. data/ext/libdeflate/libdeflate/lib/deflate_constants.h +0 -66
  32. data/ext/libdeflate/libdeflate/lib/deflate_decompress.c +0 -889
  33. data/ext/libdeflate/libdeflate/lib/gzip_compress.c +0 -95
  34. data/ext/libdeflate/libdeflate/lib/gzip_constants.h +0 -45
  35. data/ext/libdeflate/libdeflate/lib/gzip_decompress.c +0 -130
  36. data/ext/libdeflate/libdeflate/lib/hc_matchfinder.h +0 -405
  37. data/ext/libdeflate/libdeflate/lib/lib_common.h +0 -35
  38. data/ext/libdeflate/libdeflate/lib/matchfinder_avx2.h +0 -53
  39. data/ext/libdeflate/libdeflate/lib/matchfinder_common.h +0 -205
  40. data/ext/libdeflate/libdeflate/lib/matchfinder_neon.h +0 -61
  41. data/ext/libdeflate/libdeflate/lib/matchfinder_sse2.h +0 -53
  42. data/ext/libdeflate/libdeflate/lib/unaligned.h +0 -202
  43. data/ext/libdeflate/libdeflate/lib/x86_cpu_features.c +0 -169
  44. data/ext/libdeflate/libdeflate/lib/x86_cpu_features.h +0 -48
  45. data/ext/libdeflate/libdeflate/lib/zlib_compress.c +0 -87
  46. data/ext/libdeflate/libdeflate/lib/zlib_constants.h +0 -21
  47. data/ext/libdeflate/libdeflate/lib/zlib_decompress.c +0 -91
  48. data/ext/libdeflate/libdeflate/libdeflate.h +0 -274
  49. data/ext/libdeflate/libdeflate/programs/benchmark.c +0 -558
  50. data/ext/libdeflate/libdeflate/programs/checksum.c +0 -197
  51. data/ext/libdeflate/libdeflate/programs/detect.sh +0 -62
  52. data/ext/libdeflate/libdeflate/programs/gzip.c +0 -603
  53. data/ext/libdeflate/libdeflate/programs/prog_util.c +0 -530
  54. data/ext/libdeflate/libdeflate/programs/prog_util.h +0 -162
  55. data/ext/libdeflate/libdeflate/programs/test_checksums.c +0 -135
  56. data/ext/libdeflate/libdeflate/programs/tgetopt.c +0 -118
  57. data/ext/libdeflate/libdeflate/tools/afl-fuzz/Makefile +0 -12
  58. data/ext/libdeflate/libdeflate/tools/afl-fuzz/deflate_compress/fuzz.c +0 -40
  59. data/ext/libdeflate/libdeflate/tools/afl-fuzz/deflate_compress/inputs/0 +0 -0
  60. data/ext/libdeflate/libdeflate/tools/afl-fuzz/deflate_decompress/fuzz.c +0 -28
  61. data/ext/libdeflate/libdeflate/tools/afl-fuzz/deflate_decompress/inputs/0 +0 -3
  62. data/ext/libdeflate/libdeflate/tools/afl-fuzz/gzip_decompress/fuzz.c +0 -28
  63. data/ext/libdeflate/libdeflate/tools/afl-fuzz/gzip_decompress/inputs/0 +0 -0
  64. data/ext/libdeflate/libdeflate/tools/afl-fuzz/prepare_for_fuzz.sh +0 -14
  65. data/ext/libdeflate/libdeflate/tools/afl-fuzz/zlib_decompress/fuzz.c +0 -28
  66. data/ext/libdeflate/libdeflate/tools/afl-fuzz/zlib_decompress/inputs/0 +0 -3
  67. data/ext/libdeflate/libdeflate/tools/android_build.sh +0 -104
  68. data/ext/libdeflate/libdeflate/tools/checksum_benchmarks.sh +0 -76
  69. data/ext/libdeflate/libdeflate/tools/exec_tests.sh +0 -30
  70. data/ext/libdeflate/libdeflate/tools/gen_crc32_multipliers.c +0 -108
  71. data/ext/libdeflate/libdeflate/tools/gen_crc32_table.c +0 -100
  72. data/ext/libdeflate/libdeflate/tools/gzip_tests.sh +0 -412
  73. data/ext/libdeflate/libdeflate/tools/make-windows-releases +0 -21
  74. data/ext/libdeflate/libdeflate/tools/mips_build.sh +0 -9
  75. data/ext/libdeflate/libdeflate/tools/msc_test.bat +0 -3
  76. data/ext/libdeflate/libdeflate/tools/pgo_build.sh +0 -23
  77. data/ext/libdeflate/libdeflate/tools/produce_gzip_benchmark_table.sh +0 -37
  78. data/ext/libdeflate/libdeflate/tools/run_tests.sh +0 -305
  79. data/ext/libdeflate/libdeflate/tools/windows_build.sh +0 -10
@@ -1,2817 +0,0 @@
1
- /*
2
- * deflate_compress.c - a compressor for DEFLATE
3
- *
4
- * Originally public domain; changes after 2016-09-07 are copyrighted.
5
- *
6
- * Copyright 2016 Eric Biggers
7
- *
8
- * Permission is hereby granted, free of charge, to any person
9
- * obtaining a copy of this software and associated documentation
10
- * files (the "Software"), to deal in the Software without
11
- * restriction, including without limitation the rights to use,
12
- * copy, modify, merge, publish, distribute, sublicense, and/or sell
13
- * copies of the Software, and to permit persons to whom the
14
- * Software is furnished to do so, subject to the following
15
- * conditions:
16
- *
17
- * The above copyright notice and this permission notice shall be
18
- * included in all copies or substantial portions of the Software.
19
- *
20
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
21
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
22
- * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
23
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
24
- * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
25
- * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
26
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
27
- * OTHER DEALINGS IN THE SOFTWARE.
28
- */
29
-
30
- #include <stdlib.h>
31
- #include <string.h>
32
-
33
- #include "aligned_malloc.h"
34
- #include "deflate_compress.h"
35
- #include "deflate_constants.h"
36
- #include "unaligned.h"
37
-
38
- #include "libdeflate.h"
39
-
40
- /*
41
- * By default, the near-optimal parsing algorithm is enabled at compression
42
- * level 8 and above. The near-optimal parsing algorithm produces a compression
43
- * ratio significantly better than the greedy and lazy algorithms implemented
44
- * here, and also the algorithm used by zlib at level 9. However, it is slow.
45
- */
46
- #define SUPPORT_NEAR_OPTIMAL_PARSING 1
47
-
48
- /*
49
- * Define to 1 to maintain the full map from match offsets to offset slots.
50
- * This slightly speeds up translations of match offsets to offset slots, but it
51
- * uses 32769 bytes of memory rather than the 512 bytes used by the condensed
52
- * map. The speedup provided by the larger map is most helpful when the
53
- * near-optimal parsing algorithm is being used.
54
- */
55
- #define USE_FULL_OFFSET_SLOT_FAST SUPPORT_NEAR_OPTIMAL_PARSING
56
-
57
- /*
58
- * DEFLATE uses a 32768 byte sliding window; set the matchfinder parameters
59
- * appropriately.
60
- */
61
- #define MATCHFINDER_WINDOW_ORDER 15
62
-
63
- #include "hc_matchfinder.h"
64
- #if SUPPORT_NEAR_OPTIMAL_PARSING
65
- # include "bt_matchfinder.h"
66
- #endif
67
-
68
- /*
69
- * The compressor always chooses a block of at least MIN_BLOCK_LENGTH bytes,
70
- * except if the last block has to be shorter.
71
- */
72
- #define MIN_BLOCK_LENGTH 10000
73
-
74
- /*
75
- * The compressor attempts to end blocks after SOFT_MAX_BLOCK_LENGTH bytes, but
76
- * the final length might be slightly longer due to matches extending beyond
77
- * this limit.
78
- */
79
- #define SOFT_MAX_BLOCK_LENGTH 300000
80
-
81
- /*
82
- * The number of observed matches or literals that represents sufficient data to
83
- * decide whether the current block should be terminated or not.
84
- */
85
- #define NUM_OBSERVATIONS_PER_BLOCK_CHECK 512
86
-
87
-
88
- #if SUPPORT_NEAR_OPTIMAL_PARSING
89
- /* Constants specific to the near-optimal parsing algorithm */
90
-
91
- /*
92
- * The maximum number of matches the matchfinder can find at a single position.
93
- * Since the matchfinder never finds more than one match for the same length,
94
- * presuming one of each possible length is sufficient for an upper bound.
95
- * (This says nothing about whether it is worthwhile to consider so many
96
- * matches; this is just defining the worst case.)
97
- */
98
- # define MAX_MATCHES_PER_POS (DEFLATE_MAX_MATCH_LEN - DEFLATE_MIN_MATCH_LEN + 1)
99
-
100
- /*
101
- * The number of lz_match structures in the match cache, excluding the extra
102
- * "overflow" entries. This value should be high enough so that nearly the
103
- * time, all matches found in a given block can fit in the match cache.
104
- * However, fallback behavior (immediately terminating the block) on cache
105
- * overflow is still required.
106
- */
107
- # define CACHE_LENGTH (SOFT_MAX_BLOCK_LENGTH * 5)
108
-
109
- #endif /* SUPPORT_NEAR_OPTIMAL_PARSING */
110
-
111
- /*
112
- * These are the compressor-side limits on the codeword lengths for each Huffman
113
- * code. To make outputting bits slightly faster, some of these limits are
114
- * lower than the limits defined by the DEFLATE format. This does not
115
- * significantly affect the compression ratio, at least for the block lengths we
116
- * use.
117
- */
118
- #define MAX_LITLEN_CODEWORD_LEN 14
119
- #define MAX_OFFSET_CODEWORD_LEN DEFLATE_MAX_OFFSET_CODEWORD_LEN
120
- #define MAX_PRE_CODEWORD_LEN DEFLATE_MAX_PRE_CODEWORD_LEN
121
-
122
- /* Table: length slot => length slot base value */
123
- static const unsigned deflate_length_slot_base[] = {
124
- 3 , 4 , 5 , 6 , 7 , 8 , 9 , 10 ,
125
- 11 , 13 , 15 , 17 , 19 , 23 , 27 , 31 ,
126
- 35 , 43 , 51 , 59 , 67 , 83 , 99 , 115 ,
127
- 131 , 163 , 195 , 227 , 258 ,
128
- };
129
-
130
- /* Table: length slot => number of extra length bits */
131
- static const u8 deflate_extra_length_bits[] = {
132
- 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
133
- 1 , 1 , 1 , 1 , 2 , 2 , 2 , 2 ,
134
- 3 , 3 , 3 , 3 , 4 , 4 , 4 , 4 ,
135
- 5 , 5 , 5 , 5 , 0 ,
136
- };
137
-
138
- /* Table: offset slot => offset slot base value */
139
- static const unsigned deflate_offset_slot_base[] = {
140
- 1 , 2 , 3 , 4 , 5 , 7 , 9 , 13 ,
141
- 17 , 25 , 33 , 49 , 65 , 97 , 129 , 193 ,
142
- 257 , 385 , 513 , 769 , 1025 , 1537 , 2049 , 3073 ,
143
- 4097 , 6145 , 8193 , 12289 , 16385 , 24577 ,
144
- };
145
-
146
- /* Table: offset slot => number of extra offset bits */
147
- static const u8 deflate_extra_offset_bits[] = {
148
- 0 , 0 , 0 , 0 , 1 , 1 , 2 , 2 ,
149
- 3 , 3 , 4 , 4 , 5 , 5 , 6 , 6 ,
150
- 7 , 7 , 8 , 8 , 9 , 9 , 10 , 10 ,
151
- 11 , 11 , 12 , 12 , 13 , 13 ,
152
- };
153
-
154
- /* Table: length => length slot */
155
- static const u8 deflate_length_slot[DEFLATE_MAX_MATCH_LEN + 1] = {
156
- 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 12,
157
- 12, 13, 13, 13, 13, 14, 14, 14, 14, 15, 15, 15, 15, 16, 16, 16, 16, 16,
158
- 16, 16, 16, 17, 17, 17, 17, 17, 17, 17, 17, 18, 18, 18, 18, 18, 18, 18,
159
- 18, 19, 19, 19, 19, 19, 19, 19, 19, 20, 20, 20, 20, 20, 20, 20, 20, 20,
160
- 20, 20, 20, 20, 20, 20, 20, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
161
- 21, 21, 21, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22,
162
- 22, 22, 22, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23,
163
- 23, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
164
- 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 25, 25, 25,
165
- 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
166
- 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 26, 26, 26, 26, 26, 26, 26,
167
- 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
168
- 26, 26, 26, 26, 26, 26, 26, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
169
- 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
170
- 27, 27, 28,
171
- };
172
-
173
- /* The order in which precode codeword lengths are stored */
174
- static const u8 deflate_precode_lens_permutation[DEFLATE_NUM_PRECODE_SYMS] = {
175
- 16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15
176
- };
177
-
178
- /* Codewords for the DEFLATE Huffman codes. */
179
- struct deflate_codewords {
180
- u32 litlen[DEFLATE_NUM_LITLEN_SYMS];
181
- u32 offset[DEFLATE_NUM_OFFSET_SYMS];
182
- };
183
-
184
- /* Codeword lengths (in bits) for the DEFLATE Huffman codes.
185
- * A zero length means the corresponding symbol had zero frequency. */
186
- struct deflate_lens {
187
- u8 litlen[DEFLATE_NUM_LITLEN_SYMS];
188
- u8 offset[DEFLATE_NUM_OFFSET_SYMS];
189
- };
190
-
191
- /* Codewords and lengths for the DEFLATE Huffman codes. */
192
- struct deflate_codes {
193
- struct deflate_codewords codewords;
194
- struct deflate_lens lens;
195
- };
196
-
197
- /* Symbol frequency counters for the DEFLATE Huffman codes. */
198
- struct deflate_freqs {
199
- u32 litlen[DEFLATE_NUM_LITLEN_SYMS];
200
- u32 offset[DEFLATE_NUM_OFFSET_SYMS];
201
- };
202
-
203
- #if SUPPORT_NEAR_OPTIMAL_PARSING
204
-
205
- /* Costs for the near-optimal parsing algorithm. */
206
- struct deflate_costs {
207
-
208
- /* The cost to output each possible literal. */
209
- u32 literal[DEFLATE_NUM_LITERALS];
210
-
211
- /* The cost to output each possible match length. */
212
- u32 length[DEFLATE_MAX_MATCH_LEN + 1];
213
-
214
- /* The cost to output a match offset of each possible offset slot. */
215
- u32 offset_slot[DEFLATE_NUM_OFFSET_SYMS];
216
- };
217
-
218
- /*
219
- * COST_SHIFT is a scaling factor that makes it possible to consider fractional
220
- * bit costs. A token requiring 'n' bits to represent has cost n << COST_SHIFT.
221
- *
222
- * Note: this is only useful as a statistical trick for when the true costs are
223
- * unknown. In reality, each token in DEFLATE requires a whole number of bits
224
- * to output.
225
- */
226
- #define COST_SHIFT 3
227
-
228
- /*
229
- * The NOSTAT_BITS value for a given alphabet is the number of bits assumed to
230
- * be needed to output a symbol that was unused in the previous optimization
231
- * pass. Assigning a default cost allows the symbol to be used in the next
232
- * optimization pass. However, the cost should be relatively high because the
233
- * symbol probably won't be used very many times (if at all).
234
- */
235
- #define LITERAL_NOSTAT_BITS 13
236
- #define LENGTH_NOSTAT_BITS 13
237
- #define OFFSET_NOSTAT_BITS 10
238
-
239
- #endif /* SUPPORT_NEAR_OPTIMAL_PARSING */
240
-
241
- /*
242
- * Represents a run of literals followed by a match or end-of-block. This
243
- * struct is needed to temporarily store items chosen by the parser, since items
244
- * cannot be written until all items for the block have been chosen and the
245
- * block's Huffman codes have been computed.
246
- */
247
- struct deflate_sequence {
248
-
249
- /* Bits 0..22: the number of literals in this run. This may be 0 and
250
- * can be at most about SOFT_MAX_BLOCK_LENGTH. The literals are not
251
- * stored explicitly in this structure; instead, they are read directly
252
- * from the uncompressed data.
253
- *
254
- * Bits 23..31: the length of the match which follows the literals, or 0
255
- * if this literal run was the last in the block, so there is no match
256
- * which follows it. */
257
- u32 litrunlen_and_length;
258
-
259
- /* If 'length' doesn't indicate end-of-block, then this is the offset of
260
- * the match which follows the literals. */
261
- u16 offset;
262
-
263
- /* If 'length' doesn't indicate end-of-block, then this is the offset
264
- * symbol of the match which follows the literals. */
265
- u8 offset_symbol;
266
-
267
- /* If 'length' doesn't indicate end-of-block, then this is the length
268
- * slot of the match which follows the literals. */
269
- u8 length_slot;
270
- };
271
-
272
- #if SUPPORT_NEAR_OPTIMAL_PARSING
273
-
274
- /*
275
- * This structure represents a byte position in the input data and a node in the
276
- * graph of possible match/literal choices for the current block.
277
- *
278
- * Logically, each incoming edge to this node is labeled with a literal or a
279
- * match that can be taken to reach this position from an earlier position; and
280
- * each outgoing edge from this node is labeled with a literal or a match that
281
- * can be taken to advance from this position to a later position.
282
- *
283
- * But these "edges" are actually stored elsewhere (in 'match_cache'). Here we
284
- * associate with each node just two pieces of information:
285
- *
286
- * 'cost_to_end' is the minimum cost to reach the end of the block from
287
- * this position.
288
- *
289
- * 'item' represents the literal or match that must be chosen from here to
290
- * reach the end of the block with the minimum cost. Equivalently, this
291
- * can be interpreted as the label of the outgoing edge on the minimum-cost
292
- * path to the "end of block" node from this node.
293
- */
294
- struct deflate_optimum_node {
295
-
296
- u32 cost_to_end;
297
-
298
- /*
299
- * Notes on the match/literal representation used here:
300
- *
301
- * The low bits of 'item' are the length: 1 if this is a literal,
302
- * or the match length if this is a match.
303
- *
304
- * The high bits of 'item' are the actual literal byte if this is a
305
- * literal, or the match offset if this is a match.
306
- */
307
- #define OPTIMUM_OFFSET_SHIFT 9
308
- #define OPTIMUM_LEN_MASK (((u32)1 << OPTIMUM_OFFSET_SHIFT) - 1)
309
- u32 item;
310
-
311
- };
312
-
313
- #endif /* SUPPORT_NEAR_OPTIMAL_PARSING */
314
-
315
- /* Block split statistics. See "Block splitting algorithm" below. */
316
- #define NUM_LITERAL_OBSERVATION_TYPES 8
317
- #define NUM_MATCH_OBSERVATION_TYPES 2
318
- #define NUM_OBSERVATION_TYPES (NUM_LITERAL_OBSERVATION_TYPES + NUM_MATCH_OBSERVATION_TYPES)
319
- struct block_split_stats {
320
- u32 new_observations[NUM_OBSERVATION_TYPES];
321
- u32 observations[NUM_OBSERVATION_TYPES];
322
- u32 num_new_observations;
323
- u32 num_observations;
324
- };
325
-
326
- /* The main DEFLATE compressor structure */
327
- struct libdeflate_compressor {
328
-
329
- /* Pointer to the compress() implementation chosen at allocation time */
330
- size_t (*impl)(struct libdeflate_compressor *,
331
- const u8 *, size_t, u8 *, size_t);
332
-
333
- /* Frequency counters for the current block */
334
- struct deflate_freqs freqs;
335
-
336
- /* Dynamic Huffman codes for the current block */
337
- struct deflate_codes codes;
338
-
339
- /* Static Huffman codes */
340
- struct deflate_codes static_codes;
341
-
342
- /* Block split statistics for the currently pending block */
343
- struct block_split_stats split_stats;
344
-
345
- /* A table for fast lookups of offset slot by match offset.
346
- *
347
- * If the full table is being used, it is a direct mapping from offset
348
- * to offset slot.
349
- *
350
- * If the condensed table is being used, the first 256 entries map
351
- * directly to the offset slots of offsets 1 through 256. The next 256
352
- * entries map to the offset slots for the remaining offsets, stepping
353
- * through the offsets with a stride of 128. This relies on the fact
354
- * that each of the remaining offset slots contains at least 128 offsets
355
- * and has an offset base that is a multiple of 128. */
356
- #if USE_FULL_OFFSET_SLOT_FAST
357
- u8 offset_slot_fast[DEFLATE_MAX_MATCH_OFFSET + 1];
358
- #else
359
- u8 offset_slot_fast[512];
360
- #endif
361
-
362
- /* The "nice" match length: if a match of this length is found, choose
363
- * it immediately without further consideration. */
364
- unsigned nice_match_length;
365
-
366
- /* The maximum search depth: consider at most this many potential
367
- * matches at each position. */
368
- unsigned max_search_depth;
369
-
370
- /* The compression level with which this compressor was created. */
371
- unsigned compression_level;
372
-
373
- /* Temporary space for Huffman code output */
374
- u32 precode_freqs[DEFLATE_NUM_PRECODE_SYMS];
375
- u8 precode_lens[DEFLATE_NUM_PRECODE_SYMS];
376
- u32 precode_codewords[DEFLATE_NUM_PRECODE_SYMS];
377
- unsigned precode_items[DEFLATE_NUM_LITLEN_SYMS + DEFLATE_NUM_OFFSET_SYMS];
378
- unsigned num_litlen_syms;
379
- unsigned num_offset_syms;
380
- unsigned num_explicit_lens;
381
- unsigned num_precode_items;
382
-
383
- union {
384
- /* Data for greedy or lazy parsing */
385
- struct {
386
- /* Hash chain matchfinder */
387
- struct hc_matchfinder hc_mf;
388
-
389
- /* The matches and literals that the parser has chosen
390
- * for the current block. The required length of this
391
- * array is limited by the maximum number of matches
392
- * that can ever be chosen for a single block, plus one
393
- * for the special entry at the end. */
394
- struct deflate_sequence sequences[
395
- DIV_ROUND_UP(SOFT_MAX_BLOCK_LENGTH,
396
- DEFLATE_MIN_MATCH_LEN) + 1];
397
- } g; /* (g)reedy */
398
-
399
- #if SUPPORT_NEAR_OPTIMAL_PARSING
400
- /* Data for near-optimal parsing */
401
- struct {
402
-
403
- /* Binary tree matchfinder */
404
- struct bt_matchfinder bt_mf;
405
-
406
- /*
407
- * Cached matches for the current block. This array
408
- * contains the matches that were found at each position
409
- * in the block. Specifically, for each position, there
410
- * is a list of matches found at that position, if any,
411
- * sorted by strictly increasing length. In addition,
412
- * following the matches for each position, there is a
413
- * special 'struct lz_match' whose 'length' member
414
- * contains the number of matches found at that
415
- * position, and whose 'offset' member contains the
416
- * literal at that position.
417
- *
418
- * Note: in rare cases, there will be a very high number
419
- * of matches in the block and this array will overflow.
420
- * If this happens, we force the end of the current
421
- * block. CACHE_LENGTH is the length at which we
422
- * actually check for overflow. The extra slots beyond
423
- * this are enough to absorb the worst case overflow,
424
- * which occurs if starting at &match_cache[CACHE_LENGTH
425
- * - 1], we write MAX_MATCHES_PER_POS matches and a
426
- * match count header, then skip searching for matches
427
- * at 'DEFLATE_MAX_MATCH_LEN - 1' positions and write
428
- * the match count header for each.
429
- */
430
- struct lz_match match_cache[CACHE_LENGTH +
431
- MAX_MATCHES_PER_POS +
432
- DEFLATE_MAX_MATCH_LEN - 1];
433
-
434
- /*
435
- * Array of nodes, one per position, for running the
436
- * minimum-cost path algorithm.
437
- *
438
- * This array must be large enough to accommodate the
439
- * worst-case number of nodes, which occurs if we find a
440
- * match of length DEFLATE_MAX_MATCH_LEN at position
441
- * SOFT_MAX_BLOCK_LENGTH - 1, producing a block of
442
- * length SOFT_MAX_BLOCK_LENGTH - 1 +
443
- * DEFLATE_MAX_MATCH_LEN. Add one for the end-of-block
444
- * node.
445
- */
446
- struct deflate_optimum_node optimum_nodes[SOFT_MAX_BLOCK_LENGTH - 1 +
447
- DEFLATE_MAX_MATCH_LEN + 1];
448
-
449
- /* The current cost model being used. */
450
- struct deflate_costs costs;
451
-
452
- unsigned num_optim_passes;
453
- } n; /* (n)ear-optimal */
454
- #endif /* SUPPORT_NEAR_OPTIMAL_PARSING */
455
-
456
- } p; /* (p)arser */
457
- };
458
-
459
- /*
460
- * The type for the bitbuffer variable, which temporarily holds bits that are
461
- * being packed into bytes and written to the output buffer. For best
462
- * performance, this should have size equal to a machine word.
463
- */
464
- typedef machine_word_t bitbuf_t;
465
- #define BITBUF_NBITS (8 * sizeof(bitbuf_t))
466
-
467
- /* Can the specified number of bits always be added to 'bitbuf' after any
468
- * pending bytes have been flushed? */
469
- #define CAN_BUFFER(n) ((n) <= BITBUF_NBITS - 7)
470
-
471
- /*
472
- * Structure to keep track of the current state of sending bits to the
473
- * compressed output buffer.
474
- */
475
- struct deflate_output_bitstream {
476
-
477
- /* Bits that haven't yet been written to the output buffer. */
478
- bitbuf_t bitbuf;
479
-
480
- /* Number of bits currently held in @bitbuf. */
481
- unsigned bitcount;
482
-
483
- /* Pointer to the beginning of the output buffer. */
484
- u8 *begin;
485
-
486
- /* Pointer to the position in the output buffer at which the next byte
487
- * should be written. */
488
- u8 *next;
489
-
490
- /* Pointer just past the end of the output buffer. */
491
- u8 *end;
492
- };
493
-
494
- #define MIN_OUTPUT_SIZE (UNALIGNED_ACCESS_IS_FAST ? sizeof(bitbuf_t) : 1)
495
-
496
- /* Initialize the output bitstream. 'size' is assumed to be at least
497
- * MIN_OUTPUT_SIZE. */
498
- static void
499
- deflate_init_output(struct deflate_output_bitstream *os,
500
- void *buffer, size_t size)
501
- {
502
- os->bitbuf = 0;
503
- os->bitcount = 0;
504
- os->begin = buffer;
505
- os->next = os->begin;
506
- os->end = os->begin + size - MIN_OUTPUT_SIZE;
507
- }
508
-
509
- /* Add some bits to the bitbuffer variable of the output bitstream. The caller
510
- * must make sure there is enough room. */
511
- static forceinline void
512
- deflate_add_bits(struct deflate_output_bitstream *os,
513
- const bitbuf_t bits, const unsigned num_bits)
514
- {
515
- os->bitbuf |= bits << os->bitcount;
516
- os->bitcount += num_bits;
517
- }
518
-
519
- /* Flush bits from the bitbuffer variable to the output buffer. */
520
- static forceinline void
521
- deflate_flush_bits(struct deflate_output_bitstream *os)
522
- {
523
- if (UNALIGNED_ACCESS_IS_FAST) {
524
- /* Flush a whole word (branchlessly). */
525
- put_unaligned_leword(os->bitbuf, os->next);
526
- os->bitbuf >>= os->bitcount & ~7;
527
- os->next += MIN(os->end - os->next, os->bitcount >> 3);
528
- os->bitcount &= 7;
529
- } else {
530
- /* Flush a byte at a time. */
531
- while (os->bitcount >= 8) {
532
- *os->next = os->bitbuf;
533
- if (os->next != os->end)
534
- os->next++;
535
- os->bitcount -= 8;
536
- os->bitbuf >>= 8;
537
- }
538
- }
539
- }
540
-
541
- /* Align the bitstream on a byte boundary. */
542
- static forceinline void
543
- deflate_align_bitstream(struct deflate_output_bitstream *os)
544
- {
545
- os->bitcount += -os->bitcount & 7;
546
- deflate_flush_bits(os);
547
- }
548
-
549
- /*
550
- * Flush any remaining bits to the output buffer if needed. Return the total
551
- * number of bytes written to the output buffer, or 0 if an overflow occurred.
552
- */
553
- static u32
554
- deflate_flush_output(struct deflate_output_bitstream *os)
555
- {
556
- if (os->next == os->end) /* overflow? */
557
- return 0;
558
-
559
- while ((int)os->bitcount > 0) {
560
- *os->next++ = os->bitbuf;
561
- os->bitcount -= 8;
562
- os->bitbuf >>= 8;
563
- }
564
-
565
- return os->next - os->begin;
566
- }
567
-
568
- /* Given the binary tree node A[subtree_idx] whose children already
569
- * satisfy the maxheap property, swap the node with its greater child
570
- * until it is greater than both its children, so that the maxheap
571
- * property is satisfied in the subtree rooted at A[subtree_idx]. */
572
- static void
573
- heapify_subtree(u32 A[], unsigned length, unsigned subtree_idx)
574
- {
575
- unsigned parent_idx;
576
- unsigned child_idx;
577
- u32 v;
578
-
579
- v = A[subtree_idx];
580
- parent_idx = subtree_idx;
581
- while ((child_idx = parent_idx * 2) <= length) {
582
- if (child_idx < length && A[child_idx + 1] > A[child_idx])
583
- child_idx++;
584
- if (v >= A[child_idx])
585
- break;
586
- A[parent_idx] = A[child_idx];
587
- parent_idx = child_idx;
588
- }
589
- A[parent_idx] = v;
590
- }
591
-
592
- /* Rearrange the array 'A' so that it satisfies the maxheap property.
593
- * 'A' uses 1-based indices, so the children of A[i] are A[i*2] and A[i*2 + 1].
594
- */
595
- static void
596
- heapify_array(u32 A[], unsigned length)
597
- {
598
- unsigned subtree_idx;
599
-
600
- for (subtree_idx = length / 2; subtree_idx >= 1; subtree_idx--)
601
- heapify_subtree(A, length, subtree_idx);
602
- }
603
-
604
- /*
605
- * Sort the array 'A', which contains 'length' unsigned 32-bit integers.
606
- *
607
- * Note: name this function heap_sort() instead of heapsort() to avoid colliding
608
- * with heapsort() from stdlib.h on BSD-derived systems --- though this isn't
609
- * necessary when compiling with -D_ANSI_SOURCE, which is the better solution.
610
- */
611
- static void
612
- heap_sort(u32 A[], unsigned length)
613
- {
614
- A--; /* Use 1-based indices */
615
-
616
- heapify_array(A, length);
617
-
618
- while (length >= 2) {
619
- u32 tmp = A[length];
620
- A[length] = A[1];
621
- A[1] = tmp;
622
- length--;
623
- heapify_subtree(A, length, 1);
624
- }
625
- }
626
-
627
- #define NUM_SYMBOL_BITS 10
628
- #define SYMBOL_MASK ((1 << NUM_SYMBOL_BITS) - 1)
629
-
630
- #define GET_NUM_COUNTERS(num_syms) ((((num_syms) + 3 / 4) + 3) & ~3)
631
- /*
632
- * Sort the symbols primarily by frequency and secondarily by symbol
633
- * value. Discard symbols with zero frequency and fill in an array with
634
- * the remaining symbols, along with their frequencies. The low
635
- * NUM_SYMBOL_BITS bits of each array entry will contain the symbol
636
- * value, and the remaining bits will contain the frequency.
637
- *
638
- * @num_syms
639
- * Number of symbols in the alphabet.
640
- * Can't be greater than (1 << NUM_SYMBOL_BITS).
641
- *
642
- * @freqs[num_syms]
643
- * The frequency of each symbol.
644
- *
645
- * @lens[num_syms]
646
- * An array that eventually will hold the length of each codeword.
647
- * This function only fills in the codeword lengths for symbols that
648
- * have zero frequency, which are not well defined per se but will
649
- * be set to 0.
650
- *
651
- * @symout[num_syms]
652
- * The output array, described above.
653
- *
654
- * Returns the number of entries in 'symout' that were filled. This is
655
- * the number of symbols that have nonzero frequency.
656
- */
657
- static unsigned
658
- sort_symbols(unsigned num_syms, const u32 freqs[restrict],
659
- u8 lens[restrict], u32 symout[restrict])
660
- {
661
- unsigned sym;
662
- unsigned i;
663
- unsigned num_used_syms;
664
- unsigned num_counters;
665
- unsigned counters[GET_NUM_COUNTERS(DEFLATE_MAX_NUM_SYMS)];
666
-
667
- /* We rely on heapsort, but with an added optimization. Since
668
- * it's common for most symbol frequencies to be low, we first do
669
- * a count sort using a limited number of counters. High
670
- * frequencies will be counted in the last counter, and only they
671
- * will be sorted with heapsort.
672
- *
673
- * Note: with more symbols, it is generally beneficial to have more
674
- * counters. About 1 counter per 4 symbols seems fast.
675
- *
676
- * Note: I also tested radix sort, but even for large symbol
677
- * counts (> 255) and frequencies bounded at 16 bits (enabling
678
- * radix sort by just two base-256 digits), it didn't seem any
679
- * faster than the method implemented here.
680
- *
681
- * Note: I tested the optimized quicksort implementation from
682
- * glibc (with indirection overhead removed), but it was only
683
- * marginally faster than the simple heapsort implemented here.
684
- *
685
- * Tests were done with building the codes for LZX. Results may
686
- * vary for different compression algorithms...! */
687
-
688
- num_counters = GET_NUM_COUNTERS(num_syms);
689
-
690
- memset(counters, 0, num_counters * sizeof(counters[0]));
691
-
692
- /* Count the frequencies. */
693
- for (sym = 0; sym < num_syms; sym++)
694
- counters[MIN(freqs[sym], num_counters - 1)]++;
695
-
696
- /* Make the counters cumulative, ignoring the zero-th, which
697
- * counted symbols with zero frequency. As a side effect, this
698
- * calculates the number of symbols with nonzero frequency. */
699
- num_used_syms = 0;
700
- for (i = 1; i < num_counters; i++) {
701
- unsigned count = counters[i];
702
- counters[i] = num_used_syms;
703
- num_used_syms += count;
704
- }
705
-
706
- /* Sort nonzero-frequency symbols using the counters. At the
707
- * same time, set the codeword lengths of zero-frequency symbols
708
- * to 0. */
709
- for (sym = 0; sym < num_syms; sym++) {
710
- u32 freq = freqs[sym];
711
- if (freq != 0) {
712
- symout[counters[MIN(freq, num_counters - 1)]++] =
713
- sym | (freq << NUM_SYMBOL_BITS);
714
- } else {
715
- lens[sym] = 0;
716
- }
717
- }
718
-
719
- /* Sort the symbols counted in the last counter. */
720
- heap_sort(symout + counters[num_counters - 2],
721
- counters[num_counters - 1] - counters[num_counters - 2]);
722
-
723
- return num_used_syms;
724
- }
725
-
726
- /*
727
- * Build the Huffman tree.
728
- *
729
- * This is an optimized implementation that
730
- * (a) takes advantage of the frequencies being already sorted;
731
- * (b) only generates non-leaf nodes, since the non-leaf nodes of a
732
- * Huffman tree are sufficient to generate a canonical code;
733
- * (c) Only stores parent pointers, not child pointers;
734
- * (d) Produces the nodes in the same memory used for input
735
- * frequency information.
736
- *
737
- * Array 'A', which contains 'sym_count' entries, is used for both input
738
- * and output. For this function, 'sym_count' must be at least 2.
739
- *
740
- * For input, the array must contain the frequencies of the symbols,
741
- * sorted in increasing order. Specifically, each entry must contain a
742
- * frequency left shifted by NUM_SYMBOL_BITS bits. Any data in the low
743
- * NUM_SYMBOL_BITS bits of the entries will be ignored by this function.
744
- * Although these bits will, in fact, contain the symbols that correspond
745
- * to the frequencies, this function is concerned with frequencies only
746
- * and keeps the symbols as-is.
747
- *
748
- * For output, this function will produce the non-leaf nodes of the
749
- * Huffman tree. These nodes will be stored in the first (sym_count - 1)
750
- * entries of the array. Entry A[sym_count - 2] will represent the root
751
- * node. Each other node will contain the zero-based index of its parent
752
- * node in 'A', left shifted by NUM_SYMBOL_BITS bits. The low
753
- * NUM_SYMBOL_BITS bits of each entry in A will be kept as-is. Again,
754
- * note that although these low bits will, in fact, contain a symbol
755
- * value, this symbol will have *no relationship* with the Huffman tree
756
- * node that happens to occupy the same slot. This is because this
757
- * implementation only generates the non-leaf nodes of the tree.
758
- */
759
- static void
760
- build_tree(u32 A[], unsigned sym_count)
761
- {
762
- /* Index, in 'A', of next lowest frequency symbol that has not
763
- * yet been processed. */
764
- unsigned i = 0;
765
-
766
- /* Index, in 'A', of next lowest frequency parentless non-leaf
767
- * node; or, if equal to 'e', then no such node exists yet. */
768
- unsigned b = 0;
769
-
770
- /* Index, in 'A', of next node to allocate as a non-leaf. */
771
- unsigned e = 0;
772
-
773
- do {
774
- unsigned m, n;
775
- u32 freq_shifted;
776
-
777
- /* Choose the two next lowest frequency entries. */
778
-
779
- if (i != sym_count &&
780
- (b == e || (A[i] >> NUM_SYMBOL_BITS) <= (A[b] >> NUM_SYMBOL_BITS)))
781
- m = i++;
782
- else
783
- m = b++;
784
-
785
- if (i != sym_count &&
786
- (b == e || (A[i] >> NUM_SYMBOL_BITS) <= (A[b] >> NUM_SYMBOL_BITS)))
787
- n = i++;
788
- else
789
- n = b++;
790
-
791
- /* Allocate a non-leaf node and link the entries to it.
792
- *
793
- * If we link an entry that we're visiting for the first
794
- * time (via index 'i'), then we're actually linking a
795
- * leaf node and it will have no effect, since the leaf
796
- * will be overwritten with a non-leaf when index 'e'
797
- * catches up to it. But it's not any slower to
798
- * unconditionally set the parent index.
799
- *
800
- * We also compute the frequency of the non-leaf node as
801
- * the sum of its two children's frequencies. */
802
-
803
- freq_shifted = (A[m] & ~SYMBOL_MASK) + (A[n] & ~SYMBOL_MASK);
804
-
805
- A[m] = (A[m] & SYMBOL_MASK) | (e << NUM_SYMBOL_BITS);
806
- A[n] = (A[n] & SYMBOL_MASK) | (e << NUM_SYMBOL_BITS);
807
- A[e] = (A[e] & SYMBOL_MASK) | freq_shifted;
808
- e++;
809
- } while (sym_count - e > 1);
810
- /* When just one entry remains, it is a "leaf" that was
811
- * linked to some other node. We ignore it, since the
812
- * rest of the array contains the non-leaves which we
813
- * need. (Note that we're assuming the cases with 0 or 1
814
- * symbols were handled separately.) */
815
- }
816
-
817
- /*
818
- * Given the stripped-down Huffman tree constructed by build_tree(),
819
- * determine the number of codewords that should be assigned each
820
- * possible length, taking into account the length-limited constraint.
821
- *
822
- * @A
823
- * The array produced by build_tree(), containing parent index
824
- * information for the non-leaf nodes of the Huffman tree. Each
825
- * entry in this array is a node; a node's parent always has a
826
- * greater index than that node itself. This function will
827
- * overwrite the parent index information in this array, so
828
- * essentially it will destroy the tree. However, the data in the
829
- * low NUM_SYMBOL_BITS of each entry will be preserved.
830
- *
831
- * @root_idx
832
- * The 0-based index of the root node in 'A', and consequently one
833
- * less than the number of tree node entries in 'A'. (Or, really 2
834
- * less than the actual length of 'A'.)
835
- *
836
- * @len_counts
837
- * An array of length ('max_codeword_len' + 1) in which the number of
838
- * codewords having each length <= max_codeword_len will be
839
- * returned.
840
- *
841
- * @max_codeword_len
842
- * The maximum permissible codeword length.
843
- */
844
- static void
845
- compute_length_counts(u32 A[restrict], unsigned root_idx,
846
- unsigned len_counts[restrict], unsigned max_codeword_len)
847
- {
848
- unsigned len;
849
- int node;
850
-
851
- /* The key observations are:
852
- *
853
- * (1) We can traverse the non-leaf nodes of the tree, always
854
- * visiting a parent before its children, by simply iterating
855
- * through the array in reverse order. Consequently, we can
856
- * compute the depth of each node in one pass, overwriting the
857
- * parent indices with depths.
858
- *
859
- * (2) We can initially assume that in the real Huffman tree,
860
- * both children of the root are leaves. This corresponds to two
861
- * codewords of length 1. Then, whenever we visit a (non-leaf)
862
- * node during the traversal, we modify this assumption to
863
- * account for the current node *not* being a leaf, but rather
864
- * its two children being leaves. This causes the loss of one
865
- * codeword for the current depth and the addition of two
866
- * codewords for the current depth plus one.
867
- *
868
- * (3) We can handle the length-limited constraint fairly easily
869
- * by simply using the largest length available when a depth
870
- * exceeds max_codeword_len.
871
- */
872
-
873
- for (len = 0; len <= max_codeword_len; len++)
874
- len_counts[len] = 0;
875
- len_counts[1] = 2;
876
-
877
- /* Set the root node's depth to 0. */
878
- A[root_idx] &= SYMBOL_MASK;
879
-
880
- for (node = root_idx - 1; node >= 0; node--) {
881
-
882
- /* Calculate the depth of this node. */
883
-
884
- unsigned parent = A[node] >> NUM_SYMBOL_BITS;
885
- unsigned parent_depth = A[parent] >> NUM_SYMBOL_BITS;
886
- unsigned depth = parent_depth + 1;
887
- unsigned len = depth;
888
-
889
- /* Set the depth of this node so that it is available
890
- * when its children (if any) are processed. */
891
-
892
- A[node] = (A[node] & SYMBOL_MASK) | (depth << NUM_SYMBOL_BITS);
893
-
894
- /* If needed, decrease the length to meet the
895
- * length-limited constraint. This is not the optimal
896
- * method for generating length-limited Huffman codes!
897
- * But it should be good enough. */
898
- if (len >= max_codeword_len) {
899
- len = max_codeword_len;
900
- do {
901
- len--;
902
- } while (len_counts[len] == 0);
903
- }
904
-
905
- /* Account for the fact that we have a non-leaf node at
906
- * the current depth. */
907
- len_counts[len]--;
908
- len_counts[len + 1] += 2;
909
- }
910
- }
911
-
912
- /*
913
- * Generate the codewords for a canonical Huffman code.
914
- *
915
- * @A
916
- * The output array for codewords. In addition, initially this
917
- * array must contain the symbols, sorted primarily by frequency and
918
- * secondarily by symbol value, in the low NUM_SYMBOL_BITS bits of
919
- * each entry.
920
- *
921
- * @len
922
- * Output array for codeword lengths.
923
- *
924
- * @len_counts
925
- * An array that provides the number of codewords that will have
926
- * each possible length <= max_codeword_len.
927
- *
928
- * @max_codeword_len
929
- * Maximum length, in bits, of each codeword.
930
- *
931
- * @num_syms
932
- * Number of symbols in the alphabet, including symbols with zero
933
- * frequency. This is the length of the 'A' and 'len' arrays.
934
- */
935
- static void
936
- gen_codewords(u32 A[restrict], u8 lens[restrict],
937
- const unsigned len_counts[restrict],
938
- unsigned max_codeword_len, unsigned num_syms)
939
- {
940
- u32 next_codewords[DEFLATE_MAX_CODEWORD_LEN + 1];
941
- unsigned i;
942
- unsigned len;
943
- unsigned sym;
944
-
945
- /* Given the number of codewords that will have each length,
946
- * assign codeword lengths to symbols. We do this by assigning
947
- * the lengths in decreasing order to the symbols sorted
948
- * primarily by increasing frequency and secondarily by
949
- * increasing symbol value. */
950
- for (i = 0, len = max_codeword_len; len >= 1; len--) {
951
- unsigned count = len_counts[len];
952
- while (count--)
953
- lens[A[i++] & SYMBOL_MASK] = len;
954
- }
955
-
956
- /* Generate the codewords themselves. We initialize the
957
- * 'next_codewords' array to provide the lexicographically first
958
- * codeword of each length, then assign codewords in symbol
959
- * order. This produces a canonical code. */
960
- next_codewords[0] = 0;
961
- next_codewords[1] = 0;
962
- for (len = 2; len <= max_codeword_len; len++)
963
- next_codewords[len] =
964
- (next_codewords[len - 1] + len_counts[len - 1]) << 1;
965
-
966
- for (sym = 0; sym < num_syms; sym++)
967
- A[sym] = next_codewords[lens[sym]]++;
968
- }
969
-
970
- /*
971
- * ---------------------------------------------------------------------
972
- * make_canonical_huffman_code()
973
- * ---------------------------------------------------------------------
974
- *
975
- * Given an alphabet and the frequency of each symbol in it, construct a
976
- * length-limited canonical Huffman code.
977
- *
978
- * @num_syms
979
- * The number of symbols in the alphabet. The symbols are the
980
- * integers in the range [0, num_syms - 1]. This parameter must be
981
- * at least 2 and can't be greater than (1 << NUM_SYMBOL_BITS).
982
- *
983
- * @max_codeword_len
984
- * The maximum permissible codeword length.
985
- *
986
- * @freqs
987
- * An array of @num_syms entries, each of which specifies the
988
- * frequency of the corresponding symbol. It is valid for some,
989
- * none, or all of the frequencies to be 0.
990
- *
991
- * @lens
992
- * An array of @num_syms entries in which this function will return
993
- * the length, in bits, of the codeword assigned to each symbol.
994
- * Symbols with 0 frequency will not have codewords per se, but
995
- * their entries in this array will be set to 0. No lengths greater
996
- * than @max_codeword_len will be assigned.
997
- *
998
- * @codewords
999
- * An array of @num_syms entries in which this function will return
1000
- * the codeword for each symbol, right-justified and padded on the
1001
- * left with zeroes. Codewords for symbols with 0 frequency will be
1002
- * undefined.
1003
- *
1004
- * ---------------------------------------------------------------------
1005
- *
1006
- * This function builds a length-limited canonical Huffman code.
1007
- *
1008
- * A length-limited Huffman code contains no codewords longer than some
1009
- * specified length, and has exactly (with some algorithms) or
1010
- * approximately (with the algorithm used here) the minimum weighted path
1011
- * length from the root, given this constraint.
1012
- *
1013
- * A canonical Huffman code satisfies the properties that a longer
1014
- * codeword never lexicographically precedes a shorter codeword, and the
1015
- * lexicographic ordering of codewords of the same length is the same as
1016
- * the lexicographic ordering of the corresponding symbols. A canonical
1017
- * Huffman code, or more generally a canonical prefix code, can be
1018
- * reconstructed from only a list containing the codeword length of each
1019
- * symbol.
1020
- *
1021
- * The classic algorithm to generate a Huffman code creates a node for
1022
- * each symbol, then inserts these nodes into a min-heap keyed by symbol
1023
- * frequency. Then, repeatedly, the two lowest-frequency nodes are
1024
- * removed from the min-heap and added as the children of a new node
1025
- * having frequency equal to the sum of its two children, which is then
1026
- * inserted into the min-heap. When only a single node remains in the
1027
- * min-heap, it is the root of the Huffman tree. The codeword for each
1028
- * symbol is determined by the path needed to reach the corresponding
1029
- * node from the root. Descending to the left child appends a 0 bit,
1030
- * whereas descending to the right child appends a 1 bit.
1031
- *
1032
- * The classic algorithm is relatively easy to understand, but it is
1033
- * subject to a number of inefficiencies. In practice, it is fastest to
1034
- * first sort the symbols by frequency. (This itself can be subject to
1035
- * an optimization based on the fact that most frequencies tend to be
1036
- * low.) At the same time, we sort secondarily by symbol value, which
1037
- * aids the process of generating a canonical code. Then, during tree
1038
- * construction, no heap is necessary because both the leaf nodes and the
1039
- * unparented non-leaf nodes can be easily maintained in sorted order.
1040
- * Consequently, there can never be more than two possibilities for the
1041
- * next-lowest-frequency node.
1042
- *
1043
- * In addition, because we're generating a canonical code, we actually
1044
- * don't need the leaf nodes of the tree at all, only the non-leaf nodes.
1045
- * This is because for canonical code generation we don't need to know
1046
- * where the symbols are in the tree. Rather, we only need to know how
1047
- * many leaf nodes have each depth (codeword length). And this
1048
- * information can, in fact, be quickly generated from the tree of
1049
- * non-leaves only.
1050
- *
1051
- * Furthermore, we can build this stripped-down Huffman tree directly in
1052
- * the array in which the codewords are to be generated, provided that
1053
- * these array slots are large enough to hold a symbol and frequency
1054
- * value.
1055
- *
1056
- * Still furthermore, we don't even need to maintain explicit child
1057
- * pointers. We only need the parent pointers, and even those can be
1058
- * overwritten in-place with depth information as part of the process of
1059
- * extracting codeword lengths from the tree. So in summary, we do NOT
1060
- * need a big structure like:
1061
- *
1062
- * struct huffman_tree_node {
1063
- * unsigned int symbol;
1064
- * unsigned int frequency;
1065
- * unsigned int depth;
1066
- * struct huffman_tree_node *left_child;
1067
- * struct huffman_tree_node *right_child;
1068
- * };
1069
- *
1070
- *
1071
- * ... which often gets used in "naive" implementations of Huffman code
1072
- * generation.
1073
- *
1074
- * Many of these optimizations are based on the implementation in 7-Zip
1075
- * (source file: C/HuffEnc.c), which has been placed in the public domain
1076
- * by Igor Pavlov.
1077
- */
1078
- static void
1079
- make_canonical_huffman_code(unsigned num_syms, unsigned max_codeword_len,
1080
- const u32 freqs[restrict],
1081
- u8 lens[restrict], u32 codewords[restrict])
1082
- {
1083
- u32 *A = codewords;
1084
- unsigned num_used_syms;
1085
-
1086
- STATIC_ASSERT(DEFLATE_MAX_NUM_SYMS <= 1 << NUM_SYMBOL_BITS);
1087
-
1088
- /* We begin by sorting the symbols primarily by frequency and
1089
- * secondarily by symbol value. As an optimization, the array
1090
- * used for this purpose ('A') shares storage with the space in
1091
- * which we will eventually return the codewords. */
1092
-
1093
- num_used_syms = sort_symbols(num_syms, freqs, lens, A);
1094
-
1095
- /* 'num_used_syms' is the number of symbols with nonzero
1096
- * frequency. This may be less than @num_syms. 'num_used_syms'
1097
- * is also the number of entries in 'A' that are valid. Each
1098
- * entry consists of a distinct symbol and a nonzero frequency
1099
- * packed into a 32-bit integer. */
1100
-
1101
- /* Handle special cases where only 0 or 1 symbols were used (had
1102
- * nonzero frequency). */
1103
-
1104
- if (unlikely(num_used_syms == 0)) {
1105
- /* Code is empty. sort_symbols() already set all lengths
1106
- * to 0, so there is nothing more to do. */
1107
- return;
1108
- }
1109
-
1110
- if (unlikely(num_used_syms == 1)) {
1111
- /* Only one symbol was used, so we only need one
1112
- * codeword. But two codewords are needed to form the
1113
- * smallest complete Huffman code, which uses codewords 0
1114
- * and 1. Therefore, we choose another symbol to which
1115
- * to assign a codeword. We use 0 (if the used symbol is
1116
- * not 0) or 1 (if the used symbol is 0). In either
1117
- * case, the lesser-valued symbol must be assigned
1118
- * codeword 0 so that the resulting code is canonical. */
1119
-
1120
- unsigned sym = A[0] & SYMBOL_MASK;
1121
- unsigned nonzero_idx = sym ? sym : 1;
1122
-
1123
- codewords[0] = 0;
1124
- lens[0] = 1;
1125
- codewords[nonzero_idx] = 1;
1126
- lens[nonzero_idx] = 1;
1127
- return;
1128
- }
1129
-
1130
- /* Build a stripped-down version of the Huffman tree, sharing the
1131
- * array 'A' with the symbol values. Then extract length counts
1132
- * from the tree and use them to generate the final codewords. */
1133
-
1134
- build_tree(A, num_used_syms);
1135
-
1136
- {
1137
- unsigned len_counts[DEFLATE_MAX_CODEWORD_LEN + 1];
1138
-
1139
- compute_length_counts(A, num_used_syms - 2,
1140
- len_counts, max_codeword_len);
1141
-
1142
- gen_codewords(A, lens, len_counts, max_codeword_len, num_syms);
1143
- }
1144
- }
1145
-
1146
- /*
1147
- * Clear the Huffman symbol frequency counters.
1148
- * This must be called when starting a new DEFLATE block.
1149
- */
1150
- static void
1151
- deflate_reset_symbol_frequencies(struct libdeflate_compressor *c)
1152
- {
1153
- memset(&c->freqs, 0, sizeof(c->freqs));
1154
- }
1155
-
1156
- /* Reverse the Huffman codeword 'codeword', which is 'len' bits in length. */
1157
- static u32
1158
- deflate_reverse_codeword(u32 codeword, u8 len)
1159
- {
1160
- /* The following branchless algorithm is faster than going bit by bit.
1161
- * Note: since no codewords are longer than 16 bits, we only need to
1162
- * reverse the low 16 bits of the 'u32'. */
1163
- STATIC_ASSERT(DEFLATE_MAX_CODEWORD_LEN <= 16);
1164
-
1165
- /* Flip adjacent 1-bit fields */
1166
- codeword = ((codeword & 0x5555) << 1) | ((codeword & 0xAAAA) >> 1);
1167
-
1168
- /* Flip adjacent 2-bit fields */
1169
- codeword = ((codeword & 0x3333) << 2) | ((codeword & 0xCCCC) >> 2);
1170
-
1171
- /* Flip adjacent 4-bit fields */
1172
- codeword = ((codeword & 0x0F0F) << 4) | ((codeword & 0xF0F0) >> 4);
1173
-
1174
- /* Flip adjacent 8-bit fields */
1175
- codeword = ((codeword & 0x00FF) << 8) | ((codeword & 0xFF00) >> 8);
1176
-
1177
- /* Return the high 'len' bits of the bit-reversed 16 bit value. */
1178
- return codeword >> (16 - len);
1179
- }
1180
-
1181
- /* Make a canonical Huffman code with bit-reversed codewords. */
1182
- static void
1183
- deflate_make_huffman_code(unsigned num_syms, unsigned max_codeword_len,
1184
- const u32 freqs[], u8 lens[], u32 codewords[])
1185
- {
1186
- unsigned sym;
1187
-
1188
- make_canonical_huffman_code(num_syms, max_codeword_len,
1189
- freqs, lens, codewords);
1190
-
1191
- for (sym = 0; sym < num_syms; sym++)
1192
- codewords[sym] = deflate_reverse_codeword(codewords[sym], lens[sym]);
1193
- }
1194
-
1195
- /*
1196
- * Build the literal/length and offset Huffman codes for a DEFLATE block.
1197
- *
1198
- * This takes as input the frequency tables for each code and produces as output
1199
- * a set of tables that map symbols to codewords and codeword lengths.
1200
- */
1201
- static void
1202
- deflate_make_huffman_codes(const struct deflate_freqs *freqs,
1203
- struct deflate_codes *codes)
1204
- {
1205
- STATIC_ASSERT(MAX_LITLEN_CODEWORD_LEN <= DEFLATE_MAX_LITLEN_CODEWORD_LEN);
1206
- STATIC_ASSERT(MAX_OFFSET_CODEWORD_LEN <= DEFLATE_MAX_OFFSET_CODEWORD_LEN);
1207
-
1208
- deflate_make_huffman_code(DEFLATE_NUM_LITLEN_SYMS,
1209
- MAX_LITLEN_CODEWORD_LEN,
1210
- freqs->litlen,
1211
- codes->lens.litlen,
1212
- codes->codewords.litlen);
1213
-
1214
- deflate_make_huffman_code(DEFLATE_NUM_OFFSET_SYMS,
1215
- MAX_OFFSET_CODEWORD_LEN,
1216
- freqs->offset,
1217
- codes->lens.offset,
1218
- codes->codewords.offset);
1219
- }
1220
-
1221
- /* Initialize c->static_codes. */
1222
- static void
1223
- deflate_init_static_codes(struct libdeflate_compressor *c)
1224
- {
1225
- unsigned i;
1226
-
1227
- for (i = 0; i < 144; i++)
1228
- c->freqs.litlen[i] = 1 << (9 - 8);
1229
- for (; i < 256; i++)
1230
- c->freqs.litlen[i] = 1 << (9 - 9);
1231
- for (; i < 280; i++)
1232
- c->freqs.litlen[i] = 1 << (9 - 7);
1233
- for (; i < 288; i++)
1234
- c->freqs.litlen[i] = 1 << (9 - 8);
1235
-
1236
- for (i = 0; i < 32; i++)
1237
- c->freqs.offset[i] = 1 << (5 - 5);
1238
-
1239
- deflate_make_huffman_codes(&c->freqs, &c->static_codes);
1240
- }
1241
-
1242
- /* Return the offset slot for the specified match offset. */
1243
- static forceinline unsigned
1244
- deflate_get_offset_slot(struct libdeflate_compressor *c, unsigned offset)
1245
- {
1246
- #if USE_FULL_OFFSET_SLOT_FAST
1247
- return c->offset_slot_fast[offset];
1248
- #else
1249
- if (offset <= 256)
1250
- return c->offset_slot_fast[offset - 1];
1251
- else
1252
- return c->offset_slot_fast[256 + ((offset - 1) >> 7)];
1253
- #endif
1254
- }
1255
-
1256
- /* Write the header fields common to all DEFLATE block types. */
1257
- static void
1258
- deflate_write_block_header(struct deflate_output_bitstream *os,
1259
- bool is_final_block, unsigned block_type)
1260
- {
1261
- deflate_add_bits(os, is_final_block, 1);
1262
- deflate_add_bits(os, block_type, 2);
1263
- deflate_flush_bits(os);
1264
- }
1265
-
1266
- static unsigned
1267
- deflate_compute_precode_items(const u8 lens[restrict],
1268
- const unsigned num_lens,
1269
- u32 precode_freqs[restrict],
1270
- unsigned precode_items[restrict])
1271
- {
1272
- unsigned *itemptr;
1273
- unsigned run_start;
1274
- unsigned run_end;
1275
- unsigned extra_bits;
1276
- u8 len;
1277
-
1278
- memset(precode_freqs, 0,
1279
- DEFLATE_NUM_PRECODE_SYMS * sizeof(precode_freqs[0]));
1280
-
1281
- itemptr = precode_items;
1282
- run_start = 0;
1283
- do {
1284
- /* Find the next run of codeword lengths. */
1285
-
1286
- /* len = the length being repeated */
1287
- len = lens[run_start];
1288
-
1289
- /* Extend the run. */
1290
- run_end = run_start;
1291
- do {
1292
- run_end++;
1293
- } while (run_end != num_lens && len == lens[run_end]);
1294
-
1295
- if (len == 0) {
1296
- /* Run of zeroes. */
1297
-
1298
- /* Symbol 18: RLE 11 to 138 zeroes at a time. */
1299
- while ((run_end - run_start) >= 11) {
1300
- extra_bits = MIN((run_end - run_start) - 11, 0x7F);
1301
- precode_freqs[18]++;
1302
- *itemptr++ = 18 | (extra_bits << 5);
1303
- run_start += 11 + extra_bits;
1304
- }
1305
-
1306
- /* Symbol 17: RLE 3 to 10 zeroes at a time. */
1307
- if ((run_end - run_start) >= 3) {
1308
- extra_bits = MIN((run_end - run_start) - 3, 0x7);
1309
- precode_freqs[17]++;
1310
- *itemptr++ = 17 | (extra_bits << 5);
1311
- run_start += 3 + extra_bits;
1312
- }
1313
- } else {
1314
-
1315
- /* A run of nonzero lengths. */
1316
-
1317
- /* Symbol 16: RLE 3 to 6 of the previous length. */
1318
- if ((run_end - run_start) >= 4) {
1319
- precode_freqs[len]++;
1320
- *itemptr++ = len;
1321
- run_start++;
1322
- do {
1323
- extra_bits = MIN((run_end - run_start) - 3, 0x3);
1324
- precode_freqs[16]++;
1325
- *itemptr++ = 16 | (extra_bits << 5);
1326
- run_start += 3 + extra_bits;
1327
- } while ((run_end - run_start) >= 3);
1328
- }
1329
- }
1330
-
1331
- /* Output any remaining lengths without RLE. */
1332
- while (run_start != run_end) {
1333
- precode_freqs[len]++;
1334
- *itemptr++ = len;
1335
- run_start++;
1336
- }
1337
- } while (run_start != num_lens);
1338
-
1339
- return itemptr - precode_items;
1340
- }
1341
-
1342
- /*
1343
- * Huffman codeword lengths for dynamic Huffman blocks are compressed using a
1344
- * separate Huffman code, the "precode", which contains a symbol for each
1345
- * possible codeword length in the larger code as well as several special
1346
- * symbols to represent repeated codeword lengths (a form of run-length
1347
- * encoding). The precode is itself constructed in canonical form, and its
1348
- * codeword lengths are represented literally in 19 3-bit fields that
1349
- * immediately precede the compressed codeword lengths of the larger code.
1350
- */
1351
-
1352
- /* Precompute the information needed to output Huffman codes. */
1353
- static void
1354
- deflate_precompute_huffman_header(struct libdeflate_compressor *c)
1355
- {
1356
- /* Compute how many litlen and offset symbols are needed. */
1357
-
1358
- for (c->num_litlen_syms = DEFLATE_NUM_LITLEN_SYMS;
1359
- c->num_litlen_syms > 257;
1360
- c->num_litlen_syms--)
1361
- if (c->codes.lens.litlen[c->num_litlen_syms - 1] != 0)
1362
- break;
1363
-
1364
- for (c->num_offset_syms = DEFLATE_NUM_OFFSET_SYMS;
1365
- c->num_offset_syms > 1;
1366
- c->num_offset_syms--)
1367
- if (c->codes.lens.offset[c->num_offset_syms - 1] != 0)
1368
- break;
1369
-
1370
- /* If we're not using the full set of literal/length codeword lengths,
1371
- * then temporarily move the offset codeword lengths over so that the
1372
- * literal/length and offset codeword lengths are contiguous. */
1373
-
1374
- STATIC_ASSERT(offsetof(struct deflate_lens, offset) ==
1375
- DEFLATE_NUM_LITLEN_SYMS);
1376
-
1377
- if (c->num_litlen_syms != DEFLATE_NUM_LITLEN_SYMS) {
1378
- memmove((u8 *)&c->codes.lens + c->num_litlen_syms,
1379
- (u8 *)&c->codes.lens + DEFLATE_NUM_LITLEN_SYMS,
1380
- c->num_offset_syms);
1381
- }
1382
-
1383
- /* Compute the "items" (RLE / literal tokens and extra bits) with which
1384
- * the codeword lengths in the larger code will be output. */
1385
- c->num_precode_items =
1386
- deflate_compute_precode_items((u8 *)&c->codes.lens,
1387
- c->num_litlen_syms +
1388
- c->num_offset_syms,
1389
- c->precode_freqs,
1390
- c->precode_items);
1391
-
1392
- /* Build the precode. */
1393
- STATIC_ASSERT(MAX_PRE_CODEWORD_LEN <= DEFLATE_MAX_PRE_CODEWORD_LEN);
1394
- deflate_make_huffman_code(DEFLATE_NUM_PRECODE_SYMS,
1395
- MAX_PRE_CODEWORD_LEN,
1396
- c->precode_freqs, c->precode_lens,
1397
- c->precode_codewords);
1398
-
1399
- /* Count how many precode lengths we actually need to output. */
1400
- for (c->num_explicit_lens = DEFLATE_NUM_PRECODE_SYMS;
1401
- c->num_explicit_lens > 4;
1402
- c->num_explicit_lens--)
1403
- if (c->precode_lens[deflate_precode_lens_permutation[
1404
- c->num_explicit_lens - 1]] != 0)
1405
- break;
1406
-
1407
- /* Restore the offset codeword lengths if needed. */
1408
- if (c->num_litlen_syms != DEFLATE_NUM_LITLEN_SYMS) {
1409
- memmove((u8 *)&c->codes.lens + DEFLATE_NUM_LITLEN_SYMS,
1410
- (u8 *)&c->codes.lens + c->num_litlen_syms,
1411
- c->num_offset_syms);
1412
- }
1413
- }
1414
-
1415
- /* Output the Huffman codes. */
1416
- static void
1417
- deflate_write_huffman_header(struct libdeflate_compressor *c,
1418
- struct deflate_output_bitstream *os)
1419
- {
1420
- unsigned i;
1421
-
1422
- deflate_add_bits(os, c->num_litlen_syms - 257, 5);
1423
- deflate_add_bits(os, c->num_offset_syms - 1, 5);
1424
- deflate_add_bits(os, c->num_explicit_lens - 4, 4);
1425
- deflate_flush_bits(os);
1426
-
1427
- /* Output the lengths of the codewords in the precode. */
1428
- for (i = 0; i < c->num_explicit_lens; i++) {
1429
- deflate_add_bits(os, c->precode_lens[
1430
- deflate_precode_lens_permutation[i]], 3);
1431
- deflate_flush_bits(os);
1432
- }
1433
-
1434
- /* Output the encoded lengths of the codewords in the larger code. */
1435
- for (i = 0; i < c->num_precode_items; i++) {
1436
- unsigned precode_item = c->precode_items[i];
1437
- unsigned precode_sym = precode_item & 0x1F;
1438
- deflate_add_bits(os, c->precode_codewords[precode_sym],
1439
- c->precode_lens[precode_sym]);
1440
- if (precode_sym >= 16) {
1441
- if (precode_sym == 16)
1442
- deflate_add_bits(os, precode_item >> 5, 2);
1443
- else if (precode_sym == 17)
1444
- deflate_add_bits(os, precode_item >> 5, 3);
1445
- else
1446
- deflate_add_bits(os, precode_item >> 5, 7);
1447
- }
1448
- STATIC_ASSERT(CAN_BUFFER(DEFLATE_MAX_PRE_CODEWORD_LEN + 7));
1449
- deflate_flush_bits(os);
1450
- }
1451
- }
1452
-
1453
- static void
1454
- deflate_write_sequences(struct deflate_output_bitstream * restrict os,
1455
- const struct deflate_codes * restrict codes,
1456
- const struct deflate_sequence sequences[restrict],
1457
- const u8 * restrict in_next)
1458
- {
1459
- const struct deflate_sequence *seq = sequences;
1460
-
1461
- for (;;) {
1462
- u32 litrunlen = seq->litrunlen_and_length & 0x7FFFFF;
1463
- unsigned length = seq->litrunlen_and_length >> 23;
1464
- unsigned length_slot;
1465
- unsigned litlen_symbol;
1466
- unsigned offset_symbol;
1467
-
1468
- if (litrunlen) {
1469
- #if 1
1470
- while (litrunlen >= 4) {
1471
- unsigned lit0 = in_next[0];
1472
- unsigned lit1 = in_next[1];
1473
- unsigned lit2 = in_next[2];
1474
- unsigned lit3 = in_next[3];
1475
-
1476
- deflate_add_bits(os, codes->codewords.litlen[lit0],
1477
- codes->lens.litlen[lit0]);
1478
- if (!CAN_BUFFER(2 * MAX_LITLEN_CODEWORD_LEN))
1479
- deflate_flush_bits(os);
1480
-
1481
- deflate_add_bits(os, codes->codewords.litlen[lit1],
1482
- codes->lens.litlen[lit1]);
1483
- if (!CAN_BUFFER(4 * MAX_LITLEN_CODEWORD_LEN))
1484
- deflate_flush_bits(os);
1485
-
1486
- deflate_add_bits(os, codes->codewords.litlen[lit2],
1487
- codes->lens.litlen[lit2]);
1488
- if (!CAN_BUFFER(2 * MAX_LITLEN_CODEWORD_LEN))
1489
- deflate_flush_bits(os);
1490
-
1491
- deflate_add_bits(os, codes->codewords.litlen[lit3],
1492
- codes->lens.litlen[lit3]);
1493
- deflate_flush_bits(os);
1494
- in_next += 4;
1495
- litrunlen -= 4;
1496
- }
1497
- if (litrunlen-- != 0) {
1498
- deflate_add_bits(os, codes->codewords.litlen[*in_next],
1499
- codes->lens.litlen[*in_next]);
1500
- if (!CAN_BUFFER(3 * MAX_LITLEN_CODEWORD_LEN))
1501
- deflate_flush_bits(os);
1502
- in_next++;
1503
- if (litrunlen-- != 0) {
1504
- deflate_add_bits(os, codes->codewords.litlen[*in_next],
1505
- codes->lens.litlen[*in_next]);
1506
- if (!CAN_BUFFER(3 * MAX_LITLEN_CODEWORD_LEN))
1507
- deflate_flush_bits(os);
1508
- in_next++;
1509
- if (litrunlen-- != 0) {
1510
- deflate_add_bits(os, codes->codewords.litlen[*in_next],
1511
- codes->lens.litlen[*in_next]);
1512
- if (!CAN_BUFFER(3 * MAX_LITLEN_CODEWORD_LEN))
1513
- deflate_flush_bits(os);
1514
- in_next++;
1515
- }
1516
- }
1517
- if (CAN_BUFFER(3 * MAX_LITLEN_CODEWORD_LEN))
1518
- deflate_flush_bits(os);
1519
- }
1520
- #else
1521
- do {
1522
- unsigned lit = *in_next++;
1523
- deflate_add_bits(os, codes->codewords.litlen[lit],
1524
- codes->lens.litlen[lit]);
1525
- deflate_flush_bits(os);
1526
- } while (--litrunlen);
1527
- #endif
1528
- }
1529
-
1530
- if (length == 0)
1531
- return;
1532
-
1533
- in_next += length;
1534
-
1535
- length_slot = seq->length_slot;
1536
- litlen_symbol = 257 + length_slot;
1537
-
1538
- /* Litlen symbol */
1539
- deflate_add_bits(os, codes->codewords.litlen[litlen_symbol],
1540
- codes->lens.litlen[litlen_symbol]);
1541
-
1542
- /* Extra length bits */
1543
- STATIC_ASSERT(CAN_BUFFER(MAX_LITLEN_CODEWORD_LEN +
1544
- DEFLATE_MAX_EXTRA_LENGTH_BITS));
1545
- deflate_add_bits(os, length - deflate_length_slot_base[length_slot],
1546
- deflate_extra_length_bits[length_slot]);
1547
-
1548
- if (!CAN_BUFFER(MAX_LITLEN_CODEWORD_LEN +
1549
- DEFLATE_MAX_EXTRA_LENGTH_BITS +
1550
- MAX_OFFSET_CODEWORD_LEN +
1551
- DEFLATE_MAX_EXTRA_OFFSET_BITS))
1552
- deflate_flush_bits(os);
1553
-
1554
- /* Offset symbol */
1555
- offset_symbol = seq->offset_symbol;
1556
- deflate_add_bits(os, codes->codewords.offset[offset_symbol],
1557
- codes->lens.offset[offset_symbol]);
1558
-
1559
- if (!CAN_BUFFER(MAX_OFFSET_CODEWORD_LEN +
1560
- DEFLATE_MAX_EXTRA_OFFSET_BITS))
1561
- deflate_flush_bits(os);
1562
-
1563
- /* Extra offset bits */
1564
- deflate_add_bits(os, seq->offset - deflate_offset_slot_base[offset_symbol],
1565
- deflate_extra_offset_bits[offset_symbol]);
1566
-
1567
- deflate_flush_bits(os);
1568
-
1569
- seq++;
1570
- }
1571
- }
1572
-
1573
- #if SUPPORT_NEAR_OPTIMAL_PARSING
1574
- /*
1575
- * Follow the minimum-cost path in the graph of possible match/literal choices
1576
- * for the current block and write out the matches/literals using the specified
1577
- * Huffman codes.
1578
- *
1579
- * Note: this is slightly duplicated with deflate_write_sequences(), the reason
1580
- * being that we don't want to waste time translating between intermediate
1581
- * match/literal representations.
1582
- */
1583
- static void
1584
- deflate_write_item_list(struct deflate_output_bitstream *os,
1585
- const struct deflate_codes *codes,
1586
- struct libdeflate_compressor *c,
1587
- u32 block_length)
1588
- {
1589
- struct deflate_optimum_node *cur_node = &c->p.n.optimum_nodes[0];
1590
- struct deflate_optimum_node * const end_node = &c->p.n.optimum_nodes[block_length];
1591
- do {
1592
- unsigned length = cur_node->item & OPTIMUM_LEN_MASK;
1593
- unsigned offset = cur_node->item >> OPTIMUM_OFFSET_SHIFT;
1594
- unsigned litlen_symbol;
1595
- unsigned length_slot;
1596
- unsigned offset_slot;
1597
-
1598
- if (length == 1) {
1599
- /* Literal */
1600
- litlen_symbol = offset;
1601
- deflate_add_bits(os, codes->codewords.litlen[litlen_symbol],
1602
- codes->lens.litlen[litlen_symbol]);
1603
- deflate_flush_bits(os);
1604
- } else {
1605
- /* Match length */
1606
- length_slot = deflate_length_slot[length];
1607
- litlen_symbol = 257 + length_slot;
1608
- deflate_add_bits(os, codes->codewords.litlen[litlen_symbol],
1609
- codes->lens.litlen[litlen_symbol]);
1610
-
1611
- deflate_add_bits(os, length - deflate_length_slot_base[length_slot],
1612
- deflate_extra_length_bits[length_slot]);
1613
-
1614
- if (!CAN_BUFFER(MAX_LITLEN_CODEWORD_LEN +
1615
- DEFLATE_MAX_EXTRA_LENGTH_BITS +
1616
- MAX_OFFSET_CODEWORD_LEN +
1617
- DEFLATE_MAX_EXTRA_OFFSET_BITS))
1618
- deflate_flush_bits(os);
1619
-
1620
-
1621
- /* Match offset */
1622
- offset_slot = deflate_get_offset_slot(c, offset);
1623
- deflate_add_bits(os, codes->codewords.offset[offset_slot],
1624
- codes->lens.offset[offset_slot]);
1625
-
1626
- if (!CAN_BUFFER(MAX_OFFSET_CODEWORD_LEN +
1627
- DEFLATE_MAX_EXTRA_OFFSET_BITS))
1628
- deflate_flush_bits(os);
1629
-
1630
- deflate_add_bits(os, offset - deflate_offset_slot_base[offset_slot],
1631
- deflate_extra_offset_bits[offset_slot]);
1632
-
1633
- deflate_flush_bits(os);
1634
- }
1635
- cur_node += length;
1636
- } while (cur_node != end_node);
1637
- }
1638
- #endif /* SUPPORT_NEAR_OPTIMAL_PARSING */
1639
-
1640
- /* Output the end-of-block symbol. */
1641
- static void
1642
- deflate_write_end_of_block(struct deflate_output_bitstream *os,
1643
- const struct deflate_codes *codes)
1644
- {
1645
- deflate_add_bits(os, codes->codewords.litlen[DEFLATE_END_OF_BLOCK],
1646
- codes->lens.litlen[DEFLATE_END_OF_BLOCK]);
1647
- deflate_flush_bits(os);
1648
- }
1649
-
1650
- static void
1651
- deflate_write_uncompressed_block(struct deflate_output_bitstream *os,
1652
- const u8 *data, u16 len,
1653
- bool is_final_block)
1654
- {
1655
- deflate_write_block_header(os, is_final_block,
1656
- DEFLATE_BLOCKTYPE_UNCOMPRESSED);
1657
- deflate_align_bitstream(os);
1658
-
1659
- if (4 + (u32)len >= os->end - os->next) {
1660
- os->next = os->end;
1661
- return;
1662
- }
1663
-
1664
- put_unaligned_le16(len, os->next);
1665
- os->next += 2;
1666
- put_unaligned_le16(~len, os->next);
1667
- os->next += 2;
1668
- memcpy(os->next, data, len);
1669
- os->next += len;
1670
- }
1671
-
1672
- static void
1673
- deflate_write_uncompressed_blocks(struct deflate_output_bitstream *os,
1674
- const u8 *data, u32 data_length,
1675
- bool is_final_block)
1676
- {
1677
- do {
1678
- u16 len = MIN(data_length, UINT16_MAX);
1679
-
1680
- deflate_write_uncompressed_block(os, data, len,
1681
- is_final_block && len == data_length);
1682
- data += len;
1683
- data_length -= len;
1684
- } while (data_length != 0);
1685
- }
1686
-
1687
- /*
1688
- * Choose the best type of block to use (dynamic Huffman, static Huffman, or
1689
- * uncompressed), then output it.
1690
- */
1691
- static void
1692
- deflate_flush_block(struct libdeflate_compressor * restrict c,
1693
- struct deflate_output_bitstream * restrict os,
1694
- const u8 * restrict block_begin, u32 block_length,
1695
- bool is_final_block, bool use_item_list)
1696
- {
1697
- static const u8 deflate_extra_precode_bits[DEFLATE_NUM_PRECODE_SYMS] = {
1698
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 3, 7,
1699
- };
1700
-
1701
- /* Costs are measured in bits */
1702
- u32 dynamic_cost = 0;
1703
- u32 static_cost = 0;
1704
- u32 uncompressed_cost = 0;
1705
- struct deflate_codes *codes;
1706
- int block_type;
1707
- unsigned sym;
1708
-
1709
- /* Tally the end-of-block symbol. */
1710
- c->freqs.litlen[DEFLATE_END_OF_BLOCK]++;
1711
-
1712
- /* Build dynamic Huffman codes. */
1713
- deflate_make_huffman_codes(&c->freqs, &c->codes);
1714
-
1715
- /* Account for the cost of sending dynamic Huffman codes. */
1716
- deflate_precompute_huffman_header(c);
1717
- dynamic_cost += 5 + 5 + 4 + (3 * c->num_explicit_lens);
1718
- for (sym = 0; sym < DEFLATE_NUM_PRECODE_SYMS; sym++) {
1719
- u32 extra = deflate_extra_precode_bits[sym];
1720
- dynamic_cost += c->precode_freqs[sym] *
1721
- (extra + c->precode_lens[sym]);
1722
- }
1723
-
1724
- /* Account for the cost of encoding literals. */
1725
- for (sym = 0; sym < 256; sym++) {
1726
- dynamic_cost += c->freqs.litlen[sym] *
1727
- c->codes.lens.litlen[sym];
1728
- }
1729
- for (sym = 0; sym < 144; sym++)
1730
- static_cost += c->freqs.litlen[sym] * 8;
1731
- for (; sym < 256; sym++)
1732
- static_cost += c->freqs.litlen[sym] * 9;
1733
-
1734
- /* Account for the cost of encoding the end-of-block symbol. */
1735
- dynamic_cost += c->codes.lens.litlen[256];
1736
- static_cost += 7;
1737
-
1738
- /* Account for the cost of encoding lengths. */
1739
- for (sym = 257; sym < 257 + ARRAY_LEN(deflate_extra_length_bits); sym++) {
1740
- u32 extra = deflate_extra_length_bits[sym - 257];
1741
- dynamic_cost += c->freqs.litlen[sym] *
1742
- (extra + c->codes.lens.litlen[sym]);
1743
- static_cost += c->freqs.litlen[sym] *
1744
- (extra + c->static_codes.lens.litlen[sym]);
1745
- }
1746
-
1747
- /* Account for the cost of encoding offsets. */
1748
- for (sym = 0; sym < ARRAY_LEN(deflate_extra_offset_bits); sym++) {
1749
- u32 extra = deflate_extra_offset_bits[sym];
1750
- dynamic_cost += c->freqs.offset[sym] *
1751
- (extra + c->codes.lens.offset[sym]);
1752
- static_cost += c->freqs.offset[sym] * (extra + 5);
1753
- }
1754
-
1755
- /* Compute the cost of using uncompressed blocks. */
1756
- uncompressed_cost += (-(os->bitcount + 3) & 7) + 32 +
1757
- (40 * (DIV_ROUND_UP(block_length,
1758
- UINT16_MAX) - 1)) +
1759
- (8 * block_length);
1760
-
1761
- /* Choose the cheapest block type. */
1762
- if (dynamic_cost < MIN(static_cost, uncompressed_cost)) {
1763
- block_type = DEFLATE_BLOCKTYPE_DYNAMIC_HUFFMAN;
1764
- codes = &c->codes;
1765
- } else if (static_cost < uncompressed_cost) {
1766
- block_type = DEFLATE_BLOCKTYPE_STATIC_HUFFMAN;
1767
- codes = &c->static_codes;
1768
- } else {
1769
- block_type = DEFLATE_BLOCKTYPE_UNCOMPRESSED;
1770
- }
1771
-
1772
- /* Now actually output the block. */
1773
-
1774
- if (block_type == DEFLATE_BLOCKTYPE_UNCOMPRESSED) {
1775
- /* Note: the length being flushed may exceed the maximum length
1776
- * of an uncompressed block (65535 bytes). Therefore, more than
1777
- * one uncompressed block might be needed. */
1778
- deflate_write_uncompressed_blocks(os, block_begin, block_length,
1779
- is_final_block);
1780
- } else {
1781
- /* Output the block header. */
1782
- deflate_write_block_header(os, is_final_block, block_type);
1783
-
1784
- /* Output the Huffman codes (dynamic Huffman blocks only). */
1785
- if (block_type == DEFLATE_BLOCKTYPE_DYNAMIC_HUFFMAN)
1786
- deflate_write_huffman_header(c, os);
1787
-
1788
- /* Output the literals, matches, and end-of-block symbol. */
1789
- #if SUPPORT_NEAR_OPTIMAL_PARSING
1790
- if (use_item_list)
1791
- deflate_write_item_list(os, codes, c, block_length);
1792
- else
1793
- #endif
1794
- deflate_write_sequences(os, codes, c->p.g.sequences,
1795
- block_begin);
1796
- deflate_write_end_of_block(os, codes);
1797
- }
1798
- }
1799
-
1800
- static forceinline void
1801
- deflate_choose_literal(struct libdeflate_compressor *c, unsigned literal,
1802
- u32 *litrunlen_p)
1803
- {
1804
- c->freqs.litlen[literal]++;
1805
- ++*litrunlen_p;
1806
- }
1807
-
1808
- static forceinline void
1809
- deflate_choose_match(struct libdeflate_compressor *c,
1810
- unsigned length, unsigned offset,
1811
- u32 *litrunlen_p, struct deflate_sequence **next_seq_p)
1812
- {
1813
- struct deflate_sequence *seq = *next_seq_p;
1814
- unsigned length_slot = deflate_length_slot[length];
1815
- unsigned offset_slot = deflate_get_offset_slot(c, offset);
1816
-
1817
- c->freqs.litlen[257 + length_slot]++;
1818
- c->freqs.offset[offset_slot]++;
1819
-
1820
- seq->litrunlen_and_length = ((u32)length << 23) | *litrunlen_p;
1821
- seq->offset = offset;
1822
- seq->length_slot = length_slot;
1823
- seq->offset_symbol = offset_slot;
1824
-
1825
- *litrunlen_p = 0;
1826
- *next_seq_p = seq + 1;
1827
- }
1828
-
1829
- static forceinline void
1830
- deflate_finish_sequence(struct deflate_sequence *seq, u32 litrunlen)
1831
- {
1832
- seq->litrunlen_and_length = litrunlen; /* length = 0 */
1833
- }
1834
-
1835
- /******************************************************************************/
1836
-
1837
- /*
1838
- * Block splitting algorithm. The problem is to decide when it is worthwhile to
1839
- * start a new block with new Huffman codes. There is a theoretically optimal
1840
- * solution: recursively consider every possible block split, considering the
1841
- * exact cost of each block, and choose the minimum cost approach. But this is
1842
- * far too slow. Instead, as an approximation, we can count symbols and after
1843
- * every N symbols, compare the expected distribution of symbols based on the
1844
- * previous data with the actual distribution. If they differ "by enough", then
1845
- * start a new block.
1846
- *
1847
- * As an optimization and heuristic, we don't distinguish between every symbol
1848
- * but rather we combine many symbols into a single "observation type". For
1849
- * literals we only look at the high bits and low bits, and for matches we only
1850
- * look at whether the match is long or not. The assumption is that for typical
1851
- * "real" data, places that are good block boundaries will tend to be noticable
1852
- * based only on changes in these aggregate frequencies, without looking for
1853
- * subtle differences in individual symbols. For example, a change from ASCII
1854
- * bytes to non-ASCII bytes, or from few matches (generally less compressible)
1855
- * to many matches (generally more compressible), would be easily noticed based
1856
- * on the aggregates.
1857
- *
1858
- * For determining whether the frequency distributions are "different enough" to
1859
- * start a new block, the simply heuristic of splitting when the sum of absolute
1860
- * differences exceeds a constant seems to be good enough. We also add a number
1861
- * proportional to the block length so that the algorithm is more likely to end
1862
- * long blocks than short blocks. This reflects the general expectation that it
1863
- * will become increasingly beneficial to start a new block as the current
1864
- * block grows longer.
1865
- *
1866
- * Finally, for an approximation, it is not strictly necessary that the exact
1867
- * symbols being used are considered. With "near-optimal parsing", for example,
1868
- * the actual symbols that will be used are unknown until after the block
1869
- * boundary is chosen and the block has been optimized. Since the final choices
1870
- * cannot be used, we can use preliminary "greedy" choices instead.
1871
- */
1872
-
1873
- /* Initialize the block split statistics when starting a new block. */
1874
- static void
1875
- init_block_split_stats(struct block_split_stats *stats)
1876
- {
1877
- int i;
1878
-
1879
- for (i = 0; i < NUM_OBSERVATION_TYPES; i++) {
1880
- stats->new_observations[i] = 0;
1881
- stats->observations[i] = 0;
1882
- }
1883
- stats->num_new_observations = 0;
1884
- stats->num_observations = 0;
1885
- }
1886
-
1887
- /* Literal observation. Heuristic: use the top 2 bits and low 1 bits of the
1888
- * literal, for 8 possible literal observation types. */
1889
- static forceinline void
1890
- observe_literal(struct block_split_stats *stats, u8 lit)
1891
- {
1892
- stats->new_observations[((lit >> 5) & 0x6) | (lit & 1)]++;
1893
- stats->num_new_observations++;
1894
- }
1895
-
1896
- /* Match observation. Heuristic: use one observation type for "short match" and
1897
- * one observation type for "long match". */
1898
- static forceinline void
1899
- observe_match(struct block_split_stats *stats, unsigned length)
1900
- {
1901
- stats->new_observations[NUM_LITERAL_OBSERVATION_TYPES + (length >= 9)]++;
1902
- stats->num_new_observations++;
1903
- }
1904
-
1905
- static bool
1906
- do_end_block_check(struct block_split_stats *stats, u32 block_length)
1907
- {
1908
- int i;
1909
-
1910
- if (stats->num_observations > 0) {
1911
-
1912
- /* Note: to avoid slow divisions, we do not divide by
1913
- * 'num_observations', but rather do all math with the numbers
1914
- * multiplied by 'num_observations'. */
1915
- u32 total_delta = 0;
1916
- for (i = 0; i < NUM_OBSERVATION_TYPES; i++) {
1917
- u32 expected = stats->observations[i] * stats->num_new_observations;
1918
- u32 actual = stats->new_observations[i] * stats->num_observations;
1919
- u32 delta = (actual > expected) ? actual - expected :
1920
- expected - actual;
1921
- total_delta += delta;
1922
- }
1923
-
1924
- /* Ready to end the block? */
1925
- if (total_delta + (block_length / 4096) * stats->num_observations >=
1926
- NUM_OBSERVATIONS_PER_BLOCK_CHECK * 200 / 512 * stats->num_observations)
1927
- return true;
1928
- }
1929
-
1930
- for (i = 0; i < NUM_OBSERVATION_TYPES; i++) {
1931
- stats->num_observations += stats->new_observations[i];
1932
- stats->observations[i] += stats->new_observations[i];
1933
- stats->new_observations[i] = 0;
1934
- }
1935
- stats->num_new_observations = 0;
1936
- return false;
1937
- }
1938
-
1939
- static forceinline bool
1940
- should_end_block(struct block_split_stats *stats,
1941
- const u8 *in_block_begin, const u8 *in_next, const u8 *in_end)
1942
- {
1943
- /* Ready to check block split statistics? */
1944
- if (stats->num_new_observations < NUM_OBSERVATIONS_PER_BLOCK_CHECK ||
1945
- in_next - in_block_begin < MIN_BLOCK_LENGTH ||
1946
- in_end - in_next < MIN_BLOCK_LENGTH)
1947
- return false;
1948
-
1949
- return do_end_block_check(stats, in_next - in_block_begin);
1950
- }
1951
-
1952
- /******************************************************************************/
1953
-
1954
- /*
1955
- * This is the "greedy" DEFLATE compressor. It always chooses the longest match.
1956
- */
1957
- static size_t
1958
- deflate_compress_greedy(struct libdeflate_compressor * restrict c,
1959
- const u8 * restrict in, size_t in_nbytes,
1960
- u8 * restrict out, size_t out_nbytes_avail)
1961
- {
1962
- const u8 *in_next = in;
1963
- const u8 *in_end = in_next + in_nbytes;
1964
- struct deflate_output_bitstream os;
1965
- const u8 *in_cur_base = in_next;
1966
- unsigned max_len = DEFLATE_MAX_MATCH_LEN;
1967
- unsigned nice_len = MIN(c->nice_match_length, max_len);
1968
- u32 next_hashes[2] = {0, 0};
1969
-
1970
- deflate_init_output(&os, out, out_nbytes_avail);
1971
- hc_matchfinder_init(&c->p.g.hc_mf);
1972
-
1973
- do {
1974
- /* Starting a new DEFLATE block. */
1975
-
1976
- const u8 * const in_block_begin = in_next;
1977
- const u8 * const in_max_block_end =
1978
- in_next + MIN(in_end - in_next, SOFT_MAX_BLOCK_LENGTH);
1979
- u32 litrunlen = 0;
1980
- struct deflate_sequence *next_seq = c->p.g.sequences;
1981
-
1982
- init_block_split_stats(&c->split_stats);
1983
- deflate_reset_symbol_frequencies(c);
1984
-
1985
- do {
1986
- u32 length;
1987
- u32 offset;
1988
-
1989
- /* Decrease the maximum and nice match lengths if we're
1990
- * approaching the end of the input buffer. */
1991
- if (unlikely(max_len > in_end - in_next)) {
1992
- max_len = in_end - in_next;
1993
- nice_len = MIN(nice_len, max_len);
1994
- }
1995
-
1996
- length = hc_matchfinder_longest_match(&c->p.g.hc_mf,
1997
- &in_cur_base,
1998
- in_next,
1999
- DEFLATE_MIN_MATCH_LEN - 1,
2000
- max_len,
2001
- nice_len,
2002
- c->max_search_depth,
2003
- next_hashes,
2004
- &offset);
2005
-
2006
- if (length >= DEFLATE_MIN_MATCH_LEN) {
2007
- /* Match found. */
2008
- deflate_choose_match(c, length, offset,
2009
- &litrunlen, &next_seq);
2010
- observe_match(&c->split_stats, length);
2011
- in_next = hc_matchfinder_skip_positions(&c->p.g.hc_mf,
2012
- &in_cur_base,
2013
- in_next + 1,
2014
- in_end,
2015
- length - 1,
2016
- next_hashes);
2017
- } else {
2018
- /* No match found. */
2019
- deflate_choose_literal(c, *in_next, &litrunlen);
2020
- observe_literal(&c->split_stats, *in_next);
2021
- in_next++;
2022
- }
2023
-
2024
- /* Check if it's time to output another block. */
2025
- } while (in_next < in_max_block_end &&
2026
- !should_end_block(&c->split_stats, in_block_begin, in_next, in_end));
2027
-
2028
- deflate_finish_sequence(next_seq, litrunlen);
2029
- deflate_flush_block(c, &os, in_block_begin,
2030
- in_next - in_block_begin,
2031
- in_next == in_end, false);
2032
- } while (in_next != in_end);
2033
-
2034
- return deflate_flush_output(&os);
2035
- }
2036
-
2037
- /*
2038
- * This is the "lazy" DEFLATE compressor. Before choosing a match, it checks to
2039
- * see if there's a longer match at the next position. If yes, it outputs a
2040
- * literal and continues to the next position. If no, it outputs the match.
2041
- */
2042
- static size_t
2043
- deflate_compress_lazy(struct libdeflate_compressor * restrict c,
2044
- const u8 * restrict in, size_t in_nbytes,
2045
- u8 * restrict out, size_t out_nbytes_avail)
2046
- {
2047
- const u8 *in_next = in;
2048
- const u8 *in_end = in_next + in_nbytes;
2049
- struct deflate_output_bitstream os;
2050
- const u8 *in_cur_base = in_next;
2051
- unsigned max_len = DEFLATE_MAX_MATCH_LEN;
2052
- unsigned nice_len = MIN(c->nice_match_length, max_len);
2053
- u32 next_hashes[2] = {0, 0};
2054
-
2055
- deflate_init_output(&os, out, out_nbytes_avail);
2056
- hc_matchfinder_init(&c->p.g.hc_mf);
2057
-
2058
- do {
2059
- /* Starting a new DEFLATE block. */
2060
-
2061
- const u8 * const in_block_begin = in_next;
2062
- const u8 * const in_max_block_end =
2063
- in_next + MIN(in_end - in_next, SOFT_MAX_BLOCK_LENGTH);
2064
- u32 litrunlen = 0;
2065
- struct deflate_sequence *next_seq = c->p.g.sequences;
2066
-
2067
- init_block_split_stats(&c->split_stats);
2068
- deflate_reset_symbol_frequencies(c);
2069
-
2070
- do {
2071
- unsigned cur_len;
2072
- unsigned cur_offset;
2073
- unsigned next_len;
2074
- unsigned next_offset;
2075
-
2076
- if (unlikely(in_end - in_next < DEFLATE_MAX_MATCH_LEN)) {
2077
- max_len = in_end - in_next;
2078
- nice_len = MIN(nice_len, max_len);
2079
- }
2080
-
2081
- /* Find the longest match at the current position. */
2082
- cur_len = hc_matchfinder_longest_match(&c->p.g.hc_mf,
2083
- &in_cur_base,
2084
- in_next,
2085
- DEFLATE_MIN_MATCH_LEN - 1,
2086
- max_len,
2087
- nice_len,
2088
- c->max_search_depth,
2089
- next_hashes,
2090
- &cur_offset);
2091
- in_next += 1;
2092
-
2093
- if (cur_len < DEFLATE_MIN_MATCH_LEN) {
2094
- /* No match found. Choose a literal. */
2095
- deflate_choose_literal(c, *(in_next - 1), &litrunlen);
2096
- observe_literal(&c->split_stats, *(in_next - 1));
2097
- continue;
2098
- }
2099
-
2100
- have_cur_match:
2101
- observe_match(&c->split_stats, cur_len);
2102
-
2103
- /* We have a match at the current position. */
2104
-
2105
- /* If the current match is very long, choose it
2106
- * immediately. */
2107
- if (cur_len >= nice_len) {
2108
- deflate_choose_match(c, cur_len, cur_offset,
2109
- &litrunlen, &next_seq);
2110
- in_next = hc_matchfinder_skip_positions(&c->p.g.hc_mf,
2111
- &in_cur_base,
2112
- in_next,
2113
- in_end,
2114
- cur_len - 1,
2115
- next_hashes);
2116
- continue;
2117
- }
2118
-
2119
- /*
2120
- * Try to find a match at the next position.
2121
- *
2122
- * Note: since we already have a match at the *current*
2123
- * position, we use only half the 'max_search_depth'
2124
- * when checking the *next* position. This is a useful
2125
- * trade-off because it's more worthwhile to use a
2126
- * greater search depth on the initial match.
2127
- *
2128
- * Note: it's possible to structure the code such that
2129
- * there's only one call to longest_match(), which
2130
- * handles both the "find the initial match" and "try to
2131
- * find a longer match" cases. However, it is faster to
2132
- * have two call sites, with longest_match() inlined at
2133
- * each.
2134
- */
2135
- if (unlikely(in_end - in_next < DEFLATE_MAX_MATCH_LEN)) {
2136
- max_len = in_end - in_next;
2137
- nice_len = MIN(nice_len, max_len);
2138
- }
2139
- next_len = hc_matchfinder_longest_match(&c->p.g.hc_mf,
2140
- &in_cur_base,
2141
- in_next,
2142
- cur_len,
2143
- max_len,
2144
- nice_len,
2145
- c->max_search_depth / 2,
2146
- next_hashes,
2147
- &next_offset);
2148
- in_next += 1;
2149
-
2150
- if (next_len > cur_len) {
2151
- /* Found a longer match at the next position.
2152
- * Output a literal. Then the next match
2153
- * becomes the current match. */
2154
- deflate_choose_literal(c, *(in_next - 2), &litrunlen);
2155
- cur_len = next_len;
2156
- cur_offset = next_offset;
2157
- goto have_cur_match;
2158
- }
2159
-
2160
- /* No longer match at the next position.
2161
- * Output the current match. */
2162
- deflate_choose_match(c, cur_len, cur_offset,
2163
- &litrunlen, &next_seq);
2164
- in_next = hc_matchfinder_skip_positions(&c->p.g.hc_mf,
2165
- &in_cur_base,
2166
- in_next,
2167
- in_end,
2168
- cur_len - 2,
2169
- next_hashes);
2170
-
2171
- /* Check if it's time to output another block. */
2172
- } while (in_next < in_max_block_end &&
2173
- !should_end_block(&c->split_stats, in_block_begin, in_next, in_end));
2174
-
2175
- deflate_finish_sequence(next_seq, litrunlen);
2176
- deflate_flush_block(c, &os, in_block_begin,
2177
- in_next - in_block_begin,
2178
- in_next == in_end, false);
2179
- } while (in_next != in_end);
2180
-
2181
- return deflate_flush_output(&os);
2182
- }
2183
-
2184
- #if SUPPORT_NEAR_OPTIMAL_PARSING
2185
-
2186
- /*
2187
- * Follow the minimum-cost path in the graph of possible match/literal choices
2188
- * for the current block and compute the frequencies of the Huffman symbols that
2189
- * would be needed to output those matches and literals.
2190
- */
2191
- static void
2192
- deflate_tally_item_list(struct libdeflate_compressor *c, u32 block_length)
2193
- {
2194
- struct deflate_optimum_node *cur_node = &c->p.n.optimum_nodes[0];
2195
- struct deflate_optimum_node *end_node = &c->p.n.optimum_nodes[block_length];
2196
- do {
2197
- unsigned length = cur_node->item & OPTIMUM_LEN_MASK;
2198
- unsigned offset = cur_node->item >> OPTIMUM_OFFSET_SHIFT;
2199
-
2200
- if (length == 1) {
2201
- /* Literal */
2202
- c->freqs.litlen[offset]++;
2203
- } else {
2204
- /* Match */
2205
- c->freqs.litlen[257 + deflate_length_slot[length]]++;
2206
- c->freqs.offset[deflate_get_offset_slot(c, offset)]++;
2207
- }
2208
- cur_node += length;
2209
- } while (cur_node != end_node);
2210
- }
2211
-
2212
- /* Set the current cost model from the codeword lengths specified in @lens. */
2213
- static void
2214
- deflate_set_costs_from_codes(struct libdeflate_compressor *c,
2215
- const struct deflate_lens *lens)
2216
- {
2217
- unsigned i;
2218
-
2219
- /* Literals */
2220
- for (i = 0; i < DEFLATE_NUM_LITERALS; i++) {
2221
- u32 bits = (lens->litlen[i] ? lens->litlen[i] : LITERAL_NOSTAT_BITS);
2222
- c->p.n.costs.literal[i] = bits << COST_SHIFT;
2223
- }
2224
-
2225
- /* Lengths */
2226
- for (i = DEFLATE_MIN_MATCH_LEN; i <= DEFLATE_MAX_MATCH_LEN; i++) {
2227
- unsigned length_slot = deflate_length_slot[i];
2228
- unsigned litlen_sym = 257 + length_slot;
2229
- u32 bits = (lens->litlen[litlen_sym] ? lens->litlen[litlen_sym] : LENGTH_NOSTAT_BITS);
2230
- bits += deflate_extra_length_bits[length_slot];
2231
- c->p.n.costs.length[i] = bits << COST_SHIFT;
2232
- }
2233
-
2234
- /* Offset slots */
2235
- for (i = 0; i < ARRAY_LEN(deflate_offset_slot_base); i++) {
2236
- u32 bits = (lens->offset[i] ? lens->offset[i] : OFFSET_NOSTAT_BITS);
2237
- bits += deflate_extra_offset_bits[i];
2238
- c->p.n.costs.offset_slot[i] = bits << COST_SHIFT;
2239
- }
2240
- }
2241
-
2242
- static forceinline u32
2243
- deflate_default_literal_cost(unsigned literal)
2244
- {
2245
- STATIC_ASSERT(COST_SHIFT == 3);
2246
- /* 66 is 8.25 bits/symbol */
2247
- return 66;
2248
- }
2249
-
2250
- static forceinline u32
2251
- deflate_default_length_slot_cost(unsigned length_slot)
2252
- {
2253
- STATIC_ASSERT(COST_SHIFT == 3);
2254
- /* 60 is 7.5 bits/symbol */
2255
- return 60 + ((u32)deflate_extra_length_bits[length_slot] << COST_SHIFT);
2256
- }
2257
-
2258
- static forceinline u32
2259
- deflate_default_offset_slot_cost(unsigned offset_slot)
2260
- {
2261
- STATIC_ASSERT(COST_SHIFT == 3);
2262
- /* 39 is 4.875 bits/symbol */
2263
- return 39 + ((u32)deflate_extra_offset_bits[offset_slot] << COST_SHIFT);
2264
- }
2265
-
2266
- /*
2267
- * Set default symbol costs for the first block's first optimization pass.
2268
- *
2269
- * It works well to assume that each symbol is equally probable. This results
2270
- * in each symbol being assigned a cost of (-log2(1.0/num_syms) * (1 <<
2271
- * COST_SHIFT)) where 'num_syms' is the number of symbols in the corresponding
2272
- * alphabet. However, we intentionally bias the parse towards matches rather
2273
- * than literals by using a slightly lower default cost for length symbols than
2274
- * for literals. This often improves the compression ratio slightly.
2275
- */
2276
- static void
2277
- deflate_set_default_costs(struct libdeflate_compressor *c)
2278
- {
2279
- unsigned i;
2280
-
2281
- /* Literals */
2282
- for (i = 0; i < DEFLATE_NUM_LITERALS; i++)
2283
- c->p.n.costs.literal[i] = deflate_default_literal_cost(i);
2284
-
2285
- /* Lengths */
2286
- for (i = DEFLATE_MIN_MATCH_LEN; i <= DEFLATE_MAX_MATCH_LEN; i++)
2287
- c->p.n.costs.length[i] = deflate_default_length_slot_cost(
2288
- deflate_length_slot[i]);
2289
-
2290
- /* Offset slots */
2291
- for (i = 0; i < ARRAY_LEN(deflate_offset_slot_base); i++)
2292
- c->p.n.costs.offset_slot[i] = deflate_default_offset_slot_cost(i);
2293
- }
2294
-
2295
- static forceinline void
2296
- deflate_adjust_cost(u32 *cost_p, u32 default_cost)
2297
- {
2298
- *cost_p += ((s32)default_cost - (s32)*cost_p) >> 1;
2299
- }
2300
-
2301
- /*
2302
- * Adjust the costs when beginning a new block.
2303
- *
2304
- * Since the current costs have been optimized for the data, it's undesirable to
2305
- * throw them away and start over with the default costs. At the same time, we
2306
- * don't want to bias the parse by assuming that the next block will be similar
2307
- * to the current block. As a compromise, make the costs closer to the
2308
- * defaults, but don't simply set them to the defaults.
2309
- */
2310
- static void
2311
- deflate_adjust_costs(struct libdeflate_compressor *c)
2312
- {
2313
- unsigned i;
2314
-
2315
- /* Literals */
2316
- for (i = 0; i < DEFLATE_NUM_LITERALS; i++)
2317
- deflate_adjust_cost(&c->p.n.costs.literal[i],
2318
- deflate_default_literal_cost(i));
2319
-
2320
- /* Lengths */
2321
- for (i = DEFLATE_MIN_MATCH_LEN; i <= DEFLATE_MAX_MATCH_LEN; i++)
2322
- deflate_adjust_cost(&c->p.n.costs.length[i],
2323
- deflate_default_length_slot_cost(
2324
- deflate_length_slot[i]));
2325
-
2326
- /* Offset slots */
2327
- for (i = 0; i < ARRAY_LEN(deflate_offset_slot_base); i++)
2328
- deflate_adjust_cost(&c->p.n.costs.offset_slot[i],
2329
- deflate_default_offset_slot_cost(i));
2330
- }
2331
-
2332
- /*
2333
- * Find the minimum-cost path through the graph of possible match/literal
2334
- * choices for this block.
2335
- *
2336
- * We find the minimum cost path from 'c->p.n.optimum_nodes[0]', which
2337
- * represents the node at the beginning of the block, to
2338
- * 'c->p.n.optimum_nodes[block_length]', which represents the node at the end of
2339
- * the block. Edge costs are evaluated using the cost model 'c->p.n.costs'.
2340
- *
2341
- * The algorithm works backwards, starting at the end node and proceeding
2342
- * backwards one node at a time. At each node, the minimum cost to reach the
2343
- * end node is computed and the match/literal choice that begins that path is
2344
- * saved.
2345
- */
2346
- static void
2347
- deflate_find_min_cost_path(struct libdeflate_compressor *c,
2348
- const u32 block_length,
2349
- const struct lz_match *cache_ptr)
2350
- {
2351
- struct deflate_optimum_node *end_node = &c->p.n.optimum_nodes[block_length];
2352
- struct deflate_optimum_node *cur_node = end_node;
2353
-
2354
- cur_node->cost_to_end = 0;
2355
- do {
2356
- unsigned num_matches;
2357
- unsigned literal;
2358
- u32 best_cost_to_end;
2359
-
2360
- cur_node--;
2361
- cache_ptr--;
2362
-
2363
- num_matches = cache_ptr->length;
2364
- literal = cache_ptr->offset;
2365
-
2366
- /* It's always possible to choose a literal. */
2367
- best_cost_to_end = c->p.n.costs.literal[literal] +
2368
- (cur_node + 1)->cost_to_end;
2369
- cur_node->item = ((u32)literal << OPTIMUM_OFFSET_SHIFT) | 1;
2370
-
2371
- /* Also consider matches if there are any. */
2372
- if (num_matches) {
2373
- const struct lz_match *match;
2374
- unsigned len;
2375
- unsigned offset;
2376
- unsigned offset_slot;
2377
- u32 offset_cost;
2378
- u32 cost_to_end;
2379
-
2380
- /*
2381
- * Consider each length from the minimum
2382
- * (DEFLATE_MIN_MATCH_LEN) to the length of the longest
2383
- * match found at this position. For each length, we
2384
- * consider only the smallest offset for which that
2385
- * length is available. Although this is not guaranteed
2386
- * to be optimal due to the possibility of a larger
2387
- * offset costing less than a smaller offset to code,
2388
- * this is a very useful heuristic.
2389
- */
2390
- match = cache_ptr - num_matches;
2391
- len = DEFLATE_MIN_MATCH_LEN;
2392
- do {
2393
- offset = match->offset;
2394
- offset_slot = deflate_get_offset_slot(c, offset);
2395
- offset_cost = c->p.n.costs.offset_slot[offset_slot];
2396
- do {
2397
- cost_to_end = offset_cost +
2398
- c->p.n.costs.length[len] +
2399
- (cur_node + len)->cost_to_end;
2400
- if (cost_to_end < best_cost_to_end) {
2401
- best_cost_to_end = cost_to_end;
2402
- cur_node->item = ((u32)offset << OPTIMUM_OFFSET_SHIFT) | len;
2403
- }
2404
- } while (++len <= match->length);
2405
- } while (++match != cache_ptr);
2406
- cache_ptr -= num_matches;
2407
- }
2408
- cur_node->cost_to_end = best_cost_to_end;
2409
- } while (cur_node != &c->p.n.optimum_nodes[0]);
2410
- }
2411
-
2412
- /*
2413
- * Choose the literal/match sequence to use for the current block. The basic
2414
- * algorithm finds a minimum-cost path through the block's graph of
2415
- * literal/match choices, given a cost model. However, the cost of each symbol
2416
- * is unknown until the Huffman codes have been built, but at the same time the
2417
- * Huffman codes depend on the frequencies of chosen symbols. Consequently,
2418
- * multiple passes must be used to try to approximate an optimal solution. The
2419
- * first pass uses default costs, mixed with the costs from the previous block
2420
- * if any. Later passes use the Huffman codeword lengths from the previous pass
2421
- * as the costs.
2422
- */
2423
- static void
2424
- deflate_optimize_block(struct libdeflate_compressor *c, u32 block_length,
2425
- const struct lz_match *cache_ptr, bool is_first_block)
2426
- {
2427
- unsigned num_passes_remaining = c->p.n.num_optim_passes;
2428
- u32 i;
2429
-
2430
- /* Force the block to really end at the desired length, even if some
2431
- * matches extend beyond it. */
2432
- for (i = block_length; i <= MIN(block_length - 1 + DEFLATE_MAX_MATCH_LEN,
2433
- ARRAY_LEN(c->p.n.optimum_nodes) - 1); i++)
2434
- c->p.n.optimum_nodes[i].cost_to_end = 0x80000000;
2435
-
2436
- /* Set the initial costs. */
2437
- if (is_first_block)
2438
- deflate_set_default_costs(c);
2439
- else
2440
- deflate_adjust_costs(c);
2441
-
2442
- for (;;) {
2443
- /* Find the minimum cost path for this pass. */
2444
- deflate_find_min_cost_path(c, block_length, cache_ptr);
2445
-
2446
- /* Compute frequencies of the chosen symbols. */
2447
- deflate_reset_symbol_frequencies(c);
2448
- deflate_tally_item_list(c, block_length);
2449
-
2450
- if (--num_passes_remaining == 0)
2451
- break;
2452
-
2453
- /* At least one optimization pass remains; update the costs. */
2454
- deflate_make_huffman_codes(&c->freqs, &c->codes);
2455
- deflate_set_costs_from_codes(c, &c->codes.lens);
2456
- }
2457
- }
2458
-
2459
- /*
2460
- * This is the "near-optimal" DEFLATE compressor. It computes the optimal
2461
- * representation of each DEFLATE block using a minimum-cost path search over
2462
- * the graph of possible match/literal choices for that block, assuming a
2463
- * certain cost for each Huffman symbol.
2464
- *
2465
- * For several reasons, the end result is not guaranteed to be optimal:
2466
- *
2467
- * - Nonoptimal choice of blocks
2468
- * - Heuristic limitations on which matches are actually considered
2469
- * - Symbol costs are unknown until the symbols have already been chosen
2470
- * (so iterative optimization must be used)
2471
- */
2472
- static size_t
2473
- deflate_compress_near_optimal(struct libdeflate_compressor * restrict c,
2474
- const u8 * restrict in, size_t in_nbytes,
2475
- u8 * restrict out, size_t out_nbytes_avail)
2476
- {
2477
- const u8 *in_next = in;
2478
- const u8 *in_end = in_next + in_nbytes;
2479
- struct deflate_output_bitstream os;
2480
- const u8 *in_cur_base = in_next;
2481
- const u8 *in_next_slide = in_next + MIN(in_end - in_next, MATCHFINDER_WINDOW_SIZE);
2482
- unsigned max_len = DEFLATE_MAX_MATCH_LEN;
2483
- unsigned nice_len = MIN(c->nice_match_length, max_len);
2484
- u32 next_hashes[2] = {0, 0};
2485
-
2486
- deflate_init_output(&os, out, out_nbytes_avail);
2487
- bt_matchfinder_init(&c->p.n.bt_mf);
2488
-
2489
- do {
2490
- /* Starting a new DEFLATE block. */
2491
-
2492
- struct lz_match *cache_ptr = c->p.n.match_cache;
2493
- const u8 * const in_block_begin = in_next;
2494
- const u8 * const in_max_block_end =
2495
- in_next + MIN(in_end - in_next, SOFT_MAX_BLOCK_LENGTH);
2496
- const u8 *next_observation = in_next;
2497
-
2498
- init_block_split_stats(&c->split_stats);
2499
-
2500
- /*
2501
- * Find matches until we decide to end the block. We end the
2502
- * block if any of the following is true:
2503
- *
2504
- * (1) Maximum block length has been reached
2505
- * (2) Match catch may overflow.
2506
- * (3) Block split heuristic says to split now.
2507
- */
2508
- do {
2509
- struct lz_match *matches;
2510
- unsigned best_len;
2511
-
2512
- /* Slide the window forward if needed. */
2513
- if (in_next == in_next_slide) {
2514
- bt_matchfinder_slide_window(&c->p.n.bt_mf);
2515
- in_cur_base = in_next;
2516
- in_next_slide = in_next + MIN(in_end - in_next,
2517
- MATCHFINDER_WINDOW_SIZE);
2518
- }
2519
-
2520
- /* Decrease the maximum and nice match lengths if we're
2521
- * approaching the end of the input buffer. */
2522
- if (unlikely(max_len > in_end - in_next)) {
2523
- max_len = in_end - in_next;
2524
- nice_len = MIN(nice_len, max_len);
2525
- }
2526
-
2527
- /*
2528
- * Find matches with the current position using the
2529
- * binary tree matchfinder and save them in
2530
- * 'match_cache'.
2531
- *
2532
- * Note: the binary tree matchfinder is more suited for
2533
- * optimal parsing than the hash chain matchfinder. The
2534
- * reasons for this include:
2535
- *
2536
- * - The binary tree matchfinder can find more matches
2537
- * in the same number of steps.
2538
- * - One of the major advantages of hash chains is that
2539
- * skipping positions (not searching for matches at
2540
- * them) is faster; however, with optimal parsing we
2541
- * search for matches at almost all positions, so this
2542
- * advantage of hash chains is negated.
2543
- */
2544
- matches = cache_ptr;
2545
- best_len = 0;
2546
- if (likely(max_len >= BT_MATCHFINDER_REQUIRED_NBYTES)) {
2547
- cache_ptr = bt_matchfinder_get_matches(&c->p.n.bt_mf,
2548
- in_cur_base,
2549
- in_next - in_cur_base,
2550
- max_len,
2551
- nice_len,
2552
- c->max_search_depth,
2553
- next_hashes,
2554
- &best_len,
2555
- matches);
2556
- }
2557
-
2558
- if (in_next >= next_observation) {
2559
- if (best_len >= 4) {
2560
- observe_match(&c->split_stats, best_len);
2561
- next_observation = in_next + best_len;
2562
- } else {
2563
- observe_literal(&c->split_stats, *in_next);
2564
- next_observation = in_next + 1;
2565
- }
2566
- }
2567
-
2568
- cache_ptr->length = cache_ptr - matches;
2569
- cache_ptr->offset = *in_next;
2570
- in_next++;
2571
- cache_ptr++;
2572
-
2573
- /*
2574
- * If there was a very long match found, don't cache any
2575
- * matches for the bytes covered by that match. This
2576
- * avoids degenerate behavior when compressing highly
2577
- * redundant data, where the number of matches can be
2578
- * very large.
2579
- *
2580
- * This heuristic doesn't actually hurt the compression
2581
- * ratio very much. If there's a long match, then the
2582
- * data must be highly compressible, so it doesn't
2583
- * matter much what we do.
2584
- */
2585
- if (best_len >= DEFLATE_MIN_MATCH_LEN && best_len >= nice_len) {
2586
- --best_len;
2587
- do {
2588
- if (in_next == in_next_slide) {
2589
- bt_matchfinder_slide_window(&c->p.n.bt_mf);
2590
- in_cur_base = in_next;
2591
- in_next_slide = in_next + MIN(in_end - in_next,
2592
- MATCHFINDER_WINDOW_SIZE);
2593
- }
2594
- if (unlikely(max_len > in_end - in_next)) {
2595
- max_len = in_end - in_next;
2596
- nice_len = MIN(nice_len, max_len);
2597
- }
2598
- if (max_len >= BT_MATCHFINDER_REQUIRED_NBYTES) {
2599
- bt_matchfinder_skip_position(&c->p.n.bt_mf,
2600
- in_cur_base,
2601
- in_next - in_cur_base,
2602
- nice_len,
2603
- c->max_search_depth,
2604
- next_hashes);
2605
- }
2606
- cache_ptr->length = 0;
2607
- cache_ptr->offset = *in_next;
2608
- in_next++;
2609
- cache_ptr++;
2610
- } while (--best_len);
2611
- }
2612
- } while (in_next < in_max_block_end &&
2613
- cache_ptr < &c->p.n.match_cache[CACHE_LENGTH] &&
2614
- !should_end_block(&c->split_stats, in_block_begin, in_next, in_end));
2615
-
2616
- /* All the matches for this block have been cached. Now choose
2617
- * the sequence of items to output and flush the block. */
2618
- deflate_optimize_block(c, in_next - in_block_begin, cache_ptr,
2619
- in_block_begin == in);
2620
- deflate_flush_block(c, &os, in_block_begin, in_next - in_block_begin,
2621
- in_next == in_end, true);
2622
- } while (in_next != in_end);
2623
-
2624
- return deflate_flush_output(&os);
2625
- }
2626
-
2627
- #endif /* SUPPORT_NEAR_OPTIMAL_PARSING */
2628
-
2629
- /* Initialize c->offset_slot_fast. */
2630
- static void
2631
- deflate_init_offset_slot_fast(struct libdeflate_compressor *c)
2632
- {
2633
- unsigned offset_slot;
2634
- unsigned offset;
2635
- unsigned offset_end;
2636
-
2637
- for (offset_slot = 0;
2638
- offset_slot < ARRAY_LEN(deflate_offset_slot_base);
2639
- offset_slot++)
2640
- {
2641
- offset = deflate_offset_slot_base[offset_slot];
2642
- #if USE_FULL_OFFSET_SLOT_FAST
2643
- offset_end = offset + (1 << deflate_extra_offset_bits[offset_slot]);
2644
- do {
2645
- c->offset_slot_fast[offset] = offset_slot;
2646
- } while (++offset != offset_end);
2647
- #else
2648
- if (offset <= 256) {
2649
- offset_end = offset + (1 << deflate_extra_offset_bits[offset_slot]);
2650
- do {
2651
- c->offset_slot_fast[offset - 1] = offset_slot;
2652
- } while (++offset != offset_end);
2653
- } else {
2654
- offset_end = offset + (1 << deflate_extra_offset_bits[offset_slot]);
2655
- do {
2656
- c->offset_slot_fast[256 + ((offset - 1) >> 7)] = offset_slot;
2657
- } while ((offset += (1 << 7)) != offset_end);
2658
- }
2659
- #endif
2660
- }
2661
- }
2662
-
2663
- LIBDEFLATEAPI struct libdeflate_compressor *
2664
- libdeflate_alloc_compressor(int compression_level)
2665
- {
2666
- struct libdeflate_compressor *c;
2667
- size_t size;
2668
-
2669
- #if SUPPORT_NEAR_OPTIMAL_PARSING
2670
- if (compression_level >= 8)
2671
- size = offsetof(struct libdeflate_compressor, p) + sizeof(c->p.n);
2672
- else
2673
- #endif
2674
- size = offsetof(struct libdeflate_compressor, p) + sizeof(c->p.g);
2675
-
2676
- c = aligned_malloc(MATCHFINDER_ALIGNMENT, size);
2677
- if (!c)
2678
- return NULL;
2679
-
2680
- switch (compression_level) {
2681
- case 1:
2682
- c->impl = deflate_compress_greedy;
2683
- c->max_search_depth = 2;
2684
- c->nice_match_length = 8;
2685
- break;
2686
- case 2:
2687
- c->impl = deflate_compress_greedy;
2688
- c->max_search_depth = 6;
2689
- c->nice_match_length = 10;
2690
- break;
2691
- case 3:
2692
- c->impl = deflate_compress_greedy;
2693
- c->max_search_depth = 12;
2694
- c->nice_match_length = 14;
2695
- break;
2696
- case 4:
2697
- c->impl = deflate_compress_greedy;
2698
- c->max_search_depth = 24;
2699
- c->nice_match_length = 24;
2700
- break;
2701
- case 5:
2702
- c->impl = deflate_compress_lazy;
2703
- c->max_search_depth = 20;
2704
- c->nice_match_length = 30;
2705
- break;
2706
- case 6:
2707
- c->impl = deflate_compress_lazy;
2708
- c->max_search_depth = 40;
2709
- c->nice_match_length = 65;
2710
- break;
2711
- case 7:
2712
- c->impl = deflate_compress_lazy;
2713
- c->max_search_depth = 100;
2714
- c->nice_match_length = 130;
2715
- break;
2716
- #if SUPPORT_NEAR_OPTIMAL_PARSING
2717
- case 8:
2718
- c->impl = deflate_compress_near_optimal;
2719
- c->max_search_depth = 12;
2720
- c->nice_match_length = 20;
2721
- c->p.n.num_optim_passes = 1;
2722
- break;
2723
- case 9:
2724
- c->impl = deflate_compress_near_optimal;
2725
- c->max_search_depth = 16;
2726
- c->nice_match_length = 26;
2727
- c->p.n.num_optim_passes = 2;
2728
- break;
2729
- case 10:
2730
- c->impl = deflate_compress_near_optimal;
2731
- c->max_search_depth = 30;
2732
- c->nice_match_length = 50;
2733
- c->p.n.num_optim_passes = 2;
2734
- break;
2735
- case 11:
2736
- c->impl = deflate_compress_near_optimal;
2737
- c->max_search_depth = 60;
2738
- c->nice_match_length = 80;
2739
- c->p.n.num_optim_passes = 3;
2740
- break;
2741
- case 12:
2742
- c->impl = deflate_compress_near_optimal;
2743
- c->max_search_depth = 100;
2744
- c->nice_match_length = 133;
2745
- c->p.n.num_optim_passes = 4;
2746
- break;
2747
- #else
2748
- case 8:
2749
- c->impl = deflate_compress_lazy;
2750
- c->max_search_depth = 150;
2751
- c->nice_match_length = 200;
2752
- break;
2753
- case 9:
2754
- c->impl = deflate_compress_lazy;
2755
- c->max_search_depth = 200;
2756
- c->nice_match_length = DEFLATE_MAX_MATCH_LEN;
2757
- break;
2758
- #endif
2759
- default:
2760
- aligned_free(c);
2761
- return NULL;
2762
- }
2763
-
2764
- c->compression_level = compression_level;
2765
-
2766
- deflate_init_offset_slot_fast(c);
2767
- deflate_init_static_codes(c);
2768
-
2769
- return c;
2770
- }
2771
-
2772
- LIBDEFLATEAPI size_t
2773
- libdeflate_deflate_compress(struct libdeflate_compressor *c,
2774
- const void *in, size_t in_nbytes,
2775
- void *out, size_t out_nbytes_avail)
2776
- {
2777
- if (unlikely(out_nbytes_avail < MIN_OUTPUT_SIZE))
2778
- return 0;
2779
-
2780
- /* For extremely small inputs just use a single uncompressed block. */
2781
- if (unlikely(in_nbytes < 16)) {
2782
- struct deflate_output_bitstream os;
2783
- deflate_init_output(&os, out, out_nbytes_avail);
2784
- if (in_nbytes == 0)
2785
- in = &os; /* Avoid passing NULL to memcpy() */
2786
- deflate_write_uncompressed_block(&os, in, in_nbytes, true);
2787
- return deflate_flush_output(&os);
2788
- }
2789
-
2790
- return (*c->impl)(c, in, in_nbytes, out, out_nbytes_avail);
2791
- }
2792
-
2793
- LIBDEFLATEAPI void
2794
- libdeflate_free_compressor(struct libdeflate_compressor *c)
2795
- {
2796
- aligned_free(c);
2797
- }
2798
-
2799
- unsigned int
2800
- deflate_get_compression_level(struct libdeflate_compressor *c)
2801
- {
2802
- return c->compression_level;
2803
- }
2804
-
2805
- LIBDEFLATEAPI size_t
2806
- libdeflate_deflate_compress_bound(struct libdeflate_compressor *c,
2807
- size_t in_nbytes)
2808
- {
2809
- /*
2810
- * The worst case is all uncompressed blocks where one block has length
2811
- * <= MIN_BLOCK_LENGTH and the others have length MIN_BLOCK_LENGTH.
2812
- * Each uncompressed block has 5 bytes of overhead: 1 for BFINAL, BTYPE,
2813
- * and alignment to a byte boundary; 2 for LEN; and 2 for NLEN.
2814
- */
2815
- size_t max_num_blocks = MAX(DIV_ROUND_UP(in_nbytes, MIN_BLOCK_LENGTH), 1);
2816
- return (5 * max_num_blocks) + in_nbytes + 1 + MIN_OUTPUT_SIZE;
2817
- }