libdeflate 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (89) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +17 -0
  3. data/.gitmodules +3 -0
  4. data/.rspec +2 -0
  5. data/.rubocop.yml +1 -0
  6. data/.rubocop_todo.yml +9 -0
  7. data/.travis.yml +5 -0
  8. data/Gemfile +4 -0
  9. data/LICENSE.txt +21 -0
  10. data/README.md +52 -0
  11. data/Rakefile +15 -0
  12. data/bin/console +14 -0
  13. data/bin/setup +8 -0
  14. data/ext/libdeflate/extconf.rb +14 -0
  15. data/ext/libdeflate/libdeflate/.gitignore +19 -0
  16. data/ext/libdeflate/libdeflate/COPYING +21 -0
  17. data/ext/libdeflate/libdeflate/Makefile +231 -0
  18. data/ext/libdeflate/libdeflate/Makefile.msc +64 -0
  19. data/ext/libdeflate/libdeflate/NEWS +57 -0
  20. data/ext/libdeflate/libdeflate/README.md +170 -0
  21. data/ext/libdeflate/libdeflate/common/common_defs.h +351 -0
  22. data/ext/libdeflate/libdeflate/common/compiler_gcc.h +134 -0
  23. data/ext/libdeflate/libdeflate/common/compiler_msc.h +95 -0
  24. data/ext/libdeflate/libdeflate/lib/adler32.c +213 -0
  25. data/ext/libdeflate/libdeflate/lib/adler32_impl.h +281 -0
  26. data/ext/libdeflate/libdeflate/lib/aligned_malloc.c +57 -0
  27. data/ext/libdeflate/libdeflate/lib/aligned_malloc.h +13 -0
  28. data/ext/libdeflate/libdeflate/lib/bt_matchfinder.h +357 -0
  29. data/ext/libdeflate/libdeflate/lib/crc32.c +368 -0
  30. data/ext/libdeflate/libdeflate/lib/crc32_impl.h +286 -0
  31. data/ext/libdeflate/libdeflate/lib/crc32_table.h +526 -0
  32. data/ext/libdeflate/libdeflate/lib/decompress_impl.h +404 -0
  33. data/ext/libdeflate/libdeflate/lib/deflate_compress.c +2817 -0
  34. data/ext/libdeflate/libdeflate/lib/deflate_compress.h +14 -0
  35. data/ext/libdeflate/libdeflate/lib/deflate_constants.h +66 -0
  36. data/ext/libdeflate/libdeflate/lib/deflate_decompress.c +889 -0
  37. data/ext/libdeflate/libdeflate/lib/gzip_compress.c +95 -0
  38. data/ext/libdeflate/libdeflate/lib/gzip_constants.h +45 -0
  39. data/ext/libdeflate/libdeflate/lib/gzip_decompress.c +130 -0
  40. data/ext/libdeflate/libdeflate/lib/hc_matchfinder.h +405 -0
  41. data/ext/libdeflate/libdeflate/lib/lib_common.h +35 -0
  42. data/ext/libdeflate/libdeflate/lib/matchfinder_avx2.h +53 -0
  43. data/ext/libdeflate/libdeflate/lib/matchfinder_common.h +205 -0
  44. data/ext/libdeflate/libdeflate/lib/matchfinder_neon.h +61 -0
  45. data/ext/libdeflate/libdeflate/lib/matchfinder_sse2.h +53 -0
  46. data/ext/libdeflate/libdeflate/lib/unaligned.h +202 -0
  47. data/ext/libdeflate/libdeflate/lib/x86_cpu_features.c +169 -0
  48. data/ext/libdeflate/libdeflate/lib/x86_cpu_features.h +48 -0
  49. data/ext/libdeflate/libdeflate/lib/zlib_compress.c +87 -0
  50. data/ext/libdeflate/libdeflate/lib/zlib_constants.h +21 -0
  51. data/ext/libdeflate/libdeflate/lib/zlib_decompress.c +91 -0
  52. data/ext/libdeflate/libdeflate/libdeflate.h +274 -0
  53. data/ext/libdeflate/libdeflate/programs/benchmark.c +558 -0
  54. data/ext/libdeflate/libdeflate/programs/checksum.c +197 -0
  55. data/ext/libdeflate/libdeflate/programs/detect.sh +62 -0
  56. data/ext/libdeflate/libdeflate/programs/gzip.c +603 -0
  57. data/ext/libdeflate/libdeflate/programs/prog_util.c +530 -0
  58. data/ext/libdeflate/libdeflate/programs/prog_util.h +162 -0
  59. data/ext/libdeflate/libdeflate/programs/test_checksums.c +135 -0
  60. data/ext/libdeflate/libdeflate/programs/tgetopt.c +118 -0
  61. data/ext/libdeflate/libdeflate/tools/afl-fuzz/Makefile +12 -0
  62. data/ext/libdeflate/libdeflate/tools/afl-fuzz/deflate_compress/fuzz.c +40 -0
  63. data/ext/libdeflate/libdeflate/tools/afl-fuzz/deflate_compress/inputs/0 +0 -0
  64. data/ext/libdeflate/libdeflate/tools/afl-fuzz/deflate_decompress/fuzz.c +28 -0
  65. data/ext/libdeflate/libdeflate/tools/afl-fuzz/deflate_decompress/inputs/0 +3 -0
  66. data/ext/libdeflate/libdeflate/tools/afl-fuzz/gzip_decompress/fuzz.c +28 -0
  67. data/ext/libdeflate/libdeflate/tools/afl-fuzz/gzip_decompress/inputs/0 +0 -0
  68. data/ext/libdeflate/libdeflate/tools/afl-fuzz/prepare_for_fuzz.sh +14 -0
  69. data/ext/libdeflate/libdeflate/tools/afl-fuzz/zlib_decompress/fuzz.c +28 -0
  70. data/ext/libdeflate/libdeflate/tools/afl-fuzz/zlib_decompress/inputs/0 +3 -0
  71. data/ext/libdeflate/libdeflate/tools/android_build.sh +104 -0
  72. data/ext/libdeflate/libdeflate/tools/checksum_benchmarks.sh +76 -0
  73. data/ext/libdeflate/libdeflate/tools/exec_tests.sh +30 -0
  74. data/ext/libdeflate/libdeflate/tools/gen_crc32_multipliers.c +108 -0
  75. data/ext/libdeflate/libdeflate/tools/gen_crc32_table.c +100 -0
  76. data/ext/libdeflate/libdeflate/tools/gzip_tests.sh +412 -0
  77. data/ext/libdeflate/libdeflate/tools/make-windows-releases +21 -0
  78. data/ext/libdeflate/libdeflate/tools/mips_build.sh +9 -0
  79. data/ext/libdeflate/libdeflate/tools/msc_test.bat +3 -0
  80. data/ext/libdeflate/libdeflate/tools/pgo_build.sh +23 -0
  81. data/ext/libdeflate/libdeflate/tools/produce_gzip_benchmark_table.sh +37 -0
  82. data/ext/libdeflate/libdeflate/tools/run_tests.sh +305 -0
  83. data/ext/libdeflate/libdeflate/tools/windows_build.sh +10 -0
  84. data/ext/libdeflate/libdeflate_ext.c +389 -0
  85. data/ext/libdeflate/libdeflate_ext.h +8 -0
  86. data/lib/libdeflate.rb +2 -0
  87. data/lib/libdeflate/version.rb +3 -0
  88. data/libdeflate.gemspec +33 -0
  89. metadata +230 -0
@@ -0,0 +1,404 @@
1
+ /*
2
+ * decompress_impl.h
3
+ *
4
+ * Originally public domain; changes after 2016-09-07 are copyrighted.
5
+ *
6
+ * Copyright 2016 Eric Biggers
7
+ *
8
+ * Permission is hereby granted, free of charge, to any person
9
+ * obtaining a copy of this software and associated documentation
10
+ * files (the "Software"), to deal in the Software without
11
+ * restriction, including without limitation the rights to use,
12
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
13
+ * copies of the Software, and to permit persons to whom the
14
+ * Software is furnished to do so, subject to the following
15
+ * conditions:
16
+ *
17
+ * The above copyright notice and this permission notice shall be
18
+ * included in all copies or substantial portions of the Software.
19
+ *
20
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
21
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
22
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
23
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
24
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
25
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
26
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
27
+ * OTHER DEALINGS IN THE SOFTWARE.
28
+ */
29
+
30
+ /*
31
+ * This is the actual DEFLATE decompression routine, lifted out of
32
+ * deflate_decompress.c so that it can be compiled multiple times with different
33
+ * target instruction sets.
34
+ */
35
+
36
+ static enum libdeflate_result ATTRIBUTES
37
+ FUNCNAME(struct libdeflate_decompressor * restrict d,
38
+ const void * restrict in, size_t in_nbytes,
39
+ void * restrict out, size_t out_nbytes_avail,
40
+ size_t *actual_out_nbytes_ret)
41
+ {
42
+ u8 *out_next = out;
43
+ u8 * const out_end = out_next + out_nbytes_avail;
44
+ const u8 *in_next = in;
45
+ const u8 * const in_end = in_next + in_nbytes;
46
+ bitbuf_t bitbuf = 0;
47
+ unsigned bitsleft = 0;
48
+ size_t overrun_count = 0;
49
+ unsigned i;
50
+ unsigned is_final_block;
51
+ unsigned block_type;
52
+ u16 len;
53
+ u16 nlen;
54
+ unsigned num_litlen_syms;
55
+ unsigned num_offset_syms;
56
+ u16 tmp16;
57
+ u32 tmp32;
58
+
59
+ next_block:
60
+ /* Starting to read the next block. */
61
+ ;
62
+
63
+ STATIC_ASSERT(CAN_ENSURE(1 + 2 + 5 + 5 + 4));
64
+ ENSURE_BITS(1 + 2 + 5 + 5 + 4);
65
+
66
+ /* BFINAL: 1 bit */
67
+ is_final_block = POP_BITS(1);
68
+
69
+ /* BTYPE: 2 bits */
70
+ block_type = POP_BITS(2);
71
+
72
+ if (block_type == DEFLATE_BLOCKTYPE_DYNAMIC_HUFFMAN) {
73
+
74
+ /* Dynamic Huffman block. */
75
+
76
+ /* The order in which precode lengths are stored. */
77
+ static const u8 deflate_precode_lens_permutation[DEFLATE_NUM_PRECODE_SYMS] = {
78
+ 16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15
79
+ };
80
+
81
+ unsigned num_explicit_precode_lens;
82
+
83
+ /* Read the codeword length counts. */
84
+
85
+ STATIC_ASSERT(DEFLATE_NUM_LITLEN_SYMS == ((1 << 5) - 1) + 257);
86
+ num_litlen_syms = POP_BITS(5) + 257;
87
+
88
+ STATIC_ASSERT(DEFLATE_NUM_OFFSET_SYMS == ((1 << 5) - 1) + 1);
89
+ num_offset_syms = POP_BITS(5) + 1;
90
+
91
+ STATIC_ASSERT(DEFLATE_NUM_PRECODE_SYMS == ((1 << 4) - 1) + 4);
92
+ num_explicit_precode_lens = POP_BITS(4) + 4;
93
+
94
+ /* Read the precode codeword lengths. */
95
+ STATIC_ASSERT(DEFLATE_MAX_PRE_CODEWORD_LEN == (1 << 3) - 1);
96
+ if (CAN_ENSURE(DEFLATE_NUM_PRECODE_SYMS * 3)) {
97
+
98
+ ENSURE_BITS(DEFLATE_NUM_PRECODE_SYMS * 3);
99
+
100
+ for (i = 0; i < num_explicit_precode_lens; i++)
101
+ d->u.precode_lens[deflate_precode_lens_permutation[i]] = POP_BITS(3);
102
+ } else {
103
+ for (i = 0; i < num_explicit_precode_lens; i++) {
104
+ ENSURE_BITS(3);
105
+ d->u.precode_lens[deflate_precode_lens_permutation[i]] = POP_BITS(3);
106
+ }
107
+ }
108
+
109
+ for (; i < DEFLATE_NUM_PRECODE_SYMS; i++)
110
+ d->u.precode_lens[deflate_precode_lens_permutation[i]] = 0;
111
+
112
+ /* Build the decode table for the precode. */
113
+ SAFETY_CHECK(build_precode_decode_table(d));
114
+
115
+ /* Expand the literal/length and offset codeword lengths. */
116
+ for (i = 0; i < num_litlen_syms + num_offset_syms; ) {
117
+ u32 entry;
118
+ unsigned presym;
119
+ u8 rep_val;
120
+ unsigned rep_count;
121
+
122
+ ENSURE_BITS(DEFLATE_MAX_PRE_CODEWORD_LEN + 7);
123
+
124
+ /* (The code below assumes that the precode decode table
125
+ * does not have any subtables.) */
126
+ STATIC_ASSERT(PRECODE_TABLEBITS == DEFLATE_MAX_PRE_CODEWORD_LEN);
127
+
128
+ /* Read the next precode symbol. */
129
+ entry = d->u.l.precode_decode_table[BITS(DEFLATE_MAX_PRE_CODEWORD_LEN)];
130
+ REMOVE_BITS(entry & HUFFDEC_LENGTH_MASK);
131
+ presym = entry >> HUFFDEC_RESULT_SHIFT;
132
+
133
+ if (presym < 16) {
134
+ /* Explicit codeword length */
135
+ d->u.l.lens[i++] = presym;
136
+ continue;
137
+ }
138
+
139
+ /* Run-length encoded codeword lengths */
140
+
141
+ /* Note: we don't need verify that the repeat count
142
+ * doesn't overflow the number of elements, since we
143
+ * have enough extra spaces to allow for the worst-case
144
+ * overflow (138 zeroes when only 1 length was
145
+ * remaining).
146
+ *
147
+ * In the case of the small repeat counts (presyms 16
148
+ * and 17), it is fastest to always write the maximum
149
+ * number of entries. That gets rid of branches that
150
+ * would otherwise be required.
151
+ *
152
+ * It is not just because of the numerical order that
153
+ * our checks go in the order 'presym < 16', 'presym ==
154
+ * 16', and 'presym == 17'. For typical data this is
155
+ * ordered from most frequent to least frequent case.
156
+ */
157
+ STATIC_ASSERT(DEFLATE_MAX_LENS_OVERRUN == 138 - 1);
158
+
159
+ if (presym == 16) {
160
+ /* Repeat the previous length 3 - 6 times */
161
+ SAFETY_CHECK(i != 0);
162
+ rep_val = d->u.l.lens[i - 1];
163
+ STATIC_ASSERT(3 + ((1 << 2) - 1) == 6);
164
+ rep_count = 3 + POP_BITS(2);
165
+ d->u.l.lens[i + 0] = rep_val;
166
+ d->u.l.lens[i + 1] = rep_val;
167
+ d->u.l.lens[i + 2] = rep_val;
168
+ d->u.l.lens[i + 3] = rep_val;
169
+ d->u.l.lens[i + 4] = rep_val;
170
+ d->u.l.lens[i + 5] = rep_val;
171
+ i += rep_count;
172
+ } else if (presym == 17) {
173
+ /* Repeat zero 3 - 10 times */
174
+ STATIC_ASSERT(3 + ((1 << 3) - 1) == 10);
175
+ rep_count = 3 + POP_BITS(3);
176
+ d->u.l.lens[i + 0] = 0;
177
+ d->u.l.lens[i + 1] = 0;
178
+ d->u.l.lens[i + 2] = 0;
179
+ d->u.l.lens[i + 3] = 0;
180
+ d->u.l.lens[i + 4] = 0;
181
+ d->u.l.lens[i + 5] = 0;
182
+ d->u.l.lens[i + 6] = 0;
183
+ d->u.l.lens[i + 7] = 0;
184
+ d->u.l.lens[i + 8] = 0;
185
+ d->u.l.lens[i + 9] = 0;
186
+ i += rep_count;
187
+ } else {
188
+ /* Repeat zero 11 - 138 times */
189
+ STATIC_ASSERT(11 + ((1 << 7) - 1) == 138);
190
+ rep_count = 11 + POP_BITS(7);
191
+ memset(&d->u.l.lens[i], 0,
192
+ rep_count * sizeof(d->u.l.lens[i]));
193
+ i += rep_count;
194
+ }
195
+ }
196
+ } else if (block_type == DEFLATE_BLOCKTYPE_UNCOMPRESSED) {
197
+
198
+ /* Uncompressed block: copy 'len' bytes literally from the input
199
+ * buffer to the output buffer. */
200
+
201
+ ALIGN_INPUT();
202
+
203
+ SAFETY_CHECK(in_end - in_next >= 4);
204
+
205
+ len = READ_U16();
206
+ nlen = READ_U16();
207
+
208
+ SAFETY_CHECK(len == (u16)~nlen);
209
+ if (unlikely(len > out_end - out_next))
210
+ return LIBDEFLATE_INSUFFICIENT_SPACE;
211
+ SAFETY_CHECK(len <= in_end - in_next);
212
+
213
+ memcpy(out_next, in_next, len);
214
+ in_next += len;
215
+ out_next += len;
216
+
217
+ goto block_done;
218
+
219
+ } else {
220
+ SAFETY_CHECK(block_type == DEFLATE_BLOCKTYPE_STATIC_HUFFMAN);
221
+
222
+ /* Static Huffman block: set the static Huffman codeword
223
+ * lengths. Then the remainder is the same as decompressing a
224
+ * dynamic Huffman block. */
225
+
226
+ STATIC_ASSERT(DEFLATE_NUM_LITLEN_SYMS == 288);
227
+ STATIC_ASSERT(DEFLATE_NUM_OFFSET_SYMS == 32);
228
+
229
+ for (i = 0; i < 144; i++)
230
+ d->u.l.lens[i] = 8;
231
+ for (; i < 256; i++)
232
+ d->u.l.lens[i] = 9;
233
+ for (; i < 280; i++)
234
+ d->u.l.lens[i] = 7;
235
+ for (; i < 288; i++)
236
+ d->u.l.lens[i] = 8;
237
+
238
+ for (; i < 288 + 32; i++)
239
+ d->u.l.lens[i] = 5;
240
+
241
+ num_litlen_syms = 288;
242
+ num_offset_syms = 32;
243
+
244
+ }
245
+
246
+ /* Decompressing a Huffman block (either dynamic or static) */
247
+
248
+ SAFETY_CHECK(build_offset_decode_table(d, num_litlen_syms, num_offset_syms));
249
+ SAFETY_CHECK(build_litlen_decode_table(d, num_litlen_syms, num_offset_syms));
250
+
251
+ /* The main DEFLATE decode loop */
252
+ for (;;) {
253
+ u32 entry;
254
+ u32 length;
255
+ u32 offset;
256
+
257
+ /* Decode a litlen symbol. */
258
+ ENSURE_BITS(DEFLATE_MAX_LITLEN_CODEWORD_LEN);
259
+ entry = d->u.litlen_decode_table[BITS(LITLEN_TABLEBITS)];
260
+ if (entry & HUFFDEC_SUBTABLE_POINTER) {
261
+ /* Litlen subtable required (uncommon case) */
262
+ REMOVE_BITS(LITLEN_TABLEBITS);
263
+ entry = d->u.litlen_decode_table[
264
+ ((entry >> HUFFDEC_RESULT_SHIFT) & 0xFFFF) +
265
+ BITS(entry & HUFFDEC_LENGTH_MASK)];
266
+ }
267
+ REMOVE_BITS(entry & HUFFDEC_LENGTH_MASK);
268
+ if (entry & HUFFDEC_LITERAL) {
269
+ /* Literal */
270
+ if (unlikely(out_next == out_end))
271
+ return LIBDEFLATE_INSUFFICIENT_SPACE;
272
+ *out_next++ = (u8)(entry >> HUFFDEC_RESULT_SHIFT);
273
+ continue;
274
+ }
275
+
276
+ /* Match or end-of-block */
277
+
278
+ entry >>= HUFFDEC_RESULT_SHIFT;
279
+ ENSURE_BITS(MAX_ENSURE);
280
+
281
+ /* Pop the extra length bits and add them to the length base to
282
+ * produce the full length. */
283
+ length = (entry >> HUFFDEC_LENGTH_BASE_SHIFT) +
284
+ POP_BITS(entry & HUFFDEC_EXTRA_LENGTH_BITS_MASK);
285
+
286
+ /* The match destination must not end after the end of the
287
+ * output buffer. For efficiency, combine this check with the
288
+ * end-of-block check. We're using 0 for the special
289
+ * end-of-block length, so subtract 1 and it turn it into
290
+ * SIZE_MAX. */
291
+ STATIC_ASSERT(HUFFDEC_END_OF_BLOCK_LENGTH == 0);
292
+ if (unlikely((size_t)length - 1 >= out_end - out_next)) {
293
+ if (unlikely(length != HUFFDEC_END_OF_BLOCK_LENGTH))
294
+ return LIBDEFLATE_INSUFFICIENT_SPACE;
295
+ goto block_done;
296
+ }
297
+
298
+ /* Decode the match offset. */
299
+
300
+ entry = d->offset_decode_table[BITS(OFFSET_TABLEBITS)];
301
+ if (entry & HUFFDEC_SUBTABLE_POINTER) {
302
+ /* Offset subtable required (uncommon case) */
303
+ REMOVE_BITS(OFFSET_TABLEBITS);
304
+ entry = d->offset_decode_table[
305
+ ((entry >> HUFFDEC_RESULT_SHIFT) & 0xFFFF) +
306
+ BITS(entry & HUFFDEC_LENGTH_MASK)];
307
+ }
308
+ REMOVE_BITS(entry & HUFFDEC_LENGTH_MASK);
309
+ entry >>= HUFFDEC_RESULT_SHIFT;
310
+
311
+ STATIC_ASSERT(CAN_ENSURE(DEFLATE_MAX_EXTRA_LENGTH_BITS +
312
+ DEFLATE_MAX_OFFSET_CODEWORD_LEN) &&
313
+ CAN_ENSURE(DEFLATE_MAX_EXTRA_OFFSET_BITS));
314
+ if (!CAN_ENSURE(DEFLATE_MAX_EXTRA_LENGTH_BITS +
315
+ DEFLATE_MAX_OFFSET_CODEWORD_LEN +
316
+ DEFLATE_MAX_EXTRA_OFFSET_BITS))
317
+ ENSURE_BITS(DEFLATE_MAX_EXTRA_OFFSET_BITS);
318
+
319
+ /* Pop the extra offset bits and add them to the offset base to
320
+ * produce the full offset. */
321
+ offset = (entry & HUFFDEC_OFFSET_BASE_MASK) +
322
+ POP_BITS(entry >> HUFFDEC_EXTRA_OFFSET_BITS_SHIFT);
323
+
324
+ /* The match source must not begin before the beginning of the
325
+ * output buffer. */
326
+ SAFETY_CHECK(offset <= out_next - (const u8 *)out);
327
+
328
+ /* Copy the match: 'length' bytes at 'out_next - offset' to
329
+ * 'out_next'. */
330
+
331
+ if (UNALIGNED_ACCESS_IS_FAST &&
332
+ length <= (3 * WORDBYTES) &&
333
+ offset >= WORDBYTES &&
334
+ length + (3 * WORDBYTES) <= out_end - out_next)
335
+ {
336
+ /* Fast case: short length, no overlaps if we copy one
337
+ * word at a time, and we aren't getting too close to
338
+ * the end of the output array. */
339
+ copy_word_unaligned(out_next - offset + (0 * WORDBYTES),
340
+ out_next + (0 * WORDBYTES));
341
+ copy_word_unaligned(out_next - offset + (1 * WORDBYTES),
342
+ out_next + (1 * WORDBYTES));
343
+ copy_word_unaligned(out_next - offset + (2 * WORDBYTES),
344
+ out_next + (2 * WORDBYTES));
345
+ } else {
346
+ const u8 *src = out_next - offset;
347
+ u8 *dst = out_next;
348
+ u8 *end = out_next + length;
349
+
350
+ if (UNALIGNED_ACCESS_IS_FAST &&
351
+ likely(out_end - end >= WORDBYTES - 1)) {
352
+ if (offset >= WORDBYTES) {
353
+ copy_word_unaligned(src, dst);
354
+ src += WORDBYTES;
355
+ dst += WORDBYTES;
356
+ if (dst < end) {
357
+ do {
358
+ copy_word_unaligned(src, dst);
359
+ src += WORDBYTES;
360
+ dst += WORDBYTES;
361
+ } while (dst < end);
362
+ }
363
+ } else if (offset == 1) {
364
+ machine_word_t v = repeat_byte(*(dst - 1));
365
+ do {
366
+ store_word_unaligned(v, dst);
367
+ src += WORDBYTES;
368
+ dst += WORDBYTES;
369
+ } while (dst < end);
370
+ } else {
371
+ *dst++ = *src++;
372
+ *dst++ = *src++;
373
+ do {
374
+ *dst++ = *src++;
375
+ } while (dst < end);
376
+ }
377
+ } else {
378
+ *dst++ = *src++;
379
+ *dst++ = *src++;
380
+ do {
381
+ *dst++ = *src++;
382
+ } while (dst < end);
383
+ }
384
+ }
385
+
386
+ out_next += length;
387
+ }
388
+
389
+ block_done:
390
+ /* Finished decoding a block. */
391
+
392
+ if (!is_final_block)
393
+ goto next_block;
394
+
395
+ /* That was the last block. */
396
+
397
+ if (actual_out_nbytes_ret) {
398
+ *actual_out_nbytes_ret = out_next - (u8 *)out;
399
+ } else {
400
+ if (out_next != out_end)
401
+ return LIBDEFLATE_SHORT_OUTPUT;
402
+ }
403
+ return LIBDEFLATE_SUCCESS;
404
+ }
@@ -0,0 +1,2817 @@
1
+ /*
2
+ * deflate_compress.c - a compressor for DEFLATE
3
+ *
4
+ * Originally public domain; changes after 2016-09-07 are copyrighted.
5
+ *
6
+ * Copyright 2016 Eric Biggers
7
+ *
8
+ * Permission is hereby granted, free of charge, to any person
9
+ * obtaining a copy of this software and associated documentation
10
+ * files (the "Software"), to deal in the Software without
11
+ * restriction, including without limitation the rights to use,
12
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
13
+ * copies of the Software, and to permit persons to whom the
14
+ * Software is furnished to do so, subject to the following
15
+ * conditions:
16
+ *
17
+ * The above copyright notice and this permission notice shall be
18
+ * included in all copies or substantial portions of the Software.
19
+ *
20
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
21
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
22
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
23
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
24
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
25
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
26
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
27
+ * OTHER DEALINGS IN THE SOFTWARE.
28
+ */
29
+
30
+ #include <stdlib.h>
31
+ #include <string.h>
32
+
33
+ #include "aligned_malloc.h"
34
+ #include "deflate_compress.h"
35
+ #include "deflate_constants.h"
36
+ #include "unaligned.h"
37
+
38
+ #include "libdeflate.h"
39
+
40
+ /*
41
+ * By default, the near-optimal parsing algorithm is enabled at compression
42
+ * level 8 and above. The near-optimal parsing algorithm produces a compression
43
+ * ratio significantly better than the greedy and lazy algorithms implemented
44
+ * here, and also the algorithm used by zlib at level 9. However, it is slow.
45
+ */
46
+ #define SUPPORT_NEAR_OPTIMAL_PARSING 1
47
+
48
+ /*
49
+ * Define to 1 to maintain the full map from match offsets to offset slots.
50
+ * This slightly speeds up translations of match offsets to offset slots, but it
51
+ * uses 32769 bytes of memory rather than the 512 bytes used by the condensed
52
+ * map. The speedup provided by the larger map is most helpful when the
53
+ * near-optimal parsing algorithm is being used.
54
+ */
55
+ #define USE_FULL_OFFSET_SLOT_FAST SUPPORT_NEAR_OPTIMAL_PARSING
56
+
57
+ /*
58
+ * DEFLATE uses a 32768 byte sliding window; set the matchfinder parameters
59
+ * appropriately.
60
+ */
61
+ #define MATCHFINDER_WINDOW_ORDER 15
62
+
63
+ #include "hc_matchfinder.h"
64
+ #if SUPPORT_NEAR_OPTIMAL_PARSING
65
+ # include "bt_matchfinder.h"
66
+ #endif
67
+
68
+ /*
69
+ * The compressor always chooses a block of at least MIN_BLOCK_LENGTH bytes,
70
+ * except if the last block has to be shorter.
71
+ */
72
+ #define MIN_BLOCK_LENGTH 10000
73
+
74
+ /*
75
+ * The compressor attempts to end blocks after SOFT_MAX_BLOCK_LENGTH bytes, but
76
+ * the final length might be slightly longer due to matches extending beyond
77
+ * this limit.
78
+ */
79
+ #define SOFT_MAX_BLOCK_LENGTH 300000
80
+
81
+ /*
82
+ * The number of observed matches or literals that represents sufficient data to
83
+ * decide whether the current block should be terminated or not.
84
+ */
85
+ #define NUM_OBSERVATIONS_PER_BLOCK_CHECK 512
86
+
87
+
88
+ #if SUPPORT_NEAR_OPTIMAL_PARSING
89
+ /* Constants specific to the near-optimal parsing algorithm */
90
+
91
+ /*
92
+ * The maximum number of matches the matchfinder can find at a single position.
93
+ * Since the matchfinder never finds more than one match for the same length,
94
+ * presuming one of each possible length is sufficient for an upper bound.
95
+ * (This says nothing about whether it is worthwhile to consider so many
96
+ * matches; this is just defining the worst case.)
97
+ */
98
+ # define MAX_MATCHES_PER_POS (DEFLATE_MAX_MATCH_LEN - DEFLATE_MIN_MATCH_LEN + 1)
99
+
100
+ /*
101
+ * The number of lz_match structures in the match cache, excluding the extra
102
+ * "overflow" entries. This value should be high enough so that nearly the
103
+ * time, all matches found in a given block can fit in the match cache.
104
+ * However, fallback behavior (immediately terminating the block) on cache
105
+ * overflow is still required.
106
+ */
107
+ # define CACHE_LENGTH (SOFT_MAX_BLOCK_LENGTH * 5)
108
+
109
+ #endif /* SUPPORT_NEAR_OPTIMAL_PARSING */
110
+
111
+ /*
112
+ * These are the compressor-side limits on the codeword lengths for each Huffman
113
+ * code. To make outputting bits slightly faster, some of these limits are
114
+ * lower than the limits defined by the DEFLATE format. This does not
115
+ * significantly affect the compression ratio, at least for the block lengths we
116
+ * use.
117
+ */
118
+ #define MAX_LITLEN_CODEWORD_LEN 14
119
+ #define MAX_OFFSET_CODEWORD_LEN DEFLATE_MAX_OFFSET_CODEWORD_LEN
120
+ #define MAX_PRE_CODEWORD_LEN DEFLATE_MAX_PRE_CODEWORD_LEN
121
+
122
+ /* Table: length slot => length slot base value */
123
+ static const unsigned deflate_length_slot_base[] = {
124
+ 3 , 4 , 5 , 6 , 7 , 8 , 9 , 10 ,
125
+ 11 , 13 , 15 , 17 , 19 , 23 , 27 , 31 ,
126
+ 35 , 43 , 51 , 59 , 67 , 83 , 99 , 115 ,
127
+ 131 , 163 , 195 , 227 , 258 ,
128
+ };
129
+
130
+ /* Table: length slot => number of extra length bits */
131
+ static const u8 deflate_extra_length_bits[] = {
132
+ 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
133
+ 1 , 1 , 1 , 1 , 2 , 2 , 2 , 2 ,
134
+ 3 , 3 , 3 , 3 , 4 , 4 , 4 , 4 ,
135
+ 5 , 5 , 5 , 5 , 0 ,
136
+ };
137
+
138
+ /* Table: offset slot => offset slot base value */
139
+ static const unsigned deflate_offset_slot_base[] = {
140
+ 1 , 2 , 3 , 4 , 5 , 7 , 9 , 13 ,
141
+ 17 , 25 , 33 , 49 , 65 , 97 , 129 , 193 ,
142
+ 257 , 385 , 513 , 769 , 1025 , 1537 , 2049 , 3073 ,
143
+ 4097 , 6145 , 8193 , 12289 , 16385 , 24577 ,
144
+ };
145
+
146
+ /* Table: offset slot => number of extra offset bits */
147
+ static const u8 deflate_extra_offset_bits[] = {
148
+ 0 , 0 , 0 , 0 , 1 , 1 , 2 , 2 ,
149
+ 3 , 3 , 4 , 4 , 5 , 5 , 6 , 6 ,
150
+ 7 , 7 , 8 , 8 , 9 , 9 , 10 , 10 ,
151
+ 11 , 11 , 12 , 12 , 13 , 13 ,
152
+ };
153
+
154
+ /* Table: length => length slot */
155
+ static const u8 deflate_length_slot[DEFLATE_MAX_MATCH_LEN + 1] = {
156
+ 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 12,
157
+ 12, 13, 13, 13, 13, 14, 14, 14, 14, 15, 15, 15, 15, 16, 16, 16, 16, 16,
158
+ 16, 16, 16, 17, 17, 17, 17, 17, 17, 17, 17, 18, 18, 18, 18, 18, 18, 18,
159
+ 18, 19, 19, 19, 19, 19, 19, 19, 19, 20, 20, 20, 20, 20, 20, 20, 20, 20,
160
+ 20, 20, 20, 20, 20, 20, 20, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
161
+ 21, 21, 21, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22,
162
+ 22, 22, 22, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23,
163
+ 23, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
164
+ 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 25, 25, 25,
165
+ 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
166
+ 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 26, 26, 26, 26, 26, 26, 26,
167
+ 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
168
+ 26, 26, 26, 26, 26, 26, 26, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
169
+ 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
170
+ 27, 27, 28,
171
+ };
172
+
173
+ /* The order in which precode codeword lengths are stored */
174
+ static const u8 deflate_precode_lens_permutation[DEFLATE_NUM_PRECODE_SYMS] = {
175
+ 16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15
176
+ };
177
+
178
+ /* Codewords for the DEFLATE Huffman codes. */
179
+ struct deflate_codewords {
180
+ u32 litlen[DEFLATE_NUM_LITLEN_SYMS];
181
+ u32 offset[DEFLATE_NUM_OFFSET_SYMS];
182
+ };
183
+
184
+ /* Codeword lengths (in bits) for the DEFLATE Huffman codes.
185
+ * A zero length means the corresponding symbol had zero frequency. */
186
+ struct deflate_lens {
187
+ u8 litlen[DEFLATE_NUM_LITLEN_SYMS];
188
+ u8 offset[DEFLATE_NUM_OFFSET_SYMS];
189
+ };
190
+
191
+ /* Codewords and lengths for the DEFLATE Huffman codes. */
192
+ struct deflate_codes {
193
+ struct deflate_codewords codewords;
194
+ struct deflate_lens lens;
195
+ };
196
+
197
+ /* Symbol frequency counters for the DEFLATE Huffman codes. */
198
+ struct deflate_freqs {
199
+ u32 litlen[DEFLATE_NUM_LITLEN_SYMS];
200
+ u32 offset[DEFLATE_NUM_OFFSET_SYMS];
201
+ };
202
+
203
+ #if SUPPORT_NEAR_OPTIMAL_PARSING
204
+
205
+ /* Costs for the near-optimal parsing algorithm. */
206
+ struct deflate_costs {
207
+
208
+ /* The cost to output each possible literal. */
209
+ u32 literal[DEFLATE_NUM_LITERALS];
210
+
211
+ /* The cost to output each possible match length. */
212
+ u32 length[DEFLATE_MAX_MATCH_LEN + 1];
213
+
214
+ /* The cost to output a match offset of each possible offset slot. */
215
+ u32 offset_slot[DEFLATE_NUM_OFFSET_SYMS];
216
+ };
217
+
218
+ /*
219
+ * COST_SHIFT is a scaling factor that makes it possible to consider fractional
220
+ * bit costs. A token requiring 'n' bits to represent has cost n << COST_SHIFT.
221
+ *
222
+ * Note: this is only useful as a statistical trick for when the true costs are
223
+ * unknown. In reality, each token in DEFLATE requires a whole number of bits
224
+ * to output.
225
+ */
226
+ #define COST_SHIFT 3
227
+
228
+ /*
229
+ * The NOSTAT_BITS value for a given alphabet is the number of bits assumed to
230
+ * be needed to output a symbol that was unused in the previous optimization
231
+ * pass. Assigning a default cost allows the symbol to be used in the next
232
+ * optimization pass. However, the cost should be relatively high because the
233
+ * symbol probably won't be used very many times (if at all).
234
+ */
235
+ #define LITERAL_NOSTAT_BITS 13
236
+ #define LENGTH_NOSTAT_BITS 13
237
+ #define OFFSET_NOSTAT_BITS 10
238
+
239
+ #endif /* SUPPORT_NEAR_OPTIMAL_PARSING */
240
+
241
+ /*
242
+ * Represents a run of literals followed by a match or end-of-block. This
243
+ * struct is needed to temporarily store items chosen by the parser, since items
244
+ * cannot be written until all items for the block have been chosen and the
245
+ * block's Huffman codes have been computed.
246
+ */
247
+ struct deflate_sequence {
248
+
249
+ /* Bits 0..22: the number of literals in this run. This may be 0 and
250
+ * can be at most about SOFT_MAX_BLOCK_LENGTH. The literals are not
251
+ * stored explicitly in this structure; instead, they are read directly
252
+ * from the uncompressed data.
253
+ *
254
+ * Bits 23..31: the length of the match which follows the literals, or 0
255
+ * if this literal run was the last in the block, so there is no match
256
+ * which follows it. */
257
+ u32 litrunlen_and_length;
258
+
259
+ /* If 'length' doesn't indicate end-of-block, then this is the offset of
260
+ * the match which follows the literals. */
261
+ u16 offset;
262
+
263
+ /* If 'length' doesn't indicate end-of-block, then this is the offset
264
+ * symbol of the match which follows the literals. */
265
+ u8 offset_symbol;
266
+
267
+ /* If 'length' doesn't indicate end-of-block, then this is the length
268
+ * slot of the match which follows the literals. */
269
+ u8 length_slot;
270
+ };
271
+
272
+ #if SUPPORT_NEAR_OPTIMAL_PARSING
273
+
274
+ /*
275
+ * This structure represents a byte position in the input data and a node in the
276
+ * graph of possible match/literal choices for the current block.
277
+ *
278
+ * Logically, each incoming edge to this node is labeled with a literal or a
279
+ * match that can be taken to reach this position from an earlier position; and
280
+ * each outgoing edge from this node is labeled with a literal or a match that
281
+ * can be taken to advance from this position to a later position.
282
+ *
283
+ * But these "edges" are actually stored elsewhere (in 'match_cache'). Here we
284
+ * associate with each node just two pieces of information:
285
+ *
286
+ * 'cost_to_end' is the minimum cost to reach the end of the block from
287
+ * this position.
288
+ *
289
+ * 'item' represents the literal or match that must be chosen from here to
290
+ * reach the end of the block with the minimum cost. Equivalently, this
291
+ * can be interpreted as the label of the outgoing edge on the minimum-cost
292
+ * path to the "end of block" node from this node.
293
+ */
294
+ struct deflate_optimum_node {
295
+
296
+ u32 cost_to_end;
297
+
298
+ /*
299
+ * Notes on the match/literal representation used here:
300
+ *
301
+ * The low bits of 'item' are the length: 1 if this is a literal,
302
+ * or the match length if this is a match.
303
+ *
304
+ * The high bits of 'item' are the actual literal byte if this is a
305
+ * literal, or the match offset if this is a match.
306
+ */
307
+ #define OPTIMUM_OFFSET_SHIFT 9
308
+ #define OPTIMUM_LEN_MASK (((u32)1 << OPTIMUM_OFFSET_SHIFT) - 1)
309
+ u32 item;
310
+
311
+ };
312
+
313
+ #endif /* SUPPORT_NEAR_OPTIMAL_PARSING */
314
+
315
+ /* Block split statistics. See "Block splitting algorithm" below. */
316
+ #define NUM_LITERAL_OBSERVATION_TYPES 8
317
+ #define NUM_MATCH_OBSERVATION_TYPES 2
318
+ #define NUM_OBSERVATION_TYPES (NUM_LITERAL_OBSERVATION_TYPES + NUM_MATCH_OBSERVATION_TYPES)
319
+ struct block_split_stats {
320
+ u32 new_observations[NUM_OBSERVATION_TYPES];
321
+ u32 observations[NUM_OBSERVATION_TYPES];
322
+ u32 num_new_observations;
323
+ u32 num_observations;
324
+ };
325
+
326
+ /* The main DEFLATE compressor structure */
327
+ struct libdeflate_compressor {
328
+
329
+ /* Pointer to the compress() implementation chosen at allocation time */
330
+ size_t (*impl)(struct libdeflate_compressor *,
331
+ const u8 *, size_t, u8 *, size_t);
332
+
333
+ /* Frequency counters for the current block */
334
+ struct deflate_freqs freqs;
335
+
336
+ /* Dynamic Huffman codes for the current block */
337
+ struct deflate_codes codes;
338
+
339
+ /* Static Huffman codes */
340
+ struct deflate_codes static_codes;
341
+
342
+ /* Block split statistics for the currently pending block */
343
+ struct block_split_stats split_stats;
344
+
345
+ /* A table for fast lookups of offset slot by match offset.
346
+ *
347
+ * If the full table is being used, it is a direct mapping from offset
348
+ * to offset slot.
349
+ *
350
+ * If the condensed table is being used, the first 256 entries map
351
+ * directly to the offset slots of offsets 1 through 256. The next 256
352
+ * entries map to the offset slots for the remaining offsets, stepping
353
+ * through the offsets with a stride of 128. This relies on the fact
354
+ * that each of the remaining offset slots contains at least 128 offsets
355
+ * and has an offset base that is a multiple of 128. */
356
+ #if USE_FULL_OFFSET_SLOT_FAST
357
+ u8 offset_slot_fast[DEFLATE_MAX_MATCH_OFFSET + 1];
358
+ #else
359
+ u8 offset_slot_fast[512];
360
+ #endif
361
+
362
+ /* The "nice" match length: if a match of this length is found, choose
363
+ * it immediately without further consideration. */
364
+ unsigned nice_match_length;
365
+
366
+ /* The maximum search depth: consider at most this many potential
367
+ * matches at each position. */
368
+ unsigned max_search_depth;
369
+
370
+ /* The compression level with which this compressor was created. */
371
+ unsigned compression_level;
372
+
373
+ /* Temporary space for Huffman code output */
374
+ u32 precode_freqs[DEFLATE_NUM_PRECODE_SYMS];
375
+ u8 precode_lens[DEFLATE_NUM_PRECODE_SYMS];
376
+ u32 precode_codewords[DEFLATE_NUM_PRECODE_SYMS];
377
+ unsigned precode_items[DEFLATE_NUM_LITLEN_SYMS + DEFLATE_NUM_OFFSET_SYMS];
378
+ unsigned num_litlen_syms;
379
+ unsigned num_offset_syms;
380
+ unsigned num_explicit_lens;
381
+ unsigned num_precode_items;
382
+
383
+ union {
384
+ /* Data for greedy or lazy parsing */
385
+ struct {
386
+ /* Hash chain matchfinder */
387
+ struct hc_matchfinder hc_mf;
388
+
389
+ /* The matches and literals that the parser has chosen
390
+ * for the current block. The required length of this
391
+ * array is limited by the maximum number of matches
392
+ * that can ever be chosen for a single block, plus one
393
+ * for the special entry at the end. */
394
+ struct deflate_sequence sequences[
395
+ DIV_ROUND_UP(SOFT_MAX_BLOCK_LENGTH,
396
+ DEFLATE_MIN_MATCH_LEN) + 1];
397
+ } g; /* (g)reedy */
398
+
399
+ #if SUPPORT_NEAR_OPTIMAL_PARSING
400
+ /* Data for near-optimal parsing */
401
+ struct {
402
+
403
+ /* Binary tree matchfinder */
404
+ struct bt_matchfinder bt_mf;
405
+
406
+ /*
407
+ * Cached matches for the current block. This array
408
+ * contains the matches that were found at each position
409
+ * in the block. Specifically, for each position, there
410
+ * is a list of matches found at that position, if any,
411
+ * sorted by strictly increasing length. In addition,
412
+ * following the matches for each position, there is a
413
+ * special 'struct lz_match' whose 'length' member
414
+ * contains the number of matches found at that
415
+ * position, and whose 'offset' member contains the
416
+ * literal at that position.
417
+ *
418
+ * Note: in rare cases, there will be a very high number
419
+ * of matches in the block and this array will overflow.
420
+ * If this happens, we force the end of the current
421
+ * block. CACHE_LENGTH is the length at which we
422
+ * actually check for overflow. The extra slots beyond
423
+ * this are enough to absorb the worst case overflow,
424
+ * which occurs if starting at &match_cache[CACHE_LENGTH
425
+ * - 1], we write MAX_MATCHES_PER_POS matches and a
426
+ * match count header, then skip searching for matches
427
+ * at 'DEFLATE_MAX_MATCH_LEN - 1' positions and write
428
+ * the match count header for each.
429
+ */
430
+ struct lz_match match_cache[CACHE_LENGTH +
431
+ MAX_MATCHES_PER_POS +
432
+ DEFLATE_MAX_MATCH_LEN - 1];
433
+
434
+ /*
435
+ * Array of nodes, one per position, for running the
436
+ * minimum-cost path algorithm.
437
+ *
438
+ * This array must be large enough to accommodate the
439
+ * worst-case number of nodes, which occurs if we find a
440
+ * match of length DEFLATE_MAX_MATCH_LEN at position
441
+ * SOFT_MAX_BLOCK_LENGTH - 1, producing a block of
442
+ * length SOFT_MAX_BLOCK_LENGTH - 1 +
443
+ * DEFLATE_MAX_MATCH_LEN. Add one for the end-of-block
444
+ * node.
445
+ */
446
+ struct deflate_optimum_node optimum_nodes[SOFT_MAX_BLOCK_LENGTH - 1 +
447
+ DEFLATE_MAX_MATCH_LEN + 1];
448
+
449
+ /* The current cost model being used. */
450
+ struct deflate_costs costs;
451
+
452
+ unsigned num_optim_passes;
453
+ } n; /* (n)ear-optimal */
454
+ #endif /* SUPPORT_NEAR_OPTIMAL_PARSING */
455
+
456
+ } p; /* (p)arser */
457
+ };
458
+
459
+ /*
460
+ * The type for the bitbuffer variable, which temporarily holds bits that are
461
+ * being packed into bytes and written to the output buffer. For best
462
+ * performance, this should have size equal to a machine word.
463
+ */
464
+ typedef machine_word_t bitbuf_t;
465
+ #define BITBUF_NBITS (8 * sizeof(bitbuf_t))
466
+
467
+ /* Can the specified number of bits always be added to 'bitbuf' after any
468
+ * pending bytes have been flushed? */
469
+ #define CAN_BUFFER(n) ((n) <= BITBUF_NBITS - 7)
470
+
471
+ /*
472
+ * Structure to keep track of the current state of sending bits to the
473
+ * compressed output buffer.
474
+ */
475
+ struct deflate_output_bitstream {
476
+
477
+ /* Bits that haven't yet been written to the output buffer. */
478
+ bitbuf_t bitbuf;
479
+
480
+ /* Number of bits currently held in @bitbuf. */
481
+ unsigned bitcount;
482
+
483
+ /* Pointer to the beginning of the output buffer. */
484
+ u8 *begin;
485
+
486
+ /* Pointer to the position in the output buffer at which the next byte
487
+ * should be written. */
488
+ u8 *next;
489
+
490
+ /* Pointer just past the end of the output buffer. */
491
+ u8 *end;
492
+ };
493
+
494
+ #define MIN_OUTPUT_SIZE (UNALIGNED_ACCESS_IS_FAST ? sizeof(bitbuf_t) : 1)
495
+
496
+ /* Initialize the output bitstream. 'size' is assumed to be at least
497
+ * MIN_OUTPUT_SIZE. */
498
+ static void
499
+ deflate_init_output(struct deflate_output_bitstream *os,
500
+ void *buffer, size_t size)
501
+ {
502
+ os->bitbuf = 0;
503
+ os->bitcount = 0;
504
+ os->begin = buffer;
505
+ os->next = os->begin;
506
+ os->end = os->begin + size - MIN_OUTPUT_SIZE;
507
+ }
508
+
509
+ /* Add some bits to the bitbuffer variable of the output bitstream. The caller
510
+ * must make sure there is enough room. */
511
+ static forceinline void
512
+ deflate_add_bits(struct deflate_output_bitstream *os,
513
+ const bitbuf_t bits, const unsigned num_bits)
514
+ {
515
+ os->bitbuf |= bits << os->bitcount;
516
+ os->bitcount += num_bits;
517
+ }
518
+
519
+ /* Flush bits from the bitbuffer variable to the output buffer. */
520
+ static forceinline void
521
+ deflate_flush_bits(struct deflate_output_bitstream *os)
522
+ {
523
+ if (UNALIGNED_ACCESS_IS_FAST) {
524
+ /* Flush a whole word (branchlessly). */
525
+ put_unaligned_leword(os->bitbuf, os->next);
526
+ os->bitbuf >>= os->bitcount & ~7;
527
+ os->next += MIN(os->end - os->next, os->bitcount >> 3);
528
+ os->bitcount &= 7;
529
+ } else {
530
+ /* Flush a byte at a time. */
531
+ while (os->bitcount >= 8) {
532
+ *os->next = os->bitbuf;
533
+ if (os->next != os->end)
534
+ os->next++;
535
+ os->bitcount -= 8;
536
+ os->bitbuf >>= 8;
537
+ }
538
+ }
539
+ }
540
+
541
+ /* Align the bitstream on a byte boundary. */
542
+ static forceinline void
543
+ deflate_align_bitstream(struct deflate_output_bitstream *os)
544
+ {
545
+ os->bitcount += -os->bitcount & 7;
546
+ deflate_flush_bits(os);
547
+ }
548
+
549
+ /*
550
+ * Flush any remaining bits to the output buffer if needed. Return the total
551
+ * number of bytes written to the output buffer, or 0 if an overflow occurred.
552
+ */
553
+ static u32
554
+ deflate_flush_output(struct deflate_output_bitstream *os)
555
+ {
556
+ if (os->next == os->end) /* overflow? */
557
+ return 0;
558
+
559
+ while ((int)os->bitcount > 0) {
560
+ *os->next++ = os->bitbuf;
561
+ os->bitcount -= 8;
562
+ os->bitbuf >>= 8;
563
+ }
564
+
565
+ return os->next - os->begin;
566
+ }
567
+
568
+ /* Given the binary tree node A[subtree_idx] whose children already
569
+ * satisfy the maxheap property, swap the node with its greater child
570
+ * until it is greater than both its children, so that the maxheap
571
+ * property is satisfied in the subtree rooted at A[subtree_idx]. */
572
+ static void
573
+ heapify_subtree(u32 A[], unsigned length, unsigned subtree_idx)
574
+ {
575
+ unsigned parent_idx;
576
+ unsigned child_idx;
577
+ u32 v;
578
+
579
+ v = A[subtree_idx];
580
+ parent_idx = subtree_idx;
581
+ while ((child_idx = parent_idx * 2) <= length) {
582
+ if (child_idx < length && A[child_idx + 1] > A[child_idx])
583
+ child_idx++;
584
+ if (v >= A[child_idx])
585
+ break;
586
+ A[parent_idx] = A[child_idx];
587
+ parent_idx = child_idx;
588
+ }
589
+ A[parent_idx] = v;
590
+ }
591
+
592
+ /* Rearrange the array 'A' so that it satisfies the maxheap property.
593
+ * 'A' uses 1-based indices, so the children of A[i] are A[i*2] and A[i*2 + 1].
594
+ */
595
+ static void
596
+ heapify_array(u32 A[], unsigned length)
597
+ {
598
+ unsigned subtree_idx;
599
+
600
+ for (subtree_idx = length / 2; subtree_idx >= 1; subtree_idx--)
601
+ heapify_subtree(A, length, subtree_idx);
602
+ }
603
+
604
+ /*
605
+ * Sort the array 'A', which contains 'length' unsigned 32-bit integers.
606
+ *
607
+ * Note: name this function heap_sort() instead of heapsort() to avoid colliding
608
+ * with heapsort() from stdlib.h on BSD-derived systems --- though this isn't
609
+ * necessary when compiling with -D_ANSI_SOURCE, which is the better solution.
610
+ */
611
+ static void
612
+ heap_sort(u32 A[], unsigned length)
613
+ {
614
+ A--; /* Use 1-based indices */
615
+
616
+ heapify_array(A, length);
617
+
618
+ while (length >= 2) {
619
+ u32 tmp = A[length];
620
+ A[length] = A[1];
621
+ A[1] = tmp;
622
+ length--;
623
+ heapify_subtree(A, length, 1);
624
+ }
625
+ }
626
+
627
+ #define NUM_SYMBOL_BITS 10
628
+ #define SYMBOL_MASK ((1 << NUM_SYMBOL_BITS) - 1)
629
+
630
+ #define GET_NUM_COUNTERS(num_syms) ((((num_syms) + 3 / 4) + 3) & ~3)
631
+ /*
632
+ * Sort the symbols primarily by frequency and secondarily by symbol
633
+ * value. Discard symbols with zero frequency and fill in an array with
634
+ * the remaining symbols, along with their frequencies. The low
635
+ * NUM_SYMBOL_BITS bits of each array entry will contain the symbol
636
+ * value, and the remaining bits will contain the frequency.
637
+ *
638
+ * @num_syms
639
+ * Number of symbols in the alphabet.
640
+ * Can't be greater than (1 << NUM_SYMBOL_BITS).
641
+ *
642
+ * @freqs[num_syms]
643
+ * The frequency of each symbol.
644
+ *
645
+ * @lens[num_syms]
646
+ * An array that eventually will hold the length of each codeword.
647
+ * This function only fills in the codeword lengths for symbols that
648
+ * have zero frequency, which are not well defined per se but will
649
+ * be set to 0.
650
+ *
651
+ * @symout[num_syms]
652
+ * The output array, described above.
653
+ *
654
+ * Returns the number of entries in 'symout' that were filled. This is
655
+ * the number of symbols that have nonzero frequency.
656
+ */
657
+ static unsigned
658
+ sort_symbols(unsigned num_syms, const u32 freqs[restrict],
659
+ u8 lens[restrict], u32 symout[restrict])
660
+ {
661
+ unsigned sym;
662
+ unsigned i;
663
+ unsigned num_used_syms;
664
+ unsigned num_counters;
665
+ unsigned counters[GET_NUM_COUNTERS(DEFLATE_MAX_NUM_SYMS)];
666
+
667
+ /* We rely on heapsort, but with an added optimization. Since
668
+ * it's common for most symbol frequencies to be low, we first do
669
+ * a count sort using a limited number of counters. High
670
+ * frequencies will be counted in the last counter, and only they
671
+ * will be sorted with heapsort.
672
+ *
673
+ * Note: with more symbols, it is generally beneficial to have more
674
+ * counters. About 1 counter per 4 symbols seems fast.
675
+ *
676
+ * Note: I also tested radix sort, but even for large symbol
677
+ * counts (> 255) and frequencies bounded at 16 bits (enabling
678
+ * radix sort by just two base-256 digits), it didn't seem any
679
+ * faster than the method implemented here.
680
+ *
681
+ * Note: I tested the optimized quicksort implementation from
682
+ * glibc (with indirection overhead removed), but it was only
683
+ * marginally faster than the simple heapsort implemented here.
684
+ *
685
+ * Tests were done with building the codes for LZX. Results may
686
+ * vary for different compression algorithms...! */
687
+
688
+ num_counters = GET_NUM_COUNTERS(num_syms);
689
+
690
+ memset(counters, 0, num_counters * sizeof(counters[0]));
691
+
692
+ /* Count the frequencies. */
693
+ for (sym = 0; sym < num_syms; sym++)
694
+ counters[MIN(freqs[sym], num_counters - 1)]++;
695
+
696
+ /* Make the counters cumulative, ignoring the zero-th, which
697
+ * counted symbols with zero frequency. As a side effect, this
698
+ * calculates the number of symbols with nonzero frequency. */
699
+ num_used_syms = 0;
700
+ for (i = 1; i < num_counters; i++) {
701
+ unsigned count = counters[i];
702
+ counters[i] = num_used_syms;
703
+ num_used_syms += count;
704
+ }
705
+
706
+ /* Sort nonzero-frequency symbols using the counters. At the
707
+ * same time, set the codeword lengths of zero-frequency symbols
708
+ * to 0. */
709
+ for (sym = 0; sym < num_syms; sym++) {
710
+ u32 freq = freqs[sym];
711
+ if (freq != 0) {
712
+ symout[counters[MIN(freq, num_counters - 1)]++] =
713
+ sym | (freq << NUM_SYMBOL_BITS);
714
+ } else {
715
+ lens[sym] = 0;
716
+ }
717
+ }
718
+
719
+ /* Sort the symbols counted in the last counter. */
720
+ heap_sort(symout + counters[num_counters - 2],
721
+ counters[num_counters - 1] - counters[num_counters - 2]);
722
+
723
+ return num_used_syms;
724
+ }
725
+
726
+ /*
727
+ * Build the Huffman tree.
728
+ *
729
+ * This is an optimized implementation that
730
+ * (a) takes advantage of the frequencies being already sorted;
731
+ * (b) only generates non-leaf nodes, since the non-leaf nodes of a
732
+ * Huffman tree are sufficient to generate a canonical code;
733
+ * (c) Only stores parent pointers, not child pointers;
734
+ * (d) Produces the nodes in the same memory used for input
735
+ * frequency information.
736
+ *
737
+ * Array 'A', which contains 'sym_count' entries, is used for both input
738
+ * and output. For this function, 'sym_count' must be at least 2.
739
+ *
740
+ * For input, the array must contain the frequencies of the symbols,
741
+ * sorted in increasing order. Specifically, each entry must contain a
742
+ * frequency left shifted by NUM_SYMBOL_BITS bits. Any data in the low
743
+ * NUM_SYMBOL_BITS bits of the entries will be ignored by this function.
744
+ * Although these bits will, in fact, contain the symbols that correspond
745
+ * to the frequencies, this function is concerned with frequencies only
746
+ * and keeps the symbols as-is.
747
+ *
748
+ * For output, this function will produce the non-leaf nodes of the
749
+ * Huffman tree. These nodes will be stored in the first (sym_count - 1)
750
+ * entries of the array. Entry A[sym_count - 2] will represent the root
751
+ * node. Each other node will contain the zero-based index of its parent
752
+ * node in 'A', left shifted by NUM_SYMBOL_BITS bits. The low
753
+ * NUM_SYMBOL_BITS bits of each entry in A will be kept as-is. Again,
754
+ * note that although these low bits will, in fact, contain a symbol
755
+ * value, this symbol will have *no relationship* with the Huffman tree
756
+ * node that happens to occupy the same slot. This is because this
757
+ * implementation only generates the non-leaf nodes of the tree.
758
+ */
759
+ static void
760
+ build_tree(u32 A[], unsigned sym_count)
761
+ {
762
+ /* Index, in 'A', of next lowest frequency symbol that has not
763
+ * yet been processed. */
764
+ unsigned i = 0;
765
+
766
+ /* Index, in 'A', of next lowest frequency parentless non-leaf
767
+ * node; or, if equal to 'e', then no such node exists yet. */
768
+ unsigned b = 0;
769
+
770
+ /* Index, in 'A', of next node to allocate as a non-leaf. */
771
+ unsigned e = 0;
772
+
773
+ do {
774
+ unsigned m, n;
775
+ u32 freq_shifted;
776
+
777
+ /* Choose the two next lowest frequency entries. */
778
+
779
+ if (i != sym_count &&
780
+ (b == e || (A[i] >> NUM_SYMBOL_BITS) <= (A[b] >> NUM_SYMBOL_BITS)))
781
+ m = i++;
782
+ else
783
+ m = b++;
784
+
785
+ if (i != sym_count &&
786
+ (b == e || (A[i] >> NUM_SYMBOL_BITS) <= (A[b] >> NUM_SYMBOL_BITS)))
787
+ n = i++;
788
+ else
789
+ n = b++;
790
+
791
+ /* Allocate a non-leaf node and link the entries to it.
792
+ *
793
+ * If we link an entry that we're visiting for the first
794
+ * time (via index 'i'), then we're actually linking a
795
+ * leaf node and it will have no effect, since the leaf
796
+ * will be overwritten with a non-leaf when index 'e'
797
+ * catches up to it. But it's not any slower to
798
+ * unconditionally set the parent index.
799
+ *
800
+ * We also compute the frequency of the non-leaf node as
801
+ * the sum of its two children's frequencies. */
802
+
803
+ freq_shifted = (A[m] & ~SYMBOL_MASK) + (A[n] & ~SYMBOL_MASK);
804
+
805
+ A[m] = (A[m] & SYMBOL_MASK) | (e << NUM_SYMBOL_BITS);
806
+ A[n] = (A[n] & SYMBOL_MASK) | (e << NUM_SYMBOL_BITS);
807
+ A[e] = (A[e] & SYMBOL_MASK) | freq_shifted;
808
+ e++;
809
+ } while (sym_count - e > 1);
810
+ /* When just one entry remains, it is a "leaf" that was
811
+ * linked to some other node. We ignore it, since the
812
+ * rest of the array contains the non-leaves which we
813
+ * need. (Note that we're assuming the cases with 0 or 1
814
+ * symbols were handled separately.) */
815
+ }
816
+
817
+ /*
818
+ * Given the stripped-down Huffman tree constructed by build_tree(),
819
+ * determine the number of codewords that should be assigned each
820
+ * possible length, taking into account the length-limited constraint.
821
+ *
822
+ * @A
823
+ * The array produced by build_tree(), containing parent index
824
+ * information for the non-leaf nodes of the Huffman tree. Each
825
+ * entry in this array is a node; a node's parent always has a
826
+ * greater index than that node itself. This function will
827
+ * overwrite the parent index information in this array, so
828
+ * essentially it will destroy the tree. However, the data in the
829
+ * low NUM_SYMBOL_BITS of each entry will be preserved.
830
+ *
831
+ * @root_idx
832
+ * The 0-based index of the root node in 'A', and consequently one
833
+ * less than the number of tree node entries in 'A'. (Or, really 2
834
+ * less than the actual length of 'A'.)
835
+ *
836
+ * @len_counts
837
+ * An array of length ('max_codeword_len' + 1) in which the number of
838
+ * codewords having each length <= max_codeword_len will be
839
+ * returned.
840
+ *
841
+ * @max_codeword_len
842
+ * The maximum permissible codeword length.
843
+ */
844
+ static void
845
+ compute_length_counts(u32 A[restrict], unsigned root_idx,
846
+ unsigned len_counts[restrict], unsigned max_codeword_len)
847
+ {
848
+ unsigned len;
849
+ int node;
850
+
851
+ /* The key observations are:
852
+ *
853
+ * (1) We can traverse the non-leaf nodes of the tree, always
854
+ * visiting a parent before its children, by simply iterating
855
+ * through the array in reverse order. Consequently, we can
856
+ * compute the depth of each node in one pass, overwriting the
857
+ * parent indices with depths.
858
+ *
859
+ * (2) We can initially assume that in the real Huffman tree,
860
+ * both children of the root are leaves. This corresponds to two
861
+ * codewords of length 1. Then, whenever we visit a (non-leaf)
862
+ * node during the traversal, we modify this assumption to
863
+ * account for the current node *not* being a leaf, but rather
864
+ * its two children being leaves. This causes the loss of one
865
+ * codeword for the current depth and the addition of two
866
+ * codewords for the current depth plus one.
867
+ *
868
+ * (3) We can handle the length-limited constraint fairly easily
869
+ * by simply using the largest length available when a depth
870
+ * exceeds max_codeword_len.
871
+ */
872
+
873
+ for (len = 0; len <= max_codeword_len; len++)
874
+ len_counts[len] = 0;
875
+ len_counts[1] = 2;
876
+
877
+ /* Set the root node's depth to 0. */
878
+ A[root_idx] &= SYMBOL_MASK;
879
+
880
+ for (node = root_idx - 1; node >= 0; node--) {
881
+
882
+ /* Calculate the depth of this node. */
883
+
884
+ unsigned parent = A[node] >> NUM_SYMBOL_BITS;
885
+ unsigned parent_depth = A[parent] >> NUM_SYMBOL_BITS;
886
+ unsigned depth = parent_depth + 1;
887
+ unsigned len = depth;
888
+
889
+ /* Set the depth of this node so that it is available
890
+ * when its children (if any) are processed. */
891
+
892
+ A[node] = (A[node] & SYMBOL_MASK) | (depth << NUM_SYMBOL_BITS);
893
+
894
+ /* If needed, decrease the length to meet the
895
+ * length-limited constraint. This is not the optimal
896
+ * method for generating length-limited Huffman codes!
897
+ * But it should be good enough. */
898
+ if (len >= max_codeword_len) {
899
+ len = max_codeword_len;
900
+ do {
901
+ len--;
902
+ } while (len_counts[len] == 0);
903
+ }
904
+
905
+ /* Account for the fact that we have a non-leaf node at
906
+ * the current depth. */
907
+ len_counts[len]--;
908
+ len_counts[len + 1] += 2;
909
+ }
910
+ }
911
+
912
+ /*
913
+ * Generate the codewords for a canonical Huffman code.
914
+ *
915
+ * @A
916
+ * The output array for codewords. In addition, initially this
917
+ * array must contain the symbols, sorted primarily by frequency and
918
+ * secondarily by symbol value, in the low NUM_SYMBOL_BITS bits of
919
+ * each entry.
920
+ *
921
+ * @len
922
+ * Output array for codeword lengths.
923
+ *
924
+ * @len_counts
925
+ * An array that provides the number of codewords that will have
926
+ * each possible length <= max_codeword_len.
927
+ *
928
+ * @max_codeword_len
929
+ * Maximum length, in bits, of each codeword.
930
+ *
931
+ * @num_syms
932
+ * Number of symbols in the alphabet, including symbols with zero
933
+ * frequency. This is the length of the 'A' and 'len' arrays.
934
+ */
935
+ static void
936
+ gen_codewords(u32 A[restrict], u8 lens[restrict],
937
+ const unsigned len_counts[restrict],
938
+ unsigned max_codeword_len, unsigned num_syms)
939
+ {
940
+ u32 next_codewords[DEFLATE_MAX_CODEWORD_LEN + 1];
941
+ unsigned i;
942
+ unsigned len;
943
+ unsigned sym;
944
+
945
+ /* Given the number of codewords that will have each length,
946
+ * assign codeword lengths to symbols. We do this by assigning
947
+ * the lengths in decreasing order to the symbols sorted
948
+ * primarily by increasing frequency and secondarily by
949
+ * increasing symbol value. */
950
+ for (i = 0, len = max_codeword_len; len >= 1; len--) {
951
+ unsigned count = len_counts[len];
952
+ while (count--)
953
+ lens[A[i++] & SYMBOL_MASK] = len;
954
+ }
955
+
956
+ /* Generate the codewords themselves. We initialize the
957
+ * 'next_codewords' array to provide the lexicographically first
958
+ * codeword of each length, then assign codewords in symbol
959
+ * order. This produces a canonical code. */
960
+ next_codewords[0] = 0;
961
+ next_codewords[1] = 0;
962
+ for (len = 2; len <= max_codeword_len; len++)
963
+ next_codewords[len] =
964
+ (next_codewords[len - 1] + len_counts[len - 1]) << 1;
965
+
966
+ for (sym = 0; sym < num_syms; sym++)
967
+ A[sym] = next_codewords[lens[sym]]++;
968
+ }
969
+
970
+ /*
971
+ * ---------------------------------------------------------------------
972
+ * make_canonical_huffman_code()
973
+ * ---------------------------------------------------------------------
974
+ *
975
+ * Given an alphabet and the frequency of each symbol in it, construct a
976
+ * length-limited canonical Huffman code.
977
+ *
978
+ * @num_syms
979
+ * The number of symbols in the alphabet. The symbols are the
980
+ * integers in the range [0, num_syms - 1]. This parameter must be
981
+ * at least 2 and can't be greater than (1 << NUM_SYMBOL_BITS).
982
+ *
983
+ * @max_codeword_len
984
+ * The maximum permissible codeword length.
985
+ *
986
+ * @freqs
987
+ * An array of @num_syms entries, each of which specifies the
988
+ * frequency of the corresponding symbol. It is valid for some,
989
+ * none, or all of the frequencies to be 0.
990
+ *
991
+ * @lens
992
+ * An array of @num_syms entries in which this function will return
993
+ * the length, in bits, of the codeword assigned to each symbol.
994
+ * Symbols with 0 frequency will not have codewords per se, but
995
+ * their entries in this array will be set to 0. No lengths greater
996
+ * than @max_codeword_len will be assigned.
997
+ *
998
+ * @codewords
999
+ * An array of @num_syms entries in which this function will return
1000
+ * the codeword for each symbol, right-justified and padded on the
1001
+ * left with zeroes. Codewords for symbols with 0 frequency will be
1002
+ * undefined.
1003
+ *
1004
+ * ---------------------------------------------------------------------
1005
+ *
1006
+ * This function builds a length-limited canonical Huffman code.
1007
+ *
1008
+ * A length-limited Huffman code contains no codewords longer than some
1009
+ * specified length, and has exactly (with some algorithms) or
1010
+ * approximately (with the algorithm used here) the minimum weighted path
1011
+ * length from the root, given this constraint.
1012
+ *
1013
+ * A canonical Huffman code satisfies the properties that a longer
1014
+ * codeword never lexicographically precedes a shorter codeword, and the
1015
+ * lexicographic ordering of codewords of the same length is the same as
1016
+ * the lexicographic ordering of the corresponding symbols. A canonical
1017
+ * Huffman code, or more generally a canonical prefix code, can be
1018
+ * reconstructed from only a list containing the codeword length of each
1019
+ * symbol.
1020
+ *
1021
+ * The classic algorithm to generate a Huffman code creates a node for
1022
+ * each symbol, then inserts these nodes into a min-heap keyed by symbol
1023
+ * frequency. Then, repeatedly, the two lowest-frequency nodes are
1024
+ * removed from the min-heap and added as the children of a new node
1025
+ * having frequency equal to the sum of its two children, which is then
1026
+ * inserted into the min-heap. When only a single node remains in the
1027
+ * min-heap, it is the root of the Huffman tree. The codeword for each
1028
+ * symbol is determined by the path needed to reach the corresponding
1029
+ * node from the root. Descending to the left child appends a 0 bit,
1030
+ * whereas descending to the right child appends a 1 bit.
1031
+ *
1032
+ * The classic algorithm is relatively easy to understand, but it is
1033
+ * subject to a number of inefficiencies. In practice, it is fastest to
1034
+ * first sort the symbols by frequency. (This itself can be subject to
1035
+ * an optimization based on the fact that most frequencies tend to be
1036
+ * low.) At the same time, we sort secondarily by symbol value, which
1037
+ * aids the process of generating a canonical code. Then, during tree
1038
+ * construction, no heap is necessary because both the leaf nodes and the
1039
+ * unparented non-leaf nodes can be easily maintained in sorted order.
1040
+ * Consequently, there can never be more than two possibilities for the
1041
+ * next-lowest-frequency node.
1042
+ *
1043
+ * In addition, because we're generating a canonical code, we actually
1044
+ * don't need the leaf nodes of the tree at all, only the non-leaf nodes.
1045
+ * This is because for canonical code generation we don't need to know
1046
+ * where the symbols are in the tree. Rather, we only need to know how
1047
+ * many leaf nodes have each depth (codeword length). And this
1048
+ * information can, in fact, be quickly generated from the tree of
1049
+ * non-leaves only.
1050
+ *
1051
+ * Furthermore, we can build this stripped-down Huffman tree directly in
1052
+ * the array in which the codewords are to be generated, provided that
1053
+ * these array slots are large enough to hold a symbol and frequency
1054
+ * value.
1055
+ *
1056
+ * Still furthermore, we don't even need to maintain explicit child
1057
+ * pointers. We only need the parent pointers, and even those can be
1058
+ * overwritten in-place with depth information as part of the process of
1059
+ * extracting codeword lengths from the tree. So in summary, we do NOT
1060
+ * need a big structure like:
1061
+ *
1062
+ * struct huffman_tree_node {
1063
+ * unsigned int symbol;
1064
+ * unsigned int frequency;
1065
+ * unsigned int depth;
1066
+ * struct huffman_tree_node *left_child;
1067
+ * struct huffman_tree_node *right_child;
1068
+ * };
1069
+ *
1070
+ *
1071
+ * ... which often gets used in "naive" implementations of Huffman code
1072
+ * generation.
1073
+ *
1074
+ * Many of these optimizations are based on the implementation in 7-Zip
1075
+ * (source file: C/HuffEnc.c), which has been placed in the public domain
1076
+ * by Igor Pavlov.
1077
+ */
1078
+ static void
1079
+ make_canonical_huffman_code(unsigned num_syms, unsigned max_codeword_len,
1080
+ const u32 freqs[restrict],
1081
+ u8 lens[restrict], u32 codewords[restrict])
1082
+ {
1083
+ u32 *A = codewords;
1084
+ unsigned num_used_syms;
1085
+
1086
+ STATIC_ASSERT(DEFLATE_MAX_NUM_SYMS <= 1 << NUM_SYMBOL_BITS);
1087
+
1088
+ /* We begin by sorting the symbols primarily by frequency and
1089
+ * secondarily by symbol value. As an optimization, the array
1090
+ * used for this purpose ('A') shares storage with the space in
1091
+ * which we will eventually return the codewords. */
1092
+
1093
+ num_used_syms = sort_symbols(num_syms, freqs, lens, A);
1094
+
1095
+ /* 'num_used_syms' is the number of symbols with nonzero
1096
+ * frequency. This may be less than @num_syms. 'num_used_syms'
1097
+ * is also the number of entries in 'A' that are valid. Each
1098
+ * entry consists of a distinct symbol and a nonzero frequency
1099
+ * packed into a 32-bit integer. */
1100
+
1101
+ /* Handle special cases where only 0 or 1 symbols were used (had
1102
+ * nonzero frequency). */
1103
+
1104
+ if (unlikely(num_used_syms == 0)) {
1105
+ /* Code is empty. sort_symbols() already set all lengths
1106
+ * to 0, so there is nothing more to do. */
1107
+ return;
1108
+ }
1109
+
1110
+ if (unlikely(num_used_syms == 1)) {
1111
+ /* Only one symbol was used, so we only need one
1112
+ * codeword. But two codewords are needed to form the
1113
+ * smallest complete Huffman code, which uses codewords 0
1114
+ * and 1. Therefore, we choose another symbol to which
1115
+ * to assign a codeword. We use 0 (if the used symbol is
1116
+ * not 0) or 1 (if the used symbol is 0). In either
1117
+ * case, the lesser-valued symbol must be assigned
1118
+ * codeword 0 so that the resulting code is canonical. */
1119
+
1120
+ unsigned sym = A[0] & SYMBOL_MASK;
1121
+ unsigned nonzero_idx = sym ? sym : 1;
1122
+
1123
+ codewords[0] = 0;
1124
+ lens[0] = 1;
1125
+ codewords[nonzero_idx] = 1;
1126
+ lens[nonzero_idx] = 1;
1127
+ return;
1128
+ }
1129
+
1130
+ /* Build a stripped-down version of the Huffman tree, sharing the
1131
+ * array 'A' with the symbol values. Then extract length counts
1132
+ * from the tree and use them to generate the final codewords. */
1133
+
1134
+ build_tree(A, num_used_syms);
1135
+
1136
+ {
1137
+ unsigned len_counts[DEFLATE_MAX_CODEWORD_LEN + 1];
1138
+
1139
+ compute_length_counts(A, num_used_syms - 2,
1140
+ len_counts, max_codeword_len);
1141
+
1142
+ gen_codewords(A, lens, len_counts, max_codeword_len, num_syms);
1143
+ }
1144
+ }
1145
+
1146
+ /*
1147
+ * Clear the Huffman symbol frequency counters.
1148
+ * This must be called when starting a new DEFLATE block.
1149
+ */
1150
+ static void
1151
+ deflate_reset_symbol_frequencies(struct libdeflate_compressor *c)
1152
+ {
1153
+ memset(&c->freqs, 0, sizeof(c->freqs));
1154
+ }
1155
+
1156
+ /* Reverse the Huffman codeword 'codeword', which is 'len' bits in length. */
1157
+ static u32
1158
+ deflate_reverse_codeword(u32 codeword, u8 len)
1159
+ {
1160
+ /* The following branchless algorithm is faster than going bit by bit.
1161
+ * Note: since no codewords are longer than 16 bits, we only need to
1162
+ * reverse the low 16 bits of the 'u32'. */
1163
+ STATIC_ASSERT(DEFLATE_MAX_CODEWORD_LEN <= 16);
1164
+
1165
+ /* Flip adjacent 1-bit fields */
1166
+ codeword = ((codeword & 0x5555) << 1) | ((codeword & 0xAAAA) >> 1);
1167
+
1168
+ /* Flip adjacent 2-bit fields */
1169
+ codeword = ((codeword & 0x3333) << 2) | ((codeword & 0xCCCC) >> 2);
1170
+
1171
+ /* Flip adjacent 4-bit fields */
1172
+ codeword = ((codeword & 0x0F0F) << 4) | ((codeword & 0xF0F0) >> 4);
1173
+
1174
+ /* Flip adjacent 8-bit fields */
1175
+ codeword = ((codeword & 0x00FF) << 8) | ((codeword & 0xFF00) >> 8);
1176
+
1177
+ /* Return the high 'len' bits of the bit-reversed 16 bit value. */
1178
+ return codeword >> (16 - len);
1179
+ }
1180
+
1181
+ /* Make a canonical Huffman code with bit-reversed codewords. */
1182
+ static void
1183
+ deflate_make_huffman_code(unsigned num_syms, unsigned max_codeword_len,
1184
+ const u32 freqs[], u8 lens[], u32 codewords[])
1185
+ {
1186
+ unsigned sym;
1187
+
1188
+ make_canonical_huffman_code(num_syms, max_codeword_len,
1189
+ freqs, lens, codewords);
1190
+
1191
+ for (sym = 0; sym < num_syms; sym++)
1192
+ codewords[sym] = deflate_reverse_codeword(codewords[sym], lens[sym]);
1193
+ }
1194
+
1195
+ /*
1196
+ * Build the literal/length and offset Huffman codes for a DEFLATE block.
1197
+ *
1198
+ * This takes as input the frequency tables for each code and produces as output
1199
+ * a set of tables that map symbols to codewords and codeword lengths.
1200
+ */
1201
+ static void
1202
+ deflate_make_huffman_codes(const struct deflate_freqs *freqs,
1203
+ struct deflate_codes *codes)
1204
+ {
1205
+ STATIC_ASSERT(MAX_LITLEN_CODEWORD_LEN <= DEFLATE_MAX_LITLEN_CODEWORD_LEN);
1206
+ STATIC_ASSERT(MAX_OFFSET_CODEWORD_LEN <= DEFLATE_MAX_OFFSET_CODEWORD_LEN);
1207
+
1208
+ deflate_make_huffman_code(DEFLATE_NUM_LITLEN_SYMS,
1209
+ MAX_LITLEN_CODEWORD_LEN,
1210
+ freqs->litlen,
1211
+ codes->lens.litlen,
1212
+ codes->codewords.litlen);
1213
+
1214
+ deflate_make_huffman_code(DEFLATE_NUM_OFFSET_SYMS,
1215
+ MAX_OFFSET_CODEWORD_LEN,
1216
+ freqs->offset,
1217
+ codes->lens.offset,
1218
+ codes->codewords.offset);
1219
+ }
1220
+
1221
+ /* Initialize c->static_codes. */
1222
+ static void
1223
+ deflate_init_static_codes(struct libdeflate_compressor *c)
1224
+ {
1225
+ unsigned i;
1226
+
1227
+ for (i = 0; i < 144; i++)
1228
+ c->freqs.litlen[i] = 1 << (9 - 8);
1229
+ for (; i < 256; i++)
1230
+ c->freqs.litlen[i] = 1 << (9 - 9);
1231
+ for (; i < 280; i++)
1232
+ c->freqs.litlen[i] = 1 << (9 - 7);
1233
+ for (; i < 288; i++)
1234
+ c->freqs.litlen[i] = 1 << (9 - 8);
1235
+
1236
+ for (i = 0; i < 32; i++)
1237
+ c->freqs.offset[i] = 1 << (5 - 5);
1238
+
1239
+ deflate_make_huffman_codes(&c->freqs, &c->static_codes);
1240
+ }
1241
+
1242
+ /* Return the offset slot for the specified match offset. */
1243
+ static forceinline unsigned
1244
+ deflate_get_offset_slot(struct libdeflate_compressor *c, unsigned offset)
1245
+ {
1246
+ #if USE_FULL_OFFSET_SLOT_FAST
1247
+ return c->offset_slot_fast[offset];
1248
+ #else
1249
+ if (offset <= 256)
1250
+ return c->offset_slot_fast[offset - 1];
1251
+ else
1252
+ return c->offset_slot_fast[256 + ((offset - 1) >> 7)];
1253
+ #endif
1254
+ }
1255
+
1256
+ /* Write the header fields common to all DEFLATE block types. */
1257
+ static void
1258
+ deflate_write_block_header(struct deflate_output_bitstream *os,
1259
+ bool is_final_block, unsigned block_type)
1260
+ {
1261
+ deflate_add_bits(os, is_final_block, 1);
1262
+ deflate_add_bits(os, block_type, 2);
1263
+ deflate_flush_bits(os);
1264
+ }
1265
+
1266
+ static unsigned
1267
+ deflate_compute_precode_items(const u8 lens[restrict],
1268
+ const unsigned num_lens,
1269
+ u32 precode_freqs[restrict],
1270
+ unsigned precode_items[restrict])
1271
+ {
1272
+ unsigned *itemptr;
1273
+ unsigned run_start;
1274
+ unsigned run_end;
1275
+ unsigned extra_bits;
1276
+ u8 len;
1277
+
1278
+ memset(precode_freqs, 0,
1279
+ DEFLATE_NUM_PRECODE_SYMS * sizeof(precode_freqs[0]));
1280
+
1281
+ itemptr = precode_items;
1282
+ run_start = 0;
1283
+ do {
1284
+ /* Find the next run of codeword lengths. */
1285
+
1286
+ /* len = the length being repeated */
1287
+ len = lens[run_start];
1288
+
1289
+ /* Extend the run. */
1290
+ run_end = run_start;
1291
+ do {
1292
+ run_end++;
1293
+ } while (run_end != num_lens && len == lens[run_end]);
1294
+
1295
+ if (len == 0) {
1296
+ /* Run of zeroes. */
1297
+
1298
+ /* Symbol 18: RLE 11 to 138 zeroes at a time. */
1299
+ while ((run_end - run_start) >= 11) {
1300
+ extra_bits = MIN((run_end - run_start) - 11, 0x7F);
1301
+ precode_freqs[18]++;
1302
+ *itemptr++ = 18 | (extra_bits << 5);
1303
+ run_start += 11 + extra_bits;
1304
+ }
1305
+
1306
+ /* Symbol 17: RLE 3 to 10 zeroes at a time. */
1307
+ if ((run_end - run_start) >= 3) {
1308
+ extra_bits = MIN((run_end - run_start) - 3, 0x7);
1309
+ precode_freqs[17]++;
1310
+ *itemptr++ = 17 | (extra_bits << 5);
1311
+ run_start += 3 + extra_bits;
1312
+ }
1313
+ } else {
1314
+
1315
+ /* A run of nonzero lengths. */
1316
+
1317
+ /* Symbol 16: RLE 3 to 6 of the previous length. */
1318
+ if ((run_end - run_start) >= 4) {
1319
+ precode_freqs[len]++;
1320
+ *itemptr++ = len;
1321
+ run_start++;
1322
+ do {
1323
+ extra_bits = MIN((run_end - run_start) - 3, 0x3);
1324
+ precode_freqs[16]++;
1325
+ *itemptr++ = 16 | (extra_bits << 5);
1326
+ run_start += 3 + extra_bits;
1327
+ } while ((run_end - run_start) >= 3);
1328
+ }
1329
+ }
1330
+
1331
+ /* Output any remaining lengths without RLE. */
1332
+ while (run_start != run_end) {
1333
+ precode_freqs[len]++;
1334
+ *itemptr++ = len;
1335
+ run_start++;
1336
+ }
1337
+ } while (run_start != num_lens);
1338
+
1339
+ return itemptr - precode_items;
1340
+ }
1341
+
1342
+ /*
1343
+ * Huffman codeword lengths for dynamic Huffman blocks are compressed using a
1344
+ * separate Huffman code, the "precode", which contains a symbol for each
1345
+ * possible codeword length in the larger code as well as several special
1346
+ * symbols to represent repeated codeword lengths (a form of run-length
1347
+ * encoding). The precode is itself constructed in canonical form, and its
1348
+ * codeword lengths are represented literally in 19 3-bit fields that
1349
+ * immediately precede the compressed codeword lengths of the larger code.
1350
+ */
1351
+
1352
+ /* Precompute the information needed to output Huffman codes. */
1353
+ static void
1354
+ deflate_precompute_huffman_header(struct libdeflate_compressor *c)
1355
+ {
1356
+ /* Compute how many litlen and offset symbols are needed. */
1357
+
1358
+ for (c->num_litlen_syms = DEFLATE_NUM_LITLEN_SYMS;
1359
+ c->num_litlen_syms > 257;
1360
+ c->num_litlen_syms--)
1361
+ if (c->codes.lens.litlen[c->num_litlen_syms - 1] != 0)
1362
+ break;
1363
+
1364
+ for (c->num_offset_syms = DEFLATE_NUM_OFFSET_SYMS;
1365
+ c->num_offset_syms > 1;
1366
+ c->num_offset_syms--)
1367
+ if (c->codes.lens.offset[c->num_offset_syms - 1] != 0)
1368
+ break;
1369
+
1370
+ /* If we're not using the full set of literal/length codeword lengths,
1371
+ * then temporarily move the offset codeword lengths over so that the
1372
+ * literal/length and offset codeword lengths are contiguous. */
1373
+
1374
+ STATIC_ASSERT(offsetof(struct deflate_lens, offset) ==
1375
+ DEFLATE_NUM_LITLEN_SYMS);
1376
+
1377
+ if (c->num_litlen_syms != DEFLATE_NUM_LITLEN_SYMS) {
1378
+ memmove((u8 *)&c->codes.lens + c->num_litlen_syms,
1379
+ (u8 *)&c->codes.lens + DEFLATE_NUM_LITLEN_SYMS,
1380
+ c->num_offset_syms);
1381
+ }
1382
+
1383
+ /* Compute the "items" (RLE / literal tokens and extra bits) with which
1384
+ * the codeword lengths in the larger code will be output. */
1385
+ c->num_precode_items =
1386
+ deflate_compute_precode_items((u8 *)&c->codes.lens,
1387
+ c->num_litlen_syms +
1388
+ c->num_offset_syms,
1389
+ c->precode_freqs,
1390
+ c->precode_items);
1391
+
1392
+ /* Build the precode. */
1393
+ STATIC_ASSERT(MAX_PRE_CODEWORD_LEN <= DEFLATE_MAX_PRE_CODEWORD_LEN);
1394
+ deflate_make_huffman_code(DEFLATE_NUM_PRECODE_SYMS,
1395
+ MAX_PRE_CODEWORD_LEN,
1396
+ c->precode_freqs, c->precode_lens,
1397
+ c->precode_codewords);
1398
+
1399
+ /* Count how many precode lengths we actually need to output. */
1400
+ for (c->num_explicit_lens = DEFLATE_NUM_PRECODE_SYMS;
1401
+ c->num_explicit_lens > 4;
1402
+ c->num_explicit_lens--)
1403
+ if (c->precode_lens[deflate_precode_lens_permutation[
1404
+ c->num_explicit_lens - 1]] != 0)
1405
+ break;
1406
+
1407
+ /* Restore the offset codeword lengths if needed. */
1408
+ if (c->num_litlen_syms != DEFLATE_NUM_LITLEN_SYMS) {
1409
+ memmove((u8 *)&c->codes.lens + DEFLATE_NUM_LITLEN_SYMS,
1410
+ (u8 *)&c->codes.lens + c->num_litlen_syms,
1411
+ c->num_offset_syms);
1412
+ }
1413
+ }
1414
+
1415
+ /* Output the Huffman codes. */
1416
+ static void
1417
+ deflate_write_huffman_header(struct libdeflate_compressor *c,
1418
+ struct deflate_output_bitstream *os)
1419
+ {
1420
+ unsigned i;
1421
+
1422
+ deflate_add_bits(os, c->num_litlen_syms - 257, 5);
1423
+ deflate_add_bits(os, c->num_offset_syms - 1, 5);
1424
+ deflate_add_bits(os, c->num_explicit_lens - 4, 4);
1425
+ deflate_flush_bits(os);
1426
+
1427
+ /* Output the lengths of the codewords in the precode. */
1428
+ for (i = 0; i < c->num_explicit_lens; i++) {
1429
+ deflate_add_bits(os, c->precode_lens[
1430
+ deflate_precode_lens_permutation[i]], 3);
1431
+ deflate_flush_bits(os);
1432
+ }
1433
+
1434
+ /* Output the encoded lengths of the codewords in the larger code. */
1435
+ for (i = 0; i < c->num_precode_items; i++) {
1436
+ unsigned precode_item = c->precode_items[i];
1437
+ unsigned precode_sym = precode_item & 0x1F;
1438
+ deflate_add_bits(os, c->precode_codewords[precode_sym],
1439
+ c->precode_lens[precode_sym]);
1440
+ if (precode_sym >= 16) {
1441
+ if (precode_sym == 16)
1442
+ deflate_add_bits(os, precode_item >> 5, 2);
1443
+ else if (precode_sym == 17)
1444
+ deflate_add_bits(os, precode_item >> 5, 3);
1445
+ else
1446
+ deflate_add_bits(os, precode_item >> 5, 7);
1447
+ }
1448
+ STATIC_ASSERT(CAN_BUFFER(DEFLATE_MAX_PRE_CODEWORD_LEN + 7));
1449
+ deflate_flush_bits(os);
1450
+ }
1451
+ }
1452
+
1453
+ static void
1454
+ deflate_write_sequences(struct deflate_output_bitstream * restrict os,
1455
+ const struct deflate_codes * restrict codes,
1456
+ const struct deflate_sequence sequences[restrict],
1457
+ const u8 * restrict in_next)
1458
+ {
1459
+ const struct deflate_sequence *seq = sequences;
1460
+
1461
+ for (;;) {
1462
+ u32 litrunlen = seq->litrunlen_and_length & 0x7FFFFF;
1463
+ unsigned length = seq->litrunlen_and_length >> 23;
1464
+ unsigned length_slot;
1465
+ unsigned litlen_symbol;
1466
+ unsigned offset_symbol;
1467
+
1468
+ if (litrunlen) {
1469
+ #if 1
1470
+ while (litrunlen >= 4) {
1471
+ unsigned lit0 = in_next[0];
1472
+ unsigned lit1 = in_next[1];
1473
+ unsigned lit2 = in_next[2];
1474
+ unsigned lit3 = in_next[3];
1475
+
1476
+ deflate_add_bits(os, codes->codewords.litlen[lit0],
1477
+ codes->lens.litlen[lit0]);
1478
+ if (!CAN_BUFFER(2 * MAX_LITLEN_CODEWORD_LEN))
1479
+ deflate_flush_bits(os);
1480
+
1481
+ deflate_add_bits(os, codes->codewords.litlen[lit1],
1482
+ codes->lens.litlen[lit1]);
1483
+ if (!CAN_BUFFER(4 * MAX_LITLEN_CODEWORD_LEN))
1484
+ deflate_flush_bits(os);
1485
+
1486
+ deflate_add_bits(os, codes->codewords.litlen[lit2],
1487
+ codes->lens.litlen[lit2]);
1488
+ if (!CAN_BUFFER(2 * MAX_LITLEN_CODEWORD_LEN))
1489
+ deflate_flush_bits(os);
1490
+
1491
+ deflate_add_bits(os, codes->codewords.litlen[lit3],
1492
+ codes->lens.litlen[lit3]);
1493
+ deflate_flush_bits(os);
1494
+ in_next += 4;
1495
+ litrunlen -= 4;
1496
+ }
1497
+ if (litrunlen-- != 0) {
1498
+ deflate_add_bits(os, codes->codewords.litlen[*in_next],
1499
+ codes->lens.litlen[*in_next]);
1500
+ if (!CAN_BUFFER(3 * MAX_LITLEN_CODEWORD_LEN))
1501
+ deflate_flush_bits(os);
1502
+ in_next++;
1503
+ if (litrunlen-- != 0) {
1504
+ deflate_add_bits(os, codes->codewords.litlen[*in_next],
1505
+ codes->lens.litlen[*in_next]);
1506
+ if (!CAN_BUFFER(3 * MAX_LITLEN_CODEWORD_LEN))
1507
+ deflate_flush_bits(os);
1508
+ in_next++;
1509
+ if (litrunlen-- != 0) {
1510
+ deflate_add_bits(os, codes->codewords.litlen[*in_next],
1511
+ codes->lens.litlen[*in_next]);
1512
+ if (!CAN_BUFFER(3 * MAX_LITLEN_CODEWORD_LEN))
1513
+ deflate_flush_bits(os);
1514
+ in_next++;
1515
+ }
1516
+ }
1517
+ if (CAN_BUFFER(3 * MAX_LITLEN_CODEWORD_LEN))
1518
+ deflate_flush_bits(os);
1519
+ }
1520
+ #else
1521
+ do {
1522
+ unsigned lit = *in_next++;
1523
+ deflate_add_bits(os, codes->codewords.litlen[lit],
1524
+ codes->lens.litlen[lit]);
1525
+ deflate_flush_bits(os);
1526
+ } while (--litrunlen);
1527
+ #endif
1528
+ }
1529
+
1530
+ if (length == 0)
1531
+ return;
1532
+
1533
+ in_next += length;
1534
+
1535
+ length_slot = seq->length_slot;
1536
+ litlen_symbol = 257 + length_slot;
1537
+
1538
+ /* Litlen symbol */
1539
+ deflate_add_bits(os, codes->codewords.litlen[litlen_symbol],
1540
+ codes->lens.litlen[litlen_symbol]);
1541
+
1542
+ /* Extra length bits */
1543
+ STATIC_ASSERT(CAN_BUFFER(MAX_LITLEN_CODEWORD_LEN +
1544
+ DEFLATE_MAX_EXTRA_LENGTH_BITS));
1545
+ deflate_add_bits(os, length - deflate_length_slot_base[length_slot],
1546
+ deflate_extra_length_bits[length_slot]);
1547
+
1548
+ if (!CAN_BUFFER(MAX_LITLEN_CODEWORD_LEN +
1549
+ DEFLATE_MAX_EXTRA_LENGTH_BITS +
1550
+ MAX_OFFSET_CODEWORD_LEN +
1551
+ DEFLATE_MAX_EXTRA_OFFSET_BITS))
1552
+ deflate_flush_bits(os);
1553
+
1554
+ /* Offset symbol */
1555
+ offset_symbol = seq->offset_symbol;
1556
+ deflate_add_bits(os, codes->codewords.offset[offset_symbol],
1557
+ codes->lens.offset[offset_symbol]);
1558
+
1559
+ if (!CAN_BUFFER(MAX_OFFSET_CODEWORD_LEN +
1560
+ DEFLATE_MAX_EXTRA_OFFSET_BITS))
1561
+ deflate_flush_bits(os);
1562
+
1563
+ /* Extra offset bits */
1564
+ deflate_add_bits(os, seq->offset - deflate_offset_slot_base[offset_symbol],
1565
+ deflate_extra_offset_bits[offset_symbol]);
1566
+
1567
+ deflate_flush_bits(os);
1568
+
1569
+ seq++;
1570
+ }
1571
+ }
1572
+
1573
+ #if SUPPORT_NEAR_OPTIMAL_PARSING
1574
+ /*
1575
+ * Follow the minimum-cost path in the graph of possible match/literal choices
1576
+ * for the current block and write out the matches/literals using the specified
1577
+ * Huffman codes.
1578
+ *
1579
+ * Note: this is slightly duplicated with deflate_write_sequences(), the reason
1580
+ * being that we don't want to waste time translating between intermediate
1581
+ * match/literal representations.
1582
+ */
1583
+ static void
1584
+ deflate_write_item_list(struct deflate_output_bitstream *os,
1585
+ const struct deflate_codes *codes,
1586
+ struct libdeflate_compressor *c,
1587
+ u32 block_length)
1588
+ {
1589
+ struct deflate_optimum_node *cur_node = &c->p.n.optimum_nodes[0];
1590
+ struct deflate_optimum_node * const end_node = &c->p.n.optimum_nodes[block_length];
1591
+ do {
1592
+ unsigned length = cur_node->item & OPTIMUM_LEN_MASK;
1593
+ unsigned offset = cur_node->item >> OPTIMUM_OFFSET_SHIFT;
1594
+ unsigned litlen_symbol;
1595
+ unsigned length_slot;
1596
+ unsigned offset_slot;
1597
+
1598
+ if (length == 1) {
1599
+ /* Literal */
1600
+ litlen_symbol = offset;
1601
+ deflate_add_bits(os, codes->codewords.litlen[litlen_symbol],
1602
+ codes->lens.litlen[litlen_symbol]);
1603
+ deflate_flush_bits(os);
1604
+ } else {
1605
+ /* Match length */
1606
+ length_slot = deflate_length_slot[length];
1607
+ litlen_symbol = 257 + length_slot;
1608
+ deflate_add_bits(os, codes->codewords.litlen[litlen_symbol],
1609
+ codes->lens.litlen[litlen_symbol]);
1610
+
1611
+ deflate_add_bits(os, length - deflate_length_slot_base[length_slot],
1612
+ deflate_extra_length_bits[length_slot]);
1613
+
1614
+ if (!CAN_BUFFER(MAX_LITLEN_CODEWORD_LEN +
1615
+ DEFLATE_MAX_EXTRA_LENGTH_BITS +
1616
+ MAX_OFFSET_CODEWORD_LEN +
1617
+ DEFLATE_MAX_EXTRA_OFFSET_BITS))
1618
+ deflate_flush_bits(os);
1619
+
1620
+
1621
+ /* Match offset */
1622
+ offset_slot = deflate_get_offset_slot(c, offset);
1623
+ deflate_add_bits(os, codes->codewords.offset[offset_slot],
1624
+ codes->lens.offset[offset_slot]);
1625
+
1626
+ if (!CAN_BUFFER(MAX_OFFSET_CODEWORD_LEN +
1627
+ DEFLATE_MAX_EXTRA_OFFSET_BITS))
1628
+ deflate_flush_bits(os);
1629
+
1630
+ deflate_add_bits(os, offset - deflate_offset_slot_base[offset_slot],
1631
+ deflate_extra_offset_bits[offset_slot]);
1632
+
1633
+ deflate_flush_bits(os);
1634
+ }
1635
+ cur_node += length;
1636
+ } while (cur_node != end_node);
1637
+ }
1638
+ #endif /* SUPPORT_NEAR_OPTIMAL_PARSING */
1639
+
1640
+ /* Output the end-of-block symbol. */
1641
+ static void
1642
+ deflate_write_end_of_block(struct deflate_output_bitstream *os,
1643
+ const struct deflate_codes *codes)
1644
+ {
1645
+ deflate_add_bits(os, codes->codewords.litlen[DEFLATE_END_OF_BLOCK],
1646
+ codes->lens.litlen[DEFLATE_END_OF_BLOCK]);
1647
+ deflate_flush_bits(os);
1648
+ }
1649
+
1650
+ static void
1651
+ deflate_write_uncompressed_block(struct deflate_output_bitstream *os,
1652
+ const u8 *data, u16 len,
1653
+ bool is_final_block)
1654
+ {
1655
+ deflate_write_block_header(os, is_final_block,
1656
+ DEFLATE_BLOCKTYPE_UNCOMPRESSED);
1657
+ deflate_align_bitstream(os);
1658
+
1659
+ if (4 + (u32)len >= os->end - os->next) {
1660
+ os->next = os->end;
1661
+ return;
1662
+ }
1663
+
1664
+ put_unaligned_le16(len, os->next);
1665
+ os->next += 2;
1666
+ put_unaligned_le16(~len, os->next);
1667
+ os->next += 2;
1668
+ memcpy(os->next, data, len);
1669
+ os->next += len;
1670
+ }
1671
+
1672
+ static void
1673
+ deflate_write_uncompressed_blocks(struct deflate_output_bitstream *os,
1674
+ const u8 *data, u32 data_length,
1675
+ bool is_final_block)
1676
+ {
1677
+ do {
1678
+ u16 len = MIN(data_length, UINT16_MAX);
1679
+
1680
+ deflate_write_uncompressed_block(os, data, len,
1681
+ is_final_block && len == data_length);
1682
+ data += len;
1683
+ data_length -= len;
1684
+ } while (data_length != 0);
1685
+ }
1686
+
1687
+ /*
1688
+ * Choose the best type of block to use (dynamic Huffman, static Huffman, or
1689
+ * uncompressed), then output it.
1690
+ */
1691
+ static void
1692
+ deflate_flush_block(struct libdeflate_compressor * restrict c,
1693
+ struct deflate_output_bitstream * restrict os,
1694
+ const u8 * restrict block_begin, u32 block_length,
1695
+ bool is_final_block, bool use_item_list)
1696
+ {
1697
+ static const u8 deflate_extra_precode_bits[DEFLATE_NUM_PRECODE_SYMS] = {
1698
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 3, 7,
1699
+ };
1700
+
1701
+ /* Costs are measured in bits */
1702
+ u32 dynamic_cost = 0;
1703
+ u32 static_cost = 0;
1704
+ u32 uncompressed_cost = 0;
1705
+ struct deflate_codes *codes;
1706
+ int block_type;
1707
+ unsigned sym;
1708
+
1709
+ /* Tally the end-of-block symbol. */
1710
+ c->freqs.litlen[DEFLATE_END_OF_BLOCK]++;
1711
+
1712
+ /* Build dynamic Huffman codes. */
1713
+ deflate_make_huffman_codes(&c->freqs, &c->codes);
1714
+
1715
+ /* Account for the cost of sending dynamic Huffman codes. */
1716
+ deflate_precompute_huffman_header(c);
1717
+ dynamic_cost += 5 + 5 + 4 + (3 * c->num_explicit_lens);
1718
+ for (sym = 0; sym < DEFLATE_NUM_PRECODE_SYMS; sym++) {
1719
+ u32 extra = deflate_extra_precode_bits[sym];
1720
+ dynamic_cost += c->precode_freqs[sym] *
1721
+ (extra + c->precode_lens[sym]);
1722
+ }
1723
+
1724
+ /* Account for the cost of encoding literals. */
1725
+ for (sym = 0; sym < 256; sym++) {
1726
+ dynamic_cost += c->freqs.litlen[sym] *
1727
+ c->codes.lens.litlen[sym];
1728
+ }
1729
+ for (sym = 0; sym < 144; sym++)
1730
+ static_cost += c->freqs.litlen[sym] * 8;
1731
+ for (; sym < 256; sym++)
1732
+ static_cost += c->freqs.litlen[sym] * 9;
1733
+
1734
+ /* Account for the cost of encoding the end-of-block symbol. */
1735
+ dynamic_cost += c->codes.lens.litlen[256];
1736
+ static_cost += 7;
1737
+
1738
+ /* Account for the cost of encoding lengths. */
1739
+ for (sym = 257; sym < 257 + ARRAY_LEN(deflate_extra_length_bits); sym++) {
1740
+ u32 extra = deflate_extra_length_bits[sym - 257];
1741
+ dynamic_cost += c->freqs.litlen[sym] *
1742
+ (extra + c->codes.lens.litlen[sym]);
1743
+ static_cost += c->freqs.litlen[sym] *
1744
+ (extra + c->static_codes.lens.litlen[sym]);
1745
+ }
1746
+
1747
+ /* Account for the cost of encoding offsets. */
1748
+ for (sym = 0; sym < ARRAY_LEN(deflate_extra_offset_bits); sym++) {
1749
+ u32 extra = deflate_extra_offset_bits[sym];
1750
+ dynamic_cost += c->freqs.offset[sym] *
1751
+ (extra + c->codes.lens.offset[sym]);
1752
+ static_cost += c->freqs.offset[sym] * (extra + 5);
1753
+ }
1754
+
1755
+ /* Compute the cost of using uncompressed blocks. */
1756
+ uncompressed_cost += (-(os->bitcount + 3) & 7) + 32 +
1757
+ (40 * (DIV_ROUND_UP(block_length,
1758
+ UINT16_MAX) - 1)) +
1759
+ (8 * block_length);
1760
+
1761
+ /* Choose the cheapest block type. */
1762
+ if (dynamic_cost < MIN(static_cost, uncompressed_cost)) {
1763
+ block_type = DEFLATE_BLOCKTYPE_DYNAMIC_HUFFMAN;
1764
+ codes = &c->codes;
1765
+ } else if (static_cost < uncompressed_cost) {
1766
+ block_type = DEFLATE_BLOCKTYPE_STATIC_HUFFMAN;
1767
+ codes = &c->static_codes;
1768
+ } else {
1769
+ block_type = DEFLATE_BLOCKTYPE_UNCOMPRESSED;
1770
+ }
1771
+
1772
+ /* Now actually output the block. */
1773
+
1774
+ if (block_type == DEFLATE_BLOCKTYPE_UNCOMPRESSED) {
1775
+ /* Note: the length being flushed may exceed the maximum length
1776
+ * of an uncompressed block (65535 bytes). Therefore, more than
1777
+ * one uncompressed block might be needed. */
1778
+ deflate_write_uncompressed_blocks(os, block_begin, block_length,
1779
+ is_final_block);
1780
+ } else {
1781
+ /* Output the block header. */
1782
+ deflate_write_block_header(os, is_final_block, block_type);
1783
+
1784
+ /* Output the Huffman codes (dynamic Huffman blocks only). */
1785
+ if (block_type == DEFLATE_BLOCKTYPE_DYNAMIC_HUFFMAN)
1786
+ deflate_write_huffman_header(c, os);
1787
+
1788
+ /* Output the literals, matches, and end-of-block symbol. */
1789
+ #if SUPPORT_NEAR_OPTIMAL_PARSING
1790
+ if (use_item_list)
1791
+ deflate_write_item_list(os, codes, c, block_length);
1792
+ else
1793
+ #endif
1794
+ deflate_write_sequences(os, codes, c->p.g.sequences,
1795
+ block_begin);
1796
+ deflate_write_end_of_block(os, codes);
1797
+ }
1798
+ }
1799
+
1800
+ static forceinline void
1801
+ deflate_choose_literal(struct libdeflate_compressor *c, unsigned literal,
1802
+ u32 *litrunlen_p)
1803
+ {
1804
+ c->freqs.litlen[literal]++;
1805
+ ++*litrunlen_p;
1806
+ }
1807
+
1808
+ static forceinline void
1809
+ deflate_choose_match(struct libdeflate_compressor *c,
1810
+ unsigned length, unsigned offset,
1811
+ u32 *litrunlen_p, struct deflate_sequence **next_seq_p)
1812
+ {
1813
+ struct deflate_sequence *seq = *next_seq_p;
1814
+ unsigned length_slot = deflate_length_slot[length];
1815
+ unsigned offset_slot = deflate_get_offset_slot(c, offset);
1816
+
1817
+ c->freqs.litlen[257 + length_slot]++;
1818
+ c->freqs.offset[offset_slot]++;
1819
+
1820
+ seq->litrunlen_and_length = ((u32)length << 23) | *litrunlen_p;
1821
+ seq->offset = offset;
1822
+ seq->length_slot = length_slot;
1823
+ seq->offset_symbol = offset_slot;
1824
+
1825
+ *litrunlen_p = 0;
1826
+ *next_seq_p = seq + 1;
1827
+ }
1828
+
1829
+ static forceinline void
1830
+ deflate_finish_sequence(struct deflate_sequence *seq, u32 litrunlen)
1831
+ {
1832
+ seq->litrunlen_and_length = litrunlen; /* length = 0 */
1833
+ }
1834
+
1835
+ /******************************************************************************/
1836
+
1837
+ /*
1838
+ * Block splitting algorithm. The problem is to decide when it is worthwhile to
1839
+ * start a new block with new Huffman codes. There is a theoretically optimal
1840
+ * solution: recursively consider every possible block split, considering the
1841
+ * exact cost of each block, and choose the minimum cost approach. But this is
1842
+ * far too slow. Instead, as an approximation, we can count symbols and after
1843
+ * every N symbols, compare the expected distribution of symbols based on the
1844
+ * previous data with the actual distribution. If they differ "by enough", then
1845
+ * start a new block.
1846
+ *
1847
+ * As an optimization and heuristic, we don't distinguish between every symbol
1848
+ * but rather we combine many symbols into a single "observation type". For
1849
+ * literals we only look at the high bits and low bits, and for matches we only
1850
+ * look at whether the match is long or not. The assumption is that for typical
1851
+ * "real" data, places that are good block boundaries will tend to be noticable
1852
+ * based only on changes in these aggregate frequencies, without looking for
1853
+ * subtle differences in individual symbols. For example, a change from ASCII
1854
+ * bytes to non-ASCII bytes, or from few matches (generally less compressible)
1855
+ * to many matches (generally more compressible), would be easily noticed based
1856
+ * on the aggregates.
1857
+ *
1858
+ * For determining whether the frequency distributions are "different enough" to
1859
+ * start a new block, the simply heuristic of splitting when the sum of absolute
1860
+ * differences exceeds a constant seems to be good enough. We also add a number
1861
+ * proportional to the block length so that the algorithm is more likely to end
1862
+ * long blocks than short blocks. This reflects the general expectation that it
1863
+ * will become increasingly beneficial to start a new block as the current
1864
+ * block grows longer.
1865
+ *
1866
+ * Finally, for an approximation, it is not strictly necessary that the exact
1867
+ * symbols being used are considered. With "near-optimal parsing", for example,
1868
+ * the actual symbols that will be used are unknown until after the block
1869
+ * boundary is chosen and the block has been optimized. Since the final choices
1870
+ * cannot be used, we can use preliminary "greedy" choices instead.
1871
+ */
1872
+
1873
+ /* Initialize the block split statistics when starting a new block. */
1874
+ static void
1875
+ init_block_split_stats(struct block_split_stats *stats)
1876
+ {
1877
+ int i;
1878
+
1879
+ for (i = 0; i < NUM_OBSERVATION_TYPES; i++) {
1880
+ stats->new_observations[i] = 0;
1881
+ stats->observations[i] = 0;
1882
+ }
1883
+ stats->num_new_observations = 0;
1884
+ stats->num_observations = 0;
1885
+ }
1886
+
1887
+ /* Literal observation. Heuristic: use the top 2 bits and low 1 bits of the
1888
+ * literal, for 8 possible literal observation types. */
1889
+ static forceinline void
1890
+ observe_literal(struct block_split_stats *stats, u8 lit)
1891
+ {
1892
+ stats->new_observations[((lit >> 5) & 0x6) | (lit & 1)]++;
1893
+ stats->num_new_observations++;
1894
+ }
1895
+
1896
+ /* Match observation. Heuristic: use one observation type for "short match" and
1897
+ * one observation type for "long match". */
1898
+ static forceinline void
1899
+ observe_match(struct block_split_stats *stats, unsigned length)
1900
+ {
1901
+ stats->new_observations[NUM_LITERAL_OBSERVATION_TYPES + (length >= 9)]++;
1902
+ stats->num_new_observations++;
1903
+ }
1904
+
1905
+ static bool
1906
+ do_end_block_check(struct block_split_stats *stats, u32 block_length)
1907
+ {
1908
+ int i;
1909
+
1910
+ if (stats->num_observations > 0) {
1911
+
1912
+ /* Note: to avoid slow divisions, we do not divide by
1913
+ * 'num_observations', but rather do all math with the numbers
1914
+ * multiplied by 'num_observations'. */
1915
+ u32 total_delta = 0;
1916
+ for (i = 0; i < NUM_OBSERVATION_TYPES; i++) {
1917
+ u32 expected = stats->observations[i] * stats->num_new_observations;
1918
+ u32 actual = stats->new_observations[i] * stats->num_observations;
1919
+ u32 delta = (actual > expected) ? actual - expected :
1920
+ expected - actual;
1921
+ total_delta += delta;
1922
+ }
1923
+
1924
+ /* Ready to end the block? */
1925
+ if (total_delta + (block_length / 4096) * stats->num_observations >=
1926
+ NUM_OBSERVATIONS_PER_BLOCK_CHECK * 200 / 512 * stats->num_observations)
1927
+ return true;
1928
+ }
1929
+
1930
+ for (i = 0; i < NUM_OBSERVATION_TYPES; i++) {
1931
+ stats->num_observations += stats->new_observations[i];
1932
+ stats->observations[i] += stats->new_observations[i];
1933
+ stats->new_observations[i] = 0;
1934
+ }
1935
+ stats->num_new_observations = 0;
1936
+ return false;
1937
+ }
1938
+
1939
+ static forceinline bool
1940
+ should_end_block(struct block_split_stats *stats,
1941
+ const u8 *in_block_begin, const u8 *in_next, const u8 *in_end)
1942
+ {
1943
+ /* Ready to check block split statistics? */
1944
+ if (stats->num_new_observations < NUM_OBSERVATIONS_PER_BLOCK_CHECK ||
1945
+ in_next - in_block_begin < MIN_BLOCK_LENGTH ||
1946
+ in_end - in_next < MIN_BLOCK_LENGTH)
1947
+ return false;
1948
+
1949
+ return do_end_block_check(stats, in_next - in_block_begin);
1950
+ }
1951
+
1952
+ /******************************************************************************/
1953
+
1954
+ /*
1955
+ * This is the "greedy" DEFLATE compressor. It always chooses the longest match.
1956
+ */
1957
+ static size_t
1958
+ deflate_compress_greedy(struct libdeflate_compressor * restrict c,
1959
+ const u8 * restrict in, size_t in_nbytes,
1960
+ u8 * restrict out, size_t out_nbytes_avail)
1961
+ {
1962
+ const u8 *in_next = in;
1963
+ const u8 *in_end = in_next + in_nbytes;
1964
+ struct deflate_output_bitstream os;
1965
+ const u8 *in_cur_base = in_next;
1966
+ unsigned max_len = DEFLATE_MAX_MATCH_LEN;
1967
+ unsigned nice_len = MIN(c->nice_match_length, max_len);
1968
+ u32 next_hashes[2] = {0, 0};
1969
+
1970
+ deflate_init_output(&os, out, out_nbytes_avail);
1971
+ hc_matchfinder_init(&c->p.g.hc_mf);
1972
+
1973
+ do {
1974
+ /* Starting a new DEFLATE block. */
1975
+
1976
+ const u8 * const in_block_begin = in_next;
1977
+ const u8 * const in_max_block_end =
1978
+ in_next + MIN(in_end - in_next, SOFT_MAX_BLOCK_LENGTH);
1979
+ u32 litrunlen = 0;
1980
+ struct deflate_sequence *next_seq = c->p.g.sequences;
1981
+
1982
+ init_block_split_stats(&c->split_stats);
1983
+ deflate_reset_symbol_frequencies(c);
1984
+
1985
+ do {
1986
+ u32 length;
1987
+ u32 offset;
1988
+
1989
+ /* Decrease the maximum and nice match lengths if we're
1990
+ * approaching the end of the input buffer. */
1991
+ if (unlikely(max_len > in_end - in_next)) {
1992
+ max_len = in_end - in_next;
1993
+ nice_len = MIN(nice_len, max_len);
1994
+ }
1995
+
1996
+ length = hc_matchfinder_longest_match(&c->p.g.hc_mf,
1997
+ &in_cur_base,
1998
+ in_next,
1999
+ DEFLATE_MIN_MATCH_LEN - 1,
2000
+ max_len,
2001
+ nice_len,
2002
+ c->max_search_depth,
2003
+ next_hashes,
2004
+ &offset);
2005
+
2006
+ if (length >= DEFLATE_MIN_MATCH_LEN) {
2007
+ /* Match found. */
2008
+ deflate_choose_match(c, length, offset,
2009
+ &litrunlen, &next_seq);
2010
+ observe_match(&c->split_stats, length);
2011
+ in_next = hc_matchfinder_skip_positions(&c->p.g.hc_mf,
2012
+ &in_cur_base,
2013
+ in_next + 1,
2014
+ in_end,
2015
+ length - 1,
2016
+ next_hashes);
2017
+ } else {
2018
+ /* No match found. */
2019
+ deflate_choose_literal(c, *in_next, &litrunlen);
2020
+ observe_literal(&c->split_stats, *in_next);
2021
+ in_next++;
2022
+ }
2023
+
2024
+ /* Check if it's time to output another block. */
2025
+ } while (in_next < in_max_block_end &&
2026
+ !should_end_block(&c->split_stats, in_block_begin, in_next, in_end));
2027
+
2028
+ deflate_finish_sequence(next_seq, litrunlen);
2029
+ deflate_flush_block(c, &os, in_block_begin,
2030
+ in_next - in_block_begin,
2031
+ in_next == in_end, false);
2032
+ } while (in_next != in_end);
2033
+
2034
+ return deflate_flush_output(&os);
2035
+ }
2036
+
2037
+ /*
2038
+ * This is the "lazy" DEFLATE compressor. Before choosing a match, it checks to
2039
+ * see if there's a longer match at the next position. If yes, it outputs a
2040
+ * literal and continues to the next position. If no, it outputs the match.
2041
+ */
2042
+ static size_t
2043
+ deflate_compress_lazy(struct libdeflate_compressor * restrict c,
2044
+ const u8 * restrict in, size_t in_nbytes,
2045
+ u8 * restrict out, size_t out_nbytes_avail)
2046
+ {
2047
+ const u8 *in_next = in;
2048
+ const u8 *in_end = in_next + in_nbytes;
2049
+ struct deflate_output_bitstream os;
2050
+ const u8 *in_cur_base = in_next;
2051
+ unsigned max_len = DEFLATE_MAX_MATCH_LEN;
2052
+ unsigned nice_len = MIN(c->nice_match_length, max_len);
2053
+ u32 next_hashes[2] = {0, 0};
2054
+
2055
+ deflate_init_output(&os, out, out_nbytes_avail);
2056
+ hc_matchfinder_init(&c->p.g.hc_mf);
2057
+
2058
+ do {
2059
+ /* Starting a new DEFLATE block. */
2060
+
2061
+ const u8 * const in_block_begin = in_next;
2062
+ const u8 * const in_max_block_end =
2063
+ in_next + MIN(in_end - in_next, SOFT_MAX_BLOCK_LENGTH);
2064
+ u32 litrunlen = 0;
2065
+ struct deflate_sequence *next_seq = c->p.g.sequences;
2066
+
2067
+ init_block_split_stats(&c->split_stats);
2068
+ deflate_reset_symbol_frequencies(c);
2069
+
2070
+ do {
2071
+ unsigned cur_len;
2072
+ unsigned cur_offset;
2073
+ unsigned next_len;
2074
+ unsigned next_offset;
2075
+
2076
+ if (unlikely(in_end - in_next < DEFLATE_MAX_MATCH_LEN)) {
2077
+ max_len = in_end - in_next;
2078
+ nice_len = MIN(nice_len, max_len);
2079
+ }
2080
+
2081
+ /* Find the longest match at the current position. */
2082
+ cur_len = hc_matchfinder_longest_match(&c->p.g.hc_mf,
2083
+ &in_cur_base,
2084
+ in_next,
2085
+ DEFLATE_MIN_MATCH_LEN - 1,
2086
+ max_len,
2087
+ nice_len,
2088
+ c->max_search_depth,
2089
+ next_hashes,
2090
+ &cur_offset);
2091
+ in_next += 1;
2092
+
2093
+ if (cur_len < DEFLATE_MIN_MATCH_LEN) {
2094
+ /* No match found. Choose a literal. */
2095
+ deflate_choose_literal(c, *(in_next - 1), &litrunlen);
2096
+ observe_literal(&c->split_stats, *(in_next - 1));
2097
+ continue;
2098
+ }
2099
+
2100
+ have_cur_match:
2101
+ observe_match(&c->split_stats, cur_len);
2102
+
2103
+ /* We have a match at the current position. */
2104
+
2105
+ /* If the current match is very long, choose it
2106
+ * immediately. */
2107
+ if (cur_len >= nice_len) {
2108
+ deflate_choose_match(c, cur_len, cur_offset,
2109
+ &litrunlen, &next_seq);
2110
+ in_next = hc_matchfinder_skip_positions(&c->p.g.hc_mf,
2111
+ &in_cur_base,
2112
+ in_next,
2113
+ in_end,
2114
+ cur_len - 1,
2115
+ next_hashes);
2116
+ continue;
2117
+ }
2118
+
2119
+ /*
2120
+ * Try to find a match at the next position.
2121
+ *
2122
+ * Note: since we already have a match at the *current*
2123
+ * position, we use only half the 'max_search_depth'
2124
+ * when checking the *next* position. This is a useful
2125
+ * trade-off because it's more worthwhile to use a
2126
+ * greater search depth on the initial match.
2127
+ *
2128
+ * Note: it's possible to structure the code such that
2129
+ * there's only one call to longest_match(), which
2130
+ * handles both the "find the initial match" and "try to
2131
+ * find a longer match" cases. However, it is faster to
2132
+ * have two call sites, with longest_match() inlined at
2133
+ * each.
2134
+ */
2135
+ if (unlikely(in_end - in_next < DEFLATE_MAX_MATCH_LEN)) {
2136
+ max_len = in_end - in_next;
2137
+ nice_len = MIN(nice_len, max_len);
2138
+ }
2139
+ next_len = hc_matchfinder_longest_match(&c->p.g.hc_mf,
2140
+ &in_cur_base,
2141
+ in_next,
2142
+ cur_len,
2143
+ max_len,
2144
+ nice_len,
2145
+ c->max_search_depth / 2,
2146
+ next_hashes,
2147
+ &next_offset);
2148
+ in_next += 1;
2149
+
2150
+ if (next_len > cur_len) {
2151
+ /* Found a longer match at the next position.
2152
+ * Output a literal. Then the next match
2153
+ * becomes the current match. */
2154
+ deflate_choose_literal(c, *(in_next - 2), &litrunlen);
2155
+ cur_len = next_len;
2156
+ cur_offset = next_offset;
2157
+ goto have_cur_match;
2158
+ }
2159
+
2160
+ /* No longer match at the next position.
2161
+ * Output the current match. */
2162
+ deflate_choose_match(c, cur_len, cur_offset,
2163
+ &litrunlen, &next_seq);
2164
+ in_next = hc_matchfinder_skip_positions(&c->p.g.hc_mf,
2165
+ &in_cur_base,
2166
+ in_next,
2167
+ in_end,
2168
+ cur_len - 2,
2169
+ next_hashes);
2170
+
2171
+ /* Check if it's time to output another block. */
2172
+ } while (in_next < in_max_block_end &&
2173
+ !should_end_block(&c->split_stats, in_block_begin, in_next, in_end));
2174
+
2175
+ deflate_finish_sequence(next_seq, litrunlen);
2176
+ deflate_flush_block(c, &os, in_block_begin,
2177
+ in_next - in_block_begin,
2178
+ in_next == in_end, false);
2179
+ } while (in_next != in_end);
2180
+
2181
+ return deflate_flush_output(&os);
2182
+ }
2183
+
2184
+ #if SUPPORT_NEAR_OPTIMAL_PARSING
2185
+
2186
+ /*
2187
+ * Follow the minimum-cost path in the graph of possible match/literal choices
2188
+ * for the current block and compute the frequencies of the Huffman symbols that
2189
+ * would be needed to output those matches and literals.
2190
+ */
2191
+ static void
2192
+ deflate_tally_item_list(struct libdeflate_compressor *c, u32 block_length)
2193
+ {
2194
+ struct deflate_optimum_node *cur_node = &c->p.n.optimum_nodes[0];
2195
+ struct deflate_optimum_node *end_node = &c->p.n.optimum_nodes[block_length];
2196
+ do {
2197
+ unsigned length = cur_node->item & OPTIMUM_LEN_MASK;
2198
+ unsigned offset = cur_node->item >> OPTIMUM_OFFSET_SHIFT;
2199
+
2200
+ if (length == 1) {
2201
+ /* Literal */
2202
+ c->freqs.litlen[offset]++;
2203
+ } else {
2204
+ /* Match */
2205
+ c->freqs.litlen[257 + deflate_length_slot[length]]++;
2206
+ c->freqs.offset[deflate_get_offset_slot(c, offset)]++;
2207
+ }
2208
+ cur_node += length;
2209
+ } while (cur_node != end_node);
2210
+ }
2211
+
2212
+ /* Set the current cost model from the codeword lengths specified in @lens. */
2213
+ static void
2214
+ deflate_set_costs_from_codes(struct libdeflate_compressor *c,
2215
+ const struct deflate_lens *lens)
2216
+ {
2217
+ unsigned i;
2218
+
2219
+ /* Literals */
2220
+ for (i = 0; i < DEFLATE_NUM_LITERALS; i++) {
2221
+ u32 bits = (lens->litlen[i] ? lens->litlen[i] : LITERAL_NOSTAT_BITS);
2222
+ c->p.n.costs.literal[i] = bits << COST_SHIFT;
2223
+ }
2224
+
2225
+ /* Lengths */
2226
+ for (i = DEFLATE_MIN_MATCH_LEN; i <= DEFLATE_MAX_MATCH_LEN; i++) {
2227
+ unsigned length_slot = deflate_length_slot[i];
2228
+ unsigned litlen_sym = 257 + length_slot;
2229
+ u32 bits = (lens->litlen[litlen_sym] ? lens->litlen[litlen_sym] : LENGTH_NOSTAT_BITS);
2230
+ bits += deflate_extra_length_bits[length_slot];
2231
+ c->p.n.costs.length[i] = bits << COST_SHIFT;
2232
+ }
2233
+
2234
+ /* Offset slots */
2235
+ for (i = 0; i < ARRAY_LEN(deflate_offset_slot_base); i++) {
2236
+ u32 bits = (lens->offset[i] ? lens->offset[i] : OFFSET_NOSTAT_BITS);
2237
+ bits += deflate_extra_offset_bits[i];
2238
+ c->p.n.costs.offset_slot[i] = bits << COST_SHIFT;
2239
+ }
2240
+ }
2241
+
2242
+ static forceinline u32
2243
+ deflate_default_literal_cost(unsigned literal)
2244
+ {
2245
+ STATIC_ASSERT(COST_SHIFT == 3);
2246
+ /* 66 is 8.25 bits/symbol */
2247
+ return 66;
2248
+ }
2249
+
2250
+ static forceinline u32
2251
+ deflate_default_length_slot_cost(unsigned length_slot)
2252
+ {
2253
+ STATIC_ASSERT(COST_SHIFT == 3);
2254
+ /* 60 is 7.5 bits/symbol */
2255
+ return 60 + ((u32)deflate_extra_length_bits[length_slot] << COST_SHIFT);
2256
+ }
2257
+
2258
+ static forceinline u32
2259
+ deflate_default_offset_slot_cost(unsigned offset_slot)
2260
+ {
2261
+ STATIC_ASSERT(COST_SHIFT == 3);
2262
+ /* 39 is 4.875 bits/symbol */
2263
+ return 39 + ((u32)deflate_extra_offset_bits[offset_slot] << COST_SHIFT);
2264
+ }
2265
+
2266
+ /*
2267
+ * Set default symbol costs for the first block's first optimization pass.
2268
+ *
2269
+ * It works well to assume that each symbol is equally probable. This results
2270
+ * in each symbol being assigned a cost of (-log2(1.0/num_syms) * (1 <<
2271
+ * COST_SHIFT)) where 'num_syms' is the number of symbols in the corresponding
2272
+ * alphabet. However, we intentionally bias the parse towards matches rather
2273
+ * than literals by using a slightly lower default cost for length symbols than
2274
+ * for literals. This often improves the compression ratio slightly.
2275
+ */
2276
+ static void
2277
+ deflate_set_default_costs(struct libdeflate_compressor *c)
2278
+ {
2279
+ unsigned i;
2280
+
2281
+ /* Literals */
2282
+ for (i = 0; i < DEFLATE_NUM_LITERALS; i++)
2283
+ c->p.n.costs.literal[i] = deflate_default_literal_cost(i);
2284
+
2285
+ /* Lengths */
2286
+ for (i = DEFLATE_MIN_MATCH_LEN; i <= DEFLATE_MAX_MATCH_LEN; i++)
2287
+ c->p.n.costs.length[i] = deflate_default_length_slot_cost(
2288
+ deflate_length_slot[i]);
2289
+
2290
+ /* Offset slots */
2291
+ for (i = 0; i < ARRAY_LEN(deflate_offset_slot_base); i++)
2292
+ c->p.n.costs.offset_slot[i] = deflate_default_offset_slot_cost(i);
2293
+ }
2294
+
2295
+ static forceinline void
2296
+ deflate_adjust_cost(u32 *cost_p, u32 default_cost)
2297
+ {
2298
+ *cost_p += ((s32)default_cost - (s32)*cost_p) >> 1;
2299
+ }
2300
+
2301
+ /*
2302
+ * Adjust the costs when beginning a new block.
2303
+ *
2304
+ * Since the current costs have been optimized for the data, it's undesirable to
2305
+ * throw them away and start over with the default costs. At the same time, we
2306
+ * don't want to bias the parse by assuming that the next block will be similar
2307
+ * to the current block. As a compromise, make the costs closer to the
2308
+ * defaults, but don't simply set them to the defaults.
2309
+ */
2310
+ static void
2311
+ deflate_adjust_costs(struct libdeflate_compressor *c)
2312
+ {
2313
+ unsigned i;
2314
+
2315
+ /* Literals */
2316
+ for (i = 0; i < DEFLATE_NUM_LITERALS; i++)
2317
+ deflate_adjust_cost(&c->p.n.costs.literal[i],
2318
+ deflate_default_literal_cost(i));
2319
+
2320
+ /* Lengths */
2321
+ for (i = DEFLATE_MIN_MATCH_LEN; i <= DEFLATE_MAX_MATCH_LEN; i++)
2322
+ deflate_adjust_cost(&c->p.n.costs.length[i],
2323
+ deflate_default_length_slot_cost(
2324
+ deflate_length_slot[i]));
2325
+
2326
+ /* Offset slots */
2327
+ for (i = 0; i < ARRAY_LEN(deflate_offset_slot_base); i++)
2328
+ deflate_adjust_cost(&c->p.n.costs.offset_slot[i],
2329
+ deflate_default_offset_slot_cost(i));
2330
+ }
2331
+
2332
+ /*
2333
+ * Find the minimum-cost path through the graph of possible match/literal
2334
+ * choices for this block.
2335
+ *
2336
+ * We find the minimum cost path from 'c->p.n.optimum_nodes[0]', which
2337
+ * represents the node at the beginning of the block, to
2338
+ * 'c->p.n.optimum_nodes[block_length]', which represents the node at the end of
2339
+ * the block. Edge costs are evaluated using the cost model 'c->p.n.costs'.
2340
+ *
2341
+ * The algorithm works backwards, starting at the end node and proceeding
2342
+ * backwards one node at a time. At each node, the minimum cost to reach the
2343
+ * end node is computed and the match/literal choice that begins that path is
2344
+ * saved.
2345
+ */
2346
+ static void
2347
+ deflate_find_min_cost_path(struct libdeflate_compressor *c,
2348
+ const u32 block_length,
2349
+ const struct lz_match *cache_ptr)
2350
+ {
2351
+ struct deflate_optimum_node *end_node = &c->p.n.optimum_nodes[block_length];
2352
+ struct deflate_optimum_node *cur_node = end_node;
2353
+
2354
+ cur_node->cost_to_end = 0;
2355
+ do {
2356
+ unsigned num_matches;
2357
+ unsigned literal;
2358
+ u32 best_cost_to_end;
2359
+
2360
+ cur_node--;
2361
+ cache_ptr--;
2362
+
2363
+ num_matches = cache_ptr->length;
2364
+ literal = cache_ptr->offset;
2365
+
2366
+ /* It's always possible to choose a literal. */
2367
+ best_cost_to_end = c->p.n.costs.literal[literal] +
2368
+ (cur_node + 1)->cost_to_end;
2369
+ cur_node->item = ((u32)literal << OPTIMUM_OFFSET_SHIFT) | 1;
2370
+
2371
+ /* Also consider matches if there are any. */
2372
+ if (num_matches) {
2373
+ const struct lz_match *match;
2374
+ unsigned len;
2375
+ unsigned offset;
2376
+ unsigned offset_slot;
2377
+ u32 offset_cost;
2378
+ u32 cost_to_end;
2379
+
2380
+ /*
2381
+ * Consider each length from the minimum
2382
+ * (DEFLATE_MIN_MATCH_LEN) to the length of the longest
2383
+ * match found at this position. For each length, we
2384
+ * consider only the smallest offset for which that
2385
+ * length is available. Although this is not guaranteed
2386
+ * to be optimal due to the possibility of a larger
2387
+ * offset costing less than a smaller offset to code,
2388
+ * this is a very useful heuristic.
2389
+ */
2390
+ match = cache_ptr - num_matches;
2391
+ len = DEFLATE_MIN_MATCH_LEN;
2392
+ do {
2393
+ offset = match->offset;
2394
+ offset_slot = deflate_get_offset_slot(c, offset);
2395
+ offset_cost = c->p.n.costs.offset_slot[offset_slot];
2396
+ do {
2397
+ cost_to_end = offset_cost +
2398
+ c->p.n.costs.length[len] +
2399
+ (cur_node + len)->cost_to_end;
2400
+ if (cost_to_end < best_cost_to_end) {
2401
+ best_cost_to_end = cost_to_end;
2402
+ cur_node->item = ((u32)offset << OPTIMUM_OFFSET_SHIFT) | len;
2403
+ }
2404
+ } while (++len <= match->length);
2405
+ } while (++match != cache_ptr);
2406
+ cache_ptr -= num_matches;
2407
+ }
2408
+ cur_node->cost_to_end = best_cost_to_end;
2409
+ } while (cur_node != &c->p.n.optimum_nodes[0]);
2410
+ }
2411
+
2412
+ /*
2413
+ * Choose the literal/match sequence to use for the current block. The basic
2414
+ * algorithm finds a minimum-cost path through the block's graph of
2415
+ * literal/match choices, given a cost model. However, the cost of each symbol
2416
+ * is unknown until the Huffman codes have been built, but at the same time the
2417
+ * Huffman codes depend on the frequencies of chosen symbols. Consequently,
2418
+ * multiple passes must be used to try to approximate an optimal solution. The
2419
+ * first pass uses default costs, mixed with the costs from the previous block
2420
+ * if any. Later passes use the Huffman codeword lengths from the previous pass
2421
+ * as the costs.
2422
+ */
2423
+ static void
2424
+ deflate_optimize_block(struct libdeflate_compressor *c, u32 block_length,
2425
+ const struct lz_match *cache_ptr, bool is_first_block)
2426
+ {
2427
+ unsigned num_passes_remaining = c->p.n.num_optim_passes;
2428
+ u32 i;
2429
+
2430
+ /* Force the block to really end at the desired length, even if some
2431
+ * matches extend beyond it. */
2432
+ for (i = block_length; i <= MIN(block_length - 1 + DEFLATE_MAX_MATCH_LEN,
2433
+ ARRAY_LEN(c->p.n.optimum_nodes) - 1); i++)
2434
+ c->p.n.optimum_nodes[i].cost_to_end = 0x80000000;
2435
+
2436
+ /* Set the initial costs. */
2437
+ if (is_first_block)
2438
+ deflate_set_default_costs(c);
2439
+ else
2440
+ deflate_adjust_costs(c);
2441
+
2442
+ for (;;) {
2443
+ /* Find the minimum cost path for this pass. */
2444
+ deflate_find_min_cost_path(c, block_length, cache_ptr);
2445
+
2446
+ /* Compute frequencies of the chosen symbols. */
2447
+ deflate_reset_symbol_frequencies(c);
2448
+ deflate_tally_item_list(c, block_length);
2449
+
2450
+ if (--num_passes_remaining == 0)
2451
+ break;
2452
+
2453
+ /* At least one optimization pass remains; update the costs. */
2454
+ deflate_make_huffman_codes(&c->freqs, &c->codes);
2455
+ deflate_set_costs_from_codes(c, &c->codes.lens);
2456
+ }
2457
+ }
2458
+
2459
+ /*
2460
+ * This is the "near-optimal" DEFLATE compressor. It computes the optimal
2461
+ * representation of each DEFLATE block using a minimum-cost path search over
2462
+ * the graph of possible match/literal choices for that block, assuming a
2463
+ * certain cost for each Huffman symbol.
2464
+ *
2465
+ * For several reasons, the end result is not guaranteed to be optimal:
2466
+ *
2467
+ * - Nonoptimal choice of blocks
2468
+ * - Heuristic limitations on which matches are actually considered
2469
+ * - Symbol costs are unknown until the symbols have already been chosen
2470
+ * (so iterative optimization must be used)
2471
+ */
2472
+ static size_t
2473
+ deflate_compress_near_optimal(struct libdeflate_compressor * restrict c,
2474
+ const u8 * restrict in, size_t in_nbytes,
2475
+ u8 * restrict out, size_t out_nbytes_avail)
2476
+ {
2477
+ const u8 *in_next = in;
2478
+ const u8 *in_end = in_next + in_nbytes;
2479
+ struct deflate_output_bitstream os;
2480
+ const u8 *in_cur_base = in_next;
2481
+ const u8 *in_next_slide = in_next + MIN(in_end - in_next, MATCHFINDER_WINDOW_SIZE);
2482
+ unsigned max_len = DEFLATE_MAX_MATCH_LEN;
2483
+ unsigned nice_len = MIN(c->nice_match_length, max_len);
2484
+ u32 next_hashes[2] = {0, 0};
2485
+
2486
+ deflate_init_output(&os, out, out_nbytes_avail);
2487
+ bt_matchfinder_init(&c->p.n.bt_mf);
2488
+
2489
+ do {
2490
+ /* Starting a new DEFLATE block. */
2491
+
2492
+ struct lz_match *cache_ptr = c->p.n.match_cache;
2493
+ const u8 * const in_block_begin = in_next;
2494
+ const u8 * const in_max_block_end =
2495
+ in_next + MIN(in_end - in_next, SOFT_MAX_BLOCK_LENGTH);
2496
+ const u8 *next_observation = in_next;
2497
+
2498
+ init_block_split_stats(&c->split_stats);
2499
+
2500
+ /*
2501
+ * Find matches until we decide to end the block. We end the
2502
+ * block if any of the following is true:
2503
+ *
2504
+ * (1) Maximum block length has been reached
2505
+ * (2) Match catch may overflow.
2506
+ * (3) Block split heuristic says to split now.
2507
+ */
2508
+ do {
2509
+ struct lz_match *matches;
2510
+ unsigned best_len;
2511
+
2512
+ /* Slide the window forward if needed. */
2513
+ if (in_next == in_next_slide) {
2514
+ bt_matchfinder_slide_window(&c->p.n.bt_mf);
2515
+ in_cur_base = in_next;
2516
+ in_next_slide = in_next + MIN(in_end - in_next,
2517
+ MATCHFINDER_WINDOW_SIZE);
2518
+ }
2519
+
2520
+ /* Decrease the maximum and nice match lengths if we're
2521
+ * approaching the end of the input buffer. */
2522
+ if (unlikely(max_len > in_end - in_next)) {
2523
+ max_len = in_end - in_next;
2524
+ nice_len = MIN(nice_len, max_len);
2525
+ }
2526
+
2527
+ /*
2528
+ * Find matches with the current position using the
2529
+ * binary tree matchfinder and save them in
2530
+ * 'match_cache'.
2531
+ *
2532
+ * Note: the binary tree matchfinder is more suited for
2533
+ * optimal parsing than the hash chain matchfinder. The
2534
+ * reasons for this include:
2535
+ *
2536
+ * - The binary tree matchfinder can find more matches
2537
+ * in the same number of steps.
2538
+ * - One of the major advantages of hash chains is that
2539
+ * skipping positions (not searching for matches at
2540
+ * them) is faster; however, with optimal parsing we
2541
+ * search for matches at almost all positions, so this
2542
+ * advantage of hash chains is negated.
2543
+ */
2544
+ matches = cache_ptr;
2545
+ best_len = 0;
2546
+ if (likely(max_len >= BT_MATCHFINDER_REQUIRED_NBYTES)) {
2547
+ cache_ptr = bt_matchfinder_get_matches(&c->p.n.bt_mf,
2548
+ in_cur_base,
2549
+ in_next - in_cur_base,
2550
+ max_len,
2551
+ nice_len,
2552
+ c->max_search_depth,
2553
+ next_hashes,
2554
+ &best_len,
2555
+ matches);
2556
+ }
2557
+
2558
+ if (in_next >= next_observation) {
2559
+ if (best_len >= 4) {
2560
+ observe_match(&c->split_stats, best_len);
2561
+ next_observation = in_next + best_len;
2562
+ } else {
2563
+ observe_literal(&c->split_stats, *in_next);
2564
+ next_observation = in_next + 1;
2565
+ }
2566
+ }
2567
+
2568
+ cache_ptr->length = cache_ptr - matches;
2569
+ cache_ptr->offset = *in_next;
2570
+ in_next++;
2571
+ cache_ptr++;
2572
+
2573
+ /*
2574
+ * If there was a very long match found, don't cache any
2575
+ * matches for the bytes covered by that match. This
2576
+ * avoids degenerate behavior when compressing highly
2577
+ * redundant data, where the number of matches can be
2578
+ * very large.
2579
+ *
2580
+ * This heuristic doesn't actually hurt the compression
2581
+ * ratio very much. If there's a long match, then the
2582
+ * data must be highly compressible, so it doesn't
2583
+ * matter much what we do.
2584
+ */
2585
+ if (best_len >= DEFLATE_MIN_MATCH_LEN && best_len >= nice_len) {
2586
+ --best_len;
2587
+ do {
2588
+ if (in_next == in_next_slide) {
2589
+ bt_matchfinder_slide_window(&c->p.n.bt_mf);
2590
+ in_cur_base = in_next;
2591
+ in_next_slide = in_next + MIN(in_end - in_next,
2592
+ MATCHFINDER_WINDOW_SIZE);
2593
+ }
2594
+ if (unlikely(max_len > in_end - in_next)) {
2595
+ max_len = in_end - in_next;
2596
+ nice_len = MIN(nice_len, max_len);
2597
+ }
2598
+ if (max_len >= BT_MATCHFINDER_REQUIRED_NBYTES) {
2599
+ bt_matchfinder_skip_position(&c->p.n.bt_mf,
2600
+ in_cur_base,
2601
+ in_next - in_cur_base,
2602
+ nice_len,
2603
+ c->max_search_depth,
2604
+ next_hashes);
2605
+ }
2606
+ cache_ptr->length = 0;
2607
+ cache_ptr->offset = *in_next;
2608
+ in_next++;
2609
+ cache_ptr++;
2610
+ } while (--best_len);
2611
+ }
2612
+ } while (in_next < in_max_block_end &&
2613
+ cache_ptr < &c->p.n.match_cache[CACHE_LENGTH] &&
2614
+ !should_end_block(&c->split_stats, in_block_begin, in_next, in_end));
2615
+
2616
+ /* All the matches for this block have been cached. Now choose
2617
+ * the sequence of items to output and flush the block. */
2618
+ deflate_optimize_block(c, in_next - in_block_begin, cache_ptr,
2619
+ in_block_begin == in);
2620
+ deflate_flush_block(c, &os, in_block_begin, in_next - in_block_begin,
2621
+ in_next == in_end, true);
2622
+ } while (in_next != in_end);
2623
+
2624
+ return deflate_flush_output(&os);
2625
+ }
2626
+
2627
+ #endif /* SUPPORT_NEAR_OPTIMAL_PARSING */
2628
+
2629
+ /* Initialize c->offset_slot_fast. */
2630
+ static void
2631
+ deflate_init_offset_slot_fast(struct libdeflate_compressor *c)
2632
+ {
2633
+ unsigned offset_slot;
2634
+ unsigned offset;
2635
+ unsigned offset_end;
2636
+
2637
+ for (offset_slot = 0;
2638
+ offset_slot < ARRAY_LEN(deflate_offset_slot_base);
2639
+ offset_slot++)
2640
+ {
2641
+ offset = deflate_offset_slot_base[offset_slot];
2642
+ #if USE_FULL_OFFSET_SLOT_FAST
2643
+ offset_end = offset + (1 << deflate_extra_offset_bits[offset_slot]);
2644
+ do {
2645
+ c->offset_slot_fast[offset] = offset_slot;
2646
+ } while (++offset != offset_end);
2647
+ #else
2648
+ if (offset <= 256) {
2649
+ offset_end = offset + (1 << deflate_extra_offset_bits[offset_slot]);
2650
+ do {
2651
+ c->offset_slot_fast[offset - 1] = offset_slot;
2652
+ } while (++offset != offset_end);
2653
+ } else {
2654
+ offset_end = offset + (1 << deflate_extra_offset_bits[offset_slot]);
2655
+ do {
2656
+ c->offset_slot_fast[256 + ((offset - 1) >> 7)] = offset_slot;
2657
+ } while ((offset += (1 << 7)) != offset_end);
2658
+ }
2659
+ #endif
2660
+ }
2661
+ }
2662
+
2663
+ LIBDEFLATEAPI struct libdeflate_compressor *
2664
+ libdeflate_alloc_compressor(int compression_level)
2665
+ {
2666
+ struct libdeflate_compressor *c;
2667
+ size_t size;
2668
+
2669
+ #if SUPPORT_NEAR_OPTIMAL_PARSING
2670
+ if (compression_level >= 8)
2671
+ size = offsetof(struct libdeflate_compressor, p) + sizeof(c->p.n);
2672
+ else
2673
+ #endif
2674
+ size = offsetof(struct libdeflate_compressor, p) + sizeof(c->p.g);
2675
+
2676
+ c = aligned_malloc(MATCHFINDER_ALIGNMENT, size);
2677
+ if (!c)
2678
+ return NULL;
2679
+
2680
+ switch (compression_level) {
2681
+ case 1:
2682
+ c->impl = deflate_compress_greedy;
2683
+ c->max_search_depth = 2;
2684
+ c->nice_match_length = 8;
2685
+ break;
2686
+ case 2:
2687
+ c->impl = deflate_compress_greedy;
2688
+ c->max_search_depth = 6;
2689
+ c->nice_match_length = 10;
2690
+ break;
2691
+ case 3:
2692
+ c->impl = deflate_compress_greedy;
2693
+ c->max_search_depth = 12;
2694
+ c->nice_match_length = 14;
2695
+ break;
2696
+ case 4:
2697
+ c->impl = deflate_compress_greedy;
2698
+ c->max_search_depth = 24;
2699
+ c->nice_match_length = 24;
2700
+ break;
2701
+ case 5:
2702
+ c->impl = deflate_compress_lazy;
2703
+ c->max_search_depth = 20;
2704
+ c->nice_match_length = 30;
2705
+ break;
2706
+ case 6:
2707
+ c->impl = deflate_compress_lazy;
2708
+ c->max_search_depth = 40;
2709
+ c->nice_match_length = 65;
2710
+ break;
2711
+ case 7:
2712
+ c->impl = deflate_compress_lazy;
2713
+ c->max_search_depth = 100;
2714
+ c->nice_match_length = 130;
2715
+ break;
2716
+ #if SUPPORT_NEAR_OPTIMAL_PARSING
2717
+ case 8:
2718
+ c->impl = deflate_compress_near_optimal;
2719
+ c->max_search_depth = 12;
2720
+ c->nice_match_length = 20;
2721
+ c->p.n.num_optim_passes = 1;
2722
+ break;
2723
+ case 9:
2724
+ c->impl = deflate_compress_near_optimal;
2725
+ c->max_search_depth = 16;
2726
+ c->nice_match_length = 26;
2727
+ c->p.n.num_optim_passes = 2;
2728
+ break;
2729
+ case 10:
2730
+ c->impl = deflate_compress_near_optimal;
2731
+ c->max_search_depth = 30;
2732
+ c->nice_match_length = 50;
2733
+ c->p.n.num_optim_passes = 2;
2734
+ break;
2735
+ case 11:
2736
+ c->impl = deflate_compress_near_optimal;
2737
+ c->max_search_depth = 60;
2738
+ c->nice_match_length = 80;
2739
+ c->p.n.num_optim_passes = 3;
2740
+ break;
2741
+ case 12:
2742
+ c->impl = deflate_compress_near_optimal;
2743
+ c->max_search_depth = 100;
2744
+ c->nice_match_length = 133;
2745
+ c->p.n.num_optim_passes = 4;
2746
+ break;
2747
+ #else
2748
+ case 8:
2749
+ c->impl = deflate_compress_lazy;
2750
+ c->max_search_depth = 150;
2751
+ c->nice_match_length = 200;
2752
+ break;
2753
+ case 9:
2754
+ c->impl = deflate_compress_lazy;
2755
+ c->max_search_depth = 200;
2756
+ c->nice_match_length = DEFLATE_MAX_MATCH_LEN;
2757
+ break;
2758
+ #endif
2759
+ default:
2760
+ aligned_free(c);
2761
+ return NULL;
2762
+ }
2763
+
2764
+ c->compression_level = compression_level;
2765
+
2766
+ deflate_init_offset_slot_fast(c);
2767
+ deflate_init_static_codes(c);
2768
+
2769
+ return c;
2770
+ }
2771
+
2772
+ LIBDEFLATEAPI size_t
2773
+ libdeflate_deflate_compress(struct libdeflate_compressor *c,
2774
+ const void *in, size_t in_nbytes,
2775
+ void *out, size_t out_nbytes_avail)
2776
+ {
2777
+ if (unlikely(out_nbytes_avail < MIN_OUTPUT_SIZE))
2778
+ return 0;
2779
+
2780
+ /* For extremely small inputs just use a single uncompressed block. */
2781
+ if (unlikely(in_nbytes < 16)) {
2782
+ struct deflate_output_bitstream os;
2783
+ deflate_init_output(&os, out, out_nbytes_avail);
2784
+ if (in_nbytes == 0)
2785
+ in = &os; /* Avoid passing NULL to memcpy() */
2786
+ deflate_write_uncompressed_block(&os, in, in_nbytes, true);
2787
+ return deflate_flush_output(&os);
2788
+ }
2789
+
2790
+ return (*c->impl)(c, in, in_nbytes, out, out_nbytes_avail);
2791
+ }
2792
+
2793
+ LIBDEFLATEAPI void
2794
+ libdeflate_free_compressor(struct libdeflate_compressor *c)
2795
+ {
2796
+ aligned_free(c);
2797
+ }
2798
+
2799
+ unsigned int
2800
+ deflate_get_compression_level(struct libdeflate_compressor *c)
2801
+ {
2802
+ return c->compression_level;
2803
+ }
2804
+
2805
+ LIBDEFLATEAPI size_t
2806
+ libdeflate_deflate_compress_bound(struct libdeflate_compressor *c,
2807
+ size_t in_nbytes)
2808
+ {
2809
+ /*
2810
+ * The worst case is all uncompressed blocks where one block has length
2811
+ * <= MIN_BLOCK_LENGTH and the others have length MIN_BLOCK_LENGTH.
2812
+ * Each uncompressed block has 5 bytes of overhead: 1 for BFINAL, BTYPE,
2813
+ * and alignment to a byte boundary; 2 for LEN; and 2 for NLEN.
2814
+ */
2815
+ size_t max_num_blocks = MAX(DIV_ROUND_UP(in_nbytes, MIN_BLOCK_LENGTH), 1);
2816
+ return (5 * max_num_blocks) + in_nbytes + 1 + MIN_OUTPUT_SIZE;
2817
+ }