libdeflate 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (89) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +17 -0
  3. data/.gitmodules +3 -0
  4. data/.rspec +2 -0
  5. data/.rubocop.yml +1 -0
  6. data/.rubocop_todo.yml +9 -0
  7. data/.travis.yml +5 -0
  8. data/Gemfile +4 -0
  9. data/LICENSE.txt +21 -0
  10. data/README.md +52 -0
  11. data/Rakefile +15 -0
  12. data/bin/console +14 -0
  13. data/bin/setup +8 -0
  14. data/ext/libdeflate/extconf.rb +14 -0
  15. data/ext/libdeflate/libdeflate/.gitignore +19 -0
  16. data/ext/libdeflate/libdeflate/COPYING +21 -0
  17. data/ext/libdeflate/libdeflate/Makefile +231 -0
  18. data/ext/libdeflate/libdeflate/Makefile.msc +64 -0
  19. data/ext/libdeflate/libdeflate/NEWS +57 -0
  20. data/ext/libdeflate/libdeflate/README.md +170 -0
  21. data/ext/libdeflate/libdeflate/common/common_defs.h +351 -0
  22. data/ext/libdeflate/libdeflate/common/compiler_gcc.h +134 -0
  23. data/ext/libdeflate/libdeflate/common/compiler_msc.h +95 -0
  24. data/ext/libdeflate/libdeflate/lib/adler32.c +213 -0
  25. data/ext/libdeflate/libdeflate/lib/adler32_impl.h +281 -0
  26. data/ext/libdeflate/libdeflate/lib/aligned_malloc.c +57 -0
  27. data/ext/libdeflate/libdeflate/lib/aligned_malloc.h +13 -0
  28. data/ext/libdeflate/libdeflate/lib/bt_matchfinder.h +357 -0
  29. data/ext/libdeflate/libdeflate/lib/crc32.c +368 -0
  30. data/ext/libdeflate/libdeflate/lib/crc32_impl.h +286 -0
  31. data/ext/libdeflate/libdeflate/lib/crc32_table.h +526 -0
  32. data/ext/libdeflate/libdeflate/lib/decompress_impl.h +404 -0
  33. data/ext/libdeflate/libdeflate/lib/deflate_compress.c +2817 -0
  34. data/ext/libdeflate/libdeflate/lib/deflate_compress.h +14 -0
  35. data/ext/libdeflate/libdeflate/lib/deflate_constants.h +66 -0
  36. data/ext/libdeflate/libdeflate/lib/deflate_decompress.c +889 -0
  37. data/ext/libdeflate/libdeflate/lib/gzip_compress.c +95 -0
  38. data/ext/libdeflate/libdeflate/lib/gzip_constants.h +45 -0
  39. data/ext/libdeflate/libdeflate/lib/gzip_decompress.c +130 -0
  40. data/ext/libdeflate/libdeflate/lib/hc_matchfinder.h +405 -0
  41. data/ext/libdeflate/libdeflate/lib/lib_common.h +35 -0
  42. data/ext/libdeflate/libdeflate/lib/matchfinder_avx2.h +53 -0
  43. data/ext/libdeflate/libdeflate/lib/matchfinder_common.h +205 -0
  44. data/ext/libdeflate/libdeflate/lib/matchfinder_neon.h +61 -0
  45. data/ext/libdeflate/libdeflate/lib/matchfinder_sse2.h +53 -0
  46. data/ext/libdeflate/libdeflate/lib/unaligned.h +202 -0
  47. data/ext/libdeflate/libdeflate/lib/x86_cpu_features.c +169 -0
  48. data/ext/libdeflate/libdeflate/lib/x86_cpu_features.h +48 -0
  49. data/ext/libdeflate/libdeflate/lib/zlib_compress.c +87 -0
  50. data/ext/libdeflate/libdeflate/lib/zlib_constants.h +21 -0
  51. data/ext/libdeflate/libdeflate/lib/zlib_decompress.c +91 -0
  52. data/ext/libdeflate/libdeflate/libdeflate.h +274 -0
  53. data/ext/libdeflate/libdeflate/programs/benchmark.c +558 -0
  54. data/ext/libdeflate/libdeflate/programs/checksum.c +197 -0
  55. data/ext/libdeflate/libdeflate/programs/detect.sh +62 -0
  56. data/ext/libdeflate/libdeflate/programs/gzip.c +603 -0
  57. data/ext/libdeflate/libdeflate/programs/prog_util.c +530 -0
  58. data/ext/libdeflate/libdeflate/programs/prog_util.h +162 -0
  59. data/ext/libdeflate/libdeflate/programs/test_checksums.c +135 -0
  60. data/ext/libdeflate/libdeflate/programs/tgetopt.c +118 -0
  61. data/ext/libdeflate/libdeflate/tools/afl-fuzz/Makefile +12 -0
  62. data/ext/libdeflate/libdeflate/tools/afl-fuzz/deflate_compress/fuzz.c +40 -0
  63. data/ext/libdeflate/libdeflate/tools/afl-fuzz/deflate_compress/inputs/0 +0 -0
  64. data/ext/libdeflate/libdeflate/tools/afl-fuzz/deflate_decompress/fuzz.c +28 -0
  65. data/ext/libdeflate/libdeflate/tools/afl-fuzz/deflate_decompress/inputs/0 +3 -0
  66. data/ext/libdeflate/libdeflate/tools/afl-fuzz/gzip_decompress/fuzz.c +28 -0
  67. data/ext/libdeflate/libdeflate/tools/afl-fuzz/gzip_decompress/inputs/0 +0 -0
  68. data/ext/libdeflate/libdeflate/tools/afl-fuzz/prepare_for_fuzz.sh +14 -0
  69. data/ext/libdeflate/libdeflate/tools/afl-fuzz/zlib_decompress/fuzz.c +28 -0
  70. data/ext/libdeflate/libdeflate/tools/afl-fuzz/zlib_decompress/inputs/0 +3 -0
  71. data/ext/libdeflate/libdeflate/tools/android_build.sh +104 -0
  72. data/ext/libdeflate/libdeflate/tools/checksum_benchmarks.sh +76 -0
  73. data/ext/libdeflate/libdeflate/tools/exec_tests.sh +30 -0
  74. data/ext/libdeflate/libdeflate/tools/gen_crc32_multipliers.c +108 -0
  75. data/ext/libdeflate/libdeflate/tools/gen_crc32_table.c +100 -0
  76. data/ext/libdeflate/libdeflate/tools/gzip_tests.sh +412 -0
  77. data/ext/libdeflate/libdeflate/tools/make-windows-releases +21 -0
  78. data/ext/libdeflate/libdeflate/tools/mips_build.sh +9 -0
  79. data/ext/libdeflate/libdeflate/tools/msc_test.bat +3 -0
  80. data/ext/libdeflate/libdeflate/tools/pgo_build.sh +23 -0
  81. data/ext/libdeflate/libdeflate/tools/produce_gzip_benchmark_table.sh +37 -0
  82. data/ext/libdeflate/libdeflate/tools/run_tests.sh +305 -0
  83. data/ext/libdeflate/libdeflate/tools/windows_build.sh +10 -0
  84. data/ext/libdeflate/libdeflate_ext.c +389 -0
  85. data/ext/libdeflate/libdeflate_ext.h +8 -0
  86. data/lib/libdeflate.rb +2 -0
  87. data/lib/libdeflate/version.rb +3 -0
  88. data/libdeflate.gemspec +33 -0
  89. metadata +230 -0
@@ -0,0 +1,404 @@
1
+ /*
2
+ * decompress_impl.h
3
+ *
4
+ * Originally public domain; changes after 2016-09-07 are copyrighted.
5
+ *
6
+ * Copyright 2016 Eric Biggers
7
+ *
8
+ * Permission is hereby granted, free of charge, to any person
9
+ * obtaining a copy of this software and associated documentation
10
+ * files (the "Software"), to deal in the Software without
11
+ * restriction, including without limitation the rights to use,
12
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
13
+ * copies of the Software, and to permit persons to whom the
14
+ * Software is furnished to do so, subject to the following
15
+ * conditions:
16
+ *
17
+ * The above copyright notice and this permission notice shall be
18
+ * included in all copies or substantial portions of the Software.
19
+ *
20
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
21
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
22
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
23
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
24
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
25
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
26
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
27
+ * OTHER DEALINGS IN THE SOFTWARE.
28
+ */
29
+
30
+ /*
31
+ * This is the actual DEFLATE decompression routine, lifted out of
32
+ * deflate_decompress.c so that it can be compiled multiple times with different
33
+ * target instruction sets.
34
+ */
35
+
36
+ static enum libdeflate_result ATTRIBUTES
37
+ FUNCNAME(struct libdeflate_decompressor * restrict d,
38
+ const void * restrict in, size_t in_nbytes,
39
+ void * restrict out, size_t out_nbytes_avail,
40
+ size_t *actual_out_nbytes_ret)
41
+ {
42
+ u8 *out_next = out;
43
+ u8 * const out_end = out_next + out_nbytes_avail;
44
+ const u8 *in_next = in;
45
+ const u8 * const in_end = in_next + in_nbytes;
46
+ bitbuf_t bitbuf = 0;
47
+ unsigned bitsleft = 0;
48
+ size_t overrun_count = 0;
49
+ unsigned i;
50
+ unsigned is_final_block;
51
+ unsigned block_type;
52
+ u16 len;
53
+ u16 nlen;
54
+ unsigned num_litlen_syms;
55
+ unsigned num_offset_syms;
56
+ u16 tmp16;
57
+ u32 tmp32;
58
+
59
+ next_block:
60
+ /* Starting to read the next block. */
61
+ ;
62
+
63
+ STATIC_ASSERT(CAN_ENSURE(1 + 2 + 5 + 5 + 4));
64
+ ENSURE_BITS(1 + 2 + 5 + 5 + 4);
65
+
66
+ /* BFINAL: 1 bit */
67
+ is_final_block = POP_BITS(1);
68
+
69
+ /* BTYPE: 2 bits */
70
+ block_type = POP_BITS(2);
71
+
72
+ if (block_type == DEFLATE_BLOCKTYPE_DYNAMIC_HUFFMAN) {
73
+
74
+ /* Dynamic Huffman block. */
75
+
76
+ /* The order in which precode lengths are stored. */
77
+ static const u8 deflate_precode_lens_permutation[DEFLATE_NUM_PRECODE_SYMS] = {
78
+ 16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15
79
+ };
80
+
81
+ unsigned num_explicit_precode_lens;
82
+
83
+ /* Read the codeword length counts. */
84
+
85
+ STATIC_ASSERT(DEFLATE_NUM_LITLEN_SYMS == ((1 << 5) - 1) + 257);
86
+ num_litlen_syms = POP_BITS(5) + 257;
87
+
88
+ STATIC_ASSERT(DEFLATE_NUM_OFFSET_SYMS == ((1 << 5) - 1) + 1);
89
+ num_offset_syms = POP_BITS(5) + 1;
90
+
91
+ STATIC_ASSERT(DEFLATE_NUM_PRECODE_SYMS == ((1 << 4) - 1) + 4);
92
+ num_explicit_precode_lens = POP_BITS(4) + 4;
93
+
94
+ /* Read the precode codeword lengths. */
95
+ STATIC_ASSERT(DEFLATE_MAX_PRE_CODEWORD_LEN == (1 << 3) - 1);
96
+ if (CAN_ENSURE(DEFLATE_NUM_PRECODE_SYMS * 3)) {
97
+
98
+ ENSURE_BITS(DEFLATE_NUM_PRECODE_SYMS * 3);
99
+
100
+ for (i = 0; i < num_explicit_precode_lens; i++)
101
+ d->u.precode_lens[deflate_precode_lens_permutation[i]] = POP_BITS(3);
102
+ } else {
103
+ for (i = 0; i < num_explicit_precode_lens; i++) {
104
+ ENSURE_BITS(3);
105
+ d->u.precode_lens[deflate_precode_lens_permutation[i]] = POP_BITS(3);
106
+ }
107
+ }
108
+
109
+ for (; i < DEFLATE_NUM_PRECODE_SYMS; i++)
110
+ d->u.precode_lens[deflate_precode_lens_permutation[i]] = 0;
111
+
112
+ /* Build the decode table for the precode. */
113
+ SAFETY_CHECK(build_precode_decode_table(d));
114
+
115
+ /* Expand the literal/length and offset codeword lengths. */
116
+ for (i = 0; i < num_litlen_syms + num_offset_syms; ) {
117
+ u32 entry;
118
+ unsigned presym;
119
+ u8 rep_val;
120
+ unsigned rep_count;
121
+
122
+ ENSURE_BITS(DEFLATE_MAX_PRE_CODEWORD_LEN + 7);
123
+
124
+ /* (The code below assumes that the precode decode table
125
+ * does not have any subtables.) */
126
+ STATIC_ASSERT(PRECODE_TABLEBITS == DEFLATE_MAX_PRE_CODEWORD_LEN);
127
+
128
+ /* Read the next precode symbol. */
129
+ entry = d->u.l.precode_decode_table[BITS(DEFLATE_MAX_PRE_CODEWORD_LEN)];
130
+ REMOVE_BITS(entry & HUFFDEC_LENGTH_MASK);
131
+ presym = entry >> HUFFDEC_RESULT_SHIFT;
132
+
133
+ if (presym < 16) {
134
+ /* Explicit codeword length */
135
+ d->u.l.lens[i++] = presym;
136
+ continue;
137
+ }
138
+
139
+ /* Run-length encoded codeword lengths */
140
+
141
+ /* Note: we don't need verify that the repeat count
142
+ * doesn't overflow the number of elements, since we
143
+ * have enough extra spaces to allow for the worst-case
144
+ * overflow (138 zeroes when only 1 length was
145
+ * remaining).
146
+ *
147
+ * In the case of the small repeat counts (presyms 16
148
+ * and 17), it is fastest to always write the maximum
149
+ * number of entries. That gets rid of branches that
150
+ * would otherwise be required.
151
+ *
152
+ * It is not just because of the numerical order that
153
+ * our checks go in the order 'presym < 16', 'presym ==
154
+ * 16', and 'presym == 17'. For typical data this is
155
+ * ordered from most frequent to least frequent case.
156
+ */
157
+ STATIC_ASSERT(DEFLATE_MAX_LENS_OVERRUN == 138 - 1);
158
+
159
+ if (presym == 16) {
160
+ /* Repeat the previous length 3 - 6 times */
161
+ SAFETY_CHECK(i != 0);
162
+ rep_val = d->u.l.lens[i - 1];
163
+ STATIC_ASSERT(3 + ((1 << 2) - 1) == 6);
164
+ rep_count = 3 + POP_BITS(2);
165
+ d->u.l.lens[i + 0] = rep_val;
166
+ d->u.l.lens[i + 1] = rep_val;
167
+ d->u.l.lens[i + 2] = rep_val;
168
+ d->u.l.lens[i + 3] = rep_val;
169
+ d->u.l.lens[i + 4] = rep_val;
170
+ d->u.l.lens[i + 5] = rep_val;
171
+ i += rep_count;
172
+ } else if (presym == 17) {
173
+ /* Repeat zero 3 - 10 times */
174
+ STATIC_ASSERT(3 + ((1 << 3) - 1) == 10);
175
+ rep_count = 3 + POP_BITS(3);
176
+ d->u.l.lens[i + 0] = 0;
177
+ d->u.l.lens[i + 1] = 0;
178
+ d->u.l.lens[i + 2] = 0;
179
+ d->u.l.lens[i + 3] = 0;
180
+ d->u.l.lens[i + 4] = 0;
181
+ d->u.l.lens[i + 5] = 0;
182
+ d->u.l.lens[i + 6] = 0;
183
+ d->u.l.lens[i + 7] = 0;
184
+ d->u.l.lens[i + 8] = 0;
185
+ d->u.l.lens[i + 9] = 0;
186
+ i += rep_count;
187
+ } else {
188
+ /* Repeat zero 11 - 138 times */
189
+ STATIC_ASSERT(11 + ((1 << 7) - 1) == 138);
190
+ rep_count = 11 + POP_BITS(7);
191
+ memset(&d->u.l.lens[i], 0,
192
+ rep_count * sizeof(d->u.l.lens[i]));
193
+ i += rep_count;
194
+ }
195
+ }
196
+ } else if (block_type == DEFLATE_BLOCKTYPE_UNCOMPRESSED) {
197
+
198
+ /* Uncompressed block: copy 'len' bytes literally from the input
199
+ * buffer to the output buffer. */
200
+
201
+ ALIGN_INPUT();
202
+
203
+ SAFETY_CHECK(in_end - in_next >= 4);
204
+
205
+ len = READ_U16();
206
+ nlen = READ_U16();
207
+
208
+ SAFETY_CHECK(len == (u16)~nlen);
209
+ if (unlikely(len > out_end - out_next))
210
+ return LIBDEFLATE_INSUFFICIENT_SPACE;
211
+ SAFETY_CHECK(len <= in_end - in_next);
212
+
213
+ memcpy(out_next, in_next, len);
214
+ in_next += len;
215
+ out_next += len;
216
+
217
+ goto block_done;
218
+
219
+ } else {
220
+ SAFETY_CHECK(block_type == DEFLATE_BLOCKTYPE_STATIC_HUFFMAN);
221
+
222
+ /* Static Huffman block: set the static Huffman codeword
223
+ * lengths. Then the remainder is the same as decompressing a
224
+ * dynamic Huffman block. */
225
+
226
+ STATIC_ASSERT(DEFLATE_NUM_LITLEN_SYMS == 288);
227
+ STATIC_ASSERT(DEFLATE_NUM_OFFSET_SYMS == 32);
228
+
229
+ for (i = 0; i < 144; i++)
230
+ d->u.l.lens[i] = 8;
231
+ for (; i < 256; i++)
232
+ d->u.l.lens[i] = 9;
233
+ for (; i < 280; i++)
234
+ d->u.l.lens[i] = 7;
235
+ for (; i < 288; i++)
236
+ d->u.l.lens[i] = 8;
237
+
238
+ for (; i < 288 + 32; i++)
239
+ d->u.l.lens[i] = 5;
240
+
241
+ num_litlen_syms = 288;
242
+ num_offset_syms = 32;
243
+
244
+ }
245
+
246
+ /* Decompressing a Huffman block (either dynamic or static) */
247
+
248
+ SAFETY_CHECK(build_offset_decode_table(d, num_litlen_syms, num_offset_syms));
249
+ SAFETY_CHECK(build_litlen_decode_table(d, num_litlen_syms, num_offset_syms));
250
+
251
+ /* The main DEFLATE decode loop */
252
+ for (;;) {
253
+ u32 entry;
254
+ u32 length;
255
+ u32 offset;
256
+
257
+ /* Decode a litlen symbol. */
258
+ ENSURE_BITS(DEFLATE_MAX_LITLEN_CODEWORD_LEN);
259
+ entry = d->u.litlen_decode_table[BITS(LITLEN_TABLEBITS)];
260
+ if (entry & HUFFDEC_SUBTABLE_POINTER) {
261
+ /* Litlen subtable required (uncommon case) */
262
+ REMOVE_BITS(LITLEN_TABLEBITS);
263
+ entry = d->u.litlen_decode_table[
264
+ ((entry >> HUFFDEC_RESULT_SHIFT) & 0xFFFF) +
265
+ BITS(entry & HUFFDEC_LENGTH_MASK)];
266
+ }
267
+ REMOVE_BITS(entry & HUFFDEC_LENGTH_MASK);
268
+ if (entry & HUFFDEC_LITERAL) {
269
+ /* Literal */
270
+ if (unlikely(out_next == out_end))
271
+ return LIBDEFLATE_INSUFFICIENT_SPACE;
272
+ *out_next++ = (u8)(entry >> HUFFDEC_RESULT_SHIFT);
273
+ continue;
274
+ }
275
+
276
+ /* Match or end-of-block */
277
+
278
+ entry >>= HUFFDEC_RESULT_SHIFT;
279
+ ENSURE_BITS(MAX_ENSURE);
280
+
281
+ /* Pop the extra length bits and add them to the length base to
282
+ * produce the full length. */
283
+ length = (entry >> HUFFDEC_LENGTH_BASE_SHIFT) +
284
+ POP_BITS(entry & HUFFDEC_EXTRA_LENGTH_BITS_MASK);
285
+
286
+ /* The match destination must not end after the end of the
287
+ * output buffer. For efficiency, combine this check with the
288
+ * end-of-block check. We're using 0 for the special
289
+ * end-of-block length, so subtract 1 and it turn it into
290
+ * SIZE_MAX. */
291
+ STATIC_ASSERT(HUFFDEC_END_OF_BLOCK_LENGTH == 0);
292
+ if (unlikely((size_t)length - 1 >= out_end - out_next)) {
293
+ if (unlikely(length != HUFFDEC_END_OF_BLOCK_LENGTH))
294
+ return LIBDEFLATE_INSUFFICIENT_SPACE;
295
+ goto block_done;
296
+ }
297
+
298
+ /* Decode the match offset. */
299
+
300
+ entry = d->offset_decode_table[BITS(OFFSET_TABLEBITS)];
301
+ if (entry & HUFFDEC_SUBTABLE_POINTER) {
302
+ /* Offset subtable required (uncommon case) */
303
+ REMOVE_BITS(OFFSET_TABLEBITS);
304
+ entry = d->offset_decode_table[
305
+ ((entry >> HUFFDEC_RESULT_SHIFT) & 0xFFFF) +
306
+ BITS(entry & HUFFDEC_LENGTH_MASK)];
307
+ }
308
+ REMOVE_BITS(entry & HUFFDEC_LENGTH_MASK);
309
+ entry >>= HUFFDEC_RESULT_SHIFT;
310
+
311
+ STATIC_ASSERT(CAN_ENSURE(DEFLATE_MAX_EXTRA_LENGTH_BITS +
312
+ DEFLATE_MAX_OFFSET_CODEWORD_LEN) &&
313
+ CAN_ENSURE(DEFLATE_MAX_EXTRA_OFFSET_BITS));
314
+ if (!CAN_ENSURE(DEFLATE_MAX_EXTRA_LENGTH_BITS +
315
+ DEFLATE_MAX_OFFSET_CODEWORD_LEN +
316
+ DEFLATE_MAX_EXTRA_OFFSET_BITS))
317
+ ENSURE_BITS(DEFLATE_MAX_EXTRA_OFFSET_BITS);
318
+
319
+ /* Pop the extra offset bits and add them to the offset base to
320
+ * produce the full offset. */
321
+ offset = (entry & HUFFDEC_OFFSET_BASE_MASK) +
322
+ POP_BITS(entry >> HUFFDEC_EXTRA_OFFSET_BITS_SHIFT);
323
+
324
+ /* The match source must not begin before the beginning of the
325
+ * output buffer. */
326
+ SAFETY_CHECK(offset <= out_next - (const u8 *)out);
327
+
328
+ /* Copy the match: 'length' bytes at 'out_next - offset' to
329
+ * 'out_next'. */
330
+
331
+ if (UNALIGNED_ACCESS_IS_FAST &&
332
+ length <= (3 * WORDBYTES) &&
333
+ offset >= WORDBYTES &&
334
+ length + (3 * WORDBYTES) <= out_end - out_next)
335
+ {
336
+ /* Fast case: short length, no overlaps if we copy one
337
+ * word at a time, and we aren't getting too close to
338
+ * the end of the output array. */
339
+ copy_word_unaligned(out_next - offset + (0 * WORDBYTES),
340
+ out_next + (0 * WORDBYTES));
341
+ copy_word_unaligned(out_next - offset + (1 * WORDBYTES),
342
+ out_next + (1 * WORDBYTES));
343
+ copy_word_unaligned(out_next - offset + (2 * WORDBYTES),
344
+ out_next + (2 * WORDBYTES));
345
+ } else {
346
+ const u8 *src = out_next - offset;
347
+ u8 *dst = out_next;
348
+ u8 *end = out_next + length;
349
+
350
+ if (UNALIGNED_ACCESS_IS_FAST &&
351
+ likely(out_end - end >= WORDBYTES - 1)) {
352
+ if (offset >= WORDBYTES) {
353
+ copy_word_unaligned(src, dst);
354
+ src += WORDBYTES;
355
+ dst += WORDBYTES;
356
+ if (dst < end) {
357
+ do {
358
+ copy_word_unaligned(src, dst);
359
+ src += WORDBYTES;
360
+ dst += WORDBYTES;
361
+ } while (dst < end);
362
+ }
363
+ } else if (offset == 1) {
364
+ machine_word_t v = repeat_byte(*(dst - 1));
365
+ do {
366
+ store_word_unaligned(v, dst);
367
+ src += WORDBYTES;
368
+ dst += WORDBYTES;
369
+ } while (dst < end);
370
+ } else {
371
+ *dst++ = *src++;
372
+ *dst++ = *src++;
373
+ do {
374
+ *dst++ = *src++;
375
+ } while (dst < end);
376
+ }
377
+ } else {
378
+ *dst++ = *src++;
379
+ *dst++ = *src++;
380
+ do {
381
+ *dst++ = *src++;
382
+ } while (dst < end);
383
+ }
384
+ }
385
+
386
+ out_next += length;
387
+ }
388
+
389
+ block_done:
390
+ /* Finished decoding a block. */
391
+
392
+ if (!is_final_block)
393
+ goto next_block;
394
+
395
+ /* That was the last block. */
396
+
397
+ if (actual_out_nbytes_ret) {
398
+ *actual_out_nbytes_ret = out_next - (u8 *)out;
399
+ } else {
400
+ if (out_next != out_end)
401
+ return LIBDEFLATE_SHORT_OUTPUT;
402
+ }
403
+ return LIBDEFLATE_SUCCESS;
404
+ }
@@ -0,0 +1,2817 @@
1
+ /*
2
+ * deflate_compress.c - a compressor for DEFLATE
3
+ *
4
+ * Originally public domain; changes after 2016-09-07 are copyrighted.
5
+ *
6
+ * Copyright 2016 Eric Biggers
7
+ *
8
+ * Permission is hereby granted, free of charge, to any person
9
+ * obtaining a copy of this software and associated documentation
10
+ * files (the "Software"), to deal in the Software without
11
+ * restriction, including without limitation the rights to use,
12
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
13
+ * copies of the Software, and to permit persons to whom the
14
+ * Software is furnished to do so, subject to the following
15
+ * conditions:
16
+ *
17
+ * The above copyright notice and this permission notice shall be
18
+ * included in all copies or substantial portions of the Software.
19
+ *
20
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
21
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
22
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
23
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
24
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
25
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
26
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
27
+ * OTHER DEALINGS IN THE SOFTWARE.
28
+ */
29
+
30
+ #include <stdlib.h>
31
+ #include <string.h>
32
+
33
+ #include "aligned_malloc.h"
34
+ #include "deflate_compress.h"
35
+ #include "deflate_constants.h"
36
+ #include "unaligned.h"
37
+
38
+ #include "libdeflate.h"
39
+
40
+ /*
41
+ * By default, the near-optimal parsing algorithm is enabled at compression
42
+ * level 8 and above. The near-optimal parsing algorithm produces a compression
43
+ * ratio significantly better than the greedy and lazy algorithms implemented
44
+ * here, and also the algorithm used by zlib at level 9. However, it is slow.
45
+ */
46
+ #define SUPPORT_NEAR_OPTIMAL_PARSING 1
47
+
48
+ /*
49
+ * Define to 1 to maintain the full map from match offsets to offset slots.
50
+ * This slightly speeds up translations of match offsets to offset slots, but it
51
+ * uses 32769 bytes of memory rather than the 512 bytes used by the condensed
52
+ * map. The speedup provided by the larger map is most helpful when the
53
+ * near-optimal parsing algorithm is being used.
54
+ */
55
+ #define USE_FULL_OFFSET_SLOT_FAST SUPPORT_NEAR_OPTIMAL_PARSING
56
+
57
+ /*
58
+ * DEFLATE uses a 32768 byte sliding window; set the matchfinder parameters
59
+ * appropriately.
60
+ */
61
+ #define MATCHFINDER_WINDOW_ORDER 15
62
+
63
+ #include "hc_matchfinder.h"
64
+ #if SUPPORT_NEAR_OPTIMAL_PARSING
65
+ # include "bt_matchfinder.h"
66
+ #endif
67
+
68
+ /*
69
+ * The compressor always chooses a block of at least MIN_BLOCK_LENGTH bytes,
70
+ * except if the last block has to be shorter.
71
+ */
72
+ #define MIN_BLOCK_LENGTH 10000
73
+
74
+ /*
75
+ * The compressor attempts to end blocks after SOFT_MAX_BLOCK_LENGTH bytes, but
76
+ * the final length might be slightly longer due to matches extending beyond
77
+ * this limit.
78
+ */
79
+ #define SOFT_MAX_BLOCK_LENGTH 300000
80
+
81
+ /*
82
+ * The number of observed matches or literals that represents sufficient data to
83
+ * decide whether the current block should be terminated or not.
84
+ */
85
+ #define NUM_OBSERVATIONS_PER_BLOCK_CHECK 512
86
+
87
+
88
+ #if SUPPORT_NEAR_OPTIMAL_PARSING
89
+ /* Constants specific to the near-optimal parsing algorithm */
90
+
91
+ /*
92
+ * The maximum number of matches the matchfinder can find at a single position.
93
+ * Since the matchfinder never finds more than one match for the same length,
94
+ * presuming one of each possible length is sufficient for an upper bound.
95
+ * (This says nothing about whether it is worthwhile to consider so many
96
+ * matches; this is just defining the worst case.)
97
+ */
98
+ # define MAX_MATCHES_PER_POS (DEFLATE_MAX_MATCH_LEN - DEFLATE_MIN_MATCH_LEN + 1)
99
+
100
+ /*
101
+ * The number of lz_match structures in the match cache, excluding the extra
102
+ * "overflow" entries. This value should be high enough so that nearly the
103
+ * time, all matches found in a given block can fit in the match cache.
104
+ * However, fallback behavior (immediately terminating the block) on cache
105
+ * overflow is still required.
106
+ */
107
+ # define CACHE_LENGTH (SOFT_MAX_BLOCK_LENGTH * 5)
108
+
109
+ #endif /* SUPPORT_NEAR_OPTIMAL_PARSING */
110
+
111
+ /*
112
+ * These are the compressor-side limits on the codeword lengths for each Huffman
113
+ * code. To make outputting bits slightly faster, some of these limits are
114
+ * lower than the limits defined by the DEFLATE format. This does not
115
+ * significantly affect the compression ratio, at least for the block lengths we
116
+ * use.
117
+ */
118
+ #define MAX_LITLEN_CODEWORD_LEN 14
119
+ #define MAX_OFFSET_CODEWORD_LEN DEFLATE_MAX_OFFSET_CODEWORD_LEN
120
+ #define MAX_PRE_CODEWORD_LEN DEFLATE_MAX_PRE_CODEWORD_LEN
121
+
122
+ /* Table: length slot => length slot base value */
123
+ static const unsigned deflate_length_slot_base[] = {
124
+ 3 , 4 , 5 , 6 , 7 , 8 , 9 , 10 ,
125
+ 11 , 13 , 15 , 17 , 19 , 23 , 27 , 31 ,
126
+ 35 , 43 , 51 , 59 , 67 , 83 , 99 , 115 ,
127
+ 131 , 163 , 195 , 227 , 258 ,
128
+ };
129
+
130
+ /* Table: length slot => number of extra length bits */
131
+ static const u8 deflate_extra_length_bits[] = {
132
+ 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
133
+ 1 , 1 , 1 , 1 , 2 , 2 , 2 , 2 ,
134
+ 3 , 3 , 3 , 3 , 4 , 4 , 4 , 4 ,
135
+ 5 , 5 , 5 , 5 , 0 ,
136
+ };
137
+
138
+ /* Table: offset slot => offset slot base value */
139
+ static const unsigned deflate_offset_slot_base[] = {
140
+ 1 , 2 , 3 , 4 , 5 , 7 , 9 , 13 ,
141
+ 17 , 25 , 33 , 49 , 65 , 97 , 129 , 193 ,
142
+ 257 , 385 , 513 , 769 , 1025 , 1537 , 2049 , 3073 ,
143
+ 4097 , 6145 , 8193 , 12289 , 16385 , 24577 ,
144
+ };
145
+
146
+ /* Table: offset slot => number of extra offset bits */
147
+ static const u8 deflate_extra_offset_bits[] = {
148
+ 0 , 0 , 0 , 0 , 1 , 1 , 2 , 2 ,
149
+ 3 , 3 , 4 , 4 , 5 , 5 , 6 , 6 ,
150
+ 7 , 7 , 8 , 8 , 9 , 9 , 10 , 10 ,
151
+ 11 , 11 , 12 , 12 , 13 , 13 ,
152
+ };
153
+
154
+ /* Table: length => length slot */
155
+ static const u8 deflate_length_slot[DEFLATE_MAX_MATCH_LEN + 1] = {
156
+ 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 12,
157
+ 12, 13, 13, 13, 13, 14, 14, 14, 14, 15, 15, 15, 15, 16, 16, 16, 16, 16,
158
+ 16, 16, 16, 17, 17, 17, 17, 17, 17, 17, 17, 18, 18, 18, 18, 18, 18, 18,
159
+ 18, 19, 19, 19, 19, 19, 19, 19, 19, 20, 20, 20, 20, 20, 20, 20, 20, 20,
160
+ 20, 20, 20, 20, 20, 20, 20, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
161
+ 21, 21, 21, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22,
162
+ 22, 22, 22, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23,
163
+ 23, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
164
+ 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 25, 25, 25,
165
+ 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
166
+ 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 26, 26, 26, 26, 26, 26, 26,
167
+ 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
168
+ 26, 26, 26, 26, 26, 26, 26, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
169
+ 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
170
+ 27, 27, 28,
171
+ };
172
+
173
+ /* The order in which precode codeword lengths are stored */
174
+ static const u8 deflate_precode_lens_permutation[DEFLATE_NUM_PRECODE_SYMS] = {
175
+ 16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15
176
+ };
177
+
178
+ /* Codewords for the DEFLATE Huffman codes. */
179
+ struct deflate_codewords {
180
+ u32 litlen[DEFLATE_NUM_LITLEN_SYMS];
181
+ u32 offset[DEFLATE_NUM_OFFSET_SYMS];
182
+ };
183
+
184
+ /* Codeword lengths (in bits) for the DEFLATE Huffman codes.
185
+ * A zero length means the corresponding symbol had zero frequency. */
186
+ struct deflate_lens {
187
+ u8 litlen[DEFLATE_NUM_LITLEN_SYMS];
188
+ u8 offset[DEFLATE_NUM_OFFSET_SYMS];
189
+ };
190
+
191
+ /* Codewords and lengths for the DEFLATE Huffman codes. */
192
+ struct deflate_codes {
193
+ struct deflate_codewords codewords;
194
+ struct deflate_lens lens;
195
+ };
196
+
197
+ /* Symbol frequency counters for the DEFLATE Huffman codes. */
198
+ struct deflate_freqs {
199
+ u32 litlen[DEFLATE_NUM_LITLEN_SYMS];
200
+ u32 offset[DEFLATE_NUM_OFFSET_SYMS];
201
+ };
202
+
203
+ #if SUPPORT_NEAR_OPTIMAL_PARSING
204
+
205
+ /* Costs for the near-optimal parsing algorithm. */
206
+ struct deflate_costs {
207
+
208
+ /* The cost to output each possible literal. */
209
+ u32 literal[DEFLATE_NUM_LITERALS];
210
+
211
+ /* The cost to output each possible match length. */
212
+ u32 length[DEFLATE_MAX_MATCH_LEN + 1];
213
+
214
+ /* The cost to output a match offset of each possible offset slot. */
215
+ u32 offset_slot[DEFLATE_NUM_OFFSET_SYMS];
216
+ };
217
+
218
+ /*
219
+ * COST_SHIFT is a scaling factor that makes it possible to consider fractional
220
+ * bit costs. A token requiring 'n' bits to represent has cost n << COST_SHIFT.
221
+ *
222
+ * Note: this is only useful as a statistical trick for when the true costs are
223
+ * unknown. In reality, each token in DEFLATE requires a whole number of bits
224
+ * to output.
225
+ */
226
+ #define COST_SHIFT 3
227
+
228
+ /*
229
+ * The NOSTAT_BITS value for a given alphabet is the number of bits assumed to
230
+ * be needed to output a symbol that was unused in the previous optimization
231
+ * pass. Assigning a default cost allows the symbol to be used in the next
232
+ * optimization pass. However, the cost should be relatively high because the
233
+ * symbol probably won't be used very many times (if at all).
234
+ */
235
+ #define LITERAL_NOSTAT_BITS 13
236
+ #define LENGTH_NOSTAT_BITS 13
237
+ #define OFFSET_NOSTAT_BITS 10
238
+
239
+ #endif /* SUPPORT_NEAR_OPTIMAL_PARSING */
240
+
241
+ /*
242
+ * Represents a run of literals followed by a match or end-of-block. This
243
+ * struct is needed to temporarily store items chosen by the parser, since items
244
+ * cannot be written until all items for the block have been chosen and the
245
+ * block's Huffman codes have been computed.
246
+ */
247
+ struct deflate_sequence {
248
+
249
+ /* Bits 0..22: the number of literals in this run. This may be 0 and
250
+ * can be at most about SOFT_MAX_BLOCK_LENGTH. The literals are not
251
+ * stored explicitly in this structure; instead, they are read directly
252
+ * from the uncompressed data.
253
+ *
254
+ * Bits 23..31: the length of the match which follows the literals, or 0
255
+ * if this literal run was the last in the block, so there is no match
256
+ * which follows it. */
257
+ u32 litrunlen_and_length;
258
+
259
+ /* If 'length' doesn't indicate end-of-block, then this is the offset of
260
+ * the match which follows the literals. */
261
+ u16 offset;
262
+
263
+ /* If 'length' doesn't indicate end-of-block, then this is the offset
264
+ * symbol of the match which follows the literals. */
265
+ u8 offset_symbol;
266
+
267
+ /* If 'length' doesn't indicate end-of-block, then this is the length
268
+ * slot of the match which follows the literals. */
269
+ u8 length_slot;
270
+ };
271
+
272
+ #if SUPPORT_NEAR_OPTIMAL_PARSING
273
+
274
+ /*
275
+ * This structure represents a byte position in the input data and a node in the
276
+ * graph of possible match/literal choices for the current block.
277
+ *
278
+ * Logically, each incoming edge to this node is labeled with a literal or a
279
+ * match that can be taken to reach this position from an earlier position; and
280
+ * each outgoing edge from this node is labeled with a literal or a match that
281
+ * can be taken to advance from this position to a later position.
282
+ *
283
+ * But these "edges" are actually stored elsewhere (in 'match_cache'). Here we
284
+ * associate with each node just two pieces of information:
285
+ *
286
+ * 'cost_to_end' is the minimum cost to reach the end of the block from
287
+ * this position.
288
+ *
289
+ * 'item' represents the literal or match that must be chosen from here to
290
+ * reach the end of the block with the minimum cost. Equivalently, this
291
+ * can be interpreted as the label of the outgoing edge on the minimum-cost
292
+ * path to the "end of block" node from this node.
293
+ */
294
+ struct deflate_optimum_node {
295
+
296
+ u32 cost_to_end;
297
+
298
+ /*
299
+ * Notes on the match/literal representation used here:
300
+ *
301
+ * The low bits of 'item' are the length: 1 if this is a literal,
302
+ * or the match length if this is a match.
303
+ *
304
+ * The high bits of 'item' are the actual literal byte if this is a
305
+ * literal, or the match offset if this is a match.
306
+ */
307
+ #define OPTIMUM_OFFSET_SHIFT 9
308
+ #define OPTIMUM_LEN_MASK (((u32)1 << OPTIMUM_OFFSET_SHIFT) - 1)
309
+ u32 item;
310
+
311
+ };
312
+
313
+ #endif /* SUPPORT_NEAR_OPTIMAL_PARSING */
314
+
315
+ /* Block split statistics. See "Block splitting algorithm" below. */
316
+ #define NUM_LITERAL_OBSERVATION_TYPES 8
317
+ #define NUM_MATCH_OBSERVATION_TYPES 2
318
+ #define NUM_OBSERVATION_TYPES (NUM_LITERAL_OBSERVATION_TYPES + NUM_MATCH_OBSERVATION_TYPES)
319
+ struct block_split_stats {
320
+ u32 new_observations[NUM_OBSERVATION_TYPES];
321
+ u32 observations[NUM_OBSERVATION_TYPES];
322
+ u32 num_new_observations;
323
+ u32 num_observations;
324
+ };
325
+
326
+ /* The main DEFLATE compressor structure */
327
+ struct libdeflate_compressor {
328
+
329
+ /* Pointer to the compress() implementation chosen at allocation time */
330
+ size_t (*impl)(struct libdeflate_compressor *,
331
+ const u8 *, size_t, u8 *, size_t);
332
+
333
+ /* Frequency counters for the current block */
334
+ struct deflate_freqs freqs;
335
+
336
+ /* Dynamic Huffman codes for the current block */
337
+ struct deflate_codes codes;
338
+
339
+ /* Static Huffman codes */
340
+ struct deflate_codes static_codes;
341
+
342
+ /* Block split statistics for the currently pending block */
343
+ struct block_split_stats split_stats;
344
+
345
+ /* A table for fast lookups of offset slot by match offset.
346
+ *
347
+ * If the full table is being used, it is a direct mapping from offset
348
+ * to offset slot.
349
+ *
350
+ * If the condensed table is being used, the first 256 entries map
351
+ * directly to the offset slots of offsets 1 through 256. The next 256
352
+ * entries map to the offset slots for the remaining offsets, stepping
353
+ * through the offsets with a stride of 128. This relies on the fact
354
+ * that each of the remaining offset slots contains at least 128 offsets
355
+ * and has an offset base that is a multiple of 128. */
356
+ #if USE_FULL_OFFSET_SLOT_FAST
357
+ u8 offset_slot_fast[DEFLATE_MAX_MATCH_OFFSET + 1];
358
+ #else
359
+ u8 offset_slot_fast[512];
360
+ #endif
361
+
362
+ /* The "nice" match length: if a match of this length is found, choose
363
+ * it immediately without further consideration. */
364
+ unsigned nice_match_length;
365
+
366
+ /* The maximum search depth: consider at most this many potential
367
+ * matches at each position. */
368
+ unsigned max_search_depth;
369
+
370
+ /* The compression level with which this compressor was created. */
371
+ unsigned compression_level;
372
+
373
+ /* Temporary space for Huffman code output */
374
+ u32 precode_freqs[DEFLATE_NUM_PRECODE_SYMS];
375
+ u8 precode_lens[DEFLATE_NUM_PRECODE_SYMS];
376
+ u32 precode_codewords[DEFLATE_NUM_PRECODE_SYMS];
377
+ unsigned precode_items[DEFLATE_NUM_LITLEN_SYMS + DEFLATE_NUM_OFFSET_SYMS];
378
+ unsigned num_litlen_syms;
379
+ unsigned num_offset_syms;
380
+ unsigned num_explicit_lens;
381
+ unsigned num_precode_items;
382
+
383
+ union {
384
+ /* Data for greedy or lazy parsing */
385
+ struct {
386
+ /* Hash chain matchfinder */
387
+ struct hc_matchfinder hc_mf;
388
+
389
+ /* The matches and literals that the parser has chosen
390
+ * for the current block. The required length of this
391
+ * array is limited by the maximum number of matches
392
+ * that can ever be chosen for a single block, plus one
393
+ * for the special entry at the end. */
394
+ struct deflate_sequence sequences[
395
+ DIV_ROUND_UP(SOFT_MAX_BLOCK_LENGTH,
396
+ DEFLATE_MIN_MATCH_LEN) + 1];
397
+ } g; /* (g)reedy */
398
+
399
+ #if SUPPORT_NEAR_OPTIMAL_PARSING
400
+ /* Data for near-optimal parsing */
401
+ struct {
402
+
403
+ /* Binary tree matchfinder */
404
+ struct bt_matchfinder bt_mf;
405
+
406
+ /*
407
+ * Cached matches for the current block. This array
408
+ * contains the matches that were found at each position
409
+ * in the block. Specifically, for each position, there
410
+ * is a list of matches found at that position, if any,
411
+ * sorted by strictly increasing length. In addition,
412
+ * following the matches for each position, there is a
413
+ * special 'struct lz_match' whose 'length' member
414
+ * contains the number of matches found at that
415
+ * position, and whose 'offset' member contains the
416
+ * literal at that position.
417
+ *
418
+ * Note: in rare cases, there will be a very high number
419
+ * of matches in the block and this array will overflow.
420
+ * If this happens, we force the end of the current
421
+ * block. CACHE_LENGTH is the length at which we
422
+ * actually check for overflow. The extra slots beyond
423
+ * this are enough to absorb the worst case overflow,
424
+ * which occurs if starting at &match_cache[CACHE_LENGTH
425
+ * - 1], we write MAX_MATCHES_PER_POS matches and a
426
+ * match count header, then skip searching for matches
427
+ * at 'DEFLATE_MAX_MATCH_LEN - 1' positions and write
428
+ * the match count header for each.
429
+ */
430
+ struct lz_match match_cache[CACHE_LENGTH +
431
+ MAX_MATCHES_PER_POS +
432
+ DEFLATE_MAX_MATCH_LEN - 1];
433
+
434
+ /*
435
+ * Array of nodes, one per position, for running the
436
+ * minimum-cost path algorithm.
437
+ *
438
+ * This array must be large enough to accommodate the
439
+ * worst-case number of nodes, which occurs if we find a
440
+ * match of length DEFLATE_MAX_MATCH_LEN at position
441
+ * SOFT_MAX_BLOCK_LENGTH - 1, producing a block of
442
+ * length SOFT_MAX_BLOCK_LENGTH - 1 +
443
+ * DEFLATE_MAX_MATCH_LEN. Add one for the end-of-block
444
+ * node.
445
+ */
446
+ struct deflate_optimum_node optimum_nodes[SOFT_MAX_BLOCK_LENGTH - 1 +
447
+ DEFLATE_MAX_MATCH_LEN + 1];
448
+
449
+ /* The current cost model being used. */
450
+ struct deflate_costs costs;
451
+
452
+ unsigned num_optim_passes;
453
+ } n; /* (n)ear-optimal */
454
+ #endif /* SUPPORT_NEAR_OPTIMAL_PARSING */
455
+
456
+ } p; /* (p)arser */
457
+ };
458
+
459
+ /*
460
+ * The type for the bitbuffer variable, which temporarily holds bits that are
461
+ * being packed into bytes and written to the output buffer. For best
462
+ * performance, this should have size equal to a machine word.
463
+ */
464
+ typedef machine_word_t bitbuf_t;
465
+ #define BITBUF_NBITS (8 * sizeof(bitbuf_t))
466
+
467
+ /* Can the specified number of bits always be added to 'bitbuf' after any
468
+ * pending bytes have been flushed? */
469
+ #define CAN_BUFFER(n) ((n) <= BITBUF_NBITS - 7)
470
+
471
+ /*
472
+ * Structure to keep track of the current state of sending bits to the
473
+ * compressed output buffer.
474
+ */
475
+ struct deflate_output_bitstream {
476
+
477
+ /* Bits that haven't yet been written to the output buffer. */
478
+ bitbuf_t bitbuf;
479
+
480
+ /* Number of bits currently held in @bitbuf. */
481
+ unsigned bitcount;
482
+
483
+ /* Pointer to the beginning of the output buffer. */
484
+ u8 *begin;
485
+
486
+ /* Pointer to the position in the output buffer at which the next byte
487
+ * should be written. */
488
+ u8 *next;
489
+
490
+ /* Pointer just past the end of the output buffer. */
491
+ u8 *end;
492
+ };
493
+
494
+ #define MIN_OUTPUT_SIZE (UNALIGNED_ACCESS_IS_FAST ? sizeof(bitbuf_t) : 1)
495
+
496
+ /* Initialize the output bitstream. 'size' is assumed to be at least
497
+ * MIN_OUTPUT_SIZE. */
498
+ static void
499
+ deflate_init_output(struct deflate_output_bitstream *os,
500
+ void *buffer, size_t size)
501
+ {
502
+ os->bitbuf = 0;
503
+ os->bitcount = 0;
504
+ os->begin = buffer;
505
+ os->next = os->begin;
506
+ os->end = os->begin + size - MIN_OUTPUT_SIZE;
507
+ }
508
+
509
+ /* Add some bits to the bitbuffer variable of the output bitstream. The caller
510
+ * must make sure there is enough room. */
511
+ static forceinline void
512
+ deflate_add_bits(struct deflate_output_bitstream *os,
513
+ const bitbuf_t bits, const unsigned num_bits)
514
+ {
515
+ os->bitbuf |= bits << os->bitcount;
516
+ os->bitcount += num_bits;
517
+ }
518
+
519
+ /* Flush bits from the bitbuffer variable to the output buffer. */
520
+ static forceinline void
521
+ deflate_flush_bits(struct deflate_output_bitstream *os)
522
+ {
523
+ if (UNALIGNED_ACCESS_IS_FAST) {
524
+ /* Flush a whole word (branchlessly). */
525
+ put_unaligned_leword(os->bitbuf, os->next);
526
+ os->bitbuf >>= os->bitcount & ~7;
527
+ os->next += MIN(os->end - os->next, os->bitcount >> 3);
528
+ os->bitcount &= 7;
529
+ } else {
530
+ /* Flush a byte at a time. */
531
+ while (os->bitcount >= 8) {
532
+ *os->next = os->bitbuf;
533
+ if (os->next != os->end)
534
+ os->next++;
535
+ os->bitcount -= 8;
536
+ os->bitbuf >>= 8;
537
+ }
538
+ }
539
+ }
540
+
541
+ /* Align the bitstream on a byte boundary. */
542
+ static forceinline void
543
+ deflate_align_bitstream(struct deflate_output_bitstream *os)
544
+ {
545
+ os->bitcount += -os->bitcount & 7;
546
+ deflate_flush_bits(os);
547
+ }
548
+
549
+ /*
550
+ * Flush any remaining bits to the output buffer if needed. Return the total
551
+ * number of bytes written to the output buffer, or 0 if an overflow occurred.
552
+ */
553
+ static u32
554
+ deflate_flush_output(struct deflate_output_bitstream *os)
555
+ {
556
+ if (os->next == os->end) /* overflow? */
557
+ return 0;
558
+
559
+ while ((int)os->bitcount > 0) {
560
+ *os->next++ = os->bitbuf;
561
+ os->bitcount -= 8;
562
+ os->bitbuf >>= 8;
563
+ }
564
+
565
+ return os->next - os->begin;
566
+ }
567
+
568
+ /* Given the binary tree node A[subtree_idx] whose children already
569
+ * satisfy the maxheap property, swap the node with its greater child
570
+ * until it is greater than both its children, so that the maxheap
571
+ * property is satisfied in the subtree rooted at A[subtree_idx]. */
572
+ static void
573
+ heapify_subtree(u32 A[], unsigned length, unsigned subtree_idx)
574
+ {
575
+ unsigned parent_idx;
576
+ unsigned child_idx;
577
+ u32 v;
578
+
579
+ v = A[subtree_idx];
580
+ parent_idx = subtree_idx;
581
+ while ((child_idx = parent_idx * 2) <= length) {
582
+ if (child_idx < length && A[child_idx + 1] > A[child_idx])
583
+ child_idx++;
584
+ if (v >= A[child_idx])
585
+ break;
586
+ A[parent_idx] = A[child_idx];
587
+ parent_idx = child_idx;
588
+ }
589
+ A[parent_idx] = v;
590
+ }
591
+
592
+ /* Rearrange the array 'A' so that it satisfies the maxheap property.
593
+ * 'A' uses 1-based indices, so the children of A[i] are A[i*2] and A[i*2 + 1].
594
+ */
595
+ static void
596
+ heapify_array(u32 A[], unsigned length)
597
+ {
598
+ unsigned subtree_idx;
599
+
600
+ for (subtree_idx = length / 2; subtree_idx >= 1; subtree_idx--)
601
+ heapify_subtree(A, length, subtree_idx);
602
+ }
603
+
604
+ /*
605
+ * Sort the array 'A', which contains 'length' unsigned 32-bit integers.
606
+ *
607
+ * Note: name this function heap_sort() instead of heapsort() to avoid colliding
608
+ * with heapsort() from stdlib.h on BSD-derived systems --- though this isn't
609
+ * necessary when compiling with -D_ANSI_SOURCE, which is the better solution.
610
+ */
611
+ static void
612
+ heap_sort(u32 A[], unsigned length)
613
+ {
614
+ A--; /* Use 1-based indices */
615
+
616
+ heapify_array(A, length);
617
+
618
+ while (length >= 2) {
619
+ u32 tmp = A[length];
620
+ A[length] = A[1];
621
+ A[1] = tmp;
622
+ length--;
623
+ heapify_subtree(A, length, 1);
624
+ }
625
+ }
626
+
627
+ #define NUM_SYMBOL_BITS 10
628
+ #define SYMBOL_MASK ((1 << NUM_SYMBOL_BITS) - 1)
629
+
630
+ #define GET_NUM_COUNTERS(num_syms) ((((num_syms) + 3 / 4) + 3) & ~3)
631
+ /*
632
+ * Sort the symbols primarily by frequency and secondarily by symbol
633
+ * value. Discard symbols with zero frequency and fill in an array with
634
+ * the remaining symbols, along with their frequencies. The low
635
+ * NUM_SYMBOL_BITS bits of each array entry will contain the symbol
636
+ * value, and the remaining bits will contain the frequency.
637
+ *
638
+ * @num_syms
639
+ * Number of symbols in the alphabet.
640
+ * Can't be greater than (1 << NUM_SYMBOL_BITS).
641
+ *
642
+ * @freqs[num_syms]
643
+ * The frequency of each symbol.
644
+ *
645
+ * @lens[num_syms]
646
+ * An array that eventually will hold the length of each codeword.
647
+ * This function only fills in the codeword lengths for symbols that
648
+ * have zero frequency, which are not well defined per se but will
649
+ * be set to 0.
650
+ *
651
+ * @symout[num_syms]
652
+ * The output array, described above.
653
+ *
654
+ * Returns the number of entries in 'symout' that were filled. This is
655
+ * the number of symbols that have nonzero frequency.
656
+ */
657
+ static unsigned
658
+ sort_symbols(unsigned num_syms, const u32 freqs[restrict],
659
+ u8 lens[restrict], u32 symout[restrict])
660
+ {
661
+ unsigned sym;
662
+ unsigned i;
663
+ unsigned num_used_syms;
664
+ unsigned num_counters;
665
+ unsigned counters[GET_NUM_COUNTERS(DEFLATE_MAX_NUM_SYMS)];
666
+
667
+ /* We rely on heapsort, but with an added optimization. Since
668
+ * it's common for most symbol frequencies to be low, we first do
669
+ * a count sort using a limited number of counters. High
670
+ * frequencies will be counted in the last counter, and only they
671
+ * will be sorted with heapsort.
672
+ *
673
+ * Note: with more symbols, it is generally beneficial to have more
674
+ * counters. About 1 counter per 4 symbols seems fast.
675
+ *
676
+ * Note: I also tested radix sort, but even for large symbol
677
+ * counts (> 255) and frequencies bounded at 16 bits (enabling
678
+ * radix sort by just two base-256 digits), it didn't seem any
679
+ * faster than the method implemented here.
680
+ *
681
+ * Note: I tested the optimized quicksort implementation from
682
+ * glibc (with indirection overhead removed), but it was only
683
+ * marginally faster than the simple heapsort implemented here.
684
+ *
685
+ * Tests were done with building the codes for LZX. Results may
686
+ * vary for different compression algorithms...! */
687
+
688
+ num_counters = GET_NUM_COUNTERS(num_syms);
689
+
690
+ memset(counters, 0, num_counters * sizeof(counters[0]));
691
+
692
+ /* Count the frequencies. */
693
+ for (sym = 0; sym < num_syms; sym++)
694
+ counters[MIN(freqs[sym], num_counters - 1)]++;
695
+
696
+ /* Make the counters cumulative, ignoring the zero-th, which
697
+ * counted symbols with zero frequency. As a side effect, this
698
+ * calculates the number of symbols with nonzero frequency. */
699
+ num_used_syms = 0;
700
+ for (i = 1; i < num_counters; i++) {
701
+ unsigned count = counters[i];
702
+ counters[i] = num_used_syms;
703
+ num_used_syms += count;
704
+ }
705
+
706
+ /* Sort nonzero-frequency symbols using the counters. At the
707
+ * same time, set the codeword lengths of zero-frequency symbols
708
+ * to 0. */
709
+ for (sym = 0; sym < num_syms; sym++) {
710
+ u32 freq = freqs[sym];
711
+ if (freq != 0) {
712
+ symout[counters[MIN(freq, num_counters - 1)]++] =
713
+ sym | (freq << NUM_SYMBOL_BITS);
714
+ } else {
715
+ lens[sym] = 0;
716
+ }
717
+ }
718
+
719
+ /* Sort the symbols counted in the last counter. */
720
+ heap_sort(symout + counters[num_counters - 2],
721
+ counters[num_counters - 1] - counters[num_counters - 2]);
722
+
723
+ return num_used_syms;
724
+ }
725
+
726
+ /*
727
+ * Build the Huffman tree.
728
+ *
729
+ * This is an optimized implementation that
730
+ * (a) takes advantage of the frequencies being already sorted;
731
+ * (b) only generates non-leaf nodes, since the non-leaf nodes of a
732
+ * Huffman tree are sufficient to generate a canonical code;
733
+ * (c) Only stores parent pointers, not child pointers;
734
+ * (d) Produces the nodes in the same memory used for input
735
+ * frequency information.
736
+ *
737
+ * Array 'A', which contains 'sym_count' entries, is used for both input
738
+ * and output. For this function, 'sym_count' must be at least 2.
739
+ *
740
+ * For input, the array must contain the frequencies of the symbols,
741
+ * sorted in increasing order. Specifically, each entry must contain a
742
+ * frequency left shifted by NUM_SYMBOL_BITS bits. Any data in the low
743
+ * NUM_SYMBOL_BITS bits of the entries will be ignored by this function.
744
+ * Although these bits will, in fact, contain the symbols that correspond
745
+ * to the frequencies, this function is concerned with frequencies only
746
+ * and keeps the symbols as-is.
747
+ *
748
+ * For output, this function will produce the non-leaf nodes of the
749
+ * Huffman tree. These nodes will be stored in the first (sym_count - 1)
750
+ * entries of the array. Entry A[sym_count - 2] will represent the root
751
+ * node. Each other node will contain the zero-based index of its parent
752
+ * node in 'A', left shifted by NUM_SYMBOL_BITS bits. The low
753
+ * NUM_SYMBOL_BITS bits of each entry in A will be kept as-is. Again,
754
+ * note that although these low bits will, in fact, contain a symbol
755
+ * value, this symbol will have *no relationship* with the Huffman tree
756
+ * node that happens to occupy the same slot. This is because this
757
+ * implementation only generates the non-leaf nodes of the tree.
758
+ */
759
+ static void
760
+ build_tree(u32 A[], unsigned sym_count)
761
+ {
762
+ /* Index, in 'A', of next lowest frequency symbol that has not
763
+ * yet been processed. */
764
+ unsigned i = 0;
765
+
766
+ /* Index, in 'A', of next lowest frequency parentless non-leaf
767
+ * node; or, if equal to 'e', then no such node exists yet. */
768
+ unsigned b = 0;
769
+
770
+ /* Index, in 'A', of next node to allocate as a non-leaf. */
771
+ unsigned e = 0;
772
+
773
+ do {
774
+ unsigned m, n;
775
+ u32 freq_shifted;
776
+
777
+ /* Choose the two next lowest frequency entries. */
778
+
779
+ if (i != sym_count &&
780
+ (b == e || (A[i] >> NUM_SYMBOL_BITS) <= (A[b] >> NUM_SYMBOL_BITS)))
781
+ m = i++;
782
+ else
783
+ m = b++;
784
+
785
+ if (i != sym_count &&
786
+ (b == e || (A[i] >> NUM_SYMBOL_BITS) <= (A[b] >> NUM_SYMBOL_BITS)))
787
+ n = i++;
788
+ else
789
+ n = b++;
790
+
791
+ /* Allocate a non-leaf node and link the entries to it.
792
+ *
793
+ * If we link an entry that we're visiting for the first
794
+ * time (via index 'i'), then we're actually linking a
795
+ * leaf node and it will have no effect, since the leaf
796
+ * will be overwritten with a non-leaf when index 'e'
797
+ * catches up to it. But it's not any slower to
798
+ * unconditionally set the parent index.
799
+ *
800
+ * We also compute the frequency of the non-leaf node as
801
+ * the sum of its two children's frequencies. */
802
+
803
+ freq_shifted = (A[m] & ~SYMBOL_MASK) + (A[n] & ~SYMBOL_MASK);
804
+
805
+ A[m] = (A[m] & SYMBOL_MASK) | (e << NUM_SYMBOL_BITS);
806
+ A[n] = (A[n] & SYMBOL_MASK) | (e << NUM_SYMBOL_BITS);
807
+ A[e] = (A[e] & SYMBOL_MASK) | freq_shifted;
808
+ e++;
809
+ } while (sym_count - e > 1);
810
+ /* When just one entry remains, it is a "leaf" that was
811
+ * linked to some other node. We ignore it, since the
812
+ * rest of the array contains the non-leaves which we
813
+ * need. (Note that we're assuming the cases with 0 or 1
814
+ * symbols were handled separately.) */
815
+ }
816
+
817
+ /*
818
+ * Given the stripped-down Huffman tree constructed by build_tree(),
819
+ * determine the number of codewords that should be assigned each
820
+ * possible length, taking into account the length-limited constraint.
821
+ *
822
+ * @A
823
+ * The array produced by build_tree(), containing parent index
824
+ * information for the non-leaf nodes of the Huffman tree. Each
825
+ * entry in this array is a node; a node's parent always has a
826
+ * greater index than that node itself. This function will
827
+ * overwrite the parent index information in this array, so
828
+ * essentially it will destroy the tree. However, the data in the
829
+ * low NUM_SYMBOL_BITS of each entry will be preserved.
830
+ *
831
+ * @root_idx
832
+ * The 0-based index of the root node in 'A', and consequently one
833
+ * less than the number of tree node entries in 'A'. (Or, really 2
834
+ * less than the actual length of 'A'.)
835
+ *
836
+ * @len_counts
837
+ * An array of length ('max_codeword_len' + 1) in which the number of
838
+ * codewords having each length <= max_codeword_len will be
839
+ * returned.
840
+ *
841
+ * @max_codeword_len
842
+ * The maximum permissible codeword length.
843
+ */
844
+ static void
845
+ compute_length_counts(u32 A[restrict], unsigned root_idx,
846
+ unsigned len_counts[restrict], unsigned max_codeword_len)
847
+ {
848
+ unsigned len;
849
+ int node;
850
+
851
+ /* The key observations are:
852
+ *
853
+ * (1) We can traverse the non-leaf nodes of the tree, always
854
+ * visiting a parent before its children, by simply iterating
855
+ * through the array in reverse order. Consequently, we can
856
+ * compute the depth of each node in one pass, overwriting the
857
+ * parent indices with depths.
858
+ *
859
+ * (2) We can initially assume that in the real Huffman tree,
860
+ * both children of the root are leaves. This corresponds to two
861
+ * codewords of length 1. Then, whenever we visit a (non-leaf)
862
+ * node during the traversal, we modify this assumption to
863
+ * account for the current node *not* being a leaf, but rather
864
+ * its two children being leaves. This causes the loss of one
865
+ * codeword for the current depth and the addition of two
866
+ * codewords for the current depth plus one.
867
+ *
868
+ * (3) We can handle the length-limited constraint fairly easily
869
+ * by simply using the largest length available when a depth
870
+ * exceeds max_codeword_len.
871
+ */
872
+
873
+ for (len = 0; len <= max_codeword_len; len++)
874
+ len_counts[len] = 0;
875
+ len_counts[1] = 2;
876
+
877
+ /* Set the root node's depth to 0. */
878
+ A[root_idx] &= SYMBOL_MASK;
879
+
880
+ for (node = root_idx - 1; node >= 0; node--) {
881
+
882
+ /* Calculate the depth of this node. */
883
+
884
+ unsigned parent = A[node] >> NUM_SYMBOL_BITS;
885
+ unsigned parent_depth = A[parent] >> NUM_SYMBOL_BITS;
886
+ unsigned depth = parent_depth + 1;
887
+ unsigned len = depth;
888
+
889
+ /* Set the depth of this node so that it is available
890
+ * when its children (if any) are processed. */
891
+
892
+ A[node] = (A[node] & SYMBOL_MASK) | (depth << NUM_SYMBOL_BITS);
893
+
894
+ /* If needed, decrease the length to meet the
895
+ * length-limited constraint. This is not the optimal
896
+ * method for generating length-limited Huffman codes!
897
+ * But it should be good enough. */
898
+ if (len >= max_codeword_len) {
899
+ len = max_codeword_len;
900
+ do {
901
+ len--;
902
+ } while (len_counts[len] == 0);
903
+ }
904
+
905
+ /* Account for the fact that we have a non-leaf node at
906
+ * the current depth. */
907
+ len_counts[len]--;
908
+ len_counts[len + 1] += 2;
909
+ }
910
+ }
911
+
912
+ /*
913
+ * Generate the codewords for a canonical Huffman code.
914
+ *
915
+ * @A
916
+ * The output array for codewords. In addition, initially this
917
+ * array must contain the symbols, sorted primarily by frequency and
918
+ * secondarily by symbol value, in the low NUM_SYMBOL_BITS bits of
919
+ * each entry.
920
+ *
921
+ * @len
922
+ * Output array for codeword lengths.
923
+ *
924
+ * @len_counts
925
+ * An array that provides the number of codewords that will have
926
+ * each possible length <= max_codeword_len.
927
+ *
928
+ * @max_codeword_len
929
+ * Maximum length, in bits, of each codeword.
930
+ *
931
+ * @num_syms
932
+ * Number of symbols in the alphabet, including symbols with zero
933
+ * frequency. This is the length of the 'A' and 'len' arrays.
934
+ */
935
+ static void
936
+ gen_codewords(u32 A[restrict], u8 lens[restrict],
937
+ const unsigned len_counts[restrict],
938
+ unsigned max_codeword_len, unsigned num_syms)
939
+ {
940
+ u32 next_codewords[DEFLATE_MAX_CODEWORD_LEN + 1];
941
+ unsigned i;
942
+ unsigned len;
943
+ unsigned sym;
944
+
945
+ /* Given the number of codewords that will have each length,
946
+ * assign codeword lengths to symbols. We do this by assigning
947
+ * the lengths in decreasing order to the symbols sorted
948
+ * primarily by increasing frequency and secondarily by
949
+ * increasing symbol value. */
950
+ for (i = 0, len = max_codeword_len; len >= 1; len--) {
951
+ unsigned count = len_counts[len];
952
+ while (count--)
953
+ lens[A[i++] & SYMBOL_MASK] = len;
954
+ }
955
+
956
+ /* Generate the codewords themselves. We initialize the
957
+ * 'next_codewords' array to provide the lexicographically first
958
+ * codeword of each length, then assign codewords in symbol
959
+ * order. This produces a canonical code. */
960
+ next_codewords[0] = 0;
961
+ next_codewords[1] = 0;
962
+ for (len = 2; len <= max_codeword_len; len++)
963
+ next_codewords[len] =
964
+ (next_codewords[len - 1] + len_counts[len - 1]) << 1;
965
+
966
+ for (sym = 0; sym < num_syms; sym++)
967
+ A[sym] = next_codewords[lens[sym]]++;
968
+ }
969
+
970
+ /*
971
+ * ---------------------------------------------------------------------
972
+ * make_canonical_huffman_code()
973
+ * ---------------------------------------------------------------------
974
+ *
975
+ * Given an alphabet and the frequency of each symbol in it, construct a
976
+ * length-limited canonical Huffman code.
977
+ *
978
+ * @num_syms
979
+ * The number of symbols in the alphabet. The symbols are the
980
+ * integers in the range [0, num_syms - 1]. This parameter must be
981
+ * at least 2 and can't be greater than (1 << NUM_SYMBOL_BITS).
982
+ *
983
+ * @max_codeword_len
984
+ * The maximum permissible codeword length.
985
+ *
986
+ * @freqs
987
+ * An array of @num_syms entries, each of which specifies the
988
+ * frequency of the corresponding symbol. It is valid for some,
989
+ * none, or all of the frequencies to be 0.
990
+ *
991
+ * @lens
992
+ * An array of @num_syms entries in which this function will return
993
+ * the length, in bits, of the codeword assigned to each symbol.
994
+ * Symbols with 0 frequency will not have codewords per se, but
995
+ * their entries in this array will be set to 0. No lengths greater
996
+ * than @max_codeword_len will be assigned.
997
+ *
998
+ * @codewords
999
+ * An array of @num_syms entries in which this function will return
1000
+ * the codeword for each symbol, right-justified and padded on the
1001
+ * left with zeroes. Codewords for symbols with 0 frequency will be
1002
+ * undefined.
1003
+ *
1004
+ * ---------------------------------------------------------------------
1005
+ *
1006
+ * This function builds a length-limited canonical Huffman code.
1007
+ *
1008
+ * A length-limited Huffman code contains no codewords longer than some
1009
+ * specified length, and has exactly (with some algorithms) or
1010
+ * approximately (with the algorithm used here) the minimum weighted path
1011
+ * length from the root, given this constraint.
1012
+ *
1013
+ * A canonical Huffman code satisfies the properties that a longer
1014
+ * codeword never lexicographically precedes a shorter codeword, and the
1015
+ * lexicographic ordering of codewords of the same length is the same as
1016
+ * the lexicographic ordering of the corresponding symbols. A canonical
1017
+ * Huffman code, or more generally a canonical prefix code, can be
1018
+ * reconstructed from only a list containing the codeword length of each
1019
+ * symbol.
1020
+ *
1021
+ * The classic algorithm to generate a Huffman code creates a node for
1022
+ * each symbol, then inserts these nodes into a min-heap keyed by symbol
1023
+ * frequency. Then, repeatedly, the two lowest-frequency nodes are
1024
+ * removed from the min-heap and added as the children of a new node
1025
+ * having frequency equal to the sum of its two children, which is then
1026
+ * inserted into the min-heap. When only a single node remains in the
1027
+ * min-heap, it is the root of the Huffman tree. The codeword for each
1028
+ * symbol is determined by the path needed to reach the corresponding
1029
+ * node from the root. Descending to the left child appends a 0 bit,
1030
+ * whereas descending to the right child appends a 1 bit.
1031
+ *
1032
+ * The classic algorithm is relatively easy to understand, but it is
1033
+ * subject to a number of inefficiencies. In practice, it is fastest to
1034
+ * first sort the symbols by frequency. (This itself can be subject to
1035
+ * an optimization based on the fact that most frequencies tend to be
1036
+ * low.) At the same time, we sort secondarily by symbol value, which
1037
+ * aids the process of generating a canonical code. Then, during tree
1038
+ * construction, no heap is necessary because both the leaf nodes and the
1039
+ * unparented non-leaf nodes can be easily maintained in sorted order.
1040
+ * Consequently, there can never be more than two possibilities for the
1041
+ * next-lowest-frequency node.
1042
+ *
1043
+ * In addition, because we're generating a canonical code, we actually
1044
+ * don't need the leaf nodes of the tree at all, only the non-leaf nodes.
1045
+ * This is because for canonical code generation we don't need to know
1046
+ * where the symbols are in the tree. Rather, we only need to know how
1047
+ * many leaf nodes have each depth (codeword length). And this
1048
+ * information can, in fact, be quickly generated from the tree of
1049
+ * non-leaves only.
1050
+ *
1051
+ * Furthermore, we can build this stripped-down Huffman tree directly in
1052
+ * the array in which the codewords are to be generated, provided that
1053
+ * these array slots are large enough to hold a symbol and frequency
1054
+ * value.
1055
+ *
1056
+ * Still furthermore, we don't even need to maintain explicit child
1057
+ * pointers. We only need the parent pointers, and even those can be
1058
+ * overwritten in-place with depth information as part of the process of
1059
+ * extracting codeword lengths from the tree. So in summary, we do NOT
1060
+ * need a big structure like:
1061
+ *
1062
+ * struct huffman_tree_node {
1063
+ * unsigned int symbol;
1064
+ * unsigned int frequency;
1065
+ * unsigned int depth;
1066
+ * struct huffman_tree_node *left_child;
1067
+ * struct huffman_tree_node *right_child;
1068
+ * };
1069
+ *
1070
+ *
1071
+ * ... which often gets used in "naive" implementations of Huffman code
1072
+ * generation.
1073
+ *
1074
+ * Many of these optimizations are based on the implementation in 7-Zip
1075
+ * (source file: C/HuffEnc.c), which has been placed in the public domain
1076
+ * by Igor Pavlov.
1077
+ */
1078
+ static void
1079
+ make_canonical_huffman_code(unsigned num_syms, unsigned max_codeword_len,
1080
+ const u32 freqs[restrict],
1081
+ u8 lens[restrict], u32 codewords[restrict])
1082
+ {
1083
+ u32 *A = codewords;
1084
+ unsigned num_used_syms;
1085
+
1086
+ STATIC_ASSERT(DEFLATE_MAX_NUM_SYMS <= 1 << NUM_SYMBOL_BITS);
1087
+
1088
+ /* We begin by sorting the symbols primarily by frequency and
1089
+ * secondarily by symbol value. As an optimization, the array
1090
+ * used for this purpose ('A') shares storage with the space in
1091
+ * which we will eventually return the codewords. */
1092
+
1093
+ num_used_syms = sort_symbols(num_syms, freqs, lens, A);
1094
+
1095
+ /* 'num_used_syms' is the number of symbols with nonzero
1096
+ * frequency. This may be less than @num_syms. 'num_used_syms'
1097
+ * is also the number of entries in 'A' that are valid. Each
1098
+ * entry consists of a distinct symbol and a nonzero frequency
1099
+ * packed into a 32-bit integer. */
1100
+
1101
+ /* Handle special cases where only 0 or 1 symbols were used (had
1102
+ * nonzero frequency). */
1103
+
1104
+ if (unlikely(num_used_syms == 0)) {
1105
+ /* Code is empty. sort_symbols() already set all lengths
1106
+ * to 0, so there is nothing more to do. */
1107
+ return;
1108
+ }
1109
+
1110
+ if (unlikely(num_used_syms == 1)) {
1111
+ /* Only one symbol was used, so we only need one
1112
+ * codeword. But two codewords are needed to form the
1113
+ * smallest complete Huffman code, which uses codewords 0
1114
+ * and 1. Therefore, we choose another symbol to which
1115
+ * to assign a codeword. We use 0 (if the used symbol is
1116
+ * not 0) or 1 (if the used symbol is 0). In either
1117
+ * case, the lesser-valued symbol must be assigned
1118
+ * codeword 0 so that the resulting code is canonical. */
1119
+
1120
+ unsigned sym = A[0] & SYMBOL_MASK;
1121
+ unsigned nonzero_idx = sym ? sym : 1;
1122
+
1123
+ codewords[0] = 0;
1124
+ lens[0] = 1;
1125
+ codewords[nonzero_idx] = 1;
1126
+ lens[nonzero_idx] = 1;
1127
+ return;
1128
+ }
1129
+
1130
+ /* Build a stripped-down version of the Huffman tree, sharing the
1131
+ * array 'A' with the symbol values. Then extract length counts
1132
+ * from the tree and use them to generate the final codewords. */
1133
+
1134
+ build_tree(A, num_used_syms);
1135
+
1136
+ {
1137
+ unsigned len_counts[DEFLATE_MAX_CODEWORD_LEN + 1];
1138
+
1139
+ compute_length_counts(A, num_used_syms - 2,
1140
+ len_counts, max_codeword_len);
1141
+
1142
+ gen_codewords(A, lens, len_counts, max_codeword_len, num_syms);
1143
+ }
1144
+ }
1145
+
1146
+ /*
1147
+ * Clear the Huffman symbol frequency counters.
1148
+ * This must be called when starting a new DEFLATE block.
1149
+ */
1150
+ static void
1151
+ deflate_reset_symbol_frequencies(struct libdeflate_compressor *c)
1152
+ {
1153
+ memset(&c->freqs, 0, sizeof(c->freqs));
1154
+ }
1155
+
1156
+ /* Reverse the Huffman codeword 'codeword', which is 'len' bits in length. */
1157
+ static u32
1158
+ deflate_reverse_codeword(u32 codeword, u8 len)
1159
+ {
1160
+ /* The following branchless algorithm is faster than going bit by bit.
1161
+ * Note: since no codewords are longer than 16 bits, we only need to
1162
+ * reverse the low 16 bits of the 'u32'. */
1163
+ STATIC_ASSERT(DEFLATE_MAX_CODEWORD_LEN <= 16);
1164
+
1165
+ /* Flip adjacent 1-bit fields */
1166
+ codeword = ((codeword & 0x5555) << 1) | ((codeword & 0xAAAA) >> 1);
1167
+
1168
+ /* Flip adjacent 2-bit fields */
1169
+ codeword = ((codeword & 0x3333) << 2) | ((codeword & 0xCCCC) >> 2);
1170
+
1171
+ /* Flip adjacent 4-bit fields */
1172
+ codeword = ((codeword & 0x0F0F) << 4) | ((codeword & 0xF0F0) >> 4);
1173
+
1174
+ /* Flip adjacent 8-bit fields */
1175
+ codeword = ((codeword & 0x00FF) << 8) | ((codeword & 0xFF00) >> 8);
1176
+
1177
+ /* Return the high 'len' bits of the bit-reversed 16 bit value. */
1178
+ return codeword >> (16 - len);
1179
+ }
1180
+
1181
+ /* Make a canonical Huffman code with bit-reversed codewords. */
1182
+ static void
1183
+ deflate_make_huffman_code(unsigned num_syms, unsigned max_codeword_len,
1184
+ const u32 freqs[], u8 lens[], u32 codewords[])
1185
+ {
1186
+ unsigned sym;
1187
+
1188
+ make_canonical_huffman_code(num_syms, max_codeword_len,
1189
+ freqs, lens, codewords);
1190
+
1191
+ for (sym = 0; sym < num_syms; sym++)
1192
+ codewords[sym] = deflate_reverse_codeword(codewords[sym], lens[sym]);
1193
+ }
1194
+
1195
+ /*
1196
+ * Build the literal/length and offset Huffman codes for a DEFLATE block.
1197
+ *
1198
+ * This takes as input the frequency tables for each code and produces as output
1199
+ * a set of tables that map symbols to codewords and codeword lengths.
1200
+ */
1201
+ static void
1202
+ deflate_make_huffman_codes(const struct deflate_freqs *freqs,
1203
+ struct deflate_codes *codes)
1204
+ {
1205
+ STATIC_ASSERT(MAX_LITLEN_CODEWORD_LEN <= DEFLATE_MAX_LITLEN_CODEWORD_LEN);
1206
+ STATIC_ASSERT(MAX_OFFSET_CODEWORD_LEN <= DEFLATE_MAX_OFFSET_CODEWORD_LEN);
1207
+
1208
+ deflate_make_huffman_code(DEFLATE_NUM_LITLEN_SYMS,
1209
+ MAX_LITLEN_CODEWORD_LEN,
1210
+ freqs->litlen,
1211
+ codes->lens.litlen,
1212
+ codes->codewords.litlen);
1213
+
1214
+ deflate_make_huffman_code(DEFLATE_NUM_OFFSET_SYMS,
1215
+ MAX_OFFSET_CODEWORD_LEN,
1216
+ freqs->offset,
1217
+ codes->lens.offset,
1218
+ codes->codewords.offset);
1219
+ }
1220
+
1221
+ /* Initialize c->static_codes. */
1222
+ static void
1223
+ deflate_init_static_codes(struct libdeflate_compressor *c)
1224
+ {
1225
+ unsigned i;
1226
+
1227
+ for (i = 0; i < 144; i++)
1228
+ c->freqs.litlen[i] = 1 << (9 - 8);
1229
+ for (; i < 256; i++)
1230
+ c->freqs.litlen[i] = 1 << (9 - 9);
1231
+ for (; i < 280; i++)
1232
+ c->freqs.litlen[i] = 1 << (9 - 7);
1233
+ for (; i < 288; i++)
1234
+ c->freqs.litlen[i] = 1 << (9 - 8);
1235
+
1236
+ for (i = 0; i < 32; i++)
1237
+ c->freqs.offset[i] = 1 << (5 - 5);
1238
+
1239
+ deflate_make_huffman_codes(&c->freqs, &c->static_codes);
1240
+ }
1241
+
1242
+ /* Return the offset slot for the specified match offset. */
1243
+ static forceinline unsigned
1244
+ deflate_get_offset_slot(struct libdeflate_compressor *c, unsigned offset)
1245
+ {
1246
+ #if USE_FULL_OFFSET_SLOT_FAST
1247
+ return c->offset_slot_fast[offset];
1248
+ #else
1249
+ if (offset <= 256)
1250
+ return c->offset_slot_fast[offset - 1];
1251
+ else
1252
+ return c->offset_slot_fast[256 + ((offset - 1) >> 7)];
1253
+ #endif
1254
+ }
1255
+
1256
+ /* Write the header fields common to all DEFLATE block types. */
1257
+ static void
1258
+ deflate_write_block_header(struct deflate_output_bitstream *os,
1259
+ bool is_final_block, unsigned block_type)
1260
+ {
1261
+ deflate_add_bits(os, is_final_block, 1);
1262
+ deflate_add_bits(os, block_type, 2);
1263
+ deflate_flush_bits(os);
1264
+ }
1265
+
1266
+ static unsigned
1267
+ deflate_compute_precode_items(const u8 lens[restrict],
1268
+ const unsigned num_lens,
1269
+ u32 precode_freqs[restrict],
1270
+ unsigned precode_items[restrict])
1271
+ {
1272
+ unsigned *itemptr;
1273
+ unsigned run_start;
1274
+ unsigned run_end;
1275
+ unsigned extra_bits;
1276
+ u8 len;
1277
+
1278
+ memset(precode_freqs, 0,
1279
+ DEFLATE_NUM_PRECODE_SYMS * sizeof(precode_freqs[0]));
1280
+
1281
+ itemptr = precode_items;
1282
+ run_start = 0;
1283
+ do {
1284
+ /* Find the next run of codeword lengths. */
1285
+
1286
+ /* len = the length being repeated */
1287
+ len = lens[run_start];
1288
+
1289
+ /* Extend the run. */
1290
+ run_end = run_start;
1291
+ do {
1292
+ run_end++;
1293
+ } while (run_end != num_lens && len == lens[run_end]);
1294
+
1295
+ if (len == 0) {
1296
+ /* Run of zeroes. */
1297
+
1298
+ /* Symbol 18: RLE 11 to 138 zeroes at a time. */
1299
+ while ((run_end - run_start) >= 11) {
1300
+ extra_bits = MIN((run_end - run_start) - 11, 0x7F);
1301
+ precode_freqs[18]++;
1302
+ *itemptr++ = 18 | (extra_bits << 5);
1303
+ run_start += 11 + extra_bits;
1304
+ }
1305
+
1306
+ /* Symbol 17: RLE 3 to 10 zeroes at a time. */
1307
+ if ((run_end - run_start) >= 3) {
1308
+ extra_bits = MIN((run_end - run_start) - 3, 0x7);
1309
+ precode_freqs[17]++;
1310
+ *itemptr++ = 17 | (extra_bits << 5);
1311
+ run_start += 3 + extra_bits;
1312
+ }
1313
+ } else {
1314
+
1315
+ /* A run of nonzero lengths. */
1316
+
1317
+ /* Symbol 16: RLE 3 to 6 of the previous length. */
1318
+ if ((run_end - run_start) >= 4) {
1319
+ precode_freqs[len]++;
1320
+ *itemptr++ = len;
1321
+ run_start++;
1322
+ do {
1323
+ extra_bits = MIN((run_end - run_start) - 3, 0x3);
1324
+ precode_freqs[16]++;
1325
+ *itemptr++ = 16 | (extra_bits << 5);
1326
+ run_start += 3 + extra_bits;
1327
+ } while ((run_end - run_start) >= 3);
1328
+ }
1329
+ }
1330
+
1331
+ /* Output any remaining lengths without RLE. */
1332
+ while (run_start != run_end) {
1333
+ precode_freqs[len]++;
1334
+ *itemptr++ = len;
1335
+ run_start++;
1336
+ }
1337
+ } while (run_start != num_lens);
1338
+
1339
+ return itemptr - precode_items;
1340
+ }
1341
+
1342
+ /*
1343
+ * Huffman codeword lengths for dynamic Huffman blocks are compressed using a
1344
+ * separate Huffman code, the "precode", which contains a symbol for each
1345
+ * possible codeword length in the larger code as well as several special
1346
+ * symbols to represent repeated codeword lengths (a form of run-length
1347
+ * encoding). The precode is itself constructed in canonical form, and its
1348
+ * codeword lengths are represented literally in 19 3-bit fields that
1349
+ * immediately precede the compressed codeword lengths of the larger code.
1350
+ */
1351
+
1352
+ /* Precompute the information needed to output Huffman codes. */
1353
+ static void
1354
+ deflate_precompute_huffman_header(struct libdeflate_compressor *c)
1355
+ {
1356
+ /* Compute how many litlen and offset symbols are needed. */
1357
+
1358
+ for (c->num_litlen_syms = DEFLATE_NUM_LITLEN_SYMS;
1359
+ c->num_litlen_syms > 257;
1360
+ c->num_litlen_syms--)
1361
+ if (c->codes.lens.litlen[c->num_litlen_syms - 1] != 0)
1362
+ break;
1363
+
1364
+ for (c->num_offset_syms = DEFLATE_NUM_OFFSET_SYMS;
1365
+ c->num_offset_syms > 1;
1366
+ c->num_offset_syms--)
1367
+ if (c->codes.lens.offset[c->num_offset_syms - 1] != 0)
1368
+ break;
1369
+
1370
+ /* If we're not using the full set of literal/length codeword lengths,
1371
+ * then temporarily move the offset codeword lengths over so that the
1372
+ * literal/length and offset codeword lengths are contiguous. */
1373
+
1374
+ STATIC_ASSERT(offsetof(struct deflate_lens, offset) ==
1375
+ DEFLATE_NUM_LITLEN_SYMS);
1376
+
1377
+ if (c->num_litlen_syms != DEFLATE_NUM_LITLEN_SYMS) {
1378
+ memmove((u8 *)&c->codes.lens + c->num_litlen_syms,
1379
+ (u8 *)&c->codes.lens + DEFLATE_NUM_LITLEN_SYMS,
1380
+ c->num_offset_syms);
1381
+ }
1382
+
1383
+ /* Compute the "items" (RLE / literal tokens and extra bits) with which
1384
+ * the codeword lengths in the larger code will be output. */
1385
+ c->num_precode_items =
1386
+ deflate_compute_precode_items((u8 *)&c->codes.lens,
1387
+ c->num_litlen_syms +
1388
+ c->num_offset_syms,
1389
+ c->precode_freqs,
1390
+ c->precode_items);
1391
+
1392
+ /* Build the precode. */
1393
+ STATIC_ASSERT(MAX_PRE_CODEWORD_LEN <= DEFLATE_MAX_PRE_CODEWORD_LEN);
1394
+ deflate_make_huffman_code(DEFLATE_NUM_PRECODE_SYMS,
1395
+ MAX_PRE_CODEWORD_LEN,
1396
+ c->precode_freqs, c->precode_lens,
1397
+ c->precode_codewords);
1398
+
1399
+ /* Count how many precode lengths we actually need to output. */
1400
+ for (c->num_explicit_lens = DEFLATE_NUM_PRECODE_SYMS;
1401
+ c->num_explicit_lens > 4;
1402
+ c->num_explicit_lens--)
1403
+ if (c->precode_lens[deflate_precode_lens_permutation[
1404
+ c->num_explicit_lens - 1]] != 0)
1405
+ break;
1406
+
1407
+ /* Restore the offset codeword lengths if needed. */
1408
+ if (c->num_litlen_syms != DEFLATE_NUM_LITLEN_SYMS) {
1409
+ memmove((u8 *)&c->codes.lens + DEFLATE_NUM_LITLEN_SYMS,
1410
+ (u8 *)&c->codes.lens + c->num_litlen_syms,
1411
+ c->num_offset_syms);
1412
+ }
1413
+ }
1414
+
1415
+ /* Output the Huffman codes. */
1416
+ static void
1417
+ deflate_write_huffman_header(struct libdeflate_compressor *c,
1418
+ struct deflate_output_bitstream *os)
1419
+ {
1420
+ unsigned i;
1421
+
1422
+ deflate_add_bits(os, c->num_litlen_syms - 257, 5);
1423
+ deflate_add_bits(os, c->num_offset_syms - 1, 5);
1424
+ deflate_add_bits(os, c->num_explicit_lens - 4, 4);
1425
+ deflate_flush_bits(os);
1426
+
1427
+ /* Output the lengths of the codewords in the precode. */
1428
+ for (i = 0; i < c->num_explicit_lens; i++) {
1429
+ deflate_add_bits(os, c->precode_lens[
1430
+ deflate_precode_lens_permutation[i]], 3);
1431
+ deflate_flush_bits(os);
1432
+ }
1433
+
1434
+ /* Output the encoded lengths of the codewords in the larger code. */
1435
+ for (i = 0; i < c->num_precode_items; i++) {
1436
+ unsigned precode_item = c->precode_items[i];
1437
+ unsigned precode_sym = precode_item & 0x1F;
1438
+ deflate_add_bits(os, c->precode_codewords[precode_sym],
1439
+ c->precode_lens[precode_sym]);
1440
+ if (precode_sym >= 16) {
1441
+ if (precode_sym == 16)
1442
+ deflate_add_bits(os, precode_item >> 5, 2);
1443
+ else if (precode_sym == 17)
1444
+ deflate_add_bits(os, precode_item >> 5, 3);
1445
+ else
1446
+ deflate_add_bits(os, precode_item >> 5, 7);
1447
+ }
1448
+ STATIC_ASSERT(CAN_BUFFER(DEFLATE_MAX_PRE_CODEWORD_LEN + 7));
1449
+ deflate_flush_bits(os);
1450
+ }
1451
+ }
1452
+
1453
+ static void
1454
+ deflate_write_sequences(struct deflate_output_bitstream * restrict os,
1455
+ const struct deflate_codes * restrict codes,
1456
+ const struct deflate_sequence sequences[restrict],
1457
+ const u8 * restrict in_next)
1458
+ {
1459
+ const struct deflate_sequence *seq = sequences;
1460
+
1461
+ for (;;) {
1462
+ u32 litrunlen = seq->litrunlen_and_length & 0x7FFFFF;
1463
+ unsigned length = seq->litrunlen_and_length >> 23;
1464
+ unsigned length_slot;
1465
+ unsigned litlen_symbol;
1466
+ unsigned offset_symbol;
1467
+
1468
+ if (litrunlen) {
1469
+ #if 1
1470
+ while (litrunlen >= 4) {
1471
+ unsigned lit0 = in_next[0];
1472
+ unsigned lit1 = in_next[1];
1473
+ unsigned lit2 = in_next[2];
1474
+ unsigned lit3 = in_next[3];
1475
+
1476
+ deflate_add_bits(os, codes->codewords.litlen[lit0],
1477
+ codes->lens.litlen[lit0]);
1478
+ if (!CAN_BUFFER(2 * MAX_LITLEN_CODEWORD_LEN))
1479
+ deflate_flush_bits(os);
1480
+
1481
+ deflate_add_bits(os, codes->codewords.litlen[lit1],
1482
+ codes->lens.litlen[lit1]);
1483
+ if (!CAN_BUFFER(4 * MAX_LITLEN_CODEWORD_LEN))
1484
+ deflate_flush_bits(os);
1485
+
1486
+ deflate_add_bits(os, codes->codewords.litlen[lit2],
1487
+ codes->lens.litlen[lit2]);
1488
+ if (!CAN_BUFFER(2 * MAX_LITLEN_CODEWORD_LEN))
1489
+ deflate_flush_bits(os);
1490
+
1491
+ deflate_add_bits(os, codes->codewords.litlen[lit3],
1492
+ codes->lens.litlen[lit3]);
1493
+ deflate_flush_bits(os);
1494
+ in_next += 4;
1495
+ litrunlen -= 4;
1496
+ }
1497
+ if (litrunlen-- != 0) {
1498
+ deflate_add_bits(os, codes->codewords.litlen[*in_next],
1499
+ codes->lens.litlen[*in_next]);
1500
+ if (!CAN_BUFFER(3 * MAX_LITLEN_CODEWORD_LEN))
1501
+ deflate_flush_bits(os);
1502
+ in_next++;
1503
+ if (litrunlen-- != 0) {
1504
+ deflate_add_bits(os, codes->codewords.litlen[*in_next],
1505
+ codes->lens.litlen[*in_next]);
1506
+ if (!CAN_BUFFER(3 * MAX_LITLEN_CODEWORD_LEN))
1507
+ deflate_flush_bits(os);
1508
+ in_next++;
1509
+ if (litrunlen-- != 0) {
1510
+ deflate_add_bits(os, codes->codewords.litlen[*in_next],
1511
+ codes->lens.litlen[*in_next]);
1512
+ if (!CAN_BUFFER(3 * MAX_LITLEN_CODEWORD_LEN))
1513
+ deflate_flush_bits(os);
1514
+ in_next++;
1515
+ }
1516
+ }
1517
+ if (CAN_BUFFER(3 * MAX_LITLEN_CODEWORD_LEN))
1518
+ deflate_flush_bits(os);
1519
+ }
1520
+ #else
1521
+ do {
1522
+ unsigned lit = *in_next++;
1523
+ deflate_add_bits(os, codes->codewords.litlen[lit],
1524
+ codes->lens.litlen[lit]);
1525
+ deflate_flush_bits(os);
1526
+ } while (--litrunlen);
1527
+ #endif
1528
+ }
1529
+
1530
+ if (length == 0)
1531
+ return;
1532
+
1533
+ in_next += length;
1534
+
1535
+ length_slot = seq->length_slot;
1536
+ litlen_symbol = 257 + length_slot;
1537
+
1538
+ /* Litlen symbol */
1539
+ deflate_add_bits(os, codes->codewords.litlen[litlen_symbol],
1540
+ codes->lens.litlen[litlen_symbol]);
1541
+
1542
+ /* Extra length bits */
1543
+ STATIC_ASSERT(CAN_BUFFER(MAX_LITLEN_CODEWORD_LEN +
1544
+ DEFLATE_MAX_EXTRA_LENGTH_BITS));
1545
+ deflate_add_bits(os, length - deflate_length_slot_base[length_slot],
1546
+ deflate_extra_length_bits[length_slot]);
1547
+
1548
+ if (!CAN_BUFFER(MAX_LITLEN_CODEWORD_LEN +
1549
+ DEFLATE_MAX_EXTRA_LENGTH_BITS +
1550
+ MAX_OFFSET_CODEWORD_LEN +
1551
+ DEFLATE_MAX_EXTRA_OFFSET_BITS))
1552
+ deflate_flush_bits(os);
1553
+
1554
+ /* Offset symbol */
1555
+ offset_symbol = seq->offset_symbol;
1556
+ deflate_add_bits(os, codes->codewords.offset[offset_symbol],
1557
+ codes->lens.offset[offset_symbol]);
1558
+
1559
+ if (!CAN_BUFFER(MAX_OFFSET_CODEWORD_LEN +
1560
+ DEFLATE_MAX_EXTRA_OFFSET_BITS))
1561
+ deflate_flush_bits(os);
1562
+
1563
+ /* Extra offset bits */
1564
+ deflate_add_bits(os, seq->offset - deflate_offset_slot_base[offset_symbol],
1565
+ deflate_extra_offset_bits[offset_symbol]);
1566
+
1567
+ deflate_flush_bits(os);
1568
+
1569
+ seq++;
1570
+ }
1571
+ }
1572
+
1573
+ #if SUPPORT_NEAR_OPTIMAL_PARSING
1574
+ /*
1575
+ * Follow the minimum-cost path in the graph of possible match/literal choices
1576
+ * for the current block and write out the matches/literals using the specified
1577
+ * Huffman codes.
1578
+ *
1579
+ * Note: this is slightly duplicated with deflate_write_sequences(), the reason
1580
+ * being that we don't want to waste time translating between intermediate
1581
+ * match/literal representations.
1582
+ */
1583
+ static void
1584
+ deflate_write_item_list(struct deflate_output_bitstream *os,
1585
+ const struct deflate_codes *codes,
1586
+ struct libdeflate_compressor *c,
1587
+ u32 block_length)
1588
+ {
1589
+ struct deflate_optimum_node *cur_node = &c->p.n.optimum_nodes[0];
1590
+ struct deflate_optimum_node * const end_node = &c->p.n.optimum_nodes[block_length];
1591
+ do {
1592
+ unsigned length = cur_node->item & OPTIMUM_LEN_MASK;
1593
+ unsigned offset = cur_node->item >> OPTIMUM_OFFSET_SHIFT;
1594
+ unsigned litlen_symbol;
1595
+ unsigned length_slot;
1596
+ unsigned offset_slot;
1597
+
1598
+ if (length == 1) {
1599
+ /* Literal */
1600
+ litlen_symbol = offset;
1601
+ deflate_add_bits(os, codes->codewords.litlen[litlen_symbol],
1602
+ codes->lens.litlen[litlen_symbol]);
1603
+ deflate_flush_bits(os);
1604
+ } else {
1605
+ /* Match length */
1606
+ length_slot = deflate_length_slot[length];
1607
+ litlen_symbol = 257 + length_slot;
1608
+ deflate_add_bits(os, codes->codewords.litlen[litlen_symbol],
1609
+ codes->lens.litlen[litlen_symbol]);
1610
+
1611
+ deflate_add_bits(os, length - deflate_length_slot_base[length_slot],
1612
+ deflate_extra_length_bits[length_slot]);
1613
+
1614
+ if (!CAN_BUFFER(MAX_LITLEN_CODEWORD_LEN +
1615
+ DEFLATE_MAX_EXTRA_LENGTH_BITS +
1616
+ MAX_OFFSET_CODEWORD_LEN +
1617
+ DEFLATE_MAX_EXTRA_OFFSET_BITS))
1618
+ deflate_flush_bits(os);
1619
+
1620
+
1621
+ /* Match offset */
1622
+ offset_slot = deflate_get_offset_slot(c, offset);
1623
+ deflate_add_bits(os, codes->codewords.offset[offset_slot],
1624
+ codes->lens.offset[offset_slot]);
1625
+
1626
+ if (!CAN_BUFFER(MAX_OFFSET_CODEWORD_LEN +
1627
+ DEFLATE_MAX_EXTRA_OFFSET_BITS))
1628
+ deflate_flush_bits(os);
1629
+
1630
+ deflate_add_bits(os, offset - deflate_offset_slot_base[offset_slot],
1631
+ deflate_extra_offset_bits[offset_slot]);
1632
+
1633
+ deflate_flush_bits(os);
1634
+ }
1635
+ cur_node += length;
1636
+ } while (cur_node != end_node);
1637
+ }
1638
+ #endif /* SUPPORT_NEAR_OPTIMAL_PARSING */
1639
+
1640
+ /* Output the end-of-block symbol. */
1641
+ static void
1642
+ deflate_write_end_of_block(struct deflate_output_bitstream *os,
1643
+ const struct deflate_codes *codes)
1644
+ {
1645
+ deflate_add_bits(os, codes->codewords.litlen[DEFLATE_END_OF_BLOCK],
1646
+ codes->lens.litlen[DEFLATE_END_OF_BLOCK]);
1647
+ deflate_flush_bits(os);
1648
+ }
1649
+
1650
+ static void
1651
+ deflate_write_uncompressed_block(struct deflate_output_bitstream *os,
1652
+ const u8 *data, u16 len,
1653
+ bool is_final_block)
1654
+ {
1655
+ deflate_write_block_header(os, is_final_block,
1656
+ DEFLATE_BLOCKTYPE_UNCOMPRESSED);
1657
+ deflate_align_bitstream(os);
1658
+
1659
+ if (4 + (u32)len >= os->end - os->next) {
1660
+ os->next = os->end;
1661
+ return;
1662
+ }
1663
+
1664
+ put_unaligned_le16(len, os->next);
1665
+ os->next += 2;
1666
+ put_unaligned_le16(~len, os->next);
1667
+ os->next += 2;
1668
+ memcpy(os->next, data, len);
1669
+ os->next += len;
1670
+ }
1671
+
1672
+ static void
1673
+ deflate_write_uncompressed_blocks(struct deflate_output_bitstream *os,
1674
+ const u8 *data, u32 data_length,
1675
+ bool is_final_block)
1676
+ {
1677
+ do {
1678
+ u16 len = MIN(data_length, UINT16_MAX);
1679
+
1680
+ deflate_write_uncompressed_block(os, data, len,
1681
+ is_final_block && len == data_length);
1682
+ data += len;
1683
+ data_length -= len;
1684
+ } while (data_length != 0);
1685
+ }
1686
+
1687
+ /*
1688
+ * Choose the best type of block to use (dynamic Huffman, static Huffman, or
1689
+ * uncompressed), then output it.
1690
+ */
1691
+ static void
1692
+ deflate_flush_block(struct libdeflate_compressor * restrict c,
1693
+ struct deflate_output_bitstream * restrict os,
1694
+ const u8 * restrict block_begin, u32 block_length,
1695
+ bool is_final_block, bool use_item_list)
1696
+ {
1697
+ static const u8 deflate_extra_precode_bits[DEFLATE_NUM_PRECODE_SYMS] = {
1698
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 3, 7,
1699
+ };
1700
+
1701
+ /* Costs are measured in bits */
1702
+ u32 dynamic_cost = 0;
1703
+ u32 static_cost = 0;
1704
+ u32 uncompressed_cost = 0;
1705
+ struct deflate_codes *codes;
1706
+ int block_type;
1707
+ unsigned sym;
1708
+
1709
+ /* Tally the end-of-block symbol. */
1710
+ c->freqs.litlen[DEFLATE_END_OF_BLOCK]++;
1711
+
1712
+ /* Build dynamic Huffman codes. */
1713
+ deflate_make_huffman_codes(&c->freqs, &c->codes);
1714
+
1715
+ /* Account for the cost of sending dynamic Huffman codes. */
1716
+ deflate_precompute_huffman_header(c);
1717
+ dynamic_cost += 5 + 5 + 4 + (3 * c->num_explicit_lens);
1718
+ for (sym = 0; sym < DEFLATE_NUM_PRECODE_SYMS; sym++) {
1719
+ u32 extra = deflate_extra_precode_bits[sym];
1720
+ dynamic_cost += c->precode_freqs[sym] *
1721
+ (extra + c->precode_lens[sym]);
1722
+ }
1723
+
1724
+ /* Account for the cost of encoding literals. */
1725
+ for (sym = 0; sym < 256; sym++) {
1726
+ dynamic_cost += c->freqs.litlen[sym] *
1727
+ c->codes.lens.litlen[sym];
1728
+ }
1729
+ for (sym = 0; sym < 144; sym++)
1730
+ static_cost += c->freqs.litlen[sym] * 8;
1731
+ for (; sym < 256; sym++)
1732
+ static_cost += c->freqs.litlen[sym] * 9;
1733
+
1734
+ /* Account for the cost of encoding the end-of-block symbol. */
1735
+ dynamic_cost += c->codes.lens.litlen[256];
1736
+ static_cost += 7;
1737
+
1738
+ /* Account for the cost of encoding lengths. */
1739
+ for (sym = 257; sym < 257 + ARRAY_LEN(deflate_extra_length_bits); sym++) {
1740
+ u32 extra = deflate_extra_length_bits[sym - 257];
1741
+ dynamic_cost += c->freqs.litlen[sym] *
1742
+ (extra + c->codes.lens.litlen[sym]);
1743
+ static_cost += c->freqs.litlen[sym] *
1744
+ (extra + c->static_codes.lens.litlen[sym]);
1745
+ }
1746
+
1747
+ /* Account for the cost of encoding offsets. */
1748
+ for (sym = 0; sym < ARRAY_LEN(deflate_extra_offset_bits); sym++) {
1749
+ u32 extra = deflate_extra_offset_bits[sym];
1750
+ dynamic_cost += c->freqs.offset[sym] *
1751
+ (extra + c->codes.lens.offset[sym]);
1752
+ static_cost += c->freqs.offset[sym] * (extra + 5);
1753
+ }
1754
+
1755
+ /* Compute the cost of using uncompressed blocks. */
1756
+ uncompressed_cost += (-(os->bitcount + 3) & 7) + 32 +
1757
+ (40 * (DIV_ROUND_UP(block_length,
1758
+ UINT16_MAX) - 1)) +
1759
+ (8 * block_length);
1760
+
1761
+ /* Choose the cheapest block type. */
1762
+ if (dynamic_cost < MIN(static_cost, uncompressed_cost)) {
1763
+ block_type = DEFLATE_BLOCKTYPE_DYNAMIC_HUFFMAN;
1764
+ codes = &c->codes;
1765
+ } else if (static_cost < uncompressed_cost) {
1766
+ block_type = DEFLATE_BLOCKTYPE_STATIC_HUFFMAN;
1767
+ codes = &c->static_codes;
1768
+ } else {
1769
+ block_type = DEFLATE_BLOCKTYPE_UNCOMPRESSED;
1770
+ }
1771
+
1772
+ /* Now actually output the block. */
1773
+
1774
+ if (block_type == DEFLATE_BLOCKTYPE_UNCOMPRESSED) {
1775
+ /* Note: the length being flushed may exceed the maximum length
1776
+ * of an uncompressed block (65535 bytes). Therefore, more than
1777
+ * one uncompressed block might be needed. */
1778
+ deflate_write_uncompressed_blocks(os, block_begin, block_length,
1779
+ is_final_block);
1780
+ } else {
1781
+ /* Output the block header. */
1782
+ deflate_write_block_header(os, is_final_block, block_type);
1783
+
1784
+ /* Output the Huffman codes (dynamic Huffman blocks only). */
1785
+ if (block_type == DEFLATE_BLOCKTYPE_DYNAMIC_HUFFMAN)
1786
+ deflate_write_huffman_header(c, os);
1787
+
1788
+ /* Output the literals, matches, and end-of-block symbol. */
1789
+ #if SUPPORT_NEAR_OPTIMAL_PARSING
1790
+ if (use_item_list)
1791
+ deflate_write_item_list(os, codes, c, block_length);
1792
+ else
1793
+ #endif
1794
+ deflate_write_sequences(os, codes, c->p.g.sequences,
1795
+ block_begin);
1796
+ deflate_write_end_of_block(os, codes);
1797
+ }
1798
+ }
1799
+
1800
+ static forceinline void
1801
+ deflate_choose_literal(struct libdeflate_compressor *c, unsigned literal,
1802
+ u32 *litrunlen_p)
1803
+ {
1804
+ c->freqs.litlen[literal]++;
1805
+ ++*litrunlen_p;
1806
+ }
1807
+
1808
+ static forceinline void
1809
+ deflate_choose_match(struct libdeflate_compressor *c,
1810
+ unsigned length, unsigned offset,
1811
+ u32 *litrunlen_p, struct deflate_sequence **next_seq_p)
1812
+ {
1813
+ struct deflate_sequence *seq = *next_seq_p;
1814
+ unsigned length_slot = deflate_length_slot[length];
1815
+ unsigned offset_slot = deflate_get_offset_slot(c, offset);
1816
+
1817
+ c->freqs.litlen[257 + length_slot]++;
1818
+ c->freqs.offset[offset_slot]++;
1819
+
1820
+ seq->litrunlen_and_length = ((u32)length << 23) | *litrunlen_p;
1821
+ seq->offset = offset;
1822
+ seq->length_slot = length_slot;
1823
+ seq->offset_symbol = offset_slot;
1824
+
1825
+ *litrunlen_p = 0;
1826
+ *next_seq_p = seq + 1;
1827
+ }
1828
+
1829
+ static forceinline void
1830
+ deflate_finish_sequence(struct deflate_sequence *seq, u32 litrunlen)
1831
+ {
1832
+ seq->litrunlen_and_length = litrunlen; /* length = 0 */
1833
+ }
1834
+
1835
+ /******************************************************************************/
1836
+
1837
+ /*
1838
+ * Block splitting algorithm. The problem is to decide when it is worthwhile to
1839
+ * start a new block with new Huffman codes. There is a theoretically optimal
1840
+ * solution: recursively consider every possible block split, considering the
1841
+ * exact cost of each block, and choose the minimum cost approach. But this is
1842
+ * far too slow. Instead, as an approximation, we can count symbols and after
1843
+ * every N symbols, compare the expected distribution of symbols based on the
1844
+ * previous data with the actual distribution. If they differ "by enough", then
1845
+ * start a new block.
1846
+ *
1847
+ * As an optimization and heuristic, we don't distinguish between every symbol
1848
+ * but rather we combine many symbols into a single "observation type". For
1849
+ * literals we only look at the high bits and low bits, and for matches we only
1850
+ * look at whether the match is long or not. The assumption is that for typical
1851
+ * "real" data, places that are good block boundaries will tend to be noticable
1852
+ * based only on changes in these aggregate frequencies, without looking for
1853
+ * subtle differences in individual symbols. For example, a change from ASCII
1854
+ * bytes to non-ASCII bytes, or from few matches (generally less compressible)
1855
+ * to many matches (generally more compressible), would be easily noticed based
1856
+ * on the aggregates.
1857
+ *
1858
+ * For determining whether the frequency distributions are "different enough" to
1859
+ * start a new block, the simply heuristic of splitting when the sum of absolute
1860
+ * differences exceeds a constant seems to be good enough. We also add a number
1861
+ * proportional to the block length so that the algorithm is more likely to end
1862
+ * long blocks than short blocks. This reflects the general expectation that it
1863
+ * will become increasingly beneficial to start a new block as the current
1864
+ * block grows longer.
1865
+ *
1866
+ * Finally, for an approximation, it is not strictly necessary that the exact
1867
+ * symbols being used are considered. With "near-optimal parsing", for example,
1868
+ * the actual symbols that will be used are unknown until after the block
1869
+ * boundary is chosen and the block has been optimized. Since the final choices
1870
+ * cannot be used, we can use preliminary "greedy" choices instead.
1871
+ */
1872
+
1873
+ /* Initialize the block split statistics when starting a new block. */
1874
+ static void
1875
+ init_block_split_stats(struct block_split_stats *stats)
1876
+ {
1877
+ int i;
1878
+
1879
+ for (i = 0; i < NUM_OBSERVATION_TYPES; i++) {
1880
+ stats->new_observations[i] = 0;
1881
+ stats->observations[i] = 0;
1882
+ }
1883
+ stats->num_new_observations = 0;
1884
+ stats->num_observations = 0;
1885
+ }
1886
+
1887
+ /* Literal observation. Heuristic: use the top 2 bits and low 1 bits of the
1888
+ * literal, for 8 possible literal observation types. */
1889
+ static forceinline void
1890
+ observe_literal(struct block_split_stats *stats, u8 lit)
1891
+ {
1892
+ stats->new_observations[((lit >> 5) & 0x6) | (lit & 1)]++;
1893
+ stats->num_new_observations++;
1894
+ }
1895
+
1896
+ /* Match observation. Heuristic: use one observation type for "short match" and
1897
+ * one observation type for "long match". */
1898
+ static forceinline void
1899
+ observe_match(struct block_split_stats *stats, unsigned length)
1900
+ {
1901
+ stats->new_observations[NUM_LITERAL_OBSERVATION_TYPES + (length >= 9)]++;
1902
+ stats->num_new_observations++;
1903
+ }
1904
+
1905
+ static bool
1906
+ do_end_block_check(struct block_split_stats *stats, u32 block_length)
1907
+ {
1908
+ int i;
1909
+
1910
+ if (stats->num_observations > 0) {
1911
+
1912
+ /* Note: to avoid slow divisions, we do not divide by
1913
+ * 'num_observations', but rather do all math with the numbers
1914
+ * multiplied by 'num_observations'. */
1915
+ u32 total_delta = 0;
1916
+ for (i = 0; i < NUM_OBSERVATION_TYPES; i++) {
1917
+ u32 expected = stats->observations[i] * stats->num_new_observations;
1918
+ u32 actual = stats->new_observations[i] * stats->num_observations;
1919
+ u32 delta = (actual > expected) ? actual - expected :
1920
+ expected - actual;
1921
+ total_delta += delta;
1922
+ }
1923
+
1924
+ /* Ready to end the block? */
1925
+ if (total_delta + (block_length / 4096) * stats->num_observations >=
1926
+ NUM_OBSERVATIONS_PER_BLOCK_CHECK * 200 / 512 * stats->num_observations)
1927
+ return true;
1928
+ }
1929
+
1930
+ for (i = 0; i < NUM_OBSERVATION_TYPES; i++) {
1931
+ stats->num_observations += stats->new_observations[i];
1932
+ stats->observations[i] += stats->new_observations[i];
1933
+ stats->new_observations[i] = 0;
1934
+ }
1935
+ stats->num_new_observations = 0;
1936
+ return false;
1937
+ }
1938
+
1939
+ static forceinline bool
1940
+ should_end_block(struct block_split_stats *stats,
1941
+ const u8 *in_block_begin, const u8 *in_next, const u8 *in_end)
1942
+ {
1943
+ /* Ready to check block split statistics? */
1944
+ if (stats->num_new_observations < NUM_OBSERVATIONS_PER_BLOCK_CHECK ||
1945
+ in_next - in_block_begin < MIN_BLOCK_LENGTH ||
1946
+ in_end - in_next < MIN_BLOCK_LENGTH)
1947
+ return false;
1948
+
1949
+ return do_end_block_check(stats, in_next - in_block_begin);
1950
+ }
1951
+
1952
+ /******************************************************************************/
1953
+
1954
+ /*
1955
+ * This is the "greedy" DEFLATE compressor. It always chooses the longest match.
1956
+ */
1957
+ static size_t
1958
+ deflate_compress_greedy(struct libdeflate_compressor * restrict c,
1959
+ const u8 * restrict in, size_t in_nbytes,
1960
+ u8 * restrict out, size_t out_nbytes_avail)
1961
+ {
1962
+ const u8 *in_next = in;
1963
+ const u8 *in_end = in_next + in_nbytes;
1964
+ struct deflate_output_bitstream os;
1965
+ const u8 *in_cur_base = in_next;
1966
+ unsigned max_len = DEFLATE_MAX_MATCH_LEN;
1967
+ unsigned nice_len = MIN(c->nice_match_length, max_len);
1968
+ u32 next_hashes[2] = {0, 0};
1969
+
1970
+ deflate_init_output(&os, out, out_nbytes_avail);
1971
+ hc_matchfinder_init(&c->p.g.hc_mf);
1972
+
1973
+ do {
1974
+ /* Starting a new DEFLATE block. */
1975
+
1976
+ const u8 * const in_block_begin = in_next;
1977
+ const u8 * const in_max_block_end =
1978
+ in_next + MIN(in_end - in_next, SOFT_MAX_BLOCK_LENGTH);
1979
+ u32 litrunlen = 0;
1980
+ struct deflate_sequence *next_seq = c->p.g.sequences;
1981
+
1982
+ init_block_split_stats(&c->split_stats);
1983
+ deflate_reset_symbol_frequencies(c);
1984
+
1985
+ do {
1986
+ u32 length;
1987
+ u32 offset;
1988
+
1989
+ /* Decrease the maximum and nice match lengths if we're
1990
+ * approaching the end of the input buffer. */
1991
+ if (unlikely(max_len > in_end - in_next)) {
1992
+ max_len = in_end - in_next;
1993
+ nice_len = MIN(nice_len, max_len);
1994
+ }
1995
+
1996
+ length = hc_matchfinder_longest_match(&c->p.g.hc_mf,
1997
+ &in_cur_base,
1998
+ in_next,
1999
+ DEFLATE_MIN_MATCH_LEN - 1,
2000
+ max_len,
2001
+ nice_len,
2002
+ c->max_search_depth,
2003
+ next_hashes,
2004
+ &offset);
2005
+
2006
+ if (length >= DEFLATE_MIN_MATCH_LEN) {
2007
+ /* Match found. */
2008
+ deflate_choose_match(c, length, offset,
2009
+ &litrunlen, &next_seq);
2010
+ observe_match(&c->split_stats, length);
2011
+ in_next = hc_matchfinder_skip_positions(&c->p.g.hc_mf,
2012
+ &in_cur_base,
2013
+ in_next + 1,
2014
+ in_end,
2015
+ length - 1,
2016
+ next_hashes);
2017
+ } else {
2018
+ /* No match found. */
2019
+ deflate_choose_literal(c, *in_next, &litrunlen);
2020
+ observe_literal(&c->split_stats, *in_next);
2021
+ in_next++;
2022
+ }
2023
+
2024
+ /* Check if it's time to output another block. */
2025
+ } while (in_next < in_max_block_end &&
2026
+ !should_end_block(&c->split_stats, in_block_begin, in_next, in_end));
2027
+
2028
+ deflate_finish_sequence(next_seq, litrunlen);
2029
+ deflate_flush_block(c, &os, in_block_begin,
2030
+ in_next - in_block_begin,
2031
+ in_next == in_end, false);
2032
+ } while (in_next != in_end);
2033
+
2034
+ return deflate_flush_output(&os);
2035
+ }
2036
+
2037
+ /*
2038
+ * This is the "lazy" DEFLATE compressor. Before choosing a match, it checks to
2039
+ * see if there's a longer match at the next position. If yes, it outputs a
2040
+ * literal and continues to the next position. If no, it outputs the match.
2041
+ */
2042
+ static size_t
2043
+ deflate_compress_lazy(struct libdeflate_compressor * restrict c,
2044
+ const u8 * restrict in, size_t in_nbytes,
2045
+ u8 * restrict out, size_t out_nbytes_avail)
2046
+ {
2047
+ const u8 *in_next = in;
2048
+ const u8 *in_end = in_next + in_nbytes;
2049
+ struct deflate_output_bitstream os;
2050
+ const u8 *in_cur_base = in_next;
2051
+ unsigned max_len = DEFLATE_MAX_MATCH_LEN;
2052
+ unsigned nice_len = MIN(c->nice_match_length, max_len);
2053
+ u32 next_hashes[2] = {0, 0};
2054
+
2055
+ deflate_init_output(&os, out, out_nbytes_avail);
2056
+ hc_matchfinder_init(&c->p.g.hc_mf);
2057
+
2058
+ do {
2059
+ /* Starting a new DEFLATE block. */
2060
+
2061
+ const u8 * const in_block_begin = in_next;
2062
+ const u8 * const in_max_block_end =
2063
+ in_next + MIN(in_end - in_next, SOFT_MAX_BLOCK_LENGTH);
2064
+ u32 litrunlen = 0;
2065
+ struct deflate_sequence *next_seq = c->p.g.sequences;
2066
+
2067
+ init_block_split_stats(&c->split_stats);
2068
+ deflate_reset_symbol_frequencies(c);
2069
+
2070
+ do {
2071
+ unsigned cur_len;
2072
+ unsigned cur_offset;
2073
+ unsigned next_len;
2074
+ unsigned next_offset;
2075
+
2076
+ if (unlikely(in_end - in_next < DEFLATE_MAX_MATCH_LEN)) {
2077
+ max_len = in_end - in_next;
2078
+ nice_len = MIN(nice_len, max_len);
2079
+ }
2080
+
2081
+ /* Find the longest match at the current position. */
2082
+ cur_len = hc_matchfinder_longest_match(&c->p.g.hc_mf,
2083
+ &in_cur_base,
2084
+ in_next,
2085
+ DEFLATE_MIN_MATCH_LEN - 1,
2086
+ max_len,
2087
+ nice_len,
2088
+ c->max_search_depth,
2089
+ next_hashes,
2090
+ &cur_offset);
2091
+ in_next += 1;
2092
+
2093
+ if (cur_len < DEFLATE_MIN_MATCH_LEN) {
2094
+ /* No match found. Choose a literal. */
2095
+ deflate_choose_literal(c, *(in_next - 1), &litrunlen);
2096
+ observe_literal(&c->split_stats, *(in_next - 1));
2097
+ continue;
2098
+ }
2099
+
2100
+ have_cur_match:
2101
+ observe_match(&c->split_stats, cur_len);
2102
+
2103
+ /* We have a match at the current position. */
2104
+
2105
+ /* If the current match is very long, choose it
2106
+ * immediately. */
2107
+ if (cur_len >= nice_len) {
2108
+ deflate_choose_match(c, cur_len, cur_offset,
2109
+ &litrunlen, &next_seq);
2110
+ in_next = hc_matchfinder_skip_positions(&c->p.g.hc_mf,
2111
+ &in_cur_base,
2112
+ in_next,
2113
+ in_end,
2114
+ cur_len - 1,
2115
+ next_hashes);
2116
+ continue;
2117
+ }
2118
+
2119
+ /*
2120
+ * Try to find a match at the next position.
2121
+ *
2122
+ * Note: since we already have a match at the *current*
2123
+ * position, we use only half the 'max_search_depth'
2124
+ * when checking the *next* position. This is a useful
2125
+ * trade-off because it's more worthwhile to use a
2126
+ * greater search depth on the initial match.
2127
+ *
2128
+ * Note: it's possible to structure the code such that
2129
+ * there's only one call to longest_match(), which
2130
+ * handles both the "find the initial match" and "try to
2131
+ * find a longer match" cases. However, it is faster to
2132
+ * have two call sites, with longest_match() inlined at
2133
+ * each.
2134
+ */
2135
+ if (unlikely(in_end - in_next < DEFLATE_MAX_MATCH_LEN)) {
2136
+ max_len = in_end - in_next;
2137
+ nice_len = MIN(nice_len, max_len);
2138
+ }
2139
+ next_len = hc_matchfinder_longest_match(&c->p.g.hc_mf,
2140
+ &in_cur_base,
2141
+ in_next,
2142
+ cur_len,
2143
+ max_len,
2144
+ nice_len,
2145
+ c->max_search_depth / 2,
2146
+ next_hashes,
2147
+ &next_offset);
2148
+ in_next += 1;
2149
+
2150
+ if (next_len > cur_len) {
2151
+ /* Found a longer match at the next position.
2152
+ * Output a literal. Then the next match
2153
+ * becomes the current match. */
2154
+ deflate_choose_literal(c, *(in_next - 2), &litrunlen);
2155
+ cur_len = next_len;
2156
+ cur_offset = next_offset;
2157
+ goto have_cur_match;
2158
+ }
2159
+
2160
+ /* No longer match at the next position.
2161
+ * Output the current match. */
2162
+ deflate_choose_match(c, cur_len, cur_offset,
2163
+ &litrunlen, &next_seq);
2164
+ in_next = hc_matchfinder_skip_positions(&c->p.g.hc_mf,
2165
+ &in_cur_base,
2166
+ in_next,
2167
+ in_end,
2168
+ cur_len - 2,
2169
+ next_hashes);
2170
+
2171
+ /* Check if it's time to output another block. */
2172
+ } while (in_next < in_max_block_end &&
2173
+ !should_end_block(&c->split_stats, in_block_begin, in_next, in_end));
2174
+
2175
+ deflate_finish_sequence(next_seq, litrunlen);
2176
+ deflate_flush_block(c, &os, in_block_begin,
2177
+ in_next - in_block_begin,
2178
+ in_next == in_end, false);
2179
+ } while (in_next != in_end);
2180
+
2181
+ return deflate_flush_output(&os);
2182
+ }
2183
+
2184
+ #if SUPPORT_NEAR_OPTIMAL_PARSING
2185
+
2186
+ /*
2187
+ * Follow the minimum-cost path in the graph of possible match/literal choices
2188
+ * for the current block and compute the frequencies of the Huffman symbols that
2189
+ * would be needed to output those matches and literals.
2190
+ */
2191
+ static void
2192
+ deflate_tally_item_list(struct libdeflate_compressor *c, u32 block_length)
2193
+ {
2194
+ struct deflate_optimum_node *cur_node = &c->p.n.optimum_nodes[0];
2195
+ struct deflate_optimum_node *end_node = &c->p.n.optimum_nodes[block_length];
2196
+ do {
2197
+ unsigned length = cur_node->item & OPTIMUM_LEN_MASK;
2198
+ unsigned offset = cur_node->item >> OPTIMUM_OFFSET_SHIFT;
2199
+
2200
+ if (length == 1) {
2201
+ /* Literal */
2202
+ c->freqs.litlen[offset]++;
2203
+ } else {
2204
+ /* Match */
2205
+ c->freqs.litlen[257 + deflate_length_slot[length]]++;
2206
+ c->freqs.offset[deflate_get_offset_slot(c, offset)]++;
2207
+ }
2208
+ cur_node += length;
2209
+ } while (cur_node != end_node);
2210
+ }
2211
+
2212
+ /* Set the current cost model from the codeword lengths specified in @lens. */
2213
+ static void
2214
+ deflate_set_costs_from_codes(struct libdeflate_compressor *c,
2215
+ const struct deflate_lens *lens)
2216
+ {
2217
+ unsigned i;
2218
+
2219
+ /* Literals */
2220
+ for (i = 0; i < DEFLATE_NUM_LITERALS; i++) {
2221
+ u32 bits = (lens->litlen[i] ? lens->litlen[i] : LITERAL_NOSTAT_BITS);
2222
+ c->p.n.costs.literal[i] = bits << COST_SHIFT;
2223
+ }
2224
+
2225
+ /* Lengths */
2226
+ for (i = DEFLATE_MIN_MATCH_LEN; i <= DEFLATE_MAX_MATCH_LEN; i++) {
2227
+ unsigned length_slot = deflate_length_slot[i];
2228
+ unsigned litlen_sym = 257 + length_slot;
2229
+ u32 bits = (lens->litlen[litlen_sym] ? lens->litlen[litlen_sym] : LENGTH_NOSTAT_BITS);
2230
+ bits += deflate_extra_length_bits[length_slot];
2231
+ c->p.n.costs.length[i] = bits << COST_SHIFT;
2232
+ }
2233
+
2234
+ /* Offset slots */
2235
+ for (i = 0; i < ARRAY_LEN(deflate_offset_slot_base); i++) {
2236
+ u32 bits = (lens->offset[i] ? lens->offset[i] : OFFSET_NOSTAT_BITS);
2237
+ bits += deflate_extra_offset_bits[i];
2238
+ c->p.n.costs.offset_slot[i] = bits << COST_SHIFT;
2239
+ }
2240
+ }
2241
+
2242
+ static forceinline u32
2243
+ deflate_default_literal_cost(unsigned literal)
2244
+ {
2245
+ STATIC_ASSERT(COST_SHIFT == 3);
2246
+ /* 66 is 8.25 bits/symbol */
2247
+ return 66;
2248
+ }
2249
+
2250
+ static forceinline u32
2251
+ deflate_default_length_slot_cost(unsigned length_slot)
2252
+ {
2253
+ STATIC_ASSERT(COST_SHIFT == 3);
2254
+ /* 60 is 7.5 bits/symbol */
2255
+ return 60 + ((u32)deflate_extra_length_bits[length_slot] << COST_SHIFT);
2256
+ }
2257
+
2258
+ static forceinline u32
2259
+ deflate_default_offset_slot_cost(unsigned offset_slot)
2260
+ {
2261
+ STATIC_ASSERT(COST_SHIFT == 3);
2262
+ /* 39 is 4.875 bits/symbol */
2263
+ return 39 + ((u32)deflate_extra_offset_bits[offset_slot] << COST_SHIFT);
2264
+ }
2265
+
2266
+ /*
2267
+ * Set default symbol costs for the first block's first optimization pass.
2268
+ *
2269
+ * It works well to assume that each symbol is equally probable. This results
2270
+ * in each symbol being assigned a cost of (-log2(1.0/num_syms) * (1 <<
2271
+ * COST_SHIFT)) where 'num_syms' is the number of symbols in the corresponding
2272
+ * alphabet. However, we intentionally bias the parse towards matches rather
2273
+ * than literals by using a slightly lower default cost for length symbols than
2274
+ * for literals. This often improves the compression ratio slightly.
2275
+ */
2276
+ static void
2277
+ deflate_set_default_costs(struct libdeflate_compressor *c)
2278
+ {
2279
+ unsigned i;
2280
+
2281
+ /* Literals */
2282
+ for (i = 0; i < DEFLATE_NUM_LITERALS; i++)
2283
+ c->p.n.costs.literal[i] = deflate_default_literal_cost(i);
2284
+
2285
+ /* Lengths */
2286
+ for (i = DEFLATE_MIN_MATCH_LEN; i <= DEFLATE_MAX_MATCH_LEN; i++)
2287
+ c->p.n.costs.length[i] = deflate_default_length_slot_cost(
2288
+ deflate_length_slot[i]);
2289
+
2290
+ /* Offset slots */
2291
+ for (i = 0; i < ARRAY_LEN(deflate_offset_slot_base); i++)
2292
+ c->p.n.costs.offset_slot[i] = deflate_default_offset_slot_cost(i);
2293
+ }
2294
+
2295
+ static forceinline void
2296
+ deflate_adjust_cost(u32 *cost_p, u32 default_cost)
2297
+ {
2298
+ *cost_p += ((s32)default_cost - (s32)*cost_p) >> 1;
2299
+ }
2300
+
2301
+ /*
2302
+ * Adjust the costs when beginning a new block.
2303
+ *
2304
+ * Since the current costs have been optimized for the data, it's undesirable to
2305
+ * throw them away and start over with the default costs. At the same time, we
2306
+ * don't want to bias the parse by assuming that the next block will be similar
2307
+ * to the current block. As a compromise, make the costs closer to the
2308
+ * defaults, but don't simply set them to the defaults.
2309
+ */
2310
+ static void
2311
+ deflate_adjust_costs(struct libdeflate_compressor *c)
2312
+ {
2313
+ unsigned i;
2314
+
2315
+ /* Literals */
2316
+ for (i = 0; i < DEFLATE_NUM_LITERALS; i++)
2317
+ deflate_adjust_cost(&c->p.n.costs.literal[i],
2318
+ deflate_default_literal_cost(i));
2319
+
2320
+ /* Lengths */
2321
+ for (i = DEFLATE_MIN_MATCH_LEN; i <= DEFLATE_MAX_MATCH_LEN; i++)
2322
+ deflate_adjust_cost(&c->p.n.costs.length[i],
2323
+ deflate_default_length_slot_cost(
2324
+ deflate_length_slot[i]));
2325
+
2326
+ /* Offset slots */
2327
+ for (i = 0; i < ARRAY_LEN(deflate_offset_slot_base); i++)
2328
+ deflate_adjust_cost(&c->p.n.costs.offset_slot[i],
2329
+ deflate_default_offset_slot_cost(i));
2330
+ }
2331
+
2332
+ /*
2333
+ * Find the minimum-cost path through the graph of possible match/literal
2334
+ * choices for this block.
2335
+ *
2336
+ * We find the minimum cost path from 'c->p.n.optimum_nodes[0]', which
2337
+ * represents the node at the beginning of the block, to
2338
+ * 'c->p.n.optimum_nodes[block_length]', which represents the node at the end of
2339
+ * the block. Edge costs are evaluated using the cost model 'c->p.n.costs'.
2340
+ *
2341
+ * The algorithm works backwards, starting at the end node and proceeding
2342
+ * backwards one node at a time. At each node, the minimum cost to reach the
2343
+ * end node is computed and the match/literal choice that begins that path is
2344
+ * saved.
2345
+ */
2346
+ static void
2347
+ deflate_find_min_cost_path(struct libdeflate_compressor *c,
2348
+ const u32 block_length,
2349
+ const struct lz_match *cache_ptr)
2350
+ {
2351
+ struct deflate_optimum_node *end_node = &c->p.n.optimum_nodes[block_length];
2352
+ struct deflate_optimum_node *cur_node = end_node;
2353
+
2354
+ cur_node->cost_to_end = 0;
2355
+ do {
2356
+ unsigned num_matches;
2357
+ unsigned literal;
2358
+ u32 best_cost_to_end;
2359
+
2360
+ cur_node--;
2361
+ cache_ptr--;
2362
+
2363
+ num_matches = cache_ptr->length;
2364
+ literal = cache_ptr->offset;
2365
+
2366
+ /* It's always possible to choose a literal. */
2367
+ best_cost_to_end = c->p.n.costs.literal[literal] +
2368
+ (cur_node + 1)->cost_to_end;
2369
+ cur_node->item = ((u32)literal << OPTIMUM_OFFSET_SHIFT) | 1;
2370
+
2371
+ /* Also consider matches if there are any. */
2372
+ if (num_matches) {
2373
+ const struct lz_match *match;
2374
+ unsigned len;
2375
+ unsigned offset;
2376
+ unsigned offset_slot;
2377
+ u32 offset_cost;
2378
+ u32 cost_to_end;
2379
+
2380
+ /*
2381
+ * Consider each length from the minimum
2382
+ * (DEFLATE_MIN_MATCH_LEN) to the length of the longest
2383
+ * match found at this position. For each length, we
2384
+ * consider only the smallest offset for which that
2385
+ * length is available. Although this is not guaranteed
2386
+ * to be optimal due to the possibility of a larger
2387
+ * offset costing less than a smaller offset to code,
2388
+ * this is a very useful heuristic.
2389
+ */
2390
+ match = cache_ptr - num_matches;
2391
+ len = DEFLATE_MIN_MATCH_LEN;
2392
+ do {
2393
+ offset = match->offset;
2394
+ offset_slot = deflate_get_offset_slot(c, offset);
2395
+ offset_cost = c->p.n.costs.offset_slot[offset_slot];
2396
+ do {
2397
+ cost_to_end = offset_cost +
2398
+ c->p.n.costs.length[len] +
2399
+ (cur_node + len)->cost_to_end;
2400
+ if (cost_to_end < best_cost_to_end) {
2401
+ best_cost_to_end = cost_to_end;
2402
+ cur_node->item = ((u32)offset << OPTIMUM_OFFSET_SHIFT) | len;
2403
+ }
2404
+ } while (++len <= match->length);
2405
+ } while (++match != cache_ptr);
2406
+ cache_ptr -= num_matches;
2407
+ }
2408
+ cur_node->cost_to_end = best_cost_to_end;
2409
+ } while (cur_node != &c->p.n.optimum_nodes[0]);
2410
+ }
2411
+
2412
+ /*
2413
+ * Choose the literal/match sequence to use for the current block. The basic
2414
+ * algorithm finds a minimum-cost path through the block's graph of
2415
+ * literal/match choices, given a cost model. However, the cost of each symbol
2416
+ * is unknown until the Huffman codes have been built, but at the same time the
2417
+ * Huffman codes depend on the frequencies of chosen symbols. Consequently,
2418
+ * multiple passes must be used to try to approximate an optimal solution. The
2419
+ * first pass uses default costs, mixed with the costs from the previous block
2420
+ * if any. Later passes use the Huffman codeword lengths from the previous pass
2421
+ * as the costs.
2422
+ */
2423
+ static void
2424
+ deflate_optimize_block(struct libdeflate_compressor *c, u32 block_length,
2425
+ const struct lz_match *cache_ptr, bool is_first_block)
2426
+ {
2427
+ unsigned num_passes_remaining = c->p.n.num_optim_passes;
2428
+ u32 i;
2429
+
2430
+ /* Force the block to really end at the desired length, even if some
2431
+ * matches extend beyond it. */
2432
+ for (i = block_length; i <= MIN(block_length - 1 + DEFLATE_MAX_MATCH_LEN,
2433
+ ARRAY_LEN(c->p.n.optimum_nodes) - 1); i++)
2434
+ c->p.n.optimum_nodes[i].cost_to_end = 0x80000000;
2435
+
2436
+ /* Set the initial costs. */
2437
+ if (is_first_block)
2438
+ deflate_set_default_costs(c);
2439
+ else
2440
+ deflate_adjust_costs(c);
2441
+
2442
+ for (;;) {
2443
+ /* Find the minimum cost path for this pass. */
2444
+ deflate_find_min_cost_path(c, block_length, cache_ptr);
2445
+
2446
+ /* Compute frequencies of the chosen symbols. */
2447
+ deflate_reset_symbol_frequencies(c);
2448
+ deflate_tally_item_list(c, block_length);
2449
+
2450
+ if (--num_passes_remaining == 0)
2451
+ break;
2452
+
2453
+ /* At least one optimization pass remains; update the costs. */
2454
+ deflate_make_huffman_codes(&c->freqs, &c->codes);
2455
+ deflate_set_costs_from_codes(c, &c->codes.lens);
2456
+ }
2457
+ }
2458
+
2459
+ /*
2460
+ * This is the "near-optimal" DEFLATE compressor. It computes the optimal
2461
+ * representation of each DEFLATE block using a minimum-cost path search over
2462
+ * the graph of possible match/literal choices for that block, assuming a
2463
+ * certain cost for each Huffman symbol.
2464
+ *
2465
+ * For several reasons, the end result is not guaranteed to be optimal:
2466
+ *
2467
+ * - Nonoptimal choice of blocks
2468
+ * - Heuristic limitations on which matches are actually considered
2469
+ * - Symbol costs are unknown until the symbols have already been chosen
2470
+ * (so iterative optimization must be used)
2471
+ */
2472
+ static size_t
2473
+ deflate_compress_near_optimal(struct libdeflate_compressor * restrict c,
2474
+ const u8 * restrict in, size_t in_nbytes,
2475
+ u8 * restrict out, size_t out_nbytes_avail)
2476
+ {
2477
+ const u8 *in_next = in;
2478
+ const u8 *in_end = in_next + in_nbytes;
2479
+ struct deflate_output_bitstream os;
2480
+ const u8 *in_cur_base = in_next;
2481
+ const u8 *in_next_slide = in_next + MIN(in_end - in_next, MATCHFINDER_WINDOW_SIZE);
2482
+ unsigned max_len = DEFLATE_MAX_MATCH_LEN;
2483
+ unsigned nice_len = MIN(c->nice_match_length, max_len);
2484
+ u32 next_hashes[2] = {0, 0};
2485
+
2486
+ deflate_init_output(&os, out, out_nbytes_avail);
2487
+ bt_matchfinder_init(&c->p.n.bt_mf);
2488
+
2489
+ do {
2490
+ /* Starting a new DEFLATE block. */
2491
+
2492
+ struct lz_match *cache_ptr = c->p.n.match_cache;
2493
+ const u8 * const in_block_begin = in_next;
2494
+ const u8 * const in_max_block_end =
2495
+ in_next + MIN(in_end - in_next, SOFT_MAX_BLOCK_LENGTH);
2496
+ const u8 *next_observation = in_next;
2497
+
2498
+ init_block_split_stats(&c->split_stats);
2499
+
2500
+ /*
2501
+ * Find matches until we decide to end the block. We end the
2502
+ * block if any of the following is true:
2503
+ *
2504
+ * (1) Maximum block length has been reached
2505
+ * (2) Match catch may overflow.
2506
+ * (3) Block split heuristic says to split now.
2507
+ */
2508
+ do {
2509
+ struct lz_match *matches;
2510
+ unsigned best_len;
2511
+
2512
+ /* Slide the window forward if needed. */
2513
+ if (in_next == in_next_slide) {
2514
+ bt_matchfinder_slide_window(&c->p.n.bt_mf);
2515
+ in_cur_base = in_next;
2516
+ in_next_slide = in_next + MIN(in_end - in_next,
2517
+ MATCHFINDER_WINDOW_SIZE);
2518
+ }
2519
+
2520
+ /* Decrease the maximum and nice match lengths if we're
2521
+ * approaching the end of the input buffer. */
2522
+ if (unlikely(max_len > in_end - in_next)) {
2523
+ max_len = in_end - in_next;
2524
+ nice_len = MIN(nice_len, max_len);
2525
+ }
2526
+
2527
+ /*
2528
+ * Find matches with the current position using the
2529
+ * binary tree matchfinder and save them in
2530
+ * 'match_cache'.
2531
+ *
2532
+ * Note: the binary tree matchfinder is more suited for
2533
+ * optimal parsing than the hash chain matchfinder. The
2534
+ * reasons for this include:
2535
+ *
2536
+ * - The binary tree matchfinder can find more matches
2537
+ * in the same number of steps.
2538
+ * - One of the major advantages of hash chains is that
2539
+ * skipping positions (not searching for matches at
2540
+ * them) is faster; however, with optimal parsing we
2541
+ * search for matches at almost all positions, so this
2542
+ * advantage of hash chains is negated.
2543
+ */
2544
+ matches = cache_ptr;
2545
+ best_len = 0;
2546
+ if (likely(max_len >= BT_MATCHFINDER_REQUIRED_NBYTES)) {
2547
+ cache_ptr = bt_matchfinder_get_matches(&c->p.n.bt_mf,
2548
+ in_cur_base,
2549
+ in_next - in_cur_base,
2550
+ max_len,
2551
+ nice_len,
2552
+ c->max_search_depth,
2553
+ next_hashes,
2554
+ &best_len,
2555
+ matches);
2556
+ }
2557
+
2558
+ if (in_next >= next_observation) {
2559
+ if (best_len >= 4) {
2560
+ observe_match(&c->split_stats, best_len);
2561
+ next_observation = in_next + best_len;
2562
+ } else {
2563
+ observe_literal(&c->split_stats, *in_next);
2564
+ next_observation = in_next + 1;
2565
+ }
2566
+ }
2567
+
2568
+ cache_ptr->length = cache_ptr - matches;
2569
+ cache_ptr->offset = *in_next;
2570
+ in_next++;
2571
+ cache_ptr++;
2572
+
2573
+ /*
2574
+ * If there was a very long match found, don't cache any
2575
+ * matches for the bytes covered by that match. This
2576
+ * avoids degenerate behavior when compressing highly
2577
+ * redundant data, where the number of matches can be
2578
+ * very large.
2579
+ *
2580
+ * This heuristic doesn't actually hurt the compression
2581
+ * ratio very much. If there's a long match, then the
2582
+ * data must be highly compressible, so it doesn't
2583
+ * matter much what we do.
2584
+ */
2585
+ if (best_len >= DEFLATE_MIN_MATCH_LEN && best_len >= nice_len) {
2586
+ --best_len;
2587
+ do {
2588
+ if (in_next == in_next_slide) {
2589
+ bt_matchfinder_slide_window(&c->p.n.bt_mf);
2590
+ in_cur_base = in_next;
2591
+ in_next_slide = in_next + MIN(in_end - in_next,
2592
+ MATCHFINDER_WINDOW_SIZE);
2593
+ }
2594
+ if (unlikely(max_len > in_end - in_next)) {
2595
+ max_len = in_end - in_next;
2596
+ nice_len = MIN(nice_len, max_len);
2597
+ }
2598
+ if (max_len >= BT_MATCHFINDER_REQUIRED_NBYTES) {
2599
+ bt_matchfinder_skip_position(&c->p.n.bt_mf,
2600
+ in_cur_base,
2601
+ in_next - in_cur_base,
2602
+ nice_len,
2603
+ c->max_search_depth,
2604
+ next_hashes);
2605
+ }
2606
+ cache_ptr->length = 0;
2607
+ cache_ptr->offset = *in_next;
2608
+ in_next++;
2609
+ cache_ptr++;
2610
+ } while (--best_len);
2611
+ }
2612
+ } while (in_next < in_max_block_end &&
2613
+ cache_ptr < &c->p.n.match_cache[CACHE_LENGTH] &&
2614
+ !should_end_block(&c->split_stats, in_block_begin, in_next, in_end));
2615
+
2616
+ /* All the matches for this block have been cached. Now choose
2617
+ * the sequence of items to output and flush the block. */
2618
+ deflate_optimize_block(c, in_next - in_block_begin, cache_ptr,
2619
+ in_block_begin == in);
2620
+ deflate_flush_block(c, &os, in_block_begin, in_next - in_block_begin,
2621
+ in_next == in_end, true);
2622
+ } while (in_next != in_end);
2623
+
2624
+ return deflate_flush_output(&os);
2625
+ }
2626
+
2627
+ #endif /* SUPPORT_NEAR_OPTIMAL_PARSING */
2628
+
2629
+ /* Initialize c->offset_slot_fast. */
2630
+ static void
2631
+ deflate_init_offset_slot_fast(struct libdeflate_compressor *c)
2632
+ {
2633
+ unsigned offset_slot;
2634
+ unsigned offset;
2635
+ unsigned offset_end;
2636
+
2637
+ for (offset_slot = 0;
2638
+ offset_slot < ARRAY_LEN(deflate_offset_slot_base);
2639
+ offset_slot++)
2640
+ {
2641
+ offset = deflate_offset_slot_base[offset_slot];
2642
+ #if USE_FULL_OFFSET_SLOT_FAST
2643
+ offset_end = offset + (1 << deflate_extra_offset_bits[offset_slot]);
2644
+ do {
2645
+ c->offset_slot_fast[offset] = offset_slot;
2646
+ } while (++offset != offset_end);
2647
+ #else
2648
+ if (offset <= 256) {
2649
+ offset_end = offset + (1 << deflate_extra_offset_bits[offset_slot]);
2650
+ do {
2651
+ c->offset_slot_fast[offset - 1] = offset_slot;
2652
+ } while (++offset != offset_end);
2653
+ } else {
2654
+ offset_end = offset + (1 << deflate_extra_offset_bits[offset_slot]);
2655
+ do {
2656
+ c->offset_slot_fast[256 + ((offset - 1) >> 7)] = offset_slot;
2657
+ } while ((offset += (1 << 7)) != offset_end);
2658
+ }
2659
+ #endif
2660
+ }
2661
+ }
2662
+
2663
+ LIBDEFLATEAPI struct libdeflate_compressor *
2664
+ libdeflate_alloc_compressor(int compression_level)
2665
+ {
2666
+ struct libdeflate_compressor *c;
2667
+ size_t size;
2668
+
2669
+ #if SUPPORT_NEAR_OPTIMAL_PARSING
2670
+ if (compression_level >= 8)
2671
+ size = offsetof(struct libdeflate_compressor, p) + sizeof(c->p.n);
2672
+ else
2673
+ #endif
2674
+ size = offsetof(struct libdeflate_compressor, p) + sizeof(c->p.g);
2675
+
2676
+ c = aligned_malloc(MATCHFINDER_ALIGNMENT, size);
2677
+ if (!c)
2678
+ return NULL;
2679
+
2680
+ switch (compression_level) {
2681
+ case 1:
2682
+ c->impl = deflate_compress_greedy;
2683
+ c->max_search_depth = 2;
2684
+ c->nice_match_length = 8;
2685
+ break;
2686
+ case 2:
2687
+ c->impl = deflate_compress_greedy;
2688
+ c->max_search_depth = 6;
2689
+ c->nice_match_length = 10;
2690
+ break;
2691
+ case 3:
2692
+ c->impl = deflate_compress_greedy;
2693
+ c->max_search_depth = 12;
2694
+ c->nice_match_length = 14;
2695
+ break;
2696
+ case 4:
2697
+ c->impl = deflate_compress_greedy;
2698
+ c->max_search_depth = 24;
2699
+ c->nice_match_length = 24;
2700
+ break;
2701
+ case 5:
2702
+ c->impl = deflate_compress_lazy;
2703
+ c->max_search_depth = 20;
2704
+ c->nice_match_length = 30;
2705
+ break;
2706
+ case 6:
2707
+ c->impl = deflate_compress_lazy;
2708
+ c->max_search_depth = 40;
2709
+ c->nice_match_length = 65;
2710
+ break;
2711
+ case 7:
2712
+ c->impl = deflate_compress_lazy;
2713
+ c->max_search_depth = 100;
2714
+ c->nice_match_length = 130;
2715
+ break;
2716
+ #if SUPPORT_NEAR_OPTIMAL_PARSING
2717
+ case 8:
2718
+ c->impl = deflate_compress_near_optimal;
2719
+ c->max_search_depth = 12;
2720
+ c->nice_match_length = 20;
2721
+ c->p.n.num_optim_passes = 1;
2722
+ break;
2723
+ case 9:
2724
+ c->impl = deflate_compress_near_optimal;
2725
+ c->max_search_depth = 16;
2726
+ c->nice_match_length = 26;
2727
+ c->p.n.num_optim_passes = 2;
2728
+ break;
2729
+ case 10:
2730
+ c->impl = deflate_compress_near_optimal;
2731
+ c->max_search_depth = 30;
2732
+ c->nice_match_length = 50;
2733
+ c->p.n.num_optim_passes = 2;
2734
+ break;
2735
+ case 11:
2736
+ c->impl = deflate_compress_near_optimal;
2737
+ c->max_search_depth = 60;
2738
+ c->nice_match_length = 80;
2739
+ c->p.n.num_optim_passes = 3;
2740
+ break;
2741
+ case 12:
2742
+ c->impl = deflate_compress_near_optimal;
2743
+ c->max_search_depth = 100;
2744
+ c->nice_match_length = 133;
2745
+ c->p.n.num_optim_passes = 4;
2746
+ break;
2747
+ #else
2748
+ case 8:
2749
+ c->impl = deflate_compress_lazy;
2750
+ c->max_search_depth = 150;
2751
+ c->nice_match_length = 200;
2752
+ break;
2753
+ case 9:
2754
+ c->impl = deflate_compress_lazy;
2755
+ c->max_search_depth = 200;
2756
+ c->nice_match_length = DEFLATE_MAX_MATCH_LEN;
2757
+ break;
2758
+ #endif
2759
+ default:
2760
+ aligned_free(c);
2761
+ return NULL;
2762
+ }
2763
+
2764
+ c->compression_level = compression_level;
2765
+
2766
+ deflate_init_offset_slot_fast(c);
2767
+ deflate_init_static_codes(c);
2768
+
2769
+ return c;
2770
+ }
2771
+
2772
+ LIBDEFLATEAPI size_t
2773
+ libdeflate_deflate_compress(struct libdeflate_compressor *c,
2774
+ const void *in, size_t in_nbytes,
2775
+ void *out, size_t out_nbytes_avail)
2776
+ {
2777
+ if (unlikely(out_nbytes_avail < MIN_OUTPUT_SIZE))
2778
+ return 0;
2779
+
2780
+ /* For extremely small inputs just use a single uncompressed block. */
2781
+ if (unlikely(in_nbytes < 16)) {
2782
+ struct deflate_output_bitstream os;
2783
+ deflate_init_output(&os, out, out_nbytes_avail);
2784
+ if (in_nbytes == 0)
2785
+ in = &os; /* Avoid passing NULL to memcpy() */
2786
+ deflate_write_uncompressed_block(&os, in, in_nbytes, true);
2787
+ return deflate_flush_output(&os);
2788
+ }
2789
+
2790
+ return (*c->impl)(c, in, in_nbytes, out, out_nbytes_avail);
2791
+ }
2792
+
2793
+ LIBDEFLATEAPI void
2794
+ libdeflate_free_compressor(struct libdeflate_compressor *c)
2795
+ {
2796
+ aligned_free(c);
2797
+ }
2798
+
2799
+ unsigned int
2800
+ deflate_get_compression_level(struct libdeflate_compressor *c)
2801
+ {
2802
+ return c->compression_level;
2803
+ }
2804
+
2805
+ LIBDEFLATEAPI size_t
2806
+ libdeflate_deflate_compress_bound(struct libdeflate_compressor *c,
2807
+ size_t in_nbytes)
2808
+ {
2809
+ /*
2810
+ * The worst case is all uncompressed blocks where one block has length
2811
+ * <= MIN_BLOCK_LENGTH and the others have length MIN_BLOCK_LENGTH.
2812
+ * Each uncompressed block has 5 bytes of overhead: 1 for BFINAL, BTYPE,
2813
+ * and alignment to a byte boundary; 2 for LEN; and 2 for NLEN.
2814
+ */
2815
+ size_t max_num_blocks = MAX(DIV_ROUND_UP(in_nbytes, MIN_BLOCK_LENGTH), 1);
2816
+ return (5 * max_num_blocks) + in_nbytes + 1 + MIN_OUTPUT_SIZE;
2817
+ }