deflate-ruby 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (100) hide show
  1. checksums.yaml +7 -0
  2. data/CLAUDE.md +138 -0
  3. data/LICENSE.txt +21 -0
  4. data/README.md +117 -0
  5. data/ext/deflate_ruby/deflate_ruby.c +301 -0
  6. data/ext/deflate_ruby/extconf.rb +34 -0
  7. data/ext/deflate_ruby/libdeflate/CMakeLists.txt +270 -0
  8. data/ext/deflate_ruby/libdeflate/COPYING +22 -0
  9. data/ext/deflate_ruby/libdeflate/NEWS.md +494 -0
  10. data/ext/deflate_ruby/libdeflate/README.md +228 -0
  11. data/ext/deflate_ruby/libdeflate/common_defs.h +747 -0
  12. data/ext/deflate_ruby/libdeflate/lib/adler32.c +162 -0
  13. data/ext/deflate_ruby/libdeflate/lib/arm/adler32_impl.h +358 -0
  14. data/ext/deflate_ruby/libdeflate/lib/arm/cpu_features.c +230 -0
  15. data/ext/deflate_ruby/libdeflate/lib/arm/cpu_features.h +214 -0
  16. data/ext/deflate_ruby/libdeflate/lib/arm/crc32_impl.h +600 -0
  17. data/ext/deflate_ruby/libdeflate/lib/arm/crc32_pmull_helpers.h +156 -0
  18. data/ext/deflate_ruby/libdeflate/lib/arm/crc32_pmull_wide.h +226 -0
  19. data/ext/deflate_ruby/libdeflate/lib/arm/matchfinder_impl.h +78 -0
  20. data/ext/deflate_ruby/libdeflate/lib/bt_matchfinder.h +342 -0
  21. data/ext/deflate_ruby/libdeflate/lib/cpu_features_common.h +93 -0
  22. data/ext/deflate_ruby/libdeflate/lib/crc32.c +262 -0
  23. data/ext/deflate_ruby/libdeflate/lib/crc32_multipliers.h +377 -0
  24. data/ext/deflate_ruby/libdeflate/lib/crc32_tables.h +587 -0
  25. data/ext/deflate_ruby/libdeflate/lib/decompress_template.h +777 -0
  26. data/ext/deflate_ruby/libdeflate/lib/deflate_compress.c +4129 -0
  27. data/ext/deflate_ruby/libdeflate/lib/deflate_compress.h +15 -0
  28. data/ext/deflate_ruby/libdeflate/lib/deflate_constants.h +56 -0
  29. data/ext/deflate_ruby/libdeflate/lib/deflate_decompress.c +1208 -0
  30. data/ext/deflate_ruby/libdeflate/lib/gzip_compress.c +90 -0
  31. data/ext/deflate_ruby/libdeflate/lib/gzip_constants.h +45 -0
  32. data/ext/deflate_ruby/libdeflate/lib/gzip_decompress.c +144 -0
  33. data/ext/deflate_ruby/libdeflate/lib/hc_matchfinder.h +401 -0
  34. data/ext/deflate_ruby/libdeflate/lib/ht_matchfinder.h +234 -0
  35. data/ext/deflate_ruby/libdeflate/lib/lib_common.h +106 -0
  36. data/ext/deflate_ruby/libdeflate/lib/matchfinder_common.h +224 -0
  37. data/ext/deflate_ruby/libdeflate/lib/riscv/matchfinder_impl.h +97 -0
  38. data/ext/deflate_ruby/libdeflate/lib/utils.c +141 -0
  39. data/ext/deflate_ruby/libdeflate/lib/x86/adler32_impl.h +134 -0
  40. data/ext/deflate_ruby/libdeflate/lib/x86/adler32_template.h +518 -0
  41. data/ext/deflate_ruby/libdeflate/lib/x86/cpu_features.c +183 -0
  42. data/ext/deflate_ruby/libdeflate/lib/x86/cpu_features.h +169 -0
  43. data/ext/deflate_ruby/libdeflate/lib/x86/crc32_impl.h +160 -0
  44. data/ext/deflate_ruby/libdeflate/lib/x86/crc32_pclmul_template.h +495 -0
  45. data/ext/deflate_ruby/libdeflate/lib/x86/decompress_impl.h +57 -0
  46. data/ext/deflate_ruby/libdeflate/lib/x86/matchfinder_impl.h +122 -0
  47. data/ext/deflate_ruby/libdeflate/lib/zlib_compress.c +82 -0
  48. data/ext/deflate_ruby/libdeflate/lib/zlib_constants.h +21 -0
  49. data/ext/deflate_ruby/libdeflate/lib/zlib_decompress.c +104 -0
  50. data/ext/deflate_ruby/libdeflate/libdeflate-config.cmake.in +3 -0
  51. data/ext/deflate_ruby/libdeflate/libdeflate.h +411 -0
  52. data/ext/deflate_ruby/libdeflate/libdeflate.pc.in +18 -0
  53. data/ext/deflate_ruby/libdeflate/programs/CMakeLists.txt +105 -0
  54. data/ext/deflate_ruby/libdeflate/programs/benchmark.c +696 -0
  55. data/ext/deflate_ruby/libdeflate/programs/checksum.c +218 -0
  56. data/ext/deflate_ruby/libdeflate/programs/config.h.in +19 -0
  57. data/ext/deflate_ruby/libdeflate/programs/gzip.c +688 -0
  58. data/ext/deflate_ruby/libdeflate/programs/prog_util.c +521 -0
  59. data/ext/deflate_ruby/libdeflate/programs/prog_util.h +225 -0
  60. data/ext/deflate_ruby/libdeflate/programs/test_checksums.c +200 -0
  61. data/ext/deflate_ruby/libdeflate/programs/test_custom_malloc.c +155 -0
  62. data/ext/deflate_ruby/libdeflate/programs/test_incomplete_codes.c +385 -0
  63. data/ext/deflate_ruby/libdeflate/programs/test_invalid_streams.c +130 -0
  64. data/ext/deflate_ruby/libdeflate/programs/test_litrunlen_overflow.c +72 -0
  65. data/ext/deflate_ruby/libdeflate/programs/test_overread.c +95 -0
  66. data/ext/deflate_ruby/libdeflate/programs/test_slow_decompression.c +472 -0
  67. data/ext/deflate_ruby/libdeflate/programs/test_trailing_bytes.c +151 -0
  68. data/ext/deflate_ruby/libdeflate/programs/test_util.c +237 -0
  69. data/ext/deflate_ruby/libdeflate/programs/test_util.h +61 -0
  70. data/ext/deflate_ruby/libdeflate/programs/tgetopt.c +118 -0
  71. data/ext/deflate_ruby/libdeflate/scripts/android_build.sh +118 -0
  72. data/ext/deflate_ruby/libdeflate/scripts/android_tests.sh +69 -0
  73. data/ext/deflate_ruby/libdeflate/scripts/benchmark.sh +10 -0
  74. data/ext/deflate_ruby/libdeflate/scripts/checksum.sh +10 -0
  75. data/ext/deflate_ruby/libdeflate/scripts/checksum_benchmarks.sh +253 -0
  76. data/ext/deflate_ruby/libdeflate/scripts/cmake-helper.sh +17 -0
  77. data/ext/deflate_ruby/libdeflate/scripts/deflate_benchmarks.sh +119 -0
  78. data/ext/deflate_ruby/libdeflate/scripts/exec_tests.sh +38 -0
  79. data/ext/deflate_ruby/libdeflate/scripts/gen-release-archives.sh +37 -0
  80. data/ext/deflate_ruby/libdeflate/scripts/gen_bitreverse_tab.py +19 -0
  81. data/ext/deflate_ruby/libdeflate/scripts/gen_crc32_multipliers.c +199 -0
  82. data/ext/deflate_ruby/libdeflate/scripts/gen_crc32_tables.c +105 -0
  83. data/ext/deflate_ruby/libdeflate/scripts/gen_default_litlen_costs.py +44 -0
  84. data/ext/deflate_ruby/libdeflate/scripts/gen_offset_slot_map.py +29 -0
  85. data/ext/deflate_ruby/libdeflate/scripts/gzip_tests.sh +523 -0
  86. data/ext/deflate_ruby/libdeflate/scripts/libFuzzer/deflate_compress/corpus/0 +0 -0
  87. data/ext/deflate_ruby/libdeflate/scripts/libFuzzer/deflate_compress/fuzz.c +95 -0
  88. data/ext/deflate_ruby/libdeflate/scripts/libFuzzer/deflate_decompress/corpus/0 +3 -0
  89. data/ext/deflate_ruby/libdeflate/scripts/libFuzzer/deflate_decompress/fuzz.c +62 -0
  90. data/ext/deflate_ruby/libdeflate/scripts/libFuzzer/fuzz.sh +108 -0
  91. data/ext/deflate_ruby/libdeflate/scripts/libFuzzer/gzip_decompress/corpus/0 +0 -0
  92. data/ext/deflate_ruby/libdeflate/scripts/libFuzzer/gzip_decompress/fuzz.c +19 -0
  93. data/ext/deflate_ruby/libdeflate/scripts/libFuzzer/zlib_decompress/corpus/0 +3 -0
  94. data/ext/deflate_ruby/libdeflate/scripts/libFuzzer/zlib_decompress/fuzz.c +19 -0
  95. data/ext/deflate_ruby/libdeflate/scripts/run_tests.sh +416 -0
  96. data/ext/deflate_ruby/libdeflate/scripts/toolchain-i686-w64-mingw32.cmake +8 -0
  97. data/ext/deflate_ruby/libdeflate/scripts/toolchain-x86_64-w64-mingw32.cmake +8 -0
  98. data/lib/deflate_ruby/version.rb +5 -0
  99. data/lib/deflate_ruby.rb +71 -0
  100. metadata +191 -0
@@ -0,0 +1,1208 @@
1
+ /*
2
+ * deflate_decompress.c - a decompressor for DEFLATE
3
+ *
4
+ * Copyright 2016 Eric Biggers
5
+ *
6
+ * Permission is hereby granted, free of charge, to any person
7
+ * obtaining a copy of this software and associated documentation
8
+ * files (the "Software"), to deal in the Software without
9
+ * restriction, including without limitation the rights to use,
10
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
11
+ * copies of the Software, and to permit persons to whom the
12
+ * Software is furnished to do so, subject to the following
13
+ * conditions:
14
+ *
15
+ * The above copyright notice and this permission notice shall be
16
+ * included in all copies or substantial portions of the Software.
17
+ *
18
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
20
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
21
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
22
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
23
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
24
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
25
+ * OTHER DEALINGS IN THE SOFTWARE.
26
+ *
27
+ * ---------------------------------------------------------------------------
28
+ *
29
+ * This is a highly optimized DEFLATE decompressor. It is much faster than
30
+ * vanilla zlib, typically well over twice as fast, though results vary by CPU.
31
+ *
32
+ * Why this is faster than vanilla zlib:
33
+ *
34
+ * - Word accesses rather than byte accesses when reading input
35
+ * - Word accesses rather than byte accesses when copying matches
36
+ * - Faster Huffman decoding combined with various DEFLATE-specific tricks
37
+ * - Larger bitbuffer variable that doesn't need to be refilled as often
38
+ * - Other optimizations to remove unnecessary branches
39
+ * - Only full-buffer decompression is supported, so the code doesn't need to
40
+ * support stopping and resuming decompression.
41
+ * - On x86_64, a version of the decompression routine is compiled with BMI2
42
+ * instructions enabled and is used automatically at runtime when supported.
43
+ */
44
+
45
+ #include "lib_common.h"
46
+ #include "deflate_constants.h"
47
+
48
+ /*
49
+ * If the expression passed to SAFETY_CHECK() evaluates to false, then the
50
+ * decompression routine immediately returns LIBDEFLATE_BAD_DATA, indicating the
51
+ * compressed data is invalid.
52
+ *
53
+ * Theoretically, these checks could be disabled for specialized applications
54
+ * where all input to the decompressor will be trusted.
55
+ */
56
+ #if 0
57
+ # pragma message("UNSAFE DECOMPRESSION IS ENABLED. THIS MUST ONLY BE USED IF THE DECOMPRESSOR INPUT WILL ALWAYS BE TRUSTED!")
58
+ # define SAFETY_CHECK(expr) (void)(expr)
59
+ #else
60
+ # define SAFETY_CHECK(expr) if (unlikely(!(expr))) return LIBDEFLATE_BAD_DATA
61
+ #endif
62
+
63
+ /*****************************************************************************
64
+ * Input bitstream *
65
+ *****************************************************************************/
66
+
67
+ /*
68
+ * The state of the "input bitstream" consists of the following variables:
69
+ *
70
+ * - in_next: a pointer to the next unread byte in the input buffer
71
+ *
72
+ * - in_end: a pointer to just past the end of the input buffer
73
+ *
74
+ * - bitbuf: a word-sized variable containing bits that have been read from
75
+ * the input buffer or from the implicit appended zero bytes
76
+ *
77
+ * - bitsleft: the number of bits in 'bitbuf' available to be consumed.
78
+ * After REFILL_BITS_BRANCHLESS(), 'bitbuf' can actually
79
+ * contain more bits than this. However, only the bits counted
80
+ * by 'bitsleft' can actually be consumed; the rest can only be
81
+ * used for preloading.
82
+ *
83
+ * As a micro-optimization, we allow bits 8 and higher of
84
+ * 'bitsleft' to contain garbage. When consuming the bits
85
+ * associated with a decode table entry, this allows us to do
86
+ * 'bitsleft -= entry' instead of 'bitsleft -= (u8)entry'.
87
+ * On some CPUs, this helps reduce instruction dependencies.
88
+ * This does have the disadvantage that 'bitsleft' sometimes
89
+ * needs to be cast to 'u8', such as when it's used as a shift
90
+ * amount in REFILL_BITS_BRANCHLESS(). But that one happens
91
+ * for free since most CPUs ignore high bits in shift amounts.
92
+ *
93
+ * - overread_count: the total number of implicit appended zero bytes that
94
+ * have been loaded into the bitbuffer, including any
95
+ * counted by 'bitsleft' and any already consumed
96
+ */
97
+
98
+ /*
99
+ * The type for the bitbuffer variable ('bitbuf' described above). For best
100
+ * performance, this should have size equal to a machine word.
101
+ *
102
+ * 64-bit platforms have a significant advantage: they get a bigger bitbuffer
103
+ * which they don't have to refill as often.
104
+ */
105
+ typedef machine_word_t bitbuf_t;
106
+ #define BITBUF_NBITS (8 * (int)sizeof(bitbuf_t))
107
+
108
+ /* BITMASK(n) returns a bitmask of length 'n'. */
109
+ #define BITMASK(n) (((bitbuf_t)1 << (n)) - 1)
110
+
111
+ /*
112
+ * MAX_BITSLEFT is the maximum number of consumable bits, i.e. the maximum value
113
+ * of '(u8)bitsleft'. This is the size of the bitbuffer variable, minus 1 if
114
+ * the branchless refill method is being used (see REFILL_BITS_BRANCHLESS()).
115
+ */
116
+ #define MAX_BITSLEFT \
117
+ (UNALIGNED_ACCESS_IS_FAST ? BITBUF_NBITS - 1 : BITBUF_NBITS)
118
+
119
+ /*
120
+ * CONSUMABLE_NBITS is the minimum number of bits that are guaranteed to be
121
+ * consumable (counted in 'bitsleft') immediately after refilling the bitbuffer.
122
+ * Since only whole bytes can be added to 'bitsleft', the worst case is
123
+ * 'MAX_BITSLEFT - 7': the smallest amount where another byte doesn't fit.
124
+ */
125
+ #define CONSUMABLE_NBITS (MAX_BITSLEFT - 7)
126
+
127
+ /*
128
+ * FASTLOOP_PRELOADABLE_NBITS is the minimum number of bits that are guaranteed
129
+ * to be preloadable immediately after REFILL_BITS_IN_FASTLOOP(). (It is *not*
130
+ * guaranteed after REFILL_BITS(), since REFILL_BITS() falls back to a
131
+ * byte-at-a-time refill method near the end of input.) This may exceed the
132
+ * number of consumable bits (counted by 'bitsleft'). Any bits not counted in
133
+ * 'bitsleft' can only be used for precomputation and cannot be consumed.
134
+ */
135
+ #define FASTLOOP_PRELOADABLE_NBITS \
136
+ (UNALIGNED_ACCESS_IS_FAST ? BITBUF_NBITS : CONSUMABLE_NBITS)
137
+
138
+ /*
139
+ * PRELOAD_SLACK is the minimum number of bits that are guaranteed to be
140
+ * preloadable but not consumable, following REFILL_BITS_IN_FASTLOOP() and any
141
+ * subsequent consumptions. This is 1 bit if the branchless refill method is
142
+ * being used, and 0 bits otherwise.
143
+ */
144
+ #define PRELOAD_SLACK MAX(0, FASTLOOP_PRELOADABLE_NBITS - MAX_BITSLEFT)
145
+
146
+ /*
147
+ * CAN_CONSUME(n) is true if it's guaranteed that if the bitbuffer has just been
148
+ * refilled, then it's always possible to consume 'n' bits from it. 'n' should
149
+ * be a compile-time constant, to enable compile-time evaluation.
150
+ */
151
+ #define CAN_CONSUME(n) (CONSUMABLE_NBITS >= (n))
152
+
153
+ /*
154
+ * CAN_CONSUME_AND_THEN_PRELOAD(consume_nbits, preload_nbits) is true if it's
155
+ * guaranteed that after REFILL_BITS_IN_FASTLOOP(), it's always possible to
156
+ * consume 'consume_nbits' bits, then preload 'preload_nbits' bits. The
157
+ * arguments should be compile-time constants to enable compile-time evaluation.
158
+ */
159
+ #define CAN_CONSUME_AND_THEN_PRELOAD(consume_nbits, preload_nbits) \
160
+ (CONSUMABLE_NBITS >= (consume_nbits) && \
161
+ FASTLOOP_PRELOADABLE_NBITS >= (consume_nbits) + (preload_nbits))
162
+
163
+ /*
164
+ * REFILL_BITS_BRANCHLESS() branchlessly refills the bitbuffer variable by
165
+ * reading the next word from the input buffer and updating 'in_next' and
166
+ * 'bitsleft' based on how many bits were refilled -- counting whole bytes only.
167
+ * This is much faster than reading a byte at a time, at least if the CPU is
168
+ * little endian and supports fast unaligned memory accesses.
169
+ *
170
+ * The simplest way of branchlessly updating 'bitsleft' would be:
171
+ *
172
+ * bitsleft += (MAX_BITSLEFT - bitsleft) & ~7;
173
+ *
174
+ * To make it faster, we define MAX_BITSLEFT to be 'WORDBITS - 1' rather than
175
+ * WORDBITS, so that in binary it looks like 111111 or 11111. Then, we update
176
+ * 'bitsleft' by just setting the bits above the low 3 bits:
177
+ *
178
+ * bitsleft |= MAX_BITSLEFT & ~7;
179
+ *
180
+ * That compiles down to a single instruction like 'or $0x38, %rbp'. Using
181
+ * 'MAX_BITSLEFT == WORDBITS - 1' also has the advantage that refills can be
182
+ * done when 'bitsleft == MAX_BITSLEFT' without invoking undefined behavior.
183
+ *
184
+ * The simplest way of branchlessly updating 'in_next' would be:
185
+ *
186
+ * in_next += (MAX_BITSLEFT - bitsleft) >> 3;
187
+ *
188
+ * With 'MAX_BITSLEFT == WORDBITS - 1' we could use an XOR instead, though this
189
+ * isn't really better:
190
+ *
191
+ * in_next += (MAX_BITSLEFT ^ bitsleft) >> 3;
192
+ *
193
+ * An alternative which can be marginally better is the following:
194
+ *
195
+ * in_next += sizeof(bitbuf_t) - 1;
196
+ * in_next -= (bitsleft >> 3) & 0x7;
197
+ *
198
+ * It seems this would increase the number of CPU instructions from 3 (sub, shr,
199
+ * add) to 4 (add, shr, and, sub). However, if the CPU has a bitfield
200
+ * extraction instruction (e.g. arm's ubfx), it stays at 3, and is potentially
201
+ * more efficient because the length of the longest dependency chain decreases
202
+ * from 3 to 2. This alternative also has the advantage that it ignores the
203
+ * high bits in 'bitsleft', so it is compatible with the micro-optimization we
204
+ * use where we let the high bits of 'bitsleft' contain garbage.
205
+ */
206
+ #define REFILL_BITS_BRANCHLESS() \
207
+ do { \
208
+ bitbuf |= get_unaligned_leword(in_next) << (u8)bitsleft; \
209
+ in_next += sizeof(bitbuf_t) - 1; \
210
+ in_next -= (bitsleft >> 3) & 0x7; \
211
+ bitsleft |= MAX_BITSLEFT & ~7; \
212
+ } while (0)
213
+
214
+ /*
215
+ * REFILL_BITS() loads bits from the input buffer until the bitbuffer variable
216
+ * contains at least CONSUMABLE_NBITS consumable bits.
217
+ *
218
+ * This checks for the end of input, and it doesn't guarantee
219
+ * FASTLOOP_PRELOADABLE_NBITS, so it can't be used in the fastloop.
220
+ *
221
+ * If we would overread the input buffer, we just don't read anything, leaving
222
+ * the bits zeroed but marking them filled. This simplifies the decompressor
223
+ * because it removes the need to always be able to distinguish between real
224
+ * overreads and overreads caused only by the decompressor's own lookahead.
225
+ *
226
+ * We do still keep track of the number of bytes that have been overread, for
227
+ * two reasons. First, it allows us to determine the exact number of bytes that
228
+ * were consumed once the stream ends or an uncompressed block is reached.
229
+ * Second, it allows us to stop early if the overread amount gets so large (more
230
+ * than sizeof bitbuf) that it can only be caused by a real overread. (The
231
+ * second part is arguably unneeded, since libdeflate is buffer-based; given
232
+ * infinite zeroes, it will eventually either completely fill the output buffer
233
+ * or return an error. However, we do it to be slightly more friendly to the
234
+ * not-recommended use case of decompressing with an unknown output size.)
235
+ */
236
+ #define REFILL_BITS() \
237
+ do { \
238
+ if (UNALIGNED_ACCESS_IS_FAST && \
239
+ likely(in_end - in_next >= sizeof(bitbuf_t))) { \
240
+ REFILL_BITS_BRANCHLESS(); \
241
+ } else { \
242
+ while ((u8)bitsleft < CONSUMABLE_NBITS) { \
243
+ if (likely(in_next != in_end)) { \
244
+ bitbuf |= (bitbuf_t)*in_next++ << \
245
+ (u8)bitsleft; \
246
+ } else { \
247
+ overread_count++; \
248
+ SAFETY_CHECK(overread_count <= \
249
+ sizeof(bitbuf_t)); \
250
+ } \
251
+ bitsleft += 8; \
252
+ } \
253
+ } \
254
+ } while (0)
255
+
256
+ /*
257
+ * REFILL_BITS_IN_FASTLOOP() is like REFILL_BITS(), but it doesn't check for the
258
+ * end of the input. It can only be used in the fastloop.
259
+ */
260
+ #define REFILL_BITS_IN_FASTLOOP() \
261
+ do { \
262
+ STATIC_ASSERT(UNALIGNED_ACCESS_IS_FAST || \
263
+ FASTLOOP_PRELOADABLE_NBITS == CONSUMABLE_NBITS); \
264
+ if (UNALIGNED_ACCESS_IS_FAST) { \
265
+ REFILL_BITS_BRANCHLESS(); \
266
+ } else { \
267
+ while ((u8)bitsleft < CONSUMABLE_NBITS) { \
268
+ bitbuf |= (bitbuf_t)*in_next++ << (u8)bitsleft; \
269
+ bitsleft += 8; \
270
+ } \
271
+ } \
272
+ } while (0)
273
+
274
+ /*
275
+ * This is the worst-case maximum number of output bytes that are written to
276
+ * during each iteration of the fastloop. The worst case is 2 literals, then a
277
+ * match of length DEFLATE_MAX_MATCH_LEN. Additionally, some slack space must
278
+ * be included for the intentional overrun in the match copy implementation.
279
+ */
280
+ #define FASTLOOP_MAX_BYTES_WRITTEN \
281
+ (2 + DEFLATE_MAX_MATCH_LEN + (5 * WORDBYTES) - 1)
282
+
283
+ /*
284
+ * This is the worst-case maximum number of input bytes that are read during
285
+ * each iteration of the fastloop. To get this value, we first compute the
286
+ * greatest number of bits that can be refilled during a loop iteration. The
287
+ * refill at the beginning can add at most MAX_BITSLEFT, and the amount that can
288
+ * be refilled later is no more than the maximum amount that can be consumed by
289
+ * 2 literals that don't need a subtable, then a match. We convert this value
290
+ * to bytes, rounding up; this gives the maximum number of bytes that 'in_next'
291
+ * can be advanced. Finally, we add sizeof(bitbuf_t) to account for
292
+ * REFILL_BITS_BRANCHLESS() reading a word past 'in_next'.
293
+ */
294
+ #define FASTLOOP_MAX_BYTES_READ \
295
+ (DIV_ROUND_UP(MAX_BITSLEFT + (2 * LITLEN_TABLEBITS) + \
296
+ LENGTH_MAXBITS + OFFSET_MAXBITS, 8) + \
297
+ sizeof(bitbuf_t))
298
+
299
+ /*****************************************************************************
300
+ * Huffman decoding *
301
+ *****************************************************************************/
302
+
303
+ /*
304
+ * The fastest way to decode Huffman-encoded data is basically to use a decode
305
+ * table that maps the next TABLEBITS bits of data to their symbol. Each entry
306
+ * decode_table[i] maps to the symbol whose codeword is a prefix of 'i'. A
307
+ * symbol with codeword length 'n' has '2**(TABLEBITS-n)' entries in the table.
308
+ *
309
+ * Ideally, TABLEBITS and the maximum codeword length would be the same; some
310
+ * compression formats are designed with this goal in mind. Unfortunately, in
311
+ * DEFLATE, the maximum litlen and offset codeword lengths are 15 bits, which is
312
+ * too large for a practical TABLEBITS. It's not *that* much larger, though, so
313
+ * the workaround is to use a single level of subtables. In the main table,
314
+ * entries for prefixes of codewords longer than TABLEBITS contain a "pointer"
315
+ * to the appropriate subtable along with the number of bits it is indexed with.
316
+ *
317
+ * The most efficient way to allocate subtables is to allocate them dynamically
318
+ * after the main table. The worst-case number of table entries needed,
319
+ * including subtables, is precomputable; see the ENOUGH constants below.
320
+ *
321
+ * A useful optimization is to store the codeword lengths in the decode table so
322
+ * that they don't have to be looked up by indexing a separate table that maps
323
+ * symbols to their codeword lengths. We basically do this; however, for the
324
+ * litlen and offset codes we also implement some DEFLATE-specific optimizations
325
+ * that build in the consideration of the "extra bits" and the
326
+ * literal/length/end-of-block division. For the exact decode table entry
327
+ * format we use, see the definitions of the *_decode_results[] arrays below.
328
+ */
329
+
330
+
331
+ /*
332
+ * These are the TABLEBITS values we use for each of the DEFLATE Huffman codes,
333
+ * along with their corresponding ENOUGH values.
334
+ *
335
+ * For the precode, we use PRECODE_TABLEBITS == 7 since this is the maximum
336
+ * precode codeword length. This avoids ever needing subtables.
337
+ *
338
+ * For the litlen and offset codes, we cannot realistically avoid ever needing
339
+ * subtables, since litlen and offset codewords can be up to 15 bits. A higher
340
+ * TABLEBITS reduces the number of lookups that need a subtable, which increases
341
+ * performance; however, it increases memory usage and makes building the table
342
+ * take longer, which decreases performance. We choose values that work well in
343
+ * practice, making subtables rarely needed without making the tables too large.
344
+ *
345
+ * Our choice of OFFSET_TABLEBITS == 8 is a bit low; without any special
346
+ * considerations, 9 would fit the trade-off curve better. However, there is a
347
+ * performance benefit to using exactly 8 bits when it is a compile-time
348
+ * constant, as many CPUs can take the low byte more easily than the low 9 bits.
349
+ *
350
+ * zlib treats its equivalents of TABLEBITS as maximum values; whenever it
351
+ * builds a table, it caps the actual table_bits to the longest codeword. This
352
+ * makes sense in theory, as there's no need for the table to be any larger than
353
+ * needed to support the longest codeword. However, having the table bits be a
354
+ * compile-time constant is beneficial to the performance of the decode loop, so
355
+ * there is a trade-off. libdeflate currently uses the dynamic table_bits
356
+ * strategy for the litlen table only, due to its larger maximum size.
357
+ * PRECODE_TABLEBITS and OFFSET_TABLEBITS are smaller, so going dynamic there
358
+ * isn't as useful, and OFFSET_TABLEBITS=8 is useful as mentioned above.
359
+ *
360
+ * Each TABLEBITS value has a corresponding ENOUGH value that gives the
361
+ * worst-case maximum number of decode table entries, including the main table
362
+ * and all subtables. The ENOUGH value depends on three parameters:
363
+ *
364
+ * (1) the maximum number of symbols in the code (DEFLATE_NUM_*_SYMS)
365
+ * (2) the maximum number of main table bits (*_TABLEBITS)
366
+ * (3) the maximum allowed codeword length (DEFLATE_MAX_*_CODEWORD_LEN)
367
+ *
368
+ * The ENOUGH values were computed using the utility program 'enough' from zlib.
369
+ */
370
+ #define PRECODE_TABLEBITS 7
371
+ #define PRECODE_ENOUGH 128 /* enough 19 7 7 */
372
+ #define LITLEN_TABLEBITS 11
373
+ #define LITLEN_ENOUGH 2342 /* enough 288 11 15 */
374
+ #define OFFSET_TABLEBITS 8
375
+ #define OFFSET_ENOUGH 402 /* enough 32 8 15 */
376
+
377
+ /*
378
+ * make_decode_table_entry() creates a decode table entry for the given symbol
379
+ * by combining the static part 'decode_results[sym]' with the dynamic part
380
+ * 'len', which is the remaining codeword length (the codeword length for main
381
+ * table entries, or the codeword length minus TABLEBITS for subtable entries).
382
+ *
383
+ * In all cases, we add 'len' to each of the two low-order bytes to create the
384
+ * appropriately-formatted decode table entry. See the definitions of the
385
+ * *_decode_results[] arrays below, where the entry format is described.
386
+ */
387
+ static forceinline u32
388
+ make_decode_table_entry(const u32 decode_results[], u32 sym, u32 len)
389
+ {
390
+ return decode_results[sym] + (len << 8) + len;
391
+ }
392
+
393
+ /*
394
+ * Here is the format of our precode decode table entries. Bits not explicitly
395
+ * described contain zeroes:
396
+ *
397
+ * Bit 20-16: presym
398
+ * Bit 10-8: codeword length [not used]
399
+ * Bit 2-0: codeword length
400
+ *
401
+ * The precode decode table never has subtables, since we use
402
+ * PRECODE_TABLEBITS == DEFLATE_MAX_PRE_CODEWORD_LEN.
403
+ *
404
+ * precode_decode_results[] contains the static part of the entry for each
405
+ * symbol. make_decode_table_entry() produces the final entries.
406
+ */
407
+ static const u32 precode_decode_results[] = {
408
+ #define ENTRY(presym) ((u32)presym << 16)
409
+ ENTRY(0) , ENTRY(1) , ENTRY(2) , ENTRY(3) ,
410
+ ENTRY(4) , ENTRY(5) , ENTRY(6) , ENTRY(7) ,
411
+ ENTRY(8) , ENTRY(9) , ENTRY(10) , ENTRY(11) ,
412
+ ENTRY(12) , ENTRY(13) , ENTRY(14) , ENTRY(15) ,
413
+ ENTRY(16) , ENTRY(17) , ENTRY(18) ,
414
+ #undef ENTRY
415
+ };
416
+
417
+ /* Litlen and offset decode table entry flags */
418
+
419
+ /* Indicates a literal entry in the litlen decode table */
420
+ #define HUFFDEC_LITERAL 0x80000000
421
+
422
+ /* Indicates that HUFFDEC_SUBTABLE_POINTER or HUFFDEC_END_OF_BLOCK is set */
423
+ #define HUFFDEC_EXCEPTIONAL 0x00008000
424
+
425
+ /* Indicates a subtable pointer entry in the litlen or offset decode table */
426
+ #define HUFFDEC_SUBTABLE_POINTER 0x00004000
427
+
428
+ /* Indicates an end-of-block entry in the litlen decode table */
429
+ #define HUFFDEC_END_OF_BLOCK 0x00002000
430
+
431
+ /* Maximum number of bits that can be consumed by decoding a match length */
432
+ #define LENGTH_MAXBITS (DEFLATE_MAX_LITLEN_CODEWORD_LEN + \
433
+ DEFLATE_MAX_EXTRA_LENGTH_BITS)
434
+ #define LENGTH_MAXFASTBITS (LITLEN_TABLEBITS /* no subtable needed */ + \
435
+ DEFLATE_MAX_EXTRA_LENGTH_BITS)
436
+
437
+ /*
438
+ * Here is the format of our litlen decode table entries. Bits not explicitly
439
+ * described contain zeroes:
440
+ *
441
+ * Literals:
442
+ * Bit 31: 1 (HUFFDEC_LITERAL)
443
+ * Bit 23-16: literal value
444
+ * Bit 15: 0 (!HUFFDEC_EXCEPTIONAL)
445
+ * Bit 14: 0 (!HUFFDEC_SUBTABLE_POINTER)
446
+ * Bit 13: 0 (!HUFFDEC_END_OF_BLOCK)
447
+ * Bit 11-8: remaining codeword length [not used]
448
+ * Bit 3-0: remaining codeword length
449
+ * Lengths:
450
+ * Bit 31: 0 (!HUFFDEC_LITERAL)
451
+ * Bit 24-16: length base value
452
+ * Bit 15: 0 (!HUFFDEC_EXCEPTIONAL)
453
+ * Bit 14: 0 (!HUFFDEC_SUBTABLE_POINTER)
454
+ * Bit 13: 0 (!HUFFDEC_END_OF_BLOCK)
455
+ * Bit 11-8: remaining codeword length
456
+ * Bit 4-0: remaining codeword length + number of extra bits
457
+ * End of block:
458
+ * Bit 31: 0 (!HUFFDEC_LITERAL)
459
+ * Bit 15: 1 (HUFFDEC_EXCEPTIONAL)
460
+ * Bit 14: 0 (!HUFFDEC_SUBTABLE_POINTER)
461
+ * Bit 13: 1 (HUFFDEC_END_OF_BLOCK)
462
+ * Bit 11-8: remaining codeword length [not used]
463
+ * Bit 3-0: remaining codeword length
464
+ * Subtable pointer:
465
+ * Bit 31: 0 (!HUFFDEC_LITERAL)
466
+ * Bit 30-16: index of start of subtable
467
+ * Bit 15: 1 (HUFFDEC_EXCEPTIONAL)
468
+ * Bit 14: 1 (HUFFDEC_SUBTABLE_POINTER)
469
+ * Bit 13: 0 (!HUFFDEC_END_OF_BLOCK)
470
+ * Bit 11-8: number of subtable bits
471
+ * Bit 3-0: number of main table bits
472
+ *
473
+ * This format has several desirable properties:
474
+ *
475
+ * - The codeword length, length slot base, and number of extra length bits
476
+ * are all built in. This eliminates the need to separately look up this
477
+ * information by indexing separate arrays by symbol or length slot.
478
+ *
479
+ * - The HUFFDEC_* flags enable easily distinguishing between the different
480
+ * types of entries. The HUFFDEC_LITERAL flag enables a fast path for
481
+ * literals; the high bit is used for this, as some CPUs can test the
482
+ * high bit more easily than other bits. The HUFFDEC_EXCEPTIONAL flag
483
+ * makes it possible to detect the two unlikely cases (subtable pointer
484
+ * and end of block) in a single bit flag test.
485
+ *
486
+ * - The low byte is the number of bits that need to be removed from the
487
+ * bitstream; this makes this value easily accessible, and it enables the
488
+ * micro-optimization of doing 'bitsleft -= entry' instead of
489
+ * 'bitsleft -= (u8)entry'. It also includes the number of extra bits,
490
+ * so they don't need to be removed separately.
491
+ *
492
+ * - The flags in bits 15-13 are arranged to be 0 when the
493
+ * "remaining codeword length" in bits 11-8 is needed, making this value
494
+ * fairly easily accessible as well via a shift and downcast.
495
+ *
496
+ * - Similarly, bits 13-12 are 0 when the "subtable bits" in bits 11-8 are
497
+ * needed, making it possible to extract this value with '& 0x3F' rather
498
+ * than '& 0xF'. This value is only used as a shift amount, so this can
499
+ * save an 'and' instruction as the masking by 0x3F happens implicitly.
500
+ *
501
+ * litlen_decode_results[] contains the static part of the entry for each
502
+ * symbol. make_decode_table_entry() produces the final entries.
503
+ */
504
+ static const u32 litlen_decode_results[] = {
505
+
506
+ /* Literals */
507
+ #define ENTRY(literal) (HUFFDEC_LITERAL | ((u32)literal << 16))
508
+ ENTRY(0) , ENTRY(1) , ENTRY(2) , ENTRY(3) ,
509
+ ENTRY(4) , ENTRY(5) , ENTRY(6) , ENTRY(7) ,
510
+ ENTRY(8) , ENTRY(9) , ENTRY(10) , ENTRY(11) ,
511
+ ENTRY(12) , ENTRY(13) , ENTRY(14) , ENTRY(15) ,
512
+ ENTRY(16) , ENTRY(17) , ENTRY(18) , ENTRY(19) ,
513
+ ENTRY(20) , ENTRY(21) , ENTRY(22) , ENTRY(23) ,
514
+ ENTRY(24) , ENTRY(25) , ENTRY(26) , ENTRY(27) ,
515
+ ENTRY(28) , ENTRY(29) , ENTRY(30) , ENTRY(31) ,
516
+ ENTRY(32) , ENTRY(33) , ENTRY(34) , ENTRY(35) ,
517
+ ENTRY(36) , ENTRY(37) , ENTRY(38) , ENTRY(39) ,
518
+ ENTRY(40) , ENTRY(41) , ENTRY(42) , ENTRY(43) ,
519
+ ENTRY(44) , ENTRY(45) , ENTRY(46) , ENTRY(47) ,
520
+ ENTRY(48) , ENTRY(49) , ENTRY(50) , ENTRY(51) ,
521
+ ENTRY(52) , ENTRY(53) , ENTRY(54) , ENTRY(55) ,
522
+ ENTRY(56) , ENTRY(57) , ENTRY(58) , ENTRY(59) ,
523
+ ENTRY(60) , ENTRY(61) , ENTRY(62) , ENTRY(63) ,
524
+ ENTRY(64) , ENTRY(65) , ENTRY(66) , ENTRY(67) ,
525
+ ENTRY(68) , ENTRY(69) , ENTRY(70) , ENTRY(71) ,
526
+ ENTRY(72) , ENTRY(73) , ENTRY(74) , ENTRY(75) ,
527
+ ENTRY(76) , ENTRY(77) , ENTRY(78) , ENTRY(79) ,
528
+ ENTRY(80) , ENTRY(81) , ENTRY(82) , ENTRY(83) ,
529
+ ENTRY(84) , ENTRY(85) , ENTRY(86) , ENTRY(87) ,
530
+ ENTRY(88) , ENTRY(89) , ENTRY(90) , ENTRY(91) ,
531
+ ENTRY(92) , ENTRY(93) , ENTRY(94) , ENTRY(95) ,
532
+ ENTRY(96) , ENTRY(97) , ENTRY(98) , ENTRY(99) ,
533
+ ENTRY(100) , ENTRY(101) , ENTRY(102) , ENTRY(103) ,
534
+ ENTRY(104) , ENTRY(105) , ENTRY(106) , ENTRY(107) ,
535
+ ENTRY(108) , ENTRY(109) , ENTRY(110) , ENTRY(111) ,
536
+ ENTRY(112) , ENTRY(113) , ENTRY(114) , ENTRY(115) ,
537
+ ENTRY(116) , ENTRY(117) , ENTRY(118) , ENTRY(119) ,
538
+ ENTRY(120) , ENTRY(121) , ENTRY(122) , ENTRY(123) ,
539
+ ENTRY(124) , ENTRY(125) , ENTRY(126) , ENTRY(127) ,
540
+ ENTRY(128) , ENTRY(129) , ENTRY(130) , ENTRY(131) ,
541
+ ENTRY(132) , ENTRY(133) , ENTRY(134) , ENTRY(135) ,
542
+ ENTRY(136) , ENTRY(137) , ENTRY(138) , ENTRY(139) ,
543
+ ENTRY(140) , ENTRY(141) , ENTRY(142) , ENTRY(143) ,
544
+ ENTRY(144) , ENTRY(145) , ENTRY(146) , ENTRY(147) ,
545
+ ENTRY(148) , ENTRY(149) , ENTRY(150) , ENTRY(151) ,
546
+ ENTRY(152) , ENTRY(153) , ENTRY(154) , ENTRY(155) ,
547
+ ENTRY(156) , ENTRY(157) , ENTRY(158) , ENTRY(159) ,
548
+ ENTRY(160) , ENTRY(161) , ENTRY(162) , ENTRY(163) ,
549
+ ENTRY(164) , ENTRY(165) , ENTRY(166) , ENTRY(167) ,
550
+ ENTRY(168) , ENTRY(169) , ENTRY(170) , ENTRY(171) ,
551
+ ENTRY(172) , ENTRY(173) , ENTRY(174) , ENTRY(175) ,
552
+ ENTRY(176) , ENTRY(177) , ENTRY(178) , ENTRY(179) ,
553
+ ENTRY(180) , ENTRY(181) , ENTRY(182) , ENTRY(183) ,
554
+ ENTRY(184) , ENTRY(185) , ENTRY(186) , ENTRY(187) ,
555
+ ENTRY(188) , ENTRY(189) , ENTRY(190) , ENTRY(191) ,
556
+ ENTRY(192) , ENTRY(193) , ENTRY(194) , ENTRY(195) ,
557
+ ENTRY(196) , ENTRY(197) , ENTRY(198) , ENTRY(199) ,
558
+ ENTRY(200) , ENTRY(201) , ENTRY(202) , ENTRY(203) ,
559
+ ENTRY(204) , ENTRY(205) , ENTRY(206) , ENTRY(207) ,
560
+ ENTRY(208) , ENTRY(209) , ENTRY(210) , ENTRY(211) ,
561
+ ENTRY(212) , ENTRY(213) , ENTRY(214) , ENTRY(215) ,
562
+ ENTRY(216) , ENTRY(217) , ENTRY(218) , ENTRY(219) ,
563
+ ENTRY(220) , ENTRY(221) , ENTRY(222) , ENTRY(223) ,
564
+ ENTRY(224) , ENTRY(225) , ENTRY(226) , ENTRY(227) ,
565
+ ENTRY(228) , ENTRY(229) , ENTRY(230) , ENTRY(231) ,
566
+ ENTRY(232) , ENTRY(233) , ENTRY(234) , ENTRY(235) ,
567
+ ENTRY(236) , ENTRY(237) , ENTRY(238) , ENTRY(239) ,
568
+ ENTRY(240) , ENTRY(241) , ENTRY(242) , ENTRY(243) ,
569
+ ENTRY(244) , ENTRY(245) , ENTRY(246) , ENTRY(247) ,
570
+ ENTRY(248) , ENTRY(249) , ENTRY(250) , ENTRY(251) ,
571
+ ENTRY(252) , ENTRY(253) , ENTRY(254) , ENTRY(255) ,
572
+ #undef ENTRY
573
+
574
+ /* End of block */
575
+ HUFFDEC_EXCEPTIONAL | HUFFDEC_END_OF_BLOCK,
576
+
577
+ /* Lengths */
578
+ #define ENTRY(length_base, num_extra_bits) \
579
+ (((u32)(length_base) << 16) | (num_extra_bits))
580
+ ENTRY(3 , 0) , ENTRY(4 , 0) , ENTRY(5 , 0) , ENTRY(6 , 0),
581
+ ENTRY(7 , 0) , ENTRY(8 , 0) , ENTRY(9 , 0) , ENTRY(10 , 0),
582
+ ENTRY(11 , 1) , ENTRY(13 , 1) , ENTRY(15 , 1) , ENTRY(17 , 1),
583
+ ENTRY(19 , 2) , ENTRY(23 , 2) , ENTRY(27 , 2) , ENTRY(31 , 2),
584
+ ENTRY(35 , 3) , ENTRY(43 , 3) , ENTRY(51 , 3) , ENTRY(59 , 3),
585
+ ENTRY(67 , 4) , ENTRY(83 , 4) , ENTRY(99 , 4) , ENTRY(115, 4),
586
+ ENTRY(131, 5) , ENTRY(163, 5) , ENTRY(195, 5) , ENTRY(227, 5),
587
+ ENTRY(258, 0) , ENTRY(258, 0) , ENTRY(258, 0) ,
588
+ #undef ENTRY
589
+ };
590
+
591
+ /* Maximum number of bits that can be consumed by decoding a match offset */
592
+ #define OFFSET_MAXBITS (DEFLATE_MAX_OFFSET_CODEWORD_LEN + \
593
+ DEFLATE_MAX_EXTRA_OFFSET_BITS)
594
+ #define OFFSET_MAXFASTBITS (OFFSET_TABLEBITS /* no subtable needed */ + \
595
+ DEFLATE_MAX_EXTRA_OFFSET_BITS)
596
+
597
+ /*
598
+ * Here is the format of our offset decode table entries. Bits not explicitly
599
+ * described contain zeroes:
600
+ *
601
+ * Offsets:
602
+ * Bit 31-16: offset base value
603
+ * Bit 15: 0 (!HUFFDEC_EXCEPTIONAL)
604
+ * Bit 14: 0 (!HUFFDEC_SUBTABLE_POINTER)
605
+ * Bit 11-8: remaining codeword length
606
+ * Bit 4-0: remaining codeword length + number of extra bits
607
+ * Subtable pointer:
608
+ * Bit 31-16: index of start of subtable
609
+ * Bit 15: 1 (HUFFDEC_EXCEPTIONAL)
610
+ * Bit 14: 1 (HUFFDEC_SUBTABLE_POINTER)
611
+ * Bit 11-8: number of subtable bits
612
+ * Bit 3-0: number of main table bits
613
+ *
614
+ * These work the same way as the length entries and subtable pointer entries in
615
+ * the litlen decode table; see litlen_decode_results[] above.
616
+ */
617
+ static const u32 offset_decode_results[] = {
618
+ #define ENTRY(offset_base, num_extra_bits) \
619
+ (((u32)(offset_base) << 16) | (num_extra_bits))
620
+ ENTRY(1 , 0) , ENTRY(2 , 0) , ENTRY(3 , 0) , ENTRY(4 , 0) ,
621
+ ENTRY(5 , 1) , ENTRY(7 , 1) , ENTRY(9 , 2) , ENTRY(13 , 2) ,
622
+ ENTRY(17 , 3) , ENTRY(25 , 3) , ENTRY(33 , 4) , ENTRY(49 , 4) ,
623
+ ENTRY(65 , 5) , ENTRY(97 , 5) , ENTRY(129 , 6) , ENTRY(193 , 6) ,
624
+ ENTRY(257 , 7) , ENTRY(385 , 7) , ENTRY(513 , 8) , ENTRY(769 , 8) ,
625
+ ENTRY(1025 , 9) , ENTRY(1537 , 9) , ENTRY(2049 , 10) , ENTRY(3073 , 10) ,
626
+ ENTRY(4097 , 11) , ENTRY(6145 , 11) , ENTRY(8193 , 12) , ENTRY(12289 , 12) ,
627
+ ENTRY(16385 , 13) , ENTRY(24577 , 13) , ENTRY(24577 , 13) , ENTRY(24577 , 13) ,
628
+ #undef ENTRY
629
+ };
630
+
631
+ /*
632
+ * The main DEFLATE decompressor structure. Since libdeflate only supports
633
+ * full-buffer decompression, this structure doesn't store the entire
634
+ * decompression state, most of which is in stack variables. Instead, this
635
+ * struct just contains the decode tables and some temporary arrays used for
636
+ * building them, as these are too large to comfortably allocate on the stack.
637
+ *
638
+ * Storing the decode tables in the decompressor struct also allows the decode
639
+ * tables for the static codes to be reused whenever two static Huffman blocks
640
+ * are decoded without an intervening dynamic block, even across streams.
641
+ */
642
+ struct libdeflate_decompressor {
643
+
644
+ /*
645
+ * The arrays aren't all needed at the same time. 'precode_lens' and
646
+ * 'precode_decode_table' are unneeded after 'lens' has been filled.
647
+ * Furthermore, 'lens' need not be retained after building the litlen
648
+ * and offset decode tables. In fact, 'lens' can be in union with
649
+ * 'litlen_decode_table' provided that 'offset_decode_table' is separate
650
+ * and is built first.
651
+ */
652
+
653
+ union {
654
+ u8 precode_lens[DEFLATE_NUM_PRECODE_SYMS];
655
+
656
+ struct {
657
+ u8 lens[DEFLATE_NUM_LITLEN_SYMS +
658
+ DEFLATE_NUM_OFFSET_SYMS +
659
+ DEFLATE_MAX_LENS_OVERRUN];
660
+
661
+ u32 precode_decode_table[PRECODE_ENOUGH];
662
+ } l;
663
+
664
+ u32 litlen_decode_table[LITLEN_ENOUGH];
665
+ } u;
666
+
667
+ u32 offset_decode_table[OFFSET_ENOUGH];
668
+
669
+ /* used only during build_decode_table() */
670
+ u16 sorted_syms[DEFLATE_MAX_NUM_SYMS];
671
+
672
+ bool static_codes_loaded;
673
+ unsigned litlen_tablebits;
674
+
675
+ /* The free() function for this struct, chosen at allocation time */
676
+ free_func_t free_func;
677
+ };
678
+
679
+ /*
680
+ * Build a table for fast decoding of symbols from a Huffman code. As input,
681
+ * this function takes the codeword length of each symbol which may be used in
682
+ * the code. As output, it produces a decode table for the canonical Huffman
683
+ * code described by the codeword lengths. The decode table is built with the
684
+ * assumption that it will be indexed with "bit-reversed" codewords, where the
685
+ * low-order bit is the first bit of the codeword. This format is used for all
686
+ * Huffman codes in DEFLATE.
687
+ *
688
+ * @decode_table
689
+ * The array in which the decode table will be generated. This array must
690
+ * have sufficient length; see the definition of the ENOUGH numbers.
691
+ * @lens
692
+ * An array which provides, for each symbol, the length of the
693
+ * corresponding codeword in bits, or 0 if the symbol is unused. This may
694
+ * alias @decode_table, since nothing is written to @decode_table until all
695
+ * @lens have been consumed. All codeword lengths are assumed to be <=
696
+ * @max_codeword_len but are otherwise considered untrusted. If they do
697
+ * not form a valid Huffman code, then the decode table is not built and
698
+ * %false is returned.
699
+ * @num_syms
700
+ * The number of symbols in the code, including all unused symbols.
701
+ * @decode_results
702
+ * An array which gives the incomplete decode result for each symbol. The
703
+ * needed values in this array will be combined with codeword lengths to
704
+ * make the final decode table entries using make_decode_table_entry().
705
+ * @table_bits
706
+ * The log base-2 of the number of main table entries to use.
707
+ * If @table_bits_ret != NULL, then @table_bits is treated as a maximum
708
+ * value and it will be decreased if a smaller table would be sufficient.
709
+ * @max_codeword_len
710
+ * The maximum allowed codeword length for this Huffman code.
711
+ * Must be <= DEFLATE_MAX_CODEWORD_LEN.
712
+ * @sorted_syms
713
+ * A temporary array of length @num_syms.
714
+ * @table_bits_ret
715
+ * If non-NULL, then the dynamic table_bits is enabled, and the actual
716
+ * table_bits value will be returned here.
717
+ *
718
+ * Returns %true if successful; %false if the codeword lengths do not form a
719
+ * valid Huffman code.
720
+ */
721
+ static bool
722
+ build_decode_table(u32 decode_table[],
723
+ const u8 lens[],
724
+ const unsigned num_syms,
725
+ const u32 decode_results[],
726
+ unsigned table_bits,
727
+ unsigned max_codeword_len,
728
+ u16 *sorted_syms,
729
+ unsigned *table_bits_ret)
730
+ {
731
+ unsigned len_counts[DEFLATE_MAX_CODEWORD_LEN + 1];
732
+ unsigned offsets[DEFLATE_MAX_CODEWORD_LEN + 1];
733
+ unsigned sym; /* current symbol */
734
+ unsigned codeword; /* current codeword, bit-reversed */
735
+ unsigned len; /* current codeword length in bits */
736
+ unsigned count; /* num codewords remaining with this length */
737
+ u32 codespace_used; /* codespace used out of '2^max_codeword_len' */
738
+ unsigned cur_table_end; /* end index of current table */
739
+ unsigned subtable_prefix; /* codeword prefix of current subtable */
740
+ unsigned subtable_start; /* start index of current subtable */
741
+ unsigned subtable_bits; /* log2 of current subtable length */
742
+
743
+ /* Count how many codewords have each length, including 0. */
744
+ for (len = 0; len <= max_codeword_len; len++)
745
+ len_counts[len] = 0;
746
+ for (sym = 0; sym < num_syms; sym++)
747
+ len_counts[lens[sym]]++;
748
+
749
+ /*
750
+ * Determine the actual maximum codeword length that was used, and
751
+ * decrease table_bits to it if allowed.
752
+ */
753
+ while (max_codeword_len > 1 && len_counts[max_codeword_len] == 0)
754
+ max_codeword_len--;
755
+ if (table_bits_ret != NULL) {
756
+ table_bits = MIN(table_bits, max_codeword_len);
757
+ *table_bits_ret = table_bits;
758
+ }
759
+
760
+ /*
761
+ * Sort the symbols primarily by increasing codeword length and
762
+ * secondarily by increasing symbol value; or equivalently by their
763
+ * codewords in lexicographic order, since a canonical code is assumed.
764
+ *
765
+ * For efficiency, also compute 'codespace_used' in the same pass over
766
+ * 'len_counts[]' used to build 'offsets[]' for sorting.
767
+ */
768
+
769
+ /* Ensure that 'codespace_used' cannot overflow. */
770
+ STATIC_ASSERT(sizeof(codespace_used) == 4);
771
+ STATIC_ASSERT(UINT32_MAX / (1U << (DEFLATE_MAX_CODEWORD_LEN - 1)) >=
772
+ DEFLATE_MAX_NUM_SYMS);
773
+
774
+ offsets[0] = 0;
775
+ offsets[1] = len_counts[0];
776
+ codespace_used = 0;
777
+ for (len = 1; len < max_codeword_len; len++) {
778
+ offsets[len + 1] = offsets[len] + len_counts[len];
779
+ codespace_used = (codespace_used << 1) + len_counts[len];
780
+ }
781
+ codespace_used = (codespace_used << 1) + len_counts[len];
782
+
783
+ for (sym = 0; sym < num_syms; sym++)
784
+ sorted_syms[offsets[lens[sym]]++] = sym;
785
+
786
+ sorted_syms += offsets[0]; /* Skip unused symbols */
787
+
788
+ /* lens[] is done being used, so we can write to decode_table[] now. */
789
+
790
+ /*
791
+ * Check whether the lengths form a complete code (exactly fills the
792
+ * codespace), an incomplete code (doesn't fill the codespace), or an
793
+ * overfull code (overflows the codespace). A codeword of length 'n'
794
+ * uses proportion '1/(2^n)' of the codespace. An overfull code is
795
+ * nonsensical, so is considered invalid. An incomplete code is
796
+ * considered valid only in two specific cases; see below.
797
+ */
798
+
799
+ /* overfull code? */
800
+ if (unlikely(codespace_used > (1U << max_codeword_len)))
801
+ return false;
802
+
803
+ /* incomplete code? */
804
+ if (unlikely(codespace_used < (1U << max_codeword_len))) {
805
+ u32 entry;
806
+ unsigned i;
807
+
808
+ /*
809
+ * The DEFLATE RFC explicitly allows the offset code to be
810
+ * incomplete in two cases: a code containing just 1 codeword,
811
+ * if that codeword has length 1; and a code containing no
812
+ * codewords. Note: the list of offset codeword lengths is
813
+ * always nonempty, but lengths of 0 don't count as codewords.
814
+ *
815
+ * The RFC doesn't say whether the same cases are allowed for
816
+ * the litlen and pre codes. It's actually impossible for no
817
+ * symbols to be used from these codes; however, it's
818
+ * technically possible for only one symbol to be used. zlib
819
+ * allows 1 codeword for the litlen code, but not the precode.
820
+ * The RFC also doesn't say whether, when there is 1 codeword,
821
+ * that codeword is '0' or '1'. zlib uses '0'.
822
+ *
823
+ * We accept what zlib accepts, plus a bit more. First, we
824
+ * don't treat the precode more strictly than the litlen and
825
+ * offset codes. There's no convincing reason to add a special
826
+ * case for the precode here.
827
+ *
828
+ * Second, we just map each allowed incompete code to a complete
829
+ * code with only real symbols. To do this, we choose a symbol,
830
+ * either the used symbol (for codes with 1 codeword) or an
831
+ * arbitrary symbol (for empty codes), and give it both
832
+ * codewords '0' and '1'. zlib instead uses a special ERROR
833
+ * symbol in the part of the codespace the code doesn't use.
834
+ * However, having an ERROR symbol reduces the performance of
835
+ * the Huffman decoder, for no real benefit. Our approach also
836
+ * avoids having to decide whether '0' or '1' is correct.
837
+ *
838
+ * Like zlib, we still reject all incomplete codes that contain
839
+ * more than 1 codeword or a codeword length greater than 1.
840
+ */
841
+ if (codespace_used == 0) {
842
+ sym = 0; /* arbitrary */
843
+ } else {
844
+ if (codespace_used != (1U << (max_codeword_len - 1)) ||
845
+ len_counts[1] != 1)
846
+ return false;
847
+ sym = sorted_syms[0];
848
+ }
849
+ entry = make_decode_table_entry(decode_results, sym, 1);
850
+ for (i = 0; i < (1U << table_bits); i++)
851
+ decode_table[i] = entry;
852
+ return true;
853
+ }
854
+
855
+ /*
856
+ * The lengths form a complete code. Now, enumerate the codewords in
857
+ * lexicographic order and fill the decode table entries for each one.
858
+ *
859
+ * First, process all codewords with len <= table_bits. Each one gets
860
+ * '2^(table_bits-len)' direct entries in the table.
861
+ *
862
+ * Since DEFLATE uses bit-reversed codewords, these entries aren't
863
+ * consecutive but rather are spaced '2^len' entries apart. This makes
864
+ * filling them naively somewhat awkward and inefficient, since strided
865
+ * stores are less cache-friendly and preclude the use of word or
866
+ * vector-at-a-time stores to fill multiple entries per instruction.
867
+ *
868
+ * To optimize this, we incrementally double the table size. When
869
+ * processing codewords with length 'len', the table is treated as
870
+ * having only '2^len' entries, so each codeword uses just one entry.
871
+ * Then, each time 'len' is incremented, the table size is doubled and
872
+ * the first half is copied to the second half. This significantly
873
+ * improves performance over naively doing strided stores.
874
+ *
875
+ * Note that some entries copied for each table doubling may not have
876
+ * been initialized yet, but it doesn't matter since they're guaranteed
877
+ * to be initialized later (because the Huffman code is complete).
878
+ */
879
+ codeword = 0;
880
+ len = 1;
881
+ while ((count = len_counts[len]) == 0)
882
+ len++;
883
+ cur_table_end = 1U << len;
884
+ while (len <= table_bits) {
885
+ /* Process all 'count' codewords with length 'len' bits. */
886
+ do {
887
+ unsigned bit;
888
+
889
+ /* Fill the first entry for the current codeword. */
890
+ decode_table[codeword] =
891
+ make_decode_table_entry(decode_results,
892
+ *sorted_syms++, len);
893
+
894
+ if (codeword == cur_table_end - 1) {
895
+ /* Last codeword (all 1's) */
896
+ for (; len < table_bits; len++) {
897
+ memcpy(&decode_table[cur_table_end],
898
+ decode_table,
899
+ cur_table_end *
900
+ sizeof(decode_table[0]));
901
+ cur_table_end <<= 1;
902
+ }
903
+ return true;
904
+ }
905
+ /*
906
+ * To advance to the lexicographically next codeword in
907
+ * the canonical code, the codeword must be incremented,
908
+ * then 0's must be appended to the codeword as needed
909
+ * to match the next codeword's length.
910
+ *
911
+ * Since the codeword is bit-reversed, appending 0's is
912
+ * a no-op. However, incrementing it is nontrivial. To
913
+ * do so efficiently, use the 'bsr' instruction to find
914
+ * the last (highest order) 0 bit in the codeword, set
915
+ * it, and clear any later (higher order) 1 bits. But
916
+ * 'bsr' actually finds the highest order 1 bit, so to
917
+ * use it first flip all bits in the codeword by XOR'ing
918
+ * it with (1U << len) - 1 == cur_table_end - 1.
919
+ */
920
+ bit = 1U << bsr32(codeword ^ (cur_table_end - 1));
921
+ codeword &= bit - 1;
922
+ codeword |= bit;
923
+ } while (--count);
924
+
925
+ /* Advance to the next codeword length. */
926
+ do {
927
+ if (++len <= table_bits) {
928
+ memcpy(&decode_table[cur_table_end],
929
+ decode_table,
930
+ cur_table_end * sizeof(decode_table[0]));
931
+ cur_table_end <<= 1;
932
+ }
933
+ } while ((count = len_counts[len]) == 0);
934
+ }
935
+
936
+ /* Process codewords with len > table_bits. These require subtables. */
937
+ cur_table_end = 1U << table_bits;
938
+ subtable_prefix = -1;
939
+ subtable_start = 0;
940
+ for (;;) {
941
+ u32 entry;
942
+ unsigned i;
943
+ unsigned stride;
944
+ unsigned bit;
945
+
946
+ /*
947
+ * Start a new subtable if the first 'table_bits' bits of the
948
+ * codeword don't match the prefix of the current subtable.
949
+ */
950
+ if ((codeword & ((1U << table_bits) - 1)) != subtable_prefix) {
951
+ subtable_prefix = (codeword & ((1U << table_bits) - 1));
952
+ subtable_start = cur_table_end;
953
+ /*
954
+ * Calculate the subtable length. If the codeword has
955
+ * length 'table_bits + n', then the subtable needs
956
+ * '2^n' entries. But it may need more; if fewer than
957
+ * '2^n' codewords of length 'table_bits + n' remain,
958
+ * then the length will need to be incremented to bring
959
+ * in longer codewords until the subtable can be
960
+ * completely filled. Note that because the Huffman
961
+ * code is complete, it will always be possible to fill
962
+ * the subtable eventually.
963
+ */
964
+ subtable_bits = len - table_bits;
965
+ codespace_used = count;
966
+ while (codespace_used < (1U << subtable_bits)) {
967
+ subtable_bits++;
968
+ codespace_used = (codespace_used << 1) +
969
+ len_counts[table_bits + subtable_bits];
970
+ }
971
+ cur_table_end = subtable_start + (1U << subtable_bits);
972
+
973
+ /*
974
+ * Create the entry that points from the main table to
975
+ * the subtable.
976
+ */
977
+ decode_table[subtable_prefix] =
978
+ ((u32)subtable_start << 16) |
979
+ HUFFDEC_EXCEPTIONAL |
980
+ HUFFDEC_SUBTABLE_POINTER |
981
+ (subtable_bits << 8) | table_bits;
982
+ }
983
+
984
+ /* Fill the subtable entries for the current codeword. */
985
+ entry = make_decode_table_entry(decode_results, *sorted_syms++,
986
+ len - table_bits);
987
+ i = subtable_start + (codeword >> table_bits);
988
+ stride = 1U << (len - table_bits);
989
+ do {
990
+ decode_table[i] = entry;
991
+ i += stride;
992
+ } while (i < cur_table_end);
993
+
994
+ /* Advance to the next codeword. */
995
+ if (codeword == (1U << len) - 1) /* last codeword (all 1's)? */
996
+ return true;
997
+ bit = 1U << bsr32(codeword ^ ((1U << len) - 1));
998
+ codeword &= bit - 1;
999
+ codeword |= bit;
1000
+ count--;
1001
+ while (count == 0)
1002
+ count = len_counts[++len];
1003
+ }
1004
+ }
1005
+
1006
+ /* Build the decode table for the precode. */
1007
+ static bool
1008
+ build_precode_decode_table(struct libdeflate_decompressor *d)
1009
+ {
1010
+ /* When you change TABLEBITS, you must change ENOUGH, and vice versa! */
1011
+ STATIC_ASSERT(PRECODE_TABLEBITS == 7 && PRECODE_ENOUGH == 128);
1012
+
1013
+ STATIC_ASSERT(ARRAY_LEN(precode_decode_results) ==
1014
+ DEFLATE_NUM_PRECODE_SYMS);
1015
+
1016
+ return build_decode_table(d->u.l.precode_decode_table,
1017
+ d->u.precode_lens,
1018
+ DEFLATE_NUM_PRECODE_SYMS,
1019
+ precode_decode_results,
1020
+ PRECODE_TABLEBITS,
1021
+ DEFLATE_MAX_PRE_CODEWORD_LEN,
1022
+ d->sorted_syms,
1023
+ NULL);
1024
+ }
1025
+
1026
+ /* Build the decode table for the literal/length code. */
1027
+ static bool
1028
+ build_litlen_decode_table(struct libdeflate_decompressor *d,
1029
+ unsigned num_litlen_syms, unsigned num_offset_syms)
1030
+ {
1031
+ /* When you change TABLEBITS, you must change ENOUGH, and vice versa! */
1032
+ STATIC_ASSERT(LITLEN_TABLEBITS == 11 && LITLEN_ENOUGH == 2342);
1033
+
1034
+ STATIC_ASSERT(ARRAY_LEN(litlen_decode_results) ==
1035
+ DEFLATE_NUM_LITLEN_SYMS);
1036
+
1037
+ return build_decode_table(d->u.litlen_decode_table,
1038
+ d->u.l.lens,
1039
+ num_litlen_syms,
1040
+ litlen_decode_results,
1041
+ LITLEN_TABLEBITS,
1042
+ DEFLATE_MAX_LITLEN_CODEWORD_LEN,
1043
+ d->sorted_syms,
1044
+ &d->litlen_tablebits);
1045
+ }
1046
+
1047
+ /* Build the decode table for the offset code. */
1048
+ static bool
1049
+ build_offset_decode_table(struct libdeflate_decompressor *d,
1050
+ unsigned num_litlen_syms, unsigned num_offset_syms)
1051
+ {
1052
+ /* When you change TABLEBITS, you must change ENOUGH, and vice versa! */
1053
+ STATIC_ASSERT(OFFSET_TABLEBITS == 8 && OFFSET_ENOUGH == 402);
1054
+
1055
+ STATIC_ASSERT(ARRAY_LEN(offset_decode_results) ==
1056
+ DEFLATE_NUM_OFFSET_SYMS);
1057
+
1058
+ return build_decode_table(d->offset_decode_table,
1059
+ d->u.l.lens + num_litlen_syms,
1060
+ num_offset_syms,
1061
+ offset_decode_results,
1062
+ OFFSET_TABLEBITS,
1063
+ DEFLATE_MAX_OFFSET_CODEWORD_LEN,
1064
+ d->sorted_syms,
1065
+ NULL);
1066
+ }
1067
+
1068
+ /*****************************************************************************
1069
+ * Main decompression routine
1070
+ *****************************************************************************/
1071
+
1072
+ typedef enum libdeflate_result (*decompress_func_t)
1073
+ (struct libdeflate_decompressor * restrict d,
1074
+ const void * restrict in, size_t in_nbytes,
1075
+ void * restrict out, size_t out_nbytes_avail,
1076
+ size_t *actual_in_nbytes_ret, size_t *actual_out_nbytes_ret);
1077
+
1078
+ #define FUNCNAME deflate_decompress_default
1079
+ #undef ATTRIBUTES
1080
+ #undef EXTRACT_VARBITS
1081
+ #undef EXTRACT_VARBITS8
1082
+ #include "decompress_template.h"
1083
+
1084
+ /* Include architecture-specific implementation(s) if available. */
1085
+ #undef DEFAULT_IMPL
1086
+ #undef arch_select_decompress_func
1087
+ #if defined(ARCH_X86_32) || defined(ARCH_X86_64)
1088
+ # include "x86/decompress_impl.h"
1089
+ #endif
1090
+
1091
+ #ifndef DEFAULT_IMPL
1092
+ # define DEFAULT_IMPL deflate_decompress_default
1093
+ #endif
1094
+
1095
+ #ifdef arch_select_decompress_func
1096
+ static enum libdeflate_result
1097
+ dispatch_decomp(struct libdeflate_decompressor *d,
1098
+ const void *in, size_t in_nbytes,
1099
+ void *out, size_t out_nbytes_avail,
1100
+ size_t *actual_in_nbytes_ret, size_t *actual_out_nbytes_ret);
1101
+
1102
+ static volatile decompress_func_t decompress_impl = dispatch_decomp;
1103
+
1104
+ /* Choose the best implementation at runtime. */
1105
+ static enum libdeflate_result
1106
+ dispatch_decomp(struct libdeflate_decompressor *d,
1107
+ const void *in, size_t in_nbytes,
1108
+ void *out, size_t out_nbytes_avail,
1109
+ size_t *actual_in_nbytes_ret, size_t *actual_out_nbytes_ret)
1110
+ {
1111
+ decompress_func_t f = arch_select_decompress_func();
1112
+
1113
+ if (f == NULL)
1114
+ f = DEFAULT_IMPL;
1115
+
1116
+ decompress_impl = f;
1117
+ return f(d, in, in_nbytes, out, out_nbytes_avail,
1118
+ actual_in_nbytes_ret, actual_out_nbytes_ret);
1119
+ }
1120
+ #else
1121
+ /* The best implementation is statically known, so call it directly. */
1122
+ # define decompress_impl DEFAULT_IMPL
1123
+ #endif
1124
+
1125
+ /*
1126
+ * This is the main DEFLATE decompression routine. See libdeflate.h for the
1127
+ * documentation.
1128
+ *
1129
+ * Note that the real code is in decompress_template.h. The part here just
1130
+ * handles calling the appropriate implementation depending on the CPU features
1131
+ * at runtime.
1132
+ */
1133
+ LIBDEFLATEAPI enum libdeflate_result
1134
+ libdeflate_deflate_decompress_ex(struct libdeflate_decompressor *d,
1135
+ const void *in, size_t in_nbytes,
1136
+ void *out, size_t out_nbytes_avail,
1137
+ size_t *actual_in_nbytes_ret,
1138
+ size_t *actual_out_nbytes_ret)
1139
+ {
1140
+ return decompress_impl(d, in, in_nbytes, out, out_nbytes_avail,
1141
+ actual_in_nbytes_ret, actual_out_nbytes_ret);
1142
+ }
1143
+
1144
+ LIBDEFLATEAPI enum libdeflate_result
1145
+ libdeflate_deflate_decompress(struct libdeflate_decompressor *d,
1146
+ const void *in, size_t in_nbytes,
1147
+ void *out, size_t out_nbytes_avail,
1148
+ size_t *actual_out_nbytes_ret)
1149
+ {
1150
+ return libdeflate_deflate_decompress_ex(d, in, in_nbytes,
1151
+ out, out_nbytes_avail,
1152
+ NULL, actual_out_nbytes_ret);
1153
+ }
1154
+
1155
+ LIBDEFLATEAPI struct libdeflate_decompressor *
1156
+ libdeflate_alloc_decompressor_ex(const struct libdeflate_options *options)
1157
+ {
1158
+ struct libdeflate_decompressor *d;
1159
+
1160
+ /*
1161
+ * Note: if more fields are added to libdeflate_options, this code will
1162
+ * need to be updated to support both the old and new structs.
1163
+ */
1164
+ if (options->sizeof_options != sizeof(*options))
1165
+ return NULL;
1166
+
1167
+ d = (options->malloc_func ? options->malloc_func :
1168
+ libdeflate_default_malloc_func)(sizeof(*d));
1169
+ if (d == NULL)
1170
+ return NULL;
1171
+ /*
1172
+ * Note that only certain parts of the decompressor actually must be
1173
+ * initialized here:
1174
+ *
1175
+ * - 'static_codes_loaded' must be initialized to false.
1176
+ *
1177
+ * - The first half of the main portion of each decode table must be
1178
+ * initialized to any value, to avoid reading from uninitialized
1179
+ * memory during table expansion in build_decode_table(). (Although,
1180
+ * this is really just to avoid warnings with dynamic tools like
1181
+ * valgrind, since build_decode_table() is guaranteed to initialize
1182
+ * all entries eventually anyway.)
1183
+ *
1184
+ * - 'free_func' must be set.
1185
+ *
1186
+ * But for simplicity, we currently just zero the whole decompressor.
1187
+ */
1188
+ memset(d, 0, sizeof(*d));
1189
+ d->free_func = options->free_func ?
1190
+ options->free_func : libdeflate_default_free_func;
1191
+ return d;
1192
+ }
1193
+
1194
+ LIBDEFLATEAPI struct libdeflate_decompressor *
1195
+ libdeflate_alloc_decompressor(void)
1196
+ {
1197
+ static const struct libdeflate_options defaults = {
1198
+ .sizeof_options = sizeof(defaults),
1199
+ };
1200
+ return libdeflate_alloc_decompressor_ex(&defaults);
1201
+ }
1202
+
1203
+ LIBDEFLATEAPI void
1204
+ libdeflate_free_decompressor(struct libdeflate_decompressor *d)
1205
+ {
1206
+ if (d)
1207
+ d->free_func(d);
1208
+ }