chd 0.1.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/README.md +30 -0
- data/chd.gemspec +29 -0
- data/ext/chd.c +1008 -0
- data/ext/extconf.rb +60 -0
- data/lib/chd/cd.rb +272 -0
- data/lib/chd/metadata.rb +196 -0
- data/lib/chd/version.rb +4 -0
- data/lib/chd.rb +21 -0
- data/libchdr/CMakeLists.txt +104 -0
- data/libchdr/LICENSE.txt +24 -0
- data/libchdr/README.md +7 -0
- data/libchdr/deps/lzma-19.00/CMakeLists.txt +33 -0
- data/libchdr/deps/lzma-19.00/LICENSE +3 -0
- data/libchdr/deps/lzma-19.00/include/7zTypes.h +375 -0
- data/libchdr/deps/lzma-19.00/include/Alloc.h +51 -0
- data/libchdr/deps/lzma-19.00/include/Bra.h +64 -0
- data/libchdr/deps/lzma-19.00/include/Compiler.h +33 -0
- data/libchdr/deps/lzma-19.00/include/CpuArch.h +336 -0
- data/libchdr/deps/lzma-19.00/include/Delta.h +19 -0
- data/libchdr/deps/lzma-19.00/include/LzFind.h +121 -0
- data/libchdr/deps/lzma-19.00/include/LzHash.h +57 -0
- data/libchdr/deps/lzma-19.00/include/Lzma86.h +111 -0
- data/libchdr/deps/lzma-19.00/include/LzmaDec.h +234 -0
- data/libchdr/deps/lzma-19.00/include/LzmaEnc.h +76 -0
- data/libchdr/deps/lzma-19.00/include/LzmaLib.h +131 -0
- data/libchdr/deps/lzma-19.00/include/Precomp.h +10 -0
- data/libchdr/deps/lzma-19.00/include/Sort.h +18 -0
- data/libchdr/deps/lzma-19.00/lzma-history.txt +446 -0
- data/libchdr/deps/lzma-19.00/lzma.txt +328 -0
- data/libchdr/deps/lzma-19.00/lzma.vcxproj +543 -0
- data/libchdr/deps/lzma-19.00/lzma.vcxproj.filters +17 -0
- data/libchdr/deps/lzma-19.00/src/Alloc.c +455 -0
- data/libchdr/deps/lzma-19.00/src/Bra86.c +82 -0
- data/libchdr/deps/lzma-19.00/src/BraIA64.c +53 -0
- data/libchdr/deps/lzma-19.00/src/CpuArch.c +218 -0
- data/libchdr/deps/lzma-19.00/src/Delta.c +64 -0
- data/libchdr/deps/lzma-19.00/src/LzFind.c +1127 -0
- data/libchdr/deps/lzma-19.00/src/Lzma86Dec.c +54 -0
- data/libchdr/deps/lzma-19.00/src/LzmaDec.c +1185 -0
- data/libchdr/deps/lzma-19.00/src/LzmaEnc.c +1330 -0
- data/libchdr/deps/lzma-19.00/src/Sort.c +141 -0
- data/libchdr/deps/zlib-1.2.11/CMakeLists.txt +29 -0
- data/libchdr/deps/zlib-1.2.11/ChangeLog +1515 -0
- data/libchdr/deps/zlib-1.2.11/FAQ +368 -0
- data/libchdr/deps/zlib-1.2.11/INDEX +68 -0
- data/libchdr/deps/zlib-1.2.11/Makefile +5 -0
- data/libchdr/deps/zlib-1.2.11/Makefile.in +410 -0
- data/libchdr/deps/zlib-1.2.11/README +115 -0
- data/libchdr/deps/zlib-1.2.11/adler32.c +186 -0
- data/libchdr/deps/zlib-1.2.11/compress.c +86 -0
- data/libchdr/deps/zlib-1.2.11/configure +921 -0
- data/libchdr/deps/zlib-1.2.11/crc32.c +442 -0
- data/libchdr/deps/zlib-1.2.11/crc32.h +441 -0
- data/libchdr/deps/zlib-1.2.11/deflate.c +2163 -0
- data/libchdr/deps/zlib-1.2.11/deflate.h +349 -0
- data/libchdr/deps/zlib-1.2.11/doc/algorithm.txt +209 -0
- data/libchdr/deps/zlib-1.2.11/doc/rfc1950.txt +619 -0
- data/libchdr/deps/zlib-1.2.11/doc/rfc1951.txt +955 -0
- data/libchdr/deps/zlib-1.2.11/doc/rfc1952.txt +675 -0
- data/libchdr/deps/zlib-1.2.11/doc/txtvsbin.txt +107 -0
- data/libchdr/deps/zlib-1.2.11/gzclose.c +25 -0
- data/libchdr/deps/zlib-1.2.11/gzguts.h +218 -0
- data/libchdr/deps/zlib-1.2.11/gzlib.c +637 -0
- data/libchdr/deps/zlib-1.2.11/gzread.c +654 -0
- data/libchdr/deps/zlib-1.2.11/gzwrite.c +665 -0
- data/libchdr/deps/zlib-1.2.11/infback.c +640 -0
- data/libchdr/deps/zlib-1.2.11/inffast.c +323 -0
- data/libchdr/deps/zlib-1.2.11/inffast.h +11 -0
- data/libchdr/deps/zlib-1.2.11/inffixed.h +94 -0
- data/libchdr/deps/zlib-1.2.11/inflate.c +1561 -0
- data/libchdr/deps/zlib-1.2.11/inflate.h +125 -0
- data/libchdr/deps/zlib-1.2.11/inftrees.c +304 -0
- data/libchdr/deps/zlib-1.2.11/inftrees.h +62 -0
- data/libchdr/deps/zlib-1.2.11/make_vms.com +867 -0
- data/libchdr/deps/zlib-1.2.11/treebuild.xml +116 -0
- data/libchdr/deps/zlib-1.2.11/trees.c +1203 -0
- data/libchdr/deps/zlib-1.2.11/trees.h +128 -0
- data/libchdr/deps/zlib-1.2.11/uncompr.c +93 -0
- data/libchdr/deps/zlib-1.2.11/zconf.h +534 -0
- data/libchdr/deps/zlib-1.2.11/zconf.h.cmakein +536 -0
- data/libchdr/deps/zlib-1.2.11/zconf.h.in +534 -0
- data/libchdr/deps/zlib-1.2.11/zlib.3 +149 -0
- data/libchdr/deps/zlib-1.2.11/zlib.3.pdf +0 -0
- data/libchdr/deps/zlib-1.2.11/zlib.h +1912 -0
- data/libchdr/deps/zlib-1.2.11/zlib.map +94 -0
- data/libchdr/deps/zlib-1.2.11/zlib.pc.cmakein +13 -0
- data/libchdr/deps/zlib-1.2.11/zlib.pc.in +13 -0
- data/libchdr/deps/zlib-1.2.11/zlib2ansi +152 -0
- data/libchdr/deps/zlib-1.2.11/zutil.c +325 -0
- data/libchdr/deps/zlib-1.2.11/zutil.h +271 -0
- data/libchdr/include/dr_libs/dr_flac.h +12280 -0
- data/libchdr/include/libchdr/bitstream.h +43 -0
- data/libchdr/include/libchdr/cdrom.h +110 -0
- data/libchdr/include/libchdr/chd.h +427 -0
- data/libchdr/include/libchdr/chdconfig.h +10 -0
- data/libchdr/include/libchdr/coretypes.h +60 -0
- data/libchdr/include/libchdr/flac.h +50 -0
- data/libchdr/include/libchdr/huffman.h +90 -0
- data/libchdr/pkg-config.pc.in +10 -0
- data/libchdr/src/libchdr_bitstream.c +125 -0
- data/libchdr/src/libchdr_cdrom.c +415 -0
- data/libchdr/src/libchdr_chd.c +2744 -0
- data/libchdr/src/libchdr_flac.c +302 -0
- data/libchdr/src/libchdr_huffman.c +545 -0
- data/libchdr/src/link.T +5 -0
- data/libchdr/tests/CMakeLists.txt +2 -0
- data/libchdr/tests/benchmark.c +52 -0
- metadata +183 -0
@@ -0,0 +1,349 @@
|
|
1
|
+
/* deflate.h -- internal compression state
|
2
|
+
* Copyright (C) 1995-2016 Jean-loup Gailly
|
3
|
+
* For conditions of distribution and use, see copyright notice in zlib.h
|
4
|
+
*/
|
5
|
+
|
6
|
+
/* WARNING: this file should *not* be used by applications. It is
|
7
|
+
part of the implementation of the compression library and is
|
8
|
+
subject to change. Applications should only use zlib.h.
|
9
|
+
*/
|
10
|
+
|
11
|
+
/* @(#) $Id$ */
|
12
|
+
|
13
|
+
#ifndef DEFLATE_H
|
14
|
+
#define DEFLATE_H
|
15
|
+
|
16
|
+
#include "zutil.h"
|
17
|
+
|
18
|
+
/* define NO_GZIP when compiling if you want to disable gzip header and
|
19
|
+
trailer creation by deflate(). NO_GZIP would be used to avoid linking in
|
20
|
+
the crc code when it is not needed. For shared libraries, gzip encoding
|
21
|
+
should be left enabled. */
|
22
|
+
#ifndef NO_GZIP
|
23
|
+
# define GZIP
|
24
|
+
#endif
|
25
|
+
|
26
|
+
/* ===========================================================================
|
27
|
+
* Internal compression state.
|
28
|
+
*/
|
29
|
+
|
30
|
+
#define LENGTH_CODES 29
|
31
|
+
/* number of length codes, not counting the special END_BLOCK code */
|
32
|
+
|
33
|
+
#define LITERALS 256
|
34
|
+
/* number of literal bytes 0..255 */
|
35
|
+
|
36
|
+
#define L_CODES (LITERALS+1+LENGTH_CODES)
|
37
|
+
/* number of Literal or Length codes, including the END_BLOCK code */
|
38
|
+
|
39
|
+
#define D_CODES 30
|
40
|
+
/* number of distance codes */
|
41
|
+
|
42
|
+
#define BL_CODES 19
|
43
|
+
/* number of codes used to transfer the bit lengths */
|
44
|
+
|
45
|
+
#define HEAP_SIZE (2*L_CODES+1)
|
46
|
+
/* maximum heap size */
|
47
|
+
|
48
|
+
#define MAX_BITS 15
|
49
|
+
/* All codes must not exceed MAX_BITS bits */
|
50
|
+
|
51
|
+
#define Buf_size 16
|
52
|
+
/* size of bit buffer in bi_buf */
|
53
|
+
|
54
|
+
#define INIT_STATE 42 /* zlib header -> BUSY_STATE */
|
55
|
+
#ifdef GZIP
|
56
|
+
# define GZIP_STATE 57 /* gzip header -> BUSY_STATE | EXTRA_STATE */
|
57
|
+
#endif
|
58
|
+
#define EXTRA_STATE 69 /* gzip extra block -> NAME_STATE */
|
59
|
+
#define NAME_STATE 73 /* gzip file name -> COMMENT_STATE */
|
60
|
+
#define COMMENT_STATE 91 /* gzip comment -> HCRC_STATE */
|
61
|
+
#define HCRC_STATE 103 /* gzip header CRC -> BUSY_STATE */
|
62
|
+
#define BUSY_STATE 113 /* deflate -> FINISH_STATE */
|
63
|
+
#define FINISH_STATE 666 /* stream complete */
|
64
|
+
/* Stream status */
|
65
|
+
|
66
|
+
|
67
|
+
/* Data structure describing a single value and its code string. */
|
68
|
+
typedef struct ct_data_s {
|
69
|
+
union {
|
70
|
+
ush freq; /* frequency count */
|
71
|
+
ush code; /* bit string */
|
72
|
+
} fc;
|
73
|
+
union {
|
74
|
+
ush dad; /* father node in Huffman tree */
|
75
|
+
ush len; /* length of bit string */
|
76
|
+
} dl;
|
77
|
+
} FAR ct_data;
|
78
|
+
|
79
|
+
#define Freq fc.freq
|
80
|
+
#define Code fc.code
|
81
|
+
#define Dad dl.dad
|
82
|
+
#define Len dl.len
|
83
|
+
|
84
|
+
typedef struct static_tree_desc_s static_tree_desc;
|
85
|
+
|
86
|
+
typedef struct tree_desc_s {
|
87
|
+
ct_data *dyn_tree; /* the dynamic tree */
|
88
|
+
int max_code; /* largest code with non zero frequency */
|
89
|
+
const static_tree_desc *stat_desc; /* the corresponding static tree */
|
90
|
+
} FAR tree_desc;
|
91
|
+
|
92
|
+
typedef ush Pos;
|
93
|
+
typedef Pos FAR Posf;
|
94
|
+
typedef unsigned IPos;
|
95
|
+
|
96
|
+
/* A Pos is an index in the character window. We use short instead of int to
|
97
|
+
* save space in the various tables. IPos is used only for parameter passing.
|
98
|
+
*/
|
99
|
+
|
100
|
+
typedef struct internal_state {
|
101
|
+
z_streamp strm; /* pointer back to this zlib stream */
|
102
|
+
int status; /* as the name implies */
|
103
|
+
Bytef *pending_buf; /* output still pending */
|
104
|
+
ulg pending_buf_size; /* size of pending_buf */
|
105
|
+
Bytef *pending_out; /* next pending byte to output to the stream */
|
106
|
+
ulg pending; /* nb of bytes in the pending buffer */
|
107
|
+
int wrap; /* bit 0 true for zlib, bit 1 true for gzip */
|
108
|
+
gz_headerp gzhead; /* gzip header information to write */
|
109
|
+
ulg gzindex; /* where in extra, name, or comment */
|
110
|
+
Byte method; /* can only be DEFLATED */
|
111
|
+
int last_flush; /* value of flush param for previous deflate call */
|
112
|
+
|
113
|
+
/* used by deflate.c: */
|
114
|
+
|
115
|
+
uInt w_size; /* LZ77 window size (32K by default) */
|
116
|
+
uInt w_bits; /* log2(w_size) (8..16) */
|
117
|
+
uInt w_mask; /* w_size - 1 */
|
118
|
+
|
119
|
+
Bytef *window;
|
120
|
+
/* Sliding window. Input bytes are read into the second half of the window,
|
121
|
+
* and move to the first half later to keep a dictionary of at least wSize
|
122
|
+
* bytes. With this organization, matches are limited to a distance of
|
123
|
+
* wSize-MAX_MATCH bytes, but this ensures that IO is always
|
124
|
+
* performed with a length multiple of the block size. Also, it limits
|
125
|
+
* the window size to 64K, which is quite useful on MSDOS.
|
126
|
+
* To do: use the user input buffer as sliding window.
|
127
|
+
*/
|
128
|
+
|
129
|
+
ulg window_size;
|
130
|
+
/* Actual size of window: 2*wSize, except when the user input buffer
|
131
|
+
* is directly used as sliding window.
|
132
|
+
*/
|
133
|
+
|
134
|
+
Posf *prev;
|
135
|
+
/* Link to older string with same hash index. To limit the size of this
|
136
|
+
* array to 64K, this link is maintained only for the last 32K strings.
|
137
|
+
* An index in this array is thus a window index modulo 32K.
|
138
|
+
*/
|
139
|
+
|
140
|
+
Posf *head; /* Heads of the hash chains or NIL. */
|
141
|
+
|
142
|
+
uInt ins_h; /* hash index of string to be inserted */
|
143
|
+
uInt hash_size; /* number of elements in hash table */
|
144
|
+
uInt hash_bits; /* log2(hash_size) */
|
145
|
+
uInt hash_mask; /* hash_size-1 */
|
146
|
+
|
147
|
+
uInt hash_shift;
|
148
|
+
/* Number of bits by which ins_h must be shifted at each input
|
149
|
+
* step. It must be such that after MIN_MATCH steps, the oldest
|
150
|
+
* byte no longer takes part in the hash key, that is:
|
151
|
+
* hash_shift * MIN_MATCH >= hash_bits
|
152
|
+
*/
|
153
|
+
|
154
|
+
long block_start;
|
155
|
+
/* Window position at the beginning of the current output block. Gets
|
156
|
+
* negative when the window is moved backwards.
|
157
|
+
*/
|
158
|
+
|
159
|
+
uInt match_length; /* length of best match */
|
160
|
+
IPos prev_match; /* previous match */
|
161
|
+
int match_available; /* set if previous match exists */
|
162
|
+
uInt strstart; /* start of string to insert */
|
163
|
+
uInt match_start; /* start of matching string */
|
164
|
+
uInt lookahead; /* number of valid bytes ahead in window */
|
165
|
+
|
166
|
+
uInt prev_length;
|
167
|
+
/* Length of the best match at previous step. Matches not greater than this
|
168
|
+
* are discarded. This is used in the lazy match evaluation.
|
169
|
+
*/
|
170
|
+
|
171
|
+
uInt max_chain_length;
|
172
|
+
/* To speed up deflation, hash chains are never searched beyond this
|
173
|
+
* length. A higher limit improves compression ratio but degrades the
|
174
|
+
* speed.
|
175
|
+
*/
|
176
|
+
|
177
|
+
uInt max_lazy_match;
|
178
|
+
/* Attempt to find a better match only when the current match is strictly
|
179
|
+
* smaller than this value. This mechanism is used only for compression
|
180
|
+
* levels >= 4.
|
181
|
+
*/
|
182
|
+
# define max_insert_length max_lazy_match
|
183
|
+
/* Insert new strings in the hash table only if the match length is not
|
184
|
+
* greater than this length. This saves time but degrades compression.
|
185
|
+
* max_insert_length is used only for compression levels <= 3.
|
186
|
+
*/
|
187
|
+
|
188
|
+
int level; /* compression level (1..9) */
|
189
|
+
int strategy; /* favor or force Huffman coding*/
|
190
|
+
|
191
|
+
uInt good_match;
|
192
|
+
/* Use a faster search when the previous match is longer than this */
|
193
|
+
|
194
|
+
int nice_match; /* Stop searching when current match exceeds this */
|
195
|
+
|
196
|
+
/* used by trees.c: */
|
197
|
+
/* Didn't use ct_data typedef below to suppress compiler warning */
|
198
|
+
struct ct_data_s dyn_ltree[HEAP_SIZE]; /* literal and length tree */
|
199
|
+
struct ct_data_s dyn_dtree[2*D_CODES+1]; /* distance tree */
|
200
|
+
struct ct_data_s bl_tree[2*BL_CODES+1]; /* Huffman tree for bit lengths */
|
201
|
+
|
202
|
+
struct tree_desc_s l_desc; /* desc. for literal tree */
|
203
|
+
struct tree_desc_s d_desc; /* desc. for distance tree */
|
204
|
+
struct tree_desc_s bl_desc; /* desc. for bit length tree */
|
205
|
+
|
206
|
+
ush bl_count[MAX_BITS+1];
|
207
|
+
/* number of codes at each bit length for an optimal tree */
|
208
|
+
|
209
|
+
int heap[2*L_CODES+1]; /* heap used to build the Huffman trees */
|
210
|
+
int heap_len; /* number of elements in the heap */
|
211
|
+
int heap_max; /* element of largest frequency */
|
212
|
+
/* The sons of heap[n] are heap[2*n] and heap[2*n+1]. heap[0] is not used.
|
213
|
+
* The same heap array is used to build all trees.
|
214
|
+
*/
|
215
|
+
|
216
|
+
uch depth[2*L_CODES+1];
|
217
|
+
/* Depth of each subtree used as tie breaker for trees of equal frequency
|
218
|
+
*/
|
219
|
+
|
220
|
+
uchf *l_buf; /* buffer for literals or lengths */
|
221
|
+
|
222
|
+
uInt lit_bufsize;
|
223
|
+
/* Size of match buffer for literals/lengths. There are 4 reasons for
|
224
|
+
* limiting lit_bufsize to 64K:
|
225
|
+
* - frequencies can be kept in 16 bit counters
|
226
|
+
* - if compression is not successful for the first block, all input
|
227
|
+
* data is still in the window so we can still emit a stored block even
|
228
|
+
* when input comes from standard input. (This can also be done for
|
229
|
+
* all blocks if lit_bufsize is not greater than 32K.)
|
230
|
+
* - if compression is not successful for a file smaller than 64K, we can
|
231
|
+
* even emit a stored file instead of a stored block (saving 5 bytes).
|
232
|
+
* This is applicable only for zip (not gzip or zlib).
|
233
|
+
* - creating new Huffman trees less frequently may not provide fast
|
234
|
+
* adaptation to changes in the input data statistics. (Take for
|
235
|
+
* example a binary file with poorly compressible code followed by
|
236
|
+
* a highly compressible string table.) Smaller buffer sizes give
|
237
|
+
* fast adaptation but have of course the overhead of transmitting
|
238
|
+
* trees more frequently.
|
239
|
+
* - I can't count above 4
|
240
|
+
*/
|
241
|
+
|
242
|
+
uInt last_lit; /* running index in l_buf */
|
243
|
+
|
244
|
+
ushf *d_buf;
|
245
|
+
/* Buffer for distances. To simplify the code, d_buf and l_buf have
|
246
|
+
* the same number of elements. To use different lengths, an extra flag
|
247
|
+
* array would be necessary.
|
248
|
+
*/
|
249
|
+
|
250
|
+
ulg opt_len; /* bit length of current block with optimal trees */
|
251
|
+
ulg static_len; /* bit length of current block with static trees */
|
252
|
+
uInt matches; /* number of string matches in current block */
|
253
|
+
uInt insert; /* bytes at end of window left to insert */
|
254
|
+
|
255
|
+
#ifdef ZLIB_DEBUG
|
256
|
+
ulg compressed_len; /* total bit length of compressed file mod 2^32 */
|
257
|
+
ulg bits_sent; /* bit length of compressed data sent mod 2^32 */
|
258
|
+
#endif
|
259
|
+
|
260
|
+
ush bi_buf;
|
261
|
+
/* Output buffer. bits are inserted starting at the bottom (least
|
262
|
+
* significant bits).
|
263
|
+
*/
|
264
|
+
int bi_valid;
|
265
|
+
/* Number of valid bits in bi_buf. All bits above the last valid bit
|
266
|
+
* are always zero.
|
267
|
+
*/
|
268
|
+
|
269
|
+
ulg high_water;
|
270
|
+
/* High water mark offset in window for initialized bytes -- bytes above
|
271
|
+
* this are set to zero in order to avoid memory check warnings when
|
272
|
+
* longest match routines access bytes past the input. This is then
|
273
|
+
* updated to the new high water mark.
|
274
|
+
*/
|
275
|
+
|
276
|
+
} FAR deflate_state;
|
277
|
+
|
278
|
+
/* Output a byte on the stream.
|
279
|
+
* IN assertion: there is enough room in pending_buf.
|
280
|
+
*/
|
281
|
+
#define put_byte(s, c) {s->pending_buf[s->pending++] = (Bytef)(c);}
|
282
|
+
|
283
|
+
|
284
|
+
#define MIN_LOOKAHEAD (MAX_MATCH+MIN_MATCH+1)
|
285
|
+
/* Minimum amount of lookahead, except at the end of the input file.
|
286
|
+
* See deflate.c for comments about the MIN_MATCH+1.
|
287
|
+
*/
|
288
|
+
|
289
|
+
#define MAX_DIST(s) ((s)->w_size-MIN_LOOKAHEAD)
|
290
|
+
/* In order to simplify the code, particularly on 16 bit machines, match
|
291
|
+
* distances are limited to MAX_DIST instead of WSIZE.
|
292
|
+
*/
|
293
|
+
|
294
|
+
#define WIN_INIT MAX_MATCH
|
295
|
+
/* Number of bytes after end of data in window to initialize in order to avoid
|
296
|
+
memory checker errors from longest match routines */
|
297
|
+
|
298
|
+
/* in trees.c */
|
299
|
+
void ZLIB_INTERNAL _tr_init OF((deflate_state *s));
|
300
|
+
int ZLIB_INTERNAL _tr_tally OF((deflate_state *s, unsigned dist, unsigned lc));
|
301
|
+
void ZLIB_INTERNAL _tr_flush_block OF((deflate_state *s, charf *buf,
|
302
|
+
ulg stored_len, int last));
|
303
|
+
void ZLIB_INTERNAL _tr_flush_bits OF((deflate_state *s));
|
304
|
+
void ZLIB_INTERNAL _tr_align OF((deflate_state *s));
|
305
|
+
void ZLIB_INTERNAL _tr_stored_block OF((deflate_state *s, charf *buf,
|
306
|
+
ulg stored_len, int last));
|
307
|
+
|
308
|
+
#define d_code(dist) \
|
309
|
+
((dist) < 256 ? _dist_code[dist] : _dist_code[256+((dist)>>7)])
|
310
|
+
/* Mapping from a distance to a distance code. dist is the distance - 1 and
|
311
|
+
* must not have side effects. _dist_code[256] and _dist_code[257] are never
|
312
|
+
* used.
|
313
|
+
*/
|
314
|
+
|
315
|
+
#ifndef ZLIB_DEBUG
|
316
|
+
/* Inline versions of _tr_tally for speed: */
|
317
|
+
|
318
|
+
#if defined(GEN_TREES_H) || !defined(STDC)
|
319
|
+
extern uch ZLIB_INTERNAL _length_code[];
|
320
|
+
extern uch ZLIB_INTERNAL _dist_code[];
|
321
|
+
#else
|
322
|
+
extern const uch ZLIB_INTERNAL _length_code[];
|
323
|
+
extern const uch ZLIB_INTERNAL _dist_code[];
|
324
|
+
#endif
|
325
|
+
|
326
|
+
# define _tr_tally_lit(s, c, flush) \
|
327
|
+
{ uch cc = (c); \
|
328
|
+
s->d_buf[s->last_lit] = 0; \
|
329
|
+
s->l_buf[s->last_lit++] = cc; \
|
330
|
+
s->dyn_ltree[cc].Freq++; \
|
331
|
+
flush = (s->last_lit == s->lit_bufsize-1); \
|
332
|
+
}
|
333
|
+
# define _tr_tally_dist(s, distance, length, flush) \
|
334
|
+
{ uch len = (uch)(length); \
|
335
|
+
ush dist = (ush)(distance); \
|
336
|
+
s->d_buf[s->last_lit] = dist; \
|
337
|
+
s->l_buf[s->last_lit++] = len; \
|
338
|
+
dist--; \
|
339
|
+
s->dyn_ltree[_length_code[len]+LITERALS+1].Freq++; \
|
340
|
+
s->dyn_dtree[d_code(dist)].Freq++; \
|
341
|
+
flush = (s->last_lit == s->lit_bufsize-1); \
|
342
|
+
}
|
343
|
+
#else
|
344
|
+
# define _tr_tally_lit(s, c, flush) flush = _tr_tally(s, 0, c)
|
345
|
+
# define _tr_tally_dist(s, distance, length, flush) \
|
346
|
+
flush = _tr_tally(s, distance, length)
|
347
|
+
#endif
|
348
|
+
|
349
|
+
#endif /* DEFLATE_H */
|
@@ -0,0 +1,209 @@
|
|
1
|
+
1. Compression algorithm (deflate)
|
2
|
+
|
3
|
+
The deflation algorithm used by gzip (also zip and zlib) is a variation of
|
4
|
+
LZ77 (Lempel-Ziv 1977, see reference below). It finds duplicated strings in
|
5
|
+
the input data. The second occurrence of a string is replaced by a
|
6
|
+
pointer to the previous string, in the form of a pair (distance,
|
7
|
+
length). Distances are limited to 32K bytes, and lengths are limited
|
8
|
+
to 258 bytes. When a string does not occur anywhere in the previous
|
9
|
+
32K bytes, it is emitted as a sequence of literal bytes. (In this
|
10
|
+
description, `string' must be taken as an arbitrary sequence of bytes,
|
11
|
+
and is not restricted to printable characters.)
|
12
|
+
|
13
|
+
Literals or match lengths are compressed with one Huffman tree, and
|
14
|
+
match distances are compressed with another tree. The trees are stored
|
15
|
+
in a compact form at the start of each block. The blocks can have any
|
16
|
+
size (except that the compressed data for one block must fit in
|
17
|
+
available memory). A block is terminated when deflate() determines that
|
18
|
+
it would be useful to start another block with fresh trees. (This is
|
19
|
+
somewhat similar to the behavior of LZW-based _compress_.)
|
20
|
+
|
21
|
+
Duplicated strings are found using a hash table. All input strings of
|
22
|
+
length 3 are inserted in the hash table. A hash index is computed for
|
23
|
+
the next 3 bytes. If the hash chain for this index is not empty, all
|
24
|
+
strings in the chain are compared with the current input string, and
|
25
|
+
the longest match is selected.
|
26
|
+
|
27
|
+
The hash chains are searched starting with the most recent strings, to
|
28
|
+
favor small distances and thus take advantage of the Huffman encoding.
|
29
|
+
The hash chains are singly linked. There are no deletions from the
|
30
|
+
hash chains, the algorithm simply discards matches that are too old.
|
31
|
+
|
32
|
+
To avoid a worst-case situation, very long hash chains are arbitrarily
|
33
|
+
truncated at a certain length, determined by a runtime option (level
|
34
|
+
parameter of deflateInit). So deflate() does not always find the longest
|
35
|
+
possible match but generally finds a match which is long enough.
|
36
|
+
|
37
|
+
deflate() also defers the selection of matches with a lazy evaluation
|
38
|
+
mechanism. After a match of length N has been found, deflate() searches for
|
39
|
+
a longer match at the next input byte. If a longer match is found, the
|
40
|
+
previous match is truncated to a length of one (thus producing a single
|
41
|
+
literal byte) and the process of lazy evaluation begins again. Otherwise,
|
42
|
+
the original match is kept, and the next match search is attempted only N
|
43
|
+
steps later.
|
44
|
+
|
45
|
+
The lazy match evaluation is also subject to a runtime parameter. If
|
46
|
+
the current match is long enough, deflate() reduces the search for a longer
|
47
|
+
match, thus speeding up the whole process. If compression ratio is more
|
48
|
+
important than speed, deflate() attempts a complete second search even if
|
49
|
+
the first match is already long enough.
|
50
|
+
|
51
|
+
The lazy match evaluation is not performed for the fastest compression
|
52
|
+
modes (level parameter 1 to 3). For these fast modes, new strings
|
53
|
+
are inserted in the hash table only when no match was found, or
|
54
|
+
when the match is not too long. This degrades the compression ratio
|
55
|
+
but saves time since there are both fewer insertions and fewer searches.
|
56
|
+
|
57
|
+
|
58
|
+
2. Decompression algorithm (inflate)
|
59
|
+
|
60
|
+
2.1 Introduction
|
61
|
+
|
62
|
+
The key question is how to represent a Huffman code (or any prefix code) so
|
63
|
+
that you can decode fast. The most important characteristic is that shorter
|
64
|
+
codes are much more common than longer codes, so pay attention to decoding the
|
65
|
+
short codes fast, and let the long codes take longer to decode.
|
66
|
+
|
67
|
+
inflate() sets up a first level table that covers some number of bits of
|
68
|
+
input less than the length of longest code. It gets that many bits from the
|
69
|
+
stream, and looks it up in the table. The table will tell if the next
|
70
|
+
code is that many bits or less and how many, and if it is, it will tell
|
71
|
+
the value, else it will point to the next level table for which inflate()
|
72
|
+
grabs more bits and tries to decode a longer code.
|
73
|
+
|
74
|
+
How many bits to make the first lookup is a tradeoff between the time it
|
75
|
+
takes to decode and the time it takes to build the table. If building the
|
76
|
+
table took no time (and if you had infinite memory), then there would only
|
77
|
+
be a first level table to cover all the way to the longest code. However,
|
78
|
+
building the table ends up taking a lot longer for more bits since short
|
79
|
+
codes are replicated many times in such a table. What inflate() does is
|
80
|
+
simply to make the number of bits in the first table a variable, and then
|
81
|
+
to set that variable for the maximum speed.
|
82
|
+
|
83
|
+
For inflate, which has 286 possible codes for the literal/length tree, the size
|
84
|
+
of the first table is nine bits. Also the distance trees have 30 possible
|
85
|
+
values, and the size of the first table is six bits. Note that for each of
|
86
|
+
those cases, the table ended up one bit longer than the ``average'' code
|
87
|
+
length, i.e. the code length of an approximately flat code which would be a
|
88
|
+
little more than eight bits for 286 symbols and a little less than five bits
|
89
|
+
for 30 symbols.
|
90
|
+
|
91
|
+
|
92
|
+
2.2 More details on the inflate table lookup
|
93
|
+
|
94
|
+
Ok, you want to know what this cleverly obfuscated inflate tree actually
|
95
|
+
looks like. You are correct that it's not a Huffman tree. It is simply a
|
96
|
+
lookup table for the first, let's say, nine bits of a Huffman symbol. The
|
97
|
+
symbol could be as short as one bit or as long as 15 bits. If a particular
|
98
|
+
symbol is shorter than nine bits, then that symbol's translation is duplicated
|
99
|
+
in all those entries that start with that symbol's bits. For example, if the
|
100
|
+
symbol is four bits, then it's duplicated 32 times in a nine-bit table. If a
|
101
|
+
symbol is nine bits long, it appears in the table once.
|
102
|
+
|
103
|
+
If the symbol is longer than nine bits, then that entry in the table points
|
104
|
+
to another similar table for the remaining bits. Again, there are duplicated
|
105
|
+
entries as needed. The idea is that most of the time the symbol will be short
|
106
|
+
and there will only be one table look up. (That's whole idea behind data
|
107
|
+
compression in the first place.) For the less frequent long symbols, there
|
108
|
+
will be two lookups. If you had a compression method with really long
|
109
|
+
symbols, you could have as many levels of lookups as is efficient. For
|
110
|
+
inflate, two is enough.
|
111
|
+
|
112
|
+
So a table entry either points to another table (in which case nine bits in
|
113
|
+
the above example are gobbled), or it contains the translation for the symbol
|
114
|
+
and the number of bits to gobble. Then you start again with the next
|
115
|
+
ungobbled bit.
|
116
|
+
|
117
|
+
You may wonder: why not just have one lookup table for how ever many bits the
|
118
|
+
longest symbol is? The reason is that if you do that, you end up spending
|
119
|
+
more time filling in duplicate symbol entries than you do actually decoding.
|
120
|
+
At least for deflate's output that generates new trees every several 10's of
|
121
|
+
kbytes. You can imagine that filling in a 2^15 entry table for a 15-bit code
|
122
|
+
would take too long if you're only decoding several thousand symbols. At the
|
123
|
+
other extreme, you could make a new table for every bit in the code. In fact,
|
124
|
+
that's essentially a Huffman tree. But then you spend too much time
|
125
|
+
traversing the tree while decoding, even for short symbols.
|
126
|
+
|
127
|
+
So the number of bits for the first lookup table is a trade of the time to
|
128
|
+
fill out the table vs. the time spent looking at the second level and above of
|
129
|
+
the table.
|
130
|
+
|
131
|
+
Here is an example, scaled down:
|
132
|
+
|
133
|
+
The code being decoded, with 10 symbols, from 1 to 6 bits long:
|
134
|
+
|
135
|
+
A: 0
|
136
|
+
B: 10
|
137
|
+
C: 1100
|
138
|
+
D: 11010
|
139
|
+
E: 11011
|
140
|
+
F: 11100
|
141
|
+
G: 11101
|
142
|
+
H: 11110
|
143
|
+
I: 111110
|
144
|
+
J: 111111
|
145
|
+
|
146
|
+
Let's make the first table three bits long (eight entries):
|
147
|
+
|
148
|
+
000: A,1
|
149
|
+
001: A,1
|
150
|
+
010: A,1
|
151
|
+
011: A,1
|
152
|
+
100: B,2
|
153
|
+
101: B,2
|
154
|
+
110: -> table X (gobble 3 bits)
|
155
|
+
111: -> table Y (gobble 3 bits)
|
156
|
+
|
157
|
+
Each entry is what the bits decode as and how many bits that is, i.e. how
|
158
|
+
many bits to gobble. Or the entry points to another table, with the number of
|
159
|
+
bits to gobble implicit in the size of the table.
|
160
|
+
|
161
|
+
Table X is two bits long since the longest code starting with 110 is five bits
|
162
|
+
long:
|
163
|
+
|
164
|
+
00: C,1
|
165
|
+
01: C,1
|
166
|
+
10: D,2
|
167
|
+
11: E,2
|
168
|
+
|
169
|
+
Table Y is three bits long since the longest code starting with 111 is six
|
170
|
+
bits long:
|
171
|
+
|
172
|
+
000: F,2
|
173
|
+
001: F,2
|
174
|
+
010: G,2
|
175
|
+
011: G,2
|
176
|
+
100: H,2
|
177
|
+
101: H,2
|
178
|
+
110: I,3
|
179
|
+
111: J,3
|
180
|
+
|
181
|
+
So what we have here are three tables with a total of 20 entries that had to
|
182
|
+
be constructed. That's compared to 64 entries for a single table. Or
|
183
|
+
compared to 16 entries for a Huffman tree (six two entry tables and one four
|
184
|
+
entry table). Assuming that the code ideally represents the probability of
|
185
|
+
the symbols, it takes on the average 1.25 lookups per symbol. That's compared
|
186
|
+
to one lookup for the single table, or 1.66 lookups per symbol for the
|
187
|
+
Huffman tree.
|
188
|
+
|
189
|
+
There, I think that gives you a picture of what's going on. For inflate, the
|
190
|
+
meaning of a particular symbol is often more than just a letter. It can be a
|
191
|
+
byte (a "literal"), or it can be either a length or a distance which
|
192
|
+
indicates a base value and a number of bits to fetch after the code that is
|
193
|
+
added to the base value. Or it might be the special end-of-block code. The
|
194
|
+
data structures created in inftrees.c try to encode all that information
|
195
|
+
compactly in the tables.
|
196
|
+
|
197
|
+
|
198
|
+
Jean-loup Gailly Mark Adler
|
199
|
+
jloup@gzip.org madler@alumni.caltech.edu
|
200
|
+
|
201
|
+
|
202
|
+
References:
|
203
|
+
|
204
|
+
[LZ77] Ziv J., Lempel A., ``A Universal Algorithm for Sequential Data
|
205
|
+
Compression,'' IEEE Transactions on Information Theory, Vol. 23, No. 3,
|
206
|
+
pp. 337-343.
|
207
|
+
|
208
|
+
``DEFLATE Compressed Data Format Specification'' available in
|
209
|
+
http://tools.ietf.org/html/rfc1951
|