chd 0.1.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (109) hide show
  1. checksums.yaml +7 -0
  2. data/README.md +30 -0
  3. data/chd.gemspec +29 -0
  4. data/ext/chd.c +1008 -0
  5. data/ext/extconf.rb +60 -0
  6. data/lib/chd/cd.rb +272 -0
  7. data/lib/chd/metadata.rb +196 -0
  8. data/lib/chd/version.rb +4 -0
  9. data/lib/chd.rb +21 -0
  10. data/libchdr/CMakeLists.txt +104 -0
  11. data/libchdr/LICENSE.txt +24 -0
  12. data/libchdr/README.md +7 -0
  13. data/libchdr/deps/lzma-19.00/CMakeLists.txt +33 -0
  14. data/libchdr/deps/lzma-19.00/LICENSE +3 -0
  15. data/libchdr/deps/lzma-19.00/include/7zTypes.h +375 -0
  16. data/libchdr/deps/lzma-19.00/include/Alloc.h +51 -0
  17. data/libchdr/deps/lzma-19.00/include/Bra.h +64 -0
  18. data/libchdr/deps/lzma-19.00/include/Compiler.h +33 -0
  19. data/libchdr/deps/lzma-19.00/include/CpuArch.h +336 -0
  20. data/libchdr/deps/lzma-19.00/include/Delta.h +19 -0
  21. data/libchdr/deps/lzma-19.00/include/LzFind.h +121 -0
  22. data/libchdr/deps/lzma-19.00/include/LzHash.h +57 -0
  23. data/libchdr/deps/lzma-19.00/include/Lzma86.h +111 -0
  24. data/libchdr/deps/lzma-19.00/include/LzmaDec.h +234 -0
  25. data/libchdr/deps/lzma-19.00/include/LzmaEnc.h +76 -0
  26. data/libchdr/deps/lzma-19.00/include/LzmaLib.h +131 -0
  27. data/libchdr/deps/lzma-19.00/include/Precomp.h +10 -0
  28. data/libchdr/deps/lzma-19.00/include/Sort.h +18 -0
  29. data/libchdr/deps/lzma-19.00/lzma-history.txt +446 -0
  30. data/libchdr/deps/lzma-19.00/lzma.txt +328 -0
  31. data/libchdr/deps/lzma-19.00/lzma.vcxproj +543 -0
  32. data/libchdr/deps/lzma-19.00/lzma.vcxproj.filters +17 -0
  33. data/libchdr/deps/lzma-19.00/src/Alloc.c +455 -0
  34. data/libchdr/deps/lzma-19.00/src/Bra86.c +82 -0
  35. data/libchdr/deps/lzma-19.00/src/BraIA64.c +53 -0
  36. data/libchdr/deps/lzma-19.00/src/CpuArch.c +218 -0
  37. data/libchdr/deps/lzma-19.00/src/Delta.c +64 -0
  38. data/libchdr/deps/lzma-19.00/src/LzFind.c +1127 -0
  39. data/libchdr/deps/lzma-19.00/src/Lzma86Dec.c +54 -0
  40. data/libchdr/deps/lzma-19.00/src/LzmaDec.c +1185 -0
  41. data/libchdr/deps/lzma-19.00/src/LzmaEnc.c +1330 -0
  42. data/libchdr/deps/lzma-19.00/src/Sort.c +141 -0
  43. data/libchdr/deps/zlib-1.2.11/CMakeLists.txt +29 -0
  44. data/libchdr/deps/zlib-1.2.11/ChangeLog +1515 -0
  45. data/libchdr/deps/zlib-1.2.11/FAQ +368 -0
  46. data/libchdr/deps/zlib-1.2.11/INDEX +68 -0
  47. data/libchdr/deps/zlib-1.2.11/Makefile +5 -0
  48. data/libchdr/deps/zlib-1.2.11/Makefile.in +410 -0
  49. data/libchdr/deps/zlib-1.2.11/README +115 -0
  50. data/libchdr/deps/zlib-1.2.11/adler32.c +186 -0
  51. data/libchdr/deps/zlib-1.2.11/compress.c +86 -0
  52. data/libchdr/deps/zlib-1.2.11/configure +921 -0
  53. data/libchdr/deps/zlib-1.2.11/crc32.c +442 -0
  54. data/libchdr/deps/zlib-1.2.11/crc32.h +441 -0
  55. data/libchdr/deps/zlib-1.2.11/deflate.c +2163 -0
  56. data/libchdr/deps/zlib-1.2.11/deflate.h +349 -0
  57. data/libchdr/deps/zlib-1.2.11/doc/algorithm.txt +209 -0
  58. data/libchdr/deps/zlib-1.2.11/doc/rfc1950.txt +619 -0
  59. data/libchdr/deps/zlib-1.2.11/doc/rfc1951.txt +955 -0
  60. data/libchdr/deps/zlib-1.2.11/doc/rfc1952.txt +675 -0
  61. data/libchdr/deps/zlib-1.2.11/doc/txtvsbin.txt +107 -0
  62. data/libchdr/deps/zlib-1.2.11/gzclose.c +25 -0
  63. data/libchdr/deps/zlib-1.2.11/gzguts.h +218 -0
  64. data/libchdr/deps/zlib-1.2.11/gzlib.c +637 -0
  65. data/libchdr/deps/zlib-1.2.11/gzread.c +654 -0
  66. data/libchdr/deps/zlib-1.2.11/gzwrite.c +665 -0
  67. data/libchdr/deps/zlib-1.2.11/infback.c +640 -0
  68. data/libchdr/deps/zlib-1.2.11/inffast.c +323 -0
  69. data/libchdr/deps/zlib-1.2.11/inffast.h +11 -0
  70. data/libchdr/deps/zlib-1.2.11/inffixed.h +94 -0
  71. data/libchdr/deps/zlib-1.2.11/inflate.c +1561 -0
  72. data/libchdr/deps/zlib-1.2.11/inflate.h +125 -0
  73. data/libchdr/deps/zlib-1.2.11/inftrees.c +304 -0
  74. data/libchdr/deps/zlib-1.2.11/inftrees.h +62 -0
  75. data/libchdr/deps/zlib-1.2.11/make_vms.com +867 -0
  76. data/libchdr/deps/zlib-1.2.11/treebuild.xml +116 -0
  77. data/libchdr/deps/zlib-1.2.11/trees.c +1203 -0
  78. data/libchdr/deps/zlib-1.2.11/trees.h +128 -0
  79. data/libchdr/deps/zlib-1.2.11/uncompr.c +93 -0
  80. data/libchdr/deps/zlib-1.2.11/zconf.h +534 -0
  81. data/libchdr/deps/zlib-1.2.11/zconf.h.cmakein +536 -0
  82. data/libchdr/deps/zlib-1.2.11/zconf.h.in +534 -0
  83. data/libchdr/deps/zlib-1.2.11/zlib.3 +149 -0
  84. data/libchdr/deps/zlib-1.2.11/zlib.3.pdf +0 -0
  85. data/libchdr/deps/zlib-1.2.11/zlib.h +1912 -0
  86. data/libchdr/deps/zlib-1.2.11/zlib.map +94 -0
  87. data/libchdr/deps/zlib-1.2.11/zlib.pc.cmakein +13 -0
  88. data/libchdr/deps/zlib-1.2.11/zlib.pc.in +13 -0
  89. data/libchdr/deps/zlib-1.2.11/zlib2ansi +152 -0
  90. data/libchdr/deps/zlib-1.2.11/zutil.c +325 -0
  91. data/libchdr/deps/zlib-1.2.11/zutil.h +271 -0
  92. data/libchdr/include/dr_libs/dr_flac.h +12280 -0
  93. data/libchdr/include/libchdr/bitstream.h +43 -0
  94. data/libchdr/include/libchdr/cdrom.h +110 -0
  95. data/libchdr/include/libchdr/chd.h +427 -0
  96. data/libchdr/include/libchdr/chdconfig.h +10 -0
  97. data/libchdr/include/libchdr/coretypes.h +60 -0
  98. data/libchdr/include/libchdr/flac.h +50 -0
  99. data/libchdr/include/libchdr/huffman.h +90 -0
  100. data/libchdr/pkg-config.pc.in +10 -0
  101. data/libchdr/src/libchdr_bitstream.c +125 -0
  102. data/libchdr/src/libchdr_cdrom.c +415 -0
  103. data/libchdr/src/libchdr_chd.c +2744 -0
  104. data/libchdr/src/libchdr_flac.c +302 -0
  105. data/libchdr/src/libchdr_huffman.c +545 -0
  106. data/libchdr/src/link.T +5 -0
  107. data/libchdr/tests/CMakeLists.txt +2 -0
  108. data/libchdr/tests/benchmark.c +52 -0
  109. metadata +183 -0
@@ -0,0 +1,349 @@
1
+ /* deflate.h -- internal compression state
2
+ * Copyright (C) 1995-2016 Jean-loup Gailly
3
+ * For conditions of distribution and use, see copyright notice in zlib.h
4
+ */
5
+
6
+ /* WARNING: this file should *not* be used by applications. It is
7
+ part of the implementation of the compression library and is
8
+ subject to change. Applications should only use zlib.h.
9
+ */
10
+
11
+ /* @(#) $Id$ */
12
+
13
+ #ifndef DEFLATE_H
14
+ #define DEFLATE_H
15
+
16
+ #include "zutil.h"
17
+
18
+ /* define NO_GZIP when compiling if you want to disable gzip header and
19
+ trailer creation by deflate(). NO_GZIP would be used to avoid linking in
20
+ the crc code when it is not needed. For shared libraries, gzip encoding
21
+ should be left enabled. */
22
+ #ifndef NO_GZIP
23
+ # define GZIP
24
+ #endif
25
+
26
+ /* ===========================================================================
27
+ * Internal compression state.
28
+ */
29
+
30
+ #define LENGTH_CODES 29
31
+ /* number of length codes, not counting the special END_BLOCK code */
32
+
33
+ #define LITERALS 256
34
+ /* number of literal bytes 0..255 */
35
+
36
+ #define L_CODES (LITERALS+1+LENGTH_CODES)
37
+ /* number of Literal or Length codes, including the END_BLOCK code */
38
+
39
+ #define D_CODES 30
40
+ /* number of distance codes */
41
+
42
+ #define BL_CODES 19
43
+ /* number of codes used to transfer the bit lengths */
44
+
45
+ #define HEAP_SIZE (2*L_CODES+1)
46
+ /* maximum heap size */
47
+
48
+ #define MAX_BITS 15
49
+ /* All codes must not exceed MAX_BITS bits */
50
+
51
+ #define Buf_size 16
52
+ /* size of bit buffer in bi_buf */
53
+
54
+ #define INIT_STATE 42 /* zlib header -> BUSY_STATE */
55
+ #ifdef GZIP
56
+ # define GZIP_STATE 57 /* gzip header -> BUSY_STATE | EXTRA_STATE */
57
+ #endif
58
+ #define EXTRA_STATE 69 /* gzip extra block -> NAME_STATE */
59
+ #define NAME_STATE 73 /* gzip file name -> COMMENT_STATE */
60
+ #define COMMENT_STATE 91 /* gzip comment -> HCRC_STATE */
61
+ #define HCRC_STATE 103 /* gzip header CRC -> BUSY_STATE */
62
+ #define BUSY_STATE 113 /* deflate -> FINISH_STATE */
63
+ #define FINISH_STATE 666 /* stream complete */
64
+ /* Stream status */
65
+
66
+
67
+ /* Data structure describing a single value and its code string. */
68
+ typedef struct ct_data_s {
69
+ union {
70
+ ush freq; /* frequency count */
71
+ ush code; /* bit string */
72
+ } fc;
73
+ union {
74
+ ush dad; /* father node in Huffman tree */
75
+ ush len; /* length of bit string */
76
+ } dl;
77
+ } FAR ct_data;
78
+
79
+ #define Freq fc.freq
80
+ #define Code fc.code
81
+ #define Dad dl.dad
82
+ #define Len dl.len
83
+
84
+ typedef struct static_tree_desc_s static_tree_desc;
85
+
86
+ typedef struct tree_desc_s {
87
+ ct_data *dyn_tree; /* the dynamic tree */
88
+ int max_code; /* largest code with non zero frequency */
89
+ const static_tree_desc *stat_desc; /* the corresponding static tree */
90
+ } FAR tree_desc;
91
+
92
+ typedef ush Pos;
93
+ typedef Pos FAR Posf;
94
+ typedef unsigned IPos;
95
+
96
+ /* A Pos is an index in the character window. We use short instead of int to
97
+ * save space in the various tables. IPos is used only for parameter passing.
98
+ */
99
+
100
+ typedef struct internal_state {
101
+ z_streamp strm; /* pointer back to this zlib stream */
102
+ int status; /* as the name implies */
103
+ Bytef *pending_buf; /* output still pending */
104
+ ulg pending_buf_size; /* size of pending_buf */
105
+ Bytef *pending_out; /* next pending byte to output to the stream */
106
+ ulg pending; /* nb of bytes in the pending buffer */
107
+ int wrap; /* bit 0 true for zlib, bit 1 true for gzip */
108
+ gz_headerp gzhead; /* gzip header information to write */
109
+ ulg gzindex; /* where in extra, name, or comment */
110
+ Byte method; /* can only be DEFLATED */
111
+ int last_flush; /* value of flush param for previous deflate call */
112
+
113
+ /* used by deflate.c: */
114
+
115
+ uInt w_size; /* LZ77 window size (32K by default) */
116
+ uInt w_bits; /* log2(w_size) (8..16) */
117
+ uInt w_mask; /* w_size - 1 */
118
+
119
+ Bytef *window;
120
+ /* Sliding window. Input bytes are read into the second half of the window,
121
+ * and move to the first half later to keep a dictionary of at least wSize
122
+ * bytes. With this organization, matches are limited to a distance of
123
+ * wSize-MAX_MATCH bytes, but this ensures that IO is always
124
+ * performed with a length multiple of the block size. Also, it limits
125
+ * the window size to 64K, which is quite useful on MSDOS.
126
+ * To do: use the user input buffer as sliding window.
127
+ */
128
+
129
+ ulg window_size;
130
+ /* Actual size of window: 2*wSize, except when the user input buffer
131
+ * is directly used as sliding window.
132
+ */
133
+
134
+ Posf *prev;
135
+ /* Link to older string with same hash index. To limit the size of this
136
+ * array to 64K, this link is maintained only for the last 32K strings.
137
+ * An index in this array is thus a window index modulo 32K.
138
+ */
139
+
140
+ Posf *head; /* Heads of the hash chains or NIL. */
141
+
142
+ uInt ins_h; /* hash index of string to be inserted */
143
+ uInt hash_size; /* number of elements in hash table */
144
+ uInt hash_bits; /* log2(hash_size) */
145
+ uInt hash_mask; /* hash_size-1 */
146
+
147
+ uInt hash_shift;
148
+ /* Number of bits by which ins_h must be shifted at each input
149
+ * step. It must be such that after MIN_MATCH steps, the oldest
150
+ * byte no longer takes part in the hash key, that is:
151
+ * hash_shift * MIN_MATCH >= hash_bits
152
+ */
153
+
154
+ long block_start;
155
+ /* Window position at the beginning of the current output block. Gets
156
+ * negative when the window is moved backwards.
157
+ */
158
+
159
+ uInt match_length; /* length of best match */
160
+ IPos prev_match; /* previous match */
161
+ int match_available; /* set if previous match exists */
162
+ uInt strstart; /* start of string to insert */
163
+ uInt match_start; /* start of matching string */
164
+ uInt lookahead; /* number of valid bytes ahead in window */
165
+
166
+ uInt prev_length;
167
+ /* Length of the best match at previous step. Matches not greater than this
168
+ * are discarded. This is used in the lazy match evaluation.
169
+ */
170
+
171
+ uInt max_chain_length;
172
+ /* To speed up deflation, hash chains are never searched beyond this
173
+ * length. A higher limit improves compression ratio but degrades the
174
+ * speed.
175
+ */
176
+
177
+ uInt max_lazy_match;
178
+ /* Attempt to find a better match only when the current match is strictly
179
+ * smaller than this value. This mechanism is used only for compression
180
+ * levels >= 4.
181
+ */
182
+ # define max_insert_length max_lazy_match
183
+ /* Insert new strings in the hash table only if the match length is not
184
+ * greater than this length. This saves time but degrades compression.
185
+ * max_insert_length is used only for compression levels <= 3.
186
+ */
187
+
188
+ int level; /* compression level (1..9) */
189
+ int strategy; /* favor or force Huffman coding*/
190
+
191
+ uInt good_match;
192
+ /* Use a faster search when the previous match is longer than this */
193
+
194
+ int nice_match; /* Stop searching when current match exceeds this */
195
+
196
+ /* used by trees.c: */
197
+ /* Didn't use ct_data typedef below to suppress compiler warning */
198
+ struct ct_data_s dyn_ltree[HEAP_SIZE]; /* literal and length tree */
199
+ struct ct_data_s dyn_dtree[2*D_CODES+1]; /* distance tree */
200
+ struct ct_data_s bl_tree[2*BL_CODES+1]; /* Huffman tree for bit lengths */
201
+
202
+ struct tree_desc_s l_desc; /* desc. for literal tree */
203
+ struct tree_desc_s d_desc; /* desc. for distance tree */
204
+ struct tree_desc_s bl_desc; /* desc. for bit length tree */
205
+
206
+ ush bl_count[MAX_BITS+1];
207
+ /* number of codes at each bit length for an optimal tree */
208
+
209
+ int heap[2*L_CODES+1]; /* heap used to build the Huffman trees */
210
+ int heap_len; /* number of elements in the heap */
211
+ int heap_max; /* element of largest frequency */
212
+ /* The sons of heap[n] are heap[2*n] and heap[2*n+1]. heap[0] is not used.
213
+ * The same heap array is used to build all trees.
214
+ */
215
+
216
+ uch depth[2*L_CODES+1];
217
+ /* Depth of each subtree used as tie breaker for trees of equal frequency
218
+ */
219
+
220
+ uchf *l_buf; /* buffer for literals or lengths */
221
+
222
+ uInt lit_bufsize;
223
+ /* Size of match buffer for literals/lengths. There are 4 reasons for
224
+ * limiting lit_bufsize to 64K:
225
+ * - frequencies can be kept in 16 bit counters
226
+ * - if compression is not successful for the first block, all input
227
+ * data is still in the window so we can still emit a stored block even
228
+ * when input comes from standard input. (This can also be done for
229
+ * all blocks if lit_bufsize is not greater than 32K.)
230
+ * - if compression is not successful for a file smaller than 64K, we can
231
+ * even emit a stored file instead of a stored block (saving 5 bytes).
232
+ * This is applicable only for zip (not gzip or zlib).
233
+ * - creating new Huffman trees less frequently may not provide fast
234
+ * adaptation to changes in the input data statistics. (Take for
235
+ * example a binary file with poorly compressible code followed by
236
+ * a highly compressible string table.) Smaller buffer sizes give
237
+ * fast adaptation but have of course the overhead of transmitting
238
+ * trees more frequently.
239
+ * - I can't count above 4
240
+ */
241
+
242
+ uInt last_lit; /* running index in l_buf */
243
+
244
+ ushf *d_buf;
245
+ /* Buffer for distances. To simplify the code, d_buf and l_buf have
246
+ * the same number of elements. To use different lengths, an extra flag
247
+ * array would be necessary.
248
+ */
249
+
250
+ ulg opt_len; /* bit length of current block with optimal trees */
251
+ ulg static_len; /* bit length of current block with static trees */
252
+ uInt matches; /* number of string matches in current block */
253
+ uInt insert; /* bytes at end of window left to insert */
254
+
255
+ #ifdef ZLIB_DEBUG
256
+ ulg compressed_len; /* total bit length of compressed file mod 2^32 */
257
+ ulg bits_sent; /* bit length of compressed data sent mod 2^32 */
258
+ #endif
259
+
260
+ ush bi_buf;
261
+ /* Output buffer. bits are inserted starting at the bottom (least
262
+ * significant bits).
263
+ */
264
+ int bi_valid;
265
+ /* Number of valid bits in bi_buf. All bits above the last valid bit
266
+ * are always zero.
267
+ */
268
+
269
+ ulg high_water;
270
+ /* High water mark offset in window for initialized bytes -- bytes above
271
+ * this are set to zero in order to avoid memory check warnings when
272
+ * longest match routines access bytes past the input. This is then
273
+ * updated to the new high water mark.
274
+ */
275
+
276
+ } FAR deflate_state;
277
+
278
+ /* Output a byte on the stream.
279
+ * IN assertion: there is enough room in pending_buf.
280
+ */
281
+ #define put_byte(s, c) {s->pending_buf[s->pending++] = (Bytef)(c);}
282
+
283
+
284
+ #define MIN_LOOKAHEAD (MAX_MATCH+MIN_MATCH+1)
285
+ /* Minimum amount of lookahead, except at the end of the input file.
286
+ * See deflate.c for comments about the MIN_MATCH+1.
287
+ */
288
+
289
+ #define MAX_DIST(s) ((s)->w_size-MIN_LOOKAHEAD)
290
+ /* In order to simplify the code, particularly on 16 bit machines, match
291
+ * distances are limited to MAX_DIST instead of WSIZE.
292
+ */
293
+
294
+ #define WIN_INIT MAX_MATCH
295
+ /* Number of bytes after end of data in window to initialize in order to avoid
296
+ memory checker errors from longest match routines */
297
+
298
+ /* in trees.c */
299
+ void ZLIB_INTERNAL _tr_init OF((deflate_state *s));
300
+ int ZLIB_INTERNAL _tr_tally OF((deflate_state *s, unsigned dist, unsigned lc));
301
+ void ZLIB_INTERNAL _tr_flush_block OF((deflate_state *s, charf *buf,
302
+ ulg stored_len, int last));
303
+ void ZLIB_INTERNAL _tr_flush_bits OF((deflate_state *s));
304
+ void ZLIB_INTERNAL _tr_align OF((deflate_state *s));
305
+ void ZLIB_INTERNAL _tr_stored_block OF((deflate_state *s, charf *buf,
306
+ ulg stored_len, int last));
307
+
308
+ #define d_code(dist) \
309
+ ((dist) < 256 ? _dist_code[dist] : _dist_code[256+((dist)>>7)])
310
+ /* Mapping from a distance to a distance code. dist is the distance - 1 and
311
+ * must not have side effects. _dist_code[256] and _dist_code[257] are never
312
+ * used.
313
+ */
314
+
315
+ #ifndef ZLIB_DEBUG
316
+ /* Inline versions of _tr_tally for speed: */
317
+
318
+ #if defined(GEN_TREES_H) || !defined(STDC)
319
+ extern uch ZLIB_INTERNAL _length_code[];
320
+ extern uch ZLIB_INTERNAL _dist_code[];
321
+ #else
322
+ extern const uch ZLIB_INTERNAL _length_code[];
323
+ extern const uch ZLIB_INTERNAL _dist_code[];
324
+ #endif
325
+
326
+ # define _tr_tally_lit(s, c, flush) \
327
+ { uch cc = (c); \
328
+ s->d_buf[s->last_lit] = 0; \
329
+ s->l_buf[s->last_lit++] = cc; \
330
+ s->dyn_ltree[cc].Freq++; \
331
+ flush = (s->last_lit == s->lit_bufsize-1); \
332
+ }
333
+ # define _tr_tally_dist(s, distance, length, flush) \
334
+ { uch len = (uch)(length); \
335
+ ush dist = (ush)(distance); \
336
+ s->d_buf[s->last_lit] = dist; \
337
+ s->l_buf[s->last_lit++] = len; \
338
+ dist--; \
339
+ s->dyn_ltree[_length_code[len]+LITERALS+1].Freq++; \
340
+ s->dyn_dtree[d_code(dist)].Freq++; \
341
+ flush = (s->last_lit == s->lit_bufsize-1); \
342
+ }
343
+ #else
344
+ # define _tr_tally_lit(s, c, flush) flush = _tr_tally(s, 0, c)
345
+ # define _tr_tally_dist(s, distance, length, flush) \
346
+ flush = _tr_tally(s, distance, length)
347
+ #endif
348
+
349
+ #endif /* DEFLATE_H */
@@ -0,0 +1,209 @@
1
+ 1. Compression algorithm (deflate)
2
+
3
+ The deflation algorithm used by gzip (also zip and zlib) is a variation of
4
+ LZ77 (Lempel-Ziv 1977, see reference below). It finds duplicated strings in
5
+ the input data. The second occurrence of a string is replaced by a
6
+ pointer to the previous string, in the form of a pair (distance,
7
+ length). Distances are limited to 32K bytes, and lengths are limited
8
+ to 258 bytes. When a string does not occur anywhere in the previous
9
+ 32K bytes, it is emitted as a sequence of literal bytes. (In this
10
+ description, `string' must be taken as an arbitrary sequence of bytes,
11
+ and is not restricted to printable characters.)
12
+
13
+ Literals or match lengths are compressed with one Huffman tree, and
14
+ match distances are compressed with another tree. The trees are stored
15
+ in a compact form at the start of each block. The blocks can have any
16
+ size (except that the compressed data for one block must fit in
17
+ available memory). A block is terminated when deflate() determines that
18
+ it would be useful to start another block with fresh trees. (This is
19
+ somewhat similar to the behavior of LZW-based _compress_.)
20
+
21
+ Duplicated strings are found using a hash table. All input strings of
22
+ length 3 are inserted in the hash table. A hash index is computed for
23
+ the next 3 bytes. If the hash chain for this index is not empty, all
24
+ strings in the chain are compared with the current input string, and
25
+ the longest match is selected.
26
+
27
+ The hash chains are searched starting with the most recent strings, to
28
+ favor small distances and thus take advantage of the Huffman encoding.
29
+ The hash chains are singly linked. There are no deletions from the
30
+ hash chains, the algorithm simply discards matches that are too old.
31
+
32
+ To avoid a worst-case situation, very long hash chains are arbitrarily
33
+ truncated at a certain length, determined by a runtime option (level
34
+ parameter of deflateInit). So deflate() does not always find the longest
35
+ possible match but generally finds a match which is long enough.
36
+
37
+ deflate() also defers the selection of matches with a lazy evaluation
38
+ mechanism. After a match of length N has been found, deflate() searches for
39
+ a longer match at the next input byte. If a longer match is found, the
40
+ previous match is truncated to a length of one (thus producing a single
41
+ literal byte) and the process of lazy evaluation begins again. Otherwise,
42
+ the original match is kept, and the next match search is attempted only N
43
+ steps later.
44
+
45
+ The lazy match evaluation is also subject to a runtime parameter. If
46
+ the current match is long enough, deflate() reduces the search for a longer
47
+ match, thus speeding up the whole process. If compression ratio is more
48
+ important than speed, deflate() attempts a complete second search even if
49
+ the first match is already long enough.
50
+
51
+ The lazy match evaluation is not performed for the fastest compression
52
+ modes (level parameter 1 to 3). For these fast modes, new strings
53
+ are inserted in the hash table only when no match was found, or
54
+ when the match is not too long. This degrades the compression ratio
55
+ but saves time since there are both fewer insertions and fewer searches.
56
+
57
+
58
+ 2. Decompression algorithm (inflate)
59
+
60
+ 2.1 Introduction
61
+
62
+ The key question is how to represent a Huffman code (or any prefix code) so
63
+ that you can decode fast. The most important characteristic is that shorter
64
+ codes are much more common than longer codes, so pay attention to decoding the
65
+ short codes fast, and let the long codes take longer to decode.
66
+
67
+ inflate() sets up a first level table that covers some number of bits of
68
+ input less than the length of longest code. It gets that many bits from the
69
+ stream, and looks it up in the table. The table will tell if the next
70
+ code is that many bits or less and how many, and if it is, it will tell
71
+ the value, else it will point to the next level table for which inflate()
72
+ grabs more bits and tries to decode a longer code.
73
+
74
+ How many bits to make the first lookup is a tradeoff between the time it
75
+ takes to decode and the time it takes to build the table. If building the
76
+ table took no time (and if you had infinite memory), then there would only
77
+ be a first level table to cover all the way to the longest code. However,
78
+ building the table ends up taking a lot longer for more bits since short
79
+ codes are replicated many times in such a table. What inflate() does is
80
+ simply to make the number of bits in the first table a variable, and then
81
+ to set that variable for the maximum speed.
82
+
83
+ For inflate, which has 286 possible codes for the literal/length tree, the size
84
+ of the first table is nine bits. Also the distance trees have 30 possible
85
+ values, and the size of the first table is six bits. Note that for each of
86
+ those cases, the table ended up one bit longer than the ``average'' code
87
+ length, i.e. the code length of an approximately flat code which would be a
88
+ little more than eight bits for 286 symbols and a little less than five bits
89
+ for 30 symbols.
90
+
91
+
92
+ 2.2 More details on the inflate table lookup
93
+
94
+ Ok, you want to know what this cleverly obfuscated inflate tree actually
95
+ looks like. You are correct that it's not a Huffman tree. It is simply a
96
+ lookup table for the first, let's say, nine bits of a Huffman symbol. The
97
+ symbol could be as short as one bit or as long as 15 bits. If a particular
98
+ symbol is shorter than nine bits, then that symbol's translation is duplicated
99
+ in all those entries that start with that symbol's bits. For example, if the
100
+ symbol is four bits, then it's duplicated 32 times in a nine-bit table. If a
101
+ symbol is nine bits long, it appears in the table once.
102
+
103
+ If the symbol is longer than nine bits, then that entry in the table points
104
+ to another similar table for the remaining bits. Again, there are duplicated
105
+ entries as needed. The idea is that most of the time the symbol will be short
106
+ and there will only be one table look up. (That's whole idea behind data
107
+ compression in the first place.) For the less frequent long symbols, there
108
+ will be two lookups. If you had a compression method with really long
109
+ symbols, you could have as many levels of lookups as is efficient. For
110
+ inflate, two is enough.
111
+
112
+ So a table entry either points to another table (in which case nine bits in
113
+ the above example are gobbled), or it contains the translation for the symbol
114
+ and the number of bits to gobble. Then you start again with the next
115
+ ungobbled bit.
116
+
117
+ You may wonder: why not just have one lookup table for how ever many bits the
118
+ longest symbol is? The reason is that if you do that, you end up spending
119
+ more time filling in duplicate symbol entries than you do actually decoding.
120
+ At least for deflate's output that generates new trees every several 10's of
121
+ kbytes. You can imagine that filling in a 2^15 entry table for a 15-bit code
122
+ would take too long if you're only decoding several thousand symbols. At the
123
+ other extreme, you could make a new table for every bit in the code. In fact,
124
+ that's essentially a Huffman tree. But then you spend too much time
125
+ traversing the tree while decoding, even for short symbols.
126
+
127
+ So the number of bits for the first lookup table is a trade of the time to
128
+ fill out the table vs. the time spent looking at the second level and above of
129
+ the table.
130
+
131
+ Here is an example, scaled down:
132
+
133
+ The code being decoded, with 10 symbols, from 1 to 6 bits long:
134
+
135
+ A: 0
136
+ B: 10
137
+ C: 1100
138
+ D: 11010
139
+ E: 11011
140
+ F: 11100
141
+ G: 11101
142
+ H: 11110
143
+ I: 111110
144
+ J: 111111
145
+
146
+ Let's make the first table three bits long (eight entries):
147
+
148
+ 000: A,1
149
+ 001: A,1
150
+ 010: A,1
151
+ 011: A,1
152
+ 100: B,2
153
+ 101: B,2
154
+ 110: -> table X (gobble 3 bits)
155
+ 111: -> table Y (gobble 3 bits)
156
+
157
+ Each entry is what the bits decode as and how many bits that is, i.e. how
158
+ many bits to gobble. Or the entry points to another table, with the number of
159
+ bits to gobble implicit in the size of the table.
160
+
161
+ Table X is two bits long since the longest code starting with 110 is five bits
162
+ long:
163
+
164
+ 00: C,1
165
+ 01: C,1
166
+ 10: D,2
167
+ 11: E,2
168
+
169
+ Table Y is three bits long since the longest code starting with 111 is six
170
+ bits long:
171
+
172
+ 000: F,2
173
+ 001: F,2
174
+ 010: G,2
175
+ 011: G,2
176
+ 100: H,2
177
+ 101: H,2
178
+ 110: I,3
179
+ 111: J,3
180
+
181
+ So what we have here are three tables with a total of 20 entries that had to
182
+ be constructed. That's compared to 64 entries for a single table. Or
183
+ compared to 16 entries for a Huffman tree (six two entry tables and one four
184
+ entry table). Assuming that the code ideally represents the probability of
185
+ the symbols, it takes on the average 1.25 lookups per symbol. That's compared
186
+ to one lookup for the single table, or 1.66 lookups per symbol for the
187
+ Huffman tree.
188
+
189
+ There, I think that gives you a picture of what's going on. For inflate, the
190
+ meaning of a particular symbol is often more than just a letter. It can be a
191
+ byte (a "literal"), or it can be either a length or a distance which
192
+ indicates a base value and a number of bits to fetch after the code that is
193
+ added to the base value. Or it might be the special end-of-block code. The
194
+ data structures created in inftrees.c try to encode all that information
195
+ compactly in the tables.
196
+
197
+
198
+ Jean-loup Gailly Mark Adler
199
+ jloup@gzip.org madler@alumni.caltech.edu
200
+
201
+
202
+ References:
203
+
204
+ [LZ77] Ziv J., Lempel A., ``A Universal Algorithm for Sequential Data
205
+ Compression,'' IEEE Transactions on Information Theory, Vol. 23, No. 3,
206
+ pp. 337-343.
207
+
208
+ ``DEFLATE Compressed Data Format Specification'' available in
209
+ http://tools.ietf.org/html/rfc1951