chd 0.1.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (109) hide show
  1. checksums.yaml +7 -0
  2. data/README.md +30 -0
  3. data/chd.gemspec +29 -0
  4. data/ext/chd.c +1008 -0
  5. data/ext/extconf.rb +60 -0
  6. data/lib/chd/cd.rb +272 -0
  7. data/lib/chd/metadata.rb +196 -0
  8. data/lib/chd/version.rb +4 -0
  9. data/lib/chd.rb +21 -0
  10. data/libchdr/CMakeLists.txt +104 -0
  11. data/libchdr/LICENSE.txt +24 -0
  12. data/libchdr/README.md +7 -0
  13. data/libchdr/deps/lzma-19.00/CMakeLists.txt +33 -0
  14. data/libchdr/deps/lzma-19.00/LICENSE +3 -0
  15. data/libchdr/deps/lzma-19.00/include/7zTypes.h +375 -0
  16. data/libchdr/deps/lzma-19.00/include/Alloc.h +51 -0
  17. data/libchdr/deps/lzma-19.00/include/Bra.h +64 -0
  18. data/libchdr/deps/lzma-19.00/include/Compiler.h +33 -0
  19. data/libchdr/deps/lzma-19.00/include/CpuArch.h +336 -0
  20. data/libchdr/deps/lzma-19.00/include/Delta.h +19 -0
  21. data/libchdr/deps/lzma-19.00/include/LzFind.h +121 -0
  22. data/libchdr/deps/lzma-19.00/include/LzHash.h +57 -0
  23. data/libchdr/deps/lzma-19.00/include/Lzma86.h +111 -0
  24. data/libchdr/deps/lzma-19.00/include/LzmaDec.h +234 -0
  25. data/libchdr/deps/lzma-19.00/include/LzmaEnc.h +76 -0
  26. data/libchdr/deps/lzma-19.00/include/LzmaLib.h +131 -0
  27. data/libchdr/deps/lzma-19.00/include/Precomp.h +10 -0
  28. data/libchdr/deps/lzma-19.00/include/Sort.h +18 -0
  29. data/libchdr/deps/lzma-19.00/lzma-history.txt +446 -0
  30. data/libchdr/deps/lzma-19.00/lzma.txt +328 -0
  31. data/libchdr/deps/lzma-19.00/lzma.vcxproj +543 -0
  32. data/libchdr/deps/lzma-19.00/lzma.vcxproj.filters +17 -0
  33. data/libchdr/deps/lzma-19.00/src/Alloc.c +455 -0
  34. data/libchdr/deps/lzma-19.00/src/Bra86.c +82 -0
  35. data/libchdr/deps/lzma-19.00/src/BraIA64.c +53 -0
  36. data/libchdr/deps/lzma-19.00/src/CpuArch.c +218 -0
  37. data/libchdr/deps/lzma-19.00/src/Delta.c +64 -0
  38. data/libchdr/deps/lzma-19.00/src/LzFind.c +1127 -0
  39. data/libchdr/deps/lzma-19.00/src/Lzma86Dec.c +54 -0
  40. data/libchdr/deps/lzma-19.00/src/LzmaDec.c +1185 -0
  41. data/libchdr/deps/lzma-19.00/src/LzmaEnc.c +1330 -0
  42. data/libchdr/deps/lzma-19.00/src/Sort.c +141 -0
  43. data/libchdr/deps/zlib-1.2.11/CMakeLists.txt +29 -0
  44. data/libchdr/deps/zlib-1.2.11/ChangeLog +1515 -0
  45. data/libchdr/deps/zlib-1.2.11/FAQ +368 -0
  46. data/libchdr/deps/zlib-1.2.11/INDEX +68 -0
  47. data/libchdr/deps/zlib-1.2.11/Makefile +5 -0
  48. data/libchdr/deps/zlib-1.2.11/Makefile.in +410 -0
  49. data/libchdr/deps/zlib-1.2.11/README +115 -0
  50. data/libchdr/deps/zlib-1.2.11/adler32.c +186 -0
  51. data/libchdr/deps/zlib-1.2.11/compress.c +86 -0
  52. data/libchdr/deps/zlib-1.2.11/configure +921 -0
  53. data/libchdr/deps/zlib-1.2.11/crc32.c +442 -0
  54. data/libchdr/deps/zlib-1.2.11/crc32.h +441 -0
  55. data/libchdr/deps/zlib-1.2.11/deflate.c +2163 -0
  56. data/libchdr/deps/zlib-1.2.11/deflate.h +349 -0
  57. data/libchdr/deps/zlib-1.2.11/doc/algorithm.txt +209 -0
  58. data/libchdr/deps/zlib-1.2.11/doc/rfc1950.txt +619 -0
  59. data/libchdr/deps/zlib-1.2.11/doc/rfc1951.txt +955 -0
  60. data/libchdr/deps/zlib-1.2.11/doc/rfc1952.txt +675 -0
  61. data/libchdr/deps/zlib-1.2.11/doc/txtvsbin.txt +107 -0
  62. data/libchdr/deps/zlib-1.2.11/gzclose.c +25 -0
  63. data/libchdr/deps/zlib-1.2.11/gzguts.h +218 -0
  64. data/libchdr/deps/zlib-1.2.11/gzlib.c +637 -0
  65. data/libchdr/deps/zlib-1.2.11/gzread.c +654 -0
  66. data/libchdr/deps/zlib-1.2.11/gzwrite.c +665 -0
  67. data/libchdr/deps/zlib-1.2.11/infback.c +640 -0
  68. data/libchdr/deps/zlib-1.2.11/inffast.c +323 -0
  69. data/libchdr/deps/zlib-1.2.11/inffast.h +11 -0
  70. data/libchdr/deps/zlib-1.2.11/inffixed.h +94 -0
  71. data/libchdr/deps/zlib-1.2.11/inflate.c +1561 -0
  72. data/libchdr/deps/zlib-1.2.11/inflate.h +125 -0
  73. data/libchdr/deps/zlib-1.2.11/inftrees.c +304 -0
  74. data/libchdr/deps/zlib-1.2.11/inftrees.h +62 -0
  75. data/libchdr/deps/zlib-1.2.11/make_vms.com +867 -0
  76. data/libchdr/deps/zlib-1.2.11/treebuild.xml +116 -0
  77. data/libchdr/deps/zlib-1.2.11/trees.c +1203 -0
  78. data/libchdr/deps/zlib-1.2.11/trees.h +128 -0
  79. data/libchdr/deps/zlib-1.2.11/uncompr.c +93 -0
  80. data/libchdr/deps/zlib-1.2.11/zconf.h +534 -0
  81. data/libchdr/deps/zlib-1.2.11/zconf.h.cmakein +536 -0
  82. data/libchdr/deps/zlib-1.2.11/zconf.h.in +534 -0
  83. data/libchdr/deps/zlib-1.2.11/zlib.3 +149 -0
  84. data/libchdr/deps/zlib-1.2.11/zlib.3.pdf +0 -0
  85. data/libchdr/deps/zlib-1.2.11/zlib.h +1912 -0
  86. data/libchdr/deps/zlib-1.2.11/zlib.map +94 -0
  87. data/libchdr/deps/zlib-1.2.11/zlib.pc.cmakein +13 -0
  88. data/libchdr/deps/zlib-1.2.11/zlib.pc.in +13 -0
  89. data/libchdr/deps/zlib-1.2.11/zlib2ansi +152 -0
  90. data/libchdr/deps/zlib-1.2.11/zutil.c +325 -0
  91. data/libchdr/deps/zlib-1.2.11/zutil.h +271 -0
  92. data/libchdr/include/dr_libs/dr_flac.h +12280 -0
  93. data/libchdr/include/libchdr/bitstream.h +43 -0
  94. data/libchdr/include/libchdr/cdrom.h +110 -0
  95. data/libchdr/include/libchdr/chd.h +427 -0
  96. data/libchdr/include/libchdr/chdconfig.h +10 -0
  97. data/libchdr/include/libchdr/coretypes.h +60 -0
  98. data/libchdr/include/libchdr/flac.h +50 -0
  99. data/libchdr/include/libchdr/huffman.h +90 -0
  100. data/libchdr/pkg-config.pc.in +10 -0
  101. data/libchdr/src/libchdr_bitstream.c +125 -0
  102. data/libchdr/src/libchdr_cdrom.c +415 -0
  103. data/libchdr/src/libchdr_chd.c +2744 -0
  104. data/libchdr/src/libchdr_flac.c +302 -0
  105. data/libchdr/src/libchdr_huffman.c +545 -0
  106. data/libchdr/src/link.T +5 -0
  107. data/libchdr/tests/CMakeLists.txt +2 -0
  108. data/libchdr/tests/benchmark.c +52 -0
  109. metadata +183 -0
@@ -0,0 +1,955 @@
1
+
2
+
3
+
4
+
5
+
6
+
7
+ Network Working Group P. Deutsch
8
+ Request for Comments: 1951 Aladdin Enterprises
9
+ Category: Informational May 1996
10
+
11
+
12
+ DEFLATE Compressed Data Format Specification version 1.3
13
+
14
+ Status of This Memo
15
+
16
+ This memo provides information for the Internet community. This memo
17
+ does not specify an Internet standard of any kind. Distribution of
18
+ this memo is unlimited.
19
+
20
+ IESG Note:
21
+
22
+ The IESG takes no position on the validity of any Intellectual
23
+ Property Rights statements contained in this document.
24
+
25
+ Notices
26
+
27
+ Copyright (c) 1996 L. Peter Deutsch
28
+
29
+ Permission is granted to copy and distribute this document for any
30
+ purpose and without charge, including translations into other
31
+ languages and incorporation into compilations, provided that the
32
+ copyright notice and this notice are preserved, and that any
33
+ substantive changes or deletions from the original are clearly
34
+ marked.
35
+
36
+ A pointer to the latest version of this and related documentation in
37
+ HTML format can be found at the URL
38
+ <ftp://ftp.uu.net/graphics/png/documents/zlib/zdoc-index.html>.
39
+
40
+ Abstract
41
+
42
+ This specification defines a lossless compressed data format that
43
+ compresses data using a combination of the LZ77 algorithm and Huffman
44
+ coding, with efficiency comparable to the best currently available
45
+ general-purpose compression methods. The data can be produced or
46
+ consumed, even for an arbitrarily long sequentially presented input
47
+ data stream, using only an a priori bounded amount of intermediate
48
+ storage. The format can be implemented readily in a manner not
49
+ covered by patents.
50
+
51
+
52
+
53
+
54
+
55
+
56
+
57
+
58
+ Deutsch Informational [Page 1]
59
+
60
+ RFC 1951 DEFLATE Compressed Data Format Specification May 1996
61
+
62
+
63
+ Table of Contents
64
+
65
+ 1. Introduction ................................................... 2
66
+ 1.1. Purpose ................................................... 2
67
+ 1.2. Intended audience ......................................... 3
68
+ 1.3. Scope ..................................................... 3
69
+ 1.4. Compliance ................................................ 3
70
+ 1.5. Definitions of terms and conventions used ................ 3
71
+ 1.6. Changes from previous versions ............................ 4
72
+ 2. Compressed representation overview ............................. 4
73
+ 3. Detailed specification ......................................... 5
74
+ 3.1. Overall conventions ....................................... 5
75
+ 3.1.1. Packing into bytes .................................. 5
76
+ 3.2. Compressed block format ................................... 6
77
+ 3.2.1. Synopsis of prefix and Huffman coding ............... 6
78
+ 3.2.2. Use of Huffman coding in the "deflate" format ....... 7
79
+ 3.2.3. Details of block format ............................. 9
80
+ 3.2.4. Non-compressed blocks (BTYPE=00) ................... 11
81
+ 3.2.5. Compressed blocks (length and distance codes) ...... 11
82
+ 3.2.6. Compression with fixed Huffman codes (BTYPE=01) .... 12
83
+ 3.2.7. Compression with dynamic Huffman codes (BTYPE=10) .. 13
84
+ 3.3. Compliance ............................................... 14
85
+ 4. Compression algorithm details ................................. 14
86
+ 5. References .................................................... 16
87
+ 6. Security Considerations ....................................... 16
88
+ 7. Source code ................................................... 16
89
+ 8. Acknowledgements .............................................. 16
90
+ 9. Author's Address .............................................. 17
91
+
92
+ 1. Introduction
93
+
94
+ 1.1. Purpose
95
+
96
+ The purpose of this specification is to define a lossless
97
+ compressed data format that:
98
+ * Is independent of CPU type, operating system, file system,
99
+ and character set, and hence can be used for interchange;
100
+ * Can be produced or consumed, even for an arbitrarily long
101
+ sequentially presented input data stream, using only an a
102
+ priori bounded amount of intermediate storage, and hence
103
+ can be used in data communications or similar structures
104
+ such as Unix filters;
105
+ * Compresses data with efficiency comparable to the best
106
+ currently available general-purpose compression methods,
107
+ and in particular considerably better than the "compress"
108
+ program;
109
+ * Can be implemented readily in a manner not covered by
110
+ patents, and hence can be practiced freely;
111
+
112
+
113
+
114
+ Deutsch Informational [Page 2]
115
+
116
+ RFC 1951 DEFLATE Compressed Data Format Specification May 1996
117
+
118
+
119
+ * Is compatible with the file format produced by the current
120
+ widely used gzip utility, in that conforming decompressors
121
+ will be able to read data produced by the existing gzip
122
+ compressor.
123
+
124
+ The data format defined by this specification does not attempt to:
125
+
126
+ * Allow random access to compressed data;
127
+ * Compress specialized data (e.g., raster graphics) as well
128
+ as the best currently available specialized algorithms.
129
+
130
+ A simple counting argument shows that no lossless compression
131
+ algorithm can compress every possible input data set. For the
132
+ format defined here, the worst case expansion is 5 bytes per 32K-
133
+ byte block, i.e., a size increase of 0.015% for large data sets.
134
+ English text usually compresses by a factor of 2.5 to 3;
135
+ executable files usually compress somewhat less; graphical data
136
+ such as raster images may compress much more.
137
+
138
+ 1.2. Intended audience
139
+
140
+ This specification is intended for use by implementors of software
141
+ to compress data into "deflate" format and/or decompress data from
142
+ "deflate" format.
143
+
144
+ The text of the specification assumes a basic background in
145
+ programming at the level of bits and other primitive data
146
+ representations. Familiarity with the technique of Huffman coding
147
+ is helpful but not required.
148
+
149
+ 1.3. Scope
150
+
151
+ The specification specifies a method for representing a sequence
152
+ of bytes as a (usually shorter) sequence of bits, and a method for
153
+ packing the latter bit sequence into bytes.
154
+
155
+ 1.4. Compliance
156
+
157
+ Unless otherwise indicated below, a compliant decompressor must be
158
+ able to accept and decompress any data set that conforms to all
159
+ the specifications presented here; a compliant compressor must
160
+ produce data sets that conform to all the specifications presented
161
+ here.
162
+
163
+ 1.5. Definitions of terms and conventions used
164
+
165
+ Byte: 8 bits stored or transmitted as a unit (same as an octet).
166
+ For this specification, a byte is exactly 8 bits, even on machines
167
+
168
+
169
+
170
+ Deutsch Informational [Page 3]
171
+
172
+ RFC 1951 DEFLATE Compressed Data Format Specification May 1996
173
+
174
+
175
+ which store a character on a number of bits different from eight.
176
+ See below, for the numbering of bits within a byte.
177
+
178
+ String: a sequence of arbitrary bytes.
179
+
180
+ 1.6. Changes from previous versions
181
+
182
+ There have been no technical changes to the deflate format since
183
+ version 1.1 of this specification. In version 1.2, some
184
+ terminology was changed. Version 1.3 is a conversion of the
185
+ specification to RFC style.
186
+
187
+ 2. Compressed representation overview
188
+
189
+ A compressed data set consists of a series of blocks, corresponding
190
+ to successive blocks of input data. The block sizes are arbitrary,
191
+ except that non-compressible blocks are limited to 65,535 bytes.
192
+
193
+ Each block is compressed using a combination of the LZ77 algorithm
194
+ and Huffman coding. The Huffman trees for each block are independent
195
+ of those for previous or subsequent blocks; the LZ77 algorithm may
196
+ use a reference to a duplicated string occurring in a previous block,
197
+ up to 32K input bytes before.
198
+
199
+ Each block consists of two parts: a pair of Huffman code trees that
200
+ describe the representation of the compressed data part, and a
201
+ compressed data part. (The Huffman trees themselves are compressed
202
+ using Huffman encoding.) The compressed data consists of a series of
203
+ elements of two types: literal bytes (of strings that have not been
204
+ detected as duplicated within the previous 32K input bytes), and
205
+ pointers to duplicated strings, where a pointer is represented as a
206
+ pair <length, backward distance>. The representation used in the
207
+ "deflate" format limits distances to 32K bytes and lengths to 258
208
+ bytes, but does not limit the size of a block, except for
209
+ uncompressible blocks, which are limited as noted above.
210
+
211
+ Each type of value (literals, distances, and lengths) in the
212
+ compressed data is represented using a Huffman code, using one code
213
+ tree for literals and lengths and a separate code tree for distances.
214
+ The code trees for each block appear in a compact form just before
215
+ the compressed data for that block.
216
+
217
+
218
+
219
+
220
+
221
+
222
+
223
+
224
+
225
+
226
+ Deutsch Informational [Page 4]
227
+
228
+ RFC 1951 DEFLATE Compressed Data Format Specification May 1996
229
+
230
+
231
+ 3. Detailed specification
232
+
233
+ 3.1. Overall conventions In the diagrams below, a box like this:
234
+
235
+ +---+
236
+ | | <-- the vertical bars might be missing
237
+ +---+
238
+
239
+ represents one byte; a box like this:
240
+
241
+ +==============+
242
+ | |
243
+ +==============+
244
+
245
+ represents a variable number of bytes.
246
+
247
+ Bytes stored within a computer do not have a "bit order", since
248
+ they are always treated as a unit. However, a byte considered as
249
+ an integer between 0 and 255 does have a most- and least-
250
+ significant bit, and since we write numbers with the most-
251
+ significant digit on the left, we also write bytes with the most-
252
+ significant bit on the left. In the diagrams below, we number the
253
+ bits of a byte so that bit 0 is the least-significant bit, i.e.,
254
+ the bits are numbered:
255
+
256
+ +--------+
257
+ |76543210|
258
+ +--------+
259
+
260
+ Within a computer, a number may occupy multiple bytes. All
261
+ multi-byte numbers in the format described here are stored with
262
+ the least-significant byte first (at the lower memory address).
263
+ For example, the decimal number 520 is stored as:
264
+
265
+ 0 1
266
+ +--------+--------+
267
+ |00001000|00000010|
268
+ +--------+--------+
269
+ ^ ^
270
+ | |
271
+ | + more significant byte = 2 x 256
272
+ + less significant byte = 8
273
+
274
+ 3.1.1. Packing into bytes
275
+
276
+ This document does not address the issue of the order in which
277
+ bits of a byte are transmitted on a bit-sequential medium,
278
+ since the final data format described here is byte- rather than
279
+
280
+
281
+
282
+ Deutsch Informational [Page 5]
283
+
284
+ RFC 1951 DEFLATE Compressed Data Format Specification May 1996
285
+
286
+
287
+ bit-oriented. However, we describe the compressed block format
288
+ in below, as a sequence of data elements of various bit
289
+ lengths, not a sequence of bytes. We must therefore specify
290
+ how to pack these data elements into bytes to form the final
291
+ compressed byte sequence:
292
+
293
+ * Data elements are packed into bytes in order of
294
+ increasing bit number within the byte, i.e., starting
295
+ with the least-significant bit of the byte.
296
+ * Data elements other than Huffman codes are packed
297
+ starting with the least-significant bit of the data
298
+ element.
299
+ * Huffman codes are packed starting with the most-
300
+ significant bit of the code.
301
+
302
+ In other words, if one were to print out the compressed data as
303
+ a sequence of bytes, starting with the first byte at the
304
+ *right* margin and proceeding to the *left*, with the most-
305
+ significant bit of each byte on the left as usual, one would be
306
+ able to parse the result from right to left, with fixed-width
307
+ elements in the correct MSB-to-LSB order and Huffman codes in
308
+ bit-reversed order (i.e., with the first bit of the code in the
309
+ relative LSB position).
310
+
311
+ 3.2. Compressed block format
312
+
313
+ 3.2.1. Synopsis of prefix and Huffman coding
314
+
315
+ Prefix coding represents symbols from an a priori known
316
+ alphabet by bit sequences (codes), one code for each symbol, in
317
+ a manner such that different symbols may be represented by bit
318
+ sequences of different lengths, but a parser can always parse
319
+ an encoded string unambiguously symbol-by-symbol.
320
+
321
+ We define a prefix code in terms of a binary tree in which the
322
+ two edges descending from each non-leaf node are labeled 0 and
323
+ 1 and in which the leaf nodes correspond one-for-one with (are
324
+ labeled with) the symbols of the alphabet; then the code for a
325
+ symbol is the sequence of 0's and 1's on the edges leading from
326
+ the root to the leaf labeled with that symbol. For example:
327
+
328
+
329
+
330
+
331
+
332
+
333
+
334
+
335
+
336
+
337
+
338
+ Deutsch Informational [Page 6]
339
+
340
+ RFC 1951 DEFLATE Compressed Data Format Specification May 1996
341
+
342
+
343
+ /\ Symbol Code
344
+ 0 1 ------ ----
345
+ / \ A 00
346
+ /\ B B 1
347
+ 0 1 C 011
348
+ / \ D 010
349
+ A /\
350
+ 0 1
351
+ / \
352
+ D C
353
+
354
+ A parser can decode the next symbol from an encoded input
355
+ stream by walking down the tree from the root, at each step
356
+ choosing the edge corresponding to the next input bit.
357
+
358
+ Given an alphabet with known symbol frequencies, the Huffman
359
+ algorithm allows the construction of an optimal prefix code
360
+ (one which represents strings with those symbol frequencies
361
+ using the fewest bits of any possible prefix codes for that
362
+ alphabet). Such a code is called a Huffman code. (See
363
+ reference [1] in Chapter 5, references for additional
364
+ information on Huffman codes.)
365
+
366
+ Note that in the "deflate" format, the Huffman codes for the
367
+ various alphabets must not exceed certain maximum code lengths.
368
+ This constraint complicates the algorithm for computing code
369
+ lengths from symbol frequencies. Again, see Chapter 5,
370
+ references for details.
371
+
372
+ 3.2.2. Use of Huffman coding in the "deflate" format
373
+
374
+ The Huffman codes used for each alphabet in the "deflate"
375
+ format have two additional rules:
376
+
377
+ * All codes of a given bit length have lexicographically
378
+ consecutive values, in the same order as the symbols
379
+ they represent;
380
+
381
+ * Shorter codes lexicographically precede longer codes.
382
+
383
+
384
+
385
+
386
+
387
+
388
+
389
+
390
+
391
+
392
+
393
+
394
+ Deutsch Informational [Page 7]
395
+
396
+ RFC 1951 DEFLATE Compressed Data Format Specification May 1996
397
+
398
+
399
+ We could recode the example above to follow this rule as
400
+ follows, assuming that the order of the alphabet is ABCD:
401
+
402
+ Symbol Code
403
+ ------ ----
404
+ A 10
405
+ B 0
406
+ C 110
407
+ D 111
408
+
409
+ I.e., 0 precedes 10 which precedes 11x, and 110 and 111 are
410
+ lexicographically consecutive.
411
+
412
+ Given this rule, we can define the Huffman code for an alphabet
413
+ just by giving the bit lengths of the codes for each symbol of
414
+ the alphabet in order; this is sufficient to determine the
415
+ actual codes. In our example, the code is completely defined
416
+ by the sequence of bit lengths (2, 1, 3, 3). The following
417
+ algorithm generates the codes as integers, intended to be read
418
+ from most- to least-significant bit. The code lengths are
419
+ initially in tree[I].Len; the codes are produced in
420
+ tree[I].Code.
421
+
422
+ 1) Count the number of codes for each code length. Let
423
+ bl_count[N] be the number of codes of length N, N >= 1.
424
+
425
+ 2) Find the numerical value of the smallest code for each
426
+ code length:
427
+
428
+ code = 0;
429
+ bl_count[0] = 0;
430
+ for (bits = 1; bits <= MAX_BITS; bits++) {
431
+ code = (code + bl_count[bits-1]) << 1;
432
+ next_code[bits] = code;
433
+ }
434
+
435
+ 3) Assign numerical values to all codes, using consecutive
436
+ values for all codes of the same length with the base
437
+ values determined at step 2. Codes that are never used
438
+ (which have a bit length of zero) must not be assigned a
439
+ value.
440
+
441
+ for (n = 0; n <= max_code; n++) {
442
+ len = tree[n].Len;
443
+ if (len != 0) {
444
+ tree[n].Code = next_code[len];
445
+ next_code[len]++;
446
+ }
447
+
448
+
449
+
450
+ Deutsch Informational [Page 8]
451
+
452
+ RFC 1951 DEFLATE Compressed Data Format Specification May 1996
453
+
454
+
455
+ }
456
+
457
+ Example:
458
+
459
+ Consider the alphabet ABCDEFGH, with bit lengths (3, 3, 3, 3,
460
+ 3, 2, 4, 4). After step 1, we have:
461
+
462
+ N bl_count[N]
463
+ - -----------
464
+ 2 1
465
+ 3 5
466
+ 4 2
467
+
468
+ Step 2 computes the following next_code values:
469
+
470
+ N next_code[N]
471
+ - ------------
472
+ 1 0
473
+ 2 0
474
+ 3 2
475
+ 4 14
476
+
477
+ Step 3 produces the following code values:
478
+
479
+ Symbol Length Code
480
+ ------ ------ ----
481
+ A 3 010
482
+ B 3 011
483
+ C 3 100
484
+ D 3 101
485
+ E 3 110
486
+ F 2 00
487
+ G 4 1110
488
+ H 4 1111
489
+
490
+ 3.2.3. Details of block format
491
+
492
+ Each block of compressed data begins with 3 header bits
493
+ containing the following data:
494
+
495
+ first bit BFINAL
496
+ next 2 bits BTYPE
497
+
498
+ Note that the header bits do not necessarily begin on a byte
499
+ boundary, since a block does not necessarily occupy an integral
500
+ number of bytes.
501
+
502
+
503
+
504
+
505
+
506
+ Deutsch Informational [Page 9]
507
+
508
+ RFC 1951 DEFLATE Compressed Data Format Specification May 1996
509
+
510
+
511
+ BFINAL is set if and only if this is the last block of the data
512
+ set.
513
+
514
+ BTYPE specifies how the data are compressed, as follows:
515
+
516
+ 00 - no compression
517
+ 01 - compressed with fixed Huffman codes
518
+ 10 - compressed with dynamic Huffman codes
519
+ 11 - reserved (error)
520
+
521
+ The only difference between the two compressed cases is how the
522
+ Huffman codes for the literal/length and distance alphabets are
523
+ defined.
524
+
525
+ In all cases, the decoding algorithm for the actual data is as
526
+ follows:
527
+
528
+ do
529
+ read block header from input stream.
530
+ if stored with no compression
531
+ skip any remaining bits in current partially
532
+ processed byte
533
+ read LEN and NLEN (see next section)
534
+ copy LEN bytes of data to output
535
+ otherwise
536
+ if compressed with dynamic Huffman codes
537
+ read representation of code trees (see
538
+ subsection below)
539
+ loop (until end of block code recognized)
540
+ decode literal/length value from input stream
541
+ if value < 256
542
+ copy value (literal byte) to output stream
543
+ otherwise
544
+ if value = end of block (256)
545
+ break from loop
546
+ otherwise (value = 257..285)
547
+ decode distance from input stream
548
+
549
+ move backwards distance bytes in the output
550
+ stream, and copy length bytes from this
551
+ position to the output stream.
552
+ end loop
553
+ while not last block
554
+
555
+ Note that a duplicated string reference may refer to a string
556
+ in a previous block; i.e., the backward distance may cross one
557
+ or more block boundaries. However a distance cannot refer past
558
+ the beginning of the output stream. (An application using a
559
+
560
+
561
+
562
+ Deutsch Informational [Page 10]
563
+
564
+ RFC 1951 DEFLATE Compressed Data Format Specification May 1996
565
+
566
+
567
+ preset dictionary might discard part of the output stream; a
568
+ distance can refer to that part of the output stream anyway)
569
+ Note also that the referenced string may overlap the current
570
+ position; for example, if the last 2 bytes decoded have values
571
+ X and Y, a string reference with <length = 5, distance = 2>
572
+ adds X,Y,X,Y,X to the output stream.
573
+
574
+ We now specify each compression method in turn.
575
+
576
+ 3.2.4. Non-compressed blocks (BTYPE=00)
577
+
578
+ Any bits of input up to the next byte boundary are ignored.
579
+ The rest of the block consists of the following information:
580
+
581
+ 0 1 2 3 4...
582
+ +---+---+---+---+================================+
583
+ | LEN | NLEN |... LEN bytes of literal data...|
584
+ +---+---+---+---+================================+
585
+
586
+ LEN is the number of data bytes in the block. NLEN is the
587
+ one's complement of LEN.
588
+
589
+ 3.2.5. Compressed blocks (length and distance codes)
590
+
591
+ As noted above, encoded data blocks in the "deflate" format
592
+ consist of sequences of symbols drawn from three conceptually
593
+ distinct alphabets: either literal bytes, from the alphabet of
594
+ byte values (0..255), or <length, backward distance> pairs,
595
+ where the length is drawn from (3..258) and the distance is
596
+ drawn from (1..32,768). In fact, the literal and length
597
+ alphabets are merged into a single alphabet (0..285), where
598
+ values 0..255 represent literal bytes, the value 256 indicates
599
+ end-of-block, and values 257..285 represent length codes
600
+ (possibly in conjunction with extra bits following the symbol
601
+ code) as follows:
602
+
603
+
604
+
605
+
606
+
607
+
608
+
609
+
610
+
611
+
612
+
613
+
614
+
615
+
616
+
617
+
618
+ Deutsch Informational [Page 11]
619
+
620
+ RFC 1951 DEFLATE Compressed Data Format Specification May 1996
621
+
622
+
623
+ Extra Extra Extra
624
+ Code Bits Length(s) Code Bits Lengths Code Bits Length(s)
625
+ ---- ---- ------ ---- ---- ------- ---- ---- -------
626
+ 257 0 3 267 1 15,16 277 4 67-82
627
+ 258 0 4 268 1 17,18 278 4 83-98
628
+ 259 0 5 269 2 19-22 279 4 99-114
629
+ 260 0 6 270 2 23-26 280 4 115-130
630
+ 261 0 7 271 2 27-30 281 5 131-162
631
+ 262 0 8 272 2 31-34 282 5 163-194
632
+ 263 0 9 273 3 35-42 283 5 195-226
633
+ 264 0 10 274 3 43-50 284 5 227-257
634
+ 265 1 11,12 275 3 51-58 285 0 258
635
+ 266 1 13,14 276 3 59-66
636
+
637
+ The extra bits should be interpreted as a machine integer
638
+ stored with the most-significant bit first, e.g., bits 1110
639
+ represent the value 14.
640
+
641
+ Extra Extra Extra
642
+ Code Bits Dist Code Bits Dist Code Bits Distance
643
+ ---- ---- ---- ---- ---- ------ ---- ---- --------
644
+ 0 0 1 10 4 33-48 20 9 1025-1536
645
+ 1 0 2 11 4 49-64 21 9 1537-2048
646
+ 2 0 3 12 5 65-96 22 10 2049-3072
647
+ 3 0 4 13 5 97-128 23 10 3073-4096
648
+ 4 1 5,6 14 6 129-192 24 11 4097-6144
649
+ 5 1 7,8 15 6 193-256 25 11 6145-8192
650
+ 6 2 9-12 16 7 257-384 26 12 8193-12288
651
+ 7 2 13-16 17 7 385-512 27 12 12289-16384
652
+ 8 3 17-24 18 8 513-768 28 13 16385-24576
653
+ 9 3 25-32 19 8 769-1024 29 13 24577-32768
654
+
655
+ 3.2.6. Compression with fixed Huffman codes (BTYPE=01)
656
+
657
+ The Huffman codes for the two alphabets are fixed, and are not
658
+ represented explicitly in the data. The Huffman code lengths
659
+ for the literal/length alphabet are:
660
+
661
+ Lit Value Bits Codes
662
+ --------- ---- -----
663
+ 0 - 143 8 00110000 through
664
+ 10111111
665
+ 144 - 255 9 110010000 through
666
+ 111111111
667
+ 256 - 279 7 0000000 through
668
+ 0010111
669
+ 280 - 287 8 11000000 through
670
+ 11000111
671
+
672
+
673
+
674
+ Deutsch Informational [Page 12]
675
+
676
+ RFC 1951 DEFLATE Compressed Data Format Specification May 1996
677
+
678
+
679
+ The code lengths are sufficient to generate the actual codes,
680
+ as described above; we show the codes in the table for added
681
+ clarity. Literal/length values 286-287 will never actually
682
+ occur in the compressed data, but participate in the code
683
+ construction.
684
+
685
+ Distance codes 0-31 are represented by (fixed-length) 5-bit
686
+ codes, with possible additional bits as shown in the table
687
+ shown in Paragraph 3.2.5, above. Note that distance codes 30-
688
+ 31 will never actually occur in the compressed data.
689
+
690
+ 3.2.7. Compression with dynamic Huffman codes (BTYPE=10)
691
+
692
+ The Huffman codes for the two alphabets appear in the block
693
+ immediately after the header bits and before the actual
694
+ compressed data, first the literal/length code and then the
695
+ distance code. Each code is defined by a sequence of code
696
+ lengths, as discussed in Paragraph 3.2.2, above. For even
697
+ greater compactness, the code length sequences themselves are
698
+ compressed using a Huffman code. The alphabet for code lengths
699
+ is as follows:
700
+
701
+ 0 - 15: Represent code lengths of 0 - 15
702
+ 16: Copy the previous code length 3 - 6 times.
703
+ The next 2 bits indicate repeat length
704
+ (0 = 3, ... , 3 = 6)
705
+ Example: Codes 8, 16 (+2 bits 11),
706
+ 16 (+2 bits 10) will expand to
707
+ 12 code lengths of 8 (1 + 6 + 5)
708
+ 17: Repeat a code length of 0 for 3 - 10 times.
709
+ (3 bits of length)
710
+ 18: Repeat a code length of 0 for 11 - 138 times
711
+ (7 bits of length)
712
+
713
+ A code length of 0 indicates that the corresponding symbol in
714
+ the literal/length or distance alphabet will not occur in the
715
+ block, and should not participate in the Huffman code
716
+ construction algorithm given earlier. If only one distance
717
+ code is used, it is encoded using one bit, not zero bits; in
718
+ this case there is a single code length of one, with one unused
719
+ code. One distance code of zero bits means that there are no
720
+ distance codes used at all (the data is all literals).
721
+
722
+ We can now define the format of the block:
723
+
724
+ 5 Bits: HLIT, # of Literal/Length codes - 257 (257 - 286)
725
+ 5 Bits: HDIST, # of Distance codes - 1 (1 - 32)
726
+ 4 Bits: HCLEN, # of Code Length codes - 4 (4 - 19)
727
+
728
+
729
+
730
+ Deutsch Informational [Page 13]
731
+
732
+ RFC 1951 DEFLATE Compressed Data Format Specification May 1996
733
+
734
+
735
+ (HCLEN + 4) x 3 bits: code lengths for the code length
736
+ alphabet given just above, in the order: 16, 17, 18,
737
+ 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15
738
+
739
+ These code lengths are interpreted as 3-bit integers
740
+ (0-7); as above, a code length of 0 means the
741
+ corresponding symbol (literal/length or distance code
742
+ length) is not used.
743
+
744
+ HLIT + 257 code lengths for the literal/length alphabet,
745
+ encoded using the code length Huffman code
746
+
747
+ HDIST + 1 code lengths for the distance alphabet,
748
+ encoded using the code length Huffman code
749
+
750
+ The actual compressed data of the block,
751
+ encoded using the literal/length and distance Huffman
752
+ codes
753
+
754
+ The literal/length symbol 256 (end of data),
755
+ encoded using the literal/length Huffman code
756
+
757
+ The code length repeat codes can cross from HLIT + 257 to the
758
+ HDIST + 1 code lengths. In other words, all code lengths form
759
+ a single sequence of HLIT + HDIST + 258 values.
760
+
761
+ 3.3. Compliance
762
+
763
+ A compressor may limit further the ranges of values specified in
764
+ the previous section and still be compliant; for example, it may
765
+ limit the range of backward pointers to some value smaller than
766
+ 32K. Similarly, a compressor may limit the size of blocks so that
767
+ a compressible block fits in memory.
768
+
769
+ A compliant decompressor must accept the full range of possible
770
+ values defined in the previous section, and must accept blocks of
771
+ arbitrary size.
772
+
773
+ 4. Compression algorithm details
774
+
775
+ While it is the intent of this document to define the "deflate"
776
+ compressed data format without reference to any particular
777
+ compression algorithm, the format is related to the compressed
778
+ formats produced by LZ77 (Lempel-Ziv 1977, see reference [2] below);
779
+ since many variations of LZ77 are patented, it is strongly
780
+ recommended that the implementor of a compressor follow the general
781
+ algorithm presented here, which is known not to be patented per se.
782
+ The material in this section is not part of the definition of the
783
+
784
+
785
+
786
+ Deutsch Informational [Page 14]
787
+
788
+ RFC 1951 DEFLATE Compressed Data Format Specification May 1996
789
+
790
+
791
+ specification per se, and a compressor need not follow it in order to
792
+ be compliant.
793
+
794
+ The compressor terminates a block when it determines that starting a
795
+ new block with fresh trees would be useful, or when the block size
796
+ fills up the compressor's block buffer.
797
+
798
+ The compressor uses a chained hash table to find duplicated strings,
799
+ using a hash function that operates on 3-byte sequences. At any
800
+ given point during compression, let XYZ be the next 3 input bytes to
801
+ be examined (not necessarily all different, of course). First, the
802
+ compressor examines the hash chain for XYZ. If the chain is empty,
803
+ the compressor simply writes out X as a literal byte and advances one
804
+ byte in the input. If the hash chain is not empty, indicating that
805
+ the sequence XYZ (or, if we are unlucky, some other 3 bytes with the
806
+ same hash function value) has occurred recently, the compressor
807
+ compares all strings on the XYZ hash chain with the actual input data
808
+ sequence starting at the current point, and selects the longest
809
+ match.
810
+
811
+ The compressor searches the hash chains starting with the most recent
812
+ strings, to favor small distances and thus take advantage of the
813
+ Huffman encoding. The hash chains are singly linked. There are no
814
+ deletions from the hash chains; the algorithm simply discards matches
815
+ that are too old. To avoid a worst-case situation, very long hash
816
+ chains are arbitrarily truncated at a certain length, determined by a
817
+ run-time parameter.
818
+
819
+ To improve overall compression, the compressor optionally defers the
820
+ selection of matches ("lazy matching"): after a match of length N has
821
+ been found, the compressor searches for a longer match starting at
822
+ the next input byte. If it finds a longer match, it truncates the
823
+ previous match to a length of one (thus producing a single literal
824
+ byte) and then emits the longer match. Otherwise, it emits the
825
+ original match, and, as described above, advances N bytes before
826
+ continuing.
827
+
828
+ Run-time parameters also control this "lazy match" procedure. If
829
+ compression ratio is most important, the compressor attempts a
830
+ complete second search regardless of the length of the first match.
831
+ In the normal case, if the current match is "long enough", the
832
+ compressor reduces the search for a longer match, thus speeding up
833
+ the process. If speed is most important, the compressor inserts new
834
+ strings in the hash table only when no match was found, or when the
835
+ match is not "too long". This degrades the compression ratio but
836
+ saves time since there are both fewer insertions and fewer searches.
837
+
838
+
839
+
840
+
841
+
842
+ Deutsch Informational [Page 15]
843
+
844
+ RFC 1951 DEFLATE Compressed Data Format Specification May 1996
845
+
846
+
847
+ 5. References
848
+
849
+ [1] Huffman, D. A., "A Method for the Construction of Minimum
850
+ Redundancy Codes", Proceedings of the Institute of Radio
851
+ Engineers, September 1952, Volume 40, Number 9, pp. 1098-1101.
852
+
853
+ [2] Ziv J., Lempel A., "A Universal Algorithm for Sequential Data
854
+ Compression", IEEE Transactions on Information Theory, Vol. 23,
855
+ No. 3, pp. 337-343.
856
+
857
+ [3] Gailly, J.-L., and Adler, M., ZLIB documentation and sources,
858
+ available in ftp://ftp.uu.net/pub/archiving/zip/doc/
859
+
860
+ [4] Gailly, J.-L., and Adler, M., GZIP documentation and sources,
861
+ available as gzip-*.tar in ftp://prep.ai.mit.edu/pub/gnu/
862
+
863
+ [5] Schwartz, E. S., and Kallick, B. "Generating a canonical prefix
864
+ encoding." Comm. ACM, 7,3 (Mar. 1964), pp. 166-169.
865
+
866
+ [6] Hirschberg and Lelewer, "Efficient decoding of prefix codes,"
867
+ Comm. ACM, 33,4, April 1990, pp. 449-459.
868
+
869
+ 6. Security Considerations
870
+
871
+ Any data compression method involves the reduction of redundancy in
872
+ the data. Consequently, any corruption of the data is likely to have
873
+ severe effects and be difficult to correct. Uncompressed text, on
874
+ the other hand, will probably still be readable despite the presence
875
+ of some corrupted bytes.
876
+
877
+ It is recommended that systems using this data format provide some
878
+ means of validating the integrity of the compressed data. See
879
+ reference [3], for example.
880
+
881
+ 7. Source code
882
+
883
+ Source code for a C language implementation of a "deflate" compliant
884
+ compressor and decompressor is available within the zlib package at
885
+ ftp://ftp.uu.net/pub/archiving/zip/zlib/.
886
+
887
+ 8. Acknowledgements
888
+
889
+ Trademarks cited in this document are the property of their
890
+ respective owners.
891
+
892
+ Phil Katz designed the deflate format. Jean-Loup Gailly and Mark
893
+ Adler wrote the related software described in this specification.
894
+ Glenn Randers-Pehrson converted this document to RFC and HTML format.
895
+
896
+
897
+
898
+ Deutsch Informational [Page 16]
899
+
900
+ RFC 1951 DEFLATE Compressed Data Format Specification May 1996
901
+
902
+
903
+ 9. Author's Address
904
+
905
+ L. Peter Deutsch
906
+ Aladdin Enterprises
907
+ 203 Santa Margarita Ave.
908
+ Menlo Park, CA 94025
909
+
910
+ Phone: (415) 322-0103 (AM only)
911
+ FAX: (415) 322-1734
912
+ EMail: <ghost@aladdin.com>
913
+
914
+ Questions about the technical content of this specification can be
915
+ sent by email to:
916
+
917
+ Jean-Loup Gailly <gzip@prep.ai.mit.edu> and
918
+ Mark Adler <madler@alumni.caltech.edu>
919
+
920
+ Editorial comments on this specification can be sent by email to:
921
+
922
+ L. Peter Deutsch <ghost@aladdin.com> and
923
+ Glenn Randers-Pehrson <randeg@alumni.rpi.edu>
924
+
925
+
926
+
927
+
928
+
929
+
930
+
931
+
932
+
933
+
934
+
935
+
936
+
937
+
938
+
939
+
940
+
941
+
942
+
943
+
944
+
945
+
946
+
947
+
948
+
949
+
950
+
951
+
952
+
953
+
954
+ Deutsch Informational [Page 17]
955
+