cabriolet 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (78) hide show
  1. checksums.yaml +7 -0
  2. data/ARCHITECTURE.md +799 -0
  3. data/CHANGELOG.md +44 -0
  4. data/LICENSE +29 -0
  5. data/README.adoc +1207 -0
  6. data/exe/cabriolet +6 -0
  7. data/lib/cabriolet/auto.rb +173 -0
  8. data/lib/cabriolet/binary/bitstream.rb +148 -0
  9. data/lib/cabriolet/binary/bitstream_writer.rb +180 -0
  10. data/lib/cabriolet/binary/chm_structures.rb +213 -0
  11. data/lib/cabriolet/binary/hlp_structures.rb +66 -0
  12. data/lib/cabriolet/binary/kwaj_structures.rb +74 -0
  13. data/lib/cabriolet/binary/lit_structures.rb +107 -0
  14. data/lib/cabriolet/binary/oab_structures.rb +112 -0
  15. data/lib/cabriolet/binary/structures.rb +56 -0
  16. data/lib/cabriolet/binary/szdd_structures.rb +60 -0
  17. data/lib/cabriolet/cab/compressor.rb +382 -0
  18. data/lib/cabriolet/cab/decompressor.rb +510 -0
  19. data/lib/cabriolet/cab/extractor.rb +357 -0
  20. data/lib/cabriolet/cab/parser.rb +264 -0
  21. data/lib/cabriolet/chm/compressor.rb +513 -0
  22. data/lib/cabriolet/chm/decompressor.rb +436 -0
  23. data/lib/cabriolet/chm/parser.rb +254 -0
  24. data/lib/cabriolet/cli.rb +776 -0
  25. data/lib/cabriolet/compressors/base.rb +34 -0
  26. data/lib/cabriolet/compressors/lzss.rb +250 -0
  27. data/lib/cabriolet/compressors/lzx.rb +581 -0
  28. data/lib/cabriolet/compressors/mszip.rb +315 -0
  29. data/lib/cabriolet/compressors/quantum.rb +446 -0
  30. data/lib/cabriolet/constants.rb +75 -0
  31. data/lib/cabriolet/decompressors/base.rb +39 -0
  32. data/lib/cabriolet/decompressors/lzss.rb +138 -0
  33. data/lib/cabriolet/decompressors/lzx.rb +726 -0
  34. data/lib/cabriolet/decompressors/mszip.rb +390 -0
  35. data/lib/cabriolet/decompressors/none.rb +27 -0
  36. data/lib/cabriolet/decompressors/quantum.rb +456 -0
  37. data/lib/cabriolet/errors.rb +39 -0
  38. data/lib/cabriolet/format_detector.rb +156 -0
  39. data/lib/cabriolet/hlp/compressor.rb +272 -0
  40. data/lib/cabriolet/hlp/decompressor.rb +198 -0
  41. data/lib/cabriolet/hlp/parser.rb +131 -0
  42. data/lib/cabriolet/huffman/decoder.rb +79 -0
  43. data/lib/cabriolet/huffman/encoder.rb +108 -0
  44. data/lib/cabriolet/huffman/tree.rb +138 -0
  45. data/lib/cabriolet/kwaj/compressor.rb +479 -0
  46. data/lib/cabriolet/kwaj/decompressor.rb +237 -0
  47. data/lib/cabriolet/kwaj/parser.rb +183 -0
  48. data/lib/cabriolet/lit/compressor.rb +255 -0
  49. data/lib/cabriolet/lit/decompressor.rb +250 -0
  50. data/lib/cabriolet/models/cabinet.rb +81 -0
  51. data/lib/cabriolet/models/chm_file.rb +28 -0
  52. data/lib/cabriolet/models/chm_header.rb +67 -0
  53. data/lib/cabriolet/models/chm_section.rb +38 -0
  54. data/lib/cabriolet/models/file.rb +119 -0
  55. data/lib/cabriolet/models/folder.rb +102 -0
  56. data/lib/cabriolet/models/folder_data.rb +21 -0
  57. data/lib/cabriolet/models/hlp_file.rb +45 -0
  58. data/lib/cabriolet/models/hlp_header.rb +37 -0
  59. data/lib/cabriolet/models/kwaj_header.rb +98 -0
  60. data/lib/cabriolet/models/lit_header.rb +55 -0
  61. data/lib/cabriolet/models/oab_header.rb +95 -0
  62. data/lib/cabriolet/models/szdd_header.rb +72 -0
  63. data/lib/cabriolet/modifier.rb +326 -0
  64. data/lib/cabriolet/oab/compressor.rb +353 -0
  65. data/lib/cabriolet/oab/decompressor.rb +315 -0
  66. data/lib/cabriolet/parallel.rb +333 -0
  67. data/lib/cabriolet/repairer.rb +288 -0
  68. data/lib/cabriolet/streaming.rb +221 -0
  69. data/lib/cabriolet/system/file_handle.rb +107 -0
  70. data/lib/cabriolet/system/io_system.rb +87 -0
  71. data/lib/cabriolet/system/memory_handle.rb +105 -0
  72. data/lib/cabriolet/szdd/compressor.rb +217 -0
  73. data/lib/cabriolet/szdd/decompressor.rb +184 -0
  74. data/lib/cabriolet/szdd/parser.rb +127 -0
  75. data/lib/cabriolet/validator.rb +332 -0
  76. data/lib/cabriolet/version.rb +5 -0
  77. data/lib/cabriolet.rb +104 -0
  78. metadata +157 -0
@@ -0,0 +1,315 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "../binary/bitstream_writer"
4
+ require_relative "../huffman/encoder"
5
+
6
+ module Cabriolet
7
+ module Compressors
8
+ # MSZIP handles MSZIP (DEFLATE) compression
9
+ # Based on RFC 1951 and libmspack implementation
10
+ class MSZIP < Base
11
+ # MSZIP frame size (32KB sliding window)
12
+ FRAME_SIZE = 32_768
13
+
14
+ # MSZIP signature bytes
15
+ SIGNATURE = [0x43, 0x4B].freeze # 'CK'
16
+
17
+ # Block types
18
+ STORED_BLOCK = 0
19
+ FIXED_HUFFMAN_BLOCK = 1
20
+ DYNAMIC_HUFFMAN_BLOCK = 2
21
+
22
+ # Match length constants
23
+ MIN_MATCH = 3
24
+ MAX_MATCH = 258
25
+
26
+ # Window size for LZ77
27
+ WINDOW_SIZE = 32_768
28
+
29
+ # Match lengths for literal codes 257-285
30
+ LIT_LENGTHS = [
31
+ 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 15, 17, 19, 23, 27,
32
+ 31, 35, 43, 51, 59, 67, 83, 99, 115, 131, 163, 195, 227, 258
33
+ ].freeze
34
+
35
+ # Match offsets for distance codes 0-29
36
+ DIST_OFFSETS = [
37
+ 1, 2, 3, 4, 5, 7, 9, 13, 17, 25, 33, 49, 65, 97, 129, 193, 257, 385,
38
+ 513, 769, 1025, 1537, 2049, 3073, 4097, 6145, 8193, 12_289, 16_385, 24_577
39
+ ].freeze
40
+
41
+ # Extra bits for literal codes 257-285
42
+ LIT_EXTRABITS = [
43
+ 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2,
44
+ 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 0
45
+ ].freeze
46
+
47
+ # Extra bits for distance codes 0-29
48
+ DIST_EXTRABITS = [
49
+ 0, 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6,
50
+ 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13
51
+ ].freeze
52
+
53
+ # Initialize MSZIP compressor
54
+ #
55
+ # @param io_system [System::IOSystem] I/O system for reading/writing
56
+ # @param input [System::FileHandle, System::MemoryHandle] Input handle
57
+ # @param output [System::FileHandle, System::MemoryHandle] Output handle
58
+ # @param buffer_size [Integer] Buffer size for I/O operations
59
+ def initialize(io_system, input, output, buffer_size)
60
+ super
61
+
62
+ # Initialize bitstream writer
63
+ @bitstream = Binary::BitstreamWriter.new(io_system, output, buffer_size)
64
+
65
+ # Build fixed Huffman codes
66
+ @fixed_codes = Huffman::Encoder.build_fixed_codes
67
+
68
+ # Initialize sliding window for LZ77
69
+ @window = "\0" * WINDOW_SIZE
70
+ @window_pos = 0
71
+ end
72
+
73
+ # Compress input data using MSZIP (DEFLATE) algorithm
74
+ #
75
+ # @return [Integer] Number of bytes written
76
+ def compress
77
+ input_data = read_all_input
78
+ total_written = 0
79
+ pos = 0
80
+
81
+ # Handle empty input - still need to write a block
82
+ if input_data.empty?
83
+ write_signature
84
+ compress_block("", true)
85
+ @bitstream.flush
86
+ return 0
87
+ end
88
+
89
+ # Process data in FRAME_SIZE chunks
90
+ # Each frame is independent and contains blocks ending with last_block=1
91
+ while pos < input_data.bytesize
92
+ chunk_size = [FRAME_SIZE, input_data.bytesize - pos].min
93
+ chunk = input_data[pos, chunk_size]
94
+
95
+ # Write CK signature
96
+ write_signature
97
+
98
+ # Compress block with fixed Huffman
99
+ # Each frame's block is always marked as last within that frame
100
+ compress_block(chunk, true)
101
+
102
+ pos += chunk_size
103
+ total_written += chunk_size
104
+ end
105
+
106
+ # Flush any remaining bits
107
+ @bitstream.flush
108
+
109
+ total_written
110
+ end
111
+
112
+ private
113
+
114
+ # Read all input data into memory
115
+ #
116
+ # @return [String] All input data
117
+ def read_all_input
118
+ data = +""
119
+ loop do
120
+ chunk = @io_system.read(@input, @buffer_size)
121
+ break if chunk.empty?
122
+
123
+ data << chunk
124
+ end
125
+ data
126
+ end
127
+
128
+ # Write MSZIP signature (CK)
129
+ #
130
+ # @return [void]
131
+ def write_signature
132
+ @bitstream.byte_align
133
+ SIGNATURE.each { |byte| @bitstream.write_raw_byte(byte) }
134
+ end
135
+
136
+ # Compress a single block using fixed Huffman encoding
137
+ #
138
+ # @param data [String] Data to compress
139
+ # @param is_last [Boolean] Whether this is the last block
140
+ # @return [void]
141
+ def compress_block(data, is_last)
142
+ # Write block header
143
+ @bitstream.write_bits(is_last ? 1 : 0, 1) # Last block flag
144
+ @bitstream.write_bits(FIXED_HUFFMAN_BLOCK, 2) # Block type
145
+
146
+ # Reset window position for this block
147
+ @window_pos = 0
148
+
149
+ # Encode data using LZ77 and Huffman
150
+ encode_data(data)
151
+
152
+ # Write end-of-block symbol (256)
153
+ encode_literal(256)
154
+ end
155
+
156
+ # Encode data using LZ77 matching and Huffman encoding
157
+ #
158
+ # @param data [String] Data to encode
159
+ # @return [void]
160
+ def encode_data(data)
161
+ pos = 0
162
+
163
+ while pos < data.bytesize
164
+ # Try to find a match in the window
165
+ match = find_match(data, pos)
166
+
167
+ if match && match[:length] >= MIN_MATCH
168
+ # Encode as length/distance pair
169
+ encode_match(match[:length], match[:distance])
170
+
171
+ # Add matched bytes to window
172
+ match[:length].times do
173
+ add_to_window(data.getbyte(pos))
174
+ pos += 1
175
+ end
176
+ else
177
+ # Encode as literal
178
+ byte = data.getbyte(pos)
179
+ encode_literal(byte)
180
+ add_to_window(byte)
181
+ pos += 1
182
+ end
183
+ end
184
+ end
185
+
186
+ # Find the longest match in the sliding window
187
+ #
188
+ # @param data [String] Input data
189
+ # @param pos [Integer] Current position in data
190
+ # @return [Hash, nil] Match info with :length and :distance, or nil
191
+ def find_match(data, pos)
192
+ return nil if pos >= data.bytesize
193
+
194
+ best_match = nil
195
+ max_length = [MAX_MATCH, data.bytesize - pos].min
196
+
197
+ # Don't search if we can't get MIN_MATCH
198
+ return nil if max_length < MIN_MATCH
199
+
200
+ # Search window for matches (simple greedy search)
201
+ # Start from most recent positions for better compression
202
+ search_start = [@window_pos - WINDOW_SIZE, 0].max
203
+ search_end = @window_pos
204
+
205
+ (search_start...search_end).each do |win_pos|
206
+ length = 0
207
+
208
+ # Count matching bytes
209
+ while length < max_length &&
210
+ data.getbyte(pos + length) == @window.getbyte(win_pos + length)
211
+ length += 1
212
+ end
213
+
214
+ # Update best match if this is longer
215
+ next unless length >= MIN_MATCH && (best_match.nil? || length > best_match[:length])
216
+
217
+ distance = @window_pos - win_pos
218
+ best_match = { length: length, distance: distance }
219
+
220
+ # Stop if we found maximum match
221
+ break if length == MAX_MATCH
222
+ end
223
+
224
+ best_match
225
+ end
226
+
227
+ # Add byte to sliding window
228
+ #
229
+ # @param byte [Integer] Byte to add
230
+ # @return [void]
231
+ def add_to_window(byte)
232
+ @window.setbyte(@window_pos % WINDOW_SIZE, byte)
233
+ @window_pos += 1
234
+ end
235
+
236
+ # Encode a literal byte using fixed Huffman codes
237
+ #
238
+ # @param byte [Integer] Byte value (0-255) or end-of-block (256)
239
+ # @return [void]
240
+ def encode_literal(byte)
241
+ Huffman::Encoder.encode_symbol(byte, @fixed_codes[:literal], @bitstream)
242
+ end
243
+
244
+ # Encode a match as length/distance pair
245
+ #
246
+ # @param length [Integer] Match length (3-258)
247
+ # @param distance [Integer] Match distance (1-32768)
248
+ # @return [void]
249
+ def encode_match(length, distance)
250
+ # Encode length
251
+ length_code, extra_bits, extra_value = encode_length(length)
252
+ Huffman::Encoder.encode_symbol(length_code, @fixed_codes[:literal],
253
+ @bitstream)
254
+ @bitstream.write_bits(extra_value, extra_bits) if extra_bits.positive?
255
+
256
+ # Encode distance
257
+ dist_code, extra_bits, extra_value = encode_distance(distance)
258
+ Huffman::Encoder.encode_symbol(dist_code, @fixed_codes[:distance],
259
+ @bitstream)
260
+ @bitstream.write_bits(extra_value, extra_bits) if extra_bits.positive?
261
+ end
262
+
263
+ # Encode length into length code and extra bits
264
+ #
265
+ # @param length [Integer] Match length (3-258)
266
+ # @return [Array<Integer>] [code, extra_bits, extra_value]
267
+ def encode_length(length)
268
+ # Handle edge case for length 258 (max length)
269
+ return [285, 0, 0] if length == 258
270
+
271
+ # Find the appropriate length code
272
+ LIT_LENGTHS.each_with_index do |base_length, index|
273
+ next if index >= 29 # Only codes 0-28 are valid
274
+
275
+ extra_bits = LIT_EXTRABITS[index]
276
+ max_length = if index == 28
277
+ 258 # Last code handles length 258
278
+ else
279
+ base_length + (1 << extra_bits) - 1
280
+ end
281
+
282
+ next unless length.between?(base_length, max_length)
283
+
284
+ code = 257 + index
285
+ extra_value = length - base_length
286
+ return [code, extra_bits, extra_value]
287
+ end
288
+
289
+ # Should not reach here
290
+ raise Errors::CompressionError, "Invalid length: #{length}"
291
+ end
292
+
293
+ # Encode distance into distance code and extra bits
294
+ #
295
+ # @param distance [Integer] Match distance (1-32768)
296
+ # @return [Array<Integer>] [code, extra_bits, extra_value]
297
+ def encode_distance(distance)
298
+ # Find the appropriate distance code (only 0-29 are valid)
299
+ (0...30).each do |code|
300
+ base_offset = DIST_OFFSETS[code]
301
+ extra_bits = DIST_EXTRABITS[code]
302
+ max_offset = base_offset + (1 << extra_bits) - 1
303
+
304
+ if distance.between?(base_offset, max_offset)
305
+ extra_value = distance - base_offset
306
+ return [code, extra_bits, extra_value]
307
+ end
308
+ end
309
+
310
+ # Should not reach here
311
+ raise Errors::CompressionError, "Invalid distance: #{distance}"
312
+ end
313
+ end
314
+ end
315
+ end