cabriolet 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (78) hide show
  1. checksums.yaml +7 -0
  2. data/ARCHITECTURE.md +799 -0
  3. data/CHANGELOG.md +44 -0
  4. data/LICENSE +29 -0
  5. data/README.adoc +1207 -0
  6. data/exe/cabriolet +6 -0
  7. data/lib/cabriolet/auto.rb +173 -0
  8. data/lib/cabriolet/binary/bitstream.rb +148 -0
  9. data/lib/cabriolet/binary/bitstream_writer.rb +180 -0
  10. data/lib/cabriolet/binary/chm_structures.rb +213 -0
  11. data/lib/cabriolet/binary/hlp_structures.rb +66 -0
  12. data/lib/cabriolet/binary/kwaj_structures.rb +74 -0
  13. data/lib/cabriolet/binary/lit_structures.rb +107 -0
  14. data/lib/cabriolet/binary/oab_structures.rb +112 -0
  15. data/lib/cabriolet/binary/structures.rb +56 -0
  16. data/lib/cabriolet/binary/szdd_structures.rb +60 -0
  17. data/lib/cabriolet/cab/compressor.rb +382 -0
  18. data/lib/cabriolet/cab/decompressor.rb +510 -0
  19. data/lib/cabriolet/cab/extractor.rb +357 -0
  20. data/lib/cabriolet/cab/parser.rb +264 -0
  21. data/lib/cabriolet/chm/compressor.rb +513 -0
  22. data/lib/cabriolet/chm/decompressor.rb +436 -0
  23. data/lib/cabriolet/chm/parser.rb +254 -0
  24. data/lib/cabriolet/cli.rb +776 -0
  25. data/lib/cabriolet/compressors/base.rb +34 -0
  26. data/lib/cabriolet/compressors/lzss.rb +250 -0
  27. data/lib/cabriolet/compressors/lzx.rb +581 -0
  28. data/lib/cabriolet/compressors/mszip.rb +315 -0
  29. data/lib/cabriolet/compressors/quantum.rb +446 -0
  30. data/lib/cabriolet/constants.rb +75 -0
  31. data/lib/cabriolet/decompressors/base.rb +39 -0
  32. data/lib/cabriolet/decompressors/lzss.rb +138 -0
  33. data/lib/cabriolet/decompressors/lzx.rb +726 -0
  34. data/lib/cabriolet/decompressors/mszip.rb +390 -0
  35. data/lib/cabriolet/decompressors/none.rb +27 -0
  36. data/lib/cabriolet/decompressors/quantum.rb +456 -0
  37. data/lib/cabriolet/errors.rb +39 -0
  38. data/lib/cabriolet/format_detector.rb +156 -0
  39. data/lib/cabriolet/hlp/compressor.rb +272 -0
  40. data/lib/cabriolet/hlp/decompressor.rb +198 -0
  41. data/lib/cabriolet/hlp/parser.rb +131 -0
  42. data/lib/cabriolet/huffman/decoder.rb +79 -0
  43. data/lib/cabriolet/huffman/encoder.rb +108 -0
  44. data/lib/cabriolet/huffman/tree.rb +138 -0
  45. data/lib/cabriolet/kwaj/compressor.rb +479 -0
  46. data/lib/cabriolet/kwaj/decompressor.rb +237 -0
  47. data/lib/cabriolet/kwaj/parser.rb +183 -0
  48. data/lib/cabriolet/lit/compressor.rb +255 -0
  49. data/lib/cabriolet/lit/decompressor.rb +250 -0
  50. data/lib/cabriolet/models/cabinet.rb +81 -0
  51. data/lib/cabriolet/models/chm_file.rb +28 -0
  52. data/lib/cabriolet/models/chm_header.rb +67 -0
  53. data/lib/cabriolet/models/chm_section.rb +38 -0
  54. data/lib/cabriolet/models/file.rb +119 -0
  55. data/lib/cabriolet/models/folder.rb +102 -0
  56. data/lib/cabriolet/models/folder_data.rb +21 -0
  57. data/lib/cabriolet/models/hlp_file.rb +45 -0
  58. data/lib/cabriolet/models/hlp_header.rb +37 -0
  59. data/lib/cabriolet/models/kwaj_header.rb +98 -0
  60. data/lib/cabriolet/models/lit_header.rb +55 -0
  61. data/lib/cabriolet/models/oab_header.rb +95 -0
  62. data/lib/cabriolet/models/szdd_header.rb +72 -0
  63. data/lib/cabriolet/modifier.rb +326 -0
  64. data/lib/cabriolet/oab/compressor.rb +353 -0
  65. data/lib/cabriolet/oab/decompressor.rb +315 -0
  66. data/lib/cabriolet/parallel.rb +333 -0
  67. data/lib/cabriolet/repairer.rb +288 -0
  68. data/lib/cabriolet/streaming.rb +221 -0
  69. data/lib/cabriolet/system/file_handle.rb +107 -0
  70. data/lib/cabriolet/system/io_system.rb +87 -0
  71. data/lib/cabriolet/system/memory_handle.rb +105 -0
  72. data/lib/cabriolet/szdd/compressor.rb +217 -0
  73. data/lib/cabriolet/szdd/decompressor.rb +184 -0
  74. data/lib/cabriolet/szdd/parser.rb +127 -0
  75. data/lib/cabriolet/validator.rb +332 -0
  76. data/lib/cabriolet/version.rb +5 -0
  77. data/lib/cabriolet.rb +104 -0
  78. metadata +157 -0
@@ -0,0 +1,456 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Cabriolet
4
+ module Decompressors
5
+ # Quantum handles Quantum-compressed data using arithmetic coding
6
+ # Based on libmspack qtmd.c implementation
7
+ #
8
+ # The Quantum method was created by David Stafford, adapted by Microsoft
9
+ # Corporation.
10
+ class Quantum < Base
11
+ # Frame size (32KB per frame)
12
+ FRAME_SIZE = 32_768
13
+
14
+ # Match constants
15
+ MAX_MATCH = 1028
16
+
17
+ # Position slot tables (same as in qtmd.c)
18
+ POSITION_BASE = [
19
+ 0, 1, 2, 3, 4, 6, 8, 12, 16, 24, 32, 48, 64, 96, 128, 192, 256, 384,
20
+ 512, 768, 1024, 1536, 2048, 3072, 4096, 6144, 8192, 12_288, 16_384,
21
+ 24_576, 32_768, 49_152, 65_536, 98_304, 131_072, 196_608, 262_144,
22
+ 393_216, 524_288, 786_432, 1_048_576, 1_572_864
23
+ ].freeze
24
+
25
+ EXTRA_BITS = [
26
+ 0, 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8,
27
+ 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15, 16, 16,
28
+ 17, 17, 18, 18, 19, 19
29
+ ].freeze
30
+
31
+ LENGTH_BASE = [
32
+ 0, 1, 2, 3, 4, 5, 6, 8, 10, 12, 14, 18, 22, 26,
33
+ 30, 38, 46, 54, 62, 78, 94, 110, 126, 158, 190, 222, 254
34
+ ].freeze
35
+
36
+ LENGTH_EXTRA = [
37
+ 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2,
38
+ 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 0
39
+ ].freeze
40
+
41
+ attr_reader :window_bits, :window_size
42
+
43
+ # Represents a symbol in an arithmetic coding model
44
+ class ModelSymbol
45
+ attr_accessor :sym, :cumfreq
46
+
47
+ def initialize(sym, cumfreq)
48
+ @sym = sym
49
+ @cumfreq = cumfreq
50
+ end
51
+ end
52
+
53
+ # Represents an arithmetic coding model
54
+ class Model
55
+ attr_accessor :shiftsleft, :entries, :syms
56
+
57
+ def initialize(syms, entries)
58
+ @syms = syms
59
+ @entries = entries
60
+ @shiftsleft = 4
61
+ end
62
+ end
63
+
64
+ # Initialize Quantum decompressor
65
+ #
66
+ # @param io_system [System::IOSystem] I/O system for reading/writing
67
+ # @param input [System::FileHandle, System::MemoryHandle] Input handle
68
+ # @param output [System::FileHandle, System::MemoryHandle] Output handle
69
+ # @param buffer_size [Integer] Buffer size for I/O operations
70
+ # @param window_bits [Integer] Window size parameter (10-21)
71
+ def initialize(io_system, input, output, buffer_size, window_bits: 10)
72
+ super(io_system, input, output, buffer_size)
73
+
74
+ # Validate window_bits
75
+ unless (10..21).cover?(window_bits)
76
+ raise ArgumentError,
77
+ "Quantum window_bits must be 10-21, got #{window_bits}"
78
+ end
79
+
80
+ @window_bits = window_bits
81
+ @window_size = 1 << window_bits
82
+
83
+ # Initialize window
84
+ @window = "\0" * @window_size
85
+ @window_posn = 0
86
+ @frame_todo = FRAME_SIZE
87
+
88
+ # Arithmetic coding state
89
+ @h = 0xFFFF
90
+ @l = 0
91
+ @c = 0
92
+ @header_read = false
93
+
94
+ # Initialize bitstream for MSB-first reading
95
+ @bitstream = MSBBitstream.new(io_system, input, buffer_size)
96
+
97
+ # Initialize models
98
+ initialize_models
99
+ end
100
+
101
+ # Decompress Quantum data
102
+ #
103
+ # @param bytes [Integer] Number of bytes to decompress
104
+ # @return [Integer] Number of bytes decompressed
105
+ def decompress(bytes)
106
+ return 0 if bytes <= 0
107
+
108
+ output_data = String.new(capacity: bytes)
109
+ bytes_todo = bytes
110
+
111
+ while bytes_todo.positive?
112
+ # Read header if needed (initializes C register)
113
+ read_frame_header unless @header_read
114
+
115
+ # Calculate how much to decode this iteration
116
+ frame_end = @window_posn + [bytes_todo, @frame_todo,
117
+ @window_size - @window_posn].min
118
+
119
+ # Decode symbols
120
+ while @window_posn < frame_end
121
+ selector = decode_symbol(@model7)
122
+
123
+ if selector < 4
124
+ # Literal byte from one of 4 models
125
+ model = case selector
126
+ when 0 then @model0
127
+ when 1 then @model1
128
+ when 2 then @model2
129
+ else @model3
130
+ end
131
+
132
+ sym = decode_symbol(model)
133
+ @window.setbyte(@window_posn, sym)
134
+ @window_posn += 1
135
+ @frame_todo -= 1
136
+ else
137
+ # Match
138
+ match_offset, match_length = decode_match(selector)
139
+
140
+ # Validate match doesn't exceed frame or window
141
+ if @window_posn + match_length > @window_size
142
+ raise DecompressionError,
143
+ "Match exceeds window boundary"
144
+ end
145
+
146
+ @frame_todo -= match_length
147
+
148
+ # Copy match
149
+ copy_match(match_offset, match_length)
150
+ end
151
+ end
152
+
153
+ # Extract decoded bytes for output
154
+ output_amount = [@window_posn, bytes_todo].min
155
+ output_data << @window[0, output_amount]
156
+ bytes_todo -= output_amount
157
+
158
+ # Handle frame completion
159
+ if @frame_todo.zero?
160
+ # Re-align to byte boundary
161
+ @bitstream.byte_align
162
+
163
+ # Skip trailer bytes until 0xFF
164
+ loop do
165
+ byte = @bitstream.read_bits(8)
166
+ break if byte == 0xFF
167
+ end
168
+
169
+ @header_read = false
170
+ @frame_todo = FRAME_SIZE
171
+ end
172
+
173
+ # Handle window wrap
174
+ @window_posn = 0 if @window_posn == @window_size
175
+ end
176
+
177
+ # Write output
178
+ io_system.write(output, output_data)
179
+ bytes
180
+ end
181
+
182
+ private
183
+
184
+ # MSB-first bitstream for Quantum (reads 16-bit words MSB first)
185
+ class MSBBitstream
186
+ attr_reader :bits_left
187
+
188
+ def initialize(io_system, handle, buffer_size)
189
+ @io_system = io_system
190
+ @handle = handle
191
+ @buffer_size = buffer_size
192
+ @buffer = ""
193
+ @buffer_pos = 0
194
+ @bit_buffer = 0
195
+ @bits_left = 0
196
+ end
197
+
198
+ # Read bits MSB first (matching Quantum's READ_BITS macro)
199
+ def read_bits(num_bits)
200
+ while @bits_left < num_bits
201
+ # Read 16-bit word MSB first
202
+ b0 = read_byte
203
+ b1 = read_byte
204
+ word = (b0 << 8) | b1
205
+ @bit_buffer = (@bit_buffer << 16) | word
206
+ @bits_left += 16
207
+ end
208
+
209
+ # Extract bits from MSB side
210
+ @bits_left -= num_bits
211
+ (@bit_buffer >> @bits_left) & ((1 << num_bits) - 1)
212
+ end
213
+
214
+ def read_byte
215
+ if @buffer_pos >= @buffer.bytesize
216
+ @buffer = @io_system.read(@handle, @buffer_size)
217
+ @buffer_pos = 0
218
+ return 0 if @buffer.empty?
219
+ end
220
+
221
+ byte = @buffer.getbyte(@buffer_pos)
222
+ @buffer_pos += 1
223
+ byte
224
+ end
225
+
226
+ def byte_align
227
+ @bits_left -= (@bits_left % 8)
228
+ end
229
+ end
230
+
231
+ # Initialize all 7 arithmetic coding models
232
+ def initialize_models
233
+ # Models depend on window size
234
+ i = @window_bits * 2
235
+
236
+ # Four literal models (64 symbols each)
237
+ @m0sym = init_model_syms(0, 64)
238
+ @model0 = Model.new(@m0sym, 64)
239
+
240
+ @m1sym = init_model_syms(64, 64)
241
+ @model1 = Model.new(@m1sym, 64)
242
+
243
+ @m2sym = init_model_syms(128, 64)
244
+ @model2 = Model.new(@m2sym, 64)
245
+
246
+ @m3sym = init_model_syms(192, 64)
247
+ @model3 = Model.new(@m3sym, 64)
248
+
249
+ # Three match models (size depends on window)
250
+ @m4sym = init_model_syms(0, [i, 24].min)
251
+ @model4 = Model.new(@m4sym, [i, 24].min)
252
+
253
+ @m5sym = init_model_syms(0, [i, 36].min)
254
+ @model5 = Model.new(@m5sym, [i, 36].min)
255
+
256
+ @m6sym = init_model_syms(0, i)
257
+ @model6 = Model.new(@m6sym, i)
258
+
259
+ # Match length model
260
+ @m6lsym = init_model_syms(0, 27)
261
+ @model6len = Model.new(@m6lsym, 27)
262
+
263
+ # Selector model (7 symbols: 0-3 literals, 4-6 matches)
264
+ @m7sym = init_model_syms(0, 7)
265
+ @model7 = Model.new(@m7sym, 7)
266
+ end
267
+
268
+ # Initialize model symbol array
269
+ def init_model_syms(start, len)
270
+ Array.new(len + 1) do |i|
271
+ ModelSymbol.new(start + i, len - i)
272
+ end
273
+ end
274
+
275
+ # Read frame header (initialize C register)
276
+ def read_frame_header
277
+ @h = 0xFFFF
278
+ @l = 0
279
+ @c = @bitstream.read_bits(16)
280
+ @header_read = true
281
+ end
282
+
283
+ # Decode a symbol using arithmetic coding
284
+ # This implements the GET_SYMBOL macro from qtmd.c
285
+ def decode_symbol(model)
286
+ # Calculate range
287
+ range = ((@h - @l) & 0xFFFF) + 1
288
+ symf = ((((@c - @l + 1) * model.syms[0].cumfreq) - 1) / range) & 0xFFFF
289
+
290
+ # Find symbol
291
+ i = 1
292
+ while i < model.entries
293
+ break if model.syms[i].cumfreq <= symf
294
+
295
+ i += 1
296
+ end
297
+
298
+ sym = model.syms[i - 1].sym
299
+
300
+ # Update range
301
+ range = (@h - @l) + 1
302
+ symf = model.syms[0].cumfreq
303
+ @h = @l + ((model.syms[i - 1].cumfreq * range) / symf) - 1
304
+ @l += ((model.syms[i].cumfreq * range) / symf)
305
+
306
+ # Update model frequencies
307
+ j = i - 1
308
+ while j >= 0
309
+ model.syms[j].cumfreq += 8
310
+ j -= 1
311
+ end
312
+
313
+ # Check if model needs updating
314
+ update_model(model) if model.syms[0].cumfreq > 3800
315
+
316
+ # Normalize range
317
+ normalize_range
318
+
319
+ sym
320
+ end
321
+
322
+ # Normalize arithmetic coding range
323
+ def normalize_range
324
+ loop do
325
+ if (@l & 0x8000) != (@h & 0x8000)
326
+ # Underflow case
327
+ break unless @l.anybits?(0x4000) && @h.nobits?(0x4000)
328
+
329
+ @c ^= 0x4000
330
+ @l &= 0x3FFF
331
+ @h |= 0x4000
332
+
333
+ end
334
+
335
+ @l = (@l << 1) & 0xFFFF
336
+ @h = ((@h << 1) | 1) & 0xFFFF
337
+ bit = @bitstream.read_bits(1)
338
+ @c = ((@c << 1) | bit) & 0xFFFF
339
+ end
340
+ end
341
+
342
+ # Update model statistics (from qtmd_update_model)
343
+ def update_model(model)
344
+ model.shiftsleft -= 1
345
+
346
+ if model.shiftsleft.positive?
347
+ # Simple shift
348
+ (model.entries - 1).downto(0) do |i|
349
+ model.syms[i].cumfreq >>= 1
350
+ model.syms[i].cumfreq = model.syms[i + 1].cumfreq + 1 if model.syms[i].cumfreq <= model.syms[i + 1].cumfreq
351
+ end
352
+ else
353
+ # Full rebuild
354
+ model.shiftsleft = 50
355
+
356
+ # Convert cumfreq to frequencies
357
+ (0...model.entries).each do |i|
358
+ model.syms[i].cumfreq -= model.syms[i + 1].cumfreq
359
+ model.syms[i].cumfreq += 1
360
+ model.syms[i].cumfreq >>= 1
361
+ end
362
+
363
+ # Sort by frequency (selection sort for stability)
364
+ (0...(model.entries - 1)).each do |i|
365
+ ((i + 1)...model.entries).each do |j|
366
+ if model.syms[i].cumfreq < model.syms[j].cumfreq
367
+ model.syms[i], model.syms[j] = model.syms[j], model.syms[i]
368
+ end
369
+ end
370
+ end
371
+
372
+ # Convert back to cumulative frequencies
373
+ (model.entries - 1).downto(0) do |i|
374
+ model.syms[i].cumfreq += model.syms[i + 1].cumfreq
375
+ end
376
+ end
377
+ end
378
+
379
+ # Decode match offset and length
380
+ def decode_match(selector)
381
+ case selector
382
+ when 4
383
+ # Fixed length match (3 bytes)
384
+ sym = decode_symbol(@model4)
385
+ extra = @bitstream.read_bits(EXTRA_BITS[sym]) if EXTRA_BITS[sym].positive?
386
+ match_offset = POSITION_BASE[sym] + (extra || 0) + 1
387
+ match_length = 3
388
+ when 5
389
+ # Fixed length match (4 bytes)
390
+ sym = decode_symbol(@model5)
391
+ extra = @bitstream.read_bits(EXTRA_BITS[sym]) if EXTRA_BITS[sym].positive?
392
+ match_offset = POSITION_BASE[sym] + (extra || 0) + 1
393
+ match_length = 4
394
+ when 6
395
+ # Variable length match
396
+ sym = decode_symbol(@model6len)
397
+ extra = @bitstream.read_bits(LENGTH_EXTRA[sym]) if LENGTH_EXTRA[sym].positive?
398
+ match_length = LENGTH_BASE[sym] + (extra || 0) + 5
399
+
400
+ sym = decode_symbol(@model6)
401
+ extra = @bitstream.read_bits(EXTRA_BITS[sym]) if EXTRA_BITS[sym].positive?
402
+ match_offset = POSITION_BASE[sym] + (extra || 0) + 1
403
+ else
404
+ raise DecompressionError, "Invalid selector: #{selector}"
405
+ end
406
+
407
+ [match_offset, match_length]
408
+ end
409
+
410
+ # Copy match from window
411
+ def copy_match(offset, length)
412
+ if offset > @window_posn
413
+ # Match wraps around window
414
+ if offset > @window_size
415
+ raise DecompressionError,
416
+ "Match offset beyond window"
417
+ end
418
+
419
+ # Copy from end of window
420
+ src_pos = @window_size - (offset - @window_posn)
421
+ copy_len = offset - @window_posn
422
+
423
+ if copy_len < length
424
+ # Copy from end, then from beginning
425
+ copy_len.times do
426
+ @window.setbyte(@window_posn, @window.getbyte(src_pos))
427
+ @window_posn += 1
428
+ src_pos += 1
429
+ end
430
+ src_pos = 0
431
+ (length - copy_len).times do
432
+ @window.setbyte(@window_posn, @window.getbyte(src_pos))
433
+ @window_posn += 1
434
+ src_pos += 1
435
+ end
436
+ else
437
+ # Copy entirely from end
438
+ length.times do
439
+ @window.setbyte(@window_posn, @window.getbyte(src_pos))
440
+ @window_posn += 1
441
+ src_pos += 1
442
+ end
443
+ end
444
+ else
445
+ # Normal copy
446
+ src_pos = @window_posn - offset
447
+ length.times do
448
+ @window.setbyte(@window_posn, @window.getbyte(src_pos))
449
+ @window_posn += 1
450
+ src_pos += 1
451
+ end
452
+ end
453
+ end
454
+ end
455
+ end
456
+ end
@@ -0,0 +1,39 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Cabriolet
4
+ # Base error class for all Cabriolet errors
5
+ class Error < StandardError; end
6
+
7
+ # Raised when there's an I/O error
8
+ class IOError < Error; end
9
+
10
+ # Raised when parsing a CAB file fails
11
+ class ParseError < Error; end
12
+
13
+ # Raised during decompression
14
+ class DecompressionError < Error; end
15
+
16
+ # Raised during compression
17
+ class CompressionError < Error; end
18
+
19
+ # Raised when a checksum doesn't match
20
+ class ChecksumError < Error; end
21
+
22
+ # Raised when an unsupported format is encountered
23
+ class UnsupportedFormatError < Error; end
24
+
25
+ # Raised when invalid arguments are provided
26
+ class ArgumentError < ::ArgumentError; end
27
+
28
+ # Raised when file signature doesn't match expected format
29
+ class SignatureError < Error; end
30
+
31
+ # Raised when file format is invalid or corrupted
32
+ class FormatError < Error; end
33
+
34
+ # Raised when read operation fails
35
+ class ReadError < IOError; end
36
+
37
+ # Raised when seek operation fails
38
+ class SeekError < IOError; end
39
+ end
@@ -0,0 +1,156 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Cabriolet
4
+ # Detects archive format based on magic bytes and file structure
5
+ class FormatDetector
6
+ # Magic byte signatures for supported formats
7
+ MAGIC_SIGNATURES = {
8
+ "MSCF" => :cab,
9
+ "ITSF" => :chm,
10
+ "\x3F\x5F" => :hlp, # ?_
11
+ "\x4C\x4E" => :hlp, # LN (alternative HLP signature)
12
+ "KWAJ" => :kwaj,
13
+ "SZDD" => :szdd,
14
+ "\x88\xF0\x27\x00" => :szdd, # Alternative SZDD signature
15
+ "ITOLITLS" => :lit,
16
+ "\x00\x00\x00\x00" => :oab, # OAB has null header start
17
+ }.freeze
18
+
19
+ # File extension to format mapping (fallback)
20
+ EXTENSION_MAP = {
21
+ ".cab" => :cab,
22
+ ".chm" => :chm,
23
+ ".hlp" => :hlp,
24
+ ".kwj" => :kwaj,
25
+ ".kwaj" => :kwaj,
26
+ ".lit" => :lit,
27
+ ".oab" => :oab,
28
+ ".szdd" => :szdd,
29
+ }.freeze
30
+
31
+ class << self
32
+ # Detect format from file path
33
+ #
34
+ # @param path [String] Path to the archive file
35
+ # @return [Symbol, nil] Detected format or nil if unknown
36
+ def detect(path)
37
+ return nil unless File.exist?(path)
38
+
39
+ # Try magic byte detection first
40
+ format = detect_by_magic_bytes(path)
41
+ return format if format
42
+
43
+ # Fallback to extension-based detection
44
+ detect_by_extension(path)
45
+ end
46
+
47
+ # Detect format from IO stream
48
+ #
49
+ # @param io [IO] IO object to read from
50
+ # @return [Symbol, nil] Detected format or nil if unknown
51
+ def detect_from_io(io)
52
+ original_pos = io.pos
53
+
54
+ # Read first 16 bytes for magic byte checking
55
+ magic_bytes = io.read(16)
56
+ io.seek(original_pos) if original_pos
57
+
58
+ return nil unless magic_bytes && magic_bytes.size >= 4
59
+
60
+ detect_magic_bytes(magic_bytes)
61
+ end
62
+
63
+ # Detect format and return appropriate parser class
64
+ #
65
+ # @param path [String] Path to the archive file
66
+ # @return [Class, nil] Parser class or nil if unknown format
67
+ def parser_for(path)
68
+ format = detect(path)
69
+ format_to_parser(format) if format
70
+ end
71
+
72
+ # Convert format symbol to parser class
73
+ #
74
+ # @param format [Symbol] Format symbol
75
+ # @return [Class, nil] Parser class
76
+ def format_to_parser(format)
77
+ case format
78
+ when :cab
79
+ Cabriolet::CAB::Parser
80
+ when :chm
81
+ Cabriolet::CHM::Parser
82
+ when :hlp
83
+ Cabriolet::HLP::Parser
84
+ when :kwaj
85
+ Cabriolet::KWAJ::Parser
86
+ when :szdd
87
+ Cabriolet::SZDD::Parser
88
+ when :lit
89
+ # LIT parser to be implemented
90
+ nil
91
+ when :oab
92
+ # OAB parser to be implemented
93
+ nil
94
+ end
95
+ end
96
+
97
+ private
98
+
99
+ def detect_by_magic_bytes(path)
100
+ File.open(path, "rb") do |file|
101
+ magic_bytes = file.read(16)
102
+ detect_magic_bytes(magic_bytes)
103
+ end
104
+ rescue StandardError
105
+ nil
106
+ end
107
+
108
+ def detect_magic_bytes(bytes)
109
+ return nil unless bytes && bytes.size >= 4
110
+
111
+ # Check each known signature
112
+ MAGIC_SIGNATURES.each do |signature, format|
113
+ if bytes.start_with?(signature) && validate_format(bytes, format)
114
+ # Additional validation for specific formats
115
+ return format
116
+ end
117
+ end
118
+
119
+ nil
120
+ end
121
+
122
+ def detect_by_extension(path)
123
+ ext = File.extname(path).downcase
124
+ EXTENSION_MAP[ext]
125
+ end
126
+
127
+ def validate_format(bytes, format)
128
+ case format
129
+ when :cab
130
+ # Verify CAB header structure
131
+ bytes.size >= 36 && bytes[0..3] == "MSCF"
132
+ when :chm
133
+ # Verify CHM header
134
+ bytes.size >= 8 && bytes[0..3] == "ITSF"
135
+ when :hlp
136
+ # HLP files have either ?_ or LN signature
137
+ bytes.size >= 2 && ["\x3F\x5F", "\x4C\x4E"].include?(bytes[0..1])
138
+ when :kwaj
139
+ # Verify KWAJ header
140
+ bytes.size >= 4 && bytes[0..3] == "KWAJ"
141
+ when :szdd
142
+ # SZDD can have multiple signatures
143
+ bytes.size >= 4 && ["SZDD", "\x88\xF0\x27\x00"].include?(bytes[0..3])
144
+ when :lit
145
+ # Verify LIT header
146
+ bytes.size >= 8 && bytes[0..7] == "ITOLITLS"
147
+ when :oab
148
+ # OAB validation would need more specific checks
149
+ true
150
+ else
151
+ true
152
+ end
153
+ end
154
+ end
155
+ end
156
+ end