cabriolet 0.1.2 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (102) hide show
  1. checksums.yaml +4 -4
  2. data/README.adoc +703 -38
  3. data/lib/cabriolet/algorithm_factory.rb +250 -0
  4. data/lib/cabriolet/base_compressor.rb +206 -0
  5. data/lib/cabriolet/binary/bitstream.rb +167 -16
  6. data/lib/cabriolet/binary/bitstream_writer.rb +150 -21
  7. data/lib/cabriolet/binary/chm_structures.rb +2 -2
  8. data/lib/cabriolet/binary/hlp_structures.rb +258 -37
  9. data/lib/cabriolet/binary/lit_structures.rb +231 -65
  10. data/lib/cabriolet/binary/oab_structures.rb +17 -1
  11. data/lib/cabriolet/cab/command_handler.rb +226 -0
  12. data/lib/cabriolet/cab/compressor.rb +108 -84
  13. data/lib/cabriolet/cab/decompressor.rb +16 -20
  14. data/lib/cabriolet/cab/extractor.rb +142 -66
  15. data/lib/cabriolet/cab/file_compression_work.rb +52 -0
  16. data/lib/cabriolet/cab/file_compression_worker.rb +89 -0
  17. data/lib/cabriolet/checksum.rb +49 -0
  18. data/lib/cabriolet/chm/command_handler.rb +227 -0
  19. data/lib/cabriolet/chm/compressor.rb +7 -3
  20. data/lib/cabriolet/chm/decompressor.rb +39 -21
  21. data/lib/cabriolet/chm/parser.rb +5 -2
  22. data/lib/cabriolet/cli/base_command_handler.rb +127 -0
  23. data/lib/cabriolet/cli/command_dispatcher.rb +140 -0
  24. data/lib/cabriolet/cli/command_registry.rb +83 -0
  25. data/lib/cabriolet/cli.rb +356 -607
  26. data/lib/cabriolet/collections/file_collection.rb +175 -0
  27. data/lib/cabriolet/compressors/base.rb +1 -1
  28. data/lib/cabriolet/compressors/lzx.rb +241 -54
  29. data/lib/cabriolet/compressors/mszip.rb +35 -3
  30. data/lib/cabriolet/compressors/quantum.rb +36 -95
  31. data/lib/cabriolet/decompressors/base.rb +1 -1
  32. data/lib/cabriolet/decompressors/lzss.rb +13 -3
  33. data/lib/cabriolet/decompressors/lzx.rb +70 -33
  34. data/lib/cabriolet/decompressors/mszip.rb +126 -39
  35. data/lib/cabriolet/decompressors/quantum.rb +83 -53
  36. data/lib/cabriolet/errors.rb +3 -0
  37. data/lib/cabriolet/extraction/base_extractor.rb +88 -0
  38. data/lib/cabriolet/extraction/extractor.rb +171 -0
  39. data/lib/cabriolet/extraction/file_extraction_work.rb +60 -0
  40. data/lib/cabriolet/extraction/file_extraction_worker.rb +106 -0
  41. data/lib/cabriolet/file_entry.rb +156 -0
  42. data/lib/cabriolet/file_manager.rb +144 -0
  43. data/lib/cabriolet/format_base.rb +79 -0
  44. data/lib/cabriolet/hlp/command_handler.rb +282 -0
  45. data/lib/cabriolet/hlp/compressor.rb +28 -238
  46. data/lib/cabriolet/hlp/decompressor.rb +107 -147
  47. data/lib/cabriolet/hlp/parser.rb +52 -101
  48. data/lib/cabriolet/hlp/quickhelp/compression_stream.rb +138 -0
  49. data/lib/cabriolet/hlp/quickhelp/compressor.rb +151 -0
  50. data/lib/cabriolet/hlp/quickhelp/decompressor.rb +558 -0
  51. data/lib/cabriolet/hlp/quickhelp/file_writer.rb +125 -0
  52. data/lib/cabriolet/hlp/quickhelp/huffman_stream.rb +74 -0
  53. data/lib/cabriolet/hlp/quickhelp/huffman_tree.rb +167 -0
  54. data/lib/cabriolet/hlp/quickhelp/offset_calculator.rb +61 -0
  55. data/lib/cabriolet/hlp/quickhelp/parser.rb +274 -0
  56. data/lib/cabriolet/hlp/quickhelp/structure_builder.rb +93 -0
  57. data/lib/cabriolet/hlp/quickhelp/topic_builder.rb +52 -0
  58. data/lib/cabriolet/hlp/quickhelp/topic_compressor.rb +83 -0
  59. data/lib/cabriolet/hlp/winhelp/btree_builder.rb +289 -0
  60. data/lib/cabriolet/hlp/winhelp/compressor.rb +400 -0
  61. data/lib/cabriolet/hlp/winhelp/decompressor.rb +192 -0
  62. data/lib/cabriolet/hlp/winhelp/parser.rb +484 -0
  63. data/lib/cabriolet/hlp/winhelp/zeck_lz77.rb +271 -0
  64. data/lib/cabriolet/huffman/encoder.rb +15 -12
  65. data/lib/cabriolet/huffman/tree.rb +85 -1
  66. data/lib/cabriolet/kwaj/command_handler.rb +213 -0
  67. data/lib/cabriolet/kwaj/compressor.rb +7 -3
  68. data/lib/cabriolet/kwaj/decompressor.rb +18 -12
  69. data/lib/cabriolet/lit/command_handler.rb +221 -0
  70. data/lib/cabriolet/lit/compressor.rb +119 -168
  71. data/lib/cabriolet/lit/content_encoder.rb +76 -0
  72. data/lib/cabriolet/lit/content_type_detector.rb +50 -0
  73. data/lib/cabriolet/lit/decompressor.rb +518 -152
  74. data/lib/cabriolet/lit/directory_builder.rb +153 -0
  75. data/lib/cabriolet/lit/guid_generator.rb +16 -0
  76. data/lib/cabriolet/lit/header_writer.rb +124 -0
  77. data/lib/cabriolet/lit/parser.rb +670 -0
  78. data/lib/cabriolet/lit/piece_builder.rb +74 -0
  79. data/lib/cabriolet/lit/structure_builder.rb +252 -0
  80. data/lib/cabriolet/models/hlp_file.rb +130 -29
  81. data/lib/cabriolet/models/hlp_header.rb +105 -17
  82. data/lib/cabriolet/models/lit_header.rb +212 -25
  83. data/lib/cabriolet/models/szdd_header.rb +10 -2
  84. data/lib/cabriolet/models/winhelp_header.rb +127 -0
  85. data/lib/cabriolet/oab/command_handler.rb +257 -0
  86. data/lib/cabriolet/oab/compressor.rb +17 -8
  87. data/lib/cabriolet/oab/decompressor.rb +41 -10
  88. data/lib/cabriolet/offset_calculator.rb +81 -0
  89. data/lib/cabriolet/plugin.rb +233 -0
  90. data/lib/cabriolet/plugin_manager.rb +453 -0
  91. data/lib/cabriolet/plugin_validator.rb +422 -0
  92. data/lib/cabriolet/quantum_shared.rb +105 -0
  93. data/lib/cabriolet/system/io_system.rb +3 -0
  94. data/lib/cabriolet/system/memory_handle.rb +17 -4
  95. data/lib/cabriolet/szdd/command_handler.rb +217 -0
  96. data/lib/cabriolet/szdd/compressor.rb +15 -11
  97. data/lib/cabriolet/szdd/decompressor.rb +18 -9
  98. data/lib/cabriolet/version.rb +1 -1
  99. data/lib/cabriolet.rb +181 -20
  100. metadata +69 -4
  101. data/lib/cabriolet/auto.rb +0 -173
  102. data/lib/cabriolet/parallel.rb +0 -333
@@ -14,6 +14,13 @@ module Cabriolet
14
14
  DISTANCE_MAXSYMBOLS = 32
15
15
  DISTANCE_TABLEBITS = 6
16
16
 
17
+ # MSZIP signature bytes
18
+ SIGNATURE_BYTE_C = 0x43 # ASCII 'C'
19
+ SIGNATURE_BYTE_K = 0x4B # ASCII 'K'
20
+
21
+ # Maximum bytes to search for CK signature (prevents infinite loops)
22
+ MAX_SIGNATURE_SEARCH = 10_000
23
+
17
24
  # Match lengths for literal codes 257-285
18
25
  LIT_LENGTHS = [
19
26
  3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 15, 17, 19, 23, 27,
@@ -50,7 +57,8 @@ module Cabriolet
50
57
  # @param output [System::FileHandle, System::MemoryHandle] Output handle
51
58
  # @param buffer_size [Integer] Buffer size for I/O operations
52
59
  # @param fix_mszip [Boolean] Enable repair mode for corrupted data
53
- def initialize(io_system, input, output, buffer_size, fix_mszip: false)
60
+ def initialize(io_system, input, output, buffer_size, fix_mszip: false,
61
+ salvage: false, **_kwargs)
54
62
  super(io_system, input, output, buffer_size)
55
63
  @fix_mszip = fix_mszip
56
64
 
@@ -58,9 +66,11 @@ module Cabriolet
58
66
  @window = "\0" * FRAME_SIZE
59
67
  @window_posn = 0
60
68
  @bytes_output = 0
69
+ @window_offset = 0 # Offset into window for unconsumed data (for multi-file CFDATA blocks)
61
70
 
62
71
  # Initialize bitstream
63
- @bitstream = Binary::Bitstream.new(io_system, input, buffer_size)
72
+ @bitstream = Binary::Bitstream.new(io_system, input, buffer_size,
73
+ salvage: salvage)
64
74
 
65
75
  # Initialize Huffman trees
66
76
  @literal_lengths = Array.new(LITERAL_MAXSYMBOLS, 0)
@@ -76,15 +86,50 @@ module Cabriolet
76
86
  def decompress(bytes)
77
87
  total_written = 0
78
88
 
89
+ if ENV["DEBUG_MSZIP"]
90
+ warn "DEBUG MSZIP.decompress(#{bytes}): ENTRY bytes_output=#{@bytes_output} window_offset=#{@window_offset} window_posn=#{@window_posn}"
91
+ end
92
+
79
93
  while bytes.positive?
80
- # Read 'CK' signature
81
- read_signature
94
+ # Check if we have buffered data from previous inflate
95
+ if @bytes_output.positive?
96
+ if ENV["DEBUG_MSZIP"]
97
+ warn "DEBUG MSZIP: Using buffered data: bytes_output=#{@bytes_output} window_offset=#{@window_offset}"
98
+ end
82
99
 
83
- # Reset window state for new block
100
+ # Write from buffer
101
+ write_amount = [bytes, @bytes_output].min
102
+ io_system.write(output, @window[@window_offset, write_amount])
103
+ total_written += write_amount
104
+ bytes -= write_amount
105
+ @bytes_output -= write_amount
106
+ @window_offset += write_amount
107
+
108
+ if ENV["DEBUG_MSZIP"]
109
+ warn "DEBUG MSZIP: After buffer write: total_written=#{total_written} bytes_remaining=#{bytes} bytes_output=#{@bytes_output}"
110
+ end
111
+
112
+ # Continue loop to check if we need more data
113
+ next
114
+ end
115
+
116
+ # No buffered data - need to inflate a new MSZIP frame
117
+ # Reset window for new frame
118
+ @window_offset = 0
84
119
  @window_posn = 0
85
- @bytes_output = 0
86
120
 
87
- # Inflate the block
121
+ # Read 'CK' signature (marks start of MSZIP frame)
122
+ # Every MSZIP frame starts with a CK signature
123
+ if ENV["DEBUG_MSZIP"]
124
+ warn "DEBUG MSZIP: Reading CK signature (new MSZIP frame)"
125
+ end
126
+ read_signature
127
+
128
+ # Inflate the MSZIP frame (processes deflate blocks until last_block or window full)
129
+ if ENV["DEBUG_MSZIP"]
130
+ warn "DEBUG MSZIP: Calling inflate_block"
131
+ end
132
+
88
133
  begin
89
134
  inflate_block
90
135
  rescue DecompressionError
@@ -97,11 +142,15 @@ module Cabriolet
97
142
  @bytes_output = FRAME_SIZE
98
143
  end
99
144
 
100
- # Write output
101
- write_amount = [bytes, @bytes_output].min
102
- io_system.write(output, @window[0, write_amount])
103
- total_written += write_amount
104
- bytes -= write_amount
145
+ if ENV["DEBUG_MSZIP"]
146
+ warn "DEBUG MSZIP: After inflate_block: bytes_output=#{@bytes_output} window_posn=#{@window_posn}"
147
+ end
148
+
149
+ # Now we have data in the window buffer - loop back to write from it
150
+ end
151
+
152
+ if ENV["DEBUG_MSZIP"]
153
+ warn "DEBUG MSZIP.decompress: EXIT total_written=#{total_written}"
105
154
  end
106
155
 
107
156
  total_written
@@ -111,49 +160,63 @@ module Cabriolet
111
160
 
112
161
  # Read and verify 'CK' signature
113
162
  def read_signature
163
+ if ENV["DEBUG_MSZIP"]
164
+ warn "DEBUG read_signature: Before byte_align"
165
+ end
166
+
114
167
  # Align to byte boundary
115
168
  @bitstream.byte_align
116
169
 
117
- # Read bytes until we find 'CK'
118
- state = 0
119
- bytes_read = 0
120
- max_search = 10_000 # Prevent infinite loops
170
+ # Read first 2 bytes
171
+ c = @bitstream.read_bits(8)
172
+ k = @bitstream.read_bits(8)
121
173
 
122
- loop do
123
- byte = @bitstream.read_bits(8)
124
- bytes_read += 1
174
+ if ENV["DEBUG_MSZIP"]
175
+ warn "DEBUG read_signature: Read 0x#{c.to_s(16)} 0x#{k.to_s(16)} (expected 'C'=0x43 'K'=0x4B)"
176
+ end
125
177
 
126
- # Check for EOF (bitstream returns 0)
127
- if bytes_read > 2 && byte.zero?
128
- raise DecompressionError,
129
- "Unexpected EOF while searching for CK signature"
178
+ # If not CK, search for it (similar to libmspack's tolerant behavior)
179
+ unless c == SIGNATURE_BYTE_C && k == SIGNATURE_BYTE_K
180
+ # Search for CK signature in the stream (up to a reasonable limit)
181
+ max_search = 256
182
+ found = false
183
+
184
+ max_search.times do
185
+ # Shift: c becomes k, read new k
186
+ c = k
187
+ k = @bitstream.read_bits(8)
188
+
189
+ if c == SIGNATURE_BYTE_C && k == SIGNATURE_BYTE_K
190
+ found = true
191
+ if ENV["DEBUG_MSZIP"]
192
+ warn "DEBUG read_signature: Found CK signature after searching"
193
+ end
194
+ break
195
+ end
130
196
  end
131
197
 
132
- # Prevent infinite loops
133
- if bytes_read > max_search
198
+ unless found
134
199
  raise DecompressionError,
135
- "CK signature not found in stream"
136
- end
137
-
138
- if byte == 0x43 # 'C'
139
- state = 1
140
- elsif state == 1 && byte == 0x4B # 'K'
141
- break
142
- else
143
- state = 0
200
+ "Invalid MSZIP signature: could not find CK in stream"
144
201
  end
145
202
  end
146
203
  end
147
204
 
148
205
  # Inflate a single block
206
+ #
207
+ # Processes deflate blocks until the last_block flag is set or window is full.
208
+ # Always decodes complete blocks - does not stop mid-block.
149
209
  def inflate_block
150
- loop do
151
- # Read last block flag
152
- last_block = @bitstream.read_bits(1)
210
+ # Read first block header
211
+ last_block = @bitstream.read_bits(1)
212
+ block_type = @bitstream.read_bits(2)
153
213
 
154
- # Read block type
155
- block_type = @bitstream.read_bits(2)
214
+ if ENV["DEBUG_MSZIP"]
215
+ warn "DEBUG inflate_block: First block: last_block=#{last_block} block_type=#{block_type}"
216
+ end
156
217
 
218
+ loop do
219
+ # Process current block
157
220
  case block_type
158
221
  when 0
159
222
  inflate_stored_block
@@ -167,7 +230,16 @@ module Cabriolet
167
230
  raise DecompressionError, "Invalid block type: #{block_type}"
168
231
  end
169
232
 
233
+ if ENV["DEBUG_MSZIP"]
234
+ warn "DEBUG inflate_block: After block: last_block=#{last_block} window_posn=#{@window_posn}"
235
+ end
236
+
237
+ # Stop if this was the last block
170
238
  break if last_block == 1
239
+
240
+ # Read next block header (only if we need to continue)
241
+ last_block = @bitstream.read_bits(1)
242
+ block_type = @bitstream.read_bits(2)
171
243
  end
172
244
 
173
245
  # Flush remaining window data
@@ -306,13 +378,25 @@ module Cabriolet
306
378
  end
307
379
 
308
380
  # Inflate a Huffman-compressed block
381
+ #
382
+ # Always decodes until code 256 (END OF BLOCK)
309
383
  def inflate_huffman_block
384
+ symbol_count = 0
310
385
  loop do
386
+ if ENV["DEBUG_MSZIP_SYMBOLS"]
387
+ warn "DEBUG inflate_huffman_block: window_posn=#{@window_posn} bytes_output=#{@bytes_output}"
388
+ end
389
+
311
390
  # Decode symbol from literal tree
312
391
  code = Huffman::Decoder.decode_symbol(
313
392
  @bitstream, @literal_tree.table, LITERAL_TABLEBITS,
314
393
  @literal_lengths, LITERAL_MAXSYMBOLS
315
394
  )
395
+ symbol_count += 1
396
+
397
+ if ENV["DEBUG_MSZIP_SYMBOLS"] || ENV["DEBUG_MSZIP"]
398
+ warn "DEBUG inflate_huffman_block[#{symbol_count}]: decoded code=#{code} (#{'0x%02x' % code if code < 256})"
399
+ end
316
400
 
317
401
  if code < 256
318
402
  # Literal byte
@@ -321,6 +405,9 @@ module Cabriolet
321
405
  flush_window if @window_posn == FRAME_SIZE
322
406
  elsif code == 256
323
407
  # End of block
408
+ if ENV["DEBUG_MSZIP"] || ENV["DEBUG_MSZIP_SYMBOLS"]
409
+ warn "DEBUG inflate_huffman_block: END OF BLOCK (window_posn=#{@window_posn})"
410
+ end
324
411
  break
325
412
  else
326
413
  # Length/distance pair (LZ77 match)
@@ -1,5 +1,33 @@
1
1
  # frozen_string_literal: true
2
2
 
3
+ require_relative "../quantum_shared"
4
+
5
+ # Compatibility shim for String#bytesplice (added in Ruby 3.2)
6
+ unless String.method_defined?(:bytesplice)
7
+ module StringBytespliceCompat
8
+ # Compatibility implementation of bytesplice for Ruby < 3.2
9
+ # Uses clear/append which is slower but works with mutable strings
10
+ def bytesplice(index, length, other_string, other_index = 0,
11
+ other_length = nil)
12
+ other_length ||= other_string.bytesize
13
+
14
+ # Build new string content
15
+ prefix = byteslice(0, index)
16
+ middle = other_string.byteslice(other_index, other_length)
17
+ suffix = byteslice((index + length)..-1)
18
+ new_content = prefix + middle + suffix
19
+
20
+ # Modify receiver in place
21
+ clear
22
+ self << new_content
23
+
24
+ self
25
+ end
26
+ end
27
+
28
+ String.prepend(StringBytespliceCompat)
29
+ end
30
+
3
31
  module Cabriolet
4
32
  module Decompressors
5
33
  # Quantum handles Quantum-compressed data using arithmetic coding
@@ -8,59 +36,10 @@ module Cabriolet
8
36
  # The Quantum method was created by David Stafford, adapted by Microsoft
9
37
  # Corporation.
10
38
  class Quantum < Base
11
- # Frame size (32KB per frame)
12
- FRAME_SIZE = 32_768
13
-
14
- # Match constants
15
- MAX_MATCH = 1028
16
-
17
- # Position slot tables (same as in qtmd.c)
18
- POSITION_BASE = [
19
- 0, 1, 2, 3, 4, 6, 8, 12, 16, 24, 32, 48, 64, 96, 128, 192, 256, 384,
20
- 512, 768, 1024, 1536, 2048, 3072, 4096, 6144, 8192, 12_288, 16_384,
21
- 24_576, 32_768, 49_152, 65_536, 98_304, 131_072, 196_608, 262_144,
22
- 393_216, 524_288, 786_432, 1_048_576, 1_572_864
23
- ].freeze
24
-
25
- EXTRA_BITS = [
26
- 0, 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8,
27
- 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15, 16, 16,
28
- 17, 17, 18, 18, 19, 19
29
- ].freeze
30
-
31
- LENGTH_BASE = [
32
- 0, 1, 2, 3, 4, 5, 6, 8, 10, 12, 14, 18, 22, 26,
33
- 30, 38, 46, 54, 62, 78, 94, 110, 126, 158, 190, 222, 254
34
- ].freeze
35
-
36
- LENGTH_EXTRA = [
37
- 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2,
38
- 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 0
39
- ].freeze
39
+ include QuantumShared
40
40
 
41
41
  attr_reader :window_bits, :window_size
42
42
 
43
- # Represents a symbol in an arithmetic coding model
44
- class ModelSymbol
45
- attr_accessor :sym, :cumfreq
46
-
47
- def initialize(sym, cumfreq)
48
- @sym = sym
49
- @cumfreq = cumfreq
50
- end
51
- end
52
-
53
- # Represents an arithmetic coding model
54
- class Model
55
- attr_accessor :shiftsleft, :entries, :syms
56
-
57
- def initialize(syms, entries)
58
- @syms = syms
59
- @entries = entries
60
- @shiftsleft = 4
61
- end
62
- end
63
-
64
43
  # Initialize Quantum decompressor
65
44
  #
66
45
  # @param io_system [System::IOSystem] I/O system for reading/writing
@@ -68,7 +47,8 @@ module Cabriolet
68
47
  # @param output [System::FileHandle, System::MemoryHandle] Output handle
69
48
  # @param buffer_size [Integer] Buffer size for I/O operations
70
49
  # @param window_bits [Integer] Window size parameter (10-21)
71
- def initialize(io_system, input, output, buffer_size, window_bits: 10)
50
+ def initialize(io_system, input, output, buffer_size, window_bits: 10,
51
+ **_kwargs)
72
52
  super(io_system, input, output, buffer_size)
73
53
 
74
54
  # Validate window_bits
@@ -80,8 +60,13 @@ module Cabriolet
80
60
  @window_bits = window_bits
81
61
  @window_size = 1 << window_bits
82
62
 
83
- # Initialize window
84
- @window = "\0" * @window_size
63
+ # Initialize window (mutable for Ruby < 3.2 bytesplice compatibility)
64
+ @window = if String.method_defined?(:bytesplice)
65
+ "\0" * @window_size
66
+ else
67
+ # In Ruby < 3.2, create mutable window using String.new
68
+ String.new("\0" * @window_size)
69
+ end
85
70
  @window_posn = 0
86
71
  @frame_todo = FRAME_SIZE
87
72
 
@@ -408,7 +393,52 @@ module Cabriolet
408
393
  end
409
394
 
410
395
  # Copy match from window
396
+ # Optimized to use bulk byte operations for better performance
411
397
  def copy_match(offset, length)
398
+ # Use bulk copy for matches longer than 32 bytes
399
+ if length > 32
400
+ copy_match_bulk(offset, length)
401
+ else
402
+ copy_match_byte_by_byte(offset, length)
403
+ end
404
+ end
405
+
406
+ # Bulk copy using bytesplice for better performance on longer matches
407
+ def copy_match_bulk(offset, length)
408
+ if offset > @window_posn
409
+ # Match wraps around window
410
+ if offset > @window_size
411
+ raise DecompressionError,
412
+ "Match offset beyond window"
413
+ end
414
+
415
+ # Copy from end of window
416
+ src_pos = @window_size - (offset - @window_posn)
417
+ copy_len = offset - @window_posn
418
+
419
+ if copy_len < length
420
+ # Copy from end, then from beginning
421
+ @window.bytesplice(@window_posn, copy_len, @window, src_pos,
422
+ copy_len)
423
+ @window_posn += copy_len
424
+ remaining = length - copy_len
425
+ @window.bytesplice(@window_posn, remaining, @window, 0, remaining)
426
+ @window_posn += remaining
427
+ else
428
+ # Copy entirely from end
429
+ @window.bytesplice(@window_posn, length, @window, src_pos, length)
430
+ @window_posn += length
431
+ end
432
+ else
433
+ # Normal copy - use bytesplice for bulk operation
434
+ src_pos = @window_posn - offset
435
+ @window.bytesplice(@window_posn, length, @window, src_pos, length)
436
+ @window_posn += length
437
+ end
438
+ end
439
+
440
+ # Byte-by-byte copy for short matches (fallback)
441
+ def copy_match_byte_by_byte(offset, length)
412
442
  if offset > @window_posn
413
443
  # Match wraps around window
414
444
  if offset > @window_size
@@ -36,4 +36,7 @@ module Cabriolet
36
36
 
37
37
  # Raised when seek operation fails
38
38
  class SeekError < IOError; end
39
+
40
+ # Raised when plugin operations fail
41
+ class PluginError < Error; end
39
42
  end
@@ -0,0 +1,88 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "fileutils"
4
+
5
+ module Cabriolet
6
+ module Extraction
7
+ # BaseExtractor provides common extraction functionality for all extractors
8
+ # Reduces code duplication between SimpleExtractor and Parallel::Extractor
9
+ class BaseExtractor
10
+ # Initialize the base extractor
11
+ #
12
+ # @param output_dir [String] Directory to extract files to
13
+ # @param preserve_paths [Boolean] Whether to preserve directory structure
14
+ # @param overwrite [Boolean] Whether to overwrite existing files
15
+ def initialize(output_dir, preserve_paths: true, overwrite: false)
16
+ @output_dir = output_dir
17
+ @preserve_paths = preserve_paths
18
+ @overwrite = overwrite
19
+ end
20
+
21
+ protected
22
+
23
+ # Build the output path for a file, handling path preservation and cleaning
24
+ #
25
+ # @param filename [String] Original filename from archive (may have backslashes)
26
+ # @return [String] Full output path for the file
27
+ def build_output_path(filename)
28
+ # Normalize path separators (Windows archives use backslashes)
29
+ clean_name = filename.gsub("\\", "/")
30
+
31
+ if @preserve_paths
32
+ # Keep directory structure
33
+ ::File.join(@output_dir, clean_name)
34
+ else
35
+ # Flatten to output directory (just basename)
36
+ ::File.join(@output_dir, ::File.basename(clean_name))
37
+ end
38
+ end
39
+
40
+ # Extract a single file to disk
41
+ #
42
+ # @param file [Object] File object from archive (must respond to :name and :data)
43
+ # @yield [path, data] Optional block for custom handling instead of default write
44
+ # @return [String, nil] Output path if successful, nil if skipped or failed
45
+ def extract_file(file)
46
+ output_path = build_output_path(file.name)
47
+
48
+ # Check if file exists and skip if not overwriting
49
+ if ::File.exist?(output_path) && !@overwrite
50
+ return nil
51
+ end
52
+
53
+ # Create parent directory
54
+ dir = ::File.dirname(output_path)
55
+ FileUtils.mkdir_p(dir) unless ::File.directory?(dir)
56
+
57
+ # Get file data
58
+ data = file.data
59
+ return nil unless data
60
+
61
+ # Write file data
62
+ ::File.binwrite(output_path, data)
63
+
64
+ # Preserve file attributes if available
65
+ preserve_file_attributes(output_path, file)
66
+
67
+ output_path
68
+ rescue StandardError => e
69
+ warn "Failed to extract #{file.name}: #{e.message}"
70
+ nil
71
+ end
72
+
73
+ # Preserve file attributes (timestamps, etc.) if available on the file object
74
+ #
75
+ # @param path [String] Path to extracted file
76
+ # @param file [Object] File object from archive
77
+ def preserve_file_attributes(path, file)
78
+ # Try various timestamp attributes that different formats use
79
+ if file.respond_to?(:datetime) && file.datetime
80
+ ::File.utime(::File.atime(path), file.datetime, path)
81
+ elsif file.respond_to?(:mtime) && file.mtime
82
+ atime = file.respond_to?(:atime) ? file.atime : ::File.atime(path)
83
+ ::File.utime(atime, file.mtime, path)
84
+ end
85
+ end
86
+ end
87
+ end
88
+ end
@@ -0,0 +1,171 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "fractor"
4
+ require_relative "file_extraction_work"
5
+ require_relative "file_extraction_worker"
6
+
7
+ module Cabriolet
8
+ module Extraction
9
+ # Unified extractor using Fractor for parallel file extraction
10
+ # Single workers: 1 = sequential, N = parallel
11
+ class Extractor
12
+ DEFAULT_WORKERS = 4
13
+
14
+ attr_reader :archive, :output_dir, :workers, :stats
15
+
16
+ def initialize(archive, output_dir, workers: DEFAULT_WORKERS, **options)
17
+ @archive = archive
18
+ @output_dir = output_dir
19
+ @workers = [workers, 1].max # At least 1 worker
20
+ @preserve_paths = options.fetch(:preserve_paths, true)
21
+ @overwrite = options.fetch(:overwrite, false)
22
+ @stats = { extracted: 0, skipped: 0, failed: 0, bytes: 0 }
23
+ end
24
+
25
+ # Extract all files from archive
26
+ #
27
+ # @return [Hash] Extraction statistics
28
+ def extract_all
29
+ FileUtils.mkdir_p(@output_dir)
30
+
31
+ # Create work items for all files
32
+ work_items = @archive.files.map do |file|
33
+ FileExtractionWork.new(
34
+ file,
35
+ output_dir: @output_dir,
36
+ preserve_paths: @preserve_paths,
37
+ overwrite: @overwrite,
38
+ )
39
+ end
40
+
41
+ # Create supervisor with workers
42
+ supervisor = Fractor::Supervisor.new(
43
+ worker_pools: [
44
+ {
45
+ worker_class: FileExtractionWorker,
46
+ num_workers: @workers,
47
+ },
48
+ ],
49
+ )
50
+
51
+ # Add all work items
52
+ supervisor.add_work_items(work_items)
53
+
54
+ # Run extraction
55
+ supervisor.run
56
+
57
+ # Collect results
58
+ collect_stats(supervisor.results)
59
+
60
+ @stats
61
+ end
62
+
63
+ # Extract files with progress callback
64
+ #
65
+ # @yield [current, total, file] Progress callback
66
+ # @return [Hash] Extraction statistics
67
+ def extract_with_progress(&block)
68
+ return extract_all unless block
69
+
70
+ FileUtils.mkdir_p(@output_dir)
71
+
72
+ # For progress tracking, we need to process in batches
73
+ # or use a custom approach since Fractor doesn't have built-in callbacks
74
+ total = @archive.files.count
75
+ current = 0
76
+
77
+ # Sequential mode uses simple iteration with progress
78
+ if @workers == 1
79
+ @archive.files.each do |file|
80
+ extract_single_file(file)
81
+ current += 1
82
+ yield(current, total, file)
83
+ end
84
+ return @stats
85
+ end
86
+
87
+ # Parallel mode: batch files for progress updates
88
+ batch_size = [@archive.files.count / @workers, 1].max
89
+ batches = @archive.files.each_slice(batch_size).to_a
90
+
91
+ batches.each do |batch|
92
+ work_items = batch.map do |file|
93
+ FileExtractionWork.new(
94
+ file,
95
+ output_dir: @output_dir,
96
+ preserve_paths: @preserve_paths,
97
+ overwrite: @overwrite,
98
+ )
99
+ end
100
+
101
+ supervisor = Fractor::Supervisor.new(
102
+ worker_pools: [
103
+ {
104
+ worker_class: FileExtractionWorker,
105
+ num_workers: @workers,
106
+ },
107
+ ],
108
+ )
109
+
110
+ supervisor.add_work_items(work_items)
111
+ supervisor.run
112
+
113
+ batch.each do |file|
114
+ current += 1
115
+ yield(current, total, file)
116
+ end
117
+ end
118
+
119
+ @stats
120
+ end
121
+
122
+ private
123
+
124
+ # Extract a single file (for sequential mode with progress)
125
+ #
126
+ # @param file [Object] File to extract
127
+ # @return [Object] Result from worker
128
+ def extract_single_file(file)
129
+ work = FileExtractionWork.new(
130
+ file,
131
+ output_dir: @output_dir,
132
+ preserve_paths: @preserve_paths,
133
+ overwrite: @overwrite,
134
+ )
135
+
136
+ worker = FileExtractionWorker.new
137
+ result = worker.process(work)
138
+
139
+ update_stats_from_result(result)
140
+
141
+ result
142
+ end
143
+
144
+ # Collect statistics from Fractor results
145
+ #
146
+ # @param results [Fractor::Results] Results from supervisor
147
+ def collect_stats(results)
148
+ results.results.each do |result|
149
+ update_stats_from_result(result)
150
+ end
151
+ end
152
+
153
+ # Update stats from a single work result
154
+ #
155
+ # @param result [Fractor::WorkResult] Result from worker
156
+ def update_stats_from_result(result)
157
+ if result.success?
158
+ data = result.result
159
+ if data.is_a?(Hash) && data[:status] == :skipped
160
+ @stats[:skipped] += 1
161
+ else
162
+ @stats[:extracted] += 1
163
+ @stats[:bytes] += data[:size] if data.is_a?(Hash) && data[:size]
164
+ end
165
+ else
166
+ @stats[:failed] += 1
167
+ end
168
+ end
169
+ end
170
+ end
171
+ end