cabriolet 0.1.2 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.adoc +703 -38
- data/lib/cabriolet/algorithm_factory.rb +250 -0
- data/lib/cabriolet/base_compressor.rb +206 -0
- data/lib/cabriolet/binary/bitstream.rb +167 -16
- data/lib/cabriolet/binary/bitstream_writer.rb +150 -21
- data/lib/cabriolet/binary/chm_structures.rb +2 -2
- data/lib/cabriolet/binary/hlp_structures.rb +258 -37
- data/lib/cabriolet/binary/lit_structures.rb +231 -65
- data/lib/cabriolet/binary/oab_structures.rb +17 -1
- data/lib/cabriolet/cab/command_handler.rb +226 -0
- data/lib/cabriolet/cab/compressor.rb +108 -84
- data/lib/cabriolet/cab/decompressor.rb +16 -20
- data/lib/cabriolet/cab/extractor.rb +142 -66
- data/lib/cabriolet/cab/file_compression_work.rb +52 -0
- data/lib/cabriolet/cab/file_compression_worker.rb +89 -0
- data/lib/cabriolet/checksum.rb +49 -0
- data/lib/cabriolet/chm/command_handler.rb +227 -0
- data/lib/cabriolet/chm/compressor.rb +7 -3
- data/lib/cabriolet/chm/decompressor.rb +39 -21
- data/lib/cabriolet/chm/parser.rb +5 -2
- data/lib/cabriolet/cli/base_command_handler.rb +127 -0
- data/lib/cabriolet/cli/command_dispatcher.rb +140 -0
- data/lib/cabriolet/cli/command_registry.rb +83 -0
- data/lib/cabriolet/cli.rb +356 -607
- data/lib/cabriolet/collections/file_collection.rb +175 -0
- data/lib/cabriolet/compressors/base.rb +1 -1
- data/lib/cabriolet/compressors/lzx.rb +241 -54
- data/lib/cabriolet/compressors/mszip.rb +35 -3
- data/lib/cabriolet/compressors/quantum.rb +36 -95
- data/lib/cabriolet/decompressors/base.rb +1 -1
- data/lib/cabriolet/decompressors/lzss.rb +13 -3
- data/lib/cabriolet/decompressors/lzx.rb +70 -33
- data/lib/cabriolet/decompressors/mszip.rb +126 -39
- data/lib/cabriolet/decompressors/quantum.rb +83 -53
- data/lib/cabriolet/errors.rb +3 -0
- data/lib/cabriolet/extraction/base_extractor.rb +88 -0
- data/lib/cabriolet/extraction/extractor.rb +171 -0
- data/lib/cabriolet/extraction/file_extraction_work.rb +60 -0
- data/lib/cabriolet/extraction/file_extraction_worker.rb +106 -0
- data/lib/cabriolet/file_entry.rb +156 -0
- data/lib/cabriolet/file_manager.rb +144 -0
- data/lib/cabriolet/format_base.rb +79 -0
- data/lib/cabriolet/hlp/command_handler.rb +282 -0
- data/lib/cabriolet/hlp/compressor.rb +28 -238
- data/lib/cabriolet/hlp/decompressor.rb +107 -147
- data/lib/cabriolet/hlp/parser.rb +52 -101
- data/lib/cabriolet/hlp/quickhelp/compression_stream.rb +138 -0
- data/lib/cabriolet/hlp/quickhelp/compressor.rb +151 -0
- data/lib/cabriolet/hlp/quickhelp/decompressor.rb +558 -0
- data/lib/cabriolet/hlp/quickhelp/file_writer.rb +125 -0
- data/lib/cabriolet/hlp/quickhelp/huffman_stream.rb +74 -0
- data/lib/cabriolet/hlp/quickhelp/huffman_tree.rb +167 -0
- data/lib/cabriolet/hlp/quickhelp/offset_calculator.rb +61 -0
- data/lib/cabriolet/hlp/quickhelp/parser.rb +274 -0
- data/lib/cabriolet/hlp/quickhelp/structure_builder.rb +93 -0
- data/lib/cabriolet/hlp/quickhelp/topic_builder.rb +52 -0
- data/lib/cabriolet/hlp/quickhelp/topic_compressor.rb +83 -0
- data/lib/cabriolet/hlp/winhelp/btree_builder.rb +289 -0
- data/lib/cabriolet/hlp/winhelp/compressor.rb +400 -0
- data/lib/cabriolet/hlp/winhelp/decompressor.rb +192 -0
- data/lib/cabriolet/hlp/winhelp/parser.rb +484 -0
- data/lib/cabriolet/hlp/winhelp/zeck_lz77.rb +271 -0
- data/lib/cabriolet/huffman/encoder.rb +15 -12
- data/lib/cabriolet/huffman/tree.rb +85 -1
- data/lib/cabriolet/kwaj/command_handler.rb +213 -0
- data/lib/cabriolet/kwaj/compressor.rb +7 -3
- data/lib/cabriolet/kwaj/decompressor.rb +18 -12
- data/lib/cabriolet/lit/command_handler.rb +221 -0
- data/lib/cabriolet/lit/compressor.rb +119 -168
- data/lib/cabriolet/lit/content_encoder.rb +76 -0
- data/lib/cabriolet/lit/content_type_detector.rb +50 -0
- data/lib/cabriolet/lit/decompressor.rb +518 -152
- data/lib/cabriolet/lit/directory_builder.rb +153 -0
- data/lib/cabriolet/lit/guid_generator.rb +16 -0
- data/lib/cabriolet/lit/header_writer.rb +124 -0
- data/lib/cabriolet/lit/parser.rb +670 -0
- data/lib/cabriolet/lit/piece_builder.rb +74 -0
- data/lib/cabriolet/lit/structure_builder.rb +252 -0
- data/lib/cabriolet/models/hlp_file.rb +130 -29
- data/lib/cabriolet/models/hlp_header.rb +105 -17
- data/lib/cabriolet/models/lit_header.rb +212 -25
- data/lib/cabriolet/models/szdd_header.rb +10 -2
- data/lib/cabriolet/models/winhelp_header.rb +127 -0
- data/lib/cabriolet/oab/command_handler.rb +257 -0
- data/lib/cabriolet/oab/compressor.rb +17 -8
- data/lib/cabriolet/oab/decompressor.rb +41 -10
- data/lib/cabriolet/offset_calculator.rb +81 -0
- data/lib/cabriolet/plugin.rb +233 -0
- data/lib/cabriolet/plugin_manager.rb +453 -0
- data/lib/cabriolet/plugin_validator.rb +422 -0
- data/lib/cabriolet/quantum_shared.rb +105 -0
- data/lib/cabriolet/system/io_system.rb +3 -0
- data/lib/cabriolet/system/memory_handle.rb +17 -4
- data/lib/cabriolet/szdd/command_handler.rb +217 -0
- data/lib/cabriolet/szdd/compressor.rb +15 -11
- data/lib/cabriolet/szdd/decompressor.rb +18 -9
- data/lib/cabriolet/version.rb +1 -1
- data/lib/cabriolet.rb +181 -20
- metadata +69 -4
- data/lib/cabriolet/auto.rb +0 -173
- data/lib/cabriolet/parallel.rb +0 -333
|
@@ -14,6 +14,13 @@ module Cabriolet
|
|
|
14
14
|
DISTANCE_MAXSYMBOLS = 32
|
|
15
15
|
DISTANCE_TABLEBITS = 6
|
|
16
16
|
|
|
17
|
+
# MSZIP signature bytes
|
|
18
|
+
SIGNATURE_BYTE_C = 0x43 # ASCII 'C'
|
|
19
|
+
SIGNATURE_BYTE_K = 0x4B # ASCII 'K'
|
|
20
|
+
|
|
21
|
+
# Maximum bytes to search for CK signature (prevents infinite loops)
|
|
22
|
+
MAX_SIGNATURE_SEARCH = 10_000
|
|
23
|
+
|
|
17
24
|
# Match lengths for literal codes 257-285
|
|
18
25
|
LIT_LENGTHS = [
|
|
19
26
|
3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 15, 17, 19, 23, 27,
|
|
@@ -50,7 +57,8 @@ module Cabriolet
|
|
|
50
57
|
# @param output [System::FileHandle, System::MemoryHandle] Output handle
|
|
51
58
|
# @param buffer_size [Integer] Buffer size for I/O operations
|
|
52
59
|
# @param fix_mszip [Boolean] Enable repair mode for corrupted data
|
|
53
|
-
def initialize(io_system, input, output, buffer_size, fix_mszip: false
|
|
60
|
+
def initialize(io_system, input, output, buffer_size, fix_mszip: false,
|
|
61
|
+
salvage: false, **_kwargs)
|
|
54
62
|
super(io_system, input, output, buffer_size)
|
|
55
63
|
@fix_mszip = fix_mszip
|
|
56
64
|
|
|
@@ -58,9 +66,11 @@ module Cabriolet
|
|
|
58
66
|
@window = "\0" * FRAME_SIZE
|
|
59
67
|
@window_posn = 0
|
|
60
68
|
@bytes_output = 0
|
|
69
|
+
@window_offset = 0 # Offset into window for unconsumed data (for multi-file CFDATA blocks)
|
|
61
70
|
|
|
62
71
|
# Initialize bitstream
|
|
63
|
-
@bitstream = Binary::Bitstream.new(io_system, input, buffer_size
|
|
72
|
+
@bitstream = Binary::Bitstream.new(io_system, input, buffer_size,
|
|
73
|
+
salvage: salvage)
|
|
64
74
|
|
|
65
75
|
# Initialize Huffman trees
|
|
66
76
|
@literal_lengths = Array.new(LITERAL_MAXSYMBOLS, 0)
|
|
@@ -76,15 +86,50 @@ module Cabriolet
|
|
|
76
86
|
def decompress(bytes)
|
|
77
87
|
total_written = 0
|
|
78
88
|
|
|
89
|
+
if ENV["DEBUG_MSZIP"]
|
|
90
|
+
warn "DEBUG MSZIP.decompress(#{bytes}): ENTRY bytes_output=#{@bytes_output} window_offset=#{@window_offset} window_posn=#{@window_posn}"
|
|
91
|
+
end
|
|
92
|
+
|
|
79
93
|
while bytes.positive?
|
|
80
|
-
#
|
|
81
|
-
|
|
94
|
+
# Check if we have buffered data from previous inflate
|
|
95
|
+
if @bytes_output.positive?
|
|
96
|
+
if ENV["DEBUG_MSZIP"]
|
|
97
|
+
warn "DEBUG MSZIP: Using buffered data: bytes_output=#{@bytes_output} window_offset=#{@window_offset}"
|
|
98
|
+
end
|
|
82
99
|
|
|
83
|
-
|
|
100
|
+
# Write from buffer
|
|
101
|
+
write_amount = [bytes, @bytes_output].min
|
|
102
|
+
io_system.write(output, @window[@window_offset, write_amount])
|
|
103
|
+
total_written += write_amount
|
|
104
|
+
bytes -= write_amount
|
|
105
|
+
@bytes_output -= write_amount
|
|
106
|
+
@window_offset += write_amount
|
|
107
|
+
|
|
108
|
+
if ENV["DEBUG_MSZIP"]
|
|
109
|
+
warn "DEBUG MSZIP: After buffer write: total_written=#{total_written} bytes_remaining=#{bytes} bytes_output=#{@bytes_output}"
|
|
110
|
+
end
|
|
111
|
+
|
|
112
|
+
# Continue loop to check if we need more data
|
|
113
|
+
next
|
|
114
|
+
end
|
|
115
|
+
|
|
116
|
+
# No buffered data - need to inflate a new MSZIP frame
|
|
117
|
+
# Reset window for new frame
|
|
118
|
+
@window_offset = 0
|
|
84
119
|
@window_posn = 0
|
|
85
|
-
@bytes_output = 0
|
|
86
120
|
|
|
87
|
-
#
|
|
121
|
+
# Read 'CK' signature (marks start of MSZIP frame)
|
|
122
|
+
# Every MSZIP frame starts with a CK signature
|
|
123
|
+
if ENV["DEBUG_MSZIP"]
|
|
124
|
+
warn "DEBUG MSZIP: Reading CK signature (new MSZIP frame)"
|
|
125
|
+
end
|
|
126
|
+
read_signature
|
|
127
|
+
|
|
128
|
+
# Inflate the MSZIP frame (processes deflate blocks until last_block or window full)
|
|
129
|
+
if ENV["DEBUG_MSZIP"]
|
|
130
|
+
warn "DEBUG MSZIP: Calling inflate_block"
|
|
131
|
+
end
|
|
132
|
+
|
|
88
133
|
begin
|
|
89
134
|
inflate_block
|
|
90
135
|
rescue DecompressionError
|
|
@@ -97,11 +142,15 @@ module Cabriolet
|
|
|
97
142
|
@bytes_output = FRAME_SIZE
|
|
98
143
|
end
|
|
99
144
|
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
145
|
+
if ENV["DEBUG_MSZIP"]
|
|
146
|
+
warn "DEBUG MSZIP: After inflate_block: bytes_output=#{@bytes_output} window_posn=#{@window_posn}"
|
|
147
|
+
end
|
|
148
|
+
|
|
149
|
+
# Now we have data in the window buffer - loop back to write from it
|
|
150
|
+
end
|
|
151
|
+
|
|
152
|
+
if ENV["DEBUG_MSZIP"]
|
|
153
|
+
warn "DEBUG MSZIP.decompress: EXIT total_written=#{total_written}"
|
|
105
154
|
end
|
|
106
155
|
|
|
107
156
|
total_written
|
|
@@ -111,49 +160,63 @@ module Cabriolet
|
|
|
111
160
|
|
|
112
161
|
# Read and verify 'CK' signature
|
|
113
162
|
def read_signature
|
|
163
|
+
if ENV["DEBUG_MSZIP"]
|
|
164
|
+
warn "DEBUG read_signature: Before byte_align"
|
|
165
|
+
end
|
|
166
|
+
|
|
114
167
|
# Align to byte boundary
|
|
115
168
|
@bitstream.byte_align
|
|
116
169
|
|
|
117
|
-
# Read
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
max_search = 10_000 # Prevent infinite loops
|
|
170
|
+
# Read first 2 bytes
|
|
171
|
+
c = @bitstream.read_bits(8)
|
|
172
|
+
k = @bitstream.read_bits(8)
|
|
121
173
|
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
174
|
+
if ENV["DEBUG_MSZIP"]
|
|
175
|
+
warn "DEBUG read_signature: Read 0x#{c.to_s(16)} 0x#{k.to_s(16)} (expected 'C'=0x43 'K'=0x4B)"
|
|
176
|
+
end
|
|
125
177
|
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
178
|
+
# If not CK, search for it (similar to libmspack's tolerant behavior)
|
|
179
|
+
unless c == SIGNATURE_BYTE_C && k == SIGNATURE_BYTE_K
|
|
180
|
+
# Search for CK signature in the stream (up to a reasonable limit)
|
|
181
|
+
max_search = 256
|
|
182
|
+
found = false
|
|
183
|
+
|
|
184
|
+
max_search.times do
|
|
185
|
+
# Shift: c becomes k, read new k
|
|
186
|
+
c = k
|
|
187
|
+
k = @bitstream.read_bits(8)
|
|
188
|
+
|
|
189
|
+
if c == SIGNATURE_BYTE_C && k == SIGNATURE_BYTE_K
|
|
190
|
+
found = true
|
|
191
|
+
if ENV["DEBUG_MSZIP"]
|
|
192
|
+
warn "DEBUG read_signature: Found CK signature after searching"
|
|
193
|
+
end
|
|
194
|
+
break
|
|
195
|
+
end
|
|
130
196
|
end
|
|
131
197
|
|
|
132
|
-
|
|
133
|
-
if bytes_read > max_search
|
|
198
|
+
unless found
|
|
134
199
|
raise DecompressionError,
|
|
135
|
-
"
|
|
136
|
-
end
|
|
137
|
-
|
|
138
|
-
if byte == 0x43 # 'C'
|
|
139
|
-
state = 1
|
|
140
|
-
elsif state == 1 && byte == 0x4B # 'K'
|
|
141
|
-
break
|
|
142
|
-
else
|
|
143
|
-
state = 0
|
|
200
|
+
"Invalid MSZIP signature: could not find CK in stream"
|
|
144
201
|
end
|
|
145
202
|
end
|
|
146
203
|
end
|
|
147
204
|
|
|
148
205
|
# Inflate a single block
|
|
206
|
+
#
|
|
207
|
+
# Processes deflate blocks until the last_block flag is set or window is full.
|
|
208
|
+
# Always decodes complete blocks - does not stop mid-block.
|
|
149
209
|
def inflate_block
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
210
|
+
# Read first block header
|
|
211
|
+
last_block = @bitstream.read_bits(1)
|
|
212
|
+
block_type = @bitstream.read_bits(2)
|
|
153
213
|
|
|
154
|
-
|
|
155
|
-
|
|
214
|
+
if ENV["DEBUG_MSZIP"]
|
|
215
|
+
warn "DEBUG inflate_block: First block: last_block=#{last_block} block_type=#{block_type}"
|
|
216
|
+
end
|
|
156
217
|
|
|
218
|
+
loop do
|
|
219
|
+
# Process current block
|
|
157
220
|
case block_type
|
|
158
221
|
when 0
|
|
159
222
|
inflate_stored_block
|
|
@@ -167,7 +230,16 @@ module Cabriolet
|
|
|
167
230
|
raise DecompressionError, "Invalid block type: #{block_type}"
|
|
168
231
|
end
|
|
169
232
|
|
|
233
|
+
if ENV["DEBUG_MSZIP"]
|
|
234
|
+
warn "DEBUG inflate_block: After block: last_block=#{last_block} window_posn=#{@window_posn}"
|
|
235
|
+
end
|
|
236
|
+
|
|
237
|
+
# Stop if this was the last block
|
|
170
238
|
break if last_block == 1
|
|
239
|
+
|
|
240
|
+
# Read next block header (only if we need to continue)
|
|
241
|
+
last_block = @bitstream.read_bits(1)
|
|
242
|
+
block_type = @bitstream.read_bits(2)
|
|
171
243
|
end
|
|
172
244
|
|
|
173
245
|
# Flush remaining window data
|
|
@@ -306,13 +378,25 @@ module Cabriolet
|
|
|
306
378
|
end
|
|
307
379
|
|
|
308
380
|
# Inflate a Huffman-compressed block
|
|
381
|
+
#
|
|
382
|
+
# Always decodes until code 256 (END OF BLOCK)
|
|
309
383
|
def inflate_huffman_block
|
|
384
|
+
symbol_count = 0
|
|
310
385
|
loop do
|
|
386
|
+
if ENV["DEBUG_MSZIP_SYMBOLS"]
|
|
387
|
+
warn "DEBUG inflate_huffman_block: window_posn=#{@window_posn} bytes_output=#{@bytes_output}"
|
|
388
|
+
end
|
|
389
|
+
|
|
311
390
|
# Decode symbol from literal tree
|
|
312
391
|
code = Huffman::Decoder.decode_symbol(
|
|
313
392
|
@bitstream, @literal_tree.table, LITERAL_TABLEBITS,
|
|
314
393
|
@literal_lengths, LITERAL_MAXSYMBOLS
|
|
315
394
|
)
|
|
395
|
+
symbol_count += 1
|
|
396
|
+
|
|
397
|
+
if ENV["DEBUG_MSZIP_SYMBOLS"] || ENV["DEBUG_MSZIP"]
|
|
398
|
+
warn "DEBUG inflate_huffman_block[#{symbol_count}]: decoded code=#{code} (#{'0x%02x' % code if code < 256})"
|
|
399
|
+
end
|
|
316
400
|
|
|
317
401
|
if code < 256
|
|
318
402
|
# Literal byte
|
|
@@ -321,6 +405,9 @@ module Cabriolet
|
|
|
321
405
|
flush_window if @window_posn == FRAME_SIZE
|
|
322
406
|
elsif code == 256
|
|
323
407
|
# End of block
|
|
408
|
+
if ENV["DEBUG_MSZIP"] || ENV["DEBUG_MSZIP_SYMBOLS"]
|
|
409
|
+
warn "DEBUG inflate_huffman_block: END OF BLOCK (window_posn=#{@window_posn})"
|
|
410
|
+
end
|
|
324
411
|
break
|
|
325
412
|
else
|
|
326
413
|
# Length/distance pair (LZ77 match)
|
|
@@ -1,5 +1,33 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
|
+
require_relative "../quantum_shared"
|
|
4
|
+
|
|
5
|
+
# Compatibility shim for String#bytesplice (added in Ruby 3.2)
|
|
6
|
+
unless String.method_defined?(:bytesplice)
|
|
7
|
+
module StringBytespliceCompat
|
|
8
|
+
# Compatibility implementation of bytesplice for Ruby < 3.2
|
|
9
|
+
# Uses clear/append which is slower but works with mutable strings
|
|
10
|
+
def bytesplice(index, length, other_string, other_index = 0,
|
|
11
|
+
other_length = nil)
|
|
12
|
+
other_length ||= other_string.bytesize
|
|
13
|
+
|
|
14
|
+
# Build new string content
|
|
15
|
+
prefix = byteslice(0, index)
|
|
16
|
+
middle = other_string.byteslice(other_index, other_length)
|
|
17
|
+
suffix = byteslice((index + length)..-1)
|
|
18
|
+
new_content = prefix + middle + suffix
|
|
19
|
+
|
|
20
|
+
# Modify receiver in place
|
|
21
|
+
clear
|
|
22
|
+
self << new_content
|
|
23
|
+
|
|
24
|
+
self
|
|
25
|
+
end
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
String.prepend(StringBytespliceCompat)
|
|
29
|
+
end
|
|
30
|
+
|
|
3
31
|
module Cabriolet
|
|
4
32
|
module Decompressors
|
|
5
33
|
# Quantum handles Quantum-compressed data using arithmetic coding
|
|
@@ -8,59 +36,10 @@ module Cabriolet
|
|
|
8
36
|
# The Quantum method was created by David Stafford, adapted by Microsoft
|
|
9
37
|
# Corporation.
|
|
10
38
|
class Quantum < Base
|
|
11
|
-
|
|
12
|
-
FRAME_SIZE = 32_768
|
|
13
|
-
|
|
14
|
-
# Match constants
|
|
15
|
-
MAX_MATCH = 1028
|
|
16
|
-
|
|
17
|
-
# Position slot tables (same as in qtmd.c)
|
|
18
|
-
POSITION_BASE = [
|
|
19
|
-
0, 1, 2, 3, 4, 6, 8, 12, 16, 24, 32, 48, 64, 96, 128, 192, 256, 384,
|
|
20
|
-
512, 768, 1024, 1536, 2048, 3072, 4096, 6144, 8192, 12_288, 16_384,
|
|
21
|
-
24_576, 32_768, 49_152, 65_536, 98_304, 131_072, 196_608, 262_144,
|
|
22
|
-
393_216, 524_288, 786_432, 1_048_576, 1_572_864
|
|
23
|
-
].freeze
|
|
24
|
-
|
|
25
|
-
EXTRA_BITS = [
|
|
26
|
-
0, 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8,
|
|
27
|
-
9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15, 16, 16,
|
|
28
|
-
17, 17, 18, 18, 19, 19
|
|
29
|
-
].freeze
|
|
30
|
-
|
|
31
|
-
LENGTH_BASE = [
|
|
32
|
-
0, 1, 2, 3, 4, 5, 6, 8, 10, 12, 14, 18, 22, 26,
|
|
33
|
-
30, 38, 46, 54, 62, 78, 94, 110, 126, 158, 190, 222, 254
|
|
34
|
-
].freeze
|
|
35
|
-
|
|
36
|
-
LENGTH_EXTRA = [
|
|
37
|
-
0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2,
|
|
38
|
-
3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 0
|
|
39
|
-
].freeze
|
|
39
|
+
include QuantumShared
|
|
40
40
|
|
|
41
41
|
attr_reader :window_bits, :window_size
|
|
42
42
|
|
|
43
|
-
# Represents a symbol in an arithmetic coding model
|
|
44
|
-
class ModelSymbol
|
|
45
|
-
attr_accessor :sym, :cumfreq
|
|
46
|
-
|
|
47
|
-
def initialize(sym, cumfreq)
|
|
48
|
-
@sym = sym
|
|
49
|
-
@cumfreq = cumfreq
|
|
50
|
-
end
|
|
51
|
-
end
|
|
52
|
-
|
|
53
|
-
# Represents an arithmetic coding model
|
|
54
|
-
class Model
|
|
55
|
-
attr_accessor :shiftsleft, :entries, :syms
|
|
56
|
-
|
|
57
|
-
def initialize(syms, entries)
|
|
58
|
-
@syms = syms
|
|
59
|
-
@entries = entries
|
|
60
|
-
@shiftsleft = 4
|
|
61
|
-
end
|
|
62
|
-
end
|
|
63
|
-
|
|
64
43
|
# Initialize Quantum decompressor
|
|
65
44
|
#
|
|
66
45
|
# @param io_system [System::IOSystem] I/O system for reading/writing
|
|
@@ -68,7 +47,8 @@ module Cabriolet
|
|
|
68
47
|
# @param output [System::FileHandle, System::MemoryHandle] Output handle
|
|
69
48
|
# @param buffer_size [Integer] Buffer size for I/O operations
|
|
70
49
|
# @param window_bits [Integer] Window size parameter (10-21)
|
|
71
|
-
def initialize(io_system, input, output, buffer_size, window_bits: 10
|
|
50
|
+
def initialize(io_system, input, output, buffer_size, window_bits: 10,
|
|
51
|
+
**_kwargs)
|
|
72
52
|
super(io_system, input, output, buffer_size)
|
|
73
53
|
|
|
74
54
|
# Validate window_bits
|
|
@@ -80,8 +60,13 @@ module Cabriolet
|
|
|
80
60
|
@window_bits = window_bits
|
|
81
61
|
@window_size = 1 << window_bits
|
|
82
62
|
|
|
83
|
-
# Initialize window
|
|
84
|
-
@window =
|
|
63
|
+
# Initialize window (mutable for Ruby < 3.2 bytesplice compatibility)
|
|
64
|
+
@window = if String.method_defined?(:bytesplice)
|
|
65
|
+
"\0" * @window_size
|
|
66
|
+
else
|
|
67
|
+
# In Ruby < 3.2, create mutable window using String.new
|
|
68
|
+
String.new("\0" * @window_size)
|
|
69
|
+
end
|
|
85
70
|
@window_posn = 0
|
|
86
71
|
@frame_todo = FRAME_SIZE
|
|
87
72
|
|
|
@@ -408,7 +393,52 @@ module Cabriolet
|
|
|
408
393
|
end
|
|
409
394
|
|
|
410
395
|
# Copy match from window
|
|
396
|
+
# Optimized to use bulk byte operations for better performance
|
|
411
397
|
def copy_match(offset, length)
|
|
398
|
+
# Use bulk copy for matches longer than 32 bytes
|
|
399
|
+
if length > 32
|
|
400
|
+
copy_match_bulk(offset, length)
|
|
401
|
+
else
|
|
402
|
+
copy_match_byte_by_byte(offset, length)
|
|
403
|
+
end
|
|
404
|
+
end
|
|
405
|
+
|
|
406
|
+
# Bulk copy using bytesplice for better performance on longer matches
|
|
407
|
+
def copy_match_bulk(offset, length)
|
|
408
|
+
if offset > @window_posn
|
|
409
|
+
# Match wraps around window
|
|
410
|
+
if offset > @window_size
|
|
411
|
+
raise DecompressionError,
|
|
412
|
+
"Match offset beyond window"
|
|
413
|
+
end
|
|
414
|
+
|
|
415
|
+
# Copy from end of window
|
|
416
|
+
src_pos = @window_size - (offset - @window_posn)
|
|
417
|
+
copy_len = offset - @window_posn
|
|
418
|
+
|
|
419
|
+
if copy_len < length
|
|
420
|
+
# Copy from end, then from beginning
|
|
421
|
+
@window.bytesplice(@window_posn, copy_len, @window, src_pos,
|
|
422
|
+
copy_len)
|
|
423
|
+
@window_posn += copy_len
|
|
424
|
+
remaining = length - copy_len
|
|
425
|
+
@window.bytesplice(@window_posn, remaining, @window, 0, remaining)
|
|
426
|
+
@window_posn += remaining
|
|
427
|
+
else
|
|
428
|
+
# Copy entirely from end
|
|
429
|
+
@window.bytesplice(@window_posn, length, @window, src_pos, length)
|
|
430
|
+
@window_posn += length
|
|
431
|
+
end
|
|
432
|
+
else
|
|
433
|
+
# Normal copy - use bytesplice for bulk operation
|
|
434
|
+
src_pos = @window_posn - offset
|
|
435
|
+
@window.bytesplice(@window_posn, length, @window, src_pos, length)
|
|
436
|
+
@window_posn += length
|
|
437
|
+
end
|
|
438
|
+
end
|
|
439
|
+
|
|
440
|
+
# Byte-by-byte copy for short matches (fallback)
|
|
441
|
+
def copy_match_byte_by_byte(offset, length)
|
|
412
442
|
if offset > @window_posn
|
|
413
443
|
# Match wraps around window
|
|
414
444
|
if offset > @window_size
|
data/lib/cabriolet/errors.rb
CHANGED
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "fileutils"
|
|
4
|
+
|
|
5
|
+
module Cabriolet
|
|
6
|
+
module Extraction
|
|
7
|
+
# BaseExtractor provides common extraction functionality for all extractors
|
|
8
|
+
# Reduces code duplication between SimpleExtractor and Parallel::Extractor
|
|
9
|
+
class BaseExtractor
|
|
10
|
+
# Initialize the base extractor
|
|
11
|
+
#
|
|
12
|
+
# @param output_dir [String] Directory to extract files to
|
|
13
|
+
# @param preserve_paths [Boolean] Whether to preserve directory structure
|
|
14
|
+
# @param overwrite [Boolean] Whether to overwrite existing files
|
|
15
|
+
def initialize(output_dir, preserve_paths: true, overwrite: false)
|
|
16
|
+
@output_dir = output_dir
|
|
17
|
+
@preserve_paths = preserve_paths
|
|
18
|
+
@overwrite = overwrite
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
protected
|
|
22
|
+
|
|
23
|
+
# Build the output path for a file, handling path preservation and cleaning
|
|
24
|
+
#
|
|
25
|
+
# @param filename [String] Original filename from archive (may have backslashes)
|
|
26
|
+
# @return [String] Full output path for the file
|
|
27
|
+
def build_output_path(filename)
|
|
28
|
+
# Normalize path separators (Windows archives use backslashes)
|
|
29
|
+
clean_name = filename.gsub("\\", "/")
|
|
30
|
+
|
|
31
|
+
if @preserve_paths
|
|
32
|
+
# Keep directory structure
|
|
33
|
+
::File.join(@output_dir, clean_name)
|
|
34
|
+
else
|
|
35
|
+
# Flatten to output directory (just basename)
|
|
36
|
+
::File.join(@output_dir, ::File.basename(clean_name))
|
|
37
|
+
end
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
# Extract a single file to disk
|
|
41
|
+
#
|
|
42
|
+
# @param file [Object] File object from archive (must respond to :name and :data)
|
|
43
|
+
# @yield [path, data] Optional block for custom handling instead of default write
|
|
44
|
+
# @return [String, nil] Output path if successful, nil if skipped or failed
|
|
45
|
+
def extract_file(file)
|
|
46
|
+
output_path = build_output_path(file.name)
|
|
47
|
+
|
|
48
|
+
# Check if file exists and skip if not overwriting
|
|
49
|
+
if ::File.exist?(output_path) && !@overwrite
|
|
50
|
+
return nil
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
# Create parent directory
|
|
54
|
+
dir = ::File.dirname(output_path)
|
|
55
|
+
FileUtils.mkdir_p(dir) unless ::File.directory?(dir)
|
|
56
|
+
|
|
57
|
+
# Get file data
|
|
58
|
+
data = file.data
|
|
59
|
+
return nil unless data
|
|
60
|
+
|
|
61
|
+
# Write file data
|
|
62
|
+
::File.binwrite(output_path, data)
|
|
63
|
+
|
|
64
|
+
# Preserve file attributes if available
|
|
65
|
+
preserve_file_attributes(output_path, file)
|
|
66
|
+
|
|
67
|
+
output_path
|
|
68
|
+
rescue StandardError => e
|
|
69
|
+
warn "Failed to extract #{file.name}: #{e.message}"
|
|
70
|
+
nil
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
# Preserve file attributes (timestamps, etc.) if available on the file object
|
|
74
|
+
#
|
|
75
|
+
# @param path [String] Path to extracted file
|
|
76
|
+
# @param file [Object] File object from archive
|
|
77
|
+
def preserve_file_attributes(path, file)
|
|
78
|
+
# Try various timestamp attributes that different formats use
|
|
79
|
+
if file.respond_to?(:datetime) && file.datetime
|
|
80
|
+
::File.utime(::File.atime(path), file.datetime, path)
|
|
81
|
+
elsif file.respond_to?(:mtime) && file.mtime
|
|
82
|
+
atime = file.respond_to?(:atime) ? file.atime : ::File.atime(path)
|
|
83
|
+
::File.utime(atime, file.mtime, path)
|
|
84
|
+
end
|
|
85
|
+
end
|
|
86
|
+
end
|
|
87
|
+
end
|
|
88
|
+
end
|
|
@@ -0,0 +1,171 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "fractor"
|
|
4
|
+
require_relative "file_extraction_work"
|
|
5
|
+
require_relative "file_extraction_worker"
|
|
6
|
+
|
|
7
|
+
module Cabriolet
|
|
8
|
+
module Extraction
|
|
9
|
+
# Unified extractor using Fractor for parallel file extraction
|
|
10
|
+
# Single workers: 1 = sequential, N = parallel
|
|
11
|
+
class Extractor
|
|
12
|
+
DEFAULT_WORKERS = 4
|
|
13
|
+
|
|
14
|
+
attr_reader :archive, :output_dir, :workers, :stats
|
|
15
|
+
|
|
16
|
+
def initialize(archive, output_dir, workers: DEFAULT_WORKERS, **options)
|
|
17
|
+
@archive = archive
|
|
18
|
+
@output_dir = output_dir
|
|
19
|
+
@workers = [workers, 1].max # At least 1 worker
|
|
20
|
+
@preserve_paths = options.fetch(:preserve_paths, true)
|
|
21
|
+
@overwrite = options.fetch(:overwrite, false)
|
|
22
|
+
@stats = { extracted: 0, skipped: 0, failed: 0, bytes: 0 }
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
# Extract all files from archive
|
|
26
|
+
#
|
|
27
|
+
# @return [Hash] Extraction statistics
|
|
28
|
+
def extract_all
|
|
29
|
+
FileUtils.mkdir_p(@output_dir)
|
|
30
|
+
|
|
31
|
+
# Create work items for all files
|
|
32
|
+
work_items = @archive.files.map do |file|
|
|
33
|
+
FileExtractionWork.new(
|
|
34
|
+
file,
|
|
35
|
+
output_dir: @output_dir,
|
|
36
|
+
preserve_paths: @preserve_paths,
|
|
37
|
+
overwrite: @overwrite,
|
|
38
|
+
)
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
# Create supervisor with workers
|
|
42
|
+
supervisor = Fractor::Supervisor.new(
|
|
43
|
+
worker_pools: [
|
|
44
|
+
{
|
|
45
|
+
worker_class: FileExtractionWorker,
|
|
46
|
+
num_workers: @workers,
|
|
47
|
+
},
|
|
48
|
+
],
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
# Add all work items
|
|
52
|
+
supervisor.add_work_items(work_items)
|
|
53
|
+
|
|
54
|
+
# Run extraction
|
|
55
|
+
supervisor.run
|
|
56
|
+
|
|
57
|
+
# Collect results
|
|
58
|
+
collect_stats(supervisor.results)
|
|
59
|
+
|
|
60
|
+
@stats
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
# Extract files with progress callback
|
|
64
|
+
#
|
|
65
|
+
# @yield [current, total, file] Progress callback
|
|
66
|
+
# @return [Hash] Extraction statistics
|
|
67
|
+
def extract_with_progress(&block)
|
|
68
|
+
return extract_all unless block
|
|
69
|
+
|
|
70
|
+
FileUtils.mkdir_p(@output_dir)
|
|
71
|
+
|
|
72
|
+
# For progress tracking, we need to process in batches
|
|
73
|
+
# or use a custom approach since Fractor doesn't have built-in callbacks
|
|
74
|
+
total = @archive.files.count
|
|
75
|
+
current = 0
|
|
76
|
+
|
|
77
|
+
# Sequential mode uses simple iteration with progress
|
|
78
|
+
if @workers == 1
|
|
79
|
+
@archive.files.each do |file|
|
|
80
|
+
extract_single_file(file)
|
|
81
|
+
current += 1
|
|
82
|
+
yield(current, total, file)
|
|
83
|
+
end
|
|
84
|
+
return @stats
|
|
85
|
+
end
|
|
86
|
+
|
|
87
|
+
# Parallel mode: batch files for progress updates
|
|
88
|
+
batch_size = [@archive.files.count / @workers, 1].max
|
|
89
|
+
batches = @archive.files.each_slice(batch_size).to_a
|
|
90
|
+
|
|
91
|
+
batches.each do |batch|
|
|
92
|
+
work_items = batch.map do |file|
|
|
93
|
+
FileExtractionWork.new(
|
|
94
|
+
file,
|
|
95
|
+
output_dir: @output_dir,
|
|
96
|
+
preserve_paths: @preserve_paths,
|
|
97
|
+
overwrite: @overwrite,
|
|
98
|
+
)
|
|
99
|
+
end
|
|
100
|
+
|
|
101
|
+
supervisor = Fractor::Supervisor.new(
|
|
102
|
+
worker_pools: [
|
|
103
|
+
{
|
|
104
|
+
worker_class: FileExtractionWorker,
|
|
105
|
+
num_workers: @workers,
|
|
106
|
+
},
|
|
107
|
+
],
|
|
108
|
+
)
|
|
109
|
+
|
|
110
|
+
supervisor.add_work_items(work_items)
|
|
111
|
+
supervisor.run
|
|
112
|
+
|
|
113
|
+
batch.each do |file|
|
|
114
|
+
current += 1
|
|
115
|
+
yield(current, total, file)
|
|
116
|
+
end
|
|
117
|
+
end
|
|
118
|
+
|
|
119
|
+
@stats
|
|
120
|
+
end
|
|
121
|
+
|
|
122
|
+
private
|
|
123
|
+
|
|
124
|
+
# Extract a single file (for sequential mode with progress)
|
|
125
|
+
#
|
|
126
|
+
# @param file [Object] File to extract
|
|
127
|
+
# @return [Object] Result from worker
|
|
128
|
+
def extract_single_file(file)
|
|
129
|
+
work = FileExtractionWork.new(
|
|
130
|
+
file,
|
|
131
|
+
output_dir: @output_dir,
|
|
132
|
+
preserve_paths: @preserve_paths,
|
|
133
|
+
overwrite: @overwrite,
|
|
134
|
+
)
|
|
135
|
+
|
|
136
|
+
worker = FileExtractionWorker.new
|
|
137
|
+
result = worker.process(work)
|
|
138
|
+
|
|
139
|
+
update_stats_from_result(result)
|
|
140
|
+
|
|
141
|
+
result
|
|
142
|
+
end
|
|
143
|
+
|
|
144
|
+
# Collect statistics from Fractor results
|
|
145
|
+
#
|
|
146
|
+
# @param results [Fractor::Results] Results from supervisor
|
|
147
|
+
def collect_stats(results)
|
|
148
|
+
results.results.each do |result|
|
|
149
|
+
update_stats_from_result(result)
|
|
150
|
+
end
|
|
151
|
+
end
|
|
152
|
+
|
|
153
|
+
# Update stats from a single work result
|
|
154
|
+
#
|
|
155
|
+
# @param result [Fractor::WorkResult] Result from worker
|
|
156
|
+
def update_stats_from_result(result)
|
|
157
|
+
if result.success?
|
|
158
|
+
data = result.result
|
|
159
|
+
if data.is_a?(Hash) && data[:status] == :skipped
|
|
160
|
+
@stats[:skipped] += 1
|
|
161
|
+
else
|
|
162
|
+
@stats[:extracted] += 1
|
|
163
|
+
@stats[:bytes] += data[:size] if data.is_a?(Hash) && data[:size]
|
|
164
|
+
end
|
|
165
|
+
else
|
|
166
|
+
@stats[:failed] += 1
|
|
167
|
+
end
|
|
168
|
+
end
|
|
169
|
+
end
|
|
170
|
+
end
|
|
171
|
+
end
|