cabriolet 0.1.2 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.adoc +700 -38
- data/lib/cabriolet/algorithm_factory.rb +250 -0
- data/lib/cabriolet/base_compressor.rb +206 -0
- data/lib/cabriolet/binary/bitstream.rb +154 -14
- data/lib/cabriolet/binary/bitstream_writer.rb +129 -17
- data/lib/cabriolet/binary/chm_structures.rb +2 -2
- data/lib/cabriolet/binary/hlp_structures.rb +258 -37
- data/lib/cabriolet/binary/lit_structures.rb +231 -65
- data/lib/cabriolet/binary/oab_structures.rb +17 -1
- data/lib/cabriolet/cab/command_handler.rb +226 -0
- data/lib/cabriolet/cab/compressor.rb +35 -43
- data/lib/cabriolet/cab/decompressor.rb +14 -19
- data/lib/cabriolet/cab/extractor.rb +140 -31
- data/lib/cabriolet/chm/command_handler.rb +227 -0
- data/lib/cabriolet/chm/compressor.rb +7 -3
- data/lib/cabriolet/chm/decompressor.rb +39 -21
- data/lib/cabriolet/chm/parser.rb +5 -2
- data/lib/cabriolet/cli/base_command_handler.rb +127 -0
- data/lib/cabriolet/cli/command_dispatcher.rb +140 -0
- data/lib/cabriolet/cli/command_registry.rb +83 -0
- data/lib/cabriolet/cli.rb +356 -607
- data/lib/cabriolet/compressors/base.rb +1 -1
- data/lib/cabriolet/compressors/lzx.rb +241 -54
- data/lib/cabriolet/compressors/mszip.rb +35 -3
- data/lib/cabriolet/compressors/quantum.rb +34 -45
- data/lib/cabriolet/decompressors/base.rb +1 -1
- data/lib/cabriolet/decompressors/lzss.rb +13 -3
- data/lib/cabriolet/decompressors/lzx.rb +70 -33
- data/lib/cabriolet/decompressors/mszip.rb +126 -39
- data/lib/cabriolet/decompressors/quantum.rb +3 -2
- data/lib/cabriolet/errors.rb +3 -0
- data/lib/cabriolet/file_entry.rb +156 -0
- data/lib/cabriolet/file_manager.rb +144 -0
- data/lib/cabriolet/hlp/command_handler.rb +282 -0
- data/lib/cabriolet/hlp/compressor.rb +28 -238
- data/lib/cabriolet/hlp/decompressor.rb +107 -147
- data/lib/cabriolet/hlp/parser.rb +52 -101
- data/lib/cabriolet/hlp/quickhelp/compression_stream.rb +138 -0
- data/lib/cabriolet/hlp/quickhelp/compressor.rb +626 -0
- data/lib/cabriolet/hlp/quickhelp/decompressor.rb +558 -0
- data/lib/cabriolet/hlp/quickhelp/huffman_stream.rb +74 -0
- data/lib/cabriolet/hlp/quickhelp/huffman_tree.rb +167 -0
- data/lib/cabriolet/hlp/quickhelp/parser.rb +274 -0
- data/lib/cabriolet/hlp/winhelp/btree_builder.rb +289 -0
- data/lib/cabriolet/hlp/winhelp/compressor.rb +400 -0
- data/lib/cabriolet/hlp/winhelp/decompressor.rb +192 -0
- data/lib/cabriolet/hlp/winhelp/parser.rb +484 -0
- data/lib/cabriolet/hlp/winhelp/zeck_lz77.rb +271 -0
- data/lib/cabriolet/huffman/tree.rb +85 -1
- data/lib/cabriolet/kwaj/command_handler.rb +213 -0
- data/lib/cabriolet/kwaj/compressor.rb +7 -3
- data/lib/cabriolet/kwaj/decompressor.rb +18 -12
- data/lib/cabriolet/lit/command_handler.rb +221 -0
- data/lib/cabriolet/lit/compressor.rb +633 -38
- data/lib/cabriolet/lit/decompressor.rb +518 -152
- data/lib/cabriolet/lit/parser.rb +670 -0
- data/lib/cabriolet/models/hlp_file.rb +130 -29
- data/lib/cabriolet/models/hlp_header.rb +105 -17
- data/lib/cabriolet/models/lit_header.rb +212 -25
- data/lib/cabriolet/models/szdd_header.rb +10 -2
- data/lib/cabriolet/models/winhelp_header.rb +127 -0
- data/lib/cabriolet/oab/command_handler.rb +257 -0
- data/lib/cabriolet/oab/compressor.rb +17 -8
- data/lib/cabriolet/oab/decompressor.rb +41 -10
- data/lib/cabriolet/offset_calculator.rb +81 -0
- data/lib/cabriolet/plugin.rb +233 -0
- data/lib/cabriolet/plugin_manager.rb +453 -0
- data/lib/cabriolet/plugin_validator.rb +422 -0
- data/lib/cabriolet/system/io_system.rb +3 -0
- data/lib/cabriolet/system/memory_handle.rb +17 -4
- data/lib/cabriolet/szdd/command_handler.rb +217 -0
- data/lib/cabriolet/szdd/compressor.rb +15 -11
- data/lib/cabriolet/szdd/decompressor.rb +18 -9
- data/lib/cabriolet/version.rb +1 -1
- data/lib/cabriolet.rb +67 -17
- metadata +33 -2
|
@@ -15,7 +15,7 @@ module Cabriolet
|
|
|
15
15
|
# @param input [System::FileHandle, System::MemoryHandle] Input handle
|
|
16
16
|
# @param output [System::FileHandle, System::MemoryHandle] Output handle
|
|
17
17
|
# @param buffer_size [Integer] Buffer size for I/O operations
|
|
18
|
-
def initialize(io_system, input, output, buffer_size)
|
|
18
|
+
def initialize(io_system, input, output, buffer_size, **_kwargs)
|
|
19
19
|
@io_system = io_system
|
|
20
20
|
@input = input
|
|
21
21
|
@output = output
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
|
+
require_relative "base"
|
|
3
4
|
require_relative "../binary/bitstream_writer"
|
|
4
5
|
require_relative "../huffman/encoder"
|
|
5
6
|
|
|
@@ -66,7 +67,8 @@ module Cabriolet
|
|
|
66
67
|
# @param output [System::FileHandle, System::MemoryHandle] Output handle
|
|
67
68
|
# @param buffer_size [Integer] Buffer size for I/O operations
|
|
68
69
|
# @param window_bits [Integer] Window size (15-21 for regular LZX)
|
|
69
|
-
def initialize(io_system, input, output, buffer_size, window_bits: 15
|
|
70
|
+
def initialize(io_system, input, output, buffer_size, window_bits: 15,
|
|
71
|
+
**_kwargs)
|
|
70
72
|
super(io_system, input, output, buffer_size)
|
|
71
73
|
|
|
72
74
|
# Validate window_bits
|
|
@@ -82,8 +84,9 @@ module Cabriolet
|
|
|
82
84
|
@num_offsets = POSITION_SLOTS[window_bits - 15] << 3
|
|
83
85
|
@maintree_maxsymbols = NUM_CHARS + @num_offsets
|
|
84
86
|
|
|
85
|
-
# Initialize bitstream writer
|
|
86
|
-
@bitstream = Binary::BitstreamWriter.new(io_system, output,
|
|
87
|
+
# Initialize bitstream writer (LZX uses MSB-first bit ordering per libmspack lzxd.c)
|
|
88
|
+
@bitstream = Binary::BitstreamWriter.new(io_system, output,
|
|
89
|
+
buffer_size, bit_order: :msb)
|
|
87
90
|
|
|
88
91
|
# Initialize sliding window for LZ77
|
|
89
92
|
@window = "\0" * @window_size
|
|
@@ -119,6 +122,7 @@ module Cabriolet
|
|
|
119
122
|
frame_data = input_data[pos, frame_size]
|
|
120
123
|
|
|
121
124
|
# Compress this frame
|
|
125
|
+
# TODO: Use compress_frame_verbatim once tree encoding is fixed
|
|
122
126
|
compress_frame(frame_data)
|
|
123
127
|
|
|
124
128
|
pos += frame_size
|
|
@@ -152,19 +156,46 @@ module Cabriolet
|
|
|
152
156
|
# @param data [String] Frame data to compress
|
|
153
157
|
# @return [void]
|
|
154
158
|
def compress_frame(data)
|
|
155
|
-
#
|
|
156
|
-
|
|
159
|
+
# For uncompressed blocks, block length is just the frame data size
|
|
160
|
+
# (offset registers are NOT included in the block length field)
|
|
161
|
+
block_length = data.bytesize
|
|
157
162
|
|
|
158
|
-
# Write
|
|
163
|
+
# Write UNCOMPRESSED block header
|
|
164
|
+
write_block_header(BLOCKTYPE_UNCOMPRESSED, block_length)
|
|
165
|
+
|
|
166
|
+
# Write offset registers (R0, R1, R2)
|
|
159
167
|
write_offset_registers
|
|
160
168
|
|
|
161
|
-
# Write raw data
|
|
169
|
+
# Write raw uncompressed data
|
|
162
170
|
data.each_byte do |byte|
|
|
163
171
|
@bitstream.write_bits(byte, 8)
|
|
164
172
|
end
|
|
173
|
+
end
|
|
174
|
+
|
|
175
|
+
# Compress a single frame (32KB) - VERBATIM version (currently disabled)
|
|
176
|
+
#
|
|
177
|
+
# @param data [String] Frame data to compress
|
|
178
|
+
# @return [void]
|
|
179
|
+
def compress_frame_verbatim(data)
|
|
180
|
+
# Reset frequency statistics for each frame
|
|
181
|
+
@literal_freq.fill(0)
|
|
182
|
+
@match_freq.fill(0)
|
|
183
|
+
@length_freq.fill(0)
|
|
184
|
+
|
|
185
|
+
# Analyze frame to generate LZ77 tokens
|
|
186
|
+
tokens = analyze_frame(data)
|
|
165
187
|
|
|
166
|
-
#
|
|
167
|
-
|
|
188
|
+
# Build Huffman trees from statistics
|
|
189
|
+
build_trees
|
|
190
|
+
|
|
191
|
+
# Write VERBATIM block header
|
|
192
|
+
write_block_header(BLOCKTYPE_VERBATIM, data.bytesize)
|
|
193
|
+
|
|
194
|
+
# Write Huffman tree definitions
|
|
195
|
+
write_trees
|
|
196
|
+
|
|
197
|
+
# Encode all tokens using the Huffman codes
|
|
198
|
+
encode_tokens(tokens)
|
|
168
199
|
end
|
|
169
200
|
|
|
170
201
|
# Analyze frame and generate LZ77 tokens
|
|
@@ -301,68 +332,224 @@ module Cabriolet
|
|
|
301
332
|
slot
|
|
302
333
|
end
|
|
303
334
|
|
|
335
|
+
# Build Huffman code lengths from frequencies
|
|
336
|
+
#
|
|
337
|
+
# Uses a simplified approach: assign equal lengths to all symbols.
|
|
338
|
+
# This guarantees valid Huffman trees that satisfy Kraft inequality.
|
|
339
|
+
#
|
|
340
|
+
# @param freqs [Array<Integer>] Symbol frequencies
|
|
341
|
+
# @param num_symbols [Integer] Number of symbols
|
|
342
|
+
# @return [Array<Integer>] Code lengths
|
|
343
|
+
def build_tree_lengths(freqs, num_symbols)
|
|
344
|
+
lengths = Array.new(num_symbols, 0)
|
|
345
|
+
|
|
346
|
+
# Get symbols with non-zero frequencies
|
|
347
|
+
non_zero_symbols = freqs.each_with_index.select do |freq, _|
|
|
348
|
+
freq.positive?
|
|
349
|
+
end.map { |_, sym| sym }
|
|
350
|
+
|
|
351
|
+
# Handle edge cases
|
|
352
|
+
if non_zero_symbols.empty?
|
|
353
|
+
# Empty tree: create minimal valid tree with 2 symbols
|
|
354
|
+
lengths[0] = 1
|
|
355
|
+
lengths[1] = 1
|
|
356
|
+
return lengths
|
|
357
|
+
elsif non_zero_symbols.size == 1
|
|
358
|
+
# Single symbol: need at least 2 symbols for valid Huffman tree
|
|
359
|
+
symbol = non_zero_symbols[0]
|
|
360
|
+
lengths[symbol] = 1
|
|
361
|
+
dummy = symbol.zero? ? 1 : 0
|
|
362
|
+
lengths[dummy] = 1
|
|
363
|
+
return lengths
|
|
364
|
+
end
|
|
365
|
+
|
|
366
|
+
# Calculate required length: ceil(log2(count))
|
|
367
|
+
count = non_zero_symbols.size
|
|
368
|
+
bit_length = 1
|
|
369
|
+
while (1 << bit_length) < count
|
|
370
|
+
bit_length += 1
|
|
371
|
+
end
|
|
372
|
+
|
|
373
|
+
# Assign same length to all non-zero symbols
|
|
374
|
+
non_zero_symbols.each do |symbol|
|
|
375
|
+
lengths[symbol] = bit_length
|
|
376
|
+
end
|
|
377
|
+
|
|
378
|
+
# Pad with dummy symbols to make tree complete (2^bit_length total symbols)
|
|
379
|
+
# This ensures Kraft inequality sum equals exactly 1.0
|
|
380
|
+
total_needed = 1 << bit_length
|
|
381
|
+
dummy_count = total_needed - count
|
|
382
|
+
|
|
383
|
+
if dummy_count.positive?
|
|
384
|
+
dummy_index = 0
|
|
385
|
+
while dummy_count.positive? && dummy_index < num_symbols
|
|
386
|
+
if lengths[dummy_index].zero?
|
|
387
|
+
lengths[dummy_index] = bit_length
|
|
388
|
+
dummy_count -= 1
|
|
389
|
+
end
|
|
390
|
+
dummy_index += 1
|
|
391
|
+
end
|
|
392
|
+
end
|
|
393
|
+
|
|
394
|
+
lengths
|
|
395
|
+
end
|
|
396
|
+
|
|
304
397
|
# Build Huffman trees from frequency statistics
|
|
305
398
|
#
|
|
399
|
+
# This creates three trees for LZX compression:
|
|
400
|
+
# 1. Main tree: literals (0-255) + match position/length combinations
|
|
401
|
+
# 2. Length tree: additional length symbols for long matches
|
|
402
|
+
# 3. Pretree: encodes the code lengths of main/length trees
|
|
403
|
+
#
|
|
306
404
|
# @return [void]
|
|
307
405
|
def build_trees
|
|
308
|
-
#
|
|
309
|
-
|
|
310
|
-
|
|
406
|
+
# Step 1: Combine literal and match frequencies for main tree
|
|
407
|
+
maintree_freq = @literal_freq + @match_freq
|
|
408
|
+
|
|
409
|
+
# Step 2: Build main tree code lengths
|
|
410
|
+
@maintree_lengths = build_tree_lengths(maintree_freq,
|
|
311
411
|
@maintree_maxsymbols)
|
|
312
|
-
@maintree_codes = Huffman::Encoder.build_codes(@maintree_lengths,
|
|
313
|
-
@maintree_maxsymbols)
|
|
314
412
|
|
|
315
|
-
# Build length tree
|
|
413
|
+
# Step 3: Build length tree code lengths
|
|
316
414
|
@length_lengths = build_tree_lengths(@length_freq, LENGTH_MAXSYMBOLS)
|
|
415
|
+
|
|
416
|
+
# Step 4: Calculate pretree frequencies by simulating tree encoding
|
|
417
|
+
pretree_freq = calculate_pretree_frequencies
|
|
418
|
+
|
|
419
|
+
# Step 5: Build pretree code lengths
|
|
420
|
+
@pretree_lengths = build_tree_lengths(pretree_freq, PRETREE_MAXSYMBOLS)
|
|
421
|
+
|
|
422
|
+
# Step 6: Generate code tables from lengths
|
|
423
|
+
@maintree_codes = Huffman::Encoder.build_codes(@maintree_lengths,
|
|
424
|
+
@maintree_maxsymbols)
|
|
317
425
|
@length_codes = Huffman::Encoder.build_codes(@length_lengths,
|
|
318
426
|
LENGTH_MAXSYMBOLS)
|
|
319
|
-
|
|
320
|
-
# Build pretree (used to encode the other trees)
|
|
321
|
-
# Create a valid Huffman tree that satisfies Kraft inequality
|
|
322
|
-
# For 20 symbols, use: 2@3bits + 6@4bits + 12@5bits = 1.0
|
|
323
|
-
@pretree_lengths = Array.new(PRETREE_MAXSYMBOLS, 0)
|
|
324
|
-
# Most common symbols (0-1): 3 bits
|
|
325
|
-
(0..1).each { |i| @pretree_lengths[i] = 3 }
|
|
326
|
-
# Common symbols (2-7): 4 bits
|
|
327
|
-
(2..7).each { |i| @pretree_lengths[i] = 4 }
|
|
328
|
-
# Less common symbols (8-19): 5 bits
|
|
329
|
-
(8..19).each { |i| @pretree_lengths[i] = 5 }
|
|
330
427
|
@pretree_codes = Huffman::Encoder.build_codes(@pretree_lengths,
|
|
331
428
|
PRETREE_MAXSYMBOLS)
|
|
332
429
|
end
|
|
333
430
|
|
|
334
|
-
#
|
|
431
|
+
# Calculate pretree symbol frequencies
|
|
335
432
|
#
|
|
336
|
-
#
|
|
337
|
-
#
|
|
338
|
-
#
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
433
|
+
# The pretree encodes the code lengths of the main and length trees.
|
|
434
|
+
# This method simulates the tree encoding process to determine which
|
|
435
|
+
# pretree symbols will be needed.
|
|
436
|
+
#
|
|
437
|
+
# @return [Array<Integer>] Frequency array for pretree symbols (0-19)
|
|
438
|
+
def calculate_pretree_frequencies
|
|
439
|
+
pretree_freq = Array.new(PRETREE_MAXSYMBOLS, 0)
|
|
343
440
|
|
|
344
|
-
#
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
441
|
+
# Count symbols needed to encode main tree (two parts)
|
|
442
|
+
count_pretree_symbols(@maintree_lengths, 0, NUM_CHARS, pretree_freq)
|
|
443
|
+
count_pretree_symbols(@maintree_lengths, NUM_CHARS,
|
|
444
|
+
@maintree_maxsymbols, pretree_freq)
|
|
445
|
+
|
|
446
|
+
# Count symbols needed to encode length tree
|
|
447
|
+
count_pretree_symbols(@length_lengths, 0, NUM_SECONDARY_LENGTHS,
|
|
448
|
+
pretree_freq)
|
|
449
|
+
|
|
450
|
+
pretree_freq
|
|
451
|
+
end
|
|
452
|
+
|
|
453
|
+
# Count pretree symbols needed to encode a tree
|
|
454
|
+
#
|
|
455
|
+
# This simulates the write_tree_with_pretree encoding process to count
|
|
456
|
+
# which pretree symbols will be used, allowing us to build an optimal
|
|
457
|
+
# pretree.
|
|
458
|
+
#
|
|
459
|
+
# @param lengths [Array<Integer>] Tree lengths to encode
|
|
460
|
+
# @param start [Integer] Start index
|
|
461
|
+
# @param end_idx [Integer] End index (exclusive)
|
|
462
|
+
# @param freq [Array<Integer>] Frequency array to update
|
|
463
|
+
# @return [void]
|
|
464
|
+
def count_pretree_symbols(lengths, start, end_idx, freq)
|
|
465
|
+
i = start
|
|
466
|
+
prev_length = 0
|
|
467
|
+
|
|
468
|
+
while i < end_idx
|
|
469
|
+
length = lengths[i]
|
|
470
|
+
|
|
471
|
+
if length.zero?
|
|
472
|
+
# Count run of zeros
|
|
473
|
+
zero_count = 0
|
|
474
|
+
while i < end_idx && lengths[i].zero? && zero_count < 138
|
|
475
|
+
zero_count += 1
|
|
476
|
+
i += 1
|
|
477
|
+
end
|
|
478
|
+
|
|
479
|
+
# Encode long runs with symbol 18
|
|
480
|
+
if zero_count >= 20
|
|
481
|
+
while zero_count >= 20
|
|
482
|
+
run = [zero_count, 51].min
|
|
483
|
+
freq[18] += 1
|
|
484
|
+
zero_count -= run
|
|
485
|
+
end
|
|
486
|
+
end
|
|
487
|
+
|
|
488
|
+
# Encode medium runs with symbol 17
|
|
489
|
+
if zero_count >= 4
|
|
490
|
+
run = [zero_count, 19].min
|
|
491
|
+
freq[17] += 1
|
|
492
|
+
zero_count -= run
|
|
493
|
+
end
|
|
494
|
+
|
|
495
|
+
# Encode remaining short runs as deltas
|
|
496
|
+
if zero_count.positive?
|
|
497
|
+
zero_count.times do
|
|
498
|
+
delta = (17 - prev_length) % 17
|
|
499
|
+
freq[delta] += 1
|
|
500
|
+
prev_length = 0
|
|
501
|
+
end
|
|
502
|
+
end
|
|
503
|
+
else
|
|
504
|
+
# Encode as delta from previous length
|
|
505
|
+
delta = (length - prev_length) % 17
|
|
506
|
+
freq[delta] += 1
|
|
507
|
+
prev_length = length
|
|
508
|
+
i += 1
|
|
509
|
+
end
|
|
363
510
|
end
|
|
511
|
+
end
|
|
364
512
|
|
|
365
|
-
|
|
513
|
+
# Calculate code lengths by traversing Huffman tree
|
|
514
|
+
#
|
|
515
|
+
# @param node [Array] Tree node [freq, symbol, left, right, depth]
|
|
516
|
+
# @param depth [Integer] Current depth
|
|
517
|
+
# @param lengths [Array<Integer>] Output array for lengths
|
|
518
|
+
# @return [void]
|
|
519
|
+
def calculate_depths(node, depth, lengths)
|
|
520
|
+
return unless node
|
|
521
|
+
|
|
522
|
+
_, symbol, left, right, = node
|
|
523
|
+
|
|
524
|
+
if symbol.nil?
|
|
525
|
+
# Internal node: recurse to children
|
|
526
|
+
calculate_depths(left, depth + 1, lengths)
|
|
527
|
+
calculate_depths(right, depth + 1, lengths)
|
|
528
|
+
else
|
|
529
|
+
# Leaf node: record length
|
|
530
|
+
lengths[symbol] = depth
|
|
531
|
+
end
|
|
532
|
+
end
|
|
533
|
+
|
|
534
|
+
# Calculate code lengths by traversing Huffman tree
|
|
535
|
+
#
|
|
536
|
+
# @param node [Array] Tree node [freq, symbol, left, right]
|
|
537
|
+
# @param depth [Integer] Current depth
|
|
538
|
+
# @param lengths [Array<Integer>] Output array for lengths
|
|
539
|
+
# @return [void]
|
|
540
|
+
def calculate_code_lengths(node, depth, lengths)
|
|
541
|
+
return unless node
|
|
542
|
+
|
|
543
|
+
_, symbol, left, right = node
|
|
544
|
+
|
|
545
|
+
if symbol.nil?
|
|
546
|
+
# Internal node: recurse to children
|
|
547
|
+
calculate_code_lengths(left, depth + 1, lengths)
|
|
548
|
+
calculate_code_lengths(right, depth + 1, lengths)
|
|
549
|
+
else
|
|
550
|
+
# Leaf node: record length
|
|
551
|
+
lengths[symbol] = depth
|
|
552
|
+
end
|
|
366
553
|
end
|
|
367
554
|
|
|
368
555
|
# Write block header
|
|
@@ -56,7 +56,7 @@ module Cabriolet
|
|
|
56
56
|
# @param input [System::FileHandle, System::MemoryHandle] Input handle
|
|
57
57
|
# @param output [System::FileHandle, System::MemoryHandle] Output handle
|
|
58
58
|
# @param buffer_size [Integer] Buffer size for I/O operations
|
|
59
|
-
def initialize(io_system, input, output, buffer_size)
|
|
59
|
+
def initialize(io_system, input, output, buffer_size, **_kwargs)
|
|
60
60
|
super
|
|
61
61
|
|
|
62
62
|
# Initialize bitstream writer
|
|
@@ -88,10 +88,15 @@ module Cabriolet
|
|
|
88
88
|
|
|
89
89
|
# Process data in FRAME_SIZE chunks
|
|
90
90
|
# Each frame is independent and contains blocks ending with last_block=1
|
|
91
|
+
frame_num = 0
|
|
91
92
|
while pos < input_data.bytesize
|
|
92
93
|
chunk_size = [FRAME_SIZE, input_data.bytesize - pos].min
|
|
93
94
|
chunk = input_data[pos, chunk_size]
|
|
94
95
|
|
|
96
|
+
if ENV["DEBUG_MSZIP_COMPRESS"]
|
|
97
|
+
warn "DEBUG compress: Frame #{frame_num}: pos=#{pos}, chunk_size=#{chunk_size}"
|
|
98
|
+
end
|
|
99
|
+
|
|
95
100
|
# Write CK signature
|
|
96
101
|
write_signature
|
|
97
102
|
|
|
@@ -99,11 +104,19 @@ module Cabriolet
|
|
|
99
104
|
# Each frame's block is always marked as last within that frame
|
|
100
105
|
compress_block(chunk, true)
|
|
101
106
|
|
|
107
|
+
# Flush bitstream after each frame to ensure data is written
|
|
108
|
+
@bitstream.flush
|
|
109
|
+
|
|
110
|
+
if ENV["DEBUG_MSZIP_COMPRESS"]
|
|
111
|
+
warn "DEBUG compress: Frame #{frame_num} complete, flushed"
|
|
112
|
+
end
|
|
113
|
+
|
|
102
114
|
pos += chunk_size
|
|
103
115
|
total_written += chunk_size
|
|
116
|
+
frame_num += 1
|
|
104
117
|
end
|
|
105
118
|
|
|
106
|
-
#
|
|
119
|
+
# Final flush (may not be needed now but keep for safety)
|
|
107
120
|
@bitstream.flush
|
|
108
121
|
|
|
109
122
|
total_written
|
|
@@ -129,8 +142,19 @@ module Cabriolet
|
|
|
129
142
|
#
|
|
130
143
|
# @return [void]
|
|
131
144
|
def write_signature
|
|
145
|
+
if ENV["DEBUG_MSZIP_COMPRESS"]
|
|
146
|
+
warn "DEBUG write_signature: ENTRY"
|
|
147
|
+
end
|
|
132
148
|
@bitstream.byte_align
|
|
133
|
-
SIGNATURE.each
|
|
149
|
+
SIGNATURE.each do |byte|
|
|
150
|
+
if ENV["DEBUG_MSZIP_COMPRESS"]
|
|
151
|
+
warn "DEBUG write_signature: Writing byte 0x#{byte.to_s(16)}"
|
|
152
|
+
end
|
|
153
|
+
@bitstream.write_raw_byte(byte)
|
|
154
|
+
end
|
|
155
|
+
if ENV["DEBUG_MSZIP_COMPRESS"]
|
|
156
|
+
warn "DEBUG write_signature: EXIT"
|
|
157
|
+
end
|
|
134
158
|
end
|
|
135
159
|
|
|
136
160
|
# Compress a single block using fixed Huffman encoding
|
|
@@ -139,6 +163,10 @@ module Cabriolet
|
|
|
139
163
|
# @param is_last [Boolean] Whether this is the last block
|
|
140
164
|
# @return [void]
|
|
141
165
|
def compress_block(data, is_last)
|
|
166
|
+
if ENV["DEBUG_MSZIP_COMPRESS"]
|
|
167
|
+
warn "DEBUG compress_block: ENTRY data_size=#{data.bytesize} is_last=#{is_last}"
|
|
168
|
+
end
|
|
169
|
+
|
|
142
170
|
# Write block header
|
|
143
171
|
@bitstream.write_bits(is_last ? 1 : 0, 1) # Last block flag
|
|
144
172
|
@bitstream.write_bits(FIXED_HUFFMAN_BLOCK, 2) # Block type
|
|
@@ -151,6 +179,10 @@ module Cabriolet
|
|
|
151
179
|
|
|
152
180
|
# Write end-of-block symbol (256)
|
|
153
181
|
encode_literal(256)
|
|
182
|
+
|
|
183
|
+
if ENV["DEBUG_MSZIP_COMPRESS"]
|
|
184
|
+
warn "DEBUG compress_block: EXIT"
|
|
185
|
+
end
|
|
154
186
|
end
|
|
155
187
|
|
|
156
188
|
# Encode data using LZ77 matching and Huffman encoding
|
|
@@ -5,15 +5,12 @@ module Cabriolet
|
|
|
5
5
|
# Quantum compresses data using arithmetic coding and LZ77-based matching
|
|
6
6
|
# Based on the Quantum decompressor and libmspack qtmd.c implementation
|
|
7
7
|
#
|
|
8
|
-
# STATUS: Functional with known limitations
|
|
9
|
-
# - Literals: WORKING ✓
|
|
10
|
-
# - Short matches (3-13 bytes): WORKING ✓
|
|
11
|
-
# - Longer matches (14+ bytes): Limited support (known issue)
|
|
12
|
-
# - Simple data round-trips successfully
|
|
13
|
-
# - Complex repeated patterns may have issues
|
|
14
|
-
#
|
|
15
8
|
# The Quantum method was created by David Stafford, adapted by Microsoft
|
|
16
9
|
# Corporation.
|
|
10
|
+
#
|
|
11
|
+
# NOTE: This compressor is a work-in-progress. The arithmetic coding
|
|
12
|
+
# implementation needs refinement to match the decoder exactly.
|
|
13
|
+
# For now, this implementation focuses on correct structure.
|
|
17
14
|
# rubocop:disable Metrics/ClassLength
|
|
18
15
|
class Quantum < Base
|
|
19
16
|
# Frame size (32KB per frame)
|
|
@@ -21,7 +18,7 @@ module Cabriolet
|
|
|
21
18
|
|
|
22
19
|
# Match constants
|
|
23
20
|
MIN_MATCH = 3
|
|
24
|
-
MAX_MATCH =
|
|
21
|
+
MAX_MATCH = 259
|
|
25
22
|
|
|
26
23
|
# Position slot tables (same as decompressor)
|
|
27
24
|
POSITION_BASE = [
|
|
@@ -77,7 +74,8 @@ module Cabriolet
|
|
|
77
74
|
# @param output [System::FileHandle, System::MemoryHandle] Output handle
|
|
78
75
|
# @param buffer_size [Integer] Buffer size for I/O operations
|
|
79
76
|
# @param window_bits [Integer] Window size parameter (10-21)
|
|
80
|
-
def initialize(io_system, input, output, buffer_size, window_bits: 10
|
|
77
|
+
def initialize(io_system, input, output, buffer_size, window_bits: 10,
|
|
78
|
+
**_kwargs)
|
|
81
79
|
super(io_system, input, output, buffer_size)
|
|
82
80
|
|
|
83
81
|
# Validate window_bits
|
|
@@ -179,7 +177,6 @@ module Cabriolet
|
|
|
179
177
|
|
|
180
178
|
# Compress a single frame
|
|
181
179
|
def compress_frame(data)
|
|
182
|
-
# No header needed - the first 16 bits of encoded data will be read as C
|
|
183
180
|
pos = 0
|
|
184
181
|
|
|
185
182
|
while pos < data.bytesize
|
|
@@ -198,27 +195,25 @@ module Cabriolet
|
|
|
198
195
|
end
|
|
199
196
|
end
|
|
200
197
|
|
|
201
|
-
# Finish arithmetic coding
|
|
202
|
-
# We need to output enough bits to disambiguate the final range
|
|
198
|
+
# Finish arithmetic coding
|
|
203
199
|
finish_arithmetic_coding
|
|
204
200
|
end
|
|
205
201
|
|
|
206
|
-
# Finish arithmetic coding
|
|
202
|
+
# Finish arithmetic coding
|
|
207
203
|
def finish_arithmetic_coding
|
|
208
|
-
# Output
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
@
|
|
204
|
+
# Output pending underflow bits
|
|
205
|
+
if @underflow_bits.positive?
|
|
206
|
+
bit = if @l.anybits?(0x4000)
|
|
207
|
+
1
|
|
208
|
+
else
|
|
209
|
+
0
|
|
210
|
+
end
|
|
211
|
+
@bitstream.write_bits_msb(bit, 1)
|
|
212
|
+
@underflow_bits.times do
|
|
213
|
+
@bitstream.write_bits_msb(bit ^ 1, 1)
|
|
214
|
+
end
|
|
215
|
+
@underflow_bits = 0
|
|
220
216
|
end
|
|
221
|
-
@underflow_bits = 0
|
|
222
217
|
end
|
|
223
218
|
|
|
224
219
|
# Find best match in the sliding window
|
|
@@ -335,7 +330,6 @@ module Cabriolet
|
|
|
335
330
|
end
|
|
336
331
|
|
|
337
332
|
# Encode a symbol using arithmetic coding
|
|
338
|
-
# This is the inverse of GET_SYMBOL macro in qtmd.c
|
|
339
333
|
def encode_symbol(model, sym)
|
|
340
334
|
# Find symbol index in model
|
|
341
335
|
i = 0
|
|
@@ -346,33 +340,29 @@ module Cabriolet
|
|
|
346
340
|
"Symbol #{sym} not found in model"
|
|
347
341
|
end
|
|
348
342
|
|
|
349
|
-
# Calculate range
|
|
350
|
-
range = (@h - @l) + 1
|
|
343
|
+
# Calculate range - use decoder's formula
|
|
344
|
+
range = ((@h - @l) & 0xFFFF) + 1
|
|
351
345
|
symf = model.syms[0].cumfreq
|
|
352
346
|
|
|
353
|
-
# Update H and L
|
|
354
|
-
# Decoder uses syms[i-1] and syms[i], so encoder at index j
|
|
355
|
-
# should use syms[j] and syms[j+1] to make decoder land at i=j+1
|
|
356
|
-
# But decoder returns syms[i-1].sym, so it will return syms[j].sym ✓
|
|
347
|
+
# Update H and L
|
|
357
348
|
@h = @l + ((model.syms[i].cumfreq * range) / symf) - 1
|
|
358
349
|
@l += ((model.syms[i + 1].cumfreq * range) / symf)
|
|
359
350
|
|
|
360
|
-
# Update model frequencies
|
|
351
|
+
# Update model frequencies
|
|
361
352
|
j = i
|
|
362
353
|
while j >= 0
|
|
363
354
|
model.syms[j].cumfreq += 8
|
|
364
355
|
j -= 1
|
|
365
356
|
end
|
|
366
357
|
|
|
367
|
-
# Check if model needs updating
|
|
358
|
+
# Check if model needs updating
|
|
368
359
|
update_model(model) if model.syms[0].cumfreq > 3800
|
|
369
360
|
|
|
370
|
-
# Normalize range
|
|
361
|
+
# Normalize range
|
|
371
362
|
normalize_range
|
|
372
363
|
end
|
|
373
364
|
|
|
374
365
|
# Normalize arithmetic coding range and output bits
|
|
375
|
-
# This implements the encoder equivalent of the decoder's normalization (lines 109-121)
|
|
376
366
|
def normalize_range
|
|
377
367
|
loop do
|
|
378
368
|
if (@l & 0x8000) == (@h & 0x8000)
|
|
@@ -395,37 +385,36 @@ module Cabriolet
|
|
|
395
385
|
@h |= 0x4000
|
|
396
386
|
|
|
397
387
|
# Can't normalize further
|
|
398
|
-
|
|
399
388
|
end
|
|
400
389
|
|
|
401
|
-
# Shift range
|
|
390
|
+
# Shift range
|
|
402
391
|
@l = (@l << 1) & 0xFFFF
|
|
403
392
|
@h = ((@h << 1) | 1) & 0xFFFF
|
|
404
393
|
end
|
|
405
394
|
end
|
|
406
395
|
|
|
407
|
-
# Update model statistics
|
|
396
|
+
# Update model statistics
|
|
408
397
|
def update_model(model)
|
|
409
398
|
model.shiftsleft -= 1
|
|
410
399
|
|
|
411
400
|
if model.shiftsleft.positive?
|
|
412
|
-
# Simple shift
|
|
401
|
+
# Simple shift
|
|
413
402
|
(model.entries - 1).downto(0) do |i|
|
|
414
403
|
model.syms[i].cumfreq >>= 1
|
|
415
404
|
model.syms[i].cumfreq = model.syms[i + 1].cumfreq + 1 if model.syms[i].cumfreq <= model.syms[i + 1].cumfreq
|
|
416
405
|
end
|
|
417
406
|
else
|
|
418
|
-
# Full rebuild
|
|
407
|
+
# Full rebuild
|
|
419
408
|
model.shiftsleft = 50
|
|
420
409
|
|
|
421
|
-
# Convert cumfreq to frequencies
|
|
410
|
+
# Convert cumfreq to frequencies
|
|
422
411
|
(0...model.entries).each do |i|
|
|
423
412
|
model.syms[i].cumfreq -= model.syms[i + 1].cumfreq
|
|
424
413
|
model.syms[i].cumfreq += 1
|
|
425
414
|
model.syms[i].cumfreq >>= 1
|
|
426
415
|
end
|
|
427
416
|
|
|
428
|
-
# Sort by frequency
|
|
417
|
+
# Sort by frequency
|
|
429
418
|
(0...(model.entries - 1)).each do |i|
|
|
430
419
|
((i + 1)...model.entries).each do |j|
|
|
431
420
|
if model.syms[i].cumfreq < model.syms[j].cumfreq
|
|
@@ -434,7 +423,7 @@ module Cabriolet
|
|
|
434
423
|
end
|
|
435
424
|
end
|
|
436
425
|
|
|
437
|
-
# Convert back to cumulative frequencies
|
|
426
|
+
# Convert back to cumulative frequencies
|
|
438
427
|
(model.entries - 1).downto(0) do |i|
|
|
439
428
|
model.syms[i].cumfreq += model.syms[i + 1].cumfreq
|
|
440
429
|
end
|
|
@@ -12,7 +12,7 @@ module Cabriolet
|
|
|
12
12
|
# @param input [System::FileHandle, System::MemoryHandle] Input handle
|
|
13
13
|
# @param output [System::FileHandle, System::MemoryHandle] Output handle
|
|
14
14
|
# @param buffer_size [Integer] Buffer size for I/O operations
|
|
15
|
-
def initialize(io_system, input, output, buffer_size)
|
|
15
|
+
def initialize(io_system, input, output, buffer_size, **_kwargs)
|
|
16
16
|
@io_system = io_system
|
|
17
17
|
@input = input
|
|
18
18
|
@output = output
|