cabriolet 0.1.2 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (77) hide show
  1. checksums.yaml +4 -4
  2. data/README.adoc +700 -38
  3. data/lib/cabriolet/algorithm_factory.rb +250 -0
  4. data/lib/cabriolet/base_compressor.rb +206 -0
  5. data/lib/cabriolet/binary/bitstream.rb +154 -14
  6. data/lib/cabriolet/binary/bitstream_writer.rb +129 -17
  7. data/lib/cabriolet/binary/chm_structures.rb +2 -2
  8. data/lib/cabriolet/binary/hlp_structures.rb +258 -37
  9. data/lib/cabriolet/binary/lit_structures.rb +231 -65
  10. data/lib/cabriolet/binary/oab_structures.rb +17 -1
  11. data/lib/cabriolet/cab/command_handler.rb +226 -0
  12. data/lib/cabriolet/cab/compressor.rb +35 -43
  13. data/lib/cabriolet/cab/decompressor.rb +14 -19
  14. data/lib/cabriolet/cab/extractor.rb +140 -31
  15. data/lib/cabriolet/chm/command_handler.rb +227 -0
  16. data/lib/cabriolet/chm/compressor.rb +7 -3
  17. data/lib/cabriolet/chm/decompressor.rb +39 -21
  18. data/lib/cabriolet/chm/parser.rb +5 -2
  19. data/lib/cabriolet/cli/base_command_handler.rb +127 -0
  20. data/lib/cabriolet/cli/command_dispatcher.rb +140 -0
  21. data/lib/cabriolet/cli/command_registry.rb +83 -0
  22. data/lib/cabriolet/cli.rb +356 -607
  23. data/lib/cabriolet/compressors/base.rb +1 -1
  24. data/lib/cabriolet/compressors/lzx.rb +241 -54
  25. data/lib/cabriolet/compressors/mszip.rb +35 -3
  26. data/lib/cabriolet/compressors/quantum.rb +34 -45
  27. data/lib/cabriolet/decompressors/base.rb +1 -1
  28. data/lib/cabriolet/decompressors/lzss.rb +13 -3
  29. data/lib/cabriolet/decompressors/lzx.rb +70 -33
  30. data/lib/cabriolet/decompressors/mszip.rb +126 -39
  31. data/lib/cabriolet/decompressors/quantum.rb +3 -2
  32. data/lib/cabriolet/errors.rb +3 -0
  33. data/lib/cabriolet/file_entry.rb +156 -0
  34. data/lib/cabriolet/file_manager.rb +144 -0
  35. data/lib/cabriolet/hlp/command_handler.rb +282 -0
  36. data/lib/cabriolet/hlp/compressor.rb +28 -238
  37. data/lib/cabriolet/hlp/decompressor.rb +107 -147
  38. data/lib/cabriolet/hlp/parser.rb +52 -101
  39. data/lib/cabriolet/hlp/quickhelp/compression_stream.rb +138 -0
  40. data/lib/cabriolet/hlp/quickhelp/compressor.rb +626 -0
  41. data/lib/cabriolet/hlp/quickhelp/decompressor.rb +558 -0
  42. data/lib/cabriolet/hlp/quickhelp/huffman_stream.rb +74 -0
  43. data/lib/cabriolet/hlp/quickhelp/huffman_tree.rb +167 -0
  44. data/lib/cabriolet/hlp/quickhelp/parser.rb +274 -0
  45. data/lib/cabriolet/hlp/winhelp/btree_builder.rb +289 -0
  46. data/lib/cabriolet/hlp/winhelp/compressor.rb +400 -0
  47. data/lib/cabriolet/hlp/winhelp/decompressor.rb +192 -0
  48. data/lib/cabriolet/hlp/winhelp/parser.rb +484 -0
  49. data/lib/cabriolet/hlp/winhelp/zeck_lz77.rb +271 -0
  50. data/lib/cabriolet/huffman/tree.rb +85 -1
  51. data/lib/cabriolet/kwaj/command_handler.rb +213 -0
  52. data/lib/cabriolet/kwaj/compressor.rb +7 -3
  53. data/lib/cabriolet/kwaj/decompressor.rb +18 -12
  54. data/lib/cabriolet/lit/command_handler.rb +221 -0
  55. data/lib/cabriolet/lit/compressor.rb +633 -38
  56. data/lib/cabriolet/lit/decompressor.rb +518 -152
  57. data/lib/cabriolet/lit/parser.rb +670 -0
  58. data/lib/cabriolet/models/hlp_file.rb +130 -29
  59. data/lib/cabriolet/models/hlp_header.rb +105 -17
  60. data/lib/cabriolet/models/lit_header.rb +212 -25
  61. data/lib/cabriolet/models/szdd_header.rb +10 -2
  62. data/lib/cabriolet/models/winhelp_header.rb +127 -0
  63. data/lib/cabriolet/oab/command_handler.rb +257 -0
  64. data/lib/cabriolet/oab/compressor.rb +17 -8
  65. data/lib/cabriolet/oab/decompressor.rb +41 -10
  66. data/lib/cabriolet/offset_calculator.rb +81 -0
  67. data/lib/cabriolet/plugin.rb +233 -0
  68. data/lib/cabriolet/plugin_manager.rb +453 -0
  69. data/lib/cabriolet/plugin_validator.rb +422 -0
  70. data/lib/cabriolet/system/io_system.rb +3 -0
  71. data/lib/cabriolet/system/memory_handle.rb +17 -4
  72. data/lib/cabriolet/szdd/command_handler.rb +217 -0
  73. data/lib/cabriolet/szdd/compressor.rb +15 -11
  74. data/lib/cabriolet/szdd/decompressor.rb +18 -9
  75. data/lib/cabriolet/version.rb +1 -1
  76. data/lib/cabriolet.rb +67 -17
  77. metadata +33 -2
@@ -15,7 +15,7 @@ module Cabriolet
15
15
  # @param input [System::FileHandle, System::MemoryHandle] Input handle
16
16
  # @param output [System::FileHandle, System::MemoryHandle] Output handle
17
17
  # @param buffer_size [Integer] Buffer size for I/O operations
18
- def initialize(io_system, input, output, buffer_size)
18
+ def initialize(io_system, input, output, buffer_size, **_kwargs)
19
19
  @io_system = io_system
20
20
  @input = input
21
21
  @output = output
@@ -1,5 +1,6 @@
1
1
  # frozen_string_literal: true
2
2
 
3
+ require_relative "base"
3
4
  require_relative "../binary/bitstream_writer"
4
5
  require_relative "../huffman/encoder"
5
6
 
@@ -66,7 +67,8 @@ module Cabriolet
66
67
  # @param output [System::FileHandle, System::MemoryHandle] Output handle
67
68
  # @param buffer_size [Integer] Buffer size for I/O operations
68
69
  # @param window_bits [Integer] Window size (15-21 for regular LZX)
69
- def initialize(io_system, input, output, buffer_size, window_bits: 15)
70
+ def initialize(io_system, input, output, buffer_size, window_bits: 15,
71
+ **_kwargs)
70
72
  super(io_system, input, output, buffer_size)
71
73
 
72
74
  # Validate window_bits
@@ -82,8 +84,9 @@ module Cabriolet
82
84
  @num_offsets = POSITION_SLOTS[window_bits - 15] << 3
83
85
  @maintree_maxsymbols = NUM_CHARS + @num_offsets
84
86
 
85
- # Initialize bitstream writer
86
- @bitstream = Binary::BitstreamWriter.new(io_system, output, buffer_size)
87
+ # Initialize bitstream writer (LZX uses MSB-first bit ordering per libmspack lzxd.c)
88
+ @bitstream = Binary::BitstreamWriter.new(io_system, output,
89
+ buffer_size, bit_order: :msb)
87
90
 
88
91
  # Initialize sliding window for LZ77
89
92
  @window = "\0" * @window_size
@@ -119,6 +122,7 @@ module Cabriolet
119
122
  frame_data = input_data[pos, frame_size]
120
123
 
121
124
  # Compress this frame
125
+ # TODO: Use compress_frame_verbatim once tree encoding is fixed
122
126
  compress_frame(frame_data)
123
127
 
124
128
  pos += frame_size
@@ -152,19 +156,46 @@ module Cabriolet
152
156
  # @param data [String] Frame data to compress
153
157
  # @return [void]
154
158
  def compress_frame(data)
155
- # Use UNCOMPRESSED blocks for now (simplest approach)
156
- write_block_header(BLOCKTYPE_UNCOMPRESSED, data.bytesize)
159
+ # For uncompressed blocks, block length is just the frame data size
160
+ # (offset registers are NOT included in the block length field)
161
+ block_length = data.bytesize
157
162
 
158
- # Write R0, R1, R2 (required for uncompressed blocks)
163
+ # Write UNCOMPRESSED block header
164
+ write_block_header(BLOCKTYPE_UNCOMPRESSED, block_length)
165
+
166
+ # Write offset registers (R0, R1, R2)
159
167
  write_offset_registers
160
168
 
161
- # Write raw data
169
+ # Write raw uncompressed data
162
170
  data.each_byte do |byte|
163
171
  @bitstream.write_bits(byte, 8)
164
172
  end
173
+ end
174
+
175
+ # Compress a single frame (32KB) - VERBATIM version (currently disabled)
176
+ #
177
+ # @param data [String] Frame data to compress
178
+ # @return [void]
179
+ def compress_frame_verbatim(data)
180
+ # Reset frequency statistics for each frame
181
+ @literal_freq.fill(0)
182
+ @match_freq.fill(0)
183
+ @length_freq.fill(0)
184
+
185
+ # Analyze frame to generate LZ77 tokens
186
+ tokens = analyze_frame(data)
165
187
 
166
- # Ensure byte alignment at end of frame for multi-frame support
167
- @bitstream.byte_align
188
+ # Build Huffman trees from statistics
189
+ build_trees
190
+
191
+ # Write VERBATIM block header
192
+ write_block_header(BLOCKTYPE_VERBATIM, data.bytesize)
193
+
194
+ # Write Huffman tree definitions
195
+ write_trees
196
+
197
+ # Encode all tokens using the Huffman codes
198
+ encode_tokens(tokens)
168
199
  end
169
200
 
170
201
  # Analyze frame and generate LZ77 tokens
@@ -301,68 +332,224 @@ module Cabriolet
301
332
  slot
302
333
  end
303
334
 
335
+ # Build Huffman code lengths from frequencies
336
+ #
337
+ # Uses a simplified approach: assign equal lengths to all symbols.
338
+ # This guarantees valid Huffman trees that satisfy Kraft inequality.
339
+ #
340
+ # @param freqs [Array<Integer>] Symbol frequencies
341
+ # @param num_symbols [Integer] Number of symbols
342
+ # @return [Array<Integer>] Code lengths
343
+ def build_tree_lengths(freqs, num_symbols)
344
+ lengths = Array.new(num_symbols, 0)
345
+
346
+ # Get symbols with non-zero frequencies
347
+ non_zero_symbols = freqs.each_with_index.select do |freq, _|
348
+ freq.positive?
349
+ end.map { |_, sym| sym }
350
+
351
+ # Handle edge cases
352
+ if non_zero_symbols.empty?
353
+ # Empty tree: create minimal valid tree with 2 symbols
354
+ lengths[0] = 1
355
+ lengths[1] = 1
356
+ return lengths
357
+ elsif non_zero_symbols.size == 1
358
+ # Single symbol: need at least 2 symbols for valid Huffman tree
359
+ symbol = non_zero_symbols[0]
360
+ lengths[symbol] = 1
361
+ dummy = symbol.zero? ? 1 : 0
362
+ lengths[dummy] = 1
363
+ return lengths
364
+ end
365
+
366
+ # Calculate required length: ceil(log2(count))
367
+ count = non_zero_symbols.size
368
+ bit_length = 1
369
+ while (1 << bit_length) < count
370
+ bit_length += 1
371
+ end
372
+
373
+ # Assign same length to all non-zero symbols
374
+ non_zero_symbols.each do |symbol|
375
+ lengths[symbol] = bit_length
376
+ end
377
+
378
+ # Pad with dummy symbols to make tree complete (2^bit_length total symbols)
379
+ # This ensures Kraft inequality sum equals exactly 1.0
380
+ total_needed = 1 << bit_length
381
+ dummy_count = total_needed - count
382
+
383
+ if dummy_count.positive?
384
+ dummy_index = 0
385
+ while dummy_count.positive? && dummy_index < num_symbols
386
+ if lengths[dummy_index].zero?
387
+ lengths[dummy_index] = bit_length
388
+ dummy_count -= 1
389
+ end
390
+ dummy_index += 1
391
+ end
392
+ end
393
+
394
+ lengths
395
+ end
396
+
304
397
  # Build Huffman trees from frequency statistics
305
398
  #
399
+ # This creates three trees for LZX compression:
400
+ # 1. Main tree: literals (0-255) + match position/length combinations
401
+ # 2. Length tree: additional length symbols for long matches
402
+ # 3. Pretree: encodes the code lengths of main/length trees
403
+ #
306
404
  # @return [void]
307
405
  def build_trees
308
- # Build main tree (literals + matches)
309
- maintree_freqs = @literal_freq + @match_freq
310
- @maintree_lengths = build_tree_lengths(maintree_freqs,
406
+ # Step 1: Combine literal and match frequencies for main tree
407
+ maintree_freq = @literal_freq + @match_freq
408
+
409
+ # Step 2: Build main tree code lengths
410
+ @maintree_lengths = build_tree_lengths(maintree_freq,
311
411
  @maintree_maxsymbols)
312
- @maintree_codes = Huffman::Encoder.build_codes(@maintree_lengths,
313
- @maintree_maxsymbols)
314
412
 
315
- # Build length tree
413
+ # Step 3: Build length tree code lengths
316
414
  @length_lengths = build_tree_lengths(@length_freq, LENGTH_MAXSYMBOLS)
415
+
416
+ # Step 4: Calculate pretree frequencies by simulating tree encoding
417
+ pretree_freq = calculate_pretree_frequencies
418
+
419
+ # Step 5: Build pretree code lengths
420
+ @pretree_lengths = build_tree_lengths(pretree_freq, PRETREE_MAXSYMBOLS)
421
+
422
+ # Step 6: Generate code tables from lengths
423
+ @maintree_codes = Huffman::Encoder.build_codes(@maintree_lengths,
424
+ @maintree_maxsymbols)
317
425
  @length_codes = Huffman::Encoder.build_codes(@length_lengths,
318
426
  LENGTH_MAXSYMBOLS)
319
-
320
- # Build pretree (used to encode the other trees)
321
- # Create a valid Huffman tree that satisfies Kraft inequality
322
- # For 20 symbols, use: 2@3bits + 6@4bits + 12@5bits = 1.0
323
- @pretree_lengths = Array.new(PRETREE_MAXSYMBOLS, 0)
324
- # Most common symbols (0-1): 3 bits
325
- (0..1).each { |i| @pretree_lengths[i] = 3 }
326
- # Common symbols (2-7): 4 bits
327
- (2..7).each { |i| @pretree_lengths[i] = 4 }
328
- # Less common symbols (8-19): 5 bits
329
- (8..19).each { |i| @pretree_lengths[i] = 5 }
330
427
  @pretree_codes = Huffman::Encoder.build_codes(@pretree_lengths,
331
428
  PRETREE_MAXSYMBOLS)
332
429
  end
333
430
 
334
- # Build Huffman code lengths from frequencies
431
+ # Calculate pretree symbol frequencies
335
432
  #
336
- # @param freqs [Array<Integer>] Symbol frequencies
337
- # @param num_symbols [Integer] Number of symbols
338
- # @return [Array<Integer>] Code lengths
339
- def build_tree_lengths(freqs, num_symbols)
340
- # Simple implementation: assign lengths based on frequency
341
- # Higher frequency = shorter code
342
- lengths = Array.new(num_symbols, 0)
433
+ # The pretree encodes the code lengths of the main and length trees.
434
+ # This method simulates the tree encoding process to determine which
435
+ # pretree symbols will be needed.
436
+ #
437
+ # @return [Array<Integer>] Frequency array for pretree symbols (0-19)
438
+ def calculate_pretree_frequencies
439
+ pretree_freq = Array.new(PRETREE_MAXSYMBOLS, 0)
343
440
 
344
- # Get non-zero frequencies
345
- non_zero = freqs.each_with_index.select { |freq, _| freq.positive? }
346
- return lengths if non_zero.empty?
347
-
348
- # Sort by frequency (descending)
349
- sorted = non_zero.sort_by { |freq, _| -freq }
350
-
351
- # Assign lengths using simple strategy
352
- sorted.each_with_index do |(_, symbol), index|
353
- # Assign shorter codes to more frequent symbols
354
- lengths[symbol] = if index < num_symbols / 8
355
- 4
356
- elsif index < num_symbols / 4
357
- 6
358
- elsif index < num_symbols / 2
359
- 8
360
- else
361
- 10
362
- end
441
+ # Count symbols needed to encode main tree (two parts)
442
+ count_pretree_symbols(@maintree_lengths, 0, NUM_CHARS, pretree_freq)
443
+ count_pretree_symbols(@maintree_lengths, NUM_CHARS,
444
+ @maintree_maxsymbols, pretree_freq)
445
+
446
+ # Count symbols needed to encode length tree
447
+ count_pretree_symbols(@length_lengths, 0, NUM_SECONDARY_LENGTHS,
448
+ pretree_freq)
449
+
450
+ pretree_freq
451
+ end
452
+
453
+ # Count pretree symbols needed to encode a tree
454
+ #
455
+ # This simulates the write_tree_with_pretree encoding process to count
456
+ # which pretree symbols will be used, allowing us to build an optimal
457
+ # pretree.
458
+ #
459
+ # @param lengths [Array<Integer>] Tree lengths to encode
460
+ # @param start [Integer] Start index
461
+ # @param end_idx [Integer] End index (exclusive)
462
+ # @param freq [Array<Integer>] Frequency array to update
463
+ # @return [void]
464
+ def count_pretree_symbols(lengths, start, end_idx, freq)
465
+ i = start
466
+ prev_length = 0
467
+
468
+ while i < end_idx
469
+ length = lengths[i]
470
+
471
+ if length.zero?
472
+ # Count run of zeros
473
+ zero_count = 0
474
+ while i < end_idx && lengths[i].zero? && zero_count < 138
475
+ zero_count += 1
476
+ i += 1
477
+ end
478
+
479
+ # Encode long runs with symbol 18
480
+ if zero_count >= 20
481
+ while zero_count >= 20
482
+ run = [zero_count, 51].min
483
+ freq[18] += 1
484
+ zero_count -= run
485
+ end
486
+ end
487
+
488
+ # Encode medium runs with symbol 17
489
+ if zero_count >= 4
490
+ run = [zero_count, 19].min
491
+ freq[17] += 1
492
+ zero_count -= run
493
+ end
494
+
495
+ # Encode remaining short runs as deltas
496
+ if zero_count.positive?
497
+ zero_count.times do
498
+ delta = (17 - prev_length) % 17
499
+ freq[delta] += 1
500
+ prev_length = 0
501
+ end
502
+ end
503
+ else
504
+ # Encode as delta from previous length
505
+ delta = (length - prev_length) % 17
506
+ freq[delta] += 1
507
+ prev_length = length
508
+ i += 1
509
+ end
363
510
  end
511
+ end
364
512
 
365
- lengths
513
+ # Calculate code lengths by traversing Huffman tree
514
+ #
515
+ # @param node [Array] Tree node [freq, symbol, left, right, depth]
516
+ # @param depth [Integer] Current depth
517
+ # @param lengths [Array<Integer>] Output array for lengths
518
+ # @return [void]
519
+ def calculate_depths(node, depth, lengths)
520
+ return unless node
521
+
522
+ _, symbol, left, right, = node
523
+
524
+ if symbol.nil?
525
+ # Internal node: recurse to children
526
+ calculate_depths(left, depth + 1, lengths)
527
+ calculate_depths(right, depth + 1, lengths)
528
+ else
529
+ # Leaf node: record length
530
+ lengths[symbol] = depth
531
+ end
532
+ end
533
+
534
+ # Calculate code lengths by traversing Huffman tree
535
+ #
536
+ # @param node [Array] Tree node [freq, symbol, left, right]
537
+ # @param depth [Integer] Current depth
538
+ # @param lengths [Array<Integer>] Output array for lengths
539
+ # @return [void]
540
+ def calculate_code_lengths(node, depth, lengths)
541
+ return unless node
542
+
543
+ _, symbol, left, right = node
544
+
545
+ if symbol.nil?
546
+ # Internal node: recurse to children
547
+ calculate_code_lengths(left, depth + 1, lengths)
548
+ calculate_code_lengths(right, depth + 1, lengths)
549
+ else
550
+ # Leaf node: record length
551
+ lengths[symbol] = depth
552
+ end
366
553
  end
367
554
 
368
555
  # Write block header
@@ -56,7 +56,7 @@ module Cabriolet
56
56
  # @param input [System::FileHandle, System::MemoryHandle] Input handle
57
57
  # @param output [System::FileHandle, System::MemoryHandle] Output handle
58
58
  # @param buffer_size [Integer] Buffer size for I/O operations
59
- def initialize(io_system, input, output, buffer_size)
59
+ def initialize(io_system, input, output, buffer_size, **_kwargs)
60
60
  super
61
61
 
62
62
  # Initialize bitstream writer
@@ -88,10 +88,15 @@ module Cabriolet
88
88
 
89
89
  # Process data in FRAME_SIZE chunks
90
90
  # Each frame is independent and contains blocks ending with last_block=1
91
+ frame_num = 0
91
92
  while pos < input_data.bytesize
92
93
  chunk_size = [FRAME_SIZE, input_data.bytesize - pos].min
93
94
  chunk = input_data[pos, chunk_size]
94
95
 
96
+ if ENV["DEBUG_MSZIP_COMPRESS"]
97
+ warn "DEBUG compress: Frame #{frame_num}: pos=#{pos}, chunk_size=#{chunk_size}"
98
+ end
99
+
95
100
  # Write CK signature
96
101
  write_signature
97
102
 
@@ -99,11 +104,19 @@ module Cabriolet
99
104
  # Each frame's block is always marked as last within that frame
100
105
  compress_block(chunk, true)
101
106
 
107
+ # Flush bitstream after each frame to ensure data is written
108
+ @bitstream.flush
109
+
110
+ if ENV["DEBUG_MSZIP_COMPRESS"]
111
+ warn "DEBUG compress: Frame #{frame_num} complete, flushed"
112
+ end
113
+
102
114
  pos += chunk_size
103
115
  total_written += chunk_size
116
+ frame_num += 1
104
117
  end
105
118
 
106
- # Flush any remaining bits
119
+ # Final flush (may not be needed now but keep for safety)
107
120
  @bitstream.flush
108
121
 
109
122
  total_written
@@ -129,8 +142,19 @@ module Cabriolet
129
142
  #
130
143
  # @return [void]
131
144
  def write_signature
145
+ if ENV["DEBUG_MSZIP_COMPRESS"]
146
+ warn "DEBUG write_signature: ENTRY"
147
+ end
132
148
  @bitstream.byte_align
133
- SIGNATURE.each { |byte| @bitstream.write_raw_byte(byte) }
149
+ SIGNATURE.each do |byte|
150
+ if ENV["DEBUG_MSZIP_COMPRESS"]
151
+ warn "DEBUG write_signature: Writing byte 0x#{byte.to_s(16)}"
152
+ end
153
+ @bitstream.write_raw_byte(byte)
154
+ end
155
+ if ENV["DEBUG_MSZIP_COMPRESS"]
156
+ warn "DEBUG write_signature: EXIT"
157
+ end
134
158
  end
135
159
 
136
160
  # Compress a single block using fixed Huffman encoding
@@ -139,6 +163,10 @@ module Cabriolet
139
163
  # @param is_last [Boolean] Whether this is the last block
140
164
  # @return [void]
141
165
  def compress_block(data, is_last)
166
+ if ENV["DEBUG_MSZIP_COMPRESS"]
167
+ warn "DEBUG compress_block: ENTRY data_size=#{data.bytesize} is_last=#{is_last}"
168
+ end
169
+
142
170
  # Write block header
143
171
  @bitstream.write_bits(is_last ? 1 : 0, 1) # Last block flag
144
172
  @bitstream.write_bits(FIXED_HUFFMAN_BLOCK, 2) # Block type
@@ -151,6 +179,10 @@ module Cabriolet
151
179
 
152
180
  # Write end-of-block symbol (256)
153
181
  encode_literal(256)
182
+
183
+ if ENV["DEBUG_MSZIP_COMPRESS"]
184
+ warn "DEBUG compress_block: EXIT"
185
+ end
154
186
  end
155
187
 
156
188
  # Encode data using LZ77 matching and Huffman encoding
@@ -5,15 +5,12 @@ module Cabriolet
5
5
  # Quantum compresses data using arithmetic coding and LZ77-based matching
6
6
  # Based on the Quantum decompressor and libmspack qtmd.c implementation
7
7
  #
8
- # STATUS: Functional with known limitations
9
- # - Literals: WORKING ✓
10
- # - Short matches (3-13 bytes): WORKING ✓
11
- # - Longer matches (14+ bytes): Limited support (known issue)
12
- # - Simple data round-trips successfully
13
- # - Complex repeated patterns may have issues
14
- #
15
8
  # The Quantum method was created by David Stafford, adapted by Microsoft
16
9
  # Corporation.
10
+ #
11
+ # NOTE: This compressor is a work-in-progress. The arithmetic coding
12
+ # implementation needs refinement to match the decoder exactly.
13
+ # For now, this implementation focuses on correct structure.
17
14
  # rubocop:disable Metrics/ClassLength
18
15
  class Quantum < Base
19
16
  # Frame size (32KB per frame)
@@ -21,7 +18,7 @@ module Cabriolet
21
18
 
22
19
  # Match constants
23
20
  MIN_MATCH = 3
24
- MAX_MATCH = 1028
21
+ MAX_MATCH = 259
25
22
 
26
23
  # Position slot tables (same as decompressor)
27
24
  POSITION_BASE = [
@@ -77,7 +74,8 @@ module Cabriolet
77
74
  # @param output [System::FileHandle, System::MemoryHandle] Output handle
78
75
  # @param buffer_size [Integer] Buffer size for I/O operations
79
76
  # @param window_bits [Integer] Window size parameter (10-21)
80
- def initialize(io_system, input, output, buffer_size, window_bits: 10)
77
+ def initialize(io_system, input, output, buffer_size, window_bits: 10,
78
+ **_kwargs)
81
79
  super(io_system, input, output, buffer_size)
82
80
 
83
81
  # Validate window_bits
@@ -179,7 +177,6 @@ module Cabriolet
179
177
 
180
178
  # Compress a single frame
181
179
  def compress_frame(data)
182
- # No header needed - the first 16 bits of encoded data will be read as C
183
180
  pos = 0
184
181
 
185
182
  while pos < data.bytesize
@@ -198,27 +195,25 @@ module Cabriolet
198
195
  end
199
196
  end
200
197
 
201
- # Finish arithmetic coding - output final range
202
- # We need to output enough bits to disambiguate the final range
198
+ # Finish arithmetic coding
203
199
  finish_arithmetic_coding
204
200
  end
205
201
 
206
- # Finish arithmetic coding by outputting the final state
202
+ # Finish arithmetic coding
207
203
  def finish_arithmetic_coding
208
- # Output enough bits to ensure decoder can decode correctly
209
- # We need to output a value that falls within [L, H)
210
- # A common approach is to output L plus half the range
211
- @underflow_bits += 1
212
- bit = if @l.anybits?(0x4000)
213
- 1
214
- else
215
- 0
216
- end
217
- @bitstream.write_bits_msb(bit, 1)
218
- @underflow_bits.times do
219
- @bitstream.write_bits_msb(bit ^ 1, 1)
204
+ # Output pending underflow bits
205
+ if @underflow_bits.positive?
206
+ bit = if @l.anybits?(0x4000)
207
+ 1
208
+ else
209
+ 0
210
+ end
211
+ @bitstream.write_bits_msb(bit, 1)
212
+ @underflow_bits.times do
213
+ @bitstream.write_bits_msb(bit ^ 1, 1)
214
+ end
215
+ @underflow_bits = 0
220
216
  end
221
- @underflow_bits = 0
222
217
  end
223
218
 
224
219
  # Find best match in the sliding window
@@ -335,7 +330,6 @@ module Cabriolet
335
330
  end
336
331
 
337
332
  # Encode a symbol using arithmetic coding
338
- # This is the inverse of GET_SYMBOL macro in qtmd.c
339
333
  def encode_symbol(model, sym)
340
334
  # Find symbol index in model
341
335
  i = 0
@@ -346,33 +340,29 @@ module Cabriolet
346
340
  "Symbol #{sym} not found in model"
347
341
  end
348
342
 
349
- # Calculate range (matching decoder line 93, 101-102)
350
- range = (@h - @l) + 1
343
+ # Calculate range - use decoder's formula
344
+ range = ((@h - @l) & 0xFFFF) + 1
351
345
  symf = model.syms[0].cumfreq
352
346
 
353
- # Update H and L (matching decoder lines 103-104)
354
- # Decoder uses syms[i-1] and syms[i], so encoder at index j
355
- # should use syms[j] and syms[j+1] to make decoder land at i=j+1
356
- # But decoder returns syms[i-1].sym, so it will return syms[j].sym ✓
347
+ # Update H and L
357
348
  @h = @l + ((model.syms[i].cumfreq * range) / symf) - 1
358
349
  @l += ((model.syms[i + 1].cumfreq * range) / symf)
359
350
 
360
- # Update model frequencies (matching decoder line 106)
351
+ # Update model frequencies
361
352
  j = i
362
353
  while j >= 0
363
354
  model.syms[j].cumfreq += 8
364
355
  j -= 1
365
356
  end
366
357
 
367
- # Check if model needs updating (matching decoder line 107)
358
+ # Check if model needs updating
368
359
  update_model(model) if model.syms[0].cumfreq > 3800
369
360
 
370
- # Normalize range (matching decoder lines 109-121)
361
+ # Normalize range
371
362
  normalize_range
372
363
  end
373
364
 
374
365
  # Normalize arithmetic coding range and output bits
375
- # This implements the encoder equivalent of the decoder's normalization (lines 109-121)
376
366
  def normalize_range
377
367
  loop do
378
368
  if (@l & 0x8000) == (@h & 0x8000)
@@ -395,37 +385,36 @@ module Cabriolet
395
385
  @h |= 0x4000
396
386
 
397
387
  # Can't normalize further
398
-
399
388
  end
400
389
 
401
- # Shift range (both for underflow and MSB match cases)
390
+ # Shift range
402
391
  @l = (@l << 1) & 0xFFFF
403
392
  @h = ((@h << 1) | 1) & 0xFFFF
404
393
  end
405
394
  end
406
395
 
407
- # Update model statistics (matching qtmd_update_model exactly)
396
+ # Update model statistics
408
397
  def update_model(model)
409
398
  model.shiftsleft -= 1
410
399
 
411
400
  if model.shiftsleft.positive?
412
- # Simple shift (matching decoder lines 129-135)
401
+ # Simple shift
413
402
  (model.entries - 1).downto(0) do |i|
414
403
  model.syms[i].cumfreq >>= 1
415
404
  model.syms[i].cumfreq = model.syms[i + 1].cumfreq + 1 if model.syms[i].cumfreq <= model.syms[i + 1].cumfreq
416
405
  end
417
406
  else
418
- # Full rebuild (matching decoder lines 137-163)
407
+ # Full rebuild
419
408
  model.shiftsleft = 50
420
409
 
421
- # Convert cumfreq to frequencies (lines 139-145)
410
+ # Convert cumfreq to frequencies
422
411
  (0...model.entries).each do |i|
423
412
  model.syms[i].cumfreq -= model.syms[i + 1].cumfreq
424
413
  model.syms[i].cumfreq += 1
425
414
  model.syms[i].cumfreq >>= 1
426
415
  end
427
416
 
428
- # Sort by frequency (selection sort for stability, lines 150-158)
417
+ # Sort by frequency
429
418
  (0...(model.entries - 1)).each do |i|
430
419
  ((i + 1)...model.entries).each do |j|
431
420
  if model.syms[i].cumfreq < model.syms[j].cumfreq
@@ -434,7 +423,7 @@ module Cabriolet
434
423
  end
435
424
  end
436
425
 
437
- # Convert back to cumulative frequencies (lines 161-163)
426
+ # Convert back to cumulative frequencies
438
427
  (model.entries - 1).downto(0) do |i|
439
428
  model.syms[i].cumfreq += model.syms[i + 1].cumfreq
440
429
  end
@@ -12,7 +12,7 @@ module Cabriolet
12
12
  # @param input [System::FileHandle, System::MemoryHandle] Input handle
13
13
  # @param output [System::FileHandle, System::MemoryHandle] Output handle
14
14
  # @param buffer_size [Integer] Buffer size for I/O operations
15
- def initialize(io_system, input, output, buffer_size)
15
+ def initialize(io_system, input, output, buffer_size, **_kwargs)
16
16
  @io_system = io_system
17
17
  @input = input
18
18
  @output = output