cabriolet 0.1.2 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (102) hide show
  1. checksums.yaml +4 -4
  2. data/README.adoc +703 -38
  3. data/lib/cabriolet/algorithm_factory.rb +250 -0
  4. data/lib/cabriolet/base_compressor.rb +206 -0
  5. data/lib/cabriolet/binary/bitstream.rb +167 -16
  6. data/lib/cabriolet/binary/bitstream_writer.rb +150 -21
  7. data/lib/cabriolet/binary/chm_structures.rb +2 -2
  8. data/lib/cabriolet/binary/hlp_structures.rb +258 -37
  9. data/lib/cabriolet/binary/lit_structures.rb +231 -65
  10. data/lib/cabriolet/binary/oab_structures.rb +17 -1
  11. data/lib/cabriolet/cab/command_handler.rb +226 -0
  12. data/lib/cabriolet/cab/compressor.rb +108 -84
  13. data/lib/cabriolet/cab/decompressor.rb +16 -20
  14. data/lib/cabriolet/cab/extractor.rb +142 -66
  15. data/lib/cabriolet/cab/file_compression_work.rb +52 -0
  16. data/lib/cabriolet/cab/file_compression_worker.rb +89 -0
  17. data/lib/cabriolet/checksum.rb +49 -0
  18. data/lib/cabriolet/chm/command_handler.rb +227 -0
  19. data/lib/cabriolet/chm/compressor.rb +7 -3
  20. data/lib/cabriolet/chm/decompressor.rb +39 -21
  21. data/lib/cabriolet/chm/parser.rb +5 -2
  22. data/lib/cabriolet/cli/base_command_handler.rb +127 -0
  23. data/lib/cabriolet/cli/command_dispatcher.rb +140 -0
  24. data/lib/cabriolet/cli/command_registry.rb +83 -0
  25. data/lib/cabriolet/cli.rb +356 -607
  26. data/lib/cabriolet/collections/file_collection.rb +175 -0
  27. data/lib/cabriolet/compressors/base.rb +1 -1
  28. data/lib/cabriolet/compressors/lzx.rb +241 -54
  29. data/lib/cabriolet/compressors/mszip.rb +35 -3
  30. data/lib/cabriolet/compressors/quantum.rb +36 -95
  31. data/lib/cabriolet/decompressors/base.rb +1 -1
  32. data/lib/cabriolet/decompressors/lzss.rb +13 -3
  33. data/lib/cabriolet/decompressors/lzx.rb +70 -33
  34. data/lib/cabriolet/decompressors/mszip.rb +126 -39
  35. data/lib/cabriolet/decompressors/quantum.rb +83 -53
  36. data/lib/cabriolet/errors.rb +3 -0
  37. data/lib/cabriolet/extraction/base_extractor.rb +88 -0
  38. data/lib/cabriolet/extraction/extractor.rb +171 -0
  39. data/lib/cabriolet/extraction/file_extraction_work.rb +60 -0
  40. data/lib/cabriolet/extraction/file_extraction_worker.rb +106 -0
  41. data/lib/cabriolet/file_entry.rb +156 -0
  42. data/lib/cabriolet/file_manager.rb +144 -0
  43. data/lib/cabriolet/format_base.rb +79 -0
  44. data/lib/cabriolet/hlp/command_handler.rb +282 -0
  45. data/lib/cabriolet/hlp/compressor.rb +28 -238
  46. data/lib/cabriolet/hlp/decompressor.rb +107 -147
  47. data/lib/cabriolet/hlp/parser.rb +52 -101
  48. data/lib/cabriolet/hlp/quickhelp/compression_stream.rb +138 -0
  49. data/lib/cabriolet/hlp/quickhelp/compressor.rb +151 -0
  50. data/lib/cabriolet/hlp/quickhelp/decompressor.rb +558 -0
  51. data/lib/cabriolet/hlp/quickhelp/file_writer.rb +125 -0
  52. data/lib/cabriolet/hlp/quickhelp/huffman_stream.rb +74 -0
  53. data/lib/cabriolet/hlp/quickhelp/huffman_tree.rb +167 -0
  54. data/lib/cabriolet/hlp/quickhelp/offset_calculator.rb +61 -0
  55. data/lib/cabriolet/hlp/quickhelp/parser.rb +274 -0
  56. data/lib/cabriolet/hlp/quickhelp/structure_builder.rb +93 -0
  57. data/lib/cabriolet/hlp/quickhelp/topic_builder.rb +52 -0
  58. data/lib/cabriolet/hlp/quickhelp/topic_compressor.rb +83 -0
  59. data/lib/cabriolet/hlp/winhelp/btree_builder.rb +289 -0
  60. data/lib/cabriolet/hlp/winhelp/compressor.rb +400 -0
  61. data/lib/cabriolet/hlp/winhelp/decompressor.rb +192 -0
  62. data/lib/cabriolet/hlp/winhelp/parser.rb +484 -0
  63. data/lib/cabriolet/hlp/winhelp/zeck_lz77.rb +271 -0
  64. data/lib/cabriolet/huffman/encoder.rb +15 -12
  65. data/lib/cabriolet/huffman/tree.rb +85 -1
  66. data/lib/cabriolet/kwaj/command_handler.rb +213 -0
  67. data/lib/cabriolet/kwaj/compressor.rb +7 -3
  68. data/lib/cabriolet/kwaj/decompressor.rb +18 -12
  69. data/lib/cabriolet/lit/command_handler.rb +221 -0
  70. data/lib/cabriolet/lit/compressor.rb +119 -168
  71. data/lib/cabriolet/lit/content_encoder.rb +76 -0
  72. data/lib/cabriolet/lit/content_type_detector.rb +50 -0
  73. data/lib/cabriolet/lit/decompressor.rb +518 -152
  74. data/lib/cabriolet/lit/directory_builder.rb +153 -0
  75. data/lib/cabriolet/lit/guid_generator.rb +16 -0
  76. data/lib/cabriolet/lit/header_writer.rb +124 -0
  77. data/lib/cabriolet/lit/parser.rb +670 -0
  78. data/lib/cabriolet/lit/piece_builder.rb +74 -0
  79. data/lib/cabriolet/lit/structure_builder.rb +252 -0
  80. data/lib/cabriolet/models/hlp_file.rb +130 -29
  81. data/lib/cabriolet/models/hlp_header.rb +105 -17
  82. data/lib/cabriolet/models/lit_header.rb +212 -25
  83. data/lib/cabriolet/models/szdd_header.rb +10 -2
  84. data/lib/cabriolet/models/winhelp_header.rb +127 -0
  85. data/lib/cabriolet/oab/command_handler.rb +257 -0
  86. data/lib/cabriolet/oab/compressor.rb +17 -8
  87. data/lib/cabriolet/oab/decompressor.rb +41 -10
  88. data/lib/cabriolet/offset_calculator.rb +81 -0
  89. data/lib/cabriolet/plugin.rb +233 -0
  90. data/lib/cabriolet/plugin_manager.rb +453 -0
  91. data/lib/cabriolet/plugin_validator.rb +422 -0
  92. data/lib/cabriolet/quantum_shared.rb +105 -0
  93. data/lib/cabriolet/system/io_system.rb +3 -0
  94. data/lib/cabriolet/system/memory_handle.rb +17 -4
  95. data/lib/cabriolet/szdd/command_handler.rb +217 -0
  96. data/lib/cabriolet/szdd/compressor.rb +15 -11
  97. data/lib/cabriolet/szdd/decompressor.rb +18 -9
  98. data/lib/cabriolet/version.rb +1 -1
  99. data/lib/cabriolet.rb +181 -20
  100. metadata +69 -4
  101. data/lib/cabriolet/auto.rb +0 -173
  102. data/lib/cabriolet/parallel.rb +0 -333
@@ -0,0 +1,175 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Cabriolet
4
+ module Collections
5
+ # FileCollection manages a collection of files for compression
6
+ # Provides unified interface for adding files and preparing them for compression
7
+ class FileCollection
8
+ include Enumerable
9
+
10
+ # Initialize a new file collection
11
+ #
12
+ # @param format_options [Hash] Options specific to the archive format
13
+ def initialize(format_options = {})
14
+ @files = []
15
+ @format_options = format_options
16
+ end
17
+
18
+ # Add a file to the collection
19
+ #
20
+ # @param source_path [String] Path to the source file
21
+ # @param archive_path [String, nil] Path within the archive (defaults to basename)
22
+ # @param options [Hash] Additional options for this file
23
+ # @return [self] Returns self for chaining
24
+ #
25
+ # @example
26
+ # collection.add("README.md", "docs/README.md")
27
+ # collection.add("data.txt") # Uses basename
28
+ def add(source_path, archive_path = nil, **options)
29
+ validate_source(source_path)
30
+
31
+ @files << {
32
+ source: source_path,
33
+ archive: archive_path || ::File.basename(source_path),
34
+ options: options,
35
+ }
36
+
37
+ self
38
+ end
39
+
40
+ # Add multiple files at once
41
+ #
42
+ # @param files [Array<Hash>] Array of file hashes with :source, :archive, :options keys
43
+ # @return [self] Returns self for chaining
44
+ def add_all(files)
45
+ files.each do |file|
46
+ add(file[:source], file[:archive], **file.fetch(:options, {}))
47
+ end
48
+ self
49
+ end
50
+
51
+ # Iterate over files in the collection
52
+ #
53
+ # @yield [file_entry] Yields each file entry hash
54
+ # @return [Enumerator] If no block given
55
+ def each(&)
56
+ @files.each(&)
57
+ end
58
+
59
+ # Get the number of files in the collection
60
+ #
61
+ # @return [Integer] Number of files
62
+ def size
63
+ @files.size
64
+ end
65
+
66
+ # Check if collection is empty
67
+ #
68
+ # @return [Boolean] True if no files
69
+ def empty?
70
+ @files.empty?
71
+ end
72
+
73
+ # Clear all files from the collection
74
+ #
75
+ # @return [self] Returns self for chaining
76
+ def clear
77
+ @files.clear
78
+ self
79
+ end
80
+
81
+ # Prepare files for compression by reading metadata
82
+ #
83
+ # @return [Array<Hash>] Array of prepared file info hashes
84
+ def prepare_for_compression
85
+ @files.map do |file_entry|
86
+ prepare_file_info(file_entry)
87
+ end
88
+ end
89
+
90
+ # Get total uncompressed size of all files
91
+ #
92
+ # @return [Integer] Total size in bytes
93
+ def total_size
94
+ @files.sum { |f| ::File.size(f[:source]) }
95
+ end
96
+
97
+ # Group files by directory for archive organization
98
+ #
99
+ # @return [Hash] Hash with directory paths as keys and file arrays as values
100
+ def by_directory
101
+ @files.group_by do |file|
102
+ ::File.dirname(file[:archive])
103
+ end
104
+ end
105
+
106
+ # Find files by pattern in archive path
107
+ #
108
+ # @param pattern [String, Regexp] Pattern to match
109
+ # @return [Array<Hash>] Matching file entries
110
+ def find_by_pattern(pattern)
111
+ @files.select do |file|
112
+ if pattern.is_a?(Regexp)
113
+ file[:archive] =~ pattern
114
+ else
115
+ file[:archive].include?(pattern)
116
+ end
117
+ end
118
+ end
119
+
120
+ private
121
+
122
+ # Validate that source file exists and is accessible
123
+ #
124
+ # @param path [String] Path to validate
125
+ # @raise [ArgumentError] if file doesn't exist or isn't a regular file
126
+ def validate_source(path)
127
+ unless ::File.exist?(path)
128
+ raise ArgumentError, "File does not exist: #{path}"
129
+ end
130
+
131
+ unless ::File.file?(path)
132
+ raise ArgumentError, "Not a regular file: #{path}"
133
+ end
134
+ end
135
+
136
+ # Prepare file information for compression
137
+ #
138
+ # @param file_entry [Hash] Original file entry
139
+ # @return [Hash] Prepared file info with metadata
140
+ def prepare_file_info(file_entry)
141
+ stat = ::File.stat(file_entry[:source])
142
+
143
+ {
144
+ source_path: file_entry[:source],
145
+ archive_path: file_entry[:archive],
146
+ size: stat.size,
147
+ mtime: stat.mtime,
148
+ atime: stat.atime,
149
+ attributes: calculate_attributes(stat),
150
+ options: file_entry[:options],
151
+ }
152
+ end
153
+
154
+ # Calculate file attributes for archive format
155
+ #
156
+ # @param stat [File::Stat] File stat object
157
+ # @return [Integer] Attribute flags
158
+ def calculate_attributes(stat)
159
+ attribs = Constants::ATTRIB_ARCH
160
+
161
+ # Set read-only flag if not writable
162
+ attribs |= Constants::ATTRIB_READONLY unless stat.writable?
163
+
164
+ # Set hidden flag if hidden (Unix dotfiles)
165
+ basename = ::File.basename(@files.first[:source])
166
+ attribs |= Constants::ATTRIB_HIDDEN if basename.start_with?(".")
167
+
168
+ # Set system flag for system files
169
+ attribs |= Constants::ATTRIB_SYSTEM if stat.socket? || stat.symlink?
170
+
171
+ attribs
172
+ end
173
+ end
174
+ end
175
+ end
@@ -15,7 +15,7 @@ module Cabriolet
15
15
  # @param input [System::FileHandle, System::MemoryHandle] Input handle
16
16
  # @param output [System::FileHandle, System::MemoryHandle] Output handle
17
17
  # @param buffer_size [Integer] Buffer size for I/O operations
18
- def initialize(io_system, input, output, buffer_size)
18
+ def initialize(io_system, input, output, buffer_size, **_kwargs)
19
19
  @io_system = io_system
20
20
  @input = input
21
21
  @output = output
@@ -1,5 +1,6 @@
1
1
  # frozen_string_literal: true
2
2
 
3
+ require_relative "base"
3
4
  require_relative "../binary/bitstream_writer"
4
5
  require_relative "../huffman/encoder"
5
6
 
@@ -66,7 +67,8 @@ module Cabriolet
66
67
  # @param output [System::FileHandle, System::MemoryHandle] Output handle
67
68
  # @param buffer_size [Integer] Buffer size for I/O operations
68
69
  # @param window_bits [Integer] Window size (15-21 for regular LZX)
69
- def initialize(io_system, input, output, buffer_size, window_bits: 15)
70
+ def initialize(io_system, input, output, buffer_size, window_bits: 15,
71
+ **_kwargs)
70
72
  super(io_system, input, output, buffer_size)
71
73
 
72
74
  # Validate window_bits
@@ -82,8 +84,9 @@ module Cabriolet
82
84
  @num_offsets = POSITION_SLOTS[window_bits - 15] << 3
83
85
  @maintree_maxsymbols = NUM_CHARS + @num_offsets
84
86
 
85
- # Initialize bitstream writer
86
- @bitstream = Binary::BitstreamWriter.new(io_system, output, buffer_size)
87
+ # Initialize bitstream writer (LZX uses MSB-first bit ordering per libmspack lzxd.c)
88
+ @bitstream = Binary::BitstreamWriter.new(io_system, output,
89
+ buffer_size, bit_order: :msb)
87
90
 
88
91
  # Initialize sliding window for LZ77
89
92
  @window = "\0" * @window_size
@@ -119,6 +122,7 @@ module Cabriolet
119
122
  frame_data = input_data[pos, frame_size]
120
123
 
121
124
  # Compress this frame
125
+ # TODO: Use compress_frame_verbatim once tree encoding is fixed
122
126
  compress_frame(frame_data)
123
127
 
124
128
  pos += frame_size
@@ -152,19 +156,46 @@ module Cabriolet
152
156
  # @param data [String] Frame data to compress
153
157
  # @return [void]
154
158
  def compress_frame(data)
155
- # Use UNCOMPRESSED blocks for now (simplest approach)
156
- write_block_header(BLOCKTYPE_UNCOMPRESSED, data.bytesize)
159
+ # For uncompressed blocks, block length is just the frame data size
160
+ # (offset registers are NOT included in the block length field)
161
+ block_length = data.bytesize
157
162
 
158
- # Write R0, R1, R2 (required for uncompressed blocks)
163
+ # Write UNCOMPRESSED block header
164
+ write_block_header(BLOCKTYPE_UNCOMPRESSED, block_length)
165
+
166
+ # Write offset registers (R0, R1, R2)
159
167
  write_offset_registers
160
168
 
161
- # Write raw data
169
+ # Write raw uncompressed data
162
170
  data.each_byte do |byte|
163
171
  @bitstream.write_bits(byte, 8)
164
172
  end
173
+ end
174
+
175
+ # Compress a single frame (32KB) - VERBATIM version (currently disabled)
176
+ #
177
+ # @param data [String] Frame data to compress
178
+ # @return [void]
179
+ def compress_frame_verbatim(data)
180
+ # Reset frequency statistics for each frame
181
+ @literal_freq.fill(0)
182
+ @match_freq.fill(0)
183
+ @length_freq.fill(0)
184
+
185
+ # Analyze frame to generate LZ77 tokens
186
+ tokens = analyze_frame(data)
165
187
 
166
- # Ensure byte alignment at end of frame for multi-frame support
167
- @bitstream.byte_align
188
+ # Build Huffman trees from statistics
189
+ build_trees
190
+
191
+ # Write VERBATIM block header
192
+ write_block_header(BLOCKTYPE_VERBATIM, data.bytesize)
193
+
194
+ # Write Huffman tree definitions
195
+ write_trees
196
+
197
+ # Encode all tokens using the Huffman codes
198
+ encode_tokens(tokens)
168
199
  end
169
200
 
170
201
  # Analyze frame and generate LZ77 tokens
@@ -301,68 +332,224 @@ module Cabriolet
301
332
  slot
302
333
  end
303
334
 
335
+ # Build Huffman code lengths from frequencies
336
+ #
337
+ # Uses a simplified approach: assign equal lengths to all symbols.
338
+ # This guarantees valid Huffman trees that satisfy Kraft inequality.
339
+ #
340
+ # @param freqs [Array<Integer>] Symbol frequencies
341
+ # @param num_symbols [Integer] Number of symbols
342
+ # @return [Array<Integer>] Code lengths
343
+ def build_tree_lengths(freqs, num_symbols)
344
+ lengths = Array.new(num_symbols, 0)
345
+
346
+ # Get symbols with non-zero frequencies
347
+ non_zero_symbols = freqs.each_with_index.select do |freq, _|
348
+ freq.positive?
349
+ end.map { |_, sym| sym }
350
+
351
+ # Handle edge cases
352
+ if non_zero_symbols.empty?
353
+ # Empty tree: create minimal valid tree with 2 symbols
354
+ lengths[0] = 1
355
+ lengths[1] = 1
356
+ return lengths
357
+ elsif non_zero_symbols.size == 1
358
+ # Single symbol: need at least 2 symbols for valid Huffman tree
359
+ symbol = non_zero_symbols[0]
360
+ lengths[symbol] = 1
361
+ dummy = symbol.zero? ? 1 : 0
362
+ lengths[dummy] = 1
363
+ return lengths
364
+ end
365
+
366
+ # Calculate required length: ceil(log2(count))
367
+ count = non_zero_symbols.size
368
+ bit_length = 1
369
+ while (1 << bit_length) < count
370
+ bit_length += 1
371
+ end
372
+
373
+ # Assign same length to all non-zero symbols
374
+ non_zero_symbols.each do |symbol|
375
+ lengths[symbol] = bit_length
376
+ end
377
+
378
+ # Pad with dummy symbols to make tree complete (2^bit_length total symbols)
379
+ # This ensures Kraft inequality sum equals exactly 1.0
380
+ total_needed = 1 << bit_length
381
+ dummy_count = total_needed - count
382
+
383
+ if dummy_count.positive?
384
+ dummy_index = 0
385
+ while dummy_count.positive? && dummy_index < num_symbols
386
+ if lengths[dummy_index].zero?
387
+ lengths[dummy_index] = bit_length
388
+ dummy_count -= 1
389
+ end
390
+ dummy_index += 1
391
+ end
392
+ end
393
+
394
+ lengths
395
+ end
396
+
304
397
  # Build Huffman trees from frequency statistics
305
398
  #
399
+ # This creates three trees for LZX compression:
400
+ # 1. Main tree: literals (0-255) + match position/length combinations
401
+ # 2. Length tree: additional length symbols for long matches
402
+ # 3. Pretree: encodes the code lengths of main/length trees
403
+ #
306
404
  # @return [void]
307
405
  def build_trees
308
- # Build main tree (literals + matches)
309
- maintree_freqs = @literal_freq + @match_freq
310
- @maintree_lengths = build_tree_lengths(maintree_freqs,
406
+ # Step 1: Combine literal and match frequencies for main tree
407
+ maintree_freq = @literal_freq + @match_freq
408
+
409
+ # Step 2: Build main tree code lengths
410
+ @maintree_lengths = build_tree_lengths(maintree_freq,
311
411
  @maintree_maxsymbols)
312
- @maintree_codes = Huffman::Encoder.build_codes(@maintree_lengths,
313
- @maintree_maxsymbols)
314
412
 
315
- # Build length tree
413
+ # Step 3: Build length tree code lengths
316
414
  @length_lengths = build_tree_lengths(@length_freq, LENGTH_MAXSYMBOLS)
415
+
416
+ # Step 4: Calculate pretree frequencies by simulating tree encoding
417
+ pretree_freq = calculate_pretree_frequencies
418
+
419
+ # Step 5: Build pretree code lengths
420
+ @pretree_lengths = build_tree_lengths(pretree_freq, PRETREE_MAXSYMBOLS)
421
+
422
+ # Step 6: Generate code tables from lengths
423
+ @maintree_codes = Huffman::Encoder.build_codes(@maintree_lengths,
424
+ @maintree_maxsymbols)
317
425
  @length_codes = Huffman::Encoder.build_codes(@length_lengths,
318
426
  LENGTH_MAXSYMBOLS)
319
-
320
- # Build pretree (used to encode the other trees)
321
- # Create a valid Huffman tree that satisfies Kraft inequality
322
- # For 20 symbols, use: 2@3bits + 6@4bits + 12@5bits = 1.0
323
- @pretree_lengths = Array.new(PRETREE_MAXSYMBOLS, 0)
324
- # Most common symbols (0-1): 3 bits
325
- (0..1).each { |i| @pretree_lengths[i] = 3 }
326
- # Common symbols (2-7): 4 bits
327
- (2..7).each { |i| @pretree_lengths[i] = 4 }
328
- # Less common symbols (8-19): 5 bits
329
- (8..19).each { |i| @pretree_lengths[i] = 5 }
330
427
  @pretree_codes = Huffman::Encoder.build_codes(@pretree_lengths,
331
428
  PRETREE_MAXSYMBOLS)
332
429
  end
333
430
 
334
- # Build Huffman code lengths from frequencies
431
+ # Calculate pretree symbol frequencies
335
432
  #
336
- # @param freqs [Array<Integer>] Symbol frequencies
337
- # @param num_symbols [Integer] Number of symbols
338
- # @return [Array<Integer>] Code lengths
339
- def build_tree_lengths(freqs, num_symbols)
340
- # Simple implementation: assign lengths based on frequency
341
- # Higher frequency = shorter code
342
- lengths = Array.new(num_symbols, 0)
433
+ # The pretree encodes the code lengths of the main and length trees.
434
+ # This method simulates the tree encoding process to determine which
435
+ # pretree symbols will be needed.
436
+ #
437
+ # @return [Array<Integer>] Frequency array for pretree symbols (0-19)
438
+ def calculate_pretree_frequencies
439
+ pretree_freq = Array.new(PRETREE_MAXSYMBOLS, 0)
343
440
 
344
- # Get non-zero frequencies
345
- non_zero = freqs.each_with_index.select { |freq, _| freq.positive? }
346
- return lengths if non_zero.empty?
347
-
348
- # Sort by frequency (descending)
349
- sorted = non_zero.sort_by { |freq, _| -freq }
350
-
351
- # Assign lengths using simple strategy
352
- sorted.each_with_index do |(_, symbol), index|
353
- # Assign shorter codes to more frequent symbols
354
- lengths[symbol] = if index < num_symbols / 8
355
- 4
356
- elsif index < num_symbols / 4
357
- 6
358
- elsif index < num_symbols / 2
359
- 8
360
- else
361
- 10
362
- end
441
+ # Count symbols needed to encode main tree (two parts)
442
+ count_pretree_symbols(@maintree_lengths, 0, NUM_CHARS, pretree_freq)
443
+ count_pretree_symbols(@maintree_lengths, NUM_CHARS,
444
+ @maintree_maxsymbols, pretree_freq)
445
+
446
+ # Count symbols needed to encode length tree
447
+ count_pretree_symbols(@length_lengths, 0, NUM_SECONDARY_LENGTHS,
448
+ pretree_freq)
449
+
450
+ pretree_freq
451
+ end
452
+
453
+ # Count pretree symbols needed to encode a tree
454
+ #
455
+ # This simulates the write_tree_with_pretree encoding process to count
456
+ # which pretree symbols will be used, allowing us to build an optimal
457
+ # pretree.
458
+ #
459
+ # @param lengths [Array<Integer>] Tree lengths to encode
460
+ # @param start [Integer] Start index
461
+ # @param end_idx [Integer] End index (exclusive)
462
+ # @param freq [Array<Integer>] Frequency array to update
463
+ # @return [void]
464
+ def count_pretree_symbols(lengths, start, end_idx, freq)
465
+ i = start
466
+ prev_length = 0
467
+
468
+ while i < end_idx
469
+ length = lengths[i]
470
+
471
+ if length.zero?
472
+ # Count run of zeros
473
+ zero_count = 0
474
+ while i < end_idx && lengths[i].zero? && zero_count < 138
475
+ zero_count += 1
476
+ i += 1
477
+ end
478
+
479
+ # Encode long runs with symbol 18
480
+ if zero_count >= 20
481
+ while zero_count >= 20
482
+ run = [zero_count, 51].min
483
+ freq[18] += 1
484
+ zero_count -= run
485
+ end
486
+ end
487
+
488
+ # Encode medium runs with symbol 17
489
+ if zero_count >= 4
490
+ run = [zero_count, 19].min
491
+ freq[17] += 1
492
+ zero_count -= run
493
+ end
494
+
495
+ # Encode remaining short runs as deltas
496
+ if zero_count.positive?
497
+ zero_count.times do
498
+ delta = (17 - prev_length) % 17
499
+ freq[delta] += 1
500
+ prev_length = 0
501
+ end
502
+ end
503
+ else
504
+ # Encode as delta from previous length
505
+ delta = (length - prev_length) % 17
506
+ freq[delta] += 1
507
+ prev_length = length
508
+ i += 1
509
+ end
363
510
  end
511
+ end
364
512
 
365
- lengths
513
+ # Calculate code lengths by traversing Huffman tree
514
+ #
515
+ # @param node [Array] Tree node [freq, symbol, left, right, depth]
516
+ # @param depth [Integer] Current depth
517
+ # @param lengths [Array<Integer>] Output array for lengths
518
+ # @return [void]
519
+ def calculate_depths(node, depth, lengths)
520
+ return unless node
521
+
522
+ _, symbol, left, right, = node
523
+
524
+ if symbol.nil?
525
+ # Internal node: recurse to children
526
+ calculate_depths(left, depth + 1, lengths)
527
+ calculate_depths(right, depth + 1, lengths)
528
+ else
529
+ # Leaf node: record length
530
+ lengths[symbol] = depth
531
+ end
532
+ end
533
+
534
+ # Calculate code lengths by traversing Huffman tree
535
+ #
536
+ # @param node [Array] Tree node [freq, symbol, left, right]
537
+ # @param depth [Integer] Current depth
538
+ # @param lengths [Array<Integer>] Output array for lengths
539
+ # @return [void]
540
+ def calculate_code_lengths(node, depth, lengths)
541
+ return unless node
542
+
543
+ _, symbol, left, right = node
544
+
545
+ if symbol.nil?
546
+ # Internal node: recurse to children
547
+ calculate_code_lengths(left, depth + 1, lengths)
548
+ calculate_code_lengths(right, depth + 1, lengths)
549
+ else
550
+ # Leaf node: record length
551
+ lengths[symbol] = depth
552
+ end
366
553
  end
367
554
 
368
555
  # Write block header
@@ -56,7 +56,7 @@ module Cabriolet
56
56
  # @param input [System::FileHandle, System::MemoryHandle] Input handle
57
57
  # @param output [System::FileHandle, System::MemoryHandle] Output handle
58
58
  # @param buffer_size [Integer] Buffer size for I/O operations
59
- def initialize(io_system, input, output, buffer_size)
59
+ def initialize(io_system, input, output, buffer_size, **_kwargs)
60
60
  super
61
61
 
62
62
  # Initialize bitstream writer
@@ -88,10 +88,15 @@ module Cabriolet
88
88
 
89
89
  # Process data in FRAME_SIZE chunks
90
90
  # Each frame is independent and contains blocks ending with last_block=1
91
+ frame_num = 0
91
92
  while pos < input_data.bytesize
92
93
  chunk_size = [FRAME_SIZE, input_data.bytesize - pos].min
93
94
  chunk = input_data[pos, chunk_size]
94
95
 
96
+ if ENV["DEBUG_MSZIP_COMPRESS"]
97
+ warn "DEBUG compress: Frame #{frame_num}: pos=#{pos}, chunk_size=#{chunk_size}"
98
+ end
99
+
95
100
  # Write CK signature
96
101
  write_signature
97
102
 
@@ -99,11 +104,19 @@ module Cabriolet
99
104
  # Each frame's block is always marked as last within that frame
100
105
  compress_block(chunk, true)
101
106
 
107
+ # Flush bitstream after each frame to ensure data is written
108
+ @bitstream.flush
109
+
110
+ if ENV["DEBUG_MSZIP_COMPRESS"]
111
+ warn "DEBUG compress: Frame #{frame_num} complete, flushed"
112
+ end
113
+
102
114
  pos += chunk_size
103
115
  total_written += chunk_size
116
+ frame_num += 1
104
117
  end
105
118
 
106
- # Flush any remaining bits
119
+ # Final flush (may not be needed now but keep for safety)
107
120
  @bitstream.flush
108
121
 
109
122
  total_written
@@ -129,8 +142,19 @@ module Cabriolet
129
142
  #
130
143
  # @return [void]
131
144
  def write_signature
145
+ if ENV["DEBUG_MSZIP_COMPRESS"]
146
+ warn "DEBUG write_signature: ENTRY"
147
+ end
132
148
  @bitstream.byte_align
133
- SIGNATURE.each { |byte| @bitstream.write_raw_byte(byte) }
149
+ SIGNATURE.each do |byte|
150
+ if ENV["DEBUG_MSZIP_COMPRESS"]
151
+ warn "DEBUG write_signature: Writing byte 0x#{byte.to_s(16)}"
152
+ end
153
+ @bitstream.write_raw_byte(byte)
154
+ end
155
+ if ENV["DEBUG_MSZIP_COMPRESS"]
156
+ warn "DEBUG write_signature: EXIT"
157
+ end
134
158
  end
135
159
 
136
160
  # Compress a single block using fixed Huffman encoding
@@ -139,6 +163,10 @@ module Cabriolet
139
163
  # @param is_last [Boolean] Whether this is the last block
140
164
  # @return [void]
141
165
  def compress_block(data, is_last)
166
+ if ENV["DEBUG_MSZIP_COMPRESS"]
167
+ warn "DEBUG compress_block: ENTRY data_size=#{data.bytesize} is_last=#{is_last}"
168
+ end
169
+
142
170
  # Write block header
143
171
  @bitstream.write_bits(is_last ? 1 : 0, 1) # Last block flag
144
172
  @bitstream.write_bits(FIXED_HUFFMAN_BLOCK, 2) # Block type
@@ -151,6 +179,10 @@ module Cabriolet
151
179
 
152
180
  # Write end-of-block symbol (256)
153
181
  encode_literal(256)
182
+
183
+ if ENV["DEBUG_MSZIP_COMPRESS"]
184
+ warn "DEBUG compress_block: EXIT"
185
+ end
154
186
  end
155
187
 
156
188
  # Encode data using LZ77 matching and Huffman encoding