cabriolet 0.1.2 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.adoc +703 -38
- data/lib/cabriolet/algorithm_factory.rb +250 -0
- data/lib/cabriolet/base_compressor.rb +206 -0
- data/lib/cabriolet/binary/bitstream.rb +167 -16
- data/lib/cabriolet/binary/bitstream_writer.rb +150 -21
- data/lib/cabriolet/binary/chm_structures.rb +2 -2
- data/lib/cabriolet/binary/hlp_structures.rb +258 -37
- data/lib/cabriolet/binary/lit_structures.rb +231 -65
- data/lib/cabriolet/binary/oab_structures.rb +17 -1
- data/lib/cabriolet/cab/command_handler.rb +226 -0
- data/lib/cabriolet/cab/compressor.rb +108 -84
- data/lib/cabriolet/cab/decompressor.rb +16 -20
- data/lib/cabriolet/cab/extractor.rb +142 -66
- data/lib/cabriolet/cab/file_compression_work.rb +52 -0
- data/lib/cabriolet/cab/file_compression_worker.rb +89 -0
- data/lib/cabriolet/checksum.rb +49 -0
- data/lib/cabriolet/chm/command_handler.rb +227 -0
- data/lib/cabriolet/chm/compressor.rb +7 -3
- data/lib/cabriolet/chm/decompressor.rb +39 -21
- data/lib/cabriolet/chm/parser.rb +5 -2
- data/lib/cabriolet/cli/base_command_handler.rb +127 -0
- data/lib/cabriolet/cli/command_dispatcher.rb +140 -0
- data/lib/cabriolet/cli/command_registry.rb +83 -0
- data/lib/cabriolet/cli.rb +356 -607
- data/lib/cabriolet/collections/file_collection.rb +175 -0
- data/lib/cabriolet/compressors/base.rb +1 -1
- data/lib/cabriolet/compressors/lzx.rb +241 -54
- data/lib/cabriolet/compressors/mszip.rb +35 -3
- data/lib/cabriolet/compressors/quantum.rb +36 -95
- data/lib/cabriolet/decompressors/base.rb +1 -1
- data/lib/cabriolet/decompressors/lzss.rb +13 -3
- data/lib/cabriolet/decompressors/lzx.rb +70 -33
- data/lib/cabriolet/decompressors/mszip.rb +126 -39
- data/lib/cabriolet/decompressors/quantum.rb +83 -53
- data/lib/cabriolet/errors.rb +3 -0
- data/lib/cabriolet/extraction/base_extractor.rb +88 -0
- data/lib/cabriolet/extraction/extractor.rb +171 -0
- data/lib/cabriolet/extraction/file_extraction_work.rb +60 -0
- data/lib/cabriolet/extraction/file_extraction_worker.rb +106 -0
- data/lib/cabriolet/file_entry.rb +156 -0
- data/lib/cabriolet/file_manager.rb +144 -0
- data/lib/cabriolet/format_base.rb +79 -0
- data/lib/cabriolet/hlp/command_handler.rb +282 -0
- data/lib/cabriolet/hlp/compressor.rb +28 -238
- data/lib/cabriolet/hlp/decompressor.rb +107 -147
- data/lib/cabriolet/hlp/parser.rb +52 -101
- data/lib/cabriolet/hlp/quickhelp/compression_stream.rb +138 -0
- data/lib/cabriolet/hlp/quickhelp/compressor.rb +151 -0
- data/lib/cabriolet/hlp/quickhelp/decompressor.rb +558 -0
- data/lib/cabriolet/hlp/quickhelp/file_writer.rb +125 -0
- data/lib/cabriolet/hlp/quickhelp/huffman_stream.rb +74 -0
- data/lib/cabriolet/hlp/quickhelp/huffman_tree.rb +167 -0
- data/lib/cabriolet/hlp/quickhelp/offset_calculator.rb +61 -0
- data/lib/cabriolet/hlp/quickhelp/parser.rb +274 -0
- data/lib/cabriolet/hlp/quickhelp/structure_builder.rb +93 -0
- data/lib/cabriolet/hlp/quickhelp/topic_builder.rb +52 -0
- data/lib/cabriolet/hlp/quickhelp/topic_compressor.rb +83 -0
- data/lib/cabriolet/hlp/winhelp/btree_builder.rb +289 -0
- data/lib/cabriolet/hlp/winhelp/compressor.rb +400 -0
- data/lib/cabriolet/hlp/winhelp/decompressor.rb +192 -0
- data/lib/cabriolet/hlp/winhelp/parser.rb +484 -0
- data/lib/cabriolet/hlp/winhelp/zeck_lz77.rb +271 -0
- data/lib/cabriolet/huffman/encoder.rb +15 -12
- data/lib/cabriolet/huffman/tree.rb +85 -1
- data/lib/cabriolet/kwaj/command_handler.rb +213 -0
- data/lib/cabriolet/kwaj/compressor.rb +7 -3
- data/lib/cabriolet/kwaj/decompressor.rb +18 -12
- data/lib/cabriolet/lit/command_handler.rb +221 -0
- data/lib/cabriolet/lit/compressor.rb +119 -168
- data/lib/cabriolet/lit/content_encoder.rb +76 -0
- data/lib/cabriolet/lit/content_type_detector.rb +50 -0
- data/lib/cabriolet/lit/decompressor.rb +518 -152
- data/lib/cabriolet/lit/directory_builder.rb +153 -0
- data/lib/cabriolet/lit/guid_generator.rb +16 -0
- data/lib/cabriolet/lit/header_writer.rb +124 -0
- data/lib/cabriolet/lit/parser.rb +670 -0
- data/lib/cabriolet/lit/piece_builder.rb +74 -0
- data/lib/cabriolet/lit/structure_builder.rb +252 -0
- data/lib/cabriolet/models/hlp_file.rb +130 -29
- data/lib/cabriolet/models/hlp_header.rb +105 -17
- data/lib/cabriolet/models/lit_header.rb +212 -25
- data/lib/cabriolet/models/szdd_header.rb +10 -2
- data/lib/cabriolet/models/winhelp_header.rb +127 -0
- data/lib/cabriolet/oab/command_handler.rb +257 -0
- data/lib/cabriolet/oab/compressor.rb +17 -8
- data/lib/cabriolet/oab/decompressor.rb +41 -10
- data/lib/cabriolet/offset_calculator.rb +81 -0
- data/lib/cabriolet/plugin.rb +233 -0
- data/lib/cabriolet/plugin_manager.rb +453 -0
- data/lib/cabriolet/plugin_validator.rb +422 -0
- data/lib/cabriolet/quantum_shared.rb +105 -0
- data/lib/cabriolet/system/io_system.rb +3 -0
- data/lib/cabriolet/system/memory_handle.rb +17 -4
- data/lib/cabriolet/szdd/command_handler.rb +217 -0
- data/lib/cabriolet/szdd/compressor.rb +15 -11
- data/lib/cabriolet/szdd/decompressor.rb +18 -9
- data/lib/cabriolet/version.rb +1 -1
- data/lib/cabriolet.rb +181 -20
- metadata +69 -4
- data/lib/cabriolet/auto.rb +0 -173
- data/lib/cabriolet/parallel.rb +0 -333
|
@@ -0,0 +1,175 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Cabriolet
|
|
4
|
+
module Collections
|
|
5
|
+
# FileCollection manages a collection of files for compression
|
|
6
|
+
# Provides unified interface for adding files and preparing them for compression
|
|
7
|
+
class FileCollection
|
|
8
|
+
include Enumerable
|
|
9
|
+
|
|
10
|
+
# Initialize a new file collection
|
|
11
|
+
#
|
|
12
|
+
# @param format_options [Hash] Options specific to the archive format
|
|
13
|
+
def initialize(format_options = {})
|
|
14
|
+
@files = []
|
|
15
|
+
@format_options = format_options
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
# Add a file to the collection
|
|
19
|
+
#
|
|
20
|
+
# @param source_path [String] Path to the source file
|
|
21
|
+
# @param archive_path [String, nil] Path within the archive (defaults to basename)
|
|
22
|
+
# @param options [Hash] Additional options for this file
|
|
23
|
+
# @return [self] Returns self for chaining
|
|
24
|
+
#
|
|
25
|
+
# @example
|
|
26
|
+
# collection.add("README.md", "docs/README.md")
|
|
27
|
+
# collection.add("data.txt") # Uses basename
|
|
28
|
+
def add(source_path, archive_path = nil, **options)
|
|
29
|
+
validate_source(source_path)
|
|
30
|
+
|
|
31
|
+
@files << {
|
|
32
|
+
source: source_path,
|
|
33
|
+
archive: archive_path || ::File.basename(source_path),
|
|
34
|
+
options: options,
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
self
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
# Add multiple files at once
|
|
41
|
+
#
|
|
42
|
+
# @param files [Array<Hash>] Array of file hashes with :source, :archive, :options keys
|
|
43
|
+
# @return [self] Returns self for chaining
|
|
44
|
+
def add_all(files)
|
|
45
|
+
files.each do |file|
|
|
46
|
+
add(file[:source], file[:archive], **file.fetch(:options, {}))
|
|
47
|
+
end
|
|
48
|
+
self
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
# Iterate over files in the collection
|
|
52
|
+
#
|
|
53
|
+
# @yield [file_entry] Yields each file entry hash
|
|
54
|
+
# @return [Enumerator] If no block given
|
|
55
|
+
def each(&)
|
|
56
|
+
@files.each(&)
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
# Get the number of files in the collection
|
|
60
|
+
#
|
|
61
|
+
# @return [Integer] Number of files
|
|
62
|
+
def size
|
|
63
|
+
@files.size
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
# Check if collection is empty
|
|
67
|
+
#
|
|
68
|
+
# @return [Boolean] True if no files
|
|
69
|
+
def empty?
|
|
70
|
+
@files.empty?
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
# Clear all files from the collection
|
|
74
|
+
#
|
|
75
|
+
# @return [self] Returns self for chaining
|
|
76
|
+
def clear
|
|
77
|
+
@files.clear
|
|
78
|
+
self
|
|
79
|
+
end
|
|
80
|
+
|
|
81
|
+
# Prepare files for compression by reading metadata
|
|
82
|
+
#
|
|
83
|
+
# @return [Array<Hash>] Array of prepared file info hashes
|
|
84
|
+
def prepare_for_compression
|
|
85
|
+
@files.map do |file_entry|
|
|
86
|
+
prepare_file_info(file_entry)
|
|
87
|
+
end
|
|
88
|
+
end
|
|
89
|
+
|
|
90
|
+
# Get total uncompressed size of all files
|
|
91
|
+
#
|
|
92
|
+
# @return [Integer] Total size in bytes
|
|
93
|
+
def total_size
|
|
94
|
+
@files.sum { |f| ::File.size(f[:source]) }
|
|
95
|
+
end
|
|
96
|
+
|
|
97
|
+
# Group files by directory for archive organization
|
|
98
|
+
#
|
|
99
|
+
# @return [Hash] Hash with directory paths as keys and file arrays as values
|
|
100
|
+
def by_directory
|
|
101
|
+
@files.group_by do |file|
|
|
102
|
+
::File.dirname(file[:archive])
|
|
103
|
+
end
|
|
104
|
+
end
|
|
105
|
+
|
|
106
|
+
# Find files by pattern in archive path
|
|
107
|
+
#
|
|
108
|
+
# @param pattern [String, Regexp] Pattern to match
|
|
109
|
+
# @return [Array<Hash>] Matching file entries
|
|
110
|
+
def find_by_pattern(pattern)
|
|
111
|
+
@files.select do |file|
|
|
112
|
+
if pattern.is_a?(Regexp)
|
|
113
|
+
file[:archive] =~ pattern
|
|
114
|
+
else
|
|
115
|
+
file[:archive].include?(pattern)
|
|
116
|
+
end
|
|
117
|
+
end
|
|
118
|
+
end
|
|
119
|
+
|
|
120
|
+
private
|
|
121
|
+
|
|
122
|
+
# Validate that source file exists and is accessible
|
|
123
|
+
#
|
|
124
|
+
# @param path [String] Path to validate
|
|
125
|
+
# @raise [ArgumentError] if file doesn't exist or isn't a regular file
|
|
126
|
+
def validate_source(path)
|
|
127
|
+
unless ::File.exist?(path)
|
|
128
|
+
raise ArgumentError, "File does not exist: #{path}"
|
|
129
|
+
end
|
|
130
|
+
|
|
131
|
+
unless ::File.file?(path)
|
|
132
|
+
raise ArgumentError, "Not a regular file: #{path}"
|
|
133
|
+
end
|
|
134
|
+
end
|
|
135
|
+
|
|
136
|
+
# Prepare file information for compression
|
|
137
|
+
#
|
|
138
|
+
# @param file_entry [Hash] Original file entry
|
|
139
|
+
# @return [Hash] Prepared file info with metadata
|
|
140
|
+
def prepare_file_info(file_entry)
|
|
141
|
+
stat = ::File.stat(file_entry[:source])
|
|
142
|
+
|
|
143
|
+
{
|
|
144
|
+
source_path: file_entry[:source],
|
|
145
|
+
archive_path: file_entry[:archive],
|
|
146
|
+
size: stat.size,
|
|
147
|
+
mtime: stat.mtime,
|
|
148
|
+
atime: stat.atime,
|
|
149
|
+
attributes: calculate_attributes(stat),
|
|
150
|
+
options: file_entry[:options],
|
|
151
|
+
}
|
|
152
|
+
end
|
|
153
|
+
|
|
154
|
+
# Calculate file attributes for archive format
|
|
155
|
+
#
|
|
156
|
+
# @param stat [File::Stat] File stat object
|
|
157
|
+
# @return [Integer] Attribute flags
|
|
158
|
+
def calculate_attributes(stat)
|
|
159
|
+
attribs = Constants::ATTRIB_ARCH
|
|
160
|
+
|
|
161
|
+
# Set read-only flag if not writable
|
|
162
|
+
attribs |= Constants::ATTRIB_READONLY unless stat.writable?
|
|
163
|
+
|
|
164
|
+
# Set hidden flag if hidden (Unix dotfiles)
|
|
165
|
+
basename = ::File.basename(@files.first[:source])
|
|
166
|
+
attribs |= Constants::ATTRIB_HIDDEN if basename.start_with?(".")
|
|
167
|
+
|
|
168
|
+
# Set system flag for system files
|
|
169
|
+
attribs |= Constants::ATTRIB_SYSTEM if stat.socket? || stat.symlink?
|
|
170
|
+
|
|
171
|
+
attribs
|
|
172
|
+
end
|
|
173
|
+
end
|
|
174
|
+
end
|
|
175
|
+
end
|
|
@@ -15,7 +15,7 @@ module Cabriolet
|
|
|
15
15
|
# @param input [System::FileHandle, System::MemoryHandle] Input handle
|
|
16
16
|
# @param output [System::FileHandle, System::MemoryHandle] Output handle
|
|
17
17
|
# @param buffer_size [Integer] Buffer size for I/O operations
|
|
18
|
-
def initialize(io_system, input, output, buffer_size)
|
|
18
|
+
def initialize(io_system, input, output, buffer_size, **_kwargs)
|
|
19
19
|
@io_system = io_system
|
|
20
20
|
@input = input
|
|
21
21
|
@output = output
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
|
+
require_relative "base"
|
|
3
4
|
require_relative "../binary/bitstream_writer"
|
|
4
5
|
require_relative "../huffman/encoder"
|
|
5
6
|
|
|
@@ -66,7 +67,8 @@ module Cabriolet
|
|
|
66
67
|
# @param output [System::FileHandle, System::MemoryHandle] Output handle
|
|
67
68
|
# @param buffer_size [Integer] Buffer size for I/O operations
|
|
68
69
|
# @param window_bits [Integer] Window size (15-21 for regular LZX)
|
|
69
|
-
def initialize(io_system, input, output, buffer_size, window_bits: 15
|
|
70
|
+
def initialize(io_system, input, output, buffer_size, window_bits: 15,
|
|
71
|
+
**_kwargs)
|
|
70
72
|
super(io_system, input, output, buffer_size)
|
|
71
73
|
|
|
72
74
|
# Validate window_bits
|
|
@@ -82,8 +84,9 @@ module Cabriolet
|
|
|
82
84
|
@num_offsets = POSITION_SLOTS[window_bits - 15] << 3
|
|
83
85
|
@maintree_maxsymbols = NUM_CHARS + @num_offsets
|
|
84
86
|
|
|
85
|
-
# Initialize bitstream writer
|
|
86
|
-
@bitstream = Binary::BitstreamWriter.new(io_system, output,
|
|
87
|
+
# Initialize bitstream writer (LZX uses MSB-first bit ordering per libmspack lzxd.c)
|
|
88
|
+
@bitstream = Binary::BitstreamWriter.new(io_system, output,
|
|
89
|
+
buffer_size, bit_order: :msb)
|
|
87
90
|
|
|
88
91
|
# Initialize sliding window for LZ77
|
|
89
92
|
@window = "\0" * @window_size
|
|
@@ -119,6 +122,7 @@ module Cabriolet
|
|
|
119
122
|
frame_data = input_data[pos, frame_size]
|
|
120
123
|
|
|
121
124
|
# Compress this frame
|
|
125
|
+
# TODO: Use compress_frame_verbatim once tree encoding is fixed
|
|
122
126
|
compress_frame(frame_data)
|
|
123
127
|
|
|
124
128
|
pos += frame_size
|
|
@@ -152,19 +156,46 @@ module Cabriolet
|
|
|
152
156
|
# @param data [String] Frame data to compress
|
|
153
157
|
# @return [void]
|
|
154
158
|
def compress_frame(data)
|
|
155
|
-
#
|
|
156
|
-
|
|
159
|
+
# For uncompressed blocks, block length is just the frame data size
|
|
160
|
+
# (offset registers are NOT included in the block length field)
|
|
161
|
+
block_length = data.bytesize
|
|
157
162
|
|
|
158
|
-
# Write
|
|
163
|
+
# Write UNCOMPRESSED block header
|
|
164
|
+
write_block_header(BLOCKTYPE_UNCOMPRESSED, block_length)
|
|
165
|
+
|
|
166
|
+
# Write offset registers (R0, R1, R2)
|
|
159
167
|
write_offset_registers
|
|
160
168
|
|
|
161
|
-
# Write raw data
|
|
169
|
+
# Write raw uncompressed data
|
|
162
170
|
data.each_byte do |byte|
|
|
163
171
|
@bitstream.write_bits(byte, 8)
|
|
164
172
|
end
|
|
173
|
+
end
|
|
174
|
+
|
|
175
|
+
# Compress a single frame (32KB) - VERBATIM version (currently disabled)
|
|
176
|
+
#
|
|
177
|
+
# @param data [String] Frame data to compress
|
|
178
|
+
# @return [void]
|
|
179
|
+
def compress_frame_verbatim(data)
|
|
180
|
+
# Reset frequency statistics for each frame
|
|
181
|
+
@literal_freq.fill(0)
|
|
182
|
+
@match_freq.fill(0)
|
|
183
|
+
@length_freq.fill(0)
|
|
184
|
+
|
|
185
|
+
# Analyze frame to generate LZ77 tokens
|
|
186
|
+
tokens = analyze_frame(data)
|
|
165
187
|
|
|
166
|
-
#
|
|
167
|
-
|
|
188
|
+
# Build Huffman trees from statistics
|
|
189
|
+
build_trees
|
|
190
|
+
|
|
191
|
+
# Write VERBATIM block header
|
|
192
|
+
write_block_header(BLOCKTYPE_VERBATIM, data.bytesize)
|
|
193
|
+
|
|
194
|
+
# Write Huffman tree definitions
|
|
195
|
+
write_trees
|
|
196
|
+
|
|
197
|
+
# Encode all tokens using the Huffman codes
|
|
198
|
+
encode_tokens(tokens)
|
|
168
199
|
end
|
|
169
200
|
|
|
170
201
|
# Analyze frame and generate LZ77 tokens
|
|
@@ -301,68 +332,224 @@ module Cabriolet
|
|
|
301
332
|
slot
|
|
302
333
|
end
|
|
303
334
|
|
|
335
|
+
# Build Huffman code lengths from frequencies
|
|
336
|
+
#
|
|
337
|
+
# Uses a simplified approach: assign equal lengths to all symbols.
|
|
338
|
+
# This guarantees valid Huffman trees that satisfy Kraft inequality.
|
|
339
|
+
#
|
|
340
|
+
# @param freqs [Array<Integer>] Symbol frequencies
|
|
341
|
+
# @param num_symbols [Integer] Number of symbols
|
|
342
|
+
# @return [Array<Integer>] Code lengths
|
|
343
|
+
def build_tree_lengths(freqs, num_symbols)
|
|
344
|
+
lengths = Array.new(num_symbols, 0)
|
|
345
|
+
|
|
346
|
+
# Get symbols with non-zero frequencies
|
|
347
|
+
non_zero_symbols = freqs.each_with_index.select do |freq, _|
|
|
348
|
+
freq.positive?
|
|
349
|
+
end.map { |_, sym| sym }
|
|
350
|
+
|
|
351
|
+
# Handle edge cases
|
|
352
|
+
if non_zero_symbols.empty?
|
|
353
|
+
# Empty tree: create minimal valid tree with 2 symbols
|
|
354
|
+
lengths[0] = 1
|
|
355
|
+
lengths[1] = 1
|
|
356
|
+
return lengths
|
|
357
|
+
elsif non_zero_symbols.size == 1
|
|
358
|
+
# Single symbol: need at least 2 symbols for valid Huffman tree
|
|
359
|
+
symbol = non_zero_symbols[0]
|
|
360
|
+
lengths[symbol] = 1
|
|
361
|
+
dummy = symbol.zero? ? 1 : 0
|
|
362
|
+
lengths[dummy] = 1
|
|
363
|
+
return lengths
|
|
364
|
+
end
|
|
365
|
+
|
|
366
|
+
# Calculate required length: ceil(log2(count))
|
|
367
|
+
count = non_zero_symbols.size
|
|
368
|
+
bit_length = 1
|
|
369
|
+
while (1 << bit_length) < count
|
|
370
|
+
bit_length += 1
|
|
371
|
+
end
|
|
372
|
+
|
|
373
|
+
# Assign same length to all non-zero symbols
|
|
374
|
+
non_zero_symbols.each do |symbol|
|
|
375
|
+
lengths[symbol] = bit_length
|
|
376
|
+
end
|
|
377
|
+
|
|
378
|
+
# Pad with dummy symbols to make tree complete (2^bit_length total symbols)
|
|
379
|
+
# This ensures Kraft inequality sum equals exactly 1.0
|
|
380
|
+
total_needed = 1 << bit_length
|
|
381
|
+
dummy_count = total_needed - count
|
|
382
|
+
|
|
383
|
+
if dummy_count.positive?
|
|
384
|
+
dummy_index = 0
|
|
385
|
+
while dummy_count.positive? && dummy_index < num_symbols
|
|
386
|
+
if lengths[dummy_index].zero?
|
|
387
|
+
lengths[dummy_index] = bit_length
|
|
388
|
+
dummy_count -= 1
|
|
389
|
+
end
|
|
390
|
+
dummy_index += 1
|
|
391
|
+
end
|
|
392
|
+
end
|
|
393
|
+
|
|
394
|
+
lengths
|
|
395
|
+
end
|
|
396
|
+
|
|
304
397
|
# Build Huffman trees from frequency statistics
|
|
305
398
|
#
|
|
399
|
+
# This creates three trees for LZX compression:
|
|
400
|
+
# 1. Main tree: literals (0-255) + match position/length combinations
|
|
401
|
+
# 2. Length tree: additional length symbols for long matches
|
|
402
|
+
# 3. Pretree: encodes the code lengths of main/length trees
|
|
403
|
+
#
|
|
306
404
|
# @return [void]
|
|
307
405
|
def build_trees
|
|
308
|
-
#
|
|
309
|
-
|
|
310
|
-
|
|
406
|
+
# Step 1: Combine literal and match frequencies for main tree
|
|
407
|
+
maintree_freq = @literal_freq + @match_freq
|
|
408
|
+
|
|
409
|
+
# Step 2: Build main tree code lengths
|
|
410
|
+
@maintree_lengths = build_tree_lengths(maintree_freq,
|
|
311
411
|
@maintree_maxsymbols)
|
|
312
|
-
@maintree_codes = Huffman::Encoder.build_codes(@maintree_lengths,
|
|
313
|
-
@maintree_maxsymbols)
|
|
314
412
|
|
|
315
|
-
# Build length tree
|
|
413
|
+
# Step 3: Build length tree code lengths
|
|
316
414
|
@length_lengths = build_tree_lengths(@length_freq, LENGTH_MAXSYMBOLS)
|
|
415
|
+
|
|
416
|
+
# Step 4: Calculate pretree frequencies by simulating tree encoding
|
|
417
|
+
pretree_freq = calculate_pretree_frequencies
|
|
418
|
+
|
|
419
|
+
# Step 5: Build pretree code lengths
|
|
420
|
+
@pretree_lengths = build_tree_lengths(pretree_freq, PRETREE_MAXSYMBOLS)
|
|
421
|
+
|
|
422
|
+
# Step 6: Generate code tables from lengths
|
|
423
|
+
@maintree_codes = Huffman::Encoder.build_codes(@maintree_lengths,
|
|
424
|
+
@maintree_maxsymbols)
|
|
317
425
|
@length_codes = Huffman::Encoder.build_codes(@length_lengths,
|
|
318
426
|
LENGTH_MAXSYMBOLS)
|
|
319
|
-
|
|
320
|
-
# Build pretree (used to encode the other trees)
|
|
321
|
-
# Create a valid Huffman tree that satisfies Kraft inequality
|
|
322
|
-
# For 20 symbols, use: 2@3bits + 6@4bits + 12@5bits = 1.0
|
|
323
|
-
@pretree_lengths = Array.new(PRETREE_MAXSYMBOLS, 0)
|
|
324
|
-
# Most common symbols (0-1): 3 bits
|
|
325
|
-
(0..1).each { |i| @pretree_lengths[i] = 3 }
|
|
326
|
-
# Common symbols (2-7): 4 bits
|
|
327
|
-
(2..7).each { |i| @pretree_lengths[i] = 4 }
|
|
328
|
-
# Less common symbols (8-19): 5 bits
|
|
329
|
-
(8..19).each { |i| @pretree_lengths[i] = 5 }
|
|
330
427
|
@pretree_codes = Huffman::Encoder.build_codes(@pretree_lengths,
|
|
331
428
|
PRETREE_MAXSYMBOLS)
|
|
332
429
|
end
|
|
333
430
|
|
|
334
|
-
#
|
|
431
|
+
# Calculate pretree symbol frequencies
|
|
335
432
|
#
|
|
336
|
-
#
|
|
337
|
-
#
|
|
338
|
-
#
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
433
|
+
# The pretree encodes the code lengths of the main and length trees.
|
|
434
|
+
# This method simulates the tree encoding process to determine which
|
|
435
|
+
# pretree symbols will be needed.
|
|
436
|
+
#
|
|
437
|
+
# @return [Array<Integer>] Frequency array for pretree symbols (0-19)
|
|
438
|
+
def calculate_pretree_frequencies
|
|
439
|
+
pretree_freq = Array.new(PRETREE_MAXSYMBOLS, 0)
|
|
343
440
|
|
|
344
|
-
#
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
441
|
+
# Count symbols needed to encode main tree (two parts)
|
|
442
|
+
count_pretree_symbols(@maintree_lengths, 0, NUM_CHARS, pretree_freq)
|
|
443
|
+
count_pretree_symbols(@maintree_lengths, NUM_CHARS,
|
|
444
|
+
@maintree_maxsymbols, pretree_freq)
|
|
445
|
+
|
|
446
|
+
# Count symbols needed to encode length tree
|
|
447
|
+
count_pretree_symbols(@length_lengths, 0, NUM_SECONDARY_LENGTHS,
|
|
448
|
+
pretree_freq)
|
|
449
|
+
|
|
450
|
+
pretree_freq
|
|
451
|
+
end
|
|
452
|
+
|
|
453
|
+
# Count pretree symbols needed to encode a tree
|
|
454
|
+
#
|
|
455
|
+
# This simulates the write_tree_with_pretree encoding process to count
|
|
456
|
+
# which pretree symbols will be used, allowing us to build an optimal
|
|
457
|
+
# pretree.
|
|
458
|
+
#
|
|
459
|
+
# @param lengths [Array<Integer>] Tree lengths to encode
|
|
460
|
+
# @param start [Integer] Start index
|
|
461
|
+
# @param end_idx [Integer] End index (exclusive)
|
|
462
|
+
# @param freq [Array<Integer>] Frequency array to update
|
|
463
|
+
# @return [void]
|
|
464
|
+
def count_pretree_symbols(lengths, start, end_idx, freq)
|
|
465
|
+
i = start
|
|
466
|
+
prev_length = 0
|
|
467
|
+
|
|
468
|
+
while i < end_idx
|
|
469
|
+
length = lengths[i]
|
|
470
|
+
|
|
471
|
+
if length.zero?
|
|
472
|
+
# Count run of zeros
|
|
473
|
+
zero_count = 0
|
|
474
|
+
while i < end_idx && lengths[i].zero? && zero_count < 138
|
|
475
|
+
zero_count += 1
|
|
476
|
+
i += 1
|
|
477
|
+
end
|
|
478
|
+
|
|
479
|
+
# Encode long runs with symbol 18
|
|
480
|
+
if zero_count >= 20
|
|
481
|
+
while zero_count >= 20
|
|
482
|
+
run = [zero_count, 51].min
|
|
483
|
+
freq[18] += 1
|
|
484
|
+
zero_count -= run
|
|
485
|
+
end
|
|
486
|
+
end
|
|
487
|
+
|
|
488
|
+
# Encode medium runs with symbol 17
|
|
489
|
+
if zero_count >= 4
|
|
490
|
+
run = [zero_count, 19].min
|
|
491
|
+
freq[17] += 1
|
|
492
|
+
zero_count -= run
|
|
493
|
+
end
|
|
494
|
+
|
|
495
|
+
# Encode remaining short runs as deltas
|
|
496
|
+
if zero_count.positive?
|
|
497
|
+
zero_count.times do
|
|
498
|
+
delta = (17 - prev_length) % 17
|
|
499
|
+
freq[delta] += 1
|
|
500
|
+
prev_length = 0
|
|
501
|
+
end
|
|
502
|
+
end
|
|
503
|
+
else
|
|
504
|
+
# Encode as delta from previous length
|
|
505
|
+
delta = (length - prev_length) % 17
|
|
506
|
+
freq[delta] += 1
|
|
507
|
+
prev_length = length
|
|
508
|
+
i += 1
|
|
509
|
+
end
|
|
363
510
|
end
|
|
511
|
+
end
|
|
364
512
|
|
|
365
|
-
|
|
513
|
+
# Calculate code lengths by traversing Huffman tree
|
|
514
|
+
#
|
|
515
|
+
# @param node [Array] Tree node [freq, symbol, left, right, depth]
|
|
516
|
+
# @param depth [Integer] Current depth
|
|
517
|
+
# @param lengths [Array<Integer>] Output array for lengths
|
|
518
|
+
# @return [void]
|
|
519
|
+
def calculate_depths(node, depth, lengths)
|
|
520
|
+
return unless node
|
|
521
|
+
|
|
522
|
+
_, symbol, left, right, = node
|
|
523
|
+
|
|
524
|
+
if symbol.nil?
|
|
525
|
+
# Internal node: recurse to children
|
|
526
|
+
calculate_depths(left, depth + 1, lengths)
|
|
527
|
+
calculate_depths(right, depth + 1, lengths)
|
|
528
|
+
else
|
|
529
|
+
# Leaf node: record length
|
|
530
|
+
lengths[symbol] = depth
|
|
531
|
+
end
|
|
532
|
+
end
|
|
533
|
+
|
|
534
|
+
# Calculate code lengths by traversing Huffman tree
|
|
535
|
+
#
|
|
536
|
+
# @param node [Array] Tree node [freq, symbol, left, right]
|
|
537
|
+
# @param depth [Integer] Current depth
|
|
538
|
+
# @param lengths [Array<Integer>] Output array for lengths
|
|
539
|
+
# @return [void]
|
|
540
|
+
def calculate_code_lengths(node, depth, lengths)
|
|
541
|
+
return unless node
|
|
542
|
+
|
|
543
|
+
_, symbol, left, right = node
|
|
544
|
+
|
|
545
|
+
if symbol.nil?
|
|
546
|
+
# Internal node: recurse to children
|
|
547
|
+
calculate_code_lengths(left, depth + 1, lengths)
|
|
548
|
+
calculate_code_lengths(right, depth + 1, lengths)
|
|
549
|
+
else
|
|
550
|
+
# Leaf node: record length
|
|
551
|
+
lengths[symbol] = depth
|
|
552
|
+
end
|
|
366
553
|
end
|
|
367
554
|
|
|
368
555
|
# Write block header
|
|
@@ -56,7 +56,7 @@ module Cabriolet
|
|
|
56
56
|
# @param input [System::FileHandle, System::MemoryHandle] Input handle
|
|
57
57
|
# @param output [System::FileHandle, System::MemoryHandle] Output handle
|
|
58
58
|
# @param buffer_size [Integer] Buffer size for I/O operations
|
|
59
|
-
def initialize(io_system, input, output, buffer_size)
|
|
59
|
+
def initialize(io_system, input, output, buffer_size, **_kwargs)
|
|
60
60
|
super
|
|
61
61
|
|
|
62
62
|
# Initialize bitstream writer
|
|
@@ -88,10 +88,15 @@ module Cabriolet
|
|
|
88
88
|
|
|
89
89
|
# Process data in FRAME_SIZE chunks
|
|
90
90
|
# Each frame is independent and contains blocks ending with last_block=1
|
|
91
|
+
frame_num = 0
|
|
91
92
|
while pos < input_data.bytesize
|
|
92
93
|
chunk_size = [FRAME_SIZE, input_data.bytesize - pos].min
|
|
93
94
|
chunk = input_data[pos, chunk_size]
|
|
94
95
|
|
|
96
|
+
if ENV["DEBUG_MSZIP_COMPRESS"]
|
|
97
|
+
warn "DEBUG compress: Frame #{frame_num}: pos=#{pos}, chunk_size=#{chunk_size}"
|
|
98
|
+
end
|
|
99
|
+
|
|
95
100
|
# Write CK signature
|
|
96
101
|
write_signature
|
|
97
102
|
|
|
@@ -99,11 +104,19 @@ module Cabriolet
|
|
|
99
104
|
# Each frame's block is always marked as last within that frame
|
|
100
105
|
compress_block(chunk, true)
|
|
101
106
|
|
|
107
|
+
# Flush bitstream after each frame to ensure data is written
|
|
108
|
+
@bitstream.flush
|
|
109
|
+
|
|
110
|
+
if ENV["DEBUG_MSZIP_COMPRESS"]
|
|
111
|
+
warn "DEBUG compress: Frame #{frame_num} complete, flushed"
|
|
112
|
+
end
|
|
113
|
+
|
|
102
114
|
pos += chunk_size
|
|
103
115
|
total_written += chunk_size
|
|
116
|
+
frame_num += 1
|
|
104
117
|
end
|
|
105
118
|
|
|
106
|
-
#
|
|
119
|
+
# Final flush (may not be needed now but keep for safety)
|
|
107
120
|
@bitstream.flush
|
|
108
121
|
|
|
109
122
|
total_written
|
|
@@ -129,8 +142,19 @@ module Cabriolet
|
|
|
129
142
|
#
|
|
130
143
|
# @return [void]
|
|
131
144
|
def write_signature
|
|
145
|
+
if ENV["DEBUG_MSZIP_COMPRESS"]
|
|
146
|
+
warn "DEBUG write_signature: ENTRY"
|
|
147
|
+
end
|
|
132
148
|
@bitstream.byte_align
|
|
133
|
-
SIGNATURE.each
|
|
149
|
+
SIGNATURE.each do |byte|
|
|
150
|
+
if ENV["DEBUG_MSZIP_COMPRESS"]
|
|
151
|
+
warn "DEBUG write_signature: Writing byte 0x#{byte.to_s(16)}"
|
|
152
|
+
end
|
|
153
|
+
@bitstream.write_raw_byte(byte)
|
|
154
|
+
end
|
|
155
|
+
if ENV["DEBUG_MSZIP_COMPRESS"]
|
|
156
|
+
warn "DEBUG write_signature: EXIT"
|
|
157
|
+
end
|
|
134
158
|
end
|
|
135
159
|
|
|
136
160
|
# Compress a single block using fixed Huffman encoding
|
|
@@ -139,6 +163,10 @@ module Cabriolet
|
|
|
139
163
|
# @param is_last [Boolean] Whether this is the last block
|
|
140
164
|
# @return [void]
|
|
141
165
|
def compress_block(data, is_last)
|
|
166
|
+
if ENV["DEBUG_MSZIP_COMPRESS"]
|
|
167
|
+
warn "DEBUG compress_block: ENTRY data_size=#{data.bytesize} is_last=#{is_last}"
|
|
168
|
+
end
|
|
169
|
+
|
|
142
170
|
# Write block header
|
|
143
171
|
@bitstream.write_bits(is_last ? 1 : 0, 1) # Last block flag
|
|
144
172
|
@bitstream.write_bits(FIXED_HUFFMAN_BLOCK, 2) # Block type
|
|
@@ -151,6 +179,10 @@ module Cabriolet
|
|
|
151
179
|
|
|
152
180
|
# Write end-of-block symbol (256)
|
|
153
181
|
encode_literal(256)
|
|
182
|
+
|
|
183
|
+
if ENV["DEBUG_MSZIP_COMPRESS"]
|
|
184
|
+
warn "DEBUG compress_block: EXIT"
|
|
185
|
+
end
|
|
154
186
|
end
|
|
155
187
|
|
|
156
188
|
# Encode data using LZ77 matching and Huffman encoding
|