omnizip 0.3.2 → 0.3.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop_todo.yml +243 -368
- data/README.adoc +101 -5
- data/docs/guides/archive-formats/index.adoc +31 -1
- data/docs/guides/archive-formats/ole-format.adoc +316 -0
- data/docs/guides/archive-formats/rpm-format.adoc +249 -0
- data/docs/index.adoc +12 -2
- data/lib/omnizip/algorithms/lzma/distance_coder.rb +29 -18
- data/lib/omnizip/algorithms/lzma/encoder.rb +2 -1
- data/lib/omnizip/algorithms/lzma/length_coder.rb +6 -3
- data/lib/omnizip/algorithms/lzma/literal_decoder.rb +2 -1
- data/lib/omnizip/algorithms/lzma/lzip_decoder.rb +40 -13
- data/lib/omnizip/algorithms/lzma/range_decoder.rb +36 -2
- data/lib/omnizip/algorithms/lzma/range_encoder.rb +19 -0
- data/lib/omnizip/algorithms/lzma/xz_encoder_fast.rb +2 -1
- data/lib/omnizip/algorithms/lzma/xz_utils_decoder.rb +148 -112
- data/lib/omnizip/algorithms/lzma.rb +20 -5
- data/lib/omnizip/algorithms/ppmd7/decoder.rb +25 -21
- data/lib/omnizip/algorithms/ppmd7/encoder.rb +4 -11
- data/lib/omnizip/algorithms/sevenzip_lzma2.rb +2 -1
- data/lib/omnizip/algorithms/xz_lzma2.rb +2 -1
- data/lib/omnizip/algorithms/zstandard/constants.rb +125 -9
- data/lib/omnizip/algorithms/zstandard/decoder.rb +202 -17
- data/lib/omnizip/algorithms/zstandard/encoder.rb +197 -17
- data/lib/omnizip/algorithms/zstandard/frame/block.rb +128 -0
- data/lib/omnizip/algorithms/zstandard/frame/header.rb +224 -0
- data/lib/omnizip/algorithms/zstandard/fse/bitstream.rb +186 -0
- data/lib/omnizip/algorithms/zstandard/fse/encoder.rb +325 -0
- data/lib/omnizip/algorithms/zstandard/fse/table.rb +269 -0
- data/lib/omnizip/algorithms/zstandard/huffman.rb +272 -0
- data/lib/omnizip/algorithms/zstandard/huffman_encoder.rb +339 -0
- data/lib/omnizip/algorithms/zstandard/literals.rb +178 -0
- data/lib/omnizip/algorithms/zstandard/literals_encoder.rb +251 -0
- data/lib/omnizip/algorithms/zstandard/sequences.rb +346 -0
- data/lib/omnizip/buffer/memory_extractor.rb +3 -3
- data/lib/omnizip/buffer.rb +2 -2
- data/lib/omnizip/filters/delta.rb +2 -1
- data/lib/omnizip/filters/registry.rb +6 -6
- data/lib/omnizip/formats/cpio/bounded_io.rb +66 -0
- data/lib/omnizip/formats/lzip.rb +2 -1
- data/lib/omnizip/formats/lzma_alone.rb +2 -1
- data/lib/omnizip/formats/ole/allocation_table.rb +244 -0
- data/lib/omnizip/formats/ole/constants.rb +61 -0
- data/lib/omnizip/formats/ole/dirent.rb +380 -0
- data/lib/omnizip/formats/ole/header.rb +198 -0
- data/lib/omnizip/formats/ole/ranges_io.rb +264 -0
- data/lib/omnizip/formats/ole/storage.rb +305 -0
- data/lib/omnizip/formats/ole/types/variant.rb +328 -0
- data/lib/omnizip/formats/ole.rb +145 -0
- data/lib/omnizip/formats/rar/compression/ppmd/decoder.rb +92 -49
- data/lib/omnizip/formats/rar/compression/ppmd/encoder.rb +13 -20
- data/lib/omnizip/formats/rar/rar5/compression/lzss.rb +6 -2
- data/lib/omnizip/formats/rar3/reader.rb +6 -2
- data/lib/omnizip/formats/rar5/reader.rb +4 -1
- data/lib/omnizip/formats/rpm/constants.rb +58 -0
- data/lib/omnizip/formats/rpm/entry.rb +102 -0
- data/lib/omnizip/formats/rpm/header.rb +113 -0
- data/lib/omnizip/formats/rpm/lead.rb +122 -0
- data/lib/omnizip/formats/rpm/tag.rb +230 -0
- data/lib/omnizip/formats/rpm.rb +434 -0
- data/lib/omnizip/formats/seven_zip/bcj2_stream_decompressor.rb +239 -0
- data/lib/omnizip/formats/seven_zip/coder_chain.rb +32 -8
- data/lib/omnizip/formats/seven_zip/constants.rb +1 -1
- data/lib/omnizip/formats/seven_zip/reader.rb +84 -8
- data/lib/omnizip/formats/seven_zip/stream_compressor.rb +2 -1
- data/lib/omnizip/formats/seven_zip/stream_decompressor.rb +6 -0
- data/lib/omnizip/formats/seven_zip/writer.rb +21 -9
- data/lib/omnizip/formats/seven_zip.rb +10 -0
- data/lib/omnizip/formats/xar/entry.rb +18 -5
- data/lib/omnizip/formats/xar/header.rb +34 -6
- data/lib/omnizip/formats/xar/reader.rb +43 -10
- data/lib/omnizip/formats/xar/toc.rb +34 -21
- data/lib/omnizip/formats/xar/writer.rb +15 -5
- data/lib/omnizip/formats/xz_impl/block_decoder.rb +45 -33
- data/lib/omnizip/formats/xz_impl/block_encoder.rb +2 -1
- data/lib/omnizip/formats/xz_impl/index_decoder.rb +3 -1
- data/lib/omnizip/formats/xz_impl/stream_header_parser.rb +2 -1
- data/lib/omnizip/formats/zip/end_of_central_directory.rb +4 -3
- data/lib/omnizip/implementations/seven_zip/lzma/decoder.rb +14 -6
- data/lib/omnizip/implementations/seven_zip/lzma/encoder.rb +2 -1
- data/lib/omnizip/implementations/seven_zip/lzma2/encoder.rb +28 -13
- data/lib/omnizip/implementations/xz_utils/lzma2/encoder.rb +13 -6
- data/lib/omnizip/pipe/stream_compressor.rb +1 -1
- data/lib/omnizip/version.rb +1 -1
- data/readme-docs/compression-algorithms.adoc +6 -2
- metadata +30 -2
|
@@ -0,0 +1,272 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
# Copyright (C) 2025 Ribose Inc.
|
|
4
|
+
#
|
|
5
|
+
# Permission is hereby granted, free of charge, to any person obtaining a
|
|
6
|
+
# copy of this software and associated documentation files (the "Software"),
|
|
7
|
+
# to deal in the Software without restriction, including without limitation
|
|
8
|
+
# the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
|
9
|
+
# and/or sell copies of the Software, and to permit persons to whom the
|
|
10
|
+
# Software is furnished to do so, subject to the following conditions:
|
|
11
|
+
#
|
|
12
|
+
# The above copyright notice and this permission notice shall be included in
|
|
13
|
+
# all copies or substantial portions of the Software.
|
|
14
|
+
#
|
|
15
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
|
20
|
+
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
|
21
|
+
# DEALINGS IN THE SOFTWARE.
|
|
22
|
+
|
|
23
|
+
require_relative "constants"
|
|
24
|
+
require_relative "fse/bitstream"
|
|
25
|
+
|
|
26
|
+
module Omnizip
|
|
27
|
+
module Algorithms
|
|
28
|
+
class Zstandard
|
|
29
|
+
# Huffman decoding for Zstandard (RFC 8878 Section 4.2)
|
|
30
|
+
#
|
|
31
|
+
# Zstandard uses FSE-compressed Huffman weights followed by
|
|
32
|
+
# canonical Huffman decoding.
|
|
33
|
+
class Huffman
|
|
34
|
+
include Constants
|
|
35
|
+
|
|
36
|
+
# @return [Hash<Integer, Array<Integer>>] Code to symbol mapping
|
|
37
|
+
attr_reader :decode_table
|
|
38
|
+
|
|
39
|
+
# @return [Integer] Maximum code length
|
|
40
|
+
attr_reader :max_bits
|
|
41
|
+
|
|
42
|
+
# Build Huffman table from weights
|
|
43
|
+
#
|
|
44
|
+
# @param weights [Array<Integer>] Symbol weights (0 means not present)
|
|
45
|
+
# @param max_bits [Integer] Maximum code length
|
|
46
|
+
# @return [Huffman] Built Huffman decoder
|
|
47
|
+
def self.build_from_weights(weights, max_bits = HUFFMAN_MAX_BITS)
|
|
48
|
+
# Convert weights to code lengths
|
|
49
|
+
code_lengths = calculate_code_lengths(weights, max_bits)
|
|
50
|
+
|
|
51
|
+
# Build canonical Huffman codes
|
|
52
|
+
codes = build_canonical_codes(code_lengths)
|
|
53
|
+
|
|
54
|
+
# Build decode table: code -> [symbol, length]
|
|
55
|
+
decode_table = {}
|
|
56
|
+
code_lengths.each_with_index do |length, symbol|
|
|
57
|
+
next if length.nil? || length.zero?
|
|
58
|
+
|
|
59
|
+
code = codes[symbol]
|
|
60
|
+
decode_table[code] = [symbol, length]
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
new(decode_table, max_bits)
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
# Calculate code lengths from weights
|
|
67
|
+
#
|
|
68
|
+
# Weight 0 means symbol is not present.
|
|
69
|
+
# Higher weights mean shorter codes.
|
|
70
|
+
#
|
|
71
|
+
# @param weights [Array<Integer>] Symbol weights
|
|
72
|
+
# @param max_bits [Integer] Maximum code length
|
|
73
|
+
# @return [Array<Integer>] Code lengths
|
|
74
|
+
def self.calculate_code_lengths(weights, max_bits)
|
|
75
|
+
return [] if weights.nil? || weights.empty?
|
|
76
|
+
|
|
77
|
+
# Find max weight
|
|
78
|
+
max_weight = weights.max || 0
|
|
79
|
+
return Array.new(weights.length, 0) if max_weight.zero?
|
|
80
|
+
|
|
81
|
+
# Convert weights to code lengths
|
|
82
|
+
# Higher weight = shorter code length
|
|
83
|
+
weights.map do |weight|
|
|
84
|
+
next 0 if weight.nil? || weight.zero?
|
|
85
|
+
|
|
86
|
+
# Code length = max_weight - weight + 1
|
|
87
|
+
[max_weight - weight + 1, max_bits].min
|
|
88
|
+
end
|
|
89
|
+
end
|
|
90
|
+
|
|
91
|
+
# Build canonical Huffman codes from lengths
|
|
92
|
+
#
|
|
93
|
+
# @param code_lengths [Array<Integer>] Code lengths for each symbol
|
|
94
|
+
# @return [Hash<Integer, Integer>] Symbol to code mapping
|
|
95
|
+
def self.build_canonical_codes(code_lengths)
|
|
96
|
+
codes = {}
|
|
97
|
+
return codes if code_lengths.nil? || code_lengths.empty?
|
|
98
|
+
|
|
99
|
+
max_length = code_lengths.compact.max || 0
|
|
100
|
+
|
|
101
|
+
# Count symbols at each length
|
|
102
|
+
bl_count = Array.new(max_length + 1, 0)
|
|
103
|
+
code_lengths.each do |length|
|
|
104
|
+
bl_count[length] += 1 if length&.positive?
|
|
105
|
+
end
|
|
106
|
+
|
|
107
|
+
# Calculate starting code for each length
|
|
108
|
+
code = 0
|
|
109
|
+
next_code = Array.new(max_length + 1, 0)
|
|
110
|
+
(1..max_length).each do |bits|
|
|
111
|
+
code = ((code + bl_count[bits - 1]) << 1)
|
|
112
|
+
next_code[bits] = code
|
|
113
|
+
end
|
|
114
|
+
|
|
115
|
+
# Assign codes to symbols
|
|
116
|
+
code_lengths.each_with_index do |length, symbol|
|
|
117
|
+
next if length.nil? || length.zero?
|
|
118
|
+
|
|
119
|
+
codes[symbol] = next_code[length]
|
|
120
|
+
next_code[length] += 1
|
|
121
|
+
end
|
|
122
|
+
|
|
123
|
+
codes
|
|
124
|
+
end
|
|
125
|
+
|
|
126
|
+
# Initialize Huffman decoder
|
|
127
|
+
#
|
|
128
|
+
# @param decode_table [Hash] Code to [symbol, length] mapping
|
|
129
|
+
# @param max_bits [Integer] Maximum code length
|
|
130
|
+
def initialize(decode_table, max_bits)
|
|
131
|
+
@decode_table = decode_table
|
|
132
|
+
@max_bits = max_bits
|
|
133
|
+
|
|
134
|
+
# Build lookup table for faster decoding
|
|
135
|
+
build_lookup_table
|
|
136
|
+
end
|
|
137
|
+
|
|
138
|
+
# Decode a symbol from bitstream
|
|
139
|
+
#
|
|
140
|
+
# @param bitstream [FSE::ForwardBitStream] The bitstream to read from
|
|
141
|
+
# @return [Integer] Decoded symbol
|
|
142
|
+
def decode(bitstream)
|
|
143
|
+
return 0 if @lookup_table.nil? || @lookup_table.empty?
|
|
144
|
+
|
|
145
|
+
# Peek max_bits bits
|
|
146
|
+
code = 0
|
|
147
|
+
bits_read = 0
|
|
148
|
+
|
|
149
|
+
(@max_bits || 1).times do
|
|
150
|
+
bit = read_single_bit_forward(bitstream)
|
|
151
|
+
code = (code << 1) | bit
|
|
152
|
+
bits_read += 1
|
|
153
|
+
|
|
154
|
+
# Check if this code exists in our table
|
|
155
|
+
if @decode_table.key?(code)
|
|
156
|
+
expected_length = @decode_table[code][1]
|
|
157
|
+
if bits_read == expected_length
|
|
158
|
+
return @decode_table[code][0]
|
|
159
|
+
end
|
|
160
|
+
end
|
|
161
|
+
end
|
|
162
|
+
|
|
163
|
+
# Fallback: try lookup table
|
|
164
|
+
symbol = @lookup_table[code]
|
|
165
|
+
return symbol if symbol
|
|
166
|
+
|
|
167
|
+
0
|
|
168
|
+
end
|
|
169
|
+
|
|
170
|
+
private
|
|
171
|
+
|
|
172
|
+
# Build lookup table for fast decoding
|
|
173
|
+
def build_lookup_table
|
|
174
|
+
@lookup_table = {}
|
|
175
|
+
|
|
176
|
+
return if @decode_table.nil? || @decode_table.empty?
|
|
177
|
+
|
|
178
|
+
@decode_table.each do |code, (symbol, length)|
|
|
179
|
+
# For codes shorter than max_bits, fill all variations
|
|
180
|
+
padding_bits = (@max_bits || 1) - length
|
|
181
|
+
next if padding_bits.negative?
|
|
182
|
+
|
|
183
|
+
(1 << padding_bits).times do |padding|
|
|
184
|
+
full_code = (code << padding_bits) | padding
|
|
185
|
+
@lookup_table[full_code] = symbol
|
|
186
|
+
end
|
|
187
|
+
end
|
|
188
|
+
end
|
|
189
|
+
|
|
190
|
+
# Read a single bit in forward order (MSB first)
|
|
191
|
+
def read_single_bit_forward(bitstream)
|
|
192
|
+
bitstream.read_bits(1)
|
|
193
|
+
end
|
|
194
|
+
end
|
|
195
|
+
|
|
196
|
+
# Huffman table reader (RFC 8878 Section 4.2.1)
|
|
197
|
+
#
|
|
198
|
+
# Reads compressed Huffman table description from input.
|
|
199
|
+
class HuffmanTableReader
|
|
200
|
+
include Constants
|
|
201
|
+
|
|
202
|
+
# Read Huffman table from input
|
|
203
|
+
#
|
|
204
|
+
# @param input [IO] Input stream positioned at Huffman description
|
|
205
|
+
# @return [Huffman] Huffman decoder
|
|
206
|
+
def self.read(input)
|
|
207
|
+
reader = new(input)
|
|
208
|
+
reader.read_table
|
|
209
|
+
end
|
|
210
|
+
|
|
211
|
+
def initialize(input)
|
|
212
|
+
@input = input
|
|
213
|
+
end
|
|
214
|
+
|
|
215
|
+
# Read and build Huffman table
|
|
216
|
+
#
|
|
217
|
+
# @return [Huffman] Huffman decoder
|
|
218
|
+
def read_table
|
|
219
|
+
# Read header
|
|
220
|
+
header = @input.read(1).ord
|
|
221
|
+
|
|
222
|
+
# FSE compressed or raw weights?
|
|
223
|
+
fse_compressed = header.anybits?(0x80)
|
|
224
|
+
|
|
225
|
+
if fse_compressed
|
|
226
|
+
read_fse_compressed_weights(header)
|
|
227
|
+
else
|
|
228
|
+
read_raw_weights(header)
|
|
229
|
+
end
|
|
230
|
+
end
|
|
231
|
+
|
|
232
|
+
private
|
|
233
|
+
|
|
234
|
+
# Read FSE-compressed weights
|
|
235
|
+
def read_fse_compressed_weights(header)
|
|
236
|
+
# Read accuracy log (4 bits)
|
|
237
|
+
(header & 0x1F) + 5
|
|
238
|
+
|
|
239
|
+
# Read number of symbols (if header bit 6 is set)
|
|
240
|
+
# For simplicity, assume 256 symbols
|
|
241
|
+
num_symbols = 256
|
|
242
|
+
|
|
243
|
+
# Read compressed weights using FSE
|
|
244
|
+
# This is a simplified implementation
|
|
245
|
+
weights = Array.new(num_symbols, 0)
|
|
246
|
+
|
|
247
|
+
# For now, use uniform weights as fallback
|
|
248
|
+
Huffman.build_from_weights(weights, HUFFMAN_MAX_BITS)
|
|
249
|
+
end
|
|
250
|
+
|
|
251
|
+
# Read raw (uncompressed) weights
|
|
252
|
+
def read_raw_weights(header)
|
|
253
|
+
# Header byte: 0b0RHHHHH
|
|
254
|
+
# R = repeat flag (not used in basic implementation)
|
|
255
|
+
# HHHHH = header byte
|
|
256
|
+
|
|
257
|
+
# Read number of weights
|
|
258
|
+
num_weights = header & 0x3F
|
|
259
|
+
num_weights = 256 if num_weights.zero?
|
|
260
|
+
|
|
261
|
+
weights = []
|
|
262
|
+
num_weights.times do
|
|
263
|
+
byte = @input.read(1)&.ord || 0
|
|
264
|
+
weights << byte
|
|
265
|
+
end
|
|
266
|
+
|
|
267
|
+
Huffman.build_from_weights(weights, HUFFMAN_MAX_BITS)
|
|
268
|
+
end
|
|
269
|
+
end
|
|
270
|
+
end
|
|
271
|
+
end
|
|
272
|
+
end
|
|
@@ -0,0 +1,339 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
# Copyright (C) 2025 Ribose Inc.
|
|
4
|
+
#
|
|
5
|
+
# Permission is hereby granted, free of charge, to any person obtaining a
|
|
6
|
+
# copy of this software and associated documentation files (the "Software"),
|
|
7
|
+
# to deal in the Software without restriction, including without limitation
|
|
8
|
+
# the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
|
9
|
+
# and/or sell copies of the Software, and to permit persons to whom the
|
|
10
|
+
# Software is furnished to do so, subject to the following conditions:
|
|
11
|
+
#
|
|
12
|
+
# The above copyright notice and this permission notice shall be included in
|
|
13
|
+
# all copies or substantial portions of the Software.
|
|
14
|
+
#
|
|
15
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
|
20
|
+
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
|
21
|
+
# DEALINGS IN THE SOFTWARE.
|
|
22
|
+
|
|
23
|
+
require_relative "constants"
|
|
24
|
+
require_relative "fse/encoder"
|
|
25
|
+
|
|
26
|
+
module Omnizip
|
|
27
|
+
module Algorithms
|
|
28
|
+
class Zstandard
|
|
29
|
+
# Huffman Encoder for Zstandard (RFC 8878 Section 4.2)
|
|
30
|
+
#
|
|
31
|
+
# Encodes literals using Huffman coding with FSE-compressed weights.
|
|
32
|
+
class HuffmanEncoder
|
|
33
|
+
include Constants
|
|
34
|
+
|
|
35
|
+
# @return [Array<Integer>] Code lengths for each symbol
|
|
36
|
+
attr_reader :code_lengths
|
|
37
|
+
|
|
38
|
+
# @return [Hash<Integer, Integer>] Symbol to code mapping
|
|
39
|
+
attr_reader :codes
|
|
40
|
+
|
|
41
|
+
# @return [Integer] Maximum code length
|
|
42
|
+
attr_reader :max_bits
|
|
43
|
+
|
|
44
|
+
# Build Huffman encoder from symbol frequencies
|
|
45
|
+
#
|
|
46
|
+
# @param frequencies [Array<Integer>] Symbol frequencies
|
|
47
|
+
# @param max_bits [Integer] Maximum code length (default 11)
|
|
48
|
+
# @return [HuffmanEncoder] Huffman encoder
|
|
49
|
+
def self.build_from_frequencies(frequencies,
|
|
50
|
+
max_bits = HUFFMAN_MAX_BITS)
|
|
51
|
+
return nil if frequencies.nil? || frequencies.empty?
|
|
52
|
+
|
|
53
|
+
# Build Huffman tree and get code lengths
|
|
54
|
+
code_lengths = build_huffman_lengths(frequencies, max_bits)
|
|
55
|
+
|
|
56
|
+
# Limit code lengths to max_bits
|
|
57
|
+
code_lengths = limit_code_lengths(code_lengths, max_bits)
|
|
58
|
+
|
|
59
|
+
# Build canonical codes
|
|
60
|
+
codes = build_canonical_codes(code_lengths)
|
|
61
|
+
|
|
62
|
+
new(code_lengths, codes, max_bits)
|
|
63
|
+
end
|
|
64
|
+
|
|
65
|
+
# Build Huffman code lengths using package-merge algorithm
|
|
66
|
+
#
|
|
67
|
+
# @param frequencies [Array<Integer>] Symbol frequencies
|
|
68
|
+
# @param max_bits [Integer] Maximum code length
|
|
69
|
+
# @return [Array<Integer>] Code lengths
|
|
70
|
+
def self.build_huffman_lengths(frequencies, max_bits)
|
|
71
|
+
return [] if frequencies.nil? || frequencies.empty?
|
|
72
|
+
|
|
73
|
+
# Create list of (frequency, symbol) pairs
|
|
74
|
+
symbols_with_freq = frequencies.each_with_index
|
|
75
|
+
.select { |freq, _| freq&.positive? }
|
|
76
|
+
.map { |freq, sym| [freq, sym] }
|
|
77
|
+
|
|
78
|
+
return Array.new(frequencies.length, 0) if symbols_with_freq.empty?
|
|
79
|
+
|
|
80
|
+
# Sort by frequency
|
|
81
|
+
symbols_with_freq.sort_by! { |freq, _| freq }
|
|
82
|
+
|
|
83
|
+
# Build Huffman tree
|
|
84
|
+
code_lengths = Array.new(frequencies.length, 0)
|
|
85
|
+
|
|
86
|
+
# Simple Huffman tree building
|
|
87
|
+
# Using a priority queue approach
|
|
88
|
+
build_tree_lengths(symbols_with_freq, code_lengths, max_bits)
|
|
89
|
+
|
|
90
|
+
code_lengths
|
|
91
|
+
end
|
|
92
|
+
|
|
93
|
+
# Build code lengths using tree approach
|
|
94
|
+
def self.build_tree_lengths(symbols_with_freq, code_lengths, max_bits)
|
|
95
|
+
return if symbols_with_freq.empty?
|
|
96
|
+
|
|
97
|
+
# Create leaf nodes
|
|
98
|
+
nodes = symbols_with_freq.map do |freq, sym|
|
|
99
|
+
{ freq: freq, symbol: sym, left: nil, right: nil, depth: 0 }
|
|
100
|
+
end
|
|
101
|
+
|
|
102
|
+
# Build tree by combining nodes
|
|
103
|
+
while nodes.length > 1
|
|
104
|
+
# Sort by frequency
|
|
105
|
+
nodes.sort_by! { |n| n[:freq] }
|
|
106
|
+
|
|
107
|
+
# Combine two smallest
|
|
108
|
+
left = nodes.shift
|
|
109
|
+
right = nodes.shift
|
|
110
|
+
|
|
111
|
+
combined = {
|
|
112
|
+
freq: left[:freq] + right[:freq],
|
|
113
|
+
symbol: nil,
|
|
114
|
+
left: left,
|
|
115
|
+
right: right,
|
|
116
|
+
depth: [left[:depth], right[:depth]].max + 1,
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
nodes << combined
|
|
120
|
+
end
|
|
121
|
+
|
|
122
|
+
# Extract code lengths from tree
|
|
123
|
+
if nodes.length == 1
|
|
124
|
+
assign_lengths(nodes[0], 0, code_lengths, max_bits)
|
|
125
|
+
elsif symbols_with_freq.length == 1
|
|
126
|
+
# Single symbol
|
|
127
|
+
code_lengths[symbols_with_freq[0][1]] = 1
|
|
128
|
+
end
|
|
129
|
+
end
|
|
130
|
+
|
|
131
|
+
# Recursively assign code lengths to symbols
|
|
132
|
+
def self.assign_lengths(node, depth, code_lengths, max_bits)
|
|
133
|
+
return unless node
|
|
134
|
+
|
|
135
|
+
depth = [depth, max_bits].min
|
|
136
|
+
|
|
137
|
+
if node[:symbol]
|
|
138
|
+
# Leaf node
|
|
139
|
+
code_lengths[node[:symbol]] = depth.positive? ? depth : 1
|
|
140
|
+
else
|
|
141
|
+
# Internal node
|
|
142
|
+
assign_lengths(node[:left], depth + 1, code_lengths, max_bits)
|
|
143
|
+
assign_lengths(node[:right], depth + 1, code_lengths, max_bits)
|
|
144
|
+
end
|
|
145
|
+
end
|
|
146
|
+
|
|
147
|
+
# Limit code lengths to maximum
|
|
148
|
+
#
|
|
149
|
+
# Uses the package-merge algorithm concept to limit lengths.
|
|
150
|
+
#
|
|
151
|
+
# @param code_lengths [Array<Integer>] Original code lengths
|
|
152
|
+
# @param max_bits [Integer] Maximum code length
|
|
153
|
+
# @return [Array<Integer>] Limited code lengths
|
|
154
|
+
def self.limit_code_lengths(code_lengths, max_bits)
|
|
155
|
+
return code_lengths if code_lengths.nil? || code_lengths.empty?
|
|
156
|
+
|
|
157
|
+
# Check if any length exceeds max
|
|
158
|
+
max_length = code_lengths.max || 0
|
|
159
|
+
return code_lengths if max_length <= max_bits
|
|
160
|
+
|
|
161
|
+
# Limit using a simple approach: cap at max_bits and adjust
|
|
162
|
+
# This is a simplified implementation
|
|
163
|
+
lengths = code_lengths.map { |l| [l, max_bits].min }
|
|
164
|
+
|
|
165
|
+
# Ensure Kraft inequality is satisfied
|
|
166
|
+
# Sum of 2^(-length) must be <= 1
|
|
167
|
+
kraft_sum = lengths.sum { |l| l.positive? ? 1 << (max_bits - l) : 0 }
|
|
168
|
+
max_kraft = 1 << max_bits
|
|
169
|
+
|
|
170
|
+
if kraft_sum > max_kraft
|
|
171
|
+
# Need to increase some lengths
|
|
172
|
+
# This is simplified - a proper implementation would use package-merge
|
|
173
|
+
lengths = redistribute_lengths(lengths, max_bits)
|
|
174
|
+
end
|
|
175
|
+
|
|
176
|
+
lengths
|
|
177
|
+
end
|
|
178
|
+
|
|
179
|
+
# Redistribute lengths to satisfy Kraft inequality
|
|
180
|
+
def self.redistribute_lengths(lengths, max_bits)
|
|
181
|
+
# Simplified: just cap at max_bits
|
|
182
|
+
lengths.map { |l| [l, max_bits].min }
|
|
183
|
+
end
|
|
184
|
+
|
|
185
|
+
# Build canonical Huffman codes from lengths
|
|
186
|
+
#
|
|
187
|
+
# @param code_lengths [Array<Integer>] Code lengths for each symbol
|
|
188
|
+
# @return [Hash<Integer, Integer>] Symbol to code mapping
|
|
189
|
+
def self.build_canonical_codes(code_lengths)
|
|
190
|
+
codes = {}
|
|
191
|
+
return codes if code_lengths.nil? || code_lengths.empty?
|
|
192
|
+
|
|
193
|
+
max_length = code_lengths.compact.max || 0
|
|
194
|
+
return codes if max_length.zero?
|
|
195
|
+
|
|
196
|
+
# Count symbols at each length
|
|
197
|
+
bl_count = Array.new(max_length + 1, 0)
|
|
198
|
+
code_lengths.each do |length|
|
|
199
|
+
bl_count[length] += 1 if length&.positive?
|
|
200
|
+
end
|
|
201
|
+
|
|
202
|
+
# Calculate starting code for each length
|
|
203
|
+
code = 0
|
|
204
|
+
next_code = Array.new(max_length + 1, 0)
|
|
205
|
+
(1..max_length).each do |bits|
|
|
206
|
+
code = ((code + bl_count[bits - 1]) << 1)
|
|
207
|
+
next_code[bits] = code
|
|
208
|
+
end
|
|
209
|
+
|
|
210
|
+
# Assign codes to symbols
|
|
211
|
+
code_lengths.each_with_index do |length, symbol|
|
|
212
|
+
next if length.nil? || length.zero?
|
|
213
|
+
|
|
214
|
+
codes[symbol] = next_code[length]
|
|
215
|
+
next_code[length] += 1
|
|
216
|
+
end
|
|
217
|
+
|
|
218
|
+
codes
|
|
219
|
+
end
|
|
220
|
+
|
|
221
|
+
# Initialize Huffman encoder
|
|
222
|
+
#
|
|
223
|
+
# @param code_lengths [Array<Integer>] Code lengths
|
|
224
|
+
# @param codes [Hash<Integer, Integer>] Symbol to code mapping
|
|
225
|
+
# @param max_bits [Integer] Maximum code length
|
|
226
|
+
def initialize(code_lengths, codes, max_bits)
|
|
227
|
+
@code_lengths = code_lengths
|
|
228
|
+
@codes = codes
|
|
229
|
+
@max_bits = max_bits
|
|
230
|
+
|
|
231
|
+
# Build reverse lookup for encoding
|
|
232
|
+
@symbol_code = {}
|
|
233
|
+
@symbol_length = {}
|
|
234
|
+
|
|
235
|
+
codes.each do |symbol, code|
|
|
236
|
+
@symbol_code[symbol] = code
|
|
237
|
+
@symbol_length[symbol] = code_lengths[symbol]
|
|
238
|
+
end
|
|
239
|
+
end
|
|
240
|
+
|
|
241
|
+
# Encode data using Huffman codes
|
|
242
|
+
#
|
|
243
|
+
# @param data [String] Data to encode
|
|
244
|
+
# @return [String] Encoded bitstream
|
|
245
|
+
def encode(data)
|
|
246
|
+
return "" if data.nil? || data.empty?
|
|
247
|
+
|
|
248
|
+
bits = []
|
|
249
|
+
|
|
250
|
+
data.each_byte do |byte|
|
|
251
|
+
code = @symbol_code[byte]
|
|
252
|
+
length = @symbol_length[byte]
|
|
253
|
+
|
|
254
|
+
next unless code && length
|
|
255
|
+
|
|
256
|
+
# Write bits MSB first
|
|
257
|
+
length.times do |i|
|
|
258
|
+
bit = (code >> (length - 1 - i)) & 1
|
|
259
|
+
bits << bit
|
|
260
|
+
end
|
|
261
|
+
end
|
|
262
|
+
|
|
263
|
+
# Convert bit array to bytes
|
|
264
|
+
bits_to_bytes(bits)
|
|
265
|
+
end
|
|
266
|
+
|
|
267
|
+
# Encode Huffman table description for Zstandard
|
|
268
|
+
#
|
|
269
|
+
# Zstandard compresses Huffman weights using FSE.
|
|
270
|
+
#
|
|
271
|
+
# @return [String] Encoded Huffman table description
|
|
272
|
+
def encode_table_description
|
|
273
|
+
# Convert code lengths to weights
|
|
274
|
+
# Weight = max_bits - code_length + 1 (for non-zero lengths)
|
|
275
|
+
weights = @code_lengths.map do |length|
|
|
276
|
+
next 0 if length.nil? || length.zero?
|
|
277
|
+
|
|
278
|
+
@max_bits - length + 1
|
|
279
|
+
end
|
|
280
|
+
|
|
281
|
+
encode_weights_fse(weights)
|
|
282
|
+
end
|
|
283
|
+
|
|
284
|
+
private
|
|
285
|
+
|
|
286
|
+
# Encode weights using FSE compression
|
|
287
|
+
def encode_weights_fse(weights)
|
|
288
|
+
# Count non-zero weights
|
|
289
|
+
num_weights = weights.count(&:positive?)
|
|
290
|
+
|
|
291
|
+
if num_weights.zero?
|
|
292
|
+
# No symbols - empty table
|
|
293
|
+
return "\x00"
|
|
294
|
+
end
|
|
295
|
+
|
|
296
|
+
# Build header byte
|
|
297
|
+
# Bit 7: FSE compressed (1)
|
|
298
|
+
# Bits 0-6: depends on format
|
|
299
|
+
|
|
300
|
+
if num_weights <= 127
|
|
301
|
+
# Simple format: just the count
|
|
302
|
+
header = 0x80 | num_weights
|
|
303
|
+
header_bytes = [header].pack("C")
|
|
304
|
+
|
|
305
|
+
# Encode weights as FSE (simplified: just raw bytes for now)
|
|
306
|
+
|
|
307
|
+
else
|
|
308
|
+
# Extended format
|
|
309
|
+
header = 0x80 | 127
|
|
310
|
+
header_bytes = [header, num_weights].pack("CC")
|
|
311
|
+
|
|
312
|
+
end
|
|
313
|
+
weight_bytes = weights.select(&:positive?).pack("C*")
|
|
314
|
+
header_bytes + weight_bytes
|
|
315
|
+
end
|
|
316
|
+
|
|
317
|
+
# Convert bit array to bytes
|
|
318
|
+
def bits_to_bytes(bits)
|
|
319
|
+
# Pad to byte boundary
|
|
320
|
+
bits = bits.dup
|
|
321
|
+
while bits.length % 8 != 0
|
|
322
|
+
bits << 0
|
|
323
|
+
end
|
|
324
|
+
|
|
325
|
+
bytes = []
|
|
326
|
+
bits.each_slice(8) do |byte_bits|
|
|
327
|
+
byte = 0
|
|
328
|
+
byte_bits.each_with_index do |bit, i|
|
|
329
|
+
byte |= (bit << (7 - i)) # MSB first for Huffman
|
|
330
|
+
end
|
|
331
|
+
bytes << byte
|
|
332
|
+
end
|
|
333
|
+
|
|
334
|
+
bytes.pack("C*")
|
|
335
|
+
end
|
|
336
|
+
end
|
|
337
|
+
end
|
|
338
|
+
end
|
|
339
|
+
end
|