omnizip 0.3.2 → 0.3.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop_todo.yml +243 -368
- data/README.adoc +101 -5
- data/docs/guides/archive-formats/index.adoc +31 -1
- data/docs/guides/archive-formats/ole-format.adoc +316 -0
- data/docs/guides/archive-formats/rpm-format.adoc +249 -0
- data/docs/index.adoc +12 -2
- data/lib/omnizip/algorithms/lzma/distance_coder.rb +29 -18
- data/lib/omnizip/algorithms/lzma/encoder.rb +2 -1
- data/lib/omnizip/algorithms/lzma/length_coder.rb +6 -3
- data/lib/omnizip/algorithms/lzma/literal_decoder.rb +2 -1
- data/lib/omnizip/algorithms/lzma/lzip_decoder.rb +40 -13
- data/lib/omnizip/algorithms/lzma/range_decoder.rb +36 -2
- data/lib/omnizip/algorithms/lzma/range_encoder.rb +19 -0
- data/lib/omnizip/algorithms/lzma/xz_encoder_fast.rb +2 -1
- data/lib/omnizip/algorithms/lzma/xz_utils_decoder.rb +148 -112
- data/lib/omnizip/algorithms/lzma.rb +20 -5
- data/lib/omnizip/algorithms/ppmd7/decoder.rb +25 -21
- data/lib/omnizip/algorithms/ppmd7/encoder.rb +4 -11
- data/lib/omnizip/algorithms/sevenzip_lzma2.rb +2 -1
- data/lib/omnizip/algorithms/xz_lzma2.rb +2 -1
- data/lib/omnizip/algorithms/zstandard/constants.rb +125 -9
- data/lib/omnizip/algorithms/zstandard/decoder.rb +202 -17
- data/lib/omnizip/algorithms/zstandard/encoder.rb +197 -17
- data/lib/omnizip/algorithms/zstandard/frame/block.rb +128 -0
- data/lib/omnizip/algorithms/zstandard/frame/header.rb +224 -0
- data/lib/omnizip/algorithms/zstandard/fse/bitstream.rb +186 -0
- data/lib/omnizip/algorithms/zstandard/fse/encoder.rb +325 -0
- data/lib/omnizip/algorithms/zstandard/fse/table.rb +269 -0
- data/lib/omnizip/algorithms/zstandard/huffman.rb +272 -0
- data/lib/omnizip/algorithms/zstandard/huffman_encoder.rb +339 -0
- data/lib/omnizip/algorithms/zstandard/literals.rb +178 -0
- data/lib/omnizip/algorithms/zstandard/literals_encoder.rb +251 -0
- data/lib/omnizip/algorithms/zstandard/sequences.rb +346 -0
- data/lib/omnizip/buffer/memory_extractor.rb +3 -3
- data/lib/omnizip/buffer.rb +2 -2
- data/lib/omnizip/filters/delta.rb +2 -1
- data/lib/omnizip/filters/registry.rb +6 -6
- data/lib/omnizip/formats/cpio/bounded_io.rb +66 -0
- data/lib/omnizip/formats/lzip.rb +2 -1
- data/lib/omnizip/formats/lzma_alone.rb +2 -1
- data/lib/omnizip/formats/ole/allocation_table.rb +244 -0
- data/lib/omnizip/formats/ole/constants.rb +61 -0
- data/lib/omnizip/formats/ole/dirent.rb +380 -0
- data/lib/omnizip/formats/ole/header.rb +198 -0
- data/lib/omnizip/formats/ole/ranges_io.rb +264 -0
- data/lib/omnizip/formats/ole/storage.rb +305 -0
- data/lib/omnizip/formats/ole/types/variant.rb +328 -0
- data/lib/omnizip/formats/ole.rb +145 -0
- data/lib/omnizip/formats/rar/compression/ppmd/decoder.rb +92 -49
- data/lib/omnizip/formats/rar/compression/ppmd/encoder.rb +13 -20
- data/lib/omnizip/formats/rar/rar5/compression/lzss.rb +6 -2
- data/lib/omnizip/formats/rar3/reader.rb +6 -2
- data/lib/omnizip/formats/rar5/reader.rb +4 -1
- data/lib/omnizip/formats/rpm/constants.rb +58 -0
- data/lib/omnizip/formats/rpm/entry.rb +102 -0
- data/lib/omnizip/formats/rpm/header.rb +113 -0
- data/lib/omnizip/formats/rpm/lead.rb +122 -0
- data/lib/omnizip/formats/rpm/tag.rb +230 -0
- data/lib/omnizip/formats/rpm.rb +434 -0
- data/lib/omnizip/formats/seven_zip/bcj2_stream_decompressor.rb +239 -0
- data/lib/omnizip/formats/seven_zip/coder_chain.rb +32 -8
- data/lib/omnizip/formats/seven_zip/constants.rb +1 -1
- data/lib/omnizip/formats/seven_zip/reader.rb +84 -8
- data/lib/omnizip/formats/seven_zip/stream_compressor.rb +2 -1
- data/lib/omnizip/formats/seven_zip/stream_decompressor.rb +6 -0
- data/lib/omnizip/formats/seven_zip/writer.rb +21 -9
- data/lib/omnizip/formats/seven_zip.rb +10 -0
- data/lib/omnizip/formats/xar/entry.rb +18 -5
- data/lib/omnizip/formats/xar/header.rb +34 -6
- data/lib/omnizip/formats/xar/reader.rb +43 -10
- data/lib/omnizip/formats/xar/toc.rb +34 -21
- data/lib/omnizip/formats/xar/writer.rb +15 -5
- data/lib/omnizip/formats/xz_impl/block_decoder.rb +45 -33
- data/lib/omnizip/formats/xz_impl/block_encoder.rb +2 -1
- data/lib/omnizip/formats/xz_impl/index_decoder.rb +3 -1
- data/lib/omnizip/formats/xz_impl/stream_header_parser.rb +2 -1
- data/lib/omnizip/formats/zip/end_of_central_directory.rb +4 -3
- data/lib/omnizip/implementations/seven_zip/lzma/decoder.rb +14 -6
- data/lib/omnizip/implementations/seven_zip/lzma/encoder.rb +2 -1
- data/lib/omnizip/implementations/seven_zip/lzma2/encoder.rb +28 -13
- data/lib/omnizip/implementations/xz_utils/lzma2/encoder.rb +13 -6
- data/lib/omnizip/pipe/stream_compressor.rb +1 -1
- data/lib/omnizip/version.rb +1 -1
- data/readme-docs/compression-algorithms.adoc +6 -2
- metadata +30 -2
|
@@ -0,0 +1,325 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
# Copyright (C) 2025 Ribose Inc.
|
|
4
|
+
#
|
|
5
|
+
# Permission is hereby granted, free of charge, to any person obtaining a
|
|
6
|
+
# copy of this software and associated documentation files (the "Software"),
|
|
7
|
+
# to deal in the Software without restriction, including without limitation
|
|
8
|
+
# the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
|
9
|
+
# and/or sell copies of the Software, and to permit persons to whom the
|
|
10
|
+
# Software is furnished to do so, subject to the following conditions:
|
|
11
|
+
#
|
|
12
|
+
# The above copyright notice and this permission notice shall be included in
|
|
13
|
+
# all copies or substantial portions of the Software.
|
|
14
|
+
#
|
|
15
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
|
20
|
+
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
|
21
|
+
# DEALINGS IN THE SOFTWARE.
|
|
22
|
+
|
|
23
|
+
require_relative "../constants"
|
|
24
|
+
require_relative "bitstream"
|
|
25
|
+
|
|
26
|
+
module Omnizip
|
|
27
|
+
module Algorithms
|
|
28
|
+
class Zstandard
|
|
29
|
+
module FSE
|
|
30
|
+
# FSE Encoder (RFC 8878 Section 4.1)
|
|
31
|
+
#
|
|
32
|
+
# Encodes symbols using Finite State Entropy coding.
|
|
33
|
+
# FSE is a variant of arithmetic coding that uses table-based state transitions.
|
|
34
|
+
class Encoder
|
|
35
|
+
include Constants
|
|
36
|
+
|
|
37
|
+
# @return [Array<Integer>] Symbol distribution (normalized frequencies)
|
|
38
|
+
attr_reader :distribution
|
|
39
|
+
|
|
40
|
+
# @return [Integer] Accuracy log (table size = 2^accuracy_log)
|
|
41
|
+
attr_reader :accuracy_log
|
|
42
|
+
|
|
43
|
+
# @return [Integer] Table size
|
|
44
|
+
attr_reader :table_size
|
|
45
|
+
|
|
46
|
+
# Build FSE encoder from symbol frequencies
|
|
47
|
+
#
|
|
48
|
+
# @param frequencies [Array<Integer>] Raw symbol frequencies
|
|
49
|
+
# @param max_accuracy_log [Integer] Maximum accuracy log (default 9)
|
|
50
|
+
# @return [Encoder] FSE encoder
|
|
51
|
+
def self.build_from_frequencies(frequencies,
|
|
52
|
+
max_accuracy_log = FSE_MAX_ACCURACY_LOG)
|
|
53
|
+
return nil if frequencies.nil? || frequencies.empty?
|
|
54
|
+
|
|
55
|
+
# Normalize frequencies to table size
|
|
56
|
+
distribution, accuracy_log = normalize_distribution(frequencies,
|
|
57
|
+
max_accuracy_log)
|
|
58
|
+
|
|
59
|
+
new(distribution, accuracy_log)
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
# Normalize frequency distribution
|
|
63
|
+
#
|
|
64
|
+
# Converts raw frequencies to normalized distribution that sums to 2^accuracy_log.
|
|
65
|
+
#
|
|
66
|
+
# @param frequencies [Array<Integer>] Raw frequencies
|
|
67
|
+
# @param max_accuracy_log [Integer] Maximum accuracy log
|
|
68
|
+
# @return [Array<Array<Integer>, Integer>] Normalized distribution and accuracy log
|
|
69
|
+
def self.normalize_distribution(frequencies, max_accuracy_log)
|
|
70
|
+
# Count non-zero symbols
|
|
71
|
+
total_freq = frequencies.sum
|
|
72
|
+
return [[], 0] if total_freq.zero?
|
|
73
|
+
|
|
74
|
+
# Find minimum accuracy log that fits the distribution
|
|
75
|
+
num_symbols = frequencies.count { |f| f&.positive? }
|
|
76
|
+
accuracy_log = [calculate_min_accuracy_log(num_symbols),
|
|
77
|
+
FSE_MIN_ACCURACY_LOG].max
|
|
78
|
+
accuracy_log = [accuracy_log, max_accuracy_log].min
|
|
79
|
+
|
|
80
|
+
table_size = 1 << accuracy_log
|
|
81
|
+
|
|
82
|
+
# Normalize frequencies to table size
|
|
83
|
+
distribution = normalize_frequencies(frequencies, table_size)
|
|
84
|
+
|
|
85
|
+
# Verify distribution sums to table size
|
|
86
|
+
sum = distribution.sum
|
|
87
|
+
if sum != table_size
|
|
88
|
+
# Adjust to make it sum correctly
|
|
89
|
+
adjust_distribution(distribution, table_size - sum)
|
|
90
|
+
end
|
|
91
|
+
|
|
92
|
+
[distribution, accuracy_log]
|
|
93
|
+
end
|
|
94
|
+
|
|
95
|
+
# Calculate minimum accuracy log for given number of symbols
|
|
96
|
+
def self.calculate_min_accuracy_log(num_symbols)
|
|
97
|
+
return 0 if num_symbols <= 1
|
|
98
|
+
|
|
99
|
+
log = 0
|
|
100
|
+
temp = num_symbols - 1
|
|
101
|
+
while temp.positive?
|
|
102
|
+
log += 1
|
|
103
|
+
temp >>= 1
|
|
104
|
+
end
|
|
105
|
+
log
|
|
106
|
+
end
|
|
107
|
+
|
|
108
|
+
# Normalize frequencies to fit table size
|
|
109
|
+
def self.normalize_frequencies(frequencies, table_size)
|
|
110
|
+
total = frequencies.sum
|
|
111
|
+
return Array.new(frequencies.length, 0) if total.zero?
|
|
112
|
+
|
|
113
|
+
# Scale frequencies
|
|
114
|
+
frequencies.map do |freq|
|
|
115
|
+
next 0 if freq.nil? || freq <= 0
|
|
116
|
+
|
|
117
|
+
normalized = ((freq * table_size) + (total / 2)) / total
|
|
118
|
+
[normalized, 1].max # Minimum 1 for non-zero symbols
|
|
119
|
+
end
|
|
120
|
+
end
|
|
121
|
+
|
|
122
|
+
# Adjust distribution to sum to exactly table_size
|
|
123
|
+
def self.adjust_distribution(distribution, delta)
|
|
124
|
+
return if delta.zero?
|
|
125
|
+
|
|
126
|
+
if delta.positive?
|
|
127
|
+
# Need to add: increment largest probabilities
|
|
128
|
+
delta.times do
|
|
129
|
+
max_idx = distribution.each_with_index.max_by { |v, _| v }&.last
|
|
130
|
+
distribution[max_idx] += 1 if max_idx
|
|
131
|
+
end
|
|
132
|
+
else
|
|
133
|
+
# Need to subtract: decrement smallest non-zero probabilities
|
|
134
|
+
(-delta).times do
|
|
135
|
+
min_idx = distribution.each_with_index.select do |v, _|
|
|
136
|
+
v > 1
|
|
137
|
+
end.min_by { |v, _| v }&.last
|
|
138
|
+
distribution[min_idx] -= 1 if min_idx
|
|
139
|
+
end
|
|
140
|
+
end
|
|
141
|
+
end
|
|
142
|
+
|
|
143
|
+
# Initialize FSE encoder
|
|
144
|
+
#
|
|
145
|
+
# @param distribution [Array<Integer>] Normalized symbol distribution
|
|
146
|
+
# @param accuracy_log [Integer] Accuracy log
|
|
147
|
+
def initialize(distribution, accuracy_log)
|
|
148
|
+
@distribution = distribution
|
|
149
|
+
@accuracy_log = accuracy_log
|
|
150
|
+
@table_size = 1 << accuracy_log
|
|
151
|
+
|
|
152
|
+
# Build encoding tables
|
|
153
|
+
build_encoding_tables
|
|
154
|
+
end
|
|
155
|
+
|
|
156
|
+
# Encode symbols to bitstream
|
|
157
|
+
#
|
|
158
|
+
# @param symbols [Array<Integer>] Symbols to encode
|
|
159
|
+
# @return [String] Encoded bitstream
|
|
160
|
+
def encode(symbols)
|
|
161
|
+
return "" if symbols.nil? || symbols.empty?
|
|
162
|
+
|
|
163
|
+
# Initialize state from last symbol (reverse order encoding)
|
|
164
|
+
bitstream = []
|
|
165
|
+
|
|
166
|
+
# Encode in reverse order
|
|
167
|
+
state = @table_size - 1 # Initial state
|
|
168
|
+
|
|
169
|
+
symbols.reverse_each.with_index do |symbol, _idx|
|
|
170
|
+
entry = @symbol_to_state[symbol]
|
|
171
|
+
next unless entry
|
|
172
|
+
|
|
173
|
+
# Find state for this symbol
|
|
174
|
+
state = find_state_for_symbol(symbol, state)
|
|
175
|
+
|
|
176
|
+
# Output bits for state transition
|
|
177
|
+
num_bits = entry[:num_bits]
|
|
178
|
+
if num_bits.positive?
|
|
179
|
+
# Write lower num_bits of state
|
|
180
|
+
mask = (1 << num_bits) - 1
|
|
181
|
+
bits_to_write = state & mask
|
|
182
|
+
write_bits(bitstream, bits_to_write, num_bits)
|
|
183
|
+
state >>= num_bits
|
|
184
|
+
end
|
|
185
|
+
end
|
|
186
|
+
|
|
187
|
+
# Write final state
|
|
188
|
+
write_bits(bitstream, state, @accuracy_log)
|
|
189
|
+
|
|
190
|
+
# Convert bit array to bytes (in reverse for FSE)
|
|
191
|
+
bits_to_bytes(bitstream.reverse)
|
|
192
|
+
end
|
|
193
|
+
|
|
194
|
+
# Get number of symbols in distribution
|
|
195
|
+
#
|
|
196
|
+
# @return [Integer]
|
|
197
|
+
def symbol_count
|
|
198
|
+
@distribution.length
|
|
199
|
+
end
|
|
200
|
+
|
|
201
|
+
private
|
|
202
|
+
|
|
203
|
+
# Build encoding tables from distribution
|
|
204
|
+
def build_encoding_tables
|
|
205
|
+
@symbol_to_state = {}
|
|
206
|
+
@state_to_symbol = Array.new(@table_size)
|
|
207
|
+
|
|
208
|
+
# Allocate states to symbols based on distribution
|
|
209
|
+
position = 0
|
|
210
|
+
step = (@table_size >> 1) + (@table_size >> 3) + 3
|
|
211
|
+
mask = @table_size - 1
|
|
212
|
+
|
|
213
|
+
@distribution.each_with_index do |prob, symbol|
|
|
214
|
+
next if prob.nil? || prob <= 0
|
|
215
|
+
|
|
216
|
+
# Calculate number of bits for this symbol
|
|
217
|
+
num_bits = [@accuracy_log - log2_int(prob), 0].max
|
|
218
|
+
|
|
219
|
+
# Allocate states
|
|
220
|
+
prob.times do
|
|
221
|
+
# Find empty position using spread
|
|
222
|
+
while @state_to_symbol[position]
|
|
223
|
+
position = (position + step) & mask
|
|
224
|
+
end
|
|
225
|
+
|
|
226
|
+
@state_to_symbol[position] = {
|
|
227
|
+
symbol: symbol,
|
|
228
|
+
num_bits: num_bits,
|
|
229
|
+
baseline: 0, # Will be calculated
|
|
230
|
+
}
|
|
231
|
+
|
|
232
|
+
position = (position + step) & mask
|
|
233
|
+
end
|
|
234
|
+
|
|
235
|
+
@symbol_to_state[symbol] = {
|
|
236
|
+
num_bits: num_bits,
|
|
237
|
+
baseline: 0,
|
|
238
|
+
}
|
|
239
|
+
end
|
|
240
|
+
|
|
241
|
+
# Calculate baselines
|
|
242
|
+
calculate_baselines
|
|
243
|
+
end
|
|
244
|
+
|
|
245
|
+
# Calculate baseline values for each state
|
|
246
|
+
def calculate_baselines
|
|
247
|
+
# Group states by symbol
|
|
248
|
+
symbol_states = {}
|
|
249
|
+
@state_to_symbol.each_with_index do |entry, state|
|
|
250
|
+
next unless entry
|
|
251
|
+
|
|
252
|
+
symbol = entry[:symbol]
|
|
253
|
+
symbol_states[symbol] ||= []
|
|
254
|
+
symbol_states[symbol] << { state: state, entry: entry }
|
|
255
|
+
end
|
|
256
|
+
|
|
257
|
+
# Sort states within each symbol and assign baselines
|
|
258
|
+
symbol_states.each_value do |states|
|
|
259
|
+
states.sort_by! { |s| s[:state] }
|
|
260
|
+
states.each_with_index do |s, idx|
|
|
261
|
+
s[:entry][:baseline] = idx
|
|
262
|
+
end
|
|
263
|
+
end
|
|
264
|
+
end
|
|
265
|
+
|
|
266
|
+
# Find state for encoding a symbol
|
|
267
|
+
def find_state_for_symbol(symbol, current_state)
|
|
268
|
+
entry = @symbol_to_state[symbol]
|
|
269
|
+
return 0 unless entry
|
|
270
|
+
|
|
271
|
+
# Find the appropriate state based on current state
|
|
272
|
+
num_bits = entry[:num_bits]
|
|
273
|
+
if num_bits.positive?
|
|
274
|
+
# Use lower bits of current state to select state
|
|
275
|
+
((current_state & ((1 << num_bits) - 1)) << (@accuracy_log - num_bits)) |
|
|
276
|
+
(entry[:baseline] >> num_bits)
|
|
277
|
+
else
|
|
278
|
+
entry[:baseline]
|
|
279
|
+
end
|
|
280
|
+
end
|
|
281
|
+
|
|
282
|
+
# Write bits to bitstream array
|
|
283
|
+
def write_bits(bitstream, value, count)
|
|
284
|
+
count.times do |i|
|
|
285
|
+
bitstream << ((value >> i) & 1)
|
|
286
|
+
end
|
|
287
|
+
end
|
|
288
|
+
|
|
289
|
+
# Convert bit array to bytes
|
|
290
|
+
def bits_to_bytes(bits)
|
|
291
|
+
# Pad to byte boundary
|
|
292
|
+
bits = bits.dup
|
|
293
|
+
while bits.length % 8 != 0
|
|
294
|
+
bits << 0
|
|
295
|
+
end
|
|
296
|
+
|
|
297
|
+
bytes = []
|
|
298
|
+
bits.each_slice(8) do |byte_bits|
|
|
299
|
+
byte = 0
|
|
300
|
+
byte_bits.each_with_index do |bit, i|
|
|
301
|
+
byte |= (bit << i)
|
|
302
|
+
end
|
|
303
|
+
bytes << byte
|
|
304
|
+
end
|
|
305
|
+
|
|
306
|
+
bytes.pack("C*")
|
|
307
|
+
end
|
|
308
|
+
|
|
309
|
+
# Integer log2
|
|
310
|
+
def log2_int(value)
|
|
311
|
+
return 0 if value <= 1
|
|
312
|
+
|
|
313
|
+
log = 0
|
|
314
|
+
temp = value
|
|
315
|
+
while temp > 1
|
|
316
|
+
log += 1
|
|
317
|
+
temp >>= 1
|
|
318
|
+
end
|
|
319
|
+
log
|
|
320
|
+
end
|
|
321
|
+
end
|
|
322
|
+
end
|
|
323
|
+
end
|
|
324
|
+
end
|
|
325
|
+
end
|
|
@@ -0,0 +1,269 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
# Copyright (C) 2025 Ribose Inc.
|
|
4
|
+
#
|
|
5
|
+
# Permission is hereby granted, free of charge, to any person obtaining a
|
|
6
|
+
# copy of this software and associated documentation files (the "Software"),
|
|
7
|
+
# to deal in the Software without restriction, including without limitation
|
|
8
|
+
# the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
|
9
|
+
# and/or sell copies of the Software, and to permit persons to whom the
|
|
10
|
+
# Software is furnished to do so, subject to the following conditions:
|
|
11
|
+
#
|
|
12
|
+
# The above copyright notice and this permission notice shall be included in
|
|
13
|
+
# all copies or substantial portions of the Software.
|
|
14
|
+
#
|
|
15
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
|
20
|
+
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
|
21
|
+
# DEALINGS IN THE SOFTWARE.
|
|
22
|
+
|
|
23
|
+
require_relative "bitstream"
|
|
24
|
+
require_relative "../constants"
|
|
25
|
+
|
|
26
|
+
module Omnizip
|
|
27
|
+
module Algorithms
|
|
28
|
+
class Zstandard
|
|
29
|
+
module FSE
|
|
30
|
+
# FSE state entry for decoding table
|
|
31
|
+
#
|
|
32
|
+
# Each entry contains:
|
|
33
|
+
# - symbol: The symbol this state decodes to
|
|
34
|
+
# - num_bits: Number of bits to read for next state
|
|
35
|
+
# - baseline: Value to add to next state's value
|
|
36
|
+
State = Struct.new(:symbol, :num_bits, :baseline)
|
|
37
|
+
|
|
38
|
+
# FSE decoding table (RFC 8878 Section 4.1)
|
|
39
|
+
#
|
|
40
|
+
# Builds a decoding table from a probability distribution
|
|
41
|
+
# according to RFC 8878.
|
|
42
|
+
class Table
|
|
43
|
+
include Constants
|
|
44
|
+
|
|
45
|
+
# @return [Array<State>] Decoding table entries
|
|
46
|
+
attr_reader :states
|
|
47
|
+
|
|
48
|
+
# @return [Integer] Accuracy log (table size = 2^accuracy_log)
|
|
49
|
+
attr_reader :accuracy_log
|
|
50
|
+
|
|
51
|
+
# @return [Integer] Number of symbols in the table
|
|
52
|
+
attr_reader :symbol_count
|
|
53
|
+
|
|
54
|
+
# Build FSE table from normalized distribution
|
|
55
|
+
#
|
|
56
|
+
# @param distribution [Array<Integer>] Normalized symbol frequencies
|
|
57
|
+
# @param accuracy_log [Integer] Log2 of table size
|
|
58
|
+
# @return [Table] Built FSE table
|
|
59
|
+
def self.build(distribution, accuracy_log)
|
|
60
|
+
table_size = 1 << accuracy_log
|
|
61
|
+
|
|
62
|
+
# Allocate cells using spread pattern
|
|
63
|
+
cells = allocate_cells(distribution, table_size)
|
|
64
|
+
|
|
65
|
+
# Calculate num_bits and baseline for each state
|
|
66
|
+
states = calculate_state_values(cells, distribution, table_size)
|
|
67
|
+
|
|
68
|
+
new(states, accuracy_log, distribution.length)
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
# Build from predefined distribution
|
|
72
|
+
#
|
|
73
|
+
# @param distribution [Array<Integer>] Predefined distribution
|
|
74
|
+
# @param accuracy_log [Integer] Accuracy log
|
|
75
|
+
# @return [Table]
|
|
76
|
+
def self.build_predefined(distribution, accuracy_log)
|
|
77
|
+
build(distribution, accuracy_log)
|
|
78
|
+
end
|
|
79
|
+
|
|
80
|
+
# Initialize with pre-built table
|
|
81
|
+
#
|
|
82
|
+
# @param states [Array<State>] Decoding states
|
|
83
|
+
# @param accuracy_log [Integer]
|
|
84
|
+
# @param symbol_count [Integer]
|
|
85
|
+
def initialize(states, accuracy_log, symbol_count)
|
|
86
|
+
@states = states
|
|
87
|
+
@accuracy_log = accuracy_log
|
|
88
|
+
@symbol_count = symbol_count
|
|
89
|
+
end
|
|
90
|
+
|
|
91
|
+
# Get state at index
|
|
92
|
+
#
|
|
93
|
+
# @param index [Integer] State index
|
|
94
|
+
# @return [State] State at index
|
|
95
|
+
def [](index)
|
|
96
|
+
@states[index]
|
|
97
|
+
end
|
|
98
|
+
|
|
99
|
+
# Get table size
|
|
100
|
+
#
|
|
101
|
+
# @return [Integer] Number of entries in table
|
|
102
|
+
def size
|
|
103
|
+
@states.length
|
|
104
|
+
end
|
|
105
|
+
|
|
106
|
+
# Allocate cells using FSE spread pattern
|
|
107
|
+
#
|
|
108
|
+
# The spread pattern distributes symbols across the table
|
|
109
|
+
# using a step that ensures good distribution.
|
|
110
|
+
def self.allocate_cells(distribution, table_size)
|
|
111
|
+
cells = Array.new(table_size, nil)
|
|
112
|
+
|
|
113
|
+
# Validate distribution sum
|
|
114
|
+
total = distribution.compact.sum
|
|
115
|
+
if total > table_size
|
|
116
|
+
raise ArgumentError,
|
|
117
|
+
"Distribution sum (#{total}) exceeds table size (#{table_size})"
|
|
118
|
+
end
|
|
119
|
+
|
|
120
|
+
# Step = (table_size >> 1) + (table_size >> 3) + 3
|
|
121
|
+
step = (table_size >> 1) + (table_size >> 3) + 3
|
|
122
|
+
mask = table_size - 1
|
|
123
|
+
|
|
124
|
+
position = 0
|
|
125
|
+
|
|
126
|
+
distribution.each_with_index do |prob, symbol|
|
|
127
|
+
next if prob.nil? || prob <= 0
|
|
128
|
+
|
|
129
|
+
prob.times do
|
|
130
|
+
# Find empty position (with safety limit)
|
|
131
|
+
attempts = 0
|
|
132
|
+
while cells[position]
|
|
133
|
+
position = (position + step) & mask
|
|
134
|
+
attempts += 1
|
|
135
|
+
if attempts > table_size
|
|
136
|
+
raise "FSE table allocation failed: no empty cell found"
|
|
137
|
+
end
|
|
138
|
+
end
|
|
139
|
+
|
|
140
|
+
cells[position] = symbol
|
|
141
|
+
position = (position + step) & mask
|
|
142
|
+
end
|
|
143
|
+
end
|
|
144
|
+
|
|
145
|
+
cells
|
|
146
|
+
end
|
|
147
|
+
|
|
148
|
+
# Calculate num_bits and baseline for each state
|
|
149
|
+
def self.calculate_state_values(cells, distribution, table_size)
|
|
150
|
+
states = Array.new(table_size)
|
|
151
|
+
|
|
152
|
+
# Group positions by symbol
|
|
153
|
+
symbol_positions = {}
|
|
154
|
+
cells.each_with_index do |symbol, pos|
|
|
155
|
+
next if symbol.nil?
|
|
156
|
+
|
|
157
|
+
symbol_positions[symbol] ||= []
|
|
158
|
+
symbol_positions[symbol] << pos
|
|
159
|
+
end
|
|
160
|
+
|
|
161
|
+
# Calculate state values for each symbol
|
|
162
|
+
symbol_positions.each do |symbol, positions|
|
|
163
|
+
prob = distribution[symbol]
|
|
164
|
+
next if prob.nil? || prob <= 0
|
|
165
|
+
|
|
166
|
+
positions.each_with_index do |pos, idx|
|
|
167
|
+
# Calculate num_bits: -log2(prob/table_size)
|
|
168
|
+
num_bits = calculate_num_bits(prob, table_size)
|
|
169
|
+
|
|
170
|
+
# Calculate baseline
|
|
171
|
+
baseline = idx
|
|
172
|
+
|
|
173
|
+
states[pos] = State.new(symbol, num_bits, baseline)
|
|
174
|
+
end
|
|
175
|
+
end
|
|
176
|
+
|
|
177
|
+
states
|
|
178
|
+
end
|
|
179
|
+
|
|
180
|
+
# Calculate number of bits needed for a symbol with given probability
|
|
181
|
+
def self.calculate_num_bits(prob, table_size)
|
|
182
|
+
return 0 if prob <= 0
|
|
183
|
+
|
|
184
|
+
# num_bits = accuracy_log - log2(prob)
|
|
185
|
+
# This is the number of extra bits needed
|
|
186
|
+
log_prob = 0
|
|
187
|
+
temp = prob
|
|
188
|
+
while temp > 1
|
|
189
|
+
log_prob += 1
|
|
190
|
+
temp >>= 1
|
|
191
|
+
end
|
|
192
|
+
|
|
193
|
+
log_table = 0
|
|
194
|
+
temp = table_size
|
|
195
|
+
while temp > 1
|
|
196
|
+
log_table += 1
|
|
197
|
+
temp >>= 1
|
|
198
|
+
end
|
|
199
|
+
|
|
200
|
+
[0, log_table - log_prob].max
|
|
201
|
+
end
|
|
202
|
+
end
|
|
203
|
+
|
|
204
|
+
# FSE Decoder (RFC 8878 Section 4.1)
|
|
205
|
+
#
|
|
206
|
+
# Decodes symbols from FSE-encoded bitstreams.
|
|
207
|
+
class Decoder
|
|
208
|
+
# @return [Table] FSE decoding table
|
|
209
|
+
attr_reader :table
|
|
210
|
+
|
|
211
|
+
# @return [Integer] Current state
|
|
212
|
+
attr_reader :state
|
|
213
|
+
|
|
214
|
+
# Initialize decoder with FSE table
|
|
215
|
+
#
|
|
216
|
+
# @param table [Table] FSE decoding table
|
|
217
|
+
def initialize(table)
|
|
218
|
+
@table = table
|
|
219
|
+
@state = 0
|
|
220
|
+
end
|
|
221
|
+
|
|
222
|
+
# Initialize state from bitstream
|
|
223
|
+
#
|
|
224
|
+
# @param bitstream [BitStream] The bitstream to read from
|
|
225
|
+
def init_state(bitstream)
|
|
226
|
+
@state = bitstream.read_bits(@table.accuracy_log)
|
|
227
|
+
end
|
|
228
|
+
|
|
229
|
+
# Decode next symbol from bitstream
|
|
230
|
+
#
|
|
231
|
+
# @param bitstream [BitStream] The bitstream to read from
|
|
232
|
+
# @return [Integer] Decoded symbol
|
|
233
|
+
def decode(bitstream)
|
|
234
|
+
entry = @table[@state]
|
|
235
|
+
return 0 if entry.nil?
|
|
236
|
+
|
|
237
|
+
symbol = entry.symbol
|
|
238
|
+
|
|
239
|
+
# Read extra bits for next state
|
|
240
|
+
if entry.num_bits.positive?
|
|
241
|
+
extra = bitstream.read_bits(entry.num_bits)
|
|
242
|
+
@state = entry.baseline + extra
|
|
243
|
+
else
|
|
244
|
+
@state = entry.baseline
|
|
245
|
+
end
|
|
246
|
+
|
|
247
|
+
# Mask state to table size
|
|
248
|
+
@state &= (@table.size - 1)
|
|
249
|
+
|
|
250
|
+
symbol
|
|
251
|
+
end
|
|
252
|
+
|
|
253
|
+
# Decode multiple symbols
|
|
254
|
+
#
|
|
255
|
+
# @param bitstream [BitStream] The bitstream to read from
|
|
256
|
+
# @param count [Integer] Number of symbols to decode
|
|
257
|
+
# @return [Array<Integer>] Decoded symbols
|
|
258
|
+
def decode_symbols(bitstream, count)
|
|
259
|
+
symbols = []
|
|
260
|
+
count.times do
|
|
261
|
+
symbols << decode(bitstream)
|
|
262
|
+
end
|
|
263
|
+
symbols
|
|
264
|
+
end
|
|
265
|
+
end
|
|
266
|
+
end
|
|
267
|
+
end
|
|
268
|
+
end
|
|
269
|
+
end
|