omnizip 0.3.2 → 0.3.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (86) hide show
  1. checksums.yaml +4 -4
  2. data/.rubocop_todo.yml +243 -368
  3. data/README.adoc +101 -5
  4. data/docs/guides/archive-formats/index.adoc +31 -1
  5. data/docs/guides/archive-formats/ole-format.adoc +316 -0
  6. data/docs/guides/archive-formats/rpm-format.adoc +249 -0
  7. data/docs/index.adoc +12 -2
  8. data/lib/omnizip/algorithms/lzma/distance_coder.rb +29 -18
  9. data/lib/omnizip/algorithms/lzma/encoder.rb +2 -1
  10. data/lib/omnizip/algorithms/lzma/length_coder.rb +6 -3
  11. data/lib/omnizip/algorithms/lzma/literal_decoder.rb +2 -1
  12. data/lib/omnizip/algorithms/lzma/lzip_decoder.rb +40 -13
  13. data/lib/omnizip/algorithms/lzma/range_decoder.rb +36 -2
  14. data/lib/omnizip/algorithms/lzma/range_encoder.rb +19 -0
  15. data/lib/omnizip/algorithms/lzma/xz_encoder_fast.rb +2 -1
  16. data/lib/omnizip/algorithms/lzma/xz_utils_decoder.rb +148 -112
  17. data/lib/omnizip/algorithms/lzma.rb +20 -5
  18. data/lib/omnizip/algorithms/ppmd7/decoder.rb +25 -21
  19. data/lib/omnizip/algorithms/ppmd7/encoder.rb +4 -11
  20. data/lib/omnizip/algorithms/sevenzip_lzma2.rb +2 -1
  21. data/lib/omnizip/algorithms/xz_lzma2.rb +2 -1
  22. data/lib/omnizip/algorithms/zstandard/constants.rb +125 -9
  23. data/lib/omnizip/algorithms/zstandard/decoder.rb +202 -17
  24. data/lib/omnizip/algorithms/zstandard/encoder.rb +197 -17
  25. data/lib/omnizip/algorithms/zstandard/frame/block.rb +128 -0
  26. data/lib/omnizip/algorithms/zstandard/frame/header.rb +224 -0
  27. data/lib/omnizip/algorithms/zstandard/fse/bitstream.rb +186 -0
  28. data/lib/omnizip/algorithms/zstandard/fse/encoder.rb +325 -0
  29. data/lib/omnizip/algorithms/zstandard/fse/table.rb +269 -0
  30. data/lib/omnizip/algorithms/zstandard/huffman.rb +272 -0
  31. data/lib/omnizip/algorithms/zstandard/huffman_encoder.rb +339 -0
  32. data/lib/omnizip/algorithms/zstandard/literals.rb +178 -0
  33. data/lib/omnizip/algorithms/zstandard/literals_encoder.rb +251 -0
  34. data/lib/omnizip/algorithms/zstandard/sequences.rb +346 -0
  35. data/lib/omnizip/buffer/memory_extractor.rb +3 -3
  36. data/lib/omnizip/buffer.rb +2 -2
  37. data/lib/omnizip/filters/delta.rb +2 -1
  38. data/lib/omnizip/filters/registry.rb +6 -6
  39. data/lib/omnizip/formats/cpio/bounded_io.rb +66 -0
  40. data/lib/omnizip/formats/lzip.rb +2 -1
  41. data/lib/omnizip/formats/lzma_alone.rb +2 -1
  42. data/lib/omnizip/formats/ole/allocation_table.rb +244 -0
  43. data/lib/omnizip/formats/ole/constants.rb +61 -0
  44. data/lib/omnizip/formats/ole/dirent.rb +380 -0
  45. data/lib/omnizip/formats/ole/header.rb +198 -0
  46. data/lib/omnizip/formats/ole/ranges_io.rb +264 -0
  47. data/lib/omnizip/formats/ole/storage.rb +305 -0
  48. data/lib/omnizip/formats/ole/types/variant.rb +328 -0
  49. data/lib/omnizip/formats/ole.rb +145 -0
  50. data/lib/omnizip/formats/rar/compression/ppmd/decoder.rb +92 -49
  51. data/lib/omnizip/formats/rar/compression/ppmd/encoder.rb +13 -20
  52. data/lib/omnizip/formats/rar/rar5/compression/lzss.rb +6 -2
  53. data/lib/omnizip/formats/rar3/reader.rb +6 -2
  54. data/lib/omnizip/formats/rar5/reader.rb +4 -1
  55. data/lib/omnizip/formats/rpm/constants.rb +58 -0
  56. data/lib/omnizip/formats/rpm/entry.rb +102 -0
  57. data/lib/omnizip/formats/rpm/header.rb +113 -0
  58. data/lib/omnizip/formats/rpm/lead.rb +122 -0
  59. data/lib/omnizip/formats/rpm/tag.rb +230 -0
  60. data/lib/omnizip/formats/rpm.rb +434 -0
  61. data/lib/omnizip/formats/seven_zip/bcj2_stream_decompressor.rb +239 -0
  62. data/lib/omnizip/formats/seven_zip/coder_chain.rb +32 -8
  63. data/lib/omnizip/formats/seven_zip/constants.rb +1 -1
  64. data/lib/omnizip/formats/seven_zip/reader.rb +84 -8
  65. data/lib/omnizip/formats/seven_zip/stream_compressor.rb +2 -1
  66. data/lib/omnizip/formats/seven_zip/stream_decompressor.rb +6 -0
  67. data/lib/omnizip/formats/seven_zip/writer.rb +21 -9
  68. data/lib/omnizip/formats/seven_zip.rb +10 -0
  69. data/lib/omnizip/formats/xar/entry.rb +18 -5
  70. data/lib/omnizip/formats/xar/header.rb +34 -6
  71. data/lib/omnizip/formats/xar/reader.rb +43 -10
  72. data/lib/omnizip/formats/xar/toc.rb +34 -21
  73. data/lib/omnizip/formats/xar/writer.rb +15 -5
  74. data/lib/omnizip/formats/xz_impl/block_decoder.rb +45 -33
  75. data/lib/omnizip/formats/xz_impl/block_encoder.rb +2 -1
  76. data/lib/omnizip/formats/xz_impl/index_decoder.rb +3 -1
  77. data/lib/omnizip/formats/xz_impl/stream_header_parser.rb +2 -1
  78. data/lib/omnizip/formats/zip/end_of_central_directory.rb +4 -3
  79. data/lib/omnizip/implementations/seven_zip/lzma/decoder.rb +14 -6
  80. data/lib/omnizip/implementations/seven_zip/lzma/encoder.rb +2 -1
  81. data/lib/omnizip/implementations/seven_zip/lzma2/encoder.rb +28 -13
  82. data/lib/omnizip/implementations/xz_utils/lzma2/encoder.rb +13 -6
  83. data/lib/omnizip/pipe/stream_compressor.rb +1 -1
  84. data/lib/omnizip/version.rb +1 -1
  85. data/readme-docs/compression-algorithms.adoc +6 -2
  86. metadata +30 -2
@@ -0,0 +1,325 @@
1
+ # frozen_string_literal: true
2
+
3
+ # Copyright (C) 2025 Ribose Inc.
4
+ #
5
+ # Permission is hereby granted, free of charge, to any person obtaining a
6
+ # copy of this software and associated documentation files (the "Software"),
7
+ # to deal in the Software without restriction, including without limitation
8
+ # the rights to use, copy, modify, merge, publish, distribute, sublicense,
9
+ # and/or sell copies of the Software, and to permit persons to whom the
10
+ # Software is furnished to do so, subject to the following conditions:
11
+ #
12
+ # The above copyright notice and this permission notice shall be included in
13
+ # all copies or substantial portions of the Software.
14
+ #
15
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20
+ # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21
+ # DEALINGS IN THE SOFTWARE.
22
+
23
+ require_relative "../constants"
24
+ require_relative "bitstream"
25
+
26
+ module Omnizip
27
+ module Algorithms
28
+ class Zstandard
29
+ module FSE
30
+ # FSE Encoder (RFC 8878 Section 4.1)
31
+ #
32
+ # Encodes symbols using Finite State Entropy coding.
33
+ # FSE is a variant of arithmetic coding that uses table-based state transitions.
34
+ class Encoder
35
+ include Constants
36
+
37
+ # @return [Array<Integer>] Symbol distribution (normalized frequencies)
38
+ attr_reader :distribution
39
+
40
+ # @return [Integer] Accuracy log (table size = 2^accuracy_log)
41
+ attr_reader :accuracy_log
42
+
43
+ # @return [Integer] Table size
44
+ attr_reader :table_size
45
+
46
+ # Build FSE encoder from symbol frequencies
47
+ #
48
+ # @param frequencies [Array<Integer>] Raw symbol frequencies
49
+ # @param max_accuracy_log [Integer] Maximum accuracy log (default 9)
50
+ # @return [Encoder] FSE encoder
51
+ def self.build_from_frequencies(frequencies,
52
+ max_accuracy_log = FSE_MAX_ACCURACY_LOG)
53
+ return nil if frequencies.nil? || frequencies.empty?
54
+
55
+ # Normalize frequencies to table size
56
+ distribution, accuracy_log = normalize_distribution(frequencies,
57
+ max_accuracy_log)
58
+
59
+ new(distribution, accuracy_log)
60
+ end
61
+
62
+ # Normalize frequency distribution
63
+ #
64
+ # Converts raw frequencies to normalized distribution that sums to 2^accuracy_log.
65
+ #
66
+ # @param frequencies [Array<Integer>] Raw frequencies
67
+ # @param max_accuracy_log [Integer] Maximum accuracy log
68
+ # @return [Array<Array<Integer>, Integer>] Normalized distribution and accuracy log
69
+ def self.normalize_distribution(frequencies, max_accuracy_log)
70
+ # Count non-zero symbols
71
+ total_freq = frequencies.sum
72
+ return [[], 0] if total_freq.zero?
73
+
74
+ # Find minimum accuracy log that fits the distribution
75
+ num_symbols = frequencies.count { |f| f&.positive? }
76
+ accuracy_log = [calculate_min_accuracy_log(num_symbols),
77
+ FSE_MIN_ACCURACY_LOG].max
78
+ accuracy_log = [accuracy_log, max_accuracy_log].min
79
+
80
+ table_size = 1 << accuracy_log
81
+
82
+ # Normalize frequencies to table size
83
+ distribution = normalize_frequencies(frequencies, table_size)
84
+
85
+ # Verify distribution sums to table size
86
+ sum = distribution.sum
87
+ if sum != table_size
88
+ # Adjust to make it sum correctly
89
+ adjust_distribution(distribution, table_size - sum)
90
+ end
91
+
92
+ [distribution, accuracy_log]
93
+ end
94
+
95
+ # Calculate minimum accuracy log for given number of symbols
96
+ def self.calculate_min_accuracy_log(num_symbols)
97
+ return 0 if num_symbols <= 1
98
+
99
+ log = 0
100
+ temp = num_symbols - 1
101
+ while temp.positive?
102
+ log += 1
103
+ temp >>= 1
104
+ end
105
+ log
106
+ end
107
+
108
+ # Normalize frequencies to fit table size
109
+ def self.normalize_frequencies(frequencies, table_size)
110
+ total = frequencies.sum
111
+ return Array.new(frequencies.length, 0) if total.zero?
112
+
113
+ # Scale frequencies
114
+ frequencies.map do |freq|
115
+ next 0 if freq.nil? || freq <= 0
116
+
117
+ normalized = ((freq * table_size) + (total / 2)) / total
118
+ [normalized, 1].max # Minimum 1 for non-zero symbols
119
+ end
120
+ end
121
+
122
+ # Adjust distribution to sum to exactly table_size
123
+ def self.adjust_distribution(distribution, delta)
124
+ return if delta.zero?
125
+
126
+ if delta.positive?
127
+ # Need to add: increment largest probabilities
128
+ delta.times do
129
+ max_idx = distribution.each_with_index.max_by { |v, _| v }&.last
130
+ distribution[max_idx] += 1 if max_idx
131
+ end
132
+ else
133
+ # Need to subtract: decrement smallest non-zero probabilities
134
+ (-delta).times do
135
+ min_idx = distribution.each_with_index.select do |v, _|
136
+ v > 1
137
+ end.min_by { |v, _| v }&.last
138
+ distribution[min_idx] -= 1 if min_idx
139
+ end
140
+ end
141
+ end
142
+
143
+ # Initialize FSE encoder
144
+ #
145
+ # @param distribution [Array<Integer>] Normalized symbol distribution
146
+ # @param accuracy_log [Integer] Accuracy log
147
+ def initialize(distribution, accuracy_log)
148
+ @distribution = distribution
149
+ @accuracy_log = accuracy_log
150
+ @table_size = 1 << accuracy_log
151
+
152
+ # Build encoding tables
153
+ build_encoding_tables
154
+ end
155
+
156
+ # Encode symbols to bitstream
157
+ #
158
+ # @param symbols [Array<Integer>] Symbols to encode
159
+ # @return [String] Encoded bitstream
160
+ def encode(symbols)
161
+ return "" if symbols.nil? || symbols.empty?
162
+
163
+ # Initialize state from last symbol (reverse order encoding)
164
+ bitstream = []
165
+
166
+ # Encode in reverse order
167
+ state = @table_size - 1 # Initial state
168
+
169
+ symbols.reverse_each.with_index do |symbol, _idx|
170
+ entry = @symbol_to_state[symbol]
171
+ next unless entry
172
+
173
+ # Find state for this symbol
174
+ state = find_state_for_symbol(symbol, state)
175
+
176
+ # Output bits for state transition
177
+ num_bits = entry[:num_bits]
178
+ if num_bits.positive?
179
+ # Write lower num_bits of state
180
+ mask = (1 << num_bits) - 1
181
+ bits_to_write = state & mask
182
+ write_bits(bitstream, bits_to_write, num_bits)
183
+ state >>= num_bits
184
+ end
185
+ end
186
+
187
+ # Write final state
188
+ write_bits(bitstream, state, @accuracy_log)
189
+
190
+ # Convert bit array to bytes (in reverse for FSE)
191
+ bits_to_bytes(bitstream.reverse)
192
+ end
193
+
194
+ # Get number of symbols in distribution
195
+ #
196
+ # @return [Integer]
197
+ def symbol_count
198
+ @distribution.length
199
+ end
200
+
201
+ private
202
+
203
+ # Build encoding tables from distribution
204
+ def build_encoding_tables
205
+ @symbol_to_state = {}
206
+ @state_to_symbol = Array.new(@table_size)
207
+
208
+ # Allocate states to symbols based on distribution
209
+ position = 0
210
+ step = (@table_size >> 1) + (@table_size >> 3) + 3
211
+ mask = @table_size - 1
212
+
213
+ @distribution.each_with_index do |prob, symbol|
214
+ next if prob.nil? || prob <= 0
215
+
216
+ # Calculate number of bits for this symbol
217
+ num_bits = [@accuracy_log - log2_int(prob), 0].max
218
+
219
+ # Allocate states
220
+ prob.times do
221
+ # Find empty position using spread
222
+ while @state_to_symbol[position]
223
+ position = (position + step) & mask
224
+ end
225
+
226
+ @state_to_symbol[position] = {
227
+ symbol: symbol,
228
+ num_bits: num_bits,
229
+ baseline: 0, # Will be calculated
230
+ }
231
+
232
+ position = (position + step) & mask
233
+ end
234
+
235
+ @symbol_to_state[symbol] = {
236
+ num_bits: num_bits,
237
+ baseline: 0,
238
+ }
239
+ end
240
+
241
+ # Calculate baselines
242
+ calculate_baselines
243
+ end
244
+
245
+ # Calculate baseline values for each state
246
+ def calculate_baselines
247
+ # Group states by symbol
248
+ symbol_states = {}
249
+ @state_to_symbol.each_with_index do |entry, state|
250
+ next unless entry
251
+
252
+ symbol = entry[:symbol]
253
+ symbol_states[symbol] ||= []
254
+ symbol_states[symbol] << { state: state, entry: entry }
255
+ end
256
+
257
+ # Sort states within each symbol and assign baselines
258
+ symbol_states.each_value do |states|
259
+ states.sort_by! { |s| s[:state] }
260
+ states.each_with_index do |s, idx|
261
+ s[:entry][:baseline] = idx
262
+ end
263
+ end
264
+ end
265
+
266
+ # Find state for encoding a symbol
267
+ def find_state_for_symbol(symbol, current_state)
268
+ entry = @symbol_to_state[symbol]
269
+ return 0 unless entry
270
+
271
+ # Find the appropriate state based on current state
272
+ num_bits = entry[:num_bits]
273
+ if num_bits.positive?
274
+ # Use lower bits of current state to select state
275
+ ((current_state & ((1 << num_bits) - 1)) << (@accuracy_log - num_bits)) |
276
+ (entry[:baseline] >> num_bits)
277
+ else
278
+ entry[:baseline]
279
+ end
280
+ end
281
+
282
+ # Write bits to bitstream array
283
+ def write_bits(bitstream, value, count)
284
+ count.times do |i|
285
+ bitstream << ((value >> i) & 1)
286
+ end
287
+ end
288
+
289
+ # Convert bit array to bytes
290
+ def bits_to_bytes(bits)
291
+ # Pad to byte boundary
292
+ bits = bits.dup
293
+ while bits.length % 8 != 0
294
+ bits << 0
295
+ end
296
+
297
+ bytes = []
298
+ bits.each_slice(8) do |byte_bits|
299
+ byte = 0
300
+ byte_bits.each_with_index do |bit, i|
301
+ byte |= (bit << i)
302
+ end
303
+ bytes << byte
304
+ end
305
+
306
+ bytes.pack("C*")
307
+ end
308
+
309
+ # Integer log2
310
+ def log2_int(value)
311
+ return 0 if value <= 1
312
+
313
+ log = 0
314
+ temp = value
315
+ while temp > 1
316
+ log += 1
317
+ temp >>= 1
318
+ end
319
+ log
320
+ end
321
+ end
322
+ end
323
+ end
324
+ end
325
+ end
@@ -0,0 +1,269 @@
1
+ # frozen_string_literal: true
2
+
3
+ # Copyright (C) 2025 Ribose Inc.
4
+ #
5
+ # Permission is hereby granted, free of charge, to any person obtaining a
6
+ # copy of this software and associated documentation files (the "Software"),
7
+ # to deal in the Software without restriction, including without limitation
8
+ # the rights to use, copy, modify, merge, publish, distribute, sublicense,
9
+ # and/or sell copies of the Software, and to permit persons to whom the
10
+ # Software is furnished to do so, subject to the following conditions:
11
+ #
12
+ # The above copyright notice and this permission notice shall be included in
13
+ # all copies or substantial portions of the Software.
14
+ #
15
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20
+ # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21
+ # DEALINGS IN THE SOFTWARE.
22
+
23
+ require_relative "bitstream"
24
+ require_relative "../constants"
25
+
26
+ module Omnizip
27
+ module Algorithms
28
+ class Zstandard
29
+ module FSE
30
+ # FSE state entry for decoding table
31
+ #
32
+ # Each entry contains:
33
+ # - symbol: The symbol this state decodes to
34
+ # - num_bits: Number of bits to read for next state
35
+ # - baseline: Value to add to next state's value
36
+ State = Struct.new(:symbol, :num_bits, :baseline)
37
+
38
+ # FSE decoding table (RFC 8878 Section 4.1)
39
+ #
40
+ # Builds a decoding table from a probability distribution
41
+ # according to RFC 8878.
42
+ class Table
43
+ include Constants
44
+
45
+ # @return [Array<State>] Decoding table entries
46
+ attr_reader :states
47
+
48
+ # @return [Integer] Accuracy log (table size = 2^accuracy_log)
49
+ attr_reader :accuracy_log
50
+
51
+ # @return [Integer] Number of symbols in the table
52
+ attr_reader :symbol_count
53
+
54
+ # Build FSE table from normalized distribution
55
+ #
56
+ # @param distribution [Array<Integer>] Normalized symbol frequencies
57
+ # @param accuracy_log [Integer] Log2 of table size
58
+ # @return [Table] Built FSE table
59
+ def self.build(distribution, accuracy_log)
60
+ table_size = 1 << accuracy_log
61
+
62
+ # Allocate cells using spread pattern
63
+ cells = allocate_cells(distribution, table_size)
64
+
65
+ # Calculate num_bits and baseline for each state
66
+ states = calculate_state_values(cells, distribution, table_size)
67
+
68
+ new(states, accuracy_log, distribution.length)
69
+ end
70
+
71
+ # Build from predefined distribution
72
+ #
73
+ # @param distribution [Array<Integer>] Predefined distribution
74
+ # @param accuracy_log [Integer] Accuracy log
75
+ # @return [Table]
76
+ def self.build_predefined(distribution, accuracy_log)
77
+ build(distribution, accuracy_log)
78
+ end
79
+
80
+ # Initialize with pre-built table
81
+ #
82
+ # @param states [Array<State>] Decoding states
83
+ # @param accuracy_log [Integer]
84
+ # @param symbol_count [Integer]
85
+ def initialize(states, accuracy_log, symbol_count)
86
+ @states = states
87
+ @accuracy_log = accuracy_log
88
+ @symbol_count = symbol_count
89
+ end
90
+
91
+ # Get state at index
92
+ #
93
+ # @param index [Integer] State index
94
+ # @return [State] State at index
95
+ def [](index)
96
+ @states[index]
97
+ end
98
+
99
+ # Get table size
100
+ #
101
+ # @return [Integer] Number of entries in table
102
+ def size
103
+ @states.length
104
+ end
105
+
106
+ # Allocate cells using FSE spread pattern
107
+ #
108
+ # The spread pattern distributes symbols across the table
109
+ # using a step that ensures good distribution.
110
+ def self.allocate_cells(distribution, table_size)
111
+ cells = Array.new(table_size, nil)
112
+
113
+ # Validate distribution sum
114
+ total = distribution.compact.sum
115
+ if total > table_size
116
+ raise ArgumentError,
117
+ "Distribution sum (#{total}) exceeds table size (#{table_size})"
118
+ end
119
+
120
+ # Step = (table_size >> 1) + (table_size >> 3) + 3
121
+ step = (table_size >> 1) + (table_size >> 3) + 3
122
+ mask = table_size - 1
123
+
124
+ position = 0
125
+
126
+ distribution.each_with_index do |prob, symbol|
127
+ next if prob.nil? || prob <= 0
128
+
129
+ prob.times do
130
+ # Find empty position (with safety limit)
131
+ attempts = 0
132
+ while cells[position]
133
+ position = (position + step) & mask
134
+ attempts += 1
135
+ if attempts > table_size
136
+ raise "FSE table allocation failed: no empty cell found"
137
+ end
138
+ end
139
+
140
+ cells[position] = symbol
141
+ position = (position + step) & mask
142
+ end
143
+ end
144
+
145
+ cells
146
+ end
147
+
148
+ # Calculate num_bits and baseline for each state
149
+ def self.calculate_state_values(cells, distribution, table_size)
150
+ states = Array.new(table_size)
151
+
152
+ # Group positions by symbol
153
+ symbol_positions = {}
154
+ cells.each_with_index do |symbol, pos|
155
+ next if symbol.nil?
156
+
157
+ symbol_positions[symbol] ||= []
158
+ symbol_positions[symbol] << pos
159
+ end
160
+
161
+ # Calculate state values for each symbol
162
+ symbol_positions.each do |symbol, positions|
163
+ prob = distribution[symbol]
164
+ next if prob.nil? || prob <= 0
165
+
166
+ positions.each_with_index do |pos, idx|
167
+ # Calculate num_bits: -log2(prob/table_size)
168
+ num_bits = calculate_num_bits(prob, table_size)
169
+
170
+ # Calculate baseline
171
+ baseline = idx
172
+
173
+ states[pos] = State.new(symbol, num_bits, baseline)
174
+ end
175
+ end
176
+
177
+ states
178
+ end
179
+
180
+ # Calculate number of bits needed for a symbol with given probability
181
+ def self.calculate_num_bits(prob, table_size)
182
+ return 0 if prob <= 0
183
+
184
+ # num_bits = accuracy_log - log2(prob)
185
+ # This is the number of extra bits needed
186
+ log_prob = 0
187
+ temp = prob
188
+ while temp > 1
189
+ log_prob += 1
190
+ temp >>= 1
191
+ end
192
+
193
+ log_table = 0
194
+ temp = table_size
195
+ while temp > 1
196
+ log_table += 1
197
+ temp >>= 1
198
+ end
199
+
200
+ [0, log_table - log_prob].max
201
+ end
202
+ end
203
+
204
+ # FSE Decoder (RFC 8878 Section 4.1)
205
+ #
206
+ # Decodes symbols from FSE-encoded bitstreams.
207
+ class Decoder
208
+ # @return [Table] FSE decoding table
209
+ attr_reader :table
210
+
211
+ # @return [Integer] Current state
212
+ attr_reader :state
213
+
214
+ # Initialize decoder with FSE table
215
+ #
216
+ # @param table [Table] FSE decoding table
217
+ def initialize(table)
218
+ @table = table
219
+ @state = 0
220
+ end
221
+
222
+ # Initialize state from bitstream
223
+ #
224
+ # @param bitstream [BitStream] The bitstream to read from
225
+ def init_state(bitstream)
226
+ @state = bitstream.read_bits(@table.accuracy_log)
227
+ end
228
+
229
+ # Decode next symbol from bitstream
230
+ #
231
+ # @param bitstream [BitStream] The bitstream to read from
232
+ # @return [Integer] Decoded symbol
233
+ def decode(bitstream)
234
+ entry = @table[@state]
235
+ return 0 if entry.nil?
236
+
237
+ symbol = entry.symbol
238
+
239
+ # Read extra bits for next state
240
+ if entry.num_bits.positive?
241
+ extra = bitstream.read_bits(entry.num_bits)
242
+ @state = entry.baseline + extra
243
+ else
244
+ @state = entry.baseline
245
+ end
246
+
247
+ # Mask state to table size
248
+ @state &= (@table.size - 1)
249
+
250
+ symbol
251
+ end
252
+
253
+ # Decode multiple symbols
254
+ #
255
+ # @param bitstream [BitStream] The bitstream to read from
256
+ # @param count [Integer] Number of symbols to decode
257
+ # @return [Array<Integer>] Decoded symbols
258
+ def decode_symbols(bitstream, count)
259
+ symbols = []
260
+ count.times do
261
+ symbols << decode(bitstream)
262
+ end
263
+ symbols
264
+ end
265
+ end
266
+ end
267
+ end
268
+ end
269
+ end