omnizip 0.3.2 → 0.3.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop_todo.yml +243 -368
- data/README.adoc +101 -5
- data/docs/guides/archive-formats/index.adoc +31 -1
- data/docs/guides/archive-formats/ole-format.adoc +316 -0
- data/docs/guides/archive-formats/rpm-format.adoc +249 -0
- data/docs/index.adoc +12 -2
- data/lib/omnizip/algorithms/lzma/distance_coder.rb +29 -18
- data/lib/omnizip/algorithms/lzma/encoder.rb +2 -1
- data/lib/omnizip/algorithms/lzma/length_coder.rb +6 -3
- data/lib/omnizip/algorithms/lzma/literal_decoder.rb +2 -1
- data/lib/omnizip/algorithms/lzma/lzip_decoder.rb +40 -13
- data/lib/omnizip/algorithms/lzma/range_decoder.rb +36 -2
- data/lib/omnizip/algorithms/lzma/range_encoder.rb +19 -0
- data/lib/omnizip/algorithms/lzma/xz_encoder_fast.rb +2 -1
- data/lib/omnizip/algorithms/lzma/xz_utils_decoder.rb +148 -112
- data/lib/omnizip/algorithms/lzma.rb +20 -5
- data/lib/omnizip/algorithms/ppmd7/decoder.rb +25 -21
- data/lib/omnizip/algorithms/ppmd7/encoder.rb +4 -11
- data/lib/omnizip/algorithms/sevenzip_lzma2.rb +2 -1
- data/lib/omnizip/algorithms/xz_lzma2.rb +2 -1
- data/lib/omnizip/algorithms/zstandard/constants.rb +125 -9
- data/lib/omnizip/algorithms/zstandard/decoder.rb +202 -17
- data/lib/omnizip/algorithms/zstandard/encoder.rb +197 -17
- data/lib/omnizip/algorithms/zstandard/frame/block.rb +128 -0
- data/lib/omnizip/algorithms/zstandard/frame/header.rb +224 -0
- data/lib/omnizip/algorithms/zstandard/fse/bitstream.rb +186 -0
- data/lib/omnizip/algorithms/zstandard/fse/encoder.rb +325 -0
- data/lib/omnizip/algorithms/zstandard/fse/table.rb +269 -0
- data/lib/omnizip/algorithms/zstandard/huffman.rb +272 -0
- data/lib/omnizip/algorithms/zstandard/huffman_encoder.rb +339 -0
- data/lib/omnizip/algorithms/zstandard/literals.rb +178 -0
- data/lib/omnizip/algorithms/zstandard/literals_encoder.rb +251 -0
- data/lib/omnizip/algorithms/zstandard/sequences.rb +346 -0
- data/lib/omnizip/buffer/memory_extractor.rb +3 -3
- data/lib/omnizip/buffer.rb +2 -2
- data/lib/omnizip/filters/delta.rb +2 -1
- data/lib/omnizip/filters/registry.rb +6 -6
- data/lib/omnizip/formats/cpio/bounded_io.rb +66 -0
- data/lib/omnizip/formats/lzip.rb +2 -1
- data/lib/omnizip/formats/lzma_alone.rb +2 -1
- data/lib/omnizip/formats/ole/allocation_table.rb +244 -0
- data/lib/omnizip/formats/ole/constants.rb +61 -0
- data/lib/omnizip/formats/ole/dirent.rb +380 -0
- data/lib/omnizip/formats/ole/header.rb +198 -0
- data/lib/omnizip/formats/ole/ranges_io.rb +264 -0
- data/lib/omnizip/formats/ole/storage.rb +305 -0
- data/lib/omnizip/formats/ole/types/variant.rb +328 -0
- data/lib/omnizip/formats/ole.rb +145 -0
- data/lib/omnizip/formats/rar/compression/ppmd/decoder.rb +92 -49
- data/lib/omnizip/formats/rar/compression/ppmd/encoder.rb +13 -20
- data/lib/omnizip/formats/rar/rar5/compression/lzss.rb +6 -2
- data/lib/omnizip/formats/rar3/reader.rb +6 -2
- data/lib/omnizip/formats/rar5/reader.rb +4 -1
- data/lib/omnizip/formats/rpm/constants.rb +58 -0
- data/lib/omnizip/formats/rpm/entry.rb +102 -0
- data/lib/omnizip/formats/rpm/header.rb +113 -0
- data/lib/omnizip/formats/rpm/lead.rb +122 -0
- data/lib/omnizip/formats/rpm/tag.rb +230 -0
- data/lib/omnizip/formats/rpm.rb +434 -0
- data/lib/omnizip/formats/seven_zip/bcj2_stream_decompressor.rb +239 -0
- data/lib/omnizip/formats/seven_zip/coder_chain.rb +32 -8
- data/lib/omnizip/formats/seven_zip/constants.rb +1 -1
- data/lib/omnizip/formats/seven_zip/reader.rb +84 -8
- data/lib/omnizip/formats/seven_zip/stream_compressor.rb +2 -1
- data/lib/omnizip/formats/seven_zip/stream_decompressor.rb +6 -0
- data/lib/omnizip/formats/seven_zip/writer.rb +21 -9
- data/lib/omnizip/formats/seven_zip.rb +10 -0
- data/lib/omnizip/formats/xar/entry.rb +18 -5
- data/lib/omnizip/formats/xar/header.rb +34 -6
- data/lib/omnizip/formats/xar/reader.rb +43 -10
- data/lib/omnizip/formats/xar/toc.rb +34 -21
- data/lib/omnizip/formats/xar/writer.rb +15 -5
- data/lib/omnizip/formats/xz_impl/block_decoder.rb +45 -33
- data/lib/omnizip/formats/xz_impl/block_encoder.rb +2 -1
- data/lib/omnizip/formats/xz_impl/index_decoder.rb +3 -1
- data/lib/omnizip/formats/xz_impl/stream_header_parser.rb +2 -1
- data/lib/omnizip/formats/zip/end_of_central_directory.rb +4 -3
- data/lib/omnizip/implementations/seven_zip/lzma/decoder.rb +14 -6
- data/lib/omnizip/implementations/seven_zip/lzma/encoder.rb +2 -1
- data/lib/omnizip/implementations/seven_zip/lzma2/encoder.rb +28 -13
- data/lib/omnizip/implementations/xz_utils/lzma2/encoder.rb +13 -6
- data/lib/omnizip/pipe/stream_compressor.rb +1 -1
- data/lib/omnizip/version.rb +1 -1
- data/readme-docs/compression-algorithms.adoc +6 -2
- metadata +30 -2
|
@@ -109,6 +109,25 @@ module Omnizip
|
|
|
109
109
|
end
|
|
110
110
|
end
|
|
111
111
|
|
|
112
|
+
# Encode a symbol using cumulative frequency range
|
|
113
|
+
#
|
|
114
|
+
# This is used by PPMd for encoding symbols based on their
|
|
115
|
+
# frequency distribution in the current context.
|
|
116
|
+
#
|
|
117
|
+
# @param cum_freq [Integer] Cumulative frequency up to this symbol
|
|
118
|
+
# @param freq [Integer] Frequency of this symbol
|
|
119
|
+
# @param total_freq [Integer] Total frequency of all symbols in context
|
|
120
|
+
# @return [void]
|
|
121
|
+
def encode_freq(cum_freq, freq, total_freq)
|
|
122
|
+
normalize
|
|
123
|
+
range_freq = @range / total_freq
|
|
124
|
+
low_bound = range_freq * cum_freq
|
|
125
|
+
high_bound = range_freq * (cum_freq + freq)
|
|
126
|
+
|
|
127
|
+
@low = (@low + low_bound) & 0xFFFFFFFFFFFFFFFF
|
|
128
|
+
@range = (high_bound - low_bound) & 0xFFFFFFFF
|
|
129
|
+
end
|
|
130
|
+
|
|
112
131
|
# Flush remaining bytes to output stream
|
|
113
132
|
#
|
|
114
133
|
# Ported from XZ Utils rc_flush().
|
|
@@ -487,7 +487,8 @@ module Omnizip
|
|
|
487
487
|
subcoder_index = offset + match_bit + (symbol >> 8)
|
|
488
488
|
bit = (symbol >> 7) & 1
|
|
489
489
|
|
|
490
|
-
@encoder.queue_bit(@models.literal[literal_base + subcoder_index],
|
|
490
|
+
@encoder.queue_bit(@models.literal[literal_base + subcoder_index],
|
|
491
|
+
bit)
|
|
491
492
|
|
|
492
493
|
symbol <<= 1
|
|
493
494
|
offset &= ~(match_byte ^ symbol)
|
|
@@ -167,12 +167,15 @@ module Omnizip
|
|
|
167
167
|
# @param preserve_dict [Boolean] Whether to preserve dictionary from previous decode
|
|
168
168
|
# @param check_rc_finished [Boolean] Whether to check if range decoder is finished
|
|
169
169
|
# @return [String, Integer] Decompressed data or bytes written
|
|
170
|
-
def decode_stream(output = nil, preserve_dict: false,
|
|
170
|
+
def decode_stream(output = nil, preserve_dict: false,
|
|
171
|
+
check_rc_finished: true)
|
|
171
172
|
@decode_stream_call_count ||= 0
|
|
172
173
|
@decode_stream_call_count += 1
|
|
173
174
|
call_num = @decode_stream_call_count
|
|
174
175
|
|
|
175
|
-
puts "DEBUG decode_stream START (call ##{call_num}): @pos=#{@pos.inspect}, @dict_full=#{@dict_full.inspect}, preserve_dict=#{preserve_dict}, @uncompressed_size=#{@uncompressed_size.inspect}" if ENV.fetch(
|
|
176
|
+
puts "DEBUG decode_stream START (call ##{call_num}): @pos=#{@pos.inspect}, @dict_full=#{@dict_full.inspect}, preserve_dict=#{preserve_dict}, @uncompressed_size=#{@uncompressed_size.inspect}" if ENV.fetch(
|
|
177
|
+
"LZMA_DEBUG", nil
|
|
178
|
+
) && @dict_full && @dict_full >= 200 && @dict_full <= 230
|
|
176
179
|
if ENV["LZMA_DEBUG_DECODE_STREAM"]
|
|
177
180
|
warn "DEBUG decode_stream[#{@decoder_id}] START: preserve_dict=#{preserve_dict}, @pos=#{@pos.inspect}, @dict_full=#{@dict_full.inspect}, @dict_buf.object_id=#{@dict_buf&.object_id || 'nil'}, @dict_buf.size=#{@dict_buf&.size || 'nil'}"
|
|
178
181
|
end
|
|
@@ -183,10 +186,10 @@ module Omnizip
|
|
|
183
186
|
# See: /Users/mulgogi/src/external/xz/src/liblzma/lzma/lzma2_decoder.c:140-141
|
|
184
187
|
if ENV["LZMA_DEBUG"]
|
|
185
188
|
warn "DEBUG: decode_stream - reusing range decoder @input.pos=#{begin
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
189
|
+
@input.pos
|
|
190
|
+
rescue StandardError
|
|
191
|
+
'N/A'
|
|
192
|
+
end}, @range_decoder.class=#{@range_decoder.class}"
|
|
190
193
|
end
|
|
191
194
|
|
|
192
195
|
# Create range decoder if it doesn't exist (first chunk)
|
|
@@ -200,7 +203,7 @@ module Omnizip
|
|
|
200
203
|
|
|
201
204
|
# Special case: empty input (uncompressed_size == 0)
|
|
202
205
|
# Return immediately without trying to decode anything
|
|
203
|
-
if @uncompressed_size != 0xFFFFFFFFFFFFFFFF && @uncompressed_size
|
|
206
|
+
if @uncompressed_size != 0xFFFFFFFFFFFFFFFF && @uncompressed_size.zero?
|
|
204
207
|
if ENV["LZMA_DEBUG"]
|
|
205
208
|
warn "DEBUG: decode_stream - empty input (uncompressed_size=0), returning immediately"
|
|
206
209
|
end
|
|
@@ -216,7 +219,9 @@ module Omnizip
|
|
|
216
219
|
@chunk_bytes_decoded = 0
|
|
217
220
|
|
|
218
221
|
# DEBUG: Show chunk_bytes_decoded initialization
|
|
219
|
-
if @dict_full && @dict_full >= 220 && @dict_full <= 240 && ENV.fetch(
|
|
222
|
+
if @dict_full && @dict_full >= 220 && @dict_full <= 240 && ENV.fetch(
|
|
223
|
+
"LZMA_DEBUG", nil
|
|
224
|
+
)
|
|
220
225
|
puts "DEBUG: chunk_bytes_decoded reset to 0 for chunk (call_num=#{call_num}, dict_full=#{@dict_full})"
|
|
221
226
|
end
|
|
222
227
|
|
|
@@ -274,9 +279,13 @@ module Omnizip
|
|
|
274
279
|
# properly reflected in start_pos, so we only return NEW bytes.
|
|
275
280
|
# For LZMA2, we need to return only the NEW bytes, not all bytes from LZ_DICT_INIT_POS
|
|
276
281
|
start_pos = @pos || LZ_DICT_INIT_POS
|
|
277
|
-
puts "DEBUG: start_pos=#{start_pos}, @pos=#{@pos.inspect}, @dict_full=#{@dict_full.inspect}, preserve_dict=#{preserve_dict}, @decoder_id=#{@decoder_id}" if ENV.fetch(
|
|
282
|
+
puts "DEBUG: start_pos=#{start_pos}, @pos=#{@pos.inspect}, @dict_full=#{@dict_full.inspect}, preserve_dict=#{preserve_dict}, @decoder_id=#{@decoder_id}" if ENV.fetch(
|
|
283
|
+
"LZMA_DEBUG", nil
|
|
284
|
+
) && @dict_full && @dict_full >= 200 && @dict_full <= 230
|
|
278
285
|
# Also show for chunk #1 start (dict_full around 227)
|
|
279
|
-
puts "DEBUG: start_pos=#{start_pos}, @pos=#{@pos.inspect}, @dict_full=#{@dict_full.inspect}, @uncompressed_size=#{@uncompressed_size}, @decoder_id=#{@decoder_id}" if ENV.fetch(
|
|
286
|
+
puts "DEBUG: start_pos=#{start_pos}, @pos=#{@pos.inspect}, @dict_full=#{@dict_full.inspect}, @uncompressed_size=#{@uncompressed_size}, @decoder_id=#{@decoder_id}" if ENV.fetch(
|
|
287
|
+
"LZMA_DEBUG", nil
|
|
288
|
+
) && @dict_full && @dict_full >= 225 && @dict_full <= 230
|
|
280
289
|
|
|
281
290
|
# Initialize rep distances (XZ Utils initializes to 0)
|
|
282
291
|
# See: /Users/mulgogi/src/external/xz/src/liblzma/lzma/lzma_decoder.c:1054-1055
|
|
@@ -285,7 +294,9 @@ module Omnizip
|
|
|
285
294
|
#
|
|
286
295
|
# IMPORTANT: Initialize rep distances if they're nil OR not preserving dict
|
|
287
296
|
if @rep0.nil? || @rep1.nil? || @rep2.nil? || @rep3.nil? || !preserve_dict
|
|
288
|
-
puts "DEBUG: Resetting rep distances to 0 (rep0.nil?=#{@rep0.nil?}, preserve_dict=#{preserve_dict})" if ENV.fetch(
|
|
297
|
+
puts "DEBUG: Resetting rep distances to 0 (rep0.nil?=#{@rep0.nil?}, preserve_dict=#{preserve_dict})" if ENV.fetch(
|
|
298
|
+
"LZMA_DEBUG", nil
|
|
299
|
+
) && @dict_full && @dict_full >= 200 && @dict_full <= 230
|
|
289
300
|
@rep0 = 0
|
|
290
301
|
@rep1 = 0
|
|
291
302
|
@rep2 = 0
|
|
@@ -301,17 +312,19 @@ module Omnizip
|
|
|
301
312
|
# XZ Utils uses dict->pos (current position) + uncompressed_size
|
|
302
313
|
# We use start_pos (current position) + @uncompressed_size
|
|
303
314
|
limit = if @uncompressed_size == 0xFFFFFFFFFFFFFFFF
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
315
|
+
nil # No limit for unknown size
|
|
316
|
+
else
|
|
317
|
+
start_pos + @uncompressed_size
|
|
318
|
+
end
|
|
308
319
|
|
|
309
320
|
# DEBUG: Show limit calculation for chunk #1
|
|
310
|
-
if ENV.fetch("LZMA_DEBUG_LIMIT",
|
|
321
|
+
if ENV.fetch("LZMA_DEBUG_LIMIT",
|
|
322
|
+
nil) && @dict_full && @dict_full >= 220 && @dict_full <= 240
|
|
311
323
|
puts "DEBUG LIMIT CALCULATION: start_pos=#{start_pos}, @uncompressed_size=#{@uncompressed_size}, limit=#{limit.inspect}"
|
|
312
324
|
end
|
|
313
325
|
# DEBUG: Also show for dict_full around 293 (where the error occurs)
|
|
314
|
-
if ENV.fetch("LZMA_DEBUG_LIMIT",
|
|
326
|
+
if ENV.fetch("LZMA_DEBUG_LIMIT",
|
|
327
|
+
nil) && @dict_full && @dict_full >= 290 && @dict_full <= 300
|
|
315
328
|
puts "DEBUG LIMIT CALCULATION at dict_full=#{@dict_full}: start_pos=#{start_pos}, @uncompressed_size=#{@uncompressed_size}, limit=#{limit.inspect}, @decoder_id=#{@decoder_id}"
|
|
316
329
|
end
|
|
317
330
|
|
|
@@ -319,17 +332,18 @@ module Omnizip
|
|
|
319
332
|
loop do
|
|
320
333
|
iteration += 1
|
|
321
334
|
# DEBUG: Show every iteration after position 200
|
|
322
|
-
if ENV.fetch("LZMA_DEBUG_ITER",
|
|
335
|
+
if ENV.fetch("LZMA_DEBUG_ITER",
|
|
336
|
+
nil) && @dict_full && @dict_full >= 200 && @dict_full <= 500
|
|
323
337
|
puts "DEBUG ITERATION ##{iteration}: pos=#{@pos}, dict_full=#{@dict_full}, limit=#{limit.inspect}"
|
|
324
338
|
end
|
|
325
339
|
# Check if we've reached the expected size (if known)
|
|
326
340
|
# XZ Utils: checks dict.pos < dict.limit
|
|
327
341
|
if ENV["LZMA_DEBUG_LIMIT"]
|
|
328
342
|
compare_result = begin
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
343
|
+
limit && @pos >= limit
|
|
344
|
+
rescue StandardError
|
|
345
|
+
"ERROR"
|
|
346
|
+
end
|
|
333
347
|
XzUtilsDecoderDebug.debug_puts "DEBUG LIMIT: iter=#{iteration}, pos=#{@pos.inspect}, dict_full=#{@dict_full}, limit=#{limit.inspect}, pos >= limit: #{compare_result}"
|
|
334
348
|
end
|
|
335
349
|
|
|
@@ -504,16 +518,16 @@ module Omnizip
|
|
|
504
518
|
# If explicitly set to false, it allows EOPM even when uncompressed size is known
|
|
505
519
|
# Reference: alone_decoder.c:127 (LZMA_LZMA1EXT_ALLOW_EOPM)
|
|
506
520
|
should_check = if @allow_eopm == true
|
|
507
|
-
|
|
508
|
-
|
|
509
|
-
|
|
510
|
-
|
|
511
|
-
|
|
512
|
-
|
|
513
|
-
|
|
514
|
-
|
|
515
|
-
|
|
516
|
-
|
|
521
|
+
# EOPM is explicitly allowed, skip the check
|
|
522
|
+
false
|
|
523
|
+
elsif @allow_eopm == false
|
|
524
|
+
# LZMA2 mode: always check (EOPM is not allowed)
|
|
525
|
+
true
|
|
526
|
+
else
|
|
527
|
+
# @allow_eopm is nil (not set, first chunk or legacy mode)
|
|
528
|
+
# Use check_rc_finished parameter as default
|
|
529
|
+
check_rc_finished
|
|
530
|
+
end
|
|
517
531
|
|
|
518
532
|
if should_check
|
|
519
533
|
# If EOPM is not allowed, range decoder MUST be finished
|
|
@@ -522,12 +536,10 @@ module Omnizip
|
|
|
522
536
|
"LZMA stream finished with leftover compressed data (range_decoder.code=#{@range_decoder.code}, expected 0). This indicates corruption in the compressed stream or an invalid EOPM for LZMA2."
|
|
523
537
|
end
|
|
524
538
|
break
|
|
525
|
-
|
|
539
|
+
elsif @range_decoder.code.zero?
|
|
526
540
|
# EOPM is allowed (e.g., LZMA_Alone format)
|
|
527
541
|
# If range decoder is finished, we're done
|
|
528
|
-
|
|
529
|
-
break
|
|
530
|
-
end
|
|
542
|
+
break
|
|
531
543
|
# Otherwise, continue decoding to find EOPM marker
|
|
532
544
|
# XZ Utils sets eopm_is_valid = true and continues
|
|
533
545
|
# Reference: lzma_decoder.c:704
|
|
@@ -535,12 +547,14 @@ module Omnizip
|
|
|
535
547
|
end
|
|
536
548
|
|
|
537
549
|
# DEBUG: Show when approaching limit for chunk #1
|
|
538
|
-
if ENV.fetch("LZMA_DEBUG_LIMIT",
|
|
550
|
+
if ENV.fetch("LZMA_DEBUG_LIMIT",
|
|
551
|
+
nil) && limit && @pos >= limit - 10 && @pos < limit + 10
|
|
539
552
|
puts "DEBUG NEAR LIMIT (call #{call_num}): pos=#{@pos}, limit=#{limit}, dict_full=#{@dict_full}, chunk_bytes_decoded=#{@chunk_bytes_decoded}, remaining=#{@uncompressed_size ? @uncompressed_size - @chunk_bytes_decoded : 'N/A'}"
|
|
540
553
|
end
|
|
541
554
|
|
|
542
555
|
# DEBUG: Show when we've passed the expected limit
|
|
543
|
-
if ENV.fetch("LZMA_DEBUG_LIMIT",
|
|
556
|
+
if ENV.fetch("LZMA_DEBUG_LIMIT",
|
|
557
|
+
nil) && limit && @pos >= limit && @pos < limit + 10
|
|
544
558
|
puts "DEBUG PASSED LIMIT: pos=#{@pos}, limit=#{limit}, dict_full=#{@dict_full}, delta=#{@pos - limit}"
|
|
545
559
|
end
|
|
546
560
|
|
|
@@ -614,16 +628,18 @@ module Omnizip
|
|
|
614
628
|
end
|
|
615
629
|
valid_bytes = @dict_buf[start_pos...@pos]
|
|
616
630
|
# DEBUG: Show return value calculation
|
|
617
|
-
puts "DEBUG RETURN CALCULATION: call_num=#{call_num}, start_pos=#{start_pos}, @pos=#{@pos}, valid_bytes.size=#{@dict_buf[start_pos...@pos].size}, dict_full=#{@dict_full}, chunk_bytes_decoded=#{@chunk_bytes_decoded}" if ENV.fetch(
|
|
631
|
+
puts "DEBUG RETURN CALCULATION: call_num=#{call_num}, start_pos=#{start_pos}, @pos=#{@pos}, valid_bytes.size=#{@dict_buf[start_pos...@pos].size}, dict_full=#{@dict_full}, chunk_bytes_decoded=#{@chunk_bytes_decoded}" if ENV.fetch(
|
|
632
|
+
"LZMA_DEBUG", nil
|
|
633
|
+
) && @dict_full && @dict_full >= 220 && @dict_full <= 240 && call_num == 2
|
|
618
634
|
puts "DEBUG RETURN CALCULATION: call_num=#{call_num}, start_pos=#{start_pos}, @pos=#{@pos}, valid_bytes.size=#{@dict_buf[start_pos...@pos].size}, dict_full=#{@dict_full}, chunk_bytes_decoded=#{@chunk_bytes_decoded}" if ENV["LZMA_DEBUG"] && call_num == 2
|
|
619
635
|
# Filter out nil values (can happen during dictionary reset transitions)
|
|
620
636
|
valid_bytes = valid_bytes.map { |b| b.nil? ? 0 : b }
|
|
621
637
|
if ENV["DEBUG_DICT_BUF"]
|
|
622
638
|
XzUtilsDecoderDebug.debug_puts "DEBUG: valid_bytes=#{begin
|
|
623
|
-
|
|
624
|
-
|
|
625
|
-
|
|
626
|
-
|
|
639
|
+
valid_bytes.size
|
|
640
|
+
rescue StandardError
|
|
641
|
+
valid_bytes.inspect
|
|
642
|
+
end}"
|
|
627
643
|
end
|
|
628
644
|
valid_data = valid_bytes.pack("C*")
|
|
629
645
|
if output
|
|
@@ -863,24 +879,24 @@ module Omnizip
|
|
|
863
879
|
if @range_decoder
|
|
864
880
|
if ENV["LZMA_DEBUG"]
|
|
865
881
|
input_pos = begin
|
|
866
|
-
|
|
867
|
-
|
|
868
|
-
|
|
869
|
-
|
|
882
|
+
@input.pos
|
|
883
|
+
rescue StandardError
|
|
884
|
+
"N/A"
|
|
885
|
+
end
|
|
870
886
|
input_size = begin
|
|
871
|
-
|
|
872
|
-
|
|
873
|
-
|
|
874
|
-
|
|
887
|
+
@input.size
|
|
888
|
+
rescue StandardError
|
|
889
|
+
"N/A"
|
|
890
|
+
end
|
|
875
891
|
XzUtilsDecoderDebug.debug_puts "=== finish_state_reset: resetting range_decoder, input pos=#{input_pos}, size=#{input_size}"
|
|
876
892
|
end
|
|
877
893
|
@range_decoder.reset
|
|
878
894
|
if ENV["LZMA_DEBUG"]
|
|
879
895
|
input_pos_after = begin
|
|
880
|
-
|
|
881
|
-
|
|
882
|
-
|
|
883
|
-
|
|
896
|
+
@input.pos
|
|
897
|
+
rescue StandardError
|
|
898
|
+
"N/A"
|
|
899
|
+
end
|
|
884
900
|
XzUtilsDecoderDebug.debug_puts "=== finish_state_reset: after reset, input pos=#{input_pos_after}, range_decoder.code=0x#{@range_decoder.code.to_s(16)}"
|
|
885
901
|
end
|
|
886
902
|
end
|
|
@@ -920,7 +936,8 @@ module Omnizip
|
|
|
920
936
|
@input = new_input
|
|
921
937
|
|
|
922
938
|
# DEBUG: Trace input stream contents
|
|
923
|
-
if ENV.fetch("LZMA_DEBUG",
|
|
939
|
+
if ENV.fetch("LZMA_DEBUG",
|
|
940
|
+
nil) && @dict_full && @dict_full >= 220 && @dict_full <= 230
|
|
924
941
|
puts "\n=== set_input at dict_full=#{@dict_full} ==="
|
|
925
942
|
puts " new_input.size=#{new_input.size}"
|
|
926
943
|
puts " new_input.pos=#{new_input.pos}"
|
|
@@ -934,7 +951,9 @@ module Omnizip
|
|
|
934
951
|
|
|
935
952
|
first_bytes << byte
|
|
936
953
|
end
|
|
937
|
-
puts " First 10 bytes: #{first_bytes.map
|
|
954
|
+
puts " First 10 bytes: #{first_bytes.map do |b|
|
|
955
|
+
"0x#{b.to_s(16).upcase}"
|
|
956
|
+
end.join(' ')}"
|
|
938
957
|
|
|
939
958
|
new_input.rewind
|
|
940
959
|
test_byte = new_input.getbyte
|
|
@@ -1268,22 +1287,22 @@ module Omnizip
|
|
|
1268
1287
|
if @dict_full == 233
|
|
1269
1288
|
XzUtilsDecoderDebug.debug_puts " DETAILED TRACE at dict_full=233 (pos=#{@pos}):"
|
|
1270
1289
|
XzUtilsDecoderDebug.debug_puts " byte=0x#{byte.to_s(16)} ('#{begin
|
|
1271
|
-
|
|
1272
|
-
|
|
1273
|
-
|
|
1274
|
-
|
|
1290
|
+
byte.chr
|
|
1291
|
+
rescue StandardError
|
|
1292
|
+
'?'
|
|
1293
|
+
end}')"
|
|
1275
1294
|
XzUtilsDecoderDebug.debug_puts " state.value=#{@state.value}, lit_state=#{lit_state}"
|
|
1276
1295
|
XzUtilsDecoderDebug.debug_puts " use_matched_literal?=#{@state.use_matched_literal?}"
|
|
1277
1296
|
prev_byte_val = @dict_full.positive? ? @dict_buf[@pos - 1] : "N/A"
|
|
1278
1297
|
XzUtilsDecoderDebug.debug_puts " prev_byte=#{prev_byte_val.inspect} (#{if prev_byte_val.is_a?(Integer)
|
|
1279
|
-
|
|
1280
|
-
|
|
1281
|
-
|
|
1282
|
-
|
|
1283
|
-
|
|
1284
|
-
|
|
1285
|
-
|
|
1286
|
-
|
|
1298
|
+
"0x#{prev_byte_val.to_s(16)} ('#{begin
|
|
1299
|
+
prev_byte_val.chr
|
|
1300
|
+
rescue StandardError
|
|
1301
|
+
'?'
|
|
1302
|
+
end}')"
|
|
1303
|
+
else
|
|
1304
|
+
'N/A'
|
|
1305
|
+
end})"
|
|
1287
1306
|
XzUtilsDecoderDebug.debug_puts " range_decoder.range=0x#{@range_decoder.range.to_s(16)}, range_decoder.code=0x#{@range_decoder.code.to_s(16)}"
|
|
1288
1307
|
XzUtilsDecoderDebug.debug_puts " input.pos=#{@input.pos}, input.size=#{@input.size}"
|
|
1289
1308
|
end
|
|
@@ -1335,14 +1354,17 @@ module Omnizip
|
|
|
1335
1354
|
if ENV["TRACE_ARM64_BYTES"]
|
|
1336
1355
|
@arm64_trace ||= []
|
|
1337
1356
|
if @arm64_trace.size < 20
|
|
1338
|
-
@arm64_trace << [@dict_full, @pos, byte.class,
|
|
1357
|
+
@arm64_trace << [@dict_full, @pos, byte.class,
|
|
1358
|
+
byte.is_a?(Integer) ? byte : byte.ord, @dict_buf[@pos]]
|
|
1339
1359
|
if @arm64_trace.size == 20
|
|
1340
1360
|
# Dump the trace
|
|
1341
1361
|
puts "\n=== ARM64 BYTE TRACE (first 20 bytes) ==="
|
|
1342
1362
|
puts "Decoder ID: #{@decoder_id}"
|
|
1343
1363
|
@arm64_trace.each_with_index do |entry, i|
|
|
1344
1364
|
df, p, _, val, stored = entry
|
|
1345
|
-
puts " [#{i + 1}] dict_full=#{df.to_s.rjust(6)}, pos=#{p.to_s.rjust(6)}, byte=#{val.to_s.rjust(3)} (0x#{val.to_s(16).upcase.rjust(
|
|
1365
|
+
puts " [#{i + 1}] dict_full=#{df.to_s.rjust(6)}, pos=#{p.to_s.rjust(6)}, byte=#{val.to_s.rjust(3)} (0x#{val.to_s(16).upcase.rjust(
|
|
1366
|
+
2, '0'
|
|
1367
|
+
)}) stored=#{stored.inspect}"
|
|
1346
1368
|
end
|
|
1347
1369
|
puts "=========================================\n"
|
|
1348
1370
|
$stderr.flush
|
|
@@ -1482,24 +1504,36 @@ module Omnizip
|
|
|
1482
1504
|
# ((len) < DIST_STATES + MATCH_LEN_MIN ? (len) - MATCH_LEN_MIN : DIST_STATES - 1)
|
|
1483
1505
|
# This gives: len=2→0, len=3→1, len=4→2, len=5→3, len=6+→3
|
|
1484
1506
|
len_state = if length < NUM_LEN_TO_POS_STATES + MATCH_LEN_MIN
|
|
1485
|
-
|
|
1486
|
-
|
|
1487
|
-
|
|
1488
|
-
|
|
1507
|
+
length - MATCH_LEN_MIN
|
|
1508
|
+
else
|
|
1509
|
+
NUM_LEN_TO_POS_STATES - 1
|
|
1510
|
+
end
|
|
1489
1511
|
|
|
1490
1512
|
# DEBUG: Show bytes being copied
|
|
1491
1513
|
if old_dict_full.between?(210, 230) || ENV["LZMA_DEBUG_DISTANCE"]
|
|
1492
|
-
XzUtilsDecoderDebug.debug_puts "\n=== decode_regular_match at dict_full=#{old_dict_full} ===" if old_dict_full.between?(
|
|
1514
|
+
XzUtilsDecoderDebug.debug_puts "\n=== decode_regular_match at dict_full=#{old_dict_full} ===" if old_dict_full.between?(
|
|
1515
|
+
210, 230
|
|
1516
|
+
)
|
|
1493
1517
|
puts "[DISTANCE_DECODER] decode_regular_match at dict_full=#{old_dict_full}" if ENV["LZMA_DEBUG_DISTANCE"]
|
|
1494
|
-
XzUtilsDecoderDebug.debug_puts " pos_state=#{pos_state}" if old_dict_full.between?(
|
|
1518
|
+
XzUtilsDecoderDebug.debug_puts " pos_state=#{pos_state}" if old_dict_full.between?(
|
|
1519
|
+
210, 230
|
|
1520
|
+
)
|
|
1495
1521
|
puts "[DISTANCE_DECODER] pos_state=#{pos_state}" if ENV["LZMA_DEBUG_DISTANCE"]
|
|
1496
|
-
XzUtilsDecoderDebug.debug_puts " state=#{old_state}" if old_dict_full.between?(
|
|
1522
|
+
XzUtilsDecoderDebug.debug_puts " state=#{old_state}" if old_dict_full.between?(
|
|
1523
|
+
210, 230
|
|
1524
|
+
)
|
|
1497
1525
|
puts "[DISTANCE_DECODER] state=#{old_state}" if ENV["LZMA_DEBUG_DISTANCE"]
|
|
1498
|
-
XzUtilsDecoderDebug.debug_puts " length_encoded=#{length_encoded} length=#{length}" if old_dict_full.between?(
|
|
1526
|
+
XzUtilsDecoderDebug.debug_puts " length_encoded=#{length_encoded} length=#{length}" if old_dict_full.between?(
|
|
1527
|
+
210, 230
|
|
1528
|
+
)
|
|
1499
1529
|
puts "[DISTANCE_DECODER] length_encoded=#{length_encoded} length=#{length}" if ENV["LZMA_DEBUG_DISTANCE"]
|
|
1500
|
-
XzUtilsDecoderDebug.debug_puts " len_state=#{len_state}" if old_dict_full.between?(
|
|
1530
|
+
XzUtilsDecoderDebug.debug_puts " len_state=#{len_state}" if old_dict_full.between?(
|
|
1531
|
+
210, 230
|
|
1532
|
+
)
|
|
1501
1533
|
puts "[DISTANCE_DECODER] len_state=#{len_state}" if ENV["LZMA_DEBUG_DISTANCE"]
|
|
1502
|
-
XzUtilsDecoderDebug.debug_puts " rep0_before=#{old_rep0}" if old_dict_full.between?(
|
|
1534
|
+
XzUtilsDecoderDebug.debug_puts " rep0_before=#{old_rep0}" if old_dict_full.between?(
|
|
1535
|
+
210, 230
|
|
1536
|
+
)
|
|
1503
1537
|
puts "[DISTANCE_DECODER] rep0_before=#{old_rep0}" if ENV["LZMA_DEBUG_DISTANCE"]
|
|
1504
1538
|
end
|
|
1505
1539
|
|
|
@@ -1513,7 +1547,9 @@ module Omnizip
|
|
|
1513
1547
|
rep0 = @distance_coder.decode(@range_decoder, len_state)
|
|
1514
1548
|
|
|
1515
1549
|
# DEBUG
|
|
1516
|
-
if (ENV.fetch("LZMA_DEBUG",
|
|
1550
|
+
if (ENV.fetch("LZMA_DEBUG",
|
|
1551
|
+
nil) && old_dict_full.between?(210,
|
|
1552
|
+
230)) || old_dict_full == 293
|
|
1517
1553
|
puts " rep0_decoded=#{rep0} (distance = #{rep0})"
|
|
1518
1554
|
puts " buffer_back calculation: back=#{@dict_full - rep0 - 1}"
|
|
1519
1555
|
end
|
|
@@ -1609,14 +1645,14 @@ module Omnizip
|
|
|
1609
1645
|
XzUtilsDecoderDebug.debug_puts " buffer_back=#{buffer_back}, back=#{back}"
|
|
1610
1646
|
bytes_at_back = @dict_buf[buffer_back, 3]
|
|
1611
1647
|
bytes_hex = if bytes_at_back.is_a?(String)
|
|
1612
|
-
|
|
1613
|
-
|
|
1614
|
-
|
|
1615
|
-
|
|
1616
|
-
|
|
1617
|
-
|
|
1618
|
-
|
|
1619
|
-
|
|
1648
|
+
bytes_at_back.bytes.map do |b|
|
|
1649
|
+
"%02x" % b
|
|
1650
|
+
end.join(" ")
|
|
1651
|
+
else
|
|
1652
|
+
bytes_at_back.map do |b|
|
|
1653
|
+
"%02x" % b
|
|
1654
|
+
end.join(" ")
|
|
1655
|
+
end
|
|
1620
1656
|
XzUtilsDecoderDebug.debug_puts " First 3 bytes at buffer_back: #{bytes_hex} (#{bytes_at_back.inspect})"
|
|
1621
1657
|
end
|
|
1622
1658
|
|
|
@@ -1916,14 +1952,14 @@ module Omnizip
|
|
|
1916
1952
|
XzUtilsDecoderDebug.debug_puts " back=#{old_back}, wrapped_back=#{back}, buffer_back=#{buffer_back}"
|
|
1917
1953
|
bytes_at_back = @dict_buf[buffer_back, 3]
|
|
1918
1954
|
bytes_hex = if bytes_at_back.is_a?(String)
|
|
1919
|
-
|
|
1920
|
-
|
|
1921
|
-
|
|
1922
|
-
|
|
1923
|
-
|
|
1924
|
-
|
|
1925
|
-
|
|
1926
|
-
|
|
1955
|
+
bytes_at_back.bytes.map do |b|
|
|
1956
|
+
"%02x" % b
|
|
1957
|
+
end.join(" ")
|
|
1958
|
+
else
|
|
1959
|
+
[bytes_at_back].flatten.map do |b|
|
|
1960
|
+
"%02x" % b
|
|
1961
|
+
end.join(" ")
|
|
1962
|
+
end
|
|
1927
1963
|
XzUtilsDecoderDebug.debug_puts " First 3 bytes at buffer_back: #{bytes_hex} (#{bytes_at_back.inspect})"
|
|
1928
1964
|
end
|
|
1929
1965
|
|
|
@@ -1933,10 +1969,10 @@ module Omnizip
|
|
|
1933
1969
|
source_val = @dict_buf[@pos - 1]
|
|
1934
1970
|
puts " Rep match copy at dict_full=#{@dict_full}: length=#{length}, distance=#{distance}, @pos=#{@pos} (will write to #{@pos}...#{@pos + length - 1})"
|
|
1935
1971
|
puts " Reading from @pos-1=#{@pos - 1}, source byte = #{source_val} (0x#{source_val.to_s(16)} '#{begin
|
|
1936
|
-
|
|
1937
|
-
|
|
1938
|
-
|
|
1939
|
-
|
|
1972
|
+
source_val.chr
|
|
1973
|
+
rescue StandardError
|
|
1974
|
+
'?'
|
|
1975
|
+
end}')"
|
|
1940
1976
|
puts " Before copy: @dict_buf[#{@pos}...#{@pos + length - 1}] = #{@dict_buf[@pos,
|
|
1941
1977
|
length].inspect}"
|
|
1942
1978
|
end
|
|
@@ -1950,17 +1986,17 @@ module Omnizip
|
|
|
1950
1986
|
10].inspect}"
|
|
1951
1987
|
# DEBUG: Check if buffer_back+1 has the correct byte
|
|
1952
1988
|
puts " dict_buf[buffer_back+1=#{buffer_back + 1}] = #{@dict_buf[buffer_back + 1].inspect} ('#{begin
|
|
1953
|
-
|
|
1954
|
-
|
|
1955
|
-
|
|
1956
|
-
|
|
1989
|
+
@dict_buf[buffer_back + 1].chr
|
|
1990
|
+
rescue StandardError
|
|
1991
|
+
'?'
|
|
1992
|
+
end}')"
|
|
1957
1993
|
prev_5 = if buffer_back > 4
|
|
1958
|
-
|
|
1959
|
-
|
|
1960
|
-
|
|
1961
|
-
|
|
1962
|
-
|
|
1963
|
-
|
|
1994
|
+
@dict_buf[(buffer_back - 5)..(buffer_back - 1)].map do |b|
|
|
1995
|
+
"0x#{b.to_s(16).upcase} (#{b.chr})"
|
|
1996
|
+
end.join(", ")
|
|
1997
|
+
else
|
|
1998
|
+
"N/A"
|
|
1999
|
+
end
|
|
1964
2000
|
puts " Previous 5 bytes: [#{prev_5}]"
|
|
1965
2001
|
puts " Current dict_full=#{@dict_full}, @pos=#{@pos}"
|
|
1966
2002
|
end
|
|
@@ -97,12 +97,21 @@ module Omnizip
|
|
|
97
97
|
|
|
98
98
|
# Build decoder options, merging with instance variables as fallbacks
|
|
99
99
|
decoder_opts = build_decoder_options(options)
|
|
100
|
-
|
|
100
|
+
if @lzma2_mode && !decoder_opts.key?(:lzma2_mode)
|
|
101
|
+
decoder_opts[:lzma2_mode] =
|
|
102
|
+
@lzma2_mode
|
|
103
|
+
end
|
|
101
104
|
decoder_opts[:lc] = @lc if @lc && !decoder_opts.key?(:lc)
|
|
102
105
|
decoder_opts[:lp] = @lp if @lp && !decoder_opts.key?(:lp)
|
|
103
106
|
decoder_opts[:pb] = @pb if @pb && !decoder_opts.key?(:pb)
|
|
104
|
-
|
|
105
|
-
|
|
107
|
+
if @dict_size && !decoder_opts.key?(:dict_size)
|
|
108
|
+
decoder_opts[:dict_size] =
|
|
109
|
+
@dict_size
|
|
110
|
+
end
|
|
111
|
+
if @uncompressed_size && !decoder_opts.key?(:uncompressed_size)
|
|
112
|
+
decoder_opts[:uncompressed_size] =
|
|
113
|
+
@uncompressed_size
|
|
114
|
+
end
|
|
106
115
|
decoder_opts[:uncompressed_size] ||= options[:size] if options.respond_to?(:key?) && options.key?(:size)
|
|
107
116
|
|
|
108
117
|
decoder = Decoder.new(input_stream, decoder_opts)
|
|
@@ -167,13 +176,19 @@ module Omnizip
|
|
|
167
176
|
|
|
168
177
|
# Handle Hash-like options - pass through all decoder-relevant options
|
|
169
178
|
if options.respond_to?(:key?)
|
|
170
|
-
|
|
179
|
+
if options.key?(:sdk_compatible)
|
|
180
|
+
opts[:sdk_compatible] =
|
|
181
|
+
options[:sdk_compatible]
|
|
182
|
+
end
|
|
171
183
|
opts[:lzma2_mode] = options[:lzma2_mode] if options.key?(:lzma2_mode)
|
|
172
184
|
opts[:lc] = options[:lc] if options.key?(:lc)
|
|
173
185
|
opts[:lp] = options[:lp] if options.key?(:lp)
|
|
174
186
|
opts[:pb] = options[:pb] if options.key?(:pb)
|
|
175
187
|
opts[:dict_size] = options[:dict_size] if options.key?(:dict_size)
|
|
176
|
-
|
|
188
|
+
if options.key?(:uncompressed_size)
|
|
189
|
+
opts[:uncompressed_size] =
|
|
190
|
+
options[:uncompressed_size]
|
|
191
|
+
end
|
|
177
192
|
opts[:size] = options[:size] if options.key?(:size)
|
|
178
193
|
end
|
|
179
194
|
|
|
@@ -74,46 +74,50 @@ module Omnizip
|
|
|
74
74
|
# Decode a single symbol using the model
|
|
75
75
|
#
|
|
76
76
|
# Uses the model and range decoder to extract the
|
|
77
|
-
# original symbol value.
|
|
77
|
+
# original symbol value using proper range decoding.
|
|
78
78
|
#
|
|
79
79
|
# @return [Integer, nil] Decoded byte or nil if end
|
|
80
80
|
def decode_symbol
|
|
81
|
-
#
|
|
82
|
-
|
|
81
|
+
# Get context for decoding
|
|
82
|
+
context = @model.root_context
|
|
83
|
+
total_freq = context.total_freq
|
|
84
|
+
|
|
85
|
+
return nil if total_freq.zero?
|
|
86
|
+
|
|
87
|
+
# Decode cumulative frequency using proper range decoding
|
|
88
|
+
cum_freq_value = @range_decoder.decode_freq(total_freq)
|
|
83
89
|
|
|
84
|
-
# Find symbol from
|
|
85
|
-
symbol =
|
|
90
|
+
# Find symbol from cumulative frequency value
|
|
91
|
+
symbol, cum_freq, freq = find_symbol_from_cum_freq(context,
|
|
92
|
+
cum_freq_value)
|
|
86
93
|
return nil if symbol.nil?
|
|
87
94
|
|
|
95
|
+
# Normalize the range decoder state
|
|
96
|
+
@range_decoder.normalize_freq(cum_freq, freq, total_freq)
|
|
97
|
+
|
|
88
98
|
# Update model to stay in sync with encoder
|
|
89
99
|
@model.update(symbol)
|
|
90
100
|
|
|
91
101
|
symbol
|
|
92
102
|
end
|
|
93
103
|
|
|
94
|
-
# Find symbol from
|
|
104
|
+
# Find symbol from cumulative frequency value
|
|
95
105
|
#
|
|
96
|
-
#
|
|
97
|
-
# the
|
|
106
|
+
# Maps the decoded cumulative frequency back to a symbol
|
|
107
|
+
# using the context's probability distribution.
|
|
98
108
|
#
|
|
99
|
-
# @param
|
|
100
|
-
# @
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
# properly decode using context probabilities
|
|
104
|
-
context = @model.root_context
|
|
105
|
-
|
|
106
|
-
# Find symbol whose cumulative range contains value
|
|
107
|
-
scale = 0x10000
|
|
109
|
+
# @param context [Context] The current context
|
|
110
|
+
# @param cum_freq_value [Integer] Decoded cumulative frequency
|
|
111
|
+
# @return [Array<Integer, Integer, Integer>] symbol, cum_freq, freq
|
|
112
|
+
def find_symbol_from_cum_freq(context, cum_freq_value)
|
|
108
113
|
cum_freq = 0
|
|
109
114
|
|
|
110
115
|
context.states.keys.sort.each do |symbol|
|
|
111
116
|
state = context.states[symbol]
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
sym_high = (next_cum * scale) / context.total_freq
|
|
117
|
+
freq = state.freq
|
|
118
|
+
next_cum = cum_freq + freq
|
|
115
119
|
|
|
116
|
-
return symbol
|
|
120
|
+
return [symbol, cum_freq, freq] if cum_freq_value < next_cum
|
|
117
121
|
|
|
118
122
|
cum_freq = next_cum
|
|
119
123
|
end
|