omnizip 0.3.2 → 0.3.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (86) hide show
  1. checksums.yaml +4 -4
  2. data/.rubocop_todo.yml +243 -368
  3. data/README.adoc +101 -5
  4. data/docs/guides/archive-formats/index.adoc +31 -1
  5. data/docs/guides/archive-formats/ole-format.adoc +316 -0
  6. data/docs/guides/archive-formats/rpm-format.adoc +249 -0
  7. data/docs/index.adoc +12 -2
  8. data/lib/omnizip/algorithms/lzma/distance_coder.rb +29 -18
  9. data/lib/omnizip/algorithms/lzma/encoder.rb +2 -1
  10. data/lib/omnizip/algorithms/lzma/length_coder.rb +6 -3
  11. data/lib/omnizip/algorithms/lzma/literal_decoder.rb +2 -1
  12. data/lib/omnizip/algorithms/lzma/lzip_decoder.rb +40 -13
  13. data/lib/omnizip/algorithms/lzma/range_decoder.rb +36 -2
  14. data/lib/omnizip/algorithms/lzma/range_encoder.rb +19 -0
  15. data/lib/omnizip/algorithms/lzma/xz_encoder_fast.rb +2 -1
  16. data/lib/omnizip/algorithms/lzma/xz_utils_decoder.rb +148 -112
  17. data/lib/omnizip/algorithms/lzma.rb +20 -5
  18. data/lib/omnizip/algorithms/ppmd7/decoder.rb +25 -21
  19. data/lib/omnizip/algorithms/ppmd7/encoder.rb +4 -11
  20. data/lib/omnizip/algorithms/sevenzip_lzma2.rb +2 -1
  21. data/lib/omnizip/algorithms/xz_lzma2.rb +2 -1
  22. data/lib/omnizip/algorithms/zstandard/constants.rb +125 -9
  23. data/lib/omnizip/algorithms/zstandard/decoder.rb +202 -17
  24. data/lib/omnizip/algorithms/zstandard/encoder.rb +197 -17
  25. data/lib/omnizip/algorithms/zstandard/frame/block.rb +128 -0
  26. data/lib/omnizip/algorithms/zstandard/frame/header.rb +224 -0
  27. data/lib/omnizip/algorithms/zstandard/fse/bitstream.rb +186 -0
  28. data/lib/omnizip/algorithms/zstandard/fse/encoder.rb +325 -0
  29. data/lib/omnizip/algorithms/zstandard/fse/table.rb +269 -0
  30. data/lib/omnizip/algorithms/zstandard/huffman.rb +272 -0
  31. data/lib/omnizip/algorithms/zstandard/huffman_encoder.rb +339 -0
  32. data/lib/omnizip/algorithms/zstandard/literals.rb +178 -0
  33. data/lib/omnizip/algorithms/zstandard/literals_encoder.rb +251 -0
  34. data/lib/omnizip/algorithms/zstandard/sequences.rb +346 -0
  35. data/lib/omnizip/buffer/memory_extractor.rb +3 -3
  36. data/lib/omnizip/buffer.rb +2 -2
  37. data/lib/omnizip/filters/delta.rb +2 -1
  38. data/lib/omnizip/filters/registry.rb +6 -6
  39. data/lib/omnizip/formats/cpio/bounded_io.rb +66 -0
  40. data/lib/omnizip/formats/lzip.rb +2 -1
  41. data/lib/omnizip/formats/lzma_alone.rb +2 -1
  42. data/lib/omnizip/formats/ole/allocation_table.rb +244 -0
  43. data/lib/omnizip/formats/ole/constants.rb +61 -0
  44. data/lib/omnizip/formats/ole/dirent.rb +380 -0
  45. data/lib/omnizip/formats/ole/header.rb +198 -0
  46. data/lib/omnizip/formats/ole/ranges_io.rb +264 -0
  47. data/lib/omnizip/formats/ole/storage.rb +305 -0
  48. data/lib/omnizip/formats/ole/types/variant.rb +328 -0
  49. data/lib/omnizip/formats/ole.rb +145 -0
  50. data/lib/omnizip/formats/rar/compression/ppmd/decoder.rb +92 -49
  51. data/lib/omnizip/formats/rar/compression/ppmd/encoder.rb +13 -20
  52. data/lib/omnizip/formats/rar/rar5/compression/lzss.rb +6 -2
  53. data/lib/omnizip/formats/rar3/reader.rb +6 -2
  54. data/lib/omnizip/formats/rar5/reader.rb +4 -1
  55. data/lib/omnizip/formats/rpm/constants.rb +58 -0
  56. data/lib/omnizip/formats/rpm/entry.rb +102 -0
  57. data/lib/omnizip/formats/rpm/header.rb +113 -0
  58. data/lib/omnizip/formats/rpm/lead.rb +122 -0
  59. data/lib/omnizip/formats/rpm/tag.rb +230 -0
  60. data/lib/omnizip/formats/rpm.rb +434 -0
  61. data/lib/omnizip/formats/seven_zip/bcj2_stream_decompressor.rb +239 -0
  62. data/lib/omnizip/formats/seven_zip/coder_chain.rb +32 -8
  63. data/lib/omnizip/formats/seven_zip/constants.rb +1 -1
  64. data/lib/omnizip/formats/seven_zip/reader.rb +84 -8
  65. data/lib/omnizip/formats/seven_zip/stream_compressor.rb +2 -1
  66. data/lib/omnizip/formats/seven_zip/stream_decompressor.rb +6 -0
  67. data/lib/omnizip/formats/seven_zip/writer.rb +21 -9
  68. data/lib/omnizip/formats/seven_zip.rb +10 -0
  69. data/lib/omnizip/formats/xar/entry.rb +18 -5
  70. data/lib/omnizip/formats/xar/header.rb +34 -6
  71. data/lib/omnizip/formats/xar/reader.rb +43 -10
  72. data/lib/omnizip/formats/xar/toc.rb +34 -21
  73. data/lib/omnizip/formats/xar/writer.rb +15 -5
  74. data/lib/omnizip/formats/xz_impl/block_decoder.rb +45 -33
  75. data/lib/omnizip/formats/xz_impl/block_encoder.rb +2 -1
  76. data/lib/omnizip/formats/xz_impl/index_decoder.rb +3 -1
  77. data/lib/omnizip/formats/xz_impl/stream_header_parser.rb +2 -1
  78. data/lib/omnizip/formats/zip/end_of_central_directory.rb +4 -3
  79. data/lib/omnizip/implementations/seven_zip/lzma/decoder.rb +14 -6
  80. data/lib/omnizip/implementations/seven_zip/lzma/encoder.rb +2 -1
  81. data/lib/omnizip/implementations/seven_zip/lzma2/encoder.rb +28 -13
  82. data/lib/omnizip/implementations/xz_utils/lzma2/encoder.rb +13 -6
  83. data/lib/omnizip/pipe/stream_compressor.rb +1 -1
  84. data/lib/omnizip/version.rb +1 -1
  85. data/readme-docs/compression-algorithms.adoc +6 -2
  86. metadata +30 -2
@@ -109,6 +109,25 @@ module Omnizip
109
109
  end
110
110
  end
111
111
 
112
+ # Encode a symbol using cumulative frequency range
113
+ #
114
+ # This is used by PPMd for encoding symbols based on their
115
+ # frequency distribution in the current context.
116
+ #
117
+ # @param cum_freq [Integer] Cumulative frequency up to this symbol
118
+ # @param freq [Integer] Frequency of this symbol
119
+ # @param total_freq [Integer] Total frequency of all symbols in context
120
+ # @return [void]
121
+ def encode_freq(cum_freq, freq, total_freq)
122
+ normalize
123
+ range_freq = @range / total_freq
124
+ low_bound = range_freq * cum_freq
125
+ high_bound = range_freq * (cum_freq + freq)
126
+
127
+ @low = (@low + low_bound) & 0xFFFFFFFFFFFFFFFF
128
+ @range = (high_bound - low_bound) & 0xFFFFFFFF
129
+ end
130
+
112
131
  # Flush remaining bytes to output stream
113
132
  #
114
133
  # Ported from XZ Utils rc_flush().
@@ -487,7 +487,8 @@ module Omnizip
487
487
  subcoder_index = offset + match_bit + (symbol >> 8)
488
488
  bit = (symbol >> 7) & 1
489
489
 
490
- @encoder.queue_bit(@models.literal[literal_base + subcoder_index], bit)
490
+ @encoder.queue_bit(@models.literal[literal_base + subcoder_index],
491
+ bit)
491
492
 
492
493
  symbol <<= 1
493
494
  offset &= ~(match_byte ^ symbol)
@@ -167,12 +167,15 @@ module Omnizip
167
167
  # @param preserve_dict [Boolean] Whether to preserve dictionary from previous decode
168
168
  # @param check_rc_finished [Boolean] Whether to check if range decoder is finished
169
169
  # @return [String, Integer] Decompressed data or bytes written
170
- def decode_stream(output = nil, preserve_dict: false, check_rc_finished: true)
170
+ def decode_stream(output = nil, preserve_dict: false,
171
+ check_rc_finished: true)
171
172
  @decode_stream_call_count ||= 0
172
173
  @decode_stream_call_count += 1
173
174
  call_num = @decode_stream_call_count
174
175
 
175
- puts "DEBUG decode_stream START (call ##{call_num}): @pos=#{@pos.inspect}, @dict_full=#{@dict_full.inspect}, preserve_dict=#{preserve_dict}, @uncompressed_size=#{@uncompressed_size.inspect}" if ENV.fetch("LZMA_DEBUG", nil) && @dict_full && @dict_full >= 200 && @dict_full <= 230
176
+ puts "DEBUG decode_stream START (call ##{call_num}): @pos=#{@pos.inspect}, @dict_full=#{@dict_full.inspect}, preserve_dict=#{preserve_dict}, @uncompressed_size=#{@uncompressed_size.inspect}" if ENV.fetch(
177
+ "LZMA_DEBUG", nil
178
+ ) && @dict_full && @dict_full >= 200 && @dict_full <= 230
176
179
  if ENV["LZMA_DEBUG_DECODE_STREAM"]
177
180
  warn "DEBUG decode_stream[#{@decoder_id}] START: preserve_dict=#{preserve_dict}, @pos=#{@pos.inspect}, @dict_full=#{@dict_full.inspect}, @dict_buf.object_id=#{@dict_buf&.object_id || 'nil'}, @dict_buf.size=#{@dict_buf&.size || 'nil'}"
178
181
  end
@@ -183,10 +186,10 @@ module Omnizip
183
186
  # See: /Users/mulgogi/src/external/xz/src/liblzma/lzma/lzma2_decoder.c:140-141
184
187
  if ENV["LZMA_DEBUG"]
185
188
  warn "DEBUG: decode_stream - reusing range decoder @input.pos=#{begin
186
- @input.pos
187
- rescue StandardError
188
- 'N/A'
189
- end}, @range_decoder.class=#{@range_decoder.class}"
189
+ @input.pos
190
+ rescue StandardError
191
+ 'N/A'
192
+ end}, @range_decoder.class=#{@range_decoder.class}"
190
193
  end
191
194
 
192
195
  # Create range decoder if it doesn't exist (first chunk)
@@ -200,7 +203,7 @@ module Omnizip
200
203
 
201
204
  # Special case: empty input (uncompressed_size == 0)
202
205
  # Return immediately without trying to decode anything
203
- if @uncompressed_size != 0xFFFFFFFFFFFFFFFF && @uncompressed_size == 0
206
+ if @uncompressed_size != 0xFFFFFFFFFFFFFFFF && @uncompressed_size.zero?
204
207
  if ENV["LZMA_DEBUG"]
205
208
  warn "DEBUG: decode_stream - empty input (uncompressed_size=0), returning immediately"
206
209
  end
@@ -216,7 +219,9 @@ module Omnizip
216
219
  @chunk_bytes_decoded = 0
217
220
 
218
221
  # DEBUG: Show chunk_bytes_decoded initialization
219
- if @dict_full && @dict_full >= 220 && @dict_full <= 240 && ENV.fetch("LZMA_DEBUG", nil)
222
+ if @dict_full && @dict_full >= 220 && @dict_full <= 240 && ENV.fetch(
223
+ "LZMA_DEBUG", nil
224
+ )
220
225
  puts "DEBUG: chunk_bytes_decoded reset to 0 for chunk (call_num=#{call_num}, dict_full=#{@dict_full})"
221
226
  end
222
227
 
@@ -274,9 +279,13 @@ module Omnizip
274
279
  # properly reflected in start_pos, so we only return NEW bytes.
275
280
  # For LZMA2, we need to return only the NEW bytes, not all bytes from LZ_DICT_INIT_POS
276
281
  start_pos = @pos || LZ_DICT_INIT_POS
277
- puts "DEBUG: start_pos=#{start_pos}, @pos=#{@pos.inspect}, @dict_full=#{@dict_full.inspect}, preserve_dict=#{preserve_dict}, @decoder_id=#{@decoder_id}" if ENV.fetch("LZMA_DEBUG", nil) && @dict_full && @dict_full >= 200 && @dict_full <= 230
282
+ puts "DEBUG: start_pos=#{start_pos}, @pos=#{@pos.inspect}, @dict_full=#{@dict_full.inspect}, preserve_dict=#{preserve_dict}, @decoder_id=#{@decoder_id}" if ENV.fetch(
283
+ "LZMA_DEBUG", nil
284
+ ) && @dict_full && @dict_full >= 200 && @dict_full <= 230
278
285
  # Also show for chunk #1 start (dict_full around 227)
279
- puts "DEBUG: start_pos=#{start_pos}, @pos=#{@pos.inspect}, @dict_full=#{@dict_full.inspect}, @uncompressed_size=#{@uncompressed_size}, @decoder_id=#{@decoder_id}" if ENV.fetch("LZMA_DEBUG", nil) && @dict_full && @dict_full >= 225 && @dict_full <= 230
286
+ puts "DEBUG: start_pos=#{start_pos}, @pos=#{@pos.inspect}, @dict_full=#{@dict_full.inspect}, @uncompressed_size=#{@uncompressed_size}, @decoder_id=#{@decoder_id}" if ENV.fetch(
287
+ "LZMA_DEBUG", nil
288
+ ) && @dict_full && @dict_full >= 225 && @dict_full <= 230
280
289
 
281
290
  # Initialize rep distances (XZ Utils initializes to 0)
282
291
  # See: /Users/mulgogi/src/external/xz/src/liblzma/lzma/lzma_decoder.c:1054-1055
@@ -285,7 +294,9 @@ module Omnizip
285
294
  #
286
295
  # IMPORTANT: Initialize rep distances if they're nil OR not preserving dict
287
296
  if @rep0.nil? || @rep1.nil? || @rep2.nil? || @rep3.nil? || !preserve_dict
288
- puts "DEBUG: Resetting rep distances to 0 (rep0.nil?=#{@rep0.nil?}, preserve_dict=#{preserve_dict})" if ENV.fetch("LZMA_DEBUG", nil) && @dict_full && @dict_full >= 200 && @dict_full <= 230
297
+ puts "DEBUG: Resetting rep distances to 0 (rep0.nil?=#{@rep0.nil?}, preserve_dict=#{preserve_dict})" if ENV.fetch(
298
+ "LZMA_DEBUG", nil
299
+ ) && @dict_full && @dict_full >= 200 && @dict_full <= 230
289
300
  @rep0 = 0
290
301
  @rep1 = 0
291
302
  @rep2 = 0
@@ -301,17 +312,19 @@ module Omnizip
301
312
  # XZ Utils uses dict->pos (current position) + uncompressed_size
302
313
  # We use start_pos (current position) + @uncompressed_size
303
314
  limit = if @uncompressed_size == 0xFFFFFFFFFFFFFFFF
304
- nil # No limit for unknown size
305
- else
306
- start_pos + @uncompressed_size
307
- end
315
+ nil # No limit for unknown size
316
+ else
317
+ start_pos + @uncompressed_size
318
+ end
308
319
 
309
320
  # DEBUG: Show limit calculation for chunk #1
310
- if ENV.fetch("LZMA_DEBUG_LIMIT", nil) && @dict_full && @dict_full >= 220 && @dict_full <= 240
321
+ if ENV.fetch("LZMA_DEBUG_LIMIT",
322
+ nil) && @dict_full && @dict_full >= 220 && @dict_full <= 240
311
323
  puts "DEBUG LIMIT CALCULATION: start_pos=#{start_pos}, @uncompressed_size=#{@uncompressed_size}, limit=#{limit.inspect}"
312
324
  end
313
325
  # DEBUG: Also show for dict_full around 293 (where the error occurs)
314
- if ENV.fetch("LZMA_DEBUG_LIMIT", nil) && @dict_full && @dict_full >= 290 && @dict_full <= 300
326
+ if ENV.fetch("LZMA_DEBUG_LIMIT",
327
+ nil) && @dict_full && @dict_full >= 290 && @dict_full <= 300
315
328
  puts "DEBUG LIMIT CALCULATION at dict_full=#{@dict_full}: start_pos=#{start_pos}, @uncompressed_size=#{@uncompressed_size}, limit=#{limit.inspect}, @decoder_id=#{@decoder_id}"
316
329
  end
317
330
 
@@ -319,17 +332,18 @@ module Omnizip
319
332
  loop do
320
333
  iteration += 1
321
334
  # DEBUG: Show every iteration after position 200
322
- if ENV.fetch("LZMA_DEBUG_ITER", nil) && @dict_full && @dict_full >= 200 && @dict_full <= 500
335
+ if ENV.fetch("LZMA_DEBUG_ITER",
336
+ nil) && @dict_full && @dict_full >= 200 && @dict_full <= 500
323
337
  puts "DEBUG ITERATION ##{iteration}: pos=#{@pos}, dict_full=#{@dict_full}, limit=#{limit.inspect}"
324
338
  end
325
339
  # Check if we've reached the expected size (if known)
326
340
  # XZ Utils: checks dict.pos < dict.limit
327
341
  if ENV["LZMA_DEBUG_LIMIT"]
328
342
  compare_result = begin
329
- limit && @pos >= limit
330
- rescue StandardError
331
- "ERROR"
332
- end
343
+ limit && @pos >= limit
344
+ rescue StandardError
345
+ "ERROR"
346
+ end
333
347
  XzUtilsDecoderDebug.debug_puts "DEBUG LIMIT: iter=#{iteration}, pos=#{@pos.inspect}, dict_full=#{@dict_full}, limit=#{limit.inspect}, pos >= limit: #{compare_result}"
334
348
  end
335
349
 
@@ -504,16 +518,16 @@ module Omnizip
504
518
  # If explicitly set to false, it allows EOPM even when uncompressed size is known
505
519
  # Reference: alone_decoder.c:127 (LZMA_LZMA1EXT_ALLOW_EOPM)
506
520
  should_check = if @allow_eopm == true
507
- # EOPM is explicitly allowed, skip the check
508
- false
509
- elsif @allow_eopm == false
510
- # LZMA2 mode: always check (EOPM is not allowed)
511
- true
512
- else
513
- # @allow_eopm is nil (not set, first chunk or legacy mode)
514
- # Use check_rc_finished parameter as default
515
- check_rc_finished
516
- end
521
+ # EOPM is explicitly allowed, skip the check
522
+ false
523
+ elsif @allow_eopm == false
524
+ # LZMA2 mode: always check (EOPM is not allowed)
525
+ true
526
+ else
527
+ # @allow_eopm is nil (not set, first chunk or legacy mode)
528
+ # Use check_rc_finished parameter as default
529
+ check_rc_finished
530
+ end
517
531
 
518
532
  if should_check
519
533
  # If EOPM is not allowed, range decoder MUST be finished
@@ -522,12 +536,10 @@ module Omnizip
522
536
  "LZMA stream finished with leftover compressed data (range_decoder.code=#{@range_decoder.code}, expected 0). This indicates corruption in the compressed stream or an invalid EOPM for LZMA2."
523
537
  end
524
538
  break
525
- else
539
+ elsif @range_decoder.code.zero?
526
540
  # EOPM is allowed (e.g., LZMA_Alone format)
527
541
  # If range decoder is finished, we're done
528
- if @range_decoder.code.zero?
529
- break
530
- end
542
+ break
531
543
  # Otherwise, continue decoding to find EOPM marker
532
544
  # XZ Utils sets eopm_is_valid = true and continues
533
545
  # Reference: lzma_decoder.c:704
@@ -535,12 +547,14 @@ module Omnizip
535
547
  end
536
548
 
537
549
  # DEBUG: Show when approaching limit for chunk #1
538
- if ENV.fetch("LZMA_DEBUG_LIMIT", nil) && limit && @pos >= limit - 10 && @pos < limit + 10
550
+ if ENV.fetch("LZMA_DEBUG_LIMIT",
551
+ nil) && limit && @pos >= limit - 10 && @pos < limit + 10
539
552
  puts "DEBUG NEAR LIMIT (call #{call_num}): pos=#{@pos}, limit=#{limit}, dict_full=#{@dict_full}, chunk_bytes_decoded=#{@chunk_bytes_decoded}, remaining=#{@uncompressed_size ? @uncompressed_size - @chunk_bytes_decoded : 'N/A'}"
540
553
  end
541
554
 
542
555
  # DEBUG: Show when we've passed the expected limit
543
- if ENV.fetch("LZMA_DEBUG_LIMIT", nil) && limit && @pos >= limit && @pos < limit + 10
556
+ if ENV.fetch("LZMA_DEBUG_LIMIT",
557
+ nil) && limit && @pos >= limit && @pos < limit + 10
544
558
  puts "DEBUG PASSED LIMIT: pos=#{@pos}, limit=#{limit}, dict_full=#{@dict_full}, delta=#{@pos - limit}"
545
559
  end
546
560
 
@@ -614,16 +628,18 @@ module Omnizip
614
628
  end
615
629
  valid_bytes = @dict_buf[start_pos...@pos]
616
630
  # DEBUG: Show return value calculation
617
- puts "DEBUG RETURN CALCULATION: call_num=#{call_num}, start_pos=#{start_pos}, @pos=#{@pos}, valid_bytes.size=#{@dict_buf[start_pos...@pos].size}, dict_full=#{@dict_full}, chunk_bytes_decoded=#{@chunk_bytes_decoded}" if ENV.fetch("LZMA_DEBUG", nil) && @dict_full && @dict_full >= 220 && @dict_full <= 240 && call_num == 2
631
+ puts "DEBUG RETURN CALCULATION: call_num=#{call_num}, start_pos=#{start_pos}, @pos=#{@pos}, valid_bytes.size=#{@dict_buf[start_pos...@pos].size}, dict_full=#{@dict_full}, chunk_bytes_decoded=#{@chunk_bytes_decoded}" if ENV.fetch(
632
+ "LZMA_DEBUG", nil
633
+ ) && @dict_full && @dict_full >= 220 && @dict_full <= 240 && call_num == 2
618
634
  puts "DEBUG RETURN CALCULATION: call_num=#{call_num}, start_pos=#{start_pos}, @pos=#{@pos}, valid_bytes.size=#{@dict_buf[start_pos...@pos].size}, dict_full=#{@dict_full}, chunk_bytes_decoded=#{@chunk_bytes_decoded}" if ENV["LZMA_DEBUG"] && call_num == 2
619
635
  # Filter out nil values (can happen during dictionary reset transitions)
620
636
  valid_bytes = valid_bytes.map { |b| b.nil? ? 0 : b }
621
637
  if ENV["DEBUG_DICT_BUF"]
622
638
  XzUtilsDecoderDebug.debug_puts "DEBUG: valid_bytes=#{begin
623
- valid_bytes.size
624
- rescue StandardError
625
- valid_bytes.inspect
626
- end}"
639
+ valid_bytes.size
640
+ rescue StandardError
641
+ valid_bytes.inspect
642
+ end}"
627
643
  end
628
644
  valid_data = valid_bytes.pack("C*")
629
645
  if output
@@ -863,24 +879,24 @@ module Omnizip
863
879
  if @range_decoder
864
880
  if ENV["LZMA_DEBUG"]
865
881
  input_pos = begin
866
- @input.pos
867
- rescue StandardError
868
- "N/A"
869
- end
882
+ @input.pos
883
+ rescue StandardError
884
+ "N/A"
885
+ end
870
886
  input_size = begin
871
- @input.size
872
- rescue StandardError
873
- "N/A"
874
- end
887
+ @input.size
888
+ rescue StandardError
889
+ "N/A"
890
+ end
875
891
  XzUtilsDecoderDebug.debug_puts "=== finish_state_reset: resetting range_decoder, input pos=#{input_pos}, size=#{input_size}"
876
892
  end
877
893
  @range_decoder.reset
878
894
  if ENV["LZMA_DEBUG"]
879
895
  input_pos_after = begin
880
- @input.pos
881
- rescue StandardError
882
- "N/A"
883
- end
896
+ @input.pos
897
+ rescue StandardError
898
+ "N/A"
899
+ end
884
900
  XzUtilsDecoderDebug.debug_puts "=== finish_state_reset: after reset, input pos=#{input_pos_after}, range_decoder.code=0x#{@range_decoder.code.to_s(16)}"
885
901
  end
886
902
  end
@@ -920,7 +936,8 @@ module Omnizip
920
936
  @input = new_input
921
937
 
922
938
  # DEBUG: Trace input stream contents
923
- if ENV.fetch("LZMA_DEBUG", nil) && @dict_full && @dict_full >= 220 && @dict_full <= 230
939
+ if ENV.fetch("LZMA_DEBUG",
940
+ nil) && @dict_full && @dict_full >= 220 && @dict_full <= 230
924
941
  puts "\n=== set_input at dict_full=#{@dict_full} ==="
925
942
  puts " new_input.size=#{new_input.size}"
926
943
  puts " new_input.pos=#{new_input.pos}"
@@ -934,7 +951,9 @@ module Omnizip
934
951
 
935
952
  first_bytes << byte
936
953
  end
937
- puts " First 10 bytes: #{first_bytes.map { |b| "0x#{b.to_s(16).upcase}" }.join(' ')}"
954
+ puts " First 10 bytes: #{first_bytes.map do |b|
955
+ "0x#{b.to_s(16).upcase}"
956
+ end.join(' ')}"
938
957
 
939
958
  new_input.rewind
940
959
  test_byte = new_input.getbyte
@@ -1268,22 +1287,22 @@ module Omnizip
1268
1287
  if @dict_full == 233
1269
1288
  XzUtilsDecoderDebug.debug_puts " DETAILED TRACE at dict_full=233 (pos=#{@pos}):"
1270
1289
  XzUtilsDecoderDebug.debug_puts " byte=0x#{byte.to_s(16)} ('#{begin
1271
- byte.chr
1272
- rescue StandardError
1273
- '?'
1274
- end}')"
1290
+ byte.chr
1291
+ rescue StandardError
1292
+ '?'
1293
+ end}')"
1275
1294
  XzUtilsDecoderDebug.debug_puts " state.value=#{@state.value}, lit_state=#{lit_state}"
1276
1295
  XzUtilsDecoderDebug.debug_puts " use_matched_literal?=#{@state.use_matched_literal?}"
1277
1296
  prev_byte_val = @dict_full.positive? ? @dict_buf[@pos - 1] : "N/A"
1278
1297
  XzUtilsDecoderDebug.debug_puts " prev_byte=#{prev_byte_val.inspect} (#{if prev_byte_val.is_a?(Integer)
1279
- "0x#{prev_byte_val.to_s(16)} ('#{begin
1280
- prev_byte_val.chr
1281
- rescue StandardError
1282
- '?'
1283
- end}')"
1284
- else
1285
- 'N/A'
1286
- end})"
1298
+ "0x#{prev_byte_val.to_s(16)} ('#{begin
1299
+ prev_byte_val.chr
1300
+ rescue StandardError
1301
+ '?'
1302
+ end}')"
1303
+ else
1304
+ 'N/A'
1305
+ end})"
1287
1306
  XzUtilsDecoderDebug.debug_puts " range_decoder.range=0x#{@range_decoder.range.to_s(16)}, range_decoder.code=0x#{@range_decoder.code.to_s(16)}"
1288
1307
  XzUtilsDecoderDebug.debug_puts " input.pos=#{@input.pos}, input.size=#{@input.size}"
1289
1308
  end
@@ -1335,14 +1354,17 @@ module Omnizip
1335
1354
  if ENV["TRACE_ARM64_BYTES"]
1336
1355
  @arm64_trace ||= []
1337
1356
  if @arm64_trace.size < 20
1338
- @arm64_trace << [@dict_full, @pos, byte.class, byte.is_a?(Integer) ? byte : byte.ord, @dict_buf[@pos]]
1357
+ @arm64_trace << [@dict_full, @pos, byte.class,
1358
+ byte.is_a?(Integer) ? byte : byte.ord, @dict_buf[@pos]]
1339
1359
  if @arm64_trace.size == 20
1340
1360
  # Dump the trace
1341
1361
  puts "\n=== ARM64 BYTE TRACE (first 20 bytes) ==="
1342
1362
  puts "Decoder ID: #{@decoder_id}"
1343
1363
  @arm64_trace.each_with_index do |entry, i|
1344
1364
  df, p, _, val, stored = entry
1345
- puts " [#{i + 1}] dict_full=#{df.to_s.rjust(6)}, pos=#{p.to_s.rjust(6)}, byte=#{val.to_s.rjust(3)} (0x#{val.to_s(16).upcase.rjust(2, '0')}) stored=#{stored.inspect}"
1365
+ puts " [#{i + 1}] dict_full=#{df.to_s.rjust(6)}, pos=#{p.to_s.rjust(6)}, byte=#{val.to_s.rjust(3)} (0x#{val.to_s(16).upcase.rjust(
1366
+ 2, '0'
1367
+ )}) stored=#{stored.inspect}"
1346
1368
  end
1347
1369
  puts "=========================================\n"
1348
1370
  $stderr.flush
@@ -1482,24 +1504,36 @@ module Omnizip
1482
1504
  # ((len) < DIST_STATES + MATCH_LEN_MIN ? (len) - MATCH_LEN_MIN : DIST_STATES - 1)
1483
1505
  # This gives: len=2→0, len=3→1, len=4→2, len=5→3, len=6+→3
1484
1506
  len_state = if length < NUM_LEN_TO_POS_STATES + MATCH_LEN_MIN
1485
- length - MATCH_LEN_MIN
1486
- else
1487
- NUM_LEN_TO_POS_STATES - 1
1488
- end
1507
+ length - MATCH_LEN_MIN
1508
+ else
1509
+ NUM_LEN_TO_POS_STATES - 1
1510
+ end
1489
1511
 
1490
1512
  # DEBUG: Show bytes being copied
1491
1513
  if old_dict_full.between?(210, 230) || ENV["LZMA_DEBUG_DISTANCE"]
1492
- XzUtilsDecoderDebug.debug_puts "\n=== decode_regular_match at dict_full=#{old_dict_full} ===" if old_dict_full.between?(210, 230)
1514
+ XzUtilsDecoderDebug.debug_puts "\n=== decode_regular_match at dict_full=#{old_dict_full} ===" if old_dict_full.between?(
1515
+ 210, 230
1516
+ )
1493
1517
  puts "[DISTANCE_DECODER] decode_regular_match at dict_full=#{old_dict_full}" if ENV["LZMA_DEBUG_DISTANCE"]
1494
- XzUtilsDecoderDebug.debug_puts " pos_state=#{pos_state}" if old_dict_full.between?(210, 230)
1518
+ XzUtilsDecoderDebug.debug_puts " pos_state=#{pos_state}" if old_dict_full.between?(
1519
+ 210, 230
1520
+ )
1495
1521
  puts "[DISTANCE_DECODER] pos_state=#{pos_state}" if ENV["LZMA_DEBUG_DISTANCE"]
1496
- XzUtilsDecoderDebug.debug_puts " state=#{old_state}" if old_dict_full.between?(210, 230)
1522
+ XzUtilsDecoderDebug.debug_puts " state=#{old_state}" if old_dict_full.between?(
1523
+ 210, 230
1524
+ )
1497
1525
  puts "[DISTANCE_DECODER] state=#{old_state}" if ENV["LZMA_DEBUG_DISTANCE"]
1498
- XzUtilsDecoderDebug.debug_puts " length_encoded=#{length_encoded} length=#{length}" if old_dict_full.between?(210, 230)
1526
+ XzUtilsDecoderDebug.debug_puts " length_encoded=#{length_encoded} length=#{length}" if old_dict_full.between?(
1527
+ 210, 230
1528
+ )
1499
1529
  puts "[DISTANCE_DECODER] length_encoded=#{length_encoded} length=#{length}" if ENV["LZMA_DEBUG_DISTANCE"]
1500
- XzUtilsDecoderDebug.debug_puts " len_state=#{len_state}" if old_dict_full.between?(210, 230)
1530
+ XzUtilsDecoderDebug.debug_puts " len_state=#{len_state}" if old_dict_full.between?(
1531
+ 210, 230
1532
+ )
1501
1533
  puts "[DISTANCE_DECODER] len_state=#{len_state}" if ENV["LZMA_DEBUG_DISTANCE"]
1502
- XzUtilsDecoderDebug.debug_puts " rep0_before=#{old_rep0}" if old_dict_full.between?(210, 230)
1534
+ XzUtilsDecoderDebug.debug_puts " rep0_before=#{old_rep0}" if old_dict_full.between?(
1535
+ 210, 230
1536
+ )
1503
1537
  puts "[DISTANCE_DECODER] rep0_before=#{old_rep0}" if ENV["LZMA_DEBUG_DISTANCE"]
1504
1538
  end
1505
1539
 
@@ -1513,7 +1547,9 @@ module Omnizip
1513
1547
  rep0 = @distance_coder.decode(@range_decoder, len_state)
1514
1548
 
1515
1549
  # DEBUG
1516
- if (ENV.fetch("LZMA_DEBUG", nil) && old_dict_full.between?(210, 230)) || old_dict_full == 293
1550
+ if (ENV.fetch("LZMA_DEBUG",
1551
+ nil) && old_dict_full.between?(210,
1552
+ 230)) || old_dict_full == 293
1517
1553
  puts " rep0_decoded=#{rep0} (distance = #{rep0})"
1518
1554
  puts " buffer_back calculation: back=#{@dict_full - rep0 - 1}"
1519
1555
  end
@@ -1609,14 +1645,14 @@ module Omnizip
1609
1645
  XzUtilsDecoderDebug.debug_puts " buffer_back=#{buffer_back}, back=#{back}"
1610
1646
  bytes_at_back = @dict_buf[buffer_back, 3]
1611
1647
  bytes_hex = if bytes_at_back.is_a?(String)
1612
- bytes_at_back.bytes.map do |b|
1613
- "%02x" % b
1614
- end.join(" ")
1615
- else
1616
- bytes_at_back.map do |b|
1617
- "%02x" % b
1618
- end.join(" ")
1619
- end
1648
+ bytes_at_back.bytes.map do |b|
1649
+ "%02x" % b
1650
+ end.join(" ")
1651
+ else
1652
+ bytes_at_back.map do |b|
1653
+ "%02x" % b
1654
+ end.join(" ")
1655
+ end
1620
1656
  XzUtilsDecoderDebug.debug_puts " First 3 bytes at buffer_back: #{bytes_hex} (#{bytes_at_back.inspect})"
1621
1657
  end
1622
1658
 
@@ -1916,14 +1952,14 @@ module Omnizip
1916
1952
  XzUtilsDecoderDebug.debug_puts " back=#{old_back}, wrapped_back=#{back}, buffer_back=#{buffer_back}"
1917
1953
  bytes_at_back = @dict_buf[buffer_back, 3]
1918
1954
  bytes_hex = if bytes_at_back.is_a?(String)
1919
- bytes_at_back.bytes.map do |b|
1920
- "%02x" % b
1921
- end.join(" ")
1922
- else
1923
- [bytes_at_back].flatten.map do |b|
1924
- "%02x" % b
1925
- end.join(" ")
1926
- end
1955
+ bytes_at_back.bytes.map do |b|
1956
+ "%02x" % b
1957
+ end.join(" ")
1958
+ else
1959
+ [bytes_at_back].flatten.map do |b|
1960
+ "%02x" % b
1961
+ end.join(" ")
1962
+ end
1927
1963
  XzUtilsDecoderDebug.debug_puts " First 3 bytes at buffer_back: #{bytes_hex} (#{bytes_at_back.inspect})"
1928
1964
  end
1929
1965
 
@@ -1933,10 +1969,10 @@ module Omnizip
1933
1969
  source_val = @dict_buf[@pos - 1]
1934
1970
  puts " Rep match copy at dict_full=#{@dict_full}: length=#{length}, distance=#{distance}, @pos=#{@pos} (will write to #{@pos}...#{@pos + length - 1})"
1935
1971
  puts " Reading from @pos-1=#{@pos - 1}, source byte = #{source_val} (0x#{source_val.to_s(16)} '#{begin
1936
- source_val.chr
1937
- rescue StandardError
1938
- '?'
1939
- end}')"
1972
+ source_val.chr
1973
+ rescue StandardError
1974
+ '?'
1975
+ end}')"
1940
1976
  puts " Before copy: @dict_buf[#{@pos}...#{@pos + length - 1}] = #{@dict_buf[@pos,
1941
1977
  length].inspect}"
1942
1978
  end
@@ -1950,17 +1986,17 @@ module Omnizip
1950
1986
  10].inspect}"
1951
1987
  # DEBUG: Check if buffer_back+1 has the correct byte
1952
1988
  puts " dict_buf[buffer_back+1=#{buffer_back + 1}] = #{@dict_buf[buffer_back + 1].inspect} ('#{begin
1953
- @dict_buf[buffer_back + 1].chr
1954
- rescue StandardError
1955
- '?'
1956
- end}')"
1989
+ @dict_buf[buffer_back + 1].chr
1990
+ rescue StandardError
1991
+ '?'
1992
+ end}')"
1957
1993
  prev_5 = if buffer_back > 4
1958
- @dict_buf[(buffer_back - 5)..(buffer_back - 1)].map do |b|
1959
- "0x#{b.to_s(16).upcase} (#{b.chr})"
1960
- end.join(", ")
1961
- else
1962
- "N/A"
1963
- end
1994
+ @dict_buf[(buffer_back - 5)..(buffer_back - 1)].map do |b|
1995
+ "0x#{b.to_s(16).upcase} (#{b.chr})"
1996
+ end.join(", ")
1997
+ else
1998
+ "N/A"
1999
+ end
1964
2000
  puts " Previous 5 bytes: [#{prev_5}]"
1965
2001
  puts " Current dict_full=#{@dict_full}, @pos=#{@pos}"
1966
2002
  end
@@ -97,12 +97,21 @@ module Omnizip
97
97
 
98
98
  # Build decoder options, merging with instance variables as fallbacks
99
99
  decoder_opts = build_decoder_options(options)
100
- decoder_opts[:lzma2_mode] = @lzma2_mode if @lzma2_mode && !decoder_opts.key?(:lzma2_mode)
100
+ if @lzma2_mode && !decoder_opts.key?(:lzma2_mode)
101
+ decoder_opts[:lzma2_mode] =
102
+ @lzma2_mode
103
+ end
101
104
  decoder_opts[:lc] = @lc if @lc && !decoder_opts.key?(:lc)
102
105
  decoder_opts[:lp] = @lp if @lp && !decoder_opts.key?(:lp)
103
106
  decoder_opts[:pb] = @pb if @pb && !decoder_opts.key?(:pb)
104
- decoder_opts[:dict_size] = @dict_size if @dict_size && !decoder_opts.key?(:dict_size)
105
- decoder_opts[:uncompressed_size] = @uncompressed_size if @uncompressed_size && !decoder_opts.key?(:uncompressed_size)
107
+ if @dict_size && !decoder_opts.key?(:dict_size)
108
+ decoder_opts[:dict_size] =
109
+ @dict_size
110
+ end
111
+ if @uncompressed_size && !decoder_opts.key?(:uncompressed_size)
112
+ decoder_opts[:uncompressed_size] =
113
+ @uncompressed_size
114
+ end
106
115
  decoder_opts[:uncompressed_size] ||= options[:size] if options.respond_to?(:key?) && options.key?(:size)
107
116
 
108
117
  decoder = Decoder.new(input_stream, decoder_opts)
@@ -167,13 +176,19 @@ module Omnizip
167
176
 
168
177
  # Handle Hash-like options - pass through all decoder-relevant options
169
178
  if options.respond_to?(:key?)
170
- opts[:sdk_compatible] = options[:sdk_compatible] if options.key?(:sdk_compatible)
179
+ if options.key?(:sdk_compatible)
180
+ opts[:sdk_compatible] =
181
+ options[:sdk_compatible]
182
+ end
171
183
  opts[:lzma2_mode] = options[:lzma2_mode] if options.key?(:lzma2_mode)
172
184
  opts[:lc] = options[:lc] if options.key?(:lc)
173
185
  opts[:lp] = options[:lp] if options.key?(:lp)
174
186
  opts[:pb] = options[:pb] if options.key?(:pb)
175
187
  opts[:dict_size] = options[:dict_size] if options.key?(:dict_size)
176
- opts[:uncompressed_size] = options[:uncompressed_size] if options.key?(:uncompressed_size)
188
+ if options.key?(:uncompressed_size)
189
+ opts[:uncompressed_size] =
190
+ options[:uncompressed_size]
191
+ end
177
192
  opts[:size] = options[:size] if options.key?(:size)
178
193
  end
179
194
 
@@ -74,46 +74,50 @@ module Omnizip
74
74
  # Decode a single symbol using the model
75
75
  #
76
76
  # Uses the model and range decoder to extract the
77
- # original symbol value.
77
+ # original symbol value using proper range decoding.
78
78
  #
79
79
  # @return [Integer, nil] Decoded byte or nil if end
80
80
  def decode_symbol
81
- # Decode range value
82
- value = @range_decoder.decode_direct_bits(16)
81
+ # Get context for decoding
82
+ context = @model.root_context
83
+ total_freq = context.total_freq
84
+
85
+ return nil if total_freq.zero?
86
+
87
+ # Decode cumulative frequency using proper range decoding
88
+ cum_freq_value = @range_decoder.decode_freq(total_freq)
83
89
 
84
- # Find symbol from range
85
- symbol = find_symbol_from_range(value)
90
+ # Find symbol from cumulative frequency value
91
+ symbol, cum_freq, freq = find_symbol_from_cum_freq(context,
92
+ cum_freq_value)
86
93
  return nil if symbol.nil?
87
94
 
95
+ # Normalize the range decoder state
96
+ @range_decoder.normalize_freq(cum_freq, freq, total_freq)
97
+
88
98
  # Update model to stay in sync with encoder
89
99
  @model.update(symbol)
90
100
 
91
101
  symbol
92
102
  end
93
103
 
94
- # Find symbol from decoded range value
104
+ # Find symbol from cumulative frequency value
95
105
  #
96
- # Converts the range value back to a symbol using
97
- # the current context's probability distribution.
106
+ # Maps the decoded cumulative frequency back to a symbol
107
+ # using the context's probability distribution.
98
108
  #
99
- # @param value [Integer] Decoded range value
100
- # @return [Integer, nil] The symbol
101
- def find_symbol_from_range(value)
102
- # This is simplified - real implementation would
103
- # properly decode using context probabilities
104
- context = @model.root_context
105
-
106
- # Find symbol whose cumulative range contains value
107
- scale = 0x10000
109
+ # @param context [Context] The current context
110
+ # @param cum_freq_value [Integer] Decoded cumulative frequency
111
+ # @return [Array<Integer, Integer, Integer>] symbol, cum_freq, freq
112
+ def find_symbol_from_cum_freq(context, cum_freq_value)
108
113
  cum_freq = 0
109
114
 
110
115
  context.states.keys.sort.each do |symbol|
111
116
  state = context.states[symbol]
112
- next_cum = cum_freq + state.freq
113
- sym_low = (cum_freq * scale) / context.total_freq
114
- sym_high = (next_cum * scale) / context.total_freq
117
+ freq = state.freq
118
+ next_cum = cum_freq + freq
115
119
 
116
- return symbol if value >= sym_low && value < sym_high
120
+ return [symbol, cum_freq, freq] if cum_freq_value < next_cum
117
121
 
118
122
  cum_freq = next_cum
119
123
  end