omnizip 0.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.rspec +3 -0
- data/.rubocop.yml +32 -0
- data/.rubocop_todo.yml +754 -0
- data/COPYING +502 -0
- data/Gemfile +17 -0
- data/LICENSE +12 -0
- data/README.adoc +1045 -0
- data/Rakefile +12 -0
- data/benchmark/README.md +260 -0
- data/benchmark/benchmark_suite.rb +125 -0
- data/benchmark/compression_bench.rb +181 -0
- data/benchmark/filter_bench.rb +180 -0
- data/benchmark/models/benchmark_result.rb +59 -0
- data/benchmark/models/comparison_result.rb +69 -0
- data/benchmark/profile_suite.rb +167 -0
- data/benchmark/reporter.rb +150 -0
- data/benchmark/run_benchmarks.rb +66 -0
- data/benchmark/test_data.rb +137 -0
- data/config/formats/rar3_spec.yml +91 -0
- data/config/formats/rar5_spec.yml +102 -0
- data/docs/.github/workflows/docs.yml +142 -0
- data/docs/.gitignore +21 -0
- data/docs/.lychee.toml +67 -0
- data/docs/Gemfile +13 -0
- data/docs/RAR_WRITE_SUPPORT.md +26 -0
- data/docs/README.md +101 -0
- data/docs/_config.yml +112 -0
- data/docs/assets/logo.svg +1 -0
- data/docs/assets/omnizip-logo.pdf +1540 -11
- data/docs/comparison/feature-matrix.adoc +694 -0
- data/docs/comparison/index.adoc +113 -0
- data/docs/comparison/vs-7zip.adoc +309 -0
- data/docs/comparison/vs-peazip.adoc +77 -0
- data/docs/comparison/vs-rubyzip.adoc +342 -0
- data/docs/comparison/vs-winrar.adoc +100 -0
- data/docs/compatibility.adoc +579 -0
- data/docs/concepts/index.adoc +129 -0
- data/docs/developer/architecture.adoc +256 -0
- data/docs/developer/contributing.adoc +158 -0
- data/docs/developer/index.adoc +25 -0
- data/docs/developer/testing.adoc +212 -0
- data/docs/getting-started/basic-usage.adoc +271 -0
- data/docs/getting-started/index.adoc +42 -0
- data/docs/getting-started/installation.adoc +138 -0
- data/docs/getting-started/quick-start.adoc +185 -0
- data/docs/getting-started/your-first-archive.adoc +218 -0
- data/docs/guides/advanced-features/encryption.adoc +300 -0
- data/docs/guides/advanced-features/index.adoc +49 -0
- data/docs/guides/advanced-features/parallel-processing.adoc +246 -0
- data/docs/guides/advanced-features/progress-tracking.adoc +320 -0
- data/docs/guides/advanced-features/streaming.adoc +212 -0
- data/docs/guides/archive-formats/gzip-format.adoc +107 -0
- data/docs/guides/archive-formats/index.adoc +130 -0
- data/docs/guides/archive-formats/rar-format.adoc +104 -0
- data/docs/guides/archive-formats/rar5.adoc +521 -0
- data/docs/guides/archive-formats/seven-zip-format.adoc +35 -0
- data/docs/guides/archive-formats/tar-format.adoc +106 -0
- data/docs/guides/archive-formats/xz-format.adoc +118 -0
- data/docs/guides/archive-formats/zip-format.adoc +35 -0
- data/docs/guides/compression-algorithms/bzip2.adoc +113 -0
- data/docs/guides/compression-algorithms/deflate.adoc +319 -0
- data/docs/guides/compression-algorithms/index.adoc +190 -0
- data/docs/guides/compression-algorithms/lzma.adoc +398 -0
- data/docs/guides/compression-algorithms/lzma2.adoc +327 -0
- data/docs/guides/compression-algorithms/ppmd.adoc +316 -0
- data/docs/guides/compression-algorithms/zstandard.adoc +361 -0
- data/docs/guides/creating-archives.adoc +354 -0
- data/docs/guides/extracting-archives.adoc +53 -0
- data/docs/guides/format-conversion.adoc +64 -0
- data/docs/guides/index.adoc +49 -0
- data/docs/guides/migration-rubyzip.adoc +217 -0
- data/docs/guides/parity-archives.adoc +605 -0
- data/docs/guides/performance-tuning.adoc +88 -0
- data/docs/index.adoc +218 -0
- data/docs/lychee.toml +67 -0
- data/docs/reference/api/overview.adoc +188 -0
- data/docs/reference/cli/compress-command.adoc +114 -0
- data/docs/reference/cli/overview.adoc +140 -0
- data/docs/reference/index.adoc +26 -0
- data/docs/resources/faq.adoc +185 -0
- data/docs/resources/quick-reference.adoc +222 -0
- data/docs/troubleshooting/index.adoc +208 -0
- data/examples/api_comparison.rb +205 -0
- data/examples/deflate64_example.rb +96 -0
- data/examples/par2_demo.rb +121 -0
- data/examples/quick_start_native.rb +150 -0
- data/examples/quick_start_rubyzip.rb +115 -0
- data/examples/rubyzip_compatibility_demo.rb +194 -0
- data/exe/omnizip +27 -0
- data/lib/omnizip/algorithm.rb +130 -0
- data/lib/omnizip/algorithm_registry.rb +86 -0
- data/lib/omnizip/algorithms/.keep +0 -0
- data/lib/omnizip/algorithms/bzip2/bwt.rb +225 -0
- data/lib/omnizip/algorithms/bzip2/decoder.rb +193 -0
- data/lib/omnizip/algorithms/bzip2/encoder.rb +237 -0
- data/lib/omnizip/algorithms/bzip2/huffman.rb +206 -0
- data/lib/omnizip/algorithms/bzip2/mtf.rb +101 -0
- data/lib/omnizip/algorithms/bzip2/rle.rb +151 -0
- data/lib/omnizip/algorithms/bzip2.rb +130 -0
- data/lib/omnizip/algorithms/deflate/constants.rb +28 -0
- data/lib/omnizip/algorithms/deflate/decoder.rb +38 -0
- data/lib/omnizip/algorithms/deflate/encoder.rb +46 -0
- data/lib/omnizip/algorithms/deflate.rb +128 -0
- data/lib/omnizip/algorithms/deflate64/constants.rb +45 -0
- data/lib/omnizip/algorithms/deflate64/decoder.rb +153 -0
- data/lib/omnizip/algorithms/deflate64/encoder.rb +98 -0
- data/lib/omnizip/algorithms/deflate64/huffman_coder.rb +354 -0
- data/lib/omnizip/algorithms/deflate64/lz77_encoder.rb +142 -0
- data/lib/omnizip/algorithms/deflate64.rb +109 -0
- data/lib/omnizip/algorithms/lzma/bit_model.rb +120 -0
- data/lib/omnizip/algorithms/lzma/constants.rb +112 -0
- data/lib/omnizip/algorithms/lzma/decoder.rb +148 -0
- data/lib/omnizip/algorithms/lzma/dictionary.rb +69 -0
- data/lib/omnizip/algorithms/lzma/distance_coder.rb +415 -0
- data/lib/omnizip/algorithms/lzma/encoder.rb +142 -0
- data/lib/omnizip/algorithms/lzma/length_coder.rb +260 -0
- data/lib/omnizip/algorithms/lzma/literal_decoder.rb +320 -0
- data/lib/omnizip/algorithms/lzma/literal_encoder.rb +210 -0
- data/lib/omnizip/algorithms/lzma/lzip_decoder.rb +341 -0
- data/lib/omnizip/algorithms/lzma/lzma_alone_decoder.rb +192 -0
- data/lib/omnizip/algorithms/lzma/lzma_state.rb +128 -0
- data/lib/omnizip/algorithms/lzma/match.rb +32 -0
- data/lib/omnizip/algorithms/lzma/match_finder.rb +205 -0
- data/lib/omnizip/algorithms/lzma/match_finder_config.rb +142 -0
- data/lib/omnizip/algorithms/lzma/match_finder_factory.rb +88 -0
- data/lib/omnizip/algorithms/lzma/optimal_encoder.rb +130 -0
- data/lib/omnizip/algorithms/lzma/probability_models.rb +72 -0
- data/lib/omnizip/algorithms/lzma/range_coder.rb +85 -0
- data/lib/omnizip/algorithms/lzma/range_decoder.rb +434 -0
- data/lib/omnizip/algorithms/lzma/range_encoder.rb +194 -0
- data/lib/omnizip/algorithms/lzma/state.rb +127 -0
- data/lib/omnizip/algorithms/lzma/xz_buffered_range_encoder.rb +325 -0
- data/lib/omnizip/algorithms/lzma/xz_encoder.rb +426 -0
- data/lib/omnizip/algorithms/lzma/xz_encoder_fast.rb +645 -0
- data/lib/omnizip/algorithms/lzma/xz_match_finder_adapter.rb +227 -0
- data/lib/omnizip/algorithms/lzma/xz_price_calculator.rb +169 -0
- data/lib/omnizip/algorithms/lzma/xz_probability_models.rb +261 -0
- data/lib/omnizip/algorithms/lzma/xz_range_encoder.rb +223 -0
- data/lib/omnizip/algorithms/lzma/xz_range_encoder_exact.rb +331 -0
- data/lib/omnizip/algorithms/lzma/xz_state.rb +116 -0
- data/lib/omnizip/algorithms/lzma/xz_utils_decoder.rb +2055 -0
- data/lib/omnizip/algorithms/lzma.rb +238 -0
- data/lib/omnizip/algorithms/lzma2/chunk_manager.rb +182 -0
- data/lib/omnizip/algorithms/lzma2/constants.rb +41 -0
- data/lib/omnizip/algorithms/lzma2/encoder.rb +147 -0
- data/lib/omnizip/algorithms/lzma2/lzma2_chunk.rb +161 -0
- data/lib/omnizip/algorithms/lzma2/properties.rb +179 -0
- data/lib/omnizip/algorithms/lzma2/simple_lzma2_encoder.rb +127 -0
- data/lib/omnizip/algorithms/lzma2/xz_encoder_adapter.rb +85 -0
- data/lib/omnizip/algorithms/lzma2.rb +141 -0
- data/lib/omnizip/algorithms/ppmd7/constants.rb +74 -0
- data/lib/omnizip/algorithms/ppmd7/context.rb +154 -0
- data/lib/omnizip/algorithms/ppmd7/decoder.rb +126 -0
- data/lib/omnizip/algorithms/ppmd7/encoder.rb +163 -0
- data/lib/omnizip/algorithms/ppmd7/model.rb +248 -0
- data/lib/omnizip/algorithms/ppmd7/symbol_state.rb +57 -0
- data/lib/omnizip/algorithms/ppmd7.rb +116 -0
- data/lib/omnizip/algorithms/ppmd8/constants.rb +61 -0
- data/lib/omnizip/algorithms/ppmd8/context.rb +34 -0
- data/lib/omnizip/algorithms/ppmd8/decoder.rb +107 -0
- data/lib/omnizip/algorithms/ppmd8/encoder.rb +138 -0
- data/lib/omnizip/algorithms/ppmd8/model.rb +250 -0
- data/lib/omnizip/algorithms/ppmd8/restoration_method.rb +78 -0
- data/lib/omnizip/algorithms/ppmd8.rb +82 -0
- data/lib/omnizip/algorithms/ppmd_base.rb +138 -0
- data/lib/omnizip/algorithms/sevenzip_lzma2.rb +123 -0
- data/lib/omnizip/algorithms/xz_lzma2.rb +118 -0
- data/lib/omnizip/algorithms/zstandard/constants.rb +25 -0
- data/lib/omnizip/algorithms/zstandard/decoder.rb +46 -0
- data/lib/omnizip/algorithms/zstandard/encoder.rb +51 -0
- data/lib/omnizip/algorithms/zstandard.rb +138 -0
- data/lib/omnizip/buffer/memory_archive.rb +251 -0
- data/lib/omnizip/buffer/memory_extractor.rb +224 -0
- data/lib/omnizip/buffer.rb +176 -0
- data/lib/omnizip/checksum_registry.rb +114 -0
- data/lib/omnizip/checksums/crc32.rb +100 -0
- data/lib/omnizip/checksums/crc64.rb +101 -0
- data/lib/omnizip/checksums/crc_base.rb +158 -0
- data/lib/omnizip/checksums/verifier.rb +131 -0
- data/lib/omnizip/chunked/memory_manager.rb +194 -0
- data/lib/omnizip/chunked/reader.rb +78 -0
- data/lib/omnizip/chunked/writer.rb +120 -0
- data/lib/omnizip/chunked.rb +129 -0
- data/lib/omnizip/cli/output_formatter.rb +104 -0
- data/lib/omnizip/cli.rb +572 -0
- data/lib/omnizip/commands/.keep +0 -0
- data/lib/omnizip/commands/archive_create_command.rb +427 -0
- data/lib/omnizip/commands/archive_extract_command.rb +272 -0
- data/lib/omnizip/commands/archive_list_command.rb +218 -0
- data/lib/omnizip/commands/archive_repair_command.rb +131 -0
- data/lib/omnizip/commands/archive_verify_command.rb +117 -0
- data/lib/omnizip/commands/compress_command.rb +117 -0
- data/lib/omnizip/commands/decompress_command.rb +120 -0
- data/lib/omnizip/commands/list_command.rb +53 -0
- data/lib/omnizip/commands/metadata_command.rb +153 -0
- data/lib/omnizip/commands/parity_create_command.rb +122 -0
- data/lib/omnizip/commands/parity_repair_command.rb +122 -0
- data/lib/omnizip/commands/parity_verify_command.rb +124 -0
- data/lib/omnizip/commands/profile_list_command.rb +56 -0
- data/lib/omnizip/commands/profile_show_command.rb +44 -0
- data/lib/omnizip/convenience.rb +359 -0
- data/lib/omnizip/converter/conversion_registry.rb +49 -0
- data/lib/omnizip/converter/conversion_strategy.rb +121 -0
- data/lib/omnizip/converter/seven_zip_to_zip_strategy.rb +97 -0
- data/lib/omnizip/converter/zip_to_seven_zip_strategy.rb +112 -0
- data/lib/omnizip/converter.rb +105 -0
- data/lib/omnizip/crypto/aes256/cipher.rb +100 -0
- data/lib/omnizip/crypto/aes256/constants.rb +28 -0
- data/lib/omnizip/crypto/aes256/key_derivation.rb +101 -0
- data/lib/omnizip/crypto/aes256.rb +102 -0
- data/lib/omnizip/error.rb +106 -0
- data/lib/omnizip/eta/exponential_smoothing_estimator.rb +98 -0
- data/lib/omnizip/eta/moving_average_estimator.rb +99 -0
- data/lib/omnizip/eta/rate_calculator.rb +104 -0
- data/lib/omnizip/eta/sample_history.rb +143 -0
- data/lib/omnizip/eta/time_estimator.rb +106 -0
- data/lib/omnizip/eta.rb +63 -0
- data/lib/omnizip/extraction/filter_chain.rb +177 -0
- data/lib/omnizip/extraction/glob_pattern.rb +140 -0
- data/lib/omnizip/extraction/pattern_matcher.rb +70 -0
- data/lib/omnizip/extraction/predicate_pattern.rb +52 -0
- data/lib/omnizip/extraction/regex_pattern.rb +50 -0
- data/lib/omnizip/extraction/selective_extractor.rb +240 -0
- data/lib/omnizip/extraction.rb +111 -0
- data/lib/omnizip/file_type/mime_classifier.rb +144 -0
- data/lib/omnizip/file_type.rb +113 -0
- data/lib/omnizip/filter.rb +139 -0
- data/lib/omnizip/filter_pipeline.rb +108 -0
- data/lib/omnizip/filter_registry.rb +166 -0
- data/lib/omnizip/filters/bcj.rb +279 -0
- data/lib/omnizip/filters/bcj2/constants.rb +53 -0
- data/lib/omnizip/filters/bcj2/decoder.rb +200 -0
- data/lib/omnizip/filters/bcj2/encoder.rb +61 -0
- data/lib/omnizip/filters/bcj2/stream_data.rb +93 -0
- data/lib/omnizip/filters/bcj2.rb +99 -0
- data/lib/omnizip/filters/bcj_arm.rb +176 -0
- data/lib/omnizip/filters/bcj_arm64.rb +244 -0
- data/lib/omnizip/filters/bcj_ia64.rb +196 -0
- data/lib/omnizip/filters/bcj_ppc.rb +190 -0
- data/lib/omnizip/filters/bcj_sparc.rb +176 -0
- data/lib/omnizip/filters/bcj_x86.rb +193 -0
- data/lib/omnizip/filters/delta.rb +196 -0
- data/lib/omnizip/filters/filter_base.rb +72 -0
- data/lib/omnizip/filters/registry.rb +123 -0
- data/lib/omnizip/filters/xz_delta.rb +258 -0
- data/lib/omnizip/format_detector.rb +162 -0
- data/lib/omnizip/format_registry.rb +59 -0
- data/lib/omnizip/formats/.keep +0 -0
- data/lib/omnizip/formats/bzip2_file.rb +172 -0
- data/lib/omnizip/formats/cpio/constants.rb +55 -0
- data/lib/omnizip/formats/cpio/entry.rb +385 -0
- data/lib/omnizip/formats/cpio/reader.rb +196 -0
- data/lib/omnizip/formats/cpio/writer.rb +234 -0
- data/lib/omnizip/formats/cpio.rb +140 -0
- data/lib/omnizip/formats/format_spec_loader.rb +230 -0
- data/lib/omnizip/formats/gzip.rb +238 -0
- data/lib/omnizip/formats/iso/directory_builder.rb +297 -0
- data/lib/omnizip/formats/iso/directory_record.rb +152 -0
- data/lib/omnizip/formats/iso/joliet.rb +204 -0
- data/lib/omnizip/formats/iso/path_table.rb +125 -0
- data/lib/omnizip/formats/iso/reader.rb +197 -0
- data/lib/omnizip/formats/iso/rock_ridge.rb +349 -0
- data/lib/omnizip/formats/iso/volume_builder.rb +320 -0
- data/lib/omnizip/formats/iso/volume_descriptor.rb +168 -0
- data/lib/omnizip/formats/iso/writer.rb +530 -0
- data/lib/omnizip/formats/iso.rb +140 -0
- data/lib/omnizip/formats/lzip.rb +175 -0
- data/lib/omnizip/formats/lzma_alone.rb +171 -0
- data/lib/omnizip/formats/rar/archive_repairer.rb +243 -0
- data/lib/omnizip/formats/rar/archive_verifier.rb +195 -0
- data/lib/omnizip/formats/rar/block_parser.rb +243 -0
- data/lib/omnizip/formats/rar/compression/bit_stream.rb +180 -0
- data/lib/omnizip/formats/rar/compression/dispatcher.rb +217 -0
- data/lib/omnizip/formats/rar/compression/lz77_huffman/decoder.rb +216 -0
- data/lib/omnizip/formats/rar/compression/lz77_huffman/encoder.rb +158 -0
- data/lib/omnizip/formats/rar/compression/lz77_huffman/huffman_builder.rb +217 -0
- data/lib/omnizip/formats/rar/compression/lz77_huffman/huffman_coder.rb +189 -0
- data/lib/omnizip/formats/rar/compression/lz77_huffman/match_finder.rb +135 -0
- data/lib/omnizip/formats/rar/compression/lz77_huffman/sliding_window.rb +165 -0
- data/lib/omnizip/formats/rar/compression/ppmd/context.rb +105 -0
- data/lib/omnizip/formats/rar/compression/ppmd/decoder.rb +219 -0
- data/lib/omnizip/formats/rar/compression/ppmd/encoder.rb +262 -0
- data/lib/omnizip/formats/rar/compression_method_registry.rb +106 -0
- data/lib/omnizip/formats/rar/constants.rb +82 -0
- data/lib/omnizip/formats/rar/decompressor.rb +238 -0
- data/lib/omnizip/formats/rar/external_writer.rb +312 -0
- data/lib/omnizip/formats/rar/header.rb +192 -0
- data/lib/omnizip/formats/rar/license_validator.rb +109 -0
- data/lib/omnizip/formats/rar/models/rar_archive.rb +77 -0
- data/lib/omnizip/formats/rar/models/rar_entry.rb +65 -0
- data/lib/omnizip/formats/rar/models/rar_volume.rb +56 -0
- data/lib/omnizip/formats/rar/parity_handler.rb +292 -0
- data/lib/omnizip/formats/rar/rar5/compression/lzma.rb +202 -0
- data/lib/omnizip/formats/rar/rar5/compression/lzss.rb +578 -0
- data/lib/omnizip/formats/rar/rar5/compression/store.rb +60 -0
- data/lib/omnizip/formats/rar/rar5/crc32.rb +39 -0
- data/lib/omnizip/formats/rar/rar5/encryption/aes256_cbc.rb +97 -0
- data/lib/omnizip/formats/rar/rar5/encryption/encryption_header.rb +114 -0
- data/lib/omnizip/formats/rar/rar5/encryption/encryption_manager.rb +166 -0
- data/lib/omnizip/formats/rar/rar5/encryption/key_derivation.rb +97 -0
- data/lib/omnizip/formats/rar/rar5/header.rb +187 -0
- data/lib/omnizip/formats/rar/rar5/models/encryption_options.rb +74 -0
- data/lib/omnizip/formats/rar/rar5/models/recovery_options.rb +63 -0
- data/lib/omnizip/formats/rar/rar5/models/solid_options.rb +63 -0
- data/lib/omnizip/formats/rar/rar5/models/volume_options.rb +74 -0
- data/lib/omnizip/formats/rar/rar5/multi_volume/ARCHITECTURE.md +290 -0
- data/lib/omnizip/formats/rar/rar5/multi_volume/volume_manager.rb +264 -0
- data/lib/omnizip/formats/rar/rar5/multi_volume/volume_splitter.rb +155 -0
- data/lib/omnizip/formats/rar/rar5/multi_volume/volume_writer.rb +194 -0
- data/lib/omnizip/formats/rar/rar5/solid/solid_encoder.rb +109 -0
- data/lib/omnizip/formats/rar/rar5/solid/solid_manager.rb +142 -0
- data/lib/omnizip/formats/rar/rar5/solid/solid_stream.rb +121 -0
- data/lib/omnizip/formats/rar/rar5/vint.rb +65 -0
- data/lib/omnizip/formats/rar/rar5/writer.rb +466 -0
- data/lib/omnizip/formats/rar/rar_format_base.rb +241 -0
- data/lib/omnizip/formats/rar/reader.rb +366 -0
- data/lib/omnizip/formats/rar/recovery_record.rb +245 -0
- data/lib/omnizip/formats/rar/volume_manager.rb +168 -0
- data/lib/omnizip/formats/rar/writer.rb +431 -0
- data/lib/omnizip/formats/rar.rb +205 -0
- data/lib/omnizip/formats/rar3/compressor.rb +73 -0
- data/lib/omnizip/formats/rar3/decompressor.rb +66 -0
- data/lib/omnizip/formats/rar3/reader.rb +386 -0
- data/lib/omnizip/formats/rar3/writer.rb +219 -0
- data/lib/omnizip/formats/rar5/compressor.rb +73 -0
- data/lib/omnizip/formats/rar5/decompressor.rb +66 -0
- data/lib/omnizip/formats/rar5/reader.rb +342 -0
- data/lib/omnizip/formats/rar5/writer.rb +214 -0
- data/lib/omnizip/formats/seven_zip/coder_chain.rb +150 -0
- data/lib/omnizip/formats/seven_zip/constants.rb +126 -0
- data/lib/omnizip/formats/seven_zip/encoded_header.rb +114 -0
- data/lib/omnizip/formats/seven_zip/encrypted_header.rb +142 -0
- data/lib/omnizip/formats/seven_zip/file_collector.rb +144 -0
- data/lib/omnizip/formats/seven_zip/header.rb +106 -0
- data/lib/omnizip/formats/seven_zip/header_encryptor.rb +134 -0
- data/lib/omnizip/formats/seven_zip/header_writer.rb +466 -0
- data/lib/omnizip/formats/seven_zip/models/coder_info.rb +30 -0
- data/lib/omnizip/formats/seven_zip/models/file_entry.rb +58 -0
- data/lib/omnizip/formats/seven_zip/models/folder.rb +69 -0
- data/lib/omnizip/formats/seven_zip/models/stream_info.rb +42 -0
- data/lib/omnizip/formats/seven_zip/parser.rb +660 -0
- data/lib/omnizip/formats/seven_zip/reader.rb +458 -0
- data/lib/omnizip/formats/seven_zip/split_archive_reader.rb +632 -0
- data/lib/omnizip/formats/seven_zip/split_archive_writer.rb +315 -0
- data/lib/omnizip/formats/seven_zip/stream_compressor.rb +151 -0
- data/lib/omnizip/formats/seven_zip/stream_decompressor.rb +162 -0
- data/lib/omnizip/formats/seven_zip/writer.rb +740 -0
- data/lib/omnizip/formats/seven_zip.rb +93 -0
- data/lib/omnizip/formats/tar/constants.rb +73 -0
- data/lib/omnizip/formats/tar/entry.rb +94 -0
- data/lib/omnizip/formats/tar/header.rb +168 -0
- data/lib/omnizip/formats/tar/reader.rb +121 -0
- data/lib/omnizip/formats/tar/writer.rb +216 -0
- data/lib/omnizip/formats/tar.rb +84 -0
- data/lib/omnizip/formats/xz/reader.rb +116 -0
- data/lib/omnizip/formats/xz.rb +237 -0
- data/lib/omnizip/formats/xz_impl/block_decoder.rb +754 -0
- data/lib/omnizip/formats/xz_impl/block_encoder.rb +306 -0
- data/lib/omnizip/formats/xz_impl/block_header.rb +210 -0
- data/lib/omnizip/formats/xz_impl/block_header_parser.rb +186 -0
- data/lib/omnizip/formats/xz_impl/constants.rb +49 -0
- data/lib/omnizip/formats/xz_impl/index_decoder.rb +174 -0
- data/lib/omnizip/formats/xz_impl/index_encoder.rb +122 -0
- data/lib/omnizip/formats/xz_impl/stream_decoder.rb +468 -0
- data/lib/omnizip/formats/xz_impl/stream_encoder.rb +99 -0
- data/lib/omnizip/formats/xz_impl/stream_footer.rb +81 -0
- data/lib/omnizip/formats/xz_impl/stream_footer_parser.rb +117 -0
- data/lib/omnizip/formats/xz_impl/stream_header.rb +55 -0
- data/lib/omnizip/formats/xz_impl/stream_header_parser.rb +108 -0
- data/lib/omnizip/formats/xz_impl/vli.rb +128 -0
- data/lib/omnizip/formats/xz_impl/writer.rb +421 -0
- data/lib/omnizip/formats/zip/central_directory_header.rb +195 -0
- data/lib/omnizip/formats/zip/constants.rb +69 -0
- data/lib/omnizip/formats/zip/end_of_central_directory.rb +133 -0
- data/lib/omnizip/formats/zip/local_file_header.rb +138 -0
- data/lib/omnizip/formats/zip/reader.rb +250 -0
- data/lib/omnizip/formats/zip/unix_extra_field.rb +153 -0
- data/lib/omnizip/formats/zip/writer.rb +375 -0
- data/lib/omnizip/formats/zip/zip64_end_of_central_directory.rb +104 -0
- data/lib/omnizip/formats/zip/zip64_end_of_central_directory_locator.rb +66 -0
- data/lib/omnizip/formats/zip/zip64_extra_field.rb +114 -0
- data/lib/omnizip/formats/zip.rb +50 -0
- data/lib/omnizip/implementations/base/lzma2_decoder_base.rb +75 -0
- data/lib/omnizip/implementations/base/lzma2_encoder_base.rb +128 -0
- data/lib/omnizip/implementations/base/lzma_decoder_base.rb +83 -0
- data/lib/omnizip/implementations/base/lzma_encoder_base.rb +108 -0
- data/lib/omnizip/implementations/base/state_machine_base.rb +182 -0
- data/lib/omnizip/implementations/seven_zip/lzma/decoder.rb +421 -0
- data/lib/omnizip/implementations/seven_zip/lzma/encoder.rb +465 -0
- data/lib/omnizip/implementations/seven_zip/lzma/match_finder.rb +288 -0
- data/lib/omnizip/implementations/seven_zip/lzma/range_decoder.rb +200 -0
- data/lib/omnizip/implementations/seven_zip/lzma/range_encoder.rb +197 -0
- data/lib/omnizip/implementations/seven_zip/lzma/state_machine.rb +141 -0
- data/lib/omnizip/implementations/seven_zip/lzma2/encoder.rb +519 -0
- data/lib/omnizip/implementations/xz_utils/lzma2/decoder.rb +723 -0
- data/lib/omnizip/implementations/xz_utils/lzma2/encoder.rb +750 -0
- data/lib/omnizip/io/buffered_input.rb +146 -0
- data/lib/omnizip/io/buffered_output.rb +105 -0
- data/lib/omnizip/io/stream_manager.rb +115 -0
- data/lib/omnizip/link_handler/hard_link.rb +79 -0
- data/lib/omnizip/link_handler/symbolic_link.rb +74 -0
- data/lib/omnizip/link_handler.rb +124 -0
- data/lib/omnizip/metadata/archive_metadata.rb +114 -0
- data/lib/omnizip/metadata/entry_metadata.rb +146 -0
- data/lib/omnizip/metadata/metadata_editor.rb +171 -0
- data/lib/omnizip/metadata/metadata_registry.rb +64 -0
- data/lib/omnizip/metadata/metadata_validator.rb +99 -0
- data/lib/omnizip/metadata.rb +57 -0
- data/lib/omnizip/models/.keep +0 -0
- data/lib/omnizip/models/algorithm_metadata.rb +73 -0
- data/lib/omnizip/models/compression_options.rb +71 -0
- data/lib/omnizip/models/conversion_options.rb +87 -0
- data/lib/omnizip/models/conversion_result.rb +135 -0
- data/lib/omnizip/models/eta_result.rb +46 -0
- data/lib/omnizip/models/extraction_rule.rb +115 -0
- data/lib/omnizip/models/filter_chain.rb +144 -0
- data/lib/omnizip/models/filter_config.rb +183 -0
- data/lib/omnizip/models/match_result.rb +124 -0
- data/lib/omnizip/models/optimization_suggestion.rb +91 -0
- data/lib/omnizip/models/parallel_options.rb +104 -0
- data/lib/omnizip/models/performance_result.rb +79 -0
- data/lib/omnizip/models/profile_report.rb +82 -0
- data/lib/omnizip/models/progress_options.rb +38 -0
- data/lib/omnizip/models/split_options.rb +116 -0
- data/lib/omnizip/optimization_registry.rb +81 -0
- data/lib/omnizip/parallel/job_queue.rb +209 -0
- data/lib/omnizip/parallel/job_scheduler.rb +203 -0
- data/lib/omnizip/parallel/parallel_compressor.rb +347 -0
- data/lib/omnizip/parallel/parallel_extractor.rb +329 -0
- data/lib/omnizip/parallel/worker_pool.rb +223 -0
- data/lib/omnizip/parallel.rb +149 -0
- data/lib/omnizip/parity/chunked_block_processor.rb +196 -0
- data/lib/omnizip/parity/galois16.rb +145 -0
- data/lib/omnizip/parity/models/creator_packet.rb +73 -0
- data/lib/omnizip/parity/models/file_description_packet.rb +133 -0
- data/lib/omnizip/parity/models/ifsc_packet.rb +123 -0
- data/lib/omnizip/parity/models/main_packet.rb +128 -0
- data/lib/omnizip/parity/models/packet.rb +156 -0
- data/lib/omnizip/parity/models/packet_registry.rb +109 -0
- data/lib/omnizip/parity/models/recovery_slice_packet.rb +78 -0
- data/lib/omnizip/parity/par2_creator.rb +531 -0
- data/lib/omnizip/parity/par2_repairer.rb +407 -0
- data/lib/omnizip/parity/par2_verifier.rb +364 -0
- data/lib/omnizip/parity/par2cmdline_algorithm.rb +110 -0
- data/lib/omnizip/parity/par2cmdline_coefficients.rb +78 -0
- data/lib/omnizip/parity/reed_solomon_decoder.rb +266 -0
- data/lib/omnizip/parity/reed_solomon_encoder.rb +111 -0
- data/lib/omnizip/parity/reed_solomon_matrix.rb +342 -0
- data/lib/omnizip/parity.rb +186 -0
- data/lib/omnizip/password/encryption_registry.rb +65 -0
- data/lib/omnizip/password/encryption_strategy.rb +96 -0
- data/lib/omnizip/password/password_validator.rb +129 -0
- data/lib/omnizip/password/winzip_aes_strategy.rb +192 -0
- data/lib/omnizip/password/zip_crypto_strategy.rb +141 -0
- data/lib/omnizip/password.rb +87 -0
- data/lib/omnizip/pipe/stream_compressor.rb +124 -0
- data/lib/omnizip/pipe/stream_decompressor.rb +174 -0
- data/lib/omnizip/pipe.rb +121 -0
- data/lib/omnizip/platform/ntfs_streams.rb +201 -0
- data/lib/omnizip/platform.rb +189 -0
- data/lib/omnizip/profile/archive_profile.rb +39 -0
- data/lib/omnizip/profile/balanced_profile.rb +33 -0
- data/lib/omnizip/profile/binary_profile.rb +36 -0
- data/lib/omnizip/profile/compression_profile.rb +158 -0
- data/lib/omnizip/profile/custom_profile.rb +157 -0
- data/lib/omnizip/profile/fast_profile.rb +33 -0
- data/lib/omnizip/profile/maximum_profile.rb +33 -0
- data/lib/omnizip/profile/profile_detector.rb +110 -0
- data/lib/omnizip/profile/profile_registry.rb +161 -0
- data/lib/omnizip/profile/text_profile.rb +36 -0
- data/lib/omnizip/profile.rb +190 -0
- data/lib/omnizip/profiler/memory_profiler.rb +66 -0
- data/lib/omnizip/profiler/method_profiler.rb +49 -0
- data/lib/omnizip/profiler/report_generator.rb +169 -0
- data/lib/omnizip/profiler.rb +204 -0
- data/lib/omnizip/progress/callback_reporter.rb +36 -0
- data/lib/omnizip/progress/console_reporter.rb +62 -0
- data/lib/omnizip/progress/log_reporter.rb +91 -0
- data/lib/omnizip/progress/operation_progress.rb +118 -0
- data/lib/omnizip/progress/progress_bar.rb +156 -0
- data/lib/omnizip/progress/progress_reporter.rb +40 -0
- data/lib/omnizip/progress/progress_tracker.rb +190 -0
- data/lib/omnizip/progress/silent_reporter.rb +24 -0
- data/lib/omnizip/progress.rb +127 -0
- data/lib/omnizip/rubyzip_compat.rb +63 -0
- data/lib/omnizip/temp/safe_extract.rb +168 -0
- data/lib/omnizip/temp/temp_file.rb +124 -0
- data/lib/omnizip/temp/temp_file_pool.rb +109 -0
- data/lib/omnizip/temp.rb +181 -0
- data/lib/omnizip/version.rb +5 -0
- data/lib/omnizip/zip/entry.rb +156 -0
- data/lib/omnizip/zip/file.rb +485 -0
- data/lib/omnizip/zip/input_stream.rb +273 -0
- data/lib/omnizip/zip/output_stream.rb +324 -0
- data/lib/omnizip.rb +156 -0
- data/readme-docs/advanced-features.adoc +515 -0
- data/readme-docs/api-usage.adoc +444 -0
- data/readme-docs/architecture.adoc +449 -0
- data/readme-docs/archive-formats.adoc +479 -0
- data/readme-docs/cli-usage.adoc +222 -0
- data/readme-docs/compression-algorithms.adoc +442 -0
- data/readme-docs/compression-profiles.adoc +247 -0
- data/readme-docs/encryption-checksums.adoc +328 -0
- data/readme-docs/format-converter.adoc +325 -0
- data/readme-docs/installation.adoc +228 -0
- data/readme-docs/par2-archives.adoc +608 -0
- data/readme-docs/performance-profiler.adoc +389 -0
- data/readme-docs/preprocessing-filters.adoc +280 -0
- data/xz-file-format-1.2.1.txt +1174 -0
- metadata +617 -0
|
@@ -0,0 +1,2055 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
# Copyright (C) 2025 Ribose Inc.
|
|
4
|
+
#
|
|
5
|
+
# Permission is hereby granted, free of charge, to any person obtaining a
|
|
6
|
+
# copy of this software and associated documentation files (the "Software"),
|
|
7
|
+
# to deal in the Software without restriction, including without limitation
|
|
8
|
+
# the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
|
9
|
+
# and/or sell copies of the Software, and to permit persons to whom the
|
|
10
|
+
# Software is furnished to do so, subject to the following conditions:
|
|
11
|
+
#
|
|
12
|
+
# The above copyright notice and this permission notice shall be included in
|
|
13
|
+
# all copies or substantial portions of the Software.
|
|
14
|
+
#
|
|
15
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
|
20
|
+
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
|
21
|
+
# DEALINGS IN THE SOFTWARE.
|
|
22
|
+
|
|
23
|
+
require_relative "constants"
|
|
24
|
+
require_relative "../../implementations/seven_zip/lzma/state_machine"
|
|
25
|
+
require_relative "literal_decoder"
|
|
26
|
+
require_relative "length_coder"
|
|
27
|
+
require_relative "distance_coder"
|
|
28
|
+
require_relative "range_decoder"
|
|
29
|
+
require_relative "bit_model"
|
|
30
|
+
|
|
31
|
+
module Omnizip
|
|
32
|
+
module Algorithms
|
|
33
|
+
# LZMA XZ Utils implementation
|
|
34
|
+
#
|
|
35
|
+
# This namespace contains the XZ Utils implementation of LZMA decoder.
|
|
36
|
+
# XZ Utils is based on LZMA SDK but has been MODIFIED SIGNIFICANTLY.
|
|
37
|
+
# This implementation is for XZ format (.xz files) ONLY.
|
|
38
|
+
#
|
|
39
|
+
# Reference: /Users/mulgogi/src/external/xz/src/liblzma/lzma/lzma_decoder.c
|
|
40
|
+
#
|
|
41
|
+
# XZ Utils LZMA decoder
|
|
42
|
+
#
|
|
43
|
+
# This class implements XZ Utils' LZMA decoder (NOT LZMA SDK/7-Zip!)
|
|
44
|
+
# XZ Utils is based on LZMA SDK but has been MODIFIED SIGNIFICANTLY.
|
|
45
|
+
# Reference: /Users/mulgogi/src/external/xz/src/liblzma/lzma/lzma_decoder.c
|
|
46
|
+
#
|
|
47
|
+
# This decoder is used for:
|
|
48
|
+
# - XZ format (.xz files)
|
|
49
|
+
# - LZMA2 compression in XZ format
|
|
50
|
+
#
|
|
51
|
+
module XzUtilsDecoderDebug
|
|
52
|
+
# Debug helper to conditionally output debug messages
|
|
53
|
+
# Set ENV['LZMA_DEBUG'] = 'true' to enable all debug output
|
|
54
|
+
def self.debug_puts(*args)
|
|
55
|
+
puts(*args) if ENV["LZMA_DEBUG"]
|
|
56
|
+
end
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
# XZ Utils implementation of LZMA decoder
|
|
60
|
+
#
|
|
61
|
+
# IMPORTANT: This is NOT the LZMA SDK/7-Zip decoder!
|
|
62
|
+
# XZ Utils modified LZMA significantly - this is XZ format only.
|
|
63
|
+
# Components integrated:
|
|
64
|
+
# - LiteralDecoder: Matched/unmatched literal decoding
|
|
65
|
+
# - StateMachine: 12-state FSM for probability model selection
|
|
66
|
+
# - LengthCoder: 3-level length decoding (low/mid/high)
|
|
67
|
+
# - DistanceCoder: 64-slot distance decoding with aligned bits
|
|
68
|
+
#
|
|
69
|
+
# The decoder follows XZ Utils' exact decoding sequence:
|
|
70
|
+
# 1. Read LZMA header (property byte, dict size, uncompressed size)
|
|
71
|
+
# 2. Initialize range decoder and probability models
|
|
72
|
+
# 3. Decode loop:
|
|
73
|
+
# - Decode is_match bit
|
|
74
|
+
# - If literal: decode byte (matched/unmatched)
|
|
75
|
+
# - If match: decode length and distance
|
|
76
|
+
# - Update state machine
|
|
77
|
+
# - Write to output
|
|
78
|
+
# 4. Handle EOS marker
|
|
79
|
+
#
|
|
80
|
+
# @example Basic usage
|
|
81
|
+
# decoder = Omnizip::Algorithms::XzUtilsDecoder.new(input)
|
|
82
|
+
# data = decoder.decode_stream
|
|
83
|
+
#
|
|
84
|
+
# @example With output stream
|
|
85
|
+
# decoder = Omnizip::Algorithms::XzUtilsDecoder.new(input)
|
|
86
|
+
# File.open('output.txt', 'wb') { |f| decoder.decode_stream(f) }
|
|
87
|
+
class XzUtilsDecoder
|
|
88
|
+
include LZMA::Constants
|
|
89
|
+
|
|
90
|
+
# Alias for nested classes for easier access
|
|
91
|
+
BitModel = LZMA::BitModel
|
|
92
|
+
LengthCoder = LZMA::LengthCoder
|
|
93
|
+
DistanceCoder = LZMA::DistanceCoder
|
|
94
|
+
LiteralDecoder = LZMA::LiteralDecoder
|
|
95
|
+
RangeDecoder = LZMA::RangeDecoder
|
|
96
|
+
SdkStateMachine = Implementations::SevenZip::LZMA::StateMachine
|
|
97
|
+
|
|
98
|
+
attr_reader :lc, :lp, :pb, :dict_size, :uncompressed_size
|
|
99
|
+
|
|
100
|
+
# XZ Utils dictionary constants (from lz_decoder.h)
|
|
101
|
+
# See: /Users/mulgogi/src/external/xz/src/liblzma/lz/lz_decoder.h
|
|
102
|
+
LZ_DICT_REPEAT_MAX = 288
|
|
103
|
+
LZ_DICT_INIT_POS = 2 * LZ_DICT_REPEAT_MAX # = 576
|
|
104
|
+
|
|
105
|
+
# Initialize the SDK-compatible decoder
|
|
106
|
+
#
|
|
107
|
+
# @param input [IO] Input stream of compressed data
|
|
108
|
+
# @param options [Hash] Decoding options
|
|
109
|
+
# @option options [Boolean] :lzma2_mode If true, initialize without reading header
|
|
110
|
+
# (for LZMA2 use, requires lc, lp, pb, dict_size, uncompressed_size)
|
|
111
|
+
# @option options [Integer] :lc Literal context bits (required for lzma2_mode)
|
|
112
|
+
# @option options [Integer] :lp Literal position bits (required for lzma2_mode)
|
|
113
|
+
# @option options [Integer] :pb Position bits (required for lzma2_mode)
|
|
114
|
+
# @option options [Integer] :dict_size Dictionary size (required for lzma2_mode)
|
|
115
|
+
# @option options [Integer] :uncompressed_size Uncompressed size (required for lzma2_mode)
|
|
116
|
+
# @option options [String] :preloaded_data Data to preload into dictionary (for LZMA2
|
|
117
|
+
# uncompressed chunks followed by compressed chunks)
|
|
118
|
+
# @option options [Boolean] :validate_size If true, validate decoded size matches uncompressed_size
|
|
119
|
+
# (default: false, only for .lzma format)
|
|
120
|
+
def initialize(input, options = {})
|
|
121
|
+
@input = input
|
|
122
|
+
@decoder_id = object_id # Track decoder instance ID
|
|
123
|
+
puts "DEBUG LZMA::Decoder.new created[#{@decoder_id}]" if ENV["LZMA_DEBUG_DECODE_STREAM"]
|
|
124
|
+
if ENV["LZMA_DEBUG_DECODE_STREAM"]
|
|
125
|
+
warn "SDK Decoder #{@decoder_id} created"
|
|
126
|
+
end
|
|
127
|
+
@input = input
|
|
128
|
+
|
|
129
|
+
# Check for preloaded data (from LZMA2 uncompressed chunks)
|
|
130
|
+
@preloaded_data = options[:preloaded_data]
|
|
131
|
+
@validate_size = options.fetch(:validate_size, false)
|
|
132
|
+
@allow_eopm = options.fetch(:allow_eopm, nil)
|
|
133
|
+
|
|
134
|
+
if options[:lzma2_mode]
|
|
135
|
+
# Direct initialization for LZMA2 use (XZ Utils pattern)
|
|
136
|
+
# The LZMA2 decoder provides parameters directly, no header to read
|
|
137
|
+
# See: /Users/mulgogi/src/external/xz/src/liblzma/lzma/lzma2_decoder.c:140-141
|
|
138
|
+
@lc = options.fetch(:lc)
|
|
139
|
+
@lp = options.fetch(:lp)
|
|
140
|
+
@pb = options.fetch(:pb)
|
|
141
|
+
@dict_size = options.fetch(:dict_size)
|
|
142
|
+
@uncompressed_size = options.fetch(:uncompressed_size)
|
|
143
|
+
else
|
|
144
|
+
# Standalone LZMA file - read header from input
|
|
145
|
+
read_header
|
|
146
|
+
end
|
|
147
|
+
validate_parameters
|
|
148
|
+
init_models
|
|
149
|
+
init_coders
|
|
150
|
+
end
|
|
151
|
+
|
|
152
|
+
# Decode a compressed stream
|
|
153
|
+
#
|
|
154
|
+
# Main decoding loop following SDK's LzmaDec_DecodeToDic logic:
|
|
155
|
+
# 1. Initialize range decoder
|
|
156
|
+
# 2. Process each position: decode literals/matches
|
|
157
|
+
# 3. Detect EOS marker
|
|
158
|
+
# 4. Return decompressed data
|
|
159
|
+
#
|
|
160
|
+
# XZ Utils dictionary system (from lz_decoder.h):
|
|
161
|
+
# - pos starts at LZ_DICT_INIT_POS (576)
|
|
162
|
+
# - full = pos - LZ_DICT_INIT_POS (count of valid bytes)
|
|
163
|
+
# - has_wrapped = false until dictionary buffer wraps
|
|
164
|
+
# - Distance validation: full > distance
|
|
165
|
+
#
|
|
166
|
+
# @param output [IO, nil] Optional output stream (if nil, returns String)
|
|
167
|
+
# @param preserve_dict [Boolean] Whether to preserve dictionary from previous decode
|
|
168
|
+
# @param check_rc_finished [Boolean] Whether to check if range decoder is finished
|
|
169
|
+
# @return [String, Integer] Decompressed data or bytes written
|
|
170
|
+
def decode_stream(output = nil, preserve_dict: false, check_rc_finished: true)
|
|
171
|
+
@decode_stream_call_count ||= 0
|
|
172
|
+
@decode_stream_call_count += 1
|
|
173
|
+
call_num = @decode_stream_call_count
|
|
174
|
+
|
|
175
|
+
puts "DEBUG decode_stream START (call ##{call_num}): @pos=#{@pos.inspect}, @dict_full=#{@dict_full.inspect}, preserve_dict=#{preserve_dict}, @uncompressed_size=#{@uncompressed_size.inspect}" if ENV.fetch("LZMA_DEBUG", nil) && @dict_full && @dict_full >= 200 && @dict_full <= 230
|
|
176
|
+
if ENV["LZMA_DEBUG_DECODE_STREAM"]
|
|
177
|
+
warn "DEBUG decode_stream[#{@decoder_id}] START: preserve_dict=#{preserve_dict}, @pos=#{@pos.inspect}, @dict_full=#{@dict_full.inspect}, @dict_buf.object_id=#{@dict_buf&.object_id || 'nil'}, @dict_buf.size=#{@dict_buf&.size || 'nil'}"
|
|
178
|
+
end
|
|
179
|
+
|
|
180
|
+
# Initialize range decoder
|
|
181
|
+
# For LZMA2, reuse persistent range decoder across chunks (like XZ Utils)
|
|
182
|
+
# The range decoder is created in set_input when the first chunk is processed
|
|
183
|
+
# See: /Users/mulgogi/src/external/xz/src/liblzma/lzma/lzma2_decoder.c:140-141
|
|
184
|
+
if ENV["LZMA_DEBUG"]
|
|
185
|
+
warn "DEBUG: decode_stream - reusing range decoder @input.pos=#{begin
|
|
186
|
+
@input.pos
|
|
187
|
+
rescue StandardError
|
|
188
|
+
'N/A'
|
|
189
|
+
end}, @range_decoder.class=#{@range_decoder.class}"
|
|
190
|
+
end
|
|
191
|
+
|
|
192
|
+
# Create range decoder if it doesn't exist (first chunk)
|
|
193
|
+
# This happens when the decoder is created directly for LZMA (not LZMA2)
|
|
194
|
+
unless @range_decoder
|
|
195
|
+
if ENV["LZMA_DEBUG"]
|
|
196
|
+
warn "DEBUG: decode_stream - creating NEW range decoder"
|
|
197
|
+
end
|
|
198
|
+
@range_decoder = RangeDecoder.new(@input)
|
|
199
|
+
end
|
|
200
|
+
|
|
201
|
+
# Special case: empty input (uncompressed_size == 0)
|
|
202
|
+
# Return immediately without trying to decode anything
|
|
203
|
+
if @uncompressed_size != 0xFFFFFFFFFFFFFFFF && @uncompressed_size == 0
|
|
204
|
+
if ENV["LZMA_DEBUG"]
|
|
205
|
+
warn "DEBUG: decode_stream - empty input (uncompressed_size=0), returning immediately"
|
|
206
|
+
end
|
|
207
|
+
return "" # Empty output
|
|
208
|
+
end
|
|
209
|
+
|
|
210
|
+
# TEMP DEBUG: Trace first 10 iterations
|
|
211
|
+
@debug_iter = 0
|
|
212
|
+
|
|
213
|
+
# Track bytes decoded in this chunk (for multi-chunk streams)
|
|
214
|
+
# This is needed to limit match lengths correctly when @uncompressed_size
|
|
215
|
+
# represents only the current chunk's size, not the total size
|
|
216
|
+
@chunk_bytes_decoded = 0
|
|
217
|
+
|
|
218
|
+
# DEBUG: Show chunk_bytes_decoded initialization
|
|
219
|
+
if @dict_full && @dict_full >= 220 && @dict_full <= 240 && ENV.fetch("LZMA_DEBUG", nil)
|
|
220
|
+
puts "DEBUG: chunk_bytes_decoded reset to 0 for chunk (call_num=#{call_num}, dict_full=#{@dict_full})"
|
|
221
|
+
end
|
|
222
|
+
|
|
223
|
+
# Initialize state and dictionary (XZ Utils system from lz_decoder.c)
|
|
224
|
+
# See: /Users/mulgogi/src/external/xz/src/liblzma/lz/lz_decoder.c:56
|
|
225
|
+
# For LZMA2 multi-chunk streams, state machine persists across chunks
|
|
226
|
+
# Only reset when not preserving dictionary (first chunk)
|
|
227
|
+
#
|
|
228
|
+
# IMPORTANT: Initialize @state if it's nil (first call) OR if not preserving dict
|
|
229
|
+
if @state.nil? || !preserve_dict
|
|
230
|
+
@state = SdkStateMachine.new
|
|
231
|
+
end
|
|
232
|
+
|
|
233
|
+
# For LZMA2 multi-chunk streams, preserve dictionary across chunks
|
|
234
|
+
# when preserve_dict is true (control >= 0x80 but < 0xA0)
|
|
235
|
+
# For subsequent chunks, the reset() method handles dictionary reset
|
|
236
|
+
# For the first chunk (when @dict_buf is nil), we need to init it here
|
|
237
|
+
puts "DEBUG: Checking @dict_buf.nil? = #{@dict_buf.nil?}, preserve_dict=#{preserve_dict}" if ENV["LZMA_DEBUG_RESET"]
|
|
238
|
+
if @dict_buf.nil?
|
|
239
|
+
buf_size = @dict_size + LZ_DICT_INIT_POS
|
|
240
|
+
@dict_buf = Array.new(buf_size, 0)
|
|
241
|
+
@pos = LZ_DICT_INIT_POS
|
|
242
|
+
@dict_full = 0
|
|
243
|
+
@has_wrapped = false
|
|
244
|
+
|
|
245
|
+
# Add preloaded data to dictionary (from LZMA2 uncompressed chunks)
|
|
246
|
+
# This must be done before decoding so matches can reference this data
|
|
247
|
+
if @preloaded_data && !@preloaded_data.empty?
|
|
248
|
+
if ENV["LZMA_DEBUG_RESET"]
|
|
249
|
+
warn "DEBUG: Preloading #{@preloaded_data.bytesize} bytes into dictionary[#{@decoder_id}]"
|
|
250
|
+
end
|
|
251
|
+
@preloaded_data.each_byte do |byte|
|
|
252
|
+
@dict_buf[@pos] = byte
|
|
253
|
+
@pos += 1
|
|
254
|
+
end
|
|
255
|
+
# Update dict_full to reflect preloaded data
|
|
256
|
+
@dict_full = @pos - LZ_DICT_INIT_POS
|
|
257
|
+
if ENV["LZMA_DEBUG_RESET"]
|
|
258
|
+
warn "DEBUG: After preload - @pos=#{@pos}, @dict_full=#{@dict_full}"
|
|
259
|
+
warn " Preloaded data (hex): #{@preloaded_data[0..50].unpack1('H*')}"
|
|
260
|
+
end
|
|
261
|
+
@preloaded_data = nil # Clear after loading
|
|
262
|
+
end
|
|
263
|
+
|
|
264
|
+
if ENV["LZMA_DEBUG_RESET"]
|
|
265
|
+
warn "DEBUG: Dictionary init in decode_stream[#{@decoder_id}] - @pos=#{@pos}, @dict_full=#{@dict_full}, @dict_buf.size=#{@dict_buf.size}, @dict_buf.object_id=#{@dict_buf.object_id}"
|
|
266
|
+
# Verify buffer initialization by checking a few positions
|
|
267
|
+
warn " Sample values: [576]=#{@dict_buf[576]}, [577]=#{@dict_buf[577]}, [578]=#{@dict_buf[578]}, [583]=#{@dict_buf[583]}"
|
|
268
|
+
end
|
|
269
|
+
end
|
|
270
|
+
|
|
271
|
+
# Track starting position for multi-chunk streams
|
|
272
|
+
# IMPORTANT: Calculate start_pos AFTER dictionary initialization!
|
|
273
|
+
# This ensures that preloaded data (from LZMA2 uncompressed chunks) is
|
|
274
|
+
# properly reflected in start_pos, so we only return NEW bytes.
|
|
275
|
+
# For LZMA2, we need to return only the NEW bytes, not all bytes from LZ_DICT_INIT_POS
|
|
276
|
+
start_pos = @pos || LZ_DICT_INIT_POS
|
|
277
|
+
puts "DEBUG: start_pos=#{start_pos}, @pos=#{@pos.inspect}, @dict_full=#{@dict_full.inspect}, preserve_dict=#{preserve_dict}, @decoder_id=#{@decoder_id}" if ENV.fetch("LZMA_DEBUG", nil) && @dict_full && @dict_full >= 200 && @dict_full <= 230
|
|
278
|
+
# Also show for chunk #1 start (dict_full around 227)
|
|
279
|
+
puts "DEBUG: start_pos=#{start_pos}, @pos=#{@pos.inspect}, @dict_full=#{@dict_full.inspect}, @uncompressed_size=#{@uncompressed_size}, @decoder_id=#{@decoder_id}" if ENV.fetch("LZMA_DEBUG", nil) && @dict_full && @dict_full >= 225 && @dict_full <= 230
|
|
280
|
+
|
|
281
|
+
# Initialize rep distances (XZ Utils initializes to 0)
|
|
282
|
+
# See: /Users/mulgogi/src/external/xz/src/liblzma/lzma/lzma_decoder.c:1054-1055
|
|
283
|
+
# For LZMA2 multi-chunk streams, rep distances persist across chunks
|
|
284
|
+
# Only reset when not preserving dictionary (first chunk)
|
|
285
|
+
#
|
|
286
|
+
# IMPORTANT: Initialize rep distances if they're nil OR not preserving dict
|
|
287
|
+
if @rep0.nil? || @rep1.nil? || @rep2.nil? || @rep3.nil? || !preserve_dict
|
|
288
|
+
puts "DEBUG: Resetting rep distances to 0 (rep0.nil?=#{@rep0.nil?}, preserve_dict=#{preserve_dict})" if ENV.fetch("LZMA_DEBUG", nil) && @dict_full && @dict_full >= 200 && @dict_full <= 230
|
|
289
|
+
@rep0 = 0
|
|
290
|
+
@rep1 = 0
|
|
291
|
+
@rep2 = 0
|
|
292
|
+
@rep3 = 0
|
|
293
|
+
end
|
|
294
|
+
|
|
295
|
+
# Main decoding loop
|
|
296
|
+
# XZ Utils pattern (lzma_decoder.c:305-306):
|
|
297
|
+
# Set dict.limit = dict.pos + (size_t)(coder->uncompressed_size)
|
|
298
|
+
# Then check dict.pos < dict.limit
|
|
299
|
+
# Since our @pos starts at LZ_DICT_INIT_POS, we set limit accordingly
|
|
300
|
+
# IMPORTANT: For multi-chunk streams, calculate limit from start_pos, not LZ_DICT_INIT_POS!
|
|
301
|
+
# XZ Utils uses dict->pos (current position) + uncompressed_size
|
|
302
|
+
# We use start_pos (current position) + @uncompressed_size
|
|
303
|
+
limit = if @uncompressed_size == 0xFFFFFFFFFFFFFFFF
|
|
304
|
+
nil # No limit for unknown size
|
|
305
|
+
else
|
|
306
|
+
start_pos + @uncompressed_size
|
|
307
|
+
end
|
|
308
|
+
|
|
309
|
+
# DEBUG: Show limit calculation for chunk #1
|
|
310
|
+
if ENV.fetch("LZMA_DEBUG_LIMIT", nil) && @dict_full && @dict_full >= 220 && @dict_full <= 240
|
|
311
|
+
puts "DEBUG LIMIT CALCULATION: start_pos=#{start_pos}, @uncompressed_size=#{@uncompressed_size}, limit=#{limit.inspect}"
|
|
312
|
+
end
|
|
313
|
+
# DEBUG: Also show for dict_full around 293 (where the error occurs)
|
|
314
|
+
if ENV.fetch("LZMA_DEBUG_LIMIT", nil) && @dict_full && @dict_full >= 290 && @dict_full <= 300
|
|
315
|
+
puts "DEBUG LIMIT CALCULATION at dict_full=#{@dict_full}: start_pos=#{start_pos}, @uncompressed_size=#{@uncompressed_size}, limit=#{limit.inspect}, @decoder_id=#{@decoder_id}"
|
|
316
|
+
end
|
|
317
|
+
|
|
318
|
+
iteration = 0
|
|
319
|
+
loop do
|
|
320
|
+
iteration += 1
|
|
321
|
+
# DEBUG: Show every iteration after position 200
|
|
322
|
+
if ENV.fetch("LZMA_DEBUG_ITER", nil) && @dict_full && @dict_full >= 200 && @dict_full <= 500
|
|
323
|
+
puts "DEBUG ITERATION ##{iteration}: pos=#{@pos}, dict_full=#{@dict_full}, limit=#{limit.inspect}"
|
|
324
|
+
end
|
|
325
|
+
# Check if we've reached the expected size (if known)
|
|
326
|
+
# XZ Utils: checks dict.pos < dict.limit
|
|
327
|
+
if ENV["LZMA_DEBUG_LIMIT"]
|
|
328
|
+
compare_result = begin
|
|
329
|
+
limit && @pos >= limit
|
|
330
|
+
rescue StandardError
|
|
331
|
+
"ERROR"
|
|
332
|
+
end
|
|
333
|
+
XzUtilsDecoderDebug.debug_puts "DEBUG LIMIT: iter=#{iteration}, pos=#{@pos.inspect}, dict_full=#{@dict_full}, limit=#{limit.inspect}, pos >= limit: #{compare_result}"
|
|
334
|
+
end
|
|
335
|
+
|
|
336
|
+
# Handle nil @pos or limit gracefully
|
|
337
|
+
if limit && (@pos.nil? || limit.nil?)
|
|
338
|
+
raise "Invalid state: @pos=#{@pos.inspect}, limit=#{limit.inspect}"
|
|
339
|
+
end
|
|
340
|
+
|
|
341
|
+
if ENV["LZMA_DEBUG_LIMIT"]
|
|
342
|
+
XzUtilsDecoderDebug.debug_puts "DEBUG LIMIT: iter=#{iteration}, pos=#{@pos}, dict_full=#{@dict_full}, limit=#{limit}"
|
|
343
|
+
end
|
|
344
|
+
|
|
345
|
+
# DEBUG: Track position before decoding
|
|
346
|
+
@pos if ENV["LZMA_DEBUG_POS"]
|
|
347
|
+
|
|
348
|
+
# Decode is_match bit
|
|
349
|
+
pos_state = @pos & ((1 << @pb) - 1)
|
|
350
|
+
# XZ Utils: is_match[state][pos_state] where the array is NUM_STATES * (1 << pb)
|
|
351
|
+
# The array stride changes with pb value
|
|
352
|
+
model_index = (@state.value * (1 << @pb)) + pos_state
|
|
353
|
+
|
|
354
|
+
# DEBUG: Show state before decode (for position tracking)
|
|
355
|
+
if ENV["LZMA_DEBUG_POS_227"]
|
|
356
|
+
XzUtilsDecoderDebug.debug_puts "DEBUG: Before is_match at pos=#{@pos}, dict_full=#{@dict_full}, state=#{@state.value}, pos_state=#{pos_state}"
|
|
357
|
+
end
|
|
358
|
+
|
|
359
|
+
# Debug trace (disabled - remove or enable with ENV var as needed)
|
|
360
|
+
@debug_iter += 1
|
|
361
|
+
|
|
362
|
+
# DEBUG: Trace is_match decision around position 256
|
|
363
|
+
# IMPORTANT: Capture range/code BEFORE calling decode_bit
|
|
364
|
+
if @dict_full.between?(255, 257)
|
|
365
|
+
range = @range_decoder.instance_variable_get(:@range)
|
|
366
|
+
code = @range_decoder.instance_variable_get(:@code)
|
|
367
|
+
model = @is_match_models[model_index]
|
|
368
|
+
XzUtilsDecoderDebug.debug_puts " [IS_MATCH] pos=#{@pos}, dict_full=#{@dict_full}, state=#{@state.value}, pos_state=#{pos_state}, model_index=#{model_index}"
|
|
369
|
+
XzUtilsDecoderDebug.debug_puts " BEFORE decode: range=0x#{range.to_s(16)}, code=0x#{code.to_s(16)}, prob=#{model.probability}"
|
|
370
|
+
# Enable detailed tracing for the critical is_match[0][0] decision
|
|
371
|
+
$trace_is_match_0_0 = (@dict_full == 256)
|
|
372
|
+
end
|
|
373
|
+
|
|
374
|
+
if ENV.fetch("LZMA_DEBUG_ITER", nil)
|
|
375
|
+
range = @range_decoder.instance_variable_get(:@range)
|
|
376
|
+
code = @range_decoder.instance_variable_get(:@code)
|
|
377
|
+
model = @is_match_models[model_index]
|
|
378
|
+
bound = (range >> 11) * model.probability
|
|
379
|
+
XzUtilsDecoderDebug.debug_puts ""
|
|
380
|
+
XzUtilsDecoderDebug.debug_puts "ITER #{@debug_iter}:"
|
|
381
|
+
XzUtilsDecoderDebug.debug_puts " pos=#{@pos}, state=#{@state.value}, pos_state=#{pos_state}, model_index=#{model_index}"
|
|
382
|
+
XzUtilsDecoderDebug.debug_puts " dict_full=#{@dict_full}"
|
|
383
|
+
XzUtilsDecoderDebug.debug_puts " range=0x#{range.to_s(16)}, code=0x#{code.to_s(16)}, model.prob=#{model.probability}"
|
|
384
|
+
XzUtilsDecoderDebug.debug_puts " bound=0x#{bound.to_s(16)}, code < bound: #{code < bound}"
|
|
385
|
+
end
|
|
386
|
+
|
|
387
|
+
is_match = @range_decoder.decode_bit(@is_match_models[model_index])
|
|
388
|
+
|
|
389
|
+
# DEBUG: Trace is_match and literal/match decisions around dict_full = 50-62
|
|
390
|
+
if @dict_full.between?(50, 62)
|
|
391
|
+
range_val = @range_decoder.instance_variable_get(:@range)
|
|
392
|
+
code_val = @range_decoder.instance_variable_get(:@code)
|
|
393
|
+
prob_val = @is_match_models[model_index].probability
|
|
394
|
+
XzUtilsDecoderDebug.debug_puts "\n=== dict_full=#{@dict_full}: is_match=#{is_match}, state=#{@state.value}, pos_state=#{pos_state} ==="
|
|
395
|
+
XzUtilsDecoderDebug.debug_puts " model_index=#{model_index}, prob=#{prob_val}"
|
|
396
|
+
XzUtilsDecoderDebug.debug_puts " range=0x#{range_val.to_s(16).upcase}, code=0x#{code_val.to_s(16).upcase}"
|
|
397
|
+
end
|
|
398
|
+
|
|
399
|
+
if ENV.fetch("LZMA_DEBUG_ITER", nil)
|
|
400
|
+
XzUtilsDecoderDebug.debug_puts " is_match=#{is_match}"
|
|
401
|
+
end
|
|
402
|
+
|
|
403
|
+
# DEBUG: Show is_match result after decode
|
|
404
|
+
if @dict_full.between?(255, 257)
|
|
405
|
+
XzUtilsDecoderDebug.debug_puts " AFTER decode: is_match=#{is_match}"
|
|
406
|
+
XzUtilsDecoderDebug.debug_puts " (is_match=0 means literal, is_match=1 means match)"
|
|
407
|
+
end
|
|
408
|
+
|
|
409
|
+
# DEBUG: Track what's happening around dict_full=227 (corruption point)
|
|
410
|
+
if ENV["LZMA_DEBUG"] && @dict_full == 227
|
|
411
|
+
puts "DEBUG CORRUPTION POINT: dict_full=#{@dict_full}, pos=#{@pos}"
|
|
412
|
+
puts " is_match=#{is_match}, state=#{@state.value}"
|
|
413
|
+
range_val = @range_decoder.instance_variable_get(:@range)
|
|
414
|
+
code_val = @range_decoder.instance_variable_get(:@code)
|
|
415
|
+
puts " range=0x#{range_val.to_s(16)}, code=0x#{code_val.to_s(16)}"
|
|
416
|
+
puts " dict_buf[#{@pos - 5}...#{@pos + 5}] = #{@dict_buf[[
|
|
417
|
+
@pos - 5, LZ_DICT_INIT_POS
|
|
418
|
+
].max...[@pos + 5, @dict_buf.size - 1].min].inspect}"
|
|
419
|
+
end
|
|
420
|
+
|
|
421
|
+
if ENV["LZMA_DEBUG"] && @dict_full.between?(224, 235)
|
|
422
|
+
puts "DEBUG pos #{@dict_full}: is_match=#{is_match}, state=#{@state.value}"
|
|
423
|
+
if is_match.zero?
|
|
424
|
+
puts " Next byte should be literal"
|
|
425
|
+
else
|
|
426
|
+
puts " Next byte should be match"
|
|
427
|
+
end
|
|
428
|
+
end
|
|
429
|
+
|
|
430
|
+
# DEBUG: Verify first 256 bytes are correct
|
|
431
|
+
if @dict_full == 256
|
|
432
|
+
XzUtilsDecoderDebug.debug_puts ""
|
|
433
|
+
XzUtilsDecoderDebug.debug_puts " Verifying first 256 bytes:"
|
|
434
|
+
# Check specific bytes around position 253
|
|
435
|
+
XzUtilsDecoderDebug.debug_puts " Byte 253: @dict_buf[#{LZ_DICT_INIT_POS + 253}]=#{@dict_buf[LZ_DICT_INIT_POS + 253].inspect} (expected 'i'=0x69)"
|
|
436
|
+
XzUtilsDecoderDebug.debug_puts " Byte 254: @dict_buf[#{LZ_DICT_INIT_POS + 254}]=#{@dict_buf[LZ_DICT_INIT_POS + 254].inspect} (expected 'n'=0x6E)"
|
|
437
|
+
XzUtilsDecoderDebug.debug_puts " Byte 255: @dict_buf[#{LZ_DICT_INIT_POS + 255}]=#{@dict_buf[LZ_DICT_INIT_POS + 255].inspect} (expected ' '=0x20)"
|
|
438
|
+
all_correct = true
|
|
439
|
+
256.times do |i|
|
|
440
|
+
expected = i
|
|
441
|
+
actual = @dict_buf[LZ_DICT_INIT_POS + i]
|
|
442
|
+
if actual != expected
|
|
443
|
+
all_correct = false
|
|
444
|
+
if (i >= 253) && ENV.fetch("LZMA_DEBUG", nil)
|
|
445
|
+
puts " Byte #{i}: expected 0x#{expected.to_s(16)}, got 0x#{actual.to_s(16)} MISMATCH!"
|
|
446
|
+
end
|
|
447
|
+
end
|
|
448
|
+
end
|
|
449
|
+
XzUtilsDecoderDebug.debug_puts " First 256 bytes: #{all_correct ? 'ALL CORRECT ✓' : 'HAS MISMATCH'}"
|
|
450
|
+
XzUtilsDecoderDebug.debug_puts ""
|
|
451
|
+
end
|
|
452
|
+
|
|
453
|
+
if ENV.fetch("LZMA_DEBUG", nil) && @pos >= 605 && @pos <= 615
|
|
454
|
+
warn "DEBUG: is_match at pos=#{@pos}, state=#{@state.value}, pos_state=#{pos_state}, model_index=#{model_index}, is_match=#{is_match}"
|
|
455
|
+
end
|
|
456
|
+
|
|
457
|
+
if is_match.zero?
|
|
458
|
+
# Decode literal
|
|
459
|
+
decode_literal
|
|
460
|
+
|
|
461
|
+
# Trace positions 45-65 for debugging good-1-lzma2-3.xz divergence
|
|
462
|
+
if ENV.fetch("LZMA_DEBUG",
|
|
463
|
+
nil) && @dict_full >= 45 && @dict_full <= 65
|
|
464
|
+
last_byte = @dict_buf[@pos - 1]
|
|
465
|
+
range_after = @range_decoder.instance_variable_get(:@range)
|
|
466
|
+
code_after = @range_decoder.instance_variable_get(:@code)
|
|
467
|
+
puts " literal decoded: 0x#{last_byte.to_s(16).upcase} ('#{last_byte.chr}') at pos=#{@pos - 1}, dict_full=#{@dict_full}"
|
|
468
|
+
puts " AFTER: range=0x#{range_after.to_s(16).upcase}, code=0x#{code_after.to_s(16).upcase}"
|
|
469
|
+
end
|
|
470
|
+
|
|
471
|
+
if ENV.fetch("LZMA_DEBUG_ITER", nil)
|
|
472
|
+
last_byte = @dict_buf[@pos - 1]
|
|
473
|
+
puts " literal byte=0x#{last_byte.to_s(16)} ('#{last_byte.chr}')"
|
|
474
|
+
end
|
|
475
|
+
if ENV["LZMA_DEBUG_POS"] && @pos >= limit
|
|
476
|
+
puts "DEBUG: Literal overshoot: pos=#{@pos}, limit=#{limit}, delta=#{@pos - limit}"
|
|
477
|
+
end
|
|
478
|
+
elsif decode_match
|
|
479
|
+
# Decode match - returns true if EOS detected
|
|
480
|
+
break
|
|
481
|
+
end
|
|
482
|
+
|
|
483
|
+
# XZ Utils: Check if we've reached the limit (known uncompressed size)
|
|
484
|
+
# Reference: lzma_decoder.c:347, 680-692
|
|
485
|
+
# When dict.pos == dict.limit, the decoder should stop
|
|
486
|
+
# IMPORTANT: Must verify range decoder is finished (code == 0)
|
|
487
|
+
# If code != 0, there's leftover data in the compressed stream (corruption)
|
|
488
|
+
if limit && @pos >= limit
|
|
489
|
+
puts "DEBUG LIMIT TRIGGERED (call #{call_num}): pos=#{@pos}, limit=#{limit}, dict_full=#{@dict_full}, chunk_bytes_decoded=#{@chunk_bytes_decoded}" if ENV["LZMA_DEBUG_LIMIT"]
|
|
490
|
+
|
|
491
|
+
# XZ Utils pattern (lzma_decoder.c:689-700):
|
|
492
|
+
# Check if range decoder is finished (code == 0)
|
|
493
|
+
# - If finished → STREAM_END (success)
|
|
494
|
+
# - If NOT finished AND allow_eopm is false → DATA_ERROR (corruption)
|
|
495
|
+
# - If NOT finished AND allow_eopm is true → continue (expect EOPM)
|
|
496
|
+
# Reference: /Users/mulgogi/src/external/xz/src/liblzma/lzma/lzma_decoder.c:689-700
|
|
497
|
+
#
|
|
498
|
+
# For LZMA2: @allow_eopm is false, so range decoder MUST be finished
|
|
499
|
+
# For .lzma format: @allow_eopm may be true, so we continue decoding to find EOPM
|
|
500
|
+
# Reference: /Users/mulgogi/src/external/xz/src/liblzma/rangecoder/range_decoder.h:138-139
|
|
501
|
+
# rc_is_finished(range_decoder) = ((range_decoder).code == 0)
|
|
502
|
+
#
|
|
503
|
+
# NOTE: The check_rc_finished parameter is a legacy override for .lzma format
|
|
504
|
+
# If explicitly set to false, it allows EOPM even when uncompressed size is known
|
|
505
|
+
# Reference: alone_decoder.c:127 (LZMA_LZMA1EXT_ALLOW_EOPM)
|
|
506
|
+
should_check = if @allow_eopm == true
|
|
507
|
+
# EOPM is explicitly allowed, skip the check
|
|
508
|
+
false
|
|
509
|
+
elsif @allow_eopm == false
|
|
510
|
+
# LZMA2 mode: always check (EOPM is not allowed)
|
|
511
|
+
true
|
|
512
|
+
else
|
|
513
|
+
# @allow_eopm is nil (not set, first chunk or legacy mode)
|
|
514
|
+
# Use check_rc_finished parameter as default
|
|
515
|
+
check_rc_finished
|
|
516
|
+
end
|
|
517
|
+
|
|
518
|
+
if should_check
|
|
519
|
+
# If EOPM is not allowed, range decoder MUST be finished
|
|
520
|
+
unless @range_decoder.code.zero?
|
|
521
|
+
raise Omnizip::DecompressionError,
|
|
522
|
+
"LZMA stream finished with leftover compressed data (range_decoder.code=#{@range_decoder.code}, expected 0). This indicates corruption in the compressed stream or an invalid EOPM for LZMA2."
|
|
523
|
+
end
|
|
524
|
+
break
|
|
525
|
+
else
|
|
526
|
+
# EOPM is allowed (e.g., LZMA_Alone format)
|
|
527
|
+
# If range decoder is finished, we're done
|
|
528
|
+
if @range_decoder.code.zero?
|
|
529
|
+
break
|
|
530
|
+
end
|
|
531
|
+
# Otherwise, continue decoding to find EOPM marker
|
|
532
|
+
# XZ Utils sets eopm_is_valid = true and continues
|
|
533
|
+
# Reference: lzma_decoder.c:704
|
|
534
|
+
end
|
|
535
|
+
end
|
|
536
|
+
|
|
537
|
+
# DEBUG: Show when approaching limit for chunk #1
|
|
538
|
+
if ENV.fetch("LZMA_DEBUG_LIMIT", nil) && limit && @pos >= limit - 10 && @pos < limit + 10
|
|
539
|
+
puts "DEBUG NEAR LIMIT (call #{call_num}): pos=#{@pos}, limit=#{limit}, dict_full=#{@dict_full}, chunk_bytes_decoded=#{@chunk_bytes_decoded}, remaining=#{@uncompressed_size ? @uncompressed_size - @chunk_bytes_decoded : 'N/A'}"
|
|
540
|
+
end
|
|
541
|
+
|
|
542
|
+
# DEBUG: Show when we've passed the expected limit
|
|
543
|
+
if ENV.fetch("LZMA_DEBUG_LIMIT", nil) && limit && @pos >= limit && @pos < limit + 10
|
|
544
|
+
puts "DEBUG PASSED LIMIT: pos=#{@pos}, limit=#{limit}, dict_full=#{@dict_full}, delta=#{@pos - limit}"
|
|
545
|
+
end
|
|
546
|
+
|
|
547
|
+
if ENV["LZMA_DEBUG_POS"] && @pos >= limit
|
|
548
|
+
XzUtilsDecoderDebug.debug_puts "DEBUG: Overshoot detected: pos=#{@pos}, limit=#{limit}, delta=#{@pos - limit}"
|
|
549
|
+
end
|
|
550
|
+
end
|
|
551
|
+
|
|
552
|
+
# Validate decoded size against expected uncompressed_size
|
|
553
|
+
# Only for .lzma (LZMA_Alone) format where validate_size=true
|
|
554
|
+
# For .lzma format with known uncompressed_size, verify we decoded the right amount
|
|
555
|
+
# This catches "too_small_size-without-eopm" files where the header says 1 byte
|
|
556
|
+
# but the compressed data produces more output
|
|
557
|
+
# XZ format does NOT validate size at the LZMA decoder level - it's handled at block level
|
|
558
|
+
if @validate_size && @uncompressed_size && @uncompressed_size != 0xFFFFFFFFFFFFFFFF
|
|
559
|
+
# Calculate actual decoded size (from start of data, not LZ_DICT_INIT_POS)
|
|
560
|
+
actual_decoded_size = @pos - LZ_DICT_INIT_POS
|
|
561
|
+
|
|
562
|
+
if actual_decoded_size != @uncompressed_size
|
|
563
|
+
raise Omnizip::DecompressionError,
|
|
564
|
+
"LZMA stream size mismatch: expected #{@uncompressed_size} bytes, decoded #{actual_decoded_size} bytes. The file may be corrupted or have an invalid uncompressed size field."
|
|
565
|
+
end
|
|
566
|
+
|
|
567
|
+
# IMPORTANT: Check for leftover compressed data after EOPM
|
|
568
|
+
# If EOPM was encountered (range_decoder.code == 0) but there's still data
|
|
569
|
+
# in the input stream, the file is corrupted.
|
|
570
|
+
# Reference: /Users/mulgogi/src/external/xz/src/liblzma/common/alone_decoder.c
|
|
571
|
+
#
|
|
572
|
+
# We only check for leftover data when:
|
|
573
|
+
# 1. EOPM was encountered (code == 0) AND
|
|
574
|
+
# 2. There's more data in the input stream
|
|
575
|
+
#
|
|
576
|
+
# If EOPM was NOT encountered (code != 0), leftover data is expected
|
|
577
|
+
# (it's part of the compressed stream that we haven't read yet).
|
|
578
|
+
if @allow_eopm && @range_decoder&.code&.zero? && @range_decoder.instance_variable_get(:@stream)
|
|
579
|
+
stream = @range_decoder.instance_variable_get(:@stream)
|
|
580
|
+
# Try to peek at the next byte - if available, there's data AFTER EOPM
|
|
581
|
+
begin
|
|
582
|
+
next_byte = stream.getbyte
|
|
583
|
+
if next_byte
|
|
584
|
+
# Put the byte back
|
|
585
|
+
stream.ungetbyte(next_byte) if stream.respond_to?(:ungetbyte)
|
|
586
|
+
raise Omnizip::DecompressionError,
|
|
587
|
+
"LZMA_Alone file has data after the end-of-payload marker. The file may be corrupted or contain concatenated streams."
|
|
588
|
+
end
|
|
589
|
+
rescue IOError, EOFError
|
|
590
|
+
# Stream doesn't support peeking or is exhausted, that's fine
|
|
591
|
+
end
|
|
592
|
+
elsif !@allow_eopm && @range_decoder&.instance_variable_get(:@stream)
|
|
593
|
+
# For LZMA2 mode (EOPM not allowed): check for leftover data
|
|
594
|
+
stream = @range_decoder.instance_variable_get(:@stream)
|
|
595
|
+
begin
|
|
596
|
+
next_byte = stream.getbyte
|
|
597
|
+
if next_byte
|
|
598
|
+
stream.ungetbyte(next_byte) if stream.respond_to?(:ungetbyte)
|
|
599
|
+
raise Omnizip::DecompressionError,
|
|
600
|
+
"LZMA_Alone file has more compressed data than expected. The uncompressed size field (#{@uncompressed_size} bytes) appears to be too small."
|
|
601
|
+
end
|
|
602
|
+
rescue IOError, EOFError
|
|
603
|
+
# Stream doesn't support peeking or is exhausted, that's fine
|
|
604
|
+
end
|
|
605
|
+
end
|
|
606
|
+
end
|
|
607
|
+
|
|
608
|
+
# Return output - only the valid portion of dictionary
|
|
609
|
+
# XZ Utils: valid data starts from LZ_DICT_INIT_POS onwards
|
|
610
|
+
# IMPORTANT: For LZMA2 multi-chunk streams, only return NEW bytes since start_pos!
|
|
611
|
+
# This ensures each chunk returns only its own output, not previous chunks' output.
|
|
612
|
+
if ENV["DEBUG_DICT_BUF"]
|
|
613
|
+
XzUtilsDecoderDebug.debug_puts "DEBUG: start_pos=#{start_pos}, @pos=#{@pos.inspect}, @dict_buf.size=#{@dict_buf.size}, LZ_DICT_INIT_POS=#{LZ_DICT_INIT_POS}"
|
|
614
|
+
end
|
|
615
|
+
valid_bytes = @dict_buf[start_pos...@pos]
|
|
616
|
+
# DEBUG: Show return value calculation
|
|
617
|
+
puts "DEBUG RETURN CALCULATION: call_num=#{call_num}, start_pos=#{start_pos}, @pos=#{@pos}, valid_bytes.size=#{@dict_buf[start_pos...@pos].size}, dict_full=#{@dict_full}, chunk_bytes_decoded=#{@chunk_bytes_decoded}" if ENV.fetch("LZMA_DEBUG", nil) && @dict_full && @dict_full >= 220 && @dict_full <= 240 && call_num == 2
|
|
618
|
+
puts "DEBUG RETURN CALCULATION: call_num=#{call_num}, start_pos=#{start_pos}, @pos=#{@pos}, valid_bytes.size=#{@dict_buf[start_pos...@pos].size}, dict_full=#{@dict_full}, chunk_bytes_decoded=#{@chunk_bytes_decoded}" if ENV["LZMA_DEBUG"] && call_num == 2
|
|
619
|
+
# Filter out nil values (can happen during dictionary reset transitions)
|
|
620
|
+
valid_bytes = valid_bytes.map { |b| b.nil? ? 0 : b }
|
|
621
|
+
if ENV["DEBUG_DICT_BUF"]
|
|
622
|
+
XzUtilsDecoderDebug.debug_puts "DEBUG: valid_bytes=#{begin
|
|
623
|
+
valid_bytes.size
|
|
624
|
+
rescue StandardError
|
|
625
|
+
valid_bytes.inspect
|
|
626
|
+
end}"
|
|
627
|
+
end
|
|
628
|
+
valid_data = valid_bytes.pack("C*")
|
|
629
|
+
if output
|
|
630
|
+
output.write(valid_data.force_encoding(Encoding::BINARY))
|
|
631
|
+
valid_data.bytesize
|
|
632
|
+
else
|
|
633
|
+
valid_data.force_encoding(Encoding::BINARY)
|
|
634
|
+
end
|
|
635
|
+
end
|
|
636
|
+
|
|
637
|
+
# Reset the decoder state for reuse with new properties
|
|
638
|
+
#
|
|
639
|
+
# XZ Utils pattern (lzma_decoder.c:1034-1083):
|
|
640
|
+
# - Resets state machine and rep distances
|
|
641
|
+
# - Resets range decoder
|
|
642
|
+
# - Reinitializes all probability models
|
|
643
|
+
# - Preserves dictionary (managed externally by LZMA2 decoder)
|
|
644
|
+
#
|
|
645
|
+
# @param new_lc [Integer, nil] New lc value (if nil, keeps current)
|
|
646
|
+
# @param new_lp [Integer, nil] New lp value (if nil, keeps current)
|
|
647
|
+
# @param new_pb [Integer, nil] New pb value (if nil, keeps current)
|
|
648
|
+
# @param preserve_dict [Boolean] If true, preserve dictionary state (pos, dict_full)
|
|
649
|
+
# @return [void]
|
|
650
|
+
def reset(new_lc: nil, new_lp: nil, new_pb: nil, preserve_dict: false)
|
|
651
|
+
if ENV["LZMA_DEBUG_RESET"]
|
|
652
|
+
warn "DEBUG reset[#{@decoder_id}] called: preserve_dict=#{preserve_dict}, @pos=#{@pos.inspect}, @dict_full=#{@dict_full.inspect}, @dict_buf.size=#{@dict_buf&.size || 'nil'}, nil_count=#{@dict_buf&.count(nil) || 'N/A'}"
|
|
653
|
+
end
|
|
654
|
+
|
|
655
|
+
# DEBUG: Trace reset calls around position 224-227
|
|
656
|
+
if @dict_full && @dict_full >= 220 && @dict_full <= 230
|
|
657
|
+
XzUtilsDecoderDebug.debug_puts "\n=== reset called at dict_full=#{@dict_full} ==="
|
|
658
|
+
XzUtilsDecoderDebug.debug_puts " preserve_dict=#{preserve_dict}"
|
|
659
|
+
XzUtilsDecoderDebug.debug_puts " Before reset: rep0/1/2/3=(#{@rep0},#{@rep1},#{@rep2},#{@rep3})"
|
|
660
|
+
end
|
|
661
|
+
|
|
662
|
+
# Update properties if provided
|
|
663
|
+
properties_changed = !!(new_lc || new_lp || new_pb)
|
|
664
|
+
@lc = new_lc if new_lc
|
|
665
|
+
@lp = new_lp if new_lp
|
|
666
|
+
@pb = new_pb if new_pb
|
|
667
|
+
|
|
668
|
+
# Reset state machine (XZ Utils line 1053)
|
|
669
|
+
# Always create a new state machine when resetting
|
|
670
|
+
@state = SdkStateMachine.new
|
|
671
|
+
|
|
672
|
+
# Reset rep distances (XZ Utils lines 1071-1074)
|
|
673
|
+
# IMPORTANT: ALWAYS reset rep distances to 0 when state is reset
|
|
674
|
+
# This happens for both control=0xE0 (dict reset) and control=0xC0 (state reset)
|
|
675
|
+
# Reference: /Users/mulgogi/src/external/xz/src/liblzma/lzma/lzma_decoder.c:1071-1074
|
|
676
|
+
@rep0 = 0
|
|
677
|
+
@rep1 = 0
|
|
678
|
+
@rep2 = 0
|
|
679
|
+
@rep3 = 0
|
|
680
|
+
|
|
681
|
+
# Reset range decoder for next chunk
|
|
682
|
+
# XZ Utils pattern (lzma_decoder.c:1061):
|
|
683
|
+
# - rc_reset sets range=UINT32_MAX, code=0, init_bytes_left=5
|
|
684
|
+
# - This MUST happen during reset, not deferred to decode_stream
|
|
685
|
+
# Reference: /Users/mulgogi/src/external/xz/src/liblzma/lzma/lzma_decoder.c:1061
|
|
686
|
+
if @range_decoder
|
|
687
|
+
@range_decoder.reset
|
|
688
|
+
if ENV["LZMA_DEBUG_RESET"]
|
|
689
|
+
warn "DEBUG reset[#{@decoder_id}]: Reset range_decoder, code=0x#{@range_decoder.code.to_s(16)}, init_bytes_remaining=#{@range_decoder.instance_variable_get(:@init_bytes_remaining)}"
|
|
690
|
+
end
|
|
691
|
+
end
|
|
692
|
+
|
|
693
|
+
# Reinitialize probability models (XZ Utils lines 1064-1082)
|
|
694
|
+
# IMPORTANT: Use reset_models (reset in place) instead of init_models (create new)
|
|
695
|
+
# for state reset only. Only create new models when properties change.
|
|
696
|
+
if properties_changed
|
|
697
|
+
if ENV["LZMA_DEBUG_RESET"]
|
|
698
|
+
warn "DEBUG reset[#{@decoder_id}]: Properties changed, calling init_models (create new arrays)"
|
|
699
|
+
end
|
|
700
|
+
init_models
|
|
701
|
+
else
|
|
702
|
+
if ENV["LZMA_DEBUG_RESET"]
|
|
703
|
+
warn "DEBUG reset[#{@decoder_id}]: No properties changed, calling reset_models (reset in place)"
|
|
704
|
+
end
|
|
705
|
+
reset_models
|
|
706
|
+
end
|
|
707
|
+
|
|
708
|
+
# Reinitialize coders (needed for pb changes)
|
|
709
|
+
# Only recreate coders when properties have changed
|
|
710
|
+
if properties_changed
|
|
711
|
+
if ENV["LZMA_DEBUG_RESET"]
|
|
712
|
+
warn "DEBUG reset[#{@decoder_id}]: Properties changed, calling init_coders (create new coders)"
|
|
713
|
+
end
|
|
714
|
+
init_coders
|
|
715
|
+
elsif ENV["LZMA_DEBUG_RESET"]
|
|
716
|
+
warn "DEBUG reset[#{@decoder_id}]: No properties changed, skipping init_coders (preserve existing coders)"
|
|
717
|
+
end
|
|
718
|
+
|
|
719
|
+
# Reset dictionary position and full count (XZ Utils pattern)
|
|
720
|
+
# Only reset if preserve_dict is false
|
|
721
|
+
unless preserve_dict
|
|
722
|
+
# Reinitialize dictionary buffer
|
|
723
|
+
# XZ Utils allocates a new buffer for each dictionary reset
|
|
724
|
+
@dict_buf = Array.new(@dict_size + LZ_DICT_INIT_POS, 0)
|
|
725
|
+
@pos = LZ_DICT_INIT_POS
|
|
726
|
+
@dict_full = 0
|
|
727
|
+
@has_wrapped = false
|
|
728
|
+
if ENV["LZMA_DEBUG_RESET"]
|
|
729
|
+
warn "DEBUG reset after dict reset[#{@decoder_id}]: @pos=#{@pos}, @dict_full=#{@dict_full}, @dict_buf.size=#{@dict_buf.size}, @dict_buf.object_id=#{@dict_buf.object_id}, nil_count=#{@dict_buf.count(nil)}"
|
|
730
|
+
# Verify buffer initialization by checking a few positions
|
|
731
|
+
warn " Sample values: [576]=#{@dict_buf[576]}, [577]=#{@dict_buf[577]}, [578]=#{@dict_buf[578]}, [583]=#{@dict_buf[583]}"
|
|
732
|
+
end
|
|
733
|
+
end
|
|
734
|
+
|
|
735
|
+
nil
|
|
736
|
+
end
|
|
737
|
+
|
|
738
|
+
# Reset all probability models in place (without creating new arrays)
|
|
739
|
+
#
|
|
740
|
+
# This matches XZ Utils init_temporals behavior for control >= 0xA0.
|
|
741
|
+
# Unlike init_models which creates new arrays, this resets existing
|
|
742
|
+
# BitModels in place to preserve object identity for any references.
|
|
743
|
+
#
|
|
744
|
+
# @return [void]
|
|
745
|
+
def reset_models
|
|
746
|
+
# Reset literal models
|
|
747
|
+
@literal_models.each(&:reset)
|
|
748
|
+
|
|
749
|
+
# Reset match/rep models
|
|
750
|
+
@is_match_models.each(&:reset)
|
|
751
|
+
@is_rep_models.each(&:reset)
|
|
752
|
+
@is_rep0_models.each(&:reset)
|
|
753
|
+
@is_rep1_models.each(&:reset)
|
|
754
|
+
@is_rep2_models.each(&:reset)
|
|
755
|
+
@is_rep0_long_models.each(&:reset)
|
|
756
|
+
|
|
757
|
+
# Reset length coders
|
|
758
|
+
@length_coder.reset_models
|
|
759
|
+
@rep_length_coder.reset_models
|
|
760
|
+
|
|
761
|
+
# Reset distance coder
|
|
762
|
+
@distance_coder.reset_models
|
|
763
|
+
end
|
|
764
|
+
|
|
765
|
+
# Reset only state machine and rep distances, preserve probability models
|
|
766
|
+
#
|
|
767
|
+
# XZ Utils pattern for state reset only (control >= 0xA0):
|
|
768
|
+
# - Reset state machine
|
|
769
|
+
# - Reset rep distances
|
|
770
|
+
# - Reset probability models (via reset_models)
|
|
771
|
+
# - Reset range decoder (rc_reset + rc_read_init)
|
|
772
|
+
# - PRESERVE dictionary content (no dict_reset)
|
|
773
|
+
#
|
|
774
|
+
# XZ Utils source (lzma2_decoder.c):
|
|
775
|
+
# - For control >= 0xA0: calls lzma_lzma_decoder_reset(decoder, NULL)
|
|
776
|
+
# - lzma_lzma_decoder_reset always calls init_temporals which resets probability models
|
|
777
|
+
#
|
|
778
|
+
# @return [void]
|
|
779
|
+
# Prepare state reset - called BEFORE setting new input
|
|
780
|
+
#
|
|
781
|
+
# Resets state machine, rep distances, and probability models.
|
|
782
|
+
# The range decoder will be reset in finish_state_reset AFTER
|
|
783
|
+
# the new input is set (to match XZ Utils lzma_decoder_reset behavior).
|
|
784
|
+
#
|
|
785
|
+
# For LZMA2 control >= 0xC0, this is called before set_input to reset
|
|
786
|
+
# everything except the range decoder for the new chunk.
|
|
787
|
+
#
|
|
788
|
+
# @return [void]
|
|
789
|
+
def prepare_state_reset
|
|
790
|
+
# DEBUG: Trace when prepare_state_reset is called
|
|
791
|
+
if ENV["LZMA_DEBUG"]
|
|
792
|
+
XzUtilsDecoderDebug.debug_puts "\n=== prepare_state_reset called (decoder_id=#{@decoder_id}) ==="
|
|
793
|
+
XzUtilsDecoderDebug.debug_puts " Before reset: rep0/1/2/3=(#{@rep0},#{@rep1},#{@rep2},#{@rep3})"
|
|
794
|
+
end
|
|
795
|
+
|
|
796
|
+
# Reset state machine (XZ Utils line 1053)
|
|
797
|
+
@state = SdkStateMachine.new
|
|
798
|
+
|
|
799
|
+
# Reset rep distances (XZ Utils lines 1054-1057)
|
|
800
|
+
@rep0 = 0
|
|
801
|
+
@rep1 = 0
|
|
802
|
+
@rep2 = 0
|
|
803
|
+
@rep3 = 0
|
|
804
|
+
|
|
805
|
+
# DEBUG: Show after reset
|
|
806
|
+
if ENV["LZMA_DEBUG"]
|
|
807
|
+
XzUtilsDecoderDebug.debug_puts " After reset: rep0/1/2/3=(#{@rep0},#{@rep1},#{@rep2},#{@rep3})"
|
|
808
|
+
end
|
|
809
|
+
|
|
810
|
+
# Reset probability models (XZ Utils init_temporals for control >= 0xA0)
|
|
811
|
+
reset_models
|
|
812
|
+
|
|
813
|
+
if ENV["LZMA_DEBUG"]
|
|
814
|
+
XzUtilsDecoderDebug.debug_puts "=== end prepare_state_reset (range decoder will be reset in finish_state_reset) ==="
|
|
815
|
+
end
|
|
816
|
+
|
|
817
|
+
nil
|
|
818
|
+
end
|
|
819
|
+
|
|
820
|
+
# Reset state machine only - preserves rep distances
|
|
821
|
+
#
|
|
822
|
+
# This is used for control >= 0xA0 but < 0xC0 where we want
|
|
823
|
+
# to reset the state machine but preserve rep distances from
|
|
824
|
+
# the previous chunk.
|
|
825
|
+
#
|
|
826
|
+
# @return [void]
|
|
827
|
+
def reset_state_machine_only
|
|
828
|
+
# DEBUG: Trace when reset_state_machine_only is called
|
|
829
|
+
if @dict_full && @dict_full >= 220 && @dict_full <= 230
|
|
830
|
+
XzUtilsDecoderDebug.debug_puts "\n=== reset_state_machine_only called at dict_full=#{@dict_full} (decoder_id=#{@decoder_id}) ==="
|
|
831
|
+
XzUtilsDecoderDebug.debug_puts " Before reset: rep0/1/2/3=(#{@rep0},#{@rep1},#{@rep2},#{@rep3})"
|
|
832
|
+
end
|
|
833
|
+
|
|
834
|
+
# Reset state machine only (XZ Utils line 1053)
|
|
835
|
+
@state = SdkStateMachine.new
|
|
836
|
+
|
|
837
|
+
# Reset probability models (XZ Utils init_temporals for control >= 0xA0)
|
|
838
|
+
reset_models
|
|
839
|
+
|
|
840
|
+
# DEBUG: Show after reset (note: rep distances are preserved)
|
|
841
|
+
if @dict_full && @dict_full >= 220 && @dict_full <= 230
|
|
842
|
+
XzUtilsDecoderDebug.debug_puts " After reset: rep0/1/2/3=(#{@rep0},#{@rep1},#{@rep2},#{@rep3}) (preserved)"
|
|
843
|
+
end
|
|
844
|
+
|
|
845
|
+
nil
|
|
846
|
+
end
|
|
847
|
+
|
|
848
|
+
# Finish state reset - called AFTER setting new input
|
|
849
|
+
#
|
|
850
|
+
# Resets the range decoder to read from the new input stream.
|
|
851
|
+
# This completes the state reset process started by prepare_state_reset.
|
|
852
|
+
#
|
|
853
|
+
# XZ Utils pattern (lzma_decoder.c:1034-1083):
|
|
854
|
+
# - rc_reset is called as part of lzma_decoder_reset
|
|
855
|
+
# - rc_reset sets range = UINT32_MAX, code = 0, init_bytes_left = 5
|
|
856
|
+
# - The 5 initialization bytes are read during the first normalize calls
|
|
857
|
+
#
|
|
858
|
+
# @return [void]
|
|
859
|
+
def finish_state_reset
|
|
860
|
+
# Reset range decoder (XZ Utils rc_reset)
|
|
861
|
+
# This reinitializes the range decoder for the new chunk
|
|
862
|
+
# The reset will read 5 bytes from the input when decode_stream starts
|
|
863
|
+
if @range_decoder
|
|
864
|
+
if ENV["LZMA_DEBUG"]
|
|
865
|
+
input_pos = begin
|
|
866
|
+
@input.pos
|
|
867
|
+
rescue StandardError
|
|
868
|
+
"N/A"
|
|
869
|
+
end
|
|
870
|
+
input_size = begin
|
|
871
|
+
@input.size
|
|
872
|
+
rescue StandardError
|
|
873
|
+
"N/A"
|
|
874
|
+
end
|
|
875
|
+
XzUtilsDecoderDebug.debug_puts "=== finish_state_reset: resetting range_decoder, input pos=#{input_pos}, size=#{input_size}"
|
|
876
|
+
end
|
|
877
|
+
@range_decoder.reset
|
|
878
|
+
if ENV["LZMA_DEBUG"]
|
|
879
|
+
input_pos_after = begin
|
|
880
|
+
@input.pos
|
|
881
|
+
rescue StandardError
|
|
882
|
+
"N/A"
|
|
883
|
+
end
|
|
884
|
+
XzUtilsDecoderDebug.debug_puts "=== finish_state_reset: after reset, input pos=#{input_pos_after}, range_decoder.code=0x#{@range_decoder.code.to_s(16)}"
|
|
885
|
+
end
|
|
886
|
+
end
|
|
887
|
+
end
|
|
888
|
+
|
|
889
|
+
def reset_state_only
|
|
890
|
+
# For backward compatibility, call both prepare and finish
|
|
891
|
+
prepare_state_reset
|
|
892
|
+
finish_state_reset
|
|
893
|
+
end
|
|
894
|
+
|
|
895
|
+
# Reset only the range decoder for next chunk
|
|
896
|
+
#
|
|
897
|
+
# XZ Utils pattern (lzma_decoder.c:1014-1017):
|
|
898
|
+
# When LZMA chunk ends (LZMA_STREAM_END), reset range decoder
|
|
899
|
+
# for next LZMA2 chunk, but preserve state and probability models.
|
|
900
|
+
#
|
|
901
|
+
# Note: This method is a no-op in our implementation because
|
|
902
|
+
# decode_stream creates a fresh RangeDecoder for each chunk.
|
|
903
|
+
# The range decoder initialization happens automatically when
|
|
904
|
+
# the new RangeDecoder is created with the new input.
|
|
905
|
+
#
|
|
906
|
+
# @return [void]
|
|
907
|
+
def reset_range_decoder
|
|
908
|
+
# No-op: RangeDecoder is created fresh in decode_stream
|
|
909
|
+
end
|
|
910
|
+
|
|
911
|
+
# Set new input stream for chunked decoding
|
|
912
|
+
#
|
|
913
|
+
# For LZMA2, the range decoder is persistent across chunks and is
|
|
914
|
+
# reset separately via prepare_state_reset + finish_state_reset.
|
|
915
|
+
# This method just updates the input stream reference.
|
|
916
|
+
#
|
|
917
|
+
# @param new_input [IO] New input stream
|
|
918
|
+
# @return [void]
|
|
919
|
+
def set_input(new_input)
|
|
920
|
+
@input = new_input
|
|
921
|
+
|
|
922
|
+
# DEBUG: Trace input stream contents
|
|
923
|
+
if ENV.fetch("LZMA_DEBUG", nil) && @dict_full && @dict_full >= 220 && @dict_full <= 230
|
|
924
|
+
puts "\n=== set_input at dict_full=#{@dict_full} ==="
|
|
925
|
+
puts " new_input.size=#{new_input.size}"
|
|
926
|
+
puts " new_input.pos=#{new_input.pos}"
|
|
927
|
+
puts " new_input.class=#{new_input.class}"
|
|
928
|
+
|
|
929
|
+
# Read first 10 bytes manually
|
|
930
|
+
first_bytes = []
|
|
931
|
+
10.times do |_i|
|
|
932
|
+
byte = new_input.getbyte
|
|
933
|
+
break if byte.nil?
|
|
934
|
+
|
|
935
|
+
first_bytes << byte
|
|
936
|
+
end
|
|
937
|
+
puts " First 10 bytes: #{first_bytes.map { |b| "0x#{b.to_s(16).upcase}" }.join(' ')}"
|
|
938
|
+
|
|
939
|
+
new_input.rewind
|
|
940
|
+
test_byte = new_input.getbyte
|
|
941
|
+
puts " Test getbyte: 0x#{test_byte.to_s(16).upcase}" if test_byte
|
|
942
|
+
new_input.rewind
|
|
943
|
+
end
|
|
944
|
+
|
|
945
|
+
# Create range decoder if it doesn't exist (first chunk)
|
|
946
|
+
if @range_decoder.nil?
|
|
947
|
+
@range_decoder = RangeDecoder.new(@input)
|
|
948
|
+
if ENV["LZMA_DEBUG"]
|
|
949
|
+
XzUtilsDecoderDebug.debug_puts "=== set_input: created NEW range_decoder, input has #{@input.size} bytes"
|
|
950
|
+
end
|
|
951
|
+
else
|
|
952
|
+
# Update the range decoder's input stream to the new input
|
|
953
|
+
# This is needed because RangeDecoder holds a reference to the stream
|
|
954
|
+
@range_decoder.update_stream(@input)
|
|
955
|
+
if ENV["LZMA_DEBUG"]
|
|
956
|
+
XzUtilsDecoderDebug.debug_puts "=== set_input: reusing range_decoder, new input has #{@input.size} bytes, pos=#{@input.pos}"
|
|
957
|
+
end
|
|
958
|
+
end
|
|
959
|
+
end
|
|
960
|
+
|
|
961
|
+
# Add uncompressed data to the dictionary
|
|
962
|
+
#
|
|
963
|
+
# XZ Utils pattern (lzma2_decoder.c:195, dict_write):
|
|
964
|
+
# - Copy uncompressed data to the dictionary as-is
|
|
965
|
+
# - Update dict_full to reflect new data
|
|
966
|
+
# - This allows subsequent compressed chunks to reference the data
|
|
967
|
+
#
|
|
968
|
+
# This is used by LZMA2 decoder for uncompressed chunks (control=0x1 or 0x2)
|
|
969
|
+
#
|
|
970
|
+
# @param data [String] Uncompressed data to add to dictionary
|
|
971
|
+
# @return [void]
|
|
972
|
+
def add_to_dictionary(data)
|
|
973
|
+
if ENV["LZMA_DEBUG"]
|
|
974
|
+
old_dict_full = @dict_full
|
|
975
|
+
XzUtilsDecoderDebug.debug_puts "=== add_to_dictionary: adding #{data.bytesize} bytes to dictionary[#{@decoder_id}], current dict_full=#{@dict_full}, pos=#{@pos}"
|
|
976
|
+
end
|
|
977
|
+
|
|
978
|
+
data.each_byte do |byte|
|
|
979
|
+
@dict_buf[@pos] = byte
|
|
980
|
+
@pos += 1
|
|
981
|
+
end
|
|
982
|
+
|
|
983
|
+
# Update dict_full to reflect new data
|
|
984
|
+
@dict_full = @pos - LZ_DICT_INIT_POS
|
|
985
|
+
|
|
986
|
+
# Check if we've reached the maximum dictionary size
|
|
987
|
+
if @dict_full >= @dict_size
|
|
988
|
+
@dict_full = @dict_size
|
|
989
|
+
end
|
|
990
|
+
|
|
991
|
+
if ENV["LZMA_DEBUG"]
|
|
992
|
+
XzUtilsDecoderDebug.debug_puts "=== add_to_dictionary: after adding, dict_full=#{@dict_full} (was #{old_dict_full}), pos=#{@pos}"
|
|
993
|
+
end
|
|
994
|
+
end
|
|
995
|
+
|
|
996
|
+
# Set uncompressed size for chunked decoding
|
|
997
|
+
#
|
|
998
|
+
# XZ Utils pattern (lzma2_decoder.c:140-141):
|
|
999
|
+
# Pass the chunk's uncompressed_size to the LZMA decoder
|
|
1000
|
+
# for each LZMA2 chunk.
|
|
1001
|
+
#
|
|
1002
|
+
# @param size [Integer] Uncompressed size for current chunk
|
|
1003
|
+
# @param allow_eopm [Boolean] Whether to allow end-of-payload marker
|
|
1004
|
+
# @return [void]
|
|
1005
|
+
def set_uncompressed_size(size, allow_eopm: true)
|
|
1006
|
+
@uncompressed_size = size
|
|
1007
|
+
@allow_eopm = allow_eopm
|
|
1008
|
+
# DEBUG: Track when uncompressed size is set
|
|
1009
|
+
if ENV["LZMA_DEBUG"]
|
|
1010
|
+
puts "DEBUG set_uncompressed_size: size=#{size}, @decoder_id=#{@decoder_id}, @dict_full=#{@dict_full}"
|
|
1011
|
+
end
|
|
1012
|
+
end
|
|
1013
|
+
|
|
1014
|
+
private
|
|
1015
|
+
|
|
1016
|
+
# Read and parse LZMA header
|
|
1017
|
+
#
|
|
1018
|
+
# SDK header format:
|
|
1019
|
+
# - Property byte: (lc + lp*9 + pb*45)
|
|
1020
|
+
# - Dictionary size: 4 bytes little-endian
|
|
1021
|
+
# - Uncompressed size: 8 bytes (0xFF for unknown size)
|
|
1022
|
+
#
|
|
1023
|
+
# @return [void]
|
|
1024
|
+
# @raise [RuntimeError] If header is invalid
|
|
1025
|
+
def read_header
|
|
1026
|
+
# Property byte
|
|
1027
|
+
props = @input.getbyte
|
|
1028
|
+
raise "Invalid LZMA header" if props.nil?
|
|
1029
|
+
|
|
1030
|
+
@lc = props % 9
|
|
1031
|
+
remainder = props / 9
|
|
1032
|
+
@lp = remainder % 5
|
|
1033
|
+
@pb = remainder / 5
|
|
1034
|
+
|
|
1035
|
+
# Dictionary size (4 bytes, little-endian)
|
|
1036
|
+
@dict_size = 0
|
|
1037
|
+
4.times do |i|
|
|
1038
|
+
byte = @input.getbyte
|
|
1039
|
+
raise "Incomplete header" if byte.nil?
|
|
1040
|
+
|
|
1041
|
+
@dict_size |= (byte << (i * 8))
|
|
1042
|
+
end
|
|
1043
|
+
|
|
1044
|
+
# Uncompressed size (8 bytes, little-endian)
|
|
1045
|
+
@uncompressed_size = 0
|
|
1046
|
+
8.times do |i|
|
|
1047
|
+
byte = @input.getbyte
|
|
1048
|
+
raise "Incomplete header" if byte.nil?
|
|
1049
|
+
|
|
1050
|
+
@uncompressed_size |= (byte << (i * 8))
|
|
1051
|
+
end
|
|
1052
|
+
end
|
|
1053
|
+
|
|
1054
|
+
# Validate parameters
|
|
1055
|
+
#
|
|
1056
|
+
# @return [void]
|
|
1057
|
+
# @raise [RuntimeError] If parameters are invalid
|
|
1058
|
+
def validate_parameters
|
|
1059
|
+
raise "Invalid lc (#{@lc})" unless @lc.between?(0, 8)
|
|
1060
|
+
raise "Invalid lp (#{@lp})" unless @lp.between?(0, 4)
|
|
1061
|
+
raise "Invalid pb (#{@pb})" unless @pb.between?(0, 4)
|
|
1062
|
+
end
|
|
1063
|
+
|
|
1064
|
+
# Initialize probability models
|
|
1065
|
+
#
|
|
1066
|
+
# SDK allocates models following exact structure from LzmaDec.c:
|
|
1067
|
+
# - Literal models: (1 << (lc+lp)) contexts * 0x300 models each
|
|
1068
|
+
# - Match models: NUM_STATES * pos_states models (where pos_states = 1 << pb)
|
|
1069
|
+
# - Rep models: NUM_STATES models each
|
|
1070
|
+
#
|
|
1071
|
+
# Must match SdkEncoder's model structure exactly.
|
|
1072
|
+
# CRITICAL: When pb changes, models must be recreated with new pos_states!
|
|
1073
|
+
#
|
|
1074
|
+
# @return [void]
|
|
1075
|
+
def init_models
|
|
1076
|
+
# Calculate pos_states based on current @pb value
|
|
1077
|
+
pos_states = 1 << @pb
|
|
1078
|
+
@pos_states = pos_states # Store for use in indexing
|
|
1079
|
+
|
|
1080
|
+
# Literal models: XZ Utils compact layout
|
|
1081
|
+
# context_value ranges from 0 to literal_mask (using XZ Utils formula)
|
|
1082
|
+
# base_offset = (context_value * 3) << lc
|
|
1083
|
+
# For unmatched mode: max index = (max_context_value * 3) << lc + 256
|
|
1084
|
+
# For matched mode: max index = (max_context_value * 3) << lc + offset + match_bit + symbol
|
|
1085
|
+
# where offset, match_bit, and symbol can each be up to 0x100
|
|
1086
|
+
# So max matched index = base_offset + 0x100 + 0x100 + 0x100 = base_offset + 0x300
|
|
1087
|
+
# XZ Utils formula for literal_mask: (0x100 << lp) - (0x100 >> lc)
|
|
1088
|
+
literal_mask = (0x100 << @lp) - (0x100 >> @lc)
|
|
1089
|
+
max_context_value = literal_mask
|
|
1090
|
+
max_base_offset = (max_context_value * 3) << @lc
|
|
1091
|
+
max_model_index = max_base_offset + 0x300 # accommodate matched mode (offset + match_bit + symbol)
|
|
1092
|
+
@literal_models = Array.new(max_model_index + 1) do
|
|
1093
|
+
BitModel.new
|
|
1094
|
+
end
|
|
1095
|
+
|
|
1096
|
+
# Match/rep decision models
|
|
1097
|
+
# IMPORTANT: Use current pos_states, not POS_STATES_MAX
|
|
1098
|
+
# This ensures models are correctly sized when pb changes
|
|
1099
|
+
@is_match_models = Array.new(NUM_STATES * pos_states) do
|
|
1100
|
+
BitModel.new
|
|
1101
|
+
end
|
|
1102
|
+
@is_rep_models = Array.new(NUM_STATES) { BitModel.new }
|
|
1103
|
+
@is_rep0_models = Array.new(NUM_STATES) { BitModel.new }
|
|
1104
|
+
@is_rep1_models = Array.new(NUM_STATES) { BitModel.new }
|
|
1105
|
+
@is_rep2_models = Array.new(NUM_STATES) { BitModel.new }
|
|
1106
|
+
@is_rep0_long_models = Array.new(NUM_STATES * pos_states) do
|
|
1107
|
+
BitModel.new
|
|
1108
|
+
end
|
|
1109
|
+
end
|
|
1110
|
+
|
|
1111
|
+
# Initialize SDK coders
|
|
1112
|
+
#
|
|
1113
|
+
# @return [void]
|
|
1114
|
+
def init_coders
|
|
1115
|
+
@literal_decoder = LiteralDecoder.new
|
|
1116
|
+
pos_states = 1 << @pb
|
|
1117
|
+
@length_coder = LengthCoder.new(pos_states)
|
|
1118
|
+
@rep_length_coder = LengthCoder.new(pos_states)
|
|
1119
|
+
@distance_coder = DistanceCoder.new(NUM_LEN_TO_POS_STATES)
|
|
1120
|
+
|
|
1121
|
+
if ENV["TRACE_MODEL_INIT"]
|
|
1122
|
+
puts "[XzUtilsDecoder.init] slot_encoders len_state=0 object_id=#{@distance_coder.instance_variable_get(:@slot_encoders)[0].object_id}"
|
|
1123
|
+
puts "[XzUtilsDecoder.init] slot_encoders[0][1] object_id=#{@distance_coder.instance_variable_get(:@slot_encoders)[0][1].object_id}"
|
|
1124
|
+
puts "[XzUtilsDecoder.init] is_match_models object_id=#{@is_match_models.object_id}"
|
|
1125
|
+
puts "[XzUtilsDecoder.init] is_match_models[0] object_id=#{@is_match_models[0].object_id}"
|
|
1126
|
+
end
|
|
1127
|
+
|
|
1128
|
+
# Update probability model indices to match new pos_states
|
|
1129
|
+
# This is critical when pb changes between chunks
|
|
1130
|
+
@pos_states = pos_states
|
|
1131
|
+
end
|
|
1132
|
+
|
|
1133
|
+
# Reset distance coder probability models
|
|
1134
|
+
#
|
|
1135
|
+
# Called during state reset (control >= 0xA0) to reset the distance
|
|
1136
|
+
# coder's probability models to initial values. This matches XZ Utils
|
|
1137
|
+
# behavior where init_temporals resets all probability models.
|
|
1138
|
+
#
|
|
1139
|
+
# @return [void]
|
|
1140
|
+
def reset_distance_coder
|
|
1141
|
+
@distance_coder.reset_models
|
|
1142
|
+
end
|
|
1143
|
+
|
|
1144
|
+
# Decode a literal byte
|
|
1145
|
+
#
|
|
1146
|
+
# SDK decoding sequence (from LzmaDec.c):
|
|
1147
|
+
# 1. Calculate literal state
|
|
1148
|
+
# 2. Decode literal (matched or unmatched based on state)
|
|
1149
|
+
# 3. Update state machine
|
|
1150
|
+
# 4. Update dictionary and position
|
|
1151
|
+
#
|
|
1152
|
+
# XZ Utils dict_put pattern (from lz_decoder.h:270-276):
|
|
1153
|
+
# dict->buf[dict->pos++] = byte;
|
|
1154
|
+
# if (!dict->has_wrapped)
|
|
1155
|
+
# dict->full = dict->pos - LZ_DICT_INIT_POS;
|
|
1156
|
+
#
|
|
1157
|
+
# @return [void]
|
|
1158
|
+
def decode_literal
|
|
1159
|
+
# DEBUG: Trace literals around position 224-227
|
|
1160
|
+
old_dict_full = @dict_full
|
|
1161
|
+
|
|
1162
|
+
# DEBUG: Track how many times we're called for each position
|
|
1163
|
+
if ENV["LZMA_DEBUG_DECODE_LITERAL"]
|
|
1164
|
+
caller_info = caller(1..1).first
|
|
1165
|
+
XzUtilsDecoderDebug.debug_puts "DEBUG decode_literal[#{@decoder_id}]: pos=#{@pos}, dict_full=#{@dict_full}, from=#{caller_info.label}"
|
|
1166
|
+
end
|
|
1167
|
+
|
|
1168
|
+
# DEBUG: Check array integrity before decode
|
|
1169
|
+
if ENV.fetch("LZMA_DEBUG_ARRAY",
|
|
1170
|
+
nil) && @dict_full.positive? && @pos > 1
|
|
1171
|
+
idx = @pos - 1
|
|
1172
|
+
if @dict_buf[idx].nil?
|
|
1173
|
+
raise "DEBUG before decode: @dict_buf[#{idx}] is nil! @pos=#{@pos}, @dict_full=#{@dict_full}, @dict_buf.size=#{@dict_buf.size}, nil_count=#{@dict_buf.count(nil)}"
|
|
1174
|
+
end
|
|
1175
|
+
end
|
|
1176
|
+
|
|
1177
|
+
# Calculate literal state using SDK formula
|
|
1178
|
+
lit_state = calculate_literal_state
|
|
1179
|
+
|
|
1180
|
+
# DEBUG: Trace lit_state at position 61
|
|
1181
|
+
if @dict_full == 61 && ENV["TRACE_LITERAL_61"]
|
|
1182
|
+
XzUtilsDecoderDebug.debug_puts "=== CALC_LITERAL_STATE at dict_full=61 ==="
|
|
1183
|
+
XzUtilsDecoderDebug.debug_puts " prev_byte=#{@dict_full.positive? ? @dict_buf[@pos - 1] : 0}"
|
|
1184
|
+
XzUtilsDecoderDebug.debug_puts " lit_state=#{lit_state}"
|
|
1185
|
+
XzUtilsDecoderDebug.debug_puts " lc=#{@lc}, lp=#{@lp}"
|
|
1186
|
+
XzUtilsDecoderDebug.debug_puts " state.value=#{@state.value}"
|
|
1187
|
+
XzUtilsDecoderDebug.debug_puts " use_matched_literal?=#{@state.use_matched_literal?}"
|
|
1188
|
+
XzUtilsDecoderDebug.debug_puts " dict_full.positive?=#{@dict_full.positive?}"
|
|
1189
|
+
puts
|
|
1190
|
+
end
|
|
1191
|
+
|
|
1192
|
+
# Decode literal (matched or unmatched)
|
|
1193
|
+
# Check if dictionary has any valid bytes (XZ Utils: dict->full > 0)
|
|
1194
|
+
if @state.use_matched_literal? && @dict_full.positive?
|
|
1195
|
+
# DEBUG: Track which branch is taken
|
|
1196
|
+
if @dict_full == 61 && ENV["TRACE_LITERAL_61"]
|
|
1197
|
+
XzUtilsDecoderDebug.debug_puts " TAKING MATCHED LITERAL PATH"
|
|
1198
|
+
XzUtilsDecoderDebug.debug_puts " rep0=#{@rep0}"
|
|
1199
|
+
match_byte_pos_calc = LZ_DICT_INIT_POS + @dict_full - @rep0 - 1
|
|
1200
|
+
XzUtilsDecoderDebug.debug_puts " match_byte_pos (calc)=#{match_byte_pos_calc}"
|
|
1201
|
+
puts
|
|
1202
|
+
end
|
|
1203
|
+
|
|
1204
|
+
# Matched literal: use match byte from dictionary at distance rep0
|
|
1205
|
+
# XZ Utils dict_get pattern: dict->buf[dict->pos - distance - 1]
|
|
1206
|
+
# IMPORTANT: dict->pos in XZ Utils is the actual output position (dict->full),
|
|
1207
|
+
# not the buffer position with offset!
|
|
1208
|
+
# omnizip uses @pos for buffer position (includes LZ_DICT_INIT_POS offset)
|
|
1209
|
+
# and @dict_full for actual output position (starts at 0)
|
|
1210
|
+
# So we must convert: buffer_pos = LZ_DICT_INIT_POS + (output_pos - rep0 - 1)
|
|
1211
|
+
match_byte_pos = LZ_DICT_INIT_POS + @dict_full - @rep0 - 1
|
|
1212
|
+
match_byte = @dict_buf[match_byte_pos]
|
|
1213
|
+
if ENV["LZMA_DEBUG"]
|
|
1214
|
+
warn "DEBUG: matched literal - dict_full=#{@dict_full}, rep0=#{@rep0}, reading dict_buf[#{match_byte_pos}]=0x#{match_byte.to_s(16).upcase} ('#{match_byte.chr}'), lit_state=#{lit_state}, state=#{@state.value}"
|
|
1215
|
+
end
|
|
1216
|
+
byte = @literal_decoder.decode_matched(match_byte, lit_state, @lc,
|
|
1217
|
+
@range_decoder, @literal_models)
|
|
1218
|
+
|
|
1219
|
+
# DEBUG: Trace decoded byte at position 61
|
|
1220
|
+
if @dict_full == 61 && ENV["TRACE_LITERAL_61"]
|
|
1221
|
+
XzUtilsDecoderDebug.debug_puts " DECODED MATCHED LITERAL: 0x#{byte.to_s(16).upcase} ('#{byte.chr}')"
|
|
1222
|
+
XzUtilsDecoderDebug.debug_puts " match_byte=0x#{match_byte.to_s(16).upcase} ('#{match_byte.chr}')"
|
|
1223
|
+
puts
|
|
1224
|
+
end
|
|
1225
|
+
else
|
|
1226
|
+
# Unmatched literal: simple 8-bit decoding
|
|
1227
|
+
if @dict_full == 61 && ENV["TRACE_LITERAL_61"]
|
|
1228
|
+
XzUtilsDecoderDebug.debug_puts " TAKING UNMATCHED LITERAL PATH"
|
|
1229
|
+
puts
|
|
1230
|
+
end
|
|
1231
|
+
|
|
1232
|
+
if ENV["LZMA_DEBUG"]
|
|
1233
|
+
warn "DEBUG: calling decode_unmatched - pos=#{@pos}, lit_state=#{lit_state}"
|
|
1234
|
+
end
|
|
1235
|
+
byte = @literal_decoder.decode_unmatched(lit_state, @lc,
|
|
1236
|
+
@range_decoder, @literal_models)
|
|
1237
|
+
end
|
|
1238
|
+
|
|
1239
|
+
if ENV["LZMA_DEBUG"]
|
|
1240
|
+
warn "DEBUG: decode_literal RETURNED - pos=#{@pos}, byte=#{'0x%02X' % byte} ('#{byte.chr}'), lit_state=#{lit_state}"
|
|
1241
|
+
end
|
|
1242
|
+
|
|
1243
|
+
# DEBUG: Check byte value at critical positions
|
|
1244
|
+
if @dict_full == 256
|
|
1245
|
+
XzUtilsDecoderDebug.debug_puts "DEBUG: About to store 257th byte (pos=#{@pos}, dict_full=#{@dict_full})"
|
|
1246
|
+
XzUtilsDecoderDebug.debug_puts " byte.class=#{byte.class}"
|
|
1247
|
+
XzUtilsDecoderDebug.debug_puts " byte=#{byte.inspect}"
|
|
1248
|
+
XzUtilsDecoderDebug.debug_puts " byte.is_a?(Integer)=#{byte.is_a?(Integer)}"
|
|
1249
|
+
if byte.is_a?(Integer)
|
|
1250
|
+
XzUtilsDecoderDebug.debug_puts " byte value=#{byte}"
|
|
1251
|
+
XzUtilsDecoderDebug.debug_puts " Expected byte value=0"
|
|
1252
|
+
else
|
|
1253
|
+
XzUtilsDecoderDebug.debug_puts " byte is not an Integer!"
|
|
1254
|
+
XzUtilsDecoderDebug.debug_puts " byte.ord=#{byte.ord}"
|
|
1255
|
+
end
|
|
1256
|
+
end
|
|
1257
|
+
|
|
1258
|
+
if ENV["LZMA_DEBUG"]
|
|
1259
|
+
decoded_bytes = @dict_full.positive? ? @dict_buf[LZ_DICT_INIT_POS..].map(&:chr).join : ""
|
|
1260
|
+
warn "DEBUG: decode_literal - pos=#{@pos}, byte=#{'0x%02X' % byte} ('#{byte.chr}'), state=#{@state.value}, dict_full=#{@dict_full}, decoded_so_far='#{decoded_bytes[-10..]}'"
|
|
1261
|
+
end
|
|
1262
|
+
|
|
1263
|
+
# DEBUG: Detailed trace around position 256
|
|
1264
|
+
if ENV["LZMA_DEBUG"] && @dict_full.between?(230, 265)
|
|
1265
|
+
expected = @dict_full % 256
|
|
1266
|
+
match = byte == expected ? "✓" : "✗ MISMATCH!"
|
|
1267
|
+
XzUtilsDecoderDebug.debug_puts " [LITERAL] dict_full=#{@dict_full}: 0x#{byte.to_s(16).upcase} (expected 0x#{expected.to_s(16).upcase}) #{match}"
|
|
1268
|
+
if @dict_full == 233
|
|
1269
|
+
XzUtilsDecoderDebug.debug_puts " DETAILED TRACE at dict_full=233 (pos=#{@pos}):"
|
|
1270
|
+
XzUtilsDecoderDebug.debug_puts " byte=0x#{byte.to_s(16)} ('#{begin
|
|
1271
|
+
byte.chr
|
|
1272
|
+
rescue StandardError
|
|
1273
|
+
'?'
|
|
1274
|
+
end}')"
|
|
1275
|
+
XzUtilsDecoderDebug.debug_puts " state.value=#{@state.value}, lit_state=#{lit_state}"
|
|
1276
|
+
XzUtilsDecoderDebug.debug_puts " use_matched_literal?=#{@state.use_matched_literal?}"
|
|
1277
|
+
prev_byte_val = @dict_full.positive? ? @dict_buf[@pos - 1] : "N/A"
|
|
1278
|
+
XzUtilsDecoderDebug.debug_puts " prev_byte=#{prev_byte_val.inspect} (#{if prev_byte_val.is_a?(Integer)
|
|
1279
|
+
"0x#{prev_byte_val.to_s(16)} ('#{begin
|
|
1280
|
+
prev_byte_val.chr
|
|
1281
|
+
rescue StandardError
|
|
1282
|
+
'?'
|
|
1283
|
+
end}')"
|
|
1284
|
+
else
|
|
1285
|
+
'N/A'
|
|
1286
|
+
end})"
|
|
1287
|
+
XzUtilsDecoderDebug.debug_puts " range_decoder.range=0x#{@range_decoder.range.to_s(16)}, range_decoder.code=0x#{@range_decoder.code.to_s(16)}"
|
|
1288
|
+
XzUtilsDecoderDebug.debug_puts " input.pos=#{@input.pos}, input.size=#{@input.size}"
|
|
1289
|
+
end
|
|
1290
|
+
if @dict_full == 256
|
|
1291
|
+
XzUtilsDecoderDebug.debug_puts " pos=#{@pos}, lit_state=#{lit_state}, state.value=#{@state.value}"
|
|
1292
|
+
XzUtilsDecoderDebug.debug_puts " use_matched_literal?=#{@state.use_matched_literal?}"
|
|
1293
|
+
end
|
|
1294
|
+
end
|
|
1295
|
+
|
|
1296
|
+
# Update state and dictionary
|
|
1297
|
+
# XZ Utils dict_put pattern:
|
|
1298
|
+
# dict->buf[dict->pos++] = byte;
|
|
1299
|
+
# if (!dict->has_wrapped)
|
|
1300
|
+
# dict->full = dict->pos - LZ_DICT_INIT_POS;
|
|
1301
|
+
@state.update_literal
|
|
1302
|
+
warn "DEBUG: After update_literal - state=#{@state.value}" if ENV["LZMA_DEBUG"]
|
|
1303
|
+
|
|
1304
|
+
# Write to dictionary buffer at current position
|
|
1305
|
+
# XZ Utils dict_put pattern: dict->buf[dict->pos++] = byte;
|
|
1306
|
+
# DEBUG: Check byte value at critical position
|
|
1307
|
+
if @pos == 576 + 256
|
|
1308
|
+
XzUtilsDecoderDebug.debug_puts "DEBUG: Storing byte at pos 832 (256th decoded byte)"
|
|
1309
|
+
XzUtilsDecoderDebug.debug_puts " byte.class=#{byte.class}"
|
|
1310
|
+
XzUtilsDecoderDebug.debug_puts " byte=#{byte}"
|
|
1311
|
+
XzUtilsDecoderDebug.debug_puts " byte.ord=#{byte.is_a?(String) ? byte.ord : 'N/A (not a string)'}"
|
|
1312
|
+
XzUtilsDecoderDebug.debug_puts " Integer value=#{byte.is_a?(Integer) ? byte : byte.ord}"
|
|
1313
|
+
end
|
|
1314
|
+
@dict_buf[@pos] = byte
|
|
1315
|
+
# DEBUG: Track array size changes
|
|
1316
|
+
if ENV["LZMA_DEBUG_ARRAY_WRITE"] && @dict_buf.size != (@dict_size + LZ_DICT_INIT_POS)
|
|
1317
|
+
XzUtilsDecoderDebug.debug_puts "DEBUG: Array expanded! pos=#{@pos}, byte=#{byte}, old_size=#{@dict_buf.size - 1}, new_size=#{@dict_buf.size}, decoder_id=#{@decoder_id}"
|
|
1318
|
+
XzUtilsDecoderDebug.debug_puts " Writing beyond original size caused expansion!"
|
|
1319
|
+
end
|
|
1320
|
+
if ENV["LZMA_DEBUG_ARRAY_WRITE"]
|
|
1321
|
+
XzUtilsDecoderDebug.debug_puts "DEBUG write[#{@decoder_id}]: pos=#{@pos}, byte=#{byte}, dict_buf.size=#{@dict_buf.size}, dict_buf.object_id=#{@dict_buf.object_id}, nil_count=#{@dict_buf.count(nil)}"
|
|
1322
|
+
end
|
|
1323
|
+
if ENV["LZMA_DEBUG_ARRAY"]
|
|
1324
|
+
# Verify the write succeeded
|
|
1325
|
+
if @dict_buf[@pos] != byte
|
|
1326
|
+
raise "DEBUG after write: @dict_buf[#{@pos}] = #{@dict_buf[@pos].inspect}, expected #{byte}! nil_count=#{@dict_buf.count(nil)}"
|
|
1327
|
+
end
|
|
1328
|
+
if @dict_full.positive? && @pos > LZ_DICT_INIT_POS && @dict_buf[@pos - 1].nil?
|
|
1329
|
+
raise "DEBUG after write: @dict_buf[#{@pos - 1}] is nil! @pos=#{@pos}, @dict_full=#{@dict_full}"
|
|
1330
|
+
end
|
|
1331
|
+
end
|
|
1332
|
+
@pos += 1
|
|
1333
|
+
|
|
1334
|
+
# ARM64 DEBUG: Trace first 20 bytes being written to dictionary
|
|
1335
|
+
if ENV["TRACE_ARM64_BYTES"]
|
|
1336
|
+
@arm64_trace ||= []
|
|
1337
|
+
if @arm64_trace.size < 20
|
|
1338
|
+
@arm64_trace << [@dict_full, @pos, byte.class, byte.is_a?(Integer) ? byte : byte.ord, @dict_buf[@pos]]
|
|
1339
|
+
if @arm64_trace.size == 20
|
|
1340
|
+
# Dump the trace
|
|
1341
|
+
puts "\n=== ARM64 BYTE TRACE (first 20 bytes) ==="
|
|
1342
|
+
puts "Decoder ID: #{@decoder_id}"
|
|
1343
|
+
@arm64_trace.each_with_index do |entry, i|
|
|
1344
|
+
df, p, _, val, stored = entry
|
|
1345
|
+
puts " [#{i + 1}] dict_full=#{df.to_s.rjust(6)}, pos=#{p.to_s.rjust(6)}, byte=#{val.to_s.rjust(3)} (0x#{val.to_s(16).upcase.rjust(2, '0')}) stored=#{stored.inspect}"
|
|
1346
|
+
end
|
|
1347
|
+
puts "=========================================\n"
|
|
1348
|
+
$stderr.flush
|
|
1349
|
+
end
|
|
1350
|
+
end
|
|
1351
|
+
end
|
|
1352
|
+
|
|
1353
|
+
# Update dict_full (XZ Utils pattern)
|
|
1354
|
+
# When dict_full reaches dict_size, the dictionary is full
|
|
1355
|
+
# After that, dict_full stays at dict_size and has_wrapped = true
|
|
1356
|
+
unless @has_wrapped
|
|
1357
|
+
@dict_full = @pos - LZ_DICT_INIT_POS
|
|
1358
|
+
# Check if we've reached the maximum dictionary size
|
|
1359
|
+
if @dict_full >= @dict_size
|
|
1360
|
+
@has_wrapped = true
|
|
1361
|
+
@dict_full = @dict_size
|
|
1362
|
+
end
|
|
1363
|
+
end
|
|
1364
|
+
|
|
1365
|
+
# Track bytes decoded in this chunk (for match length limiting)
|
|
1366
|
+
# IMPORTANT: Always increment this, even after dictionary wraps!
|
|
1367
|
+
# This is needed for correct match length limiting when @uncompressed_size is set.
|
|
1368
|
+
# XZ Utils uses dict.limit for this, but we use @chunk_bytes_decoded.
|
|
1369
|
+
if @uncompressed_size != 0xFFFFFFFFFFFFFFFF
|
|
1370
|
+
@chunk_bytes_decoded += 1
|
|
1371
|
+
end
|
|
1372
|
+
|
|
1373
|
+
# DEBUG: Show literal decode for position 220-230
|
|
1374
|
+
if old_dict_full.between?(220, 230)
|
|
1375
|
+
XzUtilsDecoderDebug.debug_puts "\n=== decode_literal at dict_full=#{old_dict_full} ==="
|
|
1376
|
+
XzUtilsDecoderDebug.debug_puts " Decoded: 0x#{byte.to_s(16)} ('#{byte.chr}')"
|
|
1377
|
+
XzUtilsDecoderDebug.debug_puts " rep0/1/2/3=(#{@rep0},#{@rep1},#{@rep2},#{@rep3})"
|
|
1378
|
+
end
|
|
1379
|
+
end
|
|
1380
|
+
|
|
1381
|
+
# Decode a match
|
|
1382
|
+
#
|
|
1383
|
+
# SDK decoding sequence:
|
|
1384
|
+
# 1. Decode is_rep bit
|
|
1385
|
+
# 2. If regular match:
|
|
1386
|
+
# - Decode match length using length coder
|
|
1387
|
+
# - Decode match distance using distance coder
|
|
1388
|
+
# 3. If rep match:
|
|
1389
|
+
# - Decode which rep distance to use (rep0/1/2/3)
|
|
1390
|
+
# - Decode rep match length
|
|
1391
|
+
# 4. Check for EOS marker
|
|
1392
|
+
# 5. Copy matched data from dictionary
|
|
1393
|
+
# 6. Update state machine and rep distances
|
|
1394
|
+
# 7. Update dictionary and position
|
|
1395
|
+
#
|
|
1396
|
+
# @return [Boolean] True if EOS marker detected, false otherwise
|
|
1397
|
+
def decode_match
|
|
1398
|
+
pos_state = @pos & ((1 << @pb) - 1)
|
|
1399
|
+
|
|
1400
|
+
# DEEP DEBUG: Trace every detail at position 227
|
|
1401
|
+
if ENV["LZMA_DEBUG"] && @dict_full == 227
|
|
1402
|
+
puts "\n=== DEEP DEBUG at dict_full=227 ==="
|
|
1403
|
+
puts " State: #{@state.value}, pos_state=#{pos_state}"
|
|
1404
|
+
puts " Rep distances BEFORE: (#{@rep0},#{@rep1},#{@rep2},#{@rep3})"
|
|
1405
|
+
puts " Range decoder: range=0x#{@range_decoder.range.to_s(16).upcase}, code=0x#{@range_decoder.code.to_s(16).upcase}"
|
|
1406
|
+
input_buffer = @range_decoder.instance_variable_get(:@input)
|
|
1407
|
+
puts " Input buffer: #{input_buffer ? input_buffer.size : 'nil'} bytes"
|
|
1408
|
+
end
|
|
1409
|
+
|
|
1410
|
+
# Decode is_rep bit
|
|
1411
|
+
is_rep_model = @is_rep_models[@state.value]
|
|
1412
|
+
if ENV["TRACE_IS_REP"]
|
|
1413
|
+
range_val = @range_decoder.range
|
|
1414
|
+
code_val = @range_decoder.code
|
|
1415
|
+
puts "[XzUtilsDecoder.decode_match] Before is_rep: state.value=#{@state.value}"
|
|
1416
|
+
puts " is_rep_model.object_id=#{is_rep_model.object_id}, prob=#{is_rep_model.probability}"
|
|
1417
|
+
puts " range=#{range_val} (0x#{range_val.to_s(16)}), code=#{code_val} (0x#{code_val.to_s(16)})"
|
|
1418
|
+
bound_calc = (range_val >> 11) * is_rep_model.probability
|
|
1419
|
+
puts " bound=(#{range_val} >> 11) * #{is_rep_model.probability} = #{bound_calc} (0x#{bound_calc.to_s(16)})"
|
|
1420
|
+
puts " code < bound? #{code_val < bound_calc}"
|
|
1421
|
+
end
|
|
1422
|
+
is_rep = @range_decoder.decode_bit(is_rep_model)
|
|
1423
|
+
|
|
1424
|
+
if ENV["TRACE_IS_REP"]
|
|
1425
|
+
range_val = @range_decoder.range
|
|
1426
|
+
code_val = @range_decoder.code
|
|
1427
|
+
puts "[XzUtilsDecoder.decode_match] Decoded is_rep=#{is_rep} with prob=#{is_rep_model.probability}"
|
|
1428
|
+
puts " After is_rep: range=#{range_val} (0x#{range_val.to_s(16)}), code=#{code_val} (0x#{code_val.to_s(16)})"
|
|
1429
|
+
end
|
|
1430
|
+
|
|
1431
|
+
if ENV["LZMA_DEBUG"] && @dict_full == 227
|
|
1432
|
+
puts " Decoded is_rep bit: #{is_rep} (#{@is_rep_models[@state.value].probability})"
|
|
1433
|
+
puts " After is_rep: range=0x#{@range_decoder.range.to_s(16).upcase}, code=0x#{@range_decoder.code.to_s(16).upcase}"
|
|
1434
|
+
end
|
|
1435
|
+
|
|
1436
|
+
if ENV["LZMA_DEBUG"]
|
|
1437
|
+
warn "DEBUG: decode_match START - is_rep=#{is_rep}, state.value=#{@state.value}, pos_state=#{pos_state}, rep0/1/2/3=(#{@rep0},#{@rep1},#{@rep2},#{@rep3})"
|
|
1438
|
+
end
|
|
1439
|
+
|
|
1440
|
+
if is_rep.zero?
|
|
1441
|
+
# Regular match (not rep)
|
|
1442
|
+
if ENV["LZMA_DEBUG"] && @dict_full.between?(220, 240)
|
|
1443
|
+
puts "DEBUG pos #{@dict_full}: Regular match (not rep)"
|
|
1444
|
+
end
|
|
1445
|
+
# Return result from decode_regular_match (true if EOS marker detected)
|
|
1446
|
+
return true if decode_regular_match(pos_state)
|
|
1447
|
+
else
|
|
1448
|
+
# Rep match - decode which rep distance to use
|
|
1449
|
+
if ENV["LZMA_DEBUG"] && @dict_full.between?(220, 240)
|
|
1450
|
+
puts "DEBUG pos #{@dict_full}: Rep match (is_rep=1)"
|
|
1451
|
+
end
|
|
1452
|
+
decode_rep_match(pos_state)
|
|
1453
|
+
end
|
|
1454
|
+
|
|
1455
|
+
false # No EOS marker detected
|
|
1456
|
+
end
|
|
1457
|
+
|
|
1458
|
+
# Decode a regular (non-rep) match
|
|
1459
|
+
#
|
|
1460
|
+
# XZ Utils dict_repeat pattern (from lz_decoder.h:203-263):
|
|
1461
|
+
# - Validate distance: dict->full > distance
|
|
1462
|
+
# - Calculate back = dict->pos - distance - 1
|
|
1463
|
+
# - If distance >= dict->pos: back += dict->size - LZ_DICT_REPEAT_MAX
|
|
1464
|
+
# - Copy bytes from back position
|
|
1465
|
+
# - Update dict->full if !has_wrapped
|
|
1466
|
+
#
|
|
1467
|
+
# @param pos_state [Integer] Position state
|
|
1468
|
+
# @return [Boolean] True if EOS marker detected, false otherwise
|
|
1469
|
+
def decode_regular_match(pos_state)
|
|
1470
|
+
# DEBUG: Trace matches around dict_full = 60-63
|
|
1471
|
+
old_dict_full = @dict_full
|
|
1472
|
+
old_rep0 = @rep0
|
|
1473
|
+
old_state = @state.value
|
|
1474
|
+
|
|
1475
|
+
# Decode match length
|
|
1476
|
+
length_encoded = @length_coder.decode(@range_decoder,
|
|
1477
|
+
pos_state)
|
|
1478
|
+
length = length_encoded + MATCH_LEN_MIN
|
|
1479
|
+
|
|
1480
|
+
# Calculate length state for distance decoding
|
|
1481
|
+
# XZ Utils formula (from lzma_common.h get_dist_state macro):
|
|
1482
|
+
# ((len) < DIST_STATES + MATCH_LEN_MIN ? (len) - MATCH_LEN_MIN : DIST_STATES - 1)
|
|
1483
|
+
# This gives: len=2→0, len=3→1, len=4→2, len=5→3, len=6+→3
|
|
1484
|
+
len_state = if length < NUM_LEN_TO_POS_STATES + MATCH_LEN_MIN
|
|
1485
|
+
length - MATCH_LEN_MIN
|
|
1486
|
+
else
|
|
1487
|
+
NUM_LEN_TO_POS_STATES - 1
|
|
1488
|
+
end
|
|
1489
|
+
|
|
1490
|
+
# DEBUG: Show bytes being copied
|
|
1491
|
+
if old_dict_full.between?(210, 230) || ENV["LZMA_DEBUG_DISTANCE"]
|
|
1492
|
+
XzUtilsDecoderDebug.debug_puts "\n=== decode_regular_match at dict_full=#{old_dict_full} ===" if old_dict_full.between?(210, 230)
|
|
1493
|
+
puts "[DISTANCE_DECODER] decode_regular_match at dict_full=#{old_dict_full}" if ENV["LZMA_DEBUG_DISTANCE"]
|
|
1494
|
+
XzUtilsDecoderDebug.debug_puts " pos_state=#{pos_state}" if old_dict_full.between?(210, 230)
|
|
1495
|
+
puts "[DISTANCE_DECODER] pos_state=#{pos_state}" if ENV["LZMA_DEBUG_DISTANCE"]
|
|
1496
|
+
XzUtilsDecoderDebug.debug_puts " state=#{old_state}" if old_dict_full.between?(210, 230)
|
|
1497
|
+
puts "[DISTANCE_DECODER] state=#{old_state}" if ENV["LZMA_DEBUG_DISTANCE"]
|
|
1498
|
+
XzUtilsDecoderDebug.debug_puts " length_encoded=#{length_encoded} length=#{length}" if old_dict_full.between?(210, 230)
|
|
1499
|
+
puts "[DISTANCE_DECODER] length_encoded=#{length_encoded} length=#{length}" if ENV["LZMA_DEBUG_DISTANCE"]
|
|
1500
|
+
XzUtilsDecoderDebug.debug_puts " len_state=#{len_state}" if old_dict_full.between?(210, 230)
|
|
1501
|
+
puts "[DISTANCE_DECODER] len_state=#{len_state}" if ENV["LZMA_DEBUG_DISTANCE"]
|
|
1502
|
+
XzUtilsDecoderDebug.debug_puts " rep0_before=#{old_rep0}" if old_dict_full.between?(210, 230)
|
|
1503
|
+
puts "[DISTANCE_DECODER] rep0_before=#{old_rep0}" if ENV["LZMA_DEBUG_DISTANCE"]
|
|
1504
|
+
end
|
|
1505
|
+
|
|
1506
|
+
if ENV["LZMA_DEBUG"] && old_dict_full.between?(220, 230)
|
|
1507
|
+
puts "DEBUG decode_regular_match at dict_full=#{old_dict_full}: length=#{length}"
|
|
1508
|
+
end
|
|
1509
|
+
|
|
1510
|
+
# Decode match distance
|
|
1511
|
+
# XZ Utils stores distance in rep0 without +1
|
|
1512
|
+
# The distance coder returns 0-based distance
|
|
1513
|
+
rep0 = @distance_coder.decode(@range_decoder, len_state)
|
|
1514
|
+
|
|
1515
|
+
# DEBUG
|
|
1516
|
+
if (ENV.fetch("LZMA_DEBUG", nil) && old_dict_full.between?(210, 230)) || old_dict_full == 293
|
|
1517
|
+
puts " rep0_decoded=#{rep0} (distance = #{rep0})"
|
|
1518
|
+
puts " buffer_back calculation: back=#{@dict_full - rep0 - 1}"
|
|
1519
|
+
end
|
|
1520
|
+
if ENV["LZMA_DEBUG"] && rep0 > 100000
|
|
1521
|
+
puts " [LARGE_DISTANCE at dict_full=#{old_dict_full}] rep0=#{rep0}, call count=#{$distance_decode_count || 'unknown'}"
|
|
1522
|
+
end
|
|
1523
|
+
|
|
1524
|
+
# Check for SDK EOS marker FIRST (before validation)
|
|
1525
|
+
# EOS marker: rep0 == UINT32_MAX (0xFFFFFFFF)
|
|
1526
|
+
# XZ Utils checks: if (rep0 == UINT32_MAX) goto eopm;
|
|
1527
|
+
# EOPM is only allowed if @allow_eopm is true OR uncompressed_size is unknown
|
|
1528
|
+
# Reference: XZ Utils lzma_decoder.c:697-705, 874-888
|
|
1529
|
+
if rep0 == 0xFFFFFFFF
|
|
1530
|
+
if @allow_eopm || @uncompressed_size == 0xFFFFFFFFFFFFFFFF
|
|
1531
|
+
# XZ Utils pattern after detecting EOPM:
|
|
1532
|
+
# 1. Normalize range decoder (may read more input bytes)
|
|
1533
|
+
# 2. Check if range decoder is finished (code == 0)
|
|
1534
|
+
# Reference: lzma_decoder.c:881-887 (SEQ_EOPM case)
|
|
1535
|
+
@range_decoder.normalize
|
|
1536
|
+
|
|
1537
|
+
# Check if range decoder is finished (code == 0)
|
|
1538
|
+
unless @range_decoder.code.zero?
|
|
1539
|
+
raise Omnizip::DecompressionError,
|
|
1540
|
+
"EOPM detected but range decoder not finished (code=#{@range_decoder.code}). Corrupted stream."
|
|
1541
|
+
end
|
|
1542
|
+
|
|
1543
|
+
return true # EOS marker detected, stop decoding
|
|
1544
|
+
else
|
|
1545
|
+
raise Omnizip::DecompressionError,
|
|
1546
|
+
"End-of-payload marker (EOPM) detected but not allowed (LZMA2 streams cannot have EOPM)"
|
|
1547
|
+
end
|
|
1548
|
+
end
|
|
1549
|
+
|
|
1550
|
+
# Validate distance using XZ Utils dict_is_distance_valid pattern
|
|
1551
|
+
# XZ Utils: lzma_decoder.c:876 - dict_is_distance_valid(&dict, rep0)
|
|
1552
|
+
# See lz_decoder.h:194-198: return dict->full > distance;
|
|
1553
|
+
# XZ Utils dict->full = dict->pos - LZ_DICT_INIT_POS (same as our @dict_full)
|
|
1554
|
+
# The distance is valid if: dict->full > distance
|
|
1555
|
+
unless @dict_full > rep0
|
|
1556
|
+
raise Omnizip::DecompressionError,
|
|
1557
|
+
"Invalid distance: #{rep0} (dict_full: #{@dict_full})"
|
|
1558
|
+
end
|
|
1559
|
+
|
|
1560
|
+
# Additional validation: ensure distance doesn't exceed absolute dictionary size
|
|
1561
|
+
# Only validate against absolute dictionary size to prevent buffer overflow
|
|
1562
|
+
# The wrapping logic (below) handles rep0 >= @dict_full cases
|
|
1563
|
+
if rep0 > @dict_size + @dict_full
|
|
1564
|
+
raise Omnizip::DecompressionError,
|
|
1565
|
+
"Invalid distance: #{rep0} exceeds maximum possible (dict_size=#{@dict_size}, dict_full=#{@dict_full})"
|
|
1566
|
+
end
|
|
1567
|
+
|
|
1568
|
+
# IMPORTANT: Limit match length to not exceed uncompressed_size
|
|
1569
|
+
# XZ Utils handles this by setting dict.limit and checking before each write
|
|
1570
|
+
# We need to ensure we don't exceed the target size
|
|
1571
|
+
if @uncompressed_size != 0xFFFFFFFFFFFFFFFF
|
|
1572
|
+
# Calculate how many bytes we can still decode in THIS chunk
|
|
1573
|
+
# @chunk_bytes_decoded is the bytes decoded in this chunk (starts from 0)
|
|
1574
|
+
# @uncompressed_size is the target for THIS chunk (not cumulative)
|
|
1575
|
+
remaining = @uncompressed_size - @chunk_bytes_decoded
|
|
1576
|
+
if length > remaining
|
|
1577
|
+
if ENV["LZMA_DEBUG"] || remaining <= 5
|
|
1578
|
+
puts "DEBUG: Limiting match length from #{length} to #{remaining} (chunk_bytes_decoded=#{@chunk_bytes_decoded}, uncompressed_size=#{@uncompressed_size}, dict_full=#{@dict_full})"
|
|
1579
|
+
end
|
|
1580
|
+
length = remaining
|
|
1581
|
+
end
|
|
1582
|
+
end
|
|
1583
|
+
|
|
1584
|
+
# Copy matched data from dictionary using XZ Utils dict_repeat pattern
|
|
1585
|
+
# See lz_decoder.h:211-213:
|
|
1586
|
+
# back = dict->pos - distance - 1;
|
|
1587
|
+
# if (distance >= dict->pos)
|
|
1588
|
+
# back += dict->size - LZ_DICT_REPEAT_MAX;
|
|
1589
|
+
#
|
|
1590
|
+
# Note: dict->pos in XZ Utils is the actual data position (same as our @dict_full)
|
|
1591
|
+
# Our @pos includes the LZ_DICT_INIT_POS offset, so we use @dict_full for calculations
|
|
1592
|
+
#
|
|
1593
|
+
# dict->size in XZ Utils = dict_size + 2 * LZ_DICT_REPEAT_MAX
|
|
1594
|
+
# Our dict_buf size = @dict_size + LZ_DICT_INIT_POS = @dict_size + 2 * LZ_DICT_REPEAT_MAX
|
|
1595
|
+
back = @dict_full - rep0 - 1
|
|
1596
|
+
if rep0 >= @dict_full
|
|
1597
|
+
# Distance wraps to the end of the dictionary buffer
|
|
1598
|
+
# XZ Utils: back += dict->size - LZ_DICT_REPEAT_MAX;
|
|
1599
|
+
# Our dict_buf size = @dict_size + LZ_DICT_INIT_POS = @dict_size + 2 * LZ_DICT_REPEAT_MAX
|
|
1600
|
+
# So: back += (@dict_size + 2 * LZ_DICT_REPEAT_MAX) - LZ_DICT_REPEAT_MAX
|
|
1601
|
+
# = back + @dict_size + LZ_DICT_REPEAT_MAX
|
|
1602
|
+
back += @dict_size + LZ_DICT_REPEAT_MAX
|
|
1603
|
+
end
|
|
1604
|
+
# Convert to buffer position (add LZ_DICT_INIT_POS offset)
|
|
1605
|
+
buffer_back = back + LZ_DICT_INIT_POS
|
|
1606
|
+
|
|
1607
|
+
# DEBUG: Show buffer position for position 217
|
|
1608
|
+
if old_dict_full.between?(210, 230)
|
|
1609
|
+
XzUtilsDecoderDebug.debug_puts " buffer_back=#{buffer_back}, back=#{back}"
|
|
1610
|
+
bytes_at_back = @dict_buf[buffer_back, 3]
|
|
1611
|
+
bytes_hex = if bytes_at_back.is_a?(String)
|
|
1612
|
+
bytes_at_back.bytes.map do |b|
|
|
1613
|
+
"%02x" % b
|
|
1614
|
+
end.join(" ")
|
|
1615
|
+
else
|
|
1616
|
+
bytes_at_back.map do |b|
|
|
1617
|
+
"%02x" % b
|
|
1618
|
+
end.join(" ")
|
|
1619
|
+
end
|
|
1620
|
+
XzUtilsDecoderDebug.debug_puts " First 3 bytes at buffer_back: #{bytes_hex} (#{bytes_at_back.inspect})"
|
|
1621
|
+
end
|
|
1622
|
+
|
|
1623
|
+
if ENV["LZMA_DEBUG"]
|
|
1624
|
+
b0 = @dict_buf[buffer_back]
|
|
1625
|
+
b1 = @dict_buf[buffer_back + 1]
|
|
1626
|
+
b2 = @dict_buf[buffer_back + 2]
|
|
1627
|
+
b0_str = b0 ? "0x#{b0.to_s(16).upcase}" : "nil"
|
|
1628
|
+
b1_str = b1 ? "0x#{b1.to_s(16).upcase}" : "nil"
|
|
1629
|
+
b2_str = b2 ? "0x#{b2.to_s(16).upcase}" : "nil"
|
|
1630
|
+
b0_chr = b0 ? "'#{b0.chr}'" : "nil"
|
|
1631
|
+
b1_chr = b1 ? "'#{b1.chr}'" : "nil"
|
|
1632
|
+
b2_chr = b2 ? "'#{b2.chr}'" : "nil"
|
|
1633
|
+
warn "DEBUG: copy from buffer_back=#{buffer_back} (distance #{rep0}): #{b0_str} (#{b0_chr}) #{b1_str} (#{b1_chr}) #{b2_str} (#{b2_chr})"
|
|
1634
|
+
warn "DEBUG: pos_before=#{@pos} (output #{@pos - LZ_DICT_INIT_POS}), len=#{length}, pos_after=#{@pos + length} (output #{@pos + length - LZ_DICT_INIT_POS})"
|
|
1635
|
+
# Show what the dictionary contains at key positions (simplified)
|
|
1636
|
+
warn "DEBUG: dict_buf size=#{@dict_buf.size}, allocated=#{@dict_size + 608}"
|
|
1637
|
+
end
|
|
1638
|
+
|
|
1639
|
+
# Copy bytes from dictionary and extend buffer as needed
|
|
1640
|
+
# XZ Utils dict_repeat pattern: dict->buf[dict->pos++] = dict->buf[back++]
|
|
1641
|
+
if ENV["LZMA_DEBUG"] && old_dict_full.between?(220, 260)
|
|
1642
|
+
puts " Copying #{length} bytes from buffer_back=#{buffer_back} to @pos=#{@pos}, dict_full=#{@dict_full}"
|
|
1643
|
+
puts " Source bytes: #{@dict_buf[buffer_back, length].inspect}"
|
|
1644
|
+
puts " First 5 target bytes before copy: #{@dict_buf[@pos,
|
|
1645
|
+
5].inspect}"
|
|
1646
|
+
end
|
|
1647
|
+
length.times do |i|
|
|
1648
|
+
byte = @dict_buf[buffer_back + i]
|
|
1649
|
+
if ENV["LZMA_DEBUG"]
|
|
1650
|
+
warn "DEBUG: copy iteration #{i}: reading dict_buf[#{buffer_back + i}]=0x#{byte.to_s(16).upcase} ('#{byte.chr}'), writing to dict_buf[#{@pos + i}]"
|
|
1651
|
+
end
|
|
1652
|
+
@dict_buf[@pos + i] = byte
|
|
1653
|
+
if ENV["LZMA_DEBUG_ARRAY_WRITE"] && @dict_buf.size != (@dict_size + LZ_DICT_INIT_POS)
|
|
1654
|
+
XzUtilsDecoderDebug.debug_puts "DEBUG: Array expanded during copy! write_pos=#{@pos + i}, byte=#{byte}, old_size=#{@dict_buf.size - 1}, new_size=#{@dict_buf.size}, decoder_id=#{@decoder_id}"
|
|
1655
|
+
end
|
|
1656
|
+
end
|
|
1657
|
+
if ENV["LZMA_DEBUG"] && old_dict_full.between?(220, 230)
|
|
1658
|
+
puts " After copy: #{@dict_buf[@pos, length].inspect}"
|
|
1659
|
+
end
|
|
1660
|
+
|
|
1661
|
+
# Update state and position
|
|
1662
|
+
@state.update_match
|
|
1663
|
+
warn "DEBUG: After update_match - state=#{@state.value}" if ENV["LZMA_DEBUG"]
|
|
1664
|
+
@pos += length
|
|
1665
|
+
|
|
1666
|
+
# Update dict_full (XZ Utils pattern)
|
|
1667
|
+
# When dict_full reaches dict_size, the dictionary is full
|
|
1668
|
+
# After that, dict_full stays at dict_size and has_wrapped = true
|
|
1669
|
+
unless @has_wrapped
|
|
1670
|
+
@dict_full = @pos - LZ_DICT_INIT_POS
|
|
1671
|
+
# Check if we've reached the maximum dictionary size
|
|
1672
|
+
if @dict_full >= @dict_size
|
|
1673
|
+
@has_wrapped = true
|
|
1674
|
+
@dict_full = @dict_size
|
|
1675
|
+
end
|
|
1676
|
+
end
|
|
1677
|
+
|
|
1678
|
+
# Track bytes decoded in this chunk (for match length limiting)
|
|
1679
|
+
# IMPORTANT: Increment by length for match copies (multiple bytes at once)
|
|
1680
|
+
# This is needed for correct match length limiting when @uncompressed_size is set.
|
|
1681
|
+
# XZ Utils uses dict.limit for this, but we use @chunk_bytes_decoded.
|
|
1682
|
+
if @uncompressed_size != 0xFFFFFFFFFFFFFFFF
|
|
1683
|
+
@chunk_bytes_decoded += length
|
|
1684
|
+
end
|
|
1685
|
+
|
|
1686
|
+
# Update rep distances - rotate and set new rep0
|
|
1687
|
+
# SDK rotation: rep3←rep2, rep2←rep1, rep1←rep0, rep0←rep0
|
|
1688
|
+
# XZ Utils stores the actual distance in rep0 (no +1)
|
|
1689
|
+
if ENV["LZMA_DEBUG"]
|
|
1690
|
+
warn "DEBUG: Before rotation - rep0/1/2/3=(#{@rep0},#{@rep1},#{@rep2},#{@rep3}), new distance=#{rep0}"
|
|
1691
|
+
end
|
|
1692
|
+
|
|
1693
|
+
# DEBUG: Trace rep rotation for position 224
|
|
1694
|
+
if ENV["LZMA_DEBUG"] && old_dict_full.between?(220, 230)
|
|
1695
|
+
puts "\n=== Rep rotation after match at dict_full=#{old_dict_full} ==="
|
|
1696
|
+
puts " Before: rep0/1/2/3=(#{@rep0},#{@rep1},#{@rep2},#{@rep3})"
|
|
1697
|
+
puts " Setting rep0 to: #{rep0.inspect}"
|
|
1698
|
+
end
|
|
1699
|
+
|
|
1700
|
+
@rep3 = @rep2
|
|
1701
|
+
@rep2 = @rep1
|
|
1702
|
+
@rep1 = @rep0
|
|
1703
|
+
@rep0 = rep0
|
|
1704
|
+
|
|
1705
|
+
if ENV["LZMA_DEBUG"]
|
|
1706
|
+
warn "DEBUG: After rotation - rep0/1/2/3=(#{@rep0},#{@rep1},#{@rep2},#{@rep3})"
|
|
1707
|
+
end
|
|
1708
|
+
|
|
1709
|
+
# DEBUG: Show final rep values
|
|
1710
|
+
if ENV["LZMA_DEBUG"] && old_dict_full.between?(220, 230)
|
|
1711
|
+
puts " After: rep0/1/2/3=(#{@rep0},#{@rep1},#{@rep2},#{@rep3})"
|
|
1712
|
+
end
|
|
1713
|
+
|
|
1714
|
+
# DEBUG: Verify rep0 is actually set
|
|
1715
|
+
if old_dict_full.between?(220, 230)
|
|
1716
|
+
actual_rep0 = @rep0
|
|
1717
|
+
XzUtilsDecoderDebug.debug_puts " VERIFICATION: @rep0=#{actual_rep0.inspect}, @rep0.object_id=#{@rep0.object_id}"
|
|
1718
|
+
end
|
|
1719
|
+
|
|
1720
|
+
# DEBUG: Trace range/code state after match at dict_full 56-62
|
|
1721
|
+
if ENV.fetch("LZMA_DEBUG",
|
|
1722
|
+
nil) && old_dict_full >= 56 && old_dict_full <= 62
|
|
1723
|
+
range_after = @range_decoder.instance_variable_get(:@range)
|
|
1724
|
+
code_after = @range_decoder.instance_variable_get(:@code)
|
|
1725
|
+
XzUtilsDecoderDebug.debug_puts " AFTER match (dict_full #{old_dict_full}→#{@dict_full}): range=0x#{range_after.to_s(16).upcase}, code=0x#{code_after.to_s(16).upcase}"
|
|
1726
|
+
end
|
|
1727
|
+
|
|
1728
|
+
false # Not EOS, continue decoding
|
|
1729
|
+
end
|
|
1730
|
+
|
|
1731
|
+
# Decode a rep match
|
|
1732
|
+
#
|
|
1733
|
+
# SDK rep match decoding (from XZ Utils lzma_decoder.c):
|
|
1734
|
+
# - is_rep0: Use rep0
|
|
1735
|
+
# - is_rep0_long=0: Short rep (length=1, don't rotate)
|
|
1736
|
+
# - is_rep0_long=1: Long rep (decode length, keep rep0)
|
|
1737
|
+
# - is_rep1: Use rep1, rotate rep1→rep0
|
|
1738
|
+
# - is_rep2: Use rep2, rotate rep2→rep0
|
|
1739
|
+
# - Otherwise: Use rep3, rotate rep3→rep0
|
|
1740
|
+
# After rotation, rep0 always contains the actual distance to use
|
|
1741
|
+
#
|
|
1742
|
+
# @param pos_state [Integer] Position state
|
|
1743
|
+
# @return [Boolean] Always false (rep matches are never EOS)
|
|
1744
|
+
def decode_rep_match(pos_state)
|
|
1745
|
+
# DEBUG: Trace rep matches around position 217
|
|
1746
|
+
old_dict_full = @dict_full
|
|
1747
|
+
old_rep0 = @rep0
|
|
1748
|
+
|
|
1749
|
+
# DEBUG: Show rep distances at the start
|
|
1750
|
+
if ENV["LZMA_DEBUG"]
|
|
1751
|
+
warn "DEBUG: decode_rep_match START[#{@decoder_id}] - rep0/1/2/3=(#{@rep0},#{@rep1},#{@rep2},#{@rep3})"
|
|
1752
|
+
end
|
|
1753
|
+
|
|
1754
|
+
# DEBUG: Trace rep matches around position 227
|
|
1755
|
+
if old_dict_full.between?(220, 230)
|
|
1756
|
+
XzUtilsDecoderDebug.debug_puts "\n=== decode_rep_match at dict_full=#{old_dict_full} (decoder_id=#{@decoder_id}) ==="
|
|
1757
|
+
XzUtilsDecoderDebug.debug_puts " At START: rep0/1/2/3=(#{@rep0},#{@rep1},#{@rep2},#{@rep3})"
|
|
1758
|
+
XzUtilsDecoderDebug.debug_puts " old_rep0=#{old_rep0} (captured @rep0)"
|
|
1759
|
+
XzUtilsDecoderDebug.debug_puts " @rep0.object_id=#{@rep0.object_id}"
|
|
1760
|
+
end
|
|
1761
|
+
|
|
1762
|
+
# Decode which rep distance to use
|
|
1763
|
+
is_rep0 = @range_decoder.decode_bit(@is_rep0_models[@state.value])
|
|
1764
|
+
|
|
1765
|
+
puts "DEBUG rep match selection at dict_full=#{@dict_full}: is_rep0=#{is_rep0}, rep0/1/2/3 before=(#{@rep0},#{@rep1},#{@rep2},#{@rep3})" if @dict_full.between?(
|
|
1766
|
+
220, 230
|
|
1767
|
+
)
|
|
1768
|
+
puts " state.value=#{@state.value}, pos_state=#{pos_state}, model_index=#{(@state.value * (1 << @pb)) + pos_state}" if @dict_full.between?(
|
|
1769
|
+
220, 230
|
|
1770
|
+
)
|
|
1771
|
+
|
|
1772
|
+
if ENV["LZMA_DEBUG"]
|
|
1773
|
+
warn "DEBUG: decode_rep_match - is_rep0=#{is_rep0}"
|
|
1774
|
+
end
|
|
1775
|
+
|
|
1776
|
+
if is_rep0.zero?
|
|
1777
|
+
# Use rep0
|
|
1778
|
+
puts "DEBUG rep match using rep0" if @dict_full.between?(220, 230)
|
|
1779
|
+
# XZ Utils: is_rep0_long[state][pos_state] where the array size is NUM_STATES * (1 << pb)
|
|
1780
|
+
is_rep0_long = @range_decoder.decode_bit(
|
|
1781
|
+
@is_rep0_long_models[(@state.value * (1 << @pb)) + pos_state],
|
|
1782
|
+
)
|
|
1783
|
+
|
|
1784
|
+
if ENV["LZMA_DEBUG"]
|
|
1785
|
+
warn "DEBUG: decode_rep_match - is_rep0_long=#{is_rep0_long}"
|
|
1786
|
+
end
|
|
1787
|
+
|
|
1788
|
+
if is_rep0_long.zero?
|
|
1789
|
+
# Short rep (length=1)
|
|
1790
|
+
length = 1
|
|
1791
|
+
@state.update_short_rep
|
|
1792
|
+
else
|
|
1793
|
+
# Long rep with rep0
|
|
1794
|
+
length = @rep_length_coder.decode(@range_decoder,
|
|
1795
|
+
pos_state) + MATCH_LEN_MIN
|
|
1796
|
+
@state.update_rep
|
|
1797
|
+
end
|
|
1798
|
+
else
|
|
1799
|
+
# Not rep0, check rep1/rep2/rep3
|
|
1800
|
+
puts "DEBUG rep match NOT using rep0 (is_rep0=#{is_rep0})" if @dict_full.between?(
|
|
1801
|
+
220, 230
|
|
1802
|
+
)
|
|
1803
|
+
|
|
1804
|
+
is_rep1 = @range_decoder.decode_bit(@is_rep1_models[@state.value])
|
|
1805
|
+
|
|
1806
|
+
if is_rep1.zero?
|
|
1807
|
+
# Use rep1 - XZ Utils pattern:
|
|
1808
|
+
# const uint32_t distance = rep1;
|
|
1809
|
+
# rep1 = rep0;
|
|
1810
|
+
# rep0 = distance;
|
|
1811
|
+
@rep1, @rep0 = @rep0, @rep1
|
|
1812
|
+
else
|
|
1813
|
+
# Not rep1, check rep2/rep3
|
|
1814
|
+
is_rep2 = @range_decoder.decode_bit(@is_rep2_models[@state.value])
|
|
1815
|
+
|
|
1816
|
+
if is_rep2.zero?
|
|
1817
|
+
# Use rep2 - XZ Utils pattern:
|
|
1818
|
+
# const uint32_t distance = rep2;
|
|
1819
|
+
# rep2 = rep1;
|
|
1820
|
+
# rep1 = rep0;
|
|
1821
|
+
# rep0 = distance;
|
|
1822
|
+
distance = @rep2
|
|
1823
|
+
else
|
|
1824
|
+
# Use rep3 - XZ Utils pattern:
|
|
1825
|
+
# const uint32_t distance = rep3;
|
|
1826
|
+
# rep3 = rep2;
|
|
1827
|
+
# rep2 = rep1;
|
|
1828
|
+
# rep1 = rep0;
|
|
1829
|
+
# rep0 = distance;
|
|
1830
|
+
distance = @rep3
|
|
1831
|
+
@rep3 = @rep2
|
|
1832
|
+
end
|
|
1833
|
+
@rep2 = @rep1
|
|
1834
|
+
@rep1 = @rep0
|
|
1835
|
+
@rep0 = distance
|
|
1836
|
+
end
|
|
1837
|
+
|
|
1838
|
+
# Decode length for rep1/2/3
|
|
1839
|
+
length = @rep_length_coder.decode(@range_decoder,
|
|
1840
|
+
pos_state) + MATCH_LEN_MIN
|
|
1841
|
+
@state.update_rep
|
|
1842
|
+
end
|
|
1843
|
+
|
|
1844
|
+
# After rotation, rep0 always contains the distance to use
|
|
1845
|
+
# XZ Utils stores distances without +1 offset
|
|
1846
|
+
distance = @rep0
|
|
1847
|
+
|
|
1848
|
+
puts "DEBUG rep match after rotation: dict_full=#{old_dict_full}, distance=#{distance}, rep0/1/2/3=(#{@rep0},#{@rep1},#{@rep2},#{@rep3})" if old_dict_full.between?(
|
|
1849
|
+
220, 230
|
|
1850
|
+
)
|
|
1851
|
+
|
|
1852
|
+
# DEBUG: Trace rep matches around position 217
|
|
1853
|
+
if old_dict_full.between?(210, 230)
|
|
1854
|
+
XzUtilsDecoderDebug.debug_puts "\n=== decode_rep_match at dict_full=#{old_dict_full} ==="
|
|
1855
|
+
XzUtilsDecoderDebug.debug_puts " old_rep0=#{old_rep0}, new rep0=#{@rep0} (distance=#{distance})"
|
|
1856
|
+
XzUtilsDecoderDebug.debug_puts " pos_state=#{pos_state}"
|
|
1857
|
+
end
|
|
1858
|
+
|
|
1859
|
+
if ENV["LZMA_DEBUG"]
|
|
1860
|
+
warn "DEBUG: decode_rep_match - length=#{length}, distance=#{distance}, dict_full=#{@dict_full}, rep0/1/2/3=(#{@rep0},#{@rep1},#{@rep2},#{@rep3})"
|
|
1861
|
+
end
|
|
1862
|
+
|
|
1863
|
+
# Validate distance using XZ Utils dict_is_distance_valid pattern
|
|
1864
|
+
# See lz_decoder.h:194-198: return dict->full > distance;
|
|
1865
|
+
# Note: distance=0 is valid (means copy from position 0, the first byte)
|
|
1866
|
+
# XZ Utils allows distance=0 as long as dict->full > 0 (at least one byte available)
|
|
1867
|
+
unless @dict_full > distance
|
|
1868
|
+
raise "Invalid rep distance: #{distance} (dict_full: #{@dict_full})"
|
|
1869
|
+
end
|
|
1870
|
+
|
|
1871
|
+
# IMPORTANT: Limit match length to not exceed uncompressed_size
|
|
1872
|
+
# XZ Utils handles this by setting dict.limit and checking before each write
|
|
1873
|
+
# We need to ensure we don't exceed the target size
|
|
1874
|
+
if @uncompressed_size != 0xFFFFFFFFFFFFFFFF
|
|
1875
|
+
# Calculate how many bytes we can still decode in THIS chunk
|
|
1876
|
+
# @chunk_bytes_decoded is the bytes decoded in this chunk (starts from 0)
|
|
1877
|
+
# @uncompressed_size is the target for THIS chunk (not cumulative)
|
|
1878
|
+
remaining = @uncompressed_size - @chunk_bytes_decoded
|
|
1879
|
+
if length > remaining
|
|
1880
|
+
if ENV["LZMA_DEBUG"] || remaining <= 5
|
|
1881
|
+
puts "DEBUG REP: Limiting rep match length from #{length} to #{remaining} (chunk_bytes_decoded=#{@chunk_bytes_decoded}, uncompressed_size=#{@uncompressed_size}, dict_full=#{@dict_full})"
|
|
1882
|
+
end
|
|
1883
|
+
length = remaining
|
|
1884
|
+
end
|
|
1885
|
+
end
|
|
1886
|
+
|
|
1887
|
+
# Copy matched data from dictionary using XZ Utils dict_repeat pattern
|
|
1888
|
+
# back = dict->pos - distance - 1;
|
|
1889
|
+
# if (distance >= dict->pos) back += dict->size - LZ_DICT_REPEAT_MAX;
|
|
1890
|
+
#
|
|
1891
|
+
# Note: dict->pos in XZ Utils is the actual data position (same as our @dict_full)
|
|
1892
|
+
# Our @pos includes the LZ_DICT_INIT_POS offset, so we use @dict_full for calculations
|
|
1893
|
+
#
|
|
1894
|
+
# dict->size in XZ Utils = dict_size + 2 * LZ_DICT_REPEAT_MAX
|
|
1895
|
+
# Our dict_buf size = @dict_size + LZ_DICT_INIT_POS = @dict_size + 2 * LZ_DICT_REPEAT_MAX
|
|
1896
|
+
back = @dict_full - distance - 1
|
|
1897
|
+
old_back = back
|
|
1898
|
+
if distance >= @dict_full
|
|
1899
|
+
# Distance wraps to the end of the dictionary buffer
|
|
1900
|
+
# XZ Utils: back += dict->size - LZ_DICT_REPEAT_MAX;
|
|
1901
|
+
# Our dict_buf size = @dict_size + LZ_DICT_INIT_POS = @dict_size + 2 * LZ_DICT_REPEAT_MAX
|
|
1902
|
+
# So: back += (@dict_size + 2 * LZ_DICT_REPEAT_MAX) - LZ_DICT_REPEAT_MAX
|
|
1903
|
+
# = back + @dict_size + LZ_DICT_REPEAT_MAX
|
|
1904
|
+
back += @dict_size + LZ_DICT_REPEAT_MAX
|
|
1905
|
+
end
|
|
1906
|
+
# Convert to buffer position (add LZ_DICT_INIT_POS offset)
|
|
1907
|
+
buffer_back = back + LZ_DICT_INIT_POS
|
|
1908
|
+
|
|
1909
|
+
puts "DEBUG rep match copy at dict_full=#{@dict_full}: @dict_full=#{@dict_full}, distance=#{distance}, back=#{back}, buffer_back=#{buffer_back}" if @dict_full.between?(
|
|
1910
|
+
220, 230
|
|
1911
|
+
)
|
|
1912
|
+
|
|
1913
|
+
# DEBUG: Show back calculation for position 217
|
|
1914
|
+
if old_dict_full.between?(210, 230)
|
|
1915
|
+
XzUtilsDecoderDebug.debug_puts " back calculation: @dict_full=#{@dict_full}, distance=#{distance}"
|
|
1916
|
+
XzUtilsDecoderDebug.debug_puts " back=#{old_back}, wrapped_back=#{back}, buffer_back=#{buffer_back}"
|
|
1917
|
+
bytes_at_back = @dict_buf[buffer_back, 3]
|
|
1918
|
+
bytes_hex = if bytes_at_back.is_a?(String)
|
|
1919
|
+
bytes_at_back.bytes.map do |b|
|
|
1920
|
+
"%02x" % b
|
|
1921
|
+
end.join(" ")
|
|
1922
|
+
else
|
|
1923
|
+
[bytes_at_back].flatten.map do |b|
|
|
1924
|
+
"%02x" % b
|
|
1925
|
+
end.join(" ")
|
|
1926
|
+
end
|
|
1927
|
+
XzUtilsDecoderDebug.debug_puts " First 3 bytes at buffer_back: #{bytes_hex} (#{bytes_at_back.inspect})"
|
|
1928
|
+
end
|
|
1929
|
+
|
|
1930
|
+
# Copy bytes from dictionary and extend buffer as needed
|
|
1931
|
+
# XZ Utils dict_repeat pattern: dict->buf[dict->pos++] = dict->buf[back++]
|
|
1932
|
+
if old_dict_full.between?(250, 260)
|
|
1933
|
+
source_val = @dict_buf[@pos - 1]
|
|
1934
|
+
puts " Rep match copy at dict_full=#{@dict_full}: length=#{length}, distance=#{distance}, @pos=#{@pos} (will write to #{@pos}...#{@pos + length - 1})"
|
|
1935
|
+
puts " Reading from @pos-1=#{@pos - 1}, source byte = #{source_val} (0x#{source_val.to_s(16)} '#{begin
|
|
1936
|
+
source_val.chr
|
|
1937
|
+
rescue StandardError
|
|
1938
|
+
'?'
|
|
1939
|
+
end}')"
|
|
1940
|
+
puts " Before copy: @dict_buf[#{@pos}...#{@pos + length - 1}] = #{@dict_buf[@pos,
|
|
1941
|
+
length].inspect}"
|
|
1942
|
+
end
|
|
1943
|
+
length.times do |i|
|
|
1944
|
+
byte = @dict_buf[buffer_back + i]
|
|
1945
|
+
if @dict_full == 227 && i.zero?
|
|
1946
|
+
puts "DEBUG dict_copy at dict_full=227, i=0:"
|
|
1947
|
+
puts " buffer_back=#{buffer_back}, byte=#{byte} ('#{byte.chr}')"
|
|
1948
|
+
puts " Writing to @pos=#{@pos + i}"
|
|
1949
|
+
puts " dict_buf[buffer_back...buffer_back+10] = #{@dict_buf[buffer_back,
|
|
1950
|
+
10].inspect}"
|
|
1951
|
+
# DEBUG: Check if buffer_back+1 has the correct byte
|
|
1952
|
+
puts " dict_buf[buffer_back+1=#{buffer_back + 1}] = #{@dict_buf[buffer_back + 1].inspect} ('#{begin
|
|
1953
|
+
@dict_buf[buffer_back + 1].chr
|
|
1954
|
+
rescue StandardError
|
|
1955
|
+
'?'
|
|
1956
|
+
end}')"
|
|
1957
|
+
prev_5 = if buffer_back > 4
|
|
1958
|
+
@dict_buf[(buffer_back - 5)..(buffer_back - 1)].map do |b|
|
|
1959
|
+
"0x#{b.to_s(16).upcase} (#{b.chr})"
|
|
1960
|
+
end.join(", ")
|
|
1961
|
+
else
|
|
1962
|
+
"N/A"
|
|
1963
|
+
end
|
|
1964
|
+
puts " Previous 5 bytes: [#{prev_5}]"
|
|
1965
|
+
puts " Current dict_full=#{@dict_full}, @pos=#{@pos}"
|
|
1966
|
+
end
|
|
1967
|
+
@dict_buf[@pos + i] = byte
|
|
1968
|
+
end
|
|
1969
|
+
|
|
1970
|
+
# Update position
|
|
1971
|
+
@pos += length
|
|
1972
|
+
|
|
1973
|
+
# Update dict_full (XZ Utils pattern)
|
|
1974
|
+
# When dict_full reaches dict_size, the dictionary is full
|
|
1975
|
+
# After that, dict_full stays at dict_size and has_wrapped = true
|
|
1976
|
+
unless @has_wrapped
|
|
1977
|
+
@dict_full = @pos - LZ_DICT_INIT_POS
|
|
1978
|
+
# Check if we've reached the maximum dictionary size
|
|
1979
|
+
if @dict_full >= @dict_size
|
|
1980
|
+
@has_wrapped = true
|
|
1981
|
+
@dict_full = @dict_size
|
|
1982
|
+
end
|
|
1983
|
+
end
|
|
1984
|
+
|
|
1985
|
+
# Track bytes decoded in this chunk (for match length limiting)
|
|
1986
|
+
# IMPORTANT: Increment by length for match copies (multiple bytes at once)
|
|
1987
|
+
# This is needed for correct match length limiting when @uncompressed_size is set.
|
|
1988
|
+
# XZ Utils uses dict.limit for this, but we use @chunk_bytes_decoded.
|
|
1989
|
+
if @uncompressed_size != 0xFFFFFFFFFFFFFFFF
|
|
1990
|
+
@chunk_bytes_decoded += length
|
|
1991
|
+
end
|
|
1992
|
+
|
|
1993
|
+
false # Rep matches are never EOS
|
|
1994
|
+
end
|
|
1995
|
+
|
|
1996
|
+
# Calculate literal state index
|
|
1997
|
+
# XZ Utils literal_subcoder formula (from lzma_common.h:141-143):
|
|
1998
|
+
# ((probs) + 3 * (((((pos) << 8) + (prev_byte)) & (literal_mask)) << (lc)))
|
|
1999
|
+
# where literal_mask = (1 << (lc + lp)) - 1
|
|
2000
|
+
#
|
|
2001
|
+
# The key insight is that (pos << 8) + prev_byte is computed FIRST,
|
|
2002
|
+
# then masked, THEN shifted by lc. This is different from our old formula
|
|
2003
|
+
# which added pos_part and prev_part separately.
|
|
2004
|
+
#
|
|
2005
|
+
# IMPORTANT: The literal_subcoder macro returns:
|
|
2006
|
+
# probs + 3 * context_value_shifted
|
|
2007
|
+
# where context_value_shifted = context_value << lc
|
|
2008
|
+
#
|
|
2009
|
+
# For our implementation, we return context_value (unshifted) so that
|
|
2010
|
+
# the literal decoder can calculate the correct offset: 3 * context_value
|
|
2011
|
+
#
|
|
2012
|
+
# This creates (1 << (lc + lp)) unique contexts
|
|
2013
|
+
#
|
|
2014
|
+
# @return [Integer] Literal context value (unshifted, 0-7 for lc=3)
|
|
2015
|
+
def calculate_literal_state
|
|
2016
|
+
# XZ Utils dict_get0 pattern: dict->buf[dict->pos - 1]
|
|
2017
|
+
# For array buffer, use @pos - 1 as index
|
|
2018
|
+
|
|
2019
|
+
# DEBUG: Check buffer state before access
|
|
2020
|
+
if ENV["LZMA_DEBUG_CALC_STATE"] && @dict_full == 8
|
|
2021
|
+
XzUtilsDecoderDebug.debug_puts "DEBUG before calc_state[#{@decoder_id}]: pos=#{@pos}, dict_full=#{@dict_full}"
|
|
2022
|
+
XzUtilsDecoderDebug.debug_puts " @dict_buf.object_id=#{@dict_buf.object_id}, size=#{@dict_buf.size}"
|
|
2023
|
+
XzUtilsDecoderDebug.debug_puts " Accessing index #{@pos - 1}: value=#{@dict_buf[@pos - 1].inspect}"
|
|
2024
|
+
end
|
|
2025
|
+
|
|
2026
|
+
prev_byte = @dict_full.positive? ? @dict_buf[@pos - 1] : 0
|
|
2027
|
+
|
|
2028
|
+
# Safeguard: if prev_byte is nil, use 0 and log detailed diagnostics
|
|
2029
|
+
# This can happen if the buffer was not properly initialized or we're accessing the wrong buffer
|
|
2030
|
+
if prev_byte.nil?
|
|
2031
|
+
if ENV["LZMA_DEBUG_NIL_BYTE"]
|
|
2032
|
+
raise "DEBUG: prev_byte is nil! decoder_id=#{@decoder_id}, @pos=#{@pos}, @dict_full=#{@dict_full}, @dict_buf.size=#{@dict_buf&.size || 'nil'}, accessing index #{@pos - 1}, nil_count=#{@dict_buf&.count(nil) || 'N/A'}, @dict_buf.object_id=#{@dict_buf&.object_id || 'nil'}"
|
|
2033
|
+
end
|
|
2034
|
+
|
|
2035
|
+
prev_byte = 0
|
|
2036
|
+
end
|
|
2037
|
+
|
|
2038
|
+
if ENV["LZMA_DEBUG_CALC_STATE"]
|
|
2039
|
+
XzUtilsDecoderDebug.debug_puts "DEBUG calc_state[#{@decoder_id}]: pos=#{@pos}, dict_full=#{@dict_full}, @dict_buf.object_id=#{@dict_buf.object_id}, prev_byte=#{prev_byte}"
|
|
2040
|
+
end
|
|
2041
|
+
|
|
2042
|
+
# XZ Utils formula from lzma_common.h:literal_mask_calc
|
|
2043
|
+
# literal_mask = (UINT32_C(0x100) << (lp)) - (UINT32_C(0x100) >> (lc))
|
|
2044
|
+
# For lc=3, lp=0: (256 << 0) - (256 >> 3) = 256 - 32 = 224 (0xE0)
|
|
2045
|
+
literal_mask = (0x100 << @lp) - (0x100 >> @lc)
|
|
2046
|
+
|
|
2047
|
+
# Combine dict_full (actual decoded position) and prev_byte, then apply mask
|
|
2048
|
+
# IMPORTANT: XZ Utils uses dict.pos (which starts at 0 and increments)
|
|
2049
|
+
# omnizip's @pos starts at LZ_DICT_INIT_POS (576), so we use @dict_full instead
|
|
2050
|
+
# This ensures we match XZ Utils's literal state calculation exactly
|
|
2051
|
+
(((@dict_full << 8) + prev_byte) & literal_mask)
|
|
2052
|
+
end
|
|
2053
|
+
end
|
|
2054
|
+
end
|
|
2055
|
+
end
|