omnizip 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (511) hide show
  1. checksums.yaml +7 -0
  2. data/.rspec +3 -0
  3. data/.rubocop.yml +32 -0
  4. data/.rubocop_todo.yml +754 -0
  5. data/COPYING +502 -0
  6. data/Gemfile +17 -0
  7. data/LICENSE +12 -0
  8. data/README.adoc +1045 -0
  9. data/Rakefile +12 -0
  10. data/benchmark/README.md +260 -0
  11. data/benchmark/benchmark_suite.rb +125 -0
  12. data/benchmark/compression_bench.rb +181 -0
  13. data/benchmark/filter_bench.rb +180 -0
  14. data/benchmark/models/benchmark_result.rb +59 -0
  15. data/benchmark/models/comparison_result.rb +69 -0
  16. data/benchmark/profile_suite.rb +167 -0
  17. data/benchmark/reporter.rb +150 -0
  18. data/benchmark/run_benchmarks.rb +66 -0
  19. data/benchmark/test_data.rb +137 -0
  20. data/config/formats/rar3_spec.yml +91 -0
  21. data/config/formats/rar5_spec.yml +102 -0
  22. data/docs/.github/workflows/docs.yml +142 -0
  23. data/docs/.gitignore +21 -0
  24. data/docs/.lychee.toml +67 -0
  25. data/docs/Gemfile +13 -0
  26. data/docs/RAR_WRITE_SUPPORT.md +26 -0
  27. data/docs/README.md +101 -0
  28. data/docs/_config.yml +112 -0
  29. data/docs/assets/logo.svg +1 -0
  30. data/docs/assets/omnizip-logo.pdf +1540 -11
  31. data/docs/comparison/feature-matrix.adoc +694 -0
  32. data/docs/comparison/index.adoc +113 -0
  33. data/docs/comparison/vs-7zip.adoc +309 -0
  34. data/docs/comparison/vs-peazip.adoc +77 -0
  35. data/docs/comparison/vs-rubyzip.adoc +342 -0
  36. data/docs/comparison/vs-winrar.adoc +100 -0
  37. data/docs/compatibility.adoc +579 -0
  38. data/docs/concepts/index.adoc +129 -0
  39. data/docs/developer/architecture.adoc +256 -0
  40. data/docs/developer/contributing.adoc +158 -0
  41. data/docs/developer/index.adoc +25 -0
  42. data/docs/developer/testing.adoc +212 -0
  43. data/docs/getting-started/basic-usage.adoc +271 -0
  44. data/docs/getting-started/index.adoc +42 -0
  45. data/docs/getting-started/installation.adoc +138 -0
  46. data/docs/getting-started/quick-start.adoc +185 -0
  47. data/docs/getting-started/your-first-archive.adoc +218 -0
  48. data/docs/guides/advanced-features/encryption.adoc +300 -0
  49. data/docs/guides/advanced-features/index.adoc +49 -0
  50. data/docs/guides/advanced-features/parallel-processing.adoc +246 -0
  51. data/docs/guides/advanced-features/progress-tracking.adoc +320 -0
  52. data/docs/guides/advanced-features/streaming.adoc +212 -0
  53. data/docs/guides/archive-formats/gzip-format.adoc +107 -0
  54. data/docs/guides/archive-formats/index.adoc +130 -0
  55. data/docs/guides/archive-formats/rar-format.adoc +104 -0
  56. data/docs/guides/archive-formats/rar5.adoc +521 -0
  57. data/docs/guides/archive-formats/seven-zip-format.adoc +35 -0
  58. data/docs/guides/archive-formats/tar-format.adoc +106 -0
  59. data/docs/guides/archive-formats/xz-format.adoc +118 -0
  60. data/docs/guides/archive-formats/zip-format.adoc +35 -0
  61. data/docs/guides/compression-algorithms/bzip2.adoc +113 -0
  62. data/docs/guides/compression-algorithms/deflate.adoc +319 -0
  63. data/docs/guides/compression-algorithms/index.adoc +190 -0
  64. data/docs/guides/compression-algorithms/lzma.adoc +398 -0
  65. data/docs/guides/compression-algorithms/lzma2.adoc +327 -0
  66. data/docs/guides/compression-algorithms/ppmd.adoc +316 -0
  67. data/docs/guides/compression-algorithms/zstandard.adoc +361 -0
  68. data/docs/guides/creating-archives.adoc +354 -0
  69. data/docs/guides/extracting-archives.adoc +53 -0
  70. data/docs/guides/format-conversion.adoc +64 -0
  71. data/docs/guides/index.adoc +49 -0
  72. data/docs/guides/migration-rubyzip.adoc +217 -0
  73. data/docs/guides/parity-archives.adoc +605 -0
  74. data/docs/guides/performance-tuning.adoc +88 -0
  75. data/docs/index.adoc +218 -0
  76. data/docs/lychee.toml +67 -0
  77. data/docs/reference/api/overview.adoc +188 -0
  78. data/docs/reference/cli/compress-command.adoc +114 -0
  79. data/docs/reference/cli/overview.adoc +140 -0
  80. data/docs/reference/index.adoc +26 -0
  81. data/docs/resources/faq.adoc +185 -0
  82. data/docs/resources/quick-reference.adoc +222 -0
  83. data/docs/troubleshooting/index.adoc +208 -0
  84. data/examples/api_comparison.rb +205 -0
  85. data/examples/deflate64_example.rb +96 -0
  86. data/examples/par2_demo.rb +121 -0
  87. data/examples/quick_start_native.rb +150 -0
  88. data/examples/quick_start_rubyzip.rb +115 -0
  89. data/examples/rubyzip_compatibility_demo.rb +194 -0
  90. data/exe/omnizip +27 -0
  91. data/lib/omnizip/algorithm.rb +130 -0
  92. data/lib/omnizip/algorithm_registry.rb +86 -0
  93. data/lib/omnizip/algorithms/.keep +0 -0
  94. data/lib/omnizip/algorithms/bzip2/bwt.rb +225 -0
  95. data/lib/omnizip/algorithms/bzip2/decoder.rb +193 -0
  96. data/lib/omnizip/algorithms/bzip2/encoder.rb +237 -0
  97. data/lib/omnizip/algorithms/bzip2/huffman.rb +206 -0
  98. data/lib/omnizip/algorithms/bzip2/mtf.rb +101 -0
  99. data/lib/omnizip/algorithms/bzip2/rle.rb +151 -0
  100. data/lib/omnizip/algorithms/bzip2.rb +130 -0
  101. data/lib/omnizip/algorithms/deflate/constants.rb +28 -0
  102. data/lib/omnizip/algorithms/deflate/decoder.rb +38 -0
  103. data/lib/omnizip/algorithms/deflate/encoder.rb +46 -0
  104. data/lib/omnizip/algorithms/deflate.rb +128 -0
  105. data/lib/omnizip/algorithms/deflate64/constants.rb +45 -0
  106. data/lib/omnizip/algorithms/deflate64/decoder.rb +153 -0
  107. data/lib/omnizip/algorithms/deflate64/encoder.rb +98 -0
  108. data/lib/omnizip/algorithms/deflate64/huffman_coder.rb +354 -0
  109. data/lib/omnizip/algorithms/deflate64/lz77_encoder.rb +142 -0
  110. data/lib/omnizip/algorithms/deflate64.rb +109 -0
  111. data/lib/omnizip/algorithms/lzma/bit_model.rb +120 -0
  112. data/lib/omnizip/algorithms/lzma/constants.rb +112 -0
  113. data/lib/omnizip/algorithms/lzma/decoder.rb +148 -0
  114. data/lib/omnizip/algorithms/lzma/dictionary.rb +69 -0
  115. data/lib/omnizip/algorithms/lzma/distance_coder.rb +415 -0
  116. data/lib/omnizip/algorithms/lzma/encoder.rb +142 -0
  117. data/lib/omnizip/algorithms/lzma/length_coder.rb +260 -0
  118. data/lib/omnizip/algorithms/lzma/literal_decoder.rb +320 -0
  119. data/lib/omnizip/algorithms/lzma/literal_encoder.rb +210 -0
  120. data/lib/omnizip/algorithms/lzma/lzip_decoder.rb +341 -0
  121. data/lib/omnizip/algorithms/lzma/lzma_alone_decoder.rb +192 -0
  122. data/lib/omnizip/algorithms/lzma/lzma_state.rb +128 -0
  123. data/lib/omnizip/algorithms/lzma/match.rb +32 -0
  124. data/lib/omnizip/algorithms/lzma/match_finder.rb +205 -0
  125. data/lib/omnizip/algorithms/lzma/match_finder_config.rb +142 -0
  126. data/lib/omnizip/algorithms/lzma/match_finder_factory.rb +88 -0
  127. data/lib/omnizip/algorithms/lzma/optimal_encoder.rb +130 -0
  128. data/lib/omnizip/algorithms/lzma/probability_models.rb +72 -0
  129. data/lib/omnizip/algorithms/lzma/range_coder.rb +85 -0
  130. data/lib/omnizip/algorithms/lzma/range_decoder.rb +434 -0
  131. data/lib/omnizip/algorithms/lzma/range_encoder.rb +194 -0
  132. data/lib/omnizip/algorithms/lzma/state.rb +127 -0
  133. data/lib/omnizip/algorithms/lzma/xz_buffered_range_encoder.rb +325 -0
  134. data/lib/omnizip/algorithms/lzma/xz_encoder.rb +426 -0
  135. data/lib/omnizip/algorithms/lzma/xz_encoder_fast.rb +645 -0
  136. data/lib/omnizip/algorithms/lzma/xz_match_finder_adapter.rb +227 -0
  137. data/lib/omnizip/algorithms/lzma/xz_price_calculator.rb +169 -0
  138. data/lib/omnizip/algorithms/lzma/xz_probability_models.rb +261 -0
  139. data/lib/omnizip/algorithms/lzma/xz_range_encoder.rb +223 -0
  140. data/lib/omnizip/algorithms/lzma/xz_range_encoder_exact.rb +331 -0
  141. data/lib/omnizip/algorithms/lzma/xz_state.rb +116 -0
  142. data/lib/omnizip/algorithms/lzma/xz_utils_decoder.rb +2055 -0
  143. data/lib/omnizip/algorithms/lzma.rb +238 -0
  144. data/lib/omnizip/algorithms/lzma2/chunk_manager.rb +182 -0
  145. data/lib/omnizip/algorithms/lzma2/constants.rb +41 -0
  146. data/lib/omnizip/algorithms/lzma2/encoder.rb +147 -0
  147. data/lib/omnizip/algorithms/lzma2/lzma2_chunk.rb +161 -0
  148. data/lib/omnizip/algorithms/lzma2/properties.rb +179 -0
  149. data/lib/omnizip/algorithms/lzma2/simple_lzma2_encoder.rb +127 -0
  150. data/lib/omnizip/algorithms/lzma2/xz_encoder_adapter.rb +85 -0
  151. data/lib/omnizip/algorithms/lzma2.rb +141 -0
  152. data/lib/omnizip/algorithms/ppmd7/constants.rb +74 -0
  153. data/lib/omnizip/algorithms/ppmd7/context.rb +154 -0
  154. data/lib/omnizip/algorithms/ppmd7/decoder.rb +126 -0
  155. data/lib/omnizip/algorithms/ppmd7/encoder.rb +163 -0
  156. data/lib/omnizip/algorithms/ppmd7/model.rb +248 -0
  157. data/lib/omnizip/algorithms/ppmd7/symbol_state.rb +57 -0
  158. data/lib/omnizip/algorithms/ppmd7.rb +116 -0
  159. data/lib/omnizip/algorithms/ppmd8/constants.rb +61 -0
  160. data/lib/omnizip/algorithms/ppmd8/context.rb +34 -0
  161. data/lib/omnizip/algorithms/ppmd8/decoder.rb +107 -0
  162. data/lib/omnizip/algorithms/ppmd8/encoder.rb +138 -0
  163. data/lib/omnizip/algorithms/ppmd8/model.rb +250 -0
  164. data/lib/omnizip/algorithms/ppmd8/restoration_method.rb +78 -0
  165. data/lib/omnizip/algorithms/ppmd8.rb +82 -0
  166. data/lib/omnizip/algorithms/ppmd_base.rb +138 -0
  167. data/lib/omnizip/algorithms/sevenzip_lzma2.rb +123 -0
  168. data/lib/omnizip/algorithms/xz_lzma2.rb +118 -0
  169. data/lib/omnizip/algorithms/zstandard/constants.rb +25 -0
  170. data/lib/omnizip/algorithms/zstandard/decoder.rb +46 -0
  171. data/lib/omnizip/algorithms/zstandard/encoder.rb +51 -0
  172. data/lib/omnizip/algorithms/zstandard.rb +138 -0
  173. data/lib/omnizip/buffer/memory_archive.rb +251 -0
  174. data/lib/omnizip/buffer/memory_extractor.rb +224 -0
  175. data/lib/omnizip/buffer.rb +176 -0
  176. data/lib/omnizip/checksum_registry.rb +114 -0
  177. data/lib/omnizip/checksums/crc32.rb +100 -0
  178. data/lib/omnizip/checksums/crc64.rb +101 -0
  179. data/lib/omnizip/checksums/crc_base.rb +158 -0
  180. data/lib/omnizip/checksums/verifier.rb +131 -0
  181. data/lib/omnizip/chunked/memory_manager.rb +194 -0
  182. data/lib/omnizip/chunked/reader.rb +78 -0
  183. data/lib/omnizip/chunked/writer.rb +120 -0
  184. data/lib/omnizip/chunked.rb +129 -0
  185. data/lib/omnizip/cli/output_formatter.rb +104 -0
  186. data/lib/omnizip/cli.rb +572 -0
  187. data/lib/omnizip/commands/.keep +0 -0
  188. data/lib/omnizip/commands/archive_create_command.rb +427 -0
  189. data/lib/omnizip/commands/archive_extract_command.rb +272 -0
  190. data/lib/omnizip/commands/archive_list_command.rb +218 -0
  191. data/lib/omnizip/commands/archive_repair_command.rb +131 -0
  192. data/lib/omnizip/commands/archive_verify_command.rb +117 -0
  193. data/lib/omnizip/commands/compress_command.rb +117 -0
  194. data/lib/omnizip/commands/decompress_command.rb +120 -0
  195. data/lib/omnizip/commands/list_command.rb +53 -0
  196. data/lib/omnizip/commands/metadata_command.rb +153 -0
  197. data/lib/omnizip/commands/parity_create_command.rb +122 -0
  198. data/lib/omnizip/commands/parity_repair_command.rb +122 -0
  199. data/lib/omnizip/commands/parity_verify_command.rb +124 -0
  200. data/lib/omnizip/commands/profile_list_command.rb +56 -0
  201. data/lib/omnizip/commands/profile_show_command.rb +44 -0
  202. data/lib/omnizip/convenience.rb +359 -0
  203. data/lib/omnizip/converter/conversion_registry.rb +49 -0
  204. data/lib/omnizip/converter/conversion_strategy.rb +121 -0
  205. data/lib/omnizip/converter/seven_zip_to_zip_strategy.rb +97 -0
  206. data/lib/omnizip/converter/zip_to_seven_zip_strategy.rb +112 -0
  207. data/lib/omnizip/converter.rb +105 -0
  208. data/lib/omnizip/crypto/aes256/cipher.rb +100 -0
  209. data/lib/omnizip/crypto/aes256/constants.rb +28 -0
  210. data/lib/omnizip/crypto/aes256/key_derivation.rb +101 -0
  211. data/lib/omnizip/crypto/aes256.rb +102 -0
  212. data/lib/omnizip/error.rb +106 -0
  213. data/lib/omnizip/eta/exponential_smoothing_estimator.rb +98 -0
  214. data/lib/omnizip/eta/moving_average_estimator.rb +99 -0
  215. data/lib/omnizip/eta/rate_calculator.rb +104 -0
  216. data/lib/omnizip/eta/sample_history.rb +143 -0
  217. data/lib/omnizip/eta/time_estimator.rb +106 -0
  218. data/lib/omnizip/eta.rb +63 -0
  219. data/lib/omnizip/extraction/filter_chain.rb +177 -0
  220. data/lib/omnizip/extraction/glob_pattern.rb +140 -0
  221. data/lib/omnizip/extraction/pattern_matcher.rb +70 -0
  222. data/lib/omnizip/extraction/predicate_pattern.rb +52 -0
  223. data/lib/omnizip/extraction/regex_pattern.rb +50 -0
  224. data/lib/omnizip/extraction/selective_extractor.rb +240 -0
  225. data/lib/omnizip/extraction.rb +111 -0
  226. data/lib/omnizip/file_type/mime_classifier.rb +144 -0
  227. data/lib/omnizip/file_type.rb +113 -0
  228. data/lib/omnizip/filter.rb +139 -0
  229. data/lib/omnizip/filter_pipeline.rb +108 -0
  230. data/lib/omnizip/filter_registry.rb +166 -0
  231. data/lib/omnizip/filters/bcj.rb +279 -0
  232. data/lib/omnizip/filters/bcj2/constants.rb +53 -0
  233. data/lib/omnizip/filters/bcj2/decoder.rb +200 -0
  234. data/lib/omnizip/filters/bcj2/encoder.rb +61 -0
  235. data/lib/omnizip/filters/bcj2/stream_data.rb +93 -0
  236. data/lib/omnizip/filters/bcj2.rb +99 -0
  237. data/lib/omnizip/filters/bcj_arm.rb +176 -0
  238. data/lib/omnizip/filters/bcj_arm64.rb +244 -0
  239. data/lib/omnizip/filters/bcj_ia64.rb +196 -0
  240. data/lib/omnizip/filters/bcj_ppc.rb +190 -0
  241. data/lib/omnizip/filters/bcj_sparc.rb +176 -0
  242. data/lib/omnizip/filters/bcj_x86.rb +193 -0
  243. data/lib/omnizip/filters/delta.rb +196 -0
  244. data/lib/omnizip/filters/filter_base.rb +72 -0
  245. data/lib/omnizip/filters/registry.rb +123 -0
  246. data/lib/omnizip/filters/xz_delta.rb +258 -0
  247. data/lib/omnizip/format_detector.rb +162 -0
  248. data/lib/omnizip/format_registry.rb +59 -0
  249. data/lib/omnizip/formats/.keep +0 -0
  250. data/lib/omnizip/formats/bzip2_file.rb +172 -0
  251. data/lib/omnizip/formats/cpio/constants.rb +55 -0
  252. data/lib/omnizip/formats/cpio/entry.rb +385 -0
  253. data/lib/omnizip/formats/cpio/reader.rb +196 -0
  254. data/lib/omnizip/formats/cpio/writer.rb +234 -0
  255. data/lib/omnizip/formats/cpio.rb +140 -0
  256. data/lib/omnizip/formats/format_spec_loader.rb +230 -0
  257. data/lib/omnizip/formats/gzip.rb +238 -0
  258. data/lib/omnizip/formats/iso/directory_builder.rb +297 -0
  259. data/lib/omnizip/formats/iso/directory_record.rb +152 -0
  260. data/lib/omnizip/formats/iso/joliet.rb +204 -0
  261. data/lib/omnizip/formats/iso/path_table.rb +125 -0
  262. data/lib/omnizip/formats/iso/reader.rb +197 -0
  263. data/lib/omnizip/formats/iso/rock_ridge.rb +349 -0
  264. data/lib/omnizip/formats/iso/volume_builder.rb +320 -0
  265. data/lib/omnizip/formats/iso/volume_descriptor.rb +168 -0
  266. data/lib/omnizip/formats/iso/writer.rb +530 -0
  267. data/lib/omnizip/formats/iso.rb +140 -0
  268. data/lib/omnizip/formats/lzip.rb +175 -0
  269. data/lib/omnizip/formats/lzma_alone.rb +171 -0
  270. data/lib/omnizip/formats/rar/archive_repairer.rb +243 -0
  271. data/lib/omnizip/formats/rar/archive_verifier.rb +195 -0
  272. data/lib/omnizip/formats/rar/block_parser.rb +243 -0
  273. data/lib/omnizip/formats/rar/compression/bit_stream.rb +180 -0
  274. data/lib/omnizip/formats/rar/compression/dispatcher.rb +217 -0
  275. data/lib/omnizip/formats/rar/compression/lz77_huffman/decoder.rb +216 -0
  276. data/lib/omnizip/formats/rar/compression/lz77_huffman/encoder.rb +158 -0
  277. data/lib/omnizip/formats/rar/compression/lz77_huffman/huffman_builder.rb +217 -0
  278. data/lib/omnizip/formats/rar/compression/lz77_huffman/huffman_coder.rb +189 -0
  279. data/lib/omnizip/formats/rar/compression/lz77_huffman/match_finder.rb +135 -0
  280. data/lib/omnizip/formats/rar/compression/lz77_huffman/sliding_window.rb +165 -0
  281. data/lib/omnizip/formats/rar/compression/ppmd/context.rb +105 -0
  282. data/lib/omnizip/formats/rar/compression/ppmd/decoder.rb +219 -0
  283. data/lib/omnizip/formats/rar/compression/ppmd/encoder.rb +262 -0
  284. data/lib/omnizip/formats/rar/compression_method_registry.rb +106 -0
  285. data/lib/omnizip/formats/rar/constants.rb +82 -0
  286. data/lib/omnizip/formats/rar/decompressor.rb +238 -0
  287. data/lib/omnizip/formats/rar/external_writer.rb +312 -0
  288. data/lib/omnizip/formats/rar/header.rb +192 -0
  289. data/lib/omnizip/formats/rar/license_validator.rb +109 -0
  290. data/lib/omnizip/formats/rar/models/rar_archive.rb +77 -0
  291. data/lib/omnizip/formats/rar/models/rar_entry.rb +65 -0
  292. data/lib/omnizip/formats/rar/models/rar_volume.rb +56 -0
  293. data/lib/omnizip/formats/rar/parity_handler.rb +292 -0
  294. data/lib/omnizip/formats/rar/rar5/compression/lzma.rb +202 -0
  295. data/lib/omnizip/formats/rar/rar5/compression/lzss.rb +578 -0
  296. data/lib/omnizip/formats/rar/rar5/compression/store.rb +60 -0
  297. data/lib/omnizip/formats/rar/rar5/crc32.rb +39 -0
  298. data/lib/omnizip/formats/rar/rar5/encryption/aes256_cbc.rb +97 -0
  299. data/lib/omnizip/formats/rar/rar5/encryption/encryption_header.rb +114 -0
  300. data/lib/omnizip/formats/rar/rar5/encryption/encryption_manager.rb +166 -0
  301. data/lib/omnizip/formats/rar/rar5/encryption/key_derivation.rb +97 -0
  302. data/lib/omnizip/formats/rar/rar5/header.rb +187 -0
  303. data/lib/omnizip/formats/rar/rar5/models/encryption_options.rb +74 -0
  304. data/lib/omnizip/formats/rar/rar5/models/recovery_options.rb +63 -0
  305. data/lib/omnizip/formats/rar/rar5/models/solid_options.rb +63 -0
  306. data/lib/omnizip/formats/rar/rar5/models/volume_options.rb +74 -0
  307. data/lib/omnizip/formats/rar/rar5/multi_volume/ARCHITECTURE.md +290 -0
  308. data/lib/omnizip/formats/rar/rar5/multi_volume/volume_manager.rb +264 -0
  309. data/lib/omnizip/formats/rar/rar5/multi_volume/volume_splitter.rb +155 -0
  310. data/lib/omnizip/formats/rar/rar5/multi_volume/volume_writer.rb +194 -0
  311. data/lib/omnizip/formats/rar/rar5/solid/solid_encoder.rb +109 -0
  312. data/lib/omnizip/formats/rar/rar5/solid/solid_manager.rb +142 -0
  313. data/lib/omnizip/formats/rar/rar5/solid/solid_stream.rb +121 -0
  314. data/lib/omnizip/formats/rar/rar5/vint.rb +65 -0
  315. data/lib/omnizip/formats/rar/rar5/writer.rb +466 -0
  316. data/lib/omnizip/formats/rar/rar_format_base.rb +241 -0
  317. data/lib/omnizip/formats/rar/reader.rb +366 -0
  318. data/lib/omnizip/formats/rar/recovery_record.rb +245 -0
  319. data/lib/omnizip/formats/rar/volume_manager.rb +168 -0
  320. data/lib/omnizip/formats/rar/writer.rb +431 -0
  321. data/lib/omnizip/formats/rar.rb +205 -0
  322. data/lib/omnizip/formats/rar3/compressor.rb +73 -0
  323. data/lib/omnizip/formats/rar3/decompressor.rb +66 -0
  324. data/lib/omnizip/formats/rar3/reader.rb +386 -0
  325. data/lib/omnizip/formats/rar3/writer.rb +219 -0
  326. data/lib/omnizip/formats/rar5/compressor.rb +73 -0
  327. data/lib/omnizip/formats/rar5/decompressor.rb +66 -0
  328. data/lib/omnizip/formats/rar5/reader.rb +342 -0
  329. data/lib/omnizip/formats/rar5/writer.rb +214 -0
  330. data/lib/omnizip/formats/seven_zip/coder_chain.rb +150 -0
  331. data/lib/omnizip/formats/seven_zip/constants.rb +126 -0
  332. data/lib/omnizip/formats/seven_zip/encoded_header.rb +114 -0
  333. data/lib/omnizip/formats/seven_zip/encrypted_header.rb +142 -0
  334. data/lib/omnizip/formats/seven_zip/file_collector.rb +144 -0
  335. data/lib/omnizip/formats/seven_zip/header.rb +106 -0
  336. data/lib/omnizip/formats/seven_zip/header_encryptor.rb +134 -0
  337. data/lib/omnizip/formats/seven_zip/header_writer.rb +466 -0
  338. data/lib/omnizip/formats/seven_zip/models/coder_info.rb +30 -0
  339. data/lib/omnizip/formats/seven_zip/models/file_entry.rb +58 -0
  340. data/lib/omnizip/formats/seven_zip/models/folder.rb +69 -0
  341. data/lib/omnizip/formats/seven_zip/models/stream_info.rb +42 -0
  342. data/lib/omnizip/formats/seven_zip/parser.rb +660 -0
  343. data/lib/omnizip/formats/seven_zip/reader.rb +458 -0
  344. data/lib/omnizip/formats/seven_zip/split_archive_reader.rb +632 -0
  345. data/lib/omnizip/formats/seven_zip/split_archive_writer.rb +315 -0
  346. data/lib/omnizip/formats/seven_zip/stream_compressor.rb +151 -0
  347. data/lib/omnizip/formats/seven_zip/stream_decompressor.rb +162 -0
  348. data/lib/omnizip/formats/seven_zip/writer.rb +740 -0
  349. data/lib/omnizip/formats/seven_zip.rb +93 -0
  350. data/lib/omnizip/formats/tar/constants.rb +73 -0
  351. data/lib/omnizip/formats/tar/entry.rb +94 -0
  352. data/lib/omnizip/formats/tar/header.rb +168 -0
  353. data/lib/omnizip/formats/tar/reader.rb +121 -0
  354. data/lib/omnizip/formats/tar/writer.rb +216 -0
  355. data/lib/omnizip/formats/tar.rb +84 -0
  356. data/lib/omnizip/formats/xz/reader.rb +116 -0
  357. data/lib/omnizip/formats/xz.rb +237 -0
  358. data/lib/omnizip/formats/xz_impl/block_decoder.rb +754 -0
  359. data/lib/omnizip/formats/xz_impl/block_encoder.rb +306 -0
  360. data/lib/omnizip/formats/xz_impl/block_header.rb +210 -0
  361. data/lib/omnizip/formats/xz_impl/block_header_parser.rb +186 -0
  362. data/lib/omnizip/formats/xz_impl/constants.rb +49 -0
  363. data/lib/omnizip/formats/xz_impl/index_decoder.rb +174 -0
  364. data/lib/omnizip/formats/xz_impl/index_encoder.rb +122 -0
  365. data/lib/omnizip/formats/xz_impl/stream_decoder.rb +468 -0
  366. data/lib/omnizip/formats/xz_impl/stream_encoder.rb +99 -0
  367. data/lib/omnizip/formats/xz_impl/stream_footer.rb +81 -0
  368. data/lib/omnizip/formats/xz_impl/stream_footer_parser.rb +117 -0
  369. data/lib/omnizip/formats/xz_impl/stream_header.rb +55 -0
  370. data/lib/omnizip/formats/xz_impl/stream_header_parser.rb +108 -0
  371. data/lib/omnizip/formats/xz_impl/vli.rb +128 -0
  372. data/lib/omnizip/formats/xz_impl/writer.rb +421 -0
  373. data/lib/omnizip/formats/zip/central_directory_header.rb +195 -0
  374. data/lib/omnizip/formats/zip/constants.rb +69 -0
  375. data/lib/omnizip/formats/zip/end_of_central_directory.rb +133 -0
  376. data/lib/omnizip/formats/zip/local_file_header.rb +138 -0
  377. data/lib/omnizip/formats/zip/reader.rb +250 -0
  378. data/lib/omnizip/formats/zip/unix_extra_field.rb +153 -0
  379. data/lib/omnizip/formats/zip/writer.rb +375 -0
  380. data/lib/omnizip/formats/zip/zip64_end_of_central_directory.rb +104 -0
  381. data/lib/omnizip/formats/zip/zip64_end_of_central_directory_locator.rb +66 -0
  382. data/lib/omnizip/formats/zip/zip64_extra_field.rb +114 -0
  383. data/lib/omnizip/formats/zip.rb +50 -0
  384. data/lib/omnizip/implementations/base/lzma2_decoder_base.rb +75 -0
  385. data/lib/omnizip/implementations/base/lzma2_encoder_base.rb +128 -0
  386. data/lib/omnizip/implementations/base/lzma_decoder_base.rb +83 -0
  387. data/lib/omnizip/implementations/base/lzma_encoder_base.rb +108 -0
  388. data/lib/omnizip/implementations/base/state_machine_base.rb +182 -0
  389. data/lib/omnizip/implementations/seven_zip/lzma/decoder.rb +421 -0
  390. data/lib/omnizip/implementations/seven_zip/lzma/encoder.rb +465 -0
  391. data/lib/omnizip/implementations/seven_zip/lzma/match_finder.rb +288 -0
  392. data/lib/omnizip/implementations/seven_zip/lzma/range_decoder.rb +200 -0
  393. data/lib/omnizip/implementations/seven_zip/lzma/range_encoder.rb +197 -0
  394. data/lib/omnizip/implementations/seven_zip/lzma/state_machine.rb +141 -0
  395. data/lib/omnizip/implementations/seven_zip/lzma2/encoder.rb +519 -0
  396. data/lib/omnizip/implementations/xz_utils/lzma2/decoder.rb +723 -0
  397. data/lib/omnizip/implementations/xz_utils/lzma2/encoder.rb +750 -0
  398. data/lib/omnizip/io/buffered_input.rb +146 -0
  399. data/lib/omnizip/io/buffered_output.rb +105 -0
  400. data/lib/omnizip/io/stream_manager.rb +115 -0
  401. data/lib/omnizip/link_handler/hard_link.rb +79 -0
  402. data/lib/omnizip/link_handler/symbolic_link.rb +74 -0
  403. data/lib/omnizip/link_handler.rb +124 -0
  404. data/lib/omnizip/metadata/archive_metadata.rb +114 -0
  405. data/lib/omnizip/metadata/entry_metadata.rb +146 -0
  406. data/lib/omnizip/metadata/metadata_editor.rb +171 -0
  407. data/lib/omnizip/metadata/metadata_registry.rb +64 -0
  408. data/lib/omnizip/metadata/metadata_validator.rb +99 -0
  409. data/lib/omnizip/metadata.rb +57 -0
  410. data/lib/omnizip/models/.keep +0 -0
  411. data/lib/omnizip/models/algorithm_metadata.rb +73 -0
  412. data/lib/omnizip/models/compression_options.rb +71 -0
  413. data/lib/omnizip/models/conversion_options.rb +87 -0
  414. data/lib/omnizip/models/conversion_result.rb +135 -0
  415. data/lib/omnizip/models/eta_result.rb +46 -0
  416. data/lib/omnizip/models/extraction_rule.rb +115 -0
  417. data/lib/omnizip/models/filter_chain.rb +144 -0
  418. data/lib/omnizip/models/filter_config.rb +183 -0
  419. data/lib/omnizip/models/match_result.rb +124 -0
  420. data/lib/omnizip/models/optimization_suggestion.rb +91 -0
  421. data/lib/omnizip/models/parallel_options.rb +104 -0
  422. data/lib/omnizip/models/performance_result.rb +79 -0
  423. data/lib/omnizip/models/profile_report.rb +82 -0
  424. data/lib/omnizip/models/progress_options.rb +38 -0
  425. data/lib/omnizip/models/split_options.rb +116 -0
  426. data/lib/omnizip/optimization_registry.rb +81 -0
  427. data/lib/omnizip/parallel/job_queue.rb +209 -0
  428. data/lib/omnizip/parallel/job_scheduler.rb +203 -0
  429. data/lib/omnizip/parallel/parallel_compressor.rb +347 -0
  430. data/lib/omnizip/parallel/parallel_extractor.rb +329 -0
  431. data/lib/omnizip/parallel/worker_pool.rb +223 -0
  432. data/lib/omnizip/parallel.rb +149 -0
  433. data/lib/omnizip/parity/chunked_block_processor.rb +196 -0
  434. data/lib/omnizip/parity/galois16.rb +145 -0
  435. data/lib/omnizip/parity/models/creator_packet.rb +73 -0
  436. data/lib/omnizip/parity/models/file_description_packet.rb +133 -0
  437. data/lib/omnizip/parity/models/ifsc_packet.rb +123 -0
  438. data/lib/omnizip/parity/models/main_packet.rb +128 -0
  439. data/lib/omnizip/parity/models/packet.rb +156 -0
  440. data/lib/omnizip/parity/models/packet_registry.rb +109 -0
  441. data/lib/omnizip/parity/models/recovery_slice_packet.rb +78 -0
  442. data/lib/omnizip/parity/par2_creator.rb +531 -0
  443. data/lib/omnizip/parity/par2_repairer.rb +407 -0
  444. data/lib/omnizip/parity/par2_verifier.rb +364 -0
  445. data/lib/omnizip/parity/par2cmdline_algorithm.rb +110 -0
  446. data/lib/omnizip/parity/par2cmdline_coefficients.rb +78 -0
  447. data/lib/omnizip/parity/reed_solomon_decoder.rb +266 -0
  448. data/lib/omnizip/parity/reed_solomon_encoder.rb +111 -0
  449. data/lib/omnizip/parity/reed_solomon_matrix.rb +342 -0
  450. data/lib/omnizip/parity.rb +186 -0
  451. data/lib/omnizip/password/encryption_registry.rb +65 -0
  452. data/lib/omnizip/password/encryption_strategy.rb +96 -0
  453. data/lib/omnizip/password/password_validator.rb +129 -0
  454. data/lib/omnizip/password/winzip_aes_strategy.rb +192 -0
  455. data/lib/omnizip/password/zip_crypto_strategy.rb +141 -0
  456. data/lib/omnizip/password.rb +87 -0
  457. data/lib/omnizip/pipe/stream_compressor.rb +124 -0
  458. data/lib/omnizip/pipe/stream_decompressor.rb +174 -0
  459. data/lib/omnizip/pipe.rb +121 -0
  460. data/lib/omnizip/platform/ntfs_streams.rb +201 -0
  461. data/lib/omnizip/platform.rb +189 -0
  462. data/lib/omnizip/profile/archive_profile.rb +39 -0
  463. data/lib/omnizip/profile/balanced_profile.rb +33 -0
  464. data/lib/omnizip/profile/binary_profile.rb +36 -0
  465. data/lib/omnizip/profile/compression_profile.rb +158 -0
  466. data/lib/omnizip/profile/custom_profile.rb +157 -0
  467. data/lib/omnizip/profile/fast_profile.rb +33 -0
  468. data/lib/omnizip/profile/maximum_profile.rb +33 -0
  469. data/lib/omnizip/profile/profile_detector.rb +110 -0
  470. data/lib/omnizip/profile/profile_registry.rb +161 -0
  471. data/lib/omnizip/profile/text_profile.rb +36 -0
  472. data/lib/omnizip/profile.rb +190 -0
  473. data/lib/omnizip/profiler/memory_profiler.rb +66 -0
  474. data/lib/omnizip/profiler/method_profiler.rb +49 -0
  475. data/lib/omnizip/profiler/report_generator.rb +169 -0
  476. data/lib/omnizip/profiler.rb +204 -0
  477. data/lib/omnizip/progress/callback_reporter.rb +36 -0
  478. data/lib/omnizip/progress/console_reporter.rb +62 -0
  479. data/lib/omnizip/progress/log_reporter.rb +91 -0
  480. data/lib/omnizip/progress/operation_progress.rb +118 -0
  481. data/lib/omnizip/progress/progress_bar.rb +156 -0
  482. data/lib/omnizip/progress/progress_reporter.rb +40 -0
  483. data/lib/omnizip/progress/progress_tracker.rb +190 -0
  484. data/lib/omnizip/progress/silent_reporter.rb +24 -0
  485. data/lib/omnizip/progress.rb +127 -0
  486. data/lib/omnizip/rubyzip_compat.rb +63 -0
  487. data/lib/omnizip/temp/safe_extract.rb +168 -0
  488. data/lib/omnizip/temp/temp_file.rb +124 -0
  489. data/lib/omnizip/temp/temp_file_pool.rb +109 -0
  490. data/lib/omnizip/temp.rb +181 -0
  491. data/lib/omnizip/version.rb +5 -0
  492. data/lib/omnizip/zip/entry.rb +156 -0
  493. data/lib/omnizip/zip/file.rb +485 -0
  494. data/lib/omnizip/zip/input_stream.rb +273 -0
  495. data/lib/omnizip/zip/output_stream.rb +324 -0
  496. data/lib/omnizip.rb +156 -0
  497. data/readme-docs/advanced-features.adoc +515 -0
  498. data/readme-docs/api-usage.adoc +444 -0
  499. data/readme-docs/architecture.adoc +449 -0
  500. data/readme-docs/archive-formats.adoc +479 -0
  501. data/readme-docs/cli-usage.adoc +222 -0
  502. data/readme-docs/compression-algorithms.adoc +442 -0
  503. data/readme-docs/compression-profiles.adoc +247 -0
  504. data/readme-docs/encryption-checksums.adoc +328 -0
  505. data/readme-docs/format-converter.adoc +325 -0
  506. data/readme-docs/installation.adoc +228 -0
  507. data/readme-docs/par2-archives.adoc +608 -0
  508. data/readme-docs/performance-profiler.adoc +389 -0
  509. data/readme-docs/preprocessing-filters.adoc +280 -0
  510. data/xz-file-format-1.2.1.txt +1174 -0
  511. metadata +617 -0
@@ -0,0 +1,2055 @@
1
+ # frozen_string_literal: true
2
+
3
+ # Copyright (C) 2025 Ribose Inc.
4
+ #
5
+ # Permission is hereby granted, free of charge, to any person obtaining a
6
+ # copy of this software and associated documentation files (the "Software"),
7
+ # to deal in the Software without restriction, including without limitation
8
+ # the rights to use, copy, modify, merge, publish, distribute, sublicense,
9
+ # and/or sell copies of the Software, and to permit persons to whom the
10
+ # Software is furnished to do so, subject to the following conditions:
11
+ #
12
+ # The above copyright notice and this permission notice shall be included in
13
+ # all copies or substantial portions of the Software.
14
+ #
15
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20
+ # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21
+ # DEALINGS IN THE SOFTWARE.
22
+
23
+ require_relative "constants"
24
+ require_relative "../../implementations/seven_zip/lzma/state_machine"
25
+ require_relative "literal_decoder"
26
+ require_relative "length_coder"
27
+ require_relative "distance_coder"
28
+ require_relative "range_decoder"
29
+ require_relative "bit_model"
30
+
31
+ module Omnizip
32
+ module Algorithms
33
+ # LZMA XZ Utils implementation
34
+ #
35
+ # This namespace contains the XZ Utils implementation of LZMA decoder.
36
+ # XZ Utils is based on LZMA SDK but has been MODIFIED SIGNIFICANTLY.
37
+ # This implementation is for XZ format (.xz files) ONLY.
38
+ #
39
+ # Reference: /Users/mulgogi/src/external/xz/src/liblzma/lzma/lzma_decoder.c
40
+ #
41
+ # XZ Utils LZMA decoder
42
+ #
43
+ # This class implements XZ Utils' LZMA decoder (NOT LZMA SDK/7-Zip!)
44
+ # XZ Utils is based on LZMA SDK but has been MODIFIED SIGNIFICANTLY.
45
+ # Reference: /Users/mulgogi/src/external/xz/src/liblzma/lzma/lzma_decoder.c
46
+ #
47
+ # This decoder is used for:
48
+ # - XZ format (.xz files)
49
+ # - LZMA2 compression in XZ format
50
+ #
51
+ module XzUtilsDecoderDebug
52
+ # Debug helper to conditionally output debug messages
53
+ # Set ENV['LZMA_DEBUG'] = 'true' to enable all debug output
54
+ def self.debug_puts(*args)
55
+ puts(*args) if ENV["LZMA_DEBUG"]
56
+ end
57
+ end
58
+
59
+ # XZ Utils implementation of LZMA decoder
60
+ #
61
+ # IMPORTANT: This is NOT the LZMA SDK/7-Zip decoder!
62
+ # XZ Utils modified LZMA significantly - this is XZ format only.
63
+ # Components integrated:
64
+ # - LiteralDecoder: Matched/unmatched literal decoding
65
+ # - StateMachine: 12-state FSM for probability model selection
66
+ # - LengthCoder: 3-level length decoding (low/mid/high)
67
+ # - DistanceCoder: 64-slot distance decoding with aligned bits
68
+ #
69
+ # The decoder follows XZ Utils' exact decoding sequence:
70
+ # 1. Read LZMA header (property byte, dict size, uncompressed size)
71
+ # 2. Initialize range decoder and probability models
72
+ # 3. Decode loop:
73
+ # - Decode is_match bit
74
+ # - If literal: decode byte (matched/unmatched)
75
+ # - If match: decode length and distance
76
+ # - Update state machine
77
+ # - Write to output
78
+ # 4. Handle EOS marker
79
+ #
80
+ # @example Basic usage
81
+ # decoder = Omnizip::Algorithms::XzUtilsDecoder.new(input)
82
+ # data = decoder.decode_stream
83
+ #
84
+ # @example With output stream
85
+ # decoder = Omnizip::Algorithms::XzUtilsDecoder.new(input)
86
+ # File.open('output.txt', 'wb') { |f| decoder.decode_stream(f) }
87
+ class XzUtilsDecoder
88
+ include LZMA::Constants
89
+
90
+ # Alias for nested classes for easier access
91
+ BitModel = LZMA::BitModel
92
+ LengthCoder = LZMA::LengthCoder
93
+ DistanceCoder = LZMA::DistanceCoder
94
+ LiteralDecoder = LZMA::LiteralDecoder
95
+ RangeDecoder = LZMA::RangeDecoder
96
+ SdkStateMachine = Implementations::SevenZip::LZMA::StateMachine
97
+
98
+ attr_reader :lc, :lp, :pb, :dict_size, :uncompressed_size
99
+
100
+ # XZ Utils dictionary constants (from lz_decoder.h)
101
+ # See: /Users/mulgogi/src/external/xz/src/liblzma/lz/lz_decoder.h
102
+ LZ_DICT_REPEAT_MAX = 288
103
+ LZ_DICT_INIT_POS = 2 * LZ_DICT_REPEAT_MAX # = 576
104
+
105
+ # Initialize the SDK-compatible decoder
106
+ #
107
+ # @param input [IO] Input stream of compressed data
108
+ # @param options [Hash] Decoding options
109
+ # @option options [Boolean] :lzma2_mode If true, initialize without reading header
110
+ # (for LZMA2 use, requires lc, lp, pb, dict_size, uncompressed_size)
111
+ # @option options [Integer] :lc Literal context bits (required for lzma2_mode)
112
+ # @option options [Integer] :lp Literal position bits (required for lzma2_mode)
113
+ # @option options [Integer] :pb Position bits (required for lzma2_mode)
114
+ # @option options [Integer] :dict_size Dictionary size (required for lzma2_mode)
115
+ # @option options [Integer] :uncompressed_size Uncompressed size (required for lzma2_mode)
116
+ # @option options [String] :preloaded_data Data to preload into dictionary (for LZMA2
117
+ # uncompressed chunks followed by compressed chunks)
118
+ # @option options [Boolean] :validate_size If true, validate decoded size matches uncompressed_size
119
+ # (default: false, only for .lzma format)
120
+ def initialize(input, options = {})
121
+ @input = input
122
+ @decoder_id = object_id # Track decoder instance ID
123
+ puts "DEBUG LZMA::Decoder.new created[#{@decoder_id}]" if ENV["LZMA_DEBUG_DECODE_STREAM"]
124
+ if ENV["LZMA_DEBUG_DECODE_STREAM"]
125
+ warn "SDK Decoder #{@decoder_id} created"
126
+ end
127
+ @input = input
128
+
129
+ # Check for preloaded data (from LZMA2 uncompressed chunks)
130
+ @preloaded_data = options[:preloaded_data]
131
+ @validate_size = options.fetch(:validate_size, false)
132
+ @allow_eopm = options.fetch(:allow_eopm, nil)
133
+
134
+ if options[:lzma2_mode]
135
+ # Direct initialization for LZMA2 use (XZ Utils pattern)
136
+ # The LZMA2 decoder provides parameters directly, no header to read
137
+ # See: /Users/mulgogi/src/external/xz/src/liblzma/lzma/lzma2_decoder.c:140-141
138
+ @lc = options.fetch(:lc)
139
+ @lp = options.fetch(:lp)
140
+ @pb = options.fetch(:pb)
141
+ @dict_size = options.fetch(:dict_size)
142
+ @uncompressed_size = options.fetch(:uncompressed_size)
143
+ else
144
+ # Standalone LZMA file - read header from input
145
+ read_header
146
+ end
147
+ validate_parameters
148
+ init_models
149
+ init_coders
150
+ end
151
+
152
+ # Decode a compressed stream
153
+ #
154
+ # Main decoding loop following SDK's LzmaDec_DecodeToDic logic:
155
+ # 1. Initialize range decoder
156
+ # 2. Process each position: decode literals/matches
157
+ # 3. Detect EOS marker
158
+ # 4. Return decompressed data
159
+ #
160
+ # XZ Utils dictionary system (from lz_decoder.h):
161
+ # - pos starts at LZ_DICT_INIT_POS (576)
162
+ # - full = pos - LZ_DICT_INIT_POS (count of valid bytes)
163
+ # - has_wrapped = false until dictionary buffer wraps
164
+ # - Distance validation: full > distance
165
+ #
166
+ # @param output [IO, nil] Optional output stream (if nil, returns String)
167
+ # @param preserve_dict [Boolean] Whether to preserve dictionary from previous decode
168
+ # @param check_rc_finished [Boolean] Whether to check if range decoder is finished
169
+ # @return [String, Integer] Decompressed data or bytes written
170
+ def decode_stream(output = nil, preserve_dict: false, check_rc_finished: true)
171
+ @decode_stream_call_count ||= 0
172
+ @decode_stream_call_count += 1
173
+ call_num = @decode_stream_call_count
174
+
175
+ puts "DEBUG decode_stream START (call ##{call_num}): @pos=#{@pos.inspect}, @dict_full=#{@dict_full.inspect}, preserve_dict=#{preserve_dict}, @uncompressed_size=#{@uncompressed_size.inspect}" if ENV.fetch("LZMA_DEBUG", nil) && @dict_full && @dict_full >= 200 && @dict_full <= 230
176
+ if ENV["LZMA_DEBUG_DECODE_STREAM"]
177
+ warn "DEBUG decode_stream[#{@decoder_id}] START: preserve_dict=#{preserve_dict}, @pos=#{@pos.inspect}, @dict_full=#{@dict_full.inspect}, @dict_buf.object_id=#{@dict_buf&.object_id || 'nil'}, @dict_buf.size=#{@dict_buf&.size || 'nil'}"
178
+ end
179
+
180
+ # Initialize range decoder
181
+ # For LZMA2, reuse persistent range decoder across chunks (like XZ Utils)
182
+ # The range decoder is created in set_input when the first chunk is processed
183
+ # See: /Users/mulgogi/src/external/xz/src/liblzma/lzma/lzma2_decoder.c:140-141
184
+ if ENV["LZMA_DEBUG"]
185
+ warn "DEBUG: decode_stream - reusing range decoder @input.pos=#{begin
186
+ @input.pos
187
+ rescue StandardError
188
+ 'N/A'
189
+ end}, @range_decoder.class=#{@range_decoder.class}"
190
+ end
191
+
192
+ # Create range decoder if it doesn't exist (first chunk)
193
+ # This happens when the decoder is created directly for LZMA (not LZMA2)
194
+ unless @range_decoder
195
+ if ENV["LZMA_DEBUG"]
196
+ warn "DEBUG: decode_stream - creating NEW range decoder"
197
+ end
198
+ @range_decoder = RangeDecoder.new(@input)
199
+ end
200
+
201
+ # Special case: empty input (uncompressed_size == 0)
202
+ # Return immediately without trying to decode anything
203
+ if @uncompressed_size != 0xFFFFFFFFFFFFFFFF && @uncompressed_size == 0
204
+ if ENV["LZMA_DEBUG"]
205
+ warn "DEBUG: decode_stream - empty input (uncompressed_size=0), returning immediately"
206
+ end
207
+ return "" # Empty output
208
+ end
209
+
210
+ # TEMP DEBUG: Trace first 10 iterations
211
+ @debug_iter = 0
212
+
213
+ # Track bytes decoded in this chunk (for multi-chunk streams)
214
+ # This is needed to limit match lengths correctly when @uncompressed_size
215
+ # represents only the current chunk's size, not the total size
216
+ @chunk_bytes_decoded = 0
217
+
218
+ # DEBUG: Show chunk_bytes_decoded initialization
219
+ if @dict_full && @dict_full >= 220 && @dict_full <= 240 && ENV.fetch("LZMA_DEBUG", nil)
220
+ puts "DEBUG: chunk_bytes_decoded reset to 0 for chunk (call_num=#{call_num}, dict_full=#{@dict_full})"
221
+ end
222
+
223
+ # Initialize state and dictionary (XZ Utils system from lz_decoder.c)
224
+ # See: /Users/mulgogi/src/external/xz/src/liblzma/lz/lz_decoder.c:56
225
+ # For LZMA2 multi-chunk streams, state machine persists across chunks
226
+ # Only reset when not preserving dictionary (first chunk)
227
+ #
228
+ # IMPORTANT: Initialize @state if it's nil (first call) OR if not preserving dict
229
+ if @state.nil? || !preserve_dict
230
+ @state = SdkStateMachine.new
231
+ end
232
+
233
+ # For LZMA2 multi-chunk streams, preserve dictionary across chunks
234
+ # when preserve_dict is true (control >= 0x80 but < 0xA0)
235
+ # For subsequent chunks, the reset() method handles dictionary reset
236
+ # For the first chunk (when @dict_buf is nil), we need to init it here
237
+ puts "DEBUG: Checking @dict_buf.nil? = #{@dict_buf.nil?}, preserve_dict=#{preserve_dict}" if ENV["LZMA_DEBUG_RESET"]
238
+ if @dict_buf.nil?
239
+ buf_size = @dict_size + LZ_DICT_INIT_POS
240
+ @dict_buf = Array.new(buf_size, 0)
241
+ @pos = LZ_DICT_INIT_POS
242
+ @dict_full = 0
243
+ @has_wrapped = false
244
+
245
+ # Add preloaded data to dictionary (from LZMA2 uncompressed chunks)
246
+ # This must be done before decoding so matches can reference this data
247
+ if @preloaded_data && !@preloaded_data.empty?
248
+ if ENV["LZMA_DEBUG_RESET"]
249
+ warn "DEBUG: Preloading #{@preloaded_data.bytesize} bytes into dictionary[#{@decoder_id}]"
250
+ end
251
+ @preloaded_data.each_byte do |byte|
252
+ @dict_buf[@pos] = byte
253
+ @pos += 1
254
+ end
255
+ # Update dict_full to reflect preloaded data
256
+ @dict_full = @pos - LZ_DICT_INIT_POS
257
+ if ENV["LZMA_DEBUG_RESET"]
258
+ warn "DEBUG: After preload - @pos=#{@pos}, @dict_full=#{@dict_full}"
259
+ warn " Preloaded data (hex): #{@preloaded_data[0..50].unpack1('H*')}"
260
+ end
261
+ @preloaded_data = nil # Clear after loading
262
+ end
263
+
264
+ if ENV["LZMA_DEBUG_RESET"]
265
+ warn "DEBUG: Dictionary init in decode_stream[#{@decoder_id}] - @pos=#{@pos}, @dict_full=#{@dict_full}, @dict_buf.size=#{@dict_buf.size}, @dict_buf.object_id=#{@dict_buf.object_id}"
266
+ # Verify buffer initialization by checking a few positions
267
+ warn " Sample values: [576]=#{@dict_buf[576]}, [577]=#{@dict_buf[577]}, [578]=#{@dict_buf[578]}, [583]=#{@dict_buf[583]}"
268
+ end
269
+ end
270
+
271
+ # Track starting position for multi-chunk streams
272
+ # IMPORTANT: Calculate start_pos AFTER dictionary initialization!
273
+ # This ensures that preloaded data (from LZMA2 uncompressed chunks) is
274
+ # properly reflected in start_pos, so we only return NEW bytes.
275
+ # For LZMA2, we need to return only the NEW bytes, not all bytes from LZ_DICT_INIT_POS
276
+ start_pos = @pos || LZ_DICT_INIT_POS
277
+ puts "DEBUG: start_pos=#{start_pos}, @pos=#{@pos.inspect}, @dict_full=#{@dict_full.inspect}, preserve_dict=#{preserve_dict}, @decoder_id=#{@decoder_id}" if ENV.fetch("LZMA_DEBUG", nil) && @dict_full && @dict_full >= 200 && @dict_full <= 230
278
+ # Also show for chunk #1 start (dict_full around 227)
279
+ puts "DEBUG: start_pos=#{start_pos}, @pos=#{@pos.inspect}, @dict_full=#{@dict_full.inspect}, @uncompressed_size=#{@uncompressed_size}, @decoder_id=#{@decoder_id}" if ENV.fetch("LZMA_DEBUG", nil) && @dict_full && @dict_full >= 225 && @dict_full <= 230
280
+
281
+ # Initialize rep distances (XZ Utils initializes to 0)
282
+ # See: /Users/mulgogi/src/external/xz/src/liblzma/lzma/lzma_decoder.c:1054-1055
283
+ # For LZMA2 multi-chunk streams, rep distances persist across chunks
284
+ # Only reset when not preserving dictionary (first chunk)
285
+ #
286
+ # IMPORTANT: Initialize rep distances if they're nil OR not preserving dict
287
+ if @rep0.nil? || @rep1.nil? || @rep2.nil? || @rep3.nil? || !preserve_dict
288
+ puts "DEBUG: Resetting rep distances to 0 (rep0.nil?=#{@rep0.nil?}, preserve_dict=#{preserve_dict})" if ENV.fetch("LZMA_DEBUG", nil) && @dict_full && @dict_full >= 200 && @dict_full <= 230
289
+ @rep0 = 0
290
+ @rep1 = 0
291
+ @rep2 = 0
292
+ @rep3 = 0
293
+ end
294
+
295
+ # Main decoding loop
296
+ # XZ Utils pattern (lzma_decoder.c:305-306):
297
+ # Set dict.limit = dict.pos + (size_t)(coder->uncompressed_size)
298
+ # Then check dict.pos < dict.limit
299
+ # Since our @pos starts at LZ_DICT_INIT_POS, we set limit accordingly
300
+ # IMPORTANT: For multi-chunk streams, calculate limit from start_pos, not LZ_DICT_INIT_POS!
301
+ # XZ Utils uses dict->pos (current position) + uncompressed_size
302
+ # We use start_pos (current position) + @uncompressed_size
303
+ limit = if @uncompressed_size == 0xFFFFFFFFFFFFFFFF
304
+ nil # No limit for unknown size
305
+ else
306
+ start_pos + @uncompressed_size
307
+ end
308
+
309
+ # DEBUG: Show limit calculation for chunk #1
310
+ if ENV.fetch("LZMA_DEBUG_LIMIT", nil) && @dict_full && @dict_full >= 220 && @dict_full <= 240
311
+ puts "DEBUG LIMIT CALCULATION: start_pos=#{start_pos}, @uncompressed_size=#{@uncompressed_size}, limit=#{limit.inspect}"
312
+ end
313
+ # DEBUG: Also show for dict_full around 293 (where the error occurs)
314
+ if ENV.fetch("LZMA_DEBUG_LIMIT", nil) && @dict_full && @dict_full >= 290 && @dict_full <= 300
315
+ puts "DEBUG LIMIT CALCULATION at dict_full=#{@dict_full}: start_pos=#{start_pos}, @uncompressed_size=#{@uncompressed_size}, limit=#{limit.inspect}, @decoder_id=#{@decoder_id}"
316
+ end
317
+
318
+ iteration = 0
319
+ loop do
320
+ iteration += 1
321
+ # DEBUG: Show every iteration after position 200
322
+ if ENV.fetch("LZMA_DEBUG_ITER", nil) && @dict_full && @dict_full >= 200 && @dict_full <= 500
323
+ puts "DEBUG ITERATION ##{iteration}: pos=#{@pos}, dict_full=#{@dict_full}, limit=#{limit.inspect}"
324
+ end
325
+ # Check if we've reached the expected size (if known)
326
+ # XZ Utils: checks dict.pos < dict.limit
327
+ if ENV["LZMA_DEBUG_LIMIT"]
328
+ compare_result = begin
329
+ limit && @pos >= limit
330
+ rescue StandardError
331
+ "ERROR"
332
+ end
333
+ XzUtilsDecoderDebug.debug_puts "DEBUG LIMIT: iter=#{iteration}, pos=#{@pos.inspect}, dict_full=#{@dict_full}, limit=#{limit.inspect}, pos >= limit: #{compare_result}"
334
+ end
335
+
336
+ # Handle nil @pos or limit gracefully
337
+ if limit && (@pos.nil? || limit.nil?)
338
+ raise "Invalid state: @pos=#{@pos.inspect}, limit=#{limit.inspect}"
339
+ end
340
+
341
+ if ENV["LZMA_DEBUG_LIMIT"]
342
+ XzUtilsDecoderDebug.debug_puts "DEBUG LIMIT: iter=#{iteration}, pos=#{@pos}, dict_full=#{@dict_full}, limit=#{limit}"
343
+ end
344
+
345
+ # DEBUG: Track position before decoding
346
+ @pos if ENV["LZMA_DEBUG_POS"]
347
+
348
+ # Decode is_match bit
349
+ pos_state = @pos & ((1 << @pb) - 1)
350
+ # XZ Utils: is_match[state][pos_state] where the array is NUM_STATES * (1 << pb)
351
+ # The array stride changes with pb value
352
+ model_index = (@state.value * (1 << @pb)) + pos_state
353
+
354
+ # DEBUG: Show state before decode (for position tracking)
355
+ if ENV["LZMA_DEBUG_POS_227"]
356
+ XzUtilsDecoderDebug.debug_puts "DEBUG: Before is_match at pos=#{@pos}, dict_full=#{@dict_full}, state=#{@state.value}, pos_state=#{pos_state}"
357
+ end
358
+
359
+ # Debug trace (disabled - remove or enable with ENV var as needed)
360
+ @debug_iter += 1
361
+
362
+ # DEBUG: Trace is_match decision around position 256
363
+ # IMPORTANT: Capture range/code BEFORE calling decode_bit
364
+ if @dict_full.between?(255, 257)
365
+ range = @range_decoder.instance_variable_get(:@range)
366
+ code = @range_decoder.instance_variable_get(:@code)
367
+ model = @is_match_models[model_index]
368
+ XzUtilsDecoderDebug.debug_puts " [IS_MATCH] pos=#{@pos}, dict_full=#{@dict_full}, state=#{@state.value}, pos_state=#{pos_state}, model_index=#{model_index}"
369
+ XzUtilsDecoderDebug.debug_puts " BEFORE decode: range=0x#{range.to_s(16)}, code=0x#{code.to_s(16)}, prob=#{model.probability}"
370
+ # Enable detailed tracing for the critical is_match[0][0] decision
371
+ $trace_is_match_0_0 = (@dict_full == 256)
372
+ end
373
+
374
+ if ENV.fetch("LZMA_DEBUG_ITER", nil)
375
+ range = @range_decoder.instance_variable_get(:@range)
376
+ code = @range_decoder.instance_variable_get(:@code)
377
+ model = @is_match_models[model_index]
378
+ bound = (range >> 11) * model.probability
379
+ XzUtilsDecoderDebug.debug_puts ""
380
+ XzUtilsDecoderDebug.debug_puts "ITER #{@debug_iter}:"
381
+ XzUtilsDecoderDebug.debug_puts " pos=#{@pos}, state=#{@state.value}, pos_state=#{pos_state}, model_index=#{model_index}"
382
+ XzUtilsDecoderDebug.debug_puts " dict_full=#{@dict_full}"
383
+ XzUtilsDecoderDebug.debug_puts " range=0x#{range.to_s(16)}, code=0x#{code.to_s(16)}, model.prob=#{model.probability}"
384
+ XzUtilsDecoderDebug.debug_puts " bound=0x#{bound.to_s(16)}, code < bound: #{code < bound}"
385
+ end
386
+
387
+ is_match = @range_decoder.decode_bit(@is_match_models[model_index])
388
+
389
+ # DEBUG: Trace is_match and literal/match decisions around dict_full = 50-62
390
+ if @dict_full.between?(50, 62)
391
+ range_val = @range_decoder.instance_variable_get(:@range)
392
+ code_val = @range_decoder.instance_variable_get(:@code)
393
+ prob_val = @is_match_models[model_index].probability
394
+ XzUtilsDecoderDebug.debug_puts "\n=== dict_full=#{@dict_full}: is_match=#{is_match}, state=#{@state.value}, pos_state=#{pos_state} ==="
395
+ XzUtilsDecoderDebug.debug_puts " model_index=#{model_index}, prob=#{prob_val}"
396
+ XzUtilsDecoderDebug.debug_puts " range=0x#{range_val.to_s(16).upcase}, code=0x#{code_val.to_s(16).upcase}"
397
+ end
398
+
399
+ if ENV.fetch("LZMA_DEBUG_ITER", nil)
400
+ XzUtilsDecoderDebug.debug_puts " is_match=#{is_match}"
401
+ end
402
+
403
+ # DEBUG: Show is_match result after decode
404
+ if @dict_full.between?(255, 257)
405
+ XzUtilsDecoderDebug.debug_puts " AFTER decode: is_match=#{is_match}"
406
+ XzUtilsDecoderDebug.debug_puts " (is_match=0 means literal, is_match=1 means match)"
407
+ end
408
+
409
+ # DEBUG: Track what's happening around dict_full=227 (corruption point)
410
+ if ENV["LZMA_DEBUG"] && @dict_full == 227
411
+ puts "DEBUG CORRUPTION POINT: dict_full=#{@dict_full}, pos=#{@pos}"
412
+ puts " is_match=#{is_match}, state=#{@state.value}"
413
+ range_val = @range_decoder.instance_variable_get(:@range)
414
+ code_val = @range_decoder.instance_variable_get(:@code)
415
+ puts " range=0x#{range_val.to_s(16)}, code=0x#{code_val.to_s(16)}"
416
+ puts " dict_buf[#{@pos - 5}...#{@pos + 5}] = #{@dict_buf[[
417
+ @pos - 5, LZ_DICT_INIT_POS
418
+ ].max...[@pos + 5, @dict_buf.size - 1].min].inspect}"
419
+ end
420
+
421
+ if ENV["LZMA_DEBUG"] && @dict_full.between?(224, 235)
422
+ puts "DEBUG pos #{@dict_full}: is_match=#{is_match}, state=#{@state.value}"
423
+ if is_match.zero?
424
+ puts " Next byte should be literal"
425
+ else
426
+ puts " Next byte should be match"
427
+ end
428
+ end
429
+
430
+ # DEBUG: Verify first 256 bytes are correct
431
+ if @dict_full == 256
432
+ XzUtilsDecoderDebug.debug_puts ""
433
+ XzUtilsDecoderDebug.debug_puts " Verifying first 256 bytes:"
434
+ # Check specific bytes around position 253
435
+ XzUtilsDecoderDebug.debug_puts " Byte 253: @dict_buf[#{LZ_DICT_INIT_POS + 253}]=#{@dict_buf[LZ_DICT_INIT_POS + 253].inspect} (expected 'i'=0x69)"
436
+ XzUtilsDecoderDebug.debug_puts " Byte 254: @dict_buf[#{LZ_DICT_INIT_POS + 254}]=#{@dict_buf[LZ_DICT_INIT_POS + 254].inspect} (expected 'n'=0x6E)"
437
+ XzUtilsDecoderDebug.debug_puts " Byte 255: @dict_buf[#{LZ_DICT_INIT_POS + 255}]=#{@dict_buf[LZ_DICT_INIT_POS + 255].inspect} (expected ' '=0x20)"
438
+ all_correct = true
439
+ 256.times do |i|
440
+ expected = i
441
+ actual = @dict_buf[LZ_DICT_INIT_POS + i]
442
+ if actual != expected
443
+ all_correct = false
444
+ if (i >= 253) && ENV.fetch("LZMA_DEBUG", nil)
445
+ puts " Byte #{i}: expected 0x#{expected.to_s(16)}, got 0x#{actual.to_s(16)} MISMATCH!"
446
+ end
447
+ end
448
+ end
449
+ XzUtilsDecoderDebug.debug_puts " First 256 bytes: #{all_correct ? 'ALL CORRECT ✓' : 'HAS MISMATCH'}"
450
+ XzUtilsDecoderDebug.debug_puts ""
451
+ end
452
+
453
+ if ENV.fetch("LZMA_DEBUG", nil) && @pos >= 605 && @pos <= 615
454
+ warn "DEBUG: is_match at pos=#{@pos}, state=#{@state.value}, pos_state=#{pos_state}, model_index=#{model_index}, is_match=#{is_match}"
455
+ end
456
+
457
+ if is_match.zero?
458
+ # Decode literal
459
+ decode_literal
460
+
461
+ # Trace positions 45-65 for debugging good-1-lzma2-3.xz divergence
462
+ if ENV.fetch("LZMA_DEBUG",
463
+ nil) && @dict_full >= 45 && @dict_full <= 65
464
+ last_byte = @dict_buf[@pos - 1]
465
+ range_after = @range_decoder.instance_variable_get(:@range)
466
+ code_after = @range_decoder.instance_variable_get(:@code)
467
+ puts " literal decoded: 0x#{last_byte.to_s(16).upcase} ('#{last_byte.chr}') at pos=#{@pos - 1}, dict_full=#{@dict_full}"
468
+ puts " AFTER: range=0x#{range_after.to_s(16).upcase}, code=0x#{code_after.to_s(16).upcase}"
469
+ end
470
+
471
+ if ENV.fetch("LZMA_DEBUG_ITER", nil)
472
+ last_byte = @dict_buf[@pos - 1]
473
+ puts " literal byte=0x#{last_byte.to_s(16)} ('#{last_byte.chr}')"
474
+ end
475
+ if ENV["LZMA_DEBUG_POS"] && @pos >= limit
476
+ puts "DEBUG: Literal overshoot: pos=#{@pos}, limit=#{limit}, delta=#{@pos - limit}"
477
+ end
478
+ elsif decode_match
479
+ # Decode match - returns true if EOS detected
480
+ break
481
+ end
482
+
483
+ # XZ Utils: Check if we've reached the limit (known uncompressed size)
484
+ # Reference: lzma_decoder.c:347, 680-692
485
+ # When dict.pos == dict.limit, the decoder should stop
486
+ # IMPORTANT: Must verify range decoder is finished (code == 0)
487
+ # If code != 0, there's leftover data in the compressed stream (corruption)
488
+ if limit && @pos >= limit
489
+ puts "DEBUG LIMIT TRIGGERED (call #{call_num}): pos=#{@pos}, limit=#{limit}, dict_full=#{@dict_full}, chunk_bytes_decoded=#{@chunk_bytes_decoded}" if ENV["LZMA_DEBUG_LIMIT"]
490
+
491
+ # XZ Utils pattern (lzma_decoder.c:689-700):
492
+ # Check if range decoder is finished (code == 0)
493
+ # - If finished → STREAM_END (success)
494
+ # - If NOT finished AND allow_eopm is false → DATA_ERROR (corruption)
495
+ # - If NOT finished AND allow_eopm is true → continue (expect EOPM)
496
+ # Reference: /Users/mulgogi/src/external/xz/src/liblzma/lzma/lzma_decoder.c:689-700
497
+ #
498
+ # For LZMA2: @allow_eopm is false, so range decoder MUST be finished
499
+ # For .lzma format: @allow_eopm may be true, so we continue decoding to find EOPM
500
+ # Reference: /Users/mulgogi/src/external/xz/src/liblzma/rangecoder/range_decoder.h:138-139
501
+ # rc_is_finished(range_decoder) = ((range_decoder).code == 0)
502
+ #
503
+ # NOTE: The check_rc_finished parameter is a legacy override for .lzma format
504
+ # If explicitly set to false, it allows EOPM even when uncompressed size is known
505
+ # Reference: alone_decoder.c:127 (LZMA_LZMA1EXT_ALLOW_EOPM)
506
+ should_check = if @allow_eopm == true
507
+ # EOPM is explicitly allowed, skip the check
508
+ false
509
+ elsif @allow_eopm == false
510
+ # LZMA2 mode: always check (EOPM is not allowed)
511
+ true
512
+ else
513
+ # @allow_eopm is nil (not set, first chunk or legacy mode)
514
+ # Use check_rc_finished parameter as default
515
+ check_rc_finished
516
+ end
517
+
518
+ if should_check
519
+ # If EOPM is not allowed, range decoder MUST be finished
520
+ unless @range_decoder.code.zero?
521
+ raise Omnizip::DecompressionError,
522
+ "LZMA stream finished with leftover compressed data (range_decoder.code=#{@range_decoder.code}, expected 0). This indicates corruption in the compressed stream or an invalid EOPM for LZMA2."
523
+ end
524
+ break
525
+ else
526
+ # EOPM is allowed (e.g., LZMA_Alone format)
527
+ # If range decoder is finished, we're done
528
+ if @range_decoder.code.zero?
529
+ break
530
+ end
531
+ # Otherwise, continue decoding to find EOPM marker
532
+ # XZ Utils sets eopm_is_valid = true and continues
533
+ # Reference: lzma_decoder.c:704
534
+ end
535
+ end
536
+
537
+ # DEBUG: Show when approaching limit for chunk #1
538
+ if ENV.fetch("LZMA_DEBUG_LIMIT", nil) && limit && @pos >= limit - 10 && @pos < limit + 10
539
+ puts "DEBUG NEAR LIMIT (call #{call_num}): pos=#{@pos}, limit=#{limit}, dict_full=#{@dict_full}, chunk_bytes_decoded=#{@chunk_bytes_decoded}, remaining=#{@uncompressed_size ? @uncompressed_size - @chunk_bytes_decoded : 'N/A'}"
540
+ end
541
+
542
+ # DEBUG: Show when we've passed the expected limit
543
+ if ENV.fetch("LZMA_DEBUG_LIMIT", nil) && limit && @pos >= limit && @pos < limit + 10
544
+ puts "DEBUG PASSED LIMIT: pos=#{@pos}, limit=#{limit}, dict_full=#{@dict_full}, delta=#{@pos - limit}"
545
+ end
546
+
547
+ if ENV["LZMA_DEBUG_POS"] && @pos >= limit
548
+ XzUtilsDecoderDebug.debug_puts "DEBUG: Overshoot detected: pos=#{@pos}, limit=#{limit}, delta=#{@pos - limit}"
549
+ end
550
+ end
551
+
552
+ # Validate decoded size against expected uncompressed_size
553
+ # Only for .lzma (LZMA_Alone) format where validate_size=true
554
+ # For .lzma format with known uncompressed_size, verify we decoded the right amount
555
+ # This catches "too_small_size-without-eopm" files where the header says 1 byte
556
+ # but the compressed data produces more output
557
+ # XZ format does NOT validate size at the LZMA decoder level - it's handled at block level
558
+ if @validate_size && @uncompressed_size && @uncompressed_size != 0xFFFFFFFFFFFFFFFF
559
+ # Calculate actual decoded size (from start of data, not LZ_DICT_INIT_POS)
560
+ actual_decoded_size = @pos - LZ_DICT_INIT_POS
561
+
562
+ if actual_decoded_size != @uncompressed_size
563
+ raise Omnizip::DecompressionError,
564
+ "LZMA stream size mismatch: expected #{@uncompressed_size} bytes, decoded #{actual_decoded_size} bytes. The file may be corrupted or have an invalid uncompressed size field."
565
+ end
566
+
567
+ # IMPORTANT: Check for leftover compressed data after EOPM
568
+ # If EOPM was encountered (range_decoder.code == 0) but there's still data
569
+ # in the input stream, the file is corrupted.
570
+ # Reference: /Users/mulgogi/src/external/xz/src/liblzma/common/alone_decoder.c
571
+ #
572
+ # We only check for leftover data when:
573
+ # 1. EOPM was encountered (code == 0) AND
574
+ # 2. There's more data in the input stream
575
+ #
576
+ # If EOPM was NOT encountered (code != 0), leftover data is expected
577
+ # (it's part of the compressed stream that we haven't read yet).
578
+ if @allow_eopm && @range_decoder&.code&.zero? && @range_decoder.instance_variable_get(:@stream)
579
+ stream = @range_decoder.instance_variable_get(:@stream)
580
+ # Try to peek at the next byte - if available, there's data AFTER EOPM
581
+ begin
582
+ next_byte = stream.getbyte
583
+ if next_byte
584
+ # Put the byte back
585
+ stream.ungetbyte(next_byte) if stream.respond_to?(:ungetbyte)
586
+ raise Omnizip::DecompressionError,
587
+ "LZMA_Alone file has data after the end-of-payload marker. The file may be corrupted or contain concatenated streams."
588
+ end
589
+ rescue IOError, EOFError
590
+ # Stream doesn't support peeking or is exhausted, that's fine
591
+ end
592
+ elsif !@allow_eopm && @range_decoder&.instance_variable_get(:@stream)
593
+ # For LZMA2 mode (EOPM not allowed): check for leftover data
594
+ stream = @range_decoder.instance_variable_get(:@stream)
595
+ begin
596
+ next_byte = stream.getbyte
597
+ if next_byte
598
+ stream.ungetbyte(next_byte) if stream.respond_to?(:ungetbyte)
599
+ raise Omnizip::DecompressionError,
600
+ "LZMA_Alone file has more compressed data than expected. The uncompressed size field (#{@uncompressed_size} bytes) appears to be too small."
601
+ end
602
+ rescue IOError, EOFError
603
+ # Stream doesn't support peeking or is exhausted, that's fine
604
+ end
605
+ end
606
+ end
607
+
608
+ # Return output - only the valid portion of dictionary
609
+ # XZ Utils: valid data starts from LZ_DICT_INIT_POS onwards
610
+ # IMPORTANT: For LZMA2 multi-chunk streams, only return NEW bytes since start_pos!
611
+ # This ensures each chunk returns only its own output, not previous chunks' output.
612
+ if ENV["DEBUG_DICT_BUF"]
613
+ XzUtilsDecoderDebug.debug_puts "DEBUG: start_pos=#{start_pos}, @pos=#{@pos.inspect}, @dict_buf.size=#{@dict_buf.size}, LZ_DICT_INIT_POS=#{LZ_DICT_INIT_POS}"
614
+ end
615
+ valid_bytes = @dict_buf[start_pos...@pos]
616
+ # DEBUG: Show return value calculation
617
+ puts "DEBUG RETURN CALCULATION: call_num=#{call_num}, start_pos=#{start_pos}, @pos=#{@pos}, valid_bytes.size=#{@dict_buf[start_pos...@pos].size}, dict_full=#{@dict_full}, chunk_bytes_decoded=#{@chunk_bytes_decoded}" if ENV.fetch("LZMA_DEBUG", nil) && @dict_full && @dict_full >= 220 && @dict_full <= 240 && call_num == 2
618
+ puts "DEBUG RETURN CALCULATION: call_num=#{call_num}, start_pos=#{start_pos}, @pos=#{@pos}, valid_bytes.size=#{@dict_buf[start_pos...@pos].size}, dict_full=#{@dict_full}, chunk_bytes_decoded=#{@chunk_bytes_decoded}" if ENV["LZMA_DEBUG"] && call_num == 2
619
+ # Filter out nil values (can happen during dictionary reset transitions)
620
+ valid_bytes = valid_bytes.map { |b| b.nil? ? 0 : b }
621
+ if ENV["DEBUG_DICT_BUF"]
622
+ XzUtilsDecoderDebug.debug_puts "DEBUG: valid_bytes=#{begin
623
+ valid_bytes.size
624
+ rescue StandardError
625
+ valid_bytes.inspect
626
+ end}"
627
+ end
628
+ valid_data = valid_bytes.pack("C*")
629
+ if output
630
+ output.write(valid_data.force_encoding(Encoding::BINARY))
631
+ valid_data.bytesize
632
+ else
633
+ valid_data.force_encoding(Encoding::BINARY)
634
+ end
635
+ end
636
+
637
+ # Reset the decoder state for reuse with new properties
638
+ #
639
+ # XZ Utils pattern (lzma_decoder.c:1034-1083):
640
+ # - Resets state machine and rep distances
641
+ # - Resets range decoder
642
+ # - Reinitializes all probability models
643
+ # - Preserves dictionary (managed externally by LZMA2 decoder)
644
+ #
645
+ # @param new_lc [Integer, nil] New lc value (if nil, keeps current)
646
+ # @param new_lp [Integer, nil] New lp value (if nil, keeps current)
647
+ # @param new_pb [Integer, nil] New pb value (if nil, keeps current)
648
+ # @param preserve_dict [Boolean] If true, preserve dictionary state (pos, dict_full)
649
+ # @return [void]
650
+ def reset(new_lc: nil, new_lp: nil, new_pb: nil, preserve_dict: false)
651
+ if ENV["LZMA_DEBUG_RESET"]
652
+ warn "DEBUG reset[#{@decoder_id}] called: preserve_dict=#{preserve_dict}, @pos=#{@pos.inspect}, @dict_full=#{@dict_full.inspect}, @dict_buf.size=#{@dict_buf&.size || 'nil'}, nil_count=#{@dict_buf&.count(nil) || 'N/A'}"
653
+ end
654
+
655
+ # DEBUG: Trace reset calls around position 224-227
656
+ if @dict_full && @dict_full >= 220 && @dict_full <= 230
657
+ XzUtilsDecoderDebug.debug_puts "\n=== reset called at dict_full=#{@dict_full} ==="
658
+ XzUtilsDecoderDebug.debug_puts " preserve_dict=#{preserve_dict}"
659
+ XzUtilsDecoderDebug.debug_puts " Before reset: rep0/1/2/3=(#{@rep0},#{@rep1},#{@rep2},#{@rep3})"
660
+ end
661
+
662
+ # Update properties if provided
663
+ properties_changed = !!(new_lc || new_lp || new_pb)
664
+ @lc = new_lc if new_lc
665
+ @lp = new_lp if new_lp
666
+ @pb = new_pb if new_pb
667
+
668
+ # Reset state machine (XZ Utils line 1053)
669
+ # Always create a new state machine when resetting
670
+ @state = SdkStateMachine.new
671
+
672
+ # Reset rep distances (XZ Utils lines 1071-1074)
673
+ # IMPORTANT: ALWAYS reset rep distances to 0 when state is reset
674
+ # This happens for both control=0xE0 (dict reset) and control=0xC0 (state reset)
675
+ # Reference: /Users/mulgogi/src/external/xz/src/liblzma/lzma/lzma_decoder.c:1071-1074
676
+ @rep0 = 0
677
+ @rep1 = 0
678
+ @rep2 = 0
679
+ @rep3 = 0
680
+
681
+ # Reset range decoder for next chunk
682
+ # XZ Utils pattern (lzma_decoder.c:1061):
683
+ # - rc_reset sets range=UINT32_MAX, code=0, init_bytes_left=5
684
+ # - This MUST happen during reset, not deferred to decode_stream
685
+ # Reference: /Users/mulgogi/src/external/xz/src/liblzma/lzma/lzma_decoder.c:1061
686
+ if @range_decoder
687
+ @range_decoder.reset
688
+ if ENV["LZMA_DEBUG_RESET"]
689
+ warn "DEBUG reset[#{@decoder_id}]: Reset range_decoder, code=0x#{@range_decoder.code.to_s(16)}, init_bytes_remaining=#{@range_decoder.instance_variable_get(:@init_bytes_remaining)}"
690
+ end
691
+ end
692
+
693
+ # Reinitialize probability models (XZ Utils lines 1064-1082)
694
+ # IMPORTANT: Use reset_models (reset in place) instead of init_models (create new)
695
+ # for state reset only. Only create new models when properties change.
696
+ if properties_changed
697
+ if ENV["LZMA_DEBUG_RESET"]
698
+ warn "DEBUG reset[#{@decoder_id}]: Properties changed, calling init_models (create new arrays)"
699
+ end
700
+ init_models
701
+ else
702
+ if ENV["LZMA_DEBUG_RESET"]
703
+ warn "DEBUG reset[#{@decoder_id}]: No properties changed, calling reset_models (reset in place)"
704
+ end
705
+ reset_models
706
+ end
707
+
708
+ # Reinitialize coders (needed for pb changes)
709
+ # Only recreate coders when properties have changed
710
+ if properties_changed
711
+ if ENV["LZMA_DEBUG_RESET"]
712
+ warn "DEBUG reset[#{@decoder_id}]: Properties changed, calling init_coders (create new coders)"
713
+ end
714
+ init_coders
715
+ elsif ENV["LZMA_DEBUG_RESET"]
716
+ warn "DEBUG reset[#{@decoder_id}]: No properties changed, skipping init_coders (preserve existing coders)"
717
+ end
718
+
719
+ # Reset dictionary position and full count (XZ Utils pattern)
720
+ # Only reset if preserve_dict is false
721
+ unless preserve_dict
722
+ # Reinitialize dictionary buffer
723
+ # XZ Utils allocates a new buffer for each dictionary reset
724
+ @dict_buf = Array.new(@dict_size + LZ_DICT_INIT_POS, 0)
725
+ @pos = LZ_DICT_INIT_POS
726
+ @dict_full = 0
727
+ @has_wrapped = false
728
+ if ENV["LZMA_DEBUG_RESET"]
729
+ warn "DEBUG reset after dict reset[#{@decoder_id}]: @pos=#{@pos}, @dict_full=#{@dict_full}, @dict_buf.size=#{@dict_buf.size}, @dict_buf.object_id=#{@dict_buf.object_id}, nil_count=#{@dict_buf.count(nil)}"
730
+ # Verify buffer initialization by checking a few positions
731
+ warn " Sample values: [576]=#{@dict_buf[576]}, [577]=#{@dict_buf[577]}, [578]=#{@dict_buf[578]}, [583]=#{@dict_buf[583]}"
732
+ end
733
+ end
734
+
735
+ nil
736
+ end
737
+
738
+ # Reset all probability models in place (without creating new arrays)
739
+ #
740
+ # This matches XZ Utils init_temporals behavior for control >= 0xA0.
741
+ # Unlike init_models which creates new arrays, this resets existing
742
+ # BitModels in place to preserve object identity for any references.
743
+ #
744
+ # @return [void]
745
+ def reset_models
746
+ # Reset literal models
747
+ @literal_models.each(&:reset)
748
+
749
+ # Reset match/rep models
750
+ @is_match_models.each(&:reset)
751
+ @is_rep_models.each(&:reset)
752
+ @is_rep0_models.each(&:reset)
753
+ @is_rep1_models.each(&:reset)
754
+ @is_rep2_models.each(&:reset)
755
+ @is_rep0_long_models.each(&:reset)
756
+
757
+ # Reset length coders
758
+ @length_coder.reset_models
759
+ @rep_length_coder.reset_models
760
+
761
+ # Reset distance coder
762
+ @distance_coder.reset_models
763
+ end
764
+
765
+ # Reset only state machine and rep distances, preserve probability models
766
+ #
767
+ # XZ Utils pattern for state reset only (control >= 0xA0):
768
+ # - Reset state machine
769
+ # - Reset rep distances
770
+ # - Reset probability models (via reset_models)
771
+ # - Reset range decoder (rc_reset + rc_read_init)
772
+ # - PRESERVE dictionary content (no dict_reset)
773
+ #
774
+ # XZ Utils source (lzma2_decoder.c):
775
+ # - For control >= 0xA0: calls lzma_lzma_decoder_reset(decoder, NULL)
776
+ # - lzma_lzma_decoder_reset always calls init_temporals which resets probability models
777
+ #
778
+ # @return [void]
779
+ # Prepare state reset - called BEFORE setting new input
780
+ #
781
+ # Resets state machine, rep distances, and probability models.
782
+ # The range decoder will be reset in finish_state_reset AFTER
783
+ # the new input is set (to match XZ Utils lzma_decoder_reset behavior).
784
+ #
785
+ # For LZMA2 control >= 0xC0, this is called before set_input to reset
786
+ # everything except the range decoder for the new chunk.
787
+ #
788
+ # @return [void]
789
+ def prepare_state_reset
790
+ # DEBUG: Trace when prepare_state_reset is called
791
+ if ENV["LZMA_DEBUG"]
792
+ XzUtilsDecoderDebug.debug_puts "\n=== prepare_state_reset called (decoder_id=#{@decoder_id}) ==="
793
+ XzUtilsDecoderDebug.debug_puts " Before reset: rep0/1/2/3=(#{@rep0},#{@rep1},#{@rep2},#{@rep3})"
794
+ end
795
+
796
+ # Reset state machine (XZ Utils line 1053)
797
+ @state = SdkStateMachine.new
798
+
799
+ # Reset rep distances (XZ Utils lines 1054-1057)
800
+ @rep0 = 0
801
+ @rep1 = 0
802
+ @rep2 = 0
803
+ @rep3 = 0
804
+
805
+ # DEBUG: Show after reset
806
+ if ENV["LZMA_DEBUG"]
807
+ XzUtilsDecoderDebug.debug_puts " After reset: rep0/1/2/3=(#{@rep0},#{@rep1},#{@rep2},#{@rep3})"
808
+ end
809
+
810
+ # Reset probability models (XZ Utils init_temporals for control >= 0xA0)
811
+ reset_models
812
+
813
+ if ENV["LZMA_DEBUG"]
814
+ XzUtilsDecoderDebug.debug_puts "=== end prepare_state_reset (range decoder will be reset in finish_state_reset) ==="
815
+ end
816
+
817
+ nil
818
+ end
819
+
820
+ # Reset state machine only - preserves rep distances
821
+ #
822
+ # This is used for control >= 0xA0 but < 0xC0 where we want
823
+ # to reset the state machine but preserve rep distances from
824
+ # the previous chunk.
825
+ #
826
+ # @return [void]
827
+ def reset_state_machine_only
828
+ # DEBUG: Trace when reset_state_machine_only is called
829
+ if @dict_full && @dict_full >= 220 && @dict_full <= 230
830
+ XzUtilsDecoderDebug.debug_puts "\n=== reset_state_machine_only called at dict_full=#{@dict_full} (decoder_id=#{@decoder_id}) ==="
831
+ XzUtilsDecoderDebug.debug_puts " Before reset: rep0/1/2/3=(#{@rep0},#{@rep1},#{@rep2},#{@rep3})"
832
+ end
833
+
834
+ # Reset state machine only (XZ Utils line 1053)
835
+ @state = SdkStateMachine.new
836
+
837
+ # Reset probability models (XZ Utils init_temporals for control >= 0xA0)
838
+ reset_models
839
+
840
+ # DEBUG: Show after reset (note: rep distances are preserved)
841
+ if @dict_full && @dict_full >= 220 && @dict_full <= 230
842
+ XzUtilsDecoderDebug.debug_puts " After reset: rep0/1/2/3=(#{@rep0},#{@rep1},#{@rep2},#{@rep3}) (preserved)"
843
+ end
844
+
845
+ nil
846
+ end
847
+
848
+ # Finish state reset - called AFTER setting new input
849
+ #
850
+ # Resets the range decoder to read from the new input stream.
851
+ # This completes the state reset process started by prepare_state_reset.
852
+ #
853
+ # XZ Utils pattern (lzma_decoder.c:1034-1083):
854
+ # - rc_reset is called as part of lzma_decoder_reset
855
+ # - rc_reset sets range = UINT32_MAX, code = 0, init_bytes_left = 5
856
+ # - The 5 initialization bytes are read during the first normalize calls
857
+ #
858
+ # @return [void]
859
+ def finish_state_reset
860
+ # Reset range decoder (XZ Utils rc_reset)
861
+ # This reinitializes the range decoder for the new chunk
862
+ # The reset will read 5 bytes from the input when decode_stream starts
863
+ if @range_decoder
864
+ if ENV["LZMA_DEBUG"]
865
+ input_pos = begin
866
+ @input.pos
867
+ rescue StandardError
868
+ "N/A"
869
+ end
870
+ input_size = begin
871
+ @input.size
872
+ rescue StandardError
873
+ "N/A"
874
+ end
875
+ XzUtilsDecoderDebug.debug_puts "=== finish_state_reset: resetting range_decoder, input pos=#{input_pos}, size=#{input_size}"
876
+ end
877
+ @range_decoder.reset
878
+ if ENV["LZMA_DEBUG"]
879
+ input_pos_after = begin
880
+ @input.pos
881
+ rescue StandardError
882
+ "N/A"
883
+ end
884
+ XzUtilsDecoderDebug.debug_puts "=== finish_state_reset: after reset, input pos=#{input_pos_after}, range_decoder.code=0x#{@range_decoder.code.to_s(16)}"
885
+ end
886
+ end
887
+ end
888
+
889
+ def reset_state_only
890
+ # For backward compatibility, call both prepare and finish
891
+ prepare_state_reset
892
+ finish_state_reset
893
+ end
894
+
895
+ # Reset only the range decoder for next chunk
896
+ #
897
+ # XZ Utils pattern (lzma_decoder.c:1014-1017):
898
+ # When LZMA chunk ends (LZMA_STREAM_END), reset range decoder
899
+ # for next LZMA2 chunk, but preserve state and probability models.
900
+ #
901
+ # Note: This method is a no-op in our implementation because
902
+ # decode_stream creates a fresh RangeDecoder for each chunk.
903
+ # The range decoder initialization happens automatically when
904
+ # the new RangeDecoder is created with the new input.
905
+ #
906
+ # @return [void]
907
+ def reset_range_decoder
908
+ # No-op: RangeDecoder is created fresh in decode_stream
909
+ end
910
+
911
+ # Set new input stream for chunked decoding
912
+ #
913
+ # For LZMA2, the range decoder is persistent across chunks and is
914
+ # reset separately via prepare_state_reset + finish_state_reset.
915
+ # This method just updates the input stream reference.
916
+ #
917
+ # @param new_input [IO] New input stream
918
+ # @return [void]
919
+ def set_input(new_input)
920
+ @input = new_input
921
+
922
+ # DEBUG: Trace input stream contents
923
+ if ENV.fetch("LZMA_DEBUG", nil) && @dict_full && @dict_full >= 220 && @dict_full <= 230
924
+ puts "\n=== set_input at dict_full=#{@dict_full} ==="
925
+ puts " new_input.size=#{new_input.size}"
926
+ puts " new_input.pos=#{new_input.pos}"
927
+ puts " new_input.class=#{new_input.class}"
928
+
929
+ # Read first 10 bytes manually
930
+ first_bytes = []
931
+ 10.times do |_i|
932
+ byte = new_input.getbyte
933
+ break if byte.nil?
934
+
935
+ first_bytes << byte
936
+ end
937
+ puts " First 10 bytes: #{first_bytes.map { |b| "0x#{b.to_s(16).upcase}" }.join(' ')}"
938
+
939
+ new_input.rewind
940
+ test_byte = new_input.getbyte
941
+ puts " Test getbyte: 0x#{test_byte.to_s(16).upcase}" if test_byte
942
+ new_input.rewind
943
+ end
944
+
945
+ # Create range decoder if it doesn't exist (first chunk)
946
+ if @range_decoder.nil?
947
+ @range_decoder = RangeDecoder.new(@input)
948
+ if ENV["LZMA_DEBUG"]
949
+ XzUtilsDecoderDebug.debug_puts "=== set_input: created NEW range_decoder, input has #{@input.size} bytes"
950
+ end
951
+ else
952
+ # Update the range decoder's input stream to the new input
953
+ # This is needed because RangeDecoder holds a reference to the stream
954
+ @range_decoder.update_stream(@input)
955
+ if ENV["LZMA_DEBUG"]
956
+ XzUtilsDecoderDebug.debug_puts "=== set_input: reusing range_decoder, new input has #{@input.size} bytes, pos=#{@input.pos}"
957
+ end
958
+ end
959
+ end
960
+
961
+ # Add uncompressed data to the dictionary
962
+ #
963
+ # XZ Utils pattern (lzma2_decoder.c:195, dict_write):
964
+ # - Copy uncompressed data to the dictionary as-is
965
+ # - Update dict_full to reflect new data
966
+ # - This allows subsequent compressed chunks to reference the data
967
+ #
968
+ # This is used by LZMA2 decoder for uncompressed chunks (control=0x1 or 0x2)
969
+ #
970
+ # @param data [String] Uncompressed data to add to dictionary
971
+ # @return [void]
972
+ def add_to_dictionary(data)
973
+ if ENV["LZMA_DEBUG"]
974
+ old_dict_full = @dict_full
975
+ XzUtilsDecoderDebug.debug_puts "=== add_to_dictionary: adding #{data.bytesize} bytes to dictionary[#{@decoder_id}], current dict_full=#{@dict_full}, pos=#{@pos}"
976
+ end
977
+
978
+ data.each_byte do |byte|
979
+ @dict_buf[@pos] = byte
980
+ @pos += 1
981
+ end
982
+
983
+ # Update dict_full to reflect new data
984
+ @dict_full = @pos - LZ_DICT_INIT_POS
985
+
986
+ # Check if we've reached the maximum dictionary size
987
+ if @dict_full >= @dict_size
988
+ @dict_full = @dict_size
989
+ end
990
+
991
+ if ENV["LZMA_DEBUG"]
992
+ XzUtilsDecoderDebug.debug_puts "=== add_to_dictionary: after adding, dict_full=#{@dict_full} (was #{old_dict_full}), pos=#{@pos}"
993
+ end
994
+ end
995
+
996
+ # Set uncompressed size for chunked decoding
997
+ #
998
+ # XZ Utils pattern (lzma2_decoder.c:140-141):
999
+ # Pass the chunk's uncompressed_size to the LZMA decoder
1000
+ # for each LZMA2 chunk.
1001
+ #
1002
+ # @param size [Integer] Uncompressed size for current chunk
1003
+ # @param allow_eopm [Boolean] Whether to allow end-of-payload marker
1004
+ # @return [void]
1005
+ def set_uncompressed_size(size, allow_eopm: true)
1006
+ @uncompressed_size = size
1007
+ @allow_eopm = allow_eopm
1008
+ # DEBUG: Track when uncompressed size is set
1009
+ if ENV["LZMA_DEBUG"]
1010
+ puts "DEBUG set_uncompressed_size: size=#{size}, @decoder_id=#{@decoder_id}, @dict_full=#{@dict_full}"
1011
+ end
1012
+ end
1013
+
1014
+ private
1015
+
1016
+ # Read and parse LZMA header
1017
+ #
1018
+ # SDK header format:
1019
+ # - Property byte: (lc + lp*9 + pb*45)
1020
+ # - Dictionary size: 4 bytes little-endian
1021
+ # - Uncompressed size: 8 bytes (0xFF for unknown size)
1022
+ #
1023
+ # @return [void]
1024
+ # @raise [RuntimeError] If header is invalid
1025
+ def read_header
1026
+ # Property byte
1027
+ props = @input.getbyte
1028
+ raise "Invalid LZMA header" if props.nil?
1029
+
1030
+ @lc = props % 9
1031
+ remainder = props / 9
1032
+ @lp = remainder % 5
1033
+ @pb = remainder / 5
1034
+
1035
+ # Dictionary size (4 bytes, little-endian)
1036
+ @dict_size = 0
1037
+ 4.times do |i|
1038
+ byte = @input.getbyte
1039
+ raise "Incomplete header" if byte.nil?
1040
+
1041
+ @dict_size |= (byte << (i * 8))
1042
+ end
1043
+
1044
+ # Uncompressed size (8 bytes, little-endian)
1045
+ @uncompressed_size = 0
1046
+ 8.times do |i|
1047
+ byte = @input.getbyte
1048
+ raise "Incomplete header" if byte.nil?
1049
+
1050
+ @uncompressed_size |= (byte << (i * 8))
1051
+ end
1052
+ end
1053
+
1054
+ # Validate parameters
1055
+ #
1056
+ # @return [void]
1057
+ # @raise [RuntimeError] If parameters are invalid
1058
+ def validate_parameters
1059
+ raise "Invalid lc (#{@lc})" unless @lc.between?(0, 8)
1060
+ raise "Invalid lp (#{@lp})" unless @lp.between?(0, 4)
1061
+ raise "Invalid pb (#{@pb})" unless @pb.between?(0, 4)
1062
+ end
1063
+
1064
+ # Initialize probability models
1065
+ #
1066
+ # SDK allocates models following exact structure from LzmaDec.c:
1067
+ # - Literal models: (1 << (lc+lp)) contexts * 0x300 models each
1068
+ # - Match models: NUM_STATES * pos_states models (where pos_states = 1 << pb)
1069
+ # - Rep models: NUM_STATES models each
1070
+ #
1071
+ # Must match SdkEncoder's model structure exactly.
1072
+ # CRITICAL: When pb changes, models must be recreated with new pos_states!
1073
+ #
1074
+ # @return [void]
1075
+ def init_models
1076
+ # Calculate pos_states based on current @pb value
1077
+ pos_states = 1 << @pb
1078
+ @pos_states = pos_states # Store for use in indexing
1079
+
1080
+ # Literal models: XZ Utils compact layout
1081
+ # context_value ranges from 0 to literal_mask (using XZ Utils formula)
1082
+ # base_offset = (context_value * 3) << lc
1083
+ # For unmatched mode: max index = (max_context_value * 3) << lc + 256
1084
+ # For matched mode: max index = (max_context_value * 3) << lc + offset + match_bit + symbol
1085
+ # where offset, match_bit, and symbol can each be up to 0x100
1086
+ # So max matched index = base_offset + 0x100 + 0x100 + 0x100 = base_offset + 0x300
1087
+ # XZ Utils formula for literal_mask: (0x100 << lp) - (0x100 >> lc)
1088
+ literal_mask = (0x100 << @lp) - (0x100 >> @lc)
1089
+ max_context_value = literal_mask
1090
+ max_base_offset = (max_context_value * 3) << @lc
1091
+ max_model_index = max_base_offset + 0x300 # accommodate matched mode (offset + match_bit + symbol)
1092
+ @literal_models = Array.new(max_model_index + 1) do
1093
+ BitModel.new
1094
+ end
1095
+
1096
+ # Match/rep decision models
1097
+ # IMPORTANT: Use current pos_states, not POS_STATES_MAX
1098
+ # This ensures models are correctly sized when pb changes
1099
+ @is_match_models = Array.new(NUM_STATES * pos_states) do
1100
+ BitModel.new
1101
+ end
1102
+ @is_rep_models = Array.new(NUM_STATES) { BitModel.new }
1103
+ @is_rep0_models = Array.new(NUM_STATES) { BitModel.new }
1104
+ @is_rep1_models = Array.new(NUM_STATES) { BitModel.new }
1105
+ @is_rep2_models = Array.new(NUM_STATES) { BitModel.new }
1106
+ @is_rep0_long_models = Array.new(NUM_STATES * pos_states) do
1107
+ BitModel.new
1108
+ end
1109
+ end
1110
+
1111
+ # Initialize SDK coders
1112
+ #
1113
+ # @return [void]
1114
+ def init_coders
1115
+ @literal_decoder = LiteralDecoder.new
1116
+ pos_states = 1 << @pb
1117
+ @length_coder = LengthCoder.new(pos_states)
1118
+ @rep_length_coder = LengthCoder.new(pos_states)
1119
+ @distance_coder = DistanceCoder.new(NUM_LEN_TO_POS_STATES)
1120
+
1121
+ if ENV["TRACE_MODEL_INIT"]
1122
+ puts "[XzUtilsDecoder.init] slot_encoders len_state=0 object_id=#{@distance_coder.instance_variable_get(:@slot_encoders)[0].object_id}"
1123
+ puts "[XzUtilsDecoder.init] slot_encoders[0][1] object_id=#{@distance_coder.instance_variable_get(:@slot_encoders)[0][1].object_id}"
1124
+ puts "[XzUtilsDecoder.init] is_match_models object_id=#{@is_match_models.object_id}"
1125
+ puts "[XzUtilsDecoder.init] is_match_models[0] object_id=#{@is_match_models[0].object_id}"
1126
+ end
1127
+
1128
+ # Update probability model indices to match new pos_states
1129
+ # This is critical when pb changes between chunks
1130
+ @pos_states = pos_states
1131
+ end
1132
+
1133
+ # Reset distance coder probability models
1134
+ #
1135
+ # Called during state reset (control >= 0xA0) to reset the distance
1136
+ # coder's probability models to initial values. This matches XZ Utils
1137
+ # behavior where init_temporals resets all probability models.
1138
+ #
1139
+ # @return [void]
1140
+ def reset_distance_coder
1141
+ @distance_coder.reset_models
1142
+ end
1143
+
1144
+ # Decode a literal byte
1145
+ #
1146
+ # SDK decoding sequence (from LzmaDec.c):
1147
+ # 1. Calculate literal state
1148
+ # 2. Decode literal (matched or unmatched based on state)
1149
+ # 3. Update state machine
1150
+ # 4. Update dictionary and position
1151
+ #
1152
+ # XZ Utils dict_put pattern (from lz_decoder.h:270-276):
1153
+ # dict->buf[dict->pos++] = byte;
1154
+ # if (!dict->has_wrapped)
1155
+ # dict->full = dict->pos - LZ_DICT_INIT_POS;
1156
+ #
1157
+ # @return [void]
1158
+ def decode_literal
1159
+ # DEBUG: Trace literals around position 224-227
1160
+ old_dict_full = @dict_full
1161
+
1162
+ # DEBUG: Track how many times we're called for each position
1163
+ if ENV["LZMA_DEBUG_DECODE_LITERAL"]
1164
+ caller_info = caller(1..1).first
1165
+ XzUtilsDecoderDebug.debug_puts "DEBUG decode_literal[#{@decoder_id}]: pos=#{@pos}, dict_full=#{@dict_full}, from=#{caller_info.label}"
1166
+ end
1167
+
1168
+ # DEBUG: Check array integrity before decode
1169
+ if ENV.fetch("LZMA_DEBUG_ARRAY",
1170
+ nil) && @dict_full.positive? && @pos > 1
1171
+ idx = @pos - 1
1172
+ if @dict_buf[idx].nil?
1173
+ raise "DEBUG before decode: @dict_buf[#{idx}] is nil! @pos=#{@pos}, @dict_full=#{@dict_full}, @dict_buf.size=#{@dict_buf.size}, nil_count=#{@dict_buf.count(nil)}"
1174
+ end
1175
+ end
1176
+
1177
+ # Calculate literal state using SDK formula
1178
+ lit_state = calculate_literal_state
1179
+
1180
+ # DEBUG: Trace lit_state at position 61
1181
+ if @dict_full == 61 && ENV["TRACE_LITERAL_61"]
1182
+ XzUtilsDecoderDebug.debug_puts "=== CALC_LITERAL_STATE at dict_full=61 ==="
1183
+ XzUtilsDecoderDebug.debug_puts " prev_byte=#{@dict_full.positive? ? @dict_buf[@pos - 1] : 0}"
1184
+ XzUtilsDecoderDebug.debug_puts " lit_state=#{lit_state}"
1185
+ XzUtilsDecoderDebug.debug_puts " lc=#{@lc}, lp=#{@lp}"
1186
+ XzUtilsDecoderDebug.debug_puts " state.value=#{@state.value}"
1187
+ XzUtilsDecoderDebug.debug_puts " use_matched_literal?=#{@state.use_matched_literal?}"
1188
+ XzUtilsDecoderDebug.debug_puts " dict_full.positive?=#{@dict_full.positive?}"
1189
+ puts
1190
+ end
1191
+
1192
+ # Decode literal (matched or unmatched)
1193
+ # Check if dictionary has any valid bytes (XZ Utils: dict->full > 0)
1194
+ if @state.use_matched_literal? && @dict_full.positive?
1195
+ # DEBUG: Track which branch is taken
1196
+ if @dict_full == 61 && ENV["TRACE_LITERAL_61"]
1197
+ XzUtilsDecoderDebug.debug_puts " TAKING MATCHED LITERAL PATH"
1198
+ XzUtilsDecoderDebug.debug_puts " rep0=#{@rep0}"
1199
+ match_byte_pos_calc = LZ_DICT_INIT_POS + @dict_full - @rep0 - 1
1200
+ XzUtilsDecoderDebug.debug_puts " match_byte_pos (calc)=#{match_byte_pos_calc}"
1201
+ puts
1202
+ end
1203
+
1204
+ # Matched literal: use match byte from dictionary at distance rep0
1205
+ # XZ Utils dict_get pattern: dict->buf[dict->pos - distance - 1]
1206
+ # IMPORTANT: dict->pos in XZ Utils is the actual output position (dict->full),
1207
+ # not the buffer position with offset!
1208
+ # omnizip uses @pos for buffer position (includes LZ_DICT_INIT_POS offset)
1209
+ # and @dict_full for actual output position (starts at 0)
1210
+ # So we must convert: buffer_pos = LZ_DICT_INIT_POS + (output_pos - rep0 - 1)
1211
+ match_byte_pos = LZ_DICT_INIT_POS + @dict_full - @rep0 - 1
1212
+ match_byte = @dict_buf[match_byte_pos]
1213
+ if ENV["LZMA_DEBUG"]
1214
+ warn "DEBUG: matched literal - dict_full=#{@dict_full}, rep0=#{@rep0}, reading dict_buf[#{match_byte_pos}]=0x#{match_byte.to_s(16).upcase} ('#{match_byte.chr}'), lit_state=#{lit_state}, state=#{@state.value}"
1215
+ end
1216
+ byte = @literal_decoder.decode_matched(match_byte, lit_state, @lc,
1217
+ @range_decoder, @literal_models)
1218
+
1219
+ # DEBUG: Trace decoded byte at position 61
1220
+ if @dict_full == 61 && ENV["TRACE_LITERAL_61"]
1221
+ XzUtilsDecoderDebug.debug_puts " DECODED MATCHED LITERAL: 0x#{byte.to_s(16).upcase} ('#{byte.chr}')"
1222
+ XzUtilsDecoderDebug.debug_puts " match_byte=0x#{match_byte.to_s(16).upcase} ('#{match_byte.chr}')"
1223
+ puts
1224
+ end
1225
+ else
1226
+ # Unmatched literal: simple 8-bit decoding
1227
+ if @dict_full == 61 && ENV["TRACE_LITERAL_61"]
1228
+ XzUtilsDecoderDebug.debug_puts " TAKING UNMATCHED LITERAL PATH"
1229
+ puts
1230
+ end
1231
+
1232
+ if ENV["LZMA_DEBUG"]
1233
+ warn "DEBUG: calling decode_unmatched - pos=#{@pos}, lit_state=#{lit_state}"
1234
+ end
1235
+ byte = @literal_decoder.decode_unmatched(lit_state, @lc,
1236
+ @range_decoder, @literal_models)
1237
+ end
1238
+
1239
+ if ENV["LZMA_DEBUG"]
1240
+ warn "DEBUG: decode_literal RETURNED - pos=#{@pos}, byte=#{'0x%02X' % byte} ('#{byte.chr}'), lit_state=#{lit_state}"
1241
+ end
1242
+
1243
+ # DEBUG: Check byte value at critical positions
1244
+ if @dict_full == 256
1245
+ XzUtilsDecoderDebug.debug_puts "DEBUG: About to store 257th byte (pos=#{@pos}, dict_full=#{@dict_full})"
1246
+ XzUtilsDecoderDebug.debug_puts " byte.class=#{byte.class}"
1247
+ XzUtilsDecoderDebug.debug_puts " byte=#{byte.inspect}"
1248
+ XzUtilsDecoderDebug.debug_puts " byte.is_a?(Integer)=#{byte.is_a?(Integer)}"
1249
+ if byte.is_a?(Integer)
1250
+ XzUtilsDecoderDebug.debug_puts " byte value=#{byte}"
1251
+ XzUtilsDecoderDebug.debug_puts " Expected byte value=0"
1252
+ else
1253
+ XzUtilsDecoderDebug.debug_puts " byte is not an Integer!"
1254
+ XzUtilsDecoderDebug.debug_puts " byte.ord=#{byte.ord}"
1255
+ end
1256
+ end
1257
+
1258
+ if ENV["LZMA_DEBUG"]
1259
+ decoded_bytes = @dict_full.positive? ? @dict_buf[LZ_DICT_INIT_POS..].map(&:chr).join : ""
1260
+ warn "DEBUG: decode_literal - pos=#{@pos}, byte=#{'0x%02X' % byte} ('#{byte.chr}'), state=#{@state.value}, dict_full=#{@dict_full}, decoded_so_far='#{decoded_bytes[-10..]}'"
1261
+ end
1262
+
1263
+ # DEBUG: Detailed trace around position 256
1264
+ if ENV["LZMA_DEBUG"] && @dict_full.between?(230, 265)
1265
+ expected = @dict_full % 256
1266
+ match = byte == expected ? "✓" : "✗ MISMATCH!"
1267
+ XzUtilsDecoderDebug.debug_puts " [LITERAL] dict_full=#{@dict_full}: 0x#{byte.to_s(16).upcase} (expected 0x#{expected.to_s(16).upcase}) #{match}"
1268
+ if @dict_full == 233
1269
+ XzUtilsDecoderDebug.debug_puts " DETAILED TRACE at dict_full=233 (pos=#{@pos}):"
1270
+ XzUtilsDecoderDebug.debug_puts " byte=0x#{byte.to_s(16)} ('#{begin
1271
+ byte.chr
1272
+ rescue StandardError
1273
+ '?'
1274
+ end}')"
1275
+ XzUtilsDecoderDebug.debug_puts " state.value=#{@state.value}, lit_state=#{lit_state}"
1276
+ XzUtilsDecoderDebug.debug_puts " use_matched_literal?=#{@state.use_matched_literal?}"
1277
+ prev_byte_val = @dict_full.positive? ? @dict_buf[@pos - 1] : "N/A"
1278
+ XzUtilsDecoderDebug.debug_puts " prev_byte=#{prev_byte_val.inspect} (#{if prev_byte_val.is_a?(Integer)
1279
+ "0x#{prev_byte_val.to_s(16)} ('#{begin
1280
+ prev_byte_val.chr
1281
+ rescue StandardError
1282
+ '?'
1283
+ end}')"
1284
+ else
1285
+ 'N/A'
1286
+ end})"
1287
+ XzUtilsDecoderDebug.debug_puts " range_decoder.range=0x#{@range_decoder.range.to_s(16)}, range_decoder.code=0x#{@range_decoder.code.to_s(16)}"
1288
+ XzUtilsDecoderDebug.debug_puts " input.pos=#{@input.pos}, input.size=#{@input.size}"
1289
+ end
1290
+ if @dict_full == 256
1291
+ XzUtilsDecoderDebug.debug_puts " pos=#{@pos}, lit_state=#{lit_state}, state.value=#{@state.value}"
1292
+ XzUtilsDecoderDebug.debug_puts " use_matched_literal?=#{@state.use_matched_literal?}"
1293
+ end
1294
+ end
1295
+
1296
+ # Update state and dictionary
1297
+ # XZ Utils dict_put pattern:
1298
+ # dict->buf[dict->pos++] = byte;
1299
+ # if (!dict->has_wrapped)
1300
+ # dict->full = dict->pos - LZ_DICT_INIT_POS;
1301
+ @state.update_literal
1302
+ warn "DEBUG: After update_literal - state=#{@state.value}" if ENV["LZMA_DEBUG"]
1303
+
1304
+ # Write to dictionary buffer at current position
1305
+ # XZ Utils dict_put pattern: dict->buf[dict->pos++] = byte;
1306
+ # DEBUG: Check byte value at critical position
1307
+ if @pos == 576 + 256
1308
+ XzUtilsDecoderDebug.debug_puts "DEBUG: Storing byte at pos 832 (256th decoded byte)"
1309
+ XzUtilsDecoderDebug.debug_puts " byte.class=#{byte.class}"
1310
+ XzUtilsDecoderDebug.debug_puts " byte=#{byte}"
1311
+ XzUtilsDecoderDebug.debug_puts " byte.ord=#{byte.is_a?(String) ? byte.ord : 'N/A (not a string)'}"
1312
+ XzUtilsDecoderDebug.debug_puts " Integer value=#{byte.is_a?(Integer) ? byte : byte.ord}"
1313
+ end
1314
+ @dict_buf[@pos] = byte
1315
+ # DEBUG: Track array size changes
1316
+ if ENV["LZMA_DEBUG_ARRAY_WRITE"] && @dict_buf.size != (@dict_size + LZ_DICT_INIT_POS)
1317
+ XzUtilsDecoderDebug.debug_puts "DEBUG: Array expanded! pos=#{@pos}, byte=#{byte}, old_size=#{@dict_buf.size - 1}, new_size=#{@dict_buf.size}, decoder_id=#{@decoder_id}"
1318
+ XzUtilsDecoderDebug.debug_puts " Writing beyond original size caused expansion!"
1319
+ end
1320
+ if ENV["LZMA_DEBUG_ARRAY_WRITE"]
1321
+ XzUtilsDecoderDebug.debug_puts "DEBUG write[#{@decoder_id}]: pos=#{@pos}, byte=#{byte}, dict_buf.size=#{@dict_buf.size}, dict_buf.object_id=#{@dict_buf.object_id}, nil_count=#{@dict_buf.count(nil)}"
1322
+ end
1323
+ if ENV["LZMA_DEBUG_ARRAY"]
1324
+ # Verify the write succeeded
1325
+ if @dict_buf[@pos] != byte
1326
+ raise "DEBUG after write: @dict_buf[#{@pos}] = #{@dict_buf[@pos].inspect}, expected #{byte}! nil_count=#{@dict_buf.count(nil)}"
1327
+ end
1328
+ if @dict_full.positive? && @pos > LZ_DICT_INIT_POS && @dict_buf[@pos - 1].nil?
1329
+ raise "DEBUG after write: @dict_buf[#{@pos - 1}] is nil! @pos=#{@pos}, @dict_full=#{@dict_full}"
1330
+ end
1331
+ end
1332
+ @pos += 1
1333
+
1334
+ # ARM64 DEBUG: Trace first 20 bytes being written to dictionary
1335
+ if ENV["TRACE_ARM64_BYTES"]
1336
+ @arm64_trace ||= []
1337
+ if @arm64_trace.size < 20
1338
+ @arm64_trace << [@dict_full, @pos, byte.class, byte.is_a?(Integer) ? byte : byte.ord, @dict_buf[@pos]]
1339
+ if @arm64_trace.size == 20
1340
+ # Dump the trace
1341
+ puts "\n=== ARM64 BYTE TRACE (first 20 bytes) ==="
1342
+ puts "Decoder ID: #{@decoder_id}"
1343
+ @arm64_trace.each_with_index do |entry, i|
1344
+ df, p, _, val, stored = entry
1345
+ puts " [#{i + 1}] dict_full=#{df.to_s.rjust(6)}, pos=#{p.to_s.rjust(6)}, byte=#{val.to_s.rjust(3)} (0x#{val.to_s(16).upcase.rjust(2, '0')}) stored=#{stored.inspect}"
1346
+ end
1347
+ puts "=========================================\n"
1348
+ $stderr.flush
1349
+ end
1350
+ end
1351
+ end
1352
+
1353
+ # Update dict_full (XZ Utils pattern)
1354
+ # When dict_full reaches dict_size, the dictionary is full
1355
+ # After that, dict_full stays at dict_size and has_wrapped = true
1356
+ unless @has_wrapped
1357
+ @dict_full = @pos - LZ_DICT_INIT_POS
1358
+ # Check if we've reached the maximum dictionary size
1359
+ if @dict_full >= @dict_size
1360
+ @has_wrapped = true
1361
+ @dict_full = @dict_size
1362
+ end
1363
+ end
1364
+
1365
+ # Track bytes decoded in this chunk (for match length limiting)
1366
+ # IMPORTANT: Always increment this, even after dictionary wraps!
1367
+ # This is needed for correct match length limiting when @uncompressed_size is set.
1368
+ # XZ Utils uses dict.limit for this, but we use @chunk_bytes_decoded.
1369
+ if @uncompressed_size != 0xFFFFFFFFFFFFFFFF
1370
+ @chunk_bytes_decoded += 1
1371
+ end
1372
+
1373
+ # DEBUG: Show literal decode for position 220-230
1374
+ if old_dict_full.between?(220, 230)
1375
+ XzUtilsDecoderDebug.debug_puts "\n=== decode_literal at dict_full=#{old_dict_full} ==="
1376
+ XzUtilsDecoderDebug.debug_puts " Decoded: 0x#{byte.to_s(16)} ('#{byte.chr}')"
1377
+ XzUtilsDecoderDebug.debug_puts " rep0/1/2/3=(#{@rep0},#{@rep1},#{@rep2},#{@rep3})"
1378
+ end
1379
+ end
1380
+
1381
+ # Decode a match
1382
+ #
1383
+ # SDK decoding sequence:
1384
+ # 1. Decode is_rep bit
1385
+ # 2. If regular match:
1386
+ # - Decode match length using length coder
1387
+ # - Decode match distance using distance coder
1388
+ # 3. If rep match:
1389
+ # - Decode which rep distance to use (rep0/1/2/3)
1390
+ # - Decode rep match length
1391
+ # 4. Check for EOS marker
1392
+ # 5. Copy matched data from dictionary
1393
+ # 6. Update state machine and rep distances
1394
+ # 7. Update dictionary and position
1395
+ #
1396
+ # @return [Boolean] True if EOS marker detected, false otherwise
1397
+ def decode_match
1398
+ pos_state = @pos & ((1 << @pb) - 1)
1399
+
1400
+ # DEEP DEBUG: Trace every detail at position 227
1401
+ if ENV["LZMA_DEBUG"] && @dict_full == 227
1402
+ puts "\n=== DEEP DEBUG at dict_full=227 ==="
1403
+ puts " State: #{@state.value}, pos_state=#{pos_state}"
1404
+ puts " Rep distances BEFORE: (#{@rep0},#{@rep1},#{@rep2},#{@rep3})"
1405
+ puts " Range decoder: range=0x#{@range_decoder.range.to_s(16).upcase}, code=0x#{@range_decoder.code.to_s(16).upcase}"
1406
+ input_buffer = @range_decoder.instance_variable_get(:@input)
1407
+ puts " Input buffer: #{input_buffer ? input_buffer.size : 'nil'} bytes"
1408
+ end
1409
+
1410
+ # Decode is_rep bit
1411
+ is_rep_model = @is_rep_models[@state.value]
1412
+ if ENV["TRACE_IS_REP"]
1413
+ range_val = @range_decoder.range
1414
+ code_val = @range_decoder.code
1415
+ puts "[XzUtilsDecoder.decode_match] Before is_rep: state.value=#{@state.value}"
1416
+ puts " is_rep_model.object_id=#{is_rep_model.object_id}, prob=#{is_rep_model.probability}"
1417
+ puts " range=#{range_val} (0x#{range_val.to_s(16)}), code=#{code_val} (0x#{code_val.to_s(16)})"
1418
+ bound_calc = (range_val >> 11) * is_rep_model.probability
1419
+ puts " bound=(#{range_val} >> 11) * #{is_rep_model.probability} = #{bound_calc} (0x#{bound_calc.to_s(16)})"
1420
+ puts " code < bound? #{code_val < bound_calc}"
1421
+ end
1422
+ is_rep = @range_decoder.decode_bit(is_rep_model)
1423
+
1424
+ if ENV["TRACE_IS_REP"]
1425
+ range_val = @range_decoder.range
1426
+ code_val = @range_decoder.code
1427
+ puts "[XzUtilsDecoder.decode_match] Decoded is_rep=#{is_rep} with prob=#{is_rep_model.probability}"
1428
+ puts " After is_rep: range=#{range_val} (0x#{range_val.to_s(16)}), code=#{code_val} (0x#{code_val.to_s(16)})"
1429
+ end
1430
+
1431
+ if ENV["LZMA_DEBUG"] && @dict_full == 227
1432
+ puts " Decoded is_rep bit: #{is_rep} (#{@is_rep_models[@state.value].probability})"
1433
+ puts " After is_rep: range=0x#{@range_decoder.range.to_s(16).upcase}, code=0x#{@range_decoder.code.to_s(16).upcase}"
1434
+ end
1435
+
1436
+ if ENV["LZMA_DEBUG"]
1437
+ warn "DEBUG: decode_match START - is_rep=#{is_rep}, state.value=#{@state.value}, pos_state=#{pos_state}, rep0/1/2/3=(#{@rep0},#{@rep1},#{@rep2},#{@rep3})"
1438
+ end
1439
+
1440
+ if is_rep.zero?
1441
+ # Regular match (not rep)
1442
+ if ENV["LZMA_DEBUG"] && @dict_full.between?(220, 240)
1443
+ puts "DEBUG pos #{@dict_full}: Regular match (not rep)"
1444
+ end
1445
+ # Return result from decode_regular_match (true if EOS marker detected)
1446
+ return true if decode_regular_match(pos_state)
1447
+ else
1448
+ # Rep match - decode which rep distance to use
1449
+ if ENV["LZMA_DEBUG"] && @dict_full.between?(220, 240)
1450
+ puts "DEBUG pos #{@dict_full}: Rep match (is_rep=1)"
1451
+ end
1452
+ decode_rep_match(pos_state)
1453
+ end
1454
+
1455
+ false # No EOS marker detected
1456
+ end
1457
+
1458
+ # Decode a regular (non-rep) match
1459
+ #
1460
+ # XZ Utils dict_repeat pattern (from lz_decoder.h:203-263):
1461
+ # - Validate distance: dict->full > distance
1462
+ # - Calculate back = dict->pos - distance - 1
1463
+ # - If distance >= dict->pos: back += dict->size - LZ_DICT_REPEAT_MAX
1464
+ # - Copy bytes from back position
1465
+ # - Update dict->full if !has_wrapped
1466
+ #
1467
+ # @param pos_state [Integer] Position state
1468
+ # @return [Boolean] True if EOS marker detected, false otherwise
1469
+ def decode_regular_match(pos_state)
1470
+ # DEBUG: Trace matches around dict_full = 60-63
1471
+ old_dict_full = @dict_full
1472
+ old_rep0 = @rep0
1473
+ old_state = @state.value
1474
+
1475
+ # Decode match length
1476
+ length_encoded = @length_coder.decode(@range_decoder,
1477
+ pos_state)
1478
+ length = length_encoded + MATCH_LEN_MIN
1479
+
1480
+ # Calculate length state for distance decoding
1481
+ # XZ Utils formula (from lzma_common.h get_dist_state macro):
1482
+ # ((len) < DIST_STATES + MATCH_LEN_MIN ? (len) - MATCH_LEN_MIN : DIST_STATES - 1)
1483
+ # This gives: len=2→0, len=3→1, len=4→2, len=5→3, len=6+→3
1484
+ len_state = if length < NUM_LEN_TO_POS_STATES + MATCH_LEN_MIN
1485
+ length - MATCH_LEN_MIN
1486
+ else
1487
+ NUM_LEN_TO_POS_STATES - 1
1488
+ end
1489
+
1490
+ # DEBUG: Show bytes being copied
1491
+ if old_dict_full.between?(210, 230) || ENV["LZMA_DEBUG_DISTANCE"]
1492
+ XzUtilsDecoderDebug.debug_puts "\n=== decode_regular_match at dict_full=#{old_dict_full} ===" if old_dict_full.between?(210, 230)
1493
+ puts "[DISTANCE_DECODER] decode_regular_match at dict_full=#{old_dict_full}" if ENV["LZMA_DEBUG_DISTANCE"]
1494
+ XzUtilsDecoderDebug.debug_puts " pos_state=#{pos_state}" if old_dict_full.between?(210, 230)
1495
+ puts "[DISTANCE_DECODER] pos_state=#{pos_state}" if ENV["LZMA_DEBUG_DISTANCE"]
1496
+ XzUtilsDecoderDebug.debug_puts " state=#{old_state}" if old_dict_full.between?(210, 230)
1497
+ puts "[DISTANCE_DECODER] state=#{old_state}" if ENV["LZMA_DEBUG_DISTANCE"]
1498
+ XzUtilsDecoderDebug.debug_puts " length_encoded=#{length_encoded} length=#{length}" if old_dict_full.between?(210, 230)
1499
+ puts "[DISTANCE_DECODER] length_encoded=#{length_encoded} length=#{length}" if ENV["LZMA_DEBUG_DISTANCE"]
1500
+ XzUtilsDecoderDebug.debug_puts " len_state=#{len_state}" if old_dict_full.between?(210, 230)
1501
+ puts "[DISTANCE_DECODER] len_state=#{len_state}" if ENV["LZMA_DEBUG_DISTANCE"]
1502
+ XzUtilsDecoderDebug.debug_puts " rep0_before=#{old_rep0}" if old_dict_full.between?(210, 230)
1503
+ puts "[DISTANCE_DECODER] rep0_before=#{old_rep0}" if ENV["LZMA_DEBUG_DISTANCE"]
1504
+ end
1505
+
1506
+ if ENV["LZMA_DEBUG"] && old_dict_full.between?(220, 230)
1507
+ puts "DEBUG decode_regular_match at dict_full=#{old_dict_full}: length=#{length}"
1508
+ end
1509
+
1510
+ # Decode match distance
1511
+ # XZ Utils stores distance in rep0 without +1
1512
+ # The distance coder returns 0-based distance
1513
+ rep0 = @distance_coder.decode(@range_decoder, len_state)
1514
+
1515
+ # DEBUG
1516
+ if (ENV.fetch("LZMA_DEBUG", nil) && old_dict_full.between?(210, 230)) || old_dict_full == 293
1517
+ puts " rep0_decoded=#{rep0} (distance = #{rep0})"
1518
+ puts " buffer_back calculation: back=#{@dict_full - rep0 - 1}"
1519
+ end
1520
+ if ENV["LZMA_DEBUG"] && rep0 > 100000
1521
+ puts " [LARGE_DISTANCE at dict_full=#{old_dict_full}] rep0=#{rep0}, call count=#{$distance_decode_count || 'unknown'}"
1522
+ end
1523
+
1524
+ # Check for SDK EOS marker FIRST (before validation)
1525
+ # EOS marker: rep0 == UINT32_MAX (0xFFFFFFFF)
1526
+ # XZ Utils checks: if (rep0 == UINT32_MAX) goto eopm;
1527
+ # EOPM is only allowed if @allow_eopm is true OR uncompressed_size is unknown
1528
+ # Reference: XZ Utils lzma_decoder.c:697-705, 874-888
1529
+ if rep0 == 0xFFFFFFFF
1530
+ if @allow_eopm || @uncompressed_size == 0xFFFFFFFFFFFFFFFF
1531
+ # XZ Utils pattern after detecting EOPM:
1532
+ # 1. Normalize range decoder (may read more input bytes)
1533
+ # 2. Check if range decoder is finished (code == 0)
1534
+ # Reference: lzma_decoder.c:881-887 (SEQ_EOPM case)
1535
+ @range_decoder.normalize
1536
+
1537
+ # Check if range decoder is finished (code == 0)
1538
+ unless @range_decoder.code.zero?
1539
+ raise Omnizip::DecompressionError,
1540
+ "EOPM detected but range decoder not finished (code=#{@range_decoder.code}). Corrupted stream."
1541
+ end
1542
+
1543
+ return true # EOS marker detected, stop decoding
1544
+ else
1545
+ raise Omnizip::DecompressionError,
1546
+ "End-of-payload marker (EOPM) detected but not allowed (LZMA2 streams cannot have EOPM)"
1547
+ end
1548
+ end
1549
+
1550
+ # Validate distance using XZ Utils dict_is_distance_valid pattern
1551
+ # XZ Utils: lzma_decoder.c:876 - dict_is_distance_valid(&dict, rep0)
1552
+ # See lz_decoder.h:194-198: return dict->full > distance;
1553
+ # XZ Utils dict->full = dict->pos - LZ_DICT_INIT_POS (same as our @dict_full)
1554
+ # The distance is valid if: dict->full > distance
1555
+ unless @dict_full > rep0
1556
+ raise Omnizip::DecompressionError,
1557
+ "Invalid distance: #{rep0} (dict_full: #{@dict_full})"
1558
+ end
1559
+
1560
+ # Additional validation: ensure distance doesn't exceed absolute dictionary size
1561
+ # Only validate against absolute dictionary size to prevent buffer overflow
1562
+ # The wrapping logic (below) handles rep0 >= @dict_full cases
1563
+ if rep0 > @dict_size + @dict_full
1564
+ raise Omnizip::DecompressionError,
1565
+ "Invalid distance: #{rep0} exceeds maximum possible (dict_size=#{@dict_size}, dict_full=#{@dict_full})"
1566
+ end
1567
+
1568
+ # IMPORTANT: Limit match length to not exceed uncompressed_size
1569
+ # XZ Utils handles this by setting dict.limit and checking before each write
1570
+ # We need to ensure we don't exceed the target size
1571
+ if @uncompressed_size != 0xFFFFFFFFFFFFFFFF
1572
+ # Calculate how many bytes we can still decode in THIS chunk
1573
+ # @chunk_bytes_decoded is the bytes decoded in this chunk (starts from 0)
1574
+ # @uncompressed_size is the target for THIS chunk (not cumulative)
1575
+ remaining = @uncompressed_size - @chunk_bytes_decoded
1576
+ if length > remaining
1577
+ if ENV["LZMA_DEBUG"] || remaining <= 5
1578
+ puts "DEBUG: Limiting match length from #{length} to #{remaining} (chunk_bytes_decoded=#{@chunk_bytes_decoded}, uncompressed_size=#{@uncompressed_size}, dict_full=#{@dict_full})"
1579
+ end
1580
+ length = remaining
1581
+ end
1582
+ end
1583
+
1584
+ # Copy matched data from dictionary using XZ Utils dict_repeat pattern
1585
+ # See lz_decoder.h:211-213:
1586
+ # back = dict->pos - distance - 1;
1587
+ # if (distance >= dict->pos)
1588
+ # back += dict->size - LZ_DICT_REPEAT_MAX;
1589
+ #
1590
+ # Note: dict->pos in XZ Utils is the actual data position (same as our @dict_full)
1591
+ # Our @pos includes the LZ_DICT_INIT_POS offset, so we use @dict_full for calculations
1592
+ #
1593
+ # dict->size in XZ Utils = dict_size + 2 * LZ_DICT_REPEAT_MAX
1594
+ # Our dict_buf size = @dict_size + LZ_DICT_INIT_POS = @dict_size + 2 * LZ_DICT_REPEAT_MAX
1595
+ back = @dict_full - rep0 - 1
1596
+ if rep0 >= @dict_full
1597
+ # Distance wraps to the end of the dictionary buffer
1598
+ # XZ Utils: back += dict->size - LZ_DICT_REPEAT_MAX;
1599
+ # Our dict_buf size = @dict_size + LZ_DICT_INIT_POS = @dict_size + 2 * LZ_DICT_REPEAT_MAX
1600
+ # So: back += (@dict_size + 2 * LZ_DICT_REPEAT_MAX) - LZ_DICT_REPEAT_MAX
1601
+ # = back + @dict_size + LZ_DICT_REPEAT_MAX
1602
+ back += @dict_size + LZ_DICT_REPEAT_MAX
1603
+ end
1604
+ # Convert to buffer position (add LZ_DICT_INIT_POS offset)
1605
+ buffer_back = back + LZ_DICT_INIT_POS
1606
+
1607
+ # DEBUG: Show buffer position for position 217
1608
+ if old_dict_full.between?(210, 230)
1609
+ XzUtilsDecoderDebug.debug_puts " buffer_back=#{buffer_back}, back=#{back}"
1610
+ bytes_at_back = @dict_buf[buffer_back, 3]
1611
+ bytes_hex = if bytes_at_back.is_a?(String)
1612
+ bytes_at_back.bytes.map do |b|
1613
+ "%02x" % b
1614
+ end.join(" ")
1615
+ else
1616
+ bytes_at_back.map do |b|
1617
+ "%02x" % b
1618
+ end.join(" ")
1619
+ end
1620
+ XzUtilsDecoderDebug.debug_puts " First 3 bytes at buffer_back: #{bytes_hex} (#{bytes_at_back.inspect})"
1621
+ end
1622
+
1623
+ if ENV["LZMA_DEBUG"]
1624
+ b0 = @dict_buf[buffer_back]
1625
+ b1 = @dict_buf[buffer_back + 1]
1626
+ b2 = @dict_buf[buffer_back + 2]
1627
+ b0_str = b0 ? "0x#{b0.to_s(16).upcase}" : "nil"
1628
+ b1_str = b1 ? "0x#{b1.to_s(16).upcase}" : "nil"
1629
+ b2_str = b2 ? "0x#{b2.to_s(16).upcase}" : "nil"
1630
+ b0_chr = b0 ? "'#{b0.chr}'" : "nil"
1631
+ b1_chr = b1 ? "'#{b1.chr}'" : "nil"
1632
+ b2_chr = b2 ? "'#{b2.chr}'" : "nil"
1633
+ warn "DEBUG: copy from buffer_back=#{buffer_back} (distance #{rep0}): #{b0_str} (#{b0_chr}) #{b1_str} (#{b1_chr}) #{b2_str} (#{b2_chr})"
1634
+ warn "DEBUG: pos_before=#{@pos} (output #{@pos - LZ_DICT_INIT_POS}), len=#{length}, pos_after=#{@pos + length} (output #{@pos + length - LZ_DICT_INIT_POS})"
1635
+ # Show what the dictionary contains at key positions (simplified)
1636
+ warn "DEBUG: dict_buf size=#{@dict_buf.size}, allocated=#{@dict_size + 608}"
1637
+ end
1638
+
1639
+ # Copy bytes from dictionary and extend buffer as needed
1640
+ # XZ Utils dict_repeat pattern: dict->buf[dict->pos++] = dict->buf[back++]
1641
+ if ENV["LZMA_DEBUG"] && old_dict_full.between?(220, 260)
1642
+ puts " Copying #{length} bytes from buffer_back=#{buffer_back} to @pos=#{@pos}, dict_full=#{@dict_full}"
1643
+ puts " Source bytes: #{@dict_buf[buffer_back, length].inspect}"
1644
+ puts " First 5 target bytes before copy: #{@dict_buf[@pos,
1645
+ 5].inspect}"
1646
+ end
1647
+ length.times do |i|
1648
+ byte = @dict_buf[buffer_back + i]
1649
+ if ENV["LZMA_DEBUG"]
1650
+ warn "DEBUG: copy iteration #{i}: reading dict_buf[#{buffer_back + i}]=0x#{byte.to_s(16).upcase} ('#{byte.chr}'), writing to dict_buf[#{@pos + i}]"
1651
+ end
1652
+ @dict_buf[@pos + i] = byte
1653
+ if ENV["LZMA_DEBUG_ARRAY_WRITE"] && @dict_buf.size != (@dict_size + LZ_DICT_INIT_POS)
1654
+ XzUtilsDecoderDebug.debug_puts "DEBUG: Array expanded during copy! write_pos=#{@pos + i}, byte=#{byte}, old_size=#{@dict_buf.size - 1}, new_size=#{@dict_buf.size}, decoder_id=#{@decoder_id}"
1655
+ end
1656
+ end
1657
+ if ENV["LZMA_DEBUG"] && old_dict_full.between?(220, 230)
1658
+ puts " After copy: #{@dict_buf[@pos, length].inspect}"
1659
+ end
1660
+
1661
+ # Update state and position
1662
+ @state.update_match
1663
+ warn "DEBUG: After update_match - state=#{@state.value}" if ENV["LZMA_DEBUG"]
1664
+ @pos += length
1665
+
1666
+ # Update dict_full (XZ Utils pattern)
1667
+ # When dict_full reaches dict_size, the dictionary is full
1668
+ # After that, dict_full stays at dict_size and has_wrapped = true
1669
+ unless @has_wrapped
1670
+ @dict_full = @pos - LZ_DICT_INIT_POS
1671
+ # Check if we've reached the maximum dictionary size
1672
+ if @dict_full >= @dict_size
1673
+ @has_wrapped = true
1674
+ @dict_full = @dict_size
1675
+ end
1676
+ end
1677
+
1678
+ # Track bytes decoded in this chunk (for match length limiting)
1679
+ # IMPORTANT: Increment by length for match copies (multiple bytes at once)
1680
+ # This is needed for correct match length limiting when @uncompressed_size is set.
1681
+ # XZ Utils uses dict.limit for this, but we use @chunk_bytes_decoded.
1682
+ if @uncompressed_size != 0xFFFFFFFFFFFFFFFF
1683
+ @chunk_bytes_decoded += length
1684
+ end
1685
+
1686
+ # Update rep distances - rotate and set new rep0
1687
+ # SDK rotation: rep3←rep2, rep2←rep1, rep1←rep0, rep0←rep0
1688
+ # XZ Utils stores the actual distance in rep0 (no +1)
1689
+ if ENV["LZMA_DEBUG"]
1690
+ warn "DEBUG: Before rotation - rep0/1/2/3=(#{@rep0},#{@rep1},#{@rep2},#{@rep3}), new distance=#{rep0}"
1691
+ end
1692
+
1693
+ # DEBUG: Trace rep rotation for position 224
1694
+ if ENV["LZMA_DEBUG"] && old_dict_full.between?(220, 230)
1695
+ puts "\n=== Rep rotation after match at dict_full=#{old_dict_full} ==="
1696
+ puts " Before: rep0/1/2/3=(#{@rep0},#{@rep1},#{@rep2},#{@rep3})"
1697
+ puts " Setting rep0 to: #{rep0.inspect}"
1698
+ end
1699
+
1700
+ @rep3 = @rep2
1701
+ @rep2 = @rep1
1702
+ @rep1 = @rep0
1703
+ @rep0 = rep0
1704
+
1705
+ if ENV["LZMA_DEBUG"]
1706
+ warn "DEBUG: After rotation - rep0/1/2/3=(#{@rep0},#{@rep1},#{@rep2},#{@rep3})"
1707
+ end
1708
+
1709
+ # DEBUG: Show final rep values
1710
+ if ENV["LZMA_DEBUG"] && old_dict_full.between?(220, 230)
1711
+ puts " After: rep0/1/2/3=(#{@rep0},#{@rep1},#{@rep2},#{@rep3})"
1712
+ end
1713
+
1714
+ # DEBUG: Verify rep0 is actually set
1715
+ if old_dict_full.between?(220, 230)
1716
+ actual_rep0 = @rep0
1717
+ XzUtilsDecoderDebug.debug_puts " VERIFICATION: @rep0=#{actual_rep0.inspect}, @rep0.object_id=#{@rep0.object_id}"
1718
+ end
1719
+
1720
+ # DEBUG: Trace range/code state after match at dict_full 56-62
1721
+ if ENV.fetch("LZMA_DEBUG",
1722
+ nil) && old_dict_full >= 56 && old_dict_full <= 62
1723
+ range_after = @range_decoder.instance_variable_get(:@range)
1724
+ code_after = @range_decoder.instance_variable_get(:@code)
1725
+ XzUtilsDecoderDebug.debug_puts " AFTER match (dict_full #{old_dict_full}→#{@dict_full}): range=0x#{range_after.to_s(16).upcase}, code=0x#{code_after.to_s(16).upcase}"
1726
+ end
1727
+
1728
+ false # Not EOS, continue decoding
1729
+ end
1730
+
1731
+ # Decode a rep match
1732
+ #
1733
+ # SDK rep match decoding (from XZ Utils lzma_decoder.c):
1734
+ # - is_rep0: Use rep0
1735
+ # - is_rep0_long=0: Short rep (length=1, don't rotate)
1736
+ # - is_rep0_long=1: Long rep (decode length, keep rep0)
1737
+ # - is_rep1: Use rep1, rotate rep1→rep0
1738
+ # - is_rep2: Use rep2, rotate rep2→rep0
1739
+ # - Otherwise: Use rep3, rotate rep3→rep0
1740
+ # After rotation, rep0 always contains the actual distance to use
1741
+ #
1742
+ # @param pos_state [Integer] Position state
1743
+ # @return [Boolean] Always false (rep matches are never EOS)
1744
+ def decode_rep_match(pos_state)
1745
+ # DEBUG: Trace rep matches around position 217
1746
+ old_dict_full = @dict_full
1747
+ old_rep0 = @rep0
1748
+
1749
+ # DEBUG: Show rep distances at the start
1750
+ if ENV["LZMA_DEBUG"]
1751
+ warn "DEBUG: decode_rep_match START[#{@decoder_id}] - rep0/1/2/3=(#{@rep0},#{@rep1},#{@rep2},#{@rep3})"
1752
+ end
1753
+
1754
+ # DEBUG: Trace rep matches around position 227
1755
+ if old_dict_full.between?(220, 230)
1756
+ XzUtilsDecoderDebug.debug_puts "\n=== decode_rep_match at dict_full=#{old_dict_full} (decoder_id=#{@decoder_id}) ==="
1757
+ XzUtilsDecoderDebug.debug_puts " At START: rep0/1/2/3=(#{@rep0},#{@rep1},#{@rep2},#{@rep3})"
1758
+ XzUtilsDecoderDebug.debug_puts " old_rep0=#{old_rep0} (captured @rep0)"
1759
+ XzUtilsDecoderDebug.debug_puts " @rep0.object_id=#{@rep0.object_id}"
1760
+ end
1761
+
1762
+ # Decode which rep distance to use
1763
+ is_rep0 = @range_decoder.decode_bit(@is_rep0_models[@state.value])
1764
+
1765
+ puts "DEBUG rep match selection at dict_full=#{@dict_full}: is_rep0=#{is_rep0}, rep0/1/2/3 before=(#{@rep0},#{@rep1},#{@rep2},#{@rep3})" if @dict_full.between?(
1766
+ 220, 230
1767
+ )
1768
+ puts " state.value=#{@state.value}, pos_state=#{pos_state}, model_index=#{(@state.value * (1 << @pb)) + pos_state}" if @dict_full.between?(
1769
+ 220, 230
1770
+ )
1771
+
1772
+ if ENV["LZMA_DEBUG"]
1773
+ warn "DEBUG: decode_rep_match - is_rep0=#{is_rep0}"
1774
+ end
1775
+
1776
+ if is_rep0.zero?
1777
+ # Use rep0
1778
+ puts "DEBUG rep match using rep0" if @dict_full.between?(220, 230)
1779
+ # XZ Utils: is_rep0_long[state][pos_state] where the array size is NUM_STATES * (1 << pb)
1780
+ is_rep0_long = @range_decoder.decode_bit(
1781
+ @is_rep0_long_models[(@state.value * (1 << @pb)) + pos_state],
1782
+ )
1783
+
1784
+ if ENV["LZMA_DEBUG"]
1785
+ warn "DEBUG: decode_rep_match - is_rep0_long=#{is_rep0_long}"
1786
+ end
1787
+
1788
+ if is_rep0_long.zero?
1789
+ # Short rep (length=1)
1790
+ length = 1
1791
+ @state.update_short_rep
1792
+ else
1793
+ # Long rep with rep0
1794
+ length = @rep_length_coder.decode(@range_decoder,
1795
+ pos_state) + MATCH_LEN_MIN
1796
+ @state.update_rep
1797
+ end
1798
+ else
1799
+ # Not rep0, check rep1/rep2/rep3
1800
+ puts "DEBUG rep match NOT using rep0 (is_rep0=#{is_rep0})" if @dict_full.between?(
1801
+ 220, 230
1802
+ )
1803
+
1804
+ is_rep1 = @range_decoder.decode_bit(@is_rep1_models[@state.value])
1805
+
1806
+ if is_rep1.zero?
1807
+ # Use rep1 - XZ Utils pattern:
1808
+ # const uint32_t distance = rep1;
1809
+ # rep1 = rep0;
1810
+ # rep0 = distance;
1811
+ @rep1, @rep0 = @rep0, @rep1
1812
+ else
1813
+ # Not rep1, check rep2/rep3
1814
+ is_rep2 = @range_decoder.decode_bit(@is_rep2_models[@state.value])
1815
+
1816
+ if is_rep2.zero?
1817
+ # Use rep2 - XZ Utils pattern:
1818
+ # const uint32_t distance = rep2;
1819
+ # rep2 = rep1;
1820
+ # rep1 = rep0;
1821
+ # rep0 = distance;
1822
+ distance = @rep2
1823
+ else
1824
+ # Use rep3 - XZ Utils pattern:
1825
+ # const uint32_t distance = rep3;
1826
+ # rep3 = rep2;
1827
+ # rep2 = rep1;
1828
+ # rep1 = rep0;
1829
+ # rep0 = distance;
1830
+ distance = @rep3
1831
+ @rep3 = @rep2
1832
+ end
1833
+ @rep2 = @rep1
1834
+ @rep1 = @rep0
1835
+ @rep0 = distance
1836
+ end
1837
+
1838
+ # Decode length for rep1/2/3
1839
+ length = @rep_length_coder.decode(@range_decoder,
1840
+ pos_state) + MATCH_LEN_MIN
1841
+ @state.update_rep
1842
+ end
1843
+
1844
+ # After rotation, rep0 always contains the distance to use
1845
+ # XZ Utils stores distances without +1 offset
1846
+ distance = @rep0
1847
+
1848
+ puts "DEBUG rep match after rotation: dict_full=#{old_dict_full}, distance=#{distance}, rep0/1/2/3=(#{@rep0},#{@rep1},#{@rep2},#{@rep3})" if old_dict_full.between?(
1849
+ 220, 230
1850
+ )
1851
+
1852
+ # DEBUG: Trace rep matches around position 217
1853
+ if old_dict_full.between?(210, 230)
1854
+ XzUtilsDecoderDebug.debug_puts "\n=== decode_rep_match at dict_full=#{old_dict_full} ==="
1855
+ XzUtilsDecoderDebug.debug_puts " old_rep0=#{old_rep0}, new rep0=#{@rep0} (distance=#{distance})"
1856
+ XzUtilsDecoderDebug.debug_puts " pos_state=#{pos_state}"
1857
+ end
1858
+
1859
+ if ENV["LZMA_DEBUG"]
1860
+ warn "DEBUG: decode_rep_match - length=#{length}, distance=#{distance}, dict_full=#{@dict_full}, rep0/1/2/3=(#{@rep0},#{@rep1},#{@rep2},#{@rep3})"
1861
+ end
1862
+
1863
+ # Validate distance using XZ Utils dict_is_distance_valid pattern
1864
+ # See lz_decoder.h:194-198: return dict->full > distance;
1865
+ # Note: distance=0 is valid (means copy from position 0, the first byte)
1866
+ # XZ Utils allows distance=0 as long as dict->full > 0 (at least one byte available)
1867
+ unless @dict_full > distance
1868
+ raise "Invalid rep distance: #{distance} (dict_full: #{@dict_full})"
1869
+ end
1870
+
1871
+ # IMPORTANT: Limit match length to not exceed uncompressed_size
1872
+ # XZ Utils handles this by setting dict.limit and checking before each write
1873
+ # We need to ensure we don't exceed the target size
1874
+ if @uncompressed_size != 0xFFFFFFFFFFFFFFFF
1875
+ # Calculate how many bytes we can still decode in THIS chunk
1876
+ # @chunk_bytes_decoded is the bytes decoded in this chunk (starts from 0)
1877
+ # @uncompressed_size is the target for THIS chunk (not cumulative)
1878
+ remaining = @uncompressed_size - @chunk_bytes_decoded
1879
+ if length > remaining
1880
+ if ENV["LZMA_DEBUG"] || remaining <= 5
1881
+ puts "DEBUG REP: Limiting rep match length from #{length} to #{remaining} (chunk_bytes_decoded=#{@chunk_bytes_decoded}, uncompressed_size=#{@uncompressed_size}, dict_full=#{@dict_full})"
1882
+ end
1883
+ length = remaining
1884
+ end
1885
+ end
1886
+
1887
+ # Copy matched data from dictionary using XZ Utils dict_repeat pattern
1888
+ # back = dict->pos - distance - 1;
1889
+ # if (distance >= dict->pos) back += dict->size - LZ_DICT_REPEAT_MAX;
1890
+ #
1891
+ # Note: dict->pos in XZ Utils is the actual data position (same as our @dict_full)
1892
+ # Our @pos includes the LZ_DICT_INIT_POS offset, so we use @dict_full for calculations
1893
+ #
1894
+ # dict->size in XZ Utils = dict_size + 2 * LZ_DICT_REPEAT_MAX
1895
+ # Our dict_buf size = @dict_size + LZ_DICT_INIT_POS = @dict_size + 2 * LZ_DICT_REPEAT_MAX
1896
+ back = @dict_full - distance - 1
1897
+ old_back = back
1898
+ if distance >= @dict_full
1899
+ # Distance wraps to the end of the dictionary buffer
1900
+ # XZ Utils: back += dict->size - LZ_DICT_REPEAT_MAX;
1901
+ # Our dict_buf size = @dict_size + LZ_DICT_INIT_POS = @dict_size + 2 * LZ_DICT_REPEAT_MAX
1902
+ # So: back += (@dict_size + 2 * LZ_DICT_REPEAT_MAX) - LZ_DICT_REPEAT_MAX
1903
+ # = back + @dict_size + LZ_DICT_REPEAT_MAX
1904
+ back += @dict_size + LZ_DICT_REPEAT_MAX
1905
+ end
1906
+ # Convert to buffer position (add LZ_DICT_INIT_POS offset)
1907
+ buffer_back = back + LZ_DICT_INIT_POS
1908
+
1909
+ puts "DEBUG rep match copy at dict_full=#{@dict_full}: @dict_full=#{@dict_full}, distance=#{distance}, back=#{back}, buffer_back=#{buffer_back}" if @dict_full.between?(
1910
+ 220, 230
1911
+ )
1912
+
1913
+ # DEBUG: Show back calculation for position 217
1914
+ if old_dict_full.between?(210, 230)
1915
+ XzUtilsDecoderDebug.debug_puts " back calculation: @dict_full=#{@dict_full}, distance=#{distance}"
1916
+ XzUtilsDecoderDebug.debug_puts " back=#{old_back}, wrapped_back=#{back}, buffer_back=#{buffer_back}"
1917
+ bytes_at_back = @dict_buf[buffer_back, 3]
1918
+ bytes_hex = if bytes_at_back.is_a?(String)
1919
+ bytes_at_back.bytes.map do |b|
1920
+ "%02x" % b
1921
+ end.join(" ")
1922
+ else
1923
+ [bytes_at_back].flatten.map do |b|
1924
+ "%02x" % b
1925
+ end.join(" ")
1926
+ end
1927
+ XzUtilsDecoderDebug.debug_puts " First 3 bytes at buffer_back: #{bytes_hex} (#{bytes_at_back.inspect})"
1928
+ end
1929
+
1930
+ # Copy bytes from dictionary and extend buffer as needed
1931
+ # XZ Utils dict_repeat pattern: dict->buf[dict->pos++] = dict->buf[back++]
1932
+ if old_dict_full.between?(250, 260)
1933
+ source_val = @dict_buf[@pos - 1]
1934
+ puts " Rep match copy at dict_full=#{@dict_full}: length=#{length}, distance=#{distance}, @pos=#{@pos} (will write to #{@pos}...#{@pos + length - 1})"
1935
+ puts " Reading from @pos-1=#{@pos - 1}, source byte = #{source_val} (0x#{source_val.to_s(16)} '#{begin
1936
+ source_val.chr
1937
+ rescue StandardError
1938
+ '?'
1939
+ end}')"
1940
+ puts " Before copy: @dict_buf[#{@pos}...#{@pos + length - 1}] = #{@dict_buf[@pos,
1941
+ length].inspect}"
1942
+ end
1943
+ length.times do |i|
1944
+ byte = @dict_buf[buffer_back + i]
1945
+ if @dict_full == 227 && i.zero?
1946
+ puts "DEBUG dict_copy at dict_full=227, i=0:"
1947
+ puts " buffer_back=#{buffer_back}, byte=#{byte} ('#{byte.chr}')"
1948
+ puts " Writing to @pos=#{@pos + i}"
1949
+ puts " dict_buf[buffer_back...buffer_back+10] = #{@dict_buf[buffer_back,
1950
+ 10].inspect}"
1951
+ # DEBUG: Check if buffer_back+1 has the correct byte
1952
+ puts " dict_buf[buffer_back+1=#{buffer_back + 1}] = #{@dict_buf[buffer_back + 1].inspect} ('#{begin
1953
+ @dict_buf[buffer_back + 1].chr
1954
+ rescue StandardError
1955
+ '?'
1956
+ end}')"
1957
+ prev_5 = if buffer_back > 4
1958
+ @dict_buf[(buffer_back - 5)..(buffer_back - 1)].map do |b|
1959
+ "0x#{b.to_s(16).upcase} (#{b.chr})"
1960
+ end.join(", ")
1961
+ else
1962
+ "N/A"
1963
+ end
1964
+ puts " Previous 5 bytes: [#{prev_5}]"
1965
+ puts " Current dict_full=#{@dict_full}, @pos=#{@pos}"
1966
+ end
1967
+ @dict_buf[@pos + i] = byte
1968
+ end
1969
+
1970
+ # Update position
1971
+ @pos += length
1972
+
1973
+ # Update dict_full (XZ Utils pattern)
1974
+ # When dict_full reaches dict_size, the dictionary is full
1975
+ # After that, dict_full stays at dict_size and has_wrapped = true
1976
+ unless @has_wrapped
1977
+ @dict_full = @pos - LZ_DICT_INIT_POS
1978
+ # Check if we've reached the maximum dictionary size
1979
+ if @dict_full >= @dict_size
1980
+ @has_wrapped = true
1981
+ @dict_full = @dict_size
1982
+ end
1983
+ end
1984
+
1985
+ # Track bytes decoded in this chunk (for match length limiting)
1986
+ # IMPORTANT: Increment by length for match copies (multiple bytes at once)
1987
+ # This is needed for correct match length limiting when @uncompressed_size is set.
1988
+ # XZ Utils uses dict.limit for this, but we use @chunk_bytes_decoded.
1989
+ if @uncompressed_size != 0xFFFFFFFFFFFFFFFF
1990
+ @chunk_bytes_decoded += length
1991
+ end
1992
+
1993
+ false # Rep matches are never EOS
1994
+ end
1995
+
1996
+ # Calculate literal state index
1997
+ # XZ Utils literal_subcoder formula (from lzma_common.h:141-143):
1998
+ # ((probs) + 3 * (((((pos) << 8) + (prev_byte)) & (literal_mask)) << (lc)))
1999
+ # where literal_mask = (1 << (lc + lp)) - 1
2000
+ #
2001
+ # The key insight is that (pos << 8) + prev_byte is computed FIRST,
2002
+ # then masked, THEN shifted by lc. This is different from our old formula
2003
+ # which added pos_part and prev_part separately.
2004
+ #
2005
+ # IMPORTANT: The literal_subcoder macro returns:
2006
+ # probs + 3 * context_value_shifted
2007
+ # where context_value_shifted = context_value << lc
2008
+ #
2009
+ # For our implementation, we return context_value (unshifted) so that
2010
+ # the literal decoder can calculate the correct offset: 3 * context_value
2011
+ #
2012
+ # This creates (1 << (lc + lp)) unique contexts
2013
+ #
2014
+ # @return [Integer] Literal context value (unshifted, 0-7 for lc=3)
2015
+ def calculate_literal_state
2016
+ # XZ Utils dict_get0 pattern: dict->buf[dict->pos - 1]
2017
+ # For array buffer, use @pos - 1 as index
2018
+
2019
+ # DEBUG: Check buffer state before access
2020
+ if ENV["LZMA_DEBUG_CALC_STATE"] && @dict_full == 8
2021
+ XzUtilsDecoderDebug.debug_puts "DEBUG before calc_state[#{@decoder_id}]: pos=#{@pos}, dict_full=#{@dict_full}"
2022
+ XzUtilsDecoderDebug.debug_puts " @dict_buf.object_id=#{@dict_buf.object_id}, size=#{@dict_buf.size}"
2023
+ XzUtilsDecoderDebug.debug_puts " Accessing index #{@pos - 1}: value=#{@dict_buf[@pos - 1].inspect}"
2024
+ end
2025
+
2026
+ prev_byte = @dict_full.positive? ? @dict_buf[@pos - 1] : 0
2027
+
2028
+ # Safeguard: if prev_byte is nil, use 0 and log detailed diagnostics
2029
+ # This can happen if the buffer was not properly initialized or we're accessing the wrong buffer
2030
+ if prev_byte.nil?
2031
+ if ENV["LZMA_DEBUG_NIL_BYTE"]
2032
+ raise "DEBUG: prev_byte is nil! decoder_id=#{@decoder_id}, @pos=#{@pos}, @dict_full=#{@dict_full}, @dict_buf.size=#{@dict_buf&.size || 'nil'}, accessing index #{@pos - 1}, nil_count=#{@dict_buf&.count(nil) || 'N/A'}, @dict_buf.object_id=#{@dict_buf&.object_id || 'nil'}"
2033
+ end
2034
+
2035
+ prev_byte = 0
2036
+ end
2037
+
2038
+ if ENV["LZMA_DEBUG_CALC_STATE"]
2039
+ XzUtilsDecoderDebug.debug_puts "DEBUG calc_state[#{@decoder_id}]: pos=#{@pos}, dict_full=#{@dict_full}, @dict_buf.object_id=#{@dict_buf.object_id}, prev_byte=#{prev_byte}"
2040
+ end
2041
+
2042
+ # XZ Utils formula from lzma_common.h:literal_mask_calc
2043
+ # literal_mask = (UINT32_C(0x100) << (lp)) - (UINT32_C(0x100) >> (lc))
2044
+ # For lc=3, lp=0: (256 << 0) - (256 >> 3) = 256 - 32 = 224 (0xE0)
2045
+ literal_mask = (0x100 << @lp) - (0x100 >> @lc)
2046
+
2047
+ # Combine dict_full (actual decoded position) and prev_byte, then apply mask
2048
+ # IMPORTANT: XZ Utils uses dict.pos (which starts at 0 and increments)
2049
+ # omnizip's @pos starts at LZ_DICT_INIT_POS (576), so we use @dict_full instead
2050
+ # This ensures we match XZ Utils's literal state calculation exactly
2051
+ (((@dict_full << 8) + prev_byte) & literal_mask)
2052
+ end
2053
+ end
2054
+ end
2055
+ end