omnizip 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (511) hide show
  1. checksums.yaml +7 -0
  2. data/.rspec +3 -0
  3. data/.rubocop.yml +32 -0
  4. data/.rubocop_todo.yml +754 -0
  5. data/COPYING +502 -0
  6. data/Gemfile +17 -0
  7. data/LICENSE +12 -0
  8. data/README.adoc +1045 -0
  9. data/Rakefile +12 -0
  10. data/benchmark/README.md +260 -0
  11. data/benchmark/benchmark_suite.rb +125 -0
  12. data/benchmark/compression_bench.rb +181 -0
  13. data/benchmark/filter_bench.rb +180 -0
  14. data/benchmark/models/benchmark_result.rb +59 -0
  15. data/benchmark/models/comparison_result.rb +69 -0
  16. data/benchmark/profile_suite.rb +167 -0
  17. data/benchmark/reporter.rb +150 -0
  18. data/benchmark/run_benchmarks.rb +66 -0
  19. data/benchmark/test_data.rb +137 -0
  20. data/config/formats/rar3_spec.yml +91 -0
  21. data/config/formats/rar5_spec.yml +102 -0
  22. data/docs/.github/workflows/docs.yml +142 -0
  23. data/docs/.gitignore +21 -0
  24. data/docs/.lychee.toml +67 -0
  25. data/docs/Gemfile +13 -0
  26. data/docs/RAR_WRITE_SUPPORT.md +26 -0
  27. data/docs/README.md +101 -0
  28. data/docs/_config.yml +112 -0
  29. data/docs/assets/logo.svg +1 -0
  30. data/docs/assets/omnizip-logo.pdf +1540 -11
  31. data/docs/comparison/feature-matrix.adoc +694 -0
  32. data/docs/comparison/index.adoc +113 -0
  33. data/docs/comparison/vs-7zip.adoc +309 -0
  34. data/docs/comparison/vs-peazip.adoc +77 -0
  35. data/docs/comparison/vs-rubyzip.adoc +342 -0
  36. data/docs/comparison/vs-winrar.adoc +100 -0
  37. data/docs/compatibility.adoc +579 -0
  38. data/docs/concepts/index.adoc +129 -0
  39. data/docs/developer/architecture.adoc +256 -0
  40. data/docs/developer/contributing.adoc +158 -0
  41. data/docs/developer/index.adoc +25 -0
  42. data/docs/developer/testing.adoc +212 -0
  43. data/docs/getting-started/basic-usage.adoc +271 -0
  44. data/docs/getting-started/index.adoc +42 -0
  45. data/docs/getting-started/installation.adoc +138 -0
  46. data/docs/getting-started/quick-start.adoc +185 -0
  47. data/docs/getting-started/your-first-archive.adoc +218 -0
  48. data/docs/guides/advanced-features/encryption.adoc +300 -0
  49. data/docs/guides/advanced-features/index.adoc +49 -0
  50. data/docs/guides/advanced-features/parallel-processing.adoc +246 -0
  51. data/docs/guides/advanced-features/progress-tracking.adoc +320 -0
  52. data/docs/guides/advanced-features/streaming.adoc +212 -0
  53. data/docs/guides/archive-formats/gzip-format.adoc +107 -0
  54. data/docs/guides/archive-formats/index.adoc +130 -0
  55. data/docs/guides/archive-formats/rar-format.adoc +104 -0
  56. data/docs/guides/archive-formats/rar5.adoc +521 -0
  57. data/docs/guides/archive-formats/seven-zip-format.adoc +35 -0
  58. data/docs/guides/archive-formats/tar-format.adoc +106 -0
  59. data/docs/guides/archive-formats/xz-format.adoc +118 -0
  60. data/docs/guides/archive-formats/zip-format.adoc +35 -0
  61. data/docs/guides/compression-algorithms/bzip2.adoc +113 -0
  62. data/docs/guides/compression-algorithms/deflate.adoc +319 -0
  63. data/docs/guides/compression-algorithms/index.adoc +190 -0
  64. data/docs/guides/compression-algorithms/lzma.adoc +398 -0
  65. data/docs/guides/compression-algorithms/lzma2.adoc +327 -0
  66. data/docs/guides/compression-algorithms/ppmd.adoc +316 -0
  67. data/docs/guides/compression-algorithms/zstandard.adoc +361 -0
  68. data/docs/guides/creating-archives.adoc +354 -0
  69. data/docs/guides/extracting-archives.adoc +53 -0
  70. data/docs/guides/format-conversion.adoc +64 -0
  71. data/docs/guides/index.adoc +49 -0
  72. data/docs/guides/migration-rubyzip.adoc +217 -0
  73. data/docs/guides/parity-archives.adoc +605 -0
  74. data/docs/guides/performance-tuning.adoc +88 -0
  75. data/docs/index.adoc +218 -0
  76. data/docs/lychee.toml +67 -0
  77. data/docs/reference/api/overview.adoc +188 -0
  78. data/docs/reference/cli/compress-command.adoc +114 -0
  79. data/docs/reference/cli/overview.adoc +140 -0
  80. data/docs/reference/index.adoc +26 -0
  81. data/docs/resources/faq.adoc +185 -0
  82. data/docs/resources/quick-reference.adoc +222 -0
  83. data/docs/troubleshooting/index.adoc +208 -0
  84. data/examples/api_comparison.rb +205 -0
  85. data/examples/deflate64_example.rb +96 -0
  86. data/examples/par2_demo.rb +121 -0
  87. data/examples/quick_start_native.rb +150 -0
  88. data/examples/quick_start_rubyzip.rb +115 -0
  89. data/examples/rubyzip_compatibility_demo.rb +194 -0
  90. data/exe/omnizip +27 -0
  91. data/lib/omnizip/algorithm.rb +130 -0
  92. data/lib/omnizip/algorithm_registry.rb +86 -0
  93. data/lib/omnizip/algorithms/.keep +0 -0
  94. data/lib/omnizip/algorithms/bzip2/bwt.rb +225 -0
  95. data/lib/omnizip/algorithms/bzip2/decoder.rb +193 -0
  96. data/lib/omnizip/algorithms/bzip2/encoder.rb +237 -0
  97. data/lib/omnizip/algorithms/bzip2/huffman.rb +206 -0
  98. data/lib/omnizip/algorithms/bzip2/mtf.rb +101 -0
  99. data/lib/omnizip/algorithms/bzip2/rle.rb +151 -0
  100. data/lib/omnizip/algorithms/bzip2.rb +130 -0
  101. data/lib/omnizip/algorithms/deflate/constants.rb +28 -0
  102. data/lib/omnizip/algorithms/deflate/decoder.rb +38 -0
  103. data/lib/omnizip/algorithms/deflate/encoder.rb +46 -0
  104. data/lib/omnizip/algorithms/deflate.rb +128 -0
  105. data/lib/omnizip/algorithms/deflate64/constants.rb +45 -0
  106. data/lib/omnizip/algorithms/deflate64/decoder.rb +153 -0
  107. data/lib/omnizip/algorithms/deflate64/encoder.rb +98 -0
  108. data/lib/omnizip/algorithms/deflate64/huffman_coder.rb +354 -0
  109. data/lib/omnizip/algorithms/deflate64/lz77_encoder.rb +142 -0
  110. data/lib/omnizip/algorithms/deflate64.rb +109 -0
  111. data/lib/omnizip/algorithms/lzma/bit_model.rb +120 -0
  112. data/lib/omnizip/algorithms/lzma/constants.rb +112 -0
  113. data/lib/omnizip/algorithms/lzma/decoder.rb +148 -0
  114. data/lib/omnizip/algorithms/lzma/dictionary.rb +69 -0
  115. data/lib/omnizip/algorithms/lzma/distance_coder.rb +415 -0
  116. data/lib/omnizip/algorithms/lzma/encoder.rb +142 -0
  117. data/lib/omnizip/algorithms/lzma/length_coder.rb +260 -0
  118. data/lib/omnizip/algorithms/lzma/literal_decoder.rb +320 -0
  119. data/lib/omnizip/algorithms/lzma/literal_encoder.rb +210 -0
  120. data/lib/omnizip/algorithms/lzma/lzip_decoder.rb +341 -0
  121. data/lib/omnizip/algorithms/lzma/lzma_alone_decoder.rb +192 -0
  122. data/lib/omnizip/algorithms/lzma/lzma_state.rb +128 -0
  123. data/lib/omnizip/algorithms/lzma/match.rb +32 -0
  124. data/lib/omnizip/algorithms/lzma/match_finder.rb +205 -0
  125. data/lib/omnizip/algorithms/lzma/match_finder_config.rb +142 -0
  126. data/lib/omnizip/algorithms/lzma/match_finder_factory.rb +88 -0
  127. data/lib/omnizip/algorithms/lzma/optimal_encoder.rb +130 -0
  128. data/lib/omnizip/algorithms/lzma/probability_models.rb +72 -0
  129. data/lib/omnizip/algorithms/lzma/range_coder.rb +85 -0
  130. data/lib/omnizip/algorithms/lzma/range_decoder.rb +434 -0
  131. data/lib/omnizip/algorithms/lzma/range_encoder.rb +194 -0
  132. data/lib/omnizip/algorithms/lzma/state.rb +127 -0
  133. data/lib/omnizip/algorithms/lzma/xz_buffered_range_encoder.rb +325 -0
  134. data/lib/omnizip/algorithms/lzma/xz_encoder.rb +426 -0
  135. data/lib/omnizip/algorithms/lzma/xz_encoder_fast.rb +645 -0
  136. data/lib/omnizip/algorithms/lzma/xz_match_finder_adapter.rb +227 -0
  137. data/lib/omnizip/algorithms/lzma/xz_price_calculator.rb +169 -0
  138. data/lib/omnizip/algorithms/lzma/xz_probability_models.rb +261 -0
  139. data/lib/omnizip/algorithms/lzma/xz_range_encoder.rb +223 -0
  140. data/lib/omnizip/algorithms/lzma/xz_range_encoder_exact.rb +331 -0
  141. data/lib/omnizip/algorithms/lzma/xz_state.rb +116 -0
  142. data/lib/omnizip/algorithms/lzma/xz_utils_decoder.rb +2055 -0
  143. data/lib/omnizip/algorithms/lzma.rb +238 -0
  144. data/lib/omnizip/algorithms/lzma2/chunk_manager.rb +182 -0
  145. data/lib/omnizip/algorithms/lzma2/constants.rb +41 -0
  146. data/lib/omnizip/algorithms/lzma2/encoder.rb +147 -0
  147. data/lib/omnizip/algorithms/lzma2/lzma2_chunk.rb +161 -0
  148. data/lib/omnizip/algorithms/lzma2/properties.rb +179 -0
  149. data/lib/omnizip/algorithms/lzma2/simple_lzma2_encoder.rb +127 -0
  150. data/lib/omnizip/algorithms/lzma2/xz_encoder_adapter.rb +85 -0
  151. data/lib/omnizip/algorithms/lzma2.rb +141 -0
  152. data/lib/omnizip/algorithms/ppmd7/constants.rb +74 -0
  153. data/lib/omnizip/algorithms/ppmd7/context.rb +154 -0
  154. data/lib/omnizip/algorithms/ppmd7/decoder.rb +126 -0
  155. data/lib/omnizip/algorithms/ppmd7/encoder.rb +163 -0
  156. data/lib/omnizip/algorithms/ppmd7/model.rb +248 -0
  157. data/lib/omnizip/algorithms/ppmd7/symbol_state.rb +57 -0
  158. data/lib/omnizip/algorithms/ppmd7.rb +116 -0
  159. data/lib/omnizip/algorithms/ppmd8/constants.rb +61 -0
  160. data/lib/omnizip/algorithms/ppmd8/context.rb +34 -0
  161. data/lib/omnizip/algorithms/ppmd8/decoder.rb +107 -0
  162. data/lib/omnizip/algorithms/ppmd8/encoder.rb +138 -0
  163. data/lib/omnizip/algorithms/ppmd8/model.rb +250 -0
  164. data/lib/omnizip/algorithms/ppmd8/restoration_method.rb +78 -0
  165. data/lib/omnizip/algorithms/ppmd8.rb +82 -0
  166. data/lib/omnizip/algorithms/ppmd_base.rb +138 -0
  167. data/lib/omnizip/algorithms/sevenzip_lzma2.rb +123 -0
  168. data/lib/omnizip/algorithms/xz_lzma2.rb +118 -0
  169. data/lib/omnizip/algorithms/zstandard/constants.rb +25 -0
  170. data/lib/omnizip/algorithms/zstandard/decoder.rb +46 -0
  171. data/lib/omnizip/algorithms/zstandard/encoder.rb +51 -0
  172. data/lib/omnizip/algorithms/zstandard.rb +138 -0
  173. data/lib/omnizip/buffer/memory_archive.rb +251 -0
  174. data/lib/omnizip/buffer/memory_extractor.rb +224 -0
  175. data/lib/omnizip/buffer.rb +176 -0
  176. data/lib/omnizip/checksum_registry.rb +114 -0
  177. data/lib/omnizip/checksums/crc32.rb +100 -0
  178. data/lib/omnizip/checksums/crc64.rb +101 -0
  179. data/lib/omnizip/checksums/crc_base.rb +158 -0
  180. data/lib/omnizip/checksums/verifier.rb +131 -0
  181. data/lib/omnizip/chunked/memory_manager.rb +194 -0
  182. data/lib/omnizip/chunked/reader.rb +78 -0
  183. data/lib/omnizip/chunked/writer.rb +120 -0
  184. data/lib/omnizip/chunked.rb +129 -0
  185. data/lib/omnizip/cli/output_formatter.rb +104 -0
  186. data/lib/omnizip/cli.rb +572 -0
  187. data/lib/omnizip/commands/.keep +0 -0
  188. data/lib/omnizip/commands/archive_create_command.rb +427 -0
  189. data/lib/omnizip/commands/archive_extract_command.rb +272 -0
  190. data/lib/omnizip/commands/archive_list_command.rb +218 -0
  191. data/lib/omnizip/commands/archive_repair_command.rb +131 -0
  192. data/lib/omnizip/commands/archive_verify_command.rb +117 -0
  193. data/lib/omnizip/commands/compress_command.rb +117 -0
  194. data/lib/omnizip/commands/decompress_command.rb +120 -0
  195. data/lib/omnizip/commands/list_command.rb +53 -0
  196. data/lib/omnizip/commands/metadata_command.rb +153 -0
  197. data/lib/omnizip/commands/parity_create_command.rb +122 -0
  198. data/lib/omnizip/commands/parity_repair_command.rb +122 -0
  199. data/lib/omnizip/commands/parity_verify_command.rb +124 -0
  200. data/lib/omnizip/commands/profile_list_command.rb +56 -0
  201. data/lib/omnizip/commands/profile_show_command.rb +44 -0
  202. data/lib/omnizip/convenience.rb +359 -0
  203. data/lib/omnizip/converter/conversion_registry.rb +49 -0
  204. data/lib/omnizip/converter/conversion_strategy.rb +121 -0
  205. data/lib/omnizip/converter/seven_zip_to_zip_strategy.rb +97 -0
  206. data/lib/omnizip/converter/zip_to_seven_zip_strategy.rb +112 -0
  207. data/lib/omnizip/converter.rb +105 -0
  208. data/lib/omnizip/crypto/aes256/cipher.rb +100 -0
  209. data/lib/omnizip/crypto/aes256/constants.rb +28 -0
  210. data/lib/omnizip/crypto/aes256/key_derivation.rb +101 -0
  211. data/lib/omnizip/crypto/aes256.rb +102 -0
  212. data/lib/omnizip/error.rb +106 -0
  213. data/lib/omnizip/eta/exponential_smoothing_estimator.rb +98 -0
  214. data/lib/omnizip/eta/moving_average_estimator.rb +99 -0
  215. data/lib/omnizip/eta/rate_calculator.rb +104 -0
  216. data/lib/omnizip/eta/sample_history.rb +143 -0
  217. data/lib/omnizip/eta/time_estimator.rb +106 -0
  218. data/lib/omnizip/eta.rb +63 -0
  219. data/lib/omnizip/extraction/filter_chain.rb +177 -0
  220. data/lib/omnizip/extraction/glob_pattern.rb +140 -0
  221. data/lib/omnizip/extraction/pattern_matcher.rb +70 -0
  222. data/lib/omnizip/extraction/predicate_pattern.rb +52 -0
  223. data/lib/omnizip/extraction/regex_pattern.rb +50 -0
  224. data/lib/omnizip/extraction/selective_extractor.rb +240 -0
  225. data/lib/omnizip/extraction.rb +111 -0
  226. data/lib/omnizip/file_type/mime_classifier.rb +144 -0
  227. data/lib/omnizip/file_type.rb +113 -0
  228. data/lib/omnizip/filter.rb +139 -0
  229. data/lib/omnizip/filter_pipeline.rb +108 -0
  230. data/lib/omnizip/filter_registry.rb +166 -0
  231. data/lib/omnizip/filters/bcj.rb +279 -0
  232. data/lib/omnizip/filters/bcj2/constants.rb +53 -0
  233. data/lib/omnizip/filters/bcj2/decoder.rb +200 -0
  234. data/lib/omnizip/filters/bcj2/encoder.rb +61 -0
  235. data/lib/omnizip/filters/bcj2/stream_data.rb +93 -0
  236. data/lib/omnizip/filters/bcj2.rb +99 -0
  237. data/lib/omnizip/filters/bcj_arm.rb +176 -0
  238. data/lib/omnizip/filters/bcj_arm64.rb +244 -0
  239. data/lib/omnizip/filters/bcj_ia64.rb +196 -0
  240. data/lib/omnizip/filters/bcj_ppc.rb +190 -0
  241. data/lib/omnizip/filters/bcj_sparc.rb +176 -0
  242. data/lib/omnizip/filters/bcj_x86.rb +193 -0
  243. data/lib/omnizip/filters/delta.rb +196 -0
  244. data/lib/omnizip/filters/filter_base.rb +72 -0
  245. data/lib/omnizip/filters/registry.rb +123 -0
  246. data/lib/omnizip/filters/xz_delta.rb +258 -0
  247. data/lib/omnizip/format_detector.rb +162 -0
  248. data/lib/omnizip/format_registry.rb +59 -0
  249. data/lib/omnizip/formats/.keep +0 -0
  250. data/lib/omnizip/formats/bzip2_file.rb +172 -0
  251. data/lib/omnizip/formats/cpio/constants.rb +55 -0
  252. data/lib/omnizip/formats/cpio/entry.rb +385 -0
  253. data/lib/omnizip/formats/cpio/reader.rb +196 -0
  254. data/lib/omnizip/formats/cpio/writer.rb +234 -0
  255. data/lib/omnizip/formats/cpio.rb +140 -0
  256. data/lib/omnizip/formats/format_spec_loader.rb +230 -0
  257. data/lib/omnizip/formats/gzip.rb +238 -0
  258. data/lib/omnizip/formats/iso/directory_builder.rb +297 -0
  259. data/lib/omnizip/formats/iso/directory_record.rb +152 -0
  260. data/lib/omnizip/formats/iso/joliet.rb +204 -0
  261. data/lib/omnizip/formats/iso/path_table.rb +125 -0
  262. data/lib/omnizip/formats/iso/reader.rb +197 -0
  263. data/lib/omnizip/formats/iso/rock_ridge.rb +349 -0
  264. data/lib/omnizip/formats/iso/volume_builder.rb +320 -0
  265. data/lib/omnizip/formats/iso/volume_descriptor.rb +168 -0
  266. data/lib/omnizip/formats/iso/writer.rb +530 -0
  267. data/lib/omnizip/formats/iso.rb +140 -0
  268. data/lib/omnizip/formats/lzip.rb +175 -0
  269. data/lib/omnizip/formats/lzma_alone.rb +171 -0
  270. data/lib/omnizip/formats/rar/archive_repairer.rb +243 -0
  271. data/lib/omnizip/formats/rar/archive_verifier.rb +195 -0
  272. data/lib/omnizip/formats/rar/block_parser.rb +243 -0
  273. data/lib/omnizip/formats/rar/compression/bit_stream.rb +180 -0
  274. data/lib/omnizip/formats/rar/compression/dispatcher.rb +217 -0
  275. data/lib/omnizip/formats/rar/compression/lz77_huffman/decoder.rb +216 -0
  276. data/lib/omnizip/formats/rar/compression/lz77_huffman/encoder.rb +158 -0
  277. data/lib/omnizip/formats/rar/compression/lz77_huffman/huffman_builder.rb +217 -0
  278. data/lib/omnizip/formats/rar/compression/lz77_huffman/huffman_coder.rb +189 -0
  279. data/lib/omnizip/formats/rar/compression/lz77_huffman/match_finder.rb +135 -0
  280. data/lib/omnizip/formats/rar/compression/lz77_huffman/sliding_window.rb +165 -0
  281. data/lib/omnizip/formats/rar/compression/ppmd/context.rb +105 -0
  282. data/lib/omnizip/formats/rar/compression/ppmd/decoder.rb +219 -0
  283. data/lib/omnizip/formats/rar/compression/ppmd/encoder.rb +262 -0
  284. data/lib/omnizip/formats/rar/compression_method_registry.rb +106 -0
  285. data/lib/omnizip/formats/rar/constants.rb +82 -0
  286. data/lib/omnizip/formats/rar/decompressor.rb +238 -0
  287. data/lib/omnizip/formats/rar/external_writer.rb +312 -0
  288. data/lib/omnizip/formats/rar/header.rb +192 -0
  289. data/lib/omnizip/formats/rar/license_validator.rb +109 -0
  290. data/lib/omnizip/formats/rar/models/rar_archive.rb +77 -0
  291. data/lib/omnizip/formats/rar/models/rar_entry.rb +65 -0
  292. data/lib/omnizip/formats/rar/models/rar_volume.rb +56 -0
  293. data/lib/omnizip/formats/rar/parity_handler.rb +292 -0
  294. data/lib/omnizip/formats/rar/rar5/compression/lzma.rb +202 -0
  295. data/lib/omnizip/formats/rar/rar5/compression/lzss.rb +578 -0
  296. data/lib/omnizip/formats/rar/rar5/compression/store.rb +60 -0
  297. data/lib/omnizip/formats/rar/rar5/crc32.rb +39 -0
  298. data/lib/omnizip/formats/rar/rar5/encryption/aes256_cbc.rb +97 -0
  299. data/lib/omnizip/formats/rar/rar5/encryption/encryption_header.rb +114 -0
  300. data/lib/omnizip/formats/rar/rar5/encryption/encryption_manager.rb +166 -0
  301. data/lib/omnizip/formats/rar/rar5/encryption/key_derivation.rb +97 -0
  302. data/lib/omnizip/formats/rar/rar5/header.rb +187 -0
  303. data/lib/omnizip/formats/rar/rar5/models/encryption_options.rb +74 -0
  304. data/lib/omnizip/formats/rar/rar5/models/recovery_options.rb +63 -0
  305. data/lib/omnizip/formats/rar/rar5/models/solid_options.rb +63 -0
  306. data/lib/omnizip/formats/rar/rar5/models/volume_options.rb +74 -0
  307. data/lib/omnizip/formats/rar/rar5/multi_volume/ARCHITECTURE.md +290 -0
  308. data/lib/omnizip/formats/rar/rar5/multi_volume/volume_manager.rb +264 -0
  309. data/lib/omnizip/formats/rar/rar5/multi_volume/volume_splitter.rb +155 -0
  310. data/lib/omnizip/formats/rar/rar5/multi_volume/volume_writer.rb +194 -0
  311. data/lib/omnizip/formats/rar/rar5/solid/solid_encoder.rb +109 -0
  312. data/lib/omnizip/formats/rar/rar5/solid/solid_manager.rb +142 -0
  313. data/lib/omnizip/formats/rar/rar5/solid/solid_stream.rb +121 -0
  314. data/lib/omnizip/formats/rar/rar5/vint.rb +65 -0
  315. data/lib/omnizip/formats/rar/rar5/writer.rb +466 -0
  316. data/lib/omnizip/formats/rar/rar_format_base.rb +241 -0
  317. data/lib/omnizip/formats/rar/reader.rb +366 -0
  318. data/lib/omnizip/formats/rar/recovery_record.rb +245 -0
  319. data/lib/omnizip/formats/rar/volume_manager.rb +168 -0
  320. data/lib/omnizip/formats/rar/writer.rb +431 -0
  321. data/lib/omnizip/formats/rar.rb +205 -0
  322. data/lib/omnizip/formats/rar3/compressor.rb +73 -0
  323. data/lib/omnizip/formats/rar3/decompressor.rb +66 -0
  324. data/lib/omnizip/formats/rar3/reader.rb +386 -0
  325. data/lib/omnizip/formats/rar3/writer.rb +219 -0
  326. data/lib/omnizip/formats/rar5/compressor.rb +73 -0
  327. data/lib/omnizip/formats/rar5/decompressor.rb +66 -0
  328. data/lib/omnizip/formats/rar5/reader.rb +342 -0
  329. data/lib/omnizip/formats/rar5/writer.rb +214 -0
  330. data/lib/omnizip/formats/seven_zip/coder_chain.rb +150 -0
  331. data/lib/omnizip/formats/seven_zip/constants.rb +126 -0
  332. data/lib/omnizip/formats/seven_zip/encoded_header.rb +114 -0
  333. data/lib/omnizip/formats/seven_zip/encrypted_header.rb +142 -0
  334. data/lib/omnizip/formats/seven_zip/file_collector.rb +144 -0
  335. data/lib/omnizip/formats/seven_zip/header.rb +106 -0
  336. data/lib/omnizip/formats/seven_zip/header_encryptor.rb +134 -0
  337. data/lib/omnizip/formats/seven_zip/header_writer.rb +466 -0
  338. data/lib/omnizip/formats/seven_zip/models/coder_info.rb +30 -0
  339. data/lib/omnizip/formats/seven_zip/models/file_entry.rb +58 -0
  340. data/lib/omnizip/formats/seven_zip/models/folder.rb +69 -0
  341. data/lib/omnizip/formats/seven_zip/models/stream_info.rb +42 -0
  342. data/lib/omnizip/formats/seven_zip/parser.rb +660 -0
  343. data/lib/omnizip/formats/seven_zip/reader.rb +458 -0
  344. data/lib/omnizip/formats/seven_zip/split_archive_reader.rb +632 -0
  345. data/lib/omnizip/formats/seven_zip/split_archive_writer.rb +315 -0
  346. data/lib/omnizip/formats/seven_zip/stream_compressor.rb +151 -0
  347. data/lib/omnizip/formats/seven_zip/stream_decompressor.rb +162 -0
  348. data/lib/omnizip/formats/seven_zip/writer.rb +740 -0
  349. data/lib/omnizip/formats/seven_zip.rb +93 -0
  350. data/lib/omnizip/formats/tar/constants.rb +73 -0
  351. data/lib/omnizip/formats/tar/entry.rb +94 -0
  352. data/lib/omnizip/formats/tar/header.rb +168 -0
  353. data/lib/omnizip/formats/tar/reader.rb +121 -0
  354. data/lib/omnizip/formats/tar/writer.rb +216 -0
  355. data/lib/omnizip/formats/tar.rb +84 -0
  356. data/lib/omnizip/formats/xz/reader.rb +116 -0
  357. data/lib/omnizip/formats/xz.rb +237 -0
  358. data/lib/omnizip/formats/xz_impl/block_decoder.rb +754 -0
  359. data/lib/omnizip/formats/xz_impl/block_encoder.rb +306 -0
  360. data/lib/omnizip/formats/xz_impl/block_header.rb +210 -0
  361. data/lib/omnizip/formats/xz_impl/block_header_parser.rb +186 -0
  362. data/lib/omnizip/formats/xz_impl/constants.rb +49 -0
  363. data/lib/omnizip/formats/xz_impl/index_decoder.rb +174 -0
  364. data/lib/omnizip/formats/xz_impl/index_encoder.rb +122 -0
  365. data/lib/omnizip/formats/xz_impl/stream_decoder.rb +468 -0
  366. data/lib/omnizip/formats/xz_impl/stream_encoder.rb +99 -0
  367. data/lib/omnizip/formats/xz_impl/stream_footer.rb +81 -0
  368. data/lib/omnizip/formats/xz_impl/stream_footer_parser.rb +117 -0
  369. data/lib/omnizip/formats/xz_impl/stream_header.rb +55 -0
  370. data/lib/omnizip/formats/xz_impl/stream_header_parser.rb +108 -0
  371. data/lib/omnizip/formats/xz_impl/vli.rb +128 -0
  372. data/lib/omnizip/formats/xz_impl/writer.rb +421 -0
  373. data/lib/omnizip/formats/zip/central_directory_header.rb +195 -0
  374. data/lib/omnizip/formats/zip/constants.rb +69 -0
  375. data/lib/omnizip/formats/zip/end_of_central_directory.rb +133 -0
  376. data/lib/omnizip/formats/zip/local_file_header.rb +138 -0
  377. data/lib/omnizip/formats/zip/reader.rb +250 -0
  378. data/lib/omnizip/formats/zip/unix_extra_field.rb +153 -0
  379. data/lib/omnizip/formats/zip/writer.rb +375 -0
  380. data/lib/omnizip/formats/zip/zip64_end_of_central_directory.rb +104 -0
  381. data/lib/omnizip/formats/zip/zip64_end_of_central_directory_locator.rb +66 -0
  382. data/lib/omnizip/formats/zip/zip64_extra_field.rb +114 -0
  383. data/lib/omnizip/formats/zip.rb +50 -0
  384. data/lib/omnizip/implementations/base/lzma2_decoder_base.rb +75 -0
  385. data/lib/omnizip/implementations/base/lzma2_encoder_base.rb +128 -0
  386. data/lib/omnizip/implementations/base/lzma_decoder_base.rb +83 -0
  387. data/lib/omnizip/implementations/base/lzma_encoder_base.rb +108 -0
  388. data/lib/omnizip/implementations/base/state_machine_base.rb +182 -0
  389. data/lib/omnizip/implementations/seven_zip/lzma/decoder.rb +421 -0
  390. data/lib/omnizip/implementations/seven_zip/lzma/encoder.rb +465 -0
  391. data/lib/omnizip/implementations/seven_zip/lzma/match_finder.rb +288 -0
  392. data/lib/omnizip/implementations/seven_zip/lzma/range_decoder.rb +200 -0
  393. data/lib/omnizip/implementations/seven_zip/lzma/range_encoder.rb +197 -0
  394. data/lib/omnizip/implementations/seven_zip/lzma/state_machine.rb +141 -0
  395. data/lib/omnizip/implementations/seven_zip/lzma2/encoder.rb +519 -0
  396. data/lib/omnizip/implementations/xz_utils/lzma2/decoder.rb +723 -0
  397. data/lib/omnizip/implementations/xz_utils/lzma2/encoder.rb +750 -0
  398. data/lib/omnizip/io/buffered_input.rb +146 -0
  399. data/lib/omnizip/io/buffered_output.rb +105 -0
  400. data/lib/omnizip/io/stream_manager.rb +115 -0
  401. data/lib/omnizip/link_handler/hard_link.rb +79 -0
  402. data/lib/omnizip/link_handler/symbolic_link.rb +74 -0
  403. data/lib/omnizip/link_handler.rb +124 -0
  404. data/lib/omnizip/metadata/archive_metadata.rb +114 -0
  405. data/lib/omnizip/metadata/entry_metadata.rb +146 -0
  406. data/lib/omnizip/metadata/metadata_editor.rb +171 -0
  407. data/lib/omnizip/metadata/metadata_registry.rb +64 -0
  408. data/lib/omnizip/metadata/metadata_validator.rb +99 -0
  409. data/lib/omnizip/metadata.rb +57 -0
  410. data/lib/omnizip/models/.keep +0 -0
  411. data/lib/omnizip/models/algorithm_metadata.rb +73 -0
  412. data/lib/omnizip/models/compression_options.rb +71 -0
  413. data/lib/omnizip/models/conversion_options.rb +87 -0
  414. data/lib/omnizip/models/conversion_result.rb +135 -0
  415. data/lib/omnizip/models/eta_result.rb +46 -0
  416. data/lib/omnizip/models/extraction_rule.rb +115 -0
  417. data/lib/omnizip/models/filter_chain.rb +144 -0
  418. data/lib/omnizip/models/filter_config.rb +183 -0
  419. data/lib/omnizip/models/match_result.rb +124 -0
  420. data/lib/omnizip/models/optimization_suggestion.rb +91 -0
  421. data/lib/omnizip/models/parallel_options.rb +104 -0
  422. data/lib/omnizip/models/performance_result.rb +79 -0
  423. data/lib/omnizip/models/profile_report.rb +82 -0
  424. data/lib/omnizip/models/progress_options.rb +38 -0
  425. data/lib/omnizip/models/split_options.rb +116 -0
  426. data/lib/omnizip/optimization_registry.rb +81 -0
  427. data/lib/omnizip/parallel/job_queue.rb +209 -0
  428. data/lib/omnizip/parallel/job_scheduler.rb +203 -0
  429. data/lib/omnizip/parallel/parallel_compressor.rb +347 -0
  430. data/lib/omnizip/parallel/parallel_extractor.rb +329 -0
  431. data/lib/omnizip/parallel/worker_pool.rb +223 -0
  432. data/lib/omnizip/parallel.rb +149 -0
  433. data/lib/omnizip/parity/chunked_block_processor.rb +196 -0
  434. data/lib/omnizip/parity/galois16.rb +145 -0
  435. data/lib/omnizip/parity/models/creator_packet.rb +73 -0
  436. data/lib/omnizip/parity/models/file_description_packet.rb +133 -0
  437. data/lib/omnizip/parity/models/ifsc_packet.rb +123 -0
  438. data/lib/omnizip/parity/models/main_packet.rb +128 -0
  439. data/lib/omnizip/parity/models/packet.rb +156 -0
  440. data/lib/omnizip/parity/models/packet_registry.rb +109 -0
  441. data/lib/omnizip/parity/models/recovery_slice_packet.rb +78 -0
  442. data/lib/omnizip/parity/par2_creator.rb +531 -0
  443. data/lib/omnizip/parity/par2_repairer.rb +407 -0
  444. data/lib/omnizip/parity/par2_verifier.rb +364 -0
  445. data/lib/omnizip/parity/par2cmdline_algorithm.rb +110 -0
  446. data/lib/omnizip/parity/par2cmdline_coefficients.rb +78 -0
  447. data/lib/omnizip/parity/reed_solomon_decoder.rb +266 -0
  448. data/lib/omnizip/parity/reed_solomon_encoder.rb +111 -0
  449. data/lib/omnizip/parity/reed_solomon_matrix.rb +342 -0
  450. data/lib/omnizip/parity.rb +186 -0
  451. data/lib/omnizip/password/encryption_registry.rb +65 -0
  452. data/lib/omnizip/password/encryption_strategy.rb +96 -0
  453. data/lib/omnizip/password/password_validator.rb +129 -0
  454. data/lib/omnizip/password/winzip_aes_strategy.rb +192 -0
  455. data/lib/omnizip/password/zip_crypto_strategy.rb +141 -0
  456. data/lib/omnizip/password.rb +87 -0
  457. data/lib/omnizip/pipe/stream_compressor.rb +124 -0
  458. data/lib/omnizip/pipe/stream_decompressor.rb +174 -0
  459. data/lib/omnizip/pipe.rb +121 -0
  460. data/lib/omnizip/platform/ntfs_streams.rb +201 -0
  461. data/lib/omnizip/platform.rb +189 -0
  462. data/lib/omnizip/profile/archive_profile.rb +39 -0
  463. data/lib/omnizip/profile/balanced_profile.rb +33 -0
  464. data/lib/omnizip/profile/binary_profile.rb +36 -0
  465. data/lib/omnizip/profile/compression_profile.rb +158 -0
  466. data/lib/omnizip/profile/custom_profile.rb +157 -0
  467. data/lib/omnizip/profile/fast_profile.rb +33 -0
  468. data/lib/omnizip/profile/maximum_profile.rb +33 -0
  469. data/lib/omnizip/profile/profile_detector.rb +110 -0
  470. data/lib/omnizip/profile/profile_registry.rb +161 -0
  471. data/lib/omnizip/profile/text_profile.rb +36 -0
  472. data/lib/omnizip/profile.rb +190 -0
  473. data/lib/omnizip/profiler/memory_profiler.rb +66 -0
  474. data/lib/omnizip/profiler/method_profiler.rb +49 -0
  475. data/lib/omnizip/profiler/report_generator.rb +169 -0
  476. data/lib/omnizip/profiler.rb +204 -0
  477. data/lib/omnizip/progress/callback_reporter.rb +36 -0
  478. data/lib/omnizip/progress/console_reporter.rb +62 -0
  479. data/lib/omnizip/progress/log_reporter.rb +91 -0
  480. data/lib/omnizip/progress/operation_progress.rb +118 -0
  481. data/lib/omnizip/progress/progress_bar.rb +156 -0
  482. data/lib/omnizip/progress/progress_reporter.rb +40 -0
  483. data/lib/omnizip/progress/progress_tracker.rb +190 -0
  484. data/lib/omnizip/progress/silent_reporter.rb +24 -0
  485. data/lib/omnizip/progress.rb +127 -0
  486. data/lib/omnizip/rubyzip_compat.rb +63 -0
  487. data/lib/omnizip/temp/safe_extract.rb +168 -0
  488. data/lib/omnizip/temp/temp_file.rb +124 -0
  489. data/lib/omnizip/temp/temp_file_pool.rb +109 -0
  490. data/lib/omnizip/temp.rb +181 -0
  491. data/lib/omnizip/version.rb +5 -0
  492. data/lib/omnizip/zip/entry.rb +156 -0
  493. data/lib/omnizip/zip/file.rb +485 -0
  494. data/lib/omnizip/zip/input_stream.rb +273 -0
  495. data/lib/omnizip/zip/output_stream.rb +324 -0
  496. data/lib/omnizip.rb +156 -0
  497. data/readme-docs/advanced-features.adoc +515 -0
  498. data/readme-docs/api-usage.adoc +444 -0
  499. data/readme-docs/architecture.adoc +449 -0
  500. data/readme-docs/archive-formats.adoc +479 -0
  501. data/readme-docs/cli-usage.adoc +222 -0
  502. data/readme-docs/compression-algorithms.adoc +442 -0
  503. data/readme-docs/compression-profiles.adoc +247 -0
  504. data/readme-docs/encryption-checksums.adoc +328 -0
  505. data/readme-docs/format-converter.adoc +325 -0
  506. data/readme-docs/installation.adoc +228 -0
  507. data/readme-docs/par2-archives.adoc +608 -0
  508. data/readme-docs/performance-profiler.adoc +389 -0
  509. data/readme-docs/preprocessing-filters.adoc +280 -0
  510. data/xz-file-format-1.2.1.txt +1174 -0
  511. metadata +617 -0
@@ -0,0 +1,750 @@
1
+ # frozen_string_literal: true
2
+
3
+ # Copyright (C) 2025 Ribose Inc.
4
+ #
5
+ # Permission is hereby granted, free of charge, to any person obtaining a
6
+ # copy of this software and associated documentation files (the "Software"),
7
+ # to deal in the Software without restriction, including without limitation
8
+ # the rights to use, copy, modify, merge, publish, distribute, sublicense,
9
+ # and/or sell copies of the Software, and to permit persons to whom the
10
+ # Software is furnished to do so, subject to the following conditions:
11
+ #
12
+ # The above copyright notice and this permission notice shall be included in
13
+ # all copies or substantial portions of the Software.
14
+ #
15
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20
+ # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21
+ # DEALINGS IN THE SOFTWARE.
22
+
23
+ require "stringio"
24
+ require_relative "../../base/lzma2_encoder_base"
25
+ require_relative "../../../algorithms/lzma"
26
+ require_relative "../../../algorithms/lzma2/constants"
27
+ require_relative "../../../algorithms/lzma2/lzma2_chunk"
28
+ require_relative "../../../algorithms/lzma2/properties"
29
+ require_relative "../../../algorithms/lzma/dictionary"
30
+ require_relative "../../../algorithms/lzma/lzma_state"
31
+ require_relative "../../../algorithms/lzma/xz_probability_models"
32
+ require_relative "../../../algorithms/lzma/match_finder"
33
+ require_relative "../../../algorithms/lzma/optimal_encoder"
34
+ require_relative "../../../algorithms/lzma/xz_range_encoder_exact"
35
+ require_relative "../../../algorithms/lzma/constants"
36
+
37
+ module Omnizip
38
+ module Implementations
39
+ module XZUtils
40
+ module LZMA2
41
+ # XZ Utils LZMA2 encoder implementation.
42
+ #
43
+ # This is the original XzLZMA2Encoder moved from algorithms/lzma2/xz_lzma2_encoder.rb
44
+ # to the new namespace structure.
45
+ #
46
+ # Ported from XZ Utils liblzma/lzma2_encoder.c
47
+ #
48
+ # Compatibility helper for Ruby 3.0-3.1 where String#byteslice doesn't exist
49
+ module StringCompat
50
+ if "".respond_to?(:byteslice)
51
+ def self.byteslice(string, start, length)
52
+ string.byteslice(start, length)
53
+ end
54
+ else
55
+ def self.byteslice(string, start, length)
56
+ string.bytes[start, length]&.pack("C*") || ""
57
+ end
58
+ end
59
+ end
60
+
61
+ # Constants
62
+ UINT32_MAX = 0xFFFFFFFF
63
+ REPS = 4
64
+
65
+ # XZ Utils LZMA2 encoder.
66
+ class Encoder < Base::LZMA2EncoderBase
67
+ include Omnizip::Algorithms::LZMA::Constants
68
+
69
+ # XZ Utils limits (from lzma2_encoder.h)
70
+ # Maximum UNCOMPRESSED size per chunk: 2MB
71
+ UNCOMPRESSED_MAX = 1 << 21 # 2,097,152 bytes
72
+ # Maximum COMPRESSED size per chunk: 64KB
73
+ COMPRESSED_MAX = 1 << 16 # 65,536 bytes
74
+
75
+ # Initialize the encoder
76
+ # @param options [Hash] Encoding options
77
+ # @option options [Integer] :dict_size Dictionary size (default: 8MB)
78
+ # @option options [Integer] :lc Literal context bits (default: 3)
79
+ # @option options [Integer] :lp Literal position bits (default: 0)
80
+ # @option options [Integer] :pb Position bits (default: 2)
81
+ # @option options [Boolean] :standalone If true, write property byte at start (default: true)
82
+ def initialize(options = {})
83
+ dict_size = options.fetch(:dict_size, 8 * 1024 * 1024)
84
+ lc = options.fetch(:lc, 3)
85
+ lp = options.fetch(:lp, 0)
86
+ pb = options.fetch(:pb, 2)
87
+ standalone = options.fetch(:standalone, true)
88
+
89
+ super(
90
+ dict_size: dict_size,
91
+ lc: lc,
92
+ lp: lp,
93
+ pb: pb,
94
+ standalone: standalone
95
+ )
96
+
97
+ # Shared state across all chunks
98
+ @dictionary = Omnizip::Algorithms::LZMA::Dictionary.new(dict_size)
99
+ @state = Omnizip::Algorithms::LZMA::LZMAState.new(0)
100
+ @models = Omnizip::Algorithms::LZMA::XzProbabilityModels.new(lc, lp, pb)
101
+ @match_finder = Omnizip::Algorithms::LZMA::MatchFinder.new(@dictionary)
102
+ @optimal = Omnizip::Algorithms::LZMA::OptimalEncoder.new(mode: :fast)
103
+
104
+ # Track previous byte for literal context
105
+ @prev_byte = 0
106
+
107
+ # CRITICAL: For XZ Utils compatibility, first chunk MUST reset the dictionary
108
+ # (matches XZ Utils behavior - see lzma2_encoder.c:334-336)
109
+ # need_dictionary_reset is set to true for the first compressed chunk
110
+ @need_properties = false # Properties will be written in first compressed chunk
111
+ @need_state_reset = false
112
+ @need_dictionary_reset = true # Always reset dictionary for first chunk (XZ Utils compatibility)
113
+ end
114
+
115
+ def encode(input_data)
116
+ # CRITICAL: Reset match finder state for each encoding session
117
+ # This ensures hash table and hash chain start fresh for each Xz.create call
118
+ @match_finder.reset
119
+
120
+ output = StringIO.new
121
+ output.set_encoding(Encoding::BINARY)
122
+
123
+ # Write property byte if standalone mode (for .lz2 files)
124
+ # The property byte encodes dictionary size
125
+ # Formula: For power-of-2 sizes, d = 2 * (log2_size - 12)
126
+ if @standalone
127
+ prop_byte = encode_dict_size(@dict_size)
128
+ output.putc(prop_byte)
129
+ end
130
+
131
+ input = StringIO.new(input_data)
132
+ input.set_encoding(Encoding::BINARY)
133
+
134
+ # Process in chunks (UNCOMPRESSED_MAX = 2MB per chunk)
135
+ while !input.eof?
136
+ chunk_data = input.read(UNCOMPRESSED_MAX)
137
+ break if chunk_data.nil? || chunk_data.empty?
138
+
139
+ chunk = encode_chunk(chunk_data)
140
+ output.write(chunk.to_bytes)
141
+
142
+ @need_properties = false
143
+ @need_state_reset = false
144
+ @need_dictionary_reset = false
145
+ end
146
+
147
+ # End marker (0x00) is REQUIRED for all LZMA2 streams
148
+ # The @standalone flag only controls whether a property byte is written
149
+ # at the START (for raw LZMA2 format like .lz2), not the end marker.
150
+ # XZ format requires the end marker to properly terminate the LZMA2 stream.
151
+ output.write([0x00].pack("C"))
152
+
153
+ output.string
154
+ end
155
+
156
+ # Get implementation identifier.
157
+ #
158
+ # @return [Symbol] :xz_utils
159
+ def implementation_name
160
+ :xz_utils
161
+ end
162
+
163
+ private
164
+
165
+ def encode_chunk(uncompressed_data)
166
+ compressed = try_compress(uncompressed_data)
167
+
168
+ # XZ Utils chunk type selection:
169
+ # Use uncompressed chunk if: compressed_size >= uncompressed_size
170
+ # Use compressed chunk if: compressed_size < uncompressed_size
171
+ # NOTE: Compare only DATA sizes, NOT including headers!
172
+ # This matches XZ Utils implementation exactly (lzma2_encoder.c line 205)
173
+
174
+ if compressed.bytesize >= uncompressed_data.bytesize
175
+ # Use uncompressed chunk (compression didn't help)
176
+ chunk = Omnizip::Algorithms::LZMA2::LZMA2Chunk.new(
177
+ chunk_type: :uncompressed,
178
+ uncompressed_data: uncompressed_data,
179
+ compressed_data: "",
180
+ need_dict_reset: @need_dictionary_reset,
181
+ need_state_reset: false,
182
+ need_props: false,
183
+ )
184
+ # After uncompressed chunk, next chunk needs state reset
185
+ # (XZ Utils does this - see lzma2_encoder.c line 211)
186
+ @need_state_reset = true
187
+ else
188
+ # Use compressed chunk (compression helped)
189
+ # For compressed chunks, properties encode lc/lp/pb:
190
+ # (pb * 5 + lp) * 9 + lc
191
+ chunk_properties = (((@pb * 5) + @lp) * 9) + @lc
192
+ # CRITICAL: need_props must be TRUE when we're providing properties!
193
+ # This tells the chunk to encode properties in the control byte
194
+ # CRITICAL: compressed_size includes ALL bytes (LZMA data + flush bytes)
195
+ # The flush bytes are part of the range encoder output and must be included
196
+ chunk = Omnizip::Algorithms::LZMA2::LZMA2Chunk.new(
197
+ chunk_type: :compressed,
198
+ uncompressed_data: uncompressed_data,
199
+ compressed_data: compressed,
200
+ compressed_size: compressed.bytesize, # Full size including flush bytes
201
+ properties: chunk_properties,
202
+ need_dict_reset: @need_dictionary_reset,
203
+ need_state_reset: @need_state_reset,
204
+ need_props: true, # Always true for compressed chunks with properties
205
+ )
206
+ end
207
+
208
+ # Update dictionary with the chunk data (done once per chunk)
209
+ @dictionary.append(uncompressed_data)
210
+
211
+ # Update prev_byte for next chunk
212
+ if uncompressed_data.bytesize.positive?
213
+ @prev_byte = uncompressed_data.getbyte(uncompressed_data.bytesize - 1)
214
+ end
215
+
216
+ chunk
217
+ end
218
+
219
+ def try_compress(data)
220
+ # Create output buffer to capture compressed data
221
+ output_buffer = StringIO.new
222
+ output_buffer.set_encoding(Encoding::BINARY)
223
+
224
+ # Create range encoder (direct XZ Utils port)
225
+ encoder = Omnizip::Algorithms::LZMA::XzRangeEncoder.new(output_buffer)
226
+
227
+ # Feed all data to match finder first
228
+ # This ensures all bytes are available for finding matches
229
+ @match_finder.feed(data)
230
+
231
+ # CRITICAL: Initialize hash table for positions BEFORE encoding starts
232
+ # This ensures that matches can be found for repeated data patterns
233
+ # Matches XZ Utils lzma_encoder.c: mf_skip() behavior
234
+ # We skip to position (start_pos + data.bytesize - MATCH_LEN_MAX),
235
+ # but ensure we don't go negative for small inputs
236
+ match_len_max = 2 # Minimum match length in LZMA2
237
+ end_pos = [@dictionary.buffer.bytesize + data.bytesize - match_len_max, 0].max
238
+ @match_finder.skip(end_pos)
239
+
240
+ # Position in match finder's buffer for encoding
241
+ # Start after the data we just fed
242
+ start_pos = @dictionary.buffer.bytesize
243
+
244
+ # Store current start position for matched literal encoding
245
+ @current_start_pos = start_pos
246
+
247
+ pos = 0
248
+ while pos < data.bytesize
249
+ # Encode queued symbols if buffer getting full
250
+ # Keep headroom for largest operation
251
+ # (~30 symbols for match+distance)
252
+ if encoder.count > 20
253
+ encode_queued_symbols(encoder, output_buffer)
254
+ end
255
+
256
+ # Find matches at current position in match finder buffer
257
+ match_pos = start_pos + pos
258
+ @match_finder.find_matches(match_pos)
259
+
260
+ # Get optimal encoding choice
261
+ distance, length = @optimal.find_optimal(
262
+ match_pos,
263
+ @match_finder,
264
+ @state,
265
+ @state.reps,
266
+ @models,
267
+ )
268
+
269
+ # DEBUG: Trace encoding decisions
270
+ puts "[DEBUG] pos=#{pos} distance=#{distance} length=#{length} state=#{@state.value} reps=#{@state.reps.inspect}" if ENV["DEBUG"]
271
+
272
+ # Encode based on choice
273
+ # CRITICAL: Use UINT32_MAX to check for literal (not distance.zero?)
274
+ # because distance=0 means repeated match rep0, not literal!
275
+ if distance == UINT32_MAX || length == 1
276
+ # Encode literal
277
+ # puts "[DEBUG] -> LITERAL 0x#{'%02x' % data.getbyte(pos)}" if ENV['DEBUG']
278
+ encode_literal(data.getbyte(pos), encoder, pos)
279
+ pos += 1
280
+ elsif distance < REPS
281
+ # Encode repeated match (distance is 0-3 for rep0-rep3)
282
+ # puts "[DEBUG] -> REPEATED MATCH rep#{distance} len=#{length}" if ENV['DEBUG']
283
+ encode_repeated_match(distance, length, encoder, pos, match_pos)
284
+ pos += length
285
+ else
286
+ # Encode normal match (distance is actual_distance + REPS)
287
+ actual_distance = distance - REPS
288
+ # puts "[DEBUG] -> NORMAL MATCH distance=#{actual_distance} len=#{length}" if ENV['DEBUG']
289
+ encode_match(actual_distance, length, encoder, pos, match_pos,
290
+ data)
291
+ pos += length
292
+ end
293
+ end
294
+
295
+ # Flush encoder to write remaining bytes
296
+ # IMPORTANT: Encode all pending symbols FIRST, before queue_flush
297
+ encode_queued_symbols(encoder, output_buffer)
298
+
299
+ # Now flush the encoder (adds 5 RC_FLUSH symbols)
300
+ encoder.queue_flush
301
+
302
+ # Encode the flush symbols
303
+ # This will write additional bytes to output_buffer
304
+ encode_queued_symbols(encoder, output_buffer)
305
+
306
+ # Full output includes all bytes (LZMA data + flush bytes)
307
+ full_output = output_buffer.string
308
+
309
+ puts "[DEBUG] try_compress: full_output.size=#{full_output.bytesize}, encoder.out_total=#{encoder.out_total}" if ENV["DEBUG_FLUSH"]
310
+
311
+ # Return all bytes (flush bytes are part of the LZMA data)
312
+ full_output
313
+ end
314
+
315
+ # Encode queued symbols to output
316
+ # rubocop:disable Style/CollectionQuerying
317
+ def encode_queued_symbols(encoder, output)
318
+ return if encoder.count.zero?
319
+
320
+ # Encode symbols to buffer
321
+ encoder.encode_symbols(temp_buffer, out_pos, 10000)
322
+
323
+ # Track size before encoding
324
+ size_before = output.size
325
+
326
+ # Encode symbols to buffer
327
+ encoder.encode_symbols(temp_buffer, out_pos, 10000)
328
+
329
+ # Write to output stream
330
+ if out_pos.value.positive?
331
+ # Use StringCompat.byteslice for Ruby 3.0-3.1 compatibility
332
+ # Ruby's [] operator has a bug with null bytes that can return extra bytes
333
+ # See: https://bugs.ruby-lang.org/issues/15985
334
+ output.write(StringCompat.byteslice(temp_buffer, 0, out_pos.value))
335
+ end
336
+
337
+ # Return the number of bytes written
338
+ output.size - size_before
339
+ end
340
+
341
+ # Encode queued symbols to output
342
+ # rubocop:disable Style/CollectionQuerying
343
+ def encode_queued_symbols(encoder, output)
344
+ return if encoder.count.zero?
345
+
346
+ # Create temporary buffer for encoding
347
+ temp_buffer = "\0" * 10000
348
+ out_pos = Omnizip::Algorithms::LZMA::IntRef.new(0)
349
+
350
+ # Track size before encoding
351
+ size_before = output.size
352
+
353
+ # Encode symbols to buffer
354
+ encoder.encode_symbols(temp_buffer, out_pos, 10000)
355
+
356
+ # Write to output stream
357
+ if out_pos.value.positive?
358
+ # Use StringCompat.byteslice for Ruby 3.0-3.1 compatibility
359
+ # Ruby's [] operator has a bug with null bytes that can return extra bytes
360
+ # See: https://bugs.ruby-lang.org/issues/15985
361
+ output.write(StringCompat.byteslice(temp_buffer, 0, out_pos.value))
362
+ end
363
+
364
+ # Return the number of bytes written
365
+ output.size - size_before
366
+ end
367
+
368
+ # Encode literal byte
369
+ def encode_literal(symbol, encoder, pos)
370
+ pos_state = pos & ((1 << @pb) - 1)
371
+
372
+ # Encode is_match bit (0 for literal) - uses OLD state value
373
+ prob_is_match = @models.is_match[@state.value][pos_state]
374
+ encoder.queue_bit(prob_is_match, 0)
375
+
376
+ # Get literal subcoder flat index (uses OLD state value via @prev_byte)
377
+ # This is the base offset into the flat literal array
378
+ literal_offset = get_literal_state(pos, @prev_byte)
379
+
380
+ # CRITICAL: Check encoding path BEFORE updating state (XZ Utils order)
381
+ # The is_literal_state check happens on the current state
382
+ use_matched = @state.use_matched_literal?
383
+
384
+ # Now update state (this is the update_literal() call in XZ)
385
+ @state.update_literal!
386
+
387
+ if use_matched
388
+ # Matched literal (compare with match byte at rep0)
389
+ # XZ Utils: mf->buffer[mf->read_pos - coder->reps[0] - 1 - mf->read_ahead]
390
+ # We don't use read_ahead, so it's 0
391
+ match_pos = @current_start_pos + pos
392
+ match_byte_pos = match_pos - @state.reps[0] - 1
393
+ match_byte = @match_finder.buffer.getbyte(match_byte_pos) if match_byte_pos >= 0 && match_byte_pos < @match_finder.buffer.bytesize
394
+
395
+ # If match_byte is nil (shouldn't happen in normal operation),
396
+ # fall back to normal literal encoding
397
+ if match_byte.nil?
398
+ encode_normal_literal(literal_offset, symbol, encoder)
399
+ else
400
+ encode_matched_literal(literal_offset, match_byte, symbol,
401
+ encoder)
402
+ end
403
+ else
404
+ # Normal literal (8-bit tree)
405
+ encode_normal_literal(literal_offset, symbol, encoder)
406
+ end
407
+
408
+ # Update prev_byte
409
+ @prev_byte = symbol
410
+ end
411
+
412
+ # Encode normal match
413
+ def encode_match(distance, length, encoder, pos, match_pos, _input_data)
414
+ pos_state = pos & ((1 << @pb) - 1)
415
+
416
+ # Encode is_match bit (1 for match) - uses OLD state value
417
+ prob_is_match = @models.is_match[@state.value][pos_state]
418
+ encoder.queue_bit(prob_is_match, 1)
419
+
420
+ # Encode is_rep bit (0 for normal match) - uses OLD state value
421
+ prob_is_rep = @models.is_rep[@state.value]
422
+ encoder.queue_bit(prob_is_rep, 0)
423
+
424
+ # CRITICAL: Update state BEFORE encoding length/distance (XZ Utils order)
425
+ # This also updates reps
426
+ @state.update_match!(distance)
427
+
428
+ # Encode length - uses NEW state value
429
+ encode_match_length(length, pos_state, encoder)
430
+
431
+ # Encode distance - uses NEW state value
432
+ encode_distance(distance, length, encoder)
433
+
434
+ # Update prev_byte (last byte of match)
435
+ # Read from match finder buffer: match_pos - distance + length - 1
436
+ last_byte_pos = match_pos - distance + length - 1
437
+ @prev_byte = @match_finder.buffer.getbyte(last_byte_pos) if last_byte_pos >= 0 && last_byte_pos < @match_finder.buffer.bytesize
438
+ end
439
+
440
+ # Encode repeated match (using rep0-rep3)
441
+ # Ported from XZ Utils rep_match function
442
+ def encode_repeated_match(rep, length, encoder, pos, match_pos)
443
+ pos_state = pos & ((1 << @pb) - 1)
444
+
445
+ # Encode is_match bit (1 for match) - uses OLD state value
446
+ prob_is_match = @models.is_match[@state.value][pos_state]
447
+ encoder.queue_bit(prob_is_match, 1)
448
+
449
+ # Encode is_rep bit (1 for repeated match) - uses OLD state value
450
+ prob_is_rep = @models.is_rep[@state.value]
451
+ encoder.queue_bit(prob_is_rep, 1)
452
+
453
+ prob_is_rep0 = @models.is_rep0[@state.value]
454
+ if rep.zero?
455
+ # rep0 (shortest distance)
456
+ encoder.queue_bit(prob_is_rep0, 0)
457
+
458
+ prob_is_rep0_long = @models.is_rep0_long[@state.value][pos_state]
459
+ encoder.queue_bit(prob_is_rep0_long, length == 1 ? 0 : 1)
460
+ else
461
+ # rep1, rep2, or rep3
462
+ encoder.queue_bit(prob_is_rep0, 1)
463
+
464
+ prob_is_rep1 = @models.is_rep1[@state.value]
465
+ if rep == 1
466
+ # rep1
467
+ encoder.queue_bit(prob_is_rep1, 0)
468
+ else
469
+ # rep2 or rep3
470
+ encoder.queue_bit(prob_is_rep1, 1)
471
+
472
+ prob_is_rep2 = @models.is_rep2[@state.value]
473
+ encoder.queue_bit(prob_is_rep2, rep - 2)
474
+
475
+ if rep == 3
476
+ # Update reps[3] = reps[2] before updating reps[2]
477
+ @state.reps[3] = @state.reps[2]
478
+ end
479
+
480
+ # Update reps[2] = reps[1]
481
+ @state.reps[2] = @state.reps[1]
482
+ end
483
+
484
+ # Update reps[1] = reps[0]
485
+ @state.reps[1] = @state.reps[0]
486
+
487
+ # Update reps[0] = distance from reps[rep]
488
+ distance = @state.reps[rep]
489
+
490
+ # Defensive check: distance should never be nil
491
+ if distance.nil?
492
+ raise "Distance is nil for rep #{rep}, reps=#{@state.reps.inspect}"
493
+ end
494
+
495
+ @state.reps[0] = distance
496
+ end
497
+
498
+ # Update state based on match length
499
+ if length == 1
500
+ @state.update_short_rep!
501
+ else
502
+ # Encode length
503
+ encode_match_length(length, pos_state, encoder)
504
+ @state.update_long_rep!
505
+ end
506
+
507
+ # Update prev_byte (last byte of match)
508
+ # For rep match: match_pos - reps[rep] - 1 + length - 1 = match_pos - reps[rep] + length - 2
509
+ # But after updating reps above, reps[0] now contains the distance
510
+ last_byte_pos = match_pos - @state.reps[0] + length - 1
511
+ @prev_byte = @match_finder.buffer.getbyte(last_byte_pos) if last_byte_pos >= 0 && last_byte_pos < @match_finder.buffer.bytesize
512
+ end
513
+
514
+ # Get literal subcoder flat index
515
+ # Ported from XZ Utils literal_subcoder macro in lzma_common.h:
516
+ # #define literal_subcoder(probs, lc, literal_mask, pos, prev_byte) \
517
+ # ((probs) + UINT32_C(3) * \
518
+ # (((((pos) << 8) + (prev_byte)) & (literal_mask)) << (lc)))
519
+ # where literal_mask = (0x100 << lp) - (0x100 >> lc)
520
+ #
521
+ # Returns the flat index into the literal probability array.
522
+ # The literal array is now a flat array (matching XZ Utils) with
523
+ # size 0x300 << (lc + lp), not a 2D array.
524
+ def get_literal_state(pos, prev_byte)
525
+ literal_mask = (0x100 << @lp) - (0x100 >> @lc)
526
+ ((((pos << 8) + prev_byte) & literal_mask) << @lc)
527
+ end
528
+
529
+ # Get byte from dictionary at distance back
530
+ def get_dictionary_byte(distance)
531
+ if distance.positive? &&
532
+ distance <= @dictionary.buffer.bytesize
533
+ @dictionary.get_byte(distance)
534
+ end
535
+ end
536
+
537
+ # Encode normal literal (8-bit tree)
538
+ # Ported from XZ Utils rc_bittree() for normal literals
539
+ # @param literal_offset [Integer] Base offset into flat literal array
540
+ # @param symbol [Integer] The literal byte to encode (0-255)
541
+ # @param encoder [XZBufferedRangeEncoder] The range encoder
542
+ def encode_normal_literal(literal_offset, symbol, encoder)
543
+ context = 1
544
+ 8.downto(1) do |i|
545
+ bit = (symbol >> (i - 1)) & 1
546
+ encoder.queue_bit(@models.literal[literal_offset + context], bit)
547
+ context = (context << 1) | bit
548
+ end
549
+ end
550
+
551
+ # Encode matched literal (compare with match byte)
552
+ # Ported from XZ Utils literal_matched() in lzma_encoder.c
553
+ # @param literal_offset [Integer] Base offset into flat literal array
554
+ # @param match_byte [Integer] The match byte to compare against
555
+ # @param symbol [Integer] The literal byte to encode (0-255)
556
+ # @param encoder [XZBufferedRangeEncoder] The range encoder
557
+ def encode_matched_literal(literal_offset, match_byte, symbol, encoder)
558
+ offset = 0x100
559
+ symbol += 0x100 # Start symbol at 256 (XZ Utils algorithm)
560
+
561
+ # Loop until symbol reaches 0x10000 (65536)
562
+ while symbol < 0x10000
563
+ match_byte <<= 1
564
+ match_bit = match_byte & offset
565
+ subcoder_index = offset + match_bit + (symbol >> 8)
566
+ bit = (symbol >> 7) & 1
567
+
568
+ encoder.queue_bit(@models.literal[literal_offset + subcoder_index],
569
+ bit)
570
+
571
+ symbol <<= 1
572
+ offset &= ~(match_byte ^ symbol)
573
+ end
574
+ end
575
+
576
+ # Encode match length
577
+ def encode_match_length(length, pos_state, encoder)
578
+ len = length - MATCH_LEN_MIN
579
+
580
+ if len < LEN_LOW_SYMBOLS
581
+ # Low: 0-7
582
+ encoder.queue_bit(@models.match_len_encoder.choice, 0)
583
+ encode_bittree(
584
+ @models.match_len_encoder.low[pos_state],
585
+ NUM_LEN_LOW_BITS,
586
+ len,
587
+ encoder,
588
+ )
589
+ elsif len < LEN_LOW_SYMBOLS + LEN_MID_SYMBOLS
590
+ # Mid: 8-15
591
+ encoder.queue_bit(@models.match_len_encoder.choice, 1)
592
+ encoder.queue_bit(@models.match_len_encoder.choice2, 0)
593
+ encode_bittree(
594
+ @models.match_len_encoder.mid[pos_state],
595
+ NUM_LEN_MID_BITS,
596
+ len - LEN_LOW_SYMBOLS,
597
+ encoder,
598
+ )
599
+ else
600
+ # High: 16-271
601
+ encoder.queue_bit(@models.match_len_encoder.choice, 1)
602
+ encoder.queue_bit(@models.match_len_encoder.choice2, 1)
603
+ high_len = len - LEN_LOW_SYMBOLS - LEN_MID_SYMBOLS
604
+ encode_bittree(
605
+ @models.match_len_encoder.high,
606
+ NUM_LEN_HIGH_BITS,
607
+ high_len,
608
+ encoder,
609
+ )
610
+ end
611
+ end
612
+
613
+ # Encode distance using slot encoding
614
+ def encode_distance(distance, length, encoder)
615
+ dist_slot = get_dist_slot(distance)
616
+ len_state = get_len_to_pos_state(length)
617
+
618
+ # Encode distance slot
619
+ # @dist_slot is organized as [len_to_pos_state][dist_slot]
620
+ encode_bittree(
621
+ @models.dist_slot[len_state],
622
+ NUM_DIST_SLOT_BITS,
623
+ dist_slot,
624
+ encoder,
625
+ )
626
+
627
+ # Encode distance footer
628
+ if dist_slot >= START_POS_MODEL_INDEX
629
+ footer_bits = (dist_slot >> 1) - 1
630
+ base = (2 | (dist_slot & 1)) << footer_bits
631
+ dist_reduced = distance - base
632
+
633
+ if dist_slot < END_POS_MODEL_INDEX
634
+ # Use probability models
635
+ # XZ Utils: rc_bittree_reverse(&coder->rc, coder->dist_special + base - dist_slot - 1, ...)
636
+ encode_bittree_reverse(
637
+ @models.dist_special,
638
+ dist_reduced,
639
+ footer_bits,
640
+ base - dist_slot - 1,
641
+ encoder,
642
+ )
643
+ else
644
+ # Direct bits + alignment
645
+ direct_bits = footer_bits - DIST_ALIGN_BITS
646
+ encoder.queue_direct_bits(
647
+ dist_reduced >> DIST_ALIGN_BITS,
648
+ direct_bits,
649
+ )
650
+ align_mask = (1 << DIST_ALIGN_BITS) - 1
651
+ encode_bittree_reverse(
652
+ @models.dist_align,
653
+ dist_reduced & align_mask,
654
+ DIST_ALIGN_BITS,
655
+ 0,
656
+ encoder,
657
+ )
658
+ end
659
+ end
660
+ end
661
+
662
+ # Encode bittree (MSB first)
663
+ def encode_bittree(probs, num_bits, value, encoder)
664
+ context = 1
665
+ num_bits.downto(1) do |i|
666
+ bit = (value >> (i - 1)) & 1
667
+ encoder.queue_bit(probs[context], bit)
668
+ context = (context << 1) | bit
669
+ end
670
+ end
671
+
672
+ # Encode bittree in reverse (LSB first)
673
+ def encode_bittree_reverse(probs, value, num_bits, offset, encoder)
674
+ context = 1
675
+ num_bits.times do |i|
676
+ bit = (value >> i) & 1
677
+ encoder.queue_bit(probs[offset + context], bit)
678
+ context = (context << 1) | bit
679
+ end
680
+ end
681
+
682
+ # Get distance slot for distance
683
+ def get_dist_slot(distance)
684
+ if distance < NUM_FULL_DISTANCES
685
+ distance < 4 ? distance : fast_pos_small(distance)
686
+ else
687
+ fast_pos_large(distance)
688
+ end
689
+ end
690
+
691
+ # Fast position calculation for small distances
692
+ def fast_pos_small(distance)
693
+ slot = 0
694
+ dist = distance
695
+ while dist > 3
696
+ dist >>= 1
697
+ slot += 2
698
+ end
699
+ slot + dist
700
+ end
701
+
702
+ # Fast position calculation for large distances
703
+ def fast_pos_large(distance)
704
+ # Find highest bit position
705
+ n = 31
706
+ while n >= 0
707
+ break if (distance >> n) != 0
708
+
709
+ n -= 1
710
+ end
711
+ # slot = 2 * n + high_bit
712
+ ((n << 1) + ((distance >> (n - 1)) & 1))
713
+ end
714
+
715
+ # Map length to position state
716
+ def get_len_to_pos_state(length)
717
+ len = length - MATCH_LEN_MIN
718
+ len < NUM_LEN_TO_POS_STATES ? len : NUM_LEN_TO_POS_STATES - 1
719
+ end
720
+
721
+ # Encode dictionary size to LZMA2 property byte
722
+ # @param dict_size [Integer] Dictionary size
723
+ # @return [Integer] Property byte (0-40)
724
+ def encode_dict_size(dict_size)
725
+ # Clamp to valid range
726
+ d = [dict_size, Omnizip::Algorithms::LZMA2Const::DICT_SIZE_MIN].max
727
+
728
+ # Calculate log2 of dict_size
729
+ log2_size = 0
730
+ temp = d
731
+ while temp > 1
732
+ log2_size += 1
733
+ temp >>= 1
734
+ end
735
+
736
+ # Encoding formula for power-of-2 sizes:
737
+ # d = 2 * (log2_size - 12)
738
+ if d == (1 << log2_size)
739
+ # Exact power of 2
740
+ [(log2_size - 12) * 2, 40].min
741
+ else
742
+ # Between 2^n and 2^n + 2^(n-1), use odd encoding
743
+ [((log2_size - 12) * 2) + 1, 40].min
744
+ end
745
+ end
746
+ end
747
+ end
748
+ end
749
+ end
750
+ end