omnizip 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (511) hide show
  1. checksums.yaml +7 -0
  2. data/.rspec +3 -0
  3. data/.rubocop.yml +32 -0
  4. data/.rubocop_todo.yml +754 -0
  5. data/COPYING +502 -0
  6. data/Gemfile +17 -0
  7. data/LICENSE +12 -0
  8. data/README.adoc +1045 -0
  9. data/Rakefile +12 -0
  10. data/benchmark/README.md +260 -0
  11. data/benchmark/benchmark_suite.rb +125 -0
  12. data/benchmark/compression_bench.rb +181 -0
  13. data/benchmark/filter_bench.rb +180 -0
  14. data/benchmark/models/benchmark_result.rb +59 -0
  15. data/benchmark/models/comparison_result.rb +69 -0
  16. data/benchmark/profile_suite.rb +167 -0
  17. data/benchmark/reporter.rb +150 -0
  18. data/benchmark/run_benchmarks.rb +66 -0
  19. data/benchmark/test_data.rb +137 -0
  20. data/config/formats/rar3_spec.yml +91 -0
  21. data/config/formats/rar5_spec.yml +102 -0
  22. data/docs/.github/workflows/docs.yml +142 -0
  23. data/docs/.gitignore +21 -0
  24. data/docs/.lychee.toml +67 -0
  25. data/docs/Gemfile +13 -0
  26. data/docs/RAR_WRITE_SUPPORT.md +26 -0
  27. data/docs/README.md +101 -0
  28. data/docs/_config.yml +112 -0
  29. data/docs/assets/logo.svg +1 -0
  30. data/docs/assets/omnizip-logo.pdf +1540 -11
  31. data/docs/comparison/feature-matrix.adoc +694 -0
  32. data/docs/comparison/index.adoc +113 -0
  33. data/docs/comparison/vs-7zip.adoc +309 -0
  34. data/docs/comparison/vs-peazip.adoc +77 -0
  35. data/docs/comparison/vs-rubyzip.adoc +342 -0
  36. data/docs/comparison/vs-winrar.adoc +100 -0
  37. data/docs/compatibility.adoc +579 -0
  38. data/docs/concepts/index.adoc +129 -0
  39. data/docs/developer/architecture.adoc +256 -0
  40. data/docs/developer/contributing.adoc +158 -0
  41. data/docs/developer/index.adoc +25 -0
  42. data/docs/developer/testing.adoc +212 -0
  43. data/docs/getting-started/basic-usage.adoc +271 -0
  44. data/docs/getting-started/index.adoc +42 -0
  45. data/docs/getting-started/installation.adoc +138 -0
  46. data/docs/getting-started/quick-start.adoc +185 -0
  47. data/docs/getting-started/your-first-archive.adoc +218 -0
  48. data/docs/guides/advanced-features/encryption.adoc +300 -0
  49. data/docs/guides/advanced-features/index.adoc +49 -0
  50. data/docs/guides/advanced-features/parallel-processing.adoc +246 -0
  51. data/docs/guides/advanced-features/progress-tracking.adoc +320 -0
  52. data/docs/guides/advanced-features/streaming.adoc +212 -0
  53. data/docs/guides/archive-formats/gzip-format.adoc +107 -0
  54. data/docs/guides/archive-formats/index.adoc +130 -0
  55. data/docs/guides/archive-formats/rar-format.adoc +104 -0
  56. data/docs/guides/archive-formats/rar5.adoc +521 -0
  57. data/docs/guides/archive-formats/seven-zip-format.adoc +35 -0
  58. data/docs/guides/archive-formats/tar-format.adoc +106 -0
  59. data/docs/guides/archive-formats/xz-format.adoc +118 -0
  60. data/docs/guides/archive-formats/zip-format.adoc +35 -0
  61. data/docs/guides/compression-algorithms/bzip2.adoc +113 -0
  62. data/docs/guides/compression-algorithms/deflate.adoc +319 -0
  63. data/docs/guides/compression-algorithms/index.adoc +190 -0
  64. data/docs/guides/compression-algorithms/lzma.adoc +398 -0
  65. data/docs/guides/compression-algorithms/lzma2.adoc +327 -0
  66. data/docs/guides/compression-algorithms/ppmd.adoc +316 -0
  67. data/docs/guides/compression-algorithms/zstandard.adoc +361 -0
  68. data/docs/guides/creating-archives.adoc +354 -0
  69. data/docs/guides/extracting-archives.adoc +53 -0
  70. data/docs/guides/format-conversion.adoc +64 -0
  71. data/docs/guides/index.adoc +49 -0
  72. data/docs/guides/migration-rubyzip.adoc +217 -0
  73. data/docs/guides/parity-archives.adoc +605 -0
  74. data/docs/guides/performance-tuning.adoc +88 -0
  75. data/docs/index.adoc +218 -0
  76. data/docs/lychee.toml +67 -0
  77. data/docs/reference/api/overview.adoc +188 -0
  78. data/docs/reference/cli/compress-command.adoc +114 -0
  79. data/docs/reference/cli/overview.adoc +140 -0
  80. data/docs/reference/index.adoc +26 -0
  81. data/docs/resources/faq.adoc +185 -0
  82. data/docs/resources/quick-reference.adoc +222 -0
  83. data/docs/troubleshooting/index.adoc +208 -0
  84. data/examples/api_comparison.rb +205 -0
  85. data/examples/deflate64_example.rb +96 -0
  86. data/examples/par2_demo.rb +121 -0
  87. data/examples/quick_start_native.rb +150 -0
  88. data/examples/quick_start_rubyzip.rb +115 -0
  89. data/examples/rubyzip_compatibility_demo.rb +194 -0
  90. data/exe/omnizip +27 -0
  91. data/lib/omnizip/algorithm.rb +130 -0
  92. data/lib/omnizip/algorithm_registry.rb +86 -0
  93. data/lib/omnizip/algorithms/.keep +0 -0
  94. data/lib/omnizip/algorithms/bzip2/bwt.rb +225 -0
  95. data/lib/omnizip/algorithms/bzip2/decoder.rb +193 -0
  96. data/lib/omnizip/algorithms/bzip2/encoder.rb +237 -0
  97. data/lib/omnizip/algorithms/bzip2/huffman.rb +206 -0
  98. data/lib/omnizip/algorithms/bzip2/mtf.rb +101 -0
  99. data/lib/omnizip/algorithms/bzip2/rle.rb +151 -0
  100. data/lib/omnizip/algorithms/bzip2.rb +130 -0
  101. data/lib/omnizip/algorithms/deflate/constants.rb +28 -0
  102. data/lib/omnizip/algorithms/deflate/decoder.rb +38 -0
  103. data/lib/omnizip/algorithms/deflate/encoder.rb +46 -0
  104. data/lib/omnizip/algorithms/deflate.rb +128 -0
  105. data/lib/omnizip/algorithms/deflate64/constants.rb +45 -0
  106. data/lib/omnizip/algorithms/deflate64/decoder.rb +153 -0
  107. data/lib/omnizip/algorithms/deflate64/encoder.rb +98 -0
  108. data/lib/omnizip/algorithms/deflate64/huffman_coder.rb +354 -0
  109. data/lib/omnizip/algorithms/deflate64/lz77_encoder.rb +142 -0
  110. data/lib/omnizip/algorithms/deflate64.rb +109 -0
  111. data/lib/omnizip/algorithms/lzma/bit_model.rb +120 -0
  112. data/lib/omnizip/algorithms/lzma/constants.rb +112 -0
  113. data/lib/omnizip/algorithms/lzma/decoder.rb +148 -0
  114. data/lib/omnizip/algorithms/lzma/dictionary.rb +69 -0
  115. data/lib/omnizip/algorithms/lzma/distance_coder.rb +415 -0
  116. data/lib/omnizip/algorithms/lzma/encoder.rb +142 -0
  117. data/lib/omnizip/algorithms/lzma/length_coder.rb +260 -0
  118. data/lib/omnizip/algorithms/lzma/literal_decoder.rb +320 -0
  119. data/lib/omnizip/algorithms/lzma/literal_encoder.rb +210 -0
  120. data/lib/omnizip/algorithms/lzma/lzip_decoder.rb +341 -0
  121. data/lib/omnizip/algorithms/lzma/lzma_alone_decoder.rb +192 -0
  122. data/lib/omnizip/algorithms/lzma/lzma_state.rb +128 -0
  123. data/lib/omnizip/algorithms/lzma/match.rb +32 -0
  124. data/lib/omnizip/algorithms/lzma/match_finder.rb +205 -0
  125. data/lib/omnizip/algorithms/lzma/match_finder_config.rb +142 -0
  126. data/lib/omnizip/algorithms/lzma/match_finder_factory.rb +88 -0
  127. data/lib/omnizip/algorithms/lzma/optimal_encoder.rb +130 -0
  128. data/lib/omnizip/algorithms/lzma/probability_models.rb +72 -0
  129. data/lib/omnizip/algorithms/lzma/range_coder.rb +85 -0
  130. data/lib/omnizip/algorithms/lzma/range_decoder.rb +434 -0
  131. data/lib/omnizip/algorithms/lzma/range_encoder.rb +194 -0
  132. data/lib/omnizip/algorithms/lzma/state.rb +127 -0
  133. data/lib/omnizip/algorithms/lzma/xz_buffered_range_encoder.rb +325 -0
  134. data/lib/omnizip/algorithms/lzma/xz_encoder.rb +426 -0
  135. data/lib/omnizip/algorithms/lzma/xz_encoder_fast.rb +645 -0
  136. data/lib/omnizip/algorithms/lzma/xz_match_finder_adapter.rb +227 -0
  137. data/lib/omnizip/algorithms/lzma/xz_price_calculator.rb +169 -0
  138. data/lib/omnizip/algorithms/lzma/xz_probability_models.rb +261 -0
  139. data/lib/omnizip/algorithms/lzma/xz_range_encoder.rb +223 -0
  140. data/lib/omnizip/algorithms/lzma/xz_range_encoder_exact.rb +331 -0
  141. data/lib/omnizip/algorithms/lzma/xz_state.rb +116 -0
  142. data/lib/omnizip/algorithms/lzma/xz_utils_decoder.rb +2055 -0
  143. data/lib/omnizip/algorithms/lzma.rb +238 -0
  144. data/lib/omnizip/algorithms/lzma2/chunk_manager.rb +182 -0
  145. data/lib/omnizip/algorithms/lzma2/constants.rb +41 -0
  146. data/lib/omnizip/algorithms/lzma2/encoder.rb +147 -0
  147. data/lib/omnizip/algorithms/lzma2/lzma2_chunk.rb +161 -0
  148. data/lib/omnizip/algorithms/lzma2/properties.rb +179 -0
  149. data/lib/omnizip/algorithms/lzma2/simple_lzma2_encoder.rb +127 -0
  150. data/lib/omnizip/algorithms/lzma2/xz_encoder_adapter.rb +85 -0
  151. data/lib/omnizip/algorithms/lzma2.rb +141 -0
  152. data/lib/omnizip/algorithms/ppmd7/constants.rb +74 -0
  153. data/lib/omnizip/algorithms/ppmd7/context.rb +154 -0
  154. data/lib/omnizip/algorithms/ppmd7/decoder.rb +126 -0
  155. data/lib/omnizip/algorithms/ppmd7/encoder.rb +163 -0
  156. data/lib/omnizip/algorithms/ppmd7/model.rb +248 -0
  157. data/lib/omnizip/algorithms/ppmd7/symbol_state.rb +57 -0
  158. data/lib/omnizip/algorithms/ppmd7.rb +116 -0
  159. data/lib/omnizip/algorithms/ppmd8/constants.rb +61 -0
  160. data/lib/omnizip/algorithms/ppmd8/context.rb +34 -0
  161. data/lib/omnizip/algorithms/ppmd8/decoder.rb +107 -0
  162. data/lib/omnizip/algorithms/ppmd8/encoder.rb +138 -0
  163. data/lib/omnizip/algorithms/ppmd8/model.rb +250 -0
  164. data/lib/omnizip/algorithms/ppmd8/restoration_method.rb +78 -0
  165. data/lib/omnizip/algorithms/ppmd8.rb +82 -0
  166. data/lib/omnizip/algorithms/ppmd_base.rb +138 -0
  167. data/lib/omnizip/algorithms/sevenzip_lzma2.rb +123 -0
  168. data/lib/omnizip/algorithms/xz_lzma2.rb +118 -0
  169. data/lib/omnizip/algorithms/zstandard/constants.rb +25 -0
  170. data/lib/omnizip/algorithms/zstandard/decoder.rb +46 -0
  171. data/lib/omnizip/algorithms/zstandard/encoder.rb +51 -0
  172. data/lib/omnizip/algorithms/zstandard.rb +138 -0
  173. data/lib/omnizip/buffer/memory_archive.rb +251 -0
  174. data/lib/omnizip/buffer/memory_extractor.rb +224 -0
  175. data/lib/omnizip/buffer.rb +176 -0
  176. data/lib/omnizip/checksum_registry.rb +114 -0
  177. data/lib/omnizip/checksums/crc32.rb +100 -0
  178. data/lib/omnizip/checksums/crc64.rb +101 -0
  179. data/lib/omnizip/checksums/crc_base.rb +158 -0
  180. data/lib/omnizip/checksums/verifier.rb +131 -0
  181. data/lib/omnizip/chunked/memory_manager.rb +194 -0
  182. data/lib/omnizip/chunked/reader.rb +78 -0
  183. data/lib/omnizip/chunked/writer.rb +120 -0
  184. data/lib/omnizip/chunked.rb +129 -0
  185. data/lib/omnizip/cli/output_formatter.rb +104 -0
  186. data/lib/omnizip/cli.rb +572 -0
  187. data/lib/omnizip/commands/.keep +0 -0
  188. data/lib/omnizip/commands/archive_create_command.rb +427 -0
  189. data/lib/omnizip/commands/archive_extract_command.rb +272 -0
  190. data/lib/omnizip/commands/archive_list_command.rb +218 -0
  191. data/lib/omnizip/commands/archive_repair_command.rb +131 -0
  192. data/lib/omnizip/commands/archive_verify_command.rb +117 -0
  193. data/lib/omnizip/commands/compress_command.rb +117 -0
  194. data/lib/omnizip/commands/decompress_command.rb +120 -0
  195. data/lib/omnizip/commands/list_command.rb +53 -0
  196. data/lib/omnizip/commands/metadata_command.rb +153 -0
  197. data/lib/omnizip/commands/parity_create_command.rb +122 -0
  198. data/lib/omnizip/commands/parity_repair_command.rb +122 -0
  199. data/lib/omnizip/commands/parity_verify_command.rb +124 -0
  200. data/lib/omnizip/commands/profile_list_command.rb +56 -0
  201. data/lib/omnizip/commands/profile_show_command.rb +44 -0
  202. data/lib/omnizip/convenience.rb +359 -0
  203. data/lib/omnizip/converter/conversion_registry.rb +49 -0
  204. data/lib/omnizip/converter/conversion_strategy.rb +121 -0
  205. data/lib/omnizip/converter/seven_zip_to_zip_strategy.rb +97 -0
  206. data/lib/omnizip/converter/zip_to_seven_zip_strategy.rb +112 -0
  207. data/lib/omnizip/converter.rb +105 -0
  208. data/lib/omnizip/crypto/aes256/cipher.rb +100 -0
  209. data/lib/omnizip/crypto/aes256/constants.rb +28 -0
  210. data/lib/omnizip/crypto/aes256/key_derivation.rb +101 -0
  211. data/lib/omnizip/crypto/aes256.rb +102 -0
  212. data/lib/omnizip/error.rb +106 -0
  213. data/lib/omnizip/eta/exponential_smoothing_estimator.rb +98 -0
  214. data/lib/omnizip/eta/moving_average_estimator.rb +99 -0
  215. data/lib/omnizip/eta/rate_calculator.rb +104 -0
  216. data/lib/omnizip/eta/sample_history.rb +143 -0
  217. data/lib/omnizip/eta/time_estimator.rb +106 -0
  218. data/lib/omnizip/eta.rb +63 -0
  219. data/lib/omnizip/extraction/filter_chain.rb +177 -0
  220. data/lib/omnizip/extraction/glob_pattern.rb +140 -0
  221. data/lib/omnizip/extraction/pattern_matcher.rb +70 -0
  222. data/lib/omnizip/extraction/predicate_pattern.rb +52 -0
  223. data/lib/omnizip/extraction/regex_pattern.rb +50 -0
  224. data/lib/omnizip/extraction/selective_extractor.rb +240 -0
  225. data/lib/omnizip/extraction.rb +111 -0
  226. data/lib/omnizip/file_type/mime_classifier.rb +144 -0
  227. data/lib/omnizip/file_type.rb +113 -0
  228. data/lib/omnizip/filter.rb +139 -0
  229. data/lib/omnizip/filter_pipeline.rb +108 -0
  230. data/lib/omnizip/filter_registry.rb +166 -0
  231. data/lib/omnizip/filters/bcj.rb +279 -0
  232. data/lib/omnizip/filters/bcj2/constants.rb +53 -0
  233. data/lib/omnizip/filters/bcj2/decoder.rb +200 -0
  234. data/lib/omnizip/filters/bcj2/encoder.rb +61 -0
  235. data/lib/omnizip/filters/bcj2/stream_data.rb +93 -0
  236. data/lib/omnizip/filters/bcj2.rb +99 -0
  237. data/lib/omnizip/filters/bcj_arm.rb +176 -0
  238. data/lib/omnizip/filters/bcj_arm64.rb +244 -0
  239. data/lib/omnizip/filters/bcj_ia64.rb +196 -0
  240. data/lib/omnizip/filters/bcj_ppc.rb +190 -0
  241. data/lib/omnizip/filters/bcj_sparc.rb +176 -0
  242. data/lib/omnizip/filters/bcj_x86.rb +193 -0
  243. data/lib/omnizip/filters/delta.rb +196 -0
  244. data/lib/omnizip/filters/filter_base.rb +72 -0
  245. data/lib/omnizip/filters/registry.rb +123 -0
  246. data/lib/omnizip/filters/xz_delta.rb +258 -0
  247. data/lib/omnizip/format_detector.rb +162 -0
  248. data/lib/omnizip/format_registry.rb +59 -0
  249. data/lib/omnizip/formats/.keep +0 -0
  250. data/lib/omnizip/formats/bzip2_file.rb +172 -0
  251. data/lib/omnizip/formats/cpio/constants.rb +55 -0
  252. data/lib/omnizip/formats/cpio/entry.rb +385 -0
  253. data/lib/omnizip/formats/cpio/reader.rb +196 -0
  254. data/lib/omnizip/formats/cpio/writer.rb +234 -0
  255. data/lib/omnizip/formats/cpio.rb +140 -0
  256. data/lib/omnizip/formats/format_spec_loader.rb +230 -0
  257. data/lib/omnizip/formats/gzip.rb +238 -0
  258. data/lib/omnizip/formats/iso/directory_builder.rb +297 -0
  259. data/lib/omnizip/formats/iso/directory_record.rb +152 -0
  260. data/lib/omnizip/formats/iso/joliet.rb +204 -0
  261. data/lib/omnizip/formats/iso/path_table.rb +125 -0
  262. data/lib/omnizip/formats/iso/reader.rb +197 -0
  263. data/lib/omnizip/formats/iso/rock_ridge.rb +349 -0
  264. data/lib/omnizip/formats/iso/volume_builder.rb +320 -0
  265. data/lib/omnizip/formats/iso/volume_descriptor.rb +168 -0
  266. data/lib/omnizip/formats/iso/writer.rb +530 -0
  267. data/lib/omnizip/formats/iso.rb +140 -0
  268. data/lib/omnizip/formats/lzip.rb +175 -0
  269. data/lib/omnizip/formats/lzma_alone.rb +171 -0
  270. data/lib/omnizip/formats/rar/archive_repairer.rb +243 -0
  271. data/lib/omnizip/formats/rar/archive_verifier.rb +195 -0
  272. data/lib/omnizip/formats/rar/block_parser.rb +243 -0
  273. data/lib/omnizip/formats/rar/compression/bit_stream.rb +180 -0
  274. data/lib/omnizip/formats/rar/compression/dispatcher.rb +217 -0
  275. data/lib/omnizip/formats/rar/compression/lz77_huffman/decoder.rb +216 -0
  276. data/lib/omnizip/formats/rar/compression/lz77_huffman/encoder.rb +158 -0
  277. data/lib/omnizip/formats/rar/compression/lz77_huffman/huffman_builder.rb +217 -0
  278. data/lib/omnizip/formats/rar/compression/lz77_huffman/huffman_coder.rb +189 -0
  279. data/lib/omnizip/formats/rar/compression/lz77_huffman/match_finder.rb +135 -0
  280. data/lib/omnizip/formats/rar/compression/lz77_huffman/sliding_window.rb +165 -0
  281. data/lib/omnizip/formats/rar/compression/ppmd/context.rb +105 -0
  282. data/lib/omnizip/formats/rar/compression/ppmd/decoder.rb +219 -0
  283. data/lib/omnizip/formats/rar/compression/ppmd/encoder.rb +262 -0
  284. data/lib/omnizip/formats/rar/compression_method_registry.rb +106 -0
  285. data/lib/omnizip/formats/rar/constants.rb +82 -0
  286. data/lib/omnizip/formats/rar/decompressor.rb +238 -0
  287. data/lib/omnizip/formats/rar/external_writer.rb +312 -0
  288. data/lib/omnizip/formats/rar/header.rb +192 -0
  289. data/lib/omnizip/formats/rar/license_validator.rb +109 -0
  290. data/lib/omnizip/formats/rar/models/rar_archive.rb +77 -0
  291. data/lib/omnizip/formats/rar/models/rar_entry.rb +65 -0
  292. data/lib/omnizip/formats/rar/models/rar_volume.rb +56 -0
  293. data/lib/omnizip/formats/rar/parity_handler.rb +292 -0
  294. data/lib/omnizip/formats/rar/rar5/compression/lzma.rb +202 -0
  295. data/lib/omnizip/formats/rar/rar5/compression/lzss.rb +578 -0
  296. data/lib/omnizip/formats/rar/rar5/compression/store.rb +60 -0
  297. data/lib/omnizip/formats/rar/rar5/crc32.rb +39 -0
  298. data/lib/omnizip/formats/rar/rar5/encryption/aes256_cbc.rb +97 -0
  299. data/lib/omnizip/formats/rar/rar5/encryption/encryption_header.rb +114 -0
  300. data/lib/omnizip/formats/rar/rar5/encryption/encryption_manager.rb +166 -0
  301. data/lib/omnizip/formats/rar/rar5/encryption/key_derivation.rb +97 -0
  302. data/lib/omnizip/formats/rar/rar5/header.rb +187 -0
  303. data/lib/omnizip/formats/rar/rar5/models/encryption_options.rb +74 -0
  304. data/lib/omnizip/formats/rar/rar5/models/recovery_options.rb +63 -0
  305. data/lib/omnizip/formats/rar/rar5/models/solid_options.rb +63 -0
  306. data/lib/omnizip/formats/rar/rar5/models/volume_options.rb +74 -0
  307. data/lib/omnizip/formats/rar/rar5/multi_volume/ARCHITECTURE.md +290 -0
  308. data/lib/omnizip/formats/rar/rar5/multi_volume/volume_manager.rb +264 -0
  309. data/lib/omnizip/formats/rar/rar5/multi_volume/volume_splitter.rb +155 -0
  310. data/lib/omnizip/formats/rar/rar5/multi_volume/volume_writer.rb +194 -0
  311. data/lib/omnizip/formats/rar/rar5/solid/solid_encoder.rb +109 -0
  312. data/lib/omnizip/formats/rar/rar5/solid/solid_manager.rb +142 -0
  313. data/lib/omnizip/formats/rar/rar5/solid/solid_stream.rb +121 -0
  314. data/lib/omnizip/formats/rar/rar5/vint.rb +65 -0
  315. data/lib/omnizip/formats/rar/rar5/writer.rb +466 -0
  316. data/lib/omnizip/formats/rar/rar_format_base.rb +241 -0
  317. data/lib/omnizip/formats/rar/reader.rb +366 -0
  318. data/lib/omnizip/formats/rar/recovery_record.rb +245 -0
  319. data/lib/omnizip/formats/rar/volume_manager.rb +168 -0
  320. data/lib/omnizip/formats/rar/writer.rb +431 -0
  321. data/lib/omnizip/formats/rar.rb +205 -0
  322. data/lib/omnizip/formats/rar3/compressor.rb +73 -0
  323. data/lib/omnizip/formats/rar3/decompressor.rb +66 -0
  324. data/lib/omnizip/formats/rar3/reader.rb +386 -0
  325. data/lib/omnizip/formats/rar3/writer.rb +219 -0
  326. data/lib/omnizip/formats/rar5/compressor.rb +73 -0
  327. data/lib/omnizip/formats/rar5/decompressor.rb +66 -0
  328. data/lib/omnizip/formats/rar5/reader.rb +342 -0
  329. data/lib/omnizip/formats/rar5/writer.rb +214 -0
  330. data/lib/omnizip/formats/seven_zip/coder_chain.rb +150 -0
  331. data/lib/omnizip/formats/seven_zip/constants.rb +126 -0
  332. data/lib/omnizip/formats/seven_zip/encoded_header.rb +114 -0
  333. data/lib/omnizip/formats/seven_zip/encrypted_header.rb +142 -0
  334. data/lib/omnizip/formats/seven_zip/file_collector.rb +144 -0
  335. data/lib/omnizip/formats/seven_zip/header.rb +106 -0
  336. data/lib/omnizip/formats/seven_zip/header_encryptor.rb +134 -0
  337. data/lib/omnizip/formats/seven_zip/header_writer.rb +466 -0
  338. data/lib/omnizip/formats/seven_zip/models/coder_info.rb +30 -0
  339. data/lib/omnizip/formats/seven_zip/models/file_entry.rb +58 -0
  340. data/lib/omnizip/formats/seven_zip/models/folder.rb +69 -0
  341. data/lib/omnizip/formats/seven_zip/models/stream_info.rb +42 -0
  342. data/lib/omnizip/formats/seven_zip/parser.rb +660 -0
  343. data/lib/omnizip/formats/seven_zip/reader.rb +458 -0
  344. data/lib/omnizip/formats/seven_zip/split_archive_reader.rb +632 -0
  345. data/lib/omnizip/formats/seven_zip/split_archive_writer.rb +315 -0
  346. data/lib/omnizip/formats/seven_zip/stream_compressor.rb +151 -0
  347. data/lib/omnizip/formats/seven_zip/stream_decompressor.rb +162 -0
  348. data/lib/omnizip/formats/seven_zip/writer.rb +740 -0
  349. data/lib/omnizip/formats/seven_zip.rb +93 -0
  350. data/lib/omnizip/formats/tar/constants.rb +73 -0
  351. data/lib/omnizip/formats/tar/entry.rb +94 -0
  352. data/lib/omnizip/formats/tar/header.rb +168 -0
  353. data/lib/omnizip/formats/tar/reader.rb +121 -0
  354. data/lib/omnizip/formats/tar/writer.rb +216 -0
  355. data/lib/omnizip/formats/tar.rb +84 -0
  356. data/lib/omnizip/formats/xz/reader.rb +116 -0
  357. data/lib/omnizip/formats/xz.rb +237 -0
  358. data/lib/omnizip/formats/xz_impl/block_decoder.rb +754 -0
  359. data/lib/omnizip/formats/xz_impl/block_encoder.rb +306 -0
  360. data/lib/omnizip/formats/xz_impl/block_header.rb +210 -0
  361. data/lib/omnizip/formats/xz_impl/block_header_parser.rb +186 -0
  362. data/lib/omnizip/formats/xz_impl/constants.rb +49 -0
  363. data/lib/omnizip/formats/xz_impl/index_decoder.rb +174 -0
  364. data/lib/omnizip/formats/xz_impl/index_encoder.rb +122 -0
  365. data/lib/omnizip/formats/xz_impl/stream_decoder.rb +468 -0
  366. data/lib/omnizip/formats/xz_impl/stream_encoder.rb +99 -0
  367. data/lib/omnizip/formats/xz_impl/stream_footer.rb +81 -0
  368. data/lib/omnizip/formats/xz_impl/stream_footer_parser.rb +117 -0
  369. data/lib/omnizip/formats/xz_impl/stream_header.rb +55 -0
  370. data/lib/omnizip/formats/xz_impl/stream_header_parser.rb +108 -0
  371. data/lib/omnizip/formats/xz_impl/vli.rb +128 -0
  372. data/lib/omnizip/formats/xz_impl/writer.rb +421 -0
  373. data/lib/omnizip/formats/zip/central_directory_header.rb +195 -0
  374. data/lib/omnizip/formats/zip/constants.rb +69 -0
  375. data/lib/omnizip/formats/zip/end_of_central_directory.rb +133 -0
  376. data/lib/omnizip/formats/zip/local_file_header.rb +138 -0
  377. data/lib/omnizip/formats/zip/reader.rb +250 -0
  378. data/lib/omnizip/formats/zip/unix_extra_field.rb +153 -0
  379. data/lib/omnizip/formats/zip/writer.rb +375 -0
  380. data/lib/omnizip/formats/zip/zip64_end_of_central_directory.rb +104 -0
  381. data/lib/omnizip/formats/zip/zip64_end_of_central_directory_locator.rb +66 -0
  382. data/lib/omnizip/formats/zip/zip64_extra_field.rb +114 -0
  383. data/lib/omnizip/formats/zip.rb +50 -0
  384. data/lib/omnizip/implementations/base/lzma2_decoder_base.rb +75 -0
  385. data/lib/omnizip/implementations/base/lzma2_encoder_base.rb +128 -0
  386. data/lib/omnizip/implementations/base/lzma_decoder_base.rb +83 -0
  387. data/lib/omnizip/implementations/base/lzma_encoder_base.rb +108 -0
  388. data/lib/omnizip/implementations/base/state_machine_base.rb +182 -0
  389. data/lib/omnizip/implementations/seven_zip/lzma/decoder.rb +421 -0
  390. data/lib/omnizip/implementations/seven_zip/lzma/encoder.rb +465 -0
  391. data/lib/omnizip/implementations/seven_zip/lzma/match_finder.rb +288 -0
  392. data/lib/omnizip/implementations/seven_zip/lzma/range_decoder.rb +200 -0
  393. data/lib/omnizip/implementations/seven_zip/lzma/range_encoder.rb +197 -0
  394. data/lib/omnizip/implementations/seven_zip/lzma/state_machine.rb +141 -0
  395. data/lib/omnizip/implementations/seven_zip/lzma2/encoder.rb +519 -0
  396. data/lib/omnizip/implementations/xz_utils/lzma2/decoder.rb +723 -0
  397. data/lib/omnizip/implementations/xz_utils/lzma2/encoder.rb +750 -0
  398. data/lib/omnizip/io/buffered_input.rb +146 -0
  399. data/lib/omnizip/io/buffered_output.rb +105 -0
  400. data/lib/omnizip/io/stream_manager.rb +115 -0
  401. data/lib/omnizip/link_handler/hard_link.rb +79 -0
  402. data/lib/omnizip/link_handler/symbolic_link.rb +74 -0
  403. data/lib/omnizip/link_handler.rb +124 -0
  404. data/lib/omnizip/metadata/archive_metadata.rb +114 -0
  405. data/lib/omnizip/metadata/entry_metadata.rb +146 -0
  406. data/lib/omnizip/metadata/metadata_editor.rb +171 -0
  407. data/lib/omnizip/metadata/metadata_registry.rb +64 -0
  408. data/lib/omnizip/metadata/metadata_validator.rb +99 -0
  409. data/lib/omnizip/metadata.rb +57 -0
  410. data/lib/omnizip/models/.keep +0 -0
  411. data/lib/omnizip/models/algorithm_metadata.rb +73 -0
  412. data/lib/omnizip/models/compression_options.rb +71 -0
  413. data/lib/omnizip/models/conversion_options.rb +87 -0
  414. data/lib/omnizip/models/conversion_result.rb +135 -0
  415. data/lib/omnizip/models/eta_result.rb +46 -0
  416. data/lib/omnizip/models/extraction_rule.rb +115 -0
  417. data/lib/omnizip/models/filter_chain.rb +144 -0
  418. data/lib/omnizip/models/filter_config.rb +183 -0
  419. data/lib/omnizip/models/match_result.rb +124 -0
  420. data/lib/omnizip/models/optimization_suggestion.rb +91 -0
  421. data/lib/omnizip/models/parallel_options.rb +104 -0
  422. data/lib/omnizip/models/performance_result.rb +79 -0
  423. data/lib/omnizip/models/profile_report.rb +82 -0
  424. data/lib/omnizip/models/progress_options.rb +38 -0
  425. data/lib/omnizip/models/split_options.rb +116 -0
  426. data/lib/omnizip/optimization_registry.rb +81 -0
  427. data/lib/omnizip/parallel/job_queue.rb +209 -0
  428. data/lib/omnizip/parallel/job_scheduler.rb +203 -0
  429. data/lib/omnizip/parallel/parallel_compressor.rb +347 -0
  430. data/lib/omnizip/parallel/parallel_extractor.rb +329 -0
  431. data/lib/omnizip/parallel/worker_pool.rb +223 -0
  432. data/lib/omnizip/parallel.rb +149 -0
  433. data/lib/omnizip/parity/chunked_block_processor.rb +196 -0
  434. data/lib/omnizip/parity/galois16.rb +145 -0
  435. data/lib/omnizip/parity/models/creator_packet.rb +73 -0
  436. data/lib/omnizip/parity/models/file_description_packet.rb +133 -0
  437. data/lib/omnizip/parity/models/ifsc_packet.rb +123 -0
  438. data/lib/omnizip/parity/models/main_packet.rb +128 -0
  439. data/lib/omnizip/parity/models/packet.rb +156 -0
  440. data/lib/omnizip/parity/models/packet_registry.rb +109 -0
  441. data/lib/omnizip/parity/models/recovery_slice_packet.rb +78 -0
  442. data/lib/omnizip/parity/par2_creator.rb +531 -0
  443. data/lib/omnizip/parity/par2_repairer.rb +407 -0
  444. data/lib/omnizip/parity/par2_verifier.rb +364 -0
  445. data/lib/omnizip/parity/par2cmdline_algorithm.rb +110 -0
  446. data/lib/omnizip/parity/par2cmdline_coefficients.rb +78 -0
  447. data/lib/omnizip/parity/reed_solomon_decoder.rb +266 -0
  448. data/lib/omnizip/parity/reed_solomon_encoder.rb +111 -0
  449. data/lib/omnizip/parity/reed_solomon_matrix.rb +342 -0
  450. data/lib/omnizip/parity.rb +186 -0
  451. data/lib/omnizip/password/encryption_registry.rb +65 -0
  452. data/lib/omnizip/password/encryption_strategy.rb +96 -0
  453. data/lib/omnizip/password/password_validator.rb +129 -0
  454. data/lib/omnizip/password/winzip_aes_strategy.rb +192 -0
  455. data/lib/omnizip/password/zip_crypto_strategy.rb +141 -0
  456. data/lib/omnizip/password.rb +87 -0
  457. data/lib/omnizip/pipe/stream_compressor.rb +124 -0
  458. data/lib/omnizip/pipe/stream_decompressor.rb +174 -0
  459. data/lib/omnizip/pipe.rb +121 -0
  460. data/lib/omnizip/platform/ntfs_streams.rb +201 -0
  461. data/lib/omnizip/platform.rb +189 -0
  462. data/lib/omnizip/profile/archive_profile.rb +39 -0
  463. data/lib/omnizip/profile/balanced_profile.rb +33 -0
  464. data/lib/omnizip/profile/binary_profile.rb +36 -0
  465. data/lib/omnizip/profile/compression_profile.rb +158 -0
  466. data/lib/omnizip/profile/custom_profile.rb +157 -0
  467. data/lib/omnizip/profile/fast_profile.rb +33 -0
  468. data/lib/omnizip/profile/maximum_profile.rb +33 -0
  469. data/lib/omnizip/profile/profile_detector.rb +110 -0
  470. data/lib/omnizip/profile/profile_registry.rb +161 -0
  471. data/lib/omnizip/profile/text_profile.rb +36 -0
  472. data/lib/omnizip/profile.rb +190 -0
  473. data/lib/omnizip/profiler/memory_profiler.rb +66 -0
  474. data/lib/omnizip/profiler/method_profiler.rb +49 -0
  475. data/lib/omnizip/profiler/report_generator.rb +169 -0
  476. data/lib/omnizip/profiler.rb +204 -0
  477. data/lib/omnizip/progress/callback_reporter.rb +36 -0
  478. data/lib/omnizip/progress/console_reporter.rb +62 -0
  479. data/lib/omnizip/progress/log_reporter.rb +91 -0
  480. data/lib/omnizip/progress/operation_progress.rb +118 -0
  481. data/lib/omnizip/progress/progress_bar.rb +156 -0
  482. data/lib/omnizip/progress/progress_reporter.rb +40 -0
  483. data/lib/omnizip/progress/progress_tracker.rb +190 -0
  484. data/lib/omnizip/progress/silent_reporter.rb +24 -0
  485. data/lib/omnizip/progress.rb +127 -0
  486. data/lib/omnizip/rubyzip_compat.rb +63 -0
  487. data/lib/omnizip/temp/safe_extract.rb +168 -0
  488. data/lib/omnizip/temp/temp_file.rb +124 -0
  489. data/lib/omnizip/temp/temp_file_pool.rb +109 -0
  490. data/lib/omnizip/temp.rb +181 -0
  491. data/lib/omnizip/version.rb +5 -0
  492. data/lib/omnizip/zip/entry.rb +156 -0
  493. data/lib/omnizip/zip/file.rb +485 -0
  494. data/lib/omnizip/zip/input_stream.rb +273 -0
  495. data/lib/omnizip/zip/output_stream.rb +324 -0
  496. data/lib/omnizip.rb +156 -0
  497. data/readme-docs/advanced-features.adoc +515 -0
  498. data/readme-docs/api-usage.adoc +444 -0
  499. data/readme-docs/architecture.adoc +449 -0
  500. data/readme-docs/archive-formats.adoc +479 -0
  501. data/readme-docs/cli-usage.adoc +222 -0
  502. data/readme-docs/compression-algorithms.adoc +442 -0
  503. data/readme-docs/compression-profiles.adoc +247 -0
  504. data/readme-docs/encryption-checksums.adoc +328 -0
  505. data/readme-docs/format-converter.adoc +325 -0
  506. data/readme-docs/installation.adoc +228 -0
  507. data/readme-docs/par2-archives.adoc +608 -0
  508. data/readme-docs/performance-profiler.adoc +389 -0
  509. data/readme-docs/preprocessing-filters.adoc +280 -0
  510. data/xz-file-format-1.2.1.txt +1174 -0
  511. metadata +617 -0
@@ -0,0 +1,1174 @@
1
+
2
+ The .xz File Format
3
+ ===================
4
+
5
+ Version 1.2.1 (2024-04-08)
6
+
7
+
8
+ 0. Preface
9
+ 0.1. Notices and Acknowledgements
10
+ 0.2. Getting the Latest Version
11
+ 0.3. Version History
12
+ 1. Conventions
13
+ 1.1. Byte and Its Representation
14
+ 1.2. Multibyte Integers
15
+ 2. Overall Structure of .xz File
16
+ 2.1. Stream
17
+ 2.1.1. Stream Header
18
+ 2.1.1.1. Header Magic Bytes
19
+ 2.1.1.2. Stream Flags
20
+ 2.1.1.3. CRC32
21
+ 2.1.2. Stream Footer
22
+ 2.1.2.1. CRC32
23
+ 2.1.2.2. Backward Size
24
+ 2.1.2.3. Stream Flags
25
+ 2.1.2.4. Footer Magic Bytes
26
+ 2.2. Stream Padding
27
+ 3. Block
28
+ 3.1. Block Header
29
+ 3.1.1. Block Header Size
30
+ 3.1.2. Block Flags
31
+ 3.1.3. Compressed Size
32
+ 3.1.4. Uncompressed Size
33
+ 3.1.5. List of Filter Flags
34
+ 3.1.6. Header Padding
35
+ 3.1.7. CRC32
36
+ 3.2. Compressed Data
37
+ 3.3. Block Padding
38
+ 3.4. Check
39
+ 4. Index
40
+ 4.1. Index Indicator
41
+ 4.2. Number of Records
42
+ 4.3. List of Records
43
+ 4.3.1. Unpadded Size
44
+ 4.3.2. Uncompressed Size
45
+ 4.4. Index Padding
46
+ 4.5. CRC32
47
+ 5. Filter Chains
48
+ 5.1. Alignment
49
+ 5.2. Security
50
+ 5.3. Filters
51
+ 5.3.1. LZMA2
52
+ 5.3.2. Branch/Call/Jump Filters for Executables
53
+ 5.3.3. Delta
54
+ 5.3.3.1. Format of the Encoded Output
55
+ 5.4. Custom Filter IDs
56
+ 5.4.1. Reserved Custom Filter ID Ranges
57
+ 6. Cyclic Redundancy Checks
58
+ 7. References
59
+
60
+
61
+ 0. Preface
62
+
63
+ This document describes the .xz file format (filename suffix
64
+ ".xz", MIME type "application/x-xz"). It is intended that this
65
+ this format replace the old .lzma format used by LZMA SDK and
66
+ LZMA Utils.
67
+
68
+
69
+ 0.1. Notices and Acknowledgements
70
+
71
+ This file format was designed by Lasse Collin
72
+ <lasse.collin@tukaani.org> and Igor Pavlov.
73
+
74
+ Special thanks for helping with this document goes to
75
+ Ville Koskinen. Thanks for helping with this document goes to
76
+ Mark Adler, H. Peter Anvin, Mikko Pouru, and Lars Wirzenius.
77
+
78
+ This document has been put into the public domain.
79
+
80
+
81
+ 0.2. Getting the Latest Version
82
+
83
+ The latest official version of this document can be downloaded
84
+ from <https://tukaani.org/xz/xz-file-format.txt>.
85
+
86
+ Specific versions of this document have a filename
87
+ xz-file-format-X.Y.Z.txt where X.Y.Z is the version number.
88
+ For example, the version 1.0.0 of this document is available
89
+ at <https://tukaani.org/xz/xz-file-format-1.0.0.txt>.
90
+
91
+
92
+ 0.3. Version History
93
+
94
+ Version Date Description
95
+
96
+ 1.2.1 2024-04-08 The URLs of this specification and
97
+ XZ Utils were changed back to the
98
+ original ones in Sections 0.2 and 7.
99
+
100
+ 1.2.0 2024-01-19 Added RISC-V filter and updated URLs in
101
+ Sections 0.2 and 7. The URL of this
102
+ specification was changed.
103
+
104
+ 1.1.0 2022-12-11 Added ARM64 filter and clarified 32-bit
105
+ ARM endianness in Section 5.3.2,
106
+ language improvements in Section 5.4
107
+
108
+ 1.0.4 2009-08-27 Language improvements in Sections 1.2,
109
+ 2.1.1.2, 3.1.1, 3.1.2, and 5.3.1
110
+
111
+ 1.0.3 2009-06-05 Spelling fixes in Sections 5.1 and 5.4
112
+
113
+ 1.0.2 2009-06-04 Typo fixes in Sections 4 and 5.3.1
114
+
115
+ 1.0.1 2009-06-01 Typo fix in Section 0.3 and minor
116
+ clarifications to Sections 2, 2.2,
117
+ 3.3, 4.4, and 5.3.2
118
+
119
+ 1.0.0 2009-01-14 The first official version
120
+
121
+
122
+ 1. Conventions
123
+
124
+ The key words "MUST", "MUST NOT", "REQUIRED", "SHOULD",
125
+ "SHOULD NOT", "RECOMMENDED", "MAY", and "OPTIONAL" in this
126
+ document are to be interpreted as described in [RFC-2119].
127
+
128
+ Indicating a warning means displaying a message, returning
129
+ appropriate exit status, or doing something else to let the
130
+ user know that something worth warning occurred. The operation
131
+ SHOULD still finish if a warning is indicated.
132
+
133
+ Indicating an error means displaying a message, returning
134
+ appropriate exit status, or doing something else to let the
135
+ user know that something prevented successfully finishing the
136
+ operation. The operation MUST be aborted once an error has
137
+ been indicated.
138
+
139
+
140
+ 1.1. Byte and Its Representation
141
+
142
+ In this document, byte is always 8 bits.
143
+
144
+ A "null byte" has all bits unset. That is, the value of a null
145
+ byte is 0x00.
146
+
147
+ To represent byte blocks, this document uses notation that
148
+ is similar to the notation used in [RFC-1952]:
149
+
150
+ +-------+
151
+ | Foo | One byte.
152
+ +-------+
153
+
154
+ +---+---+
155
+ | Foo | Two bytes; that is, some of the vertical bars
156
+ +---+---+ can be missing.
157
+
158
+ +=======+
159
+ | Foo | Zero or more bytes.
160
+ +=======+
161
+
162
+ In this document, a boxed byte or a byte sequence declared
163
+ using this notation is called "a field". The example field
164
+ above would be called "the Foo field" or plain "Foo".
165
+
166
+ If there are many fields, they may be split to multiple lines.
167
+ This is indicated with an arrow ("--->"):
168
+
169
+ +=====+
170
+ | Foo |
171
+ +=====+
172
+
173
+ +=====+
174
+ ---> | Bar |
175
+ +=====+
176
+
177
+ The above is equivalent to this:
178
+
179
+ +=====+=====+
180
+ | Foo | Bar |
181
+ +=====+=====+
182
+
183
+
184
+ 1.2. Multibyte Integers
185
+
186
+ Multibyte integers of static length, such as CRC values,
187
+ are stored in little endian byte order (least significant
188
+ byte first).
189
+
190
+ When smaller values are more likely than bigger values (for
191
+ example file sizes), multibyte integers are encoded in a
192
+ variable-length representation:
193
+ - Numbers in the range [0, 127] are copied as is, and take
194
+ one byte of space.
195
+ - Bigger numbers will occupy two or more bytes. All but the
196
+ last byte of the multibyte representation have the highest
197
+ (eighth) bit set.
198
+
199
+ For now, the value of the variable-length integers is limited
200
+ to 63 bits, which limits the encoded size of the integer to
201
+ nine bytes. These limits may be increased in the future if
202
+ needed.
203
+
204
+ The following C code illustrates encoding and decoding of
205
+ variable-length integers. The functions return the number of
206
+ bytes occupied by the integer (1-9), or zero on error.
207
+
208
+ #include <stddef.h>
209
+ #include <inttypes.h>
210
+
211
+ size_t
212
+ encode(uint8_t buf[static 9], uint64_t num)
213
+ {
214
+ if (num > UINT64_MAX / 2)
215
+ return 0;
216
+
217
+ size_t i = 0;
218
+
219
+ while (num >= 0x80) {
220
+ buf[i++] = (uint8_t)(num) | 0x80;
221
+ num >>= 7;
222
+ }
223
+
224
+ buf[i++] = (uint8_t)(num);
225
+
226
+ return i;
227
+ }
228
+
229
+ size_t
230
+ decode(const uint8_t buf[], size_t size_max, uint64_t *num)
231
+ {
232
+ if (size_max == 0)
233
+ return 0;
234
+
235
+ if (size_max > 9)
236
+ size_max = 9;
237
+
238
+ *num = buf[0] & 0x7F;
239
+ size_t i = 0;
240
+
241
+ while (buf[i++] & 0x80) {
242
+ if (i >= size_max || buf[i] == 0x00)
243
+ return 0;
244
+
245
+ *num |= (uint64_t)(buf[i] & 0x7F) << (i * 7);
246
+ }
247
+
248
+ return i;
249
+ }
250
+
251
+
252
+ 2. Overall Structure of .xz File
253
+
254
+ A standalone .xz files consist of one or more Streams which may
255
+ have Stream Padding between or after them:
256
+
257
+ +========+================+========+================+
258
+ | Stream | Stream Padding | Stream | Stream Padding | ...
259
+ +========+================+========+================+
260
+
261
+ The sizes of Stream and Stream Padding are always multiples
262
+ of four bytes, thus the size of every valid .xz file MUST be
263
+ a multiple of four bytes.
264
+
265
+ While a typical file contains only one Stream and no Stream
266
+ Padding, a decoder handling standalone .xz files SHOULD support
267
+ files that have more than one Stream or Stream Padding.
268
+
269
+ In contrast to standalone .xz files, when the .xz file format
270
+ is used as an internal part of some other file format or
271
+ communication protocol, it usually is expected that the decoder
272
+ stops after the first Stream, and doesn't look for Stream
273
+ Padding or possibly other Streams.
274
+
275
+
276
+ 2.1. Stream
277
+
278
+ +-+-+-+-+-+-+-+-+-+-+-+-+=======+=======+ +=======+
279
+ | Stream Header | Block | Block | ... | Block |
280
+ +-+-+-+-+-+-+-+-+-+-+-+-+=======+=======+ +=======+
281
+
282
+ +=======+-+-+-+-+-+-+-+-+-+-+-+-+
283
+ ---> | Index | Stream Footer |
284
+ +=======+-+-+-+-+-+-+-+-+-+-+-+-+
285
+
286
+ All the above fields have a size that is a multiple of four. If
287
+ Stream is used as an internal part of another file format, it
288
+ is RECOMMENDED to make the Stream start at an offset that is
289
+ a multiple of four bytes.
290
+
291
+ Stream Header, Index, and Stream Footer are always present in
292
+ a Stream. The maximum size of the Index field is 16 GiB (2^34).
293
+
294
+ There are zero or more Blocks. The maximum number of Blocks is
295
+ limited only by the maximum size of the Index field.
296
+
297
+ Total size of a Stream MUST be less than 8 EiB (2^63 bytes).
298
+ The same limit applies to the total amount of uncompressed
299
+ data stored in a Stream.
300
+
301
+ If an implementation supports handling .xz files with multiple
302
+ concatenated Streams, it MAY apply the above limits to the file
303
+ as a whole instead of limiting per Stream basis.
304
+
305
+
306
+ 2.1.1. Stream Header
307
+
308
+ +---+---+---+---+---+---+-------+------+--+--+--+--+
309
+ | Header Magic Bytes | Stream Flags | CRC32 |
310
+ +---+---+---+---+---+---+-------+------+--+--+--+--+
311
+
312
+
313
+ 2.1.1.1. Header Magic Bytes
314
+
315
+ The first six (6) bytes of the Stream are so called Header
316
+ Magic Bytes. They can be used to identify the file type.
317
+
318
+ Using a C array and ASCII:
319
+ const uint8_t HEADER_MAGIC[6]
320
+ = { 0xFD, '7', 'z', 'X', 'Z', 0x00 };
321
+
322
+ In plain hexadecimal:
323
+ FD 37 7A 58 5A 00
324
+
325
+ Notes:
326
+ - The first byte (0xFD) was chosen so that the files cannot
327
+ be erroneously detected as being in .lzma format, in which
328
+ the first byte is in the range [0x00, 0xE0].
329
+ - The sixth byte (0x00) was chosen to prevent applications
330
+ from misdetecting the file as a text file.
331
+
332
+ If the Header Magic Bytes don't match, the decoder MUST
333
+ indicate an error.
334
+
335
+
336
+ 2.1.1.2. Stream Flags
337
+
338
+ The first byte of Stream Flags is always a null byte. In the
339
+ future, this byte may be used to indicate a new Stream version
340
+ or other Stream properties.
341
+
342
+ The second byte of Stream Flags is a bit field:
343
+
344
+ Bit(s) Mask Description
345
+ 0-3 0x0F Type of Check (see Section 3.4):
346
+ ID Size Check name
347
+ 0x00 0 bytes None
348
+ 0x01 4 bytes CRC32
349
+ 0x02 4 bytes (Reserved)
350
+ 0x03 4 bytes (Reserved)
351
+ 0x04 8 bytes CRC64
352
+ 0x05 8 bytes (Reserved)
353
+ 0x06 8 bytes (Reserved)
354
+ 0x07 16 bytes (Reserved)
355
+ 0x08 16 bytes (Reserved)
356
+ 0x09 16 bytes (Reserved)
357
+ 0x0A 32 bytes SHA-256
358
+ 0x0B 32 bytes (Reserved)
359
+ 0x0C 32 bytes (Reserved)
360
+ 0x0D 64 bytes (Reserved)
361
+ 0x0E 64 bytes (Reserved)
362
+ 0x0F 64 bytes (Reserved)
363
+ 4-7 0xF0 Reserved for future use; MUST be zero for now.
364
+
365
+ Implementations SHOULD support at least the Check IDs 0x00
366
+ (None) and 0x01 (CRC32). Supporting other Check IDs is
367
+ OPTIONAL. If an unsupported Check is used, the decoder SHOULD
368
+ indicate a warning or error.
369
+
370
+ If any reserved bit is set, the decoder MUST indicate an error.
371
+ It is possible that there is a new field present which the
372
+ decoder is not aware of, and can thus parse the Stream Header
373
+ incorrectly.
374
+
375
+
376
+ 2.1.1.3. CRC32
377
+
378
+ The CRC32 is calculated from the Stream Flags field. It is
379
+ stored as an unsigned 32-bit little endian integer. If the
380
+ calculated value does not match the stored one, the decoder
381
+ MUST indicate an error.
382
+
383
+ The idea is that Stream Flags would always be two bytes, even
384
+ if new features are needed. This way old decoders will be able
385
+ to verify the CRC32 calculated from Stream Flags, and thus
386
+ distinguish between corrupt files (CRC32 doesn't match) and
387
+ files that the decoder doesn't support (CRC32 matches but
388
+ Stream Flags has reserved bits set).
389
+
390
+
391
+ 2.1.2. Stream Footer
392
+
393
+ +-+-+-+-+---+---+---+---+-------+------+----------+---------+
394
+ | CRC32 | Backward Size | Stream Flags | Footer Magic Bytes |
395
+ +-+-+-+-+---+---+---+---+-------+------+----------+---------+
396
+
397
+
398
+ 2.1.2.1. CRC32
399
+
400
+ The CRC32 is calculated from the Backward Size and Stream Flags
401
+ fields. It is stored as an unsigned 32-bit little endian
402
+ integer. If the calculated value does not match the stored one,
403
+ the decoder MUST indicate an error.
404
+
405
+ The reason to have the CRC32 field before the Backward Size and
406
+ Stream Flags fields is to keep the four-byte fields aligned to
407
+ a multiple of four bytes.
408
+
409
+
410
+ 2.1.2.2. Backward Size
411
+
412
+ Backward Size is stored as a 32-bit little endian integer,
413
+ which indicates the size of the Index field as multiple of
414
+ four bytes, minimum value being four bytes:
415
+
416
+ real_backward_size = (stored_backward_size + 1) * 4;
417
+
418
+ If the stored value does not match the real size of the Index
419
+ field, the decoder MUST indicate an error.
420
+
421
+ Using a fixed-size integer to store Backward Size makes
422
+ it slightly simpler to parse the Stream Footer when the
423
+ application needs to parse the Stream backwards.
424
+
425
+
426
+ 2.1.2.3. Stream Flags
427
+
428
+ This is a copy of the Stream Flags field from the Stream
429
+ Header. The information stored to Stream Flags is needed
430
+ when parsing the Stream backwards. The decoder MUST compare
431
+ the Stream Flags fields in both Stream Header and Stream
432
+ Footer, and indicate an error if they are not identical.
433
+
434
+
435
+ 2.1.2.4. Footer Magic Bytes
436
+
437
+ As the last step of the decoding process, the decoder MUST
438
+ verify the existence of Footer Magic Bytes. If they don't
439
+ match, an error MUST be indicated.
440
+
441
+ Using a C array and ASCII:
442
+ const uint8_t FOOTER_MAGIC[2] = { 'Y', 'Z' };
443
+
444
+ In hexadecimal:
445
+ 59 5A
446
+
447
+ The primary reason to have Footer Magic Bytes is to make
448
+ it easier to detect incomplete files quickly, without
449
+ uncompressing. If the file does not end with Footer Magic Bytes
450
+ (excluding Stream Padding described in Section 2.2), it cannot
451
+ be undamaged, unless someone has intentionally appended garbage
452
+ after the end of the Stream.
453
+
454
+
455
+ 2.2. Stream Padding
456
+
457
+ Only the decoders that support decoding of concatenated Streams
458
+ MUST support Stream Padding.
459
+
460
+ Stream Padding MUST contain only null bytes. To preserve the
461
+ four-byte alignment of consecutive Streams, the size of Stream
462
+ Padding MUST be a multiple of four bytes. Empty Stream Padding
463
+ is allowed. If these requirements are not met, the decoder MUST
464
+ indicate an error.
465
+
466
+ Note that non-empty Stream Padding is allowed at the end of the
467
+ file; there doesn't need to be a new Stream after non-empty
468
+ Stream Padding. This can be convenient in certain situations
469
+ [GNU-tar].
470
+
471
+ The possibility of Stream Padding MUST be taken into account
472
+ when designing an application that parses Streams backwards,
473
+ and the application supports concatenated Streams.
474
+
475
+
476
+ 3. Block
477
+
478
+ +==============+=================+===============+=======+
479
+ | Block Header | Compressed Data | Block Padding | Check |
480
+ +==============+=================+===============+=======+
481
+
482
+
483
+ 3.1. Block Header
484
+
485
+ +-------------------+-------------+=================+
486
+ | Block Header Size | Block Flags | Compressed Size |
487
+ +-------------------+-------------+=================+
488
+
489
+ +===================+======================+
490
+ ---> | Uncompressed Size | List of Filter Flags |
491
+ +===================+======================+
492
+
493
+ +================+--+--+--+--+
494
+ ---> | Header Padding | CRC32 |
495
+ +================+--+--+--+--+
496
+
497
+
498
+ 3.1.1. Block Header Size
499
+
500
+ This field overlaps with the Index Indicator field (see
501
+ Section 4.1).
502
+
503
+ This field contains the size of the Block Header field,
504
+ including the Block Header Size field itself. Valid values are
505
+ in the range [0x01, 0xFF], which indicate the size of the Block
506
+ Header as multiples of four bytes, minimum size being eight
507
+ bytes:
508
+
509
+ real_header_size = (encoded_header_size + 1) * 4;
510
+
511
+ If a Block Header bigger than 1024 bytes is needed in the
512
+ future, a new field can be added between the Block Header and
513
+ Compressed Data fields. The presence of this new field would
514
+ be indicated in the Block Header field.
515
+
516
+
517
+ 3.1.2. Block Flags
518
+
519
+ The Block Flags field is a bit field:
520
+
521
+ Bit(s) Mask Description
522
+ 0-1 0x03 Number of filters (1-4)
523
+ 2-5 0x3C Reserved for future use; MUST be zero for now.
524
+ 6 0x40 The Compressed Size field is present.
525
+ 7 0x80 The Uncompressed Size field is present.
526
+
527
+ If any reserved bit is set, the decoder MUST indicate an error.
528
+ It is possible that there is a new field present which the
529
+ decoder is not aware of, and can thus parse the Block Header
530
+ incorrectly.
531
+
532
+
533
+ 3.1.3. Compressed Size
534
+
535
+ This field is present only if the appropriate bit is set in
536
+ the Block Flags field (see Section 3.1.2).
537
+
538
+ The Compressed Size field contains the size of the Compressed
539
+ Data field, which MUST be non-zero. Compressed Size is stored
540
+ using the encoding described in Section 1.2. If the Compressed
541
+ Size doesn't match the size of the Compressed Data field, the
542
+ decoder MUST indicate an error.
543
+
544
+
545
+ 3.1.4. Uncompressed Size
546
+
547
+ This field is present only if the appropriate bit is set in
548
+ the Block Flags field (see Section 3.1.2).
549
+
550
+ The Uncompressed Size field contains the size of the Block
551
+ after uncompressing. Uncompressed Size is stored using the
552
+ encoding described in Section 1.2. If the Uncompressed Size
553
+ does not match the real uncompressed size, the decoder MUST
554
+ indicate an error.
555
+
556
+ Storing the Compressed Size and Uncompressed Size fields serves
557
+ several purposes:
558
+ - The decoder knows how much memory it needs to allocate
559
+ for a temporary buffer in multithreaded mode.
560
+ - Simple error detection: wrong size indicates a broken file.
561
+ - Seeking forwards to a specific location in streamed mode.
562
+
563
+ It should be noted that the only reliable way to determine
564
+ the real uncompressed size is to uncompress the Block,
565
+ because the Block Header and Index fields may contain
566
+ (intentionally or unintentionally) invalid information.
567
+
568
+
569
+ 3.1.5. List of Filter Flags
570
+
571
+ +================+================+ +================+
572
+ | Filter 0 Flags | Filter 1 Flags | ... | Filter n Flags |
573
+ +================+================+ +================+
574
+
575
+ The number of Filter Flags fields is stored in the Block Flags
576
+ field (see Section 3.1.2).
577
+
578
+ The format of each Filter Flags field is as follows:
579
+
580
+ +===========+====================+===================+
581
+ | Filter ID | Size of Properties | Filter Properties |
582
+ +===========+====================+===================+
583
+
584
+ Both Filter ID and Size of Properties are stored using the
585
+ encoding described in Section 1.2. Size of Properties indicates
586
+ the size of the Filter Properties field as bytes. The list of
587
+ officially defined Filter IDs and the formats of their Filter
588
+ Properties are described in Section 5.3.
589
+
590
+ Filter IDs greater than or equal to 0x4000_0000_0000_0000
591
+ (2^62) are reserved for implementation-specific internal use.
592
+ These Filter IDs MUST never be used in List of Filter Flags.
593
+
594
+
595
+ 3.1.6. Header Padding
596
+
597
+ This field contains as many null byte as it is needed to make
598
+ the Block Header have the size specified in Block Header Size.
599
+ If any of the bytes are not null bytes, the decoder MUST
600
+ indicate an error. It is possible that there is a new field
601
+ present which the decoder is not aware of, and can thus parse
602
+ the Block Header incorrectly.
603
+
604
+
605
+ 3.1.7. CRC32
606
+
607
+ The CRC32 is calculated over everything in the Block Header
608
+ field except the CRC32 field itself. It is stored as an
609
+ unsigned 32-bit little endian integer. If the calculated
610
+ value does not match the stored one, the decoder MUST indicate
611
+ an error.
612
+
613
+ By verifying the CRC32 of the Block Header before parsing the
614
+ actual contents allows the decoder to distinguish between
615
+ corrupt and unsupported files.
616
+
617
+
618
+ 3.2. Compressed Data
619
+
620
+ The format of Compressed Data depends on Block Flags and List
621
+ of Filter Flags. Excluding the descriptions of the simplest
622
+ filters in Section 5.3, the format of the filter-specific
623
+ encoded data is out of scope of this document.
624
+
625
+
626
+ 3.3. Block Padding
627
+
628
+ Block Padding MUST contain 0-3 null bytes to make the size of
629
+ the Block a multiple of four bytes. This can be needed when
630
+ the size of Compressed Data is not a multiple of four. If any
631
+ of the bytes in Block Padding are not null bytes, the decoder
632
+ MUST indicate an error.
633
+
634
+
635
+ 3.4. Check
636
+
637
+ The type and size of the Check field depends on which bits
638
+ are set in the Stream Flags field (see Section 2.1.1.2).
639
+
640
+ The Check, when used, is calculated from the original
641
+ uncompressed data. If the calculated Check does not match the
642
+ stored one, the decoder MUST indicate an error. If the selected
643
+ type of Check is not supported by the decoder, it SHOULD
644
+ indicate a warning or error.
645
+
646
+
647
+ 4. Index
648
+
649
+ +-----------------+===================+
650
+ | Index Indicator | Number of Records |
651
+ +-----------------+===================+
652
+
653
+ +=================+===============+-+-+-+-+
654
+ ---> | List of Records | Index Padding | CRC32 |
655
+ +=================+===============+-+-+-+-+
656
+
657
+ Index serves several purposes. Using it, one can
658
+ - verify that all Blocks in a Stream have been processed;
659
+ - find out the uncompressed size of a Stream; and
660
+ - quickly access the beginning of any Block (random access).
661
+
662
+
663
+ 4.1. Index Indicator
664
+
665
+ This field overlaps with the Block Header Size field (see
666
+ Section 3.1.1). The value of Index Indicator is always 0x00.
667
+
668
+
669
+ 4.2. Number of Records
670
+
671
+ This field indicates how many Records there are in the List
672
+ of Records field, and thus how many Blocks there are in the
673
+ Stream. The value is stored using the encoding described in
674
+ Section 1.2. If the decoder has decoded all the Blocks of the
675
+ Stream, and then notices that the Number of Records doesn't
676
+ match the real number of Blocks, the decoder MUST indicate an
677
+ error.
678
+
679
+
680
+ 4.3. List of Records
681
+
682
+ List of Records consists of as many Records as indicated by the
683
+ Number of Records field:
684
+
685
+ +========+========+
686
+ | Record | Record | ...
687
+ +========+========+
688
+
689
+ Each Record contains information about one Block:
690
+
691
+ +===============+===================+
692
+ | Unpadded Size | Uncompressed Size |
693
+ +===============+===================+
694
+
695
+ If the decoder has decoded all the Blocks of the Stream, it
696
+ MUST verify that the contents of the Records match the real
697
+ Unpadded Size and Uncompressed Size of the respective Blocks.
698
+
699
+ Implementation hint: It is possible to verify the Index with
700
+ constant memory usage by calculating for example SHA-256 of
701
+ both the real size values and the List of Records, then
702
+ comparing the hash values. Implementing this using
703
+ non-cryptographic hash like CRC32 SHOULD be avoided unless
704
+ small code size is important.
705
+
706
+ If the decoder supports random-access reading, it MUST verify
707
+ that Unpadded Size and Uncompressed Size of every completely
708
+ decoded Block match the sizes stored in the Index. If only
709
+ partial Block is decoded, the decoder MUST verify that the
710
+ processed sizes don't exceed the sizes stored in the Index.
711
+
712
+
713
+ 4.3.1. Unpadded Size
714
+
715
+ This field indicates the size of the Block excluding the Block
716
+ Padding field. That is, Unpadded Size is the size of the Block
717
+ Header, Compressed Data, and Check fields. Unpadded Size is
718
+ stored using the encoding described in Section 1.2. The value
719
+ MUST never be zero; with the current structure of Blocks, the
720
+ actual minimum value for Unpadded Size is five.
721
+
722
+ Implementation note: Because the size of the Block Padding
723
+ field is not included in Unpadded Size, calculating the total
724
+ size of a Stream or doing random-access reading requires
725
+ calculating the actual size of the Blocks by rounding Unpadded
726
+ Sizes up to the next multiple of four.
727
+
728
+ The reason to exclude Block Padding from Unpadded Size is to
729
+ ease making a raw copy of Compressed Data without Block
730
+ Padding. This can be useful, for example, if someone wants
731
+ to convert Streams to some other file format quickly.
732
+
733
+
734
+ 4.3.2. Uncompressed Size
735
+
736
+ This field indicates the Uncompressed Size of the respective
737
+ Block as bytes. The value is stored using the encoding
738
+ described in Section 1.2.
739
+
740
+
741
+ 4.4. Index Padding
742
+
743
+ This field MUST contain 0-3 null bytes to pad the Index to
744
+ a multiple of four bytes. If any of the bytes are not null
745
+ bytes, the decoder MUST indicate an error.
746
+
747
+
748
+ 4.5. CRC32
749
+
750
+ The CRC32 is calculated over everything in the Index field
751
+ except the CRC32 field itself. The CRC32 is stored as an
752
+ unsigned 32-bit little endian integer. If the calculated
753
+ value does not match the stored one, the decoder MUST indicate
754
+ an error.
755
+
756
+
757
+ 5. Filter Chains
758
+
759
+ The Block Flags field defines how many filters are used. When
760
+ more than one filter is used, the filters are chained; that is,
761
+ the output of one filter is the input of another filter. The
762
+ following figure illustrates the direction of data flow.
763
+
764
+ v Uncompressed Data ^
765
+ | Filter 0 |
766
+ Encoder | Filter 1 | Decoder
767
+ | Filter n |
768
+ v Compressed Data ^
769
+
770
+
771
+ 5.1. Alignment
772
+
773
+ Alignment of uncompressed input data is usually the job of
774
+ the application producing the data. For example, to get the
775
+ best results, an archiver tool should make sure that all
776
+ PowerPC executable files in the archive stream start at
777
+ offsets that are multiples of four bytes.
778
+
779
+ Some filters, for example LZMA2, can be configured to take
780
+ advantage of specified alignment of input data. Note that
781
+ taking advantage of aligned input can be beneficial also when
782
+ a filter is not the first filter in the chain. For example,
783
+ if you compress PowerPC executables, you may want to use the
784
+ PowerPC filter and chain that with the LZMA2 filter. Because
785
+ not only the input but also the output alignment of the PowerPC
786
+ filter is four bytes, it is now beneficial to set LZMA2
787
+ settings so that the LZMA2 encoder can take advantage of its
788
+ four-byte-aligned input data.
789
+
790
+ The output of the last filter in the chain is stored to the
791
+ Compressed Data field, which is is guaranteed to be aligned
792
+ to a multiple of four bytes relative to the beginning of the
793
+ Stream. This can increase
794
+ - speed, if the filtered data is handled multiple bytes at
795
+ a time by the filter-specific encoder and decoder,
796
+ because accessing aligned data in computer memory is
797
+ usually faster; and
798
+ - compression ratio, if the output data is later compressed
799
+ with an external compression tool.
800
+
801
+
802
+ 5.2. Security
803
+
804
+ If filters would be allowed to be chained freely, it would be
805
+ possible to create malicious files, that would be very slow to
806
+ decode. Such files could be used to create denial of service
807
+ attacks.
808
+
809
+ Slow files could occur when multiple filters are chained:
810
+
811
+ v Compressed input data
812
+ | Filter 1 decoder (last filter)
813
+ | Filter 0 decoder (non-last filter)
814
+ v Uncompressed output data
815
+
816
+ The decoder of the last filter in the chain produces a lot of
817
+ output from little input. Another filter in the chain takes the
818
+ output of the last filter, and produces very little output
819
+ while consuming a lot of input. As a result, a lot of data is
820
+ moved inside the filter chain, but the filter chain as a whole
821
+ gets very little work done.
822
+
823
+ To prevent this kind of slow files, there are restrictions on
824
+ how the filters can be chained. These restrictions MUST be
825
+ taken into account when designing new filters.
826
+
827
+ The maximum number of filters in the chain has been limited to
828
+ four, thus there can be at maximum of three non-last filters.
829
+ Of these three non-last filters, only two are allowed to change
830
+ the size of the data.
831
+
832
+ The non-last filters, that change the size of the data, MUST
833
+ have a limit how much the decoder can compress the data: the
834
+ decoder SHOULD produce at least n bytes of output when the
835
+ filter is given 2n bytes of input. This limit is not
836
+ absolute, but significant deviations MUST be avoided.
837
+
838
+ The above limitations guarantee that if the last filter in the
839
+ chain produces 4n bytes of output, the chain as a whole will
840
+ produce at least n bytes of output.
841
+
842
+
843
+ 5.3. Filters
844
+
845
+ 5.3.1. LZMA2
846
+
847
+ LZMA (Lempel-Ziv-Markov chain-Algorithm) is a general-purpose
848
+ compression algorithm with high compression ratio and fast
849
+ decompression. LZMA is based on LZ77 and range coding
850
+ algorithms.
851
+
852
+ LZMA2 is an extension on top of the original LZMA. LZMA2 uses
853
+ LZMA internally, but adds support for flushing the encoder,
854
+ uncompressed chunks, eases stateful decoder implementations,
855
+ and improves support for multithreading. Thus, the plain LZMA
856
+ will not be supported in this file format.
857
+
858
+ Filter ID: 0x21
859
+ Size of Filter Properties: 1 byte
860
+ Changes size of data: Yes
861
+ Allow as a non-last filter: No
862
+ Allow as the last filter: Yes
863
+
864
+ Preferred alignment:
865
+ Input data: Adjustable to 1/2/4/8/16 byte(s)
866
+ Output data: 1 byte
867
+
868
+ The format of the one-byte Filter Properties field is as
869
+ follows:
870
+
871
+ Bits Mask Description
872
+ 0-5 0x3F Dictionary Size
873
+ 6-7 0xC0 Reserved for future use; MUST be zero for now.
874
+
875
+ Dictionary Size is encoded with one-bit mantissa and five-bit
876
+ exponent. The smallest dictionary size is 4 KiB and the biggest
877
+ is 4 GiB.
878
+
879
+ Raw value Mantissa Exponent Dictionary size
880
+ 0 2 11 4 KiB
881
+ 1 3 11 6 KiB
882
+ 2 2 12 8 KiB
883
+ 3 3 12 12 KiB
884
+ 4 2 13 16 KiB
885
+ 5 3 13 24 KiB
886
+ 6 2 14 32 KiB
887
+ ... ... ... ...
888
+ 35 3 27 768 MiB
889
+ 36 2 28 1024 MiB
890
+ 37 3 29 1536 MiB
891
+ 38 2 30 2048 MiB
892
+ 39 3 30 3072 MiB
893
+ 40 2 31 4096 MiB - 1 B
894
+
895
+ Instead of having a table in the decoder, the dictionary size
896
+ can be decoded using the following C code:
897
+
898
+ const uint8_t bits = get_dictionary_flags() & 0x3F;
899
+ if (bits > 40)
900
+ return DICTIONARY_TOO_BIG; // Bigger than 4 GiB
901
+
902
+ uint32_t dictionary_size;
903
+ if (bits == 40) {
904
+ dictionary_size = UINT32_MAX;
905
+ } else {
906
+ dictionary_size = 2 | (bits & 1);
907
+ dictionary_size <<= bits / 2 + 11;
908
+ }
909
+
910
+
911
+ 5.3.2. Branch/Call/Jump Filters for Executables
912
+
913
+ These filters convert relative branch, call, and jump
914
+ instructions to their absolute counterparts in executable
915
+ files. This conversion increases redundancy and thus
916
+ compression ratio.
917
+
918
+ Size of Filter Properties: 0 or 4 bytes
919
+ Changes size of data: No
920
+ Allow as a non-last filter: Yes
921
+ Allow as the last filter: No
922
+
923
+ Below is the list of filters in this category. The alignment
924
+ is the same for both input and output data.
925
+
926
+ Filter ID Alignment Description
927
+ 0x04 1 byte x86 filter (BCJ)
928
+ 0x05 4 bytes PowerPC (big endian) filter
929
+ 0x06 16 bytes IA64 filter
930
+ 0x07 4 bytes ARM filter [1]
931
+ 0x08 2 bytes ARM Thumb filter [1]
932
+ 0x09 4 bytes SPARC filter
933
+ 0x0A 4 bytes ARM64 filter [2]
934
+ 0x0B 2 bytes RISC-V filter
935
+
936
+ [1] These are for little endian instruction encoding.
937
+ This must not be confused with data endianness.
938
+ A processor configured for big endian data access
939
+ may still use little endian instruction encoding.
940
+ The filters don't care about the data endianness.
941
+
942
+ [2] 4096-byte alignment gives the best results
943
+ because the address in the ADRP instruction
944
+ is a multiple of 4096 bytes.
945
+
946
+ If the size of Filter Properties is four bytes, the Filter
947
+ Properties field contains the start offset used for address
948
+ conversions. It is stored as an unsigned 32-bit little endian
949
+ integer. The start offset MUST be a multiple of the alignment
950
+ of the filter as listed in the table above; if it isn't, the
951
+ decoder MUST indicate an error. If the size of Filter
952
+ Properties is zero, the start offset is zero.
953
+
954
+ Setting the start offset may be useful if an executable has
955
+ multiple sections, and there are many cross-section calls.
956
+ Taking advantage of this feature usually requires usage of
957
+ the Subblock filter, whose design is not complete yet.
958
+
959
+
960
+ 5.3.3. Delta
961
+
962
+ The Delta filter may increase compression ratio when the value
963
+ of the next byte correlates with the value of an earlier byte
964
+ at specified distance.
965
+
966
+ Filter ID: 0x03
967
+ Size of Filter Properties: 1 byte
968
+ Changes size of data: No
969
+ Allow as a non-last filter: Yes
970
+ Allow as the last filter: No
971
+
972
+ Preferred alignment:
973
+ Input data: 1 byte
974
+ Output data: Same as the original input data
975
+
976
+ The Properties byte indicates the delta distance, which can be
977
+ 1-256 bytes backwards from the current byte: 0x00 indicates
978
+ distance of 1 byte and 0xFF distance of 256 bytes.
979
+
980
+
981
+ 5.3.3.1. Format of the Encoded Output
982
+
983
+ The code below illustrates both encoding and decoding with
984
+ the Delta filter.
985
+
986
+ // Distance is in the range [1, 256].
987
+ const unsigned int distance = get_properties_byte() + 1;
988
+ uint8_t pos = 0;
989
+ uint8_t delta[256];
990
+
991
+ memset(delta, 0, sizeof(delta));
992
+
993
+ while (1) {
994
+ const int byte = read_byte();
995
+ if (byte == EOF)
996
+ break;
997
+
998
+ uint8_t tmp = delta[(uint8_t)(distance + pos)];
999
+ if (is_encoder) {
1000
+ tmp = (uint8_t)(byte) - tmp;
1001
+ delta[pos] = (uint8_t)(byte);
1002
+ } else {
1003
+ tmp = (uint8_t)(byte) + tmp;
1004
+ delta[pos] = tmp;
1005
+ }
1006
+
1007
+ write_byte(tmp);
1008
+ --pos;
1009
+ }
1010
+
1011
+
1012
+ 5.4. Custom Filter IDs
1013
+
1014
+ If a developer wants to use custom Filter IDs, there are two
1015
+ choices. The first choice is to contact Lasse Collin and ask
1016
+ him to allocate a range of IDs for the developer.
1017
+
1018
+ The second choice is to generate a 40-bit random integer
1019
+ which the developer can use as a personal Developer ID.
1020
+ To minimize the risk of collisions, Developer ID has to be
1021
+ a randomly generated integer, not manually selected "hex word".
1022
+ The following command, which works on many free operating
1023
+ systems, can be used to generate Developer ID:
1024
+
1025
+ dd if=/dev/urandom bs=5 count=1 | hexdump
1026
+
1027
+ The developer can then use the Developer ID to create unique
1028
+ (well, hopefully unique) Filter IDs.
1029
+
1030
+ Bits Mask Description
1031
+ 0-15 0x0000_0000_0000_FFFF Filter ID
1032
+ 16-55 0x00FF_FFFF_FFFF_0000 Developer ID
1033
+ 56-62 0x3F00_0000_0000_0000 Static prefix: 0x3F
1034
+
1035
+ The resulting 63-bit integer will use 9 bytes of space when
1036
+ stored using the encoding described in Section 1.2. To get
1037
+ a shorter ID, see the beginning of this Section how to
1038
+ request a custom ID range.
1039
+
1040
+
1041
+ 5.4.1. Reserved Custom Filter ID Ranges
1042
+
1043
+ Range Description
1044
+ 0x0000_0300 - 0x0000_04FF Reserved to ease .7z compatibility
1045
+ 0x0002_0000 - 0x0007_FFFF Reserved to ease .7z compatibility
1046
+ 0x0200_0000 - 0x07FF_FFFF Reserved to ease .7z compatibility
1047
+
1048
+
1049
+ 6. Cyclic Redundancy Checks
1050
+
1051
+ There are several incompatible variations to calculate CRC32
1052
+ and CRC64. For simplicity and clarity, complete examples are
1053
+ provided to calculate the checks as they are used in this file
1054
+ format. Implementations MAY use different code as long as it
1055
+ gives identical results.
1056
+
1057
+ The program below reads data from standard input, calculates
1058
+ the CRC32 and CRC64 values, and prints the calculated values
1059
+ as big endian hexadecimal strings to standard output.
1060
+
1061
+ #include <stddef.h>
1062
+ #include <inttypes.h>
1063
+ #include <stdio.h>
1064
+
1065
+ uint32_t crc32_table[256];
1066
+ uint64_t crc64_table[256];
1067
+
1068
+ void
1069
+ init(void)
1070
+ {
1071
+ static const uint32_t poly32 = UINT32_C(0xEDB88320);
1072
+ static const uint64_t poly64
1073
+ = UINT64_C(0xC96C5795D7870F42);
1074
+
1075
+ for (size_t i = 0; i < 256; ++i) {
1076
+ uint32_t crc32 = i;
1077
+ uint64_t crc64 = i;
1078
+
1079
+ for (size_t j = 0; j < 8; ++j) {
1080
+ if (crc32 & 1)
1081
+ crc32 = (crc32 >> 1) ^ poly32;
1082
+ else
1083
+ crc32 >>= 1;
1084
+
1085
+ if (crc64 & 1)
1086
+ crc64 = (crc64 >> 1) ^ poly64;
1087
+ else
1088
+ crc64 >>= 1;
1089
+ }
1090
+
1091
+ crc32_table[i] = crc32;
1092
+ crc64_table[i] = crc64;
1093
+ }
1094
+ }
1095
+
1096
+ uint32_t
1097
+ crc32(const uint8_t *buf, size_t size, uint32_t crc)
1098
+ {
1099
+ crc = ~crc;
1100
+ for (size_t i = 0; i < size; ++i)
1101
+ crc = crc32_table[buf[i] ^ (crc & 0xFF)]
1102
+ ^ (crc >> 8);
1103
+ return ~crc;
1104
+ }
1105
+
1106
+ uint64_t
1107
+ crc64(const uint8_t *buf, size_t size, uint64_t crc)
1108
+ {
1109
+ crc = ~crc;
1110
+ for (size_t i = 0; i < size; ++i)
1111
+ crc = crc64_table[buf[i] ^ (crc & 0xFF)]
1112
+ ^ (crc >> 8);
1113
+ return ~crc;
1114
+ }
1115
+
1116
+ int
1117
+ main()
1118
+ {
1119
+ init();
1120
+
1121
+ uint32_t value32 = 0;
1122
+ uint64_t value64 = 0;
1123
+ uint64_t total_size = 0;
1124
+ uint8_t buf[8192];
1125
+
1126
+ while (1) {
1127
+ const size_t buf_size
1128
+ = fread(buf, 1, sizeof(buf), stdin);
1129
+ if (buf_size == 0)
1130
+ break;
1131
+
1132
+ total_size += buf_size;
1133
+ value32 = crc32(buf, buf_size, value32);
1134
+ value64 = crc64(buf, buf_size, value64);
1135
+ }
1136
+
1137
+ printf("Bytes: %" PRIu64 "\n", total_size);
1138
+ printf("CRC-32: 0x%08" PRIX32 "\n", value32);
1139
+ printf("CRC-64: 0x%016" PRIX64 "\n", value64);
1140
+
1141
+ return 0;
1142
+ }
1143
+
1144
+
1145
+ 7. References
1146
+
1147
+ LZMA SDK - The original LZMA implementation
1148
+ https://7-zip.org/sdk.html
1149
+
1150
+ LZMA Utils - LZMA adapted to POSIX-like systems
1151
+ https://tukaani.org/lzma/
1152
+
1153
+ XZ Utils - The next generation of LZMA Utils
1154
+ https://tukaani.org/xz/
1155
+
1156
+ [RFC-1952]
1157
+ GZIP file format specification version 4.3
1158
+ https://www.ietf.org/rfc/rfc1952.txt
1159
+ - Notation of byte boxes in section "2.1. Overall conventions"
1160
+
1161
+ [RFC-2119]
1162
+ Key words for use in RFCs to Indicate Requirement Levels
1163
+ https://www.ietf.org/rfc/rfc2119.txt
1164
+
1165
+ [GNU-tar]
1166
+ GNU tar 1.35 manual
1167
+ https://www.gnu.org/software/tar/manual/html_node/Blocking-Factor.html
1168
+ - Node 9.4.2 "Blocking Factor", paragraph that begins
1169
+ "gzip will complain about trailing garbage"
1170
+ - Note that this URL points to the latest version of the
1171
+ manual, and may some day not contain the note which is in
1172
+ 1.35. For the exact version of the manual, download GNU
1173
+ tar 1.35: ftp://ftp.gnu.org/pub/gnu/tar/tar-1.35.tar.gz
1174
+