html-to-markdown 2.24.6 → 2.25.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (231) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +2 -2
  3. data/README.md +1 -1
  4. data/ext/html-to-markdown-rb/native/Cargo.lock +9 -32
  5. data/ext/html-to-markdown-rb/native/Cargo.toml +1 -1
  6. data/lib/html_to_markdown/version.rb +1 -1
  7. data/rust-vendor/html-to-markdown-rs/Cargo.toml +0 -1
  8. data/rust-vendor/html-to-markdown-rs/src/converter/main_helpers.rs +1 -1
  9. data/rust-vendor/html-to-markdown-rs/src/hocr/converter/hierarchy.rs +20 -5
  10. data/rust-vendor/html-to-markdown-rs/src/lib.rs +1 -0
  11. data/rust-vendor/{markup5ever_rcdom/lib.rs → html-to-markdown-rs/src/rcdom.rs} +56 -91
  12. data/rust-vendor/html-to-markdown-rs/tests/hocr_compliance_test.rs +157 -0
  13. data/rust-vendor/memmap2/.cargo-checksum.json +1 -1
  14. data/rust-vendor/memmap2/.cargo_vcs_info.json +1 -1
  15. data/rust-vendor/memmap2/CHANGELOG.md +8 -0
  16. data/rust-vendor/memmap2/Cargo.lock +1 -1
  17. data/rust-vendor/memmap2/Cargo.toml +2 -1
  18. data/rust-vendor/memmap2/Cargo.toml.orig +2 -1
  19. data/rust-vendor/memmap2/src/lib.rs +25 -1
  20. data/rust-vendor/memmap2/src/stub.rs +1 -4
  21. data/rust-vendor/memmap2/src/unix.rs +14 -1
  22. data/rust-vendor/png/.cargo-checksum.json +1 -1
  23. data/rust-vendor/png/.cargo_vcs_info.json +1 -1
  24. data/rust-vendor/png/CHANGES.md +44 -0
  25. data/rust-vendor/png/Cargo.lock +124 -171
  26. data/rust-vendor/png/Cargo.toml +1 -1
  27. data/rust-vendor/png/Cargo.toml.orig +1 -1
  28. data/rust-vendor/png/benches/expand_paletted.rs +5 -5
  29. data/rust-vendor/png/benches/unfilter.rs +3 -3
  30. data/rust-vendor/png/src/adam7.rs +17 -10
  31. data/rust-vendor/png/src/common.rs +8 -8
  32. data/rust-vendor/png/src/decoder/mod.rs +53 -20
  33. data/rust-vendor/png/src/decoder/stream.rs +263 -78
  34. data/rust-vendor/png/src/decoder/unfiltering_buffer.rs +210 -53
  35. data/rust-vendor/png/src/decoder/zlib.rs +130 -90
  36. data/rust-vendor/png/src/encoder.rs +4 -2
  37. data/rust-vendor/png/src/{filter.rs → filter/mod.rs} +100 -367
  38. data/rust-vendor/png/src/filter/optimization-notes.md +104 -0
  39. data/rust-vendor/png/src/filter/paeth.rs +398 -0
  40. data/rust-vendor/png/src/filter/simd.rs +308 -0
  41. data/rust-vendor/png/src/lib.rs +1 -0
  42. data/rust-vendor/syn/.cargo-checksum.json +1 -1
  43. data/rust-vendor/syn/.cargo_vcs_info.json +1 -1
  44. data/rust-vendor/syn/Cargo.lock +40 -41
  45. data/rust-vendor/syn/Cargo.toml +1 -1
  46. data/rust-vendor/syn/Cargo.toml.orig +1 -1
  47. data/rust-vendor/syn/src/item.rs +61 -40
  48. data/rust-vendor/syn/src/lib.rs +2 -1
  49. data/rust-vendor/syn/tests/test_item.rs +54 -0
  50. data/rust-vendor/unicode-ident/.cargo-checksum.json +1 -1
  51. data/rust-vendor/unicode-ident/.cargo_vcs_info.json +1 -1
  52. data/rust-vendor/unicode-ident/Cargo.lock +21 -21
  53. data/rust-vendor/unicode-ident/Cargo.toml +1 -1
  54. data/rust-vendor/unicode-ident/Cargo.toml.orig +1 -1
  55. data/rust-vendor/unicode-ident/src/lib.rs +1 -1
  56. data/rust-vendor/unicode-ident/src/tables.rs +87 -97
  57. data/rust-vendor/unicode-ident/tests/static_size.rs +1 -1
  58. metadata +7 -177
  59. data/rust-vendor/markup5ever_rcdom/.cargo-checksum.json +0 -1
  60. data/rust-vendor/markup5ever_rcdom/.cargo_vcs_info.json +0 -7
  61. data/rust-vendor/markup5ever_rcdom/Cargo.lock +0 -658
  62. data/rust-vendor/markup5ever_rcdom/Cargo.toml +0 -109
  63. data/rust-vendor/markup5ever_rcdom/Cargo.toml.orig +0 -42
  64. data/rust-vendor/markup5ever_rcdom/LICENSE-APACHE +0 -201
  65. data/rust-vendor/markup5ever_rcdom/LICENSE-MIT +0 -25
  66. data/rust-vendor/markup5ever_rcdom/README.md +0 -7
  67. data/rust-vendor/markup5ever_rcdom/custom-html5lib-tokenizer-tests/regression.test +0 -69
  68. data/rust-vendor/markup5ever_rcdom/data/test/ignore +0 -1
  69. data/rust-vendor/markup5ever_rcdom/examples/hello_xml.rs +0 -39
  70. data/rust-vendor/markup5ever_rcdom/examples/html2html.rs +0 -51
  71. data/rust-vendor/markup5ever_rcdom/examples/print-rcdom.rs +0 -78
  72. data/rust-vendor/markup5ever_rcdom/examples/xml_tree_printer.rs +0 -67
  73. data/rust-vendor/markup5ever_rcdom/html5lib-tests/.gitattributes +0 -2
  74. data/rust-vendor/markup5ever_rcdom/html5lib-tests/.github/workflows/downstream.yml +0 -76
  75. data/rust-vendor/markup5ever_rcdom/html5lib-tests/.github/workflows/lint.yml +0 -25
  76. data/rust-vendor/markup5ever_rcdom/html5lib-tests/.gitignore +0 -79
  77. data/rust-vendor/markup5ever_rcdom/html5lib-tests/AUTHORS.rst +0 -34
  78. data/rust-vendor/markup5ever_rcdom/html5lib-tests/LICENSE +0 -21
  79. data/rust-vendor/markup5ever_rcdom/html5lib-tests/encoding/chardet/test_big5.txt +0 -51
  80. data/rust-vendor/markup5ever_rcdom/html5lib-tests/encoding/scripted/tests1.dat +0 -5
  81. data/rust-vendor/markup5ever_rcdom/html5lib-tests/encoding/test-yahoo-jp.dat +0 -10
  82. data/rust-vendor/markup5ever_rcdom/html5lib-tests/encoding/tests1.dat +0 -388
  83. data/rust-vendor/markup5ever_rcdom/html5lib-tests/encoding/tests2.dat +0 -115
  84. data/rust-vendor/markup5ever_rcdom/html5lib-tests/lint +0 -6
  85. data/rust-vendor/markup5ever_rcdom/html5lib-tests/lint_lib/__init__.py +0 -0
  86. data/rust-vendor/markup5ever_rcdom/html5lib-tests/lint_lib/_vendor/__init__.py +0 -0
  87. data/rust-vendor/markup5ever_rcdom/html5lib-tests/lint_lib/_vendor/funcparserlib/LICENSE +0 -18
  88. data/rust-vendor/markup5ever_rcdom/html5lib-tests/lint_lib/_vendor/funcparserlib/__init__.py +0 -0
  89. data/rust-vendor/markup5ever_rcdom/html5lib-tests/lint_lib/_vendor/funcparserlib/lexer.py +0 -211
  90. data/rust-vendor/markup5ever_rcdom/html5lib-tests/lint_lib/_vendor/funcparserlib/lexer.pyi +0 -34
  91. data/rust-vendor/markup5ever_rcdom/html5lib-tests/lint_lib/_vendor/funcparserlib/parser.py +0 -872
  92. data/rust-vendor/markup5ever_rcdom/html5lib-tests/lint_lib/_vendor/funcparserlib/parser.pyi +0 -83
  93. data/rust-vendor/markup5ever_rcdom/html5lib-tests/lint_lib/_vendor/funcparserlib/py.typed +0 -0
  94. data/rust-vendor/markup5ever_rcdom/html5lib-tests/lint_lib/_vendor/funcparserlib/util.py +0 -72
  95. data/rust-vendor/markup5ever_rcdom/html5lib-tests/lint_lib/_vendor/funcparserlib/util.pyi +0 -7
  96. data/rust-vendor/markup5ever_rcdom/html5lib-tests/lint_lib/_vendor/vendor.txt +0 -1
  97. data/rust-vendor/markup5ever_rcdom/html5lib-tests/lint_lib/_vendor-patches/funcparserlib.patch +0 -24
  98. data/rust-vendor/markup5ever_rcdom/html5lib-tests/lint_lib/lint.py +0 -280
  99. data/rust-vendor/markup5ever_rcdom/html5lib-tests/lint_lib/parser.py +0 -177
  100. data/rust-vendor/markup5ever_rcdom/html5lib-tests/pyproject.toml +0 -7
  101. data/rust-vendor/markup5ever_rcdom/html5lib-tests/serializer/core.test +0 -125
  102. data/rust-vendor/markup5ever_rcdom/html5lib-tests/serializer/injectmeta.test +0 -66
  103. data/rust-vendor/markup5ever_rcdom/html5lib-tests/serializer/optionaltags.test +0 -965
  104. data/rust-vendor/markup5ever_rcdom/html5lib-tests/serializer/options.test +0 -60
  105. data/rust-vendor/markup5ever_rcdom/html5lib-tests/serializer/whitespace.test +0 -51
  106. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tokenizer/README.md +0 -107
  107. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tokenizer/contentModelFlags.test +0 -93
  108. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tokenizer/domjs.test +0 -335
  109. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tokenizer/entities.test +0 -542
  110. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tokenizer/escapeFlag.test +0 -36
  111. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tokenizer/namedEntities.test +0 -42422
  112. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tokenizer/numericEntities.test +0 -1677
  113. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tokenizer/pendingSpecChanges.test +0 -9
  114. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tokenizer/test1.test +0 -353
  115. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tokenizer/test2.test +0 -275
  116. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tokenizer/test3.test +0 -11233
  117. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tokenizer/test4.test +0 -532
  118. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tokenizer/unicodeChars.test +0 -1577
  119. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tokenizer/unicodeCharsProblematic.test +0 -41
  120. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tokenizer/xmlViolation.test +0 -20
  121. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/README.md +0 -108
  122. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/adoption01.dat +0 -354
  123. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/adoption02.dat +0 -39
  124. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/blocks.dat +0 -695
  125. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/comments01.dat +0 -217
  126. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/doctype01.dat +0 -474
  127. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/domjs-unsafe.dat +0 -0
  128. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/entities01.dat +0 -943
  129. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/entities02.dat +0 -309
  130. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/foreign-fragment.dat +0 -645
  131. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/html5test-com.dat +0 -301
  132. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/inbody01.dat +0 -54
  133. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/isindex.dat +0 -49
  134. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/main-element.dat +0 -46
  135. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/math.dat +0 -104
  136. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/menuitem-element.dat +0 -240
  137. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/namespace-sensitivity.dat +0 -22
  138. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/noscript01.dat +0 -237
  139. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/pending-spec-changes-plain-text-unsafe.dat +0 -0
  140. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/pending-spec-changes.dat +0 -46
  141. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/plain-text-unsafe.dat +0 -0
  142. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/quirks01.dat +0 -53
  143. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/ruby.dat +0 -302
  144. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/scriptdata01.dat +0 -372
  145. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/scripted/adoption01.dat +0 -16
  146. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/scripted/ark.dat +0 -27
  147. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/scripted/webkit01.dat +0 -30
  148. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/search-element.dat +0 -46
  149. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/svg.dat +0 -104
  150. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tables01.dat +0 -322
  151. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/template.dat +0 -1673
  152. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests1.dat +0 -1956
  153. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests10.dat +0 -849
  154. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests11.dat +0 -523
  155. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests12.dat +0 -62
  156. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests14.dat +0 -75
  157. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests15.dat +0 -216
  158. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests16.dat +0 -2602
  159. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests17.dat +0 -179
  160. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests18.dat +0 -558
  161. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests19.dat +0 -1398
  162. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests2.dat +0 -831
  163. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests20.dat +0 -842
  164. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests21.dat +0 -306
  165. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests22.dat +0 -190
  166. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests23.dat +0 -168
  167. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests24.dat +0 -79
  168. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests25.dat +0 -288
  169. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests26.dat +0 -453
  170. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests3.dat +0 -305
  171. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests4.dat +0 -74
  172. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests5.dat +0 -210
  173. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests6.dat +0 -663
  174. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests7.dat +0 -453
  175. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests8.dat +0 -165
  176. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests9.dat +0 -472
  177. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests_innerHTML_1.dat +0 -843
  178. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tricky01.dat +0 -336
  179. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/webkit01.dat +0 -785
  180. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/webkit02.dat +0 -554
  181. data/rust-vendor/markup5ever_rcdom/tests/foreach_html5lib_test/mod.rs +0 -41
  182. data/rust-vendor/markup5ever_rcdom/tests/html-driver.rs +0 -29
  183. data/rust-vendor/markup5ever_rcdom/tests/html-serializer.rs +0 -265
  184. data/rust-vendor/markup5ever_rcdom/tests/html-tokenizer.rs +0 -487
  185. data/rust-vendor/markup5ever_rcdom/tests/html-tree-builder.rs +0 -298
  186. data/rust-vendor/markup5ever_rcdom/tests/html-tree-sink.rs +0 -141
  187. data/rust-vendor/markup5ever_rcdom/tests/util/find_tests.rs +0 -34
  188. data/rust-vendor/markup5ever_rcdom/tests/util/runner.rs +0 -48
  189. data/rust-vendor/markup5ever_rcdom/tests/xml-driver.rs +0 -101
  190. data/rust-vendor/markup5ever_rcdom/tests/xml-tokenizer.rs +0 -374
  191. data/rust-vendor/markup5ever_rcdom/tests/xml-tree-builder.rs +0 -237
  192. data/rust-vendor/markup5ever_rcdom/xml5lib-tests/AUTHORS.rst +0 -9
  193. data/rust-vendor/markup5ever_rcdom/xml5lib-tests/LICENSE +0 -21
  194. data/rust-vendor/markup5ever_rcdom/xml5lib-tests/tokenizer/README.md +0 -92
  195. data/rust-vendor/markup5ever_rcdom/xml5lib-tests/tokenizer/comments.test +0 -274
  196. data/rust-vendor/markup5ever_rcdom/xml5lib-tests/tokenizer/doctype.test +0 -3232
  197. data/rust-vendor/markup5ever_rcdom/xml5lib-tests/tokenizer/entities.test +0 -283
  198. data/rust-vendor/markup5ever_rcdom/xml5lib-tests/tokenizer/eof.test +0 -113
  199. data/rust-vendor/markup5ever_rcdom/xml5lib-tests/tokenizer/namedEntities.test +0 -42210
  200. data/rust-vendor/markup5ever_rcdom/xml5lib-tests/tokenizer/numericEntities.test +0 -1349
  201. data/rust-vendor/markup5ever_rcdom/xml5lib-tests/tokenizer/test1.test +0 -162
  202. data/rust-vendor/markup5ever_rcdom/xml5lib-tests/tokenizer/test2.test +0 -64
  203. data/rust-vendor/markup5ever_rcdom/xml5lib-tests/tokenizer/unicodeChars.test +0 -1295
  204. data/rust-vendor/markup5ever_rcdom/xml5lib-tests/tree-construction/README.md +0 -104
  205. data/rust-vendor/markup5ever_rcdom/xml5lib-tests/tree-construction/namespace.dat +0 -119
  206. data/rust-vendor/markup5ever_rcdom/xml5lib-tests/tree-construction/test1.dat +0 -124
  207. data/rust-vendor/xml5ever/.cargo-checksum.json +0 -1
  208. data/rust-vendor/xml5ever/.cargo_vcs_info.json +0 -6
  209. data/rust-vendor/xml5ever/Cargo.lock +0 -752
  210. data/rust-vendor/xml5ever/Cargo.toml +0 -69
  211. data/rust-vendor/xml5ever/Cargo.toml.orig +0 -29
  212. data/rust-vendor/xml5ever/LICENSE-APACHE +0 -201
  213. data/rust-vendor/xml5ever/LICENSE-MIT +0 -25
  214. data/rust-vendor/xml5ever/README.md +0 -72
  215. data/rust-vendor/xml5ever/benches/xml5ever.rs +0 -77
  216. data/rust-vendor/xml5ever/data/bench/strong.xml +0 -1
  217. data/rust-vendor/xml5ever/examples/README.md +0 -223
  218. data/rust-vendor/xml5ever/examples/example.xml +0 -3
  219. data/rust-vendor/xml5ever/examples/simple_xml_tokenizer.rs +0 -81
  220. data/rust-vendor/xml5ever/examples/xml_tokenizer.rs +0 -115
  221. data/rust-vendor/xml5ever/src/driver.rs +0 -90
  222. data/rust-vendor/xml5ever/src/lib.rs +0 -47
  223. data/rust-vendor/xml5ever/src/macros.rs +0 -18
  224. data/rust-vendor/xml5ever/src/serialize/mod.rs +0 -216
  225. data/rust-vendor/xml5ever/src/tokenizer/char_ref/mod.rs +0 -456
  226. data/rust-vendor/xml5ever/src/tokenizer/interface.rs +0 -116
  227. data/rust-vendor/xml5ever/src/tokenizer/mod.rs +0 -1344
  228. data/rust-vendor/xml5ever/src/tokenizer/qname.rs +0 -84
  229. data/rust-vendor/xml5ever/src/tokenizer/states.rs +0 -167
  230. data/rust-vendor/xml5ever/src/tree_builder/mod.rs +0 -774
  231. data/rust-vendor/xml5ever/src/tree_builder/types.rs +0 -37
@@ -297,7 +297,7 @@ impl<'a, W: Write> Encoder<'a, W> {
297
297
  self.info.bit_depth = depth;
298
298
  }
299
299
 
300
- /// Set compression parameters, see [Compression] for the available options.
300
+ /// Set compression parameters, see [`Compression`] for the available options.
301
301
  pub fn set_compression(&mut self, compression: Compression) {
302
302
  self.set_deflate_compression(DeflateCompression::from_simple(compression));
303
303
  self.set_filter(Filter::from_simple(compression));
@@ -305,7 +305,9 @@ impl<'a, W: Write> Encoder<'a, W> {
305
305
 
306
306
  /// Provides in-depth customization of DEFLATE compression options.
307
307
  ///
308
- /// For a simpler selection of compression options see [Self::set_compression].
308
+ /// For a simpler selection of compression options see [`set_compression`].
309
+ ///
310
+ /// [`set_compression`]: Self::set_compression
309
311
  pub fn set_deflate_compression(&mut self, compression: DeflateCompression) {
310
312
  self.options.compression = compression;
311
313
  }
@@ -2,6 +2,11 @@ use core::convert::TryInto;
2
2
 
3
3
  use crate::{common::BytesPerPixel, Compression};
4
4
 
5
+ mod paeth;
6
+
7
+ #[cfg(feature = "unstable")]
8
+ mod simd;
9
+
5
10
  /// The byte level filter applied to scanlines to prepare them for compression.
6
11
  ///
7
12
  /// Compression in general benefits from repetitive data. The filter is a content-aware method of
@@ -20,6 +25,7 @@ pub enum Filter {
20
25
  Avg,
21
26
  Paeth,
22
27
  Adaptive,
28
+ MinEntropy,
23
29
  }
24
30
 
25
31
  impl Default for Filter {
@@ -88,91 +94,11 @@ impl RowFilter {
88
94
  Filter::Up => Some(Self::Up),
89
95
  Filter::Avg => Some(Self::Avg),
90
96
  Filter::Paeth => Some(Self::Paeth),
91
- Filter::Adaptive => None,
97
+ Filter::Adaptive | Filter::MinEntropy => None,
92
98
  }
93
99
  }
94
100
  }
95
101
 
96
- fn filter_paeth(a: u8, b: u8, c: u8) -> u8 {
97
- // On ARM this algorithm performs much better than the one above adapted from stb,
98
- // and this is the better-studied algorithm we've always used here,
99
- // so we default to it on all non-x86 platforms.
100
- let pa = (i16::from(b) - i16::from(c)).abs();
101
- let pb = (i16::from(a) - i16::from(c)).abs();
102
- let pc = ((i16::from(a) - i16::from(c)) + (i16::from(b) - i16::from(c))).abs();
103
-
104
- let mut out = a;
105
- let mut min = pa;
106
-
107
- if pb < min {
108
- min = pb;
109
- out = b;
110
- }
111
- if pc < min {
112
- out = c;
113
- }
114
-
115
- out
116
- }
117
-
118
- fn filter_paeth_stbi(a: u8, b: u8, c: u8) -> u8 {
119
- // Decoding optimizes better with this algorithm than with `filter_paeth`
120
- //
121
- // This formulation looks very different from the reference in the PNG spec, but is
122
- // actually equivalent and has favorable data dependencies and admits straightforward
123
- // generation of branch-free code, which helps performance significantly.
124
- //
125
- // Adapted from public domain PNG implementation:
126
- // https://github.com/nothings/stb/blob/5c205738c191bcb0abc65c4febfa9bd25ff35234/stb_image.h#L4657-L4668
127
- let thresh = i16::from(c) * 3 - (i16::from(a) + i16::from(b));
128
- let lo = a.min(b);
129
- let hi = a.max(b);
130
- let t0 = if hi as i16 <= thresh { lo } else { c };
131
- let t1 = if thresh <= lo as i16 { hi } else { t0 };
132
- t1
133
- }
134
-
135
- fn filter_paeth_fpnge(a: u8, b: u8, c: u8) -> u8 {
136
- // This is an optimized version of the paeth filter from the PNG specification, proposed by
137
- // Luca Versari for [FPNGE](https://www.lucaversari.it/FJXL_and_FPNGE.pdf). It operates
138
- // entirely on unsigned 8-bit quantities, making it more conducive to vectorization.
139
- //
140
- // p = a + b - c
141
- // pa = |p - a| = |a + b - c - a| = |b - c| = max(b, c) - min(b, c)
142
- // pb = |p - b| = |a + b - c - b| = |a - c| = max(a, c) - min(a, c)
143
- // pc = |p - c| = |a + b - c - c| = |(b - c) + (a - c)| = ...
144
- //
145
- // Further optimizing the calculation of `pc` a bit tricker. However, notice that:
146
- //
147
- // a > c && b > c
148
- // ==> (a - c) > 0 && (b - c) > 0
149
- // ==> pc > (a - c) && pc > (b - c)
150
- // ==> pc > |a - c| && pc > |b - c|
151
- // ==> pc > pb && pc > pa
152
- //
153
- // Meaning that if `c` is smaller than `a` and `b`, the value of `pc` is irrelevant. Similar
154
- // reasoning applies if `c` is larger than the other two inputs. Assuming that `c >= b` and
155
- // `c <= b` or vice versa:
156
- //
157
- // pc = ||b - c| - |a - c|| = |pa - pb| = max(pa, pb) - min(pa, pb)
158
- //
159
- let pa = b.max(c) - c.min(b);
160
- let pb = a.max(c) - c.min(a);
161
- let pc = if (a < c) == (c < b) {
162
- pa.max(pb) - pa.min(pb)
163
- } else {
164
- 255
165
- };
166
-
167
- if pa <= pb && pa <= pc {
168
- a
169
- } else if pb <= pc {
170
- b
171
- } else {
172
- c
173
- }
174
- }
175
-
176
102
  pub(crate) fn unfilter(
177
103
  mut filter: RowFilter,
178
104
  tbpp: BytesPerPixel,
@@ -190,110 +116,6 @@ pub(crate) fn unfilter(
190
116
  }
191
117
  }
192
118
 
193
- // Auto-vectorization notes
194
- // ========================
195
- //
196
- // [2023/01 @okaneco] - Notes on optimizing decoding filters
197
- //
198
- // Links:
199
- // [PR]: https://github.com/image-rs/image-png/pull/382
200
- // [SWAR]: http://aggregate.org/SWAR/over.html
201
- // [AVG]: http://aggregate.org/MAGIC/#Average%20of%20Integers
202
- //
203
- // #382 heavily refactored and optimized the following filters making the
204
- // implementation nonobvious. These comments function as a summary of that
205
- // PR with an explanation of the choices made below.
206
- //
207
- // #382 originally started with trying to optimize using a technique called
208
- // SWAR, SIMD Within a Register. SWAR uses regular integer types like `u32`
209
- // and `u64` as SIMD registers to perform vertical operations in parallel,
210
- // usually involving bit-twiddling. This allowed each `BytesPerPixel` (bpp)
211
- // pixel to be decoded in parallel: 3bpp and 4bpp in a `u32`, 6bpp and 8pp
212
- // in a `u64`. The `Sub` filter looked like the following code block, `Avg`
213
- // was similar but used a bitwise average method from [AVG]:
214
- // ```
215
- // // See "Unpartitioned Operations With Correction Code" from [SWAR]
216
- // fn swar_add_u32(x: u32, y: u32) -> u32 {
217
- // // 7-bit addition so there's no carry over the most significant bit
218
- // let n = (x & 0x7f7f7f7f) + (y & 0x7f7f7f7f); // 0x7F = 0b_0111_1111
219
- // // 1-bit parity/XOR addition to fill in the missing MSB
220
- // n ^ (x ^ y) & 0x80808080 // 0x80 = 0b_1000_0000
221
- // }
222
- //
223
- // let mut prev =
224
- // u32::from_ne_bytes([current[0], current[1], current[2], current[3]]);
225
- // for chunk in current[4..].chunks_exact_mut(4) {
226
- // let cur = u32::from_ne_bytes([chunk[0], chunk[1], chunk[2], chunk[3]]);
227
- // let new_chunk = swar_add_u32(cur, prev);
228
- // chunk.copy_from_slice(&new_chunk.to_ne_bytes());
229
- // prev = new_chunk;
230
- // }
231
- // ```
232
- // While this provided a measurable increase, @fintelia found that this idea
233
- // could be taken even further by unrolling the chunks component-wise and
234
- // avoiding unnecessary byte-shuffling by using byte arrays instead of
235
- // `u32::from|to_ne_bytes`. The bitwise operations were no longer necessary
236
- // so they were reverted to their obvious arithmetic equivalent. Lastly,
237
- // `TryInto` was used instead of `copy_from_slice`. The `Sub` code now
238
- // looked like this (with asserts to remove `0..bpp` bounds checks):
239
- // ```
240
- // assert!(len > 3);
241
- // let mut prev = [current[0], current[1], current[2], current[3]];
242
- // for chunk in current[4..].chunks_exact_mut(4) {
243
- // let new_chunk = [
244
- // chunk[0].wrapping_add(prev[0]),
245
- // chunk[1].wrapping_add(prev[1]),
246
- // chunk[2].wrapping_add(prev[2]),
247
- // chunk[3].wrapping_add(prev[3]),
248
- // ];
249
- // *TryInto::<&mut [u8; 4]>::try_into(chunk).unwrap() = new_chunk;
250
- // prev = new_chunk;
251
- // }
252
- // ```
253
- // The compiler was able to optimize the code to be even faster and this
254
- // method even sped up Paeth filtering! Assertions were experimentally
255
- // added within loop bodies which produced better instructions but no
256
- // difference in speed. Finally, the code was refactored to remove manual
257
- // slicing and start the previous pixel chunks with arrays of `[0; N]`.
258
- // ```
259
- // let mut prev = [0; 4];
260
- // for chunk in current.chunks_exact_mut(4) {
261
- // let new_chunk = [
262
- // chunk[0].wrapping_add(prev[0]),
263
- // chunk[1].wrapping_add(prev[1]),
264
- // chunk[2].wrapping_add(prev[2]),
265
- // chunk[3].wrapping_add(prev[3]),
266
- // ];
267
- // *TryInto::<&mut [u8; 4]>::try_into(chunk).unwrap() = new_chunk;
268
- // prev = new_chunk;
269
- // }
270
- // ```
271
- // While we're not manually bit-twiddling anymore, a possible takeaway from
272
- // this is to "think in SWAR" when dealing with small byte arrays. Unrolling
273
- // array operations and performing them component-wise may unlock previously
274
- // unavailable optimizations from the compiler, even when using the
275
- // `chunks_exact` methods for their potential auto-vectorization benefits.
276
- //
277
- // `std::simd` notes
278
- // =================
279
- //
280
- // In the past we have experimented with `std::simd` for unfiltering. This
281
- // experiment was removed in https://github.com/image-rs/image-png/pull/585
282
- // because:
283
- //
284
- // * The crate's microbenchmarks showed that `std::simd` didn't have a
285
- // significant advantage over auto-vectorization for most filters, except
286
- // for Paeth unfiltering - see
287
- // https://github.com/image-rs/image-png/pull/414#issuecomment-1736655668
288
- // * In the crate's microbenchmarks `std::simd` seemed to help with Paeth
289
- // unfiltering only on x86/x64, with mixed results on ARM - see
290
- // https://github.com/image-rs/image-png/pull/539#issuecomment-2512748043
291
- // * In Chromium end-to-end microbenchmarks `std::simd` either didn't help
292
- // or resulted in a small regression (as measured on x64). See
293
- // https://crrev.com/c/6090592.
294
- // * Field trial data from some "real world" scenarios shows that
295
- // performance can be quite good without relying on `std::simd` - see
296
- // https://github.com/image-rs/image-png/discussions/562#discussioncomment-13303307
297
119
  match filter {
298
120
  NoFilter => {}
299
121
  Sub => match tbpp {
@@ -532,150 +354,7 @@ pub(crate) fn unfilter(
532
354
  }
533
355
  }
534
356
  },
535
- #[allow(unreachable_code)]
536
- Paeth => {
537
- // Select the fastest Paeth filter implementation based on the target architecture.
538
- let filter_paeth_decode = if cfg!(target_arch = "x86_64") {
539
- filter_paeth_stbi
540
- } else {
541
- filter_paeth
542
- };
543
-
544
- // Paeth filter pixels:
545
- // C B D
546
- // A X
547
- match tbpp {
548
- BytesPerPixel::One => {
549
- let mut a_bpp = [0; 1];
550
- let mut c_bpp = [0; 1];
551
- for (chunk, b_bpp) in current.chunks_exact_mut(1).zip(previous.chunks_exact(1))
552
- {
553
- let new_chunk = [chunk[0]
554
- .wrapping_add(filter_paeth_decode(a_bpp[0], b_bpp[0], c_bpp[0]))];
555
- *TryInto::<&mut [u8; 1]>::try_into(chunk).unwrap() = new_chunk;
556
- a_bpp = new_chunk;
557
- c_bpp = b_bpp.try_into().unwrap();
558
- }
559
- }
560
- BytesPerPixel::Two => {
561
- let mut a_bpp = [0; 2];
562
- let mut c_bpp = [0; 2];
563
- for (chunk, b_bpp) in current.chunks_exact_mut(2).zip(previous.chunks_exact(2))
564
- {
565
- let new_chunk = [
566
- chunk[0]
567
- .wrapping_add(filter_paeth_decode(a_bpp[0], b_bpp[0], c_bpp[0])),
568
- chunk[1]
569
- .wrapping_add(filter_paeth_decode(a_bpp[1], b_bpp[1], c_bpp[1])),
570
- ];
571
- *TryInto::<&mut [u8; 2]>::try_into(chunk).unwrap() = new_chunk;
572
- a_bpp = new_chunk;
573
- c_bpp = b_bpp.try_into().unwrap();
574
- }
575
- }
576
- BytesPerPixel::Three => {
577
- let mut a_bpp = [0; 3];
578
- let mut c_bpp = [0; 3];
579
-
580
- let mut previous = &previous[..previous.len() / 3 * 3];
581
- let current_len = current.len();
582
- let mut current = &mut current[..current_len / 3 * 3];
583
-
584
- while let ([c0, c1, c2, c_rest @ ..], [p0, p1, p2, p_rest @ ..]) =
585
- (current, previous)
586
- {
587
- current = c_rest;
588
- previous = p_rest;
589
-
590
- *c0 = c0.wrapping_add(filter_paeth_decode(a_bpp[0], *p0, c_bpp[0]));
591
- *c1 = c1.wrapping_add(filter_paeth_decode(a_bpp[1], *p1, c_bpp[1]));
592
- *c2 = c2.wrapping_add(filter_paeth_decode(a_bpp[2], *p2, c_bpp[2]));
593
-
594
- a_bpp = [*c0, *c1, *c2];
595
- c_bpp = [*p0, *p1, *p2];
596
- }
597
- }
598
- BytesPerPixel::Four => {
599
- // Using the `simd` module here has no effect on Linux
600
- // and appears to regress performance on Windows, so we don't use it here.
601
- // See https://github.com/image-rs/image-png/issues/567
602
-
603
- let mut a_bpp = [0; 4];
604
- let mut c_bpp = [0; 4];
605
-
606
- let mut previous = &previous[..previous.len() & !3];
607
- let current_len = current.len();
608
- let mut current = &mut current[..current_len & !3];
609
-
610
- while let ([c0, c1, c2, c3, c_rest @ ..], [p0, p1, p2, p3, p_rest @ ..]) =
611
- (current, previous)
612
- {
613
- current = c_rest;
614
- previous = p_rest;
615
-
616
- *c0 = c0.wrapping_add(filter_paeth_decode(a_bpp[0], *p0, c_bpp[0]));
617
- *c1 = c1.wrapping_add(filter_paeth_decode(a_bpp[1], *p1, c_bpp[1]));
618
- *c2 = c2.wrapping_add(filter_paeth_decode(a_bpp[2], *p2, c_bpp[2]));
619
- *c3 = c3.wrapping_add(filter_paeth_decode(a_bpp[3], *p3, c_bpp[3]));
620
-
621
- a_bpp = [*c0, *c1, *c2, *c3];
622
- c_bpp = [*p0, *p1, *p2, *p3];
623
- }
624
- }
625
- BytesPerPixel::Six => {
626
- let mut a_bpp = [0; 6];
627
- let mut c_bpp = [0; 6];
628
- for (chunk, b_bpp) in current.chunks_exact_mut(6).zip(previous.chunks_exact(6))
629
- {
630
- let new_chunk = [
631
- chunk[0]
632
- .wrapping_add(filter_paeth_decode(a_bpp[0], b_bpp[0], c_bpp[0])),
633
- chunk[1]
634
- .wrapping_add(filter_paeth_decode(a_bpp[1], b_bpp[1], c_bpp[1])),
635
- chunk[2]
636
- .wrapping_add(filter_paeth_decode(a_bpp[2], b_bpp[2], c_bpp[2])),
637
- chunk[3]
638
- .wrapping_add(filter_paeth_decode(a_bpp[3], b_bpp[3], c_bpp[3])),
639
- chunk[4]
640
- .wrapping_add(filter_paeth_decode(a_bpp[4], b_bpp[4], c_bpp[4])),
641
- chunk[5]
642
- .wrapping_add(filter_paeth_decode(a_bpp[5], b_bpp[5], c_bpp[5])),
643
- ];
644
- *TryInto::<&mut [u8; 6]>::try_into(chunk).unwrap() = new_chunk;
645
- a_bpp = new_chunk;
646
- c_bpp = b_bpp.try_into().unwrap();
647
- }
648
- }
649
- BytesPerPixel::Eight => {
650
- let mut a_bpp = [0; 8];
651
- let mut c_bpp = [0; 8];
652
- for (chunk, b_bpp) in current.chunks_exact_mut(8).zip(previous.chunks_exact(8))
653
- {
654
- let new_chunk = [
655
- chunk[0]
656
- .wrapping_add(filter_paeth_decode(a_bpp[0], b_bpp[0], c_bpp[0])),
657
- chunk[1]
658
- .wrapping_add(filter_paeth_decode(a_bpp[1], b_bpp[1], c_bpp[1])),
659
- chunk[2]
660
- .wrapping_add(filter_paeth_decode(a_bpp[2], b_bpp[2], c_bpp[2])),
661
- chunk[3]
662
- .wrapping_add(filter_paeth_decode(a_bpp[3], b_bpp[3], c_bpp[3])),
663
- chunk[4]
664
- .wrapping_add(filter_paeth_decode(a_bpp[4], b_bpp[4], c_bpp[4])),
665
- chunk[5]
666
- .wrapping_add(filter_paeth_decode(a_bpp[5], b_bpp[5], c_bpp[5])),
667
- chunk[6]
668
- .wrapping_add(filter_paeth_decode(a_bpp[6], b_bpp[6], c_bpp[6])),
669
- chunk[7]
670
- .wrapping_add(filter_paeth_decode(a_bpp[7], b_bpp[7], c_bpp[7])),
671
- ];
672
- *TryInto::<&mut [u8; 8]>::try_into(chunk).unwrap() = new_chunk;
673
- a_bpp = new_chunk;
674
- c_bpp = b_bpp.try_into().unwrap();
675
- }
676
- }
677
- }
678
- }
357
+ Paeth => paeth::unfilter(tbpp, previous, current),
679
358
  }
680
359
  }
681
360
 
@@ -795,7 +474,7 @@ fn filter_internal(
795
474
  .zip(&mut c_chunks)
796
475
  {
797
476
  for i in 0..CHUNK_SIZE {
798
- out[i] = cur[i].wrapping_sub(filter_paeth_fpnge(a[i], b[i], c[i]));
477
+ out[i] = cur[i].wrapping_sub(paeth::filter_paeth_fpnge(a[i], b[i], c[i]));
799
478
  }
800
479
  }
801
480
 
@@ -807,17 +486,47 @@ fn filter_internal(
807
486
  .zip(b_chunks.remainder())
808
487
  .zip(c_chunks.remainder())
809
488
  {
810
- *out = cur.wrapping_sub(filter_paeth_fpnge(a, b, c));
489
+ *out = cur.wrapping_sub(paeth::filter_paeth_fpnge(a, b, c));
811
490
  }
812
491
 
813
492
  for i in 0..bpp {
814
- output[i] = current[i].wrapping_sub(filter_paeth_fpnge(0, previous[i], 0));
493
+ output[i] = current[i].wrapping_sub(paeth::filter_paeth_fpnge(0, previous[i], 0));
815
494
  }
816
495
  Paeth
817
496
  }
818
497
  }
819
498
  }
820
499
 
500
+ fn adaptive_filter(
501
+ f: impl Fn(&[u8]) -> u64,
502
+ bpp: usize,
503
+ len: usize,
504
+ previous: &[u8],
505
+ current: &[u8],
506
+ output: &mut [u8],
507
+ ) -> RowFilter {
508
+ use RowFilter::*;
509
+
510
+ let mut min_cost: u64 = u64::MAX;
511
+ let mut filter_choice = RowFilter::NoFilter;
512
+ for &filter in [Up, Sub, Avg, Paeth].iter() {
513
+ filter_internal(filter, bpp, len, previous, current, output);
514
+ let cost = f(output);
515
+ if cost <= min_cost {
516
+ min_cost = cost;
517
+ filter_choice = filter;
518
+
519
+ if cost == 0 {
520
+ return filter_choice;
521
+ }
522
+ }
523
+ }
524
+ if filter_choice != Paeth {
525
+ filter_internal(filter_choice, bpp, len, previous, current, output);
526
+ }
527
+ filter_choice
528
+ }
529
+
821
530
  pub(crate) fn filter(
822
531
  method: Filter,
823
532
  bpp: BytesPerPixel,
@@ -825,28 +534,12 @@ pub(crate) fn filter(
825
534
  current: &[u8],
826
535
  output: &mut [u8],
827
536
  ) -> RowFilter {
828
- use RowFilter::*;
829
537
  let bpp = bpp.into_usize();
830
538
  let len = current.len();
831
539
 
832
540
  match method {
833
- Filter::Adaptive => {
834
- let mut min_sum: u64 = u64::MAX;
835
- let mut filter_choice = RowFilter::NoFilter;
836
- for &filter in [Sub, Up, Avg, Paeth].iter() {
837
- filter_internal(filter, bpp, len, previous, current, output);
838
- let sum = sum_buffer(output);
839
- if sum <= min_sum {
840
- min_sum = sum;
841
- filter_choice = filter;
842
- }
843
- }
844
-
845
- if filter_choice != Paeth {
846
- filter_internal(filter_choice, bpp, len, previous, current, output);
847
- }
848
- filter_choice
849
- }
541
+ Filter::Adaptive => adaptive_filter(sum_buffer, bpp, len, previous, current, output),
542
+ Filter::MinEntropy => adaptive_filter(entropy, bpp, len, previous, current, output),
850
543
  _ => {
851
544
  let filter = RowFilter::from_method(method).unwrap();
852
545
  filter_internal(filter, bpp, len, previous, current, output)
@@ -854,6 +547,63 @@ pub(crate) fn filter(
854
547
  }
855
548
  }
856
549
 
550
+ /// Estimate the value of i * log2(i) without using floating point operations,
551
+ /// implementation originally from oxipng.
552
+ fn ilog2i(i: u32) -> u32 {
553
+ let log = 32 - i.leading_zeros() - 1;
554
+ i * log + ((i - (1 << log)) << 1)
555
+ }
556
+
557
+ fn entropy(buf: &[u8]) -> u64 {
558
+ let mut counts = [[0_u32; 256]; 4];
559
+ let mut total = 0;
560
+
561
+ // Count the number of occurrences of each byte value.
562
+ let mut chunks = buf.chunks_exact(8);
563
+ for chunk in &mut chunks {
564
+ // Runs of zeros are common and very compressible, so treat them as free.
565
+ if chunk == [0; 8] {
566
+ continue;
567
+ }
568
+
569
+ // Scatter the counts into 4 separate arrays to reduce contention.
570
+ for j in 0..2 {
571
+ counts[0][chunk[j * 4] as usize] += 1;
572
+ counts[1][chunk[1 + j * 4] as usize] += 1;
573
+ counts[2][chunk[2 + j * 4] as usize] += 1;
574
+ counts[3][chunk[3 + j * 4] as usize] += 1;
575
+ }
576
+ total += 8;
577
+ }
578
+ for &lit in chunks.remainder() {
579
+ counts[0][lit as usize] += 1;
580
+ total += 1;
581
+ }
582
+
583
+ // If the input is entirely zeros, short-circuit the entropy calculation.
584
+ if counts[0][0] == total {
585
+ return 0;
586
+ }
587
+
588
+ // Consolidate the counts.
589
+ //
590
+ // Upstream bug: <https://github.com/rust-lang/rust-clippy/issues/11529>
591
+ #[allow(clippy::needless_range_loop)]
592
+ for i in 0..256 {
593
+ counts[0][i] += counts[1][i] + counts[2][i] + counts[3][i];
594
+ }
595
+
596
+ // Compute the entropy.
597
+ let mut entropy = ilog2i(total);
598
+ for &count in &counts[0] {
599
+ if count > 0 {
600
+ entropy = entropy.saturating_sub(ilog2i(count));
601
+ }
602
+ }
603
+
604
+ entropy as u64
605
+ }
606
+
857
607
  // Helper function for Adaptive filter buffer summation
858
608
  fn sum_buffer(buf: &[u8]) -> u64 {
859
609
  const CHUNK_SIZE: usize = 32;
@@ -926,23 +676,6 @@ mod test {
926
676
  }
927
677
  }
928
678
 
929
- #[test]
930
- #[ignore] // takes ~20s without optimizations
931
- fn paeth_impls_are_equivalent() {
932
- for a in 0..=255 {
933
- for b in 0..=255 {
934
- for c in 0..=255 {
935
- let baseline = filter_paeth(a, b, c);
936
- let fpnge = filter_paeth_fpnge(a, b, c);
937
- let stbi = filter_paeth_stbi(a, b, c);
938
-
939
- assert_eq!(baseline, fpnge);
940
- assert_eq!(baseline, stbi);
941
- }
942
- }
943
- }
944
- }
945
-
946
679
  #[test]
947
680
  fn roundtrip_ascending_previous_line() {
948
681
  // A multiple of 8, 6, 4, 3, 2, 1
@@ -0,0 +1,104 @@
1
+ Auto-vectorization notes
2
+ ========================
3
+
4
+ [2023/01 @okaneco] - Notes on optimizing decoding filters
5
+
6
+ Links:
7
+ [PR]: https://github.com/image-rs/image-png/pull/382
8
+ [SWAR]: http://aggregate.org/SWAR/over.html
9
+ [AVG]: http://aggregate.org/MAGIC/#Average%20of%20Integers
10
+
11
+ #382 heavily refactored and optimized the following filters making the
12
+ implementation nonobvious. These comments function as a summary of that
13
+ PR with an explanation of the choices made below.
14
+
15
+ #382 originally started with trying to optimize using a technique called
16
+ SWAR, SIMD Within a Register. SWAR uses regular integer types like `u32`
17
+ and `u64` as SIMD registers to perform vertical operations in parallel,
18
+ usually involving bit-twiddling. This allowed each `BytesPerPixel` (bpp)
19
+ pixel to be decoded in parallel: 3bpp and 4bpp in a `u32`, 6bpp and 8pp
20
+ in a `u64`. The `Sub` filter looked like the following code block, `Avg`
21
+ was similar but used a bitwise average method from [AVG]:
22
+ ```
23
+ // See "Unpartitioned Operations With Correction Code" from [SWAR]
24
+ fn swar_add_u32(x: u32, y: u32) -> u32 {
25
+ // 7-bit addition so there's no carry over the most significant bit
26
+ let n = (x & 0x7f7f7f7f) + (y & 0x7f7f7f7f); // 0x7F = 0b_0111_1111
27
+ // 1-bit parity/XOR addition to fill in the missing MSB
28
+ n ^ (x ^ y) & 0x80808080 // 0x80 = 0b_1000_0000
29
+ }
30
+
31
+ let mut prev =
32
+ u32::from_ne_bytes([current[0], current[1], current[2], current[3]]);
33
+ for chunk in current[4..].chunks_exact_mut(4) {
34
+ let cur = u32::from_ne_bytes([chunk[0], chunk[1], chunk[2], chunk[3]]);
35
+ let new_chunk = swar_add_u32(cur, prev);
36
+ chunk.copy_from_slice(&new_chunk.to_ne_bytes());
37
+ prev = new_chunk;
38
+ }
39
+ ```
40
+ While this provided a measurable increase, @fintelia found that this idea
41
+ could be taken even further by unrolling the chunks component-wise and
42
+ avoiding unnecessary byte-shuffling by using byte arrays instead of
43
+ `u32::from|to_ne_bytes`. The bitwise operations were no longer necessary
44
+ so they were reverted to their obvious arithmetic equivalent. Lastly,
45
+ `TryInto` was used instead of `copy_from_slice`. The `Sub` code now
46
+ looked like this (with asserts to remove `0..bpp` bounds checks):
47
+ ```
48
+ assert!(len > 3);
49
+ let mut prev = [current[0], current[1], current[2], current[3]];
50
+ for chunk in current[4..].chunks_exact_mut(4) {
51
+ let new_chunk = [
52
+ chunk[0].wrapping_add(prev[0]),
53
+ chunk[1].wrapping_add(prev[1]),
54
+ chunk[2].wrapping_add(prev[2]),
55
+ chunk[3].wrapping_add(prev[3]),
56
+ ];
57
+ *TryInto::<&mut [u8; 4]>::try_into(chunk).unwrap() = new_chunk;
58
+ prev = new_chunk;
59
+ }
60
+ ```
61
+ The compiler was able to optimize the code to be even faster and this
62
+ method even sped up Paeth filtering! Assertions were experimentally
63
+ added within loop bodies which produced better instructions but no
64
+ difference in speed. Finally, the code was refactored to remove manual
65
+ slicing and start the previous pixel chunks with arrays of `[0; N]`.
66
+ ```
67
+ let mut prev = [0; 4];
68
+ for chunk in current.chunks_exact_mut(4) {
69
+ let new_chunk = [
70
+ chunk[0].wrapping_add(prev[0]),
71
+ chunk[1].wrapping_add(prev[1]),
72
+ chunk[2].wrapping_add(prev[2]),
73
+ chunk[3].wrapping_add(prev[3]),
74
+ ];
75
+ *TryInto::<&mut [u8; 4]>::try_into(chunk).unwrap() = new_chunk;
76
+ prev = new_chunk;
77
+ }
78
+ ```
79
+ While we're not manually bit-twiddling anymore, a possible takeaway from
80
+ this is to "think in SWAR" when dealing with small byte arrays. Unrolling
81
+ array operations and performing them component-wise may unlock previously
82
+ unavailable optimizations from the compiler, even when using the
83
+ `chunks_exact` methods for their potential auto-vectorization benefits.
84
+
85
+ `std::simd` notes
86
+ =================
87
+
88
+ In the past we have experimented with `std::simd` for unfiltering. This
89
+ experiment was removed in https://github.com/image-rs/image-png/pull/585
90
+ because:
91
+
92
+ * The crate's microbenchmarks showed that `std::simd` didn't have a
93
+ significant advantage over auto-vectorization for most filters, except
94
+ for Paeth unfiltering - see
95
+ https://github.com/image-rs/image-png/pull/414#issuecomment-1736655668
96
+ * In the crate's microbenchmarks `std::simd` seemed to help with Paeth
97
+ unfiltering only on x86/x64, with mixed results on ARM - see
98
+ https://github.com/image-rs/image-png/pull/539#issuecomment-2512748043
99
+ * In Chromium end-to-end microbenchmarks `std::simd` either didn't help
100
+ or resulted in a small regression (as measured on x64). See
101
+ https://crrev.com/c/6090592.
102
+ * Field trial data from some "real world" scenarios shows that
103
+ performance can be quite good without relying on `std::simd` - see
104
+ https://github.com/image-rs/image-png/discussions/562#discussioncomment-13303307