html-to-markdown 2.24.6 → 2.25.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (203) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +2 -2
  3. data/ext/html-to-markdown-rb/native/Cargo.lock +3 -26
  4. data/ext/html-to-markdown-rb/native/Cargo.toml +1 -1
  5. data/lib/html_to_markdown/version.rb +1 -1
  6. data/rust-vendor/html-to-markdown-rs/Cargo.toml +0 -1
  7. data/rust-vendor/html-to-markdown-rs/src/converter/main_helpers.rs +1 -1
  8. data/rust-vendor/html-to-markdown-rs/src/lib.rs +1 -0
  9. data/rust-vendor/{markup5ever_rcdom/lib.rs → html-to-markdown-rs/src/rcdom.rs} +53 -91
  10. data/rust-vendor/png/.cargo-checksum.json +1 -1
  11. data/rust-vendor/png/.cargo_vcs_info.json +1 -1
  12. data/rust-vendor/png/CHANGES.md +44 -0
  13. data/rust-vendor/png/Cargo.lock +124 -171
  14. data/rust-vendor/png/Cargo.toml +1 -1
  15. data/rust-vendor/png/Cargo.toml.orig +1 -1
  16. data/rust-vendor/png/benches/expand_paletted.rs +5 -5
  17. data/rust-vendor/png/benches/unfilter.rs +3 -3
  18. data/rust-vendor/png/src/adam7.rs +17 -10
  19. data/rust-vendor/png/src/common.rs +8 -8
  20. data/rust-vendor/png/src/decoder/mod.rs +53 -20
  21. data/rust-vendor/png/src/decoder/stream.rs +263 -78
  22. data/rust-vendor/png/src/decoder/unfiltering_buffer.rs +210 -53
  23. data/rust-vendor/png/src/decoder/zlib.rs +130 -90
  24. data/rust-vendor/png/src/encoder.rs +4 -2
  25. data/rust-vendor/png/src/{filter.rs → filter/mod.rs} +100 -367
  26. data/rust-vendor/png/src/filter/optimization-notes.md +104 -0
  27. data/rust-vendor/png/src/filter/paeth.rs +398 -0
  28. data/rust-vendor/png/src/filter/simd.rs +308 -0
  29. data/rust-vendor/png/src/lib.rs +1 -0
  30. metadata +7 -177
  31. data/rust-vendor/markup5ever_rcdom/.cargo-checksum.json +0 -1
  32. data/rust-vendor/markup5ever_rcdom/.cargo_vcs_info.json +0 -7
  33. data/rust-vendor/markup5ever_rcdom/Cargo.lock +0 -658
  34. data/rust-vendor/markup5ever_rcdom/Cargo.toml +0 -109
  35. data/rust-vendor/markup5ever_rcdom/Cargo.toml.orig +0 -42
  36. data/rust-vendor/markup5ever_rcdom/LICENSE-APACHE +0 -201
  37. data/rust-vendor/markup5ever_rcdom/LICENSE-MIT +0 -25
  38. data/rust-vendor/markup5ever_rcdom/README.md +0 -7
  39. data/rust-vendor/markup5ever_rcdom/custom-html5lib-tokenizer-tests/regression.test +0 -69
  40. data/rust-vendor/markup5ever_rcdom/data/test/ignore +0 -1
  41. data/rust-vendor/markup5ever_rcdom/examples/hello_xml.rs +0 -39
  42. data/rust-vendor/markup5ever_rcdom/examples/html2html.rs +0 -51
  43. data/rust-vendor/markup5ever_rcdom/examples/print-rcdom.rs +0 -78
  44. data/rust-vendor/markup5ever_rcdom/examples/xml_tree_printer.rs +0 -67
  45. data/rust-vendor/markup5ever_rcdom/html5lib-tests/.gitattributes +0 -2
  46. data/rust-vendor/markup5ever_rcdom/html5lib-tests/.github/workflows/downstream.yml +0 -76
  47. data/rust-vendor/markup5ever_rcdom/html5lib-tests/.github/workflows/lint.yml +0 -25
  48. data/rust-vendor/markup5ever_rcdom/html5lib-tests/.gitignore +0 -79
  49. data/rust-vendor/markup5ever_rcdom/html5lib-tests/AUTHORS.rst +0 -34
  50. data/rust-vendor/markup5ever_rcdom/html5lib-tests/LICENSE +0 -21
  51. data/rust-vendor/markup5ever_rcdom/html5lib-tests/encoding/chardet/test_big5.txt +0 -51
  52. data/rust-vendor/markup5ever_rcdom/html5lib-tests/encoding/scripted/tests1.dat +0 -5
  53. data/rust-vendor/markup5ever_rcdom/html5lib-tests/encoding/test-yahoo-jp.dat +0 -10
  54. data/rust-vendor/markup5ever_rcdom/html5lib-tests/encoding/tests1.dat +0 -388
  55. data/rust-vendor/markup5ever_rcdom/html5lib-tests/encoding/tests2.dat +0 -115
  56. data/rust-vendor/markup5ever_rcdom/html5lib-tests/lint +0 -6
  57. data/rust-vendor/markup5ever_rcdom/html5lib-tests/lint_lib/__init__.py +0 -0
  58. data/rust-vendor/markup5ever_rcdom/html5lib-tests/lint_lib/_vendor/__init__.py +0 -0
  59. data/rust-vendor/markup5ever_rcdom/html5lib-tests/lint_lib/_vendor/funcparserlib/LICENSE +0 -18
  60. data/rust-vendor/markup5ever_rcdom/html5lib-tests/lint_lib/_vendor/funcparserlib/__init__.py +0 -0
  61. data/rust-vendor/markup5ever_rcdom/html5lib-tests/lint_lib/_vendor/funcparserlib/lexer.py +0 -211
  62. data/rust-vendor/markup5ever_rcdom/html5lib-tests/lint_lib/_vendor/funcparserlib/lexer.pyi +0 -34
  63. data/rust-vendor/markup5ever_rcdom/html5lib-tests/lint_lib/_vendor/funcparserlib/parser.py +0 -872
  64. data/rust-vendor/markup5ever_rcdom/html5lib-tests/lint_lib/_vendor/funcparserlib/parser.pyi +0 -83
  65. data/rust-vendor/markup5ever_rcdom/html5lib-tests/lint_lib/_vendor/funcparserlib/py.typed +0 -0
  66. data/rust-vendor/markup5ever_rcdom/html5lib-tests/lint_lib/_vendor/funcparserlib/util.py +0 -72
  67. data/rust-vendor/markup5ever_rcdom/html5lib-tests/lint_lib/_vendor/funcparserlib/util.pyi +0 -7
  68. data/rust-vendor/markup5ever_rcdom/html5lib-tests/lint_lib/_vendor/vendor.txt +0 -1
  69. data/rust-vendor/markup5ever_rcdom/html5lib-tests/lint_lib/_vendor-patches/funcparserlib.patch +0 -24
  70. data/rust-vendor/markup5ever_rcdom/html5lib-tests/lint_lib/lint.py +0 -280
  71. data/rust-vendor/markup5ever_rcdom/html5lib-tests/lint_lib/parser.py +0 -177
  72. data/rust-vendor/markup5ever_rcdom/html5lib-tests/pyproject.toml +0 -7
  73. data/rust-vendor/markup5ever_rcdom/html5lib-tests/serializer/core.test +0 -125
  74. data/rust-vendor/markup5ever_rcdom/html5lib-tests/serializer/injectmeta.test +0 -66
  75. data/rust-vendor/markup5ever_rcdom/html5lib-tests/serializer/optionaltags.test +0 -965
  76. data/rust-vendor/markup5ever_rcdom/html5lib-tests/serializer/options.test +0 -60
  77. data/rust-vendor/markup5ever_rcdom/html5lib-tests/serializer/whitespace.test +0 -51
  78. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tokenizer/README.md +0 -107
  79. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tokenizer/contentModelFlags.test +0 -93
  80. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tokenizer/domjs.test +0 -335
  81. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tokenizer/entities.test +0 -542
  82. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tokenizer/escapeFlag.test +0 -36
  83. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tokenizer/namedEntities.test +0 -42422
  84. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tokenizer/numericEntities.test +0 -1677
  85. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tokenizer/pendingSpecChanges.test +0 -9
  86. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tokenizer/test1.test +0 -353
  87. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tokenizer/test2.test +0 -275
  88. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tokenizer/test3.test +0 -11233
  89. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tokenizer/test4.test +0 -532
  90. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tokenizer/unicodeChars.test +0 -1577
  91. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tokenizer/unicodeCharsProblematic.test +0 -41
  92. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tokenizer/xmlViolation.test +0 -20
  93. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/README.md +0 -108
  94. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/adoption01.dat +0 -354
  95. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/adoption02.dat +0 -39
  96. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/blocks.dat +0 -695
  97. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/comments01.dat +0 -217
  98. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/doctype01.dat +0 -474
  99. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/domjs-unsafe.dat +0 -0
  100. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/entities01.dat +0 -943
  101. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/entities02.dat +0 -309
  102. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/foreign-fragment.dat +0 -645
  103. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/html5test-com.dat +0 -301
  104. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/inbody01.dat +0 -54
  105. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/isindex.dat +0 -49
  106. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/main-element.dat +0 -46
  107. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/math.dat +0 -104
  108. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/menuitem-element.dat +0 -240
  109. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/namespace-sensitivity.dat +0 -22
  110. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/noscript01.dat +0 -237
  111. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/pending-spec-changes-plain-text-unsafe.dat +0 -0
  112. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/pending-spec-changes.dat +0 -46
  113. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/plain-text-unsafe.dat +0 -0
  114. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/quirks01.dat +0 -53
  115. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/ruby.dat +0 -302
  116. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/scriptdata01.dat +0 -372
  117. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/scripted/adoption01.dat +0 -16
  118. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/scripted/ark.dat +0 -27
  119. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/scripted/webkit01.dat +0 -30
  120. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/search-element.dat +0 -46
  121. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/svg.dat +0 -104
  122. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tables01.dat +0 -322
  123. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/template.dat +0 -1673
  124. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests1.dat +0 -1956
  125. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests10.dat +0 -849
  126. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests11.dat +0 -523
  127. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests12.dat +0 -62
  128. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests14.dat +0 -75
  129. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests15.dat +0 -216
  130. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests16.dat +0 -2602
  131. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests17.dat +0 -179
  132. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests18.dat +0 -558
  133. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests19.dat +0 -1398
  134. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests2.dat +0 -831
  135. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests20.dat +0 -842
  136. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests21.dat +0 -306
  137. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests22.dat +0 -190
  138. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests23.dat +0 -168
  139. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests24.dat +0 -79
  140. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests25.dat +0 -288
  141. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests26.dat +0 -453
  142. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests3.dat +0 -305
  143. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests4.dat +0 -74
  144. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests5.dat +0 -210
  145. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests6.dat +0 -663
  146. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests7.dat +0 -453
  147. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests8.dat +0 -165
  148. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests9.dat +0 -472
  149. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tests_innerHTML_1.dat +0 -843
  150. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/tricky01.dat +0 -336
  151. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/webkit01.dat +0 -785
  152. data/rust-vendor/markup5ever_rcdom/html5lib-tests/tree-construction/webkit02.dat +0 -554
  153. data/rust-vendor/markup5ever_rcdom/tests/foreach_html5lib_test/mod.rs +0 -41
  154. data/rust-vendor/markup5ever_rcdom/tests/html-driver.rs +0 -29
  155. data/rust-vendor/markup5ever_rcdom/tests/html-serializer.rs +0 -265
  156. data/rust-vendor/markup5ever_rcdom/tests/html-tokenizer.rs +0 -487
  157. data/rust-vendor/markup5ever_rcdom/tests/html-tree-builder.rs +0 -298
  158. data/rust-vendor/markup5ever_rcdom/tests/html-tree-sink.rs +0 -141
  159. data/rust-vendor/markup5ever_rcdom/tests/util/find_tests.rs +0 -34
  160. data/rust-vendor/markup5ever_rcdom/tests/util/runner.rs +0 -48
  161. data/rust-vendor/markup5ever_rcdom/tests/xml-driver.rs +0 -101
  162. data/rust-vendor/markup5ever_rcdom/tests/xml-tokenizer.rs +0 -374
  163. data/rust-vendor/markup5ever_rcdom/tests/xml-tree-builder.rs +0 -237
  164. data/rust-vendor/markup5ever_rcdom/xml5lib-tests/AUTHORS.rst +0 -9
  165. data/rust-vendor/markup5ever_rcdom/xml5lib-tests/LICENSE +0 -21
  166. data/rust-vendor/markup5ever_rcdom/xml5lib-tests/tokenizer/README.md +0 -92
  167. data/rust-vendor/markup5ever_rcdom/xml5lib-tests/tokenizer/comments.test +0 -274
  168. data/rust-vendor/markup5ever_rcdom/xml5lib-tests/tokenizer/doctype.test +0 -3232
  169. data/rust-vendor/markup5ever_rcdom/xml5lib-tests/tokenizer/entities.test +0 -283
  170. data/rust-vendor/markup5ever_rcdom/xml5lib-tests/tokenizer/eof.test +0 -113
  171. data/rust-vendor/markup5ever_rcdom/xml5lib-tests/tokenizer/namedEntities.test +0 -42210
  172. data/rust-vendor/markup5ever_rcdom/xml5lib-tests/tokenizer/numericEntities.test +0 -1349
  173. data/rust-vendor/markup5ever_rcdom/xml5lib-tests/tokenizer/test1.test +0 -162
  174. data/rust-vendor/markup5ever_rcdom/xml5lib-tests/tokenizer/test2.test +0 -64
  175. data/rust-vendor/markup5ever_rcdom/xml5lib-tests/tokenizer/unicodeChars.test +0 -1295
  176. data/rust-vendor/markup5ever_rcdom/xml5lib-tests/tree-construction/README.md +0 -104
  177. data/rust-vendor/markup5ever_rcdom/xml5lib-tests/tree-construction/namespace.dat +0 -119
  178. data/rust-vendor/markup5ever_rcdom/xml5lib-tests/tree-construction/test1.dat +0 -124
  179. data/rust-vendor/xml5ever/.cargo-checksum.json +0 -1
  180. data/rust-vendor/xml5ever/.cargo_vcs_info.json +0 -6
  181. data/rust-vendor/xml5ever/Cargo.lock +0 -752
  182. data/rust-vendor/xml5ever/Cargo.toml +0 -69
  183. data/rust-vendor/xml5ever/Cargo.toml.orig +0 -29
  184. data/rust-vendor/xml5ever/LICENSE-APACHE +0 -201
  185. data/rust-vendor/xml5ever/LICENSE-MIT +0 -25
  186. data/rust-vendor/xml5ever/README.md +0 -72
  187. data/rust-vendor/xml5ever/benches/xml5ever.rs +0 -77
  188. data/rust-vendor/xml5ever/data/bench/strong.xml +0 -1
  189. data/rust-vendor/xml5ever/examples/README.md +0 -223
  190. data/rust-vendor/xml5ever/examples/example.xml +0 -3
  191. data/rust-vendor/xml5ever/examples/simple_xml_tokenizer.rs +0 -81
  192. data/rust-vendor/xml5ever/examples/xml_tokenizer.rs +0 -115
  193. data/rust-vendor/xml5ever/src/driver.rs +0 -90
  194. data/rust-vendor/xml5ever/src/lib.rs +0 -47
  195. data/rust-vendor/xml5ever/src/macros.rs +0 -18
  196. data/rust-vendor/xml5ever/src/serialize/mod.rs +0 -216
  197. data/rust-vendor/xml5ever/src/tokenizer/char_ref/mod.rs +0 -456
  198. data/rust-vendor/xml5ever/src/tokenizer/interface.rs +0 -116
  199. data/rust-vendor/xml5ever/src/tokenizer/mod.rs +0 -1344
  200. data/rust-vendor/xml5ever/src/tokenizer/qname.rs +0 -84
  201. data/rust-vendor/xml5ever/src/tokenizer/states.rs +0 -167
  202. data/rust-vendor/xml5ever/src/tree_builder/mod.rs +0 -774
  203. data/rust-vendor/xml5ever/src/tree_builder/types.rs +0 -37
@@ -297,7 +297,7 @@ impl<'a, W: Write> Encoder<'a, W> {
297
297
  self.info.bit_depth = depth;
298
298
  }
299
299
 
300
- /// Set compression parameters, see [Compression] for the available options.
300
+ /// Set compression parameters, see [`Compression`] for the available options.
301
301
  pub fn set_compression(&mut self, compression: Compression) {
302
302
  self.set_deflate_compression(DeflateCompression::from_simple(compression));
303
303
  self.set_filter(Filter::from_simple(compression));
@@ -305,7 +305,9 @@ impl<'a, W: Write> Encoder<'a, W> {
305
305
 
306
306
  /// Provides in-depth customization of DEFLATE compression options.
307
307
  ///
308
- /// For a simpler selection of compression options see [Self::set_compression].
308
+ /// For a simpler selection of compression options see [`set_compression`].
309
+ ///
310
+ /// [`set_compression`]: Self::set_compression
309
311
  pub fn set_deflate_compression(&mut self, compression: DeflateCompression) {
310
312
  self.options.compression = compression;
311
313
  }
@@ -2,6 +2,11 @@ use core::convert::TryInto;
2
2
 
3
3
  use crate::{common::BytesPerPixel, Compression};
4
4
 
5
+ mod paeth;
6
+
7
+ #[cfg(feature = "unstable")]
8
+ mod simd;
9
+
5
10
  /// The byte level filter applied to scanlines to prepare them for compression.
6
11
  ///
7
12
  /// Compression in general benefits from repetitive data. The filter is a content-aware method of
@@ -20,6 +25,7 @@ pub enum Filter {
20
25
  Avg,
21
26
  Paeth,
22
27
  Adaptive,
28
+ MinEntropy,
23
29
  }
24
30
 
25
31
  impl Default for Filter {
@@ -88,91 +94,11 @@ impl RowFilter {
88
94
  Filter::Up => Some(Self::Up),
89
95
  Filter::Avg => Some(Self::Avg),
90
96
  Filter::Paeth => Some(Self::Paeth),
91
- Filter::Adaptive => None,
97
+ Filter::Adaptive | Filter::MinEntropy => None,
92
98
  }
93
99
  }
94
100
  }
95
101
 
96
- fn filter_paeth(a: u8, b: u8, c: u8) -> u8 {
97
- // On ARM this algorithm performs much better than the one above adapted from stb,
98
- // and this is the better-studied algorithm we've always used here,
99
- // so we default to it on all non-x86 platforms.
100
- let pa = (i16::from(b) - i16::from(c)).abs();
101
- let pb = (i16::from(a) - i16::from(c)).abs();
102
- let pc = ((i16::from(a) - i16::from(c)) + (i16::from(b) - i16::from(c))).abs();
103
-
104
- let mut out = a;
105
- let mut min = pa;
106
-
107
- if pb < min {
108
- min = pb;
109
- out = b;
110
- }
111
- if pc < min {
112
- out = c;
113
- }
114
-
115
- out
116
- }
117
-
118
- fn filter_paeth_stbi(a: u8, b: u8, c: u8) -> u8 {
119
- // Decoding optimizes better with this algorithm than with `filter_paeth`
120
- //
121
- // This formulation looks very different from the reference in the PNG spec, but is
122
- // actually equivalent and has favorable data dependencies and admits straightforward
123
- // generation of branch-free code, which helps performance significantly.
124
- //
125
- // Adapted from public domain PNG implementation:
126
- // https://github.com/nothings/stb/blob/5c205738c191bcb0abc65c4febfa9bd25ff35234/stb_image.h#L4657-L4668
127
- let thresh = i16::from(c) * 3 - (i16::from(a) + i16::from(b));
128
- let lo = a.min(b);
129
- let hi = a.max(b);
130
- let t0 = if hi as i16 <= thresh { lo } else { c };
131
- let t1 = if thresh <= lo as i16 { hi } else { t0 };
132
- t1
133
- }
134
-
135
- fn filter_paeth_fpnge(a: u8, b: u8, c: u8) -> u8 {
136
- // This is an optimized version of the paeth filter from the PNG specification, proposed by
137
- // Luca Versari for [FPNGE](https://www.lucaversari.it/FJXL_and_FPNGE.pdf). It operates
138
- // entirely on unsigned 8-bit quantities, making it more conducive to vectorization.
139
- //
140
- // p = a + b - c
141
- // pa = |p - a| = |a + b - c - a| = |b - c| = max(b, c) - min(b, c)
142
- // pb = |p - b| = |a + b - c - b| = |a - c| = max(a, c) - min(a, c)
143
- // pc = |p - c| = |a + b - c - c| = |(b - c) + (a - c)| = ...
144
- //
145
- // Further optimizing the calculation of `pc` a bit tricker. However, notice that:
146
- //
147
- // a > c && b > c
148
- // ==> (a - c) > 0 && (b - c) > 0
149
- // ==> pc > (a - c) && pc > (b - c)
150
- // ==> pc > |a - c| && pc > |b - c|
151
- // ==> pc > pb && pc > pa
152
- //
153
- // Meaning that if `c` is smaller than `a` and `b`, the value of `pc` is irrelevant. Similar
154
- // reasoning applies if `c` is larger than the other two inputs. Assuming that `c >= b` and
155
- // `c <= b` or vice versa:
156
- //
157
- // pc = ||b - c| - |a - c|| = |pa - pb| = max(pa, pb) - min(pa, pb)
158
- //
159
- let pa = b.max(c) - c.min(b);
160
- let pb = a.max(c) - c.min(a);
161
- let pc = if (a < c) == (c < b) {
162
- pa.max(pb) - pa.min(pb)
163
- } else {
164
- 255
165
- };
166
-
167
- if pa <= pb && pa <= pc {
168
- a
169
- } else if pb <= pc {
170
- b
171
- } else {
172
- c
173
- }
174
- }
175
-
176
102
  pub(crate) fn unfilter(
177
103
  mut filter: RowFilter,
178
104
  tbpp: BytesPerPixel,
@@ -190,110 +116,6 @@ pub(crate) fn unfilter(
190
116
  }
191
117
  }
192
118
 
193
- // Auto-vectorization notes
194
- // ========================
195
- //
196
- // [2023/01 @okaneco] - Notes on optimizing decoding filters
197
- //
198
- // Links:
199
- // [PR]: https://github.com/image-rs/image-png/pull/382
200
- // [SWAR]: http://aggregate.org/SWAR/over.html
201
- // [AVG]: http://aggregate.org/MAGIC/#Average%20of%20Integers
202
- //
203
- // #382 heavily refactored and optimized the following filters making the
204
- // implementation nonobvious. These comments function as a summary of that
205
- // PR with an explanation of the choices made below.
206
- //
207
- // #382 originally started with trying to optimize using a technique called
208
- // SWAR, SIMD Within a Register. SWAR uses regular integer types like `u32`
209
- // and `u64` as SIMD registers to perform vertical operations in parallel,
210
- // usually involving bit-twiddling. This allowed each `BytesPerPixel` (bpp)
211
- // pixel to be decoded in parallel: 3bpp and 4bpp in a `u32`, 6bpp and 8pp
212
- // in a `u64`. The `Sub` filter looked like the following code block, `Avg`
213
- // was similar but used a bitwise average method from [AVG]:
214
- // ```
215
- // // See "Unpartitioned Operations With Correction Code" from [SWAR]
216
- // fn swar_add_u32(x: u32, y: u32) -> u32 {
217
- // // 7-bit addition so there's no carry over the most significant bit
218
- // let n = (x & 0x7f7f7f7f) + (y & 0x7f7f7f7f); // 0x7F = 0b_0111_1111
219
- // // 1-bit parity/XOR addition to fill in the missing MSB
220
- // n ^ (x ^ y) & 0x80808080 // 0x80 = 0b_1000_0000
221
- // }
222
- //
223
- // let mut prev =
224
- // u32::from_ne_bytes([current[0], current[1], current[2], current[3]]);
225
- // for chunk in current[4..].chunks_exact_mut(4) {
226
- // let cur = u32::from_ne_bytes([chunk[0], chunk[1], chunk[2], chunk[3]]);
227
- // let new_chunk = swar_add_u32(cur, prev);
228
- // chunk.copy_from_slice(&new_chunk.to_ne_bytes());
229
- // prev = new_chunk;
230
- // }
231
- // ```
232
- // While this provided a measurable increase, @fintelia found that this idea
233
- // could be taken even further by unrolling the chunks component-wise and
234
- // avoiding unnecessary byte-shuffling by using byte arrays instead of
235
- // `u32::from|to_ne_bytes`. The bitwise operations were no longer necessary
236
- // so they were reverted to their obvious arithmetic equivalent. Lastly,
237
- // `TryInto` was used instead of `copy_from_slice`. The `Sub` code now
238
- // looked like this (with asserts to remove `0..bpp` bounds checks):
239
- // ```
240
- // assert!(len > 3);
241
- // let mut prev = [current[0], current[1], current[2], current[3]];
242
- // for chunk in current[4..].chunks_exact_mut(4) {
243
- // let new_chunk = [
244
- // chunk[0].wrapping_add(prev[0]),
245
- // chunk[1].wrapping_add(prev[1]),
246
- // chunk[2].wrapping_add(prev[2]),
247
- // chunk[3].wrapping_add(prev[3]),
248
- // ];
249
- // *TryInto::<&mut [u8; 4]>::try_into(chunk).unwrap() = new_chunk;
250
- // prev = new_chunk;
251
- // }
252
- // ```
253
- // The compiler was able to optimize the code to be even faster and this
254
- // method even sped up Paeth filtering! Assertions were experimentally
255
- // added within loop bodies which produced better instructions but no
256
- // difference in speed. Finally, the code was refactored to remove manual
257
- // slicing and start the previous pixel chunks with arrays of `[0; N]`.
258
- // ```
259
- // let mut prev = [0; 4];
260
- // for chunk in current.chunks_exact_mut(4) {
261
- // let new_chunk = [
262
- // chunk[0].wrapping_add(prev[0]),
263
- // chunk[1].wrapping_add(prev[1]),
264
- // chunk[2].wrapping_add(prev[2]),
265
- // chunk[3].wrapping_add(prev[3]),
266
- // ];
267
- // *TryInto::<&mut [u8; 4]>::try_into(chunk).unwrap() = new_chunk;
268
- // prev = new_chunk;
269
- // }
270
- // ```
271
- // While we're not manually bit-twiddling anymore, a possible takeaway from
272
- // this is to "think in SWAR" when dealing with small byte arrays. Unrolling
273
- // array operations and performing them component-wise may unlock previously
274
- // unavailable optimizations from the compiler, even when using the
275
- // `chunks_exact` methods for their potential auto-vectorization benefits.
276
- //
277
- // `std::simd` notes
278
- // =================
279
- //
280
- // In the past we have experimented with `std::simd` for unfiltering. This
281
- // experiment was removed in https://github.com/image-rs/image-png/pull/585
282
- // because:
283
- //
284
- // * The crate's microbenchmarks showed that `std::simd` didn't have a
285
- // significant advantage over auto-vectorization for most filters, except
286
- // for Paeth unfiltering - see
287
- // https://github.com/image-rs/image-png/pull/414#issuecomment-1736655668
288
- // * In the crate's microbenchmarks `std::simd` seemed to help with Paeth
289
- // unfiltering only on x86/x64, with mixed results on ARM - see
290
- // https://github.com/image-rs/image-png/pull/539#issuecomment-2512748043
291
- // * In Chromium end-to-end microbenchmarks `std::simd` either didn't help
292
- // or resulted in a small regression (as measured on x64). See
293
- // https://crrev.com/c/6090592.
294
- // * Field trial data from some "real world" scenarios shows that
295
- // performance can be quite good without relying on `std::simd` - see
296
- // https://github.com/image-rs/image-png/discussions/562#discussioncomment-13303307
297
119
  match filter {
298
120
  NoFilter => {}
299
121
  Sub => match tbpp {
@@ -532,150 +354,7 @@ pub(crate) fn unfilter(
532
354
  }
533
355
  }
534
356
  },
535
- #[allow(unreachable_code)]
536
- Paeth => {
537
- // Select the fastest Paeth filter implementation based on the target architecture.
538
- let filter_paeth_decode = if cfg!(target_arch = "x86_64") {
539
- filter_paeth_stbi
540
- } else {
541
- filter_paeth
542
- };
543
-
544
- // Paeth filter pixels:
545
- // C B D
546
- // A X
547
- match tbpp {
548
- BytesPerPixel::One => {
549
- let mut a_bpp = [0; 1];
550
- let mut c_bpp = [0; 1];
551
- for (chunk, b_bpp) in current.chunks_exact_mut(1).zip(previous.chunks_exact(1))
552
- {
553
- let new_chunk = [chunk[0]
554
- .wrapping_add(filter_paeth_decode(a_bpp[0], b_bpp[0], c_bpp[0]))];
555
- *TryInto::<&mut [u8; 1]>::try_into(chunk).unwrap() = new_chunk;
556
- a_bpp = new_chunk;
557
- c_bpp = b_bpp.try_into().unwrap();
558
- }
559
- }
560
- BytesPerPixel::Two => {
561
- let mut a_bpp = [0; 2];
562
- let mut c_bpp = [0; 2];
563
- for (chunk, b_bpp) in current.chunks_exact_mut(2).zip(previous.chunks_exact(2))
564
- {
565
- let new_chunk = [
566
- chunk[0]
567
- .wrapping_add(filter_paeth_decode(a_bpp[0], b_bpp[0], c_bpp[0])),
568
- chunk[1]
569
- .wrapping_add(filter_paeth_decode(a_bpp[1], b_bpp[1], c_bpp[1])),
570
- ];
571
- *TryInto::<&mut [u8; 2]>::try_into(chunk).unwrap() = new_chunk;
572
- a_bpp = new_chunk;
573
- c_bpp = b_bpp.try_into().unwrap();
574
- }
575
- }
576
- BytesPerPixel::Three => {
577
- let mut a_bpp = [0; 3];
578
- let mut c_bpp = [0; 3];
579
-
580
- let mut previous = &previous[..previous.len() / 3 * 3];
581
- let current_len = current.len();
582
- let mut current = &mut current[..current_len / 3 * 3];
583
-
584
- while let ([c0, c1, c2, c_rest @ ..], [p0, p1, p2, p_rest @ ..]) =
585
- (current, previous)
586
- {
587
- current = c_rest;
588
- previous = p_rest;
589
-
590
- *c0 = c0.wrapping_add(filter_paeth_decode(a_bpp[0], *p0, c_bpp[0]));
591
- *c1 = c1.wrapping_add(filter_paeth_decode(a_bpp[1], *p1, c_bpp[1]));
592
- *c2 = c2.wrapping_add(filter_paeth_decode(a_bpp[2], *p2, c_bpp[2]));
593
-
594
- a_bpp = [*c0, *c1, *c2];
595
- c_bpp = [*p0, *p1, *p2];
596
- }
597
- }
598
- BytesPerPixel::Four => {
599
- // Using the `simd` module here has no effect on Linux
600
- // and appears to regress performance on Windows, so we don't use it here.
601
- // See https://github.com/image-rs/image-png/issues/567
602
-
603
- let mut a_bpp = [0; 4];
604
- let mut c_bpp = [0; 4];
605
-
606
- let mut previous = &previous[..previous.len() & !3];
607
- let current_len = current.len();
608
- let mut current = &mut current[..current_len & !3];
609
-
610
- while let ([c0, c1, c2, c3, c_rest @ ..], [p0, p1, p2, p3, p_rest @ ..]) =
611
- (current, previous)
612
- {
613
- current = c_rest;
614
- previous = p_rest;
615
-
616
- *c0 = c0.wrapping_add(filter_paeth_decode(a_bpp[0], *p0, c_bpp[0]));
617
- *c1 = c1.wrapping_add(filter_paeth_decode(a_bpp[1], *p1, c_bpp[1]));
618
- *c2 = c2.wrapping_add(filter_paeth_decode(a_bpp[2], *p2, c_bpp[2]));
619
- *c3 = c3.wrapping_add(filter_paeth_decode(a_bpp[3], *p3, c_bpp[3]));
620
-
621
- a_bpp = [*c0, *c1, *c2, *c3];
622
- c_bpp = [*p0, *p1, *p2, *p3];
623
- }
624
- }
625
- BytesPerPixel::Six => {
626
- let mut a_bpp = [0; 6];
627
- let mut c_bpp = [0; 6];
628
- for (chunk, b_bpp) in current.chunks_exact_mut(6).zip(previous.chunks_exact(6))
629
- {
630
- let new_chunk = [
631
- chunk[0]
632
- .wrapping_add(filter_paeth_decode(a_bpp[0], b_bpp[0], c_bpp[0])),
633
- chunk[1]
634
- .wrapping_add(filter_paeth_decode(a_bpp[1], b_bpp[1], c_bpp[1])),
635
- chunk[2]
636
- .wrapping_add(filter_paeth_decode(a_bpp[2], b_bpp[2], c_bpp[2])),
637
- chunk[3]
638
- .wrapping_add(filter_paeth_decode(a_bpp[3], b_bpp[3], c_bpp[3])),
639
- chunk[4]
640
- .wrapping_add(filter_paeth_decode(a_bpp[4], b_bpp[4], c_bpp[4])),
641
- chunk[5]
642
- .wrapping_add(filter_paeth_decode(a_bpp[5], b_bpp[5], c_bpp[5])),
643
- ];
644
- *TryInto::<&mut [u8; 6]>::try_into(chunk).unwrap() = new_chunk;
645
- a_bpp = new_chunk;
646
- c_bpp = b_bpp.try_into().unwrap();
647
- }
648
- }
649
- BytesPerPixel::Eight => {
650
- let mut a_bpp = [0; 8];
651
- let mut c_bpp = [0; 8];
652
- for (chunk, b_bpp) in current.chunks_exact_mut(8).zip(previous.chunks_exact(8))
653
- {
654
- let new_chunk = [
655
- chunk[0]
656
- .wrapping_add(filter_paeth_decode(a_bpp[0], b_bpp[0], c_bpp[0])),
657
- chunk[1]
658
- .wrapping_add(filter_paeth_decode(a_bpp[1], b_bpp[1], c_bpp[1])),
659
- chunk[2]
660
- .wrapping_add(filter_paeth_decode(a_bpp[2], b_bpp[2], c_bpp[2])),
661
- chunk[3]
662
- .wrapping_add(filter_paeth_decode(a_bpp[3], b_bpp[3], c_bpp[3])),
663
- chunk[4]
664
- .wrapping_add(filter_paeth_decode(a_bpp[4], b_bpp[4], c_bpp[4])),
665
- chunk[5]
666
- .wrapping_add(filter_paeth_decode(a_bpp[5], b_bpp[5], c_bpp[5])),
667
- chunk[6]
668
- .wrapping_add(filter_paeth_decode(a_bpp[6], b_bpp[6], c_bpp[6])),
669
- chunk[7]
670
- .wrapping_add(filter_paeth_decode(a_bpp[7], b_bpp[7], c_bpp[7])),
671
- ];
672
- *TryInto::<&mut [u8; 8]>::try_into(chunk).unwrap() = new_chunk;
673
- a_bpp = new_chunk;
674
- c_bpp = b_bpp.try_into().unwrap();
675
- }
676
- }
677
- }
678
- }
357
+ Paeth => paeth::unfilter(tbpp, previous, current),
679
358
  }
680
359
  }
681
360
 
@@ -795,7 +474,7 @@ fn filter_internal(
795
474
  .zip(&mut c_chunks)
796
475
  {
797
476
  for i in 0..CHUNK_SIZE {
798
- out[i] = cur[i].wrapping_sub(filter_paeth_fpnge(a[i], b[i], c[i]));
477
+ out[i] = cur[i].wrapping_sub(paeth::filter_paeth_fpnge(a[i], b[i], c[i]));
799
478
  }
800
479
  }
801
480
 
@@ -807,17 +486,47 @@ fn filter_internal(
807
486
  .zip(b_chunks.remainder())
808
487
  .zip(c_chunks.remainder())
809
488
  {
810
- *out = cur.wrapping_sub(filter_paeth_fpnge(a, b, c));
489
+ *out = cur.wrapping_sub(paeth::filter_paeth_fpnge(a, b, c));
811
490
  }
812
491
 
813
492
  for i in 0..bpp {
814
- output[i] = current[i].wrapping_sub(filter_paeth_fpnge(0, previous[i], 0));
493
+ output[i] = current[i].wrapping_sub(paeth::filter_paeth_fpnge(0, previous[i], 0));
815
494
  }
816
495
  Paeth
817
496
  }
818
497
  }
819
498
  }
820
499
 
500
+ fn adaptive_filter(
501
+ f: impl Fn(&[u8]) -> u64,
502
+ bpp: usize,
503
+ len: usize,
504
+ previous: &[u8],
505
+ current: &[u8],
506
+ output: &mut [u8],
507
+ ) -> RowFilter {
508
+ use RowFilter::*;
509
+
510
+ let mut min_cost: u64 = u64::MAX;
511
+ let mut filter_choice = RowFilter::NoFilter;
512
+ for &filter in [Up, Sub, Avg, Paeth].iter() {
513
+ filter_internal(filter, bpp, len, previous, current, output);
514
+ let cost = f(output);
515
+ if cost <= min_cost {
516
+ min_cost = cost;
517
+ filter_choice = filter;
518
+
519
+ if cost == 0 {
520
+ return filter_choice;
521
+ }
522
+ }
523
+ }
524
+ if filter_choice != Paeth {
525
+ filter_internal(filter_choice, bpp, len, previous, current, output);
526
+ }
527
+ filter_choice
528
+ }
529
+
821
530
  pub(crate) fn filter(
822
531
  method: Filter,
823
532
  bpp: BytesPerPixel,
@@ -825,28 +534,12 @@ pub(crate) fn filter(
825
534
  current: &[u8],
826
535
  output: &mut [u8],
827
536
  ) -> RowFilter {
828
- use RowFilter::*;
829
537
  let bpp = bpp.into_usize();
830
538
  let len = current.len();
831
539
 
832
540
  match method {
833
- Filter::Adaptive => {
834
- let mut min_sum: u64 = u64::MAX;
835
- let mut filter_choice = RowFilter::NoFilter;
836
- for &filter in [Sub, Up, Avg, Paeth].iter() {
837
- filter_internal(filter, bpp, len, previous, current, output);
838
- let sum = sum_buffer(output);
839
- if sum <= min_sum {
840
- min_sum = sum;
841
- filter_choice = filter;
842
- }
843
- }
844
-
845
- if filter_choice != Paeth {
846
- filter_internal(filter_choice, bpp, len, previous, current, output);
847
- }
848
- filter_choice
849
- }
541
+ Filter::Adaptive => adaptive_filter(sum_buffer, bpp, len, previous, current, output),
542
+ Filter::MinEntropy => adaptive_filter(entropy, bpp, len, previous, current, output),
850
543
  _ => {
851
544
  let filter = RowFilter::from_method(method).unwrap();
852
545
  filter_internal(filter, bpp, len, previous, current, output)
@@ -854,6 +547,63 @@ pub(crate) fn filter(
854
547
  }
855
548
  }
856
549
 
550
+ /// Estimate the value of i * log2(i) without using floating point operations,
551
+ /// implementation originally from oxipng.
552
+ fn ilog2i(i: u32) -> u32 {
553
+ let log = 32 - i.leading_zeros() - 1;
554
+ i * log + ((i - (1 << log)) << 1)
555
+ }
556
+
557
+ fn entropy(buf: &[u8]) -> u64 {
558
+ let mut counts = [[0_u32; 256]; 4];
559
+ let mut total = 0;
560
+
561
+ // Count the number of occurrences of each byte value.
562
+ let mut chunks = buf.chunks_exact(8);
563
+ for chunk in &mut chunks {
564
+ // Runs of zeros are common and very compressible, so treat them as free.
565
+ if chunk == [0; 8] {
566
+ continue;
567
+ }
568
+
569
+ // Scatter the counts into 4 separate arrays to reduce contention.
570
+ for j in 0..2 {
571
+ counts[0][chunk[j * 4] as usize] += 1;
572
+ counts[1][chunk[1 + j * 4] as usize] += 1;
573
+ counts[2][chunk[2 + j * 4] as usize] += 1;
574
+ counts[3][chunk[3 + j * 4] as usize] += 1;
575
+ }
576
+ total += 8;
577
+ }
578
+ for &lit in chunks.remainder() {
579
+ counts[0][lit as usize] += 1;
580
+ total += 1;
581
+ }
582
+
583
+ // If the input is entirely zeros, short-circuit the entropy calculation.
584
+ if counts[0][0] == total {
585
+ return 0;
586
+ }
587
+
588
+ // Consolidate the counts.
589
+ //
590
+ // Upstream bug: <https://github.com/rust-lang/rust-clippy/issues/11529>
591
+ #[allow(clippy::needless_range_loop)]
592
+ for i in 0..256 {
593
+ counts[0][i] += counts[1][i] + counts[2][i] + counts[3][i];
594
+ }
595
+
596
+ // Compute the entropy.
597
+ let mut entropy = ilog2i(total);
598
+ for &count in &counts[0] {
599
+ if count > 0 {
600
+ entropy = entropy.saturating_sub(ilog2i(count));
601
+ }
602
+ }
603
+
604
+ entropy as u64
605
+ }
606
+
857
607
  // Helper function for Adaptive filter buffer summation
858
608
  fn sum_buffer(buf: &[u8]) -> u64 {
859
609
  const CHUNK_SIZE: usize = 32;
@@ -926,23 +676,6 @@ mod test {
926
676
  }
927
677
  }
928
678
 
929
- #[test]
930
- #[ignore] // takes ~20s without optimizations
931
- fn paeth_impls_are_equivalent() {
932
- for a in 0..=255 {
933
- for b in 0..=255 {
934
- for c in 0..=255 {
935
- let baseline = filter_paeth(a, b, c);
936
- let fpnge = filter_paeth_fpnge(a, b, c);
937
- let stbi = filter_paeth_stbi(a, b, c);
938
-
939
- assert_eq!(baseline, fpnge);
940
- assert_eq!(baseline, stbi);
941
- }
942
- }
943
- }
944
- }
945
-
946
679
  #[test]
947
680
  fn roundtrip_ascending_previous_line() {
948
681
  // A multiple of 8, 6, 4, 3, 2, 1
@@ -0,0 +1,104 @@
1
+ Auto-vectorization notes
2
+ ========================
3
+
4
+ [2023/01 @okaneco] - Notes on optimizing decoding filters
5
+
6
+ Links:
7
+ [PR]: https://github.com/image-rs/image-png/pull/382
8
+ [SWAR]: http://aggregate.org/SWAR/over.html
9
+ [AVG]: http://aggregate.org/MAGIC/#Average%20of%20Integers
10
+
11
+ #382 heavily refactored and optimized the following filters making the
12
+ implementation nonobvious. These comments function as a summary of that
13
+ PR with an explanation of the choices made below.
14
+
15
+ #382 originally started with trying to optimize using a technique called
16
+ SWAR, SIMD Within a Register. SWAR uses regular integer types like `u32`
17
+ and `u64` as SIMD registers to perform vertical operations in parallel,
18
+ usually involving bit-twiddling. This allowed each `BytesPerPixel` (bpp)
19
+ pixel to be decoded in parallel: 3bpp and 4bpp in a `u32`, 6bpp and 8pp
20
+ in a `u64`. The `Sub` filter looked like the following code block, `Avg`
21
+ was similar but used a bitwise average method from [AVG]:
22
+ ```
23
+ // See "Unpartitioned Operations With Correction Code" from [SWAR]
24
+ fn swar_add_u32(x: u32, y: u32) -> u32 {
25
+ // 7-bit addition so there's no carry over the most significant bit
26
+ let n = (x & 0x7f7f7f7f) + (y & 0x7f7f7f7f); // 0x7F = 0b_0111_1111
27
+ // 1-bit parity/XOR addition to fill in the missing MSB
28
+ n ^ (x ^ y) & 0x80808080 // 0x80 = 0b_1000_0000
29
+ }
30
+
31
+ let mut prev =
32
+ u32::from_ne_bytes([current[0], current[1], current[2], current[3]]);
33
+ for chunk in current[4..].chunks_exact_mut(4) {
34
+ let cur = u32::from_ne_bytes([chunk[0], chunk[1], chunk[2], chunk[3]]);
35
+ let new_chunk = swar_add_u32(cur, prev);
36
+ chunk.copy_from_slice(&new_chunk.to_ne_bytes());
37
+ prev = new_chunk;
38
+ }
39
+ ```
40
+ While this provided a measurable increase, @fintelia found that this idea
41
+ could be taken even further by unrolling the chunks component-wise and
42
+ avoiding unnecessary byte-shuffling by using byte arrays instead of
43
+ `u32::from|to_ne_bytes`. The bitwise operations were no longer necessary
44
+ so they were reverted to their obvious arithmetic equivalent. Lastly,
45
+ `TryInto` was used instead of `copy_from_slice`. The `Sub` code now
46
+ looked like this (with asserts to remove `0..bpp` bounds checks):
47
+ ```
48
+ assert!(len > 3);
49
+ let mut prev = [current[0], current[1], current[2], current[3]];
50
+ for chunk in current[4..].chunks_exact_mut(4) {
51
+ let new_chunk = [
52
+ chunk[0].wrapping_add(prev[0]),
53
+ chunk[1].wrapping_add(prev[1]),
54
+ chunk[2].wrapping_add(prev[2]),
55
+ chunk[3].wrapping_add(prev[3]),
56
+ ];
57
+ *TryInto::<&mut [u8; 4]>::try_into(chunk).unwrap() = new_chunk;
58
+ prev = new_chunk;
59
+ }
60
+ ```
61
+ The compiler was able to optimize the code to be even faster and this
62
+ method even sped up Paeth filtering! Assertions were experimentally
63
+ added within loop bodies which produced better instructions but no
64
+ difference in speed. Finally, the code was refactored to remove manual
65
+ slicing and start the previous pixel chunks with arrays of `[0; N]`.
66
+ ```
67
+ let mut prev = [0; 4];
68
+ for chunk in current.chunks_exact_mut(4) {
69
+ let new_chunk = [
70
+ chunk[0].wrapping_add(prev[0]),
71
+ chunk[1].wrapping_add(prev[1]),
72
+ chunk[2].wrapping_add(prev[2]),
73
+ chunk[3].wrapping_add(prev[3]),
74
+ ];
75
+ *TryInto::<&mut [u8; 4]>::try_into(chunk).unwrap() = new_chunk;
76
+ prev = new_chunk;
77
+ }
78
+ ```
79
+ While we're not manually bit-twiddling anymore, a possible takeaway from
80
+ this is to "think in SWAR" when dealing with small byte arrays. Unrolling
81
+ array operations and performing them component-wise may unlock previously
82
+ unavailable optimizations from the compiler, even when using the
83
+ `chunks_exact` methods for their potential auto-vectorization benefits.
84
+
85
+ `std::simd` notes
86
+ =================
87
+
88
+ In the past we have experimented with `std::simd` for unfiltering. This
89
+ experiment was removed in https://github.com/image-rs/image-png/pull/585
90
+ because:
91
+
92
+ * The crate's microbenchmarks showed that `std::simd` didn't have a
93
+ significant advantage over auto-vectorization for most filters, except
94
+ for Paeth unfiltering - see
95
+ https://github.com/image-rs/image-png/pull/414#issuecomment-1736655668
96
+ * In the crate's microbenchmarks `std::simd` seemed to help with Paeth
97
+ unfiltering only on x86/x64, with mixed results on ARM - see
98
+ https://github.com/image-rs/image-png/pull/539#issuecomment-2512748043
99
+ * In Chromium end-to-end microbenchmarks `std::simd` either didn't help
100
+ or resulted in a small regression (as measured on x64). See
101
+ https://crrev.com/c/6090592.
102
+ * Field trial data from some "real world" scenarios shows that
103
+ performance can be quite good without relying on `std::simd` - see
104
+ https://github.com/image-rs/image-png/discussions/562#discussioncomment-13303307