RubyGems - html-to-markdown - Versions diffs - 2.24.6 → 2.25.0 - Mend

html-to-markdown 2.24.6 → 2.25.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (203) hide show

data/rust-vendor/png/src/encoder.rs CHANGED Viewed

@@ -297,7 +297,7 @@ impl<'a, W: Write> Encoder<'a, W> {
         self.info.bit_depth = depth;
     }
-    /// Set compression parameters, see [Compression] for the available options.
+    /// Set compression parameters, see [`Compression`] for the available options.
     pub fn set_compression(&mut self, compression: Compression) {
         self.set_deflate_compression(DeflateCompression::from_simple(compression));
         self.set_filter(Filter::from_simple(compression));
@@ -305,7 +305,9 @@ impl<'a, W: Write> Encoder<'a, W> {
     /// Provides in-depth customization of DEFLATE compression options.
     ///
-    /// For a simpler selection of compression options see [Self::set_compression].
+    /// For a simpler selection of compression options see [`set_compression`].
+    ///
+    /// [`set_compression`]: Self::set_compression
     pub fn set_deflate_compression(&mut self, compression: DeflateCompression) {
         self.options.compression = compression;
     }

data/rust-vendor/png/src/{filter.rs → filter/mod.rs} RENAMED Viewed

@@ -2,6 +2,11 @@ use core::convert::TryInto;
 use crate::{common::BytesPerPixel, Compression};
+mod paeth;
+#[cfg(feature = "unstable")]
+mod simd;
 /// The byte level filter applied to scanlines to prepare them for compression.
 ///
 /// Compression in general benefits from repetitive data. The filter is a content-aware method of
@@ -20,6 +25,7 @@ pub enum Filter {
     Avg,
     Paeth,
     Adaptive,
+    MinEntropy,
 }
 impl Default for Filter {
@@ -88,91 +94,11 @@ impl RowFilter {
             Filter::Up => Some(Self::Up),
             Filter::Avg => Some(Self::Avg),
             Filter::Paeth => Some(Self::Paeth),
-            Filter::Adaptive => None,
+            Filter::Adaptive | Filter::MinEntropy => None,
         }
     }
 }
-fn filter_paeth(a: u8, b: u8, c: u8) -> u8 {
-    // On ARM this algorithm performs much better than the one above adapted from stb,
-    // and this is the better-studied algorithm we've always used here,
-    // so we default to it on all non-x86 platforms.
-    let pa = (i16::from(b) - i16::from(c)).abs();
-    let pb = (i16::from(a) - i16::from(c)).abs();
-    let pc = ((i16::from(a) - i16::from(c)) + (i16::from(b) - i16::from(c))).abs();
-    let mut out = a;
-    let mut min = pa;
-    if pb < min {
-        min = pb;
-        out = b;
-    }
-    if pc < min {
-        out = c;
-    }
-    out
-}
-fn filter_paeth_stbi(a: u8, b: u8, c: u8) -> u8 {
-    // Decoding optimizes better with this algorithm than with `filter_paeth`
-    //
-    // This formulation looks very different from the reference in the PNG spec, but is
-    // actually equivalent and has favorable data dependencies and admits straightforward
-    // generation of branch-free code, which helps performance significantly.
-    //
-    // Adapted from public domain PNG implementation:
-    // https://github.com/nothings/stb/blob/5c205738c191bcb0abc65c4febfa9bd25ff35234/stb_image.h#L4657-L4668
-    let thresh = i16::from(c) * 3 - (i16::from(a) + i16::from(b));
-    let lo = a.min(b);
-    let hi = a.max(b);
-    let t0 = if hi as i16 <= thresh { lo } else { c };
-    let t1 = if thresh <= lo as i16 { hi } else { t0 };
-    t1
-}
-fn filter_paeth_fpnge(a: u8, b: u8, c: u8) -> u8 {
-    // This is an optimized version of the paeth filter from the PNG specification, proposed by
-    // Luca Versari for [FPNGE](https://www.lucaversari.it/FJXL_and_FPNGE.pdf). It operates
-    // entirely on unsigned 8-bit quantities, making it more conducive to vectorization.
-    //
-    //     p = a + b - c
-    //     pa = |p - a| = |a + b - c - a| = |b - c| = max(b, c) - min(b, c)
-    //     pb = |p - b| = |a + b - c - b| = |a - c| = max(a, c) - min(a, c)
-    //     pc = |p - c| = |a + b - c - c| = |(b - c) + (a - c)| = ...
-    //
-    // Further optimizing the calculation of `pc` a bit tricker. However, notice that:
-    //
-    //        a > c && b > c
-    //    ==> (a - c) > 0 && (b - c) > 0
-    //    ==> pc > (a - c) && pc > (b - c)
-    //    ==> pc > |a - c| && pc > |b - c|
-    //    ==> pc > pb && pc > pa
-    //
-    // Meaning that if `c` is smaller than `a` and `b`, the value of `pc` is irrelevant. Similar
-    // reasoning applies if `c` is larger than the other two inputs. Assuming that `c >= b` and
-    // `c <= b` or vice versa:
-    //
-    //     pc = ||b - c| - |a - c|| =  |pa - pb| = max(pa, pb) - min(pa, pb)
-    //
-    let pa = b.max(c) - c.min(b);
-    let pb = a.max(c) - c.min(a);
-    let pc = if (a < c) == (c < b) {
-        pa.max(pb) - pa.min(pb)
-    } else {
-        255
-    };
-    if pa <= pb && pa <= pc {
-        a
-    } else if pb <= pc {
-        b
-    } else {
-        c
-    }
-}
 pub(crate) fn unfilter(
     mut filter: RowFilter,
     tbpp: BytesPerPixel,
@@ -190,110 +116,6 @@ pub(crate) fn unfilter(
         }
     }
-    // Auto-vectorization notes
-    // ========================
-    //
-    // [2023/01 @okaneco] - Notes on optimizing decoding filters
-    //
-    // Links:
-    // [PR]: https://github.com/image-rs/image-png/pull/382
-    // [SWAR]: http://aggregate.org/SWAR/over.html
-    // [AVG]: http://aggregate.org/MAGIC/#Average%20of%20Integers
-    //
-    // #382 heavily refactored and optimized the following filters making the
-    // implementation nonobvious. These comments function as a summary of that
-    // PR with an explanation of the choices made below.
-    //
-    // #382 originally started with trying to optimize using a technique called
-    // SWAR, SIMD Within a Register. SWAR uses regular integer types like `u32`
-    // and `u64` as SIMD registers to perform vertical operations in parallel,
-    // usually involving bit-twiddling. This allowed each `BytesPerPixel` (bpp)
-    // pixel to be decoded in parallel: 3bpp and 4bpp in a `u32`, 6bpp and 8pp
-    // in a `u64`. The `Sub` filter looked like the following code block, `Avg`
-    // was similar but used a bitwise average method from [AVG]:
-    // ```
-    // // See "Unpartitioned Operations With Correction Code" from [SWAR]
-    // fn swar_add_u32(x: u32, y: u32) -> u32 {
-    //     // 7-bit addition so there's no carry over the most significant bit
-    //     let n = (x & 0x7f7f7f7f) + (y & 0x7f7f7f7f); // 0x7F = 0b_0111_1111
-    //     // 1-bit parity/XOR addition to fill in the missing MSB
-    //     n ^ (x ^ y) & 0x80808080                     // 0x80 = 0b_1000_0000
-    // }
-    //
-    // let mut prev =
-    //     u32::from_ne_bytes([current[0], current[1], current[2], current[3]]);
-    // for chunk in current[4..].chunks_exact_mut(4) {
-    //     let cur = u32::from_ne_bytes([chunk[0], chunk[1], chunk[2], chunk[3]]);
-    //     let new_chunk = swar_add_u32(cur, prev);
-    //     chunk.copy_from_slice(&new_chunk.to_ne_bytes());
-    //     prev = new_chunk;
-    // }
-    // ```
-    // While this provided a measurable increase, @fintelia found that this idea
-    // could be taken even further by unrolling the chunks component-wise and
-    // avoiding unnecessary byte-shuffling by using byte arrays instead of
-    // `u32::from|to_ne_bytes`. The bitwise operations were no longer necessary
-    // so they were reverted to their obvious arithmetic equivalent. Lastly,
-    // `TryInto` was used instead of `copy_from_slice`. The `Sub` code now
-    // looked like this (with asserts to remove `0..bpp` bounds checks):
-    // ```
-    // assert!(len > 3);
-    // let mut prev = [current[0], current[1], current[2], current[3]];
-    // for chunk in current[4..].chunks_exact_mut(4) {
-    //     let new_chunk = [
-    //         chunk[0].wrapping_add(prev[0]),
-    //         chunk[1].wrapping_add(prev[1]),
-    //         chunk[2].wrapping_add(prev[2]),
-    //         chunk[3].wrapping_add(prev[3]),
-    //     ];
-    //     *TryInto::<&mut [u8; 4]>::try_into(chunk).unwrap() = new_chunk;
-    //     prev = new_chunk;
-    // }
-    // ```
-    // The compiler was able to optimize the code to be even faster and this
-    // method even sped up Paeth filtering! Assertions were experimentally
-    // added within loop bodies which produced better instructions but no
-    // difference in speed. Finally, the code was refactored to remove manual
-    // slicing and start the previous pixel chunks with arrays of `[0; N]`.
-    // ```
-    // let mut prev = [0; 4];
-    // for chunk in current.chunks_exact_mut(4) {
-    //     let new_chunk = [
-    //         chunk[0].wrapping_add(prev[0]),
-    //         chunk[1].wrapping_add(prev[1]),
-    //         chunk[2].wrapping_add(prev[2]),
-    //         chunk[3].wrapping_add(prev[3]),
-    //     ];
-    //     *TryInto::<&mut [u8; 4]>::try_into(chunk).unwrap() = new_chunk;
-    //     prev = new_chunk;
-    // }
-    // ```
-    // While we're not manually bit-twiddling anymore, a possible takeaway from
-    // this is to "think in SWAR" when dealing with small byte arrays. Unrolling
-    // array operations and performing them component-wise may unlock previously
-    // unavailable optimizations from the compiler, even when using the
-    // `chunks_exact` methods for their potential auto-vectorization benefits.
-    //
-    // `std::simd` notes
-    // =================
-    //
-    // In the past we have experimented with `std::simd` for unfiltering.  This
-    // experiment was removed in https://github.com/image-rs/image-png/pull/585
-    // because:
-    //
-    // * The crate's microbenchmarks showed that `std::simd` didn't have a
-    //   significant advantage over auto-vectorization for most filters, except
-    //   for Paeth unfiltering - see
-    //   https://github.com/image-rs/image-png/pull/414#issuecomment-1736655668
-    // * In the crate's microbenchmarks `std::simd` seemed to help with Paeth
-    //   unfiltering only on x86/x64, with mixed results on ARM - see
-    //   https://github.com/image-rs/image-png/pull/539#issuecomment-2512748043
-    // * In Chromium end-to-end microbenchmarks `std::simd` either didn't help
-    //   or resulted in a small regression (as measured on x64).  See
-    //   https://crrev.com/c/6090592.
-    // * Field trial data from some "real world" scenarios shows that
-    //   performance can be quite good without relying on `std::simd` - see
-    //   https://github.com/image-rs/image-png/discussions/562#discussioncomment-13303307
     match filter {
         NoFilter => {}
         Sub => match tbpp {
@@ -532,150 +354,7 @@ pub(crate) fn unfilter(
                 }
             }
         },
-        #[allow(unreachable_code)]
-        Paeth => {
-            // Select the fastest Paeth filter implementation based on the target architecture.
-            let filter_paeth_decode = if cfg!(target_arch = "x86_64") {
-                filter_paeth_stbi
-            } else {
-                filter_paeth
-            };
-            // Paeth filter pixels:
-            // C B D
-            // A X
-            match tbpp {
-                BytesPerPixel::One => {
-                    let mut a_bpp = [0; 1];
-                    let mut c_bpp = [0; 1];
-                    for (chunk, b_bpp) in current.chunks_exact_mut(1).zip(previous.chunks_exact(1))
-                    {
-                        let new_chunk = [chunk[0]
-                            .wrapping_add(filter_paeth_decode(a_bpp[0], b_bpp[0], c_bpp[0]))];
-                        *TryInto::<&mut [u8; 1]>::try_into(chunk).unwrap() = new_chunk;
-                        a_bpp = new_chunk;
-                        c_bpp = b_bpp.try_into().unwrap();
-                    }
-                }
-                BytesPerPixel::Two => {
-                    let mut a_bpp = [0; 2];
-                    let mut c_bpp = [0; 2];
-                    for (chunk, b_bpp) in current.chunks_exact_mut(2).zip(previous.chunks_exact(2))
-                    {
-                        let new_chunk = [
-                            chunk[0]
-                                .wrapping_add(filter_paeth_decode(a_bpp[0], b_bpp[0], c_bpp[0])),
-                            chunk[1]
-                                .wrapping_add(filter_paeth_decode(a_bpp[1], b_bpp[1], c_bpp[1])),
-                        ];
-                        *TryInto::<&mut [u8; 2]>::try_into(chunk).unwrap() = new_chunk;
-                        a_bpp = new_chunk;
-                        c_bpp = b_bpp.try_into().unwrap();
-                    }
-                }
-                BytesPerPixel::Three => {
-                    let mut a_bpp = [0; 3];
-                    let mut c_bpp = [0; 3];
-                    let mut previous = &previous[..previous.len() / 3 * 3];
-                    let current_len = current.len();
-                    let mut current = &mut current[..current_len / 3 * 3];
-                    while let ([c0, c1, c2, c_rest @ ..], [p0, p1, p2, p_rest @ ..]) =
-                        (current, previous)
-                    {
-                        current = c_rest;
-                        previous = p_rest;
-                        *c0 = c0.wrapping_add(filter_paeth_decode(a_bpp[0], *p0, c_bpp[0]));
-                        *c1 = c1.wrapping_add(filter_paeth_decode(a_bpp[1], *p1, c_bpp[1]));
-                        *c2 = c2.wrapping_add(filter_paeth_decode(a_bpp[2], *p2, c_bpp[2]));
-                        a_bpp = [*c0, *c1, *c2];
-                        c_bpp = [*p0, *p1, *p2];
-                    }
-                }
-                BytesPerPixel::Four => {
-                    // Using the `simd` module here has no effect on Linux
-                    // and appears to regress performance on Windows, so we don't use it here.
-                    // See https://github.com/image-rs/image-png/issues/567
-                    let mut a_bpp = [0; 4];
-                    let mut c_bpp = [0; 4];
-                    let mut previous = &previous[..previous.len() & !3];
-                    let current_len = current.len();
-                    let mut current = &mut current[..current_len & !3];
-                    while let ([c0, c1, c2, c3, c_rest @ ..], [p0, p1, p2, p3, p_rest @ ..]) =
-                        (current, previous)
-                    {
-                        current = c_rest;
-                        previous = p_rest;
-                        *c0 = c0.wrapping_add(filter_paeth_decode(a_bpp[0], *p0, c_bpp[0]));
-                        *c1 = c1.wrapping_add(filter_paeth_decode(a_bpp[1], *p1, c_bpp[1]));
-                        *c2 = c2.wrapping_add(filter_paeth_decode(a_bpp[2], *p2, c_bpp[2]));
-                        *c3 = c3.wrapping_add(filter_paeth_decode(a_bpp[3], *p3, c_bpp[3]));
-                        a_bpp = [*c0, *c1, *c2, *c3];
-                        c_bpp = [*p0, *p1, *p2, *p3];
-                    }
-                }
-                BytesPerPixel::Six => {
-                    let mut a_bpp = [0; 6];
-                    let mut c_bpp = [0; 6];
-                    for (chunk, b_bpp) in current.chunks_exact_mut(6).zip(previous.chunks_exact(6))
-                    {
-                        let new_chunk = [
-                            chunk[0]
-                                .wrapping_add(filter_paeth_decode(a_bpp[0], b_bpp[0], c_bpp[0])),
-                            chunk[1]
-                                .wrapping_add(filter_paeth_decode(a_bpp[1], b_bpp[1], c_bpp[1])),
-                            chunk[2]
-                                .wrapping_add(filter_paeth_decode(a_bpp[2], b_bpp[2], c_bpp[2])),
-                            chunk[3]
-                                .wrapping_add(filter_paeth_decode(a_bpp[3], b_bpp[3], c_bpp[3])),
-                            chunk[4]
-                                .wrapping_add(filter_paeth_decode(a_bpp[4], b_bpp[4], c_bpp[4])),
-                            chunk[5]
-                                .wrapping_add(filter_paeth_decode(a_bpp[5], b_bpp[5], c_bpp[5])),
-                        ];
-                        *TryInto::<&mut [u8; 6]>::try_into(chunk).unwrap() = new_chunk;
-                        a_bpp = new_chunk;
-                        c_bpp = b_bpp.try_into().unwrap();
-                    }
-                }
-                BytesPerPixel::Eight => {
-                    let mut a_bpp = [0; 8];
-                    let mut c_bpp = [0; 8];
-                    for (chunk, b_bpp) in current.chunks_exact_mut(8).zip(previous.chunks_exact(8))
-                    {
-                        let new_chunk = [
-                            chunk[0]
-                                .wrapping_add(filter_paeth_decode(a_bpp[0], b_bpp[0], c_bpp[0])),
-                            chunk[1]
-                                .wrapping_add(filter_paeth_decode(a_bpp[1], b_bpp[1], c_bpp[1])),
-                            chunk[2]
-                                .wrapping_add(filter_paeth_decode(a_bpp[2], b_bpp[2], c_bpp[2])),
-                            chunk[3]
-                                .wrapping_add(filter_paeth_decode(a_bpp[3], b_bpp[3], c_bpp[3])),
-                            chunk[4]
-                                .wrapping_add(filter_paeth_decode(a_bpp[4], b_bpp[4], c_bpp[4])),
-                            chunk[5]
-                                .wrapping_add(filter_paeth_decode(a_bpp[5], b_bpp[5], c_bpp[5])),
-                            chunk[6]
-                                .wrapping_add(filter_paeth_decode(a_bpp[6], b_bpp[6], c_bpp[6])),
-                            chunk[7]
-                                .wrapping_add(filter_paeth_decode(a_bpp[7], b_bpp[7], c_bpp[7])),
-                        ];
-                        *TryInto::<&mut [u8; 8]>::try_into(chunk).unwrap() = new_chunk;
-                        a_bpp = new_chunk;
-                        c_bpp = b_bpp.try_into().unwrap();
-                    }
-                }
-            }
-        }
+        Paeth => paeth::unfilter(tbpp, previous, current),
     }
 }
@@ -795,7 +474,7 @@ fn filter_internal(
                 .zip(&mut c_chunks)
             {
                 for i in 0..CHUNK_SIZE {
-                    out[i] = cur[i].wrapping_sub(filter_paeth_fpnge(a[i], b[i], c[i]));
+                    out[i] = cur[i].wrapping_sub(paeth::filter_paeth_fpnge(a[i], b[i], c[i]));
                 }
             }
@@ -807,17 +486,47 @@ fn filter_internal(
                 .zip(b_chunks.remainder())
                 .zip(c_chunks.remainder())
             {
-                *out = cur.wrapping_sub(filter_paeth_fpnge(a, b, c));
+                *out = cur.wrapping_sub(paeth::filter_paeth_fpnge(a, b, c));
             }
             for i in 0..bpp {
-                output[i] = current[i].wrapping_sub(filter_paeth_fpnge(0, previous[i], 0));
+                output[i] = current[i].wrapping_sub(paeth::filter_paeth_fpnge(0, previous[i], 0));
             }
             Paeth
         }
     }
 }
+fn adaptive_filter(
+    f: impl Fn(&[u8]) -> u64,
+    bpp: usize,
+    len: usize,
+    previous: &[u8],
+    current: &[u8],
+    output: &mut [u8],
+) -> RowFilter {
+    use RowFilter::*;
+    let mut min_cost: u64 = u64::MAX;
+    let mut filter_choice = RowFilter::NoFilter;
+    for &filter in [Up, Sub, Avg, Paeth].iter() {
+        filter_internal(filter, bpp, len, previous, current, output);
+        let cost = f(output);
+        if cost <= min_cost {
+            min_cost = cost;
+            filter_choice = filter;
+            if cost == 0 {
+                return filter_choice;
+            }
+        }
+    }
+    if filter_choice != Paeth {
+        filter_internal(filter_choice, bpp, len, previous, current, output);
+    }
+    filter_choice
+}
 pub(crate) fn filter(
     method: Filter,
     bpp: BytesPerPixel,
@@ -825,28 +534,12 @@ pub(crate) fn filter(
     current: &[u8],
     output: &mut [u8],
 ) -> RowFilter {
-    use RowFilter::*;
     let bpp = bpp.into_usize();
     let len = current.len();
     match method {
-        Filter::Adaptive => {
-            let mut min_sum: u64 = u64::MAX;
-            let mut filter_choice = RowFilter::NoFilter;
-            for &filter in [Sub, Up, Avg, Paeth].iter() {
-                filter_internal(filter, bpp, len, previous, current, output);
-                let sum = sum_buffer(output);
-                if sum <= min_sum {
-                    min_sum = sum;
-                    filter_choice = filter;
-                }
-            }
-            if filter_choice != Paeth {
-                filter_internal(filter_choice, bpp, len, previous, current, output);
-            }
-            filter_choice
-        }
+        Filter::Adaptive => adaptive_filter(sum_buffer, bpp, len, previous, current, output),
+        Filter::MinEntropy => adaptive_filter(entropy, bpp, len, previous, current, output),
         _ => {
             let filter = RowFilter::from_method(method).unwrap();
             filter_internal(filter, bpp, len, previous, current, output)
@@ -854,6 +547,63 @@ pub(crate) fn filter(
     }
 }
+/// Estimate the value of i * log2(i) without using floating point operations,
+/// implementation originally from oxipng.
+fn ilog2i(i: u32) -> u32 {
+    let log = 32 - i.leading_zeros() - 1;
+    i * log + ((i - (1 << log)) << 1)
+}
+fn entropy(buf: &[u8]) -> u64 {
+    let mut counts = [[0_u32; 256]; 4];
+    let mut total = 0;
+    // Count the number of occurrences of each byte value.
+    let mut chunks = buf.chunks_exact(8);
+    for chunk in &mut chunks {
+        // Runs of zeros are common and very compressible, so treat them as free.
+        if chunk == [0; 8] {
+            continue;
+        }
+        // Scatter the counts into 4 separate arrays to reduce contention.
+        for j in 0..2 {
+            counts[0][chunk[j * 4] as usize] += 1;
+            counts[1][chunk[1 + j * 4] as usize] += 1;
+            counts[2][chunk[2 + j * 4] as usize] += 1;
+            counts[3][chunk[3 + j * 4] as usize] += 1;
+        }
+        total += 8;
+    }
+    for &lit in chunks.remainder() {
+        counts[0][lit as usize] += 1;
+        total += 1;
+    }
+    // If the input is entirely zeros, short-circuit the entropy calculation.
+    if counts[0][0] == total {
+        return 0;
+    }
+    // Consolidate the counts.
+    //
+    // Upstream bug: <https://github.com/rust-lang/rust-clippy/issues/11529>
+    #[allow(clippy::needless_range_loop)]
+    for i in 0..256 {
+        counts[0][i] += counts[1][i] + counts[2][i] + counts[3][i];
+    }
+    // Compute the entropy.
+    let mut entropy = ilog2i(total);
+    for &count in &counts[0] {
+        if count > 0 {
+            entropy = entropy.saturating_sub(ilog2i(count));
+        }
+    }
+    entropy as u64
+}
 // Helper function for Adaptive filter buffer summation
 fn sum_buffer(buf: &[u8]) -> u64 {
     const CHUNK_SIZE: usize = 32;
@@ -926,23 +676,6 @@ mod test {
         }
     }
-    #[test]
-    #[ignore] // takes ~20s without optimizations
-    fn paeth_impls_are_equivalent() {
-        for a in 0..=255 {
-            for b in 0..=255 {
-                for c in 0..=255 {
-                    let baseline = filter_paeth(a, b, c);
-                    let fpnge = filter_paeth_fpnge(a, b, c);
-                    let stbi = filter_paeth_stbi(a, b, c);
-                    assert_eq!(baseline, fpnge);
-                    assert_eq!(baseline, stbi);
-                }
-            }
-        }
-    }
     #[test]
     fn roundtrip_ascending_previous_line() {
         // A multiple of 8, 6, 4, 3, 2, 1

data/rust-vendor/png/src/filter/optimization-notes.md ADDED Viewed

@@ -0,0 +1,104 @@
+Auto-vectorization notes
+========================
+[2023/01 @okaneco] - Notes on optimizing decoding filters
+Links:
+[PR]: https://github.com/image-rs/image-png/pull/382
+[SWAR]: http://aggregate.org/SWAR/over.html
+[AVG]: http://aggregate.org/MAGIC/#Average%20of%20Integers
+#382 heavily refactored and optimized the following filters making the
+implementation nonobvious. These comments function as a summary of that
+PR with an explanation of the choices made below.
+#382 originally started with trying to optimize using a technique called
+SWAR, SIMD Within a Register. SWAR uses regular integer types like `u32`
+and `u64` as SIMD registers to perform vertical operations in parallel,
+usually involving bit-twiddling. This allowed each `BytesPerPixel` (bpp)
+pixel to be decoded in parallel: 3bpp and 4bpp in a `u32`, 6bpp and 8pp
+in a `u64`. The `Sub` filter looked like the following code block, `Avg`
+was similar but used a bitwise average method from [AVG]:
+```
+// See "Unpartitioned Operations With Correction Code" from [SWAR]
+fn swar_add_u32(x: u32, y: u32) -> u32 {
+    // 7-bit addition so there's no carry over the most significant bit
+    let n = (x & 0x7f7f7f7f) + (y & 0x7f7f7f7f); // 0x7F = 0b_0111_1111
+    // 1-bit parity/XOR addition to fill in the missing MSB
+    n ^ (x ^ y) & 0x80808080                     // 0x80 = 0b_1000_0000
+}
+let mut prev =
+    u32::from_ne_bytes([current[0], current[1], current[2], current[3]]);
+for chunk in current[4..].chunks_exact_mut(4) {
+    let cur = u32::from_ne_bytes([chunk[0], chunk[1], chunk[2], chunk[3]]);
+    let new_chunk = swar_add_u32(cur, prev);
+    chunk.copy_from_slice(&new_chunk.to_ne_bytes());
+    prev = new_chunk;
+}
+```
+While this provided a measurable increase, @fintelia found that this idea
+could be taken even further by unrolling the chunks component-wise and
+avoiding unnecessary byte-shuffling by using byte arrays instead of
+`u32::from|to_ne_bytes`. The bitwise operations were no longer necessary
+so they were reverted to their obvious arithmetic equivalent. Lastly,
+`TryInto` was used instead of `copy_from_slice`. The `Sub` code now
+looked like this (with asserts to remove `0..bpp` bounds checks):
+```
+assert!(len > 3);
+let mut prev = [current[0], current[1], current[2], current[3]];
+for chunk in current[4..].chunks_exact_mut(4) {
+    let new_chunk = [
+        chunk[0].wrapping_add(prev[0]),
+        chunk[1].wrapping_add(prev[1]),
+        chunk[2].wrapping_add(prev[2]),
+        chunk[3].wrapping_add(prev[3]),
+    ];
+    *TryInto::<&mut [u8; 4]>::try_into(chunk).unwrap() = new_chunk;
+    prev = new_chunk;
+}
+```
+The compiler was able to optimize the code to be even faster and this
+method even sped up Paeth filtering! Assertions were experimentally
+added within loop bodies which produced better instructions but no
+difference in speed. Finally, the code was refactored to remove manual
+slicing and start the previous pixel chunks with arrays of `[0; N]`.
+```
+let mut prev = [0; 4];
+for chunk in current.chunks_exact_mut(4) {
+    let new_chunk = [
+        chunk[0].wrapping_add(prev[0]),
+        chunk[1].wrapping_add(prev[1]),
+        chunk[2].wrapping_add(prev[2]),
+        chunk[3].wrapping_add(prev[3]),
+    ];
+    *TryInto::<&mut [u8; 4]>::try_into(chunk).unwrap() = new_chunk;
+    prev = new_chunk;
+}
+```
+While we're not manually bit-twiddling anymore, a possible takeaway from
+this is to "think in SWAR" when dealing with small byte arrays. Unrolling
+array operations and performing them component-wise may unlock previously
+unavailable optimizations from the compiler, even when using the
+`chunks_exact` methods for their potential auto-vectorization benefits.
+`std::simd` notes
+=================
+In the past we have experimented with `std::simd` for unfiltering.  This
+experiment was removed in https://github.com/image-rs/image-png/pull/585
+because:
+* The crate's microbenchmarks showed that `std::simd` didn't have a
+  significant advantage over auto-vectorization for most filters, except
+  for Paeth unfiltering - see
+  https://github.com/image-rs/image-png/pull/414#issuecomment-1736655668
+* In the crate's microbenchmarks `std::simd` seemed to help with Paeth
+  unfiltering only on x86/x64, with mixed results on ARM - see
+  https://github.com/image-rs/image-png/pull/539#issuecomment-2512748043
+* In Chromium end-to-end microbenchmarks `std::simd` either didn't help
+  or resulted in a small regression (as measured on x64).  See
+  https://crrev.com/c/6090592.
+* Field trial data from some "real world" scenarios shows that
+  performance can be quite good without relying on `std::simd` - see
+  https://github.com/image-rs/image-png/discussions/562#discussioncomment-13303307