npm - @img/sharp-libvips-dev - Versions diffs - 1.0.0 → 1.0.2 - Mend

@img/sharp-libvips-dev 1.0.0 → 1.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (230) hide show

package/include/aom/aom_encoder.h +3 -3
package/include/aom/aomcx.h +17 -8
package/include/expat.h +21 -10
package/include/expat_config.h +11 -5
package/include/ffi.h +12 -25
package/include/fontconfig/fontconfig.h +5 -3
package/include/freetype2/freetype/config/ftoption.h +1 -1
package/include/gio-unix-2.0/gio/gfiledescriptorbased.h +3 -7
package/include/gio-unix-2.0/gio/gunixinputstream.h +0 -5
package/include/gio-unix-2.0/gio/gunixoutputstream.h +0 -5
package/include/glib-2.0/gio/gappinfo.h +0 -7
package/include/glib-2.0/gio/gapplication.h +6 -0
package/include/glib-2.0/gio/gapplicationcommandline.h +12 -1
package/include/glib-2.0/gio/gasyncinitable.h +0 -7
package/include/glib-2.0/gio/gasyncresult.h +0 -6
package/include/glib-2.0/gio/gbufferedinputstream.h +0 -5
package/include/glib-2.0/gio/gbufferedoutputstream.h +0 -5
package/include/glib-2.0/gio/gbytesicon.h +0 -5
package/include/glib-2.0/gio/gcancellable.h +0 -5
package/include/glib-2.0/gio/gconverter.h +0 -7
package/include/glib-2.0/gio/gconverterinputstream.h +0 -6
package/include/glib-2.0/gio/gconverteroutputstream.h +0 -6
package/include/glib-2.0/gio/gdatagrambased.h +0 -7
package/include/glib-2.0/gio/gdatainputstream.h +0 -6
package/include/glib-2.0/gio/gdataoutputstream.h +0 -6
package/include/glib-2.0/gio/gdbusinterface.h +0 -8
package/include/glib-2.0/gio/gdbusinterfaceskeleton.h +0 -8
package/include/glib-2.0/gio/gdbusmessage.h +2 -1
package/include/glib-2.0/gio/gdbusobjectmanagerclient.h +0 -8
package/include/glib-2.0/gio/gdbusobjectmanagerserver.h +0 -8
package/include/glib-2.0/gio/gdbusobjectproxy.h +0 -8
package/include/glib-2.0/gio/gdbusobjectskeleton.h +0 -8
package/include/glib-2.0/gio/gdbusproxy.h +0 -8
package/include/glib-2.0/gio/gdebugcontroller.h +0 -8
package/include/glib-2.0/gio/gdebugcontrollerdbus.h +0 -7
package/include/glib-2.0/gio/gdtlsserverconnection.h +0 -8
package/include/glib-2.0/gio/gemblem.h +0 -5
package/include/glib-2.0/gio/gemblemedicon.h +0 -5
package/include/glib-2.0/gio/gfile.h +0 -10
package/include/glib-2.0/gio/gfileenumerator.h +0 -5
package/include/glib-2.0/gio/gfileicon.h +0 -5
package/include/glib-2.0/gio/gfileinfo.h +0 -5
package/include/glib-2.0/gio/gfileinputstream.h +0 -8
package/include/glib-2.0/gio/gfileiostream.h +0 -8
package/include/glib-2.0/gio/gfilemonitor.h +0 -5
package/include/glib-2.0/gio/gfilenamecompleter.h +0 -5
package/include/glib-2.0/gio/gfileoutputstream.h +0 -8
package/include/glib-2.0/gio/gfilterinputstream.h +0 -5
package/include/glib-2.0/gio/gfilteroutputstream.h +0 -5
package/include/glib-2.0/gio/gicon.h +0 -5
package/include/glib-2.0/gio/ginitable.h +0 -7
package/include/glib-2.0/gio/ginputstream.h +0 -5
package/include/glib-2.0/gio/gio-autocleanups.h +4 -0
package/include/glib-2.0/gio/gio-visibility.h +34 -0
package/include/glib-2.0/gio/gioenums.h +6 -1
package/include/glib-2.0/gio/giomodule.h +0 -5
package/include/glib-2.0/gio/giostream.h +0 -5
package/include/glib-2.0/gio/giotypes.h +5 -108
package/include/glib-2.0/gio/gloadableicon.h +0 -6
package/include/glib-2.0/gio/gmemoryinputstream.h +0 -5
package/include/glib-2.0/gio/gmemoryoutputstream.h +0 -5
package/include/glib-2.0/gio/gmountoperation.h +0 -6
package/include/glib-2.0/gio/gnetworking.h +4 -0
package/include/glib-2.0/gio/goutputstream.h +0 -9
package/include/glib-2.0/gio/gpollableinputstream.h +0 -7
package/include/glib-2.0/gio/gpollableoutputstream.h +0 -7
package/include/glib-2.0/gio/gproxy.h +0 -7
package/include/glib-2.0/gio/gproxyaddressenumerator.h +0 -8
package/include/glib-2.0/gio/gseekable.h +0 -5
package/include/glib-2.0/gio/gsettingsbackend.h +0 -5
package/include/glib-2.0/gio/gsimpleactiongroup.h +0 -7
package/include/glib-2.0/gio/gsimpleasyncresult.h +0 -5
package/include/glib-2.0/gio/gsimpleproxyresolver.h +0 -5
package/include/glib-2.0/gio/gsocket.h +13 -0
package/include/glib-2.0/gio/gsocketaddressenumerator.h +0 -6
package/include/glib-2.0/gio/gsocketconnectable.h +0 -5
package/include/glib-2.0/gio/gtask.h +12 -0
package/include/glib-2.0/gio/gthemedicon.h +0 -5
package/include/glib-2.0/gio/gtlsserverconnection.h +0 -8
package/include/glib-2.0/gio/gunixcredentialsmessage.h +0 -8
package/include/glib-2.0/gio/gvfs.h +0 -5
package/include/glib-2.0/gio/gvolume.h +2 -2
package/include/glib-2.0/gio/gvolumemonitor.h +0 -5
package/include/glib-2.0/girepository/gi-visibility.h +986 -0
package/include/glib-2.0/girepository/giarginfo.h +100 -0
package/include/glib-2.0/girepository/gibaseinfo.h +129 -0
package/include/glib-2.0/girepository/gicallableinfo.h +119 -0
package/include/glib-2.0/girepository/gicallbackinfo.h +60 -0
package/include/glib-2.0/girepository/giconstantinfo.h +72 -0
package/include/glib-2.0/girepository/gienuminfo.h +82 -0
package/include/glib-2.0/girepository/gifieldinfo.h +84 -0
package/include/glib-2.0/girepository/giflagsinfo.h +60 -0
package/include/glib-2.0/girepository/gifunctioninfo.h +117 -0
package/include/glib-2.0/girepository/giinterfaceinfo.h +120 -0
package/include/glib-2.0/girepository/giobjectinfo.h +230 -0
package/include/glib-2.0/girepository/gipropertyinfo.h +77 -0
package/include/glib-2.0/girepository/giregisteredtypeinfo.h +75 -0
package/include/glib-2.0/girepository/girepository-autocleanups.h +56 -0
package/include/glib-2.0/girepository/girepository.h +247 -0
package/include/glib-2.0/girepository/girffi.h +129 -0
package/include/glib-2.0/girepository/gisignalinfo.h +72 -0
package/include/glib-2.0/girepository/gistructinfo.h +102 -0
package/include/glib-2.0/girepository/gitypeinfo.h +144 -0
package/include/glib-2.0/girepository/gitypelib.h +61 -0
package/include/glib-2.0/girepository/gitypes.h +421 -0
package/include/glib-2.0/girepository/giunioninfo.h +105 -0
package/include/glib-2.0/girepository/giunresolvedinfo.h +60 -0
package/include/glib-2.0/girepository/givalueinfo.h +65 -0
package/include/glib-2.0/girepository/givfuncinfo.h +88 -0
package/include/glib-2.0/glib/deprecated/gcompletion.h +1 -1
package/include/glib-2.0/glib/deprecated/grel.h +0 -23
package/include/glib-2.0/glib/deprecated/gthread.h +10 -6
package/include/glib-2.0/glib/gatomic.h +20 -20
package/include/glib-2.0/glib/gbitlock.h +31 -0
package/include/glib-2.0/glib/gbookmarkfile.h +39 -1
package/include/glib-2.0/glib/gchecksum.h +0 -10
package/include/glib-2.0/glib/gdate.h +0 -9
package/include/glib-2.0/glib/gdatetime.h +33 -1
package/include/glib-2.0/glib/gdir.h +5 -0
package/include/glib-2.0/glib/ghmac.h +0 -9
package/include/glib-2.0/glib/glib-autocleanups.h +4 -0
package/include/glib-2.0/glib/glib-visibility.h +34 -0
package/include/glib-2.0/glib/gmacros.h +1 -0
package/include/glib-2.0/glib/gmessages.h +11 -0
package/include/glib-2.0/glib/gpathbuf.h +0 -7
package/include/glib-2.0/glib/gslice.h +2 -0
package/include/glib-2.0/glib/gstdio.h +1 -1
package/include/glib-2.0/glib/gstrfuncs.h +24 -18
package/include/glib-2.0/glib/gstrvbuilder.h +4 -8
package/include/glib-2.0/glib/gtestutils.h +5 -0
package/include/glib-2.0/glib/gthread.h +216 -3
package/include/glib-2.0/glib/gunicode.h +12 -2
package/include/glib-2.0/glib/gvarianttype.h +1 -10
package/include/glib-2.0/glib/gversionmacros.h +9 -0
package/include/glib-2.0/glib/gwin32.h +4 -4
package/include/glib-2.0/glib-unix.h +214 -0
package/include/glib-2.0/gmodule/gmodule-visibility.h +34 -0
package/include/glib-2.0/gobject/gbinding.h +0 -8
package/include/glib-2.0/gobject/gbindinggroup.h +0 -8
package/include/glib-2.0/gobject/gclosure.h +1 -9
package/include/glib-2.0/gobject/genums.h +6 -6
package/include/glib-2.0/gobject/glib-types.h +44 -0
package/include/glib-2.0/gobject/gobject-autocleanups.h +4 -0
package/include/glib-2.0/gobject/gobject-visibility.h +34 -0
package/include/glib-2.0/gobject/gobject.h +1 -16
package/include/glib-2.0/gobject/gparam.h +3 -12
package/include/glib-2.0/gobject/gsignal.h +16 -6
package/include/glib-2.0/gobject/gsignalgroup.h +0 -8
package/include/glib-2.0/gobject/gtype.h +53 -20
package/include/glib-2.0/gobject/gtypemodule.h +0 -7
package/include/glib-2.0/gobject/gtypeplugin.h +0 -6
package/include/glib-2.0/gobject/gvaluearray.h +0 -7
package/include/glib-2.0/gobject/gvaluecollector.h +1 -11
package/include/glib-2.0/gobject/gvaluetypes.h +2 -0
package/include/hwy/aligned_allocator.h +171 -6
package/include/hwy/base.h +1765 -543
package/include/hwy/cache_control.h +24 -6
package/include/hwy/detect_compiler_arch.h +23 -2
package/include/hwy/detect_targets.h +56 -13
package/include/hwy/foreach_target.h +24 -0
package/include/hwy/highway.h +20 -3
package/include/hwy/ops/arm_neon-inl.h +1086 -667
package/include/hwy/ops/arm_sve-inl.h +1091 -235
package/include/hwy/ops/emu128-inl.h +271 -196
package/include/hwy/ops/generic_ops-inl.h +2270 -399
package/include/hwy/ops/ppc_vsx-inl.h +1786 -563
package/include/hwy/ops/rvv-inl.h +1043 -311
package/include/hwy/ops/scalar-inl.h +189 -159
package/include/hwy/ops/set_macros-inl.h +66 -6
package/include/hwy/ops/shared-inl.h +175 -56
package/include/hwy/ops/wasm_128-inl.h +153 -136
package/include/hwy/ops/x86_128-inl.h +1647 -646
package/include/hwy/ops/x86_256-inl.h +1003 -370
package/include/hwy/ops/x86_512-inl.h +948 -353
package/include/hwy/per_target.h +4 -0
package/include/hwy/profiler.h +648 -0
package/include/hwy/robust_statistics.h +2 -2
package/include/hwy/targets.h +18 -11
package/include/hwy/timer.h +11 -0
package/include/lcms2.h +46 -7
package/include/lcms2_plugin.h +4 -4
package/include/libheif/heif_version.h +2 -2
package/include/libpng16/png.h +32 -29
package/include/libpng16/pngconf.h +2 -2
package/include/libpng16/pnglibconf.h +7 -2
package/include/librsvg-2.0/librsvg/rsvg-version.h +2 -2
package/include/libxml2/libxml/HTMLparser.h +23 -0
package/include/libxml2/libxml/SAX.h +0 -2
package/include/libxml2/libxml/SAX2.h +0 -2
package/include/libxml2/libxml/c14n.h +0 -2
package/include/libxml2/libxml/dict.h +1 -0
package/include/libxml2/libxml/encoding.h +16 -14
package/include/libxml2/libxml/entities.h +4 -0
package/include/libxml2/libxml/globals.h +15 -503
package/include/libxml2/libxml/hash.h +57 -61
package/include/libxml2/libxml/nanoftp.h +2 -2
package/include/libxml2/libxml/parser.h +137 -18
package/include/libxml2/libxml/parserInternals.h +1 -0
package/include/libxml2/libxml/relaxng.h +2 -1
package/include/libxml2/libxml/schemasInternals.h +1 -0
package/include/libxml2/libxml/schematron.h +1 -0
package/include/libxml2/libxml/threads.h +4 -11
package/include/libxml2/libxml/tree.h +68 -20
package/include/libxml2/libxml/uri.h +2 -1
package/include/libxml2/libxml/valid.h +2 -0
package/include/libxml2/libxml/xmlIO.h +65 -13
package/include/libxml2/libxml/xmlerror.h +37 -8
package/include/libxml2/libxml/xmlmemory.h +37 -40
package/include/libxml2/libxml/xmlreader.h +6 -0
package/include/libxml2/libxml/xmlregexp.h +2 -9
package/include/libxml2/libxml/xmlsave.h +9 -0
package/include/libxml2/libxml/xmlschemas.h +3 -0
package/include/libxml2/libxml/xmlversion.h +28 -43
package/include/libxml2/libxml/xpath.h +1 -1
package/include/libxml2/libxml/xpathInternals.h +2 -1
package/include/libxml2/libxml/xpointer.h +5 -4
package/include/pango-1.0/pango/pango-features.h +3 -3
package/include/pango-1.0/pango/pango-fontmap.h +7 -0
package/include/pixman-1/pixman-version.h +3 -3
package/include/pixman-1/pixman.h +9 -2
package/include/png.h +32 -29
package/include/pngconf.h +2 -2
package/include/pnglibconf.h +7 -2
package/include/vips/connection.h +9 -3
package/include/vips/util.h +0 -9
package/include/vips/version.h +4 -4
package/include/zconf.h +3 -0
package/include/zlib.h +3 -3
package/package.json +1 -1
package/versions.json +15 -15

package/include/hwy/ops/generic_ops-inl.h CHANGED Viewed

@@ -17,6 +17,9 @@
 // Target-independent types/functions defined after target-specific ops.
+// The "include guards" in this file that check HWY_TARGET_TOGGLE serve to skip
+// the generic implementation here if native ops are already defined.
 #include "hwy/base.h"
 // Define detail::Shuffle1230 etc, but only when viewing the current header;
@@ -194,6 +197,21 @@ HWY_API void SafeCopyN(const size_t num, D d, const T* HWY_RESTRICT from,
 #endif
 }
+// ------------------------------ MaskFalse
+#if (defined(HWY_NATIVE_MASK_FALSE) == defined(HWY_TARGET_TOGGLE))
+#ifdef HWY_NATIVE_MASK_FALSE
+#undef HWY_NATIVE_MASK_FALSE
+#else
+#define HWY_NATIVE_MASK_FALSE
+#endif
+template <class D>
+HWY_API Mask<D> MaskFalse(D d) {
+  return MaskFromVec(Zero(d));
+}
+#endif  // HWY_NATIVE_MASK_FALSE
 // ------------------------------ BitwiseIfThenElse
 #if (defined(HWY_NATIVE_BITWISE_IF_THEN_ELSE) == defined(HWY_TARGET_TOGGLE))
 #ifdef HWY_NATIVE_BITWISE_IF_THEN_ELSE
@@ -209,9 +227,634 @@ HWY_API V BitwiseIfThenElse(V mask, V yes, V no) {
 #endif  // HWY_NATIVE_BITWISE_IF_THEN_ELSE
-// "Include guard": skip if native instructions are available. The generic
-// implementation is currently shared between x86_* and wasm_*, and is too large
-// to duplicate.
+// ------------------------------ PromoteMaskTo
+#if (defined(HWY_NATIVE_PROMOTE_MASK_TO) == defined(HWY_TARGET_TOGGLE))
+#ifdef HWY_NATIVE_PROMOTE_MASK_TO
+#undef HWY_NATIVE_PROMOTE_MASK_TO
+#else
+#define HWY_NATIVE_PROMOTE_MASK_TO
+#endif
+template <class DTo, class DFrom>
+HWY_API Mask<DTo> PromoteMaskTo(DTo d_to, DFrom d_from, Mask<DFrom> m) {
+  static_assert(
+      sizeof(TFromD<DTo>) > sizeof(TFromD<DFrom>),
+      "sizeof(TFromD<DTo>) must be greater than sizeof(TFromD<DFrom>)");
+  static_assert(
+      IsSame<Mask<DFrom>, Mask<Rebind<TFromD<DFrom>, DTo>>>(),
+      "Mask<DFrom> must be the same type as Mask<Rebind<TFromD<DFrom>, DTo>>");
+  const RebindToSigned<decltype(d_to)> di_to;
+  const RebindToSigned<decltype(d_from)> di_from;
+  return MaskFromVec(BitCast(
+      d_to, PromoteTo(di_to, BitCast(di_from, VecFromMask(d_from, m)))));
+}
+#endif  // HWY_NATIVE_PROMOTE_MASK_TO
+// ------------------------------ DemoteMaskTo
+#if (defined(HWY_NATIVE_DEMOTE_MASK_TO) == defined(HWY_TARGET_TOGGLE))
+#ifdef HWY_NATIVE_DEMOTE_MASK_TO
+#undef HWY_NATIVE_DEMOTE_MASK_TO
+#else
+#define HWY_NATIVE_DEMOTE_MASK_TO
+#endif
+template <class DTo, class DFrom>
+HWY_API Mask<DTo> DemoteMaskTo(DTo d_to, DFrom d_from, Mask<DFrom> m) {
+  static_assert(sizeof(TFromD<DTo>) < sizeof(TFromD<DFrom>),
+                "sizeof(TFromD<DTo>) must be less than sizeof(TFromD<DFrom>)");
+  static_assert(
+      IsSame<Mask<DFrom>, Mask<Rebind<TFromD<DFrom>, DTo>>>(),
+      "Mask<DFrom> must be the same type as Mask<Rebind<TFromD<DFrom>, DTo>>");
+  const RebindToSigned<decltype(d_to)> di_to;
+  const RebindToSigned<decltype(d_from)> di_from;
+  return MaskFromVec(
+      BitCast(d_to, DemoteTo(di_to, BitCast(di_from, VecFromMask(d_from, m)))));
+}
+#endif  // HWY_NATIVE_DEMOTE_MASK_TO
+// ------------------------------ CombineMasks
+#if (defined(HWY_NATIVE_COMBINE_MASKS) == defined(HWY_TARGET_TOGGLE))
+#ifdef HWY_NATIVE_COMBINE_MASKS
+#undef HWY_NATIVE_COMBINE_MASKS
+#else
+#define HWY_NATIVE_COMBINE_MASKS
+#endif
+#if HWY_TARGET != HWY_SCALAR
+template <class D>
+HWY_API Mask<D> CombineMasks(D d, Mask<Half<D>> hi, Mask<Half<D>> lo) {
+  const Half<decltype(d)> dh;
+  return MaskFromVec(Combine(d, VecFromMask(dh, hi), VecFromMask(dh, lo)));
+}
+#endif
+#endif  // HWY_NATIVE_COMBINE_MASKS
+// ------------------------------ LowerHalfOfMask
+#if (defined(HWY_NATIVE_LOWER_HALF_OF_MASK) == defined(HWY_TARGET_TOGGLE))
+#ifdef HWY_NATIVE_LOWER_HALF_OF_MASK
+#undef HWY_NATIVE_LOWER_HALF_OF_MASK
+#else
+#define HWY_NATIVE_LOWER_HALF_OF_MASK
+#endif
+template <class D>
+HWY_API Mask<D> LowerHalfOfMask(D d, Mask<Twice<D>> m) {
+  const Twice<decltype(d)> dt;
+  return MaskFromVec(LowerHalf(d, VecFromMask(dt, m)));
+}
+#endif  // HWY_NATIVE_LOWER_HALF_OF_MASK
+// ------------------------------ UpperHalfOfMask
+#if (defined(HWY_NATIVE_UPPER_HALF_OF_MASK) == defined(HWY_TARGET_TOGGLE))
+#ifdef HWY_NATIVE_UPPER_HALF_OF_MASK
+#undef HWY_NATIVE_UPPER_HALF_OF_MASK
+#else
+#define HWY_NATIVE_UPPER_HALF_OF_MASK
+#endif
+#if HWY_TARGET != HWY_SCALAR
+template <class D>
+HWY_API Mask<D> UpperHalfOfMask(D d, Mask<Twice<D>> m) {
+  const Twice<decltype(d)> dt;
+  return MaskFromVec(UpperHalf(d, VecFromMask(dt, m)));
+}
+#endif
+#endif  // HWY_NATIVE_UPPER_HALF_OF_MASK
+// ------------------------------ OrderedDemote2MasksTo
+#if (defined(HWY_NATIVE_ORDERED_DEMOTE_2_MASKS_TO) == \
+     defined(HWY_TARGET_TOGGLE))
+#ifdef HWY_NATIVE_ORDERED_DEMOTE_2_MASKS_TO
+#undef HWY_NATIVE_ORDERED_DEMOTE_2_MASKS_TO
+#else
+#define HWY_NATIVE_ORDERED_DEMOTE_2_MASKS_TO
+#endif
+#if HWY_TARGET != HWY_SCALAR
+template <class DTo, class DFrom>
+HWY_API Mask<DTo> OrderedDemote2MasksTo(DTo d_to, DFrom d_from, Mask<DFrom> a,
+                                        Mask<DFrom> b) {
+  static_assert(
+      sizeof(TFromD<DTo>) == sizeof(TFromD<DFrom>) / 2,
+      "sizeof(TFromD<DTo>) must be equal to sizeof(TFromD<DFrom>) / 2");
+  static_assert(IsSame<Mask<DTo>, Mask<Repartition<TFromD<DTo>, DFrom>>>(),
+                "Mask<DTo> must be the same type as "
+                "Mask<Repartition<TFromD<DTo>, DFrom>>>()");
+  const RebindToSigned<decltype(d_from)> di_from;
+  const RebindToSigned<decltype(d_to)> di_to;
+  const auto va = BitCast(di_from, VecFromMask(d_from, a));
+  const auto vb = BitCast(di_from, VecFromMask(d_from, b));
+  return MaskFromVec(BitCast(d_to, OrderedDemote2To(di_to, va, vb)));
+}
+#endif
+#endif  // HWY_NATIVE_ORDERED_DEMOTE_2_MASKS_TO
+// ------------------------------ InterleaveWholeLower/InterleaveWholeUpper
+#if (defined(HWY_NATIVE_INTERLEAVE_WHOLE) == defined(HWY_TARGET_TOGGLE))
+#ifdef HWY_NATIVE_INTERLEAVE_WHOLE
+#undef HWY_NATIVE_INTERLEAVE_WHOLE
+#else
+#define HWY_NATIVE_INTERLEAVE_WHOLE
+#endif
+#if HWY_TARGET != HWY_SCALAR
+template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
+HWY_API VFromD<D> InterleaveWholeLower(D d, VFromD<D> a, VFromD<D> b) {
+  // InterleaveWholeLower(d, a, b) is equivalent to InterleaveLower(a, b) if
+  // D().MaxBytes() <= 16 is true
+  return InterleaveLower(d, a, b);
+}
+template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
+HWY_API VFromD<D> InterleaveWholeUpper(D d, VFromD<D> a, VFromD<D> b) {
+  // InterleaveWholeUpper(d, a, b) is equivalent to InterleaveUpper(a, b) if
+  // D().MaxBytes() <= 16 is true
+  return InterleaveUpper(d, a, b);
+}
+// InterleaveWholeLower/InterleaveWholeUpper for 32-byte vectors on AVX2/AVX3
+// is implemented in x86_256-inl.h.
+// InterleaveWholeLower/InterleaveWholeUpper for 64-byte vectors on AVX3 is
+// implemented in x86_512-inl.h.
+// InterleaveWholeLower/InterleaveWholeUpper for 32-byte vectors on WASM_EMU256
+// is implemented in wasm_256-inl.h.
+#endif  // HWY_TARGET != HWY_SCALAR
+#endif  // HWY_NATIVE_INTERLEAVE_WHOLE
+#if HWY_TARGET != HWY_SCALAR
+// The InterleaveWholeLower without the optional D parameter is generic for all
+// vector lengths.
+template <class V>
+HWY_API V InterleaveWholeLower(V a, V b) {
+  return InterleaveWholeLower(DFromV<V>(), a, b);
+}
+#endif  // HWY_TARGET != HWY_SCALAR
+// ------------------------------ AddSub
+template <class V, HWY_IF_LANES_D(DFromV<V>, 1)>
+HWY_API V AddSub(V a, V b) {
+  // AddSub(a, b) for a one-lane vector is equivalent to Sub(a, b)
+  return Sub(a, b);
+}
+// AddSub for F32x2, F32x4, and F64x2 vectors is implemented in x86_128-inl.h on
+// SSSE3/SSE4/AVX2/AVX3
+// AddSub for F32x8 and F64x4 vectors is implemented in x86_256-inl.h on
+// AVX2/AVX3
+template <class V, HWY_IF_V_SIZE_GT_V(V, ((HWY_TARGET <= HWY_SSSE3 &&
+                                           hwy::IsFloat3264<TFromV<V>>())
+                                              ? 32
+                                              : sizeof(TFromV<V>)))>
+HWY_API V AddSub(V a, V b) {
+  using D = DFromV<decltype(a)>;
+  using T = TFromD<D>;
+  using TNegate = If<!hwy::IsSigned<T>(), MakeSigned<T>, T>;
+  const D d;
+  const Rebind<TNegate, D> d_negate;
+  // Negate the even lanes of b
+  const auto negated_even_b = OddEven(b, BitCast(d, Neg(BitCast(d_negate, b))));
+  return Add(a, negated_even_b);
+}
+// ------------------------------ MaskedAddOr etc.
+#if (defined(HWY_NATIVE_MASKED_ARITH) == defined(HWY_TARGET_TOGGLE))
+#ifdef HWY_NATIVE_MASKED_ARITH
+#undef HWY_NATIVE_MASKED_ARITH
+#else
+#define HWY_NATIVE_MASKED_ARITH
+#endif
+template <class V, class M>
+HWY_API V MaskedMinOr(V no, M m, V a, V b) {
+  return IfThenElse(m, Min(a, b), no);
+}
+template <class V, class M>
+HWY_API V MaskedMaxOr(V no, M m, V a, V b) {
+  return IfThenElse(m, Max(a, b), no);
+}
+template <class V, class M>
+HWY_API V MaskedAddOr(V no, M m, V a, V b) {
+  return IfThenElse(m, Add(a, b), no);
+}
+template <class V, class M>
+HWY_API V MaskedSubOr(V no, M m, V a, V b) {
+  return IfThenElse(m, Sub(a, b), no);
+}
+template <class V, class M>
+HWY_API V MaskedMulOr(V no, M m, V a, V b) {
+  return IfThenElse(m, Mul(a, b), no);
+}
+template <class V, class M>
+HWY_API V MaskedDivOr(V no, M m, V a, V b) {
+  return IfThenElse(m, Div(a, b), no);
+}
+template <class V, class M>
+HWY_API V MaskedModOr(V no, M m, V a, V b) {
+  return IfThenElse(m, Mod(a, b), no);
+}
+template <class V, class M>
+HWY_API V MaskedSatAddOr(V no, M m, V a, V b) {
+  return IfThenElse(m, SaturatedAdd(a, b), no);
+}
+template <class V, class M>
+HWY_API V MaskedSatSubOr(V no, M m, V a, V b) {
+  return IfThenElse(m, SaturatedSub(a, b), no);
+}
+#endif  // HWY_NATIVE_MASKED_ARITH
+// ------------------------------ IfNegativeThenNegOrUndefIfZero
+#if (defined(HWY_NATIVE_INTEGER_IF_NEGATIVE_THEN_NEG) == \
+     defined(HWY_TARGET_TOGGLE))
+#ifdef HWY_NATIVE_INTEGER_IF_NEGATIVE_THEN_NEG
+#undef HWY_NATIVE_INTEGER_IF_NEGATIVE_THEN_NEG
+#else
+#define HWY_NATIVE_INTEGER_IF_NEGATIVE_THEN_NEG
+#endif
+template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
+HWY_API V IfNegativeThenNegOrUndefIfZero(V mask, V v) {
+#if HWY_HAVE_SCALABLE || HWY_TARGET == HWY_SVE_256 || HWY_TARGET == HWY_SVE2_128
+  // MaskedSubOr is more efficient than IfNegativeThenElse on RVV/SVE
+  const auto zero = Zero(DFromV<V>());
+  return MaskedSubOr(v, Lt(mask, zero), zero, v);
+#else
+  return IfNegativeThenElse(mask, Neg(v), v);
+#endif
+}
+#endif  // HWY_NATIVE_INTEGER_IF_NEGATIVE_THEN_NEG
+template <class V, HWY_IF_FLOAT_V(V)>
+HWY_API V IfNegativeThenNegOrUndefIfZero(V mask, V v) {
+  return CopySign(v, Xor(mask, v));
+}
+// ------------------------------ SaturatedNeg
+#if (defined(HWY_NATIVE_SATURATED_NEG_8_16_32) == defined(HWY_TARGET_TOGGLE))
+#ifdef HWY_NATIVE_SATURATED_NEG_8_16_32
+#undef HWY_NATIVE_SATURATED_NEG_8_16_32
+#else
+#define HWY_NATIVE_SATURATED_NEG_8_16_32
+#endif
+template <class V, HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 2)),
+          HWY_IF_SIGNED_V(V)>
+HWY_API V SaturatedNeg(V v) {
+  const DFromV<decltype(v)> d;
+  return SaturatedSub(Zero(d), v);
+}
+template <class V, HWY_IF_I32(TFromV<V>)>
+HWY_API V SaturatedNeg(V v) {
+  const DFromV<decltype(v)> d;
+#if HWY_TARGET == HWY_RVV ||                               \
+    (HWY_TARGET >= HWY_PPC10 && HWY_TARGET <= HWY_PPC8) || \
+    (HWY_TARGET >= HWY_SVE2_128 && HWY_TARGET <= HWY_NEON_WITHOUT_AES)
+  // RVV/NEON/SVE/PPC have native I32 SaturatedSub instructions
+  return SaturatedSub(Zero(d), v);
+#else
+  // ~v[i] - ((v[i] > LimitsMin<int32_t>()) ? -1 : 0) is equivalent to
+  // (v[i] > LimitsMin<int32_t>) ? (-v[i]) : LimitsMax<int32_t>() since
+  // -v[i] == ~v[i] + 1 == ~v[i] - (-1) and
+  // ~LimitsMin<int32_t>() == LimitsMax<int32_t>().
+  return Sub(Not(v), VecFromMask(d, Gt(v, Set(d, LimitsMin<int32_t>()))));
+#endif
+}
+#endif  // HWY_NATIVE_SATURATED_NEG_8_16_32
+#if (defined(HWY_NATIVE_SATURATED_NEG_64) == defined(HWY_TARGET_TOGGLE))
+#ifdef HWY_NATIVE_SATURATED_NEG_64
+#undef HWY_NATIVE_SATURATED_NEG_64
+#else
+#define HWY_NATIVE_SATURATED_NEG_64
+#endif
+template <class V, HWY_IF_I64(TFromV<V>)>
+HWY_API V SaturatedNeg(V v) {
+#if HWY_TARGET == HWY_RVV || \
+    (HWY_TARGET >= HWY_SVE2_128 && HWY_TARGET <= HWY_NEON_WITHOUT_AES)
+  // RVV/NEON/SVE have native I64 SaturatedSub instructions
+  const DFromV<decltype(v)> d;
+  return SaturatedSub(Zero(d), v);
+#else
+  const auto neg_v = Neg(v);
+  return Add(neg_v, BroadcastSignBit(And(v, neg_v)));
+#endif
+}
+#endif  // HWY_NATIVE_SATURATED_NEG_64
+// ------------------------------ SaturatedAbs
+#if (defined(HWY_NATIVE_SATURATED_ABS) == defined(HWY_TARGET_TOGGLE))
+#ifdef HWY_NATIVE_SATURATED_ABS
+#undef HWY_NATIVE_SATURATED_ABS
+#else
+#define HWY_NATIVE_SATURATED_ABS
+#endif
+template <class V, HWY_IF_SIGNED_V(V)>
+HWY_API V SaturatedAbs(V v) {
+  return Max(v, SaturatedNeg(v));
+}
+#endif
+// ------------------------------ Reductions
+// Targets follow one of two strategies. If HWY_NATIVE_REDUCE_SCALAR is toggled,
+// they (RVV/SVE/Armv8/Emu128) implement ReduceSum and SumOfLanes via Set.
+// Otherwise, they (Armv7/PPC/scalar/WASM/x86) define zero to most of the
+// SumOfLanes overloads. For the latter group, we here define the remaining
+// overloads, plus ReduceSum which uses them plus GetLane.
+#if (defined(HWY_NATIVE_REDUCE_SCALAR) == defined(HWY_TARGET_TOGGLE))
+#ifdef HWY_NATIVE_REDUCE_SCALAR
+#undef HWY_NATIVE_REDUCE_SCALAR
+#else
+#define HWY_NATIVE_REDUCE_SCALAR
+#endif
+namespace detail {
+// Allows reusing the same shuffle code for SumOfLanes/MinOfLanes/MaxOfLanes.
+struct AddFunc {
+  template <class V>
+  V operator()(V a, V b) const {
+    return Add(a, b);
+  }
+};
+struct MinFunc {
+  template <class V>
+  V operator()(V a, V b) const {
+    return Min(a, b);
+  }
+};
+struct MaxFunc {
+  template <class V>
+  V operator()(V a, V b) const {
+    return Max(a, b);
+  }
+};
+// No-op for vectors of at most one block.
+template <class D, class Func, HWY_IF_V_SIZE_LE_D(D, 16)>
+HWY_INLINE VFromD<D> ReduceAcrossBlocks(D, Func, VFromD<D> v) {
+  return v;
+}
+// Reduces a lane with its counterpart in other block(s). Shared by AVX2 and
+// WASM_EMU256. AVX3 has its own overload.
+template <class D, class Func, HWY_IF_V_SIZE_D(D, 32)>
+HWY_INLINE VFromD<D> ReduceAcrossBlocks(D /*d*/, Func f, VFromD<D> v) {
+  return f(v, SwapAdjacentBlocks(v));
+}
+// These return the reduction result broadcasted across all lanes. They assume
+// the caller has already reduced across blocks.
+template <class D, class Func, HWY_IF_LANES_PER_BLOCK_D(D, 2)>
+HWY_INLINE VFromD<D> ReduceWithinBlocks(D d, Func f, VFromD<D> v10) {
+  return f(v10, Reverse2(d, v10));
+}
+template <class D, class Func, HWY_IF_LANES_PER_BLOCK_D(D, 4)>
+HWY_INLINE VFromD<D> ReduceWithinBlocks(D d, Func f, VFromD<D> v3210) {
+  const VFromD<D> v0123 = Reverse4(d, v3210);
+  const VFromD<D> v03_12_12_03 = f(v3210, v0123);
+  const VFromD<D> v12_03_03_12 = Reverse2(d, v03_12_12_03);
+  return f(v03_12_12_03, v12_03_03_12);
+}
+template <class D, class Func, HWY_IF_LANES_PER_BLOCK_D(D, 8)>
+HWY_INLINE VFromD<D> ReduceWithinBlocks(D d, Func f, VFromD<D> v76543210) {
+  // The upper half is reversed from the lower half; omit for brevity.
+  const VFromD<D> v34_25_16_07 = f(v76543210, Reverse8(d, v76543210));
+  const VFromD<D> v0347_1625_1625_0347 =
+      f(v34_25_16_07, Reverse4(d, v34_25_16_07));
+  return f(v0347_1625_1625_0347, Reverse2(d, v0347_1625_1625_0347));
+}
+template <class D, class Func, HWY_IF_LANES_PER_BLOCK_D(D, 16), HWY_IF_U8_D(D)>
+HWY_INLINE VFromD<D> ReduceWithinBlocks(D d, Func f, VFromD<D> v) {
+  const RepartitionToWide<decltype(d)> dw;
+  using VW = VFromD<decltype(dw)>;
+  const VW vw = BitCast(dw, v);
+  // f is commutative, so no need to adapt for HWY_IS_LITTLE_ENDIAN.
+  const VW even = And(vw, Set(dw, 0xFF));
+  const VW odd = ShiftRight<8>(vw);
+  const VW reduced = ReduceWithinBlocks(dw, f, f(even, odd));
+#if HWY_IS_LITTLE_ENDIAN
+  return DupEven(BitCast(d, reduced));
+#else
+  return DupOdd(BitCast(d, reduced));
+#endif
+}
+template <class D, class Func, HWY_IF_LANES_PER_BLOCK_D(D, 16), HWY_IF_I8_D(D)>
+HWY_INLINE VFromD<D> ReduceWithinBlocks(D d, Func f, VFromD<D> v) {
+  const RepartitionToWide<decltype(d)> dw;
+  using VW = VFromD<decltype(dw)>;
+  const VW vw = BitCast(dw, v);
+  // Sign-extend
+  // f is commutative, so no need to adapt for HWY_IS_LITTLE_ENDIAN.
+  const VW even = ShiftRight<8>(ShiftLeft<8>(vw));
+  const VW odd = ShiftRight<8>(vw);
+  const VW reduced = ReduceWithinBlocks(dw, f, f(even, odd));
+#if HWY_IS_LITTLE_ENDIAN
+  return DupEven(BitCast(d, reduced));
+#else
+  return DupOdd(BitCast(d, reduced));
+#endif
+}
+}  // namespace detail
+template <class D, HWY_IF_SUM_OF_LANES_D(D)>
+HWY_API VFromD<D> SumOfLanes(D d, VFromD<D> v) {
+  const detail::AddFunc f;
+  v = detail::ReduceAcrossBlocks(d, f, v);
+  return detail::ReduceWithinBlocks(d, f, v);
+}
+template <class D, HWY_IF_MINMAX_OF_LANES_D(D)>
+HWY_API VFromD<D> MinOfLanes(D d, VFromD<D> v) {
+  const detail::MinFunc f;
+  v = detail::ReduceAcrossBlocks(d, f, v);
+  return detail::ReduceWithinBlocks(d, f, v);
+}
+template <class D, HWY_IF_MINMAX_OF_LANES_D(D)>
+HWY_API VFromD<D> MaxOfLanes(D d, VFromD<D> v) {
+  const detail::MaxFunc f;
+  v = detail::ReduceAcrossBlocks(d, f, v);
+  return detail::ReduceWithinBlocks(d, f, v);
+}
+template <class D, HWY_IF_REDUCE_D(D)>
+HWY_API TFromD<D> ReduceSum(D d, VFromD<D> v) {
+  return GetLane(SumOfLanes(d, v));
+}
+template <class D, HWY_IF_REDUCE_D(D)>
+HWY_API TFromD<D> ReduceMin(D d, VFromD<D> v) {
+  return GetLane(MinOfLanes(d, v));
+}
+template <class D, HWY_IF_REDUCE_D(D)>
+HWY_API TFromD<D> ReduceMax(D d, VFromD<D> v) {
+  return GetLane(MaxOfLanes(d, v));
+}
+#endif  // HWY_NATIVE_REDUCE_SCALAR
+// Corner cases for both generic and native implementations:
+// N=1 (native covers N=2 e.g. for u64x2 and even u32x2 on Arm)
+template <class D, HWY_IF_LANES_D(D, 1)>
+HWY_API TFromD<D> ReduceSum(D /*d*/, VFromD<D> v) {
+  return GetLane(v);
+}
+template <class D, HWY_IF_LANES_D(D, 1)>
+HWY_API TFromD<D> ReduceMin(D /*d*/, VFromD<D> v) {
+  return GetLane(v);
+}
+template <class D, HWY_IF_LANES_D(D, 1)>
+HWY_API TFromD<D> ReduceMax(D /*d*/, VFromD<D> v) {
+  return GetLane(v);
+}
+template <class D, HWY_IF_LANES_D(D, 1)>
+HWY_API VFromD<D> SumOfLanes(D /* tag */, VFromD<D> v) {
+  return v;
+}
+template <class D, HWY_IF_LANES_D(D, 1)>
+HWY_API VFromD<D> MinOfLanes(D /* tag */, VFromD<D> v) {
+  return v;
+}
+template <class D, HWY_IF_LANES_D(D, 1)>
+HWY_API VFromD<D> MaxOfLanes(D /* tag */, VFromD<D> v) {
+  return v;
+}
+// N=4 for 8-bit is still less than the minimum native size.
+// ARMv7 NEON/PPC/RVV/SVE have target-specific implementations of the N=4 I8/U8
+// ReduceSum operations
+#if (defined(HWY_NATIVE_REDUCE_SUM_4_UI8) == defined(HWY_TARGET_TOGGLE))
+#ifdef HWY_NATIVE_REDUCE_SUM_4_UI8
+#undef HWY_NATIVE_REDUCE_SUM_4_UI8
+#else
+#define HWY_NATIVE_REDUCE_SUM_4_UI8
+#endif
+template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_UI8_D(D)>
+HWY_API TFromD<D> ReduceSum(D d, VFromD<D> v) {
+  const Twice<RepartitionToWide<decltype(d)>> dw;
+  return static_cast<TFromD<D>>(ReduceSum(dw, PromoteTo(dw, v)));
+}
+#endif  // HWY_NATIVE_REDUCE_SUM_4_UI8
+// RVV/SVE have target-specific implementations of the N=4 I8/U8
+// ReduceMin/ReduceMax operations
+#if (defined(HWY_NATIVE_REDUCE_MINMAX_4_UI8) == defined(HWY_TARGET_TOGGLE))
+#ifdef HWY_NATIVE_REDUCE_MINMAX_4_UI8
+#undef HWY_NATIVE_REDUCE_MINMAX_4_UI8
+#else
+#define HWY_NATIVE_REDUCE_MINMAX_4_UI8
+#endif
+template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_UI8_D(D)>
+HWY_API TFromD<D> ReduceMin(D d, VFromD<D> v) {
+  const Twice<RepartitionToWide<decltype(d)>> dw;
+  return static_cast<TFromD<D>>(ReduceMin(dw, PromoteTo(dw, v)));
+}
+template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_UI8_D(D)>
+HWY_API TFromD<D> ReduceMax(D d, VFromD<D> v) {
+  const Twice<RepartitionToWide<decltype(d)>> dw;
+  return static_cast<TFromD<D>>(ReduceMax(dw, PromoteTo(dw, v)));
+}
+#endif  // HWY_NATIVE_REDUCE_MINMAX_4_UI8
+// ------------------------------ IsInf, IsFinite
+// AVX3 has target-specific implementations of these.
+#if (defined(HWY_NATIVE_ISINF) == defined(HWY_TARGET_TOGGLE))
+#ifdef HWY_NATIVE_ISINF
+#undef HWY_NATIVE_ISINF
+#else
+#define HWY_NATIVE_ISINF
+#endif
+template <class V, class D = DFromV<V>>
+HWY_API MFromD<D> IsInf(const V v) {
+  using T = TFromD<D>;
+  const D d;
+  const RebindToUnsigned<decltype(d)> du;
+  const VFromD<decltype(du)> vu = BitCast(du, v);
+  // 'Shift left' to clear the sign bit, check for exponent=max and mantissa=0.
+  return RebindMask(
+      d,
+      Eq(Add(vu, vu),
+         Set(du, static_cast<MakeUnsigned<T>>(hwy::MaxExponentTimes2<T>()))));
+}
+// Returns whether normal/subnormal/zero.
+template <class V, class D = DFromV<V>>
+HWY_API MFromD<D> IsFinite(const V v) {
+  using T = TFromD<D>;
+  const D d;
+  const RebindToUnsigned<decltype(d)> du;
+  const RebindToSigned<decltype(d)> di;  // cheaper than unsigned comparison
+  const VFromD<decltype(du)> vu = BitCast(du, v);
+// 'Shift left' to clear the sign bit. MSVC seems to generate incorrect code
+// for AVX2 if we instead add vu + vu.
+#if HWY_COMPILER_MSVC
+  const VFromD<decltype(du)> shl = ShiftLeft<1>(vu);
+#else
+  const VFromD<decltype(du)> shl = Add(vu, vu);
+#endif
+  // Then shift right so we can compare with the max exponent (cannot compare
+  // with MaxExponentTimes2 directly because it is negative and non-negative
+  // floats would be greater).
+  const VFromD<decltype(di)> exp =
+      BitCast(di, ShiftRight<hwy::MantissaBits<T>() + 1>(shl));
+  return RebindMask(d, Lt(exp, Set(di, hwy::MaxExponentField<T>())));
+}
+#endif  // HWY_NATIVE_ISINF
+// ------------------------------ LoadInterleaved2
 #if HWY_IDE || \
     (defined(HWY_NATIVE_LOAD_STORE_INTERLEAVED) == defined(HWY_TARGET_TOGGLE))
@@ -221,8 +864,6 @@ HWY_API V BitwiseIfThenElse(V mask, V yes, V no) {
 #define HWY_NATIVE_LOAD_STORE_INTERLEAVED
 #endif
-// ------------------------------ LoadInterleaved2
 template <class D, HWY_IF_LANES_GT_D(D, 1)>
 HWY_API void LoadInterleaved2(D d, const TFromD<D>* HWY_RESTRICT unaligned,
                               VFromD<D>& v0, VFromD<D>& v1) {
@@ -277,6 +918,7 @@ HWY_API void LoadInterleaved3(D d, const TFromD<D>* HWY_RESTRICT unaligned,
                               VFromD<D>& v0, VFromD<D>& v1, VFromD<D>& v2) {
   const RebindToUnsigned<decltype(d)> du;
   using V = VFromD<D>;
+  using VU = VFromD<decltype(du)>;
   // Compact notation so these fit on one line: 12 := v1[2].
   V A;  // 05 24 14 04 23 13 03 22 12 02 21 11 01 20 10 00
   V B;  // 1a 0a 29 19 09 28 18 08 27 17 07 26 16 06 25 15
@@ -284,33 +926,33 @@ HWY_API void LoadInterleaved3(D d, const TFromD<D>* HWY_RESTRICT unaligned,
   detail::LoadTransposedBlocks3(d, unaligned, A, B, C);
   // Compress all lanes belonging to v0 into consecutive lanes.
   constexpr uint8_t Z = 0x80;
-  alignas(16) static constexpr uint8_t kIdx_v0A[16] = {
-      0, 3, 6, 9, 12, 15, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z};
-  alignas(16) static constexpr uint8_t kIdx_v0B[16] = {
-      Z, Z, Z, Z, Z, Z, 2, 5, 8, 11, 14, Z, Z, Z, Z, Z};
-  alignas(16) static constexpr uint8_t kIdx_v0C[16] = {
-      Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, 1, 4, 7, 10, 13};
-  alignas(16) static constexpr uint8_t kIdx_v1A[16] = {
-      1, 4, 7, 10, 13, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z};
-  alignas(16) static constexpr uint8_t kIdx_v1B[16] = {
-      Z, Z, Z, Z, Z, 0, 3, 6, 9, 12, 15, Z, Z, Z, Z, Z};
-  alignas(16) static constexpr uint8_t kIdx_v1C[16] = {
-      Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, 2, 5, 8, 11, 14};
-  alignas(16) static constexpr uint8_t kIdx_v2A[16] = {
-      2, 5, 8, 11, 14, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z};
-  alignas(16) static constexpr uint8_t kIdx_v2B[16] = {
-      Z, Z, Z, Z, Z, 1, 4, 7, 10, 13, Z, Z, Z, Z, Z, Z};
-  alignas(16) static constexpr uint8_t kIdx_v2C[16] = {
-      Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, 0, 3, 6, 9, 12, 15};
-  const V v0L = BitCast(d, TableLookupBytesOr0(A, LoadDup128(du, kIdx_v0A)));
-  const V v0M = BitCast(d, TableLookupBytesOr0(B, LoadDup128(du, kIdx_v0B)));
-  const V v0U = BitCast(d, TableLookupBytesOr0(C, LoadDup128(du, kIdx_v0C)));
-  const V v1L = BitCast(d, TableLookupBytesOr0(A, LoadDup128(du, kIdx_v1A)));
-  const V v1M = BitCast(d, TableLookupBytesOr0(B, LoadDup128(du, kIdx_v1B)));
-  const V v1U = BitCast(d, TableLookupBytesOr0(C, LoadDup128(du, kIdx_v1C)));
-  const V v2L = BitCast(d, TableLookupBytesOr0(A, LoadDup128(du, kIdx_v2A)));
-  const V v2M = BitCast(d, TableLookupBytesOr0(B, LoadDup128(du, kIdx_v2B)));
-  const V v2U = BitCast(d, TableLookupBytesOr0(C, LoadDup128(du, kIdx_v2C)));
+  const VU idx_v0A =
+      Dup128VecFromValues(du, 0, 3, 6, 9, 12, 15, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z);
+  const VU idx_v0B =
+      Dup128VecFromValues(du, Z, Z, Z, Z, Z, Z, 2, 5, 8, 11, 14, Z, Z, Z, Z, Z);
+  const VU idx_v0C =
+      Dup128VecFromValues(du, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, 1, 4, 7, 10, 13);
+  const VU idx_v1A =
+      Dup128VecFromValues(du, 1, 4, 7, 10, 13, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z);
+  const VU idx_v1B =
+      Dup128VecFromValues(du, Z, Z, Z, Z, Z, 0, 3, 6, 9, 12, 15, Z, Z, Z, Z, Z);
+  const VU idx_v1C =
+      Dup128VecFromValues(du, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, 2, 5, 8, 11, 14);
+  const VU idx_v2A =
+      Dup128VecFromValues(du, 2, 5, 8, 11, 14, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z);
+  const VU idx_v2B =
+      Dup128VecFromValues(du, Z, Z, Z, Z, Z, 1, 4, 7, 10, 13, Z, Z, Z, Z, Z, Z);
+  const VU idx_v2C =
+      Dup128VecFromValues(du, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, 0, 3, 6, 9, 12, 15);
+  const V v0L = BitCast(d, TableLookupBytesOr0(A, idx_v0A));
+  const V v0M = BitCast(d, TableLookupBytesOr0(B, idx_v0B));
+  const V v0U = BitCast(d, TableLookupBytesOr0(C, idx_v0C));
+  const V v1L = BitCast(d, TableLookupBytesOr0(A, idx_v1A));
+  const V v1M = BitCast(d, TableLookupBytesOr0(B, idx_v1B));
+  const V v1U = BitCast(d, TableLookupBytesOr0(C, idx_v1C));
+  const V v2L = BitCast(d, TableLookupBytesOr0(A, idx_v2A));
+  const V v2M = BitCast(d, TableLookupBytesOr0(B, idx_v2B));
+  const V v2U = BitCast(d, TableLookupBytesOr0(C, idx_v2C));
   v0 = Xor3(v0L, v0M, v0U);
   v1 = Xor3(v1L, v1M, v1U);
   v2 = Xor3(v2L, v2M, v2U);
@@ -322,30 +964,40 @@ HWY_API void LoadInterleaved3(D d, const TFromD<D>* HWY_RESTRICT unaligned,
                               VFromD<D>& v0, VFromD<D>& v1, VFromD<D>& v2) {
   const RebindToUnsigned<decltype(d)> du;
   using V = VFromD<D>;
+  using VU = VFromD<decltype(du)>;
   V A;  // v1[2] v0[2] v2[1] v1[1] v0[1] v2[0] v1[0] v0[0]
   V B;  // v0[5] v2[4] v1[4] v0[4] v2[3] v1[3] v0[3] v2[2]
   V C;  // v2[7] v1[7] v0[7] v2[6] v1[6] v0[6] v2[5] v1[5]
   detail::LoadTransposedBlocks3(d, unaligned, A, B, C);
   // Compress all lanes belonging to v0 into consecutive lanes.
   constexpr uint8_t Z = 0x80;
-  alignas(16) static constexpr uint8_t kIdx_v0A[16] = {0, 3, 6, Z, Z, Z, Z, Z};
-  alignas(16) static constexpr uint8_t kIdx_v0B[16] = {Z, Z, Z, 1, 4, 7, Z, Z};
-  alignas(16) static constexpr uint8_t kIdx_v0C[16] = {Z, Z, Z, Z, Z, Z, 2, 5};
-  alignas(16) static constexpr uint8_t kIdx_v1A[16] = {1, 4, 7, Z, Z, Z, Z, Z};
-  alignas(16) static constexpr uint8_t kIdx_v1B[16] = {Z, Z, Z, 2, 5, Z, Z, Z};
-  alignas(16) static constexpr uint8_t kIdx_v1C[16] = {Z, Z, Z, Z, Z, 0, 3, 6};
-  alignas(16) static constexpr uint8_t kIdx_v2A[16] = {2, 5, Z, Z, Z, Z, Z, Z};
-  alignas(16) static constexpr uint8_t kIdx_v2B[16] = {Z, Z, 0, 3, 6, Z, Z, Z};
-  alignas(16) static constexpr uint8_t kIdx_v2C[16] = {Z, Z, Z, Z, Z, 1, 4, 7};
-  const V v0L = BitCast(d, TableLookupBytesOr0(A, LoadDup128(du, kIdx_v0A)));
-  const V v0M = BitCast(d, TableLookupBytesOr0(B, LoadDup128(du, kIdx_v0B)));
-  const V v0U = BitCast(d, TableLookupBytesOr0(C, LoadDup128(du, kIdx_v0C)));
-  const V v1L = BitCast(d, TableLookupBytesOr0(A, LoadDup128(du, kIdx_v1A)));
-  const V v1M = BitCast(d, TableLookupBytesOr0(B, LoadDup128(du, kIdx_v1B)));
-  const V v1U = BitCast(d, TableLookupBytesOr0(C, LoadDup128(du, kIdx_v1C)));
-  const V v2L = BitCast(d, TableLookupBytesOr0(A, LoadDup128(du, kIdx_v2A)));
-  const V v2M = BitCast(d, TableLookupBytesOr0(B, LoadDup128(du, kIdx_v2B)));
-  const V v2U = BitCast(d, TableLookupBytesOr0(C, LoadDup128(du, kIdx_v2C)));
+  const VU idx_v0A =
+      Dup128VecFromValues(du, 0, 3, 6, Z, Z, Z, Z, Z, 0, 0, 0, 0, 0, 0, 0, 0);
+  const VU idx_v0B =
+      Dup128VecFromValues(du, Z, Z, Z, 1, 4, 7, Z, Z, 0, 0, 0, 0, 0, 0, 0, 0);
+  const VU idx_v0C =
+      Dup128VecFromValues(du, Z, Z, Z, Z, Z, Z, 2, 5, 0, 0, 0, 0, 0, 0, 0, 0);
+  const VU idx_v1A =
+      Dup128VecFromValues(du, 1, 4, 7, Z, Z, Z, Z, Z, 0, 0, 0, 0, 0, 0, 0, 0);
+  const VU idx_v1B =
+      Dup128VecFromValues(du, Z, Z, Z, 2, 5, Z, Z, Z, 0, 0, 0, 0, 0, 0, 0, 0);
+  const VU idx_v1C =
+      Dup128VecFromValues(du, Z, Z, Z, Z, Z, 0, 3, 6, 0, 0, 0, 0, 0, 0, 0, 0);
+  const VU idx_v2A =
+      Dup128VecFromValues(du, 2, 5, Z, Z, Z, Z, Z, Z, 0, 0, 0, 0, 0, 0, 0, 0);
+  const VU idx_v2B =
+      Dup128VecFromValues(du, Z, Z, 0, 3, 6, Z, Z, Z, 0, 0, 0, 0, 0, 0, 0, 0);
+  const VU idx_v2C =
+      Dup128VecFromValues(du, Z, Z, Z, Z, Z, 1, 4, 7, 0, 0, 0, 0, 0, 0, 0, 0);
+  const V v0L = BitCast(d, TableLookupBytesOr0(A, idx_v0A));
+  const V v0M = BitCast(d, TableLookupBytesOr0(B, idx_v0B));
+  const V v0U = BitCast(d, TableLookupBytesOr0(C, idx_v0C));
+  const V v1L = BitCast(d, TableLookupBytesOr0(A, idx_v1A));
+  const V v1M = BitCast(d, TableLookupBytesOr0(B, idx_v1B));
+  const V v1U = BitCast(d, TableLookupBytesOr0(C, idx_v1C));
+  const V v2L = BitCast(d, TableLookupBytesOr0(A, idx_v2A));
+  const V v2M = BitCast(d, TableLookupBytesOr0(B, idx_v2B));
+  const V v2U = BitCast(d, TableLookupBytesOr0(C, idx_v2C));
   v0 = Xor3(v0L, v0M, v0U);
   v1 = Xor3(v1L, v1M, v1U);
   v2 = Xor3(v2L, v2M, v2U);
@@ -358,6 +1010,7 @@ HWY_API void LoadInterleaved3(D d, const TFromD<D>* HWY_RESTRICT unaligned,
   const RebindToUnsigned<decltype(d)> du;
   const Repartition<uint8_t, decltype(du)> du8;
   using V = VFromD<D>;
+  using VU8 = VFromD<decltype(du8)>;
   V A;  // v1[2] v0[2] v2[1] v1[1] v0[1] v2[0] v1[0] v0[0]
   V B;  // v0[5] v2[4] v1[4] v0[4] v2[3] v1[3] v0[3] v2[2]
   V C;  // v2[7] v1[7] v0[7] v2[6] v1[6] v0[6] v2[5] v1[5]
@@ -365,33 +1018,33 @@ HWY_API void LoadInterleaved3(D d, const TFromD<D>* HWY_RESTRICT unaligned,
   // Compress all lanes belonging to v0 into consecutive lanes. Same as above,
   // but each element of the array contains a byte index for a byte of a lane.
   constexpr uint8_t Z = 0x80;
-  alignas(16) static constexpr uint8_t kIdx_v0A[16] = {
-      0x00, 0x01, 0x06, 0x07, 0x0C, 0x0D, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z};
-  alignas(16) static constexpr uint8_t kIdx_v0B[16] = {
-      Z, Z, Z, Z, Z, Z, 0x02, 0x03, 0x08, 0x09, 0x0E, 0x0F, Z, Z, Z, Z};
-  alignas(16) static constexpr uint8_t kIdx_v0C[16] = {
-      Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, 0x04, 0x05, 0x0A, 0x0B};
-  alignas(16) static constexpr uint8_t kIdx_v1A[16] = {
-      0x02, 0x03, 0x08, 0x09, 0x0E, 0x0F, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z};
-  alignas(16) static constexpr uint8_t kIdx_v1B[16] = {
-      Z, Z, Z, Z, Z, Z, 0x04, 0x05, 0x0A, 0x0B, Z, Z, Z, Z, Z, Z};
-  alignas(16) static constexpr uint8_t kIdx_v1C[16] = {
-      Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, 0x00, 0x01, 0x06, 0x07, 0x0C, 0x0D};
-  alignas(16) static constexpr uint8_t kIdx_v2A[16] = {
-      0x04, 0x05, 0x0A, 0x0B, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z};
-  alignas(16) static constexpr uint8_t kIdx_v2B[16] = {
-      Z, Z, Z, Z, 0x00, 0x01, 0x06, 0x07, 0x0C, 0x0D, Z, Z, Z, Z, Z, Z};
-  alignas(16) static constexpr uint8_t kIdx_v2C[16] = {
-      Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, 0x02, 0x03, 0x08, 0x09, 0x0E, 0x0F};
-  const V v0L = TableLookupBytesOr0(A, BitCast(d, LoadDup128(du8, kIdx_v0A)));
-  const V v0M = TableLookupBytesOr0(B, BitCast(d, LoadDup128(du8, kIdx_v0B)));
-  const V v0U = TableLookupBytesOr0(C, BitCast(d, LoadDup128(du8, kIdx_v0C)));
-  const V v1L = TableLookupBytesOr0(A, BitCast(d, LoadDup128(du8, kIdx_v1A)));
-  const V v1M = TableLookupBytesOr0(B, BitCast(d, LoadDup128(du8, kIdx_v1B)));
-  const V v1U = TableLookupBytesOr0(C, BitCast(d, LoadDup128(du8, kIdx_v1C)));
-  const V v2L = TableLookupBytesOr0(A, BitCast(d, LoadDup128(du8, kIdx_v2A)));
-  const V v2M = TableLookupBytesOr0(B, BitCast(d, LoadDup128(du8, kIdx_v2B)));
-  const V v2U = TableLookupBytesOr0(C, BitCast(d, LoadDup128(du8, kIdx_v2C)));
+  const VU8 idx_v0A = Dup128VecFromValues(du8, 0x00, 0x01, 0x06, 0x07, 0x0C,
+                                          0x0D, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z);
+  const VU8 idx_v0B = Dup128VecFromValues(du8, Z, Z, Z, Z, Z, Z, 0x02, 0x03,
+                                          0x08, 0x09, 0x0E, 0x0F, Z, Z, Z, Z);
+  const VU8 idx_v0C = Dup128VecFromValues(du8, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z,
+                                          Z, 0x04, 0x05, 0x0A, 0x0B);
+  const VU8 idx_v1A = Dup128VecFromValues(du8, 0x02, 0x03, 0x08, 0x09, 0x0E,
+                                          0x0F, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z);
+  const VU8 idx_v1B = Dup128VecFromValues(du8, Z, Z, Z, Z, Z, Z, 0x04, 0x05,
+                                          0x0A, 0x0B, Z, Z, Z, Z, Z, Z);
+  const VU8 idx_v1C = Dup128VecFromValues(du8, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z,
+                                          0x00, 0x01, 0x06, 0x07, 0x0C, 0x0D);
+  const VU8 idx_v2A = Dup128VecFromValues(du8, 0x04, 0x05, 0x0A, 0x0B, Z, Z, Z,
+                                          Z, Z, Z, Z, Z, Z, Z, Z, Z);
+  const VU8 idx_v2B = Dup128VecFromValues(du8, Z, Z, Z, Z, 0x00, 0x01, 0x06,
+                                          0x07, 0x0C, 0x0D, Z, Z, Z, Z, Z, Z);
+  const VU8 idx_v2C = Dup128VecFromValues(du8, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z,
+                                          0x02, 0x03, 0x08, 0x09, 0x0E, 0x0F);
+  const V v0L = TableLookupBytesOr0(A, BitCast(d, idx_v0A));
+  const V v0M = TableLookupBytesOr0(B, BitCast(d, idx_v0B));
+  const V v0U = TableLookupBytesOr0(C, BitCast(d, idx_v0C));
+  const V v1L = TableLookupBytesOr0(A, BitCast(d, idx_v1A));
+  const V v1M = TableLookupBytesOr0(B, BitCast(d, idx_v1B));
+  const V v1U = TableLookupBytesOr0(C, BitCast(d, idx_v1C));
+  const V v2L = TableLookupBytesOr0(A, BitCast(d, idx_v2A));
+  const V v2M = TableLookupBytesOr0(B, BitCast(d, idx_v2B));
+  const V v2U = TableLookupBytesOr0(C, BitCast(d, idx_v2C));
   v0 = Xor3(v0L, v0M, v0U);
   v1 = Xor3(v1L, v1M, v1U);
   v2 = Xor3(v2L, v2M, v2U);
@@ -644,16 +1297,16 @@ HWY_API void StoreInterleaved3(VFromD<D> v0, VFromD<D> v1, VFromD<D> v2, D d,
   // v0[5], v2[4],v1[4],v0[4] .. v2[0],v1[0],v0[0]. We're expanding v0 lanes
   // to their place, with 0x80 so lanes to be filled from other vectors are 0
   // to enable blending by ORing together.
-  alignas(16) static constexpr uint8_t tbl_v0[16] = {
-      0, 0x80, 0x80, 1, 0x80, 0x80, 2, 0x80, 0x80,  //
-      3, 0x80, 0x80, 4, 0x80, 0x80, 5};
-  alignas(16) static constexpr uint8_t tbl_v1[16] = {
-      0x80, 0, 0x80, 0x80, 1, 0x80,  //
-      0x80, 2, 0x80, 0x80, 3, 0x80, 0x80, 4, 0x80, 0x80};
+  const VFromD<decltype(du)> shuf_A0 =
+      Dup128VecFromValues(du, 0, 0x80, 0x80, 1, 0x80, 0x80, 2, 0x80, 0x80, 3,
+                          0x80, 0x80, 4, 0x80, 0x80, 5);
+  // Cannot reuse shuf_A0 because it contains 5.
+  const VFromD<decltype(du)> shuf_A1 =
+      Dup128VecFromValues(du, 0x80, 0, 0x80, 0x80, 1, 0x80, 0x80, 2, 0x80, 0x80,
+                          3, 0x80, 0x80, 4, 0x80, 0x80);
   // The interleaved vectors will be named A, B, C; temporaries with suffix
   // 0..2 indicate which input vector's lanes they hold.
-  const auto shuf_A0 = LoadDup128(du, tbl_v0);
-  const auto shuf_A1 = LoadDup128(du, tbl_v1);  // cannot reuse shuf_A0 (has 5)
+  // cannot reuse shuf_A0 (has 5)
   const auto shuf_A2 = CombineShiftRightBytes<15>(du, shuf_A1, shuf_A1);
   const auto A0 = TableLookupBytesOr0(v0, shuf_A0);  // 5..4..3..2..1..0
   const auto A1 = TableLookupBytesOr0(v1, shuf_A1);  // ..4..3..2..1..0.
@@ -693,19 +1346,16 @@ HWY_API void StoreInterleaved3(VFromD<D> v0, VFromD<D> v1, VFromD<D> v2, D d,
   // v1[2],v0[2], v2[1],v1[1],v0[1], v2[0],v1[0],v0[0]. 0x80 so lanes to be
   // filled from other vectors are 0 for blending. Note that these are byte
   // indices for 16-bit lanes.
-  alignas(16) static constexpr uint8_t tbl_v1[16] = {
-      0x80, 0x80, 0,    1,    0x80, 0x80, 0x80, 0x80,
-      2,    3,    0x80, 0x80, 0x80, 0x80, 4,    5};
-  alignas(16) static constexpr uint8_t tbl_v2[16] = {
-      0x80, 0x80, 0x80, 0x80, 0,    1,    0x80, 0x80,
-      0x80, 0x80, 2,    3,    0x80, 0x80, 0x80, 0x80};
+  const VFromD<decltype(du8)> shuf_A1 =
+      Dup128VecFromValues(du8, 0x80, 0x80, 0, 1, 0x80, 0x80, 0x80, 0x80, 2, 3,
+                          0x80, 0x80, 0x80, 0x80, 4, 5);
+  const VFromD<decltype(du8)> shuf_A2 =
+      Dup128VecFromValues(du8, 0x80, 0x80, 0x80, 0x80, 0, 1, 0x80, 0x80, 0x80,
+                          0x80, 2, 3, 0x80, 0x80, 0x80, 0x80);
   // The interleaved vectors will be named A, B, C; temporaries with suffix
   // 0..2 indicate which input vector's lanes they hold.
-  const auto shuf_A1 = LoadDup128(du8, tbl_v1);  // 2..1..0.
-                                                 // .2..1..0
   const auto shuf_A0 = CombineShiftRightBytes<2>(du8, shuf_A1, shuf_A1);
-  const auto shuf_A2 = LoadDup128(du8, tbl_v2);  // ..1..0..
   const auto A0 = TableLookupBytesOr0(v0, shuf_A0);
   const auto A1 = TableLookupBytesOr0(v1, shuf_A1);
@@ -1104,19 +1754,22 @@ HWY_INLINE VFromD<DTo> LoadNResizeBitCast(DTo d_to, DFrom d_from,
 }  // namespace detail
-template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 1)>
+template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 1),
+          HWY_IF_NOT_BF16_D(D)>
 HWY_API VFromD<D> LoadN(D d, const TFromD<D>* HWY_RESTRICT p,
                         size_t num_lanes) {
   return (num_lanes > 0) ? LoadU(d, p) : Zero(d);
 }
-template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 1)>
+template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 1),
+          HWY_IF_NOT_BF16_D(D)>
 HWY_API VFromD<D> LoadNOr(VFromD<D> no, D d, const TFromD<D>* HWY_RESTRICT p,
                           size_t num_lanes) {
   return (num_lanes > 0) ? LoadU(d, p) : no;
 }
-template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 2)>
+template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 2),
+          HWY_IF_NOT_BF16_D(D)>
 HWY_API VFromD<D> LoadN(D d, const TFromD<D>* HWY_RESTRICT p,
                         size_t num_lanes) {
   const FixedTag<TFromD<D>, 1> d1;
@@ -1126,7 +1779,8 @@ HWY_API VFromD<D> LoadN(D d, const TFromD<D>* HWY_RESTRICT p,
   return detail::LoadNResizeBitCast(d, d1, LoadU(d1, p));
 }
-template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 2)>
+template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 2),
+          HWY_IF_NOT_BF16_D(D)>
 HWY_API VFromD<D> LoadNOr(VFromD<D> no, D d, const TFromD<D>* HWY_RESTRICT p,
                           size_t num_lanes) {
   const FixedTag<TFromD<D>, 1> d1;
@@ -1136,7 +1790,8 @@ HWY_API VFromD<D> LoadNOr(VFromD<D> no, D d, const TFromD<D>* HWY_RESTRICT p,
   return InterleaveLower(ResizeBitCast(d, LoadU(d1, p)), no);
 }
-template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 4)>
+template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 4),
+          HWY_IF_NOT_BF16_D(D)>
 HWY_API VFromD<D> LoadN(D d, const TFromD<D>* HWY_RESTRICT p,
                         size_t num_lanes) {
   const FixedTag<TFromD<D>, 2> d2;
@@ -1151,7 +1806,8 @@ HWY_API VFromD<D> LoadN(D d, const TFromD<D>* HWY_RESTRICT p,
   return (num_lanes == 2) ? v_lo : InsertLane(v_lo, 2, p[2]);
 }
-template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 4)>
+template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 4),
+          HWY_IF_NOT_BF16_D(D)>
 HWY_API VFromD<D> LoadNOr(VFromD<D> no, D d, const TFromD<D>* HWY_RESTRICT p,
                           size_t num_lanes) {
   const FixedTag<TFromD<D>, 2> d2;
@@ -1166,7 +1822,8 @@ HWY_API VFromD<D> LoadNOr(VFromD<D> no, D d, const TFromD<D>* HWY_RESTRICT p,
   return (num_lanes == 2) ? v_lo : InsertLane(v_lo, 2, p[2]);
 }
-template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 8)>
+template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 8),
+          HWY_IF_NOT_BF16_D(D)>
 HWY_API VFromD<D> LoadN(D d, const TFromD<D>* HWY_RESTRICT p,
                         size_t num_lanes) {
   const FixedTag<TFromD<D>, 4> d4;
@@ -1201,7 +1858,8 @@ HWY_API VFromD<D> LoadN(D d, const TFromD<D>* HWY_RESTRICT p,
   }
 }
-template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 8)>
+template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 8),
+          HWY_IF_NOT_BF16_D(D)>
 HWY_API VFromD<D> LoadNOr(VFromD<D> no, D d, const TFromD<D>* HWY_RESTRICT p,
                           size_t num_lanes) {
   const FixedTag<TFromD<D>, 4> d4;
@@ -1238,7 +1896,8 @@ HWY_API VFromD<D> LoadNOr(VFromD<D> no, D d, const TFromD<D>* HWY_RESTRICT p,
   }
 }
-template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 16)>
+template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 16),
+          HWY_IF_NOT_BF16_D(D)>
 HWY_API VFromD<D> LoadN(D d, const TFromD<D>* HWY_RESTRICT p,
                         size_t num_lanes) {
   const FixedTag<TFromD<D>, 8> d8;
@@ -1283,7 +1942,8 @@ HWY_API VFromD<D> LoadN(D d, const TFromD<D>* HWY_RESTRICT p,
   }
 }
-template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 16)>
+template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 16),
+          HWY_IF_NOT_BF16_D(D)>
 HWY_API VFromD<D> LoadNOr(VFromD<D> no, D d, const TFromD<D>* HWY_RESTRICT p,
                           size_t num_lanes) {
   const FixedTag<TFromD<D>, 8> d8;
@@ -1338,7 +1998,7 @@ HWY_API VFromD<D> LoadNOr(VFromD<D> no, D d, const TFromD<D>* HWY_RESTRICT p,
 #if HWY_MAX_BYTES >= 32
-template <class D, HWY_IF_V_SIZE_GT_D(D, 16)>
+template <class D, HWY_IF_V_SIZE_GT_D(D, 16), HWY_IF_NOT_BF16_D(D)>
 HWY_API VFromD<D> LoadN(D d, const TFromD<D>* HWY_RESTRICT p,
                         size_t num_lanes) {
   if (num_lanes >= Lanes(d)) return LoadU(d, p);
@@ -1354,7 +2014,7 @@ HWY_API VFromD<D> LoadN(D d, const TFromD<D>* HWY_RESTRICT p,
   }
 }
-template <class D, HWY_IF_V_SIZE_GT_D(D, 16)>
+template <class D, HWY_IF_V_SIZE_GT_D(D, 16), HWY_IF_NOT_BF16_D(D)>
 HWY_API VFromD<D> LoadNOr(VFromD<D> no, D d, const TFromD<D>* HWY_RESTRICT p,
                           size_t num_lanes) {
   if (num_lanes >= Lanes(d)) return LoadU(d, p);
@@ -1374,7 +2034,23 @@ HWY_API VFromD<D> LoadNOr(VFromD<D> no, D d, const TFromD<D>* HWY_RESTRICT p,
 }
 #endif  // HWY_MAX_BYTES >= 32
-#else   // !HWY_MEM_OPS_MIGHT_FAULT || HWY_HAVE_SCALABLE
+template <class D, HWY_IF_BF16_D(D)>
+HWY_API VFromD<D> LoadN(D d, const TFromD<D>* HWY_RESTRICT p,
+                        size_t num_lanes) {
+  const RebindToUnsigned<D> du;
+  return BitCast(d, LoadN(du, detail::U16LanePointer(p), num_lanes));
+}
+template <class D, HWY_IF_BF16_D(D)>
+HWY_API VFromD<D> LoadNOr(VFromD<D> no, D d, const TFromD<D>* HWY_RESTRICT p,
+                          size_t num_lanes) {
+  const RebindToUnsigned<D> du;
+  return BitCast(
+      d, LoadNOr(BitCast(du, no), du, detail::U16LanePointer(p), num_lanes));
+}
+#else  // !HWY_MEM_OPS_MIGHT_FAULT || HWY_HAVE_SCALABLE
 // For SVE and non-sanitizer AVX-512; RVV has its own specialization.
 template <class D>
@@ -1549,9 +2225,7 @@ HWY_API void StoreN(VFromD<D> v, D d, T* HWY_RESTRICT p,
   BlendedStore(v, FirstN(d, clamped_max_lanes_to_store), d, p);
-#if HWY_MEM_OPS_MIGHT_FAULT
   detail::MaybeUnpoison(p, clamped_max_lanes_to_store);
-#endif
 }
 #endif  // HWY_MEM_OPS_MIGHT_FAULT && !HWY_HAVE_SCALABLE
@@ -1649,6 +2323,7 @@ HWY_API VFromD<D> GatherOffset(D d, const T* HWY_RESTRICT base,
   HWY_ALIGN T lanes[MaxLanes(d)];
   const uint8_t* base_bytes = reinterpret_cast<const uint8_t*>(base);
   for (size_t i = 0; i < MaxLanes(d); ++i) {
+    HWY_DASSERT(offset_lanes[i] >= 0);
     CopyBytes<sizeof(T)>(base_bytes + offset_lanes[i], &lanes[i]);
   }
   return Load(d, lanes);
@@ -1666,6 +2341,7 @@ HWY_API VFromD<D> GatherIndex(D d, const T* HWY_RESTRICT base,
   HWY_ALIGN T lanes[MaxLanes(d)];
   for (size_t i = 0; i < MaxLanes(d); ++i) {
+    HWY_DASSERT(index_lanes[i] >= 0);
     lanes[i] = base[index_lanes[i]];
   }
   return Load(d, lanes);
@@ -1687,11 +2363,37 @@ HWY_API VFromD<D> MaskedGatherIndex(MFromD<D> m, D d,
   HWY_ALIGN T lanes[MaxLanes(d)];
   for (size_t i = 0; i < MaxLanes(d); ++i) {
+    HWY_DASSERT(index_lanes[i] >= 0);
     lanes[i] = mask_lanes[i] ? base[index_lanes[i]] : T{0};
   }
   return Load(d, lanes);
 }
+template <class D, typename T = TFromD<D>>
+HWY_API VFromD<D> MaskedGatherIndexOr(VFromD<D> no, MFromD<D> m, D d,
+                                      const T* HWY_RESTRICT base,
+                                      VFromD<RebindToSigned<D>> index) {
+  const RebindToSigned<D> di;
+  using TI = TFromD<decltype(di)>;
+  static_assert(sizeof(T) == sizeof(TI), "Index/lane size must match");
+  HWY_ALIGN TI index_lanes[MaxLanes(di)];
+  Store(index, di, index_lanes);
+  HWY_ALIGN TI mask_lanes[MaxLanes(di)];
+  Store(BitCast(di, VecFromMask(d, m)), di, mask_lanes);
+  HWY_ALIGN T no_lanes[MaxLanes(d)];
+  Store(no, d, no_lanes);
+  HWY_ALIGN T lanes[MaxLanes(d)];
+  for (size_t i = 0; i < MaxLanes(d); ++i) {
+    HWY_DASSERT(index_lanes[i] >= 0);
+    lanes[i] = mask_lanes[i] ? base[index_lanes[i]] : no_lanes[i];
+  }
+  return Load(d, lanes);
+}
 #endif  // (defined(HWY_NATIVE_GATHER) == defined(HWY_TARGET_TOGGLE))
 // ------------------------------ ScatterN/GatherN
@@ -1733,10 +2435,14 @@ HWY_API V AbsDiff(V a, V b) {
 #define HWY_NATIVE_SUMS_OF_8_ABS_DIFF
 #endif
-template <class V, HWY_IF_U8_D(DFromV<V>),
+template <class V, HWY_IF_UI8_D(DFromV<V>),
           HWY_IF_V_SIZE_GT_D(DFromV<V>, (HWY_TARGET == HWY_SCALAR ? 0 : 4))>
-HWY_API Vec<Repartition<uint64_t, DFromV<V>>> SumsOf8AbsDiff(V a, V b) {
-  return SumsOf8(AbsDiff(a, b));
+HWY_API Vec<RepartitionToWideX3<DFromV<V>>> SumsOf8AbsDiff(V a, V b) {
+  const DFromV<decltype(a)> d;
+  const RebindToUnsigned<decltype(d)> du;
+  const RepartitionToWideX3<decltype(d)> dw;
+  return BitCast(dw, SumsOf8(BitCast(du, AbsDiff(a, b))));
 }
 #endif  // HWY_NATIVE_SUMS_OF_8_ABS_DIFF
@@ -1923,6 +2629,248 @@ HWY_API VFromD<D> PromoteUpperTo(D d, V v) {
 #endif  // HWY_TARGET != HWY_SCALAR
 #endif  // HWY_NATIVE_PROMOTE_UPPER_TO
+// ------------------------------ PromoteEvenTo/PromoteOddTo
+#if HWY_TARGET != HWY_SCALAR
+namespace detail {
+// Tag dispatch is used in detail::PromoteEvenTo and detail::PromoteOddTo as
+// there are target-specific specializations for some of the
+// detail::PromoteEvenTo and detail::PromoteOddTo cases on
+// SVE/PPC/SSE2/SSSE3/SSE4/AVX2.
+// All targets except HWY_SCALAR use the implementations of
+// detail::PromoteEvenTo and detail::PromoteOddTo in generic_ops-inl.h for at
+// least some of the PromoteEvenTo and PromoteOddTo cases.
+// Signed to signed PromoteEvenTo/PromoteOddTo
+template <size_t kToLaneSize, class D, class V>
+HWY_INLINE VFromD<D> PromoteEvenTo(
+    hwy::SignedTag /*to_type_tag*/,
+    hwy::SizeTag<kToLaneSize> /*to_lane_size_tag*/,
+    hwy::SignedTag /*from_type_tag*/, D d_to, V v) {
+#if HWY_IS_LITTLE_ENDIAN
+  // On little-endian targets, need to shift each lane of the bitcasted vector
+  // left by kToLaneSize * 4 bits to get the bits of the even source lanes into
+  // the upper kToLaneSize * 4 bits of even_in_hi.
+  const auto even_in_hi = ShiftLeft<kToLaneSize * 4>(BitCast(d_to, v));
+#else
+  // On big-endian targets, the bits of the even source lanes are already in
+  // the upper kToLaneSize * 4 bits of the lanes of the bitcasted vector.
+  const auto even_in_hi = BitCast(d_to, v);
+#endif
+  // Right-shift even_in_hi by kToLaneSize * 4 bits
+  return ShiftRight<kToLaneSize * 4>(even_in_hi);
+}
+template <size_t kToLaneSize, class D, class V>
+HWY_INLINE VFromD<D> PromoteOddTo(
+    hwy::SignedTag /*to_type_tag*/,
+    hwy::SizeTag<kToLaneSize> /*to_lane_size_tag*/,
+    hwy::SignedTag /*from_type_tag*/, D d_to, V v) {
+#if HWY_IS_LITTLE_ENDIAN
+  // On little-endian targets, the bits of the odd source lanes are already in
+  // the upper kToLaneSize * 4 bits of the lanes of the bitcasted vector.
+  const auto odd_in_hi = BitCast(d_to, v);
+#else
+  // On big-endian targets, need to shift each lane of the bitcasted vector left
+  // by kToLaneSize * 4 bits to get the bits of the odd source lanes into the
+  // upper kToLaneSize * 4 bits of odd_in_hi.
+  const auto odd_in_hi = ShiftLeft<kToLaneSize * 4>(BitCast(d_to, v));
+#endif
+  // Right-shift odd_in_hi by kToLaneSize * 4 bits
+  return ShiftRight<kToLaneSize * 4>(odd_in_hi);
+}
+// Unsigned to unsigned PromoteEvenTo/PromoteOddTo
+template <size_t kToLaneSize, class D, class V>
+HWY_INLINE VFromD<D> PromoteEvenTo(
+    hwy::UnsignedTag /*to_type_tag*/,
+    hwy::SizeTag<kToLaneSize> /*to_lane_size_tag*/,
+    hwy::UnsignedTag /*from_type_tag*/, D d_to, V v) {
+#if HWY_IS_LITTLE_ENDIAN
+  // On little-endian targets, the bits of the even source lanes are already
+  // in the lower kToLaneSize * 4 bits of the lanes of the bitcasted vector.
+  // Simply need to zero out the upper bits of each lane of the bitcasted
+  // vector.
+  return And(BitCast(d_to, v),
+             Set(d_to, static_cast<TFromD<D>>(LimitsMax<TFromV<V>>())));
+#else
+  // On big-endian targets, need to shift each lane of the bitcasted vector
+  // right by kToLaneSize * 4 bits to get the bits of the even source lanes into
+  // the lower kToLaneSize * 4 bits of the result.
+  // The right shift below will zero out the upper kToLaneSize * 4 bits of the
+  // result.
+  return ShiftRight<kToLaneSize * 4>(BitCast(d_to, v));
+#endif
+}
+template <size_t kToLaneSize, class D, class V>
+HWY_INLINE VFromD<D> PromoteOddTo(
+    hwy::UnsignedTag /*to_type_tag*/,
+    hwy::SizeTag<kToLaneSize> /*to_lane_size_tag*/,
+    hwy::UnsignedTag /*from_type_tag*/, D d_to, V v) {
+#if HWY_IS_LITTLE_ENDIAN
+  // On little-endian targets, need to shift each lane of the bitcasted vector
+  // right by kToLaneSize * 4 bits to get the bits of the odd source lanes into
+  // the lower kToLaneSize * 4 bits of the result.
+  // The right shift below will zero out the upper kToLaneSize * 4 bits of the
+  // result.
+  return ShiftRight<kToLaneSize * 4>(BitCast(d_to, v));
+#else
+  // On big-endian targets, the bits of the even source lanes are already
+  // in the lower kToLaneSize * 4 bits of the lanes of the bitcasted vector.
+  // Simply need to zero out the upper bits of each lane of the bitcasted
+  // vector.
+  return And(BitCast(d_to, v),
+             Set(d_to, static_cast<TFromD<D>>(LimitsMax<TFromV<V>>())));
+#endif
+}
+// Unsigned to signed: Same as unsigned->unsigned PromoteEvenTo/PromoteOddTo
+// followed by BitCast to signed
+template <size_t kToLaneSize, class D, class V>
+HWY_INLINE VFromD<D> PromoteEvenTo(
+    hwy::SignedTag /*to_type_tag*/,
+    hwy::SizeTag<kToLaneSize> /*to_lane_size_tag*/,
+    hwy::UnsignedTag /*from_type_tag*/, D d_to, V v) {
+  const RebindToUnsigned<decltype(d_to)> du_to;
+  return BitCast(d_to,
+                 PromoteEvenTo(hwy::UnsignedTag(), hwy::SizeTag<kToLaneSize>(),
+                               hwy::UnsignedTag(), du_to, v));
+}
+template <size_t kToLaneSize, class D, class V>
+HWY_INLINE VFromD<D> PromoteOddTo(
+    hwy::SignedTag /*to_type_tag*/,
+    hwy::SizeTag<kToLaneSize> /*to_lane_size_tag*/,
+    hwy::UnsignedTag /*from_type_tag*/, D d_to, V v) {
+  const RebindToUnsigned<decltype(d_to)> du_to;
+  return BitCast(d_to,
+                 PromoteOddTo(hwy::UnsignedTag(), hwy::SizeTag<kToLaneSize>(),
+                              hwy::UnsignedTag(), du_to, v));
+}
+// BF16->F32 PromoteEvenTo
+// NOTE: It is possible for FromTypeTag to be hwy::SignedTag or hwy::UnsignedTag
+// instead of hwy::FloatTag on targets that use scalable vectors.
+// VBF16 is considered to be a bfloat16_t vector if TFromV<VBF16> is the same
+// type as TFromV<VFromD<Repartition<bfloat16_t, DF32>>>
+// The BF16->F32 PromoteEvenTo overload is only enabled if VBF16 is considered
+// to be a bfloat16_t vector.
+template <class FromTypeTag, class DF32, class VBF16,
+          class VBF16_2 = VFromD<Repartition<bfloat16_t, DF32>>,
+          hwy::EnableIf<IsSame<TFromV<VBF16>, TFromV<VBF16_2>>()>* = nullptr>
+HWY_INLINE VFromD<DF32> PromoteEvenTo(hwy::FloatTag /*to_type_tag*/,
+                                      hwy::SizeTag<4> /*to_lane_size_tag*/,
+                                      FromTypeTag /*from_type_tag*/, DF32 d_to,
+                                      VBF16 v) {
+  const RebindToUnsigned<decltype(d_to)> du_to;
+#if HWY_IS_LITTLE_ENDIAN
+  // On little-endian platforms, need to shift left each lane of the bitcasted
+  // vector by 16 bits.
+  return BitCast(d_to, ShiftLeft<16>(BitCast(du_to, v)));
+#else
+  // On big-endian platforms, the even lanes of the source vector are already
+  // in the upper 16 bits of the lanes of the bitcasted vector.
+  // Need to simply zero out the lower 16 bits of each lane of the bitcasted
+  // vector.
+  return BitCast(d_to,
+                 And(BitCast(du_to, v), Set(du_to, uint32_t{0xFFFF0000u})));
+#endif
+}
+// BF16->F32 PromoteOddTo
+// NOTE: It is possible for FromTypeTag to be hwy::SignedTag or hwy::UnsignedTag
+// instead of hwy::FloatTag on targets that use scalable vectors.
+// VBF16 is considered to be a bfloat16_t vector if TFromV<VBF16> is the same
+// type as TFromV<VFromD<Repartition<bfloat16_t, DF32>>>
+// The BF16->F32 PromoteEvenTo overload is only enabled if VBF16 is considered
+// to be a bfloat16_t vector.
+template <class FromTypeTag, class DF32, class VBF16,
+          class VBF16_2 = VFromD<Repartition<bfloat16_t, DF32>>,
+          hwy::EnableIf<IsSame<TFromV<VBF16>, TFromV<VBF16_2>>()>* = nullptr>
+HWY_INLINE VFromD<DF32> PromoteOddTo(hwy::FloatTag /*to_type_tag*/,
+                                     hwy::SizeTag<4> /*to_lane_size_tag*/,
+                                     FromTypeTag /*from_type_tag*/, DF32 d_to,
+                                     VBF16 v) {
+  const RebindToUnsigned<decltype(d_to)> du_to;
+#if HWY_IS_LITTLE_ENDIAN
+  // On little-endian platforms, the odd lanes of the source vector are already
+  // in the upper 16 bits of the lanes of the bitcasted vector.
+  // Need to simply zero out the lower 16 bits of each lane of the bitcasted
+  // vector.
+  return BitCast(d_to,
+                 And(BitCast(du_to, v), Set(du_to, uint32_t{0xFFFF0000u})));
+#else
+  // On big-endian platforms, need to shift left each lane of the bitcasted
+  // vector by 16 bits.
+  return BitCast(d_to, ShiftLeft<16>(BitCast(du_to, v)));
+#endif
+}
+// Default PromoteEvenTo/PromoteOddTo implementations
+template <class ToTypeTag, size_t kToLaneSize, class FromTypeTag, class D,
+          class V, HWY_IF_LANES_D(D, 1)>
+HWY_INLINE VFromD<D> PromoteEvenTo(
+    ToTypeTag /*to_type_tag*/, hwy::SizeTag<kToLaneSize> /*to_lane_size_tag*/,
+    FromTypeTag /*from_type_tag*/, D d_to, V v) {
+  return PromoteLowerTo(d_to, v);
+}
+template <class ToTypeTag, size_t kToLaneSize, class FromTypeTag, class D,
+          class V, HWY_IF_LANES_GT_D(D, 1)>
+HWY_INLINE VFromD<D> PromoteEvenTo(
+    ToTypeTag /*to_type_tag*/, hwy::SizeTag<kToLaneSize> /*to_lane_size_tag*/,
+    FromTypeTag /*from_type_tag*/, D d_to, V v) {
+  const DFromV<decltype(v)> d;
+  return PromoteLowerTo(d_to, ConcatEven(d, v, v));
+}
+template <class ToTypeTag, size_t kToLaneSize, class FromTypeTag, class D,
+          class V>
+HWY_INLINE VFromD<D> PromoteOddTo(
+    ToTypeTag /*to_type_tag*/, hwy::SizeTag<kToLaneSize> /*to_lane_size_tag*/,
+    FromTypeTag /*from_type_tag*/, D d_to, V v) {
+  const DFromV<decltype(v)> d;
+  return PromoteLowerTo(d_to, ConcatOdd(d, v, v));
+}
+}  // namespace detail
+template <class D, class V, HWY_IF_T_SIZE_D(D, 2 * sizeof(TFromV<V>)),
+          class V2 = VFromD<Repartition<TFromV<V>, D>>,
+          HWY_IF_LANES_D(DFromV<V>, HWY_MAX_LANES_V(V2))>
+HWY_API VFromD<D> PromoteEvenTo(D d, V v) {
+  return detail::PromoteEvenTo(hwy::TypeTag<TFromD<D>>(),
+                               hwy::SizeTag<sizeof(TFromD<D>)>(),
+                               hwy::TypeTag<TFromV<V>>(), d, v);
+}
+template <class D, class V, HWY_IF_T_SIZE_D(D, 2 * sizeof(TFromV<V>)),
+          class V2 = VFromD<Repartition<TFromV<V>, D>>,
+          HWY_IF_LANES_D(DFromV<V>, HWY_MAX_LANES_V(V2))>
+HWY_API VFromD<D> PromoteOddTo(D d, V v) {
+  return detail::PromoteOddTo(hwy::TypeTag<TFromD<D>>(),
+                              hwy::SizeTag<sizeof(TFromD<D>)>(),
+                              hwy::TypeTag<TFromV<V>>(), d, v);
+}
+#endif  // HWY_TARGET != HWY_SCALAR
 // ------------------------------ float16_t <-> float
 #if (defined(HWY_NATIVE_F16C) == defined(HWY_TARGET_TOGGLE))
@@ -1956,41 +2904,237 @@ HWY_API VFromD<D> PromoteTo(D df32, VFromD<Rebind<float16_t, D>> v) {
 template <class D, HWY_IF_F16_D(D)>
 HWY_API VFromD<D> DemoteTo(D df16, VFromD<Rebind<float, D>> v) {
-  const RebindToUnsigned<decltype(df16)> du16;
-  const Rebind<uint32_t, decltype(df16)> du32;
-  const RebindToSigned<decltype(du32)> di32;
-  using VU32 = VFromD<decltype(du32)>;
-  using VI32 = VFromD<decltype(di32)>;
-  const VU32 bits32 = BitCast(du32, v);
-  const VU32 sign = ShiftRight<31>(bits32);
-  const VU32 biased_exp32 = And(ShiftRight<23>(bits32), Set(du32, 0xFF));
-  const VU32 mantissa32 = And(bits32, Set(du32, 0x7FFFFF));
-  const VI32 k15 = Set(di32, 15);
-  const VI32 exp = Min(Sub(BitCast(di32, biased_exp32), Set(di32, 127)), k15);
-  const MFromD<decltype(di32)> is_tiny = Lt(exp, Set(di32, -24));
-  const MFromD<decltype(di32)> is_subnormal = Lt(exp, Set(di32, -14));
-  const VU32 biased_exp16 =
-      BitCast(du32, IfThenZeroElse(is_subnormal, Add(exp, k15)));
-  const VU32 sub_exp = BitCast(du32, Sub(Set(di32, -14), exp));  // [1, 11)
-  // Clamp shift counts to prevent warnings in emu_128 Shr.
-  const VU32 k31 = Set(du32, 31);
-  const VU32 shift_m = Min(Add(Set(du32, 13), sub_exp), k31);
-  const VU32 shift_1 = Min(Sub(Set(du32, 10), sub_exp), k31);
-  const VU32 sub_m = Add(Shl(Set(du32, 1), shift_1), Shr(mantissa32, shift_m));
-  const VU32 mantissa16 = IfThenElse(RebindMask(du32, is_subnormal), sub_m,
-                                     ShiftRight<13>(mantissa32));  // <1024
-  const VU32 sign16 = ShiftLeft<15>(sign);
-  const VU32 normal16 = Or3(sign16, ShiftLeft<10>(biased_exp16), mantissa16);
-  const VI32 bits16 = IfThenZeroElse(is_tiny, BitCast(di32, normal16));
-  return BitCast(df16, DemoteTo(du16, bits16));
+  const RebindToSigned<decltype(df16)> di16;
+  const Rebind<int32_t, decltype(df16)> di32;
+  const RebindToFloat<decltype(di32)> df32;
+  const RebindToUnsigned<decltype(df32)> du32;
+  // There are 23 fractional bits (plus the implied 1 bit) in the mantissa of
+  // a F32, and there are 10 fractional bits (plus the implied 1 bit) in the
+  // mantissa of a F16
+  // We want the unbiased exponent of round_incr[i] to be at least (-14) + 13 as
+  // 2^(-14) is the smallest positive normal F16 value and as we want 13
+  // mantissa bits (including the implicit 1 bit) to the left of the
+  // F32 mantissa bits in rounded_val[i] since 23 - 10 is equal to 13
+  // The biased exponent of round_incr[i] needs to be at least 126 as
+  // (-14) + 13 + 127 is equal to 126
+  // We also want to biased exponent of round_incr[i] to be less than or equal
+  // to 255 (which is equal to MaxExponentField<float>())
+  // The biased F64 exponent of round_incr is equal to
+  // HWY_MAX(HWY_MIN(((exp_bits[i] >> 23) & 255) + 13, 255), 126)
+  // hi9_bits[i] is equal to the upper 9 bits of v[i]
+  const auto hi9_bits = ShiftRight<23>(BitCast(du32, v));
+  const auto k13 = Set(du32, uint32_t{13u});
+  // Minimum biased F32 exponent of round_incr
+  const auto k126 = Set(du32, uint32_t{126u});
+  // round_incr_hi9_bits[i] is equivalent to
+  // (hi9_bits[i] & 0x100) |
+  // HWY_MAX(HWY_MIN((hi9_bits[i] & 0xFF) + 13, 255), 126)
+#if HWY_TARGET == HWY_SCALAR || HWY_TARGET == HWY_EMU128
+  const auto k255 = Set(du32, uint32_t{255u});
+  const auto round_incr_hi9_bits = BitwiseIfThenElse(
+      k255, Max(Min(Add(And(hi9_bits, k255), k13), k255), k126), hi9_bits);
+#else
+  // On targets other than SCALAR and EMU128, the exponent bits of hi9_bits can
+  // be incremented by 13 and clamped to the [13, 255] range without overflowing
+  // into the sign bit of hi9_bits by using U8 SaturatedAdd as there are 8
+  // exponent bits in an F32
+  // U8 Max can be used on targets other than SCALAR and EMU128 to clamp
+  // ((hi9_bits & 0xFF) + 13) to the [126, 255] range without affecting the sign
+  // bit
+  const Repartition<uint8_t, decltype(du32)> du32_as_u8;
+  const auto round_incr_hi9_bits = BitCast(
+      du32,
+      Max(SaturatedAdd(BitCast(du32_as_u8, hi9_bits), BitCast(du32_as_u8, k13)),
+          BitCast(du32_as_u8, k126)));
+#endif
+  // (round_incr_hi9_bits >> 8) is equal to (hi9_bits >> 8), and
+  // (round_incr_hi9_bits & 0xFF) is equal to
+  // HWY_MAX(HWY_MIN((round_incr_hi9_bits & 0xFF) + 13, 255), 126)
+  const auto round_incr = BitCast(df32, ShiftLeft<23>(round_incr_hi9_bits));
+  // Add round_incr[i] to v[i] to round the mantissa to the nearest F16 mantissa
+  // and to move the fractional bits of the resulting non-NaN mantissa down to
+  // the lower 10 bits of rounded_val if (v[i] + round_incr[i]) is a non-NaN
+  // value
+  const auto rounded_val = Add(v, round_incr);
+  // rounded_val_bits is the bits of rounded_val as a U32
+  const auto rounded_val_bits = BitCast(du32, rounded_val);
+  // rounded_val[i] is known to have the same biased exponent as round_incr[i]
+  // as |round_incr[i]| > 2^12*|v[i]| is true if round_incr[i] is a finite
+  // value, round_incr[i] and v[i] both have the same sign, and |round_incr[i]|
+  // is either a power of 2 that is greater than or equal to 2^-1 or infinity.
+  // If rounded_val[i] is a finite F32 value, then
+  // (rounded_val_bits[i] & 0x00000FFF) is the bit representation of the
+  // rounded mantissa of rounded_val[i] as a UQ2.10 fixed point number that is
+  // in the range [0, 2].
+  // In other words, (rounded_val_bits[i] & 0x00000FFF) is between 0 and 0x0800,
+  // with (rounded_val_bits[i] & 0x000003FF) being the fractional bits of the
+  // resulting F16 mantissa, if rounded_v[i] is a finite F32 value.
+  // (rounded_val_bits[i] & 0x007FF000) == 0 is guaranteed to be true if
+  // rounded_val[i] is a non-NaN value
+  // The biased exponent of rounded_val[i] is guaranteed to be at least 126 as
+  // the biased exponent of round_incr[i] is at least 126 and as both v[i] and
+  // round_incr[i] have the same sign bit
+  // The ULP of a F32 value with a biased exponent of 126 is equal to
+  // 2^(126 - 127 - 23), which is equal to 2^(-24) (which is also the ULP of a
+  // F16 value with a biased exponent of 0 or 1 as (1 - 15 - 10) is equal to
+  // -24)
+  // The biased exponent (before subtracting by 126) needs to be clamped to the
+  // [126, 157] range as 126 + 31 is equal to 157 and as 31 is the largest
+  // biased exponent of a F16.
+  // The biased exponent of the resulting F16 value is equal to
+  // HWY_MIN((round_incr_hi9_bits[i] & 0xFF) +
+  //         ((rounded_val_bits[i] >> 10) & 0xFF), 157) - 126
+#if HWY_TARGET == HWY_SCALAR || HWY_TARGET == HWY_EMU128
+  auto f16_exp_bits =
+      Min(Add(ShiftLeft<10>(And(round_incr_hi9_bits, k255)),
+              And(rounded_val_bits,
+                  Set(du32, static_cast<uint32_t>(uint32_t{0xFFu} << 10)))),
+          Set(du32, static_cast<uint32_t>(uint32_t{157u} << 10)));
+#else
+  auto f16_exp_bits = ShiftLeft<10>(BitCast(
+      du32,
+      Min(SaturatedAdd(BitCast(du32_as_u8, round_incr_hi9_bits),
+                       BitCast(du32_as_u8, ShiftRight<10>(rounded_val_bits))),
+          BitCast(du32_as_u8, Set(du32, uint32_t{157})))));
+#endif
+  f16_exp_bits =
+      Sub(f16_exp_bits, Set(du32, static_cast<uint32_t>(uint32_t{126u} << 10)));
+  const auto f16_unmasked_mant_bits =
+      BitCast(di32, Or(rounded_val, VecFromMask(df32, IsNaN(rounded_val))));
+  const auto f16_exp_mant_bits =
+      OrAnd(BitCast(di32, f16_exp_bits), f16_unmasked_mant_bits,
+            Set(di32, int32_t{0x03FF}));
+  // f16_bits_as_i32 is the F16 bits sign-extended to an I32 (with the upper 17
+  // bits of f16_bits_as_i32[i] set to the sign bit of rounded_val[i]) to allow
+  // efficient truncation of the F16 bits to an I16 using an I32->I16 DemoteTo
+  // operation
+  const auto f16_bits_as_i32 =
+      OrAnd(f16_exp_mant_bits, ShiftRight<16>(BitCast(di32, rounded_val_bits)),
+            Set(di32, static_cast<int32_t>(0xFFFF8000u)));
+  return BitCast(df16, DemoteTo(di16, f16_bits_as_i32));
 }
 #endif  // HWY_NATIVE_F16C
+// ------------------------------ F64->F16 DemoteTo
+#if (defined(HWY_NATIVE_DEMOTE_F64_TO_F16) == defined(HWY_TARGET_TOGGLE))
+#ifdef HWY_NATIVE_DEMOTE_F64_TO_F16
+#undef HWY_NATIVE_DEMOTE_F64_TO_F16
+#else
+#define HWY_NATIVE_DEMOTE_F64_TO_F16
+#endif
+#if HWY_HAVE_FLOAT64
+template <class D, HWY_IF_F16_D(D)>
+HWY_API VFromD<D> DemoteTo(D df16, VFromD<Rebind<double, D>> v) {
+  const Rebind<double, D> df64;
+  const Rebind<uint64_t, D> du64;
+  const Rebind<float, D> df32;
+  // The mantissa bits of v[i] are first rounded using round-to-odd rounding to
+  // the nearest F64 value that has the lower 29 bits zeroed out to ensure that
+  // the result is correctly rounded to a F16.
+  const auto vf64_rounded = OrAnd(
+      And(v,
+          BitCast(df64, Set(du64, static_cast<uint64_t>(0xFFFFFFFFE0000000u)))),
+      BitCast(df64, Add(BitCast(du64, v),
+                        Set(du64, static_cast<uint64_t>(0x000000001FFFFFFFu)))),
+      BitCast(df64, Set(du64, static_cast<uint64_t>(0x0000000020000000ULL))));
+  return DemoteTo(df16, DemoteTo(df32, vf64_rounded));
+}
+#endif  // HWY_HAVE_FLOAT64
+#endif  // HWY_NATIVE_DEMOTE_F64_TO_F16
+// ------------------------------ F16->F64 PromoteTo
+#if (defined(HWY_NATIVE_PROMOTE_F16_TO_F64) == defined(HWY_TARGET_TOGGLE))
+#ifdef HWY_NATIVE_PROMOTE_F16_TO_F64
+#undef HWY_NATIVE_PROMOTE_F16_TO_F64
+#else
+#define HWY_NATIVE_PROMOTE_F16_TO_F64
+#endif
+#if HWY_HAVE_FLOAT64
+template <class D, HWY_IF_F64_D(D)>
+HWY_API VFromD<D> PromoteTo(D df64, VFromD<Rebind<float16_t, D>> v) {
+  return PromoteTo(df64, PromoteTo(Rebind<float, D>(), v));
+}
+#endif  // HWY_HAVE_FLOAT64
+#endif  // HWY_NATIVE_PROMOTE_F16_TO_F64
+// ------------------------------ SumsOf2
+#if HWY_TARGET != HWY_SCALAR
+namespace detail {
+template <class TypeTag, size_t kLaneSize, class V>
+HWY_INLINE VFromD<RepartitionToWide<DFromV<V>>> SumsOf2(
+    TypeTag /*type_tag*/, hwy::SizeTag<kLaneSize> /*lane_size_tag*/, V v) {
+  const DFromV<decltype(v)> d;
+  const RepartitionToWide<decltype(d)> dw;
+  return Add(PromoteEvenTo(dw, v), PromoteOddTo(dw, v));
+}
+}  // namespace detail
+template <class V>
+HWY_API VFromD<RepartitionToWide<DFromV<V>>> SumsOf2(V v) {
+  return detail::SumsOf2(hwy::TypeTag<TFromV<V>>(),
+                         hwy::SizeTag<sizeof(TFromV<V>)>(), v);
+}
+#endif  // HWY_TARGET != HWY_SCALAR
+// ------------------------------ SumsOf4
+namespace detail {
+template <class TypeTag, size_t kLaneSize, class V>
+HWY_INLINE VFromD<RepartitionToWideX2<DFromV<V>>> SumsOf4(
+    TypeTag /*type_tag*/, hwy::SizeTag<kLaneSize> /*lane_size_tag*/, V v) {
+  using hwy::HWY_NAMESPACE::SumsOf2;
+  return SumsOf2(SumsOf2(v));
+}
+}  // namespace detail
+template <class V>
+HWY_API VFromD<RepartitionToWideX2<DFromV<V>>> SumsOf4(V v) {
+  return detail::SumsOf4(hwy::TypeTag<TFromV<V>>(),
+                         hwy::SizeTag<sizeof(TFromV<V>)>(), v);
+}
 // ------------------------------ OrderedTruncate2To
 #if HWY_IDE || \
@@ -2206,8 +3350,7 @@ HWY_INLINE VFromD<D> UIntToF32BiasedExp(D d, VFromD<D> v) {
 #if HWY_TARGET == HWY_SCALAR
   const uint64_t u64_val = GetLane(v);
   const float f32_val = static_cast<float>(u64_val);
-  uint32_t f32_bits;
-  CopySameSize(&f32_val, &f32_bits);
+  const uint32_t f32_bits = BitCastScalar<uint32_t>(f32_val);
   return Set(d, static_cast<uint64_t>(f32_bits >> 23));
 #else
   const Repartition<uint32_t, decltype(d)> du32;
@@ -2320,30 +3463,29 @@ HWY_INLINE V SubBytesMulInverseAndAffineLookup(V state, V affine_tblL,
   // Change polynomial basis to GF(2^4)
   {
-    alignas(16) static constexpr uint8_t basisL[16] = {
-        0x00, 0x70, 0x2A, 0x5A, 0x98, 0xE8, 0xB2, 0xC2,
-        0x08, 0x78, 0x22, 0x52, 0x90, 0xE0, 0xBA, 0xCA};
-    alignas(16) static constexpr uint8_t basisU[16] = {
-        0x00, 0x4D, 0x7C, 0x31, 0x7D, 0x30, 0x01, 0x4C,
-        0x81, 0xCC, 0xFD, 0xB0, 0xFC, 0xB1, 0x80, 0xCD};
+    const VFromD<decltype(du)> basisL =
+        Dup128VecFromValues(du, 0x00, 0x70, 0x2A, 0x5A, 0x98, 0xE8, 0xB2, 0xC2,
+                            0x08, 0x78, 0x22, 0x52, 0x90, 0xE0, 0xBA, 0xCA);
+    const VFromD<decltype(du)> basisU =
+        Dup128VecFromValues(du, 0x00, 0x4D, 0x7C, 0x31, 0x7D, 0x30, 0x01, 0x4C,
+                            0x81, 0xCC, 0xFD, 0xB0, 0xFC, 0xB1, 0x80, 0xCD);
     const auto sL = And(state, mask);
     const auto sU = ShiftRight<4>(state);  // byte shift => upper bits are zero
-    const auto gf4L = TableLookupBytes(LoadDup128(du, basisL), sL);
-    const auto gf4U = TableLookupBytes(LoadDup128(du, basisU), sU);
+    const auto gf4L = TableLookupBytes(basisL, sL);
+    const auto gf4U = TableLookupBytes(basisU, sU);
     state = Xor(gf4L, gf4U);
   }
   // Inversion in GF(2^4). Elements 0 represent "infinity" (division by 0) and
   // cause TableLookupBytesOr0 to return 0.
-  alignas(16) static constexpr uint8_t kZetaInv[16] = {
-      0x80, 7, 11, 15, 6, 10, 4, 1, 9, 8, 5, 2, 12, 14, 13, 3};
-  alignas(16) static constexpr uint8_t kInv[16] = {
-      0x80, 1, 8, 13, 15, 6, 5, 14, 2, 12, 11, 10, 9, 3, 7, 4};
-  const auto tbl = LoadDup128(du, kInv);
+  const VFromD<decltype(du)> zetaInv = Dup128VecFromValues(
+      du, 0x80, 7, 11, 15, 6, 10, 4, 1, 9, 8, 5, 2, 12, 14, 13, 3);
+  const VFromD<decltype(du)> tbl = Dup128VecFromValues(
+      du, 0x80, 1, 8, 13, 15, 6, 5, 14, 2, 12, 11, 10, 9, 3, 7, 4);
   const auto sL = And(state, mask);      // L=low nibble, U=upper
   const auto sU = ShiftRight<4>(state);  // byte shift => upper bits are zero
   const auto sX = Xor(sU, sL);
-  const auto invL = TableLookupBytes(LoadDup128(du, kZetaInv), sL);
+  const auto invL = TableLookupBytes(zetaInv, sL);
   const auto invU = TableLookupBytes(tbl, sU);
   const auto invX = TableLookupBytes(tbl, sX);
   const auto outL = Xor(sX, TableLookupBytesOr0(tbl, Xor(invL, invU)));
@@ -2359,26 +3501,25 @@ HWY_INLINE V SubBytes(V state) {
   const DFromV<V> du;
   // Linear skew (cannot bake 0x63 bias into the table because out* indices
   // may have the infinity flag set).
-  alignas(16) static constexpr uint8_t kAffineL[16] = {
-      0x00, 0xC7, 0xBD, 0x6F, 0x17, 0x6D, 0xD2, 0xD0,
-      0x78, 0xA8, 0x02, 0xC5, 0x7A, 0xBF, 0xAA, 0x15};
-  alignas(16) static constexpr uint8_t kAffineU[16] = {
-      0x00, 0x6A, 0xBB, 0x5F, 0xA5, 0x74, 0xE4, 0xCF,
-      0xFA, 0x35, 0x2B, 0x41, 0xD1, 0x90, 0x1E, 0x8E};
-  return Xor(SubBytesMulInverseAndAffineLookup(state, LoadDup128(du, kAffineL),
-                                               LoadDup128(du, kAffineU)),
+  const VFromD<decltype(du)> affineL =
+      Dup128VecFromValues(du, 0x00, 0xC7, 0xBD, 0x6F, 0x17, 0x6D, 0xD2, 0xD0,
+                          0x78, 0xA8, 0x02, 0xC5, 0x7A, 0xBF, 0xAA, 0x15);
+  const VFromD<decltype(du)> affineU =
+      Dup128VecFromValues(du, 0x00, 0x6A, 0xBB, 0x5F, 0xA5, 0x74, 0xE4, 0xCF,
+                          0xFA, 0x35, 0x2B, 0x41, 0xD1, 0x90, 0x1E, 0x8E);
+  return Xor(SubBytesMulInverseAndAffineLookup(state, affineL, affineU),
              Set(du, uint8_t{0x63}));
 }
 template <class V>  // u8
 HWY_INLINE V InvSubBytes(V state) {
   const DFromV<V> du;
-  alignas(16) static constexpr uint8_t kGF2P4InvToGF2P8InvL[16]{
-      0x00, 0x40, 0xF9, 0x7E, 0x53, 0xEA, 0x87, 0x13,
-      0x2D, 0x3E, 0x94, 0xD4, 0xB9, 0x6D, 0xAA, 0xC7};
-  alignas(16) static constexpr uint8_t kGF2P4InvToGF2P8InvU[16]{
-      0x00, 0x1D, 0x44, 0x93, 0x0F, 0x56, 0xD7, 0x12,
-      0x9C, 0x8E, 0xC5, 0xD8, 0x59, 0x81, 0x4B, 0xCA};
+  const VFromD<decltype(du)> gF2P4InvToGF2P8InvL =
+      Dup128VecFromValues(du, 0x00, 0x40, 0xF9, 0x7E, 0x53, 0xEA, 0x87, 0x13,
+                          0x2D, 0x3E, 0x94, 0xD4, 0xB9, 0x6D, 0xAA, 0xC7);
+  const VFromD<decltype(du)> gF2P4InvToGF2P8InvU =
+      Dup128VecFromValues(du, 0x00, 0x1D, 0x44, 0x93, 0x0F, 0x56, 0xD7, 0x12,
+                          0x9C, 0x8E, 0xC5, 0xD8, 0x59, 0x81, 0x4B, 0xCA);
   // Apply the inverse affine transformation
   const auto b = Xor(Xor3(Or(ShiftLeft<1>(state), ShiftRight<7>(state)),
@@ -2392,16 +3533,14 @@ HWY_INLINE V InvSubBytes(V state) {
   // - Converting the GF(2^4) multiplicative inverse to the GF(2^8)
   //   multiplicative inverse through table lookups using the
   //   kGF2P4InvToGF2P8InvL and kGF2P4InvToGF2P8InvU tables
-  return SubBytesMulInverseAndAffineLookup(
-      b, LoadDup128(du, kGF2P4InvToGF2P8InvL),
-      LoadDup128(du, kGF2P4InvToGF2P8InvU));
+  return SubBytesMulInverseAndAffineLookup(b, gF2P4InvToGF2P8InvL,
+                                           gF2P4InvToGF2P8InvU);
 }
 }  // namespace detail
 #endif  // HWY_TARGET != HWY_SCALAR
-// "Include guard": skip if native AES instructions are available.
 #if (defined(HWY_NATIVE_AES) == defined(HWY_TARGET_TOGGLE))
 #ifdef HWY_NATIVE_AES
 #undef HWY_NATIVE_AES
@@ -2417,24 +3556,18 @@ namespace detail {
 template <class V>  // u8
 HWY_INLINE V ShiftRows(const V state) {
   const DFromV<V> du;
-  alignas(16) static constexpr uint8_t kShiftRow[16] = {
-      0,  5,  10, 15,  // transposed: state is column major
-      4,  9,  14, 3,   //
-      8,  13, 2,  7,   //
-      12, 1,  6,  11};
-  const auto shift_row = LoadDup128(du, kShiftRow);
+  // transposed: state is column major
+  const VFromD<decltype(du)> shift_row = Dup128VecFromValues(
+      du, 0, 5, 10, 15, 4, 9, 14, 3, 8, 13, 2, 7, 12, 1, 6, 11);
   return TableLookupBytes(state, shift_row);
 }
 template <class V>  // u8
 HWY_INLINE V InvShiftRows(const V state) {
   const DFromV<V> du;
-  alignas(16) static constexpr uint8_t kShiftRow[16] = {
-      0,  13, 10, 7,   // transposed: state is column major
-      4,  1,  14, 11,  //
-      8,  5,  2,  15,  //
-      12, 9,  6,  3};
-  const auto shift_row = LoadDup128(du, kShiftRow);
+  // transposed: state is column major
+  const VFromD<decltype(du)> shift_row = Dup128VecFromValues(
+      du, 0, 13, 10, 7, 4, 1, 14, 11, 8, 5, 2, 15, 12, 9, 6, 3);
   return TableLookupBytes(state, shift_row);
 }
@@ -2455,15 +3588,15 @@ HWY_INLINE V MixColumns(const V state) {
   // 1 2 3 1  // d are on diagonal, no permutation needed.
   // 1 1 2 3  // t1230 indicates column indices of threes for the 4 rows.
   // 3 1 1 2  // We also need to compute s2301 and s3012 (=1230 o 2301).
-  alignas(16) static constexpr uint8_t k2301[16] = {
-      2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13};
-  alignas(16) static constexpr uint8_t k1230[16] = {
-      1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12};
+  const VFromD<decltype(du)> v2301 = Dup128VecFromValues(
+      du, 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13);
+  const VFromD<decltype(du)> v1230 = Dup128VecFromValues(
+      du, 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12);
   const auto d = GF2P8Mod11BMulBy2(state);  // = state*2 in GF(2^8).
-  const auto s2301 = TableLookupBytes(state, LoadDup128(du, k2301));
+  const auto s2301 = TableLookupBytes(state, v2301);
   const auto d_s2301 = Xor(d, s2301);
   const auto t_s2301 = Xor(state, d_s2301);  // t(s*3) = XOR-sum {s, d(s*2)}
-  const auto t1230_s3012 = TableLookupBytes(t_s2301, LoadDup128(du, k1230));
+  const auto t1230_s3012 = TableLookupBytes(t_s2301, v1230);
   return Xor(d_s2301, t1230_s3012);  // XOR-sum of 4 terms
 }
@@ -2475,11 +3608,10 @@ HWY_INLINE V InvMixColumns(const V state) {
   //  9 14 11 13
   // 13  9 14 11
   // 11 13  9 14
-  alignas(16) static constexpr uint8_t k2301[16] = {
-      2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13};
-  alignas(16) static constexpr uint8_t k1230[16] = {
-      1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12};
-  const auto v1230 = LoadDup128(du, k1230);
+  const VFromD<decltype(du)> v2301 = Dup128VecFromValues(
+      du, 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13);
+  const VFromD<decltype(du)> v1230 = Dup128VecFromValues(
+      du, 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12);
   const auto sx2 = GF2P8Mod11BMulBy2(state); /* = state*2 in GF(2^8) */
   const auto sx4 = GF2P8Mod11BMulBy2(sx2);   /* = state*4 in GF(2^8) */
@@ -2491,8 +3623,7 @@ HWY_INLINE V InvMixColumns(const V state) {
   const auto sx13_0123_sx9_1230 = Xor(sx13, TableLookupBytes(sx9, v1230));
   const auto sx14_0123_sx11_1230 = Xor(sx14, TableLookupBytes(sx11, v1230));
-  const auto sx13_2301_sx9_3012 =
-      TableLookupBytes(sx13_0123_sx9_1230, LoadDup128(du, k2301));
+  const auto sx13_2301_sx9_3012 = TableLookupBytes(sx13_0123_sx9_1230, v2301);
   return Xor(sx14_0123_sx11_1230, sx13_2301_sx9_3012);
 }
@@ -2543,15 +3674,15 @@ HWY_API V AESLastRoundInv(V state, const V round_key) {
 template <uint8_t kRcon, class V, HWY_IF_U8_D(DFromV<V>)>
 HWY_API V AESKeyGenAssist(V v) {
-  alignas(16) static constexpr uint8_t kRconXorMask[16] = {
-      0, 0, 0, 0, kRcon, 0, 0, 0, 0, 0, 0, 0, kRcon, 0, 0, 0};
-  alignas(16) static constexpr uint8_t kRotWordShuffle[16] = {
-      4, 5, 6, 7, 5, 6, 7, 4, 12, 13, 14, 15, 13, 14, 15, 12};
   const DFromV<decltype(v)> d;
+  const V rconXorMask = Dup128VecFromValues(d, 0, 0, 0, 0, kRcon, 0, 0, 0, 0, 0,
+                                            0, 0, kRcon, 0, 0, 0);
+  const V rotWordShuffle = Dup128VecFromValues(d, 4, 5, 6, 7, 5, 6, 7, 4, 12,
+                                               13, 14, 15, 13, 14, 15, 12);
   const auto sub_word_result = detail::SubBytes(v);
   const auto rot_word_result =
-      TableLookupBytes(sub_word_result, LoadDup128(d, kRotWordShuffle));
-  return Xor(rot_word_result, LoadDup128(d, kRconXorMask));
+      TableLookupBytes(sub_word_result, rotWordShuffle);
+  return Xor(rot_word_result, rconXorMask);
 }
 // Constant-time implementation inspired by
@@ -2602,203 +3733,751 @@ HWY_API V CLMulUpper(V a, V b) {
   const auto b2 = And(b, k4);
   const auto b3 = And(b, k8);
-  auto m0 = Xor(MulOdd(a0, b0), MulOdd(a1, b3));
-  auto m1 = Xor(MulOdd(a0, b1), MulOdd(a1, b0));
-  auto m2 = Xor(MulOdd(a0, b2), MulOdd(a1, b1));
-  auto m3 = Xor(MulOdd(a0, b3), MulOdd(a1, b2));
-  m0 = Xor(m0, Xor(MulOdd(a2, b2), MulOdd(a3, b1)));
-  m1 = Xor(m1, Xor(MulOdd(a2, b3), MulOdd(a3, b2)));
-  m2 = Xor(m2, Xor(MulOdd(a2, b0), MulOdd(a3, b3)));
-  m3 = Xor(m3, Xor(MulOdd(a2, b1), MulOdd(a3, b0)));
-  return Or(Or(And(m0, k1), And(m1, k2)), Or(And(m2, k4), And(m3, k8)));
+  auto m0 = Xor(MulOdd(a0, b0), MulOdd(a1, b3));
+  auto m1 = Xor(MulOdd(a0, b1), MulOdd(a1, b0));
+  auto m2 = Xor(MulOdd(a0, b2), MulOdd(a1, b1));
+  auto m3 = Xor(MulOdd(a0, b3), MulOdd(a1, b2));
+  m0 = Xor(m0, Xor(MulOdd(a2, b2), MulOdd(a3, b1)));
+  m1 = Xor(m1, Xor(MulOdd(a2, b3), MulOdd(a3, b2)));
+  m2 = Xor(m2, Xor(MulOdd(a2, b0), MulOdd(a3, b3)));
+  m3 = Xor(m3, Xor(MulOdd(a2, b1), MulOdd(a3, b0)));
+  return Or(Or(And(m0, k1), And(m1, k2)), Or(And(m2, k4), And(m3, k8)));
+}
+#endif  // HWY_NATIVE_AES
+#endif  // HWY_TARGET != HWY_SCALAR
+// ------------------------------ PopulationCount
+#if (defined(HWY_NATIVE_POPCNT) == defined(HWY_TARGET_TOGGLE))
+#ifdef HWY_NATIVE_POPCNT
+#undef HWY_NATIVE_POPCNT
+#else
+#define HWY_NATIVE_POPCNT
+#endif
+// This overload requires vectors to be at least 16 bytes, which is the case
+// for LMUL >= 2.
+#undef HWY_IF_POPCNT
+#if HWY_TARGET == HWY_RVV
+#define HWY_IF_POPCNT(D) \
+  hwy::EnableIf<D().Pow2() >= 1 && D().MaxLanes() >= 16>* = nullptr
+#else
+// Other targets only have these two overloads which are mutually exclusive, so
+// no further conditions are required.
+#define HWY_IF_POPCNT(D) void* = nullptr
+#endif  // HWY_TARGET == HWY_RVV
+template <class V, class D = DFromV<V>, HWY_IF_U8_D(D),
+          HWY_IF_V_SIZE_GT_D(D, 8), HWY_IF_POPCNT(D)>
+HWY_API V PopulationCount(V v) {
+  const D d;
+  const V lookup =
+      Dup128VecFromValues(d, 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4);
+  const auto lo = And(v, Set(d, uint8_t{0xF}));
+  const auto hi = ShiftRight<4>(v);
+  return Add(TableLookupBytes(lookup, hi), TableLookupBytes(lookup, lo));
+}
+// RVV has a specialization that avoids the Set().
+#if HWY_TARGET != HWY_RVV
+// Slower fallback for capped vectors.
+template <class V, class D = DFromV<V>, HWY_IF_U8_D(D),
+          HWY_IF_V_SIZE_LE_D(D, 8)>
+HWY_API V PopulationCount(V v) {
+  const D d;
+  // See https://arxiv.org/pdf/1611.07612.pdf, Figure 3
+  const V k33 = Set(d, uint8_t{0x33});
+  v = Sub(v, And(ShiftRight<1>(v), Set(d, uint8_t{0x55})));
+  v = Add(And(ShiftRight<2>(v), k33), And(v, k33));
+  return And(Add(v, ShiftRight<4>(v)), Set(d, uint8_t{0x0F}));
+}
+#endif  // HWY_TARGET != HWY_RVV
+template <class V, class D = DFromV<V>, HWY_IF_U16_D(D)>
+HWY_API V PopulationCount(V v) {
+  const D d;
+  const Repartition<uint8_t, decltype(d)> d8;
+  const auto vals = BitCast(d, PopulationCount(BitCast(d8, v)));
+  return Add(ShiftRight<8>(vals), And(vals, Set(d, uint16_t{0xFF})));
+}
+template <class V, class D = DFromV<V>, HWY_IF_U32_D(D)>
+HWY_API V PopulationCount(V v) {
+  const D d;
+  Repartition<uint16_t, decltype(d)> d16;
+  auto vals = BitCast(d, PopulationCount(BitCast(d16, v)));
+  return Add(ShiftRight<16>(vals), And(vals, Set(d, uint32_t{0xFF})));
+}
+#if HWY_HAVE_INTEGER64
+template <class V, class D = DFromV<V>, HWY_IF_U64_D(D)>
+HWY_API V PopulationCount(V v) {
+  const D d;
+  Repartition<uint32_t, decltype(d)> d32;
+  auto vals = BitCast(d, PopulationCount(BitCast(d32, v)));
+  return Add(ShiftRight<32>(vals), And(vals, Set(d, 0xFFULL)));
+}
+#endif
+#endif  // HWY_NATIVE_POPCNT
+// ------------------------------ 8-bit multiplication
+#if (defined(HWY_NATIVE_MUL_8) == defined(HWY_TARGET_TOGGLE)) || HWY_IDE
+#ifdef HWY_NATIVE_MUL_8
+#undef HWY_NATIVE_MUL_8
+#else
+#define HWY_NATIVE_MUL_8
+#endif
+// 8 bit and fits in wider reg: promote
+template <class V, HWY_IF_T_SIZE_V(V, 1),
+          HWY_IF_V_SIZE_LE_V(V, HWY_MAX_BYTES / 2)>
+HWY_API V operator*(const V a, const V b) {
+  const DFromV<decltype(a)> d;
+  const Rebind<MakeWide<TFromV<V>>, decltype(d)> dw;
+  const RebindToUnsigned<decltype(d)> du;    // TruncateTo result
+  const RebindToUnsigned<decltype(dw)> dwu;  // TruncateTo input
+  const VFromD<decltype(dw)> mul = PromoteTo(dw, a) * PromoteTo(dw, b);
+  // TruncateTo is cheaper than ConcatEven.
+  return BitCast(d, TruncateTo(du, BitCast(dwu, mul)));
+}
+// 8 bit full reg: promote halves
+template <class V, HWY_IF_T_SIZE_V(V, 1),
+          HWY_IF_V_SIZE_GT_V(V, HWY_MAX_BYTES / 2)>
+HWY_API V operator*(const V a, const V b) {
+  const DFromV<decltype(a)> d;
+  const Half<decltype(d)> dh;
+  const Twice<RepartitionToWide<decltype(dh)>> dw;
+  const VFromD<decltype(dw)> a0 = PromoteTo(dw, LowerHalf(dh, a));
+  const VFromD<decltype(dw)> a1 = PromoteTo(dw, UpperHalf(dh, a));
+  const VFromD<decltype(dw)> b0 = PromoteTo(dw, LowerHalf(dh, b));
+  const VFromD<decltype(dw)> b1 = PromoteTo(dw, UpperHalf(dh, b));
+  const VFromD<decltype(dw)> m0 = a0 * b0;
+  const VFromD<decltype(dw)> m1 = a1 * b1;
+  return ConcatEven(d, BitCast(d, m1), BitCast(d, m0));
+}
+#endif  // HWY_NATIVE_MUL_8
+// ------------------------------ 64-bit multiplication
+#if (defined(HWY_NATIVE_MUL_64) == defined(HWY_TARGET_TOGGLE)) || HWY_IDE
+#ifdef HWY_NATIVE_MUL_64
+#undef HWY_NATIVE_MUL_64
+#else
+#define HWY_NATIVE_MUL_64
+#endif
+// Single-lane i64 or u64
+template <class V, HWY_IF_T_SIZE_V(V, 8), HWY_IF_V_SIZE_V(V, 8),
+          HWY_IF_NOT_FLOAT_V(V)>
+HWY_API V operator*(V x, V y) {
+  const DFromV<V> d;
+  using T = TFromD<decltype(d)>;
+  using TU = MakeUnsigned<T>;
+  const TU xu = static_cast<TU>(GetLane(x));
+  const TU yu = static_cast<TU>(GetLane(y));
+  return Set(d, static_cast<T>(xu * yu));
+}
+template <class V, class D64 = DFromV<V>, HWY_IF_U64_D(D64),
+          HWY_IF_V_SIZE_GT_D(D64, 8)>
+HWY_API V operator*(V x, V y) {
+  RepartitionToNarrow<D64> d32;
+  auto x32 = BitCast(d32, x);
+  auto y32 = BitCast(d32, y);
+  auto lolo = BitCast(d32, MulEven(x32, y32));
+  auto lohi = BitCast(d32, MulEven(x32, BitCast(d32, ShiftRight<32>(y))));
+  auto hilo = BitCast(d32, MulEven(BitCast(d32, ShiftRight<32>(x)), y32));
+  auto hi = BitCast(d32, ShiftLeft<32>(BitCast(D64{}, lohi + hilo)));
+  return BitCast(D64{}, lolo + hi);
+}
+template <class V, class DI64 = DFromV<V>, HWY_IF_I64_D(DI64),
+          HWY_IF_V_SIZE_GT_D(DI64, 8)>
+HWY_API V operator*(V x, V y) {
+  RebindToUnsigned<DI64> du64;
+  return BitCast(DI64{}, BitCast(du64, x) * BitCast(du64, y));
+}
+#endif  // HWY_NATIVE_MUL_64
+// ------------------------------ MulAdd / NegMulAdd
+#if (defined(HWY_NATIVE_INT_FMA) == defined(HWY_TARGET_TOGGLE))
+#ifdef HWY_NATIVE_INT_FMA
+#undef HWY_NATIVE_INT_FMA
+#else
+#define HWY_NATIVE_INT_FMA
+#endif
+#ifdef HWY_NATIVE_INT_FMSUB
+#undef HWY_NATIVE_INT_FMSUB
+#else
+#define HWY_NATIVE_INT_FMSUB
+#endif
+template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
+HWY_API V MulAdd(V mul, V x, V add) {
+  return Add(Mul(mul, x), add);
+}
+template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
+HWY_API V NegMulAdd(V mul, V x, V add) {
+  return Sub(add, Mul(mul, x));
+}
+template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
+HWY_API V MulSub(V mul, V x, V sub) {
+  return Sub(Mul(mul, x), sub);
+}
+#endif  // HWY_NATIVE_INT_FMA
+// ------------------------------ Integer MulSub / NegMulSub
+#if (defined(HWY_NATIVE_INT_FMSUB) == defined(HWY_TARGET_TOGGLE))
+#ifdef HWY_NATIVE_INT_FMSUB
+#undef HWY_NATIVE_INT_FMSUB
+#else
+#define HWY_NATIVE_INT_FMSUB
+#endif
+template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
+HWY_API V MulSub(V mul, V x, V sub) {
+  const DFromV<decltype(mul)> d;
+  const RebindToSigned<decltype(d)> di;
+  return MulAdd(mul, x, BitCast(d, Neg(BitCast(di, sub))));
+}
+#endif  // HWY_NATIVE_INT_FMSUB
+template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
+HWY_API V NegMulSub(V mul, V x, V sub) {
+  const DFromV<decltype(mul)> d;
+  const RebindToSigned<decltype(d)> di;
+  return BitCast(d, Neg(BitCast(di, MulAdd(mul, x, sub))));
+}
+// ------------------------------ MulAddSub
+// MulAddSub(mul, x, sub_or_add) for a 1-lane vector is equivalent to
+// MulSub(mul, x, sub_or_add)
+template <class V, HWY_IF_LANES_D(DFromV<V>, 1)>
+HWY_API V MulAddSub(V mul, V x, V sub_or_add) {
+  return MulSub(mul, x, sub_or_add);
+}
+// MulAddSub for F16/F32/F64 vectors with 2 or more lanes on
+// SSSE3/SSE4/AVX2/AVX3 is implemented in x86_128-inl.h, x86_256-inl.h, and
+// x86_512-inl.h
+template <class V, HWY_IF_LANES_GT_D(DFromV<V>, 1),
+          HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | ((HWY_TARGET <= HWY_SSSE3 &&
+                                                 hwy::IsFloat<TFromV<V>>())
+                                                    ? 0
+                                                    : ((1 << 2) | (1 << 4) |
+                                                       (1 << 8))))>
+HWY_API V MulAddSub(V mul, V x, V sub_or_add) {
+  using D = DFromV<V>;
+  using T = TFromD<D>;
+  using TNegate = If<!IsSigned<T>(), MakeSigned<T>, T>;
+  const D d;
+  const Rebind<TNegate, D> d_negate;
+  const auto add =
+      OddEven(sub_or_add, BitCast(d, Neg(BitCast(d_negate, sub_or_add))));
+  return MulAdd(mul, x, add);
+}
+// ------------------------------ Integer division
+#if (defined(HWY_NATIVE_INT_DIV) == defined(HWY_TARGET_TOGGLE))
+#ifdef HWY_NATIVE_INT_DIV
+#undef HWY_NATIVE_INT_DIV
+#else
+#define HWY_NATIVE_INT_DIV
+#endif
+namespace detail {
+template <class D, class V, HWY_IF_T_SIZE_D(D, sizeof(TFromV<V>))>
+HWY_INLINE Vec<D> IntDivConvFloatToInt(D di, V vf) {
+  return ConvertTo(di, vf);
+}
+template <class D, class V, HWY_IF_T_SIZE_D(D, sizeof(TFromV<V>))>
+HWY_INLINE Vec<D> IntDivConvIntToFloat(D df, V vi) {
+  return ConvertTo(df, vi);
+}
+#if !HWY_HAVE_FLOAT64 && HWY_HAVE_INTEGER64
+template <class D, class V, HWY_IF_UI64_D(D), HWY_IF_F32(TFromV<V>)>
+HWY_INLINE Vec<D> IntDivConvFloatToInt(D df, V vi) {
+  return PromoteTo(df, vi);
+}
+// If !HWY_HAVE_FLOAT64 && HWY_HAVE_INTEGER64 is true, then UI64->F32
+// IntDivConvIntToFloat(df, vi) returns an approximation of
+// static_cast<float>(v[i]) that is within 4 ULP of static_cast<float>(v[i])
+template <class D, class V, HWY_IF_F32_D(D), HWY_IF_I64(TFromV<V>)>
+HWY_INLINE Vec<D> IntDivConvIntToFloat(D df32, V vi) {
+  const Twice<decltype(df32)> dt_f32;
+  auto vf32 =
+      ConvertTo(dt_f32, BitCast(RebindToSigned<decltype(dt_f32)>(), vi));
+#if HWY_IS_LITTLE_ENDIAN
+  const auto lo_f32 = LowerHalf(df32, ConcatEven(dt_f32, vf32, vf32));
+  auto hi_f32 = LowerHalf(df32, ConcatOdd(dt_f32, vf32, vf32));
+#else
+  const auto lo_f32 = LowerHalf(df32, ConcatOdd(dt_f32, vf32, vf32));
+  auto hi_f32 = LowerHalf(df32, ConcatEven(dt_f32, vf32, vf32));
+#endif
+  const RebindToSigned<decltype(df32)> di32;
+  hi_f32 =
+      Add(hi_f32, And(BitCast(df32, BroadcastSignBit(BitCast(di32, lo_f32))),
+                      Set(df32, 1.0f)));
+  return hwy::HWY_NAMESPACE::MulAdd(hi_f32, Set(df32, 4294967296.0f), lo_f32);
+}
+template <class D, class V, HWY_IF_F32_D(D), HWY_IF_U64(TFromV<V>)>
+HWY_INLINE Vec<D> IntDivConvIntToFloat(D df32, V vu) {
+  const Twice<decltype(df32)> dt_f32;
+  auto vf32 =
+      ConvertTo(dt_f32, BitCast(RebindToUnsigned<decltype(dt_f32)>(), vu));
+#if HWY_IS_LITTLE_ENDIAN
+  const auto lo_f32 = LowerHalf(df32, ConcatEven(dt_f32, vf32, vf32));
+  const auto hi_f32 = LowerHalf(df32, ConcatOdd(dt_f32, vf32, vf32));
+#else
+  const auto lo_f32 = LowerHalf(df32, ConcatOdd(dt_f32, vf32, vf32));
+  const auto hi_f32 = LowerHalf(df32, ConcatEven(dt_f32, vf32, vf32));
+#endif
+  return hwy::HWY_NAMESPACE::MulAdd(hi_f32, Set(df32, 4294967296.0f), lo_f32);
+}
+#endif  // !HWY_HAVE_FLOAT64 && HWY_HAVE_INTEGER64
+template <size_t kOrigLaneSize, class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V),
+          HWY_IF_T_SIZE_GT(TFromV<V>, kOrigLaneSize)>
+HWY_INLINE V IntDivUsingFloatDiv(V a, V b) {
+  const DFromV<decltype(a)> d;
+  const RebindToFloat<decltype(d)> df;
+  // If kOrigLaneSize < sizeof(T) is true, then a[i] and b[i] are both in the
+  // [LimitsMin<SignedFromSize<kOrigLaneSize>>(),
+  // LimitsMax<UnsignedFromSize<kOrigLaneSize>>()] range.
+  // floor(|a[i] / b[i]|) <= |flt_q| < floor(|a[i] / b[i]|) + 1 is also
+  // guaranteed to be true if MakeFloat<T> has at least kOrigLaneSize*8 + 1
+  // mantissa bits (including the implied one bit), where flt_q is equal to
+  // static_cast<MakeFloat<T>>(a[i]) / static_cast<MakeFloat<T>>(b[i]),
+  // even in the case where the magnitude of an inexact floating point division
+  // result is rounded up.
+  // In other words, floor(flt_q) < flt_q < ceil(flt_q) is guaranteed to be true
+  // if (a[i] % b[i]) != 0 is true and MakeFloat<T> has at least
+  // kOrigLaneSize*8 + 1 mantissa bits (including the implied one bit), even in
+  // the case where the magnitude of an inexact floating point division result
+  // is rounded up.
+#if (HWY_TARGET == HWY_NEON || HWY_TARGET == HWY_NEON_WITHOUT_AES) && \
+    !HWY_HAVE_FLOAT64
+  // On Armv7, do division by multiplying by the ApproximateReciprocal
+  // to avoid unnecessary overhead as F32 Div refines the approximate
+  // reciprocal using 4 Newton-Raphson iterations
+  const RebindToSigned<decltype(d)> di;
+  const RebindToUnsigned<decltype(d)> du;
+  const auto flt_b = ConvertTo(df, b);
+  auto flt_recip_b = ApproximateReciprocal(flt_b);
+  if (kOrigLaneSize > 1) {
+    flt_recip_b =
+        Mul(flt_recip_b, ReciprocalNewtonRaphsonStep(flt_recip_b, flt_b));
+  }
+  auto q0 = ConvertTo(d, Mul(ConvertTo(df, a), flt_recip_b));
+  const auto r0 = BitCast(di, hwy::HWY_NAMESPACE::NegMulAdd(q0, b, a));
+  auto r1 = r0;
+  // Need to negate r1[i] if a[i] < 0 is true
+  if (IsSigned<TFromV<V>>()) {
+    r1 = IfNegativeThenNegOrUndefIfZero(BitCast(di, a), r1);
+  }
+  // r1[i] is now equal to (a[i] < 0) ? (-r0[i]) : r0[i]
+  auto abs_b = BitCast(du, b);
+  if (IsSigned<TFromV<V>>()) {
+    abs_b = BitCast(du, Abs(BitCast(di, abs_b)));
+  }
+  // If (r1[i] < 0 || r1[i] >= abs_b[i]) is true, then set q1[i] to -1.
+  // Otherwise, set q1[i] to 0.
+  // (r1[i] < 0 || r1[i] >= abs_b[i]) can be carried out using a single unsigned
+  // comparison as static_cast<TU>(r1[i]) >= TU(LimitsMax<TI>() + 1) >= abs_b[i]
+  // will be true if r1[i] < 0 is true.
+  auto q1 = BitCast(di, VecFromMask(du, Ge(BitCast(du, r1), abs_b)));
+  // q1[i] is now equal to (r1[i] < 0 || r1[i] >= abs_b[i]) ? -1 : 0
+  // Need to negate q1[i] if r0[i] and b[i] do not have the same sign
+  auto q1_negate_mask = r0;
+  if (IsSigned<TFromV<V>>()) {
+    q1_negate_mask = Xor(q1_negate_mask, BitCast(di, b));
+  }
+  q1 = IfNegativeThenElse(q1_negate_mask, Neg(q1), q1);
+  // q1[i] is now equal to (r1[i] < 0 || r1[i] >= abs_b[i]) ?
+  //                       (((r0[i] ^ b[i]) < 0) ? 1 : -1)
+  // Need to subtract q1[i] from q0[i] to get the final result
+  return Sub(q0, BitCast(d, q1));
+#else
+  // On targets other than Armv7 NEON, use F16 or F32 division as most targets
+  // other than Armv7 NEON have native F32 divide instructions
+  return ConvertTo(d, Div(ConvertTo(df, a), ConvertTo(df, b)));
+#endif
+}
+template <size_t kOrigLaneSize, class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V),
+          HWY_IF_T_SIZE(TFromV<V>, kOrigLaneSize),
+          HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 4) | (1 << 8))>
+HWY_INLINE V IntDivUsingFloatDiv(V a, V b) {
+  // If kOrigLaneSize == sizeof(T) is true, at least two reciprocal
+  // multiplication steps are needed as the mantissa of MakeFloat<T> has fewer
+  // than kOrigLaneSize*8 + 1 bits
+  using T = TFromV<V>;
+#if HWY_HAVE_FLOAT64
+  using TF = MakeFloat<T>;
+#else
+  using TF = float;
+#endif
+  const DFromV<decltype(a)> d;
+  const RebindToSigned<decltype(d)> di;
+  const RebindToUnsigned<decltype(d)> du;
+  const Rebind<TF, decltype(d)> df;
+  if (!IsSigned<T>()) {
+    // If T is unsigned, set a[i] to (a[i] >= b[i] ? 1 : 0) and set b[i] to 1 if
+    // b[i] > LimitsMax<MakeSigned<T>>() is true
+    const auto one = Set(di, MakeSigned<T>{1});
+    a = BitCast(
+        d, IfNegativeThenElse(BitCast(di, b),
+                              IfThenElseZero(RebindMask(di, Ge(a, b)), one),
+                              BitCast(di, a)));
+    b = BitCast(d, IfNegativeThenElse(BitCast(di, b), one, BitCast(di, b)));
+  }
+  // LimitsMin<T>() <= b[i] <= LimitsMax<MakeSigned<T>>() is now true
+  const auto flt_b = IntDivConvIntToFloat(df, b);
+#if (HWY_TARGET == HWY_NEON || HWY_TARGET == HWY_NEON_WITHOUT_AES) && \
+    !HWY_HAVE_FLOAT64
+  auto flt_recip_b = ApproximateReciprocal(flt_b);
+  flt_recip_b =
+      Mul(flt_recip_b, ReciprocalNewtonRaphsonStep(flt_recip_b, flt_b));
+#else
+  const auto flt_recip_b = Div(Set(df, TF(1.0)), flt_b);
+#endif
+  auto q0 =
+      IntDivConvFloatToInt(d, Mul(IntDivConvIntToFloat(df, a), flt_recip_b));
+  const auto r0 = BitCast(di, hwy::HWY_NAMESPACE::NegMulAdd(q0, b, a));
+  auto q1 =
+      IntDivConvFloatToInt(di, Mul(IntDivConvIntToFloat(df, r0), flt_recip_b));
+  const auto r1 = hwy::HWY_NAMESPACE::NegMulAdd(q1, BitCast(di, b), r0);
+  auto r3 = r1;
+#if !HWY_HAVE_FLOAT64
+  // Need two additional reciprocal multiplication steps for I64/U64 vectors if
+  // HWY_HAVE_FLOAT64 is 0
+  if (sizeof(T) == 8) {
+    const auto q2 = IntDivConvFloatToInt(
+        di, Mul(IntDivConvIntToFloat(df, r1), flt_recip_b));
+    const auto r2 = hwy::HWY_NAMESPACE::NegMulAdd(q2, BitCast(di, b), r1);
+    const auto q3 = IntDivConvFloatToInt(
+        di, Mul(IntDivConvIntToFloat(df, r2), flt_recip_b));
+    r3 = hwy::HWY_NAMESPACE::NegMulAdd(q3, BitCast(di, b), r2);
+    q0 = Add(q0, BitCast(d, q2));
+    q1 = Add(q1, q3);
+  }
+#endif  // !HWY_HAVE_FLOAT64
+  auto r4 = r3;
+  // Need to negate r4[i] if a[i] < 0 is true
+  if (IsSigned<TFromV<V>>()) {
+    r4 = IfNegativeThenNegOrUndefIfZero(BitCast(di, a), r4);
+  }
+  // r4[i] is now equal to (a[i] < 0) ? (-r3[i]) : r3[i]
+  auto abs_b = BitCast(du, b);
+  if (IsSigned<TFromV<V>>()) {
+    abs_b = BitCast(du, Abs(BitCast(di, abs_b)));
+  }
+  // If (r4[i] < 0 || r4[i] >= abs_b[i]) is true, then set q4[i] to -1.
+  // Otherwise, set r4[i] to 0.
+  // (r4[i] < 0 || r4[i] >= abs_b[i]) can be carried out using a single unsigned
+  // comparison as static_cast<TU>(r4[i]) >= TU(LimitsMax<TI>() + 1) >= abs_b[i]
+  // will be true if r4[i] < 0 is true.
+  auto q4 = BitCast(di, VecFromMask(du, Ge(BitCast(du, r4), abs_b)));
+  // q4[i] is now equal to (r4[i] < 0 || r4[i] >= abs_b[i]) ? -1 : 0
+  // Need to negate q4[i] if r3[i] and b[i] do not have the same sign
+  auto q4_negate_mask = r3;
+  if (IsSigned<TFromV<V>>()) {
+    q4_negate_mask = Xor(q4_negate_mask, BitCast(di, b));
+  }
+  q4 = IfNegativeThenElse(q4_negate_mask, Neg(q4), q4);
+  // q4[i] is now equal to (r4[i] < 0 || r4[i] >= abs_b[i]) ?
+  //                       (((r3[i] ^ b[i]) < 0) ? 1 : -1)
+  // The final result is equal to q0[i] + q1[i] - q4[i]
+  return Sub(Add(q0, BitCast(d, q1)), BitCast(d, q4));
 }
-#endif  // HWY_NATIVE_AES
-#endif  // HWY_TARGET != HWY_SCALAR
+template <size_t kOrigLaneSize, class V,
+          HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 2)),
+          HWY_IF_V_SIZE_LE_V(
+              V, HWY_MAX_BYTES /
+                     ((!HWY_HAVE_FLOAT16 && sizeof(TFromV<V>) == 1) ? 4 : 2))>
+HWY_INLINE V IntDiv(V a, V b) {
+  using T = TFromV<V>;
-// ------------------------------ PopulationCount
+  // If HWY_HAVE_FLOAT16 is 0, need to promote I8 to I32 and U8 to U32
+  using TW = MakeWide<
+      If<(!HWY_HAVE_FLOAT16 && sizeof(TFromV<V>) == 1), MakeWide<T>, T>>;
-// "Include guard": skip if native POPCNT-related instructions are available.
-#if (defined(HWY_NATIVE_POPCNT) == defined(HWY_TARGET_TOGGLE))
-#ifdef HWY_NATIVE_POPCNT
-#undef HWY_NATIVE_POPCNT
+  const DFromV<decltype(a)> d;
+  const Rebind<TW, decltype(d)> dw;
+#if HWY_TARGET <= HWY_SSE2
+  // On SSE2/SSSE3/SSE4/AVX2/AVX3, promote to and from MakeSigned<TW> to avoid
+  // unnecessary overhead
+  const RebindToSigned<decltype(dw)> dw_i;
+  // On SSE2/SSSE3/SSE4/AVX2/AVX3, demote to MakeSigned<T> if
+  // kOrigLaneSize < sizeof(T) to avoid unnecessary overhead
+  const If<(kOrigLaneSize < sizeof(T)), RebindToSigned<decltype(d)>,
+           decltype(d)>
+      d_demote_to;
 #else
-#define HWY_NATIVE_POPCNT
+  // On other targets, promote to TW and demote to T
+  const decltype(dw) dw_i;
+  const decltype(d) d_demote_to;
 #endif
-// This overload requires vectors to be at least 16 bytes, which is the case
-// for LMUL >= 2.
-#undef HWY_IF_POPCNT
-#if HWY_TARGET == HWY_RVV
-#define HWY_IF_POPCNT(D) \
-  hwy::EnableIf<D().Pow2() >= 1 && D().MaxLanes() >= 16>* = nullptr
+  return BitCast(
+      d, DemoteTo(d_demote_to, IntDivUsingFloatDiv<kOrigLaneSize>(
+                                   PromoteTo(dw_i, a), PromoteTo(dw_i, b))));
+}
+template <size_t kOrigLaneSize, class V,
+          HWY_IF_T_SIZE_ONE_OF_V(V,
+                                 (HWY_HAVE_FLOAT16 ? (1 << 1) : 0) | (1 << 2)),
+          HWY_IF_V_SIZE_GT_V(V, HWY_MAX_BYTES / 2)>
+HWY_INLINE V IntDiv(V a, V b) {
+  const DFromV<decltype(a)> d;
+  const RepartitionToWide<decltype(d)> dw;
+#if HWY_TARGET <= HWY_SSE2
+  // On SSE2/SSSE3/SSE4/AVX2/AVX3, promote to and from MakeSigned<TW> to avoid
+  // unnecessary overhead
+  const RebindToSigned<decltype(dw)> dw_i;
+  // On SSE2/SSSE3/SSE4/AVX2/AVX3, demote to MakeSigned<TFromV<V>> if
+  // kOrigLaneSize < sizeof(TFromV<V>) to avoid unnecessary overhead
+  const If<(kOrigLaneSize < sizeof(TFromV<V>)), RebindToSigned<decltype(d)>,
+           decltype(d)>
+      d_demote_to;
 #else
-// Other targets only have these two overloads which are mutually exclusive, so
-// no further conditions are required.
-#define HWY_IF_POPCNT(D) void* = nullptr
-#endif  // HWY_TARGET == HWY_RVV
+  // On other targets, promote to MakeWide<TFromV<V>> and demote to TFromV<V>
+  const decltype(dw) dw_i;
+  const decltype(d) d_demote_to;
+#endif
-template <class V, class D = DFromV<V>, HWY_IF_U8_D(D),
-          HWY_IF_V_SIZE_GT_D(D, 8), HWY_IF_POPCNT(D)>
-HWY_API V PopulationCount(V v) {
-  const D d;
-  HWY_ALIGN constexpr uint8_t kLookup[16] = {
-      0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
-  };
-  const auto lo = And(v, Set(d, uint8_t{0xF}));
-  const auto hi = ShiftRight<4>(v);
-  const auto lookup = LoadDup128(d, kLookup);
-  return Add(TableLookupBytes(lookup, hi), TableLookupBytes(lookup, lo));
+  return BitCast(d, OrderedDemote2To(
+                        d_demote_to,
+                        IntDivUsingFloatDiv<kOrigLaneSize>(
+                            PromoteLowerTo(dw_i, a), PromoteLowerTo(dw_i, b)),
+                        IntDivUsingFloatDiv<kOrigLaneSize>(
+                            PromoteUpperTo(dw_i, a), PromoteUpperTo(dw_i, b))));
 }
-// RVV has a specialization that avoids the Set().
-#if HWY_TARGET != HWY_RVV
-// Slower fallback for capped vectors.
-template <class V, class D = DFromV<V>, HWY_IF_U8_D(D),
-          HWY_IF_V_SIZE_LE_D(D, 8)>
-HWY_API V PopulationCount(V v) {
-  const D d;
-  // See https://arxiv.org/pdf/1611.07612.pdf, Figure 3
-  const V k33 = Set(d, uint8_t{0x33});
-  v = Sub(v, And(ShiftRight<1>(v), Set(d, uint8_t{0x55})));
-  v = Add(And(ShiftRight<2>(v), k33), And(v, k33));
-  return And(Add(v, ShiftRight<4>(v)), Set(d, uint8_t{0x0F}));
-}
-#endif  // HWY_TARGET != HWY_RVV
+#if !HWY_HAVE_FLOAT16
+template <size_t kOrigLaneSize, class V, HWY_IF_UI8(TFromV<V>),
+          HWY_IF_V_SIZE_V(V, HWY_MAX_BYTES / 2)>
+HWY_INLINE V IntDiv(V a, V b) {
+  const DFromV<decltype(a)> d;
+  const Rebind<MakeWide<TFromV<V>>, decltype(d)> dw;
-template <class V, class D = DFromV<V>, HWY_IF_U16_D(D)>
-HWY_API V PopulationCount(V v) {
-  const D d;
-  const Repartition<uint8_t, decltype(d)> d8;
-  const auto vals = BitCast(d, PopulationCount(BitCast(d8, v)));
-  return Add(ShiftRight<8>(vals), And(vals, Set(d, uint16_t{0xFF})));
+#if HWY_TARGET <= HWY_SSE2
+  // On SSE2/SSSE3, demote from int16_t to TFromV<V> to avoid unnecessary
+  // overhead
+  const RebindToSigned<decltype(dw)> dw_i;
+#else
+  // On other targets, demote from MakeWide<TFromV<V>> to TFromV<V>
+  const decltype(dw) dw_i;
+#endif
+  return DemoteTo(d,
+                  BitCast(dw_i, IntDiv<1>(PromoteTo(dw, a), PromoteTo(dw, b))));
 }
+template <size_t kOrigLaneSize, class V, HWY_IF_UI8(TFromV<V>),
+          HWY_IF_V_SIZE_GT_V(V, HWY_MAX_BYTES / 2)>
+HWY_INLINE V IntDiv(V a, V b) {
+  const DFromV<decltype(a)> d;
+  const RepartitionToWide<decltype(d)> dw;
-template <class V, class D = DFromV<V>, HWY_IF_U32_D(D)>
-HWY_API V PopulationCount(V v) {
-  const D d;
-  Repartition<uint16_t, decltype(d)> d16;
-  auto vals = BitCast(d, PopulationCount(BitCast(d16, v)));
-  return Add(ShiftRight<16>(vals), And(vals, Set(d, uint32_t{0xFF})));
+#if HWY_TARGET <= HWY_SSE2
+  // On SSE2/SSSE3, demote from int16_t to TFromV<V> to avoid unnecessary
+  // overhead
+  const RebindToSigned<decltype(dw)> dw_i;
+#else
+  // On other targets, demote from MakeWide<TFromV<V>> to TFromV<V>
+  const decltype(dw) dw_i;
+#endif
+  return OrderedDemote2To(
+      d, BitCast(dw_i, IntDiv<1>(PromoteLowerTo(dw, a), PromoteLowerTo(dw, b))),
+      BitCast(dw_i, IntDiv<1>(PromoteUpperTo(dw, a), PromoteUpperTo(dw, b))));
 }
+#endif  // !HWY_HAVE_FLOAT16
-#if HWY_HAVE_INTEGER64
-template <class V, class D = DFromV<V>, HWY_IF_U64_D(D)>
-HWY_API V PopulationCount(V v) {
-  const D d;
-  Repartition<uint32_t, decltype(d)> d32;
-  auto vals = BitCast(d, PopulationCount(BitCast(d32, v)));
-  return Add(ShiftRight<32>(vals), And(vals, Set(d, 0xFFULL)));
+template <size_t kOrigLaneSize, class V,
+          HWY_IF_T_SIZE_ONE_OF_V(V,
+                                 (HWY_HAVE_FLOAT64 ? 0 : (1 << 4)) | (1 << 8))>
+HWY_INLINE V IntDiv(V a, V b) {
+  return IntDivUsingFloatDiv<kOrigLaneSize>(a, b);
 }
-#endif
-#endif  // HWY_NATIVE_POPCNT
+#if HWY_HAVE_FLOAT64
+template <size_t kOrigLaneSize, class V, HWY_IF_UI32(TFromV<V>),
+          HWY_IF_V_SIZE_LE_V(V, HWY_MAX_BYTES / 2)>
+HWY_INLINE V IntDiv(V a, V b) {
+  const DFromV<decltype(a)> d;
+  const Rebind<double, decltype(d)> df64;
-// ------------------------------ 8-bit multiplication
+  return DemoteTo(d, Div(PromoteTo(df64, a), PromoteTo(df64, b)));
+}
+template <size_t kOrigLaneSize, class V, HWY_IF_UI32(TFromV<V>),
+          HWY_IF_V_SIZE_GT_V(V, HWY_MAX_BYTES / 2)>
+HWY_INLINE V IntDiv(V a, V b) {
+  const DFromV<decltype(a)> d;
+  const Half<decltype(d)> dh;
+  const Repartition<double, decltype(d)> df64;
-// "Include guard": skip if native 8-bit mul instructions are available.
-#if (defined(HWY_NATIVE_MUL_8) == defined(HWY_TARGET_TOGGLE)) || HWY_IDE
-#ifdef HWY_NATIVE_MUL_8
-#undef HWY_NATIVE_MUL_8
-#else
-#define HWY_NATIVE_MUL_8
-#endif
+  return Combine(
+      d, DemoteTo(dh, Div(PromoteUpperTo(df64, a), PromoteUpperTo(df64, b))),
+      DemoteTo(dh, Div(PromoteLowerTo(df64, a), PromoteLowerTo(df64, b))));
+}
+#endif  // HWY_HAVE_FLOAT64
-// 8 bit and fits in wider reg: promote
-template <class V, HWY_IF_T_SIZE_V(V, 1),
+template <size_t kOrigLaneSize, class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V),
+          HWY_IF_T_SIZE_ONE_OF_V(V, ((HWY_TARGET <= HWY_SSE2 ||
+                                      HWY_TARGET == HWY_WASM ||
+                                      HWY_TARGET == HWY_WASM_EMU256)
+                                         ? 0
+                                         : (1 << 1)) |
+                                        (1 << 2) | (1 << 4) | (1 << 8))>
+HWY_INLINE V IntMod(V a, V b) {
+  return hwy::HWY_NAMESPACE::NegMulAdd(IntDiv<kOrigLaneSize>(a, b), b, a);
+}
+#if HWY_TARGET <= HWY_SSE2 || HWY_TARGET == HWY_WASM || \
+    HWY_TARGET == HWY_WASM_EMU256
+template <size_t kOrigLaneSize, class V, HWY_IF_UI8(TFromV<V>),
           HWY_IF_V_SIZE_LE_V(V, HWY_MAX_BYTES / 2)>
-HWY_API V operator*(const V a, const V b) {
+HWY_INLINE V IntMod(V a, V b) {
   const DFromV<decltype(a)> d;
   const Rebind<MakeWide<TFromV<V>>, decltype(d)> dw;
-  const RebindToUnsigned<decltype(d)> du;    // TruncateTo result
-  const RebindToUnsigned<decltype(dw)> dwu;  // TruncateTo input
-  const VFromD<decltype(dw)> mul = PromoteTo(dw, a) * PromoteTo(dw, b);
-  // TruncateTo is cheaper than ConcatEven.
-  return BitCast(d, TruncateTo(du, BitCast(dwu, mul)));
+  return DemoteTo(d, IntMod<kOrigLaneSize>(PromoteTo(dw, a), PromoteTo(dw, b)));
 }
-// 8 bit full reg: promote halves
-template <class V, HWY_IF_T_SIZE_V(V, 1),
+template <size_t kOrigLaneSize, class V, HWY_IF_UI8(TFromV<V>),
           HWY_IF_V_SIZE_GT_V(V, HWY_MAX_BYTES / 2)>
-HWY_API V operator*(const V a, const V b) {
+HWY_INLINE V IntMod(V a, V b) {
   const DFromV<decltype(a)> d;
-  const Half<decltype(d)> dh;
-  const Twice<RepartitionToWide<decltype(dh)>> dw;
-  const VFromD<decltype(dw)> a0 = PromoteTo(dw, LowerHalf(dh, a));
-  const VFromD<decltype(dw)> a1 = PromoteTo(dw, UpperHalf(dh, a));
-  const VFromD<decltype(dw)> b0 = PromoteTo(dw, LowerHalf(dh, b));
-  const VFromD<decltype(dw)> b1 = PromoteTo(dw, UpperHalf(dh, b));
-  const VFromD<decltype(dw)> m0 = a0 * b0;
-  const VFromD<decltype(dw)> m1 = a1 * b1;
-  return ConcatEven(d, BitCast(d, m1), BitCast(d, m0));
+  const RepartitionToWide<decltype(d)> dw;
+  return OrderedDemote2To(
+      d, IntMod<kOrigLaneSize>(PromoteLowerTo(dw, a), PromoteLowerTo(dw, b)),
+      IntMod<kOrigLaneSize>(PromoteUpperTo(dw, a), PromoteUpperTo(dw, b)));
 }
+#endif  // HWY_TARGET <= HWY_SSE2 || HWY_TARGET == HWY_WASM || HWY_TARGET ==
+        // HWY_WASM_EMU256
-#endif  // HWY_NATIVE_MUL_8
-// ------------------------------ 64-bit multiplication
-// "Include guard": skip if native 64-bit mul instructions are available.
-#if (defined(HWY_NATIVE_MUL_64) == defined(HWY_TARGET_TOGGLE)) || HWY_IDE
-#ifdef HWY_NATIVE_MUL_64
-#undef HWY_NATIVE_MUL_64
-#else
-#define HWY_NATIVE_MUL_64
-#endif
+}  // namespace detail
-// Single-lane i64 or u64
-template <class V, HWY_IF_T_SIZE_V(V, 8), HWY_IF_V_SIZE_V(V, 8),
-          HWY_IF_NOT_FLOAT_V(V)>
-HWY_API V operator*(V x, V y) {
-  const DFromV<V> d;
-  using T = TFromD<decltype(d)>;
-  using TU = MakeUnsigned<T>;
-  const TU xu = static_cast<TU>(GetLane(x));
-  const TU yu = static_cast<TU>(GetLane(y));
-  return Set(d, static_cast<T>(xu * yu));
-}
+#if HWY_TARGET == HWY_SCALAR
-template <class V, class D64 = DFromV<V>, HWY_IF_U64_D(D64),
-          HWY_IF_V_SIZE_GT_D(D64, 8)>
-HWY_API V operator*(V x, V y) {
-  RepartitionToNarrow<D64> d32;
-  auto x32 = BitCast(d32, x);
-  auto y32 = BitCast(d32, y);
-  auto lolo = BitCast(d32, MulEven(x32, y32));
-  auto lohi = BitCast(d32, MulEven(x32, BitCast(d32, ShiftRight<32>(y))));
-  auto hilo = BitCast(d32, MulEven(BitCast(d32, ShiftRight<32>(x)), y32));
-  auto hi = BitCast(d32, ShiftLeft<32>(BitCast(D64{}, lohi + hilo)));
-  return BitCast(D64{}, lolo + hi);
+template <class T, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
+HWY_API Vec1<T> operator/(Vec1<T> a, Vec1<T> b) {
+  return detail::IntDiv<sizeof(T)>(a, b);
 }
-template <class V, class DI64 = DFromV<V>, HWY_IF_I64_D(DI64),
-          HWY_IF_V_SIZE_GT_D(DI64, 8)>
-HWY_API V operator*(V x, V y) {
-  RebindToUnsigned<DI64> du64;
-  return BitCast(DI64{}, BitCast(du64, x) * BitCast(du64, y));
+template <class T, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
+HWY_API Vec1<T> operator%(Vec1<T> a, Vec1<T> b) {
+  return detail::IntMod<sizeof(T)>(a, b);
 }
-#endif  // HWY_NATIVE_MUL_64
+#else  // HWY_TARGET != HWY_SCALAR
-// ------------------------------ MulAdd / NegMulAdd
+template <class T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
+HWY_API Vec128<T, N> operator/(Vec128<T, N> a, Vec128<T, N> b) {
+  return detail::IntDiv<sizeof(T)>(a, b);
+}
-// "Include guard": skip if native int MulAdd instructions are available.
-#if (defined(HWY_NATIVE_INT_FMA) == defined(HWY_TARGET_TOGGLE))
-#ifdef HWY_NATIVE_INT_FMA
-#undef HWY_NATIVE_INT_FMA
-#else
-#define HWY_NATIVE_INT_FMA
-#endif
+template <class T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
+HWY_API Vec128<T, N> operator%(Vec128<T, N> a, Vec128<T, N> b) {
+  return detail::IntMod<sizeof(T)>(a, b);
+}
-template <class V, HWY_IF_NOT_FLOAT_V(V)>
-HWY_API V MulAdd(V mul, V x, V add) {
-  return Add(Mul(mul, x), add);
+#if HWY_CAP_GE256
+template <class T, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
+HWY_API Vec256<T> operator/(Vec256<T> a, Vec256<T> b) {
+  return detail::IntDiv<sizeof(T)>(a, b);
 }
+template <class T, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
+HWY_API Vec256<T> operator%(Vec256<T> a, Vec256<T> b) {
+  return detail::IntMod<sizeof(T)>(a, b);
+}
+#endif
-template <class V, HWY_IF_NOT_FLOAT_V(V)>
-HWY_API V NegMulAdd(V mul, V x, V add) {
-  return Sub(add, Mul(mul, x));
+#if HWY_CAP_GE512
+template <class T, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
+HWY_API Vec512<T> operator/(Vec512<T> a, Vec512<T> b) {
+  return detail::IntDiv<sizeof(T)>(a, b);
 }
+template <class T, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
+HWY_API Vec512<T> operator%(Vec512<T> a, Vec512<T> b) {
+  return detail::IntMod<sizeof(T)>(a, b);
+}
+#endif
-#endif  // HWY_NATIVE_INT_FMA
+#endif  // HWY_TARGET == HWY_SCALAR
+#endif  // HWY_NATIVE_INT_DIV
 // ------------------------------ SatWidenMulPairwiseAdd
@@ -2819,11 +4498,11 @@ template <class DI16, class VU8, class VI8,
 HWY_API Vec<DI16> SatWidenMulPairwiseAdd(DI16 di16, VU8 a, VI8 b) {
   const RebindToUnsigned<decltype(di16)> du16;
-  const auto a0 = And(BitCast(di16, a), Set(di16, int16_t{0x00FF}));
-  const auto b0 = ShiftRight<8>(ShiftLeft<8>(BitCast(di16, b)));
+  const auto a0 = BitCast(di16, PromoteEvenTo(du16, a));
+  const auto b0 = PromoteEvenTo(di16, b);
-  const auto a1 = BitCast(di16, ShiftRight<8>(BitCast(du16, a)));
-  const auto b1 = ShiftRight<8>(BitCast(di16, b));
+  const auto a1 = BitCast(di16, PromoteOddTo(du16, a));
+  const auto b1 = PromoteOddTo(di16, b);
   return SaturatedAdd(Mul(a0, b0), Mul(a1, b1));
 }
@@ -2848,11 +4527,11 @@ HWY_API VFromD<DI32> SumOfMulQuadAccumulate(DI32 di32,
                                             VFromD<DI32> sum) {
   const Repartition<int16_t, decltype(di32)> di16;
-  const auto a0 = ShiftRight<8>(ShiftLeft<8>(BitCast(di16, a)));
-  const auto b0 = ShiftRight<8>(ShiftLeft<8>(BitCast(di16, b)));
+  const auto a0 = PromoteEvenTo(di16, a);
+  const auto b0 = PromoteEvenTo(di16, b);
-  const auto a1 = ShiftRight<8>(BitCast(di16, a));
-  const auto b1 = ShiftRight<8>(BitCast(di16, b));
+  const auto a1 = PromoteOddTo(di16, a);
+  const auto b1 = PromoteOddTo(di16, b);
   return Add(sum, Add(WidenMulPairwiseAdd(di32, a0, b0),
                       WidenMulPairwiseAdd(di32, a1, b1)));
@@ -2985,12 +4664,10 @@ HWY_API VFromD<DU64> SumOfMulQuadAccumulate(
   const auto u32_even_prod = MulEven(a, b);
   const auto u32_odd_prod = MulOdd(a, b);
-  const auto lo32_mask = Set(du64, uint64_t{0xFFFFFFFFu});
-  const auto p0 = Add(And(BitCast(du64, u32_even_prod), lo32_mask),
-                      And(BitCast(du64, u32_odd_prod), lo32_mask));
-  const auto p1 = Add(ShiftRight<32>(BitCast(du64, u32_even_prod)),
-                      ShiftRight<32>(BitCast(du64, u32_odd_prod)));
+  const auto p0 = Add(PromoteEvenTo(du64, u32_even_prod),
+                      PromoteEvenTo(du64, u32_odd_prod));
+  const auto p1 =
+      Add(PromoteOddTo(du64, u32_even_prod), PromoteOddTo(du64, u32_odd_prod));
   return Add(sum, Add(p0, p1));
 }
@@ -3043,7 +4720,6 @@ HWY_API V ApproximateReciprocalSqrt(V v) {
 // ------------------------------ Compress*
-// "Include guard": skip if native 8-bit compress instructions are available.
 #if (defined(HWY_NATIVE_COMPRESS8) == defined(HWY_TARGET_TOGGLE))
 #ifdef HWY_NATIVE_COMPRESS8
 #undef HWY_NATIVE_COMPRESS8
@@ -3244,7 +4920,6 @@ HWY_API V CompressNot(V v, M mask) {
 // ------------------------------ Expand
-// "Include guard": skip if native 8/16-bit Expand/LoadExpand are available.
 // Note that this generic implementation assumes <= 128 bit fixed vectors;
 // the SVE and RVV targets provide their own native implementations.
 #if (defined(HWY_NATIVE_EXPAND) == defined(HWY_TARGET_TOGGLE)) || HWY_IDE
@@ -3853,7 +5528,9 @@ HWY_API Vec128<T, N> Expand(Vec128<T, N> v, Mask128<T, N> mask) {
       BitCast(du, InterleaveLower(du8x2, indices8, indices8));
   // TableLookupBytesOr0 operates on bytes. To convert u16 lane indices to byte
   // indices, add 0 to even and 1 to odd byte lanes.
-  const Vec128<uint16_t, N> byte_indices = Add(indices16, Set(du, 0x0100));
+  const Vec128<uint16_t, N> byte_indices = Add(
+      indices16,
+      Set(du, static_cast<uint16_t>(HWY_IS_LITTLE_ENDIAN ? 0x0100 : 0x0001)));
   return BitCast(d, TableLookupBytesOr0(v, byte_indices));
 }
@@ -3947,9 +5624,9 @@ HWY_API VFromD<D> Reverse2(D d, VFromD<D> v) {
   const Repartition<uint16_t, decltype(d)> du16;
   return BitCast(d, RotateRight<8>(BitCast(du16, v)));
 #else
-  alignas(16) static constexpr TFromD<D> kShuffle[16] = {
-      1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14};
-  return TableLookupBytes(v, LoadDup128(d, kShuffle));
+  const VFromD<D> shuffle = Dup128VecFromValues(d, 1, 0, 3, 2, 5, 4, 7, 6, 9, 8,
+                                                11, 10, 13, 12, 15, 14);
+  return TableLookupBytes(v, shuffle);
 #endif
 }
@@ -3959,10 +5636,10 @@ HWY_API VFromD<D> Reverse4(D d, VFromD<D> v) {
   const Repartition<uint16_t, decltype(d)> du16;
   return BitCast(d, Reverse2(du16, BitCast(du16, Reverse2(d, v))));
 #else
-  alignas(16) static constexpr uint8_t kShuffle[16] = {
-      3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12};
   const Repartition<uint8_t, decltype(d)> du8;
-  return TableLookupBytes(v, BitCast(d, LoadDup128(du8, kShuffle)));
+  const VFromD<decltype(du8)> shuffle = Dup128VecFromValues(
+      du8, 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12);
+  return TableLookupBytes(v, BitCast(d, shuffle));
 #endif
 }
@@ -3972,10 +5649,10 @@ HWY_API VFromD<D> Reverse8(D d, VFromD<D> v) {
   const Repartition<uint32_t, D> du32;
   return BitCast(d, Reverse2(du32, BitCast(du32, Reverse4(d, v))));
 #else
-  alignas(16) static constexpr uint8_t kShuffle[16] = {
-      7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8};
   const Repartition<uint8_t, decltype(d)> du8;
-  return TableLookupBytes(v, BitCast(d, LoadDup128(du8, kShuffle)));
+  const VFromD<decltype(du8)> shuffle = Dup128VecFromValues(
+      du8, 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8);
+  return TableLookupBytes(v, BitCast(d, shuffle));
 #endif
 }
@@ -4111,8 +5788,6 @@ HWY_INLINE Vec<D> Per4LaneBlkShufDupSet4xU32(D d, const uint32_t x3,
                                              const uint32_t x2,
                                              const uint32_t x1,
                                              const uint32_t x0) {
-  alignas(16) const uint32_t lanes[4] = {x0, x1, x2, x3};
 #if HWY_TARGET == HWY_RVV
   constexpr int kPow2 = d.Pow2();
   constexpr int kLoadPow2 = HWY_MAX(kPow2, -1);
@@ -4128,8 +5803,7 @@ HWY_INLINE Vec<D> Per4LaneBlkShufDupSet4xU32(D d, const uint32_t x3,
       HWY_MAX(kMaxBytes / sizeof(uint32_t), kMinLanesToLoad);
   const CappedTag<uint32_t, kNumToLoad> d_load;
 #endif
-  return ResizeBitCast(d, LoadDup128(d_load, lanes));
+  return ResizeBitCast(d, Dup128VecFromValues(d_load, x0, x1, x2, x3));
 }
 }  // namespace detail
@@ -4291,10 +5965,6 @@ HWY_INLINE VFromD<D> TblLookupPer4LaneBlkIdxInBlk(D d, const uint32_t idx3,
   const uint16_t u16_idx1 = static_cast<uint16_t>(idx1);
   const uint16_t u16_idx2 = static_cast<uint16_t>(idx2);
   const uint16_t u16_idx3 = static_cast<uint16_t>(idx3);
-  alignas(16)
-      const uint16_t indices[8] = {u16_idx0, u16_idx1, u16_idx2, u16_idx3,
-                                   u16_idx0, u16_idx1, u16_idx2, u16_idx3};
 #if HWY_TARGET == HWY_NEON || HWY_TARGET == HWY_NEON_WITHOUT_AES
   constexpr size_t kMinLanesToLoad = 4;
 #else
@@ -4302,8 +5972,9 @@ HWY_INLINE VFromD<D> TblLookupPer4LaneBlkIdxInBlk(D d, const uint32_t idx3,
 #endif
   constexpr size_t kNumToLoad = HWY_MAX(HWY_MAX_LANES_D(D), kMinLanesToLoad);
   const CappedTag<uint16_t, kNumToLoad> d_load;
-  return ResizeBitCast(d, LoadDup128(d_load, indices));
+  return ResizeBitCast(
+      d, Dup128VecFromValues(d_load, u16_idx0, u16_idx1, u16_idx2, u16_idx3,
+                             u16_idx0, u16_idx1, u16_idx2, u16_idx3));
 }
 template <class D, HWY_IF_T_SIZE_D(D, 4)>
@@ -4672,6 +6343,202 @@ HWY_API VFromD<D> SlideDownBlocks(D d, VFromD<D> v) {
 }
 #endif
+// ------------------------------ SumsOfAdjQuadAbsDiff
+#if (defined(HWY_NATIVE_SUMS_OF_ADJ_QUAD_ABS_DIFF) == \
+     defined(HWY_TARGET_TOGGLE))
+#ifdef HWY_NATIVE_SUMS_OF_ADJ_QUAD_ABS_DIFF
+#undef HWY_NATIVE_SUMS_OF_ADJ_QUAD_ABS_DIFF
+#else
+#define HWY_NATIVE_SUMS_OF_ADJ_QUAD_ABS_DIFF
+#endif
+#if HWY_TARGET != HWY_SCALAR
+template <int kAOffset, int kBOffset, class V8, HWY_IF_UI8_D(DFromV<V8>)>
+HWY_API Vec<RepartitionToWide<DFromV<V8>>> SumsOfAdjQuadAbsDiff(V8 a, V8 b) {
+  static_assert(0 <= kAOffset && kAOffset <= 1,
+                "kAOffset must be between 0 and 1");
+  static_assert(0 <= kBOffset && kBOffset <= 3,
+                "kBOffset must be between 0 and 3");
+  using D8 = DFromV<V8>;
+  const D8 d8;
+  const RebindToUnsigned<decltype(d8)> du8;
+  const RepartitionToWide<decltype(d8)> d16;
+  const RepartitionToWide<decltype(du8)> du16;
+  // Ensure that a is resized to a vector that has at least
+  // HWY_MAX(Lanes(d8), size_t{8} << kAOffset) lanes for the interleave and
+  // CombineShiftRightBytes operations below.
+#if HWY_TARGET == HWY_RVV
+  // On RVV targets, need to ensure that d8_interleave.Pow2() >= 0 is true
+  // to ensure that Lanes(d8_interleave) >= 16 is true.
+  // Lanes(d8_interleave) >= Lanes(d8) is guaranteed to be true on RVV
+  // targets as d8_interleave.Pow2() >= d8.Pow2() is true.
+  constexpr int kInterleavePow2 = HWY_MAX(d8.Pow2(), 0);
+  const ScalableTag<TFromD<D8>, kInterleavePow2> d8_interleave;
+#elif HWY_HAVE_SCALABLE || HWY_TARGET == HWY_SVE_256 || \
+    HWY_TARGET == HWY_SVE2_128
+  // On SVE targets, Lanes(d8_interleave) >= 16 and
+  // Lanes(d8_interleave) >= Lanes(d8) are both already true as d8 is a SIMD
+  // tag for a full u8/i8 vector on SVE.
+  const D8 d8_interleave;
+#else
+  // On targets that use non-scalable vector types, Lanes(d8_interleave) is
+  // equal to HWY_MAX(Lanes(d8), size_t{8} << kAOffset).
+  constexpr size_t kInterleaveLanes =
+      HWY_MAX(HWY_MAX_LANES_D(D8), size_t{8} << kAOffset);
+  const FixedTag<TFromD<D8>, kInterleaveLanes> d8_interleave;
+#endif
+  // The ResizeBitCast operation below will resize a to a vector that has
+  // at least HWY_MAX(Lanes(d8), size_t{8} << kAOffset) lanes for the
+  // InterleaveLower, InterleaveUpper, and CombineShiftRightBytes operations
+  // below.
+  const auto a_to_interleave = ResizeBitCast(d8_interleave, a);
+  const auto a_interleaved_lo =
+      InterleaveLower(d8_interleave, a_to_interleave, a_to_interleave);
+  const auto a_interleaved_hi =
+      InterleaveUpper(d8_interleave, a_to_interleave, a_to_interleave);
+  /* a01: { a[kAOffset*4+0], a[kAOffset*4+1], a[kAOffset*4+1], a[kAOffset*4+2],
+            a[kAOffset*4+2], a[kAOffset*4+3], a[kAOffset*4+3], a[kAOffset*4+4],
+            a[kAOffset*4+4], a[kAOffset*4+5], a[kAOffset*4+5], a[kAOffset*4+6],
+            a[kAOffset*4+6], a[kAOffset*4+7], a[kAOffset*4+7], a[kAOffset*4+8] }
+   */
+  /* a23: { a[kAOffset*4+2], a[kAOffset*4+3], a[kAOffset*4+3], a[kAOffset*4+4],
+            a[kAOffset*4+4], a[kAOffset*4+5], a[kAOffset*4+5], a[kAOffset*4+6],
+            a[kAOffset*4+6], a[kAOffset*4+7], a[kAOffset*4+7], a[kAOffset*4+8],
+            a[kAOffset*4+8], a[kAOffset*4+9], a[kAOffset*4+9], a[kAOffset*4+10]
+     } */
+  // a01 and a23 are resized back to V8 as only the first Lanes(d8) lanes of
+  // the CombineShiftRightBytes are needed for the subsequent AbsDiff operations
+  // and as a01 and a23 need to be the same vector type as b01 and b23 for the
+  // AbsDiff operations below.
+  const V8 a01 =
+      ResizeBitCast(d8, CombineShiftRightBytes<kAOffset * 8 + 1>(
+                            d8_interleave, a_interleaved_hi, a_interleaved_lo));
+  const V8 a23 =
+      ResizeBitCast(d8, CombineShiftRightBytes<kAOffset * 8 + 5>(
+                            d8_interleave, a_interleaved_hi, a_interleaved_lo));
+  /* b01: { b[kBOffset*4+0], b[kBOffset*4+1], b[kBOffset*4+0], b[kBOffset*4+1],
+            b[kBOffset*4+0], b[kBOffset*4+1], b[kBOffset*4+0], b[kBOffset*4+1],
+            b[kBOffset*4+0], b[kBOffset*4+1], b[kBOffset*4+0], b[kBOffset*4+1],
+            b[kBOffset*4+0], b[kBOffset*4+1], b[kBOffset*4+0], b[kBOffset*4+1] }
+   */
+  /* b23: { b[kBOffset*4+2], b[kBOffset*4+3], b[kBOffset*4+2], b[kBOffset*4+3],
+            b[kBOffset*4+2], b[kBOffset*4+3], b[kBOffset*4+2], b[kBOffset*4+3],
+            b[kBOffset*4+2], b[kBOffset*4+3], b[kBOffset*4+2], b[kBOffset*4+3],
+            b[kBOffset*4+2], b[kBOffset*4+3], b[kBOffset*4+2], b[kBOffset*4+3] }
+   */
+  const V8 b01 = BitCast(d8, Broadcast<kBOffset * 2>(BitCast(d16, b)));
+  const V8 b23 = BitCast(d8, Broadcast<kBOffset * 2 + 1>(BitCast(d16, b)));
+  const VFromD<decltype(du16)> absdiff_sum_01 =
+      SumsOf2(BitCast(du8, AbsDiff(a01, b01)));
+  const VFromD<decltype(du16)> absdiff_sum_23 =
+      SumsOf2(BitCast(du8, AbsDiff(a23, b23)));
+  return BitCast(d16, Add(absdiff_sum_01, absdiff_sum_23));
+}
+#endif  // HWY_TARGET != HWY_SCALAR
+#endif  // HWY_NATIVE_SUMS_OF_ADJ_QUAD_ABS_DIFF
+// ------------------------------ SumsOfShuffledQuadAbsDiff
+#if (defined(HWY_NATIVE_SUMS_OF_SHUFFLED_QUAD_ABS_DIFF) == \
+     defined(HWY_TARGET_TOGGLE))
+#ifdef HWY_NATIVE_SUMS_OF_SHUFFLED_QUAD_ABS_DIFF
+#undef HWY_NATIVE_SUMS_OF_SHUFFLED_QUAD_ABS_DIFF
+#else
+#define HWY_NATIVE_SUMS_OF_SHUFFLED_QUAD_ABS_DIFF
+#endif
+#if HWY_TARGET != HWY_SCALAR
+template <int kIdx3, int kIdx2, int kIdx1, int kIdx0, class V8,
+          HWY_IF_UI8_D(DFromV<V8>)>
+HWY_API Vec<RepartitionToWide<DFromV<V8>>> SumsOfShuffledQuadAbsDiff(V8 a,
+                                                                     V8 b) {
+  static_assert(0 <= kIdx0 && kIdx0 <= 3, "kIdx0 must be between 0 and 3");
+  static_assert(0 <= kIdx1 && kIdx1 <= 3, "kIdx1 must be between 0 and 3");
+  static_assert(0 <= kIdx2 && kIdx2 <= 3, "kIdx2 must be between 0 and 3");
+  static_assert(0 <= kIdx3 && kIdx3 <= 3, "kIdx3 must be between 0 and 3");
+#if HWY_TARGET == HWY_RVV
+  // On RVV, ensure that both vA and vB have a LMUL of at least 1/2 so that
+  // both vA and vB can be bitcasted to a u32 vector.
+  const detail::AdjustSimdTagToMinVecPow2<
+      RepartitionToWideX2<DFromV<decltype(a)>>>
+      d32;
+  const RepartitionToNarrow<decltype(d32)> d16;
+  const RepartitionToNarrow<decltype(d16)> d8;
+  const auto vA = ResizeBitCast(d8, a);
+  const auto vB = ResizeBitCast(d8, b);
+#else
+  const DFromV<decltype(a)> d8;
+  const RepartitionToWide<decltype(d8)> d16;
+  const RepartitionToWide<decltype(d16)> d32;
+  const auto vA = a;
+  const auto vB = b;
+#endif
+  const RebindToUnsigned<decltype(d8)> du8;
+  const auto a_shuf =
+      Per4LaneBlockShuffle<kIdx3, kIdx2, kIdx1, kIdx0>(BitCast(d32, vA));
+  /* a0123_2345: { a_shuf[0], a_shuf[1], a_shuf[2], a_shuf[3],
+                   a_shuf[2], a_shuf[3], a_shuf[4], a_shuf[5],
+                   a_shuf[8], a_shuf[9], a_shuf[10], a_shuf[11],
+                   a_shuf[10], a_shuf[11], a_shuf[12], a_shuf[13] } */
+  /* a1234_3456: { a_shuf[1], a_shuf[2], a_shuf[3], a_shuf[4],
+                   a_shuf[3], a_shuf[4], a_shuf[5], a_shuf[6],
+                   a_shuf[9], a_shuf[10], a_shuf[11], a_shuf[12],
+                   a_shuf[11], a_shuf[12], a_shuf[13], a_shuf[14] } */
+#if HWY_HAVE_SCALABLE || HWY_TARGET == HWY_SVE_256 || HWY_TARGET == HWY_SVE2_128
+  // On RVV/SVE targets, use Slide1Up/Slide1Down instead of
+  // ShiftLeftBytes/ShiftRightBytes to avoid unnecessary zeroing out of any
+  // lanes that are shifted into an adjacent 16-byte block as any lanes that are
+  // shifted into an adjacent 16-byte block by Slide1Up/Slide1Down will be
+  // replaced by the OddEven operation.
+  const auto a_0123_2345 = BitCast(
+      d8, OddEven(BitCast(d32, Slide1Up(d16, BitCast(d16, a_shuf))), a_shuf));
+  const auto a_1234_3456 =
+      BitCast(d8, OddEven(BitCast(d32, Slide1Up(d8, BitCast(d8, a_shuf))),
+                          BitCast(d32, Slide1Down(d8, BitCast(d8, a_shuf)))));
+#else
+  const auto a_0123_2345 =
+      BitCast(d8, OddEven(ShiftLeftBytes<2>(d32, a_shuf), a_shuf));
+  const auto a_1234_3456 = BitCast(
+      d8,
+      OddEven(ShiftLeftBytes<1>(d32, a_shuf), ShiftRightBytes<1>(d32, a_shuf)));
+#endif
+  auto even_sums = SumsOf4(BitCast(du8, AbsDiff(a_0123_2345, vB)));
+  auto odd_sums = SumsOf4(BitCast(du8, AbsDiff(a_1234_3456, vB)));
+#if HWY_IS_LITTLE_ENDIAN
+  odd_sums = ShiftLeft<16>(odd_sums);
+#else
+  even_sums = ShiftLeft<16>(even_sums);
+#endif
+  const auto sums = OddEven(BitCast(d16, odd_sums), BitCast(d16, even_sums));
+#if HWY_TARGET == HWY_RVV
+  return ResizeBitCast(RepartitionToWide<DFromV<V8>>(), sums);
+#else
+  return sums;
+#endif
+}
+#endif  // HWY_TARGET != HWY_SCALAR
+#endif  // HWY_NATIVE_SUMS_OF_SHUFFLED_QUAD_ABS_DIFF
 // ================================================== Operator wrapper
 // SVE* and RVV currently cannot define operators and have already defined
@@ -4700,6 +6567,10 @@ template <class V>
 HWY_API V Div(V a, V b) {
   return a / b;
 }
+template <class V>
+HWY_API V Mod(V a, V b) {
+  return a % b;
+}
 template <class V>
 V Shl(V a, V b) {