npm - @img/sharp-libvips-dev - Versions diffs - 1.0.1 → 1.0.3 - Mend

@img/sharp-libvips-dev 1.0.1 → 1.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (169) hide show

package/README.md +1 -2
package/include/aom/aom_decoder.h +1 -1
package/include/aom/aom_encoder.h +7 -1
package/include/aom/aom_image.h +24 -12
package/include/aom/aom_integer.h +3 -3
package/include/aom/aomcx.h +15 -0
package/include/aom/aomdx.h +5 -2
package/include/archive.h +7 -5
package/include/archive_entry.h +5 -3
package/include/cgif.h +3 -0
package/include/expat.h +21 -10
package/include/expat_config.h +11 -5
package/include/ffi.h +12 -25
package/include/freetype2/freetype/config/ftoption.h +2 -2
package/include/fribidi/fribidi-config.h +2 -2
package/include/fribidi/fribidi-unicode-version.h +3 -3
package/include/gio-unix-2.0/gio/gfiledescriptorbased.h +3 -2
package/include/glib-2.0/gio/gappinfo.h +40 -25
package/include/glib-2.0/gio/gapplication.h +6 -0
package/include/glib-2.0/gio/gasyncresult.h +1 -1
package/include/glib-2.0/gio/gconverter.h +5 -0
package/include/glib-2.0/gio/gdbusintrospection.h +1 -1
package/include/glib-2.0/gio/gfile.h +16 -0
package/include/glib-2.0/gio/gio-visibility.h +34 -0
package/include/glib-2.0/gio/giotypes.h +0 -1
package/include/glib-2.0/gio/gsettings.h +8 -0
package/include/glib-2.0/gio/gvfs.h +2 -2
package/include/glib-2.0/girepository/gi-visibility.h +34 -0
package/include/glib-2.0/girepository/giarginfo.h +23 -6
package/include/glib-2.0/girepository/gibaseinfo.h +44 -18
package/include/glib-2.0/girepository/gicallableinfo.h +26 -16
package/include/glib-2.0/girepository/gicallbackinfo.h +17 -2
package/include/glib-2.0/girepository/giconstantinfo.h +19 -4
package/include/glib-2.0/girepository/gienuminfo.h +20 -21
package/include/glib-2.0/girepository/gifieldinfo.h +22 -7
package/include/glib-2.0/girepository/giflagsinfo.h +60 -0
package/include/glib-2.0/girepository/gifunctioninfo.h +22 -7
package/include/glib-2.0/girepository/giinterfaceinfo.h +33 -18
package/include/glib-2.0/girepository/giobjectinfo.h +41 -26
package/include/glib-2.0/girepository/gipropertyinfo.h +18 -3
package/include/glib-2.0/girepository/giregisteredtypeinfo.h +22 -11
package/include/glib-2.0/girepository/girepository-autocleanups.h +56 -0
package/include/glib-2.0/girepository/girepository.h +53 -62
package/include/glib-2.0/girepository/girffi.h +8 -7
package/include/glib-2.0/girepository/gisignalinfo.h +18 -3
package/include/glib-2.0/girepository/gistructinfo.h +26 -11
package/include/glib-2.0/girepository/gitypeinfo.h +29 -16
package/include/glib-2.0/girepository/gitypelib.h +9 -13
package/include/glib-2.0/girepository/gitypes.h +52 -104
package/include/glib-2.0/girepository/giunioninfo.h +28 -12
package/include/glib-2.0/girepository/giunresolvedinfo.h +17 -2
package/include/glib-2.0/girepository/givalueinfo.h +65 -0
package/include/glib-2.0/girepository/givfuncinfo.h +23 -8
package/include/glib-2.0/glib/deprecated/gthread.h +9 -5
package/include/glib-2.0/glib/gbitlock.h +31 -0
package/include/glib-2.0/glib/gbookmarkfile.h +1 -1
package/include/glib-2.0/glib/giochannel.h +2 -2
package/include/glib-2.0/glib/glib-visibility.h +34 -0
package/include/glib-2.0/glib/gmacros.h +12 -5
package/include/glib-2.0/glib/gmain.h +93 -7
package/include/glib-2.0/glib/gmessages.h +8 -0
package/include/glib-2.0/glib/gqsort.h +8 -1
package/include/glib-2.0/glib/gslice.h +2 -0
package/include/glib-2.0/glib/gstrfuncs.h +24 -30
package/include/glib-2.0/glib/gstrvbuilder.h +3 -0
package/include/glib-2.0/glib/gthread.h +191 -3
package/include/glib-2.0/glib/gunicode.h +1 -1
package/include/glib-2.0/glib/gversionmacros.h +9 -0
package/include/glib-2.0/glib-unix.h +7 -1
package/include/glib-2.0/gmodule/gmodule-visibility.h +34 -0
package/include/glib-2.0/gobject/genums.h +6 -6
package/include/glib-2.0/gobject/glib-types.h +11 -0
package/include/glib-2.0/gobject/gobject-visibility.h +34 -0
package/include/glib-2.0/gobject/gsignal.h +16 -6
package/include/glib-2.0/gobject/gtype.h +6 -6
package/include/harfbuzz/hb-buffer.h +6 -0
package/include/harfbuzz/hb-common.h +6 -9
package/include/harfbuzz/hb-cplusplus.hh +8 -11
package/include/harfbuzz/hb-subset.h +17 -4
package/include/harfbuzz/hb-version.h +3 -3
package/include/hwy/abort.h +28 -0
package/include/hwy/aligned_allocator.h +218 -6
package/include/hwy/base.h +1935 -512
package/include/hwy/cache_control.h +24 -6
package/include/hwy/detect_compiler_arch.h +105 -10
package/include/hwy/detect_targets.h +146 -37
package/include/hwy/foreach_target.h +36 -1
package/include/hwy/highway.h +222 -50
package/include/hwy/ops/arm_neon-inl.h +2055 -894
package/include/hwy/ops/arm_sve-inl.h +1476 -348
package/include/hwy/ops/emu128-inl.h +711 -623
package/include/hwy/ops/generic_ops-inl.h +4431 -2157
package/include/hwy/ops/inside-inl.h +691 -0
package/include/hwy/ops/ppc_vsx-inl.h +2186 -673
package/include/hwy/ops/rvv-inl.h +1556 -536
package/include/hwy/ops/scalar-inl.h +353 -233
package/include/hwy/ops/set_macros-inl.h +171 -23
package/include/hwy/ops/shared-inl.h +198 -56
package/include/hwy/ops/wasm_128-inl.h +283 -244
package/include/hwy/ops/x86_128-inl.h +3673 -1357
package/include/hwy/ops/x86_256-inl.h +1737 -663
package/include/hwy/ops/x86_512-inl.h +1697 -500
package/include/hwy/per_target.h +4 -0
package/include/hwy/profiler.h +648 -0
package/include/hwy/robust_statistics.h +2 -2
package/include/hwy/targets.h +40 -32
package/include/hwy/timer-inl.h +3 -3
package/include/hwy/timer.h +16 -1
package/include/libheif/heif.h +170 -15
package/include/libheif/heif_items.h +237 -0
package/include/libheif/heif_properties.h +38 -2
package/include/libheif/heif_regions.h +1 -1
package/include/libheif/heif_version.h +2 -2
package/include/libpng16/png.h +32 -29
package/include/libpng16/pngconf.h +2 -2
package/include/libpng16/pnglibconf.h +8 -3
package/include/librsvg-2.0/librsvg/rsvg-cairo.h +1 -1
package/include/librsvg-2.0/librsvg/rsvg-features.h +3 -4
package/include/librsvg-2.0/librsvg/rsvg-pixbuf.h +235 -0
package/include/librsvg-2.0/librsvg/rsvg-version.h +3 -3
package/include/librsvg-2.0/librsvg/rsvg.h +55 -176
package/include/libxml2/libxml/HTMLparser.h +12 -19
package/include/libxml2/libxml/c14n.h +1 -12
package/include/libxml2/libxml/debugXML.h +1 -1
package/include/libxml2/libxml/encoding.h +9 -0
package/include/libxml2/libxml/entities.h +12 -1
package/include/libxml2/libxml/hash.h +19 -0
package/include/libxml2/libxml/list.h +2 -2
package/include/libxml2/libxml/nanohttp.h +17 -0
package/include/libxml2/libxml/parser.h +73 -58
package/include/libxml2/libxml/parserInternals.h +9 -1
package/include/libxml2/libxml/pattern.h +6 -0
package/include/libxml2/libxml/tree.h +32 -12
package/include/libxml2/libxml/uri.h +11 -0
package/include/libxml2/libxml/valid.h +29 -2
package/include/libxml2/libxml/xinclude.h +7 -0
package/include/libxml2/libxml/xmlIO.h +21 -5
package/include/libxml2/libxml/xmlerror.h +14 -0
package/include/libxml2/libxml/xmlexports.h +111 -15
package/include/libxml2/libxml/xmlmemory.h +8 -45
package/include/libxml2/libxml/xmlreader.h +2 -0
package/include/libxml2/libxml/xmlsave.h +5 -0
package/include/libxml2/libxml/xmlunicode.h +165 -1
package/include/libxml2/libxml/xmlversion.h +15 -179
package/include/libxml2/libxml/xmlwriter.h +1 -0
package/include/libxml2/libxml/xpath.h +4 -0
package/include/pango-1.0/pango/pango-features.h +2 -2
package/include/pango-1.0/pango/pango-fontmap.h +7 -0
package/include/pango-1.0/pango/pango-item.h +4 -2
package/include/pango-1.0/pango/pango-version-macros.h +25 -0
package/include/pango-1.0/pango/pangofc-font.h +2 -1
package/include/pixman-1/pixman-version.h +2 -2
package/include/png.h +32 -29
package/include/pngconf.h +2 -2
package/include/pnglibconf.h +8 -3
package/include/vips/connection.h +9 -3
package/include/vips/util.h +1 -11
package/include/vips/version.h +4 -4
package/include/webp/decode.h +58 -56
package/include/webp/demux.h +25 -21
package/include/webp/encode.h +44 -39
package/include/webp/mux.h +76 -15
package/include/webp/mux_types.h +2 -1
package/include/webp/sharpyuv/sharpyuv.h +77 -8
package/include/webp/types.h +29 -8
package/include/zconf.h +1 -1
package/include/zlib.h +12 -12
package/package.json +1 -1
package/versions.json +18 -19

package/include/hwy/ops/emu128-inl.h CHANGED Viewed

@@ -16,7 +16,11 @@
 // Single-element vectors and operations.
 // External include guard in highway.h - see comment there.
-#include <cmath>  // std::abs, std::isnan
+#include "hwy/base.h"
+#ifndef HWY_NO_LIBCXX
+#include <math.h>  // sqrtf
+#endif
 #include "hwy/ops/shared-inl.h"
@@ -49,6 +53,9 @@ struct Vec128 {
   HWY_INLINE Vec128& operator-=(const Vec128 other) {
     return *this = (*this - other);
   }
+  HWY_INLINE Vec128& operator%=(const Vec128 other) {
+    return *this = (*this % other);
+  }
   HWY_INLINE Vec128& operator&=(const Vec128 other) {
     return *this = (*this & other);
   }
@@ -97,15 +104,12 @@ HWY_API Vec128<TFromD<D>, HWY_MAX_LANES_D(D)> Zero(D /* tag */) {
 template <class D>
 using VFromD = decltype(Zero(D()));
-// ------------------------------ Tuple (VFromD)
-#include "hwy/ops/tuple-inl.h"
 // ------------------------------ BitCast
 template <class D, class VFrom>
 HWY_API VFromD<D> BitCast(D /* tag */, VFrom v) {
   VFromD<D> to;
-  CopySameSize(&v, &to);
+  CopySameSize(&v.raw, &to.raw);
   return to;
 }
@@ -122,7 +126,7 @@ HWY_API VFromD<D> ResizeBitCast(D d, VFrom v) {
   constexpr size_t kCopyByteLen = HWY_MIN(kFromByteLen, kToByteLen);
   VFromD<D> to = Zero(d);
-  CopyBytes<kCopyByteLen>(&v, &to);
+  CopyBytes<kCopyByteLen>(&v.raw, &to.raw);
   return to;
 }
@@ -145,7 +149,7 @@ template <class D, typename T2>
 HWY_API VFromD<D> Set(D d, const T2 t) {
   VFromD<D> v;
   for (size_t i = 0; i < MaxLanes(d); ++i) {
-    v.raw[i] = static_cast<TFromD<D>>(t);
+    v.raw[i] = ConvertScalarTo<TFromD<D>>(t);
   }
   return v;
 }
@@ -156,14 +160,79 @@ HWY_API VFromD<D> Undefined(D d) {
   return Zero(d);
 }
+// ------------------------------ Dup128VecFromValues
+template <class D, HWY_IF_T_SIZE_D(D, 1)>
+HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1,
+                                      TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
+                                      TFromD<D> t5, TFromD<D> t6, TFromD<D> t7,
+                                      TFromD<D> t8, TFromD<D> t9, TFromD<D> t10,
+                                      TFromD<D> t11, TFromD<D> t12,
+                                      TFromD<D> t13, TFromD<D> t14,
+                                      TFromD<D> t15) {
+  VFromD<D> result;
+  result.raw[0] = t0;
+  result.raw[1] = t1;
+  result.raw[2] = t2;
+  result.raw[3] = t3;
+  result.raw[4] = t4;
+  result.raw[5] = t5;
+  result.raw[6] = t6;
+  result.raw[7] = t7;
+  result.raw[8] = t8;
+  result.raw[9] = t9;
+  result.raw[10] = t10;
+  result.raw[11] = t11;
+  result.raw[12] = t12;
+  result.raw[13] = t13;
+  result.raw[14] = t14;
+  result.raw[15] = t15;
+  return result;
+}
+template <class D, HWY_IF_T_SIZE_D(D, 2)>
+HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1,
+                                      TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
+                                      TFromD<D> t5, TFromD<D> t6,
+                                      TFromD<D> t7) {
+  VFromD<D> result;
+  result.raw[0] = t0;
+  result.raw[1] = t1;
+  result.raw[2] = t2;
+  result.raw[3] = t3;
+  result.raw[4] = t4;
+  result.raw[5] = t5;
+  result.raw[6] = t6;
+  result.raw[7] = t7;
+  return result;
+}
+template <class D, HWY_IF_T_SIZE_D(D, 4)>
+HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1,
+                                      TFromD<D> t2, TFromD<D> t3) {
+  VFromD<D> result;
+  result.raw[0] = t0;
+  result.raw[1] = t1;
+  result.raw[2] = t2;
+  result.raw[3] = t3;
+  return result;
+}
+template <class D, HWY_IF_T_SIZE_D(D, 8)>
+HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1) {
+  VFromD<D> result;
+  result.raw[0] = t0;
+  result.raw[1] = t1;
+  return result;
+}
 // ------------------------------ Iota
 template <class D, typename T = TFromD<D>, typename T2>
 HWY_API VFromD<D> Iota(D d, T2 first) {
   VFromD<D> v;
   for (size_t i = 0; i < MaxLanes(d); ++i) {
-    v.raw[i] =
-        AddWithWraparound(hwy::IsFloatTag<T>(), static_cast<T>(first), i);
+    v.raw[i] = AddWithWraparound(static_cast<T>(first), i);
   }
   return v;
 }
@@ -284,9 +353,8 @@ HWY_API Vec128<T, N> CopySignToAbs(Vec128<T, N> abs, Vec128<T, N> sign) {
 // ------------------------------ BroadcastSignBit
 template <typename T, size_t N>
 HWY_API Vec128<T, N> BroadcastSignBit(Vec128<T, N> v) {
-  // This is used inside ShiftRight, so we cannot implement in terms of it.
   for (size_t i = 0; i < N; ++i) {
-    v.raw[i] = v.raw[i] < 0 ? T(-1) : T(0);
+    v.raw[i] = ScalarShr(v.raw[i], sizeof(T) * 8 - 1);
   }
   return v;
 }
@@ -297,7 +365,7 @@ HWY_API Vec128<T, N> BroadcastSignBit(Vec128<T, N> v) {
 template <typename T, size_t N>
 HWY_API Mask128<T, N> MaskFromVec(Vec128<T, N> v) {
   Mask128<T, N> mask;
-  CopySameSize(&v, &mask);
+  CopySameSize(&v.raw, &mask.bits);
   return mask;
 }
@@ -307,20 +375,15 @@ using MFromD = decltype(MaskFromVec(VFromD<D>()));
 template <class DTo, class MFrom>
 HWY_API MFromD<DTo> RebindMask(DTo /* tag */, MFrom mask) {
   MFromD<DTo> to;
-  CopySameSize(&mask, &to);
+  CopySameSize(&mask.bits, &to.bits);
   return to;
 }
-template <typename T, size_t N>
-Vec128<T, N> VecFromMask(Mask128<T, N> mask) {
-  Vec128<T, N> v;
-  CopySameSize(&mask, &v);
-  return v;
-}
 template <class D>
 VFromD<D> VecFromMask(D /* tag */, MFromD<D> mask) {
-  return VecFromMask(mask);
+  VFromD<D> v;
+  CopySameSize(&mask.bits, &v.raw);
+  return v;
 }
 template <class D>
@@ -336,19 +399,20 @@ HWY_API MFromD<D> FirstN(D d, size_t n) {
 template <typename T, size_t N>
 HWY_API Vec128<T, N> IfThenElse(Mask128<T, N> mask, Vec128<T, N> yes,
                                 Vec128<T, N> no) {
-  return IfVecThenElse(VecFromMask(mask), yes, no);
+  const DFromV<decltype(yes)> d;
+  return IfVecThenElse(VecFromMask(d, mask), yes, no);
 }
 template <typename T, size_t N>
 HWY_API Vec128<T, N> IfThenElseZero(Mask128<T, N> mask, Vec128<T, N> yes) {
   const DFromV<decltype(yes)> d;
-  return IfVecThenElse(VecFromMask(mask), yes, Zero(d));
+  return IfVecThenElse(VecFromMask(d, mask), yes, Zero(d));
 }
 template <typename T, size_t N>
 HWY_API Vec128<T, N> IfThenZeroElse(Mask128<T, N> mask, Vec128<T, N> no) {
   const DFromV<decltype(no)> d;
-  return IfVecThenElse(VecFromMask(mask), Zero(d), no);
+  return IfVecThenElse(VecFromMask(d, mask), Zero(d), no);
 }
 template <typename T, size_t N>
@@ -364,17 +428,12 @@ HWY_API Vec128<T, N> IfNegativeThenElse(Vec128<T, N> v, Vec128<T, N> yes,
   return v;
 }
-template <typename T, size_t N>
-HWY_API Vec128<T, N> ZeroIfNegative(Vec128<T, N> v) {
-  const DFromV<decltype(v)> d;
-  return IfNegativeThenElse(v, Zero(d), v);
-}
 // ------------------------------ Mask logical
 template <typename T, size_t N>
 HWY_API Mask128<T, N> Not(Mask128<T, N> m) {
-  return MaskFromVec(Not(VecFromMask(Simd<T, N, 0>(), m)));
+  const Simd<T, N, 0> d;
+  return MaskFromVec(Not(VecFromMask(d, m)));
 }
 template <typename T, size_t N>
@@ -426,41 +485,26 @@ HWY_API Vec128<T, N> ShiftLeft(Vec128<T, N> v) {
 template <int kBits, typename T, size_t N>
 HWY_API Vec128<T, N> ShiftRight(Vec128<T, N> v) {
   static_assert(0 <= kBits && kBits < sizeof(T) * 8, "Invalid shift");
-#if __cplusplus >= 202002L
   // Signed right shift is now guaranteed to be arithmetic (rounding toward
   // negative infinity, i.e. shifting in the sign bit).
   for (size_t i = 0; i < N; ++i) {
-    v.raw[i] = static_cast<T>(v.raw[i] >> kBits);
-  }
-#else
-  if (IsSigned<T>()) {
-    // Emulate arithmetic shift using only logical (unsigned) shifts, because
-    // signed shifts are still implementation-defined.
-    using TU = hwy::MakeUnsigned<T>;
-    for (size_t i = 0; i < N; ++i) {
-      const TU shifted = static_cast<TU>(static_cast<TU>(v.raw[i]) >> kBits);
-      const TU sign = v.raw[i] < 0 ? static_cast<TU>(~TU{0}) : 0;
-      const size_t sign_shift =
-          static_cast<size_t>(static_cast<int>(sizeof(TU)) * 8 - 1 - kBits);
-      const TU upper = static_cast<TU>(sign << sign_shift);
-      v.raw[i] = static_cast<T>(shifted | upper);
-    }
-  } else {  // T is unsigned
-    for (size_t i = 0; i < N; ++i) {
-      v.raw[i] = static_cast<T>(v.raw[i] >> kBits);
-    }
+    v.raw[i] = ScalarShr(v.raw[i], kBits);
   }
-#endif
   return v;
 }
 // ------------------------------ RotateRight (ShiftRight)
-template <int kBits, typename T, size_t N>
+template <int kBits, typename T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
 HWY_API Vec128<T, N> RotateRight(const Vec128<T, N> v) {
+  const DFromV<decltype(v)> d;
+  const RebindToUnsigned<decltype(d)> du;
   constexpr size_t kSizeInBits = sizeof(T) * 8;
   static_assert(0 <= kBits && kBits < kSizeInBits, "Invalid shift count");
   if (kBits == 0) return v;
-  return Or(ShiftRight<kBits>(v),
+  return Or(BitCast(d, ShiftRight<kBits>(BitCast(du, v))),
             ShiftLeft<HWY_MIN(kSizeInBits - 1, kSizeInBits - kBits)>(v));
 }
@@ -477,31 +521,10 @@ HWY_API Vec128<T, N> ShiftLeftSame(Vec128<T, N> v, int bits) {
 template <typename T, size_t N>
 HWY_API Vec128<T, N> ShiftRightSame(Vec128<T, N> v, int bits) {
-#if __cplusplus >= 202002L
-  // Signed right shift is now guaranteed to be arithmetic (rounding toward
-  // negative infinity, i.e. shifting in the sign bit).
   for (size_t i = 0; i < N; ++i) {
-    v.raw[i] = static_cast<T>(v.raw[i] >> bits);
-  }
-#else
-  if (IsSigned<T>()) {
-    // Emulate arithmetic shift using only logical (unsigned) shifts, because
-    // signed shifts are still implementation-defined.
-    using TU = hwy::MakeUnsigned<T>;
-    for (size_t i = 0; i < N; ++i) {
-      const TU shifted = static_cast<TU>(static_cast<TU>(v.raw[i]) >> bits);
-      const TU sign = v.raw[i] < 0 ? static_cast<TU>(~TU{0}) : 0;
-      const size_t sign_shift =
-          static_cast<size_t>(static_cast<int>(sizeof(TU)) * 8 - 1 - bits);
-      const TU upper = static_cast<TU>(sign << sign_shift);
-      v.raw[i] = static_cast<T>(shifted | upper);
-    }
-  } else {
-    for (size_t i = 0; i < N; ++i) {
-      v.raw[i] = static_cast<T>(v.raw[i] >> bits);  // unsigned, logical shift
-    }
+    v.raw[i] = ScalarShr(v.raw[i], bits);
   }
-#endif
   return v;
 }
@@ -519,32 +542,10 @@ HWY_API Vec128<T, N> operator<<(Vec128<T, N> v, Vec128<T, N> bits) {
 template <typename T, size_t N>
 HWY_API Vec128<T, N> operator>>(Vec128<T, N> v, Vec128<T, N> bits) {
-#if __cplusplus >= 202002L
-  // Signed right shift is now guaranteed to be arithmetic (rounding toward
-  // negative infinity, i.e. shifting in the sign bit).
   for (size_t i = 0; i < N; ++i) {
-    v.raw[i] = static_cast<T>(v.raw[i] >> bits.raw[i]);
-  }
-#else
-  if (IsSigned<T>()) {
-    // Emulate arithmetic shift using only logical (unsigned) shifts, because
-    // signed shifts are still implementation-defined.
-    using TU = hwy::MakeUnsigned<T>;
-    for (size_t i = 0; i < N; ++i) {
-      const TU shifted =
-          static_cast<TU>(static_cast<TU>(v.raw[i]) >> bits.raw[i]);
-      const TU sign = v.raw[i] < 0 ? static_cast<TU>(~TU{0}) : 0;
-      const size_t sign_shift = static_cast<size_t>(
-          static_cast<int>(sizeof(TU)) * 8 - 1 - bits.raw[i]);
-      const TU upper = static_cast<TU>(sign << sign_shift);
-      v.raw[i] = static_cast<T>(shifted | upper);
-    }
-  } else {  // T is unsigned
-    for (size_t i = 0; i < N; ++i) {
-      v.raw[i] = static_cast<T>(v.raw[i] >> bits.raw[i]);
-    }
+    v.raw[i] = ScalarShr(v.raw[i], static_cast<int>(bits.raw[i]));
   }
-#endif
   return v;
 }
@@ -614,6 +615,15 @@ HWY_API Vec128<uint64_t, (N + 7) / 8> SumsOf8(Vec128<uint8_t, N> v) {
   return sums;
 }
+template <size_t N>
+HWY_API Vec128<int64_t, (N + 7) / 8> SumsOf8(Vec128<int8_t, N> v) {
+  Vec128<int64_t, (N + 7) / 8> sums;
+  for (size_t i = 0; i < N; ++i) {
+    sums.raw[i / 8] += v.raw[i];
+  }
+  return sums;
+}
 // ------------------------------ SaturatedAdd
 template <typename T, size_t N, HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2)),
           HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
@@ -652,34 +662,14 @@ HWY_API Vec128<T, N> AverageRound(Vec128<T, N> a, Vec128<T, N> b) {
 // ------------------------------ Abs
-// Tag dispatch instead of SFINAE for MSVC 2017 compatibility
-namespace detail {
 template <typename T, size_t N>
-HWY_INLINE Vec128<T, N> Abs(SignedTag /*tag*/, Vec128<T, N> a) {
+HWY_API Vec128<T, N> Abs(Vec128<T, N> a) {
   for (size_t i = 0; i < N; ++i) {
-    const T s = a.raw[i];
-    const T min = hwy::LimitsMin<T>();
-    a.raw[i] = static_cast<T>((s >= 0 || s == min) ? a.raw[i] : -s);
+    a.raw[i] = ScalarAbs(a.raw[i]);
   }
   return a;
 }
-template <typename T, size_t N>
-HWY_INLINE Vec128<T, N> Abs(hwy::FloatTag /*tag*/, Vec128<T, N> v) {
-  for (size_t i = 0; i < N; ++i) {
-    v.raw[i] = std::abs(v.raw[i]);
-  }
-  return v;
-}
-}  // namespace detail
-template <typename T, size_t N>
-HWY_API Vec128<T, N> Abs(Vec128<T, N> a) {
-  return detail::Abs(hwy::TypeTag<T>(), a);
-}
 // ------------------------------ Min/Max
 // Tag dispatch instead of SFINAE for MSVC 2017 compatibility
@@ -706,9 +696,9 @@ template <typename T, size_t N>
 HWY_INLINE Vec128<T, N> Min(hwy::FloatTag /*tag*/, Vec128<T, N> a,
                             Vec128<T, N> b) {
   for (size_t i = 0; i < N; ++i) {
-    if (std::isnan(a.raw[i])) {
+    if (ScalarIsNaN(a.raw[i])) {
       a.raw[i] = b.raw[i];
-    } else if (std::isnan(b.raw[i])) {
+    } else if (ScalarIsNaN(b.raw[i])) {
       // no change
     } else {
       a.raw[i] = HWY_MIN(a.raw[i], b.raw[i]);
@@ -720,9 +710,9 @@ template <typename T, size_t N>
 HWY_INLINE Vec128<T, N> Max(hwy::FloatTag /*tag*/, Vec128<T, N> a,
                             Vec128<T, N> b) {
   for (size_t i = 0; i < N; ++i) {
-    if (std::isnan(a.raw[i])) {
+    if (ScalarIsNaN(a.raw[i])) {
       a.raw[i] = b.raw[i];
-    } else if (std::isnan(b.raw[i])) {
+    } else if (ScalarIsNaN(b.raw[i])) {
       // no change
     } else {
       a.raw[i] = HWY_MAX(a.raw[i], b.raw[i]);
@@ -825,7 +815,7 @@ HWY_API Vec128<T, N> operator*(Vec128<T, N> a, Vec128<T, N> b) {
   return detail::Mul(hwy::TypeTag<T>(), a, b);
 }
-template <typename T, size_t N>
+template <typename T, size_t N, HWY_IF_FLOAT(T)>
 HWY_API Vec128<T, N> operator/(Vec128<T, N> a, Vec128<T, N> b) {
   for (size_t i = 0; i < N; ++i) {
     a.raw[i] = (b.raw[i] == T{0}) ? 0 : a.raw[i] / b.raw[i];
@@ -833,26 +823,36 @@ HWY_API Vec128<T, N> operator/(Vec128<T, N> a, Vec128<T, N> b) {
   return a;
 }
-// Returns the upper 16 bits of a * b in each lane.
-template <size_t N>
-HWY_API Vec128<int16_t, N> MulHigh(Vec128<int16_t, N> a, Vec128<int16_t, N> b) {
+// Returns the upper sizeof(T)*8 bits of a * b in each lane.
+template <class T, size_t N,
+          HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2) | (1 << 4)),
+          HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
+HWY_API Vec128<T, N> MulHigh(Vec128<T, N> a, Vec128<T, N> b) {
+  using TW = MakeWide<T>;
   for (size_t i = 0; i < N; ++i) {
-    a.raw[i] = static_cast<int16_t>((int32_t{a.raw[i]} * b.raw[i]) >> 16);
+    a.raw[i] = static_cast<T>(
+        (static_cast<TW>(a.raw[i]) * static_cast<TW>(b.raw[i])) >>
+        (sizeof(T) * 8));
   }
   return a;
 }
-template <size_t N>
-HWY_API Vec128<uint16_t, N> MulHigh(Vec128<uint16_t, N> a,
-                                    Vec128<uint16_t, N> b) {
-  for (size_t i = 0; i < N; ++i) {
-    // Cast to uint32_t first to prevent overflow. Otherwise the result of
-    // uint16_t * uint16_t is in "int" which may overflow. In practice the
-    // result is the same but this way it is also defined.
-    a.raw[i] = static_cast<uint16_t>(
-        (static_cast<uint32_t>(a.raw[i]) * static_cast<uint32_t>(b.raw[i])) >>
-        16);
-  }
-  return a;
+template <class T, HWY_IF_UI64(T)>
+HWY_API Vec128<T, 1> MulHigh(Vec128<T, 1> a, Vec128<T, 1> b) {
+  T hi;
+  Mul128(GetLane(a), GetLane(b), &hi);
+  return Set(Full64<T>(), hi);
+}
+template <class T, HWY_IF_UI64(T)>
+HWY_API Vec128<T> MulHigh(Vec128<T> a, Vec128<T> b) {
+  T hi_0;
+  T hi_1;
+  Mul128(GetLane(a), GetLane(b), &hi_0);
+  Mul128(ExtractLane(a, 1), ExtractLane(b, 1), &hi_1);
+  return Dup128VecFromValues(Full128<T>(), hi_0, hi_1);
 }
 template <size_t N>
@@ -900,7 +900,7 @@ HWY_API Vec128<float, N> ApproximateReciprocal(Vec128<float, N> v) {
     // Zero inputs are allowed, but callers are responsible for replacing the
     // return value with something else (typically using IfThenElse). This check
     // avoids a ubsan error. The result is arbitrary.
-    v.raw[i] = (std::abs(v.raw[i]) == 0.0f) ? 0.0f : 1.0f / v.raw[i];
+    v.raw[i] = (ScalarAbs(v.raw[i]) == 0.0f) ? 0.0f : 1.0f / v.raw[i];
   }
   return v;
 }
@@ -913,25 +913,25 @@ HWY_API Vec128<T, N> AbsDiff(Vec128<T, N> a, Vec128<T, N> b) {
 // ------------------------------ Floating-point multiply-add variants
-template <typename T, size_t N>
+template <typename T, size_t N, HWY_IF_FLOAT(T)>
 HWY_API Vec128<T, N> MulAdd(Vec128<T, N> mul, Vec128<T, N> x,
                             Vec128<T, N> add) {
   return mul * x + add;
 }
-template <typename T, size_t N>
+template <typename T, size_t N, HWY_IF_FLOAT(T)>
 HWY_API Vec128<T, N> NegMulAdd(Vec128<T, N> mul, Vec128<T, N> x,
                                Vec128<T, N> add) {
   return add - mul * x;
 }
-template <typename T, size_t N>
+template <typename T, size_t N, HWY_IF_FLOAT(T)>
 HWY_API Vec128<T, N> MulSub(Vec128<T, N> mul, Vec128<T, N> x,
                             Vec128<T, N> sub) {
   return mul * x - sub;
 }
-template <typename T, size_t N>
+template <typename T, size_t N, HWY_IF_FLOAT(T)>
 HWY_API Vec128<T, N> NegMulSub(Vec128<T, N> mul, Vec128<T, N> x,
                                Vec128<T, N> sub) {
   return Neg(mul) * x - sub;
@@ -943,21 +943,52 @@ template <size_t N>
 HWY_API Vec128<float, N> ApproximateReciprocalSqrt(Vec128<float, N> v) {
   for (size_t i = 0; i < N; ++i) {
     const float half = v.raw[i] * 0.5f;
-    uint32_t bits;
-    CopySameSize(&v.raw[i], &bits);
     // Initial guess based on log2(f)
-    bits = 0x5F3759DF - (bits >> 1);
-    CopySameSize(&bits, &v.raw[i]);
+    v.raw[i] = BitCastScalar<float>(static_cast<uint32_t>(
+        0x5F3759DF - (BitCastScalar<uint32_t>(v.raw[i]) >> 1)));
     // One Newton-Raphson iteration
     v.raw[i] = v.raw[i] * (1.5f - (half * v.raw[i] * v.raw[i]));
   }
   return v;
 }
+namespace detail {
+static HWY_INLINE float ScalarSqrt(float v) {
+#if defined(HWY_NO_LIBCXX)
+#if HWY_COMPILER_GCC_ACTUAL
+  return __builtin_sqrt(v);
+#else
+  uint32_t bits = BitCastScalar<uint32_t>(v);
+  // Coarse approximation, letting the exponent LSB leak into the mantissa
+  bits = (1 << 29) + (bits >> 1) - (1 << 22);
+  return BitCastScalar<float>(bits);
+#endif  // !HWY_COMPILER_GCC_ACTUAL
+#else
+  return sqrtf(v);
+#endif  // !HWY_NO_LIBCXX
+}
+static HWY_INLINE double ScalarSqrt(double v) {
+#if defined(HWY_NO_LIBCXX)
+#if HWY_COMPILER_GCC_ACTUAL
+  return __builtin_sqrt(v);
+#else
+  uint64_t bits = BitCastScalar<uint64_t>(v);
+  // Coarse approximation, letting the exponent LSB leak into the mantissa
+  bits = (1ULL << 61) + (bits >> 1) - (1ULL << 51);
+  return BitCastScalar<double>(bits);
+#endif  // !HWY_COMPILER_GCC_ACTUAL
+#else
+  return sqrt(v);
+#endif  // HWY_NO_LIBCXX
+}
+}  // namespace detail
 template <typename T, size_t N>
 HWY_API Vec128<T, N> Sqrt(Vec128<T, N> v) {
   for (size_t i = 0; i < N; ++i) {
-    v.raw[i] = std::sqrt(v.raw[i]);
+    v.raw[i] = detail::ScalarSqrt(v.raw[i]);
   }
   return v;
 }
@@ -967,21 +998,23 @@ HWY_API Vec128<T, N> Sqrt(Vec128<T, N> v) {
 template <typename T, size_t N>
 HWY_API Vec128<T, N> Round(Vec128<T, N> v) {
   using TI = MakeSigned<T>;
+  const T k0 = ConvertScalarTo<T>(0);
   const Vec128<T, N> a = Abs(v);
   for (size_t i = 0; i < N; ++i) {
     if (!(a.raw[i] < MantissaEnd<T>())) {  // Huge or NaN
       continue;
     }
-    const T bias = v.raw[i] < T(0.0) ? T(-0.5) : T(0.5);
-    const TI rounded = static_cast<TI>(v.raw[i] + bias);
+    const T bias = ConvertScalarTo<T>(v.raw[i] < k0 ? -0.5 : 0.5);
+    const TI rounded = ConvertScalarTo<TI>(v.raw[i] + bias);
     if (rounded == 0) {
-      v.raw[i] = v.raw[i] < 0 ? T{-0} : T{0};
+      v.raw[i] = v.raw[i] < 0 ? ConvertScalarTo<T>(-0) : k0;
       continue;
     }
-    const T rounded_f = static_cast<T>(rounded);
+    const T rounded_f = ConvertScalarTo<T>(rounded);
     // Round to even
-    if ((rounded & 1) && std::abs(rounded_f - v.raw[i]) == T(0.5)) {
-      v.raw[i] = static_cast<T>(rounded - (v.raw[i] < T(0) ? -1 : 1));
+    if ((rounded & 1) &&
+        ScalarAbs(rounded_f - v.raw[i]) == ConvertScalarTo<T>(0.5)) {
+      v.raw[i] = ConvertScalarTo<T>(rounded - (v.raw[i] < k0 ? -1 : 1));
       continue;
     }
     v.raw[i] = rounded_f;
@@ -994,30 +1027,32 @@ template <size_t N>
 HWY_API Vec128<int32_t, N> NearestInt(Vec128<float, N> v) {
   using T = float;
   using TI = int32_t;
+  const T k0 = ConvertScalarTo<T>(0);
   const Vec128<float, N> abs = Abs(v);
   Vec128<int32_t, N> ret;
   for (size_t i = 0; i < N; ++i) {
-    const bool signbit = std::signbit(v.raw[i]);
+    const bool signbit = ScalarSignBit(v.raw[i]);
     if (!(abs.raw[i] < MantissaEnd<T>())) {  // Huge or NaN
       // Check if too large to cast or NaN
-      if (!(abs.raw[i] <= static_cast<T>(LimitsMax<TI>()))) {
+      if (!(abs.raw[i] <= ConvertScalarTo<T>(LimitsMax<TI>()))) {
         ret.raw[i] = signbit ? LimitsMin<TI>() : LimitsMax<TI>();
         continue;
       }
       ret.raw[i] = static_cast<TI>(v.raw[i]);
       continue;
     }
-    const T bias = v.raw[i] < T(0.0) ? T(-0.5) : T(0.5);
-    const TI rounded = static_cast<TI>(v.raw[i] + bias);
+    const T bias = ConvertScalarTo<T>(v.raw[i] < k0 ? -0.5 : 0.5);
+    const TI rounded = ConvertScalarTo<TI>(v.raw[i] + bias);
     if (rounded == 0) {
       ret.raw[i] = 0;
       continue;
     }
-    const T rounded_f = static_cast<T>(rounded);
+    const T rounded_f = ConvertScalarTo<T>(rounded);
     // Round to even
-    if ((rounded & 1) && std::abs(rounded_f - v.raw[i]) == T(0.5)) {
+    if ((rounded & 1) &&
+        ScalarAbs(rounded_f - v.raw[i]) == ConvertScalarTo<T>(0.5)) {
       ret.raw[i] = rounded - (signbit ? -1 : 1);
       continue;
     }
@@ -1056,8 +1091,7 @@ Vec128<Float, N> Ceil(Vec128<Float, N> v) {
   for (size_t i = 0; i < N; ++i) {
     const bool positive = v.raw[i] > Float(0.0);
-    Bits bits;
-    CopySameSize(&v.raw[i], &bits);
+    Bits bits = BitCastScalar<Bits>(v.raw[i]);
     const int exponent =
         static_cast<int>(((bits >> kMantissaBits) & kExponentMask) - kBias);
@@ -1077,7 +1111,7 @@ Vec128<Float, N> Ceil(Vec128<Float, N> v) {
     if (positive) bits += (kMantissaMask + 1) >> exponent;
     bits &= ~mantissa_mask;
-    CopySameSize(&bits, &v.raw[i]);
+    v.raw[i] = BitCastScalar<Float>(bits);
   }
   return v;
 }
@@ -1094,8 +1128,7 @@ Vec128<Float, N> Floor(Vec128<Float, N> v) {
   for (size_t i = 0; i < N; ++i) {
     const bool negative = v.raw[i] < Float(0.0);
-    Bits bits;
-    CopySameSize(&v.raw[i], &bits);
+    Bits bits = BitCastScalar<Bits>(v.raw[i]);
     const int exponent =
         static_cast<int>(((bits >> kMantissaBits) & kExponentMask) - kBias);
@@ -1115,7 +1148,7 @@ Vec128<Float, N> Floor(Vec128<Float, N> v) {
     if (negative) bits += (kMantissaMask + 1) >> exponent;
     bits &= ~mantissa_mask;
-    CopySameSize(&bits, &v.raw[i]);
+    v.raw[i] = BitCastScalar<Float>(bits);
   }
   return v;
 }
@@ -1127,44 +1160,11 @@ HWY_API Mask128<T, N> IsNaN(Vec128<T, N> v) {
   Mask128<T, N> ret;
   for (size_t i = 0; i < N; ++i) {
     // std::isnan returns false for 0x7F..FF in clang AVX3 builds, so DIY.
-    MakeUnsigned<T> bits;
-    CopySameSize(&v.raw[i], &bits);
-    bits += bits;
-    bits >>= 1;  // clear sign bit
-    // NaN if all exponent bits are set and the mantissa is not zero.
-    ret.bits[i] = Mask128<T, N>::FromBool(bits > ExponentMask<T>());
+    ret.bits[i] = Mask128<T, N>::FromBool(ScalarIsNaN(v.raw[i]));
   }
   return ret;
 }
-template <typename T, size_t N>
-HWY_API Mask128<T, N> IsInf(Vec128<T, N> v) {
-  static_assert(IsFloat<T>(), "Only for float");
-  const DFromV<decltype(v)> d;
-  const RebindToSigned<decltype(d)> di;
-  const VFromD<decltype(di)> vi = BitCast(di, v);
-  // 'Shift left' to clear the sign bit, check for exponent=max and mantissa=0.
-  return RebindMask(d, Eq(Add(vi, vi), Set(di, hwy::MaxExponentTimes2<T>())));
-}
-// Returns whether normal/subnormal/zero.
-template <typename T, size_t N>
-HWY_API Mask128<T, N> IsFinite(Vec128<T, N> v) {
-  static_assert(IsFloat<T>(), "Only for float");
-  const DFromV<decltype(v)> d;
-  const RebindToUnsigned<decltype(d)> du;
-  const RebindToSigned<decltype(d)> di;  // cheaper than unsigned comparison
-  using VI = VFromD<decltype(di)>;
-  using VU = VFromD<decltype(du)>;
-  const VU vu = BitCast(du, v);
-  // 'Shift left' to clear the sign bit, then right so we can compare with the
-  // max exponent (cannot compare with MaxExponentTimes2 directly because it is
-  // negative and non-negative floats would be greater).
-  const VI exp =
-      BitCast(di, ShiftRight<hwy::MantissaBits<T>() + 1>(Add(vu, vu)));
-  return RebindMask(d, Lt(exp, Set(di, hwy::MaxExponentField<T>())));
-}
 // ================================================== COMPARE
 template <typename T, size_t N>
@@ -1400,177 +1400,387 @@ HWY_API void StoreN(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT p,
   CopyBytes(v.raw, p, num_of_lanes_to_store * sizeof(TFromD<D>));
 }
-// ------------------------------ LoadInterleaved2/3/4
+// ================================================== COMBINE
-// Per-target flag to prevent generic_ops-inl.h from defining LoadInterleaved2.
-// We implement those here because scalar code is likely faster than emulation
-// via shuffles.
-#ifdef HWY_NATIVE_LOAD_STORE_INTERLEAVED
-#undef HWY_NATIVE_LOAD_STORE_INTERLEAVED
-#else
-#define HWY_NATIVE_LOAD_STORE_INTERLEAVED
-#endif
+template <typename T, size_t N>
+HWY_API Vec128<T, N / 2> LowerHalf(Vec128<T, N> v) {
+  Vec128<T, N / 2> ret;
+  CopyBytes<N / 2 * sizeof(T)>(v.raw, ret.raw);
+  return ret;
+}
-template <class D, typename T = TFromD<D>>
-HWY_API void LoadInterleaved2(D d, const T* HWY_RESTRICT unaligned,
-                              VFromD<D>& v0, VFromD<D>& v1) {
-  alignas(16) T buf0[MaxLanes(d)];
-  alignas(16) T buf1[MaxLanes(d)];
-  for (size_t i = 0; i < MaxLanes(d); ++i) {
-    buf0[i] = *unaligned++;
-    buf1[i] = *unaligned++;
-  }
-  v0 = Load(d, buf0);
-  v1 = Load(d, buf1);
+template <class D>
+HWY_API VFromD<D> LowerHalf(D /* tag */, VFromD<Twice<D>> v) {
+  return LowerHalf(v);
 }
-template <class D, typename T = TFromD<D>>
-HWY_API void LoadInterleaved3(D d, const T* HWY_RESTRICT unaligned,
-                              VFromD<D>& v0, VFromD<D>& v1, VFromD<D>& v2) {
-  alignas(16) T buf0[MaxLanes(d)];
-  alignas(16) T buf1[MaxLanes(d)];
-  alignas(16) T buf2[MaxLanes(d)];
-  for (size_t i = 0; i < MaxLanes(d); ++i) {
-    buf0[i] = *unaligned++;
-    buf1[i] = *unaligned++;
-    buf2[i] = *unaligned++;
-  }
-  v0 = Load(d, buf0);
-  v1 = Load(d, buf1);
-  v2 = Load(d, buf2);
+template <class D>
+HWY_API VFromD<D> UpperHalf(D d, VFromD<Twice<D>> v) {
+  VFromD<D> ret;
+  CopyBytes<d.MaxBytes()>(&v.raw[MaxLanes(d)], ret.raw);
+  return ret;
 }
-template <class D, typename T = TFromD<D>>
-HWY_API void LoadInterleaved4(D d, const T* HWY_RESTRICT unaligned,
-                              VFromD<D>& v0, VFromD<D>& v1, VFromD<D>& v2,
-                              VFromD<D>& v3) {
-  alignas(16) T buf0[MaxLanes(d)];
-  alignas(16) T buf1[MaxLanes(d)];
-  alignas(16) T buf2[MaxLanes(d)];
-  alignas(16) T buf3[MaxLanes(d)];
-  for (size_t i = 0; i < MaxLanes(d); ++i) {
-    buf0[i] = *unaligned++;
-    buf1[i] = *unaligned++;
-    buf2[i] = *unaligned++;
-    buf3[i] = *unaligned++;
-  }
-  v0 = Load(d, buf0);
-  v1 = Load(d, buf1);
-  v2 = Load(d, buf2);
-  v3 = Load(d, buf3);
+template <class D>
+HWY_API VFromD<D> ZeroExtendVector(D d, VFromD<Half<D>> v) {
+  const Half<decltype(d)> dh;
+  VFromD<D> ret;  // zero-initialized
+  CopyBytes<dh.MaxBytes()>(v.raw, ret.raw);
+  return ret;
 }
-// ------------------------------ StoreInterleaved2/3/4
+template <class D, class VH = VFromD<Half<D>>>
+HWY_API VFromD<D> Combine(D d, VH hi_half, VH lo_half) {
+  const Half<decltype(d)> dh;
+  VFromD<D> ret;
+  CopyBytes<dh.MaxBytes()>(lo_half.raw, &ret.raw[0]);
+  CopyBytes<dh.MaxBytes()>(hi_half.raw, &ret.raw[MaxLanes(dh)]);
+  return ret;
+}
 template <class D>
-HWY_API void StoreInterleaved2(VFromD<D> v0, VFromD<D> v1, D d,
-                               TFromD<D>* HWY_RESTRICT unaligned) {
-  for (size_t i = 0; i < MaxLanes(d); ++i) {
-    *unaligned++ = v0.raw[i];
-    *unaligned++ = v1.raw[i];
-  }
+HWY_API VFromD<D> ConcatLowerLower(D d, VFromD<D> hi, VFromD<D> lo) {
+  const Half<decltype(d)> dh;
+  VFromD<D> ret;
+  CopyBytes<dh.MaxBytes()>(lo.raw, &ret.raw[0]);
+  CopyBytes<dh.MaxBytes()>(hi.raw, &ret.raw[MaxLanes(dh)]);
+  return ret;
 }
 template <class D>
-HWY_API void StoreInterleaved3(VFromD<D> v0, VFromD<D> v1, VFromD<D> v2, D d,
-                               TFromD<D>* HWY_RESTRICT unaligned) {
-  for (size_t i = 0; i < MaxLanes(d); ++i) {
-    *unaligned++ = v0.raw[i];
-    *unaligned++ = v1.raw[i];
-    *unaligned++ = v2.raw[i];
-  }
+HWY_API VFromD<D> ConcatUpperUpper(D d, VFromD<D> hi, VFromD<D> lo) {
+  const Half<decltype(d)> dh;
+  VFromD<D> ret;
+  CopyBytes<dh.MaxBytes()>(&lo.raw[MaxLanes(dh)], &ret.raw[0]);
+  CopyBytes<dh.MaxBytes()>(&hi.raw[MaxLanes(dh)], &ret.raw[MaxLanes(dh)]);
+  return ret;
 }
 template <class D>
-HWY_API void StoreInterleaved4(VFromD<D> v0, VFromD<D> v1, VFromD<D> v2,
-                               VFromD<D> v3, D d,
-                               TFromD<D>* HWY_RESTRICT unaligned) {
-  for (size_t i = 0; i < MaxLanes(d); ++i) {
-    *unaligned++ = v0.raw[i];
-    *unaligned++ = v1.raw[i];
-    *unaligned++ = v2.raw[i];
-    *unaligned++ = v3.raw[i];
-  }
+HWY_API VFromD<D> ConcatLowerUpper(D d, VFromD<D> hi, VFromD<D> lo) {
+  const Half<decltype(d)> dh;
+  VFromD<D> ret;
+  CopyBytes<dh.MaxBytes()>(&lo.raw[MaxLanes(dh)], &ret.raw[0]);
+  CopyBytes<dh.MaxBytes()>(hi.raw, &ret.raw[MaxLanes(dh)]);
+  return ret;
 }
-// ------------------------------ Stream
 template <class D>
-HWY_API void Stream(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT aligned) {
-  Store(v, d, aligned);
+HWY_API VFromD<D> ConcatUpperLower(D d, VFromD<D> hi, VFromD<D> lo) {
+  const Half<decltype(d)> dh;
+  VFromD<D> ret;
+  CopyBytes<dh.MaxBytes()>(lo.raw, &ret.raw[0]);
+  CopyBytes<dh.MaxBytes()>(&hi.raw[MaxLanes(dh)], &ret.raw[MaxLanes(dh)]);
+  return ret;
 }
-// ------------------------------ Scatter in generic_ops-inl.h
-// ------------------------------ Gather in generic_ops-inl.h
+template <class D>
+HWY_API VFromD<D> ConcatEven(D d, VFromD<D> hi, VFromD<D> lo) {
+  const Half<decltype(d)> dh;
+  VFromD<D> ret;
+  for (size_t i = 0; i < MaxLanes(dh); ++i) {
+    ret.raw[i] = lo.raw[2 * i];
+  }
+  for (size_t i = 0; i < MaxLanes(dh); ++i) {
+    ret.raw[MaxLanes(dh) + i] = hi.raw[2 * i];
+  }
+  return ret;
+}
-// ================================================== CONVERT
+// 2023-11-23: workaround for incorrect codegen (reduction_test fails for
+// SumsOf2 because PromoteOddTo, which uses ConcatOdd, returns zero).
+#if HWY_ARCH_RISCV && HWY_TARGET == HWY_EMU128 && HWY_COMPILER_CLANG
+#define HWY_EMU128_CONCAT_INLINE HWY_NOINLINE
+#else
+#define HWY_EMU128_CONCAT_INLINE HWY_API
+#endif
-// ConvertTo and DemoteTo with floating-point input and integer output truncate
-// (rounding toward zero).
+template <class D>
+HWY_EMU128_CONCAT_INLINE VFromD<D> ConcatOdd(D d, VFromD<D> hi, VFromD<D> lo) {
+  const Half<decltype(d)> dh;
+  VFromD<D> ret;
+  for (size_t i = 0; i < MaxLanes(dh); ++i) {
+    ret.raw[i] = lo.raw[2 * i + 1];
+  }
+  for (size_t i = 0; i < MaxLanes(dh); ++i) {
+    ret.raw[MaxLanes(dh) + i] = hi.raw[2 * i + 1];
+  }
+  return ret;
+}
-namespace detail {
+// ------------------------------ CombineShiftRightBytes
+template <int kBytes, class D>
+HWY_API VFromD<D> CombineShiftRightBytes(D d, VFromD<D> hi, VFromD<D> lo) {
+  VFromD<D> ret;
+  const uint8_t* HWY_RESTRICT lo8 =
+      reinterpret_cast<const uint8_t * HWY_RESTRICT>(lo.raw);
+  uint8_t* HWY_RESTRICT ret8 =
+      reinterpret_cast<uint8_t * HWY_RESTRICT>(ret.raw);
+  CopyBytes<d.MaxBytes() - kBytes>(lo8 + kBytes, ret8);
+  CopyBytes<kBytes>(hi.raw, ret8 + d.MaxBytes() - kBytes);
+  return ret;
+}
-template <class ToT, class FromT>
-HWY_INLINE ToT CastValueForF2IConv(hwy::UnsignedTag /* to_type_tag */,
-                                   FromT val) {
-  // Prevent ubsan errors when converting float to narrower integer
+// ------------------------------ ShiftLeftBytes
+template <int kBytes, class D>
+HWY_API VFromD<D> ShiftLeftBytes(D d, VFromD<D> v) {
+  static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes");
+  VFromD<D> ret;
+  uint8_t* HWY_RESTRICT ret8 =
+      reinterpret_cast<uint8_t * HWY_RESTRICT>(ret.raw);
+  ZeroBytes<kBytes>(ret8);
+  CopyBytes<d.MaxBytes() - kBytes>(v.raw, ret8 + kBytes);
+  return ret;
+}
+template <int kBytes, typename T, size_t N>
+HWY_API Vec128<T, N> ShiftLeftBytes(Vec128<T, N> v) {
+  return ShiftLeftBytes<kBytes>(DFromV<decltype(v)>(), v);
+}
+// ------------------------------ ShiftLeftLanes
-  // If LimitsMax<ToT>() can be exactly represented in FromT,
-  // kSmallestOutOfToTRangePosVal is equal to LimitsMax<ToT>().
+template <int kLanes, class D, typename T = TFromD<D>>
+HWY_API VFromD<D> ShiftLeftLanes(D d, VFromD<D> v) {
+  const Repartition<uint8_t, decltype(d)> d8;
+  return BitCast(d, ShiftLeftBytes<kLanes * sizeof(T)>(BitCast(d8, v)));
+}
+template <int kLanes, typename T, size_t N>
+HWY_API Vec128<T, N> ShiftLeftLanes(Vec128<T, N> v) {
+  return ShiftLeftLanes<kLanes>(DFromV<decltype(v)>(), v);
+}
+// ------------------------------ ShiftRightBytes
+template <int kBytes, class D>
+HWY_API VFromD<D> ShiftRightBytes(D d, VFromD<D> v) {
+  static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes");
+  VFromD<D> ret;
+  const uint8_t* HWY_RESTRICT v8 =
+      reinterpret_cast<const uint8_t * HWY_RESTRICT>(v.raw);
+  uint8_t* HWY_RESTRICT ret8 =
+      reinterpret_cast<uint8_t * HWY_RESTRICT>(ret.raw);
+  CopyBytes<d.MaxBytes() - kBytes>(v8 + kBytes, ret8);
+  ZeroBytes<kBytes>(ret8 + d.MaxBytes() - kBytes);
+  return ret;
+}
+// ------------------------------ ShiftRightLanes
+template <int kLanes, class D>
+HWY_API VFromD<D> ShiftRightLanes(D d, VFromD<D> v) {
+  const Repartition<uint8_t, decltype(d)> d8;
+  constexpr size_t kBytes = kLanes * sizeof(TFromD<D>);
+  return BitCast(d, ShiftRightBytes<kBytes>(d8, BitCast(d8, v)));
+}
+// ------------------------------ Tuples, PromoteEvenTo/PromoteOddTo
+#include "hwy/ops/inside-inl.h"
+// ------------------------------ LoadInterleaved2/3/4
+// Per-target flag to prevent generic_ops-inl.h from defining LoadInterleaved2.
+// We implement those here because scalar code is likely faster than emulation
+// via shuffles.
+#ifdef HWY_NATIVE_LOAD_STORE_INTERLEAVED
+#undef HWY_NATIVE_LOAD_STORE_INTERLEAVED
+#else
+#define HWY_NATIVE_LOAD_STORE_INTERLEAVED
+#endif
+template <class D, typename T = TFromD<D>>
+HWY_API void LoadInterleaved2(D d, const T* HWY_RESTRICT unaligned,
+                              VFromD<D>& v0, VFromD<D>& v1) {
+  alignas(16) T buf0[MaxLanes(d)];
+  alignas(16) T buf1[MaxLanes(d)];
+  for (size_t i = 0; i < MaxLanes(d); ++i) {
+    buf0[i] = *unaligned++;
+    buf1[i] = *unaligned++;
+  }
+  v0 = Load(d, buf0);
+  v1 = Load(d, buf1);
+}
-  // Otherwise, if LimitsMax<ToT>() cannot be exactly represented in FromT,
-  // kSmallestOutOfToTRangePosVal is equal to LimitsMax<ToT>() + 1, which can
-  // be exactly represented in FromT.
-  constexpr FromT kSmallestOutOfToTRangePosVal =
-      (sizeof(ToT) * 8 <= static_cast<size_t>(MantissaBits<FromT>()) + 1)
-          ? static_cast<FromT>(LimitsMax<ToT>())
-          : static_cast<FromT>(
-                static_cast<FromT>(ToT{1} << (sizeof(ToT) * 8 - 1)) * FromT(2));
+template <class D, typename T = TFromD<D>>
+HWY_API void LoadInterleaved3(D d, const T* HWY_RESTRICT unaligned,
+                              VFromD<D>& v0, VFromD<D>& v1, VFromD<D>& v2) {
+  alignas(16) T buf0[MaxLanes(d)];
+  alignas(16) T buf1[MaxLanes(d)];
+  alignas(16) T buf2[MaxLanes(d)];
+  for (size_t i = 0; i < MaxLanes(d); ++i) {
+    buf0[i] = *unaligned++;
+    buf1[i] = *unaligned++;
+    buf2[i] = *unaligned++;
+  }
+  v0 = Load(d, buf0);
+  v1 = Load(d, buf1);
+  v2 = Load(d, buf2);
+}
-  if (std::signbit(val)) {
-    return ToT{0};
-  } else if (std::isinf(val) || val >= kSmallestOutOfToTRangePosVal) {
-    return LimitsMax<ToT>();
-  } else {
-    return static_cast<ToT>(val);
+template <class D, typename T = TFromD<D>>
+HWY_API void LoadInterleaved4(D d, const T* HWY_RESTRICT unaligned,
+                              VFromD<D>& v0, VFromD<D>& v1, VFromD<D>& v2,
+                              VFromD<D>& v3) {
+  alignas(16) T buf0[MaxLanes(d)];
+  alignas(16) T buf1[MaxLanes(d)];
+  alignas(16) T buf2[MaxLanes(d)];
+  alignas(16) T buf3[MaxLanes(d)];
+  for (size_t i = 0; i < MaxLanes(d); ++i) {
+    buf0[i] = *unaligned++;
+    buf1[i] = *unaligned++;
+    buf2[i] = *unaligned++;
+    buf3[i] = *unaligned++;
   }
+  v0 = Load(d, buf0);
+  v1 = Load(d, buf1);
+  v2 = Load(d, buf2);
+  v3 = Load(d, buf3);
 }
-template <class ToT, class FromT>
-HWY_INLINE ToT CastValueForF2IConv(hwy::SignedTag /* to_type_tag */,
-                                   FromT val) {
-  // Prevent ubsan errors when converting float to narrower integer
+// ------------------------------ StoreInterleaved2/3/4
-  // If LimitsMax<ToT>() can be exactly represented in FromT,
-  // kSmallestOutOfToTRangePosVal is equal to LimitsMax<ToT>().
+template <class D>
+HWY_API void StoreInterleaved2(VFromD<D> v0, VFromD<D> v1, D d,
+                               TFromD<D>* HWY_RESTRICT unaligned) {
+  for (size_t i = 0; i < MaxLanes(d); ++i) {
+    *unaligned++ = v0.raw[i];
+    *unaligned++ = v1.raw[i];
+  }
+}
-  // Otherwise, if LimitsMax<ToT>() cannot be exactly represented in FromT,
-  // kSmallestOutOfToTRangePosVal is equal to -LimitsMin<ToT>(), which can
-  // be exactly represented in FromT.
-  constexpr FromT kSmallestOutOfToTRangePosVal =
-      (sizeof(ToT) * 8 <= static_cast<size_t>(MantissaBits<FromT>()) + 2)
-          ? static_cast<FromT>(LimitsMax<ToT>())
-          : static_cast<FromT>(-static_cast<FromT>(LimitsMin<ToT>()));
+template <class D>
+HWY_API void StoreInterleaved3(VFromD<D> v0, VFromD<D> v1, VFromD<D> v2, D d,
+                               TFromD<D>* HWY_RESTRICT unaligned) {
+  for (size_t i = 0; i < MaxLanes(d); ++i) {
+    *unaligned++ = v0.raw[i];
+    *unaligned++ = v1.raw[i];
+    *unaligned++ = v2.raw[i];
+  }
+}
-  if (std::isinf(val) || std::fabs(val) >= kSmallestOutOfToTRangePosVal) {
-    return std::signbit(val) ? LimitsMin<ToT>() : LimitsMax<ToT>();
-  } else {
-    return static_cast<ToT>(val);
+template <class D>
+HWY_API void StoreInterleaved4(VFromD<D> v0, VFromD<D> v1, VFromD<D> v2,
+                               VFromD<D> v3, D d,
+                               TFromD<D>* HWY_RESTRICT unaligned) {
+  for (size_t i = 0; i < MaxLanes(d); ++i) {
+    *unaligned++ = v0.raw[i];
+    *unaligned++ = v1.raw[i];
+    *unaligned++ = v2.raw[i];
+    *unaligned++ = v3.raw[i];
   }
 }
+// ------------------------------ Stream
+template <class D>
+HWY_API void Stream(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT aligned) {
+  Store(v, d, aligned);
+}
+// ------------------------------ Scatter in generic_ops-inl.h
+// ------------------------------ Gather in generic_ops-inl.h
+// ================================================== CONVERT
+// ConvertTo and DemoteTo with floating-point input and integer output truncate
+// (rounding toward zero).
+namespace detail {
+template <class ToT, class FromT>
+HWY_INLINE ToT CastValueForF2IConv(FromT val) {
+  // Prevent ubsan errors when converting float to narrower integer
+  using FromTU = MakeUnsigned<FromT>;
+  using ToTU = MakeUnsigned<ToT>;
+  constexpr unsigned kMaxExpField =
+      static_cast<unsigned>(MaxExponentField<FromT>());
+  constexpr unsigned kExpBias = kMaxExpField >> 1;
+  constexpr unsigned kMinOutOfRangeExpField = static_cast<unsigned>(HWY_MIN(
+      kExpBias + sizeof(ToT) * 8 - static_cast<unsigned>(IsSigned<ToT>()),
+      kMaxExpField));
+  // If ToT is signed, compare only the exponent bits of val against
+  // kMinOutOfRangeExpField.
+  //
+  // Otherwise, if ToT is unsigned, compare the sign bit plus exponent bits of
+  // val against kMinOutOfRangeExpField as a negative value is outside of the
+  // range of an unsigned integer type.
+  const FromT val_to_compare =
+      static_cast<FromT>(IsSigned<ToT>() ? ScalarAbs(val) : val);
+  // val is within the range of ToT if
+  // (BitCastScalar<FromTU>(val_to_compare) >> MantissaBits<FromT>()) is less
+  // than kMinOutOfRangeExpField
+  //
+  // Otherwise, val is either outside of the range of ToT or equal to
+  // LimitsMin<ToT>() if
+  // (BitCastScalar<FromTU>(val_to_compare) >> MantissaBits<FromT>()) is greater
+  // than or equal to kMinOutOfRangeExpField.
+  return (static_cast<unsigned>(BitCastScalar<FromTU>(val_to_compare) >>
+                                MantissaBits<FromT>()) < kMinOutOfRangeExpField)
+             ? static_cast<ToT>(val)
+             : static_cast<ToT>(static_cast<ToTU>(LimitsMax<ToT>()) +
+                                static_cast<ToTU>(ScalarSignBit(val)));
+}
 template <class ToT, class ToTypeTag, class FromT>
 HWY_INLINE ToT CastValueForPromoteTo(ToTypeTag /* to_type_tag */, FromT val) {
-  return static_cast<ToT>(val);
+  return ConvertScalarTo<ToT>(val);
 }
 template <class ToT>
-HWY_INLINE ToT CastValueForPromoteTo(hwy::SignedTag to_type_tag, float val) {
-  return CastValueForF2IConv<ToT>(to_type_tag, val);
+HWY_INLINE ToT CastValueForPromoteTo(hwy::SignedTag /*to_type_tag*/,
+                                     float val) {
+  return CastValueForF2IConv<ToT>(val);
 }
 template <class ToT>
-HWY_INLINE ToT CastValueForPromoteTo(hwy::UnsignedTag to_type_tag, float val) {
-  return CastValueForF2IConv<ToT>(to_type_tag, val);
+HWY_INLINE ToT CastValueForPromoteTo(hwy::UnsignedTag /*to_type_tag*/,
+                                     float val) {
+  return CastValueForF2IConv<ToT>(val);
+}
+// If val is within the range of ToT, CastValueForInRangeF2IConv<ToT>(val)
+// returns static_cast<ToT>(val)
+//
+// Otherwise, CastValueForInRangeF2IConv<ToT>(val) returns an
+// implementation-defined result if val is not within the range of ToT.
+template <class ToT, class FromT>
+HWY_INLINE ToT CastValueForInRangeF2IConv(FromT val) {
+  // Prevent ubsan errors when converting float to narrower integer
+  using FromTU = MakeUnsigned<FromT>;
+  constexpr unsigned kMaxExpField =
+      static_cast<unsigned>(MaxExponentField<FromT>());
+  constexpr unsigned kExpBias = kMaxExpField >> 1;
+  constexpr unsigned kMinOutOfRangeExpField = static_cast<unsigned>(HWY_MIN(
+      kExpBias + sizeof(ToT) * 8 - static_cast<unsigned>(IsSigned<ToT>()),
+      kMaxExpField));
+  // If ToT is signed, compare only the exponent bits of val against
+  // kMinOutOfRangeExpField.
+  //
+  // Otherwise, if ToT is unsigned, compare the sign bit plus exponent bits of
+  // val against kMinOutOfRangeExpField as a negative value is outside of the
+  // range of an unsigned integer type.
+  const FromT val_to_compare =
+      static_cast<FromT>(IsSigned<ToT>() ? ScalarAbs(val) : val);
+  // val is within the range of ToT if
+  // (BitCastScalar<FromTU>(val_to_compare) >> MantissaBits<FromT>()) is less
+  // than kMinOutOfRangeExpField
+  //
+  // Otherwise, val is either outside of the range of ToT or equal to
+  // LimitsMin<ToT>() if
+  // (BitCastScalar<FromTU>(val_to_compare) >> MantissaBits<FromT>()) is greater
+  // than or equal to kMinOutOfRangeExpField.
+  return (static_cast<unsigned>(BitCastScalar<FromTU>(val_to_compare) >>
+                                MantissaBits<FromT>()) < kMinOutOfRangeExpField)
+             ? static_cast<ToT>(val)
+             : static_cast<ToT>(LimitsMin<ToT>());
 }
 }  // namespace detail
@@ -1587,6 +1797,21 @@ HWY_API VFromD<DTo> PromoteTo(DTo d, Vec128<TFrom, HWY_MAX_LANES_D(DTo)> from) {
   return ret;
 }
+#ifdef HWY_NATIVE_F32_TO_UI64_PROMOTE_IN_RANGE_TO
+#undef HWY_NATIVE_F32_TO_UI64_PROMOTE_IN_RANGE_TO
+#else
+#define HWY_NATIVE_F32_TO_UI64_PROMOTE_IN_RANGE_TO
+#endif
+template <class D64, HWY_IF_UI64_D(D64)>
+HWY_API VFromD<D64> PromoteInRangeTo(D64 d64, VFromD<Rebind<float, D64>> v) {
+  VFromD<D64> ret;
+  for (size_t i = 0; i < MaxLanes(d64); ++i) {
+    ret.raw[i] = detail::CastValueForInRangeF2IConv<TFromD<D64>>(v.raw[i]);
+  }
+  return ret;
+}
 // MSVC 19.10 cannot deduce the argument type if HWY_IF_FLOAT(TFrom) is here,
 // so we overload for TFrom=double and ToT={float,int32_t}.
 template <class D, HWY_IF_F32_D(D)>
@@ -1594,10 +1819,10 @@ HWY_API VFromD<D> DemoteTo(D d, VFromD<Rebind<double, D>> from) {
   VFromD<D> ret;
   for (size_t i = 0; i < MaxLanes(d); ++i) {
     // Prevent ubsan errors when converting float to narrower integer/float
-    if (std::isinf(from.raw[i]) ||
-        std::fabs(from.raw[i]) > static_cast<double>(HighestValue<float>())) {
-      ret.raw[i] = std::signbit(from.raw[i]) ? LowestValue<float>()
-                                             : HighestValue<float>();
+    if (ScalarIsInf(from.raw[i]) ||
+        ScalarAbs(from.raw[i]) > static_cast<double>(HighestValue<float>())) {
+      ret.raw[i] = ScalarSignBit(from.raw[i]) ? LowestValue<float>()
+                                              : HighestValue<float>();
       continue;
     }
     ret.raw[i] = static_cast<float>(from.raw[i]);
@@ -1609,8 +1834,7 @@ HWY_API VFromD<D> DemoteTo(D d, VFromD<Rebind<double, D>> from) {
   VFromD<D> ret;
   for (size_t i = 0; i < MaxLanes(d); ++i) {
     // Prevent ubsan errors when converting double to narrower integer/int32_t
-    ret.raw[i] = detail::CastValueForF2IConv<TFromD<D>>(
-        hwy::TypeTag<TFromD<D>>(), from.raw[i]);
+    ret.raw[i] = detail::CastValueForF2IConv<TFromD<D>>(from.raw[i]);
   }
   return ret;
 }
@@ -1631,17 +1855,32 @@ HWY_API VFromD<DTo> DemoteTo(DTo /* tag */, Vec128<TFrom, N> from) {
   return ret;
 }
+// Disable the default unsigned to signed DemoteTo/ReorderDemote2To
+// implementations in generic_ops-inl.h on EMU128 as the EMU128 target has
+// target-specific implementations of the unsigned to signed DemoteTo and
+// ReorderDemote2To ops
+// NOTE: hwy::EnableIf<!hwy::IsSame<V, V>()>* = nullptr is used instead of
+// hwy::EnableIf<false>* = nullptr to avoid compiler errors since
+// !hwy::IsSame<V, V>() is always false and as !hwy::IsSame<V, V>() will cause
+// SFINAE to occur instead of a hard error due to a dependency on the V template
+// argument
+#undef HWY_IF_U2I_DEMOTE_FROM_LANE_SIZE_V
+#define HWY_IF_U2I_DEMOTE_FROM_LANE_SIZE_V(V) \
+  hwy::EnableIf<!hwy::IsSame<V, V>()>* = nullptr
 template <class DTo, typename TFrom, size_t N, HWY_IF_UNSIGNED(TFrom),
-          HWY_IF_UNSIGNED_D(DTo)>
+          HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(DTo)>
 HWY_API VFromD<DTo> DemoteTo(DTo /* tag */, Vec128<TFrom, N> from) {
   using TTo = TFromD<DTo>;
   static_assert(sizeof(TTo) < sizeof(TFrom), "Not demoting");
+  const auto max = static_cast<MakeUnsigned<TTo>>(LimitsMax<TTo>());
   VFromD<DTo> ret;
   for (size_t i = 0; i < N; ++i) {
     // Int to int: choose closest value in ToT to `from` (avoids UB)
-    from.raw[i] = HWY_MIN(from.raw[i], LimitsMax<TTo>());
-    ret.raw[i] = static_cast<TTo>(from.raw[i]);
+    ret.raw[i] = static_cast<TTo>(HWY_MIN(from.raw[i], max));
   }
   return ret;
 }
@@ -1689,14 +1928,15 @@ HWY_API VFromD<DN> ReorderDemote2To(DN dn, V a, V b) {
   return ret;
 }
-template <class DN, HWY_IF_UNSIGNED_D(DN), class V, HWY_IF_UNSIGNED_V(V),
-          HWY_IF_T_SIZE_V(V, sizeof(TFromD<DN>) * 2),
+template <class DN, HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(DN), class V,
+          HWY_IF_UNSIGNED_V(V), HWY_IF_T_SIZE_V(V, sizeof(TFromD<DN>) * 2),
           HWY_IF_LANES_D(DN, HWY_MAX_LANES_D(DFromV<V>) * 2)>
 HWY_API VFromD<DN> ReorderDemote2To(DN dn, V a, V b) {
   const RepartitionToWide<decltype(dn)> dw;
   const size_t NW = Lanes(dw);
   using TN = TFromD<DN>;
-  const TN max = LimitsMax<TN>();
+  using TN_U = MakeUnsigned<TN>;
+  const TN_U max = static_cast<TN_U>(LimitsMax<TN>());
   VFromD<DN> ret;
   for (size_t i = 0; i < NW; ++i) {
     ret.raw[i] = static_cast<TN>(HWY_MIN(a.raw[i], max));
@@ -1715,23 +1955,20 @@ HWY_API VFromD<DN> OrderedDemote2To(DN dn, V a, V b) {
   return ReorderDemote2To(dn, a, b);
 }
-template <class DN, HWY_IF_BF16_D(DN), class V, HWY_IF_F32_D(DFromV<V>),
+template <class DN, HWY_IF_SPECIAL_FLOAT_D(DN), class V,
+          HWY_IF_F32_D(DFromV<V>),
           HWY_IF_LANES_D(DN, HWY_MAX_LANES_D(DFromV<V>) * 2)>
 HWY_API VFromD<DN> OrderedDemote2To(DN dn, V a, V b) {
-  const RebindToUnsigned<DFromV<decltype(a)>> du32;
-  const size_t NW = Lanes(du32);
-  VFromD<Repartition<uint16_t, DN>> ret;
-  const auto a_bits = BitCast(du32, a);
-  const auto b_bits = BitCast(du32, b);
+  const size_t NW = Lanes(dn) / 2;
+  using TN = TFromD<DN>;
+  VFromD<DN> ret;
   for (size_t i = 0; i < NW; ++i) {
-    ret.raw[i] = static_cast<uint16_t>(a_bits.raw[i] >> 16);
+    ret.raw[i] = ConvertScalarTo<TN>(a.raw[i]);
   }
   for (size_t i = 0; i < NW; ++i) {
-    ret.raw[NW + i] = static_cast<uint16_t>(b_bits.raw[i] >> 16);
+    ret.raw[NW + i] = ConvertScalarTo<TN>(b.raw[i]);
   }
-  return BitCast(dn, ret);
+  return ret;
 }
 namespace detail {
@@ -1758,6 +1995,12 @@ HWY_API VFromD<D> PromoteTo(D /* tag */, Vec128<bfloat16_t, N> v) {
   return ret;
 }
+#ifdef HWY_NATIVE_DEMOTE_F32_TO_BF16
+#undef HWY_NATIVE_DEMOTE_F32_TO_BF16
+#else
+#define HWY_NATIVE_DEMOTE_F32_TO_BF16
+#endif
 template <class D, HWY_IF_BF16_D(D), size_t N>
 HWY_API VFromD<D> DemoteTo(D /* tag */, Vec128<float, N> v) {
   VFromD<D> ret;
@@ -1767,6 +2010,21 @@ HWY_API VFromD<D> DemoteTo(D /* tag */, Vec128<float, N> v) {
   return ret;
 }
+#ifdef HWY_NATIVE_F64_TO_UI32_DEMOTE_IN_RANGE_TO
+#undef HWY_NATIVE_F64_TO_UI32_DEMOTE_IN_RANGE_TO
+#else
+#define HWY_NATIVE_F64_TO_UI32_DEMOTE_IN_RANGE_TO
+#endif
+template <class D32, HWY_IF_UI32_D(D32)>
+HWY_API VFromD<D32> DemoteInRangeTo(D32 d32, VFromD<Rebind<double, D32>> v) {
+  VFromD<D32> ret;
+  for (size_t i = 0; i < MaxLanes(d32); ++i) {
+    ret.raw[i] = detail::CastValueForInRangeF2IConv<TFromD<D32>>(v.raw[i]);
+  }
+  return ret;
+}
 // Tag dispatch instead of SFINAE for MSVC 2017 compatibility
 namespace detail {
@@ -1780,7 +2038,7 @@ HWY_API VFromD<DTo> ConvertTo(hwy::FloatTag /*tag*/, DTo /*tag*/,
   for (size_t i = 0; i < N; ++i) {
     // float## -> int##: return closest representable value
-    ret.raw[i] = CastValueForF2IConv<ToT>(hwy::TypeTag<ToT>(), from.raw[i]);
+    ret.raw[i] = CastValueForF2IConv<ToT>(from.raw[i]);
   }
   return ret;
 }
@@ -1806,6 +2064,22 @@ HWY_API VFromD<DTo> ConvertTo(DTo d, Vec128<TFrom, HWY_MAX_LANES_D(DTo)> from) {
   return detail::ConvertTo(hwy::IsFloatTag<TFrom>(), d, from);
 }
+#ifdef HWY_NATIVE_F2I_CONVERT_IN_RANGE_TO
+#undef HWY_NATIVE_F2I_CONVERT_IN_RANGE_TO
+#else
+#define HWY_NATIVE_F2I_CONVERT_IN_RANGE_TO
+#endif
+template <class DI, HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(DI),
+          HWY_IF_T_SIZE_ONE_OF_D(DI, (1 << 4) | (1 << 8))>
+HWY_API VFromD<DI> ConvertInRangeTo(DI di, VFromD<RebindToFloat<DI>> v) {
+  VFromD<DI> ret;
+  for (size_t i = 0; i < MaxLanes(di); i++) {
+    ret.raw[i] = detail::CastValueForInRangeF2IConv<TFromD<DI>>(v.raw[i]);
+  }
+  return ret;
+}
 template <size_t N>
 HWY_API Vec128<uint8_t, N> U8FromU32(Vec128<uint32_t, N> v) {
   return DemoteTo(Simd<uint8_t, N, 0>(), v);
@@ -1893,172 +2167,6 @@ HWY_API VFromD<DN> OrderedTruncate2To(DN dn, V a, V b) {
   return ret;
 }
-// ================================================== COMBINE
-template <typename T, size_t N>
-HWY_API Vec128<T, N / 2> LowerHalf(Vec128<T, N> v) {
-  Vec128<T, N / 2> ret;
-  CopyBytes<N / 2 * sizeof(T)>(v.raw, ret.raw);
-  return ret;
-}
-template <class D>
-HWY_API VFromD<D> LowerHalf(D /* tag */, VFromD<Twice<D>> v) {
-  return LowerHalf(v);
-}
-template <class D>
-HWY_API VFromD<D> UpperHalf(D d, VFromD<Twice<D>> v) {
-  VFromD<D> ret;
-  CopyBytes<d.MaxBytes()>(&v.raw[MaxLanes(d)], ret.raw);
-  return ret;
-}
-template <class D>
-HWY_API VFromD<D> ZeroExtendVector(D d, VFromD<Half<D>> v) {
-  const Half<decltype(d)> dh;
-  VFromD<D> ret;  // zero-initialized
-  CopyBytes<dh.MaxBytes()>(v.raw, ret.raw);
-  return ret;
-}
-template <class D, class VH = VFromD<Half<D>>>
-HWY_API VFromD<D> Combine(D d, VH hi_half, VH lo_half) {
-  const Half<decltype(d)> dh;
-  VFromD<D> ret;
-  CopyBytes<dh.MaxBytes()>(lo_half.raw, &ret.raw[0]);
-  CopyBytes<dh.MaxBytes()>(hi_half.raw, &ret.raw[MaxLanes(dh)]);
-  return ret;
-}
-template <class D>
-HWY_API VFromD<D> ConcatLowerLower(D d, VFromD<D> hi, VFromD<D> lo) {
-  const Half<decltype(d)> dh;
-  VFromD<D> ret;
-  CopyBytes<dh.MaxBytes()>(lo.raw, &ret.raw[0]);
-  CopyBytes<dh.MaxBytes()>(hi.raw, &ret.raw[MaxLanes(dh)]);
-  return ret;
-}
-template <class D>
-HWY_API VFromD<D> ConcatUpperUpper(D d, VFromD<D> hi, VFromD<D> lo) {
-  const Half<decltype(d)> dh;
-  VFromD<D> ret;
-  CopyBytes<dh.MaxBytes()>(&lo.raw[MaxLanes(dh)], &ret.raw[0]);
-  CopyBytes<dh.MaxBytes()>(&hi.raw[MaxLanes(dh)], &ret.raw[MaxLanes(dh)]);
-  return ret;
-}
-template <class D>
-HWY_API VFromD<D> ConcatLowerUpper(D d, VFromD<D> hi, VFromD<D> lo) {
-  const Half<decltype(d)> dh;
-  VFromD<D> ret;
-  CopyBytes<dh.MaxBytes()>(&lo.raw[MaxLanes(dh)], &ret.raw[0]);
-  CopyBytes<dh.MaxBytes()>(hi.raw, &ret.raw[MaxLanes(dh)]);
-  return ret;
-}
-template <class D>
-HWY_API VFromD<D> ConcatUpperLower(D d, VFromD<D> hi, VFromD<D> lo) {
-  const Half<decltype(d)> dh;
-  VFromD<D> ret;
-  CopyBytes<dh.MaxBytes()>(lo.raw, &ret.raw[0]);
-  CopyBytes<dh.MaxBytes()>(&hi.raw[MaxLanes(dh)], &ret.raw[MaxLanes(dh)]);
-  return ret;
-}
-template <class D>
-HWY_API VFromD<D> ConcatEven(D d, VFromD<D> hi, VFromD<D> lo) {
-  const Half<decltype(d)> dh;
-  VFromD<D> ret;
-  for (size_t i = 0; i < MaxLanes(dh); ++i) {
-    ret.raw[i] = lo.raw[2 * i];
-  }
-  for (size_t i = 0; i < MaxLanes(dh); ++i) {
-    ret.raw[MaxLanes(dh) + i] = hi.raw[2 * i];
-  }
-  return ret;
-}
-template <class D>
-HWY_API VFromD<D> ConcatOdd(D d, VFromD<D> hi, VFromD<D> lo) {
-  const Half<decltype(d)> dh;
-  VFromD<D> ret;
-  for (size_t i = 0; i < MaxLanes(dh); ++i) {
-    ret.raw[i] = lo.raw[2 * i + 1];
-  }
-  for (size_t i = 0; i < MaxLanes(dh); ++i) {
-    ret.raw[MaxLanes(dh) + i] = hi.raw[2 * i + 1];
-  }
-  return ret;
-}
-// ------------------------------ CombineShiftRightBytes
-template <int kBytes, class D>
-HWY_API VFromD<D> CombineShiftRightBytes(D d, VFromD<D> hi, VFromD<D> lo) {
-  VFromD<D> ret;
-  const uint8_t* HWY_RESTRICT lo8 =
-      reinterpret_cast<const uint8_t * HWY_RESTRICT>(lo.raw);
-  uint8_t* HWY_RESTRICT ret8 =
-      reinterpret_cast<uint8_t * HWY_RESTRICT>(ret.raw);
-  CopyBytes<d.MaxBytes() - kBytes>(lo8 + kBytes, ret8);
-  CopyBytes<kBytes>(hi.raw, ret8 + d.MaxBytes() - kBytes);
-  return ret;
-}
-// ------------------------------ ShiftLeftBytes
-template <int kBytes, class D>
-HWY_API VFromD<D> ShiftLeftBytes(D d, VFromD<D> v) {
-  static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes");
-  VFromD<D> ret;
-  uint8_t* HWY_RESTRICT ret8 =
-      reinterpret_cast<uint8_t * HWY_RESTRICT>(ret.raw);
-  ZeroBytes<kBytes>(ret8);
-  CopyBytes<d.MaxBytes() - kBytes>(v.raw, ret8 + kBytes);
-  return ret;
-}
-template <int kBytes, typename T, size_t N>
-HWY_API Vec128<T, N> ShiftLeftBytes(Vec128<T, N> v) {
-  return ShiftLeftBytes<kBytes>(DFromV<decltype(v)>(), v);
-}
-// ------------------------------ ShiftLeftLanes
-template <int kLanes, class D, typename T = TFromD<D>>
-HWY_API VFromD<D> ShiftLeftLanes(D d, VFromD<D> v) {
-  const Repartition<uint8_t, decltype(d)> d8;
-  return BitCast(d, ShiftLeftBytes<kLanes * sizeof(T)>(BitCast(d8, v)));
-}
-template <int kLanes, typename T, size_t N>
-HWY_API Vec128<T, N> ShiftLeftLanes(Vec128<T, N> v) {
-  return ShiftLeftLanes<kLanes>(DFromV<decltype(v)>(), v);
-}
-// ------------------------------ ShiftRightBytes
-template <int kBytes, class D>
-HWY_API VFromD<D> ShiftRightBytes(D d, VFromD<D> v) {
-  static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes");
-  VFromD<D> ret;
-  const uint8_t* HWY_RESTRICT v8 =
-      reinterpret_cast<const uint8_t * HWY_RESTRICT>(v.raw);
-  uint8_t* HWY_RESTRICT ret8 =
-      reinterpret_cast<uint8_t * HWY_RESTRICT>(ret.raw);
-  CopyBytes<d.MaxBytes() - kBytes>(v8 + kBytes, ret8);
-  ZeroBytes<kBytes>(ret8 + d.MaxBytes() - kBytes);
-  return ret;
-}
-// ------------------------------ ShiftRightLanes
-template <int kLanes, class D>
-HWY_API VFromD<D> ShiftRightLanes(D d, VFromD<D> v) {
-  const Repartition<uint8_t, decltype(d)> d8;
-  constexpr size_t kBytes = kLanes * sizeof(TFromD<D>);
-  return BitCast(d, ShiftRightBytes<kBytes>(d8, BitCast(d8, v)));
-}
 // ================================================== SWIZZLE
 template <typename T, size_t N>
@@ -2101,6 +2209,24 @@ HWY_API Vec128<T, N> OddEven(Vec128<T, N> odd, Vec128<T, N> even) {
   return odd;
 }
+template <class D>
+HWY_API VFromD<D> InterleaveEven(D /*d*/, VFromD<D> a, VFromD<D> b) {
+  constexpr size_t N = HWY_MAX_LANES_D(D);
+  for (size_t i = 1; i < N; i += 2) {
+    a.raw[i] = b.raw[i - 1];
+  }
+  return a;
+}
+template <class D>
+HWY_API VFromD<D> InterleaveOdd(D /*d*/, VFromD<D> a, VFromD<D> b) {
+  constexpr size_t N = HWY_MAX_LANES_D(D);
+  for (size_t i = 1; i < N; i += 2) {
+    b.raw[i - 1] = a.raw[i];
+  }
+  return b;
+}
 template <typename T, size_t N>
 HWY_API Vec128<T, N> OddEvenBlocks(Vec128<T, N> /* odd */, Vec128<T, N> even) {
   return even;
@@ -2349,8 +2475,8 @@ HWY_API Vec128<T, N> InterleaveLower(Vec128<T, N> a, Vec128<T, N> b) {
 }
 // Additional overload for the optional tag.
-template <class V>
-HWY_API V InterleaveLower(DFromV<V> /* tag */, V a, V b) {
+template <class D>
+HWY_API VFromD<D> InterleaveLower(D /* tag */, VFromD<D> a, VFromD<D> b) {
   return InterleaveLower(a, b);
 }
@@ -2416,6 +2542,15 @@ HWY_API MFromD<D> LoadMaskBits(D d, const uint8_t* HWY_RESTRICT bits) {
   return m;
 }
+template <class D>
+HWY_API MFromD<D> Dup128MaskFromMaskBits(D d, unsigned mask_bits) {
+  MFromD<D> m;
+  for (size_t i = 0; i < MaxLanes(d); ++i) {
+    m.bits[i] = MFromD<D>::FromBool(((mask_bits >> i) & 1u) != 0);
+  }
+  return m;
+}
 // `p` points to at least 8 writable bytes.
 template <class D>
 HWY_API size_t StoreMaskBits(D d, MFromD<D> mask, uint8_t* bits) {
@@ -2517,7 +2652,7 @@ HWY_API Vec128<T, N> Expand(Vec128<T, N> v, const Mask128<T, N> mask) {
     if (mask.bits[i]) {
       ret.raw[i] = v.raw[in_pos++];
     } else {
-      ret.raw[i] = T();  // zero, also works for float16_t
+      ret.raw[i] = ConvertScalarTo<T>(0);
     }
   }
   return ret;
@@ -2662,88 +2797,26 @@ HWY_API Mask128<T, N> SetAtOrBeforeFirst(Mask128<T, N> mask) {
 // ------------------------------ WidenMulPairwiseAdd
-template <class D, HWY_IF_F32_D(D), class VBF16>
-HWY_API VFromD<D> WidenMulPairwiseAdd(D df32, VBF16 a, VBF16 b) {
-  const Rebind<uint32_t, decltype(df32)> du32;
-  using VU32 = VFromD<decltype(du32)>;
-  const VU32 odd = Set(du32, 0xFFFF0000u);  // bfloat16 is the upper half of f32
-  // Avoid ZipLower/Upper so this also works on big-endian systems.
-  const VU32 ae = ShiftLeft<16>(BitCast(du32, a));
-  const VU32 ao = And(BitCast(du32, a), odd);
-  const VU32 be = ShiftLeft<16>(BitCast(du32, b));
-  const VU32 bo = And(BitCast(du32, b), odd);
-  return Mul(BitCast(df32, ae), BitCast(df32, be)) +
-         Mul(BitCast(df32, ao), BitCast(df32, bo));
+template <class DF, HWY_IF_F32_D(DF), class VBF>
+HWY_API VFromD<DF> WidenMulPairwiseAdd(DF df, VBF a, VBF b) {
+  return MulAdd(PromoteEvenTo(df, a), PromoteEvenTo(df, b),
+                Mul(PromoteOddTo(df, a), PromoteOddTo(df, b)));
 }
-template <class D, HWY_IF_I32_D(D), class VI16>
-HWY_API VFromD<D> WidenMulPairwiseAdd(D d32, VI16 a, VI16 b) {
-  using VI32 = VFromD<decltype(d32)>;
-  // Manual sign extension requires two shifts for even lanes.
-  const VI32 ae = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, a)));
-  const VI32 be = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, b)));
-  const VI32 ao = ShiftRight<16>(BitCast(d32, a));
-  const VI32 bo = ShiftRight<16>(BitCast(d32, b));
-  return Add(Mul(ae, be), Mul(ao, bo));
-}
-template <class D, HWY_IF_U32_D(D), class VU16>
-HWY_API VFromD<D> WidenMulPairwiseAdd(D du32, VU16 a, VU16 b) {
-  const auto lo16_mask = Set(du32, 0x0000FFFFu);
-  const auto a0 = And(BitCast(du32, a), lo16_mask);
-  const auto b0 = And(BitCast(du32, b), lo16_mask);
-  const auto a1 = ShiftRight<16>(BitCast(du32, a));
-  const auto b1 = ShiftRight<16>(BitCast(du32, b));
-  return Add(Mul(a0, b0), Mul(a1, b1));
+template <class D, HWY_IF_UI32_D(D), class V16>
+HWY_API VFromD<D> WidenMulPairwiseAdd(D d32, V16 a, V16 b) {
+  return MulAdd(PromoteEvenTo(d32, a), PromoteEvenTo(d32, b),
+                Mul(PromoteOddTo(d32, a), PromoteOddTo(d32, b)));
 }
 // ------------------------------ ReorderWidenMulAccumulate (MulAdd, ZipLower)
-template <class D, HWY_IF_F32_D(D), size_t N, class VBF16>
-HWY_API VFromD<D> ReorderWidenMulAccumulate(D df32, VBF16 a, VBF16 b,
-                                            const Vec128<float, N> sum0,
-                                            Vec128<float, N>& sum1) {
-  const Rebind<uint32_t, decltype(df32)> du32;
-  using VU32 = VFromD<decltype(du32)>;
-  const VU32 odd = Set(du32, 0xFFFF0000u);  // bfloat16 is the upper half of f32
-  // Avoid ZipLower/Upper so this also works on big-endian systems.
-  const VU32 ae = ShiftLeft<16>(BitCast(du32, a));
-  const VU32 ao = And(BitCast(du32, a), odd);
-  const VU32 be = ShiftLeft<16>(BitCast(du32, b));
-  const VU32 bo = And(BitCast(du32, b), odd);
-  sum1 = MulAdd(BitCast(df32, ao), BitCast(df32, bo), sum1);
-  return MulAdd(BitCast(df32, ae), BitCast(df32, be), sum0);
-}
-template <class D, HWY_IF_I32_D(D), size_t N, class VI16>
-HWY_API VFromD<D> ReorderWidenMulAccumulate(D d32, VI16 a, VI16 b,
-                                            const Vec128<int32_t, N> sum0,
-                                            Vec128<int32_t, N>& sum1) {
-  using VI32 = VFromD<decltype(d32)>;
-  // Manual sign extension requires two shifts for even lanes.
-  const VI32 ae = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, a)));
-  const VI32 be = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, b)));
-  const VI32 ao = ShiftRight<16>(BitCast(d32, a));
-  const VI32 bo = ShiftRight<16>(BitCast(d32, b));
-  sum1 = Add(Mul(ao, bo), sum1);
-  return Add(Mul(ae, be), sum0);
-}
-template <class D, HWY_IF_U32_D(D), size_t N, class VU16>
-HWY_API VFromD<D> ReorderWidenMulAccumulate(D du32, VU16 a, VU16 b,
-                                            const Vec128<uint32_t, N> sum0,
-                                            Vec128<uint32_t, N>& sum1) {
-  using VU32 = VFromD<decltype(du32)>;
-  const VU32 lo16_mask = Set(du32, uint32_t{0x0000FFFFu});
-  const VU32 ae = And(BitCast(du32, a), lo16_mask);
-  const VU32 be = And(BitCast(du32, b), lo16_mask);
-  const VU32 ao = ShiftRight<16>(BitCast(du32, a));
-  const VU32 bo = ShiftRight<16>(BitCast(du32, b));
-  sum1 = Add(Mul(ao, bo), sum1);
-  return Add(Mul(ae, be), sum0);
+template <class D, HWY_IF_UI32_D(D), class V16>
+HWY_API VFromD<D> ReorderWidenMulAccumulate(D d32, V16 a, V16 b,
+                                            const VFromD<D> sum0,
+                                            VFromD<D>& sum1) {
+  sum1 = MulAdd(PromoteOddTo(d32, a), PromoteOddTo(d32, b), sum1);
+  return MulAdd(PromoteEvenTo(d32, a), PromoteEvenTo(d32, b), sum0);
 }
 // ------------------------------ RearrangeToOddPlusEven
@@ -2754,15 +2827,13 @@ HWY_API VW RearrangeToOddPlusEven(VW sum0, VW sum1) {
 // ================================================== REDUCTIONS
-template <class D, typename T = TFromD<D>>
-HWY_API VFromD<D> SumOfLanes(D d, VFromD<D> v) {
-  T sum = T{0};
-  for (size_t i = 0; i < MaxLanes(d); ++i) {
-    sum += v.raw[i];
-  }
-  return Set(d, sum);
-}
-template <class D, typename T = TFromD<D>>
+#ifdef HWY_NATIVE_REDUCE_SCALAR
+#undef HWY_NATIVE_REDUCE_SCALAR
+#else
+#define HWY_NATIVE_REDUCE_SCALAR
+#endif
+template <class D, typename T = TFromD<D>, HWY_IF_REDUCE_D(D)>
 HWY_API T ReduceSum(D d, VFromD<D> v) {
   T sum = T{0};
   for (size_t i = 0; i < MaxLanes(d); ++i) {
@@ -2770,39 +2841,56 @@ HWY_API T ReduceSum(D d, VFromD<D> v) {
   }
   return sum;
 }
-template <class D, typename T = TFromD<D>>
-HWY_API VFromD<D> MinOfLanes(D d, VFromD<D> v) {
+template <class D, typename T = TFromD<D>, HWY_IF_REDUCE_D(D)>
+HWY_API T ReduceMin(D d, VFromD<D> v) {
   T min = HighestValue<T>();
   for (size_t i = 0; i < MaxLanes(d); ++i) {
     min = HWY_MIN(min, v.raw[i]);
   }
-  return Set(d, min);
+  return min;
 }
-template <class D, typename T = TFromD<D>>
-HWY_API VFromD<D> MaxOfLanes(D d, VFromD<D> v) {
+template <class D, typename T = TFromD<D>, HWY_IF_REDUCE_D(D)>
+HWY_API T ReduceMax(D d, VFromD<D> v) {
   T max = LowestValue<T>();
   for (size_t i = 0; i < MaxLanes(d); ++i) {
     max = HWY_MAX(max, v.raw[i]);
   }
-  return Set(d, max);
+  return max;
+}
+// ------------------------------ SumOfLanes
+template <class D, HWY_IF_LANES_GT_D(D, 1)>
+HWY_API VFromD<D> SumOfLanes(D d, VFromD<D> v) {
+  return Set(d, ReduceSum(d, v));
+}
+template <class D, HWY_IF_LANES_GT_D(D, 1)>
+HWY_API VFromD<D> MinOfLanes(D d, VFromD<D> v) {
+  return Set(d, ReduceMin(d, v));
+}
+template <class D, HWY_IF_LANES_GT_D(D, 1)>
+HWY_API VFromD<D> MaxOfLanes(D d, VFromD<D> v) {
+  return Set(d, ReduceMax(d, v));
 }
 // ================================================== OPS WITH DEPENDENCIES
 // ------------------------------ MulEven/Odd 64x64 (UpperHalf)
-HWY_INLINE Vec128<uint64_t> MulEven(Vec128<uint64_t> a, Vec128<uint64_t> b) {
-  alignas(16) uint64_t mul[2];
+template <class T, HWY_IF_UI64(T)>
+HWY_API Vec128<T> MulEven(Vec128<T> a, Vec128<T> b) {
+  alignas(16) T mul[2];
   mul[0] = Mul128(GetLane(a), GetLane(b), &mul[1]);
-  return Load(Full128<uint64_t>(), mul);
+  return Load(Full128<T>(), mul);
 }
-HWY_INLINE Vec128<uint64_t> MulOdd(Vec128<uint64_t> a, Vec128<uint64_t> b) {
-  alignas(16) uint64_t mul[2];
-  const Half<Full128<uint64_t>> d2;
+template <class T, HWY_IF_UI64(T)>
+HWY_API Vec128<T> MulOdd(Vec128<T> a, Vec128<T> b) {
+  alignas(16) T mul[2];
+  const Half<Full128<T>> d2;
   mul[0] =
       Mul128(GetLane(UpperHalf(d2, a)), GetLane(UpperHalf(d2, b)), &mul[1]);
-  return Load(Full128<uint64_t>(), mul);
+  return Load(Full128<T>(), mul);
 }
 // NOLINTNEXTLINE(google-readability-namespace-comments)