npm - @img/sharp-libvips-dev - Versions diffs - 1.0.2 → 1.0.3 - Mend

@img/sharp-libvips-dev 1.0.2 → 1.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (118) hide show

package/README.md +1 -2
package/include/aom/aom_decoder.h +1 -1
package/include/aom/aom_encoder.h +7 -1
package/include/aom/aom_image.h +24 -12
package/include/aom/aom_integer.h +3 -3
package/include/aom/aomcx.h +15 -0
package/include/aom/aomdx.h +5 -2
package/include/archive.h +7 -5
package/include/archive_entry.h +5 -3
package/include/cgif.h +3 -0
package/include/freetype2/freetype/config/ftoption.h +1 -1
package/include/fribidi/fribidi-config.h +2 -2
package/include/fribidi/fribidi-unicode-version.h +3 -3
package/include/glib-2.0/gio/gappinfo.h +40 -25
package/include/glib-2.0/gio/gasyncresult.h +1 -1
package/include/glib-2.0/gio/gconverter.h +5 -0
package/include/glib-2.0/gio/gdbusintrospection.h +1 -1
package/include/glib-2.0/gio/gfile.h +16 -0
package/include/glib-2.0/gio/gio-visibility.h +34 -0
package/include/glib-2.0/gio/gsettings.h +8 -0
package/include/glib-2.0/gio/gvfs.h +2 -2
package/include/glib-2.0/girepository/gi-visibility.h +34 -0
package/include/glib-2.0/glib/gbookmarkfile.h +1 -1
package/include/glib-2.0/glib/giochannel.h +2 -2
package/include/glib-2.0/glib/glib-visibility.h +34 -0
package/include/glib-2.0/glib/gmacros.h +12 -5
package/include/glib-2.0/glib/gmain.h +93 -7
package/include/glib-2.0/glib/gqsort.h +8 -1
package/include/glib-2.0/glib/gstrfuncs.h +0 -12
package/include/glib-2.0/glib/gstrvbuilder.h +3 -0
package/include/glib-2.0/glib/gunicode.h +1 -1
package/include/glib-2.0/glib/gversionmacros.h +9 -0
package/include/glib-2.0/gmodule/gmodule-visibility.h +34 -0
package/include/glib-2.0/gobject/gobject-visibility.h +34 -0
package/include/glib-2.0/gobject/gtype.h +6 -6
package/include/harfbuzz/hb-buffer.h +6 -0
package/include/harfbuzz/hb-common.h +6 -9
package/include/harfbuzz/hb-cplusplus.hh +8 -11
package/include/harfbuzz/hb-subset.h +17 -4
package/include/harfbuzz/hb-version.h +3 -3
package/include/hwy/abort.h +28 -0
package/include/hwy/aligned_allocator.h +48 -1
package/include/hwy/base.h +235 -34
package/include/hwy/detect_compiler_arch.h +84 -10
package/include/hwy/detect_targets.h +95 -29
package/include/hwy/foreach_target.h +12 -1
package/include/hwy/highway.h +205 -50
package/include/hwy/ops/arm_neon-inl.h +841 -99
package/include/hwy/ops/arm_sve-inl.h +413 -141
package/include/hwy/ops/emu128-inl.h +373 -360
package/include/hwy/ops/generic_ops-inl.h +804 -401
package/include/hwy/ops/inside-inl.h +691 -0
package/include/hwy/ops/ppc_vsx-inl.h +456 -166
package/include/hwy/ops/rvv-inl.h +537 -249
package/include/hwy/ops/scalar-inl.h +169 -79
package/include/hwy/ops/set_macros-inl.h +106 -18
package/include/hwy/ops/shared-inl.h +23 -0
package/include/hwy/ops/wasm_128-inl.h +130 -108
package/include/hwy/ops/x86_128-inl.h +1892 -577
package/include/hwy/ops/x86_256-inl.h +625 -184
package/include/hwy/ops/x86_512-inl.h +733 -131
package/include/hwy/targets.h +22 -21
package/include/hwy/timer-inl.h +3 -3
package/include/hwy/timer.h +5 -1
package/include/libheif/heif.h +170 -15
package/include/libheif/heif_items.h +237 -0
package/include/libheif/heif_properties.h +38 -2
package/include/libheif/heif_regions.h +1 -1
package/include/libheif/heif_version.h +2 -2
package/include/libpng16/pnglibconf.h +1 -1
package/include/librsvg-2.0/librsvg/rsvg-cairo.h +1 -1
package/include/librsvg-2.0/librsvg/rsvg-features.h +3 -4
package/include/librsvg-2.0/librsvg/rsvg-pixbuf.h +235 -0
package/include/librsvg-2.0/librsvg/rsvg-version.h +3 -3
package/include/librsvg-2.0/librsvg/rsvg.h +55 -176
package/include/libxml2/libxml/HTMLparser.h +12 -19
package/include/libxml2/libxml/c14n.h +1 -12
package/include/libxml2/libxml/debugXML.h +1 -1
package/include/libxml2/libxml/encoding.h +9 -0
package/include/libxml2/libxml/entities.h +12 -1
package/include/libxml2/libxml/hash.h +19 -0
package/include/libxml2/libxml/list.h +2 -2
package/include/libxml2/libxml/nanohttp.h +17 -0
package/include/libxml2/libxml/parser.h +61 -55
package/include/libxml2/libxml/parserInternals.h +9 -1
package/include/libxml2/libxml/pattern.h +6 -0
package/include/libxml2/libxml/tree.h +32 -12
package/include/libxml2/libxml/uri.h +11 -0
package/include/libxml2/libxml/valid.h +29 -2
package/include/libxml2/libxml/xinclude.h +7 -0
package/include/libxml2/libxml/xmlIO.h +21 -4
package/include/libxml2/libxml/xmlerror.h +14 -0
package/include/libxml2/libxml/xmlexports.h +111 -15
package/include/libxml2/libxml/xmlmemory.h +8 -45
package/include/libxml2/libxml/xmlreader.h +2 -0
package/include/libxml2/libxml/xmlsave.h +5 -0
package/include/libxml2/libxml/xmlunicode.h +165 -1
package/include/libxml2/libxml/xmlversion.h +15 -179
package/include/libxml2/libxml/xmlwriter.h +1 -0
package/include/libxml2/libxml/xpath.h +4 -0
package/include/pango-1.0/pango/pango-features.h +3 -3
package/include/pango-1.0/pango/pango-item.h +4 -2
package/include/pango-1.0/pango/pango-version-macros.h +25 -0
package/include/pango-1.0/pango/pangofc-font.h +2 -1
package/include/pnglibconf.h +1 -1
package/include/vips/util.h +1 -2
package/include/vips/version.h +4 -4
package/include/webp/decode.h +58 -56
package/include/webp/demux.h +25 -21
package/include/webp/encode.h +44 -39
package/include/webp/mux.h +76 -15
package/include/webp/mux_types.h +2 -1
package/include/webp/sharpyuv/sharpyuv.h +77 -8
package/include/webp/types.h +29 -8
package/include/zconf.h +1 -1
package/include/zlib.h +12 -12
package/package.json +1 -1
package/versions.json +14 -15

package/include/hwy/ops/emu128-inl.h CHANGED Viewed

@@ -17,6 +17,7 @@
 // External include guard in highway.h - see comment there.
 #include "hwy/base.h"
 #ifndef HWY_NO_LIBCXX
 #include <math.h>  // sqrtf
 #endif
@@ -103,9 +104,6 @@ HWY_API Vec128<TFromD<D>, HWY_MAX_LANES_D(D)> Zero(D /* tag */) {
 template <class D>
 using VFromD = decltype(Zero(D()));
-// ------------------------------ Tuple (VFromD)
-#include "hwy/ops/tuple-inl.h"
 // ------------------------------ BitCast
 template <class D, class VFrom>
@@ -355,9 +353,8 @@ HWY_API Vec128<T, N> CopySignToAbs(Vec128<T, N> abs, Vec128<T, N> sign) {
 // ------------------------------ BroadcastSignBit
 template <typename T, size_t N>
 HWY_API Vec128<T, N> BroadcastSignBit(Vec128<T, N> v) {
-  // This is used inside ShiftRight, so we cannot implement in terms of it.
   for (size_t i = 0; i < N; ++i) {
-    v.raw[i] = static_cast<T>(v.raw[i] < 0 ? -1 : 0);
+    v.raw[i] = ScalarShr(v.raw[i], sizeof(T) * 8 - 1);
   }
   return v;
 }
@@ -431,12 +428,6 @@ HWY_API Vec128<T, N> IfNegativeThenElse(Vec128<T, N> v, Vec128<T, N> yes,
   return v;
 }
-template <typename T, size_t N>
-HWY_API Vec128<T, N> ZeroIfNegative(Vec128<T, N> v) {
-  const DFromV<decltype(v)> d;
-  return IfNegativeThenElse(v, Zero(d), v);
-}
 // ------------------------------ Mask logical
 template <typename T, size_t N>
@@ -494,41 +485,26 @@ HWY_API Vec128<T, N> ShiftLeft(Vec128<T, N> v) {
 template <int kBits, typename T, size_t N>
 HWY_API Vec128<T, N> ShiftRight(Vec128<T, N> v) {
   static_assert(0 <= kBits && kBits < sizeof(T) * 8, "Invalid shift");
-#if __cplusplus >= 202002L
   // Signed right shift is now guaranteed to be arithmetic (rounding toward
   // negative infinity, i.e. shifting in the sign bit).
   for (size_t i = 0; i < N; ++i) {
-    v.raw[i] = static_cast<T>(v.raw[i] >> kBits);
+    v.raw[i] = ScalarShr(v.raw[i], kBits);
   }
-#else
-  if (IsSigned<T>()) {
-    // Emulate arithmetic shift using only logical (unsigned) shifts, because
-    // signed shifts are still implementation-defined.
-    using TU = hwy::MakeUnsigned<T>;
-    for (size_t i = 0; i < N; ++i) {
-      const TU shifted = static_cast<TU>(static_cast<TU>(v.raw[i]) >> kBits);
-      const TU sign = v.raw[i] < 0 ? static_cast<TU>(~TU{0}) : 0;
-      const size_t sign_shift =
-          static_cast<size_t>(static_cast<int>(sizeof(TU)) * 8 - 1 - kBits);
-      const TU upper = static_cast<TU>(sign << sign_shift);
-      v.raw[i] = static_cast<T>(shifted | upper);
-    }
-  } else {  // T is unsigned
-    for (size_t i = 0; i < N; ++i) {
-      v.raw[i] = static_cast<T>(v.raw[i] >> kBits);
-    }
-  }
-#endif
   return v;
 }
 // ------------------------------ RotateRight (ShiftRight)
-template <int kBits, typename T, size_t N>
+template <int kBits, typename T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
 HWY_API Vec128<T, N> RotateRight(const Vec128<T, N> v) {
+  const DFromV<decltype(v)> d;
+  const RebindToUnsigned<decltype(d)> du;
   constexpr size_t kSizeInBits = sizeof(T) * 8;
   static_assert(0 <= kBits && kBits < kSizeInBits, "Invalid shift count");
   if (kBits == 0) return v;
-  return Or(ShiftRight<kBits>(v),
+  return Or(BitCast(d, ShiftRight<kBits>(BitCast(du, v))),
             ShiftLeft<HWY_MIN(kSizeInBits - 1, kSizeInBits - kBits)>(v));
 }
@@ -545,31 +521,10 @@ HWY_API Vec128<T, N> ShiftLeftSame(Vec128<T, N> v, int bits) {
 template <typename T, size_t N>
 HWY_API Vec128<T, N> ShiftRightSame(Vec128<T, N> v, int bits) {
-#if __cplusplus >= 202002L
-  // Signed right shift is now guaranteed to be arithmetic (rounding toward
-  // negative infinity, i.e. shifting in the sign bit).
   for (size_t i = 0; i < N; ++i) {
-    v.raw[i] = static_cast<T>(v.raw[i] >> bits);
-  }
-#else
-  if (IsSigned<T>()) {
-    // Emulate arithmetic shift using only logical (unsigned) shifts, because
-    // signed shifts are still implementation-defined.
-    using TU = hwy::MakeUnsigned<T>;
-    for (size_t i = 0; i < N; ++i) {
-      const TU shifted = static_cast<TU>(static_cast<TU>(v.raw[i]) >> bits);
-      const TU sign = v.raw[i] < 0 ? static_cast<TU>(~TU{0}) : 0;
-      const size_t sign_shift =
-          static_cast<size_t>(static_cast<int>(sizeof(TU)) * 8 - 1 - bits);
-      const TU upper = static_cast<TU>(sign << sign_shift);
-      v.raw[i] = static_cast<T>(shifted | upper);
-    }
-  } else {
-    for (size_t i = 0; i < N; ++i) {
-      v.raw[i] = static_cast<T>(v.raw[i] >> bits);  // unsigned, logical shift
-    }
+    v.raw[i] = ScalarShr(v.raw[i], bits);
   }
-#endif
   return v;
 }
@@ -587,32 +542,10 @@ HWY_API Vec128<T, N> operator<<(Vec128<T, N> v, Vec128<T, N> bits) {
 template <typename T, size_t N>
 HWY_API Vec128<T, N> operator>>(Vec128<T, N> v, Vec128<T, N> bits) {
-#if __cplusplus >= 202002L
-  // Signed right shift is now guaranteed to be arithmetic (rounding toward
-  // negative infinity, i.e. shifting in the sign bit).
   for (size_t i = 0; i < N; ++i) {
-    v.raw[i] = static_cast<T>(v.raw[i] >> bits.raw[i]);
-  }
-#else
-  if (IsSigned<T>()) {
-    // Emulate arithmetic shift using only logical (unsigned) shifts, because
-    // signed shifts are still implementation-defined.
-    using TU = hwy::MakeUnsigned<T>;
-    for (size_t i = 0; i < N; ++i) {
-      const TU shifted =
-          static_cast<TU>(static_cast<TU>(v.raw[i]) >> bits.raw[i]);
-      const TU sign = v.raw[i] < 0 ? static_cast<TU>(~TU{0}) : 0;
-      const size_t sign_shift = static_cast<size_t>(
-          static_cast<int>(sizeof(TU)) * 8 - 1 - bits.raw[i]);
-      const TU upper = static_cast<TU>(sign << sign_shift);
-      v.raw[i] = static_cast<T>(shifted | upper);
-    }
-  } else {  // T is unsigned
-    for (size_t i = 0; i < N; ++i) {
-      v.raw[i] = static_cast<T>(v.raw[i] >> bits.raw[i]);
-    }
+    v.raw[i] = ScalarShr(v.raw[i], static_cast<int>(bits.raw[i]));
   }
-#endif
   return v;
 }
@@ -890,26 +823,36 @@ HWY_API Vec128<T, N> operator/(Vec128<T, N> a, Vec128<T, N> b) {
   return a;
 }
-// Returns the upper 16 bits of a * b in each lane.
-template <size_t N>
-HWY_API Vec128<int16_t, N> MulHigh(Vec128<int16_t, N> a, Vec128<int16_t, N> b) {
+// Returns the upper sizeof(T)*8 bits of a * b in each lane.
+template <class T, size_t N,
+          HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2) | (1 << 4)),
+          HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
+HWY_API Vec128<T, N> MulHigh(Vec128<T, N> a, Vec128<T, N> b) {
+  using TW = MakeWide<T>;
   for (size_t i = 0; i < N; ++i) {
-    a.raw[i] = static_cast<int16_t>((int32_t{a.raw[i]} * b.raw[i]) >> 16);
+    a.raw[i] = static_cast<T>(
+        (static_cast<TW>(a.raw[i]) * static_cast<TW>(b.raw[i])) >>
+        (sizeof(T) * 8));
   }
   return a;
 }
-template <size_t N>
-HWY_API Vec128<uint16_t, N> MulHigh(Vec128<uint16_t, N> a,
-                                    Vec128<uint16_t, N> b) {
-  for (size_t i = 0; i < N; ++i) {
-    // Cast to uint32_t first to prevent overflow. Otherwise the result of
-    // uint16_t * uint16_t is in "int" which may overflow. In practice the
-    // result is the same but this way it is also defined.
-    a.raw[i] = static_cast<uint16_t>(
-        (static_cast<uint32_t>(a.raw[i]) * static_cast<uint32_t>(b.raw[i])) >>
-        16);
-  }
-  return a;
+template <class T, HWY_IF_UI64(T)>
+HWY_API Vec128<T, 1> MulHigh(Vec128<T, 1> a, Vec128<T, 1> b) {
+  T hi;
+  Mul128(GetLane(a), GetLane(b), &hi);
+  return Set(Full64<T>(), hi);
+}
+template <class T, HWY_IF_UI64(T)>
+HWY_API Vec128<T> MulHigh(Vec128<T> a, Vec128<T> b) {
+  T hi_0;
+  T hi_1;
+  Mul128(GetLane(a), GetLane(b), &hi_0);
+  Mul128(ExtractLane(a, 1), ExtractLane(b, 1), &hi_1);
+  return Dup128VecFromValues(Full128<T>(), hi_0, hi_1);
 }
 template <size_t N>
@@ -1457,6 +1400,183 @@ HWY_API void StoreN(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT p,
   CopyBytes(v.raw, p, num_of_lanes_to_store * sizeof(TFromD<D>));
 }
+// ================================================== COMBINE
+template <typename T, size_t N>
+HWY_API Vec128<T, N / 2> LowerHalf(Vec128<T, N> v) {
+  Vec128<T, N / 2> ret;
+  CopyBytes<N / 2 * sizeof(T)>(v.raw, ret.raw);
+  return ret;
+}
+template <class D>
+HWY_API VFromD<D> LowerHalf(D /* tag */, VFromD<Twice<D>> v) {
+  return LowerHalf(v);
+}
+template <class D>
+HWY_API VFromD<D> UpperHalf(D d, VFromD<Twice<D>> v) {
+  VFromD<D> ret;
+  CopyBytes<d.MaxBytes()>(&v.raw[MaxLanes(d)], ret.raw);
+  return ret;
+}
+template <class D>
+HWY_API VFromD<D> ZeroExtendVector(D d, VFromD<Half<D>> v) {
+  const Half<decltype(d)> dh;
+  VFromD<D> ret;  // zero-initialized
+  CopyBytes<dh.MaxBytes()>(v.raw, ret.raw);
+  return ret;
+}
+template <class D, class VH = VFromD<Half<D>>>
+HWY_API VFromD<D> Combine(D d, VH hi_half, VH lo_half) {
+  const Half<decltype(d)> dh;
+  VFromD<D> ret;
+  CopyBytes<dh.MaxBytes()>(lo_half.raw, &ret.raw[0]);
+  CopyBytes<dh.MaxBytes()>(hi_half.raw, &ret.raw[MaxLanes(dh)]);
+  return ret;
+}
+template <class D>
+HWY_API VFromD<D> ConcatLowerLower(D d, VFromD<D> hi, VFromD<D> lo) {
+  const Half<decltype(d)> dh;
+  VFromD<D> ret;
+  CopyBytes<dh.MaxBytes()>(lo.raw, &ret.raw[0]);
+  CopyBytes<dh.MaxBytes()>(hi.raw, &ret.raw[MaxLanes(dh)]);
+  return ret;
+}
+template <class D>
+HWY_API VFromD<D> ConcatUpperUpper(D d, VFromD<D> hi, VFromD<D> lo) {
+  const Half<decltype(d)> dh;
+  VFromD<D> ret;
+  CopyBytes<dh.MaxBytes()>(&lo.raw[MaxLanes(dh)], &ret.raw[0]);
+  CopyBytes<dh.MaxBytes()>(&hi.raw[MaxLanes(dh)], &ret.raw[MaxLanes(dh)]);
+  return ret;
+}
+template <class D>
+HWY_API VFromD<D> ConcatLowerUpper(D d, VFromD<D> hi, VFromD<D> lo) {
+  const Half<decltype(d)> dh;
+  VFromD<D> ret;
+  CopyBytes<dh.MaxBytes()>(&lo.raw[MaxLanes(dh)], &ret.raw[0]);
+  CopyBytes<dh.MaxBytes()>(hi.raw, &ret.raw[MaxLanes(dh)]);
+  return ret;
+}
+template <class D>
+HWY_API VFromD<D> ConcatUpperLower(D d, VFromD<D> hi, VFromD<D> lo) {
+  const Half<decltype(d)> dh;
+  VFromD<D> ret;
+  CopyBytes<dh.MaxBytes()>(lo.raw, &ret.raw[0]);
+  CopyBytes<dh.MaxBytes()>(&hi.raw[MaxLanes(dh)], &ret.raw[MaxLanes(dh)]);
+  return ret;
+}
+template <class D>
+HWY_API VFromD<D> ConcatEven(D d, VFromD<D> hi, VFromD<D> lo) {
+  const Half<decltype(d)> dh;
+  VFromD<D> ret;
+  for (size_t i = 0; i < MaxLanes(dh); ++i) {
+    ret.raw[i] = lo.raw[2 * i];
+  }
+  for (size_t i = 0; i < MaxLanes(dh); ++i) {
+    ret.raw[MaxLanes(dh) + i] = hi.raw[2 * i];
+  }
+  return ret;
+}
+// 2023-11-23: workaround for incorrect codegen (reduction_test fails for
+// SumsOf2 because PromoteOddTo, which uses ConcatOdd, returns zero).
+#if HWY_ARCH_RISCV && HWY_TARGET == HWY_EMU128 && HWY_COMPILER_CLANG
+#define HWY_EMU128_CONCAT_INLINE HWY_NOINLINE
+#else
+#define HWY_EMU128_CONCAT_INLINE HWY_API
+#endif
+template <class D>
+HWY_EMU128_CONCAT_INLINE VFromD<D> ConcatOdd(D d, VFromD<D> hi, VFromD<D> lo) {
+  const Half<decltype(d)> dh;
+  VFromD<D> ret;
+  for (size_t i = 0; i < MaxLanes(dh); ++i) {
+    ret.raw[i] = lo.raw[2 * i + 1];
+  }
+  for (size_t i = 0; i < MaxLanes(dh); ++i) {
+    ret.raw[MaxLanes(dh) + i] = hi.raw[2 * i + 1];
+  }
+  return ret;
+}
+// ------------------------------ CombineShiftRightBytes
+template <int kBytes, class D>
+HWY_API VFromD<D> CombineShiftRightBytes(D d, VFromD<D> hi, VFromD<D> lo) {
+  VFromD<D> ret;
+  const uint8_t* HWY_RESTRICT lo8 =
+      reinterpret_cast<const uint8_t * HWY_RESTRICT>(lo.raw);
+  uint8_t* HWY_RESTRICT ret8 =
+      reinterpret_cast<uint8_t * HWY_RESTRICT>(ret.raw);
+  CopyBytes<d.MaxBytes() - kBytes>(lo8 + kBytes, ret8);
+  CopyBytes<kBytes>(hi.raw, ret8 + d.MaxBytes() - kBytes);
+  return ret;
+}
+// ------------------------------ ShiftLeftBytes
+template <int kBytes, class D>
+HWY_API VFromD<D> ShiftLeftBytes(D d, VFromD<D> v) {
+  static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes");
+  VFromD<D> ret;
+  uint8_t* HWY_RESTRICT ret8 =
+      reinterpret_cast<uint8_t * HWY_RESTRICT>(ret.raw);
+  ZeroBytes<kBytes>(ret8);
+  CopyBytes<d.MaxBytes() - kBytes>(v.raw, ret8 + kBytes);
+  return ret;
+}
+template <int kBytes, typename T, size_t N>
+HWY_API Vec128<T, N> ShiftLeftBytes(Vec128<T, N> v) {
+  return ShiftLeftBytes<kBytes>(DFromV<decltype(v)>(), v);
+}
+// ------------------------------ ShiftLeftLanes
+template <int kLanes, class D, typename T = TFromD<D>>
+HWY_API VFromD<D> ShiftLeftLanes(D d, VFromD<D> v) {
+  const Repartition<uint8_t, decltype(d)> d8;
+  return BitCast(d, ShiftLeftBytes<kLanes * sizeof(T)>(BitCast(d8, v)));
+}
+template <int kLanes, typename T, size_t N>
+HWY_API Vec128<T, N> ShiftLeftLanes(Vec128<T, N> v) {
+  return ShiftLeftLanes<kLanes>(DFromV<decltype(v)>(), v);
+}
+// ------------------------------ ShiftRightBytes
+template <int kBytes, class D>
+HWY_API VFromD<D> ShiftRightBytes(D d, VFromD<D> v) {
+  static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes");
+  VFromD<D> ret;
+  const uint8_t* HWY_RESTRICT v8 =
+      reinterpret_cast<const uint8_t * HWY_RESTRICT>(v.raw);
+  uint8_t* HWY_RESTRICT ret8 =
+      reinterpret_cast<uint8_t * HWY_RESTRICT>(ret.raw);
+  CopyBytes<d.MaxBytes() - kBytes>(v8 + kBytes, ret8);
+  ZeroBytes<kBytes>(ret8 + d.MaxBytes() - kBytes);
+  return ret;
+}
+// ------------------------------ ShiftRightLanes
+template <int kLanes, class D>
+HWY_API VFromD<D> ShiftRightLanes(D d, VFromD<D> v) {
+  const Repartition<uint8_t, decltype(d)> d8;
+  constexpr size_t kBytes = kLanes * sizeof(TFromD<D>);
+  return BitCast(d, ShiftRightBytes<kBytes>(d8, BitCast(d8, v)));
+}
+// ------------------------------ Tuples, PromoteEvenTo/PromoteOddTo
+#include "hwy/ops/inside-inl.h"
 // ------------------------------ LoadInterleaved2/3/4
 // Per-target flag to prevent generic_ops-inl.h from defining LoadInterleaved2.
@@ -1621,6 +1741,47 @@ HWY_INLINE ToT CastValueForPromoteTo(hwy::UnsignedTag /*to_type_tag*/,
                                      float val) {
   return CastValueForF2IConv<ToT>(val);
 }
+// If val is within the range of ToT, CastValueForInRangeF2IConv<ToT>(val)
+// returns static_cast<ToT>(val)
+//
+// Otherwise, CastValueForInRangeF2IConv<ToT>(val) returns an
+// implementation-defined result if val is not within the range of ToT.
+template <class ToT, class FromT>
+HWY_INLINE ToT CastValueForInRangeF2IConv(FromT val) {
+  // Prevent ubsan errors when converting float to narrower integer
+  using FromTU = MakeUnsigned<FromT>;
+  constexpr unsigned kMaxExpField =
+      static_cast<unsigned>(MaxExponentField<FromT>());
+  constexpr unsigned kExpBias = kMaxExpField >> 1;
+  constexpr unsigned kMinOutOfRangeExpField = static_cast<unsigned>(HWY_MIN(
+      kExpBias + sizeof(ToT) * 8 - static_cast<unsigned>(IsSigned<ToT>()),
+      kMaxExpField));
+  // If ToT is signed, compare only the exponent bits of val against
+  // kMinOutOfRangeExpField.
+  //
+  // Otherwise, if ToT is unsigned, compare the sign bit plus exponent bits of
+  // val against kMinOutOfRangeExpField as a negative value is outside of the
+  // range of an unsigned integer type.
+  const FromT val_to_compare =
+      static_cast<FromT>(IsSigned<ToT>() ? ScalarAbs(val) : val);
+  // val is within the range of ToT if
+  // (BitCastScalar<FromTU>(val_to_compare) >> MantissaBits<FromT>()) is less
+  // than kMinOutOfRangeExpField
+  //
+  // Otherwise, val is either outside of the range of ToT or equal to
+  // LimitsMin<ToT>() if
+  // (BitCastScalar<FromTU>(val_to_compare) >> MantissaBits<FromT>()) is greater
+  // than or equal to kMinOutOfRangeExpField.
+  return (static_cast<unsigned>(BitCastScalar<FromTU>(val_to_compare) >>
+                                MantissaBits<FromT>()) < kMinOutOfRangeExpField)
+             ? static_cast<ToT>(val)
+             : static_cast<ToT>(LimitsMin<ToT>());
+}
 }  // namespace detail
@@ -1636,6 +1797,21 @@ HWY_API VFromD<DTo> PromoteTo(DTo d, Vec128<TFrom, HWY_MAX_LANES_D(DTo)> from) {
   return ret;
 }
+#ifdef HWY_NATIVE_F32_TO_UI64_PROMOTE_IN_RANGE_TO
+#undef HWY_NATIVE_F32_TO_UI64_PROMOTE_IN_RANGE_TO
+#else
+#define HWY_NATIVE_F32_TO_UI64_PROMOTE_IN_RANGE_TO
+#endif
+template <class D64, HWY_IF_UI64_D(D64)>
+HWY_API VFromD<D64> PromoteInRangeTo(D64 d64, VFromD<Rebind<float, D64>> v) {
+  VFromD<D64> ret;
+  for (size_t i = 0; i < MaxLanes(d64); ++i) {
+    ret.raw[i] = detail::CastValueForInRangeF2IConv<TFromD<D64>>(v.raw[i]);
+  }
+  return ret;
+}
 // MSVC 19.10 cannot deduce the argument type if HWY_IF_FLOAT(TFrom) is here,
 // so we overload for TFrom=double and ToT={float,int32_t}.
 template <class D, HWY_IF_F32_D(D)>
@@ -1679,17 +1855,32 @@ HWY_API VFromD<DTo> DemoteTo(DTo /* tag */, Vec128<TFrom, N> from) {
   return ret;
 }
+// Disable the default unsigned to signed DemoteTo/ReorderDemote2To
+// implementations in generic_ops-inl.h on EMU128 as the EMU128 target has
+// target-specific implementations of the unsigned to signed DemoteTo and
+// ReorderDemote2To ops
+// NOTE: hwy::EnableIf<!hwy::IsSame<V, V>()>* = nullptr is used instead of
+// hwy::EnableIf<false>* = nullptr to avoid compiler errors since
+// !hwy::IsSame<V, V>() is always false and as !hwy::IsSame<V, V>() will cause
+// SFINAE to occur instead of a hard error due to a dependency on the V template
+// argument
+#undef HWY_IF_U2I_DEMOTE_FROM_LANE_SIZE_V
+#define HWY_IF_U2I_DEMOTE_FROM_LANE_SIZE_V(V) \
+  hwy::EnableIf<!hwy::IsSame<V, V>()>* = nullptr
 template <class DTo, typename TFrom, size_t N, HWY_IF_UNSIGNED(TFrom),
-          HWY_IF_UNSIGNED_D(DTo)>
+          HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(DTo)>
 HWY_API VFromD<DTo> DemoteTo(DTo /* tag */, Vec128<TFrom, N> from) {
   using TTo = TFromD<DTo>;
   static_assert(sizeof(TTo) < sizeof(TFrom), "Not demoting");
+  const auto max = static_cast<MakeUnsigned<TTo>>(LimitsMax<TTo>());
   VFromD<DTo> ret;
   for (size_t i = 0; i < N; ++i) {
     // Int to int: choose closest value in ToT to `from` (avoids UB)
-    from.raw[i] = HWY_MIN(from.raw[i], LimitsMax<TTo>());
-    ret.raw[i] = static_cast<TTo>(from.raw[i]);
+    ret.raw[i] = static_cast<TTo>(HWY_MIN(from.raw[i], max));
   }
   return ret;
 }
@@ -1737,14 +1928,15 @@ HWY_API VFromD<DN> ReorderDemote2To(DN dn, V a, V b) {
   return ret;
 }
-template <class DN, HWY_IF_UNSIGNED_D(DN), class V, HWY_IF_UNSIGNED_V(V),
-          HWY_IF_T_SIZE_V(V, sizeof(TFromD<DN>) * 2),
+template <class DN, HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(DN), class V,
+          HWY_IF_UNSIGNED_V(V), HWY_IF_T_SIZE_V(V, sizeof(TFromD<DN>) * 2),
           HWY_IF_LANES_D(DN, HWY_MAX_LANES_D(DFromV<V>) * 2)>
 HWY_API VFromD<DN> ReorderDemote2To(DN dn, V a, V b) {
   const RepartitionToWide<decltype(dn)> dw;
   const size_t NW = Lanes(dw);
   using TN = TFromD<DN>;
-  const TN max = LimitsMax<TN>();
+  using TN_U = MakeUnsigned<TN>;
+  const TN_U max = static_cast<TN_U>(LimitsMax<TN>());
   VFromD<DN> ret;
   for (size_t i = 0; i < NW; ++i) {
     ret.raw[i] = static_cast<TN>(HWY_MIN(a.raw[i], max));
@@ -1803,6 +1995,12 @@ HWY_API VFromD<D> PromoteTo(D /* tag */, Vec128<bfloat16_t, N> v) {
   return ret;
 }
+#ifdef HWY_NATIVE_DEMOTE_F32_TO_BF16
+#undef HWY_NATIVE_DEMOTE_F32_TO_BF16
+#else
+#define HWY_NATIVE_DEMOTE_F32_TO_BF16
+#endif
 template <class D, HWY_IF_BF16_D(D), size_t N>
 HWY_API VFromD<D> DemoteTo(D /* tag */, Vec128<float, N> v) {
   VFromD<D> ret;
@@ -1812,6 +2010,21 @@ HWY_API VFromD<D> DemoteTo(D /* tag */, Vec128<float, N> v) {
   return ret;
 }
+#ifdef HWY_NATIVE_F64_TO_UI32_DEMOTE_IN_RANGE_TO
+#undef HWY_NATIVE_F64_TO_UI32_DEMOTE_IN_RANGE_TO
+#else
+#define HWY_NATIVE_F64_TO_UI32_DEMOTE_IN_RANGE_TO
+#endif
+template <class D32, HWY_IF_UI32_D(D32)>
+HWY_API VFromD<D32> DemoteInRangeTo(D32 d32, VFromD<Rebind<double, D32>> v) {
+  VFromD<D32> ret;
+  for (size_t i = 0; i < MaxLanes(d32); ++i) {
+    ret.raw[i] = detail::CastValueForInRangeF2IConv<TFromD<D32>>(v.raw[i]);
+  }
+  return ret;
+}
 // Tag dispatch instead of SFINAE for MSVC 2017 compatibility
 namespace detail {
@@ -1851,6 +2064,22 @@ HWY_API VFromD<DTo> ConvertTo(DTo d, Vec128<TFrom, HWY_MAX_LANES_D(DTo)> from) {
   return detail::ConvertTo(hwy::IsFloatTag<TFrom>(), d, from);
 }
+#ifdef HWY_NATIVE_F2I_CONVERT_IN_RANGE_TO
+#undef HWY_NATIVE_F2I_CONVERT_IN_RANGE_TO
+#else
+#define HWY_NATIVE_F2I_CONVERT_IN_RANGE_TO
+#endif
+template <class DI, HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(DI),
+          HWY_IF_T_SIZE_ONE_OF_D(DI, (1 << 4) | (1 << 8))>
+HWY_API VFromD<DI> ConvertInRangeTo(DI di, VFromD<RebindToFloat<DI>> v) {
+  VFromD<DI> ret;
+  for (size_t i = 0; i < MaxLanes(di); i++) {
+    ret.raw[i] = detail::CastValueForInRangeF2IConv<TFromD<DI>>(v.raw[i]);
+  }
+  return ret;
+}
 template <size_t N>
 HWY_API Vec128<uint8_t, N> U8FromU32(Vec128<uint32_t, N> v) {
   return DemoteTo(Simd<uint8_t, N, 0>(), v);
@@ -1938,180 +2167,6 @@ HWY_API VFromD<DN> OrderedTruncate2To(DN dn, V a, V b) {
   return ret;
 }
-// ================================================== COMBINE
-template <typename T, size_t N>
-HWY_API Vec128<T, N / 2> LowerHalf(Vec128<T, N> v) {
-  Vec128<T, N / 2> ret;
-  CopyBytes<N / 2 * sizeof(T)>(v.raw, ret.raw);
-  return ret;
-}
-template <class D>
-HWY_API VFromD<D> LowerHalf(D /* tag */, VFromD<Twice<D>> v) {
-  return LowerHalf(v);
-}
-template <class D>
-HWY_API VFromD<D> UpperHalf(D d, VFromD<Twice<D>> v) {
-  VFromD<D> ret;
-  CopyBytes<d.MaxBytes()>(&v.raw[MaxLanes(d)], ret.raw);
-  return ret;
-}
-template <class D>
-HWY_API VFromD<D> ZeroExtendVector(D d, VFromD<Half<D>> v) {
-  const Half<decltype(d)> dh;
-  VFromD<D> ret;  // zero-initialized
-  CopyBytes<dh.MaxBytes()>(v.raw, ret.raw);
-  return ret;
-}
-template <class D, class VH = VFromD<Half<D>>>
-HWY_API VFromD<D> Combine(D d, VH hi_half, VH lo_half) {
-  const Half<decltype(d)> dh;
-  VFromD<D> ret;
-  CopyBytes<dh.MaxBytes()>(lo_half.raw, &ret.raw[0]);
-  CopyBytes<dh.MaxBytes()>(hi_half.raw, &ret.raw[MaxLanes(dh)]);
-  return ret;
-}
-template <class D>
-HWY_API VFromD<D> ConcatLowerLower(D d, VFromD<D> hi, VFromD<D> lo) {
-  const Half<decltype(d)> dh;
-  VFromD<D> ret;
-  CopyBytes<dh.MaxBytes()>(lo.raw, &ret.raw[0]);
-  CopyBytes<dh.MaxBytes()>(hi.raw, &ret.raw[MaxLanes(dh)]);
-  return ret;
-}
-template <class D>
-HWY_API VFromD<D> ConcatUpperUpper(D d, VFromD<D> hi, VFromD<D> lo) {
-  const Half<decltype(d)> dh;
-  VFromD<D> ret;
-  CopyBytes<dh.MaxBytes()>(&lo.raw[MaxLanes(dh)], &ret.raw[0]);
-  CopyBytes<dh.MaxBytes()>(&hi.raw[MaxLanes(dh)], &ret.raw[MaxLanes(dh)]);
-  return ret;
-}
-template <class D>
-HWY_API VFromD<D> ConcatLowerUpper(D d, VFromD<D> hi, VFromD<D> lo) {
-  const Half<decltype(d)> dh;
-  VFromD<D> ret;
-  CopyBytes<dh.MaxBytes()>(&lo.raw[MaxLanes(dh)], &ret.raw[0]);
-  CopyBytes<dh.MaxBytes()>(hi.raw, &ret.raw[MaxLanes(dh)]);
-  return ret;
-}
-template <class D>
-HWY_API VFromD<D> ConcatUpperLower(D d, VFromD<D> hi, VFromD<D> lo) {
-  const Half<decltype(d)> dh;
-  VFromD<D> ret;
-  CopyBytes<dh.MaxBytes()>(lo.raw, &ret.raw[0]);
-  CopyBytes<dh.MaxBytes()>(&hi.raw[MaxLanes(dh)], &ret.raw[MaxLanes(dh)]);
-  return ret;
-}
-template <class D>
-HWY_API VFromD<D> ConcatEven(D d, VFromD<D> hi, VFromD<D> lo) {
-  const Half<decltype(d)> dh;
-  VFromD<D> ret;
-  for (size_t i = 0; i < MaxLanes(dh); ++i) {
-    ret.raw[i] = lo.raw[2 * i];
-  }
-  for (size_t i = 0; i < MaxLanes(dh); ++i) {
-    ret.raw[MaxLanes(dh) + i] = hi.raw[2 * i];
-  }
-  return ret;
-}
-// 2023-11-23: workaround for incorrect codegen (reduction_test fails for
-// SumsOf2 because PromoteOddTo, which uses ConcatOdd, returns zero).
-#if HWY_ARCH_RVV && HWY_TARGET == HWY_EMU128 && HWY_COMPILER_CLANG
-#define HWY_EMU128_CONCAT_INLINE HWY_NOINLINE
-#else
-#define HWY_EMU128_CONCAT_INLINE HWY_API
-#endif
-template <class D>
-HWY_EMU128_CONCAT_INLINE VFromD<D> ConcatOdd(D d, VFromD<D> hi, VFromD<D> lo) {
-  const Half<decltype(d)> dh;
-  VFromD<D> ret;
-  for (size_t i = 0; i < MaxLanes(dh); ++i) {
-    ret.raw[i] = lo.raw[2 * i + 1];
-  }
-  for (size_t i = 0; i < MaxLanes(dh); ++i) {
-    ret.raw[MaxLanes(dh) + i] = hi.raw[2 * i + 1];
-  }
-  return ret;
-}
-// ------------------------------ CombineShiftRightBytes
-template <int kBytes, class D>
-HWY_API VFromD<D> CombineShiftRightBytes(D d, VFromD<D> hi, VFromD<D> lo) {
-  VFromD<D> ret;
-  const uint8_t* HWY_RESTRICT lo8 =
-      reinterpret_cast<const uint8_t * HWY_RESTRICT>(lo.raw);
-  uint8_t* HWY_RESTRICT ret8 =
-      reinterpret_cast<uint8_t * HWY_RESTRICT>(ret.raw);
-  CopyBytes<d.MaxBytes() - kBytes>(lo8 + kBytes, ret8);
-  CopyBytes<kBytes>(hi.raw, ret8 + d.MaxBytes() - kBytes);
-  return ret;
-}
-// ------------------------------ ShiftLeftBytes
-template <int kBytes, class D>
-HWY_API VFromD<D> ShiftLeftBytes(D d, VFromD<D> v) {
-  static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes");
-  VFromD<D> ret;
-  uint8_t* HWY_RESTRICT ret8 =
-      reinterpret_cast<uint8_t * HWY_RESTRICT>(ret.raw);
-  ZeroBytes<kBytes>(ret8);
-  CopyBytes<d.MaxBytes() - kBytes>(v.raw, ret8 + kBytes);
-  return ret;
-}
-template <int kBytes, typename T, size_t N>
-HWY_API Vec128<T, N> ShiftLeftBytes(Vec128<T, N> v) {
-  return ShiftLeftBytes<kBytes>(DFromV<decltype(v)>(), v);
-}
-// ------------------------------ ShiftLeftLanes
-template <int kLanes, class D, typename T = TFromD<D>>
-HWY_API VFromD<D> ShiftLeftLanes(D d, VFromD<D> v) {
-  const Repartition<uint8_t, decltype(d)> d8;
-  return BitCast(d, ShiftLeftBytes<kLanes * sizeof(T)>(BitCast(d8, v)));
-}
-template <int kLanes, typename T, size_t N>
-HWY_API Vec128<T, N> ShiftLeftLanes(Vec128<T, N> v) {
-  return ShiftLeftLanes<kLanes>(DFromV<decltype(v)>(), v);
-}
-// ------------------------------ ShiftRightBytes
-template <int kBytes, class D>
-HWY_API VFromD<D> ShiftRightBytes(D d, VFromD<D> v) {
-  static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes");
-  VFromD<D> ret;
-  const uint8_t* HWY_RESTRICT v8 =
-      reinterpret_cast<const uint8_t * HWY_RESTRICT>(v.raw);
-  uint8_t* HWY_RESTRICT ret8 =
-      reinterpret_cast<uint8_t * HWY_RESTRICT>(ret.raw);
-  CopyBytes<d.MaxBytes() - kBytes>(v8 + kBytes, ret8);
-  ZeroBytes<kBytes>(ret8 + d.MaxBytes() - kBytes);
-  return ret;
-}
-// ------------------------------ ShiftRightLanes
-template <int kLanes, class D>
-HWY_API VFromD<D> ShiftRightLanes(D d, VFromD<D> v) {
-  const Repartition<uint8_t, decltype(d)> d8;
-  constexpr size_t kBytes = kLanes * sizeof(TFromD<D>);
-  return BitCast(d, ShiftRightBytes<kBytes>(d8, BitCast(d8, v)));
-}
 // ================================================== SWIZZLE
 template <typename T, size_t N>
@@ -2154,6 +2209,24 @@ HWY_API Vec128<T, N> OddEven(Vec128<T, N> odd, Vec128<T, N> even) {
   return odd;
 }
+template <class D>
+HWY_API VFromD<D> InterleaveEven(D /*d*/, VFromD<D> a, VFromD<D> b) {
+  constexpr size_t N = HWY_MAX_LANES_D(D);
+  for (size_t i = 1; i < N; i += 2) {
+    a.raw[i] = b.raw[i - 1];
+  }
+  return a;
+}
+template <class D>
+HWY_API VFromD<D> InterleaveOdd(D /*d*/, VFromD<D> a, VFromD<D> b) {
+  constexpr size_t N = HWY_MAX_LANES_D(D);
+  for (size_t i = 1; i < N; i += 2) {
+    b.raw[i - 1] = a.raw[i];
+  }
+  return b;
+}
 template <typename T, size_t N>
 HWY_API Vec128<T, N> OddEvenBlocks(Vec128<T, N> /* odd */, Vec128<T, N> even) {
   return even;
@@ -2724,88 +2797,26 @@ HWY_API Mask128<T, N> SetAtOrBeforeFirst(Mask128<T, N> mask) {
 // ------------------------------ WidenMulPairwiseAdd
-template <class D, HWY_IF_F32_D(D), class VBF16>
-HWY_API VFromD<D> WidenMulPairwiseAdd(D df32, VBF16 a, VBF16 b) {
-  const Rebind<uint32_t, decltype(df32)> du32;
-  using VU32 = VFromD<decltype(du32)>;
-  const VU32 odd = Set(du32, 0xFFFF0000u);  // bfloat16 is the upper half of f32
-  // Avoid ZipLower/Upper so this also works on big-endian systems.
-  const VU32 ae = ShiftLeft<16>(BitCast(du32, a));
-  const VU32 ao = And(BitCast(du32, a), odd);
-  const VU32 be = ShiftLeft<16>(BitCast(du32, b));
-  const VU32 bo = And(BitCast(du32, b), odd);
-  return Mul(BitCast(df32, ae), BitCast(df32, be)) +
-         Mul(BitCast(df32, ao), BitCast(df32, bo));
-}
-template <class D, HWY_IF_I32_D(D), class VI16>
-HWY_API VFromD<D> WidenMulPairwiseAdd(D d32, VI16 a, VI16 b) {
-  using VI32 = VFromD<decltype(d32)>;
-  // Manual sign extension requires two shifts for even lanes.
-  const VI32 ae = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, a)));
-  const VI32 be = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, b)));
-  const VI32 ao = ShiftRight<16>(BitCast(d32, a));
-  const VI32 bo = ShiftRight<16>(BitCast(d32, b));
-  return Add(Mul(ae, be), Mul(ao, bo));
+template <class DF, HWY_IF_F32_D(DF), class VBF>
+HWY_API VFromD<DF> WidenMulPairwiseAdd(DF df, VBF a, VBF b) {
+  return MulAdd(PromoteEvenTo(df, a), PromoteEvenTo(df, b),
+                Mul(PromoteOddTo(df, a), PromoteOddTo(df, b)));
 }
-template <class D, HWY_IF_U32_D(D), class VU16>
-HWY_API VFromD<D> WidenMulPairwiseAdd(D du32, VU16 a, VU16 b) {
-  const auto lo16_mask = Set(du32, 0x0000FFFFu);
-  const auto a0 = And(BitCast(du32, a), lo16_mask);
-  const auto b0 = And(BitCast(du32, b), lo16_mask);
-  const auto a1 = ShiftRight<16>(BitCast(du32, a));
-  const auto b1 = ShiftRight<16>(BitCast(du32, b));
-  return Add(Mul(a0, b0), Mul(a1, b1));
+template <class D, HWY_IF_UI32_D(D), class V16>
+HWY_API VFromD<D> WidenMulPairwiseAdd(D d32, V16 a, V16 b) {
+  return MulAdd(PromoteEvenTo(d32, a), PromoteEvenTo(d32, b),
+                Mul(PromoteOddTo(d32, a), PromoteOddTo(d32, b)));
 }
 // ------------------------------ ReorderWidenMulAccumulate (MulAdd, ZipLower)
-template <class D, HWY_IF_F32_D(D), size_t N, class VBF16>
-HWY_API VFromD<D> ReorderWidenMulAccumulate(D df32, VBF16 a, VBF16 b,
-                                            const Vec128<float, N> sum0,
-                                            Vec128<float, N>& sum1) {
-  const Rebind<uint32_t, decltype(df32)> du32;
-  using VU32 = VFromD<decltype(du32)>;
-  const VU32 odd = Set(du32, 0xFFFF0000u);  // bfloat16 is the upper half of f32
-  // Avoid ZipLower/Upper so this also works on big-endian systems.
-  const VU32 ae = ShiftLeft<16>(BitCast(du32, a));
-  const VU32 ao = And(BitCast(du32, a), odd);
-  const VU32 be = ShiftLeft<16>(BitCast(du32, b));
-  const VU32 bo = And(BitCast(du32, b), odd);
-  sum1 = MulAdd(BitCast(df32, ao), BitCast(df32, bo), sum1);
-  return MulAdd(BitCast(df32, ae), BitCast(df32, be), sum0);
-}
-template <class D, HWY_IF_I32_D(D), size_t N, class VI16>
-HWY_API VFromD<D> ReorderWidenMulAccumulate(D d32, VI16 a, VI16 b,
-                                            const Vec128<int32_t, N> sum0,
-                                            Vec128<int32_t, N>& sum1) {
-  using VI32 = VFromD<decltype(d32)>;
-  // Manual sign extension requires two shifts for even lanes.
-  const VI32 ae = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, a)));
-  const VI32 be = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, b)));
-  const VI32 ao = ShiftRight<16>(BitCast(d32, a));
-  const VI32 bo = ShiftRight<16>(BitCast(d32, b));
-  sum1 = Add(Mul(ao, bo), sum1);
-  return Add(Mul(ae, be), sum0);
-}
-template <class D, HWY_IF_U32_D(D), size_t N, class VU16>
-HWY_API VFromD<D> ReorderWidenMulAccumulate(D du32, VU16 a, VU16 b,
-                                            const Vec128<uint32_t, N> sum0,
-                                            Vec128<uint32_t, N>& sum1) {
-  using VU32 = VFromD<decltype(du32)>;
-  const VU32 lo16_mask = Set(du32, uint32_t{0x0000FFFFu});
-  const VU32 ae = And(BitCast(du32, a), lo16_mask);
-  const VU32 be = And(BitCast(du32, b), lo16_mask);
-  const VU32 ao = ShiftRight<16>(BitCast(du32, a));
-  const VU32 bo = ShiftRight<16>(BitCast(du32, b));
-  sum1 = Add(Mul(ao, bo), sum1);
-  return Add(Mul(ae, be), sum0);
+template <class D, HWY_IF_UI32_D(D), class V16>
+HWY_API VFromD<D> ReorderWidenMulAccumulate(D d32, V16 a, V16 b,
+                                            const VFromD<D> sum0,
+                                            VFromD<D>& sum1) {
+  sum1 = MulAdd(PromoteOddTo(d32, a), PromoteOddTo(d32, b), sum1);
+  return MulAdd(PromoteEvenTo(d32, a), PromoteEvenTo(d32, b), sum0);
 }
 // ------------------------------ RearrangeToOddPlusEven
@@ -2866,18 +2877,20 @@ HWY_API VFromD<D> MaxOfLanes(D d, VFromD<D> v) {
 // ------------------------------ MulEven/Odd 64x64 (UpperHalf)
-HWY_INLINE Vec128<uint64_t> MulEven(Vec128<uint64_t> a, Vec128<uint64_t> b) {
-  alignas(16) uint64_t mul[2];
+template <class T, HWY_IF_UI64(T)>
+HWY_API Vec128<T> MulEven(Vec128<T> a, Vec128<T> b) {
+  alignas(16) T mul[2];
   mul[0] = Mul128(GetLane(a), GetLane(b), &mul[1]);
-  return Load(Full128<uint64_t>(), mul);
+  return Load(Full128<T>(), mul);
 }
-HWY_INLINE Vec128<uint64_t> MulOdd(Vec128<uint64_t> a, Vec128<uint64_t> b) {
-  alignas(16) uint64_t mul[2];
-  const Half<Full128<uint64_t>> d2;
+template <class T, HWY_IF_UI64(T)>
+HWY_API Vec128<T> MulOdd(Vec128<T> a, Vec128<T> b) {
+  alignas(16) T mul[2];
+  const Half<Full128<T>> d2;
   mul[0] =
       Mul128(GetLane(UpperHalf(d2, a)), GetLane(UpperHalf(d2, b)), &mul[1]);
-  return Load(Full128<uint64_t>(), mul);
+  return Load(Full128<T>(), mul);
 }
 // NOLINTNEXTLINE(google-readability-namespace-comments)