npm - @img/sharp-libvips-dev - Versions diffs - 1.0.1 → 1.0.2 - Mend

@img/sharp-libvips-dev 1.0.1 → 1.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (85) hide show

package/include/expat.h +21 -10
package/include/expat_config.h +11 -5
package/include/ffi.h +12 -25
package/include/freetype2/freetype/config/ftoption.h +1 -1
package/include/gio-unix-2.0/gio/gfiledescriptorbased.h +3 -2
package/include/glib-2.0/gio/gapplication.h +6 -0
package/include/glib-2.0/gio/giotypes.h +0 -1
package/include/glib-2.0/girepository/giarginfo.h +23 -6
package/include/glib-2.0/girepository/gibaseinfo.h +44 -18
package/include/glib-2.0/girepository/gicallableinfo.h +26 -16
package/include/glib-2.0/girepository/gicallbackinfo.h +17 -2
package/include/glib-2.0/girepository/giconstantinfo.h +19 -4
package/include/glib-2.0/girepository/gienuminfo.h +20 -21
package/include/glib-2.0/girepository/gifieldinfo.h +22 -7
package/include/glib-2.0/girepository/giflagsinfo.h +60 -0
package/include/glib-2.0/girepository/gifunctioninfo.h +22 -7
package/include/glib-2.0/girepository/giinterfaceinfo.h +33 -18
package/include/glib-2.0/girepository/giobjectinfo.h +41 -26
package/include/glib-2.0/girepository/gipropertyinfo.h +18 -3
package/include/glib-2.0/girepository/giregisteredtypeinfo.h +22 -11
package/include/glib-2.0/girepository/girepository-autocleanups.h +56 -0
package/include/glib-2.0/girepository/girepository.h +53 -62
package/include/glib-2.0/girepository/girffi.h +8 -7
package/include/glib-2.0/girepository/gisignalinfo.h +18 -3
package/include/glib-2.0/girepository/gistructinfo.h +26 -11
package/include/glib-2.0/girepository/gitypeinfo.h +29 -16
package/include/glib-2.0/girepository/gitypelib.h +9 -13
package/include/glib-2.0/girepository/gitypes.h +52 -104
package/include/glib-2.0/girepository/giunioninfo.h +28 -12
package/include/glib-2.0/girepository/giunresolvedinfo.h +17 -2
package/include/glib-2.0/girepository/givalueinfo.h +65 -0
package/include/glib-2.0/girepository/givfuncinfo.h +23 -8
package/include/glib-2.0/glib/deprecated/gthread.h +9 -5
package/include/glib-2.0/glib/gbitlock.h +31 -0
package/include/glib-2.0/glib/gmessages.h +8 -0
package/include/glib-2.0/glib/gslice.h +2 -0
package/include/glib-2.0/glib/gstrfuncs.h +24 -18
package/include/glib-2.0/glib/gthread.h +191 -3
package/include/glib-2.0/glib-unix.h +7 -1
package/include/glib-2.0/gobject/genums.h +6 -6
package/include/glib-2.0/gobject/glib-types.h +11 -0
package/include/glib-2.0/gobject/gsignal.h +16 -6
package/include/hwy/aligned_allocator.h +171 -6
package/include/hwy/base.h +1765 -543
package/include/hwy/cache_control.h +24 -6
package/include/hwy/detect_compiler_arch.h +23 -2
package/include/hwy/detect_targets.h +56 -13
package/include/hwy/foreach_target.h +24 -0
package/include/hwy/highway.h +20 -3
package/include/hwy/ops/arm_neon-inl.h +1086 -667
package/include/hwy/ops/arm_sve-inl.h +1091 -235
package/include/hwy/ops/emu128-inl.h +271 -196
package/include/hwy/ops/generic_ops-inl.h +2270 -399
package/include/hwy/ops/ppc_vsx-inl.h +1786 -563
package/include/hwy/ops/rvv-inl.h +1043 -311
package/include/hwy/ops/scalar-inl.h +189 -159
package/include/hwy/ops/set_macros-inl.h +66 -6
package/include/hwy/ops/shared-inl.h +175 -56
package/include/hwy/ops/wasm_128-inl.h +153 -136
package/include/hwy/ops/x86_128-inl.h +1647 -646
package/include/hwy/ops/x86_256-inl.h +1003 -370
package/include/hwy/ops/x86_512-inl.h +948 -353
package/include/hwy/per_target.h +4 -0
package/include/hwy/profiler.h +648 -0
package/include/hwy/robust_statistics.h +2 -2
package/include/hwy/targets.h +18 -11
package/include/hwy/timer.h +11 -0
package/include/libpng16/png.h +32 -29
package/include/libpng16/pngconf.h +2 -2
package/include/libpng16/pnglibconf.h +7 -2
package/include/librsvg-2.0/librsvg/rsvg-version.h +2 -2
package/include/libxml2/libxml/parser.h +16 -7
package/include/libxml2/libxml/xmlIO.h +0 -1
package/include/libxml2/libxml/xmlversion.h +4 -4
package/include/pango-1.0/pango/pango-features.h +3 -3
package/include/pango-1.0/pango/pango-fontmap.h +7 -0
package/include/pixman-1/pixman-version.h +2 -2
package/include/png.h +32 -29
package/include/pngconf.h +2 -2
package/include/pnglibconf.h +7 -2
package/include/vips/connection.h +9 -3
package/include/vips/util.h +0 -9
package/include/vips/version.h +4 -4
package/package.json +1 -1
package/versions.json +11 -11

package/include/hwy/base.h CHANGED Viewed

@@ -22,16 +22,15 @@
 #include <stddef.h>
 #include <stdint.h>
-// Wrapping this into a HWY_HAS_INCLUDE causes clang-format to fail.
-#if __cplusplus >= 202100L && defined(__has_include)
-#if __has_include(<stdfloat>)
-#include <stdfloat>  // std::float16_t
-#endif
-#endif
 #include "hwy/detect_compiler_arch.h"
 #include "hwy/highway_export.h"
+#if HWY_COMPILER_MSVC && defined(_MSVC_LANG) && _MSVC_LANG > __cplusplus
+#define HWY_CXX_LANG _MSVC_LANG
+#else
+#define HWY_CXX_LANG __cplusplus
+#endif
 // "IWYU pragma: keep" does not work for these includes, so hide from the IDE.
 #if !HWY_IDE
@@ -48,6 +47,15 @@
 #endif  // !HWY_IDE
+#if !defined(HWY_NO_LIBCXX) && HWY_CXX_LANG > 201703L &&                    \
+    __cpp_impl_three_way_comparison >= 201907L && defined(__has_include) && \
+    !defined(HWY_DISABLE_CXX20_THREE_WAY_COMPARE)
+#if __has_include(<compare>)
+#include <compare>
+#define HWY_HAVE_CXX20_THREE_WAY_COMPARE 1
+#endif
+#endif
 // IWYU pragma: end_exports
 #if HWY_COMPILER_MSVC
@@ -131,6 +139,10 @@ namespace hwy {
 #define HWY_ASSUME_ALIGNED(ptr, align) (ptr) /* not supported */
 #endif
+// Special case to increases required alignment
+#define HWY_RCAST_ALIGNED(type, ptr) \
+  reinterpret_cast<type>(HWY_ASSUME_ALIGNED((ptr), alignof(type)))
 // Clang and GCC require attributes on each function into which SIMD intrinsics
 // are inlined. Support both per-function annotation (HWY_ATTR) for lambdas and
 // automatic annotation via pragmas.
@@ -274,6 +286,16 @@ HWY_DLLEXPORT HWY_NORETURN void HWY_FORMAT(3, 4)
   } while (0)
 #endif
+#if __cpp_constexpr >= 201304L
+#define HWY_CXX14_CONSTEXPR constexpr
+#else
+#define HWY_CXX14_CONSTEXPR
+#endif
+#ifndef HWY_HAVE_CXX20_THREE_WAY_COMPARE
+#define HWY_HAVE_CXX20_THREE_WAY_COMPARE 0
+#endif
 //------------------------------------------------------------------------------
 // CopyBytes / ZeroBytes
@@ -288,8 +310,7 @@ HWY_API void CopyBytes(const From* from, To* to) {
 #if HWY_COMPILER_MSVC
   memcpy(to, from, kBytes);
 #else
-  __builtin_memcpy(static_cast<void*>(to), static_cast<const void*>(from),
-                   kBytes);
+  __builtin_memcpy(to, from, kBytes);
 #endif
 }
@@ -357,349 +378,11 @@ static constexpr HWY_MAYBE_UNUSED size_t kMaxVectorSize = 16;
 //------------------------------------------------------------------------------
 // Lane types
-#pragma pack(push, 1)
-// float16_t load/store/conversion intrinsics are always supported on Armv8 and
-// VFPv4 (except with MSVC). On Armv7 Clang requires __ARM_FP & 2; GCC requires
-// -mfp16-format=ieee.
-#if (HWY_ARCH_ARM_A64 && !HWY_COMPILER_MSVC) ||                    \
-    (HWY_COMPILER_CLANG && defined(__ARM_FP) && (__ARM_FP & 2)) || \
-    (HWY_COMPILER_GCC_ACTUAL && defined(__ARM_FP16_FORMAT_IEEE))
-#define HWY_NEON_HAVE_FLOAT16C 1
-#else
-#define HWY_NEON_HAVE_FLOAT16C 0
-#endif
-// C11 extension ISO/IEC TS 18661-3:2015 but not supported on all targets.
-// Required if HWY_HAVE_FLOAT16, i.e. RVV with zvfh or AVX3_SPR (with
-// sufficiently new compiler supporting avx512fp16). Do not use on clang-cl,
-// which is missing __extendhfsf2.
-#if ((HWY_ARCH_RVV && defined(__riscv_zvfh) && HWY_COMPILER_CLANG) || \
-     (HWY_ARCH_X86 && defined(__SSE2__) &&                            \
-      ((HWY_COMPILER_CLANG >= 1600 && !HWY_COMPILER_CLANGCL) ||       \
-       HWY_COMPILER_GCC_ACTUAL >= 1200)))
-#define HWY_HAVE_C11_FLOAT16 1
-#else
-#define HWY_HAVE_C11_FLOAT16 0
-#endif
-// If 1, both __bf16 and a limited set of *_bf16 SVE intrinsics are available:
-// create/get/set/dup, ld/st, sel, rev, trn, uzp, zip.
-#if HWY_ARCH_ARM_A64 && defined(__ARM_FEATURE_SVE_BF16)
-#define HWY_SVE_HAVE_BFLOAT16 1
-#else
-#define HWY_SVE_HAVE_BFLOAT16 0
-#endif
-// Match [u]int##_t naming scheme so rvv-inl.h macros can obtain the type name
-// by concatenating base type and bits. We use a wrapper class instead of a
-// typedef to the native type to ensure that the same symbols, e.g. for VQSort,
-// are generated regardless of F16 support; see #1684.
-struct float16_t {
-#if HWY_NEON_HAVE_FLOAT16C  // ACLE's __fp16
-  using Raw = __fp16;
-#elif HWY_HAVE_C11_FLOAT16                                    // C11 _Float16
-  using Raw = _Float16;
-#elif __cplusplus > 202002L && defined(__STDCPP_FLOAT16_T__)  // C++23
-  using Raw = std::float16_t;
-#else
-#define HWY_EMULATE_FLOAT16
-  using Raw = uint16_t;
-  Raw bits;
-#endif  // float16_t
-// When backed by a native type, ensure the wrapper behaves like the native
-// type by forwarding all operators. Unfortunately it seems difficult to reuse
-// this code in a base class, so we repeat it in bfloat16_t.
-#ifndef HWY_EMULATE_FLOAT16
-  Raw raw;
-  float16_t() noexcept = default;
-  template <typename T>
-  constexpr float16_t(T arg) noexcept : raw(static_cast<Raw>(arg)) {}
-  float16_t& operator=(Raw arg) noexcept {
-    raw = arg;
-    return *this;
-  }
-  constexpr float16_t(const float16_t&) noexcept = default;
-  float16_t& operator=(const float16_t&) noexcept = default;
-  constexpr operator Raw() const noexcept { return raw; }
-  template <typename T>
-  float16_t& operator+=(T rhs) noexcept {
-    raw = static_cast<Raw>(raw + rhs);
-    return *this;
-  }
-  template <typename T>
-  float16_t& operator-=(T rhs) noexcept {
-    raw = static_cast<Raw>(raw - rhs);
-    return *this;
-  }
-  template <typename T>
-  float16_t& operator*=(T rhs) noexcept {
-    raw = static_cast<Raw>(raw * rhs);
-    return *this;
-  }
-  template <typename T>
-  float16_t& operator/=(T rhs) noexcept {
-    raw = static_cast<Raw>(raw / rhs);
-    return *this;
-  }
-  float16_t operator--() noexcept {
-    raw = static_cast<Raw>(raw - Raw{1});
-    return *this;
-  }
-  float16_t operator--(int) noexcept {
-    raw = static_cast<Raw>(raw - Raw{1});
-    return *this;
-  }
-  float16_t operator++() noexcept {
-    raw = static_cast<Raw>(raw + Raw{1});
-    return *this;
-  }
-  float16_t operator++(int) noexcept {
-    raw = static_cast<Raw>(raw + Raw{1});
-    return *this;
-  }
-  constexpr float16_t operator-() const noexcept {
-    return float16_t(static_cast<Raw>(-raw));
-  }
-  constexpr float16_t operator+() const noexcept { return *this; }
-#endif  // HWY_EMULATE_FLOAT16
-};
-#ifndef HWY_EMULATE_FLOAT16
-constexpr inline bool operator==(float16_t lhs, float16_t rhs) noexcept {
-  return lhs.raw == rhs.raw;
-}
-constexpr inline bool operator!=(float16_t lhs, float16_t rhs) noexcept {
-  return lhs.raw != rhs.raw;
-}
-constexpr inline bool operator<(float16_t lhs, float16_t rhs) noexcept {
-  return lhs.raw < rhs.raw;
-}
-constexpr inline bool operator<=(float16_t lhs, float16_t rhs) noexcept {
-  return lhs.raw <= rhs.raw;
-}
-constexpr inline bool operator>(float16_t lhs, float16_t rhs) noexcept {
-  return lhs.raw > rhs.raw;
-}
-constexpr inline bool operator>=(float16_t lhs, float16_t rhs) noexcept {
-  return lhs.raw >= rhs.raw;
-}
-#endif  // HWY_EMULATE_FLOAT16
-struct bfloat16_t {
-#if HWY_SVE_HAVE_BFLOAT16
-  using Raw = __bf16;
-#elif __cplusplus >= 202100L && defined(__STDCPP_BFLOAT16_T__)  // C++23
-  using Raw = std::bfloat16_t;
-#else
-#define HWY_EMULATE_BFLOAT16
-  using Raw = uint16_t;
-  Raw bits;
-#endif
-#ifndef HWY_EMULATE_BFLOAT16
-  Raw raw;
-  bfloat16_t() noexcept = default;
-  template <typename T>
-  constexpr bfloat16_t(T arg) noexcept : raw(static_cast<Raw>(arg)) {}
-  bfloat16_t& operator=(Raw arg) noexcept {
-    raw = arg;
-    return *this;
-  }
-  constexpr bfloat16_t(const bfloat16_t&) noexcept = default;
-  bfloat16_t& operator=(const bfloat16_t&) noexcept = default;
-  constexpr operator Raw() const noexcept { return raw; }
-  template <typename T>
-  bfloat16_t& operator+=(T rhs) noexcept {
-    raw = static_cast<Raw>(raw + rhs);
-    return *this;
-  }
-  template <typename T>
-  bfloat16_t& operator-=(T rhs) noexcept {
-    raw = static_cast<Raw>(raw - rhs);
-    return *this;
-  }
-  template <typename T>
-  bfloat16_t& operator*=(T rhs) noexcept {
-    raw = static_cast<Raw>(raw * rhs);
-    return *this;
-  }
-  template <typename T>
-  bfloat16_t& operator/=(T rhs) noexcept {
-    raw = static_cast<Raw>(raw / rhs);
-    return *this;
-  }
-  bfloat16_t operator--() noexcept {
-    raw = static_cast<Raw>(raw - Raw{1});
-    return *this;
-  }
-  bfloat16_t operator--(int) noexcept {
-    raw = static_cast<Raw>(raw - Raw{1});
-    return *this;
-  }
-  bfloat16_t operator++() noexcept {
-    raw = static_cast<Raw>(raw + Raw{1});
-    return *this;
-  }
-  bfloat16_t operator++(int) noexcept {
-    raw = static_cast<Raw>(raw + Raw{1});
-    return *this;
-  }
-  constexpr bfloat16_t operator-() const noexcept {
-    return bfloat16_t(static_cast<Raw>(-raw));
-  }
-  constexpr bfloat16_t operator+() const noexcept { return *this; }
-#endif  // HWY_EMULATE_BFLOAT16
-};
-#ifndef HWY_EMULATE_BFLOAT16
-constexpr inline bool operator==(bfloat16_t lhs, bfloat16_t rhs) noexcept {
-  return lhs.raw == rhs.raw;
-}
-constexpr inline bool operator!=(bfloat16_t lhs, bfloat16_t rhs) noexcept {
-  return lhs.raw != rhs.raw;
-}
-constexpr inline bool operator<(bfloat16_t lhs, bfloat16_t rhs) noexcept {
-  return lhs.raw < rhs.raw;
-}
-constexpr inline bool operator<=(bfloat16_t lhs, bfloat16_t rhs) noexcept {
-  return lhs.raw <= rhs.raw;
-}
-constexpr inline bool operator>(bfloat16_t lhs, bfloat16_t rhs) noexcept {
-  return lhs.raw > rhs.raw;
-}
-constexpr inline bool operator>=(bfloat16_t lhs, bfloat16_t rhs) noexcept {
-  return lhs.raw >= rhs.raw;
-}
-#endif  // HWY_EMULATE_BFLOAT16
-#pragma pack(pop)
-HWY_API float F32FromF16(float16_t f16) {
-#ifdef HWY_EMULATE_FLOAT16
-  uint16_t bits16;
-  CopySameSize(&f16, &bits16);
-  const uint32_t sign = static_cast<uint32_t>(bits16 >> 15);
-  const uint32_t biased_exp = (bits16 >> 10) & 0x1F;
-  const uint32_t mantissa = bits16 & 0x3FF;
-  // Subnormal or zero
-  if (biased_exp == 0) {
-    const float subnormal =
-        (1.0f / 16384) * (static_cast<float>(mantissa) * (1.0f / 1024));
-    return sign ? -subnormal : subnormal;
-  }
-  // Normalized: convert the representation directly (faster than ldexp/tables).
-  const uint32_t biased_exp32 = biased_exp + (127 - 15);
-  const uint32_t mantissa32 = mantissa << (23 - 10);
-  const uint32_t bits32 = (sign << 31) | (biased_exp32 << 23) | mantissa32;
-  float result;
-  CopySameSize(&bits32, &result);
-  return result;
-#else
-  return static_cast<float>(f16);
-#endif
-}
-HWY_API float16_t F16FromF32(float f32) {
-#ifdef HWY_EMULATE_FLOAT16
-  uint32_t bits32;
-  CopySameSize(&f32, &bits32);
-  const uint32_t sign = bits32 >> 31;
-  const uint32_t biased_exp32 = (bits32 >> 23) & 0xFF;
-  const uint32_t mantissa32 = bits32 & 0x7FFFFF;
-  const int32_t exp = HWY_MIN(static_cast<int32_t>(biased_exp32) - 127, 15);
-  // Tiny or zero => zero.
-  float16_t out;
-  if (exp < -24) {
-    // restore original sign
-    const uint16_t bits = static_cast<uint16_t>(sign << 15);
-    CopySameSize(&bits, &out);
-    return out;
-  }
-  uint32_t biased_exp16, mantissa16;
-  // exp = [-24, -15] => subnormal
-  if (exp < -14) {
-    biased_exp16 = 0;
-    const uint32_t sub_exp = static_cast<uint32_t>(-14 - exp);
-    HWY_DASSERT(1 <= sub_exp && sub_exp < 11);
-    mantissa16 = static_cast<uint32_t>((1u << (10 - sub_exp)) +
-                                       (mantissa32 >> (13 + sub_exp)));
-  } else {
-    // exp = [-14, 15]
-    biased_exp16 = static_cast<uint32_t>(exp + 15);
-    HWY_DASSERT(1 <= biased_exp16 && biased_exp16 < 31);
-    mantissa16 = mantissa32 >> 13;
-  }
-  HWY_DASSERT(mantissa16 < 1024);
-  const uint32_t bits16 = (sign << 15) | (biased_exp16 << 10) | mantissa16;
-  HWY_DASSERT(bits16 < 0x10000);
-  const uint16_t narrowed = static_cast<uint16_t>(bits16);  // big-endian safe
-  CopySameSize(&narrowed, &out);
-  return out;
-#else
-  return float16_t(static_cast<float16_t::Raw>(f32));
-#endif
-}
-HWY_API float F32FromBF16(bfloat16_t bf) {
-  uint16_t bits16;
-  CopyBytes<2>(&bf, &bits16);
-  uint32_t bits = bits16;
-  bits <<= 16;
-  float f;
-  CopySameSize(&bits, &f);
-  return f;
-}
-HWY_API float F32FromF16Mem(const void* ptr) {
-  float16_t f16;
-  CopyBytes<2>(ptr, &f16);
-  return F32FromF16(f16);
-}
-HWY_API float F32FromBF16Mem(const void* ptr) {
-  bfloat16_t bf;
-  CopyBytes<2>(ptr, &bf);
-  return F32FromBF16(bf);
-}
-HWY_API bfloat16_t BF16FromF32(float f) {
-  uint32_t bits;
-  CopySameSize(&f, &bits);
-  const uint16_t bits16 = static_cast<uint16_t>(bits >> 16);
-  bfloat16_t bf;
-  CopySameSize(&bits16, &bf);
-  return bf;
-}
+// hwy::float16_t and hwy::bfloat16_t are forward declared here to allow
+// BitCastScalar to be implemented before the implementations of the
+// hwy::float16_t and hwy::bfloat16_t types
+struct float16_t;
+struct bfloat16_t;
 using float32_t = float;
 using float64_t = double;
@@ -729,24 +412,6 @@ struct alignas(8) K32V32 {
 #pragma pack(pop)
-#ifdef HWY_EMULATE_FLOAT16
-static inline HWY_MAYBE_UNUSED bool operator<(const float16_t& a,
-                                              const float16_t& b) {
-  return F32FromF16(a) < F32FromF16(b);
-}
-// Required for std::greater.
-static inline HWY_MAYBE_UNUSED bool operator>(const float16_t& a,
-                                              const float16_t& b) {
-  return F32FromF16(a) > F32FromF16(b);
-}
-static inline HWY_MAYBE_UNUSED bool operator==(const float16_t& a,
-                                               const float16_t& b) {
-  return F32FromF16(a) == F32FromF16(b);
-}
-#endif  // HWY_EMULATE_FLOAT16
 static inline HWY_MAYBE_UNUSED bool operator<(const uint128_t& a,
                                               const uint128_t& b) {
   return (a.hi == b.hi) ? a.lo < b.lo : a.hi < b.hi;
@@ -817,106 +482,1378 @@ HWY_API constexpr bool IsSame() {
   return IsSameT<T, U>::value;
 }
+// Returns whether T matches either of U1 or U2
+template <typename T, typename U1, typename U2>
+HWY_API constexpr bool IsSameEither() {
+  return IsSameT<T, U1>::value || IsSameT<T, U2>::value;
+}
 template <bool Condition, typename Then, typename Else>
 struct IfT {
   using type = Then;
 };
-template <class Then, class Else>
-struct IfT<false, Then, Else> {
-  using type = Else;
-};
+template <class Then, class Else>
+struct IfT<false, Then, Else> {
+  using type = Else;
+};
+template <bool Condition, typename Then, typename Else>
+using If = typename IfT<Condition, Then, Else>::type;
+template <typename T>
+struct IsConstT {
+  enum { value = 0 };
+};
+template <typename T>
+struct IsConstT<const T> {
+  enum { value = 1 };
+};
+template <typename T>
+HWY_API constexpr bool IsConst() {
+  return IsConstT<T>::value;
+}
+template <class T>
+struct RemoveConstT {
+  using type = T;
+};
+template <class T>
+struct RemoveConstT<const T> {
+  using type = T;
+};
+template <class T>
+using RemoveConst = typename RemoveConstT<T>::type;
+template <class T>
+struct RemoveVolatileT {
+  using type = T;
+};
+template <class T>
+struct RemoveVolatileT<volatile T> {
+  using type = T;
+};
+template <class T>
+using RemoveVolatile = typename RemoveVolatileT<T>::type;
+template <class T>
+struct RemoveRefT {
+  using type = T;
+};
+template <class T>
+struct RemoveRefT<T&> {
+  using type = T;
+};
+template <class T>
+struct RemoveRefT<T&&> {
+  using type = T;
+};
+template <class T>
+using RemoveRef = typename RemoveRefT<T>::type;
+template <class T>
+using RemoveCvRef = RemoveConst<RemoveVolatile<RemoveRef<T>>>;
+// Insert into template/function arguments to enable this overload only for
+// vectors of exactly, at most (LE), or more than (GT) this many bytes.
+//
+// As an example, checking for a total size of 16 bytes will match both
+// Simd<uint8_t, 16, 0> and Simd<uint8_t, 8, 1>.
+#define HWY_IF_V_SIZE(T, kN, bytes) \
+  hwy::EnableIf<kN * sizeof(T) == bytes>* = nullptr
+#define HWY_IF_V_SIZE_LE(T, kN, bytes) \
+  hwy::EnableIf<kN * sizeof(T) <= bytes>* = nullptr
+#define HWY_IF_V_SIZE_GT(T, kN, bytes) \
+  hwy::EnableIf<(kN * sizeof(T) > bytes)>* = nullptr
+#define HWY_IF_LANES(kN, lanes) hwy::EnableIf<(kN == lanes)>* = nullptr
+#define HWY_IF_LANES_LE(kN, lanes) hwy::EnableIf<(kN <= lanes)>* = nullptr
+#define HWY_IF_LANES_GT(kN, lanes) hwy::EnableIf<(kN > lanes)>* = nullptr
+#define HWY_IF_UNSIGNED(T) hwy::EnableIf<!hwy::IsSigned<T>()>* = nullptr
+#define HWY_IF_SIGNED(T)                                    \
+  hwy::EnableIf<hwy::IsSigned<T>() && !hwy::IsFloat<T>() && \
+                !hwy::IsSpecialFloat<T>()>* = nullptr
+#define HWY_IF_FLOAT(T) hwy::EnableIf<hwy::IsFloat<T>()>* = nullptr
+#define HWY_IF_NOT_FLOAT(T) hwy::EnableIf<!hwy::IsFloat<T>()>* = nullptr
+#define HWY_IF_FLOAT3264(T) hwy::EnableIf<hwy::IsFloat3264<T>()>* = nullptr
+#define HWY_IF_NOT_FLOAT3264(T) hwy::EnableIf<!hwy::IsFloat3264<T>()>* = nullptr
+#define HWY_IF_SPECIAL_FLOAT(T) \
+  hwy::EnableIf<hwy::IsSpecialFloat<T>()>* = nullptr
+#define HWY_IF_NOT_SPECIAL_FLOAT(T) \
+  hwy::EnableIf<!hwy::IsSpecialFloat<T>()>* = nullptr
+#define HWY_IF_FLOAT_OR_SPECIAL(T) \
+  hwy::EnableIf<hwy::IsFloat<T>() || hwy::IsSpecialFloat<T>()>* = nullptr
+#define HWY_IF_NOT_FLOAT_NOR_SPECIAL(T) \
+  hwy::EnableIf<!hwy::IsFloat<T>() && !hwy::IsSpecialFloat<T>()>* = nullptr
+#define HWY_IF_INTEGER(T) hwy::EnableIf<hwy::IsInteger<T>()>* = nullptr
+#define HWY_IF_T_SIZE(T, bytes) hwy::EnableIf<sizeof(T) == (bytes)>* = nullptr
+#define HWY_IF_NOT_T_SIZE(T, bytes) \
+  hwy::EnableIf<sizeof(T) != (bytes)>* = nullptr
+// bit_array = 0x102 means 1 or 8 bytes. There is no NONE_OF because it sounds
+// too similar. If you want the opposite of this (2 or 4 bytes), ask for those
+// bits explicitly (0x14) instead of attempting to 'negate' 0x102.
+#define HWY_IF_T_SIZE_ONE_OF(T, bit_array) \
+  hwy::EnableIf<((size_t{1} << sizeof(T)) & (bit_array)) != 0>* = nullptr
+#define HWY_IF_T_SIZE_LE(T, bytes) \
+  hwy::EnableIf<(sizeof(T) <= (bytes))>* = nullptr
+#define HWY_IF_T_SIZE_GT(T, bytes) \
+  hwy::EnableIf<(sizeof(T) > (bytes))>* = nullptr
+#define HWY_IF_SAME(T, expected) \
+  hwy::EnableIf<hwy::IsSame<hwy::RemoveCvRef<T>, expected>()>* = nullptr
+#define HWY_IF_NOT_SAME(T, expected) \
+  hwy::EnableIf<!hwy::IsSame<hwy::RemoveCvRef<T>, expected>()>* = nullptr
+// One of two expected types
+#define HWY_IF_SAME2(T, expected1, expected2)                            \
+  hwy::EnableIf<                                                         \
+      hwy::IsSameEither<hwy::RemoveCvRef<T>, expected1, expected2>()>* = \
+      nullptr
+#define HWY_IF_U8(T) HWY_IF_SAME(T, uint8_t)
+#define HWY_IF_U16(T) HWY_IF_SAME(T, uint16_t)
+#define HWY_IF_U32(T) HWY_IF_SAME(T, uint32_t)
+#define HWY_IF_U64(T) HWY_IF_SAME(T, uint64_t)
+#define HWY_IF_I8(T) HWY_IF_SAME(T, int8_t)
+#define HWY_IF_I16(T) HWY_IF_SAME(T, int16_t)
+#define HWY_IF_I32(T) HWY_IF_SAME(T, int32_t)
+#define HWY_IF_I64(T) HWY_IF_SAME(T, int64_t)
+#define HWY_IF_BF16(T) HWY_IF_SAME(T, hwy::bfloat16_t)
+#define HWY_IF_NOT_BF16(T) HWY_IF_NOT_SAME(T, hwy::bfloat16_t)
+#define HWY_IF_F16(T) HWY_IF_SAME(T, hwy::float16_t)
+#define HWY_IF_NOT_F16(T) HWY_IF_NOT_SAME(T, hwy::float16_t)
+#define HWY_IF_F32(T) HWY_IF_SAME(T, float)
+#define HWY_IF_F64(T) HWY_IF_SAME(T, double)
+// Use instead of HWY_IF_T_SIZE to avoid ambiguity with float16_t/float/double
+// overloads.
+#define HWY_IF_UI8(T) HWY_IF_SAME2(T, uint8_t, int8_t)
+#define HWY_IF_UI16(T) HWY_IF_SAME2(T, uint16_t, int16_t)
+#define HWY_IF_UI32(T) HWY_IF_SAME2(T, uint32_t, int32_t)
+#define HWY_IF_UI64(T) HWY_IF_SAME2(T, uint64_t, int64_t)
+#define HWY_IF_LANES_PER_BLOCK(T, N, LANES) \
+  hwy::EnableIf<HWY_MIN(sizeof(T) * N, 16) / sizeof(T) == (LANES)>* = nullptr
+// Empty struct used as a size tag type.
+template <size_t N>
+struct SizeTag {};
+template <class T>
+class DeclValT {
+ private:
+  template <class U, class URef = U&&>
+  static URef TryAddRValRef(int);
+  template <class U, class Arg>
+  static U TryAddRValRef(Arg);
+ public:
+  using type = decltype(TryAddRValRef<T>(0));
+  enum { kDisableDeclValEvaluation = 1 };
+};
+// hwy::DeclVal<T>() can only be used in unevaluated contexts such as within an
+// expression of a decltype specifier.
+// hwy::DeclVal<T>() does not require that T have a public default constructor
+template <class T>
+HWY_API typename DeclValT<T>::type DeclVal() noexcept {
+  static_assert(!DeclValT<T>::kDisableDeclValEvaluation,
+                "DeclVal() cannot be used in an evaluated context");
+}
+template <class T>
+struct IsArrayT {
+  enum { value = 0 };
+};
+template <class T>
+struct IsArrayT<T[]> {
+  enum { value = 1 };
+};
+template <class T, size_t N>
+struct IsArrayT<T[N]> {
+  enum { value = 1 };
+};
+template <class T>
+static constexpr bool IsArray() {
+  return IsArrayT<T>::value;
+}
+#if HWY_COMPILER_MSVC
+HWY_DIAGNOSTICS(push)
+HWY_DIAGNOSTICS_OFF(disable : 4180, ignored "-Wignored-qualifiers")
+#endif
+template <class From, class To>
+class IsConvertibleT {
+ private:
+  template <class T>
+  static hwy::SizeTag<1> TestFuncWithToArg(T);
+  template <class T, class U>
+  static decltype(IsConvertibleT<T, U>::template TestFuncWithToArg<U>(
+      DeclVal<T>()))
+  TryConvTest(int);
+  template <class T, class U, class Arg>
+  static hwy::SizeTag<0> TryConvTest(Arg);
+ public:
+  enum {
+    value = (IsSame<RemoveConst<RemoveVolatile<From>>, void>() &&
+             IsSame<RemoveConst<RemoveVolatile<To>>, void>()) ||
+            (!IsArray<To>() &&
+             (IsSame<To, decltype(DeclVal<To>())>() ||
+              !IsSame<const RemoveConst<To>, RemoveConst<To>>()) &&
+             IsSame<decltype(TryConvTest<From, To>(0)), hwy::SizeTag<1>>())
+  };
+};
+#if HWY_COMPILER_MSVC
+HWY_DIAGNOSTICS(pop)
+#endif
+template <class From, class To>
+HWY_API constexpr bool IsConvertible() {
+  return IsConvertibleT<From, To>::value;
+}
+template <class From, class To>
+class IsStaticCastableT {
+ private:
+  template <class T, class U, class = decltype(static_cast<U>(DeclVal<T>()))>
+  static hwy::SizeTag<1> TryStaticCastTest(int);
+  template <class T, class U, class Arg>
+  static hwy::SizeTag<0> TryStaticCastTest(Arg);
+ public:
+  enum {
+    value = IsSame<decltype(TryStaticCastTest<From, To>(0)), hwy::SizeTag<1>>()
+  };
+};
+template <class From, class To>
+static constexpr bool IsStaticCastable() {
+  return IsStaticCastableT<From, To>::value;
+}
+#define HWY_IF_CASTABLE(From, To) \
+  hwy::EnableIf<IsStaticCastable<From, To>()>* = nullptr
+#define HWY_IF_OP_CASTABLE(op, T, Native) \
+  HWY_IF_CASTABLE(decltype(DeclVal<Native>() op DeclVal<T>()), Native)
+template <class T, class From>
+class IsAssignableT {
+ private:
+  template <class T1, class T2, class = decltype(DeclVal<T1>() = DeclVal<T2>())>
+  static hwy::SizeTag<1> TryAssignTest(int);
+  template <class T1, class T2, class Arg>
+  static hwy::SizeTag<0> TryAssignTest(Arg);
+ public:
+  enum {
+    value = IsSame<decltype(TryAssignTest<T, From>(0)), hwy::SizeTag<1>>()
+  };
+};
+template <class T, class From>
+static constexpr bool IsAssignable() {
+  return IsAssignableT<T, From>::value;
+}
+#define HWY_IF_ASSIGNABLE(T, From) \
+  hwy::EnableIf<IsAssignable<T, From>()>* = nullptr
+// ----------------------------------------------------------------------------
+// IsSpecialFloat
+// These types are often special-cased and not supported in all ops.
+template <typename T>
+HWY_API constexpr bool IsSpecialFloat() {
+  return IsSameEither<RemoveCvRef<T>, hwy::float16_t, hwy::bfloat16_t>();
+}
+// -----------------------------------------------------------------------------
+// IsIntegerLaneType and IsInteger
+template <class T>
+HWY_API constexpr bool IsIntegerLaneType() {
+  return false;
+}
+template <>
+HWY_INLINE constexpr bool IsIntegerLaneType<int8_t>() {
+  return true;
+}
+template <>
+HWY_INLINE constexpr bool IsIntegerLaneType<uint8_t>() {
+  return true;
+}
+template <>
+HWY_INLINE constexpr bool IsIntegerLaneType<int16_t>() {
+  return true;
+}
+template <>
+HWY_INLINE constexpr bool IsIntegerLaneType<uint16_t>() {
+  return true;
+}
+template <>
+HWY_INLINE constexpr bool IsIntegerLaneType<int32_t>() {
+  return true;
+}
+template <>
+HWY_INLINE constexpr bool IsIntegerLaneType<uint32_t>() {
+  return true;
+}
+template <>
+HWY_INLINE constexpr bool IsIntegerLaneType<int64_t>() {
+  return true;
+}
+template <>
+HWY_INLINE constexpr bool IsIntegerLaneType<uint64_t>() {
+  return true;
+}
+template <class T>
+HWY_API constexpr bool IsInteger() {
+  // NOTE: Do not add a IsInteger<wchar_t>() specialization below as it is
+  // possible for IsSame<wchar_t, uint16_t>() to be true when compiled with MSVC
+  // with the /Zc:wchar_t- option.
+  return IsIntegerLaneType<T>() || IsSame<RemoveCvRef<T>, wchar_t>() ||
+         IsSameEither<RemoveCvRef<T>, size_t, ptrdiff_t>() ||
+         IsSameEither<RemoveCvRef<T>, intptr_t, uintptr_t>();
+}
+template <>
+HWY_INLINE constexpr bool IsInteger<bool>() {
+  return true;
+}
+template <>
+HWY_INLINE constexpr bool IsInteger<char>() {
+  return true;
+}
+template <>
+HWY_INLINE constexpr bool IsInteger<signed char>() {
+  return true;
+}
+template <>
+HWY_INLINE constexpr bool IsInteger<unsigned char>() {
+  return true;
+}
+template <>
+HWY_INLINE constexpr bool IsInteger<short>() {  // NOLINT
+  return true;
+}
+template <>
+HWY_INLINE constexpr bool IsInteger<unsigned short>() {  // NOLINT
+  return true;
+}
+template <>
+HWY_INLINE constexpr bool IsInteger<int>() {
+  return true;
+}
+template <>
+HWY_INLINE constexpr bool IsInteger<unsigned>() {
+  return true;
+}
+template <>
+HWY_INLINE constexpr bool IsInteger<long>() {  // NOLINT
+  return true;
+}
+template <>
+HWY_INLINE constexpr bool IsInteger<unsigned long>() {  // NOLINT
+  return true;
+}
+template <>
+HWY_INLINE constexpr bool IsInteger<long long>() {  // NOLINT
+  return true;
+}
+template <>
+HWY_INLINE constexpr bool IsInteger<unsigned long long>() {  // NOLINT
+  return true;
+}
+#if defined(__cpp_char8_t) && __cpp_char8_t >= 201811L
+template <>
+HWY_INLINE constexpr bool IsInteger<char8_t>() {
+  return true;
+}
+#endif
+template <>
+HWY_INLINE constexpr bool IsInteger<char16_t>() {
+  return true;
+}
+template <>
+HWY_INLINE constexpr bool IsInteger<char32_t>() {
+  return true;
+}
+// -----------------------------------------------------------------------------
+// BitCastScalar
+#if HWY_HAS_BUILTIN(__builtin_bit_cast) || HWY_COMPILER_MSVC >= 1926
+#define HWY_BITCASTSCALAR_CONSTEXPR constexpr
+#else
+#define HWY_BITCASTSCALAR_CONSTEXPR
+#endif
+#if __cpp_constexpr >= 201304L
+#define HWY_BITCASTSCALAR_CXX14_CONSTEXPR HWY_BITCASTSCALAR_CONSTEXPR
+#else
+#define HWY_BITCASTSCALAR_CXX14_CONSTEXPR
+#endif
+#if HWY_HAS_BUILTIN(__builtin_bit_cast) || HWY_COMPILER_MSVC >= 1926
+namespace detail {
+template <class From>
+struct BitCastScalarSrcCastHelper {
+  static HWY_INLINE constexpr const From& CastSrcValRef(const From& val) {
+    return val;
+  }
+};
+#if HWY_COMPILER_CLANG >= 900 && HWY_COMPILER_CLANG < 1000
+// Workaround for Clang 9 constexpr __builtin_bit_cast bug
+template <class To, class From,
+          hwy::EnableIf<hwy::IsInteger<RemoveCvRef<To>>() &&
+                        hwy::IsInteger<RemoveCvRef<From>>()>* = nullptr>
+static HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR To
+BuiltinBitCastScalar(const From& val) {
+  static_assert(sizeof(To) == sizeof(From),
+                "sizeof(To) == sizeof(From) must be true");
+  return static_cast<To>(val);
+}
+template <class To, class From,
+          hwy::EnableIf<!(hwy::IsInteger<RemoveCvRef<To>>() &&
+                          hwy::IsInteger<RemoveCvRef<From>>())>* = nullptr>
+static HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR To
+BuiltinBitCastScalar(const From& val) {
+  return __builtin_bit_cast(To, val);
+}
+#endif  // HWY_COMPILER_CLANG >= 900 && HWY_COMPILER_CLANG < 1000
+}  // namespace detail
+template <class To, class From, HWY_IF_NOT_SPECIAL_FLOAT(To)>
+HWY_API HWY_BITCASTSCALAR_CONSTEXPR To BitCastScalar(const From& val) {
+  // If From is hwy::float16_t or hwy::bfloat16_t, first cast val to either
+  // const typename From::Native& or const uint16_t& using
+  // detail::BitCastScalarSrcCastHelper<RemoveCvRef<From>>::CastSrcValRef to
+  // allow BitCastScalar from hwy::float16_t or hwy::bfloat16_t to be constexpr
+  // if To is not a pointer type, union type, or a struct/class containing a
+  // pointer, union, or reference subobject
+#if HWY_COMPILER_CLANG >= 900 && HWY_COMPILER_CLANG < 1000
+  return detail::BuiltinBitCastScalar<To>(
+      detail::BitCastScalarSrcCastHelper<RemoveCvRef<From>>::CastSrcValRef(
+          val));
+#else
+  return __builtin_bit_cast(
+      To, detail::BitCastScalarSrcCastHelper<RemoveCvRef<From>>::CastSrcValRef(
+              val));
+#endif
+}
+template <class To, class From, HWY_IF_SPECIAL_FLOAT(To)>
+HWY_API HWY_BITCASTSCALAR_CONSTEXPR To BitCastScalar(const From& val) {
+  // If To is hwy::float16_t or hwy::bfloat16_t, first do a BitCastScalar of val
+  // to uint16_t, and then bit cast the uint16_t value to To using To::FromBits
+  // as hwy::float16_t::FromBits and hwy::bfloat16_t::FromBits are guaranteed to
+  // be constexpr if the __builtin_bit_cast intrinsic is available.
+  return To::FromBits(BitCastScalar<uint16_t>(val));
+}
+#else
+template <class To, class From>
+HWY_API HWY_BITCASTSCALAR_CONSTEXPR To BitCastScalar(const From& val) {
+  To result;
+  CopySameSize(&val, &result);
+  return result;
+}
+#endif
+//------------------------------------------------------------------------------
+// F16 lane type
+#pragma pack(push, 1)
+// Compiler supports __fp16 and load/store/conversion NEON intrinsics, which are
+// included in Armv8 and VFPv4 (except with MSVC). On Armv7 Clang requires
+// __ARM_FP & 2 whereas Armv7 GCC requires -mfp16-format=ieee.
+#if (HWY_ARCH_ARM_A64 && !HWY_COMPILER_MSVC) ||                    \
+    (HWY_COMPILER_CLANG && defined(__ARM_FP) && (__ARM_FP & 2)) || \
+    (HWY_COMPILER_GCC_ACTUAL && defined(__ARM_FP16_FORMAT_IEEE))
+#define HWY_NEON_HAVE_F16C 1
+#else
+#define HWY_NEON_HAVE_F16C 0
+#endif
+// RVV with f16 extension supports _Float16 and f16 vector ops. If set, implies
+// HWY_HAVE_FLOAT16.
+#if HWY_ARCH_RVV && defined(__riscv_zvfh) && HWY_COMPILER_CLANG >= 1600
+#define HWY_RVV_HAVE_F16_VEC 1
+#else
+#define HWY_RVV_HAVE_F16_VEC 0
+#endif
+// x86 compiler supports _Float16, not necessarily with operators.
+// Avoid clang-cl because it lacks __extendhfsf2.
+#if HWY_ARCH_X86 && defined(__SSE2__) && defined(__FLT16_MAX__) && \
+    ((HWY_COMPILER_CLANG >= 1500 && !HWY_COMPILER_CLANGCL) ||      \
+     HWY_COMPILER_GCC_ACTUAL >= 1200)
+#define HWY_SSE2_HAVE_F16_TYPE 1
+#else
+#define HWY_SSE2_HAVE_F16_TYPE 0
+#endif
+#ifndef HWY_HAVE_SCALAR_F16_TYPE
+// Compiler supports _Float16, not necessarily with operators.
+#if HWY_NEON_HAVE_F16C || HWY_RVV_HAVE_F16_VEC || HWY_SSE2_HAVE_F16_TYPE
+#define HWY_HAVE_SCALAR_F16_TYPE 1
+#else
+#define HWY_HAVE_SCALAR_F16_TYPE 0
+#endif
+#endif  // HWY_HAVE_SCALAR_F16_TYPE
+#ifndef HWY_HAVE_SCALAR_F16_OPERATORS
+// Recent enough compiler also has operators.
+#if HWY_HAVE_SCALAR_F16_TYPE &&                                       \
+    (HWY_COMPILER_CLANG >= 1800 || HWY_COMPILER_GCC_ACTUAL >= 1200 || \
+     (HWY_COMPILER_CLANG >= 1500 && !HWY_COMPILER_CLANGCL &&          \
+      !defined(_WIN32)) ||                                            \
+     (HWY_ARCH_ARM &&                                                 \
+      (HWY_COMPILER_CLANG >= 900 || HWY_COMPILER_GCC_ACTUAL >= 800)))
+#define HWY_HAVE_SCALAR_F16_OPERATORS 1
+#else
+#define HWY_HAVE_SCALAR_F16_OPERATORS 0
+#endif
+#endif  // HWY_HAVE_SCALAR_F16_OPERATORS
+namespace detail {
+template <class T, class TVal = RemoveCvRef<T>, bool = IsSpecialFloat<TVal>()>
+struct SpecialFloatUnwrapArithOpOperandT {};
+template <class T, class TVal>
+struct SpecialFloatUnwrapArithOpOperandT<T, TVal, false> {
+  using type = T;
+};
+template <class T>
+using SpecialFloatUnwrapArithOpOperand =
+    typename SpecialFloatUnwrapArithOpOperandT<T>::type;
+template <class T, class TVal = RemoveCvRef<T>>
+struct NativeSpecialFloatToWrapperT {
+  using type = T;
+};
+template <class T>
+using NativeSpecialFloatToWrapper =
+    typename NativeSpecialFloatToWrapperT<T>::type;
+}  // namespace detail
+// Match [u]int##_t naming scheme so rvv-inl.h macros can obtain the type name
+// by concatenating base type and bits. We use a wrapper class instead of a
+// typedef to the native type to ensure that the same symbols, e.g. for VQSort,
+// are generated regardless of F16 support; see #1684.
+struct alignas(2) float16_t {
+#if HWY_HAVE_SCALAR_F16_TYPE
+#if HWY_RVV_HAVE_F16_VEC || HWY_SSE2_HAVE_F16_TYPE
+  using Native = _Float16;
+#elif HWY_NEON_HAVE_F16C
+  using Native = __fp16;
+#else
+#error "Logic error: condition should be 'all but NEON_HAVE_F16C'"
+#endif
+#endif  // HWY_HAVE_SCALAR_F16_TYPE
+  union {
+#if HWY_HAVE_SCALAR_F16_TYPE
+    // Accessed via NativeLaneType, and used directly if
+    // HWY_HAVE_SCALAR_F16_OPERATORS.
+    Native native;
+#endif
+    // Only accessed via NativeLaneType or U16LaneType.
+    uint16_t bits;
+  };
+  // Default init and copying.
+  float16_t() noexcept = default;
+  constexpr float16_t(const float16_t&) noexcept = default;
+  constexpr float16_t(float16_t&&) noexcept = default;
+  float16_t& operator=(const float16_t&) noexcept = default;
+  float16_t& operator=(float16_t&&) noexcept = default;
+#if HWY_HAVE_SCALAR_F16_TYPE
+  // NEON vget/set_lane intrinsics and SVE `svaddv` could use explicit
+  // float16_t(intrinsic()), but user code expects implicit conversions.
+  constexpr float16_t(Native arg) noexcept : native(arg) {}
+  constexpr operator Native() const noexcept { return native; }
+#endif
+#if HWY_HAVE_SCALAR_F16_TYPE
+  static HWY_BITCASTSCALAR_CONSTEXPR float16_t FromBits(uint16_t bits) {
+    return float16_t(BitCastScalar<Native>(bits));
+  }
+#else
+ private:
+  struct F16FromU16BitsTag {};
+  constexpr float16_t(F16FromU16BitsTag /*tag*/, uint16_t u16_bits)
+      : bits(u16_bits) {}
+ public:
+  static constexpr float16_t FromBits(uint16_t bits) {
+    return float16_t(F16FromU16BitsTag(), bits);
+  }
+#endif
+  // When backed by a native type, ensure the wrapper behaves like the native
+  // type by forwarding all operators. Unfortunately it seems difficult to reuse
+  // this code in a base class, so we repeat it in float16_t.
+#if HWY_HAVE_SCALAR_F16_OPERATORS || HWY_IDE
+  template <typename T, hwy::EnableIf<!IsSame<RemoveCvRef<T>, float16_t>() &&
+                                      IsConvertible<T, Native>()>* = nullptr>
+  constexpr float16_t(T&& arg) noexcept
+      : native(static_cast<Native>(static_cast<T&&>(arg))) {}
+  template <typename T, hwy::EnableIf<!IsSame<RemoveCvRef<T>, float16_t>() &&
+                                      !IsConvertible<T, Native>() &&
+                                      IsStaticCastable<T, Native>()>* = nullptr>
+  explicit constexpr float16_t(T&& arg) noexcept
+      : native(static_cast<Native>(static_cast<T&&>(arg))) {}
+  // pre-decrement operator (--x)
+  HWY_CXX14_CONSTEXPR float16_t& operator--() noexcept {
+    native = static_cast<Native>(native - Native{1});
+    return *this;
+  }
+  // post-decrement operator (x--)
+  HWY_CXX14_CONSTEXPR float16_t operator--(int) noexcept {
+    float16_t result = *this;
+    native = static_cast<Native>(native - Native{1});
+    return result;
+  }
+  // pre-increment operator (++x)
+  HWY_CXX14_CONSTEXPR float16_t& operator++() noexcept {
+    native = static_cast<Native>(native + Native{1});
+    return *this;
+  }
+  // post-increment operator (x++)
+  HWY_CXX14_CONSTEXPR float16_t operator++(int) noexcept {
+    float16_t result = *this;
+    native = static_cast<Native>(native + Native{1});
+    return result;
+  }
+  constexpr float16_t operator-() const noexcept {
+    return float16_t(static_cast<Native>(-native));
+  }
+  constexpr float16_t operator+() const noexcept { return *this; }
+  // Reduce clutter by generating `operator+` and `operator+=` etc. Note that
+  // we cannot token-paste `operator` and `+`, so pass it in as `op_func`.
+#define HWY_FLOAT16_BINARY_OP(op, op_func, assign_func)                      \
+  constexpr float16_t op_func(const float16_t& rhs) const noexcept {         \
+    return float16_t(static_cast<Native>(native op rhs.native));             \
+  }                                                                          \
+  template <typename T, HWY_IF_NOT_F16(T),                                   \
+            typename UnwrappedT =                                            \
+                detail::SpecialFloatUnwrapArithOpOperand<const T&>,          \
+            typename RawResultT =                                            \
+                decltype(DeclVal<Native>() op DeclVal<UnwrappedT>()),        \
+            typename ResultT =                                               \
+                detail::NativeSpecialFloatToWrapper<RawResultT>,             \
+            HWY_IF_CASTABLE(RawResultT, ResultT)>                            \
+  constexpr ResultT op_func(const T& rhs) const noexcept(noexcept(           \
+      static_cast<ResultT>(DeclVal<Native>() op DeclVal<UnwrappedT>()))) {   \
+    return static_cast<ResultT>(native op static_cast<UnwrappedT>(rhs));     \
+  }                                                                          \
+  HWY_CXX14_CONSTEXPR hwy::float16_t& assign_func(                           \
+      const hwy::float16_t& rhs) noexcept {                                  \
+    native = static_cast<Native>(native op rhs.native);                      \
+    return *this;                                                            \
+  }                                                                          \
+  template <typename T, HWY_IF_NOT_F16(T),                                   \
+            HWY_IF_OP_CASTABLE(op, const T&, Native),                        \
+            HWY_IF_ASSIGNABLE(                                               \
+                Native, decltype(DeclVal<Native>() op DeclVal<const T&>()))> \
+  HWY_CXX14_CONSTEXPR hwy::float16_t& assign_func(const T& rhs) noexcept(    \
+      noexcept(                                                              \
+          static_cast<Native>(DeclVal<Native>() op DeclVal<const T&>()))) {  \
+    native = static_cast<Native>(native op rhs);                             \
+    return *this;                                                            \
+  }
+  HWY_FLOAT16_BINARY_OP(+, operator+, operator+=)
+  HWY_FLOAT16_BINARY_OP(-, operator-, operator-=)
+  HWY_FLOAT16_BINARY_OP(*, operator*, operator*=)
+  HWY_FLOAT16_BINARY_OP(/, operator/, operator/=)
+#undef HWY_FLOAT16_BINARY_OP
+#endif  // HWY_HAVE_SCALAR_F16_OPERATORS
+};
+static_assert(sizeof(hwy::float16_t) == 2, "Wrong size of float16_t");
+#if HWY_HAVE_SCALAR_F16_TYPE
+namespace detail {
+#if HWY_HAVE_SCALAR_F16_OPERATORS
+template <class T>
+struct SpecialFloatUnwrapArithOpOperandT<T, hwy::float16_t, true> {
+  using type = hwy::float16_t::Native;
+};
+#endif
+template <class T>
+struct NativeSpecialFloatToWrapperT<T, hwy::float16_t::Native> {
+  using type = hwy::float16_t;
+};
+}  // namespace detail
+#endif  // HWY_HAVE_SCALAR_F16_TYPE
+#if HWY_HAS_BUILTIN(__builtin_bit_cast) || HWY_COMPILER_MSVC >= 1926
+namespace detail {
+template <>
+struct BitCastScalarSrcCastHelper<hwy::float16_t> {
+#if HWY_HAVE_SCALAR_F16_TYPE
+  static HWY_INLINE constexpr const hwy::float16_t::Native& CastSrcValRef(
+      const hwy::float16_t& val) {
+    return val.native;
+  }
+#else
+  static HWY_INLINE constexpr const uint16_t& CastSrcValRef(
+      const hwy::float16_t& val) {
+    return val.bits;
+  }
+#endif
+};
+}  // namespace detail
+#endif  // HWY_HAS_BUILTIN(__builtin_bit_cast) || HWY_COMPILER_MSVC >= 1926
+#if HWY_HAVE_SCALAR_F16_OPERATORS
+#define HWY_F16_CONSTEXPR constexpr
+#else
+#define HWY_F16_CONSTEXPR HWY_BITCASTSCALAR_CXX14_CONSTEXPR
+#endif  // HWY_HAVE_SCALAR_F16_OPERATORS
+HWY_API HWY_F16_CONSTEXPR float F32FromF16(float16_t f16) {
+#if HWY_HAVE_SCALAR_F16_OPERATORS && !HWY_IDE
+  return static_cast<float>(f16);
+#endif
+#if !HWY_HAVE_SCALAR_F16_OPERATORS || HWY_IDE
+  const uint16_t bits16 = BitCastScalar<uint16_t>(f16);
+  const uint32_t sign = static_cast<uint32_t>(bits16 >> 15);
+  const uint32_t biased_exp = (bits16 >> 10) & 0x1F;
+  const uint32_t mantissa = bits16 & 0x3FF;
+  // Subnormal or zero
+  if (biased_exp == 0) {
+    const float subnormal =
+        (1.0f / 16384) * (static_cast<float>(mantissa) * (1.0f / 1024));
+    return sign ? -subnormal : subnormal;
+  }
+  // Normalized, infinity or NaN: convert the representation directly
+  // (faster than ldexp/tables).
+  const uint32_t biased_exp32 =
+      biased_exp == 31 ? 0xFF : biased_exp + (127 - 15);
+  const uint32_t mantissa32 = mantissa << (23 - 10);
+  const uint32_t bits32 = (sign << 31) | (biased_exp32 << 23) | mantissa32;
+  return BitCastScalar<float>(bits32);
+#endif  // !HWY_HAVE_SCALAR_F16_OPERATORS
+}
+#if HWY_IS_DEBUG_BUILD && \
+    (HWY_HAS_BUILTIN(__builtin_bit_cast) || HWY_COMPILER_MSVC >= 1926)
+#if defined(__cpp_if_consteval) && __cpp_if_consteval >= 202106L
+// If C++23 if !consteval support is available, only execute
+// HWY_DASSERT(condition) if F16FromF32 is not called from a constant-evaluated
+// context to avoid compilation errors.
+#define HWY_F16_FROM_F32_DASSERT(condition) \
+  do {                                      \
+    if !consteval {                         \
+      HWY_DASSERT(condition);               \
+    }                                       \
+  } while (0)
+#elif HWY_HAS_BUILTIN(__builtin_is_constant_evaluated) || \
+    HWY_COMPILER_MSVC >= 1926
+// If the __builtin_is_constant_evaluated() intrinsic is available,
+// only do HWY_DASSERT(condition) if __builtin_is_constant_evaluated() returns
+// false to avoid compilation errors if F16FromF32 is called from a
+// constant-evaluated context.
+#define HWY_F16_FROM_F32_DASSERT(condition)   \
+  do {                                        \
+    if (!__builtin_is_constant_evaluated()) { \
+      HWY_DASSERT(condition);                 \
+    }                                         \
+  } while (0)
+#else
+// If C++23 if !consteval support is not available,
+// the __builtin_is_constant_evaluated() intrinsic is not available,
+// HWY_IS_DEBUG_BUILD is 1, and the __builtin_bit_cast intrinsic is available,
+// do not do a HWY_DASSERT to avoid compilation errors if F16FromF32 is
+// called from a constant-evaluated context.
+#define HWY_F16_FROM_F32_DASSERT(condition) \
+  do {                                      \
+  } while (0)
+#endif  // defined(__cpp_if_consteval) && __cpp_if_consteval >= 202106L
+#else
+// If HWY_IS_DEBUG_BUILD is 0 or the __builtin_bit_cast intrinsic is not
+// available, define HWY_F16_FROM_F32_DASSERT(condition) as
+// HWY_DASSERT(condition)
+#define HWY_F16_FROM_F32_DASSERT(condition) HWY_DASSERT(condition)
+#endif  // HWY_IS_DEBUG_BUILD && (HWY_HAS_BUILTIN(__builtin_bit_cast) ||
+        // HWY_COMPILER_MSVC >= 1926)
+HWY_API HWY_F16_CONSTEXPR float16_t F16FromF32(float f32) {
+#if HWY_HAVE_SCALAR_F16_OPERATORS && !HWY_IDE
+  return float16_t(static_cast<float16_t::Native>(f32));
+#endif
+#if !HWY_HAVE_SCALAR_F16_OPERATORS || HWY_IDE
+  const uint32_t bits32 = BitCastScalar<uint32_t>(f32);
+  const uint32_t sign = bits32 >> 31;
+  const uint32_t biased_exp32 = (bits32 >> 23) & 0xFF;
+  constexpr uint32_t kMantissaMask = 0x7FFFFF;
+  const uint32_t mantissa32 = bits32 & kMantissaMask;
+  // Before shifting (truncation), round to nearest even to reduce bias. If
+  // the lowest remaining mantissa bit is odd, increase the offset. Example
+  // with the lowest remaining bit (left) and next lower two bits; the
+  // latter, plus two more, will be truncated.
+  // 0[00] +  1 =  0[01]
+  // 0[01] +  1 =  0[10]
+  // 0[10] +  1 =  0[11]  (round down toward even)
+  // 0[11] +  1 =  1[00]  (round up)
+  // 1[00] + 10 =  1[10]
+  // 1[01] + 10 =  1[11]
+  // 1[10] + 10 = C0[00]  (round up toward even with C=1 carry out)
+  // 1[11] + 10 = C0[01]  (round up toward even with C=1 carry out)
+  const uint32_t odd_bit = (mantissa32 >> 13) & 1;
+  const uint32_t rounded = mantissa32 + odd_bit + 0xFFF;
+  const bool carry = rounded >= (1u << 23);
+  const int32_t exp = static_cast<int32_t>(biased_exp32) - 127 + carry;
+  // Tiny or zero => zero.
+  if (exp < -24) {
+    // restore original sign
+    return float16_t::FromBits(static_cast<uint16_t>(sign << 15));
+  }
+  // If biased_exp16 would be >= 31, first check whether the input was NaN so we
+  // can set the mantissa to nonzero.
+  const bool is_nan = (biased_exp32 == 255) && mantissa32 != 0;
+  const bool overflowed = exp >= 16;
+  const uint32_t biased_exp16 =
+      static_cast<uint32_t>(HWY_MIN(HWY_MAX(0, exp + 15), 31));
+  // exp = [-24, -15] => subnormal, shift the mantissa.
+  const uint32_t sub_exp = static_cast<uint32_t>(HWY_MAX(-14 - exp, 0));
+  HWY_F16_FROM_F32_DASSERT(sub_exp < 11);
+  const uint32_t shifted_mantissa =
+      (rounded & kMantissaMask) >> (23 - 10 + sub_exp);
+  const uint32_t leading = sub_exp == 0u ? 0u : (1024u >> sub_exp);
+  const uint32_t mantissa16 = is_nan       ? 0x3FF
+                              : overflowed ? 0u
+                                           : (leading + shifted_mantissa);
+#if HWY_IS_DEBUG_BUILD
+  if (exp < -14) {
+    HWY_F16_FROM_F32_DASSERT(biased_exp16 == 0);
+    HWY_F16_FROM_F32_DASSERT(sub_exp >= 1);
+  } else if (exp <= 15) {
+    HWY_F16_FROM_F32_DASSERT(1 <= biased_exp16 && biased_exp16 < 31);
+    HWY_F16_FROM_F32_DASSERT(sub_exp == 0);
+  }
+#endif
+  HWY_F16_FROM_F32_DASSERT(mantissa16 < 1024);
+  const uint32_t bits16 = (sign << 15) | (biased_exp16 << 10) | mantissa16;
+  HWY_F16_FROM_F32_DASSERT(bits16 < 0x10000);
+  const uint16_t narrowed = static_cast<uint16_t>(bits16);  // big-endian safe
+  return float16_t::FromBits(narrowed);
+#endif  // !HWY_HAVE_SCALAR_F16_OPERATORS
+}
+HWY_API HWY_F16_CONSTEXPR float16_t F16FromF64(double f64) {
+#if HWY_HAVE_SCALAR_F16_OPERATORS
+  return float16_t(static_cast<float16_t::Native>(f64));
+#else
+  // The mantissa bits of f64 are first rounded using round-to-odd rounding
+  // to the nearest f64 value that has the lower 29 bits zeroed out to
+  // ensure that the result is correctly rounded to a F16.
+  // The F64 round-to-odd operation below will round a normal F64 value
+  // (using round-to-odd rounding) to a F64 value that has 24 bits of precision.
+  // It is okay if the magnitude of a denormal F64 value is rounded up in the
+  // F64 round-to-odd step below as the magnitude of a denormal F64 value is
+  // much smaller than 2^(-24) (the smallest positive denormal F16 value).
+  // It is also okay if bit 29 of a NaN F64 value is changed by the F64
+  // round-to-odd step below as the lower 13 bits of a F32 NaN value are usually
+  // discarded or ignored by the conversion of a F32 NaN value to a F16.
+  // If f64 is a NaN value, the result of the F64 round-to-odd step will be a
+  // NaN value as the result of the F64 round-to-odd step will have at least one
+  // mantissa bit if f64 is a NaN value.
+  // The F64 round-to-odd step will ensure that the F64 to F32 conversion is
+  // exact if the magnitude of the rounded F64 value (using round-to-odd
+  // rounding) is between 2^(-126) (the smallest normal F32 value) and
+  // HighestValue<float>() (the largest finite F32 value)
+  // It is okay if the F64 to F32 conversion is inexact for F64 values that have
+  // a magnitude that is less than 2^(-126) as the magnitude of a denormal F32
+  // value is much smaller than 2^(-24) (the smallest positive denormal F16
+  // value).
+  return F16FromF32(
+      static_cast<float>(BitCastScalar<double>(static_cast<uint64_t>(
+          (BitCastScalar<uint64_t>(f64) & 0xFFFFFFFFE0000000ULL) |
+          ((BitCastScalar<uint64_t>(f64) + 0x000000001FFFFFFFULL) &
+           0x0000000020000000ULL)))));
+#endif
+}
+// More convenient to define outside float16_t because these may use
+// F32FromF16, which is defined after the struct.
+HWY_F16_CONSTEXPR inline bool operator==(float16_t lhs,
+                                         float16_t rhs) noexcept {
+#if HWY_HAVE_SCALAR_F16_OPERATORS
+  return lhs.native == rhs.native;
+#else
+  return F32FromF16(lhs) == F32FromF16(rhs);
+#endif
+}
+HWY_F16_CONSTEXPR inline bool operator!=(float16_t lhs,
+                                         float16_t rhs) noexcept {
+#if HWY_HAVE_SCALAR_F16_OPERATORS
+  return lhs.native != rhs.native;
+#else
+  return F32FromF16(lhs) != F32FromF16(rhs);
+#endif
+}
+HWY_F16_CONSTEXPR inline bool operator<(float16_t lhs, float16_t rhs) noexcept {
+#if HWY_HAVE_SCALAR_F16_OPERATORS
+  return lhs.native < rhs.native;
+#else
+  return F32FromF16(lhs) < F32FromF16(rhs);
+#endif
+}
+HWY_F16_CONSTEXPR inline bool operator<=(float16_t lhs,
+                                         float16_t rhs) noexcept {
+#if HWY_HAVE_SCALAR_F16_OPERATORS
+  return lhs.native <= rhs.native;
+#else
+  return F32FromF16(lhs) <= F32FromF16(rhs);
+#endif
+}
+HWY_F16_CONSTEXPR inline bool operator>(float16_t lhs, float16_t rhs) noexcept {
+#if HWY_HAVE_SCALAR_F16_OPERATORS
+  return lhs.native > rhs.native;
+#else
+  return F32FromF16(lhs) > F32FromF16(rhs);
+#endif
+}
+HWY_F16_CONSTEXPR inline bool operator>=(float16_t lhs,
+                                         float16_t rhs) noexcept {
+#if HWY_HAVE_SCALAR_F16_OPERATORS
+  return lhs.native >= rhs.native;
+#else
+  return F32FromF16(lhs) >= F32FromF16(rhs);
+#endif
+}
+#if HWY_HAVE_CXX20_THREE_WAY_COMPARE
+HWY_F16_CONSTEXPR inline std::partial_ordering operator<=>(
+    float16_t lhs, float16_t rhs) noexcept {
+#if HWY_HAVE_SCALAR_F16_OPERATORS
+  return lhs.native <=> rhs.native;
+#else
+  return F32FromF16(lhs) <=> F32FromF16(rhs);
+#endif
+}
+#endif  // HWY_HAVE_CXX20_THREE_WAY_COMPARE
+//------------------------------------------------------------------------------
+// BF16 lane type
+// Compiler supports ACLE __bf16, not necessarily with operators.
+// Disable the __bf16 type on AArch64 with GCC 13 or earlier as there is a bug
+// in GCC 13 and earlier that sometimes causes BF16 constant values to be
+// incorrectly loaded on AArch64, and this GCC bug on AArch64 is
+// described at https://gcc.gnu.org/bugzilla/show_bug.cgi?id=111867.
+#if HWY_ARCH_ARM_A64 && \
+    (HWY_COMPILER_CLANG >= 1700 || HWY_COMPILER_GCC_ACTUAL >= 1400)
+#define HWY_ARM_HAVE_SCALAR_BF16_TYPE 1
+#else
+#define HWY_ARM_HAVE_SCALAR_BF16_TYPE 0
+#endif
+// x86 compiler supports __bf16, not necessarily with operators.
+#ifndef HWY_SSE2_HAVE_SCALAR_BF16_TYPE
+#if HWY_ARCH_X86 && defined(__SSE2__) &&                      \
+    ((HWY_COMPILER_CLANG >= 1700 && !HWY_COMPILER_CLANGCL) || \
+     HWY_COMPILER_GCC_ACTUAL >= 1300)
+#define HWY_SSE2_HAVE_SCALAR_BF16_TYPE 1
+#else
+#define HWY_SSE2_HAVE_SCALAR_BF16_TYPE 0
+#endif
+#endif  // HWY_SSE2_HAVE_SCALAR_BF16_TYPE
+// Compiler supports __bf16, not necessarily with operators.
+#if HWY_ARM_HAVE_SCALAR_BF16_TYPE || HWY_SSE2_HAVE_SCALAR_BF16_TYPE
+#define HWY_HAVE_SCALAR_BF16_TYPE 1
+#else
+#define HWY_HAVE_SCALAR_BF16_TYPE 0
+#endif
+#ifndef HWY_HAVE_SCALAR_BF16_OPERATORS
+// Recent enough compiler also has operators. aarch64 clang 18 hits internal
+// compiler errors on bf16 ToString, hence only enable on GCC for now.
+#if HWY_HAVE_SCALAR_BF16_TYPE && (HWY_COMPILER_GCC_ACTUAL >= 1300)
+#define HWY_HAVE_SCALAR_BF16_OPERATORS 1
+#else
+#define HWY_HAVE_SCALAR_BF16_OPERATORS 0
+#endif
+#endif  // HWY_HAVE_SCALAR_BF16_OPERATORS
+#if HWY_HAVE_SCALAR_BF16_OPERATORS
+#define HWY_BF16_CONSTEXPR constexpr
+#else
+#define HWY_BF16_CONSTEXPR HWY_BITCASTSCALAR_CONSTEXPR
+#endif
+struct alignas(2) bfloat16_t {
+#if HWY_HAVE_SCALAR_BF16_TYPE
+  using Native = __bf16;
+#endif
+  union {
+#if HWY_HAVE_SCALAR_BF16_TYPE
+    // Accessed via NativeLaneType, and used directly if
+    // HWY_HAVE_SCALAR_BF16_OPERATORS.
+    Native native;
+#endif
+    // Only accessed via NativeLaneType or U16LaneType.
+    uint16_t bits;
+  };
+  // Default init and copying
+  bfloat16_t() noexcept = default;
+  constexpr bfloat16_t(bfloat16_t&&) noexcept = default;
+  constexpr bfloat16_t(const bfloat16_t&) noexcept = default;
+  bfloat16_t& operator=(bfloat16_t&& arg) noexcept = default;
+  bfloat16_t& operator=(const bfloat16_t& arg) noexcept = default;
+// Only enable implicit conversions if we have a native type.
+#if HWY_HAVE_SCALAR_BF16_TYPE
+  constexpr bfloat16_t(Native arg) noexcept : native(arg) {}
+  constexpr operator Native() const noexcept { return native; }
+#endif
+#if HWY_HAVE_SCALAR_BF16_TYPE
+  static HWY_BITCASTSCALAR_CONSTEXPR bfloat16_t FromBits(uint16_t bits) {
+    return bfloat16_t(BitCastScalar<Native>(bits));
+  }
+#else
+ private:
+  struct BF16FromU16BitsTag {};
+  constexpr bfloat16_t(BF16FromU16BitsTag /*tag*/, uint16_t u16_bits)
+      : bits(u16_bits) {}
-template <bool Condition, typename Then, typename Else>
-using If = typename IfT<Condition, Then, Else>::type;
+ public:
+  static constexpr bfloat16_t FromBits(uint16_t bits) {
+    return bfloat16_t(BF16FromU16BitsTag(), bits);
+  }
+#endif
-// Insert into template/function arguments to enable this overload only for
-// vectors of exactly, at most (LE), or more than (GT) this many bytes.
-//
-// As an example, checking for a total size of 16 bytes will match both
-// Simd<uint8_t, 16, 0> and Simd<uint8_t, 8, 1>.
-#define HWY_IF_V_SIZE(T, kN, bytes) \
-  hwy::EnableIf<kN * sizeof(T) == bytes>* = nullptr
-#define HWY_IF_V_SIZE_LE(T, kN, bytes) \
-  hwy::EnableIf<kN * sizeof(T) <= bytes>* = nullptr
-#define HWY_IF_V_SIZE_GT(T, kN, bytes) \
-  hwy::EnableIf<(kN * sizeof(T) > bytes)>* = nullptr
+  // When backed by a native type, ensure the wrapper behaves like the native
+  // type by forwarding all operators. Unfortunately it seems difficult to reuse
+  // this code in a base class, so we repeat it in float16_t.
+#if HWY_HAVE_SCALAR_BF16_OPERATORS || HWY_IDE
+  template <typename T, hwy::EnableIf<!IsSame<RemoveCvRef<T>, Native>() &&
+                                      !IsSame<RemoveCvRef<T>, bfloat16_t>() &&
+                                      IsConvertible<T, Native>()>* = nullptr>
+  constexpr bfloat16_t(T&& arg) noexcept(
+      noexcept(static_cast<Native>(DeclVal<T>())))
+      : native(static_cast<Native>(static_cast<T&&>(arg))) {}
+  template <typename T, hwy::EnableIf<!IsSame<RemoveCvRef<T>, Native>() &&
+                                      !IsSame<RemoveCvRef<T>, bfloat16_t>() &&
+                                      !IsConvertible<T, Native>() &&
+                                      IsStaticCastable<T, Native>()>* = nullptr>
+  explicit constexpr bfloat16_t(T&& arg) noexcept(
+      noexcept(static_cast<Native>(DeclVal<T>())))
+      : native(static_cast<Native>(static_cast<T&&>(arg))) {}
+  HWY_CXX14_CONSTEXPR bfloat16_t& operator=(Native arg) noexcept {
+    native = arg;
+    return *this;
+  }
-#define HWY_IF_LANES(kN, lanes) hwy::EnableIf<(kN == lanes)>* = nullptr
-#define HWY_IF_LANES_LE(kN, lanes) hwy::EnableIf<(kN <= lanes)>* = nullptr
-#define HWY_IF_LANES_GT(kN, lanes) hwy::EnableIf<(kN > lanes)>* = nullptr
+  // pre-decrement operator (--x)
+  HWY_CXX14_CONSTEXPR bfloat16_t& operator--() noexcept {
+    native = static_cast<Native>(native - Native{1});
+    return *this;
+  }
-#define HWY_IF_UNSIGNED(T) hwy::EnableIf<!IsSigned<T>()>* = nullptr
-#define HWY_IF_SIGNED(T)                                                   \
-  hwy::EnableIf<IsSigned<T>() && !IsFloat<T>() && !IsSpecialFloat<T>()>* = \
-      nullptr
-#define HWY_IF_FLOAT(T) hwy::EnableIf<hwy::IsFloat<T>()>* = nullptr
-#define HWY_IF_NOT_FLOAT(T) hwy::EnableIf<!hwy::IsFloat<T>()>* = nullptr
-#define HWY_IF_FLOAT3264(T) hwy::EnableIf<hwy::IsFloat3264<T>()>* = nullptr
-#define HWY_IF_NOT_FLOAT3264(T) hwy::EnableIf<!hwy::IsFloat3264<T>()>* = nullptr
-#define HWY_IF_SPECIAL_FLOAT(T) \
-  hwy::EnableIf<hwy::IsSpecialFloat<T>()>* = nullptr
-#define HWY_IF_NOT_SPECIAL_FLOAT(T) \
-  hwy::EnableIf<!hwy::IsSpecialFloat<T>()>* = nullptr
-#define HWY_IF_FLOAT_OR_SPECIAL(T) \
-  hwy::EnableIf<hwy::IsFloat<T>() || hwy::IsSpecialFloat<T>()>* = nullptr
-#define HWY_IF_NOT_FLOAT_NOR_SPECIAL(T) \
-  hwy::EnableIf<!hwy::IsFloat<T>() && !hwy::IsSpecialFloat<T>()>* = nullptr
+  // post-decrement operator (x--)
+  HWY_CXX14_CONSTEXPR bfloat16_t operator--(int) noexcept {
+    bfloat16_t result = *this;
+    native = static_cast<Native>(native - Native{1});
+    return result;
+  }
-#define HWY_IF_T_SIZE(T, bytes) hwy::EnableIf<sizeof(T) == (bytes)>* = nullptr
-#define HWY_IF_NOT_T_SIZE(T, bytes) \
-  hwy::EnableIf<sizeof(T) != (bytes)>* = nullptr
-// bit_array = 0x102 means 1 or 8 bytes. There is no NONE_OF because it sounds
-// too similar. If you want the opposite of this (2 or 4 bytes), ask for those
-// bits explicitly (0x14) instead of attempting to 'negate' 0x102.
-#define HWY_IF_T_SIZE_ONE_OF(T, bit_array) \
-  hwy::EnableIf<((size_t{1} << sizeof(T)) & (bit_array)) != 0>* = nullptr
+  // pre-increment operator (++x)
+  HWY_CXX14_CONSTEXPR bfloat16_t& operator++() noexcept {
+    native = static_cast<Native>(native + Native{1});
+    return *this;
+  }
-// Use instead of HWY_IF_T_SIZE to avoid ambiguity with float16_t/float/double
-// overloads.
-#define HWY_IF_UI16(T) \
-  hwy::EnableIf<IsSame<T, uint16_t>() || IsSame<T, int16_t>()>* = nullptr
-#define HWY_IF_UI32(T) \
-  hwy::EnableIf<IsSame<T, uint32_t>() || IsSame<T, int32_t>()>* = nullptr
-#define HWY_IF_UI64(T) \
-  hwy::EnableIf<IsSame<T, uint64_t>() || IsSame<T, int64_t>()>* = nullptr
-#define HWY_IF_BF16(T) hwy::EnableIf<IsSame<T, hwy::bfloat16_t>()>* = nullptr
-#define HWY_IF_F16(T) hwy::EnableIf<IsSame<T, hwy::float16_t>()>* = nullptr
+  // post-increment operator (x++)
+  HWY_CXX14_CONSTEXPR bfloat16_t operator++(int) noexcept {
+    bfloat16_t result = *this;
+    native = static_cast<Native>(native + Native{1});
+    return result;
+  }
-#define HWY_IF_LANES_PER_BLOCK(T, N, LANES) \
-  hwy::EnableIf<HWY_MIN(sizeof(T) * N, 16) / sizeof(T) == (LANES)>* = nullptr
+  constexpr bfloat16_t operator-() const noexcept {
+    return bfloat16_t(static_cast<Native>(-native));
+  }
+  constexpr bfloat16_t operator+() const noexcept { return *this; }
-// Empty struct used as a size tag type.
-template <size_t N>
-struct SizeTag {};
+  // Reduce clutter by generating `operator+` and `operator+=` etc. Note that
+  // we cannot token-paste `operator` and `+`, so pass it in as `op_func`.
+#define HWY_BFLOAT16_BINARY_OP(op, op_func, assign_func)                     \
+  constexpr bfloat16_t op_func(const bfloat16_t& rhs) const noexcept {       \
+    return bfloat16_t(static_cast<Native>(native op rhs.native));            \
+  }                                                                          \
+  template <typename T, HWY_IF_NOT_BF16(T),                                  \
+            typename UnwrappedT =                                            \
+                detail::SpecialFloatUnwrapArithOpOperand<const T&>,          \
+            typename RawResultT =                                            \
+                decltype(DeclVal<Native>() op DeclVal<UnwrappedT>()),        \
+            typename ResultT =                                               \
+                detail::NativeSpecialFloatToWrapper<RawResultT>,             \
+            HWY_IF_CASTABLE(RawResultT, ResultT)>                            \
+  constexpr ResultT op_func(const T& rhs) const noexcept(noexcept(           \
+      static_cast<ResultT>(DeclVal<Native>() op DeclVal<UnwrappedT>()))) {   \
+    return static_cast<ResultT>(native op static_cast<UnwrappedT>(rhs));     \
+  }                                                                          \
+  HWY_CXX14_CONSTEXPR hwy::bfloat16_t& assign_func(                          \
+      const hwy::bfloat16_t& rhs) noexcept {                                 \
+    native = static_cast<Native>(native op rhs.native);                      \
+    return *this;                                                            \
+  }                                                                          \
+  template <typename T, HWY_IF_NOT_BF16(T),                                  \
+            HWY_IF_OP_CASTABLE(op, const T&, Native),                        \
+            HWY_IF_ASSIGNABLE(                                               \
+                Native, decltype(DeclVal<Native>() op DeclVal<const T&>()))> \
+  HWY_CXX14_CONSTEXPR hwy::bfloat16_t& assign_func(const T& rhs) noexcept(   \
+      noexcept(                                                              \
+          static_cast<Native>(DeclVal<Native>() op DeclVal<const T&>()))) {  \
+    native = static_cast<Native>(native op rhs);                             \
+    return *this;                                                            \
+  }
+  HWY_BFLOAT16_BINARY_OP(+, operator+, operator+=)
+  HWY_BFLOAT16_BINARY_OP(-, operator-, operator-=)
+  HWY_BFLOAT16_BINARY_OP(*, operator*, operator*=)
+  HWY_BFLOAT16_BINARY_OP(/, operator/, operator/=)
+#undef HWY_BFLOAT16_BINARY_OP
-template <class T>
-struct RemoveConstT {
-  using type = T;
-};
-template <class T>
-struct RemoveConstT<const T> {
-  using type = T;
+#endif  // HWY_HAVE_SCALAR_BF16_OPERATORS
 };
+static_assert(sizeof(hwy::bfloat16_t) == 2, "Wrong size of bfloat16_t");
-template <class T>
-using RemoveConst = typename RemoveConstT<T>::type;
+#pragma pack(pop)
+#if HWY_HAVE_SCALAR_BF16_TYPE
+namespace detail {
+#if HWY_HAVE_SCALAR_BF16_OPERATORS
 template <class T>
-struct RemoveRefT {
-  using type = T;
+struct SpecialFloatUnwrapArithOpOperandT<T, hwy::bfloat16_t, true> {
+  using type = hwy::bfloat16_t::Native;
 };
+#endif
 template <class T>
-struct RemoveRefT<T&> {
-  using type = T;
+struct NativeSpecialFloatToWrapperT<T, hwy::bfloat16_t::Native> {
+  using type = hwy::bfloat16_t;
 };
-template <class T>
-struct RemoveRefT<T&&> {
-  using type = T;
+}  // namespace detail
+#endif  // HWY_HAVE_SCALAR_BF16_TYPE
+#if HWY_HAS_BUILTIN(__builtin_bit_cast) || HWY_COMPILER_MSVC >= 1926
+namespace detail {
+template <>
+struct BitCastScalarSrcCastHelper<hwy::bfloat16_t> {
+#if HWY_HAVE_SCALAR_BF16_TYPE
+  static HWY_INLINE constexpr const hwy::bfloat16_t::Native& CastSrcValRef(
+      const hwy::bfloat16_t& val) {
+    return val.native;
+  }
+#else
+  static HWY_INLINE constexpr const uint16_t& CastSrcValRef(
+      const hwy::bfloat16_t& val) {
+    return val.bits;
+  }
+#endif
 };
-template <class T>
-using RemoveRef = typename RemoveRefT<T>::type;
+}  // namespace detail
+#endif  // HWY_HAS_BUILTIN(__builtin_bit_cast) || HWY_COMPILER_MSVC >= 1926
+HWY_API HWY_BF16_CONSTEXPR float F32FromBF16(bfloat16_t bf) {
+#if HWY_HAVE_SCALAR_BF16_OPERATORS
+  return static_cast<float>(bf);
+#else
+  return BitCastScalar<float>(static_cast<uint32_t>(
+      static_cast<uint32_t>(BitCastScalar<uint16_t>(bf)) << 16));
+#endif
+}
+HWY_API HWY_BF16_CONSTEXPR bfloat16_t BF16FromF32(float f) {
+#if HWY_HAVE_SCALAR_BF16_OPERATORS
+  return static_cast<bfloat16_t>(f);
+#else
+  return bfloat16_t::FromBits(
+      static_cast<uint16_t>(BitCastScalar<uint32_t>(f) >> 16));
+#endif
+}
+HWY_API HWY_BF16_CONSTEXPR bfloat16_t BF16FromF64(double f64) {
+#if HWY_HAVE_SCALAR_BF16_OPERATORS
+  return static_cast<bfloat16_t>(f64);
+#else
+  // The mantissa bits of f64 are first rounded using round-to-odd rounding
+  // to the nearest f64 value that has the lower 38 bits zeroed out to
+  // ensure that the result is correctly rounded to a BF16.
+  // The F64 round-to-odd operation below will round a normal F64 value
+  // (using round-to-odd rounding) to a F64 value that has 15 bits of precision.
+  // It is okay if the magnitude of a denormal F64 value is rounded up in the
+  // F64 round-to-odd step below as the magnitude of a denormal F64 value is
+  // much smaller than 2^(-133) (the smallest positive denormal BF16 value).
+  // It is also okay if bit 38 of a NaN F64 value is changed by the F64
+  // round-to-odd step below as the lower 16 bits of a F32 NaN value are usually
+  // discarded or ignored by the conversion of a F32 NaN value to a BF16.
+  // If f64 is a NaN value, the result of the F64 round-to-odd step will be a
+  // NaN value as the result of the F64 round-to-odd step will have at least one
+  // mantissa bit if f64 is a NaN value.
+  // The F64 round-to-odd step below will ensure that the F64 to F32 conversion
+  // is exact if the magnitude of the rounded F64 value (using round-to-odd
+  // rounding) is between 2^(-135) (one-fourth of the smallest positive denormal
+  // BF16 value) and HighestValue<float>() (the largest finite F32 value).
+  // If |f64| is less than 2^(-135), the magnitude of the result of the F64 to
+  // F32 conversion is guaranteed to be less than or equal to 2^(-135), which
+  // ensures that the F32 to BF16 conversion is correctly rounded, even if the
+  // conversion of a rounded F64 value whose magnitude is less than 2^(-135)
+  // to a F32 is inexact.
+  return BF16FromF32(
+      static_cast<float>(BitCastScalar<double>(static_cast<uint64_t>(
+          (BitCastScalar<uint64_t>(f64) & 0xFFFFFFC000000000ULL) |
+          ((BitCastScalar<uint64_t>(f64) + 0x0000003FFFFFFFFFULL) &
+           0x0000004000000000ULL)))));
+#endif
+}
+// More convenient to define outside bfloat16_t because these may use
+// F32FromBF16, which is defined after the struct.
+HWY_BF16_CONSTEXPR inline bool operator==(bfloat16_t lhs,
+                                          bfloat16_t rhs) noexcept {
+#if HWY_HAVE_SCALAR_BF16_OPERATORS
+  return lhs.native == rhs.native;
+#else
+  return F32FromBF16(lhs) == F32FromBF16(rhs);
+#endif
+}
+HWY_BF16_CONSTEXPR inline bool operator!=(bfloat16_t lhs,
+                                          bfloat16_t rhs) noexcept {
+#if HWY_HAVE_SCALAR_BF16_OPERATORS
+  return lhs.native != rhs.native;
+#else
+  return F32FromBF16(lhs) != F32FromBF16(rhs);
+#endif
+}
+HWY_BF16_CONSTEXPR inline bool operator<(bfloat16_t lhs,
+                                         bfloat16_t rhs) noexcept {
+#if HWY_HAVE_SCALAR_BF16_OPERATORS
+  return lhs.native < rhs.native;
+#else
+  return F32FromBF16(lhs) < F32FromBF16(rhs);
+#endif
+}
+HWY_BF16_CONSTEXPR inline bool operator<=(bfloat16_t lhs,
+                                          bfloat16_t rhs) noexcept {
+#if HWY_HAVE_SCALAR_BF16_OPERATORS
+  return lhs.native <= rhs.native;
+#else
+  return F32FromBF16(lhs) <= F32FromBF16(rhs);
+#endif
+}
+HWY_BF16_CONSTEXPR inline bool operator>(bfloat16_t lhs,
+                                         bfloat16_t rhs) noexcept {
+#if HWY_HAVE_SCALAR_BF16_OPERATORS
+  return lhs.native > rhs.native;
+#else
+  return F32FromBF16(lhs) > F32FromBF16(rhs);
+#endif
+}
+HWY_BF16_CONSTEXPR inline bool operator>=(bfloat16_t lhs,
+                                          bfloat16_t rhs) noexcept {
+#if HWY_HAVE_SCALAR_BF16_OPERATORS
+  return lhs.native >= rhs.native;
+#else
+  return F32FromBF16(lhs) >= F32FromBF16(rhs);
+#endif
+}
+#if HWY_HAVE_CXX20_THREE_WAY_COMPARE
+HWY_BF16_CONSTEXPR inline std::partial_ordering operator<=>(
+    bfloat16_t lhs, bfloat16_t rhs) noexcept {
+#if HWY_HAVE_SCALAR_BF16_OPERATORS
+  return lhs.native <=> rhs.native;
+#else
+  return F32FromBF16(lhs) <=> F32FromBF16(rhs);
+#endif
+}
+#endif  // HWY_HAVE_CXX20_THREE_WAY_COMPARE
 //------------------------------------------------------------------------------
 // Type relations
@@ -1110,25 +2047,19 @@ constexpr auto IsFloatTag() -> hwy::SizeTag<(R::is_float ? 0x200 : 0x400)> {
 template <typename T>
 HWY_API constexpr bool IsFloat3264() {
-  return IsSame<T, float>() || IsSame<T, double>();
+  return IsSameEither<RemoveCvRef<T>, float, double>();
 }
 template <typename T>
 HWY_API constexpr bool IsFloat() {
   // Cannot use T(1.25) != T(1) for float16_t, which can only be converted to or
   // from a float, not compared. Include float16_t in case HWY_HAVE_FLOAT16=1.
-  return IsSame<T, float16_t>() || IsFloat3264<T>();
-}
-// These types are often special-cased and not supported in all ops.
-template <typename T>
-HWY_API constexpr bool IsSpecialFloat() {
-  return IsSame<T, float16_t>() || IsSame<T, bfloat16_t>();
+  return IsSame<RemoveCvRef<T>, float16_t>() || IsFloat3264<T>();
 }
 template <typename T>
 HWY_API constexpr bool IsSigned() {
-  return T(0) > T(-1);
+  return static_cast<T>(0) > static_cast<T>(-1);
 }
 template <>
 constexpr bool IsSigned<float16_t>() {
@@ -1138,104 +2069,113 @@ template <>
 constexpr bool IsSigned<bfloat16_t>() {
   return true;
 }
+template <>
+constexpr bool IsSigned<hwy::uint128_t>() {
+  return false;
+}
+template <>
+constexpr bool IsSigned<hwy::K64V64>() {
+  return false;
+}
+template <>
+constexpr bool IsSigned<hwy::K32V32>() {
+  return false;
+}
+template <typename T, bool = IsInteger<T>() && !IsIntegerLaneType<T>()>
+struct MakeLaneTypeIfIntegerT {
+  using type = T;
+};
+template <typename T>
+struct MakeLaneTypeIfIntegerT<T, true> {
+  using type = hwy::If<IsSigned<T>(), SignedFromSize<sizeof(T)>,
+                       UnsignedFromSize<sizeof(T)>>;
+};
+template <typename T>
+using MakeLaneTypeIfInteger = typename MakeLaneTypeIfIntegerT<T>::type;
 // Largest/smallest representable integer values.
 template <typename T>
 HWY_API constexpr T LimitsMax() {
-  static_assert(!IsFloat<T>(), "Only for integer types");
-  using TU = MakeUnsigned<T>;
-  return static_cast<T>(IsSigned<T>() ? (static_cast<TU>(~0ull) >> 1)
-                                      : static_cast<TU>(~0ull));
+  static_assert(IsInteger<T>(), "Only for integer types");
+  using TU = UnsignedFromSize<sizeof(T)>;
+  return static_cast<T>(IsSigned<T>() ? (static_cast<TU>(~TU(0)) >> 1)
+                                      : static_cast<TU>(~TU(0)));
 }
 template <typename T>
 HWY_API constexpr T LimitsMin() {
-  static_assert(!IsFloat<T>(), "Only for integer types");
-  return IsSigned<T>() ? T(-1) - LimitsMax<T>() : T(0);
+  static_assert(IsInteger<T>(), "Only for integer types");
+  return IsSigned<T>() ? static_cast<T>(-1) - LimitsMax<T>()
+                       : static_cast<T>(0);
 }
 // Largest/smallest representable value (integer or float). This naming avoids
 // confusion with numeric_limits<float>::min() (the smallest positive value).
 // Cannot be constexpr because we use CopySameSize for [b]float16_t.
 template <typename T>
-HWY_API T LowestValue() {
+HWY_API HWY_BITCASTSCALAR_CONSTEXPR T LowestValue() {
   return LimitsMin<T>();
 }
 template <>
-HWY_INLINE bfloat16_t LowestValue<bfloat16_t>() {
-  const uint16_t kBits = 0xFF7F;  // -1.1111111 x 2^127
-  bfloat16_t ret;
-  CopySameSize(&kBits, &ret);
-  return ret;
+HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR bfloat16_t LowestValue<bfloat16_t>() {
+  return bfloat16_t::FromBits(uint16_t{0xFF7Fu});  // -1.1111111 x 2^127
 }
 template <>
-HWY_INLINE float16_t LowestValue<float16_t>() {
-  const uint16_t kBits = 0xFBFF;  // -1.1111111111 x 2^15
-  float16_t ret;
-  CopySameSize(&kBits, &ret);
-  return ret;
+HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR float16_t LowestValue<float16_t>() {
+  return float16_t::FromBits(uint16_t{0xFBFFu});  // -1.1111111111 x 2^15
 }
 template <>
-HWY_INLINE float LowestValue<float>() {
+HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR float LowestValue<float>() {
   return -3.402823466e+38F;
 }
 template <>
-HWY_INLINE double LowestValue<double>() {
+HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR double LowestValue<double>() {
   return -1.7976931348623158e+308;
 }
 template <typename T>
-HWY_API T HighestValue() {
+HWY_API HWY_BITCASTSCALAR_CONSTEXPR T HighestValue() {
   return LimitsMax<T>();
 }
 template <>
-HWY_INLINE bfloat16_t HighestValue<bfloat16_t>() {
-  const uint16_t kBits = 0x7F7F;  // 1.1111111 x 2^127
-  bfloat16_t ret;
-  CopySameSize(&kBits, &ret);
-  return ret;
+HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR bfloat16_t HighestValue<bfloat16_t>() {
+  return bfloat16_t::FromBits(uint16_t{0x7F7Fu});  // 1.1111111 x 2^127
 }
 template <>
-HWY_INLINE float16_t HighestValue<float16_t>() {
-  const uint16_t kBits = 0x7BFF;  // 1.1111111111 x 2^15
-  float16_t ret;
-  CopySameSize(&kBits, &ret);
-  return ret;
+HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR float16_t HighestValue<float16_t>() {
+  return float16_t::FromBits(uint16_t{0x7BFFu});  // 1.1111111111 x 2^15
 }
 template <>
-HWY_INLINE float HighestValue<float>() {
+HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR float HighestValue<float>() {
   return 3.402823466e+38F;
 }
 template <>
-HWY_INLINE double HighestValue<double>() {
+HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR double HighestValue<double>() {
   return 1.7976931348623158e+308;
 }
 // Difference between 1.0 and the next representable value. Equal to
 // 1 / (1ULL << MantissaBits<T>()), but hard-coding ensures precision.
 template <typename T>
-HWY_API T Epsilon() {
+HWY_API HWY_BITCASTSCALAR_CONSTEXPR T Epsilon() {
   return 1;
 }
 template <>
-HWY_INLINE bfloat16_t Epsilon<bfloat16_t>() {
-  const uint16_t kBits = 0x3C00;  // 0.0078125
-  bfloat16_t ret;
-  CopySameSize(&kBits, &ret);
-  return ret;
+HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR bfloat16_t Epsilon<bfloat16_t>() {
+  return bfloat16_t::FromBits(uint16_t{0x3C00u});  // 0.0078125
 }
 template <>
-HWY_INLINE float16_t Epsilon<float16_t>() {
-  const uint16_t kBits = 0x1400;  // 0.0009765625
-  float16_t ret;
-  CopySameSize(&kBits, &ret);
-  return ret;
+HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR float16_t Epsilon<float16_t>() {
+  return float16_t::FromBits(uint16_t{0x1400u});  // 0.0009765625
 }
 template <>
-HWY_INLINE float Epsilon<float>() {
+HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR float Epsilon<float>() {
   return 1.192092896e-7f;
 }
 template <>
-HWY_INLINE double Epsilon<double>() {
+HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR double Epsilon<double>() {
   return 2.2204460492503131e-16;
 }
@@ -1278,7 +2218,8 @@ constexpr MakeUnsigned<T> SignMask() {
 // Returns bitmask of the exponent field in IEEE binary16/32/64.
 template <typename T>
 constexpr MakeUnsigned<T> ExponentMask() {
-  return (~(MakeUnsigned<T>{1} << MantissaBits<T>()) + 1) & ~SignMask<T>();
+  return (~(MakeUnsigned<T>{1} << MantissaBits<T>()) + 1) &
+         static_cast<MakeUnsigned<T>>(~SignMask<T>());
 }
 // Returns bitmask of the mantissa field in IEEE binary16/32/64.
@@ -1290,30 +2231,24 @@ constexpr MakeUnsigned<T> MantissaMask() {
 // Returns 1 << mantissa_bits as a floating-point number. All integers whose
 // absolute value are less than this can be represented exactly.
 template <typename T>
-HWY_INLINE T MantissaEnd() {
+HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR T MantissaEnd() {
   static_assert(sizeof(T) == 0, "Only instantiate the specializations");
   return 0;
 }
 template <>
-HWY_INLINE bfloat16_t MantissaEnd<bfloat16_t>() {
-  const uint16_t kBits = 0x4300;  // 1.0 x 2^7
-  bfloat16_t ret;
-  CopySameSize(&kBits, &ret);
-  return ret;
+HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR bfloat16_t MantissaEnd<bfloat16_t>() {
+  return bfloat16_t::FromBits(uint16_t{0x4300u});  // 1.0 x 2^7
 }
 template <>
-HWY_INLINE float16_t MantissaEnd<float16_t>() {
-  const uint16_t kBits = 0x6400;  // 1.0 x 2^10
-  float16_t ret;
-  CopySameSize(&kBits, &ret);
-  return ret;
+HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR float16_t MantissaEnd<float16_t>() {
+  return float16_t::FromBits(uint16_t{0x6400u});  // 1.0 x 2^10
 }
 template <>
-HWY_INLINE float MantissaEnd<float>() {
+HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR float MantissaEnd<float>() {
   return 8388608.0f;  // 1 << 23
 }
 template <>
-HWY_INLINE double MantissaEnd<double>() {
+HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR double MantissaEnd<double>() {
   // floating point literal with p52 requires C++17.
   return 4503599627370496.0;  // 1 << 52
 }
@@ -1333,6 +2268,143 @@ constexpr MakeSigned<T> MaxExponentField() {
   return (MakeSigned<T>{1} << ExponentBits<T>()) - 1;
 }
+//------------------------------------------------------------------------------
+// Additional F16/BF16 operators
+#if HWY_HAVE_SCALAR_F16_OPERATORS || HWY_HAVE_SCALAR_BF16_OPERATORS
+#define HWY_RHS_SPECIAL_FLOAT_ARITH_OP(op, op_func, T2)                       \
+  template <                                                                  \
+      typename T1,                                                            \
+      hwy::EnableIf<hwy::IsInteger<RemoveCvRef<T1>>() ||                      \
+                    hwy::IsFloat3264<RemoveCvRef<T1>>()>* = nullptr,          \
+      typename RawResultT = decltype(DeclVal<T1>() op DeclVal<T2::Native>()), \
+      typename ResultT = detail::NativeSpecialFloatToWrapper<RawResultT>,     \
+      HWY_IF_CASTABLE(RawResultT, ResultT)>                                   \
+  static HWY_INLINE constexpr ResultT op_func(T1 a, T2 b) noexcept {          \
+    return static_cast<ResultT>(a op b.native);                               \
+  }
+#define HWY_SPECIAL_FLOAT_CMP_AGAINST_NON_SPECIAL_OP(op, op_func, T1)         \
+  HWY_RHS_SPECIAL_FLOAT_ARITH_OP(op, op_func, T1)                             \
+  template <                                                                  \
+      typename T2,                                                            \
+      hwy::EnableIf<hwy::IsInteger<RemoveCvRef<T2>>() ||                      \
+                    hwy::IsFloat3264<RemoveCvRef<T2>>()>* = nullptr,          \
+      typename RawResultT = decltype(DeclVal<T1::Native>() op DeclVal<T2>()), \
+      typename ResultT = detail::NativeSpecialFloatToWrapper<RawResultT>,     \
+      HWY_IF_CASTABLE(RawResultT, ResultT)>                                   \
+  static HWY_INLINE constexpr ResultT op_func(T1 a, T2 b) noexcept {          \
+    return static_cast<ResultT>(a.native op b);                               \
+  }
+#if HWY_HAVE_SCALAR_F16_OPERATORS
+HWY_RHS_SPECIAL_FLOAT_ARITH_OP(+, operator+, float16_t)
+HWY_RHS_SPECIAL_FLOAT_ARITH_OP(-, operator-, float16_t)
+HWY_RHS_SPECIAL_FLOAT_ARITH_OP(*, operator*, float16_t)
+HWY_RHS_SPECIAL_FLOAT_ARITH_OP(/, operator/, float16_t)
+HWY_SPECIAL_FLOAT_CMP_AGAINST_NON_SPECIAL_OP(==, operator==, float16_t)
+HWY_SPECIAL_FLOAT_CMP_AGAINST_NON_SPECIAL_OP(!=, operator!=, float16_t)
+HWY_SPECIAL_FLOAT_CMP_AGAINST_NON_SPECIAL_OP(<, operator<, float16_t)
+HWY_SPECIAL_FLOAT_CMP_AGAINST_NON_SPECIAL_OP(<=, operator<=, float16_t)
+HWY_SPECIAL_FLOAT_CMP_AGAINST_NON_SPECIAL_OP(>, operator>, float16_t)
+HWY_SPECIAL_FLOAT_CMP_AGAINST_NON_SPECIAL_OP(>=, operator>=, float16_t)
+#if HWY_HAVE_CXX20_THREE_WAY_COMPARE
+HWY_SPECIAL_FLOAT_CMP_AGAINST_NON_SPECIAL_OP(<=>, operator<=>, float16_t)
+#endif
+#endif  // HWY_HAVE_SCALAR_F16_OPERATORS
+#if HWY_HAVE_SCALAR_BF16_OPERATORS
+HWY_RHS_SPECIAL_FLOAT_ARITH_OP(+, operator+, bfloat16_t)
+HWY_RHS_SPECIAL_FLOAT_ARITH_OP(-, operator-, bfloat16_t)
+HWY_RHS_SPECIAL_FLOAT_ARITH_OP(*, operator*, bfloat16_t)
+HWY_RHS_SPECIAL_FLOAT_ARITH_OP(/, operator/, bfloat16_t)
+HWY_SPECIAL_FLOAT_CMP_AGAINST_NON_SPECIAL_OP(==, operator==, bfloat16_t)
+HWY_SPECIAL_FLOAT_CMP_AGAINST_NON_SPECIAL_OP(!=, operator!=, bfloat16_t)
+HWY_SPECIAL_FLOAT_CMP_AGAINST_NON_SPECIAL_OP(<, operator<, bfloat16_t)
+HWY_SPECIAL_FLOAT_CMP_AGAINST_NON_SPECIAL_OP(<=, operator<=, bfloat16_t)
+HWY_SPECIAL_FLOAT_CMP_AGAINST_NON_SPECIAL_OP(>, operator>, bfloat16_t)
+HWY_SPECIAL_FLOAT_CMP_AGAINST_NON_SPECIAL_OP(>=, operator>=, bfloat16_t)
+#if HWY_HAVE_CXX20_THREE_WAY_COMPARE
+HWY_SPECIAL_FLOAT_CMP_AGAINST_NON_SPECIAL_OP(<=>, operator<=>, bfloat16_t)
+#endif
+#endif  // HWY_HAVE_SCALAR_BF16_OPERATORS
+#undef HWY_RHS_SPECIAL_FLOAT_ARITH_OP
+#undef HWY_SPECIAL_FLOAT_CMP_AGAINST_NON_SPECIAL_OP
+#endif  // HWY_HAVE_SCALAR_F16_OPERATORS || HWY_HAVE_SCALAR_BF16_OPERATORS
+//------------------------------------------------------------------------------
+// Type conversions (after IsSpecialFloat)
+HWY_API float F32FromF16Mem(const void* ptr) {
+  float16_t f16;
+  CopyBytes<2>(HWY_ASSUME_ALIGNED(ptr, 2), &f16);
+  return F32FromF16(f16);
+}
+HWY_API float F32FromBF16Mem(const void* ptr) {
+  bfloat16_t bf;
+  CopyBytes<2>(HWY_ASSUME_ALIGNED(ptr, 2), &bf);
+  return F32FromBF16(bf);
+}
+#if HWY_HAVE_SCALAR_F16_OPERATORS
+#define HWY_BF16_TO_F16_CONSTEXPR HWY_BF16_CONSTEXPR
+#else
+#define HWY_BF16_TO_F16_CONSTEXPR HWY_F16_CONSTEXPR
+#endif
+// For casting from TFrom to TTo
+template <typename TTo, typename TFrom, HWY_IF_NOT_SPECIAL_FLOAT(TTo),
+          HWY_IF_NOT_SPECIAL_FLOAT(TFrom), HWY_IF_NOT_SAME(TTo, TFrom)>
+HWY_API constexpr TTo ConvertScalarTo(const TFrom in) {
+  return static_cast<TTo>(in);
+}
+template <typename TTo, typename TFrom, HWY_IF_F16(TTo),
+          HWY_IF_NOT_SPECIAL_FLOAT(TFrom), HWY_IF_NOT_SAME(TFrom, double)>
+HWY_API constexpr TTo ConvertScalarTo(const TFrom in) {
+  return F16FromF32(static_cast<float>(in));
+}
+template <typename TTo, HWY_IF_F16(TTo)>
+HWY_API HWY_BF16_TO_F16_CONSTEXPR TTo
+ConvertScalarTo(const hwy::bfloat16_t in) {
+  return F16FromF32(F32FromBF16(in));
+}
+template <typename TTo, HWY_IF_F16(TTo)>
+HWY_API HWY_F16_CONSTEXPR TTo ConvertScalarTo(const double in) {
+  return F16FromF64(in);
+}
+template <typename TTo, typename TFrom, HWY_IF_BF16(TTo),
+          HWY_IF_NOT_SPECIAL_FLOAT(TFrom), HWY_IF_NOT_SAME(TFrom, double)>
+HWY_API HWY_BF16_CONSTEXPR TTo ConvertScalarTo(const TFrom in) {
+  return BF16FromF32(static_cast<float>(in));
+}
+template <typename TTo, HWY_IF_BF16(TTo)>
+HWY_API HWY_BF16_TO_F16_CONSTEXPR TTo ConvertScalarTo(const hwy::float16_t in) {
+  return BF16FromF32(F32FromF16(in));
+}
+template <typename TTo, HWY_IF_BF16(TTo)>
+HWY_API HWY_BF16_CONSTEXPR TTo ConvertScalarTo(const double in) {
+  return BF16FromF64(in);
+}
+template <typename TTo, typename TFrom, HWY_IF_F16(TFrom),
+          HWY_IF_NOT_SPECIAL_FLOAT(TTo)>
+HWY_API HWY_F16_CONSTEXPR TTo ConvertScalarTo(const TFrom in) {
+  return static_cast<TTo>(F32FromF16(in));
+}
+template <typename TTo, typename TFrom, HWY_IF_BF16(TFrom),
+          HWY_IF_NOT_SPECIAL_FLOAT(TTo)>
+HWY_API HWY_BF16_CONSTEXPR TTo ConvertScalarTo(TFrom in) {
+  return static_cast<TTo>(F32FromBF16(in));
+}
+// Same: return unchanged
+template <typename TTo>
+HWY_API constexpr TTo ConvertScalarTo(TTo in) {
+  return in;
+}
 //------------------------------------------------------------------------------
 // Helper functions
@@ -1348,6 +2420,7 @@ constexpr inline size_t RoundUpTo(size_t what, size_t align) {
 // Undefined results for x == 0.
 HWY_API size_t Num0BitsBelowLS1Bit_Nonzero32(const uint32_t x) {
+  HWY_DASSERT(x != 0);
 #if HWY_COMPILER_MSVC
   unsigned long index;  // NOLINT
   _BitScanForward(&index, x);
@@ -1358,6 +2431,7 @@ HWY_API size_t Num0BitsBelowLS1Bit_Nonzero32(const uint32_t x) {
 }
 HWY_API size_t Num0BitsBelowLS1Bit_Nonzero64(const uint64_t x) {
+  HWY_DASSERT(x != 0);
 #if HWY_COMPILER_MSVC
 #if HWY_ARCH_X86_64
   unsigned long index;  // NOLINT
@@ -1383,6 +2457,7 @@ HWY_API size_t Num0BitsBelowLS1Bit_Nonzero64(const uint64_t x) {
 // Undefined results for x == 0.
 HWY_API size_t Num0BitsAboveMS1Bit_Nonzero32(const uint32_t x) {
+  HWY_DASSERT(x != 0);
 #if HWY_COMPILER_MSVC
   unsigned long index;  // NOLINT
   _BitScanReverse(&index, x);
@@ -1393,6 +2468,7 @@ HWY_API size_t Num0BitsAboveMS1Bit_Nonzero32(const uint32_t x) {
 }
 HWY_API size_t Num0BitsAboveMS1Bit_Nonzero64(const uint64_t x) {
+  HWY_DASSERT(x != 0);
 #if HWY_COMPILER_MSVC
 #if HWY_ARCH_X86_64
   unsigned long index;  // NOLINT
@@ -1416,26 +2492,48 @@ HWY_API size_t Num0BitsAboveMS1Bit_Nonzero64(const uint64_t x) {
 #endif  // HWY_COMPILER_MSVC
 }
-HWY_API size_t PopCount(uint64_t x) {
-#if HWY_COMPILER_GCC  // includes clang
-  return static_cast<size_t>(__builtin_popcountll(x));
-  // This instruction has a separate feature flag, but is often called from
-  // non-SIMD code, so we don't want to require dynamic dispatch. It was first
-  // supported by Intel in Nehalem (SSE4.2), but MSVC only predefines a macro
-  // for AVX, so check for that.
+template <class T, HWY_IF_INTEGER(RemoveCvRef<T>),
+          HWY_IF_T_SIZE_ONE_OF(RemoveCvRef<T>, (1 << 1) | (1 << 2) | (1 << 4))>
+HWY_API size_t PopCount(T x) {
+  uint32_t u32_x = static_cast<uint32_t>(
+      static_cast<UnsignedFromSize<sizeof(RemoveCvRef<T>)>>(x));
+#if HWY_COMPILER_GCC || HWY_COMPILER_CLANG
+  return static_cast<size_t>(__builtin_popcountl(u32_x));
+#elif HWY_COMPILER_MSVC && HWY_ARCH_X86_32 && defined(__AVX__)
+  return static_cast<size_t>(_mm_popcnt_u32(u32_x));
+#else
+  u32_x -= ((u32_x >> 1) & 0x55555555u);
+  u32_x = (((u32_x >> 2) & 0x33333333u) + (u32_x & 0x33333333u));
+  u32_x = (((u32_x >> 4) + u32_x) & 0x0F0F0F0Fu);
+  u32_x += (u32_x >> 8);
+  u32_x += (u32_x >> 16);
+  return static_cast<size_t>(u32_x & 0x3Fu);
+#endif
+}
+template <class T, HWY_IF_INTEGER(RemoveCvRef<T>),
+          HWY_IF_T_SIZE(RemoveCvRef<T>, 8)>
+HWY_API size_t PopCount(T x) {
+  uint64_t u64_x = static_cast<uint64_t>(
+      static_cast<UnsignedFromSize<sizeof(RemoveCvRef<T>)>>(x));
+#if HWY_COMPILER_GCC || HWY_COMPILER_CLANG
+  return static_cast<size_t>(__builtin_popcountll(u64_x));
 #elif HWY_COMPILER_MSVC && HWY_ARCH_X86_64 && defined(__AVX__)
-  return _mm_popcnt_u64(x);
+  return _mm_popcnt_u64(u64_x);
 #elif HWY_COMPILER_MSVC && HWY_ARCH_X86_32 && defined(__AVX__)
-  return _mm_popcnt_u32(static_cast<uint32_t>(x & 0xFFFFFFFFu)) +
-         _mm_popcnt_u32(static_cast<uint32_t>(x >> 32));
+  return _mm_popcnt_u32(static_cast<uint32_t>(u64_x & 0xFFFFFFFFu)) +
+         _mm_popcnt_u32(static_cast<uint32_t>(u64_x >> 32));
 #else
-  x -= ((x >> 1) & 0x5555555555555555ULL);
-  x = (((x >> 2) & 0x3333333333333333ULL) + (x & 0x3333333333333333ULL));
-  x = (((x >> 4) + x) & 0x0F0F0F0F0F0F0F0FULL);
-  x += (x >> 8);
-  x += (x >> 16);
-  x += (x >> 32);
-  return static_cast<size_t>(x & 0x7Fu);
+  u64_x -= ((u64_x >> 1) & 0x5555555555555555ULL);
+  u64_x = (((u64_x >> 2) & 0x3333333333333333ULL) +
+           (u64_x & 0x3333333333333333ULL));
+  u64_x = (((u64_x >> 4) + u64_x) & 0x0F0F0F0F0F0F0F0FULL);
+  u64_x += (u64_x >> 8);
+  u64_x += (u64_x >> 16);
+  u64_x += (u64_x >> 32);
+  return static_cast<size_t>(u64_x & 0x7Fu);
 #endif
 }
@@ -1456,18 +2554,28 @@ template <typename TI>
              : static_cast<size_t>(FloorLog2(static_cast<TI>(x - 1)) + 1);
 }
-template <typename T>
-HWY_INLINE constexpr T AddWithWraparound(hwy::FloatTag /*tag*/, T t, size_t n) {
-  return t + static_cast<T>(n);
+template <typename T, typename T2, HWY_IF_FLOAT(T), HWY_IF_NOT_SPECIAL_FLOAT(T)>
+HWY_INLINE constexpr T AddWithWraparound(T t, T2 increment) {
+  return t + static_cast<T>(increment);
 }
-template <typename T>
-HWY_INLINE constexpr T AddWithWraparound(hwy::NonFloatTag /*tag*/, T t,
-                                         size_t n) {
+template <typename T, typename T2, HWY_IF_SPECIAL_FLOAT(T)>
+HWY_INLINE constexpr T AddWithWraparound(T t, T2 increment) {
+  return ConvertScalarTo<T>(ConvertScalarTo<float>(t) +
+                            ConvertScalarTo<float>(increment));
+}
+template <typename T, typename T2, HWY_IF_NOT_FLOAT(T)>
+HWY_INLINE constexpr T AddWithWraparound(T t, T2 n) {
   using TU = MakeUnsigned<T>;
-  return static_cast<T>(
-      static_cast<TU>(static_cast<TU>(t) + static_cast<TU>(n)) &
-      hwy::LimitsMax<TU>());
+  // Sub-int types would promote to int, not unsigned, which would trigger
+  // warnings, so first promote to the largest unsigned type. Due to
+  // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=87519, which affected GCC 8
+  // until fixed in 9.3, we use built-in types rather than uint64_t.
+  return static_cast<T>(static_cast<TU>(
+      static_cast<unsigned long long>(static_cast<unsigned long long>(t) +
+                                      static_cast<unsigned long long>(n)) &
+      uint64_t{hwy::LimitsMax<TU>()}));
 }
 #if HWY_COMPILER_MSVC && HWY_ARCH_X86_64
@@ -1494,7 +2602,120 @@ HWY_API uint64_t Mul128(uint64_t a, uint64_t b, uint64_t* HWY_RESTRICT upper) {
 #endif
 }
+namespace detail {
+template <typename T>
+static HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR T ScalarAbs(hwy::FloatTag /*tag*/,
+                                                          T val) {
+  using TU = MakeUnsigned<T>;
+  return BitCastScalar<T>(
+      static_cast<TU>(BitCastScalar<TU>(val) & (~SignMask<T>())));
+}
+template <typename T>
+static HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR T
+ScalarAbs(hwy::SpecialTag /*tag*/, T val) {
+  return ScalarAbs(hwy::FloatTag(), val);
+}
+template <typename T>
+static HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR T
+ScalarAbs(hwy::SignedTag /*tag*/, T val) {
+  using TU = MakeUnsigned<T>;
+  return (val < T{0}) ? static_cast<T>(TU{0} - static_cast<TU>(val)) : val;
+}
+template <typename T>
+static HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR T
+ScalarAbs(hwy::UnsignedTag /*tag*/, T val) {
+  return val;
+}
+}  // namespace detail
+template <typename T>
+HWY_API HWY_BITCASTSCALAR_CONSTEXPR RemoveCvRef<T> ScalarAbs(T val) {
+  using TVal = MakeLaneTypeIfInteger<
+      detail::NativeSpecialFloatToWrapper<RemoveCvRef<T>>>;
+  return detail::ScalarAbs(hwy::TypeTag<TVal>(), static_cast<TVal>(val));
+}
+template <typename T>
+HWY_API HWY_BITCASTSCALAR_CONSTEXPR bool ScalarIsNaN(T val) {
+  using TF = detail::NativeSpecialFloatToWrapper<RemoveCvRef<T>>;
+  using TU = MakeUnsigned<TF>;
+  return (BitCastScalar<TU>(ScalarAbs(val)) > ExponentMask<TF>());
+}
+template <typename T>
+HWY_API HWY_BITCASTSCALAR_CONSTEXPR bool ScalarIsInf(T val) {
+  using TF = detail::NativeSpecialFloatToWrapper<RemoveCvRef<T>>;
+  using TU = MakeUnsigned<TF>;
+  return static_cast<TU>(BitCastScalar<TU>(static_cast<TF>(val)) << 1) ==
+         static_cast<TU>(MaxExponentTimes2<TF>());
+}
+namespace detail {
+template <typename T>
+static HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR bool ScalarIsFinite(
+    hwy::FloatTag /*tag*/, T val) {
+  using TU = MakeUnsigned<T>;
+  return (BitCastScalar<TU>(hwy::ScalarAbs(val)) < ExponentMask<T>());
+}
+template <typename T>
+static HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR bool ScalarIsFinite(
+    hwy::NonFloatTag /*tag*/, T /*val*/) {
+  // Integer values are always finite
+  return true;
+}
+}  // namespace detail
+template <typename T>
+HWY_API HWY_BITCASTSCALAR_CONSTEXPR bool ScalarIsFinite(T val) {
+  using TVal = MakeLaneTypeIfInteger<
+      detail::NativeSpecialFloatToWrapper<RemoveCvRef<T>>>;
+  return detail::ScalarIsFinite(hwy::IsFloatTag<TVal>(),
+                                static_cast<TVal>(val));
+}
+template <typename T>
+HWY_API HWY_BITCASTSCALAR_CONSTEXPR RemoveCvRef<T> ScalarCopySign(T magn,
+                                                                  T sign) {
+  using TF = RemoveCvRef<detail::NativeSpecialFloatToWrapper<RemoveCvRef<T>>>;
+  using TU = MakeUnsigned<TF>;
+  return BitCastScalar<TF>(static_cast<TU>(
+      (BitCastScalar<TU>(static_cast<TF>(magn)) & (~SignMask<TF>())) |
+      (BitCastScalar<TU>(static_cast<TF>(sign)) & SignMask<TF>())));
+}
+template <typename T>
+HWY_API HWY_BITCASTSCALAR_CONSTEXPR bool ScalarSignBit(T val) {
+  using TVal = MakeLaneTypeIfInteger<
+      detail::NativeSpecialFloatToWrapper<RemoveCvRef<T>>>;
+  using TU = MakeUnsigned<TVal>;
+  return ((BitCastScalar<TU>(static_cast<TVal>(val)) & SignMask<TVal>()) != 0);
+}
 // Prevents the compiler from eliding the computations that led to "output".
+#if HWY_ARCH_PPC && (HWY_COMPILER_GCC || HWY_COMPILER_CLANG) && \
+    !defined(_SOFT_FLOAT)
+// Workaround to avoid test failures on PPC if compiled with Clang
+template <class T, HWY_IF_F32(T)>
+HWY_API void PreventElision(T&& output) {
+  asm volatile("" : "+f"(output)::"memory");
+}
+template <class T, HWY_IF_F64(T)>
+HWY_API void PreventElision(T&& output) {
+  asm volatile("" : "+d"(output)::"memory");
+}
+template <class T, HWY_IF_NOT_FLOAT3264(T)>
+HWY_API void PreventElision(T&& output) {
+  asm volatile("" : "+r"(output)::"memory");
+}
+#else
 template <class T>
 HWY_API void PreventElision(T&& output) {
 #if HWY_COMPILER_MSVC
@@ -1502,8 +2723,8 @@ HWY_API void PreventElision(T&& output) {
   // RTL constraints). Self-assignment with #pragma optimize("off") might be
   // expected to prevent elision, but it does not with MSVC 2015. Type-punning
   // with volatile pointers generates inefficient code on MSVC 2017.
-  static std::atomic<RemoveRef<T>> dummy;
-  dummy.store(output, std::memory_order_relaxed);
+  static std::atomic<RemoveCvRef<T>> sink;
+  sink.store(output, std::memory_order_relaxed);
 #else
   // Works by indicating to the compiler that "output" is being read and
   // modified. The +r constraint avoids unnecessary writes to memory, but only
@@ -1511,6 +2732,7 @@ HWY_API void PreventElision(T&& output) {
   asm volatile("" : "+r"(output) : : "memory");
 #endif
 }
+#endif
 }  // namespace hwy