npm - @img/sharp-libvips-dev - Versions diffs - 0.0.1 → 0.0.3 - Mend

@img/sharp-libvips-dev 0.0.1 → 0.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (109) hide show

package/README.md +3 -3
package/cplusplus/VConnection.cpp +54 -54
package/cplusplus/VError.cpp +20 -18
package/cplusplus/VImage.cpp +636 -589
package/cplusplus/VInterpolate.cpp +22 -22
package/cplusplus/VRegion.cpp +4 -4
package/cplusplus/vips-operators.cpp +2326 -2301
package/include/aom/aom_codec.h +10 -6
package/include/aom/aom_decoder.h +1 -1
package/include/aom/aom_encoder.h +9 -2
package/include/aom/aomcx.h +72 -3
package/include/cairo/cairo-ft.h +1 -1
package/include/cairo/cairo-gobject.h +8 -0
package/include/cairo/cairo-svg.h +3 -3
package/include/cairo/cairo-version.h +2 -2
package/include/cairo/cairo.h +91 -24
package/include/glib-2.0/glib/gmacros.h +1 -1
package/include/glib-2.0/glib/gtestutils.h +1 -1
package/include/glib-2.0/gobject/gtype.h +7 -7
package/include/harfbuzz/hb-version.h +2 -2
package/include/hwy/aligned_allocator.h +211 -0
package/include/hwy/base.h +1517 -0
package/include/hwy/cache_control.h +108 -0
package/include/hwy/detect_compiler_arch.h +281 -0
package/include/hwy/detect_targets.h +644 -0
package/include/hwy/foreach_target.h +340 -0
package/include/hwy/highway.h +435 -0
package/include/hwy/highway_export.h +74 -0
package/include/hwy/nanobenchmark.h +171 -0
package/include/hwy/ops/arm_neon-inl.h +8913 -0
package/include/hwy/ops/arm_sve-inl.h +5105 -0
package/include/hwy/ops/emu128-inl.h +2811 -0
package/include/hwy/ops/generic_ops-inl.h +4745 -0
package/include/hwy/ops/ppc_vsx-inl.h +5716 -0
package/include/hwy/ops/rvv-inl.h +5070 -0
package/include/hwy/ops/scalar-inl.h +1995 -0
package/include/hwy/ops/set_macros-inl.h +578 -0
package/include/hwy/ops/shared-inl.h +539 -0
package/include/hwy/ops/tuple-inl.h +125 -0
package/include/hwy/ops/wasm_128-inl.h +5917 -0
package/include/hwy/ops/x86_128-inl.h +11173 -0
package/include/hwy/ops/x86_256-inl.h +7529 -0
package/include/hwy/ops/x86_512-inl.h +6849 -0
package/include/hwy/per_target.h +44 -0
package/include/hwy/print-inl.h +62 -0
package/include/hwy/print.h +75 -0
package/include/hwy/robust_statistics.h +148 -0
package/include/hwy/targets.h +338 -0
package/include/hwy/timer-inl.h +200 -0
package/include/hwy/timer.h +55 -0
package/include/jconfig.h +2 -2
package/include/jpeglib.h +3 -2
package/include/libheif/heif.h +461 -384
package/include/libheif/heif_cxx.h +4 -1
package/include/libheif/heif_plugin.h +1 -1
package/include/libheif/heif_properties.h +138 -0
package/include/libheif/heif_regions.h +866 -0
package/include/libheif/heif_version.h +3 -3
package/include/libpng16/pnglibconf.h +1 -1
package/include/pnglibconf.h +1 -1
package/include/vips/VConnection8.h +43 -49
package/include/vips/VError8.h +27 -24
package/include/vips/VImage8.h +4861 -4597
package/include/vips/VInterpolate8.h +24 -27
package/include/vips/VRegion8.h +32 -33
package/include/vips/arithmetic.h +169 -169
package/include/vips/basic.h +33 -33
package/include/vips/buf.h +56 -54
package/include/vips/colour.h +95 -95
package/include/vips/connection.h +190 -193
package/include/vips/conversion.h +91 -91
package/include/vips/convolution.h +36 -30
package/include/vips/create.h +63 -63
package/include/vips/dbuf.h +35 -37
package/include/vips/debug.h +65 -33
package/include/vips/draw.h +41 -41
package/include/vips/enumtypes.h +54 -51
package/include/vips/error.h +63 -63
package/include/vips/foreign.h +263 -223
package/include/vips/format.h +48 -48
package/include/vips/freqfilt.h +22 -22
package/include/vips/gate.h +55 -47
package/include/vips/generate.h +34 -34
package/include/vips/header.h +111 -101
package/include/vips/histogram.h +28 -28
package/include/vips/image.h +213 -213
package/include/vips/interpolate.h +40 -41
package/include/vips/memory.h +61 -52
package/include/vips/morphology.h +24 -24
package/include/vips/mosaicing.h +32 -33
package/include/vips/object.h +371 -357
package/include/vips/operation.h +68 -67
package/include/vips/private.h +76 -76
package/include/vips/rect.h +26 -26
package/include/vips/region.h +92 -92
package/include/vips/resample.h +38 -38
package/include/vips/sbuf.h +53 -54
package/include/vips/semaphore.h +24 -24
package/include/vips/thread.h +30 -27
package/include/vips/threadpool.h +48 -49
package/include/vips/transform.h +39 -39
package/include/vips/type.h +90 -85
package/include/vips/util.h +274 -229
package/include/vips/vector.h +24 -144
package/include/vips/version.h +9 -9
package/include/vips/vips.h +41 -40
package/include/zlib.h +23 -19
package/package.json +1 -1
package/versions.json +9 -9

package/include/hwy/base.h ADDED Viewed

@@ -0,0 +1,1517 @@
+// Copyright 2020 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef HIGHWAY_HWY_BASE_H_
+#define HIGHWAY_HWY_BASE_H_
+// For SIMD module implementations and their callers, target-independent.
+// IWYU pragma: begin_exports
+#include <stddef.h>
+#include <stdint.h>
+// Wrapping this into a HWY_HAS_INCLUDE causes clang-format to fail.
+#if __cplusplus >= 202100L && defined(__has_include)
+#if __has_include(<stdfloat>)
+#include <stdfloat>  // std::float16_t
+#endif
+#endif
+#include "hwy/detect_compiler_arch.h"
+#include "hwy/highway_export.h"
+// "IWYU pragma: keep" does not work for these includes, so hide from the IDE.
+#if !HWY_IDE
+#if !defined(HWY_NO_LIBCXX)
+#ifndef __STDC_FORMAT_MACROS
+#define __STDC_FORMAT_MACROS  // before inttypes.h
+#endif
+#include <inttypes.h>
+#endif
+#if (HWY_ARCH_X86 && !defined(HWY_NO_LIBCXX)) || HWY_COMPILER_MSVC
+#include <atomic>
+#endif
+#endif  // !HWY_IDE
+// IWYU pragma: end_exports
+#if HWY_COMPILER_MSVC
+#include <string.h>  // memcpy
+#endif
+//------------------------------------------------------------------------------
+// Compiler-specific definitions
+#define HWY_STR_IMPL(macro) #macro
+#define HWY_STR(macro) HWY_STR_IMPL(macro)
+#if HWY_COMPILER_MSVC
+#include <intrin.h>
+#define HWY_RESTRICT __restrict
+#define HWY_INLINE __forceinline
+#define HWY_NOINLINE __declspec(noinline)
+#define HWY_FLATTEN
+#define HWY_NORETURN __declspec(noreturn)
+#define HWY_LIKELY(expr) (expr)
+#define HWY_UNLIKELY(expr) (expr)
+#define HWY_PRAGMA(tokens) __pragma(tokens)
+#define HWY_DIAGNOSTICS(tokens) HWY_PRAGMA(warning(tokens))
+#define HWY_DIAGNOSTICS_OFF(msc, gcc) HWY_DIAGNOSTICS(msc)
+#define HWY_MAYBE_UNUSED
+#define HWY_HAS_ASSUME_ALIGNED 0
+#if (_MSC_VER >= 1700)
+#define HWY_MUST_USE_RESULT _Check_return_
+#else
+#define HWY_MUST_USE_RESULT
+#endif
+#else
+#define HWY_RESTRICT __restrict__
+// force inlining without optimization enabled creates very inefficient code
+// that can cause compiler timeout
+#ifdef __OPTIMIZE__
+#define HWY_INLINE inline __attribute__((always_inline))
+#else
+#define HWY_INLINE inline
+#endif
+#define HWY_NOINLINE __attribute__((noinline))
+#define HWY_FLATTEN __attribute__((flatten))
+#define HWY_NORETURN __attribute__((noreturn))
+#define HWY_LIKELY(expr) __builtin_expect(!!(expr), 1)
+#define HWY_UNLIKELY(expr) __builtin_expect(!!(expr), 0)
+#define HWY_PRAGMA(tokens) _Pragma(#tokens)
+#define HWY_DIAGNOSTICS(tokens) HWY_PRAGMA(GCC diagnostic tokens)
+#define HWY_DIAGNOSTICS_OFF(msc, gcc) HWY_DIAGNOSTICS(gcc)
+// Encountered "attribute list cannot appear here" when using the C++17
+// [[maybe_unused]], so only use the old style attribute for now.
+#define HWY_MAYBE_UNUSED __attribute__((unused))
+#define HWY_MUST_USE_RESULT __attribute__((warn_unused_result))
+#endif  // !HWY_COMPILER_MSVC
+//------------------------------------------------------------------------------
+// Builtin/attributes (no more #include after this point due to namespace!)
+namespace hwy {
+// Enables error-checking of format strings.
+#if HWY_HAS_ATTRIBUTE(__format__)
+#define HWY_FORMAT(idx_fmt, idx_arg) \
+  __attribute__((__format__(__printf__, idx_fmt, idx_arg)))
+#else
+#define HWY_FORMAT(idx_fmt, idx_arg)
+#endif
+// Returns a void* pointer which the compiler then assumes is N-byte aligned.
+// Example: float* HWY_RESTRICT aligned = (float*)HWY_ASSUME_ALIGNED(in, 32);
+//
+// The assignment semantics are required by GCC/Clang. ICC provides an in-place
+// __assume_aligned, whereas MSVC's __assume appears unsuitable.
+#if HWY_HAS_BUILTIN(__builtin_assume_aligned)
+#define HWY_ASSUME_ALIGNED(ptr, align) __builtin_assume_aligned((ptr), (align))
+#else
+#define HWY_ASSUME_ALIGNED(ptr, align) (ptr) /* not supported */
+#endif
+// Clang and GCC require attributes on each function into which SIMD intrinsics
+// are inlined. Support both per-function annotation (HWY_ATTR) for lambdas and
+// automatic annotation via pragmas.
+#if HWY_COMPILER_ICC
+// As of ICC 2021.{1-9} the pragma is neither implemented nor required.
+#define HWY_PUSH_ATTRIBUTES(targets_str)
+#define HWY_POP_ATTRIBUTES
+#elif HWY_COMPILER_CLANG
+#define HWY_PUSH_ATTRIBUTES(targets_str)                                \
+  HWY_PRAGMA(clang attribute push(__attribute__((target(targets_str))), \
+                                  apply_to = function))
+#define HWY_POP_ATTRIBUTES HWY_PRAGMA(clang attribute pop)
+#elif HWY_COMPILER_GCC_ACTUAL
+#define HWY_PUSH_ATTRIBUTES(targets_str) \
+  HWY_PRAGMA(GCC push_options) HWY_PRAGMA(GCC target targets_str)
+#define HWY_POP_ATTRIBUTES HWY_PRAGMA(GCC pop_options)
+#else
+#define HWY_PUSH_ATTRIBUTES(targets_str)
+#define HWY_POP_ATTRIBUTES
+#endif
+//------------------------------------------------------------------------------
+// Macros
+#define HWY_API static HWY_INLINE HWY_FLATTEN HWY_MAYBE_UNUSED
+#define HWY_CONCAT_IMPL(a, b) a##b
+#define HWY_CONCAT(a, b) HWY_CONCAT_IMPL(a, b)
+#define HWY_MIN(a, b) ((a) < (b) ? (a) : (b))
+#define HWY_MAX(a, b) ((a) > (b) ? (a) : (b))
+#if HWY_COMPILER_GCC_ACTUAL
+// nielskm: GCC does not support '#pragma GCC unroll' without the factor.
+#define HWY_UNROLL(factor) HWY_PRAGMA(GCC unroll factor)
+#define HWY_DEFAULT_UNROLL HWY_UNROLL(4)
+#elif HWY_COMPILER_CLANG || HWY_COMPILER_ICC || HWY_COMPILER_ICX
+#define HWY_UNROLL(factor) HWY_PRAGMA(unroll factor)
+#define HWY_DEFAULT_UNROLL HWY_UNROLL()
+#else
+#define HWY_UNROLL(factor)
+#define HWY_DEFAULT_UNROLL
+#endif
+// Tell a compiler that the expression always evaluates to true.
+// The expression should be free from any side effects.
+// Some older compilers may have trouble with complex expressions, therefore
+// it is advisable to split multiple conditions into separate assume statements,
+// and manually check the generated code.
+// OK but could fail:
+//   HWY_ASSUME(x == 2 && y == 3);
+// Better:
+//   HWY_ASSUME(x == 2);
+//   HWY_ASSUME(y == 3);
+#if HWY_HAS_CPP_ATTRIBUTE(assume)
+#define HWY_ASSUME(expr) [[assume(expr)]]
+#elif HWY_COMPILER_MSVC || HWY_COMPILER_ICC
+#define HWY_ASSUME(expr) __assume(expr)
+// __builtin_assume() was added in clang 3.6.
+#elif HWY_COMPILER_CLANG && HWY_HAS_BUILTIN(__builtin_assume)
+#define HWY_ASSUME(expr) __builtin_assume(expr)
+// __builtin_unreachable() was added in GCC 4.5, but __has_builtin() was added
+// later, so check for the compiler version directly.
+#elif HWY_COMPILER_GCC_ACTUAL >= 405
+#define HWY_ASSUME(expr) \
+  ((expr) ? static_cast<void>(0) : __builtin_unreachable())
+#else
+#define HWY_ASSUME(expr) static_cast<void>(0)
+#endif
+// Compile-time fence to prevent undesirable code reordering. On Clang x86, the
+// typical asm volatile("" : : : "memory") has no effect, whereas atomic fence
+// does, without generating code.
+#if HWY_ARCH_X86 && !defined(HWY_NO_LIBCXX)
+#define HWY_FENCE std::atomic_thread_fence(std::memory_order_acq_rel)
+#else
+// TODO(janwas): investigate alternatives. On Arm, the above generates barriers.
+#define HWY_FENCE
+#endif
+// 4 instances of a given literal value, useful as input to LoadDup128.
+#define HWY_REP4(literal) literal, literal, literal, literal
+HWY_DLLEXPORT HWY_NORETURN void HWY_FORMAT(3, 4)
+    Abort(const char* file, int line, const char* format, ...);
+#define HWY_ABORT(format, ...) \
+  ::hwy::Abort(__FILE__, __LINE__, format, ##__VA_ARGS__)
+// Always enabled.
+#define HWY_ASSERT(condition)             \
+  do {                                    \
+    if (!(condition)) {                   \
+      HWY_ABORT("Assert %s", #condition); \
+    }                                     \
+  } while (0)
+#if HWY_HAS_FEATURE(memory_sanitizer) || defined(MEMORY_SANITIZER)
+#define HWY_IS_MSAN 1
+#else
+#define HWY_IS_MSAN 0
+#endif
+#if HWY_HAS_FEATURE(address_sanitizer) || defined(ADDRESS_SANITIZER)
+#define HWY_IS_ASAN 1
+#else
+#define HWY_IS_ASAN 0
+#endif
+#if HWY_HAS_FEATURE(thread_sanitizer) || defined(THREAD_SANITIZER)
+#define HWY_IS_TSAN 1
+#else
+#define HWY_IS_TSAN 0
+#endif
+// MSAN may cause lengthy build times or false positives e.g. in AVX3 DemoteTo.
+// You can disable MSAN by adding this attribute to the function that fails.
+#if HWY_IS_MSAN
+#define HWY_ATTR_NO_MSAN __attribute__((no_sanitize_memory))
+#else
+#define HWY_ATTR_NO_MSAN
+#endif
+// For enabling HWY_DASSERT and shortening tests in slower debug builds
+#if !defined(HWY_IS_DEBUG_BUILD)
+// Clang does not define NDEBUG, but it and GCC define __OPTIMIZE__, and recent
+// MSVC defines NDEBUG (if not, could instead check _DEBUG).
+#if (!defined(__OPTIMIZE__) && !defined(NDEBUG)) || HWY_IS_ASAN || \
+    HWY_IS_MSAN || HWY_IS_TSAN || defined(__clang_analyzer__)
+#define HWY_IS_DEBUG_BUILD 1
+#else
+#define HWY_IS_DEBUG_BUILD 0
+#endif
+#endif  // HWY_IS_DEBUG_BUILD
+#if HWY_IS_DEBUG_BUILD
+#define HWY_DASSERT(condition) HWY_ASSERT(condition)
+#else
+#define HWY_DASSERT(condition) \
+  do {                         \
+  } while (0)
+#endif
+//------------------------------------------------------------------------------
+// CopyBytes / ZeroBytes
+#if HWY_COMPILER_MSVC
+#pragma intrinsic(memcpy)
+#pragma intrinsic(memset)
+#endif
+// The source/destination must not overlap/alias.
+template <size_t kBytes, typename From, typename To>
+HWY_API void CopyBytes(const From* from, To* to) {
+#if HWY_COMPILER_MSVC
+  memcpy(to, from, kBytes);
+#else
+  __builtin_memcpy(static_cast<void*>(to), static_cast<const void*>(from),
+                   kBytes);
+#endif
+}
+HWY_API void CopyBytes(const void* HWY_RESTRICT from, void* HWY_RESTRICT to,
+                       size_t num_of_bytes_to_copy) {
+#if HWY_COMPILER_MSVC
+  memcpy(to, from, num_of_bytes_to_copy);
+#else
+  __builtin_memcpy(to, from, num_of_bytes_to_copy);
+#endif
+}
+// Same as CopyBytes, but for same-sized objects; avoids a size argument.
+template <typename From, typename To>
+HWY_API void CopySameSize(const From* HWY_RESTRICT from, To* HWY_RESTRICT to) {
+  static_assert(sizeof(From) == sizeof(To), "");
+  CopyBytes<sizeof(From)>(from, to);
+}
+template <size_t kBytes, typename To>
+HWY_API void ZeroBytes(To* to) {
+#if HWY_COMPILER_MSVC
+  memset(to, 0, kBytes);
+#else
+  __builtin_memset(to, 0, kBytes);
+#endif
+}
+HWY_API void ZeroBytes(void* to, size_t num_bytes) {
+#if HWY_COMPILER_MSVC
+  memset(to, 0, num_bytes);
+#else
+  __builtin_memset(to, 0, num_bytes);
+#endif
+}
+//------------------------------------------------------------------------------
+// kMaxVectorSize (undocumented, pending removal)
+#if HWY_ARCH_X86
+static constexpr HWY_MAYBE_UNUSED size_t kMaxVectorSize = 64;  // AVX-512
+#elif HWY_ARCH_RVV && defined(__riscv_v_intrinsic) && \
+    __riscv_v_intrinsic >= 11000
+// Not actually an upper bound on the size.
+static constexpr HWY_MAYBE_UNUSED size_t kMaxVectorSize = 4096;
+#else
+static constexpr HWY_MAYBE_UNUSED size_t kMaxVectorSize = 16;
+#endif
+//------------------------------------------------------------------------------
+// Alignment
+// Potentially useful for LoadDup128 and capped vectors. In other cases, arrays
+// should be allocated dynamically via aligned_allocator.h because Lanes() may
+// exceed the stack size.
+#if HWY_ARCH_X86
+#define HWY_ALIGN_MAX alignas(64)
+#elif HWY_ARCH_RVV && defined(__riscv_v_intrinsic) && \
+    __riscv_v_intrinsic >= 11000
+#define HWY_ALIGN_MAX alignas(8)  // only elements need be aligned
+#else
+#define HWY_ALIGN_MAX alignas(16)
+#endif
+//------------------------------------------------------------------------------
+// Lane types
+#pragma pack(push, 1)
+// float16_t load/store/conversion intrinsics are always supported on Armv8 and
+// VFPv4 (except with MSVC). On Armv7 Clang requires __ARM_FP & 2; GCC requires
+// -mfp16-format=ieee.
+#if (HWY_ARCH_ARM_A64 && !HWY_COMPILER_MSVC) ||                    \
+    (HWY_COMPILER_CLANG && defined(__ARM_FP) && (__ARM_FP & 2)) || \
+    (HWY_COMPILER_GCC_ACTUAL && defined(__ARM_FP16_FORMAT_IEEE))
+#define HWY_NEON_HAVE_FLOAT16C 1
+#else
+#define HWY_NEON_HAVE_FLOAT16C 0
+#endif
+// C11 extension ISO/IEC TS 18661-3:2015 but not supported on all targets.
+// Required if HWY_HAVE_FLOAT16, i.e. RVV with zvfh or AVX3_SPR (with
+// sufficiently new compiler supporting avx512fp16). Do not use on clang-cl,
+// which is missing __extendhfsf2.
+#if ((HWY_ARCH_RVV && defined(__riscv_zvfh) && HWY_COMPILER_CLANG) || \
+     (HWY_ARCH_X86 && defined(__SSE2__) &&                            \
+      ((HWY_COMPILER_CLANG >= 1600 && !HWY_COMPILER_CLANGCL) ||       \
+       HWY_COMPILER_GCC_ACTUAL >= 1200)))
+#define HWY_HAVE_C11_FLOAT16 1
+#else
+#define HWY_HAVE_C11_FLOAT16 0
+#endif
+// If 1, both __bf16 and a limited set of *_bf16 SVE intrinsics are available:
+// create/get/set/dup, ld/st, sel, rev, trn, uzp, zip.
+#if HWY_ARCH_ARM_A64 && defined(__ARM_FEATURE_SVE_BF16)
+#define HWY_SVE_HAVE_BFLOAT16 1
+#else
+#define HWY_SVE_HAVE_BFLOAT16 0
+#endif
+// Match [u]int##_t naming scheme so rvv-inl.h macros can obtain the type name
+// by concatenating base type and bits. We use a wrapper class instead of a
+// typedef to the native type to ensure that the same symbols, e.g. for VQSort,
+// are generated regardless of F16 support; see #1684.
+struct float16_t {
+#if HWY_NEON_HAVE_FLOAT16C  // ACLE's __fp16
+  using Raw = __fp16;
+#elif HWY_HAVE_C11_FLOAT16                                    // C11 _Float16
+  using Raw = _Float16;
+#elif __cplusplus > 202002L && defined(__STDCPP_FLOAT16_T__)  // C++23
+  using Raw = std::float16_t;
+#else
+#define HWY_EMULATE_FLOAT16
+  using Raw = uint16_t;
+  Raw bits;
+#endif  // float16_t
+// When backed by a native type, ensure the wrapper behaves like the native
+// type by forwarding all operators. Unfortunately it seems difficult to reuse
+// this code in a base class, so we repeat it in bfloat16_t.
+#ifndef HWY_EMULATE_FLOAT16
+  Raw raw;
+  float16_t() noexcept = default;
+  template <typename T>
+  constexpr float16_t(T arg) noexcept : raw(static_cast<Raw>(arg)) {}
+  float16_t& operator=(Raw arg) noexcept {
+    raw = arg;
+    return *this;
+  }
+  constexpr float16_t(const float16_t&) noexcept = default;
+  float16_t& operator=(const float16_t&) noexcept = default;
+  constexpr operator Raw() const noexcept { return raw; }
+  template <typename T>
+  float16_t& operator+=(T rhs) noexcept {
+    raw = static_cast<Raw>(raw + rhs);
+    return *this;
+  }
+  template <typename T>
+  float16_t& operator-=(T rhs) noexcept {
+    raw = static_cast<Raw>(raw - rhs);
+    return *this;
+  }
+  template <typename T>
+  float16_t& operator*=(T rhs) noexcept {
+    raw = static_cast<Raw>(raw * rhs);
+    return *this;
+  }
+  template <typename T>
+  float16_t& operator/=(T rhs) noexcept {
+    raw = static_cast<Raw>(raw / rhs);
+    return *this;
+  }
+  float16_t operator--() noexcept {
+    raw = static_cast<Raw>(raw - Raw{1});
+    return *this;
+  }
+  float16_t operator--(int) noexcept {
+    raw = static_cast<Raw>(raw - Raw{1});
+    return *this;
+  }
+  float16_t operator++() noexcept {
+    raw = static_cast<Raw>(raw + Raw{1});
+    return *this;
+  }
+  float16_t operator++(int) noexcept {
+    raw = static_cast<Raw>(raw + Raw{1});
+    return *this;
+  }
+  constexpr float16_t operator-() const noexcept {
+    return float16_t(static_cast<Raw>(-raw));
+  }
+  constexpr float16_t operator+() const noexcept { return *this; }
+#endif  // HWY_EMULATE_FLOAT16
+};
+#ifndef HWY_EMULATE_FLOAT16
+constexpr inline bool operator==(float16_t lhs, float16_t rhs) noexcept {
+  return lhs.raw == rhs.raw;
+}
+constexpr inline bool operator!=(float16_t lhs, float16_t rhs) noexcept {
+  return lhs.raw != rhs.raw;
+}
+constexpr inline bool operator<(float16_t lhs, float16_t rhs) noexcept {
+  return lhs.raw < rhs.raw;
+}
+constexpr inline bool operator<=(float16_t lhs, float16_t rhs) noexcept {
+  return lhs.raw <= rhs.raw;
+}
+constexpr inline bool operator>(float16_t lhs, float16_t rhs) noexcept {
+  return lhs.raw > rhs.raw;
+}
+constexpr inline bool operator>=(float16_t lhs, float16_t rhs) noexcept {
+  return lhs.raw >= rhs.raw;
+}
+#endif  // HWY_EMULATE_FLOAT16
+struct bfloat16_t {
+#if HWY_SVE_HAVE_BFLOAT16
+  using Raw = __bf16;
+#elif __cplusplus >= 202100L && defined(__STDCPP_BFLOAT16_T__)  // C++23
+  using Raw = std::bfloat16_t;
+#else
+#define HWY_EMULATE_BFLOAT16
+  using Raw = uint16_t;
+  Raw bits;
+#endif
+#ifndef HWY_EMULATE_BFLOAT16
+  Raw raw;
+  bfloat16_t() noexcept = default;
+  template <typename T>
+  constexpr bfloat16_t(T arg) noexcept : raw(static_cast<Raw>(arg)) {}
+  bfloat16_t& operator=(Raw arg) noexcept {
+    raw = arg;
+    return *this;
+  }
+  constexpr bfloat16_t(const bfloat16_t&) noexcept = default;
+  bfloat16_t& operator=(const bfloat16_t&) noexcept = default;
+  constexpr operator Raw() const noexcept { return raw; }
+  template <typename T>
+  bfloat16_t& operator+=(T rhs) noexcept {
+    raw = static_cast<Raw>(raw + rhs);
+    return *this;
+  }
+  template <typename T>
+  bfloat16_t& operator-=(T rhs) noexcept {
+    raw = static_cast<Raw>(raw - rhs);
+    return *this;
+  }
+  template <typename T>
+  bfloat16_t& operator*=(T rhs) noexcept {
+    raw = static_cast<Raw>(raw * rhs);
+    return *this;
+  }
+  template <typename T>
+  bfloat16_t& operator/=(T rhs) noexcept {
+    raw = static_cast<Raw>(raw / rhs);
+    return *this;
+  }
+  bfloat16_t operator--() noexcept {
+    raw = static_cast<Raw>(raw - Raw{1});
+    return *this;
+  }
+  bfloat16_t operator--(int) noexcept {
+    raw = static_cast<Raw>(raw - Raw{1});
+    return *this;
+  }
+  bfloat16_t operator++() noexcept {
+    raw = static_cast<Raw>(raw + Raw{1});
+    return *this;
+  }
+  bfloat16_t operator++(int) noexcept {
+    raw = static_cast<Raw>(raw + Raw{1});
+    return *this;
+  }
+  constexpr bfloat16_t operator-() const noexcept {
+    return bfloat16_t(static_cast<Raw>(-raw));
+  }
+  constexpr bfloat16_t operator+() const noexcept { return *this; }
+#endif  // HWY_EMULATE_BFLOAT16
+};
+#ifndef HWY_EMULATE_BFLOAT16
+constexpr inline bool operator==(bfloat16_t lhs, bfloat16_t rhs) noexcept {
+  return lhs.raw == rhs.raw;
+}
+constexpr inline bool operator!=(bfloat16_t lhs, bfloat16_t rhs) noexcept {
+  return lhs.raw != rhs.raw;
+}
+constexpr inline bool operator<(bfloat16_t lhs, bfloat16_t rhs) noexcept {
+  return lhs.raw < rhs.raw;
+}
+constexpr inline bool operator<=(bfloat16_t lhs, bfloat16_t rhs) noexcept {
+  return lhs.raw <= rhs.raw;
+}
+constexpr inline bool operator>(bfloat16_t lhs, bfloat16_t rhs) noexcept {
+  return lhs.raw > rhs.raw;
+}
+constexpr inline bool operator>=(bfloat16_t lhs, bfloat16_t rhs) noexcept {
+  return lhs.raw >= rhs.raw;
+}
+#endif  // HWY_EMULATE_BFLOAT16
+#pragma pack(pop)
+HWY_API float F32FromF16(float16_t f16) {
+#ifdef HWY_EMULATE_FLOAT16
+  uint16_t bits16;
+  CopySameSize(&f16, &bits16);
+  const uint32_t sign = static_cast<uint32_t>(bits16 >> 15);
+  const uint32_t biased_exp = (bits16 >> 10) & 0x1F;
+  const uint32_t mantissa = bits16 & 0x3FF;
+  // Subnormal or zero
+  if (biased_exp == 0) {
+    const float subnormal =
+        (1.0f / 16384) * (static_cast<float>(mantissa) * (1.0f / 1024));
+    return sign ? -subnormal : subnormal;
+  }
+  // Normalized: convert the representation directly (faster than ldexp/tables).
+  const uint32_t biased_exp32 = biased_exp + (127 - 15);
+  const uint32_t mantissa32 = mantissa << (23 - 10);
+  const uint32_t bits32 = (sign << 31) | (biased_exp32 << 23) | mantissa32;
+  float result;
+  CopySameSize(&bits32, &result);
+  return result;
+#else
+  return static_cast<float>(f16);
+#endif
+}
+HWY_API float16_t F16FromF32(float f32) {
+#ifdef HWY_EMULATE_FLOAT16
+  uint32_t bits32;
+  CopySameSize(&f32, &bits32);
+  const uint32_t sign = bits32 >> 31;
+  const uint32_t biased_exp32 = (bits32 >> 23) & 0xFF;
+  const uint32_t mantissa32 = bits32 & 0x7FFFFF;
+  const int32_t exp = HWY_MIN(static_cast<int32_t>(biased_exp32) - 127, 15);
+  // Tiny or zero => zero.
+  float16_t out;
+  if (exp < -24) {
+    // restore original sign
+    const uint16_t bits = static_cast<uint16_t>(sign << 15);
+    CopySameSize(&bits, &out);
+    return out;
+  }
+  uint32_t biased_exp16, mantissa16;
+  // exp = [-24, -15] => subnormal
+  if (exp < -14) {
+    biased_exp16 = 0;
+    const uint32_t sub_exp = static_cast<uint32_t>(-14 - exp);
+    HWY_DASSERT(1 <= sub_exp && sub_exp < 11);
+    mantissa16 = static_cast<uint32_t>((1u << (10 - sub_exp)) +
+                                       (mantissa32 >> (13 + sub_exp)));
+  } else {
+    // exp = [-14, 15]
+    biased_exp16 = static_cast<uint32_t>(exp + 15);
+    HWY_DASSERT(1 <= biased_exp16 && biased_exp16 < 31);
+    mantissa16 = mantissa32 >> 13;
+  }
+  HWY_DASSERT(mantissa16 < 1024);
+  const uint32_t bits16 = (sign << 15) | (biased_exp16 << 10) | mantissa16;
+  HWY_DASSERT(bits16 < 0x10000);
+  const uint16_t narrowed = static_cast<uint16_t>(bits16);  // big-endian safe
+  CopySameSize(&narrowed, &out);
+  return out;
+#else
+  return float16_t(static_cast<float16_t::Raw>(f32));
+#endif
+}
+HWY_API float F32FromBF16(bfloat16_t bf) {
+  uint16_t bits16;
+  CopyBytes<2>(&bf, &bits16);
+  uint32_t bits = bits16;
+  bits <<= 16;
+  float f;
+  CopySameSize(&bits, &f);
+  return f;
+}
+HWY_API float F32FromF16Mem(const void* ptr) {
+  float16_t f16;
+  CopyBytes<2>(ptr, &f16);
+  return F32FromF16(f16);
+}
+HWY_API float F32FromBF16Mem(const void* ptr) {
+  bfloat16_t bf;
+  CopyBytes<2>(ptr, &bf);
+  return F32FromBF16(bf);
+}
+HWY_API bfloat16_t BF16FromF32(float f) {
+  uint32_t bits;
+  CopySameSize(&f, &bits);
+  const uint16_t bits16 = static_cast<uint16_t>(bits >> 16);
+  bfloat16_t bf;
+  CopySameSize(&bits16, &bf);
+  return bf;
+}
+using float32_t = float;
+using float64_t = double;
+#pragma pack(push, 1)
+// Aligned 128-bit type. Cannot use __int128 because clang doesn't yet align it:
+// https://reviews.llvm.org/D86310
+struct alignas(16) uint128_t {
+  uint64_t lo;  // little-endian layout
+  uint64_t hi;
+};
+// 64 bit key plus 64 bit value. Faster than using uint128_t when only the key
+// field is to be compared (Lt128Upper instead of Lt128).
+struct alignas(16) K64V64 {
+  uint64_t value;  // little-endian layout
+  uint64_t key;
+};
+// 32 bit key plus 32 bit value. Allows vqsort recursions to terminate earlier
+// than when considering both to be a 64-bit key.
+struct alignas(8) K32V32 {
+  uint32_t value;  // little-endian layout
+  uint32_t key;
+};
+#pragma pack(pop)
+#ifdef HWY_EMULATE_FLOAT16
+static inline HWY_MAYBE_UNUSED bool operator<(const float16_t& a,
+                                              const float16_t& b) {
+  return F32FromF16(a) < F32FromF16(b);
+}
+// Required for std::greater.
+static inline HWY_MAYBE_UNUSED bool operator>(const float16_t& a,
+                                              const float16_t& b) {
+  return F32FromF16(a) > F32FromF16(b);
+}
+static inline HWY_MAYBE_UNUSED bool operator==(const float16_t& a,
+                                               const float16_t& b) {
+  return F32FromF16(a) == F32FromF16(b);
+}
+#endif  // HWY_EMULATE_FLOAT16
+static inline HWY_MAYBE_UNUSED bool operator<(const uint128_t& a,
+                                              const uint128_t& b) {
+  return (a.hi == b.hi) ? a.lo < b.lo : a.hi < b.hi;
+}
+// Required for std::greater.
+static inline HWY_MAYBE_UNUSED bool operator>(const uint128_t& a,
+                                              const uint128_t& b) {
+  return b < a;
+}
+static inline HWY_MAYBE_UNUSED bool operator==(const uint128_t& a,
+                                               const uint128_t& b) {
+  return a.lo == b.lo && a.hi == b.hi;
+}
+static inline HWY_MAYBE_UNUSED bool operator<(const K64V64& a,
+                                              const K64V64& b) {
+  return a.key < b.key;
+}
+// Required for std::greater.
+static inline HWY_MAYBE_UNUSED bool operator>(const K64V64& a,
+                                              const K64V64& b) {
+  return b < a;
+}
+static inline HWY_MAYBE_UNUSED bool operator==(const K64V64& a,
+                                               const K64V64& b) {
+  return a.key == b.key;
+}
+static inline HWY_MAYBE_UNUSED bool operator<(const K32V32& a,
+                                              const K32V32& b) {
+  return a.key < b.key;
+}
+// Required for std::greater.
+static inline HWY_MAYBE_UNUSED bool operator>(const K32V32& a,
+                                              const K32V32& b) {
+  return b < a;
+}
+static inline HWY_MAYBE_UNUSED bool operator==(const K32V32& a,
+                                               const K32V32& b) {
+  return a.key == b.key;
+}
+//------------------------------------------------------------------------------
+// Controlling overload resolution (SFINAE)
+template <bool Condition>
+struct EnableIfT {};
+template <>
+struct EnableIfT<true> {
+  using type = void;
+};
+template <bool Condition>
+using EnableIf = typename EnableIfT<Condition>::type;
+template <typename T, typename U>
+struct IsSameT {
+  enum { value = 0 };
+};
+template <typename T>
+struct IsSameT<T, T> {
+  enum { value = 1 };
+};
+template <typename T, typename U>
+HWY_API constexpr bool IsSame() {
+  return IsSameT<T, U>::value;
+}
+template <bool Condition, typename Then, typename Else>
+struct IfT {
+  using type = Then;
+};
+template <class Then, class Else>
+struct IfT<false, Then, Else> {
+  using type = Else;
+};
+template <bool Condition, typename Then, typename Else>
+using If = typename IfT<Condition, Then, Else>::type;
+// Insert into template/function arguments to enable this overload only for
+// vectors of exactly, at most (LE), or more than (GT) this many bytes.
+//
+// As an example, checking for a total size of 16 bytes will match both
+// Simd<uint8_t, 16, 0> and Simd<uint8_t, 8, 1>.
+#define HWY_IF_V_SIZE(T, kN, bytes) \
+  hwy::EnableIf<kN * sizeof(T) == bytes>* = nullptr
+#define HWY_IF_V_SIZE_LE(T, kN, bytes) \
+  hwy::EnableIf<kN * sizeof(T) <= bytes>* = nullptr
+#define HWY_IF_V_SIZE_GT(T, kN, bytes) \
+  hwy::EnableIf<(kN * sizeof(T) > bytes)>* = nullptr
+#define HWY_IF_LANES(kN, lanes) hwy::EnableIf<(kN == lanes)>* = nullptr
+#define HWY_IF_LANES_LE(kN, lanes) hwy::EnableIf<(kN <= lanes)>* = nullptr
+#define HWY_IF_LANES_GT(kN, lanes) hwy::EnableIf<(kN > lanes)>* = nullptr
+#define HWY_IF_UNSIGNED(T) hwy::EnableIf<!IsSigned<T>()>* = nullptr
+#define HWY_IF_SIGNED(T)                                                   \
+  hwy::EnableIf<IsSigned<T>() && !IsFloat<T>() && !IsSpecialFloat<T>()>* = \
+      nullptr
+#define HWY_IF_FLOAT(T) hwy::EnableIf<hwy::IsFloat<T>()>* = nullptr
+#define HWY_IF_NOT_FLOAT(T) hwy::EnableIf<!hwy::IsFloat<T>()>* = nullptr
+#define HWY_IF_FLOAT3264(T) hwy::EnableIf<hwy::IsFloat3264<T>()>* = nullptr
+#define HWY_IF_NOT_FLOAT3264(T) hwy::EnableIf<!hwy::IsFloat3264<T>()>* = nullptr
+#define HWY_IF_SPECIAL_FLOAT(T) \
+  hwy::EnableIf<hwy::IsSpecialFloat<T>()>* = nullptr
+#define HWY_IF_NOT_SPECIAL_FLOAT(T) \
+  hwy::EnableIf<!hwy::IsSpecialFloat<T>()>* = nullptr
+#define HWY_IF_FLOAT_OR_SPECIAL(T) \
+  hwy::EnableIf<hwy::IsFloat<T>() || hwy::IsSpecialFloat<T>()>* = nullptr
+#define HWY_IF_NOT_FLOAT_NOR_SPECIAL(T) \
+  hwy::EnableIf<!hwy::IsFloat<T>() && !hwy::IsSpecialFloat<T>()>* = nullptr
+#define HWY_IF_T_SIZE(T, bytes) hwy::EnableIf<sizeof(T) == (bytes)>* = nullptr
+#define HWY_IF_NOT_T_SIZE(T, bytes) \
+  hwy::EnableIf<sizeof(T) != (bytes)>* = nullptr
+// bit_array = 0x102 means 1 or 8 bytes. There is no NONE_OF because it sounds
+// too similar. If you want the opposite of this (2 or 4 bytes), ask for those
+// bits explicitly (0x14) instead of attempting to 'negate' 0x102.
+#define HWY_IF_T_SIZE_ONE_OF(T, bit_array) \
+  hwy::EnableIf<((size_t{1} << sizeof(T)) & (bit_array)) != 0>* = nullptr
+// Use instead of HWY_IF_T_SIZE to avoid ambiguity with float16_t/float/double
+// overloads.
+#define HWY_IF_UI16(T) \
+  hwy::EnableIf<IsSame<T, uint16_t>() || IsSame<T, int16_t>()>* = nullptr
+#define HWY_IF_UI32(T) \
+  hwy::EnableIf<IsSame<T, uint32_t>() || IsSame<T, int32_t>()>* = nullptr
+#define HWY_IF_UI64(T) \
+  hwy::EnableIf<IsSame<T, uint64_t>() || IsSame<T, int64_t>()>* = nullptr
+#define HWY_IF_BF16(T) hwy::EnableIf<IsSame<T, hwy::bfloat16_t>()>* = nullptr
+#define HWY_IF_F16(T) hwy::EnableIf<IsSame<T, hwy::float16_t>()>* = nullptr
+#define HWY_IF_LANES_PER_BLOCK(T, N, LANES) \
+  hwy::EnableIf<HWY_MIN(sizeof(T) * N, 16) / sizeof(T) == (LANES)>* = nullptr
+// Empty struct used as a size tag type.
+template <size_t N>
+struct SizeTag {};
+template <class T>
+struct RemoveConstT {
+  using type = T;
+};
+template <class T>
+struct RemoveConstT<const T> {
+  using type = T;
+};
+template <class T>
+using RemoveConst = typename RemoveConstT<T>::type;
+template <class T>
+struct RemoveRefT {
+  using type = T;
+};
+template <class T>
+struct RemoveRefT<T&> {
+  using type = T;
+};
+template <class T>
+struct RemoveRefT<T&&> {
+  using type = T;
+};
+template <class T>
+using RemoveRef = typename RemoveRefT<T>::type;
+//------------------------------------------------------------------------------
+// Type relations
+namespace detail {
+template <typename T>
+struct Relations;
+template <>
+struct Relations<uint8_t> {
+  using Unsigned = uint8_t;
+  using Signed = int8_t;
+  using Wide = uint16_t;
+  enum { is_signed = 0, is_float = 0, is_bf16 = 0 };
+};
+template <>
+struct Relations<int8_t> {
+  using Unsigned = uint8_t;
+  using Signed = int8_t;
+  using Wide = int16_t;
+  enum { is_signed = 1, is_float = 0, is_bf16 = 0 };
+};
+template <>
+struct Relations<uint16_t> {
+  using Unsigned = uint16_t;
+  using Signed = int16_t;
+  using Float = float16_t;
+  using Wide = uint32_t;
+  using Narrow = uint8_t;
+  enum { is_signed = 0, is_float = 0, is_bf16 = 0 };
+};
+template <>
+struct Relations<int16_t> {
+  using Unsigned = uint16_t;
+  using Signed = int16_t;
+  using Float = float16_t;
+  using Wide = int32_t;
+  using Narrow = int8_t;
+  enum { is_signed = 1, is_float = 0, is_bf16 = 0 };
+};
+template <>
+struct Relations<uint32_t> {
+  using Unsigned = uint32_t;
+  using Signed = int32_t;
+  using Float = float;
+  using Wide = uint64_t;
+  using Narrow = uint16_t;
+  enum { is_signed = 0, is_float = 0, is_bf16 = 0 };
+};
+template <>
+struct Relations<int32_t> {
+  using Unsigned = uint32_t;
+  using Signed = int32_t;
+  using Float = float;
+  using Wide = int64_t;
+  using Narrow = int16_t;
+  enum { is_signed = 1, is_float = 0, is_bf16 = 0 };
+};
+template <>
+struct Relations<uint64_t> {
+  using Unsigned = uint64_t;
+  using Signed = int64_t;
+  using Float = double;
+  using Wide = uint128_t;
+  using Narrow = uint32_t;
+  enum { is_signed = 0, is_float = 0, is_bf16 = 0 };
+};
+template <>
+struct Relations<int64_t> {
+  using Unsigned = uint64_t;
+  using Signed = int64_t;
+  using Float = double;
+  using Narrow = int32_t;
+  enum { is_signed = 1, is_float = 0, is_bf16 = 0 };
+};
+template <>
+struct Relations<uint128_t> {
+  using Unsigned = uint128_t;
+  using Narrow = uint64_t;
+  enum { is_signed = 0, is_float = 0, is_bf16 = 0 };
+};
+template <>
+struct Relations<float16_t> {
+  using Unsigned = uint16_t;
+  using Signed = int16_t;
+  using Float = float16_t;
+  using Wide = float;
+  enum { is_signed = 1, is_float = 1, is_bf16 = 0 };
+};
+template <>
+struct Relations<bfloat16_t> {
+  using Unsigned = uint16_t;
+  using Signed = int16_t;
+  using Wide = float;
+  enum { is_signed = 1, is_float = 1, is_bf16 = 1 };
+};
+template <>
+struct Relations<float> {
+  using Unsigned = uint32_t;
+  using Signed = int32_t;
+  using Float = float;
+  using Wide = double;
+  using Narrow = float16_t;
+  enum { is_signed = 1, is_float = 1, is_bf16 = 0 };
+};
+template <>
+struct Relations<double> {
+  using Unsigned = uint64_t;
+  using Signed = int64_t;
+  using Float = double;
+  using Narrow = float;
+  enum { is_signed = 1, is_float = 1, is_bf16 = 0 };
+};
+template <size_t N>
+struct TypeFromSize;
+template <>
+struct TypeFromSize<1> {
+  using Unsigned = uint8_t;
+  using Signed = int8_t;
+};
+template <>
+struct TypeFromSize<2> {
+  using Unsigned = uint16_t;
+  using Signed = int16_t;
+  using Float = float16_t;
+};
+template <>
+struct TypeFromSize<4> {
+  using Unsigned = uint32_t;
+  using Signed = int32_t;
+  using Float = float;
+};
+template <>
+struct TypeFromSize<8> {
+  using Unsigned = uint64_t;
+  using Signed = int64_t;
+  using Float = double;
+};
+template <>
+struct TypeFromSize<16> {
+  using Unsigned = uint128_t;
+};
+}  // namespace detail
+// Aliases for types of a different category, but the same size.
+template <typename T>
+using MakeUnsigned = typename detail::Relations<T>::Unsigned;
+template <typename T>
+using MakeSigned = typename detail::Relations<T>::Signed;
+template <typename T>
+using MakeFloat = typename detail::Relations<T>::Float;
+// Aliases for types of the same category, but different size.
+template <typename T>
+using MakeWide = typename detail::Relations<T>::Wide;
+template <typename T>
+using MakeNarrow = typename detail::Relations<T>::Narrow;
+// Obtain type from its size [bytes].
+template <size_t N>
+using UnsignedFromSize = typename detail::TypeFromSize<N>::Unsigned;
+template <size_t N>
+using SignedFromSize = typename detail::TypeFromSize<N>::Signed;
+template <size_t N>
+using FloatFromSize = typename detail::TypeFromSize<N>::Float;
+// Avoid confusion with SizeTag where the parameter is a lane size.
+using UnsignedTag = SizeTag<0>;
+using SignedTag = SizeTag<0x100>;  // integer
+using FloatTag = SizeTag<0x200>;
+using SpecialTag = SizeTag<0x300>;
+template <typename T, class R = detail::Relations<T>>
+constexpr auto TypeTag()
+    -> hwy::SizeTag<((R::is_signed + R::is_float + R::is_bf16) << 8)> {
+  return hwy::SizeTag<((R::is_signed + R::is_float + R::is_bf16) << 8)>();
+}
+// For when we only want to distinguish FloatTag from everything else.
+using NonFloatTag = SizeTag<0x400>;
+template <typename T, class R = detail::Relations<T>>
+constexpr auto IsFloatTag() -> hwy::SizeTag<(R::is_float ? 0x200 : 0x400)> {
+  return hwy::SizeTag<(R::is_float ? 0x200 : 0x400)>();
+}
+//------------------------------------------------------------------------------
+// Type traits
+template <typename T>
+HWY_API constexpr bool IsFloat3264() {
+  return IsSame<T, float>() || IsSame<T, double>();
+}
+template <typename T>
+HWY_API constexpr bool IsFloat() {
+  // Cannot use T(1.25) != T(1) for float16_t, which can only be converted to or
+  // from a float, not compared. Include float16_t in case HWY_HAVE_FLOAT16=1.
+  return IsSame<T, float16_t>() || IsFloat3264<T>();
+}
+// These types are often special-cased and not supported in all ops.
+template <typename T>
+HWY_API constexpr bool IsSpecialFloat() {
+  return IsSame<T, float16_t>() || IsSame<T, bfloat16_t>();
+}
+template <typename T>
+HWY_API constexpr bool IsSigned() {
+  return T(0) > T(-1);
+}
+template <>
+constexpr bool IsSigned<float16_t>() {
+  return true;
+}
+template <>
+constexpr bool IsSigned<bfloat16_t>() {
+  return true;
+}
+// Largest/smallest representable integer values.
+template <typename T>
+HWY_API constexpr T LimitsMax() {
+  static_assert(!IsFloat<T>(), "Only for integer types");
+  using TU = MakeUnsigned<T>;
+  return static_cast<T>(IsSigned<T>() ? (static_cast<TU>(~0ull) >> 1)
+                                      : static_cast<TU>(~0ull));
+}
+template <typename T>
+HWY_API constexpr T LimitsMin() {
+  static_assert(!IsFloat<T>(), "Only for integer types");
+  return IsSigned<T>() ? T(-1) - LimitsMax<T>() : T(0);
+}
+// Largest/smallest representable value (integer or float). This naming avoids
+// confusion with numeric_limits<float>::min() (the smallest positive value).
+// Cannot be constexpr because we use CopySameSize for [b]float16_t.
+template <typename T>
+HWY_API T LowestValue() {
+  return LimitsMin<T>();
+}
+template <>
+HWY_INLINE bfloat16_t LowestValue<bfloat16_t>() {
+  const uint16_t kBits = 0xFF7F;  // -1.1111111 x 2^127
+  bfloat16_t ret;
+  CopySameSize(&kBits, &ret);
+  return ret;
+}
+template <>
+HWY_INLINE float16_t LowestValue<float16_t>() {
+  const uint16_t kBits = 0xFBFF;  // -1.1111111111 x 2^15
+  float16_t ret;
+  CopySameSize(&kBits, &ret);
+  return ret;
+}
+template <>
+HWY_INLINE float LowestValue<float>() {
+  return -3.402823466e+38F;
+}
+template <>
+HWY_INLINE double LowestValue<double>() {
+  return -1.7976931348623158e+308;
+}
+template <typename T>
+HWY_API T HighestValue() {
+  return LimitsMax<T>();
+}
+template <>
+HWY_INLINE bfloat16_t HighestValue<bfloat16_t>() {
+  const uint16_t kBits = 0x7F7F;  // 1.1111111 x 2^127
+  bfloat16_t ret;
+  CopySameSize(&kBits, &ret);
+  return ret;
+}
+template <>
+HWY_INLINE float16_t HighestValue<float16_t>() {
+  const uint16_t kBits = 0x7BFF;  // 1.1111111111 x 2^15
+  float16_t ret;
+  CopySameSize(&kBits, &ret);
+  return ret;
+}
+template <>
+HWY_INLINE float HighestValue<float>() {
+  return 3.402823466e+38F;
+}
+template <>
+HWY_INLINE double HighestValue<double>() {
+  return 1.7976931348623158e+308;
+}
+// Difference between 1.0 and the next representable value. Equal to
+// 1 / (1ULL << MantissaBits<T>()), but hard-coding ensures precision.
+template <typename T>
+HWY_API T Epsilon() {
+  return 1;
+}
+template <>
+HWY_INLINE bfloat16_t Epsilon<bfloat16_t>() {
+  const uint16_t kBits = 0x3C00;  // 0.0078125
+  bfloat16_t ret;
+  CopySameSize(&kBits, &ret);
+  return ret;
+}
+template <>
+HWY_INLINE float16_t Epsilon<float16_t>() {
+  const uint16_t kBits = 0x1400;  // 0.0009765625
+  float16_t ret;
+  CopySameSize(&kBits, &ret);
+  return ret;
+}
+template <>
+HWY_INLINE float Epsilon<float>() {
+  return 1.192092896e-7f;
+}
+template <>
+HWY_INLINE double Epsilon<double>() {
+  return 2.2204460492503131e-16;
+}
+// Returns width in bits of the mantissa field in IEEE binary16/32/64.
+template <typename T>
+constexpr int MantissaBits() {
+  static_assert(sizeof(T) == 0, "Only instantiate the specializations");
+  return 0;
+}
+template <>
+constexpr int MantissaBits<bfloat16_t>() {
+  return 7;
+}
+template <>
+constexpr int MantissaBits<float16_t>() {
+  return 10;
+}
+template <>
+constexpr int MantissaBits<float>() {
+  return 23;
+}
+template <>
+constexpr int MantissaBits<double>() {
+  return 52;
+}
+// Returns the (left-shifted by one bit) IEEE binary16/32/64 representation with
+// the largest possible (biased) exponent field. Used by IsInf.
+template <typename T>
+constexpr MakeSigned<T> MaxExponentTimes2() {
+  return -(MakeSigned<T>{1} << (MantissaBits<T>() + 1));
+}
+// Returns bitmask of the sign bit in IEEE binary16/32/64.
+template <typename T>
+constexpr MakeUnsigned<T> SignMask() {
+  return MakeUnsigned<T>{1} << (sizeof(T) * 8 - 1);
+}
+// Returns bitmask of the exponent field in IEEE binary16/32/64.
+template <typename T>
+constexpr MakeUnsigned<T> ExponentMask() {
+  return (~(MakeUnsigned<T>{1} << MantissaBits<T>()) + 1) & ~SignMask<T>();
+}
+// Returns bitmask of the mantissa field in IEEE binary16/32/64.
+template <typename T>
+constexpr MakeUnsigned<T> MantissaMask() {
+  return (MakeUnsigned<T>{1} << MantissaBits<T>()) - 1;
+}
+// Returns 1 << mantissa_bits as a floating-point number. All integers whose
+// absolute value are less than this can be represented exactly.
+template <typename T>
+HWY_INLINE T MantissaEnd() {
+  static_assert(sizeof(T) == 0, "Only instantiate the specializations");
+  return 0;
+}
+template <>
+HWY_INLINE bfloat16_t MantissaEnd<bfloat16_t>() {
+  const uint16_t kBits = 0x4300;  // 1.0 x 2^7
+  bfloat16_t ret;
+  CopySameSize(&kBits, &ret);
+  return ret;
+}
+template <>
+HWY_INLINE float16_t MantissaEnd<float16_t>() {
+  const uint16_t kBits = 0x6400;  // 1.0 x 2^10
+  float16_t ret;
+  CopySameSize(&kBits, &ret);
+  return ret;
+}
+template <>
+HWY_INLINE float MantissaEnd<float>() {
+  return 8388608.0f;  // 1 << 23
+}
+template <>
+HWY_INLINE double MantissaEnd<double>() {
+  // floating point literal with p52 requires C++17.
+  return 4503599627370496.0;  // 1 << 52
+}
+// Returns width in bits of the exponent field in IEEE binary16/32/64.
+template <typename T>
+constexpr int ExponentBits() {
+  // Exponent := remaining bits after deducting sign and mantissa.
+  return 8 * sizeof(T) - 1 - MantissaBits<T>();
+}
+// Returns largest value of the biased exponent field in IEEE binary16/32/64,
+// right-shifted so that the LSB is bit zero. Example: 0xFF for float.
+// This is expressed as a signed integer for more efficient comparison.
+template <typename T>
+constexpr MakeSigned<T> MaxExponentField() {
+  return (MakeSigned<T>{1} << ExponentBits<T>()) - 1;
+}
+//------------------------------------------------------------------------------
+// Helper functions
+template <typename T1, typename T2>
+constexpr inline T1 DivCeil(T1 a, T2 b) {
+  return (a + b - 1) / b;
+}
+// Works for any `align`; if a power of two, compiler emits ADD+AND.
+constexpr inline size_t RoundUpTo(size_t what, size_t align) {
+  return DivCeil(what, align) * align;
+}
+// Undefined results for x == 0.
+HWY_API size_t Num0BitsBelowLS1Bit_Nonzero32(const uint32_t x) {
+#if HWY_COMPILER_MSVC
+  unsigned long index;  // NOLINT
+  _BitScanForward(&index, x);
+  return index;
+#else   // HWY_COMPILER_MSVC
+  return static_cast<size_t>(__builtin_ctz(x));
+#endif  // HWY_COMPILER_MSVC
+}
+HWY_API size_t Num0BitsBelowLS1Bit_Nonzero64(const uint64_t x) {
+#if HWY_COMPILER_MSVC
+#if HWY_ARCH_X86_64
+  unsigned long index;  // NOLINT
+  _BitScanForward64(&index, x);
+  return index;
+#else   // HWY_ARCH_X86_64
+  // _BitScanForward64 not available
+  uint32_t lsb = static_cast<uint32_t>(x & 0xFFFFFFFF);
+  unsigned long index;  // NOLINT
+  if (lsb == 0) {
+    uint32_t msb = static_cast<uint32_t>(x >> 32u);
+    _BitScanForward(&index, msb);
+    return 32 + index;
+  } else {
+    _BitScanForward(&index, lsb);
+    return index;
+  }
+#endif  // HWY_ARCH_X86_64
+#else   // HWY_COMPILER_MSVC
+  return static_cast<size_t>(__builtin_ctzll(x));
+#endif  // HWY_COMPILER_MSVC
+}
+// Undefined results for x == 0.
+HWY_API size_t Num0BitsAboveMS1Bit_Nonzero32(const uint32_t x) {
+#if HWY_COMPILER_MSVC
+  unsigned long index;  // NOLINT
+  _BitScanReverse(&index, x);
+  return 31 - index;
+#else   // HWY_COMPILER_MSVC
+  return static_cast<size_t>(__builtin_clz(x));
+#endif  // HWY_COMPILER_MSVC
+}
+HWY_API size_t Num0BitsAboveMS1Bit_Nonzero64(const uint64_t x) {
+#if HWY_COMPILER_MSVC
+#if HWY_ARCH_X86_64
+  unsigned long index;  // NOLINT
+  _BitScanReverse64(&index, x);
+  return 63 - index;
+#else   // HWY_ARCH_X86_64
+  // _BitScanReverse64 not available
+  const uint32_t msb = static_cast<uint32_t>(x >> 32u);
+  unsigned long index;  // NOLINT
+  if (msb == 0) {
+    const uint32_t lsb = static_cast<uint32_t>(x & 0xFFFFFFFF);
+    _BitScanReverse(&index, lsb);
+    return 63 - index;
+  } else {
+    _BitScanReverse(&index, msb);
+    return 31 - index;
+  }
+#endif  // HWY_ARCH_X86_64
+#else   // HWY_COMPILER_MSVC
+  return static_cast<size_t>(__builtin_clzll(x));
+#endif  // HWY_COMPILER_MSVC
+}
+HWY_API size_t PopCount(uint64_t x) {
+#if HWY_COMPILER_GCC  // includes clang
+  return static_cast<size_t>(__builtin_popcountll(x));
+  // This instruction has a separate feature flag, but is often called from
+  // non-SIMD code, so we don't want to require dynamic dispatch. It was first
+  // supported by Intel in Nehalem (SSE4.2), but MSVC only predefines a macro
+  // for AVX, so check for that.
+#elif HWY_COMPILER_MSVC && HWY_ARCH_X86_64 && defined(__AVX__)
+  return _mm_popcnt_u64(x);
+#elif HWY_COMPILER_MSVC && HWY_ARCH_X86_32 && defined(__AVX__)
+  return _mm_popcnt_u32(static_cast<uint32_t>(x & 0xFFFFFFFFu)) +
+         _mm_popcnt_u32(static_cast<uint32_t>(x >> 32));
+#else
+  x -= ((x >> 1) & 0x5555555555555555ULL);
+  x = (((x >> 2) & 0x3333333333333333ULL) + (x & 0x3333333333333333ULL));
+  x = (((x >> 4) + x) & 0x0F0F0F0F0F0F0F0FULL);
+  x += (x >> 8);
+  x += (x >> 16);
+  x += (x >> 32);
+  return static_cast<size_t>(x & 0x7Fu);
+#endif
+}
+// Skip HWY_API due to GCC "function not considered for inlining". Previously
+// such errors were caused by underlying type mismatches, but it's not clear
+// what is still mismatched despite all the casts.
+template <typename TI>
+/*HWY_API*/ constexpr size_t FloorLog2(TI x) {
+  return x == TI{1}
+             ? 0
+             : static_cast<size_t>(FloorLog2(static_cast<TI>(x >> 1)) + 1);
+}
+template <typename TI>
+/*HWY_API*/ constexpr size_t CeilLog2(TI x) {
+  return x == TI{1}
+             ? 0
+             : static_cast<size_t>(FloorLog2(static_cast<TI>(x - 1)) + 1);
+}
+template <typename T>
+HWY_INLINE constexpr T AddWithWraparound(hwy::FloatTag /*tag*/, T t, size_t n) {
+  return t + static_cast<T>(n);
+}
+template <typename T>
+HWY_INLINE constexpr T AddWithWraparound(hwy::NonFloatTag /*tag*/, T t,
+                                         size_t n) {
+  using TU = MakeUnsigned<T>;
+  return static_cast<T>(
+      static_cast<TU>(static_cast<TU>(t) + static_cast<TU>(n)) &
+      hwy::LimitsMax<TU>());
+}
+#if HWY_COMPILER_MSVC && HWY_ARCH_X86_64
+#pragma intrinsic(_umul128)
+#endif
+// 64 x 64 = 128 bit multiplication
+HWY_API uint64_t Mul128(uint64_t a, uint64_t b, uint64_t* HWY_RESTRICT upper) {
+#if defined(__SIZEOF_INT128__)
+  __uint128_t product = (__uint128_t)a * (__uint128_t)b;
+  *upper = (uint64_t)(product >> 64);
+  return (uint64_t)(product & 0xFFFFFFFFFFFFFFFFULL);
+#elif HWY_COMPILER_MSVC && HWY_ARCH_X86_64
+  return _umul128(a, b, upper);
+#else
+  constexpr uint64_t kLo32 = 0xFFFFFFFFU;
+  const uint64_t lo_lo = (a & kLo32) * (b & kLo32);
+  const uint64_t hi_lo = (a >> 32) * (b & kLo32);
+  const uint64_t lo_hi = (a & kLo32) * (b >> 32);
+  const uint64_t hi_hi = (a >> 32) * (b >> 32);
+  const uint64_t t = (lo_lo >> 32) + (hi_lo & kLo32) + lo_hi;
+  *upper = (hi_lo >> 32) + (t >> 32) + hi_hi;
+  return (t << 32) | (lo_lo & kLo32);
+#endif
+}
+// Prevents the compiler from eliding the computations that led to "output".
+template <class T>
+HWY_API void PreventElision(T&& output) {
+#if HWY_COMPILER_MSVC
+  // MSVC does not support inline assembly anymore (and never supported GCC's
+  // RTL constraints). Self-assignment with #pragma optimize("off") might be
+  // expected to prevent elision, but it does not with MSVC 2015. Type-punning
+  // with volatile pointers generates inefficient code on MSVC 2017.
+  static std::atomic<RemoveRef<T>> dummy;
+  dummy.store(output, std::memory_order_relaxed);
+#else
+  // Works by indicating to the compiler that "output" is being read and
+  // modified. The +r constraint avoids unnecessary writes to memory, but only
+  // works for built-in types (typically FuncOutput).
+  asm volatile("" : "+r"(output) : : "memory");
+#endif
+}
+}  // namespace hwy
+#endif  // HIGHWAY_HWY_BASE_H_