@img/sharp-libvips-dev 1.2.1 → 1.2.2-rc.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (115) hide show
  1. package/include/aom/aom_decoder.h +1 -1
  2. package/include/aom/aom_encoder.h +2 -0
  3. package/include/aom/aomcx.h +106 -25
  4. package/include/ffi.h +3 -3
  5. package/include/freetype2/freetype/config/ftconfig.h +1 -1
  6. package/include/freetype2/freetype/config/ftheader.h +1 -1
  7. package/include/freetype2/freetype/config/ftoption.h +37 -12
  8. package/include/freetype2/freetype/config/ftstdlib.h +1 -1
  9. package/include/freetype2/freetype/config/integer-types.h +29 -2
  10. package/include/freetype2/freetype/config/mac-support.h +1 -1
  11. package/include/freetype2/freetype/config/public-macros.h +3 -3
  12. package/include/freetype2/freetype/freetype.h +51 -47
  13. package/include/freetype2/freetype/ftadvanc.h +1 -1
  14. package/include/freetype2/freetype/ftbbox.h +1 -1
  15. package/include/freetype2/freetype/ftbdf.h +1 -1
  16. package/include/freetype2/freetype/ftbitmap.h +1 -1
  17. package/include/freetype2/freetype/ftbzip2.h +1 -1
  18. package/include/freetype2/freetype/ftcache.h +1 -1
  19. package/include/freetype2/freetype/ftcid.h +1 -1
  20. package/include/freetype2/freetype/ftcolor.h +13 -4
  21. package/include/freetype2/freetype/ftdriver.h +3 -3
  22. package/include/freetype2/freetype/fterrdef.h +1 -1
  23. package/include/freetype2/freetype/fterrors.h +1 -1
  24. package/include/freetype2/freetype/ftfntfmt.h +1 -1
  25. package/include/freetype2/freetype/ftgasp.h +1 -1
  26. package/include/freetype2/freetype/ftglyph.h +1 -1
  27. package/include/freetype2/freetype/ftgxval.h +1 -1
  28. package/include/freetype2/freetype/ftgzip.h +1 -1
  29. package/include/freetype2/freetype/ftimage.h +6 -2
  30. package/include/freetype2/freetype/ftincrem.h +1 -1
  31. package/include/freetype2/freetype/ftlcdfil.h +1 -1
  32. package/include/freetype2/freetype/ftlist.h +1 -1
  33. package/include/freetype2/freetype/ftlogging.h +184 -0
  34. package/include/freetype2/freetype/ftlzw.h +1 -1
  35. package/include/freetype2/freetype/ftmac.h +1 -1
  36. package/include/freetype2/freetype/ftmm.h +159 -103
  37. package/include/freetype2/freetype/ftmodapi.h +1 -1
  38. package/include/freetype2/freetype/ftmoderr.h +1 -1
  39. package/include/freetype2/freetype/ftotval.h +1 -1
  40. package/include/freetype2/freetype/ftoutln.h +1 -1
  41. package/include/freetype2/freetype/ftparams.h +1 -1
  42. package/include/freetype2/freetype/ftpfr.h +1 -1
  43. package/include/freetype2/freetype/ftrender.h +1 -1
  44. package/include/freetype2/freetype/ftsizes.h +1 -1
  45. package/include/freetype2/freetype/ftsnames.h +1 -1
  46. package/include/freetype2/freetype/ftstroke.h +1 -1
  47. package/include/freetype2/freetype/ftsynth.h +1 -1
  48. package/include/freetype2/freetype/ftsystem.h +1 -1
  49. package/include/freetype2/freetype/fttrigon.h +1 -1
  50. package/include/freetype2/freetype/fttypes.h +1 -1
  51. package/include/freetype2/freetype/ftwinfnt.h +2 -3
  52. package/include/freetype2/freetype/otsvg.h +1 -1
  53. package/include/freetype2/freetype/t1tables.h +1 -1
  54. package/include/freetype2/freetype/ttnameid.h +129 -129
  55. package/include/freetype2/freetype/tttables.h +8 -5
  56. package/include/freetype2/freetype/tttags.h +1 -1
  57. package/include/freetype2/ft2build.h +1 -1
  58. package/include/glib-2.0/gio/gdbuserror.h +9 -8
  59. package/include/glib-2.0/gio/ginetaddress.h +12 -0
  60. package/include/glib-2.0/gio/gioenums.h +9 -2
  61. package/include/glib-2.0/glib/gstring.h +2 -2
  62. package/include/glib-2.0/glib/gunicode.h +1 -1
  63. package/include/glib-2.0/gobject/glib-types.h +1 -1
  64. package/include/glib-2.0/gobject/gparam.h +1 -1
  65. package/include/glib-2.0/gobject/gvalue.h +78 -35
  66. package/include/harfbuzz/hb-script-list.h +12 -0
  67. package/include/harfbuzz/hb-version.h +3 -3
  68. package/include/hwy/abort.h +2 -19
  69. package/include/hwy/aligned_allocator.h +11 -7
  70. package/include/hwy/auto_tune.h +504 -0
  71. package/include/hwy/base.h +425 -104
  72. package/include/hwy/cache_control.h +16 -0
  73. package/include/hwy/detect_compiler_arch.h +32 -1
  74. package/include/hwy/detect_targets.h +251 -67
  75. package/include/hwy/foreach_target.h +35 -0
  76. package/include/hwy/highway.h +185 -76
  77. package/include/hwy/nanobenchmark.h +1 -19
  78. package/include/hwy/ops/arm_neon-inl.h +969 -458
  79. package/include/hwy/ops/arm_sve-inl.h +1137 -359
  80. package/include/hwy/ops/emu128-inl.h +97 -11
  81. package/include/hwy/ops/generic_ops-inl.h +1222 -34
  82. package/include/hwy/ops/loongarch_lasx-inl.h +4664 -0
  83. package/include/hwy/ops/loongarch_lsx-inl.h +5933 -0
  84. package/include/hwy/ops/ppc_vsx-inl.h +306 -126
  85. package/include/hwy/ops/rvv-inl.h +546 -51
  86. package/include/hwy/ops/scalar-inl.h +77 -22
  87. package/include/hwy/ops/set_macros-inl.h +138 -17
  88. package/include/hwy/ops/shared-inl.h +50 -10
  89. package/include/hwy/ops/wasm_128-inl.h +137 -92
  90. package/include/hwy/ops/x86_128-inl.h +773 -214
  91. package/include/hwy/ops/x86_256-inl.h +712 -255
  92. package/include/hwy/ops/x86_512-inl.h +429 -753
  93. package/include/hwy/ops/x86_avx3-inl.h +501 -0
  94. package/include/hwy/per_target.h +2 -1
  95. package/include/hwy/profiler.h +622 -486
  96. package/include/hwy/targets.h +62 -20
  97. package/include/hwy/timer-inl.h +8 -160
  98. package/include/hwy/timer.h +170 -3
  99. package/include/hwy/x86_cpuid.h +81 -0
  100. package/include/libheif/heif_cxx.h +25 -5
  101. package/include/libheif/heif_regions.h +5 -5
  102. package/include/libheif/heif_version.h +2 -2
  103. package/include/librsvg-2.0/librsvg/rsvg-version.h +3 -3
  104. package/include/libxml2/libxml/valid.h +0 -3
  105. package/include/libxml2/libxml/xmlerror.h +1 -1
  106. package/include/libxml2/libxml/xmlversion.h +4 -4
  107. package/include/pango-1.0/pango/pango-enum-types.h +3 -0
  108. package/include/pango-1.0/pango/pango-features.h +3 -3
  109. package/include/pango-1.0/pango/pango-font.h +30 -0
  110. package/include/pango-1.0/pango/pango-version-macros.h +26 -0
  111. package/include/vips/connection.h +4 -4
  112. package/include/vips/version.h +4 -4
  113. package/include/zlib.h +3 -3
  114. package/package.json +1 -1
  115. package/versions.json +13 -13
@@ -21,13 +21,21 @@
21
21
  // IWYU pragma: begin_exports
22
22
  #include <stddef.h>
23
23
  #include <stdint.h>
24
+ #if defined(HWY_HEADER_ONLY)
25
+ #include <cstdarg>
26
+ #include <cstdio>
27
+ #endif
28
+
29
+ #if !defined(HWY_NO_LIBCXX)
30
+ #include <ostream>
31
+ #endif
24
32
 
25
33
  #include "hwy/detect_compiler_arch.h"
26
34
  #include "hwy/highway_export.h"
27
35
 
28
36
  // API version (https://semver.org/); keep in sync with CMakeLists.txt.
29
37
  #define HWY_MAJOR 1
30
- #define HWY_MINOR 2
38
+ #define HWY_MINOR 3
31
39
  #define HWY_PATCH 0
32
40
 
33
41
  // True if the Highway version >= major.minor.0. Added in 1.2.0.
@@ -47,12 +55,12 @@
47
55
  #include <inttypes.h>
48
56
  #endif
49
57
 
58
+ #endif // !HWY_IDE
59
+
50
60
  #if (HWY_ARCH_X86 && !defined(HWY_NO_LIBCXX)) || HWY_COMPILER_MSVC
51
61
  #include <atomic>
52
62
  #endif
53
63
 
54
- #endif // !HWY_IDE
55
-
56
64
  #ifndef HWY_HAVE_COMPARE_HEADER // allow override
57
65
  #define HWY_HAVE_COMPARE_HEADER 0
58
66
  #if defined(__has_include) // note: wrapper macro fails on Clang ~17
@@ -97,6 +105,7 @@
97
105
  #define HWY_NORETURN __declspec(noreturn)
98
106
  #define HWY_LIKELY(expr) (expr)
99
107
  #define HWY_UNLIKELY(expr) (expr)
108
+ #define HWY_UNREACHABLE __assume(false)
100
109
  #define HWY_PRAGMA(tokens) __pragma(tokens)
101
110
  #define HWY_DIAGNOSTICS(tokens) HWY_PRAGMA(warning(tokens))
102
111
  #define HWY_DIAGNOSTICS_OFF(msc, gcc) HWY_DIAGNOSTICS(msc)
@@ -124,6 +133,11 @@
124
133
  #define HWY_NORETURN __attribute__((noreturn))
125
134
  #define HWY_LIKELY(expr) __builtin_expect(!!(expr), 1)
126
135
  #define HWY_UNLIKELY(expr) __builtin_expect(!!(expr), 0)
136
+ #if HWY_COMPILER_GCC || HWY_HAS_BUILTIN(__builtin_unreachable)
137
+ #define HWY_UNREACHABLE __builtin_unreachable()
138
+ #else
139
+ #define HWY_UNREACHABLE
140
+ #endif
127
141
  #define HWY_PRAGMA(tokens) _Pragma(#tokens)
128
142
  #define HWY_DIAGNOSTICS(tokens) HWY_PRAGMA(GCC diagnostic tokens)
129
143
  #define HWY_DIAGNOSTICS_OFF(msc, gcc) HWY_DIAGNOSTICS(gcc)
@@ -161,7 +175,8 @@ namespace hwy {
161
175
  // Returns a pointer whose type is `type` (T*), while allowing the compiler to
162
176
  // assume that the untyped pointer `ptr` is aligned to a multiple of sizeof(T).
163
177
  #define HWY_RCAST_ALIGNED(type, ptr) \
164
- reinterpret_cast<type>(HWY_ASSUME_ALIGNED((ptr), alignof(RemovePtr<type>)))
178
+ reinterpret_cast<type>( \
179
+ HWY_ASSUME_ALIGNED((ptr), alignof(hwy::RemovePtr<type>)))
165
180
 
166
181
  // Clang and GCC require attributes on each function into which SIMD intrinsics
167
182
  // are inlined. Support both per-function annotation (HWY_ATTR) for lambdas and
@@ -217,7 +232,7 @@ namespace hwy {
217
232
  // Better:
218
233
  // HWY_ASSUME(x == 2);
219
234
  // HWY_ASSUME(y == 3);
220
- #if HWY_HAS_CPP_ATTRIBUTE(assume)
235
+ #if (HWY_CXX_LANG >= 202302L) && HWY_HAS_CPP_ATTRIBUTE(assume)
221
236
  #define HWY_ASSUME(expr) [[assume(expr)]]
222
237
  #elif HWY_COMPILER_MSVC || HWY_COMPILER_ICC
223
238
  #define HWY_ASSUME(expr) __assume(expr)
@@ -233,32 +248,106 @@ namespace hwy {
233
248
  #define HWY_ASSUME(expr) static_cast<void>(0)
234
249
  #endif
235
250
 
236
- // Compile-time fence to prevent undesirable code reordering. On Clang x86, the
237
- // typical asm volatile("" : : : "memory") has no effect, whereas atomic fence
238
- // does, without generating code.
239
- #if HWY_ARCH_X86 && !defined(HWY_NO_LIBCXX)
240
- #define HWY_FENCE std::atomic_thread_fence(std::memory_order_acq_rel)
251
+ // Compile-time fence to prevent undesirable code reordering. On Clang, the
252
+ // typical `asm volatile("" : : : "memory")` seems to be ignored. Note that
253
+ // `std::atomic_thread_fence` affects other threads, hence might generate a
254
+ // barrier instruction, but this does not.
255
+ #if !defined(HWY_NO_LIBCXX)
256
+ #define HWY_FENCE std::atomic_signal_fence(std::memory_order_seq_cst)
257
+ #elif HWY_COMPILER_GCC
258
+ #define HWY_FENCE asm volatile("" : : : "memory")
241
259
  #else
242
- // TODO(janwas): investigate alternatives. On Arm, the above generates barriers.
243
260
  #define HWY_FENCE
244
261
  #endif
245
262
 
246
263
  // 4 instances of a given literal value, useful as input to LoadDup128.
247
264
  #define HWY_REP4(literal) literal, literal, literal, literal
248
265
 
266
+ //------------------------------------------------------------------------------
267
+ // Abort / Warn
268
+
269
+ #if defined(HWY_HEADER_ONLY)
270
+ HWY_DLLEXPORT inline void HWY_FORMAT(3, 4)
271
+ Warn(const char* file, int line, const char* format, ...) {
272
+ char buf[800];
273
+ va_list args;
274
+ va_start(args, format);
275
+ vsnprintf(buf, sizeof(buf), format, args);
276
+ va_end(args);
277
+
278
+ fprintf(stderr, "Warn at %s:%d: %s\n", file, line, buf);
279
+ }
280
+
281
+ HWY_DLLEXPORT HWY_NORETURN inline void HWY_FORMAT(3, 4)
282
+ Abort(const char* file, int line, const char* format, ...) {
283
+ char buf[800];
284
+ va_list args;
285
+ va_start(args, format);
286
+ vsnprintf(buf, sizeof(buf), format, args);
287
+ va_end(args);
288
+
289
+ fprintf(stderr, "Abort at %s:%d: %s\n", file, line, buf);
290
+
291
+ fflush(stderr);
292
+
293
+ // Now terminate the program:
294
+ #if HWY_ARCH_RISCV
295
+ exit(1); // trap/abort just freeze Spike.
296
+ #else
297
+ abort(); // Compile error without this due to HWY_NORETURN.
298
+ #endif
299
+ }
300
+ #else // !HWY_HEADER_ONLY
301
+ // Interfaces for custom Warn/Abort handlers.
302
+ typedef void (*WarnFunc)(const char* file, int line, const char* message);
303
+
304
+ typedef void (*AbortFunc)(const char* file, int line, const char* message);
305
+
306
+ // Returns current Warn() handler, or nullptr if no handler was yet registered,
307
+ // indicating Highway should print to stderr.
308
+ // DEPRECATED because this is thread-hostile and prone to misuse (modifying the
309
+ // underlying pointer through the reference).
310
+ HWY_DLLEXPORT WarnFunc& GetWarnFunc();
311
+
312
+ // Returns current Abort() handler, or nullptr if no handler was yet registered,
313
+ // indicating Highway should print to stderr and abort.
314
+ // DEPRECATED because this is thread-hostile and prone to misuse (modifying the
315
+ // underlying pointer through the reference).
316
+ HWY_DLLEXPORT AbortFunc& GetAbortFunc();
317
+
318
+ // Sets a new Warn() handler and returns the previous handler, which is nullptr
319
+ // if no previous handler was registered, and should otherwise be called from
320
+ // the new handler. Thread-safe.
321
+ HWY_DLLEXPORT WarnFunc SetWarnFunc(WarnFunc func);
322
+
323
+ // Sets a new Abort() handler and returns the previous handler, which is nullptr
324
+ // if no previous handler was registered, and should otherwise be called from
325
+ // the new handler. If all handlers return, then Highway will terminate the app.
326
+ // Thread-safe.
327
+ HWY_DLLEXPORT AbortFunc SetAbortFunc(AbortFunc func);
328
+
329
+ HWY_DLLEXPORT void HWY_FORMAT(3, 4)
330
+ Warn(const char* file, int line, const char* format, ...);
331
+
249
332
  HWY_DLLEXPORT HWY_NORETURN void HWY_FORMAT(3, 4)
250
333
  Abort(const char* file, int line, const char* format, ...);
251
334
 
335
+ #endif // HWY_HEADER_ONLY
336
+
337
+ #define HWY_WARN(format, ...) \
338
+ ::hwy::Warn(__FILE__, __LINE__, format, ##__VA_ARGS__)
339
+
252
340
  #define HWY_ABORT(format, ...) \
253
341
  ::hwy::Abort(__FILE__, __LINE__, format, ##__VA_ARGS__)
254
342
 
255
343
  // Always enabled.
256
- #define HWY_ASSERT(condition) \
257
- do { \
258
- if (!(condition)) { \
259
- HWY_ABORT("Assert %s", #condition); \
260
- } \
344
+ #define HWY_ASSERT_M(condition, msg) \
345
+ do { \
346
+ if (!(condition)) { \
347
+ HWY_ABORT("Assert %s: %s", #condition, msg); \
348
+ } \
261
349
  } while (0)
350
+ #define HWY_ASSERT(condition) HWY_ASSERT_M(condition, "")
262
351
 
263
352
  #if HWY_HAS_FEATURE(memory_sanitizer) || defined(MEMORY_SANITIZER) || \
264
353
  defined(__SANITIZE_MEMORY__)
@@ -303,12 +392,17 @@ HWY_DLLEXPORT HWY_NORETURN void HWY_FORMAT(3, 4)
303
392
  #define HWY_ATTR_NO_MSAN
304
393
  #endif
305
394
 
395
+ #if HWY_IS_ASAN || HWY_IS_HWASAN || HWY_IS_MSAN || HWY_IS_TSAN || HWY_IS_UBSAN
396
+ #define HWY_IS_SANITIZER 1
397
+ #else
398
+ #define HWY_IS_SANITIZER 0
399
+ #endif
400
+
306
401
  // For enabling HWY_DASSERT and shortening tests in slower debug builds
307
402
  #if !defined(HWY_IS_DEBUG_BUILD)
308
403
  // Clang does not define NDEBUG, but it and GCC define __OPTIMIZE__, and recent
309
404
  // MSVC defines NDEBUG (if not, could instead check _DEBUG).
310
- #if (!defined(__OPTIMIZE__) && !defined(NDEBUG)) || HWY_IS_ASAN || \
311
- HWY_IS_HWASAN || HWY_IS_MSAN || HWY_IS_TSAN || HWY_IS_UBSAN || \
405
+ #if (!defined(__OPTIMIZE__) && !defined(NDEBUG)) || HWY_IS_SANITIZER || \
312
406
  defined(__clang_analyzer__)
313
407
  #define HWY_IS_DEBUG_BUILD 1
314
408
  #else
@@ -317,8 +411,12 @@ HWY_DLLEXPORT HWY_NORETURN void HWY_FORMAT(3, 4)
317
411
  #endif // HWY_IS_DEBUG_BUILD
318
412
 
319
413
  #if HWY_IS_DEBUG_BUILD
320
- #define HWY_DASSERT(condition) HWY_ASSERT(condition)
414
+ #define HWY_DASSERT_M(condition, msg) HWY_ASSERT_M(condition, msg)
415
+ #define HWY_DASSERT(condition) HWY_ASSERT_M(condition, "")
321
416
  #else
417
+ #define HWY_DASSERT_M(condition, msg) \
418
+ do { \
419
+ } while (0)
322
420
  #define HWY_DASSERT(condition) \
323
421
  do { \
324
422
  } while (0)
@@ -453,6 +551,13 @@ static inline HWY_MAYBE_UNUSED bool operator==(const uint128_t& a,
453
551
  return a.lo == b.lo && a.hi == b.hi;
454
552
  }
455
553
 
554
+ #if !defined(HWY_NO_LIBCXX)
555
+ static inline HWY_MAYBE_UNUSED std::ostream& operator<<(std::ostream& os,
556
+ const uint128_t& n) {
557
+ return os << "[hi=" << n.hi << ",lo=" << n.lo << "]";
558
+ }
559
+ #endif
560
+
456
561
  static inline HWY_MAYBE_UNUSED bool operator<(const K64V64& a,
457
562
  const K64V64& b) {
458
563
  return a.key < b.key;
@@ -467,6 +572,13 @@ static inline HWY_MAYBE_UNUSED bool operator==(const K64V64& a,
467
572
  return a.key == b.key;
468
573
  }
469
574
 
575
+ #if !defined(HWY_NO_LIBCXX)
576
+ static inline HWY_MAYBE_UNUSED std::ostream& operator<<(std::ostream& os,
577
+ const K64V64& n) {
578
+ return os << "[k=" << n.key << ",v=" << n.value << "]";
579
+ }
580
+ #endif
581
+
470
582
  static inline HWY_MAYBE_UNUSED bool operator<(const K32V32& a,
471
583
  const K32V32& b) {
472
584
  return a.key < b.key;
@@ -481,6 +593,13 @@ static inline HWY_MAYBE_UNUSED bool operator==(const K32V32& a,
481
593
  return a.key == b.key;
482
594
  }
483
595
 
596
+ #if !defined(HWY_NO_LIBCXX)
597
+ static inline HWY_MAYBE_UNUSED std::ostream& operator<<(std::ostream& os,
598
+ const K32V32& n) {
599
+ return os << "[k=" << n.key << ",v=" << n.value << "]";
600
+ }
601
+ #endif
602
+
484
603
  //------------------------------------------------------------------------------
485
604
  // Controlling overload resolution (SFINAE)
486
605
 
@@ -882,78 +1001,87 @@ HWY_INLINE constexpr bool IsIntegerLaneType<uint64_t>() {
882
1001
  return true;
883
1002
  }
884
1003
 
1004
+ namespace detail {
1005
+
885
1006
  template <class T>
886
- HWY_API constexpr bool IsInteger() {
887
- // NOTE: Do not add a IsInteger<wchar_t>() specialization below as it is
1007
+ static HWY_INLINE constexpr bool IsNonCvInteger() {
1008
+ // NOTE: Do not add a IsNonCvInteger<wchar_t>() specialization below as it is
888
1009
  // possible for IsSame<wchar_t, uint16_t>() to be true when compiled with MSVC
889
1010
  // with the /Zc:wchar_t- option.
890
- return IsIntegerLaneType<T>() || IsSame<RemoveCvRef<T>, wchar_t>() ||
891
- IsSameEither<RemoveCvRef<T>, size_t, ptrdiff_t>() ||
892
- IsSameEither<RemoveCvRef<T>, intptr_t, uintptr_t>();
1011
+ return IsIntegerLaneType<T>() || IsSame<T, wchar_t>() ||
1012
+ IsSameEither<T, size_t, ptrdiff_t>() ||
1013
+ IsSameEither<T, intptr_t, uintptr_t>();
893
1014
  }
894
1015
  template <>
895
- HWY_INLINE constexpr bool IsInteger<bool>() {
1016
+ HWY_INLINE constexpr bool IsNonCvInteger<bool>() {
896
1017
  return true;
897
1018
  }
898
1019
  template <>
899
- HWY_INLINE constexpr bool IsInteger<char>() {
1020
+ HWY_INLINE constexpr bool IsNonCvInteger<char>() {
900
1021
  return true;
901
1022
  }
902
1023
  template <>
903
- HWY_INLINE constexpr bool IsInteger<signed char>() {
1024
+ HWY_INLINE constexpr bool IsNonCvInteger<signed char>() {
904
1025
  return true;
905
1026
  }
906
1027
  template <>
907
- HWY_INLINE constexpr bool IsInteger<unsigned char>() {
1028
+ HWY_INLINE constexpr bool IsNonCvInteger<unsigned char>() {
908
1029
  return true;
909
1030
  }
910
1031
  template <>
911
- HWY_INLINE constexpr bool IsInteger<short>() { // NOLINT
1032
+ HWY_INLINE constexpr bool IsNonCvInteger<short>() { // NOLINT
912
1033
  return true;
913
1034
  }
914
1035
  template <>
915
- HWY_INLINE constexpr bool IsInteger<unsigned short>() { // NOLINT
1036
+ HWY_INLINE constexpr bool IsNonCvInteger<unsigned short>() { // NOLINT
916
1037
  return true;
917
1038
  }
918
1039
  template <>
919
- HWY_INLINE constexpr bool IsInteger<int>() {
1040
+ HWY_INLINE constexpr bool IsNonCvInteger<int>() {
920
1041
  return true;
921
1042
  }
922
1043
  template <>
923
- HWY_INLINE constexpr bool IsInteger<unsigned>() {
1044
+ HWY_INLINE constexpr bool IsNonCvInteger<unsigned>() {
924
1045
  return true;
925
1046
  }
926
1047
  template <>
927
- HWY_INLINE constexpr bool IsInteger<long>() { // NOLINT
1048
+ HWY_INLINE constexpr bool IsNonCvInteger<long>() { // NOLINT
928
1049
  return true;
929
1050
  }
930
1051
  template <>
931
- HWY_INLINE constexpr bool IsInteger<unsigned long>() { // NOLINT
1052
+ HWY_INLINE constexpr bool IsNonCvInteger<unsigned long>() { // NOLINT
932
1053
  return true;
933
1054
  }
934
1055
  template <>
935
- HWY_INLINE constexpr bool IsInteger<long long>() { // NOLINT
1056
+ HWY_INLINE constexpr bool IsNonCvInteger<long long>() { // NOLINT
936
1057
  return true;
937
1058
  }
938
1059
  template <>
939
- HWY_INLINE constexpr bool IsInteger<unsigned long long>() { // NOLINT
1060
+ HWY_INLINE constexpr bool IsNonCvInteger<unsigned long long>() { // NOLINT
940
1061
  return true;
941
1062
  }
942
1063
  #if defined(__cpp_char8_t) && __cpp_char8_t >= 201811L
943
1064
  template <>
944
- HWY_INLINE constexpr bool IsInteger<char8_t>() {
1065
+ HWY_INLINE constexpr bool IsNonCvInteger<char8_t>() {
945
1066
  return true;
946
1067
  }
947
1068
  #endif
948
1069
  template <>
949
- HWY_INLINE constexpr bool IsInteger<char16_t>() {
1070
+ HWY_INLINE constexpr bool IsNonCvInteger<char16_t>() {
950
1071
  return true;
951
1072
  }
952
1073
  template <>
953
- HWY_INLINE constexpr bool IsInteger<char32_t>() {
1074
+ HWY_INLINE constexpr bool IsNonCvInteger<char32_t>() {
954
1075
  return true;
955
1076
  }
956
1077
 
1078
+ } // namespace detail
1079
+
1080
+ template <class T>
1081
+ HWY_API constexpr bool IsInteger() {
1082
+ return detail::IsNonCvInteger<RemoveCvRef<T>>();
1083
+ }
1084
+
957
1085
  // -----------------------------------------------------------------------------
958
1086
  // BitCastScalar
959
1087
 
@@ -1042,6 +1170,7 @@ HWY_API HWY_BITCASTSCALAR_CONSTEXPR To BitCastScalar(const From& val) {
1042
1170
 
1043
1171
  #pragma pack(push, 1)
1044
1172
 
1173
+ #ifndef HWY_NEON_HAVE_F16C // allow override
1045
1174
  // Compiler supports __fp16 and load/store/conversion NEON intrinsics, which are
1046
1175
  // included in Armv8 and VFPv4 (except with MSVC). On Armv7 Clang requires
1047
1176
  // __ARM_FP & 2 whereas Armv7 GCC requires -mfp16-format=ieee.
@@ -1052,6 +1181,7 @@ HWY_API HWY_BITCASTSCALAR_CONSTEXPR To BitCastScalar(const From& val) {
1052
1181
  #else
1053
1182
  #define HWY_NEON_HAVE_F16C 0
1054
1183
  #endif
1184
+ #endif // HWY_NEON_HAVE_F16C
1055
1185
 
1056
1186
  // RVV with f16 extension supports _Float16 and f16 vector ops. If set, implies
1057
1187
  // HWY_HAVE_FLOAT16.
@@ -1071,9 +1201,10 @@ HWY_API HWY_BITCASTSCALAR_CONSTEXPR To BitCastScalar(const From& val) {
1071
1201
  #define HWY_SSE2_HAVE_F16_TYPE 0
1072
1202
  #endif
1073
1203
 
1074
- #ifndef HWY_HAVE_SCALAR_F16_TYPE
1204
+ #ifndef HWY_HAVE_SCALAR_F16_TYPE // allow override
1075
1205
  // Compiler supports _Float16, not necessarily with operators.
1076
- #if HWY_NEON_HAVE_F16C || HWY_RVV_HAVE_F16_VEC || HWY_SSE2_HAVE_F16_TYPE
1206
+ #if HWY_NEON_HAVE_F16C || HWY_RVV_HAVE_F16_VEC || HWY_SSE2_HAVE_F16_TYPE || \
1207
+ __SPIRV_DEVICE__
1077
1208
  #define HWY_HAVE_SCALAR_F16_TYPE 1
1078
1209
  #else
1079
1210
  #define HWY_HAVE_SCALAR_F16_TYPE 0
@@ -1125,17 +1256,19 @@ using NativeSpecialFloatToWrapper =
1125
1256
  // are generated regardless of F16 support; see #1684.
1126
1257
  struct alignas(2) float16_t {
1127
1258
  #if HWY_HAVE_SCALAR_F16_TYPE
1128
- #if HWY_RVV_HAVE_F16_VEC || HWY_SSE2_HAVE_F16_TYPE
1259
+ #if HWY_RVV_HAVE_F16_VEC || HWY_SSE2_HAVE_F16_TYPE || __SPIRV_DEVICE__
1129
1260
  using Native = _Float16;
1130
1261
  #elif HWY_NEON_HAVE_F16C
1131
1262
  using Native = __fp16;
1132
1263
  #else
1133
1264
  #error "Logic error: condition should be 'all but NEON_HAVE_F16C'"
1134
1265
  #endif
1266
+ #elif HWY_IDE
1267
+ using Native = uint16_t;
1135
1268
  #endif // HWY_HAVE_SCALAR_F16_TYPE
1136
1269
 
1137
1270
  union {
1138
- #if HWY_HAVE_SCALAR_F16_TYPE
1271
+ #if HWY_HAVE_SCALAR_F16_TYPE || HWY_IDE
1139
1272
  // Accessed via NativeLaneType, and used directly if
1140
1273
  // HWY_HAVE_SCALAR_F16_OPERATORS.
1141
1274
  Native native;
@@ -1581,9 +1714,13 @@ HWY_F16_CONSTEXPR inline std::partial_ordering operator<=>(
1581
1714
  #endif
1582
1715
 
1583
1716
  // x86 compiler supports __bf16, not necessarily with operators.
1717
+ // Disable in debug builds due to clang miscompiles as of 2025-07-22: casting
1718
+ // bf16 <-> f32 in convert_test results in 0x2525 for 1.0 instead of 0x3f80.
1719
+ // Reported at https://github.com/llvm/llvm-project/issues/151692.
1584
1720
  #ifndef HWY_SSE2_HAVE_SCALAR_BF16_TYPE
1585
- #if HWY_ARCH_X86 && defined(__SSE2__) && \
1586
- ((HWY_COMPILER_CLANG >= 1700 && !HWY_COMPILER_CLANGCL) || \
1721
+ #if HWY_ARCH_X86 && defined(__SSE2__) && \
1722
+ ((HWY_COMPILER_CLANG >= 1700 && !HWY_COMPILER_CLANGCL && \
1723
+ !HWY_IS_DEBUG_BUILD) || \
1587
1724
  HWY_COMPILER_GCC_ACTUAL >= 1300)
1588
1725
  #define HWY_SSE2_HAVE_SCALAR_BF16_TYPE 1
1589
1726
  #else
@@ -1617,10 +1754,12 @@ HWY_F16_CONSTEXPR inline std::partial_ordering operator<=>(
1617
1754
  struct alignas(2) bfloat16_t {
1618
1755
  #if HWY_HAVE_SCALAR_BF16_TYPE
1619
1756
  using Native = __bf16;
1757
+ #elif HWY_IDE
1758
+ using Native = uint16_t;
1620
1759
  #endif
1621
1760
 
1622
1761
  union {
1623
- #if HWY_HAVE_SCALAR_BF16_TYPE
1762
+ #if HWY_HAVE_SCALAR_BF16_TYPE || HWY_IDE
1624
1763
  // Accessed via NativeLaneType, and used directly if
1625
1764
  // HWY_HAVE_SCALAR_BF16_OPERATORS.
1626
1765
  Native native;
@@ -1637,7 +1776,7 @@ struct alignas(2) bfloat16_t {
1637
1776
  bfloat16_t& operator=(const bfloat16_t& arg) noexcept = default;
1638
1777
 
1639
1778
  // Only enable implicit conversions if we have a native type.
1640
- #if HWY_HAVE_SCALAR_BF16_TYPE
1779
+ #if HWY_HAVE_SCALAR_BF16_TYPE || HWY_IDE
1641
1780
  constexpr bfloat16_t(Native arg) noexcept : native(arg) {}
1642
1781
  constexpr operator Native() const noexcept { return native; }
1643
1782
  #endif
@@ -1818,38 +1957,33 @@ static HWY_INLINE HWY_MAYBE_UNUSED constexpr uint32_t F32BitsToBF16RoundIncr(
1818
1957
  : 0u);
1819
1958
  }
1820
1959
 
1960
+ // If f32_bits is the bit representation of a NaN F32 value, make sure that
1961
+ // bit 6 of the BF16 result is set to convert SNaN F32 values to QNaN BF16
1962
+ // values and to prevent NaN F32 values from being converted to an infinite
1963
+ // BF16 value
1964
+ static HWY_INLINE constexpr uint32_t BF16BitsIfSNAN(uint32_t f32_bits) {
1965
+ return ((f32_bits & 0x7FFFFFFFu) > 0x7F800000u) ? (uint32_t{1} << 6) : 0;
1966
+ }
1967
+
1821
1968
  // Converts f32_bits (which is the bits of a F32 value) to BF16 bits,
1822
1969
  // rounded to the nearest F16 value
1823
1970
  static HWY_INLINE HWY_MAYBE_UNUSED constexpr uint16_t F32BitsToBF16Bits(
1824
1971
  const uint32_t f32_bits) {
1825
- // Round f32_bits to the nearest BF16 by first adding
1826
- // F32BitsToBF16RoundIncr(f32_bits) to f32_bits and then right shifting
1827
- // f32_bits + F32BitsToBF16RoundIncr(f32_bits) by 16
1828
-
1829
- // If f32_bits is the bit representation of a NaN F32 value, make sure that
1830
- // bit 6 of the BF16 result is set to convert SNaN F32 values to QNaN BF16
1831
- // values and to prevent NaN F32 values from being converted to an infinite
1832
- // BF16 value
1833
1972
  return static_cast<uint16_t>(
1834
- ((f32_bits + F32BitsToBF16RoundIncr(f32_bits)) >> 16) |
1835
- (static_cast<uint32_t>((f32_bits & 0x7FFFFFFFu) > 0x7F800000u) << 6));
1973
+ BF16BitsIfSNAN(f32_bits) |
1974
+ ((f32_bits + F32BitsToBF16RoundIncr(f32_bits)) >> 16));
1836
1975
  }
1837
1976
 
1838
1977
  } // namespace detail
1839
1978
 
1840
1979
  HWY_API HWY_BF16_CONSTEXPR bfloat16_t BF16FromF32(float f) {
1841
- #if HWY_HAVE_SCALAR_BF16_OPERATORS
1842
- return static_cast<bfloat16_t>(f);
1843
- #else
1980
+ // The rounding mode is not specified in the C++ standard, so ignore
1981
+ // `HWY_HAVE_SCALAR_BF16_OPERATORS` and only use our round to nearest.
1844
1982
  return bfloat16_t::FromBits(
1845
1983
  detail::F32BitsToBF16Bits(BitCastScalar<uint32_t>(f)));
1846
- #endif
1847
1984
  }
1848
1985
 
1849
1986
  HWY_API HWY_BF16_CONSTEXPR bfloat16_t BF16FromF64(double f64) {
1850
- #if HWY_HAVE_SCALAR_BF16_OPERATORS
1851
- return static_cast<bfloat16_t>(f64);
1852
- #else
1853
1987
  // The mantissa bits of f64 are first rounded using round-to-odd rounding
1854
1988
  // to the nearest f64 value that has the lower 38 bits zeroed out to
1855
1989
  // ensure that the result is correctly rounded to a BF16.
@@ -1885,7 +2019,6 @@ HWY_API HWY_BF16_CONSTEXPR bfloat16_t BF16FromF64(double f64) {
1885
2019
  (BitCastScalar<uint64_t>(f64) & 0xFFFFFFC000000000ULL) |
1886
2020
  ((BitCastScalar<uint64_t>(f64) + 0x0000003FFFFFFFFFULL) &
1887
2021
  0x0000004000000000ULL)))));
1888
- #endif
1889
2022
  }
1890
2023
 
1891
2024
  // More convenient to define outside bfloat16_t because these may use
@@ -2178,6 +2311,11 @@ constexpr bool IsSigned<hwy::K32V32>() {
2178
2311
  return false;
2179
2312
  }
2180
2313
 
2314
+ template <typename T>
2315
+ HWY_API constexpr bool IsUnsigned() {
2316
+ return IsInteger<T>() && !IsSigned<T>();
2317
+ }
2318
+
2181
2319
  template <typename T, bool = IsInteger<T>() && !IsIntegerLaneType<T>()>
2182
2320
  struct MakeLaneTypeIfIntegerT {
2183
2321
  using type = T;
@@ -2364,6 +2502,45 @@ constexpr MakeSigned<T> MaxExponentField() {
2364
2502
  return (MakeSigned<T>{1} << ExponentBits<T>()) - 1;
2365
2503
  }
2366
2504
 
2505
+ namespace detail {
2506
+
2507
+ template <typename T>
2508
+ static HWY_INLINE HWY_MAYBE_UNUSED HWY_BITCASTSCALAR_CONSTEXPR T
2509
+ NegativeInfOrLowestValue(hwy::FloatTag /* tag */) {
2510
+ return BitCastScalar<T>(
2511
+ static_cast<MakeUnsigned<T>>(SignMask<T>() | ExponentMask<T>()));
2512
+ }
2513
+
2514
+ template <typename T>
2515
+ static HWY_INLINE HWY_MAYBE_UNUSED HWY_BITCASTSCALAR_CONSTEXPR T
2516
+ NegativeInfOrLowestValue(hwy::NonFloatTag /* tag */) {
2517
+ return LowestValue<T>();
2518
+ }
2519
+
2520
+ template <typename T>
2521
+ static HWY_INLINE HWY_MAYBE_UNUSED HWY_BITCASTSCALAR_CONSTEXPR T
2522
+ PositiveInfOrHighestValue(hwy::FloatTag /* tag */) {
2523
+ return BitCastScalar<T>(ExponentMask<T>());
2524
+ }
2525
+
2526
+ template <typename T>
2527
+ static HWY_INLINE HWY_MAYBE_UNUSED HWY_BITCASTSCALAR_CONSTEXPR T
2528
+ PositiveInfOrHighestValue(hwy::NonFloatTag /* tag */) {
2529
+ return HighestValue<T>();
2530
+ }
2531
+
2532
+ } // namespace detail
2533
+
2534
+ template <typename T>
2535
+ HWY_API HWY_BITCASTSCALAR_CONSTEXPR T NegativeInfOrLowestValue() {
2536
+ return detail::NegativeInfOrLowestValue<T>(IsFloatTag<T>());
2537
+ }
2538
+
2539
+ template <typename T>
2540
+ HWY_API HWY_BITCASTSCALAR_CONSTEXPR T PositiveInfOrHighestValue() {
2541
+ return detail::PositiveInfOrHighestValue<T>(IsFloatTag<T>());
2542
+ }
2543
+
2367
2544
  //------------------------------------------------------------------------------
2368
2545
  // Additional F16/BF16 operators
2369
2546
 
@@ -2381,6 +2558,17 @@ constexpr MakeSigned<T> MaxExponentField() {
2381
2558
  return static_cast<ResultT>(a op b.native); \
2382
2559
  }
2383
2560
 
2561
+ #define HWY_RHS_SPECIAL_FLOAT_ASSIGN_OP(op, assign_op, T2) \
2562
+ template <typename T1, \
2563
+ hwy::EnableIf<hwy::IsInteger<RemoveCvRef<T1>>() || \
2564
+ hwy::IsFloat3264<RemoveCvRef<T1>>()>* = nullptr, \
2565
+ typename ResultT = \
2566
+ decltype(DeclVal<T1&>() assign_op DeclVal<T2::Native>())> \
2567
+ static HWY_INLINE constexpr ResultT operator assign_op(T1& a, \
2568
+ T2 b) noexcept { \
2569
+ return (a assign_op b.native); \
2570
+ }
2571
+
2384
2572
  #define HWY_SPECIAL_FLOAT_CMP_AGAINST_NON_SPECIAL_OP(op, op_func, T1) \
2385
2573
  HWY_RHS_SPECIAL_FLOAT_ARITH_OP(op, op_func, T1) \
2386
2574
  template < \
@@ -2399,6 +2587,10 @@ HWY_RHS_SPECIAL_FLOAT_ARITH_OP(+, operator+, float16_t)
2399
2587
  HWY_RHS_SPECIAL_FLOAT_ARITH_OP(-, operator-, float16_t)
2400
2588
  HWY_RHS_SPECIAL_FLOAT_ARITH_OP(*, operator*, float16_t)
2401
2589
  HWY_RHS_SPECIAL_FLOAT_ARITH_OP(/, operator/, float16_t)
2590
+ HWY_RHS_SPECIAL_FLOAT_ASSIGN_OP(+, +=, float16_t)
2591
+ HWY_RHS_SPECIAL_FLOAT_ASSIGN_OP(-, -=, float16_t)
2592
+ HWY_RHS_SPECIAL_FLOAT_ASSIGN_OP(*, *=, float16_t)
2593
+ HWY_RHS_SPECIAL_FLOAT_ASSIGN_OP(/, /=, float16_t)
2402
2594
  HWY_SPECIAL_FLOAT_CMP_AGAINST_NON_SPECIAL_OP(==, operator==, float16_t)
2403
2595
  HWY_SPECIAL_FLOAT_CMP_AGAINST_NON_SPECIAL_OP(!=, operator!=, float16_t)
2404
2596
  HWY_SPECIAL_FLOAT_CMP_AGAINST_NON_SPECIAL_OP(<, operator<, float16_t)
@@ -2415,6 +2607,10 @@ HWY_RHS_SPECIAL_FLOAT_ARITH_OP(+, operator+, bfloat16_t)
2415
2607
  HWY_RHS_SPECIAL_FLOAT_ARITH_OP(-, operator-, bfloat16_t)
2416
2608
  HWY_RHS_SPECIAL_FLOAT_ARITH_OP(*, operator*, bfloat16_t)
2417
2609
  HWY_RHS_SPECIAL_FLOAT_ARITH_OP(/, operator/, bfloat16_t)
2610
+ HWY_RHS_SPECIAL_FLOAT_ASSIGN_OP(+, +=, bfloat16_t)
2611
+ HWY_RHS_SPECIAL_FLOAT_ASSIGN_OP(-, -=, bfloat16_t)
2612
+ HWY_RHS_SPECIAL_FLOAT_ASSIGN_OP(*, *=, bfloat16_t)
2613
+ HWY_RHS_SPECIAL_FLOAT_ASSIGN_OP(/, /=, bfloat16_t)
2418
2614
  HWY_SPECIAL_FLOAT_CMP_AGAINST_NON_SPECIAL_OP(==, operator==, bfloat16_t)
2419
2615
  HWY_SPECIAL_FLOAT_CMP_AGAINST_NON_SPECIAL_OP(!=, operator!=, bfloat16_t)
2420
2616
  HWY_SPECIAL_FLOAT_CMP_AGAINST_NON_SPECIAL_OP(<, operator<, bfloat16_t)
@@ -2427,6 +2623,7 @@ HWY_SPECIAL_FLOAT_CMP_AGAINST_NON_SPECIAL_OP(<=>, operator<=>, bfloat16_t)
2427
2623
  #endif // HWY_HAVE_SCALAR_BF16_OPERATORS
2428
2624
 
2429
2625
  #undef HWY_RHS_SPECIAL_FLOAT_ARITH_OP
2626
+ #undef HWY_RHS_SPECIAL_FLOAT_ASSIGN_OP
2430
2627
  #undef HWY_SPECIAL_FLOAT_CMP_AGAINST_NON_SPECIAL_OP
2431
2628
 
2432
2629
  #endif // HWY_HAVE_SCALAR_F16_OPERATORS || HWY_HAVE_SCALAR_BF16_OPERATORS
@@ -2452,53 +2649,83 @@ HWY_API float F32FromBF16Mem(const void* ptr) {
2452
2649
  #define HWY_BF16_TO_F16_CONSTEXPR HWY_F16_CONSTEXPR
2453
2650
  #endif
2454
2651
 
2455
- // For casting from TFrom to TTo
2456
- template <typename TTo, typename TFrom, HWY_IF_NOT_SPECIAL_FLOAT(TTo),
2457
- HWY_IF_NOT_SPECIAL_FLOAT(TFrom), HWY_IF_NOT_SAME(TTo, TFrom)>
2458
- HWY_API constexpr TTo ConvertScalarTo(const TFrom in) {
2459
- return static_cast<TTo>(in);
2460
- }
2461
- template <typename TTo, typename TFrom, HWY_IF_F16(TTo),
2462
- HWY_IF_NOT_SPECIAL_FLOAT(TFrom), HWY_IF_NOT_SAME(TFrom, double)>
2463
- HWY_API constexpr TTo ConvertScalarTo(const TFrom in) {
2464
- return F16FromF32(static_cast<float>(in));
2652
+ namespace detail {
2653
+
2654
+ template <class TTo, class TFrom>
2655
+ static HWY_INLINE HWY_MAYBE_UNUSED constexpr TTo ConvertScalarToResult(
2656
+ hwy::SizeTag<0> /*conv_to_tag*/, TFrom in) {
2657
+ return static_cast<TTo>(static_cast<TFrom>(in));
2465
2658
  }
2466
- template <typename TTo, HWY_IF_F16(TTo)>
2467
- HWY_API HWY_BF16_TO_F16_CONSTEXPR TTo
2468
- ConvertScalarTo(const hwy::bfloat16_t in) {
2469
- return F16FromF32(F32FromBF16(in));
2659
+
2660
+ template <class TTo>
2661
+ static HWY_INLINE HWY_MAYBE_UNUSED HWY_F16_CONSTEXPR TTo
2662
+ ConvertScalarToResult(hwy::FloatTag /*conv_to_tag*/, float in) {
2663
+ return F16FromF32(in);
2470
2664
  }
2471
- template <typename TTo, HWY_IF_F16(TTo)>
2472
- HWY_API HWY_F16_CONSTEXPR TTo ConvertScalarTo(const double in) {
2665
+
2666
+ template <class TTo>
2667
+ static HWY_INLINE HWY_MAYBE_UNUSED HWY_F16_CONSTEXPR TTo
2668
+ ConvertScalarToResult(hwy::FloatTag /*conv_to_tag*/, double in) {
2473
2669
  return F16FromF64(in);
2474
2670
  }
2475
- template <typename TTo, typename TFrom, HWY_IF_BF16(TTo),
2476
- HWY_IF_NOT_SPECIAL_FLOAT(TFrom), HWY_IF_NOT_SAME(TFrom, double)>
2477
- HWY_API HWY_BF16_CONSTEXPR TTo ConvertScalarTo(const TFrom in) {
2478
- return BF16FromF32(static_cast<float>(in));
2479
- }
2480
- template <typename TTo, HWY_IF_BF16(TTo)>
2481
- HWY_API HWY_BF16_TO_F16_CONSTEXPR TTo ConvertScalarTo(const hwy::float16_t in) {
2482
- return BF16FromF32(F32FromF16(in));
2671
+
2672
+ template <class TTo>
2673
+ static HWY_INLINE HWY_MAYBE_UNUSED HWY_BF16_CONSTEXPR TTo
2674
+ ConvertScalarToResult(hwy::SpecialTag /*conv_to_tag*/, float in) {
2675
+ return BF16FromF32(in);
2483
2676
  }
2484
- template <typename TTo, HWY_IF_BF16(TTo)>
2485
- HWY_API HWY_BF16_CONSTEXPR TTo ConvertScalarTo(const double in) {
2677
+
2678
+ template <class TTo>
2679
+ static HWY_INLINE HWY_MAYBE_UNUSED HWY_BF16_CONSTEXPR TTo
2680
+ ConvertScalarToResult(hwy::SpecialTag /*conv_to_tag*/, double in) {
2486
2681
  return BF16FromF64(in);
2487
2682
  }
2488
- template <typename TTo, typename TFrom, HWY_IF_F16(TFrom),
2489
- HWY_IF_NOT_SPECIAL_FLOAT(TTo)>
2490
- HWY_API HWY_F16_CONSTEXPR TTo ConvertScalarTo(const TFrom in) {
2491
- return static_cast<TTo>(F32FromF16(in));
2683
+
2684
+ template <class TFrom, HWY_IF_BF16(TFrom)>
2685
+ static HWY_INLINE HWY_MAYBE_UNUSED HWY_BF16_CONSTEXPR float
2686
+ ConvertScalarSpecialFloatToF32(hwy::SpecialTag /*conv_from_tag*/, TFrom in) {
2687
+ return F32FromBF16(in);
2492
2688
  }
2493
- template <typename TTo, typename TFrom, HWY_IF_BF16(TFrom),
2494
- HWY_IF_NOT_SPECIAL_FLOAT(TTo)>
2495
- HWY_API HWY_BF16_CONSTEXPR TTo ConvertScalarTo(TFrom in) {
2496
- return static_cast<TTo>(F32FromBF16(in));
2689
+
2690
+ template <class TFrom, HWY_IF_F16(TFrom)>
2691
+ static HWY_INLINE HWY_MAYBE_UNUSED HWY_F16_CONSTEXPR float
2692
+ ConvertScalarSpecialFloatToF32(hwy::SpecialTag /*conv_from_tag*/, TFrom in) {
2693
+ return F32FromF16(in);
2497
2694
  }
2498
- // Same: return unchanged
2499
- template <typename TTo>
2500
- HWY_API constexpr TTo ConvertScalarTo(TTo in) {
2501
- return in;
2695
+
2696
+ template <class TFrom>
2697
+ static HWY_INLINE HWY_MAYBE_UNUSED constexpr auto
2698
+ ConvertScalarSpecialFloatToF32(hwy::FloatTag /*conv_from_tag*/, TFrom in)
2699
+ -> hwy::If<hwy::IsSame<hwy::RemoveCvRef<TFrom>, double>(), double, float> {
2700
+ return static_cast<
2701
+ hwy::If<hwy::IsSame<hwy::RemoveCvRef<TFrom>, double>(), double, float>>(
2702
+ in);
2703
+ }
2704
+
2705
+ template <class TFrom>
2706
+ static HWY_INLINE HWY_MAYBE_UNUSED constexpr TFrom
2707
+ ConvertScalarSpecialFloatToF32(hwy::SizeTag<0> /*conv_from_tag*/, TFrom in) {
2708
+ return static_cast<TFrom>(in);
2709
+ }
2710
+
2711
+ } // namespace detail
2712
+
2713
+ template <typename TTo, typename TFrom>
2714
+ HWY_API constexpr TTo ConvertScalarTo(TFrom in) {
2715
+ return detail::ConvertScalarToResult<TTo>(
2716
+ hwy::SizeTag<
2717
+ (!hwy::IsSame<hwy::RemoveCvRef<TFrom>, hwy::RemoveCvRef<TTo>>() &&
2718
+ hwy::IsSpecialFloat<TTo>())
2719
+ ? (hwy::IsSame<RemoveCvRef<TTo>, hwy::bfloat16_t>() ? 0x300
2720
+ : 0x200)
2721
+ : 0>(),
2722
+ detail::ConvertScalarSpecialFloatToF32(
2723
+ hwy::SizeTag<
2724
+ (!hwy::IsSame<hwy::RemoveCvRef<TFrom>, hwy::RemoveCvRef<TTo>>() &&
2725
+ (hwy::IsSpecialFloat<TFrom>() || hwy::IsSpecialFloat<TTo>()))
2726
+ ? (hwy::IsSpecialFloat<TFrom>() ? 0x300 : 0x200)
2727
+ : 0>(),
2728
+ static_cast<TFrom&&>(in)));
2502
2729
  }
2503
2730
 
2504
2731
  //------------------------------------------------------------------------------
@@ -2506,10 +2733,13 @@ HWY_API constexpr TTo ConvertScalarTo(TTo in) {
2506
2733
 
2507
2734
  template <typename T1, typename T2>
2508
2735
  constexpr inline T1 DivCeil(T1 a, T2 b) {
2736
+ #if HWY_CXX_LANG >= 201703L
2737
+ HWY_DASSERT(b != 0);
2738
+ #endif
2509
2739
  return (a + b - 1) / b;
2510
2740
  }
2511
2741
 
2512
- // Works for any `align`; if a power of two, compiler emits ADD+AND.
2742
+ // Works for any non-zero `align`; if a power of two, compiler emits ADD+AND.
2513
2743
  constexpr inline size_t RoundUpTo(size_t what, size_t align) {
2514
2744
  return DivCeil(what, align) * align;
2515
2745
  }
@@ -2803,6 +3033,97 @@ class Divisor {
2803
3033
  uint32_t shift2_ = 0;
2804
3034
  };
2805
3035
 
3036
+ #ifndef HWY_HAVE_DIV128 // allow override
3037
+ // Exclude clang-cl because it calls __divti3 from clang_rt.builtins-x86_64,
3038
+ // which is not linked in.
3039
+ #if (HWY_COMPILER_MSVC >= 1920 && HWY_ARCH_X86_64) || \
3040
+ (defined(__SIZEOF_INT128__) && !HWY_COMPILER_CLANGCL)
3041
+ #define HWY_HAVE_DIV128 1
3042
+ #else
3043
+ #define HWY_HAVE_DIV128 0
3044
+ #endif
3045
+ #endif // HWY_HAVE_DIV128
3046
+
3047
+ // Divisor64 can precompute the multiplicative inverse.
3048
+ #if HWY_HAVE_DIV128
3049
+
3050
+ #if HWY_COMPILER_MSVC >= 1920 && HWY_ARCH_X86_64
3051
+ #pragma intrinsic(_udiv128)
3052
+ #pragma intrinsic(__umulh)
3053
+ #endif
3054
+
3055
+ // As above, but for 64-bit divisors: more expensive to compute and initialize.
3056
+ class Divisor64 {
3057
+ public:
3058
+ explicit Divisor64(uint64_t divisor) : divisor_(divisor) {
3059
+ if (divisor <= 1) return;
3060
+
3061
+ const uint64_t len =
3062
+ static_cast<uint64_t>(63 - Num0BitsAboveMS1Bit_Nonzero64(divisor - 1));
3063
+ const uint64_t u_hi = (2ULL << len) - divisor;
3064
+ const uint64_t q = Div128(u_hi, divisor);
3065
+
3066
+ mul_ = q + 1;
3067
+ shift1_ = 1;
3068
+ shift2_ = len;
3069
+ }
3070
+
3071
+ uint64_t GetDivisor() const { return divisor_; }
3072
+
3073
+ // Returns n / divisor_.
3074
+ uint64_t Divide(uint64_t n) const {
3075
+ const uint64_t t = MulHigh(mul_, n);
3076
+ return (t + ((n - t) >> shift1_)) >> shift2_;
3077
+ }
3078
+
3079
+ // Returns n % divisor_.
3080
+ uint64_t Remainder(uint64_t n) const { return n - (Divide(n) * divisor_); }
3081
+
3082
+ private:
3083
+ uint64_t divisor_;
3084
+
3085
+ static uint64_t Div128(uint64_t hi, uint64_t div) {
3086
+ #if HWY_COMPILER_MSVC >= 1920 && HWY_ARCH_X86_64
3087
+ unsigned __int64 remainder; // unused
3088
+ return _udiv128(hi, uint64_t{0}, div, &remainder);
3089
+ #else
3090
+ using u128 = unsigned __int128;
3091
+ const u128 hi128 = static_cast<u128>(hi) << 64;
3092
+ return static_cast<uint64_t>(hi128 / static_cast<u128>(div));
3093
+ #endif
3094
+ }
3095
+
3096
+ static uint64_t MulHigh(uint64_t a, uint64_t b) {
3097
+ #if HWY_COMPILER_MSVC >= 1920 && HWY_ARCH_X86_64
3098
+ return __umulh(a, b);
3099
+ #else
3100
+ using u128 = unsigned __int128;
3101
+ const u128 a128 = static_cast<u128>(a);
3102
+ const u128 b128 = static_cast<u128>(b);
3103
+ return static_cast<uint64_t>((a128 * b128) >> 64);
3104
+ #endif
3105
+ }
3106
+
3107
+ uint64_t mul_ = 1;
3108
+ uint64_t shift1_ = 0;
3109
+ uint64_t shift2_ = 0;
3110
+ };
3111
+ #else
3112
+ // No Div128 available, use built-in 64-bit division on each call.
3113
+ class Divisor64 {
3114
+ public:
3115
+ explicit Divisor64(uint64_t divisor) : divisor_(divisor) {}
3116
+
3117
+ uint64_t GetDivisor() const { return divisor_; }
3118
+
3119
+ uint64_t Divide(uint64_t n) const { return n / divisor_; }
3120
+ uint64_t Remainder(uint64_t n) const { return n % divisor_; }
3121
+
3122
+ private:
3123
+ uint64_t divisor_;
3124
+ };
3125
+ #endif // HWY_HAVE_DIV128
3126
+
2806
3127
  namespace detail {
2807
3128
 
2808
3129
  template <typename T>