@img/sharp-libvips-dev 1.0.1 → 1.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (169) hide show
  1. package/README.md +1 -2
  2. package/include/aom/aom_decoder.h +1 -1
  3. package/include/aom/aom_encoder.h +7 -1
  4. package/include/aom/aom_image.h +24 -12
  5. package/include/aom/aom_integer.h +3 -3
  6. package/include/aom/aomcx.h +15 -0
  7. package/include/aom/aomdx.h +5 -2
  8. package/include/archive.h +7 -5
  9. package/include/archive_entry.h +5 -3
  10. package/include/cgif.h +3 -0
  11. package/include/expat.h +21 -10
  12. package/include/expat_config.h +11 -5
  13. package/include/ffi.h +12 -25
  14. package/include/freetype2/freetype/config/ftoption.h +2 -2
  15. package/include/fribidi/fribidi-config.h +2 -2
  16. package/include/fribidi/fribidi-unicode-version.h +3 -3
  17. package/include/gio-unix-2.0/gio/gfiledescriptorbased.h +3 -2
  18. package/include/glib-2.0/gio/gappinfo.h +40 -25
  19. package/include/glib-2.0/gio/gapplication.h +6 -0
  20. package/include/glib-2.0/gio/gasyncresult.h +1 -1
  21. package/include/glib-2.0/gio/gconverter.h +5 -0
  22. package/include/glib-2.0/gio/gdbusintrospection.h +1 -1
  23. package/include/glib-2.0/gio/gfile.h +16 -0
  24. package/include/glib-2.0/gio/gio-visibility.h +34 -0
  25. package/include/glib-2.0/gio/giotypes.h +0 -1
  26. package/include/glib-2.0/gio/gsettings.h +8 -0
  27. package/include/glib-2.0/gio/gvfs.h +2 -2
  28. package/include/glib-2.0/girepository/gi-visibility.h +34 -0
  29. package/include/glib-2.0/girepository/giarginfo.h +23 -6
  30. package/include/glib-2.0/girepository/gibaseinfo.h +44 -18
  31. package/include/glib-2.0/girepository/gicallableinfo.h +26 -16
  32. package/include/glib-2.0/girepository/gicallbackinfo.h +17 -2
  33. package/include/glib-2.0/girepository/giconstantinfo.h +19 -4
  34. package/include/glib-2.0/girepository/gienuminfo.h +20 -21
  35. package/include/glib-2.0/girepository/gifieldinfo.h +22 -7
  36. package/include/glib-2.0/girepository/giflagsinfo.h +60 -0
  37. package/include/glib-2.0/girepository/gifunctioninfo.h +22 -7
  38. package/include/glib-2.0/girepository/giinterfaceinfo.h +33 -18
  39. package/include/glib-2.0/girepository/giobjectinfo.h +41 -26
  40. package/include/glib-2.0/girepository/gipropertyinfo.h +18 -3
  41. package/include/glib-2.0/girepository/giregisteredtypeinfo.h +22 -11
  42. package/include/glib-2.0/girepository/girepository-autocleanups.h +56 -0
  43. package/include/glib-2.0/girepository/girepository.h +53 -62
  44. package/include/glib-2.0/girepository/girffi.h +8 -7
  45. package/include/glib-2.0/girepository/gisignalinfo.h +18 -3
  46. package/include/glib-2.0/girepository/gistructinfo.h +26 -11
  47. package/include/glib-2.0/girepository/gitypeinfo.h +29 -16
  48. package/include/glib-2.0/girepository/gitypelib.h +9 -13
  49. package/include/glib-2.0/girepository/gitypes.h +52 -104
  50. package/include/glib-2.0/girepository/giunioninfo.h +28 -12
  51. package/include/glib-2.0/girepository/giunresolvedinfo.h +17 -2
  52. package/include/glib-2.0/girepository/givalueinfo.h +65 -0
  53. package/include/glib-2.0/girepository/givfuncinfo.h +23 -8
  54. package/include/glib-2.0/glib/deprecated/gthread.h +9 -5
  55. package/include/glib-2.0/glib/gbitlock.h +31 -0
  56. package/include/glib-2.0/glib/gbookmarkfile.h +1 -1
  57. package/include/glib-2.0/glib/giochannel.h +2 -2
  58. package/include/glib-2.0/glib/glib-visibility.h +34 -0
  59. package/include/glib-2.0/glib/gmacros.h +12 -5
  60. package/include/glib-2.0/glib/gmain.h +93 -7
  61. package/include/glib-2.0/glib/gmessages.h +8 -0
  62. package/include/glib-2.0/glib/gqsort.h +8 -1
  63. package/include/glib-2.0/glib/gslice.h +2 -0
  64. package/include/glib-2.0/glib/gstrfuncs.h +24 -30
  65. package/include/glib-2.0/glib/gstrvbuilder.h +3 -0
  66. package/include/glib-2.0/glib/gthread.h +191 -3
  67. package/include/glib-2.0/glib/gunicode.h +1 -1
  68. package/include/glib-2.0/glib/gversionmacros.h +9 -0
  69. package/include/glib-2.0/glib-unix.h +7 -1
  70. package/include/glib-2.0/gmodule/gmodule-visibility.h +34 -0
  71. package/include/glib-2.0/gobject/genums.h +6 -6
  72. package/include/glib-2.0/gobject/glib-types.h +11 -0
  73. package/include/glib-2.0/gobject/gobject-visibility.h +34 -0
  74. package/include/glib-2.0/gobject/gsignal.h +16 -6
  75. package/include/glib-2.0/gobject/gtype.h +6 -6
  76. package/include/harfbuzz/hb-buffer.h +6 -0
  77. package/include/harfbuzz/hb-common.h +6 -9
  78. package/include/harfbuzz/hb-cplusplus.hh +8 -11
  79. package/include/harfbuzz/hb-subset.h +17 -4
  80. package/include/harfbuzz/hb-version.h +3 -3
  81. package/include/hwy/abort.h +28 -0
  82. package/include/hwy/aligned_allocator.h +218 -6
  83. package/include/hwy/base.h +1935 -512
  84. package/include/hwy/cache_control.h +24 -6
  85. package/include/hwy/detect_compiler_arch.h +105 -10
  86. package/include/hwy/detect_targets.h +146 -37
  87. package/include/hwy/foreach_target.h +36 -1
  88. package/include/hwy/highway.h +222 -50
  89. package/include/hwy/ops/arm_neon-inl.h +2055 -894
  90. package/include/hwy/ops/arm_sve-inl.h +1476 -348
  91. package/include/hwy/ops/emu128-inl.h +711 -623
  92. package/include/hwy/ops/generic_ops-inl.h +4431 -2157
  93. package/include/hwy/ops/inside-inl.h +691 -0
  94. package/include/hwy/ops/ppc_vsx-inl.h +2186 -673
  95. package/include/hwy/ops/rvv-inl.h +1556 -536
  96. package/include/hwy/ops/scalar-inl.h +353 -233
  97. package/include/hwy/ops/set_macros-inl.h +171 -23
  98. package/include/hwy/ops/shared-inl.h +198 -56
  99. package/include/hwy/ops/wasm_128-inl.h +283 -244
  100. package/include/hwy/ops/x86_128-inl.h +3673 -1357
  101. package/include/hwy/ops/x86_256-inl.h +1737 -663
  102. package/include/hwy/ops/x86_512-inl.h +1697 -500
  103. package/include/hwy/per_target.h +4 -0
  104. package/include/hwy/profiler.h +648 -0
  105. package/include/hwy/robust_statistics.h +2 -2
  106. package/include/hwy/targets.h +40 -32
  107. package/include/hwy/timer-inl.h +3 -3
  108. package/include/hwy/timer.h +16 -1
  109. package/include/libheif/heif.h +170 -15
  110. package/include/libheif/heif_items.h +237 -0
  111. package/include/libheif/heif_properties.h +38 -2
  112. package/include/libheif/heif_regions.h +1 -1
  113. package/include/libheif/heif_version.h +2 -2
  114. package/include/libpng16/png.h +32 -29
  115. package/include/libpng16/pngconf.h +2 -2
  116. package/include/libpng16/pnglibconf.h +8 -3
  117. package/include/librsvg-2.0/librsvg/rsvg-cairo.h +1 -1
  118. package/include/librsvg-2.0/librsvg/rsvg-features.h +3 -4
  119. package/include/librsvg-2.0/librsvg/rsvg-pixbuf.h +235 -0
  120. package/include/librsvg-2.0/librsvg/rsvg-version.h +3 -3
  121. package/include/librsvg-2.0/librsvg/rsvg.h +55 -176
  122. package/include/libxml2/libxml/HTMLparser.h +12 -19
  123. package/include/libxml2/libxml/c14n.h +1 -12
  124. package/include/libxml2/libxml/debugXML.h +1 -1
  125. package/include/libxml2/libxml/encoding.h +9 -0
  126. package/include/libxml2/libxml/entities.h +12 -1
  127. package/include/libxml2/libxml/hash.h +19 -0
  128. package/include/libxml2/libxml/list.h +2 -2
  129. package/include/libxml2/libxml/nanohttp.h +17 -0
  130. package/include/libxml2/libxml/parser.h +73 -58
  131. package/include/libxml2/libxml/parserInternals.h +9 -1
  132. package/include/libxml2/libxml/pattern.h +6 -0
  133. package/include/libxml2/libxml/tree.h +32 -12
  134. package/include/libxml2/libxml/uri.h +11 -0
  135. package/include/libxml2/libxml/valid.h +29 -2
  136. package/include/libxml2/libxml/xinclude.h +7 -0
  137. package/include/libxml2/libxml/xmlIO.h +21 -5
  138. package/include/libxml2/libxml/xmlerror.h +14 -0
  139. package/include/libxml2/libxml/xmlexports.h +111 -15
  140. package/include/libxml2/libxml/xmlmemory.h +8 -45
  141. package/include/libxml2/libxml/xmlreader.h +2 -0
  142. package/include/libxml2/libxml/xmlsave.h +5 -0
  143. package/include/libxml2/libxml/xmlunicode.h +165 -1
  144. package/include/libxml2/libxml/xmlversion.h +15 -179
  145. package/include/libxml2/libxml/xmlwriter.h +1 -0
  146. package/include/libxml2/libxml/xpath.h +4 -0
  147. package/include/pango-1.0/pango/pango-features.h +2 -2
  148. package/include/pango-1.0/pango/pango-fontmap.h +7 -0
  149. package/include/pango-1.0/pango/pango-item.h +4 -2
  150. package/include/pango-1.0/pango/pango-version-macros.h +25 -0
  151. package/include/pango-1.0/pango/pangofc-font.h +2 -1
  152. package/include/pixman-1/pixman-version.h +2 -2
  153. package/include/png.h +32 -29
  154. package/include/pngconf.h +2 -2
  155. package/include/pnglibconf.h +8 -3
  156. package/include/vips/connection.h +9 -3
  157. package/include/vips/util.h +1 -11
  158. package/include/vips/version.h +4 -4
  159. package/include/webp/decode.h +58 -56
  160. package/include/webp/demux.h +25 -21
  161. package/include/webp/encode.h +44 -39
  162. package/include/webp/mux.h +76 -15
  163. package/include/webp/mux_types.h +2 -1
  164. package/include/webp/sharpyuv/sharpyuv.h +77 -8
  165. package/include/webp/types.h +29 -8
  166. package/include/zconf.h +1 -1
  167. package/include/zlib.h +12 -12
  168. package/package.json +1 -1
  169. package/versions.json +18 -19
@@ -16,22 +16,27 @@
16
16
  #ifndef HIGHWAY_HWY_BASE_H_
17
17
  #define HIGHWAY_HWY_BASE_H_
18
18
 
19
- // For SIMD module implementations and their callers, target-independent.
19
+ // Target-independent definitions.
20
20
 
21
21
  // IWYU pragma: begin_exports
22
22
  #include <stddef.h>
23
23
  #include <stdint.h>
24
24
 
25
- // Wrapping this into a HWY_HAS_INCLUDE causes clang-format to fail.
26
- #if __cplusplus >= 202100L && defined(__has_include)
27
- #if __has_include(<stdfloat>)
28
- #include <stdfloat> // std::float16_t
29
- #endif
30
- #endif
31
-
32
25
  #include "hwy/detect_compiler_arch.h"
33
26
  #include "hwy/highway_export.h"
34
27
 
28
+ // API version (https://semver.org/); keep in sync with CMakeLists.txt.
29
+ #define HWY_MAJOR 1
30
+ #define HWY_MINOR 2
31
+ #define HWY_PATCH 0
32
+
33
+ // True if the Highway version >= major.minor.0. Added in 1.2.0.
34
+ #define HWY_VERSION_GE(major, minor) \
35
+ (HWY_MAJOR > (major) || (HWY_MAJOR == (major) && HWY_MINOR >= (minor)))
36
+ // True if the Highway version < major.minor.0. Added in 1.2.0.
37
+ #define HWY_VERSION_LT(major, minor) \
38
+ (HWY_MAJOR < (major) || (HWY_MAJOR == (major) && HWY_MINOR < (minor)))
39
+
35
40
  // "IWYU pragma: keep" does not work for these includes, so hide from the IDE.
36
41
  #if !HWY_IDE
37
42
 
@@ -48,6 +53,26 @@
48
53
 
49
54
  #endif // !HWY_IDE
50
55
 
56
+ #ifndef HWY_HAVE_COMPARE_HEADER // allow override
57
+ #define HWY_HAVE_COMPARE_HEADER 0
58
+ #if defined(__has_include) // note: wrapper macro fails on Clang ~17
59
+ #if __has_include(<compare>)
60
+ #undef HWY_HAVE_COMPARE_HEADER
61
+ #define HWY_HAVE_COMPARE_HEADER 1
62
+ #endif // __has_include
63
+ #endif // defined(__has_include)
64
+ #endif // HWY_HAVE_COMPARE_HEADER
65
+
66
+ #ifndef HWY_HAVE_CXX20_THREE_WAY_COMPARE // allow override
67
+ #if !defined(HWY_NO_LIBCXX) && defined(__cpp_impl_three_way_comparison) && \
68
+ __cpp_impl_three_way_comparison >= 201907L && HWY_HAVE_COMPARE_HEADER
69
+ #include <compare>
70
+ #define HWY_HAVE_CXX20_THREE_WAY_COMPARE 1
71
+ #else
72
+ #define HWY_HAVE_CXX20_THREE_WAY_COMPARE 0
73
+ #endif
74
+ #endif // HWY_HAVE_CXX20_THREE_WAY_COMPARE
75
+
51
76
  // IWYU pragma: end_exports
52
77
 
53
78
  #if HWY_COMPILER_MSVC
@@ -64,6 +89,7 @@
64
89
 
65
90
  #include <intrin.h>
66
91
 
92
+ #define HWY_FUNCTION __FUNCSIG__ // function name + template args
67
93
  #define HWY_RESTRICT __restrict
68
94
  #define HWY_INLINE __forceinline
69
95
  #define HWY_NOINLINE __declspec(noinline)
@@ -84,6 +110,7 @@
84
110
 
85
111
  #else
86
112
 
113
+ #define HWY_FUNCTION __PRETTY_FUNCTION__ // function name + template args
87
114
  #define HWY_RESTRICT __restrict__
88
115
  // force inlining without optimization enabled creates very inefficient code
89
116
  // that can cause compiler timeout
@@ -131,6 +158,11 @@ namespace hwy {
131
158
  #define HWY_ASSUME_ALIGNED(ptr, align) (ptr) /* not supported */
132
159
  #endif
133
160
 
161
+ // Returns a pointer whose type is `type` (T*), while allowing the compiler to
162
+ // assume that the untyped pointer `ptr` is aligned to a multiple of sizeof(T).
163
+ #define HWY_RCAST_ALIGNED(type, ptr) \
164
+ reinterpret_cast<type>(HWY_ASSUME_ALIGNED((ptr), alignof(RemovePtr<type>)))
165
+
134
166
  // Clang and GCC require attributes on each function into which SIMD intrinsics
135
167
  // are inlined. Support both per-function annotation (HWY_ATTR) for lambdas and
136
168
  // automatic annotation via pragmas.
@@ -228,24 +260,41 @@ HWY_DLLEXPORT HWY_NORETURN void HWY_FORMAT(3, 4)
228
260
  } \
229
261
  } while (0)
230
262
 
231
- #if HWY_HAS_FEATURE(memory_sanitizer) || defined(MEMORY_SANITIZER)
263
+ #if HWY_HAS_FEATURE(memory_sanitizer) || defined(MEMORY_SANITIZER) || \
264
+ defined(__SANITIZE_MEMORY__)
232
265
  #define HWY_IS_MSAN 1
233
266
  #else
234
267
  #define HWY_IS_MSAN 0
235
268
  #endif
236
269
 
237
- #if HWY_HAS_FEATURE(address_sanitizer) || defined(ADDRESS_SANITIZER)
270
+ #if HWY_HAS_FEATURE(address_sanitizer) || defined(ADDRESS_SANITIZER) || \
271
+ defined(__SANITIZE_ADDRESS__)
238
272
  #define HWY_IS_ASAN 1
239
273
  #else
240
274
  #define HWY_IS_ASAN 0
241
275
  #endif
242
276
 
243
- #if HWY_HAS_FEATURE(thread_sanitizer) || defined(THREAD_SANITIZER)
277
+ #if HWY_HAS_FEATURE(hwaddress_sanitizer) || defined(HWADDRESS_SANITIZER) || \
278
+ defined(__SANITIZE_HWADDRESS__)
279
+ #define HWY_IS_HWASAN 1
280
+ #else
281
+ #define HWY_IS_HWASAN 0
282
+ #endif
283
+
284
+ #if HWY_HAS_FEATURE(thread_sanitizer) || defined(THREAD_SANITIZER) || \
285
+ defined(__SANITIZE_THREAD__)
244
286
  #define HWY_IS_TSAN 1
245
287
  #else
246
288
  #define HWY_IS_TSAN 0
247
289
  #endif
248
290
 
291
+ #if HWY_HAS_FEATURE(undefined_behavior_sanitizer) || \
292
+ defined(UNDEFINED_BEHAVIOR_SANITIZER)
293
+ #define HWY_IS_UBSAN 1
294
+ #else
295
+ #define HWY_IS_UBSAN 0
296
+ #endif
297
+
249
298
  // MSAN may cause lengthy build times or false positives e.g. in AVX3 DemoteTo.
250
299
  // You can disable MSAN by adding this attribute to the function that fails.
251
300
  #if HWY_IS_MSAN
@@ -259,7 +308,8 @@ HWY_DLLEXPORT HWY_NORETURN void HWY_FORMAT(3, 4)
259
308
  // Clang does not define NDEBUG, but it and GCC define __OPTIMIZE__, and recent
260
309
  // MSVC defines NDEBUG (if not, could instead check _DEBUG).
261
310
  #if (!defined(__OPTIMIZE__) && !defined(NDEBUG)) || HWY_IS_ASAN || \
262
- HWY_IS_MSAN || HWY_IS_TSAN || defined(__clang_analyzer__)
311
+ HWY_IS_HWASAN || HWY_IS_MSAN || HWY_IS_TSAN || HWY_IS_UBSAN || \
312
+ defined(__clang_analyzer__)
263
313
  #define HWY_IS_DEBUG_BUILD 1
264
314
  #else
265
315
  #define HWY_IS_DEBUG_BUILD 0
@@ -282,14 +332,12 @@ HWY_DLLEXPORT HWY_NORETURN void HWY_FORMAT(3, 4)
282
332
  #pragma intrinsic(memset)
283
333
  #endif
284
334
 
285
- // The source/destination must not overlap/alias.
286
335
  template <size_t kBytes, typename From, typename To>
287
- HWY_API void CopyBytes(const From* from, To* to) {
336
+ HWY_API void CopyBytes(const From* HWY_RESTRICT from, To* HWY_RESTRICT to) {
288
337
  #if HWY_COMPILER_MSVC
289
338
  memcpy(to, from, kBytes);
290
339
  #else
291
- __builtin_memcpy(static_cast<void*>(to), static_cast<const void*>(from),
292
- kBytes);
340
+ __builtin_memcpy(to, from, kBytes);
293
341
  #endif
294
342
  }
295
343
 
@@ -331,7 +379,7 @@ HWY_API void ZeroBytes(void* to, size_t num_bytes) {
331
379
 
332
380
  #if HWY_ARCH_X86
333
381
  static constexpr HWY_MAYBE_UNUSED size_t kMaxVectorSize = 64; // AVX-512
334
- #elif HWY_ARCH_RVV && defined(__riscv_v_intrinsic) && \
382
+ #elif HWY_ARCH_RISCV && defined(__riscv_v_intrinsic) && \
335
383
  __riscv_v_intrinsic >= 11000
336
384
  // Not actually an upper bound on the size.
337
385
  static constexpr HWY_MAYBE_UNUSED size_t kMaxVectorSize = 4096;
@@ -347,7 +395,7 @@ static constexpr HWY_MAYBE_UNUSED size_t kMaxVectorSize = 16;
347
395
  // exceed the stack size.
348
396
  #if HWY_ARCH_X86
349
397
  #define HWY_ALIGN_MAX alignas(64)
350
- #elif HWY_ARCH_RVV && defined(__riscv_v_intrinsic) && \
398
+ #elif HWY_ARCH_RISCV && defined(__riscv_v_intrinsic) && \
351
399
  __riscv_v_intrinsic >= 11000
352
400
  #define HWY_ALIGN_MAX alignas(8) // only elements need be aligned
353
401
  #else
@@ -357,349 +405,11 @@ static constexpr HWY_MAYBE_UNUSED size_t kMaxVectorSize = 16;
357
405
  //------------------------------------------------------------------------------
358
406
  // Lane types
359
407
 
360
- #pragma pack(push, 1)
361
-
362
- // float16_t load/store/conversion intrinsics are always supported on Armv8 and
363
- // VFPv4 (except with MSVC). On Armv7 Clang requires __ARM_FP & 2; GCC requires
364
- // -mfp16-format=ieee.
365
- #if (HWY_ARCH_ARM_A64 && !HWY_COMPILER_MSVC) || \
366
- (HWY_COMPILER_CLANG && defined(__ARM_FP) && (__ARM_FP & 2)) || \
367
- (HWY_COMPILER_GCC_ACTUAL && defined(__ARM_FP16_FORMAT_IEEE))
368
- #define HWY_NEON_HAVE_FLOAT16C 1
369
- #else
370
- #define HWY_NEON_HAVE_FLOAT16C 0
371
- #endif
372
-
373
- // C11 extension ISO/IEC TS 18661-3:2015 but not supported on all targets.
374
- // Required if HWY_HAVE_FLOAT16, i.e. RVV with zvfh or AVX3_SPR (with
375
- // sufficiently new compiler supporting avx512fp16). Do not use on clang-cl,
376
- // which is missing __extendhfsf2.
377
- #if ((HWY_ARCH_RVV && defined(__riscv_zvfh) && HWY_COMPILER_CLANG) || \
378
- (HWY_ARCH_X86 && defined(__SSE2__) && \
379
- ((HWY_COMPILER_CLANG >= 1600 && !HWY_COMPILER_CLANGCL) || \
380
- HWY_COMPILER_GCC_ACTUAL >= 1200)))
381
- #define HWY_HAVE_C11_FLOAT16 1
382
- #else
383
- #define HWY_HAVE_C11_FLOAT16 0
384
- #endif
385
-
386
- // If 1, both __bf16 and a limited set of *_bf16 SVE intrinsics are available:
387
- // create/get/set/dup, ld/st, sel, rev, trn, uzp, zip.
388
- #if HWY_ARCH_ARM_A64 && defined(__ARM_FEATURE_SVE_BF16)
389
- #define HWY_SVE_HAVE_BFLOAT16 1
390
- #else
391
- #define HWY_SVE_HAVE_BFLOAT16 0
392
- #endif
393
-
394
- // Match [u]int##_t naming scheme so rvv-inl.h macros can obtain the type name
395
- // by concatenating base type and bits. We use a wrapper class instead of a
396
- // typedef to the native type to ensure that the same symbols, e.g. for VQSort,
397
- // are generated regardless of F16 support; see #1684.
398
- struct float16_t {
399
- #if HWY_NEON_HAVE_FLOAT16C // ACLE's __fp16
400
- using Raw = __fp16;
401
- #elif HWY_HAVE_C11_FLOAT16 // C11 _Float16
402
- using Raw = _Float16;
403
- #elif __cplusplus > 202002L && defined(__STDCPP_FLOAT16_T__) // C++23
404
- using Raw = std::float16_t;
405
- #else
406
- #define HWY_EMULATE_FLOAT16
407
- using Raw = uint16_t;
408
- Raw bits;
409
- #endif // float16_t
410
-
411
- // When backed by a native type, ensure the wrapper behaves like the native
412
- // type by forwarding all operators. Unfortunately it seems difficult to reuse
413
- // this code in a base class, so we repeat it in bfloat16_t.
414
- #ifndef HWY_EMULATE_FLOAT16
415
- Raw raw;
416
-
417
- float16_t() noexcept = default;
418
- template <typename T>
419
- constexpr float16_t(T arg) noexcept : raw(static_cast<Raw>(arg)) {}
420
- float16_t& operator=(Raw arg) noexcept {
421
- raw = arg;
422
- return *this;
423
- }
424
- constexpr float16_t(const float16_t&) noexcept = default;
425
- float16_t& operator=(const float16_t&) noexcept = default;
426
- constexpr operator Raw() const noexcept { return raw; }
427
-
428
- template <typename T>
429
- float16_t& operator+=(T rhs) noexcept {
430
- raw = static_cast<Raw>(raw + rhs);
431
- return *this;
432
- }
433
-
434
- template <typename T>
435
- float16_t& operator-=(T rhs) noexcept {
436
- raw = static_cast<Raw>(raw - rhs);
437
- return *this;
438
- }
439
-
440
- template <typename T>
441
- float16_t& operator*=(T rhs) noexcept {
442
- raw = static_cast<Raw>(raw * rhs);
443
- return *this;
444
- }
445
-
446
- template <typename T>
447
- float16_t& operator/=(T rhs) noexcept {
448
- raw = static_cast<Raw>(raw / rhs);
449
- return *this;
450
- }
451
-
452
- float16_t operator--() noexcept {
453
- raw = static_cast<Raw>(raw - Raw{1});
454
- return *this;
455
- }
456
-
457
- float16_t operator--(int) noexcept {
458
- raw = static_cast<Raw>(raw - Raw{1});
459
- return *this;
460
- }
461
-
462
- float16_t operator++() noexcept {
463
- raw = static_cast<Raw>(raw + Raw{1});
464
- return *this;
465
- }
466
-
467
- float16_t operator++(int) noexcept {
468
- raw = static_cast<Raw>(raw + Raw{1});
469
- return *this;
470
- }
471
-
472
- constexpr float16_t operator-() const noexcept {
473
- return float16_t(static_cast<Raw>(-raw));
474
- }
475
- constexpr float16_t operator+() const noexcept { return *this; }
476
- #endif // HWY_EMULATE_FLOAT16
477
- };
478
-
479
- #ifndef HWY_EMULATE_FLOAT16
480
- constexpr inline bool operator==(float16_t lhs, float16_t rhs) noexcept {
481
- return lhs.raw == rhs.raw;
482
- }
483
- constexpr inline bool operator!=(float16_t lhs, float16_t rhs) noexcept {
484
- return lhs.raw != rhs.raw;
485
- }
486
- constexpr inline bool operator<(float16_t lhs, float16_t rhs) noexcept {
487
- return lhs.raw < rhs.raw;
488
- }
489
- constexpr inline bool operator<=(float16_t lhs, float16_t rhs) noexcept {
490
- return lhs.raw <= rhs.raw;
491
- }
492
- constexpr inline bool operator>(float16_t lhs, float16_t rhs) noexcept {
493
- return lhs.raw > rhs.raw;
494
- }
495
- constexpr inline bool operator>=(float16_t lhs, float16_t rhs) noexcept {
496
- return lhs.raw >= rhs.raw;
497
- }
498
- #endif // HWY_EMULATE_FLOAT16
499
-
500
- struct bfloat16_t {
501
- #if HWY_SVE_HAVE_BFLOAT16
502
- using Raw = __bf16;
503
- #elif __cplusplus >= 202100L && defined(__STDCPP_BFLOAT16_T__) // C++23
504
- using Raw = std::bfloat16_t;
505
- #else
506
- #define HWY_EMULATE_BFLOAT16
507
- using Raw = uint16_t;
508
- Raw bits;
509
- #endif
510
-
511
- #ifndef HWY_EMULATE_BFLOAT16
512
- Raw raw;
513
-
514
- bfloat16_t() noexcept = default;
515
- template <typename T>
516
- constexpr bfloat16_t(T arg) noexcept : raw(static_cast<Raw>(arg)) {}
517
- bfloat16_t& operator=(Raw arg) noexcept {
518
- raw = arg;
519
- return *this;
520
- }
521
- constexpr bfloat16_t(const bfloat16_t&) noexcept = default;
522
- bfloat16_t& operator=(const bfloat16_t&) noexcept = default;
523
- constexpr operator Raw() const noexcept { return raw; }
524
-
525
- template <typename T>
526
- bfloat16_t& operator+=(T rhs) noexcept {
527
- raw = static_cast<Raw>(raw + rhs);
528
- return *this;
529
- }
530
-
531
- template <typename T>
532
- bfloat16_t& operator-=(T rhs) noexcept {
533
- raw = static_cast<Raw>(raw - rhs);
534
- return *this;
535
- }
536
-
537
- template <typename T>
538
- bfloat16_t& operator*=(T rhs) noexcept {
539
- raw = static_cast<Raw>(raw * rhs);
540
- return *this;
541
- }
542
-
543
- template <typename T>
544
- bfloat16_t& operator/=(T rhs) noexcept {
545
- raw = static_cast<Raw>(raw / rhs);
546
- return *this;
547
- }
548
-
549
- bfloat16_t operator--() noexcept {
550
- raw = static_cast<Raw>(raw - Raw{1});
551
- return *this;
552
- }
553
-
554
- bfloat16_t operator--(int) noexcept {
555
- raw = static_cast<Raw>(raw - Raw{1});
556
- return *this;
557
- }
558
-
559
- bfloat16_t operator++() noexcept {
560
- raw = static_cast<Raw>(raw + Raw{1});
561
- return *this;
562
- }
563
-
564
- bfloat16_t operator++(int) noexcept {
565
- raw = static_cast<Raw>(raw + Raw{1});
566
- return *this;
567
- }
568
-
569
- constexpr bfloat16_t operator-() const noexcept {
570
- return bfloat16_t(static_cast<Raw>(-raw));
571
- }
572
- constexpr bfloat16_t operator+() const noexcept { return *this; }
573
- #endif // HWY_EMULATE_BFLOAT16
574
- };
575
-
576
- #ifndef HWY_EMULATE_BFLOAT16
577
- constexpr inline bool operator==(bfloat16_t lhs, bfloat16_t rhs) noexcept {
578
- return lhs.raw == rhs.raw;
579
- }
580
- constexpr inline bool operator!=(bfloat16_t lhs, bfloat16_t rhs) noexcept {
581
- return lhs.raw != rhs.raw;
582
- }
583
- constexpr inline bool operator<(bfloat16_t lhs, bfloat16_t rhs) noexcept {
584
- return lhs.raw < rhs.raw;
585
- }
586
- constexpr inline bool operator<=(bfloat16_t lhs, bfloat16_t rhs) noexcept {
587
- return lhs.raw <= rhs.raw;
588
- }
589
- constexpr inline bool operator>(bfloat16_t lhs, bfloat16_t rhs) noexcept {
590
- return lhs.raw > rhs.raw;
591
- }
592
- constexpr inline bool operator>=(bfloat16_t lhs, bfloat16_t rhs) noexcept {
593
- return lhs.raw >= rhs.raw;
594
- }
595
- #endif // HWY_EMULATE_BFLOAT16
596
-
597
- #pragma pack(pop)
598
-
599
- HWY_API float F32FromF16(float16_t f16) {
600
- #ifdef HWY_EMULATE_FLOAT16
601
- uint16_t bits16;
602
- CopySameSize(&f16, &bits16);
603
- const uint32_t sign = static_cast<uint32_t>(bits16 >> 15);
604
- const uint32_t biased_exp = (bits16 >> 10) & 0x1F;
605
- const uint32_t mantissa = bits16 & 0x3FF;
606
-
607
- // Subnormal or zero
608
- if (biased_exp == 0) {
609
- const float subnormal =
610
- (1.0f / 16384) * (static_cast<float>(mantissa) * (1.0f / 1024));
611
- return sign ? -subnormal : subnormal;
612
- }
613
-
614
- // Normalized: convert the representation directly (faster than ldexp/tables).
615
- const uint32_t biased_exp32 = biased_exp + (127 - 15);
616
- const uint32_t mantissa32 = mantissa << (23 - 10);
617
- const uint32_t bits32 = (sign << 31) | (biased_exp32 << 23) | mantissa32;
618
-
619
- float result;
620
- CopySameSize(&bits32, &result);
621
- return result;
622
- #else
623
- return static_cast<float>(f16);
624
- #endif
625
- }
626
-
627
- HWY_API float16_t F16FromF32(float f32) {
628
- #ifdef HWY_EMULATE_FLOAT16
629
- uint32_t bits32;
630
- CopySameSize(&f32, &bits32);
631
- const uint32_t sign = bits32 >> 31;
632
- const uint32_t biased_exp32 = (bits32 >> 23) & 0xFF;
633
- const uint32_t mantissa32 = bits32 & 0x7FFFFF;
634
-
635
- const int32_t exp = HWY_MIN(static_cast<int32_t>(biased_exp32) - 127, 15);
636
-
637
- // Tiny or zero => zero.
638
- float16_t out;
639
- if (exp < -24) {
640
- // restore original sign
641
- const uint16_t bits = static_cast<uint16_t>(sign << 15);
642
- CopySameSize(&bits, &out);
643
- return out;
644
- }
645
-
646
- uint32_t biased_exp16, mantissa16;
647
-
648
- // exp = [-24, -15] => subnormal
649
- if (exp < -14) {
650
- biased_exp16 = 0;
651
- const uint32_t sub_exp = static_cast<uint32_t>(-14 - exp);
652
- HWY_DASSERT(1 <= sub_exp && sub_exp < 11);
653
- mantissa16 = static_cast<uint32_t>((1u << (10 - sub_exp)) +
654
- (mantissa32 >> (13 + sub_exp)));
655
- } else {
656
- // exp = [-14, 15]
657
- biased_exp16 = static_cast<uint32_t>(exp + 15);
658
- HWY_DASSERT(1 <= biased_exp16 && biased_exp16 < 31);
659
- mantissa16 = mantissa32 >> 13;
660
- }
661
-
662
- HWY_DASSERT(mantissa16 < 1024);
663
- const uint32_t bits16 = (sign << 15) | (biased_exp16 << 10) | mantissa16;
664
- HWY_DASSERT(bits16 < 0x10000);
665
- const uint16_t narrowed = static_cast<uint16_t>(bits16); // big-endian safe
666
- CopySameSize(&narrowed, &out);
667
- return out;
668
- #else
669
- return float16_t(static_cast<float16_t::Raw>(f32));
670
- #endif
671
- }
672
-
673
- HWY_API float F32FromBF16(bfloat16_t bf) {
674
- uint16_t bits16;
675
- CopyBytes<2>(&bf, &bits16);
676
- uint32_t bits = bits16;
677
- bits <<= 16;
678
- float f;
679
- CopySameSize(&bits, &f);
680
- return f;
681
- }
682
-
683
- HWY_API float F32FromF16Mem(const void* ptr) {
684
- float16_t f16;
685
- CopyBytes<2>(ptr, &f16);
686
- return F32FromF16(f16);
687
- }
688
-
689
- HWY_API float F32FromBF16Mem(const void* ptr) {
690
- bfloat16_t bf;
691
- CopyBytes<2>(ptr, &bf);
692
- return F32FromBF16(bf);
693
- }
694
-
695
- HWY_API bfloat16_t BF16FromF32(float f) {
696
- uint32_t bits;
697
- CopySameSize(&f, &bits);
698
- const uint16_t bits16 = static_cast<uint16_t>(bits >> 16);
699
- bfloat16_t bf;
700
- CopySameSize(&bits16, &bf);
701
- return bf;
702
- }
408
+ // hwy::float16_t and hwy::bfloat16_t are forward declared here to allow
409
+ // BitCastScalar to be implemented before the implementations of the
410
+ // hwy::float16_t and hwy::bfloat16_t types
411
+ struct float16_t;
412
+ struct bfloat16_t;
703
413
 
704
414
  using float32_t = float;
705
415
  using float64_t = double;
@@ -729,24 +439,6 @@ struct alignas(8) K32V32 {
729
439
 
730
440
  #pragma pack(pop)
731
441
 
732
- #ifdef HWY_EMULATE_FLOAT16
733
-
734
- static inline HWY_MAYBE_UNUSED bool operator<(const float16_t& a,
735
- const float16_t& b) {
736
- return F32FromF16(a) < F32FromF16(b);
737
- }
738
- // Required for std::greater.
739
- static inline HWY_MAYBE_UNUSED bool operator>(const float16_t& a,
740
- const float16_t& b) {
741
- return F32FromF16(a) > F32FromF16(b);
742
- }
743
- static inline HWY_MAYBE_UNUSED bool operator==(const float16_t& a,
744
- const float16_t& b) {
745
- return F32FromF16(a) == F32FromF16(b);
746
- }
747
-
748
- #endif // HWY_EMULATE_FLOAT16
749
-
750
442
  static inline HWY_MAYBE_UNUSED bool operator<(const uint128_t& a,
751
443
  const uint128_t& b) {
752
444
  return (a.hi == b.hi) ? a.lo < b.lo : a.hi < b.hi;
@@ -817,6 +509,12 @@ HWY_API constexpr bool IsSame() {
817
509
  return IsSameT<T, U>::value;
818
510
  }
819
511
 
512
+ // Returns whether T matches either of U1 or U2
513
+ template <typename T, typename U1, typename U2>
514
+ HWY_API constexpr bool IsSameEither() {
515
+ return IsSameT<T, U1>::value || IsSameT<T, U2>::value;
516
+ }
517
+
820
518
  template <bool Condition, typename Then, typename Else>
821
519
  struct IfT {
822
520
  using type = Then;
@@ -830,6 +528,88 @@ struct IfT<false, Then, Else> {
830
528
  template <bool Condition, typename Then, typename Else>
831
529
  using If = typename IfT<Condition, Then, Else>::type;
832
530
 
531
+ template <typename T>
532
+ struct IsConstT {
533
+ enum { value = 0 };
534
+ };
535
+
536
+ template <typename T>
537
+ struct IsConstT<const T> {
538
+ enum { value = 1 };
539
+ };
540
+
541
+ template <typename T>
542
+ HWY_API constexpr bool IsConst() {
543
+ return IsConstT<T>::value;
544
+ }
545
+
546
+ template <class T>
547
+ struct RemoveConstT {
548
+ using type = T;
549
+ };
550
+ template <class T>
551
+ struct RemoveConstT<const T> {
552
+ using type = T;
553
+ };
554
+
555
+ template <class T>
556
+ using RemoveConst = typename RemoveConstT<T>::type;
557
+
558
+ template <class T>
559
+ struct RemoveVolatileT {
560
+ using type = T;
561
+ };
562
+ template <class T>
563
+ struct RemoveVolatileT<volatile T> {
564
+ using type = T;
565
+ };
566
+
567
+ template <class T>
568
+ using RemoveVolatile = typename RemoveVolatileT<T>::type;
569
+
570
+ template <class T>
571
+ struct RemoveRefT {
572
+ using type = T;
573
+ };
574
+ template <class T>
575
+ struct RemoveRefT<T&> {
576
+ using type = T;
577
+ };
578
+ template <class T>
579
+ struct RemoveRefT<T&&> {
580
+ using type = T;
581
+ };
582
+
583
+ template <class T>
584
+ using RemoveRef = typename RemoveRefT<T>::type;
585
+
586
+ template <class T>
587
+ using RemoveCvRef = RemoveConst<RemoveVolatile<RemoveRef<T>>>;
588
+
589
+ template <class T>
590
+ struct RemovePtrT {
591
+ using type = T;
592
+ };
593
+ template <class T>
594
+ struct RemovePtrT<T*> {
595
+ using type = T;
596
+ };
597
+ template <class T>
598
+ struct RemovePtrT<const T*> {
599
+ using type = T;
600
+ };
601
+ template <class T>
602
+ struct RemovePtrT<volatile T*> {
603
+ using type = T;
604
+ };
605
+ template <class T>
606
+ struct RemovePtrT<const volatile T*> {
607
+ using type = T;
608
+ };
609
+
610
+ template <class T>
611
+ using RemovePtr = typename RemovePtrT<T>::type;
612
+
833
613
  // Insert into template/function arguments to enable this overload only for
834
614
  // vectors of exactly, at most (LE), or more than (GT) this many bytes.
835
615
  //
@@ -846,10 +626,11 @@ using If = typename IfT<Condition, Then, Else>::type;
846
626
  #define HWY_IF_LANES_LE(kN, lanes) hwy::EnableIf<(kN <= lanes)>* = nullptr
847
627
  #define HWY_IF_LANES_GT(kN, lanes) hwy::EnableIf<(kN > lanes)>* = nullptr
848
628
 
849
- #define HWY_IF_UNSIGNED(T) hwy::EnableIf<!IsSigned<T>()>* = nullptr
850
- #define HWY_IF_SIGNED(T) \
851
- hwy::EnableIf<IsSigned<T>() && !IsFloat<T>() && !IsSpecialFloat<T>()>* = \
852
- nullptr
629
+ #define HWY_IF_UNSIGNED(T) hwy::EnableIf<!hwy::IsSigned<T>()>* = nullptr
630
+ #define HWY_IF_NOT_UNSIGNED(T) hwy::EnableIf<hwy::IsSigned<T>()>* = nullptr
631
+ #define HWY_IF_SIGNED(T) \
632
+ hwy::EnableIf<hwy::IsSigned<T>() && !hwy::IsFloat<T>() && \
633
+ !hwy::IsSpecialFloat<T>()>* = nullptr
853
634
  #define HWY_IF_FLOAT(T) hwy::EnableIf<hwy::IsFloat<T>()>* = nullptr
854
635
  #define HWY_IF_NOT_FLOAT(T) hwy::EnableIf<!hwy::IsFloat<T>()>* = nullptr
855
636
  #define HWY_IF_FLOAT3264(T) hwy::EnableIf<hwy::IsFloat3264<T>()>* = nullptr
@@ -862,6 +643,7 @@ using If = typename IfT<Condition, Then, Else>::type;
862
643
  hwy::EnableIf<hwy::IsFloat<T>() || hwy::IsSpecialFloat<T>()>* = nullptr
863
644
  #define HWY_IF_NOT_FLOAT_NOR_SPECIAL(T) \
864
645
  hwy::EnableIf<!hwy::IsFloat<T>() && !hwy::IsSpecialFloat<T>()>* = nullptr
646
+ #define HWY_IF_INTEGER(T) hwy::EnableIf<hwy::IsInteger<T>()>* = nullptr
865
647
 
866
648
  #define HWY_IF_T_SIZE(T, bytes) hwy::EnableIf<sizeof(T) == (bytes)>* = nullptr
867
649
  #define HWY_IF_NOT_T_SIZE(T, bytes) \
@@ -871,52 +653,1303 @@ using If = typename IfT<Condition, Then, Else>::type;
871
653
  // bits explicitly (0x14) instead of attempting to 'negate' 0x102.
872
654
  #define HWY_IF_T_SIZE_ONE_OF(T, bit_array) \
873
655
  hwy::EnableIf<((size_t{1} << sizeof(T)) & (bit_array)) != 0>* = nullptr
656
+ #define HWY_IF_T_SIZE_LE(T, bytes) \
657
+ hwy::EnableIf<(sizeof(T) <= (bytes))>* = nullptr
658
+ #define HWY_IF_T_SIZE_GT(T, bytes) \
659
+ hwy::EnableIf<(sizeof(T) > (bytes))>* = nullptr
660
+
661
+ #define HWY_IF_SAME(T, expected) \
662
+ hwy::EnableIf<hwy::IsSame<hwy::RemoveCvRef<T>, expected>()>* = nullptr
663
+ #define HWY_IF_NOT_SAME(T, expected) \
664
+ hwy::EnableIf<!hwy::IsSame<hwy::RemoveCvRef<T>, expected>()>* = nullptr
665
+
666
+ // One of two expected types
667
+ #define HWY_IF_SAME2(T, expected1, expected2) \
668
+ hwy::EnableIf< \
669
+ hwy::IsSameEither<hwy::RemoveCvRef<T>, expected1, expected2>()>* = \
670
+ nullptr
671
+
672
+ #define HWY_IF_U8(T) HWY_IF_SAME(T, uint8_t)
673
+ #define HWY_IF_U16(T) HWY_IF_SAME(T, uint16_t)
674
+ #define HWY_IF_U32(T) HWY_IF_SAME(T, uint32_t)
675
+ #define HWY_IF_U64(T) HWY_IF_SAME(T, uint64_t)
676
+
677
+ #define HWY_IF_I8(T) HWY_IF_SAME(T, int8_t)
678
+ #define HWY_IF_I16(T) HWY_IF_SAME(T, int16_t)
679
+ #define HWY_IF_I32(T) HWY_IF_SAME(T, int32_t)
680
+ #define HWY_IF_I64(T) HWY_IF_SAME(T, int64_t)
681
+
682
+ #define HWY_IF_BF16(T) HWY_IF_SAME(T, hwy::bfloat16_t)
683
+ #define HWY_IF_NOT_BF16(T) HWY_IF_NOT_SAME(T, hwy::bfloat16_t)
684
+
685
+ #define HWY_IF_F16(T) HWY_IF_SAME(T, hwy::float16_t)
686
+ #define HWY_IF_NOT_F16(T) HWY_IF_NOT_SAME(T, hwy::float16_t)
687
+
688
+ #define HWY_IF_F32(T) HWY_IF_SAME(T, float)
689
+ #define HWY_IF_F64(T) HWY_IF_SAME(T, double)
690
+
691
+ // Use instead of HWY_IF_T_SIZE to avoid ambiguity with float16_t/float/double
692
+ // overloads.
693
+ #define HWY_IF_UI8(T) HWY_IF_SAME2(T, uint8_t, int8_t)
694
+ #define HWY_IF_UI16(T) HWY_IF_SAME2(T, uint16_t, int16_t)
695
+ #define HWY_IF_UI32(T) HWY_IF_SAME2(T, uint32_t, int32_t)
696
+ #define HWY_IF_UI64(T) HWY_IF_SAME2(T, uint64_t, int64_t)
697
+
698
+ #define HWY_IF_LANES_PER_BLOCK(T, N, LANES) \
699
+ hwy::EnableIf<HWY_MIN(sizeof(T) * N, 16) / sizeof(T) == (LANES)>* = nullptr
700
+
701
+ // Empty struct used as a size tag type.
702
+ template <size_t N>
703
+ struct SizeTag {};
704
+
705
+ template <class T>
706
+ class DeclValT {
707
+ private:
708
+ template <class U, class URef = U&&>
709
+ static URef TryAddRValRef(int);
710
+ template <class U, class Arg>
711
+ static U TryAddRValRef(Arg);
712
+
713
+ public:
714
+ using type = decltype(TryAddRValRef<T>(0));
715
+ enum { kDisableDeclValEvaluation = 1 };
716
+ };
717
+
718
+ // hwy::DeclVal<T>() can only be used in unevaluated contexts such as within an
719
+ // expression of a decltype specifier.
720
+
721
+ // hwy::DeclVal<T>() does not require that T have a public default constructor
722
+ template <class T>
723
+ HWY_API typename DeclValT<T>::type DeclVal() noexcept {
724
+ static_assert(!DeclValT<T>::kDisableDeclValEvaluation,
725
+ "DeclVal() cannot be used in an evaluated context");
726
+ }
727
+
728
+ template <class T>
729
+ struct IsArrayT {
730
+ enum { value = 0 };
731
+ };
732
+
733
+ template <class T>
734
+ struct IsArrayT<T[]> {
735
+ enum { value = 1 };
736
+ };
737
+
738
+ template <class T, size_t N>
739
+ struct IsArrayT<T[N]> {
740
+ enum { value = 1 };
741
+ };
742
+
743
+ template <class T>
744
+ static constexpr bool IsArray() {
745
+ return IsArrayT<T>::value;
746
+ }
747
+
748
+ #if HWY_COMPILER_MSVC
749
+ HWY_DIAGNOSTICS(push)
750
+ HWY_DIAGNOSTICS_OFF(disable : 4180, ignored "-Wignored-qualifiers")
751
+ #endif
752
+
753
+ template <class From, class To>
754
+ class IsConvertibleT {
755
+ private:
756
+ template <class T>
757
+ static hwy::SizeTag<1> TestFuncWithToArg(T);
758
+
759
+ template <class T, class U>
760
+ static decltype(IsConvertibleT<T, U>::template TestFuncWithToArg<U>(
761
+ DeclVal<T>()))
762
+ TryConvTest(int);
763
+
764
+ template <class T, class U, class Arg>
765
+ static hwy::SizeTag<0> TryConvTest(Arg);
766
+
767
+ public:
768
+ enum {
769
+ value = (IsSame<RemoveConst<RemoveVolatile<From>>, void>() &&
770
+ IsSame<RemoveConst<RemoveVolatile<To>>, void>()) ||
771
+ (!IsArray<To>() &&
772
+ (IsSame<To, decltype(DeclVal<To>())>() ||
773
+ !IsSame<const RemoveConst<To>, RemoveConst<To>>()) &&
774
+ IsSame<decltype(TryConvTest<From, To>(0)), hwy::SizeTag<1>>())
775
+ };
776
+ };
777
+
778
+ #if HWY_COMPILER_MSVC
779
+ HWY_DIAGNOSTICS(pop)
780
+ #endif
781
+
782
+ template <class From, class To>
783
+ HWY_API constexpr bool IsConvertible() {
784
+ return IsConvertibleT<From, To>::value;
785
+ }
786
+
787
+ template <class From, class To>
788
+ class IsStaticCastableT {
789
+ private:
790
+ template <class T, class U, class = decltype(static_cast<U>(DeclVal<T>()))>
791
+ static hwy::SizeTag<1> TryStaticCastTest(int);
792
+
793
+ template <class T, class U, class Arg>
794
+ static hwy::SizeTag<0> TryStaticCastTest(Arg);
795
+
796
+ public:
797
+ enum {
798
+ value = IsSame<decltype(TryStaticCastTest<From, To>(0)), hwy::SizeTag<1>>()
799
+ };
800
+ };
801
+
802
+ template <class From, class To>
803
+ static constexpr bool IsStaticCastable() {
804
+ return IsStaticCastableT<From, To>::value;
805
+ }
806
+
807
+ #define HWY_IF_CASTABLE(From, To) \
808
+ hwy::EnableIf<IsStaticCastable<From, To>()>* = nullptr
809
+
810
+ #define HWY_IF_OP_CASTABLE(op, T, Native) \
811
+ HWY_IF_CASTABLE(decltype(DeclVal<Native>() op DeclVal<T>()), Native)
812
+
813
+ template <class T, class From>
814
+ class IsAssignableT {
815
+ private:
816
+ template <class T1, class T2, class = decltype(DeclVal<T1>() = DeclVal<T2>())>
817
+ static hwy::SizeTag<1> TryAssignTest(int);
818
+
819
+ template <class T1, class T2, class Arg>
820
+ static hwy::SizeTag<0> TryAssignTest(Arg);
821
+
822
+ public:
823
+ enum {
824
+ value = IsSame<decltype(TryAssignTest<T, From>(0)), hwy::SizeTag<1>>()
825
+ };
826
+ };
827
+
828
+ template <class T, class From>
829
+ static constexpr bool IsAssignable() {
830
+ return IsAssignableT<T, From>::value;
831
+ }
832
+
833
+ #define HWY_IF_ASSIGNABLE(T, From) \
834
+ hwy::EnableIf<IsAssignable<T, From>()>* = nullptr
835
+
836
+ // ----------------------------------------------------------------------------
837
+ // IsSpecialFloat
838
+
839
+ // These types are often special-cased and not supported in all ops.
840
+ template <typename T>
841
+ HWY_API constexpr bool IsSpecialFloat() {
842
+ return IsSameEither<RemoveCvRef<T>, hwy::float16_t, hwy::bfloat16_t>();
843
+ }
844
+
845
+ // -----------------------------------------------------------------------------
846
+ // IsIntegerLaneType and IsInteger
847
+
848
+ template <class T>
849
+ HWY_API constexpr bool IsIntegerLaneType() {
850
+ return false;
851
+ }
852
+ template <>
853
+ HWY_INLINE constexpr bool IsIntegerLaneType<int8_t>() {
854
+ return true;
855
+ }
856
+ template <>
857
+ HWY_INLINE constexpr bool IsIntegerLaneType<uint8_t>() {
858
+ return true;
859
+ }
860
+ template <>
861
+ HWY_INLINE constexpr bool IsIntegerLaneType<int16_t>() {
862
+ return true;
863
+ }
864
+ template <>
865
+ HWY_INLINE constexpr bool IsIntegerLaneType<uint16_t>() {
866
+ return true;
867
+ }
868
+ template <>
869
+ HWY_INLINE constexpr bool IsIntegerLaneType<int32_t>() {
870
+ return true;
871
+ }
872
+ template <>
873
+ HWY_INLINE constexpr bool IsIntegerLaneType<uint32_t>() {
874
+ return true;
875
+ }
876
+ template <>
877
+ HWY_INLINE constexpr bool IsIntegerLaneType<int64_t>() {
878
+ return true;
879
+ }
880
+ template <>
881
+ HWY_INLINE constexpr bool IsIntegerLaneType<uint64_t>() {
882
+ return true;
883
+ }
884
+
885
+ template <class T>
886
+ HWY_API constexpr bool IsInteger() {
887
+ // NOTE: Do not add a IsInteger<wchar_t>() specialization below as it is
888
+ // possible for IsSame<wchar_t, uint16_t>() to be true when compiled with MSVC
889
+ // with the /Zc:wchar_t- option.
890
+ return IsIntegerLaneType<T>() || IsSame<RemoveCvRef<T>, wchar_t>() ||
891
+ IsSameEither<RemoveCvRef<T>, size_t, ptrdiff_t>() ||
892
+ IsSameEither<RemoveCvRef<T>, intptr_t, uintptr_t>();
893
+ }
894
+ template <>
895
+ HWY_INLINE constexpr bool IsInteger<bool>() {
896
+ return true;
897
+ }
898
+ template <>
899
+ HWY_INLINE constexpr bool IsInteger<char>() {
900
+ return true;
901
+ }
902
+ template <>
903
+ HWY_INLINE constexpr bool IsInteger<signed char>() {
904
+ return true;
905
+ }
906
+ template <>
907
+ HWY_INLINE constexpr bool IsInteger<unsigned char>() {
908
+ return true;
909
+ }
910
+ template <>
911
+ HWY_INLINE constexpr bool IsInteger<short>() { // NOLINT
912
+ return true;
913
+ }
914
+ template <>
915
+ HWY_INLINE constexpr bool IsInteger<unsigned short>() { // NOLINT
916
+ return true;
917
+ }
918
+ template <>
919
+ HWY_INLINE constexpr bool IsInteger<int>() {
920
+ return true;
921
+ }
922
+ template <>
923
+ HWY_INLINE constexpr bool IsInteger<unsigned>() {
924
+ return true;
925
+ }
926
+ template <>
927
+ HWY_INLINE constexpr bool IsInteger<long>() { // NOLINT
928
+ return true;
929
+ }
930
+ template <>
931
+ HWY_INLINE constexpr bool IsInteger<unsigned long>() { // NOLINT
932
+ return true;
933
+ }
934
+ template <>
935
+ HWY_INLINE constexpr bool IsInteger<long long>() { // NOLINT
936
+ return true;
937
+ }
938
+ template <>
939
+ HWY_INLINE constexpr bool IsInteger<unsigned long long>() { // NOLINT
940
+ return true;
941
+ }
942
+ #if defined(__cpp_char8_t) && __cpp_char8_t >= 201811L
943
+ template <>
944
+ HWY_INLINE constexpr bool IsInteger<char8_t>() {
945
+ return true;
946
+ }
947
+ #endif
948
+ template <>
949
+ HWY_INLINE constexpr bool IsInteger<char16_t>() {
950
+ return true;
951
+ }
952
+ template <>
953
+ HWY_INLINE constexpr bool IsInteger<char32_t>() {
954
+ return true;
955
+ }
956
+
957
+ // -----------------------------------------------------------------------------
958
+ // BitCastScalar
959
+
960
+ #if HWY_HAS_BUILTIN(__builtin_bit_cast) || HWY_COMPILER_MSVC >= 1926
961
+ #define HWY_BITCASTSCALAR_CONSTEXPR constexpr
962
+ #else
963
+ #define HWY_BITCASTSCALAR_CONSTEXPR
964
+ #endif
965
+
966
+ #if __cpp_constexpr >= 201304L
967
+ #define HWY_BITCASTSCALAR_CXX14_CONSTEXPR HWY_BITCASTSCALAR_CONSTEXPR
968
+ #else
969
+ #define HWY_BITCASTSCALAR_CXX14_CONSTEXPR
970
+ #endif
971
+
972
+ #if HWY_HAS_BUILTIN(__builtin_bit_cast) || HWY_COMPILER_MSVC >= 1926
973
+ namespace detail {
974
+
975
+ template <class From>
976
+ struct BitCastScalarSrcCastHelper {
977
+ static HWY_INLINE constexpr const From& CastSrcValRef(const From& val) {
978
+ return val;
979
+ }
980
+ };
981
+
982
+ #if HWY_COMPILER_CLANG >= 900 && HWY_COMPILER_CLANG < 1000
983
+ // Workaround for Clang 9 constexpr __builtin_bit_cast bug
984
+ template <class To, class From,
985
+ hwy::EnableIf<hwy::IsInteger<RemoveCvRef<To>>() &&
986
+ hwy::IsInteger<RemoveCvRef<From>>()>* = nullptr>
987
+ static HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR To
988
+ BuiltinBitCastScalar(const From& val) {
989
+ static_assert(sizeof(To) == sizeof(From),
990
+ "sizeof(To) == sizeof(From) must be true");
991
+ return static_cast<To>(val);
992
+ }
993
+
994
+ template <class To, class From,
995
+ hwy::EnableIf<!(hwy::IsInteger<RemoveCvRef<To>>() &&
996
+ hwy::IsInteger<RemoveCvRef<From>>())>* = nullptr>
997
+ static HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR To
998
+ BuiltinBitCastScalar(const From& val) {
999
+ return __builtin_bit_cast(To, val);
1000
+ }
1001
+ #endif // HWY_COMPILER_CLANG >= 900 && HWY_COMPILER_CLANG < 1000
1002
+
1003
+ } // namespace detail
1004
+
1005
+ template <class To, class From, HWY_IF_NOT_SPECIAL_FLOAT(To)>
1006
+ HWY_API HWY_BITCASTSCALAR_CONSTEXPR To BitCastScalar(const From& val) {
1007
+ // If From is hwy::float16_t or hwy::bfloat16_t, first cast val to either
1008
+ // const typename From::Native& or const uint16_t& using
1009
+ // detail::BitCastScalarSrcCastHelper<RemoveCvRef<From>>::CastSrcValRef to
1010
+ // allow BitCastScalar from hwy::float16_t or hwy::bfloat16_t to be constexpr
1011
+ // if To is not a pointer type, union type, or a struct/class containing a
1012
+ // pointer, union, or reference subobject
1013
+ #if HWY_COMPILER_CLANG >= 900 && HWY_COMPILER_CLANG < 1000
1014
+ return detail::BuiltinBitCastScalar<To>(
1015
+ detail::BitCastScalarSrcCastHelper<RemoveCvRef<From>>::CastSrcValRef(
1016
+ val));
1017
+ #else
1018
+ return __builtin_bit_cast(
1019
+ To, detail::BitCastScalarSrcCastHelper<RemoveCvRef<From>>::CastSrcValRef(
1020
+ val));
1021
+ #endif
1022
+ }
1023
+ template <class To, class From, HWY_IF_SPECIAL_FLOAT(To)>
1024
+ HWY_API HWY_BITCASTSCALAR_CONSTEXPR To BitCastScalar(const From& val) {
1025
+ // If To is hwy::float16_t or hwy::bfloat16_t, first do a BitCastScalar of val
1026
+ // to uint16_t, and then bit cast the uint16_t value to To using To::FromBits
1027
+ // as hwy::float16_t::FromBits and hwy::bfloat16_t::FromBits are guaranteed to
1028
+ // be constexpr if the __builtin_bit_cast intrinsic is available.
1029
+ return To::FromBits(BitCastScalar<uint16_t>(val));
1030
+ }
1031
+ #else
1032
+ template <class To, class From>
1033
+ HWY_API HWY_BITCASTSCALAR_CONSTEXPR To BitCastScalar(const From& val) {
1034
+ To result;
1035
+ CopySameSize(&val, &result);
1036
+ return result;
1037
+ }
1038
+ #endif
1039
+
1040
+ //------------------------------------------------------------------------------
1041
+ // F16 lane type
1042
+
1043
+ #pragma pack(push, 1)
1044
+
1045
+ // Compiler supports __fp16 and load/store/conversion NEON intrinsics, which are
1046
+ // included in Armv8 and VFPv4 (except with MSVC). On Armv7 Clang requires
1047
+ // __ARM_FP & 2 whereas Armv7 GCC requires -mfp16-format=ieee.
1048
+ #if (HWY_ARCH_ARM_A64 && !HWY_COMPILER_MSVC) || \
1049
+ (HWY_COMPILER_CLANG && defined(__ARM_FP) && (__ARM_FP & 2)) || \
1050
+ (HWY_COMPILER_GCC_ACTUAL && defined(__ARM_FP16_FORMAT_IEEE))
1051
+ #define HWY_NEON_HAVE_F16C 1
1052
+ #else
1053
+ #define HWY_NEON_HAVE_F16C 0
1054
+ #endif
1055
+
1056
+ // RVV with f16 extension supports _Float16 and f16 vector ops. If set, implies
1057
+ // HWY_HAVE_FLOAT16.
1058
+ #if HWY_ARCH_RISCV && defined(__riscv_zvfh) && HWY_COMPILER_CLANG >= 1600
1059
+ #define HWY_RVV_HAVE_F16_VEC 1
1060
+ #else
1061
+ #define HWY_RVV_HAVE_F16_VEC 0
1062
+ #endif
1063
+
1064
+ // x86 compiler supports _Float16, not necessarily with operators.
1065
+ // Avoid clang-cl because it lacks __extendhfsf2.
1066
+ #if HWY_ARCH_X86 && defined(__SSE2__) && defined(__FLT16_MAX__) && \
1067
+ ((HWY_COMPILER_CLANG >= 1500 && !HWY_COMPILER_CLANGCL) || \
1068
+ HWY_COMPILER_GCC_ACTUAL >= 1200)
1069
+ #define HWY_SSE2_HAVE_F16_TYPE 1
1070
+ #else
1071
+ #define HWY_SSE2_HAVE_F16_TYPE 0
1072
+ #endif
1073
+
1074
+ #ifndef HWY_HAVE_SCALAR_F16_TYPE
1075
+ // Compiler supports _Float16, not necessarily with operators.
1076
+ #if HWY_NEON_HAVE_F16C || HWY_RVV_HAVE_F16_VEC || HWY_SSE2_HAVE_F16_TYPE
1077
+ #define HWY_HAVE_SCALAR_F16_TYPE 1
1078
+ #else
1079
+ #define HWY_HAVE_SCALAR_F16_TYPE 0
1080
+ #endif
1081
+ #endif // HWY_HAVE_SCALAR_F16_TYPE
1082
+
1083
+ #ifndef HWY_HAVE_SCALAR_F16_OPERATORS
1084
+ // Recent enough compiler also has operators.
1085
+ #if HWY_HAVE_SCALAR_F16_TYPE && \
1086
+ (HWY_COMPILER_CLANG >= 1800 || HWY_COMPILER_GCC_ACTUAL >= 1200 || \
1087
+ (HWY_COMPILER_CLANG >= 1500 && !HWY_COMPILER_CLANGCL && \
1088
+ !defined(_WIN32)) || \
1089
+ (HWY_ARCH_ARM && \
1090
+ (HWY_COMPILER_CLANG >= 900 || HWY_COMPILER_GCC_ACTUAL >= 800)))
1091
+ #define HWY_HAVE_SCALAR_F16_OPERATORS 1
1092
+ #else
1093
+ #define HWY_HAVE_SCALAR_F16_OPERATORS 0
1094
+ #endif
1095
+ #endif // HWY_HAVE_SCALAR_F16_OPERATORS
1096
+
1097
+ namespace detail {
1098
+
1099
+ template <class T, class TVal = RemoveCvRef<T>, bool = IsSpecialFloat<TVal>()>
1100
+ struct SpecialFloatUnwrapArithOpOperandT {};
1101
+
1102
+ template <class T, class TVal>
1103
+ struct SpecialFloatUnwrapArithOpOperandT<T, TVal, false> {
1104
+ using type = T;
1105
+ };
1106
+
1107
+ template <class T>
1108
+ using SpecialFloatUnwrapArithOpOperand =
1109
+ typename SpecialFloatUnwrapArithOpOperandT<T>::type;
1110
+
1111
+ template <class T, class TVal = RemoveCvRef<T>>
1112
+ struct NativeSpecialFloatToWrapperT {
1113
+ using type = T;
1114
+ };
1115
+
1116
+ template <class T>
1117
+ using NativeSpecialFloatToWrapper =
1118
+ typename NativeSpecialFloatToWrapperT<T>::type;
1119
+
1120
+ } // namespace detail
1121
+
1122
+ // Match [u]int##_t naming scheme so rvv-inl.h macros can obtain the type name
1123
+ // by concatenating base type and bits. We use a wrapper class instead of a
1124
+ // typedef to the native type to ensure that the same symbols, e.g. for VQSort,
1125
+ // are generated regardless of F16 support; see #1684.
1126
+ struct alignas(2) float16_t {
1127
+ #if HWY_HAVE_SCALAR_F16_TYPE
1128
+ #if HWY_RVV_HAVE_F16_VEC || HWY_SSE2_HAVE_F16_TYPE
1129
+ using Native = _Float16;
1130
+ #elif HWY_NEON_HAVE_F16C
1131
+ using Native = __fp16;
1132
+ #else
1133
+ #error "Logic error: condition should be 'all but NEON_HAVE_F16C'"
1134
+ #endif
1135
+ #endif // HWY_HAVE_SCALAR_F16_TYPE
1136
+
1137
+ union {
1138
+ #if HWY_HAVE_SCALAR_F16_TYPE
1139
+ // Accessed via NativeLaneType, and used directly if
1140
+ // HWY_HAVE_SCALAR_F16_OPERATORS.
1141
+ Native native;
1142
+ #endif
1143
+ // Only accessed via NativeLaneType or U16LaneType.
1144
+ uint16_t bits;
1145
+ };
1146
+
1147
+ // Default init and copying.
1148
+ float16_t() noexcept = default;
1149
+ constexpr float16_t(const float16_t&) noexcept = default;
1150
+ constexpr float16_t(float16_t&&) noexcept = default;
1151
+ float16_t& operator=(const float16_t&) noexcept = default;
1152
+ float16_t& operator=(float16_t&&) noexcept = default;
1153
+
1154
+ #if HWY_HAVE_SCALAR_F16_TYPE
1155
+ // NEON vget/set_lane intrinsics and SVE `svaddv` could use explicit
1156
+ // float16_t(intrinsic()), but user code expects implicit conversions.
1157
+ constexpr float16_t(Native arg) noexcept : native(arg) {}
1158
+ constexpr operator Native() const noexcept { return native; }
1159
+ #endif
1160
+
1161
+ #if HWY_HAVE_SCALAR_F16_TYPE
1162
+ static HWY_BITCASTSCALAR_CONSTEXPR float16_t FromBits(uint16_t bits) {
1163
+ return float16_t(BitCastScalar<Native>(bits));
1164
+ }
1165
+ #else
1166
+
1167
+ private:
1168
+ struct F16FromU16BitsTag {};
1169
+ constexpr float16_t(F16FromU16BitsTag /*tag*/, uint16_t u16_bits)
1170
+ : bits(u16_bits) {}
1171
+
1172
+ public:
1173
+ static constexpr float16_t FromBits(uint16_t bits) {
1174
+ return float16_t(F16FromU16BitsTag(), bits);
1175
+ }
1176
+ #endif
1177
+
1178
+ // When backed by a native type, ensure the wrapper behaves like the native
1179
+ // type by forwarding all operators. Unfortunately it seems difficult to reuse
1180
+ // this code in a base class, so we repeat it in float16_t.
1181
+ #if HWY_HAVE_SCALAR_F16_OPERATORS || HWY_IDE
1182
+ template <typename T, hwy::EnableIf<!IsSame<RemoveCvRef<T>, float16_t>() &&
1183
+ IsConvertible<T, Native>()>* = nullptr>
1184
+ constexpr float16_t(T&& arg) noexcept
1185
+ : native(static_cast<Native>(static_cast<T&&>(arg))) {}
1186
+
1187
+ template <typename T, hwy::EnableIf<!IsSame<RemoveCvRef<T>, float16_t>() &&
1188
+ !IsConvertible<T, Native>() &&
1189
+ IsStaticCastable<T, Native>()>* = nullptr>
1190
+ explicit constexpr float16_t(T&& arg) noexcept
1191
+ : native(static_cast<Native>(static_cast<T&&>(arg))) {}
1192
+
1193
+ // pre-decrement operator (--x)
1194
+ HWY_CXX14_CONSTEXPR float16_t& operator--() noexcept {
1195
+ native = static_cast<Native>(native - Native{1});
1196
+ return *this;
1197
+ }
1198
+
1199
+ // post-decrement operator (x--)
1200
+ HWY_CXX14_CONSTEXPR float16_t operator--(int) noexcept {
1201
+ float16_t result = *this;
1202
+ native = static_cast<Native>(native - Native{1});
1203
+ return result;
1204
+ }
1205
+
1206
+ // pre-increment operator (++x)
1207
+ HWY_CXX14_CONSTEXPR float16_t& operator++() noexcept {
1208
+ native = static_cast<Native>(native + Native{1});
1209
+ return *this;
1210
+ }
1211
+
1212
+ // post-increment operator (x++)
1213
+ HWY_CXX14_CONSTEXPR float16_t operator++(int) noexcept {
1214
+ float16_t result = *this;
1215
+ native = static_cast<Native>(native + Native{1});
1216
+ return result;
1217
+ }
1218
+
1219
+ constexpr float16_t operator-() const noexcept {
1220
+ return float16_t(static_cast<Native>(-native));
1221
+ }
1222
+ constexpr float16_t operator+() const noexcept { return *this; }
1223
+
1224
+ // Reduce clutter by generating `operator+` and `operator+=` etc. Note that
1225
+ // we cannot token-paste `operator` and `+`, so pass it in as `op_func`.
1226
+ #define HWY_FLOAT16_BINARY_OP(op, op_func, assign_func) \
1227
+ constexpr float16_t op_func(const float16_t& rhs) const noexcept { \
1228
+ return float16_t(static_cast<Native>(native op rhs.native)); \
1229
+ } \
1230
+ template <typename T, HWY_IF_NOT_F16(T), \
1231
+ typename UnwrappedT = \
1232
+ detail::SpecialFloatUnwrapArithOpOperand<const T&>, \
1233
+ typename RawResultT = \
1234
+ decltype(DeclVal<Native>() op DeclVal<UnwrappedT>()), \
1235
+ typename ResultT = \
1236
+ detail::NativeSpecialFloatToWrapper<RawResultT>, \
1237
+ HWY_IF_CASTABLE(RawResultT, ResultT)> \
1238
+ constexpr ResultT op_func(const T& rhs) const noexcept(noexcept( \
1239
+ static_cast<ResultT>(DeclVal<Native>() op DeclVal<UnwrappedT>()))) { \
1240
+ return static_cast<ResultT>(native op static_cast<UnwrappedT>(rhs)); \
1241
+ } \
1242
+ HWY_CXX14_CONSTEXPR hwy::float16_t& assign_func( \
1243
+ const hwy::float16_t& rhs) noexcept { \
1244
+ native = static_cast<Native>(native op rhs.native); \
1245
+ return *this; \
1246
+ } \
1247
+ template <typename T, HWY_IF_NOT_F16(T), \
1248
+ HWY_IF_OP_CASTABLE(op, const T&, Native), \
1249
+ HWY_IF_ASSIGNABLE( \
1250
+ Native, decltype(DeclVal<Native>() op DeclVal<const T&>()))> \
1251
+ HWY_CXX14_CONSTEXPR hwy::float16_t& assign_func(const T& rhs) noexcept( \
1252
+ noexcept( \
1253
+ static_cast<Native>(DeclVal<Native>() op DeclVal<const T&>()))) { \
1254
+ native = static_cast<Native>(native op rhs); \
1255
+ return *this; \
1256
+ }
1257
+
1258
+ HWY_FLOAT16_BINARY_OP(+, operator+, operator+=)
1259
+ HWY_FLOAT16_BINARY_OP(-, operator-, operator-=)
1260
+ HWY_FLOAT16_BINARY_OP(*, operator*, operator*=)
1261
+ HWY_FLOAT16_BINARY_OP(/, operator/, operator/=)
1262
+ #undef HWY_FLOAT16_BINARY_OP
1263
+
1264
+ #endif // HWY_HAVE_SCALAR_F16_OPERATORS
1265
+ };
1266
+ static_assert(sizeof(hwy::float16_t) == 2, "Wrong size of float16_t");
1267
+
1268
+ #if HWY_HAVE_SCALAR_F16_TYPE
1269
+ namespace detail {
1270
+
1271
+ #if HWY_HAVE_SCALAR_F16_OPERATORS
1272
+ template <class T>
1273
+ struct SpecialFloatUnwrapArithOpOperandT<T, hwy::float16_t, true> {
1274
+ using type = hwy::float16_t::Native;
1275
+ };
1276
+ #endif
1277
+
1278
+ template <class T>
1279
+ struct NativeSpecialFloatToWrapperT<T, hwy::float16_t::Native> {
1280
+ using type = hwy::float16_t;
1281
+ };
1282
+
1283
+ } // namespace detail
1284
+ #endif // HWY_HAVE_SCALAR_F16_TYPE
1285
+
1286
+ #if HWY_HAS_BUILTIN(__builtin_bit_cast) || HWY_COMPILER_MSVC >= 1926
1287
+ namespace detail {
1288
+
1289
+ template <>
1290
+ struct BitCastScalarSrcCastHelper<hwy::float16_t> {
1291
+ #if HWY_HAVE_SCALAR_F16_TYPE
1292
+ static HWY_INLINE constexpr const hwy::float16_t::Native& CastSrcValRef(
1293
+ const hwy::float16_t& val) {
1294
+ return val.native;
1295
+ }
1296
+ #else
1297
+ static HWY_INLINE constexpr const uint16_t& CastSrcValRef(
1298
+ const hwy::float16_t& val) {
1299
+ return val.bits;
1300
+ }
1301
+ #endif
1302
+ };
1303
+
1304
+ } // namespace detail
1305
+ #endif // HWY_HAS_BUILTIN(__builtin_bit_cast) || HWY_COMPILER_MSVC >= 1926
1306
+
1307
+ #if HWY_HAVE_SCALAR_F16_OPERATORS
1308
+ #define HWY_F16_CONSTEXPR constexpr
1309
+ #else
1310
+ #define HWY_F16_CONSTEXPR HWY_BITCASTSCALAR_CXX14_CONSTEXPR
1311
+ #endif // HWY_HAVE_SCALAR_F16_OPERATORS
1312
+
1313
+ HWY_API HWY_F16_CONSTEXPR float F32FromF16(float16_t f16) {
1314
+ #if HWY_HAVE_SCALAR_F16_OPERATORS && !HWY_IDE
1315
+ return static_cast<float>(f16);
1316
+ #endif
1317
+ #if !HWY_HAVE_SCALAR_F16_OPERATORS || HWY_IDE
1318
+ const uint16_t bits16 = BitCastScalar<uint16_t>(f16);
1319
+ const uint32_t sign = static_cast<uint32_t>(bits16 >> 15);
1320
+ const uint32_t biased_exp = (bits16 >> 10) & 0x1F;
1321
+ const uint32_t mantissa = bits16 & 0x3FF;
1322
+
1323
+ // Subnormal or zero
1324
+ if (biased_exp == 0) {
1325
+ const float subnormal =
1326
+ (1.0f / 16384) * (static_cast<float>(mantissa) * (1.0f / 1024));
1327
+ return sign ? -subnormal : subnormal;
1328
+ }
1329
+
1330
+ // Normalized, infinity or NaN: convert the representation directly
1331
+ // (faster than ldexp/tables).
1332
+ const uint32_t biased_exp32 =
1333
+ biased_exp == 31 ? 0xFF : biased_exp + (127 - 15);
1334
+ const uint32_t mantissa32 = mantissa << (23 - 10);
1335
+ const uint32_t bits32 = (sign << 31) | (biased_exp32 << 23) | mantissa32;
1336
+
1337
+ return BitCastScalar<float>(bits32);
1338
+ #endif // !HWY_HAVE_SCALAR_F16_OPERATORS
1339
+ }
1340
+
1341
+ #if HWY_IS_DEBUG_BUILD && \
1342
+ (HWY_HAS_BUILTIN(__builtin_bit_cast) || HWY_COMPILER_MSVC >= 1926)
1343
+ #if defined(__cpp_if_consteval) && __cpp_if_consteval >= 202106L
1344
+ // If C++23 if !consteval support is available, only execute
1345
+ // HWY_DASSERT(condition) if F16FromF32 is not called from a constant-evaluated
1346
+ // context to avoid compilation errors.
1347
+ #define HWY_F16_FROM_F32_DASSERT(condition) \
1348
+ do { \
1349
+ if !consteval { \
1350
+ HWY_DASSERT(condition); \
1351
+ } \
1352
+ } while (0)
1353
+ #elif HWY_HAS_BUILTIN(__builtin_is_constant_evaluated) || \
1354
+ HWY_COMPILER_MSVC >= 1926
1355
+ // If the __builtin_is_constant_evaluated() intrinsic is available,
1356
+ // only do HWY_DASSERT(condition) if __builtin_is_constant_evaluated() returns
1357
+ // false to avoid compilation errors if F16FromF32 is called from a
1358
+ // constant-evaluated context.
1359
+ #define HWY_F16_FROM_F32_DASSERT(condition) \
1360
+ do { \
1361
+ if (!__builtin_is_constant_evaluated()) { \
1362
+ HWY_DASSERT(condition); \
1363
+ } \
1364
+ } while (0)
1365
+ #else
1366
+ // If C++23 if !consteval support is not available,
1367
+ // the __builtin_is_constant_evaluated() intrinsic is not available,
1368
+ // HWY_IS_DEBUG_BUILD is 1, and the __builtin_bit_cast intrinsic is available,
1369
+ // do not do a HWY_DASSERT to avoid compilation errors if F16FromF32 is
1370
+ // called from a constant-evaluated context.
1371
+ #define HWY_F16_FROM_F32_DASSERT(condition) \
1372
+ do { \
1373
+ } while (0)
1374
+ #endif // defined(__cpp_if_consteval) && __cpp_if_consteval >= 202106L
1375
+ #else
1376
+ // If HWY_IS_DEBUG_BUILD is 0 or the __builtin_bit_cast intrinsic is not
1377
+ // available, define HWY_F16_FROM_F32_DASSERT(condition) as
1378
+ // HWY_DASSERT(condition)
1379
+ #define HWY_F16_FROM_F32_DASSERT(condition) HWY_DASSERT(condition)
1380
+ #endif // HWY_IS_DEBUG_BUILD && (HWY_HAS_BUILTIN(__builtin_bit_cast) ||
1381
+ // HWY_COMPILER_MSVC >= 1926)
1382
+
1383
+ HWY_API HWY_F16_CONSTEXPR float16_t F16FromF32(float f32) {
1384
+ #if HWY_HAVE_SCALAR_F16_OPERATORS && !HWY_IDE
1385
+ return float16_t(static_cast<float16_t::Native>(f32));
1386
+ #endif
1387
+ #if !HWY_HAVE_SCALAR_F16_OPERATORS || HWY_IDE
1388
+ const uint32_t bits32 = BitCastScalar<uint32_t>(f32);
1389
+ const uint32_t sign = bits32 >> 31;
1390
+ const uint32_t biased_exp32 = (bits32 >> 23) & 0xFF;
1391
+ constexpr uint32_t kMantissaMask = 0x7FFFFF;
1392
+ const uint32_t mantissa32 = bits32 & kMantissaMask;
1393
+
1394
+ // Before shifting (truncation), round to nearest even to reduce bias. If
1395
+ // the lowest remaining mantissa bit is odd, increase the offset. Example
1396
+ // with the lowest remaining bit (left) and next lower two bits; the
1397
+ // latter, plus two more, will be truncated.
1398
+ // 0[00] + 1 = 0[01]
1399
+ // 0[01] + 1 = 0[10]
1400
+ // 0[10] + 1 = 0[11] (round down toward even)
1401
+ // 0[11] + 1 = 1[00] (round up)
1402
+ // 1[00] + 10 = 1[10]
1403
+ // 1[01] + 10 = 1[11]
1404
+ // 1[10] + 10 = C0[00] (round up toward even with C=1 carry out)
1405
+ // 1[11] + 10 = C0[01] (round up toward even with C=1 carry out)
1406
+
1407
+ // If |f32| >= 2^-24, f16_ulp_bit_idx is the index of the F32 mantissa bit
1408
+ // that will be shifted down into the ULP bit of the rounded down F16 result
1409
+
1410
+ // The biased F32 exponent of 2^-14 (the smallest positive normal F16 value)
1411
+ // is 113, and bit 13 of the F32 mantissa will be shifted down to into the ULP
1412
+ // bit of the rounded down F16 result if |f32| >= 2^14
1413
+
1414
+ // If |f32| < 2^-24, f16_ulp_bit_idx is equal to 24 as there are 24 mantissa
1415
+ // bits (including the implied 1 bit) in the mantissa of a normal F32 value
1416
+ // and as we want to round up the mantissa if |f32| > 2^-25 && |f32| < 2^-24
1417
+ const int32_t f16_ulp_bit_idx =
1418
+ HWY_MIN(HWY_MAX(126 - static_cast<int32_t>(biased_exp32), 13), 24);
1419
+ const uint32_t odd_bit = ((mantissa32 | 0x800000u) >> f16_ulp_bit_idx) & 1;
1420
+ const uint32_t rounded =
1421
+ mantissa32 + odd_bit + (uint32_t{1} << (f16_ulp_bit_idx - 1)) - 1u;
1422
+ const bool carry = rounded >= (1u << 23);
1423
+
1424
+ const int32_t exp = static_cast<int32_t>(biased_exp32) - 127 + carry;
1425
+
1426
+ // Tiny or zero => zero.
1427
+ if (exp < -24) {
1428
+ // restore original sign
1429
+ return float16_t::FromBits(static_cast<uint16_t>(sign << 15));
1430
+ }
1431
+
1432
+ // If biased_exp16 would be >= 31, first check whether the input was NaN so we
1433
+ // can set the mantissa to nonzero.
1434
+ const bool is_nan = (biased_exp32 == 255) && mantissa32 != 0;
1435
+ const bool overflowed = exp >= 16;
1436
+ const uint32_t biased_exp16 =
1437
+ static_cast<uint32_t>(HWY_MIN(HWY_MAX(0, exp + 15), 31));
1438
+ // exp = [-24, -15] => subnormal, shift the mantissa.
1439
+ const uint32_t sub_exp = static_cast<uint32_t>(HWY_MAX(-14 - exp, 0));
1440
+ HWY_F16_FROM_F32_DASSERT(sub_exp < 11);
1441
+ const uint32_t shifted_mantissa =
1442
+ (rounded & kMantissaMask) >> (23 - 10 + sub_exp);
1443
+ const uint32_t leading = sub_exp == 0u ? 0u : (1024u >> sub_exp);
1444
+ const uint32_t mantissa16 = is_nan ? 0x3FF
1445
+ : overflowed ? 0u
1446
+ : (leading + shifted_mantissa);
1447
+
1448
+ #if HWY_IS_DEBUG_BUILD
1449
+ if (exp < -14) {
1450
+ HWY_F16_FROM_F32_DASSERT(biased_exp16 == 0);
1451
+ HWY_F16_FROM_F32_DASSERT(sub_exp >= 1);
1452
+ } else if (exp <= 15) {
1453
+ HWY_F16_FROM_F32_DASSERT(1 <= biased_exp16 && biased_exp16 < 31);
1454
+ HWY_F16_FROM_F32_DASSERT(sub_exp == 0);
1455
+ }
1456
+ #endif
1457
+
1458
+ HWY_F16_FROM_F32_DASSERT(mantissa16 < 1024);
1459
+ const uint32_t bits16 = (sign << 15) | (biased_exp16 << 10) | mantissa16;
1460
+ HWY_F16_FROM_F32_DASSERT(bits16 < 0x10000);
1461
+ const uint16_t narrowed = static_cast<uint16_t>(bits16); // big-endian safe
1462
+ return float16_t::FromBits(narrowed);
1463
+ #endif // !HWY_HAVE_SCALAR_F16_OPERATORS
1464
+ }
1465
+
1466
+ HWY_API HWY_F16_CONSTEXPR float16_t F16FromF64(double f64) {
1467
+ #if HWY_HAVE_SCALAR_F16_OPERATORS
1468
+ return float16_t(static_cast<float16_t::Native>(f64));
1469
+ #else
1470
+ // The mantissa bits of f64 are first rounded using round-to-odd rounding
1471
+ // to the nearest f64 value that has the lower 29 bits zeroed out to
1472
+ // ensure that the result is correctly rounded to a F16.
1473
+
1474
+ // The F64 round-to-odd operation below will round a normal F64 value
1475
+ // (using round-to-odd rounding) to a F64 value that has 24 bits of precision.
1476
+
1477
+ // It is okay if the magnitude of a denormal F64 value is rounded up in the
1478
+ // F64 round-to-odd step below as the magnitude of a denormal F64 value is
1479
+ // much smaller than 2^(-24) (the smallest positive denormal F16 value).
1480
+
1481
+ // It is also okay if bit 29 of a NaN F64 value is changed by the F64
1482
+ // round-to-odd step below as the lower 13 bits of a F32 NaN value are usually
1483
+ // discarded or ignored by the conversion of a F32 NaN value to a F16.
1484
+
1485
+ // If f64 is a NaN value, the result of the F64 round-to-odd step will be a
1486
+ // NaN value as the result of the F64 round-to-odd step will have at least one
1487
+ // mantissa bit if f64 is a NaN value.
1488
+
1489
+ // The F64 round-to-odd step will ensure that the F64 to F32 conversion is
1490
+ // exact if the magnitude of the rounded F64 value (using round-to-odd
1491
+ // rounding) is between 2^(-126) (the smallest normal F32 value) and
1492
+ // HighestValue<float>() (the largest finite F32 value)
1493
+
1494
+ // It is okay if the F64 to F32 conversion is inexact for F64 values that have
1495
+ // a magnitude that is less than 2^(-126) as the magnitude of a denormal F32
1496
+ // value is much smaller than 2^(-24) (the smallest positive denormal F16
1497
+ // value).
1498
+
1499
+ return F16FromF32(
1500
+ static_cast<float>(BitCastScalar<double>(static_cast<uint64_t>(
1501
+ (BitCastScalar<uint64_t>(f64) & 0xFFFFFFFFE0000000ULL) |
1502
+ ((BitCastScalar<uint64_t>(f64) + 0x000000001FFFFFFFULL) &
1503
+ 0x0000000020000000ULL)))));
1504
+ #endif
1505
+ }
1506
+
1507
+ // More convenient to define outside float16_t because these may use
1508
+ // F32FromF16, which is defined after the struct.
1509
+ HWY_F16_CONSTEXPR inline bool operator==(float16_t lhs,
1510
+ float16_t rhs) noexcept {
1511
+ #if HWY_HAVE_SCALAR_F16_OPERATORS
1512
+ return lhs.native == rhs.native;
1513
+ #else
1514
+ return F32FromF16(lhs) == F32FromF16(rhs);
1515
+ #endif
1516
+ }
1517
+ HWY_F16_CONSTEXPR inline bool operator!=(float16_t lhs,
1518
+ float16_t rhs) noexcept {
1519
+ #if HWY_HAVE_SCALAR_F16_OPERATORS
1520
+ return lhs.native != rhs.native;
1521
+ #else
1522
+ return F32FromF16(lhs) != F32FromF16(rhs);
1523
+ #endif
1524
+ }
1525
+ HWY_F16_CONSTEXPR inline bool operator<(float16_t lhs, float16_t rhs) noexcept {
1526
+ #if HWY_HAVE_SCALAR_F16_OPERATORS
1527
+ return lhs.native < rhs.native;
1528
+ #else
1529
+ return F32FromF16(lhs) < F32FromF16(rhs);
1530
+ #endif
1531
+ }
1532
+ HWY_F16_CONSTEXPR inline bool operator<=(float16_t lhs,
1533
+ float16_t rhs) noexcept {
1534
+ #if HWY_HAVE_SCALAR_F16_OPERATORS
1535
+ return lhs.native <= rhs.native;
1536
+ #else
1537
+ return F32FromF16(lhs) <= F32FromF16(rhs);
1538
+ #endif
1539
+ }
1540
+ HWY_F16_CONSTEXPR inline bool operator>(float16_t lhs, float16_t rhs) noexcept {
1541
+ #if HWY_HAVE_SCALAR_F16_OPERATORS
1542
+ return lhs.native > rhs.native;
1543
+ #else
1544
+ return F32FromF16(lhs) > F32FromF16(rhs);
1545
+ #endif
1546
+ }
1547
+ HWY_F16_CONSTEXPR inline bool operator>=(float16_t lhs,
1548
+ float16_t rhs) noexcept {
1549
+ #if HWY_HAVE_SCALAR_F16_OPERATORS
1550
+ return lhs.native >= rhs.native;
1551
+ #else
1552
+ return F32FromF16(lhs) >= F32FromF16(rhs);
1553
+ #endif
1554
+ }
1555
+ #if HWY_HAVE_CXX20_THREE_WAY_COMPARE
1556
+ HWY_F16_CONSTEXPR inline std::partial_ordering operator<=>(
1557
+ float16_t lhs, float16_t rhs) noexcept {
1558
+ #if HWY_HAVE_SCALAR_F16_OPERATORS
1559
+ return lhs.native <=> rhs.native;
1560
+ #else
1561
+ return F32FromF16(lhs) <=> F32FromF16(rhs);
1562
+ #endif
1563
+ }
1564
+ #endif // HWY_HAVE_CXX20_THREE_WAY_COMPARE
1565
+
1566
+ //------------------------------------------------------------------------------
1567
+ // BF16 lane type
1568
+
1569
+ // Compiler supports ACLE __bf16, not necessarily with operators.
1570
+
1571
+ // Disable the __bf16 type on AArch64 with GCC 13 or earlier as there is a bug
1572
+ // in GCC 13 and earlier that sometimes causes BF16 constant values to be
1573
+ // incorrectly loaded on AArch64, and this GCC bug on AArch64 is
1574
+ // described at https://gcc.gnu.org/bugzilla/show_bug.cgi?id=111867.
1575
+
1576
+ #if HWY_ARCH_ARM_A64 && \
1577
+ (HWY_COMPILER_CLANG >= 1700 || HWY_COMPILER_GCC_ACTUAL >= 1400)
1578
+ #define HWY_ARM_HAVE_SCALAR_BF16_TYPE 1
1579
+ #else
1580
+ #define HWY_ARM_HAVE_SCALAR_BF16_TYPE 0
1581
+ #endif
1582
+
1583
+ // x86 compiler supports __bf16, not necessarily with operators.
1584
+ #ifndef HWY_SSE2_HAVE_SCALAR_BF16_TYPE
1585
+ #if HWY_ARCH_X86 && defined(__SSE2__) && \
1586
+ ((HWY_COMPILER_CLANG >= 1700 && !HWY_COMPILER_CLANGCL) || \
1587
+ HWY_COMPILER_GCC_ACTUAL >= 1300)
1588
+ #define HWY_SSE2_HAVE_SCALAR_BF16_TYPE 1
1589
+ #else
1590
+ #define HWY_SSE2_HAVE_SCALAR_BF16_TYPE 0
1591
+ #endif
1592
+ #endif // HWY_SSE2_HAVE_SCALAR_BF16_TYPE
1593
+
1594
+ // Compiler supports __bf16, not necessarily with operators.
1595
+ #if HWY_ARM_HAVE_SCALAR_BF16_TYPE || HWY_SSE2_HAVE_SCALAR_BF16_TYPE
1596
+ #define HWY_HAVE_SCALAR_BF16_TYPE 1
1597
+ #else
1598
+ #define HWY_HAVE_SCALAR_BF16_TYPE 0
1599
+ #endif
1600
+
1601
+ #ifndef HWY_HAVE_SCALAR_BF16_OPERATORS
1602
+ // Recent enough compiler also has operators. aarch64 clang 18 hits internal
1603
+ // compiler errors on bf16 ToString, hence only enable on GCC for now.
1604
+ #if HWY_HAVE_SCALAR_BF16_TYPE && (HWY_COMPILER_GCC_ACTUAL >= 1300)
1605
+ #define HWY_HAVE_SCALAR_BF16_OPERATORS 1
1606
+ #else
1607
+ #define HWY_HAVE_SCALAR_BF16_OPERATORS 0
1608
+ #endif
1609
+ #endif // HWY_HAVE_SCALAR_BF16_OPERATORS
1610
+
1611
+ #if HWY_HAVE_SCALAR_BF16_OPERATORS
1612
+ #define HWY_BF16_CONSTEXPR constexpr
1613
+ #else
1614
+ #define HWY_BF16_CONSTEXPR HWY_BITCASTSCALAR_CONSTEXPR
1615
+ #endif
1616
+
1617
+ struct alignas(2) bfloat16_t {
1618
+ #if HWY_HAVE_SCALAR_BF16_TYPE
1619
+ using Native = __bf16;
1620
+ #endif
1621
+
1622
+ union {
1623
+ #if HWY_HAVE_SCALAR_BF16_TYPE
1624
+ // Accessed via NativeLaneType, and used directly if
1625
+ // HWY_HAVE_SCALAR_BF16_OPERATORS.
1626
+ Native native;
1627
+ #endif
1628
+ // Only accessed via NativeLaneType or U16LaneType.
1629
+ uint16_t bits;
1630
+ };
1631
+
1632
+ // Default init and copying
1633
+ bfloat16_t() noexcept = default;
1634
+ constexpr bfloat16_t(bfloat16_t&&) noexcept = default;
1635
+ constexpr bfloat16_t(const bfloat16_t&) noexcept = default;
1636
+ bfloat16_t& operator=(bfloat16_t&& arg) noexcept = default;
1637
+ bfloat16_t& operator=(const bfloat16_t& arg) noexcept = default;
1638
+
1639
+ // Only enable implicit conversions if we have a native type.
1640
+ #if HWY_HAVE_SCALAR_BF16_TYPE
1641
+ constexpr bfloat16_t(Native arg) noexcept : native(arg) {}
1642
+ constexpr operator Native() const noexcept { return native; }
1643
+ #endif
1644
+
1645
+ #if HWY_HAVE_SCALAR_BF16_TYPE
1646
+ static HWY_BITCASTSCALAR_CONSTEXPR bfloat16_t FromBits(uint16_t bits) {
1647
+ return bfloat16_t(BitCastScalar<Native>(bits));
1648
+ }
1649
+ #else
1650
+
1651
+ private:
1652
+ struct BF16FromU16BitsTag {};
1653
+ constexpr bfloat16_t(BF16FromU16BitsTag /*tag*/, uint16_t u16_bits)
1654
+ : bits(u16_bits) {}
1655
+
1656
+ public:
1657
+ static constexpr bfloat16_t FromBits(uint16_t bits) {
1658
+ return bfloat16_t(BF16FromU16BitsTag(), bits);
1659
+ }
1660
+ #endif
1661
+
1662
+ // When backed by a native type, ensure the wrapper behaves like the native
1663
+ // type by forwarding all operators. Unfortunately it seems difficult to reuse
1664
+ // this code in a base class, so we repeat it in float16_t.
1665
+ #if HWY_HAVE_SCALAR_BF16_OPERATORS || HWY_IDE
1666
+ template <typename T, hwy::EnableIf<!IsSame<RemoveCvRef<T>, Native>() &&
1667
+ !IsSame<RemoveCvRef<T>, bfloat16_t>() &&
1668
+ IsConvertible<T, Native>()>* = nullptr>
1669
+ constexpr bfloat16_t(T&& arg) noexcept(
1670
+ noexcept(static_cast<Native>(DeclVal<T>())))
1671
+ : native(static_cast<Native>(static_cast<T&&>(arg))) {}
1672
+
1673
+ template <typename T, hwy::EnableIf<!IsSame<RemoveCvRef<T>, Native>() &&
1674
+ !IsSame<RemoveCvRef<T>, bfloat16_t>() &&
1675
+ !IsConvertible<T, Native>() &&
1676
+ IsStaticCastable<T, Native>()>* = nullptr>
1677
+ explicit constexpr bfloat16_t(T&& arg) noexcept(
1678
+ noexcept(static_cast<Native>(DeclVal<T>())))
1679
+ : native(static_cast<Native>(static_cast<T&&>(arg))) {}
1680
+
1681
+ HWY_CXX14_CONSTEXPR bfloat16_t& operator=(Native arg) noexcept {
1682
+ native = arg;
1683
+ return *this;
1684
+ }
1685
+
1686
+ // pre-decrement operator (--x)
1687
+ HWY_CXX14_CONSTEXPR bfloat16_t& operator--() noexcept {
1688
+ native = static_cast<Native>(native - Native{1});
1689
+ return *this;
1690
+ }
874
1691
 
875
- // Use instead of HWY_IF_T_SIZE to avoid ambiguity with float16_t/float/double
876
- // overloads.
877
- #define HWY_IF_UI16(T) \
878
- hwy::EnableIf<IsSame<T, uint16_t>() || IsSame<T, int16_t>()>* = nullptr
879
- #define HWY_IF_UI32(T) \
880
- hwy::EnableIf<IsSame<T, uint32_t>() || IsSame<T, int32_t>()>* = nullptr
881
- #define HWY_IF_UI64(T) \
882
- hwy::EnableIf<IsSame<T, uint64_t>() || IsSame<T, int64_t>()>* = nullptr
883
- #define HWY_IF_BF16(T) hwy::EnableIf<IsSame<T, hwy::bfloat16_t>()>* = nullptr
884
- #define HWY_IF_F16(T) hwy::EnableIf<IsSame<T, hwy::float16_t>()>* = nullptr
1692
+ // post-decrement operator (x--)
1693
+ HWY_CXX14_CONSTEXPR bfloat16_t operator--(int) noexcept {
1694
+ bfloat16_t result = *this;
1695
+ native = static_cast<Native>(native - Native{1});
1696
+ return result;
1697
+ }
885
1698
 
886
- #define HWY_IF_LANES_PER_BLOCK(T, N, LANES) \
887
- hwy::EnableIf<HWY_MIN(sizeof(T) * N, 16) / sizeof(T) == (LANES)>* = nullptr
1699
+ // pre-increment operator (++x)
1700
+ HWY_CXX14_CONSTEXPR bfloat16_t& operator++() noexcept {
1701
+ native = static_cast<Native>(native + Native{1});
1702
+ return *this;
1703
+ }
888
1704
 
889
- // Empty struct used as a size tag type.
890
- template <size_t N>
891
- struct SizeTag {};
1705
+ // post-increment operator (x++)
1706
+ HWY_CXX14_CONSTEXPR bfloat16_t operator++(int) noexcept {
1707
+ bfloat16_t result = *this;
1708
+ native = static_cast<Native>(native + Native{1});
1709
+ return result;
1710
+ }
892
1711
 
893
- template <class T>
894
- struct RemoveConstT {
895
- using type = T;
896
- };
897
- template <class T>
898
- struct RemoveConstT<const T> {
899
- using type = T;
1712
+ constexpr bfloat16_t operator-() const noexcept {
1713
+ return bfloat16_t(static_cast<Native>(-native));
1714
+ }
1715
+ constexpr bfloat16_t operator+() const noexcept { return *this; }
1716
+
1717
+ // Reduce clutter by generating `operator+` and `operator+=` etc. Note that
1718
+ // we cannot token-paste `operator` and `+`, so pass it in as `op_func`.
1719
+ #define HWY_BFLOAT16_BINARY_OP(op, op_func, assign_func) \
1720
+ constexpr bfloat16_t op_func(const bfloat16_t& rhs) const noexcept { \
1721
+ return bfloat16_t(static_cast<Native>(native op rhs.native)); \
1722
+ } \
1723
+ template <typename T, HWY_IF_NOT_BF16(T), \
1724
+ typename UnwrappedT = \
1725
+ detail::SpecialFloatUnwrapArithOpOperand<const T&>, \
1726
+ typename RawResultT = \
1727
+ decltype(DeclVal<Native>() op DeclVal<UnwrappedT>()), \
1728
+ typename ResultT = \
1729
+ detail::NativeSpecialFloatToWrapper<RawResultT>, \
1730
+ HWY_IF_CASTABLE(RawResultT, ResultT)> \
1731
+ constexpr ResultT op_func(const T& rhs) const noexcept(noexcept( \
1732
+ static_cast<ResultT>(DeclVal<Native>() op DeclVal<UnwrappedT>()))) { \
1733
+ return static_cast<ResultT>(native op static_cast<UnwrappedT>(rhs)); \
1734
+ } \
1735
+ HWY_CXX14_CONSTEXPR hwy::bfloat16_t& assign_func( \
1736
+ const hwy::bfloat16_t& rhs) noexcept { \
1737
+ native = static_cast<Native>(native op rhs.native); \
1738
+ return *this; \
1739
+ } \
1740
+ template <typename T, HWY_IF_NOT_BF16(T), \
1741
+ HWY_IF_OP_CASTABLE(op, const T&, Native), \
1742
+ HWY_IF_ASSIGNABLE( \
1743
+ Native, decltype(DeclVal<Native>() op DeclVal<const T&>()))> \
1744
+ HWY_CXX14_CONSTEXPR hwy::bfloat16_t& assign_func(const T& rhs) noexcept( \
1745
+ noexcept( \
1746
+ static_cast<Native>(DeclVal<Native>() op DeclVal<const T&>()))) { \
1747
+ native = static_cast<Native>(native op rhs); \
1748
+ return *this; \
1749
+ }
1750
+ HWY_BFLOAT16_BINARY_OP(+, operator+, operator+=)
1751
+ HWY_BFLOAT16_BINARY_OP(-, operator-, operator-=)
1752
+ HWY_BFLOAT16_BINARY_OP(*, operator*, operator*=)
1753
+ HWY_BFLOAT16_BINARY_OP(/, operator/, operator/=)
1754
+ #undef HWY_BFLOAT16_BINARY_OP
1755
+
1756
+ #endif // HWY_HAVE_SCALAR_BF16_OPERATORS
900
1757
  };
1758
+ static_assert(sizeof(hwy::bfloat16_t) == 2, "Wrong size of bfloat16_t");
901
1759
 
902
- template <class T>
903
- using RemoveConst = typename RemoveConstT<T>::type;
1760
+ #pragma pack(pop)
1761
+
1762
+ #if HWY_HAVE_SCALAR_BF16_TYPE
1763
+ namespace detail {
904
1764
 
1765
+ #if HWY_HAVE_SCALAR_BF16_OPERATORS
905
1766
  template <class T>
906
- struct RemoveRefT {
907
- using type = T;
1767
+ struct SpecialFloatUnwrapArithOpOperandT<T, hwy::bfloat16_t, true> {
1768
+ using type = hwy::bfloat16_t::Native;
908
1769
  };
1770
+ #endif
1771
+
909
1772
  template <class T>
910
- struct RemoveRefT<T&> {
911
- using type = T;
1773
+ struct NativeSpecialFloatToWrapperT<T, hwy::bfloat16_t::Native> {
1774
+ using type = hwy::bfloat16_t;
912
1775
  };
913
- template <class T>
914
- struct RemoveRefT<T&&> {
915
- using type = T;
1776
+
1777
+ } // namespace detail
1778
+ #endif // HWY_HAVE_SCALAR_BF16_TYPE
1779
+
1780
+ #if HWY_HAS_BUILTIN(__builtin_bit_cast) || HWY_COMPILER_MSVC >= 1926
1781
+ namespace detail {
1782
+
1783
+ template <>
1784
+ struct BitCastScalarSrcCastHelper<hwy::bfloat16_t> {
1785
+ #if HWY_HAVE_SCALAR_BF16_TYPE
1786
+ static HWY_INLINE constexpr const hwy::bfloat16_t::Native& CastSrcValRef(
1787
+ const hwy::bfloat16_t& val) {
1788
+ return val.native;
1789
+ }
1790
+ #else
1791
+ static HWY_INLINE constexpr const uint16_t& CastSrcValRef(
1792
+ const hwy::bfloat16_t& val) {
1793
+ return val.bits;
1794
+ }
1795
+ #endif
916
1796
  };
917
1797
 
918
- template <class T>
919
- using RemoveRef = typename RemoveRefT<T>::type;
1798
+ } // namespace detail
1799
+ #endif // HWY_HAS_BUILTIN(__builtin_bit_cast) || HWY_COMPILER_MSVC >= 1926
1800
+
1801
+ HWY_API HWY_BF16_CONSTEXPR float F32FromBF16(bfloat16_t bf) {
1802
+ #if HWY_HAVE_SCALAR_BF16_OPERATORS
1803
+ return static_cast<float>(bf);
1804
+ #else
1805
+ return BitCastScalar<float>(static_cast<uint32_t>(
1806
+ static_cast<uint32_t>(BitCastScalar<uint16_t>(bf)) << 16));
1807
+ #endif
1808
+ }
1809
+
1810
+ namespace detail {
1811
+
1812
+ // Returns the increment to add to the bits of a finite F32 value to round a
1813
+ // finite F32 to the nearest BF16 value
1814
+ static HWY_INLINE HWY_MAYBE_UNUSED constexpr uint32_t F32BitsToBF16RoundIncr(
1815
+ const uint32_t f32_bits) {
1816
+ return static_cast<uint32_t>(((f32_bits & 0x7FFFFFFFu) < 0x7F800000u)
1817
+ ? (0x7FFFu + ((f32_bits >> 16) & 1u))
1818
+ : 0u);
1819
+ }
1820
+
1821
+ // Converts f32_bits (which is the bits of a F32 value) to BF16 bits,
1822
+ // rounded to the nearest F16 value
1823
+ static HWY_INLINE HWY_MAYBE_UNUSED constexpr uint16_t F32BitsToBF16Bits(
1824
+ const uint32_t f32_bits) {
1825
+ // Round f32_bits to the nearest BF16 by first adding
1826
+ // F32BitsToBF16RoundIncr(f32_bits) to f32_bits and then right shifting
1827
+ // f32_bits + F32BitsToBF16RoundIncr(f32_bits) by 16
1828
+
1829
+ // If f32_bits is the bit representation of a NaN F32 value, make sure that
1830
+ // bit 6 of the BF16 result is set to convert SNaN F32 values to QNaN BF16
1831
+ // values and to prevent NaN F32 values from being converted to an infinite
1832
+ // BF16 value
1833
+ return static_cast<uint16_t>(
1834
+ ((f32_bits + F32BitsToBF16RoundIncr(f32_bits)) >> 16) |
1835
+ (static_cast<uint32_t>((f32_bits & 0x7FFFFFFFu) > 0x7F800000u) << 6));
1836
+ }
1837
+
1838
+ } // namespace detail
1839
+
1840
+ HWY_API HWY_BF16_CONSTEXPR bfloat16_t BF16FromF32(float f) {
1841
+ #if HWY_HAVE_SCALAR_BF16_OPERATORS
1842
+ return static_cast<bfloat16_t>(f);
1843
+ #else
1844
+ return bfloat16_t::FromBits(
1845
+ detail::F32BitsToBF16Bits(BitCastScalar<uint32_t>(f)));
1846
+ #endif
1847
+ }
1848
+
1849
+ HWY_API HWY_BF16_CONSTEXPR bfloat16_t BF16FromF64(double f64) {
1850
+ #if HWY_HAVE_SCALAR_BF16_OPERATORS
1851
+ return static_cast<bfloat16_t>(f64);
1852
+ #else
1853
+ // The mantissa bits of f64 are first rounded using round-to-odd rounding
1854
+ // to the nearest f64 value that has the lower 38 bits zeroed out to
1855
+ // ensure that the result is correctly rounded to a BF16.
1856
+
1857
+ // The F64 round-to-odd operation below will round a normal F64 value
1858
+ // (using round-to-odd rounding) to a F64 value that has 15 bits of precision.
1859
+
1860
+ // It is okay if the magnitude of a denormal F64 value is rounded up in the
1861
+ // F64 round-to-odd step below as the magnitude of a denormal F64 value is
1862
+ // much smaller than 2^(-133) (the smallest positive denormal BF16 value).
1863
+
1864
+ // It is also okay if bit 38 of a NaN F64 value is changed by the F64
1865
+ // round-to-odd step below as the lower 16 bits of a F32 NaN value are usually
1866
+ // discarded or ignored by the conversion of a F32 NaN value to a BF16.
1867
+
1868
+ // If f64 is a NaN value, the result of the F64 round-to-odd step will be a
1869
+ // NaN value as the result of the F64 round-to-odd step will have at least one
1870
+ // mantissa bit if f64 is a NaN value.
1871
+
1872
+ // The F64 round-to-odd step below will ensure that the F64 to F32 conversion
1873
+ // is exact if the magnitude of the rounded F64 value (using round-to-odd
1874
+ // rounding) is between 2^(-135) (one-fourth of the smallest positive denormal
1875
+ // BF16 value) and HighestValue<float>() (the largest finite F32 value).
1876
+
1877
+ // If |f64| is less than 2^(-135), the magnitude of the result of the F64 to
1878
+ // F32 conversion is guaranteed to be less than or equal to 2^(-135), which
1879
+ // ensures that the F32 to BF16 conversion is correctly rounded, even if the
1880
+ // conversion of a rounded F64 value whose magnitude is less than 2^(-135)
1881
+ // to a F32 is inexact.
1882
+
1883
+ return BF16FromF32(
1884
+ static_cast<float>(BitCastScalar<double>(static_cast<uint64_t>(
1885
+ (BitCastScalar<uint64_t>(f64) & 0xFFFFFFC000000000ULL) |
1886
+ ((BitCastScalar<uint64_t>(f64) + 0x0000003FFFFFFFFFULL) &
1887
+ 0x0000004000000000ULL)))));
1888
+ #endif
1889
+ }
1890
+
1891
+ // More convenient to define outside bfloat16_t because these may use
1892
+ // F32FromBF16, which is defined after the struct.
1893
+
1894
+ HWY_BF16_CONSTEXPR inline bool operator==(bfloat16_t lhs,
1895
+ bfloat16_t rhs) noexcept {
1896
+ #if HWY_HAVE_SCALAR_BF16_OPERATORS
1897
+ return lhs.native == rhs.native;
1898
+ #else
1899
+ return F32FromBF16(lhs) == F32FromBF16(rhs);
1900
+ #endif
1901
+ }
1902
+
1903
+ HWY_BF16_CONSTEXPR inline bool operator!=(bfloat16_t lhs,
1904
+ bfloat16_t rhs) noexcept {
1905
+ #if HWY_HAVE_SCALAR_BF16_OPERATORS
1906
+ return lhs.native != rhs.native;
1907
+ #else
1908
+ return F32FromBF16(lhs) != F32FromBF16(rhs);
1909
+ #endif
1910
+ }
1911
+ HWY_BF16_CONSTEXPR inline bool operator<(bfloat16_t lhs,
1912
+ bfloat16_t rhs) noexcept {
1913
+ #if HWY_HAVE_SCALAR_BF16_OPERATORS
1914
+ return lhs.native < rhs.native;
1915
+ #else
1916
+ return F32FromBF16(lhs) < F32FromBF16(rhs);
1917
+ #endif
1918
+ }
1919
+ HWY_BF16_CONSTEXPR inline bool operator<=(bfloat16_t lhs,
1920
+ bfloat16_t rhs) noexcept {
1921
+ #if HWY_HAVE_SCALAR_BF16_OPERATORS
1922
+ return lhs.native <= rhs.native;
1923
+ #else
1924
+ return F32FromBF16(lhs) <= F32FromBF16(rhs);
1925
+ #endif
1926
+ }
1927
+ HWY_BF16_CONSTEXPR inline bool operator>(bfloat16_t lhs,
1928
+ bfloat16_t rhs) noexcept {
1929
+ #if HWY_HAVE_SCALAR_BF16_OPERATORS
1930
+ return lhs.native > rhs.native;
1931
+ #else
1932
+ return F32FromBF16(lhs) > F32FromBF16(rhs);
1933
+ #endif
1934
+ }
1935
+ HWY_BF16_CONSTEXPR inline bool operator>=(bfloat16_t lhs,
1936
+ bfloat16_t rhs) noexcept {
1937
+ #if HWY_HAVE_SCALAR_BF16_OPERATORS
1938
+ return lhs.native >= rhs.native;
1939
+ #else
1940
+ return F32FromBF16(lhs) >= F32FromBF16(rhs);
1941
+ #endif
1942
+ }
1943
+ #if HWY_HAVE_CXX20_THREE_WAY_COMPARE
1944
+ HWY_BF16_CONSTEXPR inline std::partial_ordering operator<=>(
1945
+ bfloat16_t lhs, bfloat16_t rhs) noexcept {
1946
+ #if HWY_HAVE_SCALAR_BF16_OPERATORS
1947
+ return lhs.native <=> rhs.native;
1948
+ #else
1949
+ return F32FromBF16(lhs) <=> F32FromBF16(rhs);
1950
+ #endif
1951
+ }
1952
+ #endif // HWY_HAVE_CXX20_THREE_WAY_COMPARE
920
1953
 
921
1954
  //------------------------------------------------------------------------------
922
1955
  // Type relations
@@ -1110,25 +2143,19 @@ constexpr auto IsFloatTag() -> hwy::SizeTag<(R::is_float ? 0x200 : 0x400)> {
1110
2143
 
1111
2144
  template <typename T>
1112
2145
  HWY_API constexpr bool IsFloat3264() {
1113
- return IsSame<T, float>() || IsSame<T, double>();
2146
+ return IsSameEither<RemoveCvRef<T>, float, double>();
1114
2147
  }
1115
2148
 
1116
2149
  template <typename T>
1117
2150
  HWY_API constexpr bool IsFloat() {
1118
2151
  // Cannot use T(1.25) != T(1) for float16_t, which can only be converted to or
1119
2152
  // from a float, not compared. Include float16_t in case HWY_HAVE_FLOAT16=1.
1120
- return IsSame<T, float16_t>() || IsFloat3264<T>();
1121
- }
1122
-
1123
- // These types are often special-cased and not supported in all ops.
1124
- template <typename T>
1125
- HWY_API constexpr bool IsSpecialFloat() {
1126
- return IsSame<T, float16_t>() || IsSame<T, bfloat16_t>();
2153
+ return IsSame<RemoveCvRef<T>, float16_t>() || IsFloat3264<T>();
1127
2154
  }
1128
2155
 
1129
2156
  template <typename T>
1130
2157
  HWY_API constexpr bool IsSigned() {
1131
- return T(0) > T(-1);
2158
+ return static_cast<T>(0) > static_cast<T>(-1);
1132
2159
  }
1133
2160
  template <>
1134
2161
  constexpr bool IsSigned<float16_t>() {
@@ -1138,104 +2165,113 @@ template <>
1138
2165
  constexpr bool IsSigned<bfloat16_t>() {
1139
2166
  return true;
1140
2167
  }
2168
+ template <>
2169
+ constexpr bool IsSigned<hwy::uint128_t>() {
2170
+ return false;
2171
+ }
2172
+ template <>
2173
+ constexpr bool IsSigned<hwy::K64V64>() {
2174
+ return false;
2175
+ }
2176
+ template <>
2177
+ constexpr bool IsSigned<hwy::K32V32>() {
2178
+ return false;
2179
+ }
2180
+
2181
+ template <typename T, bool = IsInteger<T>() && !IsIntegerLaneType<T>()>
2182
+ struct MakeLaneTypeIfIntegerT {
2183
+ using type = T;
2184
+ };
2185
+
2186
+ template <typename T>
2187
+ struct MakeLaneTypeIfIntegerT<T, true> {
2188
+ using type = hwy::If<IsSigned<T>(), SignedFromSize<sizeof(T)>,
2189
+ UnsignedFromSize<sizeof(T)>>;
2190
+ };
2191
+
2192
+ template <typename T>
2193
+ using MakeLaneTypeIfInteger = typename MakeLaneTypeIfIntegerT<T>::type;
1141
2194
 
1142
2195
  // Largest/smallest representable integer values.
1143
2196
  template <typename T>
1144
2197
  HWY_API constexpr T LimitsMax() {
1145
- static_assert(!IsFloat<T>(), "Only for integer types");
1146
- using TU = MakeUnsigned<T>;
1147
- return static_cast<T>(IsSigned<T>() ? (static_cast<TU>(~0ull) >> 1)
1148
- : static_cast<TU>(~0ull));
2198
+ static_assert(IsInteger<T>(), "Only for integer types");
2199
+ using TU = UnsignedFromSize<sizeof(T)>;
2200
+ return static_cast<T>(IsSigned<T>() ? (static_cast<TU>(~TU(0)) >> 1)
2201
+ : static_cast<TU>(~TU(0)));
1149
2202
  }
1150
2203
  template <typename T>
1151
2204
  HWY_API constexpr T LimitsMin() {
1152
- static_assert(!IsFloat<T>(), "Only for integer types");
1153
- return IsSigned<T>() ? T(-1) - LimitsMax<T>() : T(0);
2205
+ static_assert(IsInteger<T>(), "Only for integer types");
2206
+ return IsSigned<T>() ? static_cast<T>(-1) - LimitsMax<T>()
2207
+ : static_cast<T>(0);
1154
2208
  }
1155
2209
 
1156
2210
  // Largest/smallest representable value (integer or float). This naming avoids
1157
2211
  // confusion with numeric_limits<float>::min() (the smallest positive value).
1158
2212
  // Cannot be constexpr because we use CopySameSize for [b]float16_t.
1159
2213
  template <typename T>
1160
- HWY_API T LowestValue() {
2214
+ HWY_API HWY_BITCASTSCALAR_CONSTEXPR T LowestValue() {
1161
2215
  return LimitsMin<T>();
1162
2216
  }
1163
2217
  template <>
1164
- HWY_INLINE bfloat16_t LowestValue<bfloat16_t>() {
1165
- const uint16_t kBits = 0xFF7F; // -1.1111111 x 2^127
1166
- bfloat16_t ret;
1167
- CopySameSize(&kBits, &ret);
1168
- return ret;
2218
+ HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR bfloat16_t LowestValue<bfloat16_t>() {
2219
+ return bfloat16_t::FromBits(uint16_t{0xFF7Fu}); // -1.1111111 x 2^127
1169
2220
  }
1170
2221
  template <>
1171
- HWY_INLINE float16_t LowestValue<float16_t>() {
1172
- const uint16_t kBits = 0xFBFF; // -1.1111111111 x 2^15
1173
- float16_t ret;
1174
- CopySameSize(&kBits, &ret);
1175
- return ret;
2222
+ HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR float16_t LowestValue<float16_t>() {
2223
+ return float16_t::FromBits(uint16_t{0xFBFFu}); // -1.1111111111 x 2^15
1176
2224
  }
1177
2225
  template <>
1178
- HWY_INLINE float LowestValue<float>() {
2226
+ HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR float LowestValue<float>() {
1179
2227
  return -3.402823466e+38F;
1180
2228
  }
1181
2229
  template <>
1182
- HWY_INLINE double LowestValue<double>() {
2230
+ HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR double LowestValue<double>() {
1183
2231
  return -1.7976931348623158e+308;
1184
2232
  }
1185
2233
 
1186
2234
  template <typename T>
1187
- HWY_API T HighestValue() {
2235
+ HWY_API HWY_BITCASTSCALAR_CONSTEXPR T HighestValue() {
1188
2236
  return LimitsMax<T>();
1189
2237
  }
1190
2238
  template <>
1191
- HWY_INLINE bfloat16_t HighestValue<bfloat16_t>() {
1192
- const uint16_t kBits = 0x7F7F; // 1.1111111 x 2^127
1193
- bfloat16_t ret;
1194
- CopySameSize(&kBits, &ret);
1195
- return ret;
2239
+ HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR bfloat16_t HighestValue<bfloat16_t>() {
2240
+ return bfloat16_t::FromBits(uint16_t{0x7F7Fu}); // 1.1111111 x 2^127
1196
2241
  }
1197
2242
  template <>
1198
- HWY_INLINE float16_t HighestValue<float16_t>() {
1199
- const uint16_t kBits = 0x7BFF; // 1.1111111111 x 2^15
1200
- float16_t ret;
1201
- CopySameSize(&kBits, &ret);
1202
- return ret;
2243
+ HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR float16_t HighestValue<float16_t>() {
2244
+ return float16_t::FromBits(uint16_t{0x7BFFu}); // 1.1111111111 x 2^15
1203
2245
  }
1204
2246
  template <>
1205
- HWY_INLINE float HighestValue<float>() {
2247
+ HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR float HighestValue<float>() {
1206
2248
  return 3.402823466e+38F;
1207
2249
  }
1208
2250
  template <>
1209
- HWY_INLINE double HighestValue<double>() {
2251
+ HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR double HighestValue<double>() {
1210
2252
  return 1.7976931348623158e+308;
1211
2253
  }
1212
2254
 
1213
2255
  // Difference between 1.0 and the next representable value. Equal to
1214
2256
  // 1 / (1ULL << MantissaBits<T>()), but hard-coding ensures precision.
1215
2257
  template <typename T>
1216
- HWY_API T Epsilon() {
2258
+ HWY_API HWY_BITCASTSCALAR_CONSTEXPR T Epsilon() {
1217
2259
  return 1;
1218
2260
  }
1219
2261
  template <>
1220
- HWY_INLINE bfloat16_t Epsilon<bfloat16_t>() {
1221
- const uint16_t kBits = 0x3C00; // 0.0078125
1222
- bfloat16_t ret;
1223
- CopySameSize(&kBits, &ret);
1224
- return ret;
2262
+ HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR bfloat16_t Epsilon<bfloat16_t>() {
2263
+ return bfloat16_t::FromBits(uint16_t{0x3C00u}); // 0.0078125
1225
2264
  }
1226
2265
  template <>
1227
- HWY_INLINE float16_t Epsilon<float16_t>() {
1228
- const uint16_t kBits = 0x1400; // 0.0009765625
1229
- float16_t ret;
1230
- CopySameSize(&kBits, &ret);
1231
- return ret;
2266
+ HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR float16_t Epsilon<float16_t>() {
2267
+ return float16_t::FromBits(uint16_t{0x1400u}); // 0.0009765625
1232
2268
  }
1233
2269
  template <>
1234
- HWY_INLINE float Epsilon<float>() {
2270
+ HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR float Epsilon<float>() {
1235
2271
  return 1.192092896e-7f;
1236
2272
  }
1237
2273
  template <>
1238
- HWY_INLINE double Epsilon<double>() {
2274
+ HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR double Epsilon<double>() {
1239
2275
  return 2.2204460492503131e-16;
1240
2276
  }
1241
2277
 
@@ -1278,7 +2314,8 @@ constexpr MakeUnsigned<T> SignMask() {
1278
2314
  // Returns bitmask of the exponent field in IEEE binary16/32/64.
1279
2315
  template <typename T>
1280
2316
  constexpr MakeUnsigned<T> ExponentMask() {
1281
- return (~(MakeUnsigned<T>{1} << MantissaBits<T>()) + 1) & ~SignMask<T>();
2317
+ return (~(MakeUnsigned<T>{1} << MantissaBits<T>()) + 1) &
2318
+ static_cast<MakeUnsigned<T>>(~SignMask<T>());
1282
2319
  }
1283
2320
 
1284
2321
  // Returns bitmask of the mantissa field in IEEE binary16/32/64.
@@ -1290,30 +2327,24 @@ constexpr MakeUnsigned<T> MantissaMask() {
1290
2327
  // Returns 1 << mantissa_bits as a floating-point number. All integers whose
1291
2328
  // absolute value are less than this can be represented exactly.
1292
2329
  template <typename T>
1293
- HWY_INLINE T MantissaEnd() {
2330
+ HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR T MantissaEnd() {
1294
2331
  static_assert(sizeof(T) == 0, "Only instantiate the specializations");
1295
2332
  return 0;
1296
2333
  }
1297
2334
  template <>
1298
- HWY_INLINE bfloat16_t MantissaEnd<bfloat16_t>() {
1299
- const uint16_t kBits = 0x4300; // 1.0 x 2^7
1300
- bfloat16_t ret;
1301
- CopySameSize(&kBits, &ret);
1302
- return ret;
2335
+ HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR bfloat16_t MantissaEnd<bfloat16_t>() {
2336
+ return bfloat16_t::FromBits(uint16_t{0x4300u}); // 1.0 x 2^7
1303
2337
  }
1304
2338
  template <>
1305
- HWY_INLINE float16_t MantissaEnd<float16_t>() {
1306
- const uint16_t kBits = 0x6400; // 1.0 x 2^10
1307
- float16_t ret;
1308
- CopySameSize(&kBits, &ret);
1309
- return ret;
2339
+ HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR float16_t MantissaEnd<float16_t>() {
2340
+ return float16_t::FromBits(uint16_t{0x6400u}); // 1.0 x 2^10
1310
2341
  }
1311
2342
  template <>
1312
- HWY_INLINE float MantissaEnd<float>() {
2343
+ HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR float MantissaEnd<float>() {
1313
2344
  return 8388608.0f; // 1 << 23
1314
2345
  }
1315
2346
  template <>
1316
- HWY_INLINE double MantissaEnd<double>() {
2347
+ HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR double MantissaEnd<double>() {
1317
2348
  // floating point literal with p52 requires C++17.
1318
2349
  return 4503599627370496.0; // 1 << 52
1319
2350
  }
@@ -1333,6 +2364,143 @@ constexpr MakeSigned<T> MaxExponentField() {
1333
2364
  return (MakeSigned<T>{1} << ExponentBits<T>()) - 1;
1334
2365
  }
1335
2366
 
2367
+ //------------------------------------------------------------------------------
2368
+ // Additional F16/BF16 operators
2369
+
2370
+ #if HWY_HAVE_SCALAR_F16_OPERATORS || HWY_HAVE_SCALAR_BF16_OPERATORS
2371
+
2372
+ #define HWY_RHS_SPECIAL_FLOAT_ARITH_OP(op, op_func, T2) \
2373
+ template < \
2374
+ typename T1, \
2375
+ hwy::EnableIf<hwy::IsInteger<RemoveCvRef<T1>>() || \
2376
+ hwy::IsFloat3264<RemoveCvRef<T1>>()>* = nullptr, \
2377
+ typename RawResultT = decltype(DeclVal<T1>() op DeclVal<T2::Native>()), \
2378
+ typename ResultT = detail::NativeSpecialFloatToWrapper<RawResultT>, \
2379
+ HWY_IF_CASTABLE(RawResultT, ResultT)> \
2380
+ static HWY_INLINE constexpr ResultT op_func(T1 a, T2 b) noexcept { \
2381
+ return static_cast<ResultT>(a op b.native); \
2382
+ }
2383
+
2384
+ #define HWY_SPECIAL_FLOAT_CMP_AGAINST_NON_SPECIAL_OP(op, op_func, T1) \
2385
+ HWY_RHS_SPECIAL_FLOAT_ARITH_OP(op, op_func, T1) \
2386
+ template < \
2387
+ typename T2, \
2388
+ hwy::EnableIf<hwy::IsInteger<RemoveCvRef<T2>>() || \
2389
+ hwy::IsFloat3264<RemoveCvRef<T2>>()>* = nullptr, \
2390
+ typename RawResultT = decltype(DeclVal<T1::Native>() op DeclVal<T2>()), \
2391
+ typename ResultT = detail::NativeSpecialFloatToWrapper<RawResultT>, \
2392
+ HWY_IF_CASTABLE(RawResultT, ResultT)> \
2393
+ static HWY_INLINE constexpr ResultT op_func(T1 a, T2 b) noexcept { \
2394
+ return static_cast<ResultT>(a.native op b); \
2395
+ }
2396
+
2397
+ #if HWY_HAVE_SCALAR_F16_OPERATORS
2398
+ HWY_RHS_SPECIAL_FLOAT_ARITH_OP(+, operator+, float16_t)
2399
+ HWY_RHS_SPECIAL_FLOAT_ARITH_OP(-, operator-, float16_t)
2400
+ HWY_RHS_SPECIAL_FLOAT_ARITH_OP(*, operator*, float16_t)
2401
+ HWY_RHS_SPECIAL_FLOAT_ARITH_OP(/, operator/, float16_t)
2402
+ HWY_SPECIAL_FLOAT_CMP_AGAINST_NON_SPECIAL_OP(==, operator==, float16_t)
2403
+ HWY_SPECIAL_FLOAT_CMP_AGAINST_NON_SPECIAL_OP(!=, operator!=, float16_t)
2404
+ HWY_SPECIAL_FLOAT_CMP_AGAINST_NON_SPECIAL_OP(<, operator<, float16_t)
2405
+ HWY_SPECIAL_FLOAT_CMP_AGAINST_NON_SPECIAL_OP(<=, operator<=, float16_t)
2406
+ HWY_SPECIAL_FLOAT_CMP_AGAINST_NON_SPECIAL_OP(>, operator>, float16_t)
2407
+ HWY_SPECIAL_FLOAT_CMP_AGAINST_NON_SPECIAL_OP(>=, operator>=, float16_t)
2408
+ #if HWY_HAVE_CXX20_THREE_WAY_COMPARE
2409
+ HWY_SPECIAL_FLOAT_CMP_AGAINST_NON_SPECIAL_OP(<=>, operator<=>, float16_t)
2410
+ #endif
2411
+ #endif // HWY_HAVE_SCALAR_F16_OPERATORS
2412
+
2413
+ #if HWY_HAVE_SCALAR_BF16_OPERATORS
2414
+ HWY_RHS_SPECIAL_FLOAT_ARITH_OP(+, operator+, bfloat16_t)
2415
+ HWY_RHS_SPECIAL_FLOAT_ARITH_OP(-, operator-, bfloat16_t)
2416
+ HWY_RHS_SPECIAL_FLOAT_ARITH_OP(*, operator*, bfloat16_t)
2417
+ HWY_RHS_SPECIAL_FLOAT_ARITH_OP(/, operator/, bfloat16_t)
2418
+ HWY_SPECIAL_FLOAT_CMP_AGAINST_NON_SPECIAL_OP(==, operator==, bfloat16_t)
2419
+ HWY_SPECIAL_FLOAT_CMP_AGAINST_NON_SPECIAL_OP(!=, operator!=, bfloat16_t)
2420
+ HWY_SPECIAL_FLOAT_CMP_AGAINST_NON_SPECIAL_OP(<, operator<, bfloat16_t)
2421
+ HWY_SPECIAL_FLOAT_CMP_AGAINST_NON_SPECIAL_OP(<=, operator<=, bfloat16_t)
2422
+ HWY_SPECIAL_FLOAT_CMP_AGAINST_NON_SPECIAL_OP(>, operator>, bfloat16_t)
2423
+ HWY_SPECIAL_FLOAT_CMP_AGAINST_NON_SPECIAL_OP(>=, operator>=, bfloat16_t)
2424
+ #if HWY_HAVE_CXX20_THREE_WAY_COMPARE
2425
+ HWY_SPECIAL_FLOAT_CMP_AGAINST_NON_SPECIAL_OP(<=>, operator<=>, bfloat16_t)
2426
+ #endif
2427
+ #endif // HWY_HAVE_SCALAR_BF16_OPERATORS
2428
+
2429
+ #undef HWY_RHS_SPECIAL_FLOAT_ARITH_OP
2430
+ #undef HWY_SPECIAL_FLOAT_CMP_AGAINST_NON_SPECIAL_OP
2431
+
2432
+ #endif // HWY_HAVE_SCALAR_F16_OPERATORS || HWY_HAVE_SCALAR_BF16_OPERATORS
2433
+
2434
+ //------------------------------------------------------------------------------
2435
+ // Type conversions (after IsSpecialFloat)
2436
+
2437
+ HWY_API float F32FromF16Mem(const void* ptr) {
2438
+ float16_t f16;
2439
+ CopyBytes<2>(HWY_ASSUME_ALIGNED(ptr, 2), &f16);
2440
+ return F32FromF16(f16);
2441
+ }
2442
+
2443
+ HWY_API float F32FromBF16Mem(const void* ptr) {
2444
+ bfloat16_t bf;
2445
+ CopyBytes<2>(HWY_ASSUME_ALIGNED(ptr, 2), &bf);
2446
+ return F32FromBF16(bf);
2447
+ }
2448
+
2449
+ #if HWY_HAVE_SCALAR_F16_OPERATORS
2450
+ #define HWY_BF16_TO_F16_CONSTEXPR HWY_BF16_CONSTEXPR
2451
+ #else
2452
+ #define HWY_BF16_TO_F16_CONSTEXPR HWY_F16_CONSTEXPR
2453
+ #endif
2454
+
2455
+ // For casting from TFrom to TTo
2456
+ template <typename TTo, typename TFrom, HWY_IF_NOT_SPECIAL_FLOAT(TTo),
2457
+ HWY_IF_NOT_SPECIAL_FLOAT(TFrom), HWY_IF_NOT_SAME(TTo, TFrom)>
2458
+ HWY_API constexpr TTo ConvertScalarTo(const TFrom in) {
2459
+ return static_cast<TTo>(in);
2460
+ }
2461
+ template <typename TTo, typename TFrom, HWY_IF_F16(TTo),
2462
+ HWY_IF_NOT_SPECIAL_FLOAT(TFrom), HWY_IF_NOT_SAME(TFrom, double)>
2463
+ HWY_API constexpr TTo ConvertScalarTo(const TFrom in) {
2464
+ return F16FromF32(static_cast<float>(in));
2465
+ }
2466
+ template <typename TTo, HWY_IF_F16(TTo)>
2467
+ HWY_API HWY_BF16_TO_F16_CONSTEXPR TTo
2468
+ ConvertScalarTo(const hwy::bfloat16_t in) {
2469
+ return F16FromF32(F32FromBF16(in));
2470
+ }
2471
+ template <typename TTo, HWY_IF_F16(TTo)>
2472
+ HWY_API HWY_F16_CONSTEXPR TTo ConvertScalarTo(const double in) {
2473
+ return F16FromF64(in);
2474
+ }
2475
+ template <typename TTo, typename TFrom, HWY_IF_BF16(TTo),
2476
+ HWY_IF_NOT_SPECIAL_FLOAT(TFrom), HWY_IF_NOT_SAME(TFrom, double)>
2477
+ HWY_API HWY_BF16_CONSTEXPR TTo ConvertScalarTo(const TFrom in) {
2478
+ return BF16FromF32(static_cast<float>(in));
2479
+ }
2480
+ template <typename TTo, HWY_IF_BF16(TTo)>
2481
+ HWY_API HWY_BF16_TO_F16_CONSTEXPR TTo ConvertScalarTo(const hwy::float16_t in) {
2482
+ return BF16FromF32(F32FromF16(in));
2483
+ }
2484
+ template <typename TTo, HWY_IF_BF16(TTo)>
2485
+ HWY_API HWY_BF16_CONSTEXPR TTo ConvertScalarTo(const double in) {
2486
+ return BF16FromF64(in);
2487
+ }
2488
+ template <typename TTo, typename TFrom, HWY_IF_F16(TFrom),
2489
+ HWY_IF_NOT_SPECIAL_FLOAT(TTo)>
2490
+ HWY_API HWY_F16_CONSTEXPR TTo ConvertScalarTo(const TFrom in) {
2491
+ return static_cast<TTo>(F32FromF16(in));
2492
+ }
2493
+ template <typename TTo, typename TFrom, HWY_IF_BF16(TFrom),
2494
+ HWY_IF_NOT_SPECIAL_FLOAT(TTo)>
2495
+ HWY_API HWY_BF16_CONSTEXPR TTo ConvertScalarTo(TFrom in) {
2496
+ return static_cast<TTo>(F32FromBF16(in));
2497
+ }
2498
+ // Same: return unchanged
2499
+ template <typename TTo>
2500
+ HWY_API constexpr TTo ConvertScalarTo(TTo in) {
2501
+ return in;
2502
+ }
2503
+
1336
2504
  //------------------------------------------------------------------------------
1337
2505
  // Helper functions
1338
2506
 
@@ -1346,8 +2514,54 @@ constexpr inline size_t RoundUpTo(size_t what, size_t align) {
1346
2514
  return DivCeil(what, align) * align;
1347
2515
  }
1348
2516
 
2517
+ // Works for any `align`; if a power of two, compiler emits AND.
2518
+ constexpr inline size_t RoundDownTo(size_t what, size_t align) {
2519
+ return what - (what % align);
2520
+ }
2521
+
2522
+ namespace detail {
2523
+
2524
+ // T is unsigned or T is signed and (val >> shift_amt) is an arithmetic right
2525
+ // shift
2526
+ template <class T>
2527
+ static HWY_INLINE constexpr T ScalarShr(hwy::UnsignedTag /*type_tag*/, T val,
2528
+ int shift_amt) {
2529
+ return static_cast<T>(val >> shift_amt);
2530
+ }
2531
+
2532
+ // T is signed and (val >> shift_amt) is a non-arithmetic right shift
2533
+ template <class T>
2534
+ static HWY_INLINE constexpr T ScalarShr(hwy::SignedTag /*type_tag*/, T val,
2535
+ int shift_amt) {
2536
+ using TU = MakeUnsigned<MakeLaneTypeIfInteger<T>>;
2537
+ return static_cast<T>(
2538
+ (val < 0) ? static_cast<TU>(
2539
+ ~(static_cast<TU>(~static_cast<TU>(val)) >> shift_amt))
2540
+ : static_cast<TU>(static_cast<TU>(val) >> shift_amt));
2541
+ }
2542
+
2543
+ } // namespace detail
2544
+
2545
+ // If T is an signed integer type, ScalarShr is guaranteed to perform an
2546
+ // arithmetic right shift
2547
+
2548
+ // Otherwise, if T is an unsigned integer type, ScalarShr is guaranteed to
2549
+ // perform a logical right shift
2550
+ template <class T, HWY_IF_INTEGER(RemoveCvRef<T>)>
2551
+ HWY_API constexpr RemoveCvRef<T> ScalarShr(T val, int shift_amt) {
2552
+ using NonCvRefT = RemoveCvRef<T>;
2553
+ return detail::ScalarShr(
2554
+ hwy::SizeTag<((IsSigned<NonCvRefT>() &&
2555
+ (LimitsMin<NonCvRefT>() >> (sizeof(T) * 8 - 1)) !=
2556
+ static_cast<NonCvRefT>(-1))
2557
+ ? 0x100
2558
+ : 0)>(),
2559
+ static_cast<NonCvRefT>(val), shift_amt);
2560
+ }
2561
+
1349
2562
  // Undefined results for x == 0.
1350
2563
  HWY_API size_t Num0BitsBelowLS1Bit_Nonzero32(const uint32_t x) {
2564
+ HWY_DASSERT(x != 0);
1351
2565
  #if HWY_COMPILER_MSVC
1352
2566
  unsigned long index; // NOLINT
1353
2567
  _BitScanForward(&index, x);
@@ -1358,6 +2572,7 @@ HWY_API size_t Num0BitsBelowLS1Bit_Nonzero32(const uint32_t x) {
1358
2572
  }
1359
2573
 
1360
2574
  HWY_API size_t Num0BitsBelowLS1Bit_Nonzero64(const uint64_t x) {
2575
+ HWY_DASSERT(x != 0);
1361
2576
  #if HWY_COMPILER_MSVC
1362
2577
  #if HWY_ARCH_X86_64
1363
2578
  unsigned long index; // NOLINT
@@ -1383,6 +2598,7 @@ HWY_API size_t Num0BitsBelowLS1Bit_Nonzero64(const uint64_t x) {
1383
2598
 
1384
2599
  // Undefined results for x == 0.
1385
2600
  HWY_API size_t Num0BitsAboveMS1Bit_Nonzero32(const uint32_t x) {
2601
+ HWY_DASSERT(x != 0);
1386
2602
  #if HWY_COMPILER_MSVC
1387
2603
  unsigned long index; // NOLINT
1388
2604
  _BitScanReverse(&index, x);
@@ -1393,6 +2609,7 @@ HWY_API size_t Num0BitsAboveMS1Bit_Nonzero32(const uint32_t x) {
1393
2609
  }
1394
2610
 
1395
2611
  HWY_API size_t Num0BitsAboveMS1Bit_Nonzero64(const uint64_t x) {
2612
+ HWY_DASSERT(x != 0);
1396
2613
  #if HWY_COMPILER_MSVC
1397
2614
  #if HWY_ARCH_X86_64
1398
2615
  unsigned long index; // NOLINT
@@ -1416,26 +2633,48 @@ HWY_API size_t Num0BitsAboveMS1Bit_Nonzero64(const uint64_t x) {
1416
2633
  #endif // HWY_COMPILER_MSVC
1417
2634
  }
1418
2635
 
1419
- HWY_API size_t PopCount(uint64_t x) {
1420
- #if HWY_COMPILER_GCC // includes clang
1421
- return static_cast<size_t>(__builtin_popcountll(x));
1422
- // This instruction has a separate feature flag, but is often called from
1423
- // non-SIMD code, so we don't want to require dynamic dispatch. It was first
1424
- // supported by Intel in Nehalem (SSE4.2), but MSVC only predefines a macro
1425
- // for AVX, so check for that.
2636
+ template <class T, HWY_IF_INTEGER(RemoveCvRef<T>),
2637
+ HWY_IF_T_SIZE_ONE_OF(RemoveCvRef<T>, (1 << 1) | (1 << 2) | (1 << 4))>
2638
+ HWY_API size_t PopCount(T x) {
2639
+ uint32_t u32_x = static_cast<uint32_t>(
2640
+ static_cast<UnsignedFromSize<sizeof(RemoveCvRef<T>)>>(x));
2641
+
2642
+ #if HWY_COMPILER_GCC || HWY_COMPILER_CLANG
2643
+ return static_cast<size_t>(__builtin_popcountl(u32_x));
2644
+ #elif HWY_COMPILER_MSVC && HWY_ARCH_X86_32 && defined(__AVX__)
2645
+ return static_cast<size_t>(_mm_popcnt_u32(u32_x));
2646
+ #else
2647
+ u32_x -= ((u32_x >> 1) & 0x55555555u);
2648
+ u32_x = (((u32_x >> 2) & 0x33333333u) + (u32_x & 0x33333333u));
2649
+ u32_x = (((u32_x >> 4) + u32_x) & 0x0F0F0F0Fu);
2650
+ u32_x += (u32_x >> 8);
2651
+ u32_x += (u32_x >> 16);
2652
+ return static_cast<size_t>(u32_x & 0x3Fu);
2653
+ #endif
2654
+ }
2655
+
2656
+ template <class T, HWY_IF_INTEGER(RemoveCvRef<T>),
2657
+ HWY_IF_T_SIZE(RemoveCvRef<T>, 8)>
2658
+ HWY_API size_t PopCount(T x) {
2659
+ uint64_t u64_x = static_cast<uint64_t>(
2660
+ static_cast<UnsignedFromSize<sizeof(RemoveCvRef<T>)>>(x));
2661
+
2662
+ #if HWY_COMPILER_GCC || HWY_COMPILER_CLANG
2663
+ return static_cast<size_t>(__builtin_popcountll(u64_x));
1426
2664
  #elif HWY_COMPILER_MSVC && HWY_ARCH_X86_64 && defined(__AVX__)
1427
- return _mm_popcnt_u64(x);
2665
+ return _mm_popcnt_u64(u64_x);
1428
2666
  #elif HWY_COMPILER_MSVC && HWY_ARCH_X86_32 && defined(__AVX__)
1429
- return _mm_popcnt_u32(static_cast<uint32_t>(x & 0xFFFFFFFFu)) +
1430
- _mm_popcnt_u32(static_cast<uint32_t>(x >> 32));
2667
+ return _mm_popcnt_u32(static_cast<uint32_t>(u64_x & 0xFFFFFFFFu)) +
2668
+ _mm_popcnt_u32(static_cast<uint32_t>(u64_x >> 32));
1431
2669
  #else
1432
- x -= ((x >> 1) & 0x5555555555555555ULL);
1433
- x = (((x >> 2) & 0x3333333333333333ULL) + (x & 0x3333333333333333ULL));
1434
- x = (((x >> 4) + x) & 0x0F0F0F0F0F0F0F0FULL);
1435
- x += (x >> 8);
1436
- x += (x >> 16);
1437
- x += (x >> 32);
1438
- return static_cast<size_t>(x & 0x7Fu);
2670
+ u64_x -= ((u64_x >> 1) & 0x5555555555555555ULL);
2671
+ u64_x = (((u64_x >> 2) & 0x3333333333333333ULL) +
2672
+ (u64_x & 0x3333333333333333ULL));
2673
+ u64_x = (((u64_x >> 4) + u64_x) & 0x0F0F0F0F0F0F0F0FULL);
2674
+ u64_x += (u64_x >> 8);
2675
+ u64_x += (u64_x >> 16);
2676
+ u64_x += (u64_x >> 32);
2677
+ return static_cast<size_t>(u64_x & 0x7Fu);
1439
2678
  #endif
1440
2679
  }
1441
2680
 
@@ -1456,21 +2695,32 @@ template <typename TI>
1456
2695
  : static_cast<size_t>(FloorLog2(static_cast<TI>(x - 1)) + 1);
1457
2696
  }
1458
2697
 
1459
- template <typename T>
1460
- HWY_INLINE constexpr T AddWithWraparound(hwy::FloatTag /*tag*/, T t, size_t n) {
1461
- return t + static_cast<T>(n);
2698
+ template <typename T, typename T2, HWY_IF_FLOAT(T), HWY_IF_NOT_SPECIAL_FLOAT(T)>
2699
+ HWY_INLINE constexpr T AddWithWraparound(T t, T2 increment) {
2700
+ return t + static_cast<T>(increment);
1462
2701
  }
1463
2702
 
1464
- template <typename T>
1465
- HWY_INLINE constexpr T AddWithWraparound(hwy::NonFloatTag /*tag*/, T t,
1466
- size_t n) {
2703
+ template <typename T, typename T2, HWY_IF_SPECIAL_FLOAT(T)>
2704
+ HWY_INLINE constexpr T AddWithWraparound(T t, T2 increment) {
2705
+ return ConvertScalarTo<T>(ConvertScalarTo<float>(t) +
2706
+ ConvertScalarTo<float>(increment));
2707
+ }
2708
+
2709
+ template <typename T, typename T2, HWY_IF_NOT_FLOAT(T)>
2710
+ HWY_INLINE constexpr T AddWithWraparound(T t, T2 n) {
1467
2711
  using TU = MakeUnsigned<T>;
1468
- return static_cast<T>(
1469
- static_cast<TU>(static_cast<TU>(t) + static_cast<TU>(n)) &
1470
- hwy::LimitsMax<TU>());
2712
+ // Sub-int types would promote to int, not unsigned, which would trigger
2713
+ // warnings, so first promote to the largest unsigned type. Due to
2714
+ // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=87519, which affected GCC 8
2715
+ // until fixed in 9.3, we use built-in types rather than uint64_t.
2716
+ return static_cast<T>(static_cast<TU>(
2717
+ static_cast<unsigned long long>(static_cast<unsigned long long>(t) +
2718
+ static_cast<unsigned long long>(n)) &
2719
+ uint64_t{hwy::LimitsMax<TU>()}));
1471
2720
  }
1472
2721
 
1473
2722
  #if HWY_COMPILER_MSVC && HWY_ARCH_X86_64
2723
+ #pragma intrinsic(_mul128)
1474
2724
  #pragma intrinsic(_umul128)
1475
2725
  #endif
1476
2726
 
@@ -1494,7 +2744,179 @@ HWY_API uint64_t Mul128(uint64_t a, uint64_t b, uint64_t* HWY_RESTRICT upper) {
1494
2744
  #endif
1495
2745
  }
1496
2746
 
2747
+ HWY_API int64_t Mul128(int64_t a, int64_t b, int64_t* HWY_RESTRICT upper) {
2748
+ #if defined(__SIZEOF_INT128__)
2749
+ __int128_t product = (__int128_t)a * (__int128_t)b;
2750
+ *upper = (int64_t)(product >> 64);
2751
+ return (int64_t)(product & 0xFFFFFFFFFFFFFFFFULL);
2752
+ #elif HWY_COMPILER_MSVC && HWY_ARCH_X86_64
2753
+ return _mul128(a, b, upper);
2754
+ #else
2755
+ uint64_t unsigned_upper;
2756
+ const int64_t lower = static_cast<int64_t>(Mul128(
2757
+ static_cast<uint64_t>(a), static_cast<uint64_t>(b), &unsigned_upper));
2758
+ *upper = static_cast<int64_t>(
2759
+ unsigned_upper -
2760
+ (static_cast<uint64_t>(ScalarShr(a, 63)) & static_cast<uint64_t>(b)) -
2761
+ (static_cast<uint64_t>(ScalarShr(b, 63)) & static_cast<uint64_t>(a)));
2762
+ return lower;
2763
+ #endif
2764
+ }
2765
+
2766
+ // Precomputation for fast n / divisor and n % divisor, where n is a variable
2767
+ // and divisor is unchanging but unknown at compile-time.
2768
+ class Divisor {
2769
+ public:
2770
+ explicit Divisor(uint32_t divisor) : divisor_(divisor) {
2771
+ if (divisor <= 1) return;
2772
+
2773
+ const uint32_t len =
2774
+ static_cast<uint32_t>(31 - Num0BitsAboveMS1Bit_Nonzero32(divisor - 1));
2775
+ const uint64_t u_hi = (2ULL << len) - divisor;
2776
+ const uint32_t q = Truncate((u_hi << 32) / divisor);
2777
+
2778
+ mul_ = q + 1;
2779
+ shift1_ = 1;
2780
+ shift2_ = len;
2781
+ }
2782
+
2783
+ uint32_t GetDivisor() const { return divisor_; }
2784
+
2785
+ // Returns n / divisor_.
2786
+ uint32_t Divide(uint32_t n) const {
2787
+ const uint64_t mul = mul_;
2788
+ const uint32_t t = Truncate((mul * n) >> 32);
2789
+ return (t + ((n - t) >> shift1_)) >> shift2_;
2790
+ }
2791
+
2792
+ // Returns n % divisor_.
2793
+ uint32_t Remainder(uint32_t n) const { return n - (Divide(n) * divisor_); }
2794
+
2795
+ private:
2796
+ static uint32_t Truncate(uint64_t x) {
2797
+ return static_cast<uint32_t>(x & 0xFFFFFFFFu);
2798
+ }
2799
+
2800
+ uint32_t divisor_;
2801
+ uint32_t mul_ = 1;
2802
+ uint32_t shift1_ = 0;
2803
+ uint32_t shift2_ = 0;
2804
+ };
2805
+
2806
+ namespace detail {
2807
+
2808
+ template <typename T>
2809
+ static HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR T ScalarAbs(hwy::FloatTag /*tag*/,
2810
+ T val) {
2811
+ using TU = MakeUnsigned<T>;
2812
+ return BitCastScalar<T>(
2813
+ static_cast<TU>(BitCastScalar<TU>(val) & (~SignMask<T>())));
2814
+ }
2815
+
2816
+ template <typename T>
2817
+ static HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR T
2818
+ ScalarAbs(hwy::SpecialTag /*tag*/, T val) {
2819
+ return ScalarAbs(hwy::FloatTag(), val);
2820
+ }
2821
+
2822
+ template <typename T>
2823
+ static HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR T
2824
+ ScalarAbs(hwy::SignedTag /*tag*/, T val) {
2825
+ using TU = MakeUnsigned<T>;
2826
+ return (val < T{0}) ? static_cast<T>(TU{0} - static_cast<TU>(val)) : val;
2827
+ }
2828
+
2829
+ template <typename T>
2830
+ static HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR T
2831
+ ScalarAbs(hwy::UnsignedTag /*tag*/, T val) {
2832
+ return val;
2833
+ }
2834
+
2835
+ } // namespace detail
2836
+
2837
+ template <typename T>
2838
+ HWY_API HWY_BITCASTSCALAR_CONSTEXPR RemoveCvRef<T> ScalarAbs(T val) {
2839
+ using TVal = MakeLaneTypeIfInteger<
2840
+ detail::NativeSpecialFloatToWrapper<RemoveCvRef<T>>>;
2841
+ return detail::ScalarAbs(hwy::TypeTag<TVal>(), static_cast<TVal>(val));
2842
+ }
2843
+
2844
+ template <typename T>
2845
+ HWY_API HWY_BITCASTSCALAR_CONSTEXPR bool ScalarIsNaN(T val) {
2846
+ using TF = detail::NativeSpecialFloatToWrapper<RemoveCvRef<T>>;
2847
+ using TU = MakeUnsigned<TF>;
2848
+ return (BitCastScalar<TU>(ScalarAbs(val)) > ExponentMask<TF>());
2849
+ }
2850
+
2851
+ template <typename T>
2852
+ HWY_API HWY_BITCASTSCALAR_CONSTEXPR bool ScalarIsInf(T val) {
2853
+ using TF = detail::NativeSpecialFloatToWrapper<RemoveCvRef<T>>;
2854
+ using TU = MakeUnsigned<TF>;
2855
+ return static_cast<TU>(BitCastScalar<TU>(static_cast<TF>(val)) << 1) ==
2856
+ static_cast<TU>(MaxExponentTimes2<TF>());
2857
+ }
2858
+
2859
+ namespace detail {
2860
+
2861
+ template <typename T>
2862
+ static HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR bool ScalarIsFinite(
2863
+ hwy::FloatTag /*tag*/, T val) {
2864
+ using TU = MakeUnsigned<T>;
2865
+ return (BitCastScalar<TU>(hwy::ScalarAbs(val)) < ExponentMask<T>());
2866
+ }
2867
+
2868
+ template <typename T>
2869
+ static HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR bool ScalarIsFinite(
2870
+ hwy::NonFloatTag /*tag*/, T /*val*/) {
2871
+ // Integer values are always finite
2872
+ return true;
2873
+ }
2874
+
2875
+ } // namespace detail
2876
+
2877
+ template <typename T>
2878
+ HWY_API HWY_BITCASTSCALAR_CONSTEXPR bool ScalarIsFinite(T val) {
2879
+ using TVal = MakeLaneTypeIfInteger<
2880
+ detail::NativeSpecialFloatToWrapper<RemoveCvRef<T>>>;
2881
+ return detail::ScalarIsFinite(hwy::IsFloatTag<TVal>(),
2882
+ static_cast<TVal>(val));
2883
+ }
2884
+
2885
+ template <typename T>
2886
+ HWY_API HWY_BITCASTSCALAR_CONSTEXPR RemoveCvRef<T> ScalarCopySign(T magn,
2887
+ T sign) {
2888
+ using TF = RemoveCvRef<detail::NativeSpecialFloatToWrapper<RemoveCvRef<T>>>;
2889
+ using TU = MakeUnsigned<TF>;
2890
+ return BitCastScalar<TF>(static_cast<TU>(
2891
+ (BitCastScalar<TU>(static_cast<TF>(magn)) & (~SignMask<TF>())) |
2892
+ (BitCastScalar<TU>(static_cast<TF>(sign)) & SignMask<TF>())));
2893
+ }
2894
+
2895
+ template <typename T>
2896
+ HWY_API HWY_BITCASTSCALAR_CONSTEXPR bool ScalarSignBit(T val) {
2897
+ using TVal = MakeLaneTypeIfInteger<
2898
+ detail::NativeSpecialFloatToWrapper<RemoveCvRef<T>>>;
2899
+ using TU = MakeUnsigned<TVal>;
2900
+ return ((BitCastScalar<TU>(static_cast<TVal>(val)) & SignMask<TVal>()) != 0);
2901
+ }
2902
+
1497
2903
  // Prevents the compiler from eliding the computations that led to "output".
2904
+ #if HWY_ARCH_PPC && (HWY_COMPILER_GCC || HWY_COMPILER_CLANG) && \
2905
+ !defined(_SOFT_FLOAT)
2906
+ // Workaround to avoid test failures on PPC if compiled with Clang
2907
+ template <class T, HWY_IF_F32(T)>
2908
+ HWY_API void PreventElision(T&& output) {
2909
+ asm volatile("" : "+f"(output)::"memory");
2910
+ }
2911
+ template <class T, HWY_IF_F64(T)>
2912
+ HWY_API void PreventElision(T&& output) {
2913
+ asm volatile("" : "+d"(output)::"memory");
2914
+ }
2915
+ template <class T, HWY_IF_NOT_FLOAT3264(T)>
2916
+ HWY_API void PreventElision(T&& output) {
2917
+ asm volatile("" : "+r"(output)::"memory");
2918
+ }
2919
+ #else
1498
2920
  template <class T>
1499
2921
  HWY_API void PreventElision(T&& output) {
1500
2922
  #if HWY_COMPILER_MSVC
@@ -1502,8 +2924,8 @@ HWY_API void PreventElision(T&& output) {
1502
2924
  // RTL constraints). Self-assignment with #pragma optimize("off") might be
1503
2925
  // expected to prevent elision, but it does not with MSVC 2015. Type-punning
1504
2926
  // with volatile pointers generates inefficient code on MSVC 2017.
1505
- static std::atomic<RemoveRef<T>> dummy;
1506
- dummy.store(output, std::memory_order_relaxed);
2927
+ static std::atomic<RemoveCvRef<T>> sink;
2928
+ sink.store(output, std::memory_order_relaxed);
1507
2929
  #else
1508
2930
  // Works by indicating to the compiler that "output" is being read and
1509
2931
  // modified. The +r constraint avoids unnecessary writes to memory, but only
@@ -1511,6 +2933,7 @@ HWY_API void PreventElision(T&& output) {
1511
2933
  asm volatile("" : "+r"(output) : : "memory");
1512
2934
  #endif
1513
2935
  }
2936
+ #endif
1514
2937
 
1515
2938
  } // namespace hwy
1516
2939