@img/sharp-libvips-dev 1.0.0 → 1.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (230) hide show
  1. package/include/aom/aom_encoder.h +3 -3
  2. package/include/aom/aomcx.h +17 -8
  3. package/include/expat.h +21 -10
  4. package/include/expat_config.h +11 -5
  5. package/include/ffi.h +12 -25
  6. package/include/fontconfig/fontconfig.h +5 -3
  7. package/include/freetype2/freetype/config/ftoption.h +1 -1
  8. package/include/gio-unix-2.0/gio/gfiledescriptorbased.h +3 -7
  9. package/include/gio-unix-2.0/gio/gunixinputstream.h +0 -5
  10. package/include/gio-unix-2.0/gio/gunixoutputstream.h +0 -5
  11. package/include/glib-2.0/gio/gappinfo.h +0 -7
  12. package/include/glib-2.0/gio/gapplication.h +6 -0
  13. package/include/glib-2.0/gio/gapplicationcommandline.h +12 -1
  14. package/include/glib-2.0/gio/gasyncinitable.h +0 -7
  15. package/include/glib-2.0/gio/gasyncresult.h +0 -6
  16. package/include/glib-2.0/gio/gbufferedinputstream.h +0 -5
  17. package/include/glib-2.0/gio/gbufferedoutputstream.h +0 -5
  18. package/include/glib-2.0/gio/gbytesicon.h +0 -5
  19. package/include/glib-2.0/gio/gcancellable.h +0 -5
  20. package/include/glib-2.0/gio/gconverter.h +0 -7
  21. package/include/glib-2.0/gio/gconverterinputstream.h +0 -6
  22. package/include/glib-2.0/gio/gconverteroutputstream.h +0 -6
  23. package/include/glib-2.0/gio/gdatagrambased.h +0 -7
  24. package/include/glib-2.0/gio/gdatainputstream.h +0 -6
  25. package/include/glib-2.0/gio/gdataoutputstream.h +0 -6
  26. package/include/glib-2.0/gio/gdbusinterface.h +0 -8
  27. package/include/glib-2.0/gio/gdbusinterfaceskeleton.h +0 -8
  28. package/include/glib-2.0/gio/gdbusmessage.h +2 -1
  29. package/include/glib-2.0/gio/gdbusobjectmanagerclient.h +0 -8
  30. package/include/glib-2.0/gio/gdbusobjectmanagerserver.h +0 -8
  31. package/include/glib-2.0/gio/gdbusobjectproxy.h +0 -8
  32. package/include/glib-2.0/gio/gdbusobjectskeleton.h +0 -8
  33. package/include/glib-2.0/gio/gdbusproxy.h +0 -8
  34. package/include/glib-2.0/gio/gdebugcontroller.h +0 -8
  35. package/include/glib-2.0/gio/gdebugcontrollerdbus.h +0 -7
  36. package/include/glib-2.0/gio/gdtlsserverconnection.h +0 -8
  37. package/include/glib-2.0/gio/gemblem.h +0 -5
  38. package/include/glib-2.0/gio/gemblemedicon.h +0 -5
  39. package/include/glib-2.0/gio/gfile.h +0 -10
  40. package/include/glib-2.0/gio/gfileenumerator.h +0 -5
  41. package/include/glib-2.0/gio/gfileicon.h +0 -5
  42. package/include/glib-2.0/gio/gfileinfo.h +0 -5
  43. package/include/glib-2.0/gio/gfileinputstream.h +0 -8
  44. package/include/glib-2.0/gio/gfileiostream.h +0 -8
  45. package/include/glib-2.0/gio/gfilemonitor.h +0 -5
  46. package/include/glib-2.0/gio/gfilenamecompleter.h +0 -5
  47. package/include/glib-2.0/gio/gfileoutputstream.h +0 -8
  48. package/include/glib-2.0/gio/gfilterinputstream.h +0 -5
  49. package/include/glib-2.0/gio/gfilteroutputstream.h +0 -5
  50. package/include/glib-2.0/gio/gicon.h +0 -5
  51. package/include/glib-2.0/gio/ginitable.h +0 -7
  52. package/include/glib-2.0/gio/ginputstream.h +0 -5
  53. package/include/glib-2.0/gio/gio-autocleanups.h +4 -0
  54. package/include/glib-2.0/gio/gio-visibility.h +34 -0
  55. package/include/glib-2.0/gio/gioenums.h +6 -1
  56. package/include/glib-2.0/gio/giomodule.h +0 -5
  57. package/include/glib-2.0/gio/giostream.h +0 -5
  58. package/include/glib-2.0/gio/giotypes.h +5 -108
  59. package/include/glib-2.0/gio/gloadableicon.h +0 -6
  60. package/include/glib-2.0/gio/gmemoryinputstream.h +0 -5
  61. package/include/glib-2.0/gio/gmemoryoutputstream.h +0 -5
  62. package/include/glib-2.0/gio/gmountoperation.h +0 -6
  63. package/include/glib-2.0/gio/gnetworking.h +4 -0
  64. package/include/glib-2.0/gio/goutputstream.h +0 -9
  65. package/include/glib-2.0/gio/gpollableinputstream.h +0 -7
  66. package/include/glib-2.0/gio/gpollableoutputstream.h +0 -7
  67. package/include/glib-2.0/gio/gproxy.h +0 -7
  68. package/include/glib-2.0/gio/gproxyaddressenumerator.h +0 -8
  69. package/include/glib-2.0/gio/gseekable.h +0 -5
  70. package/include/glib-2.0/gio/gsettingsbackend.h +0 -5
  71. package/include/glib-2.0/gio/gsimpleactiongroup.h +0 -7
  72. package/include/glib-2.0/gio/gsimpleasyncresult.h +0 -5
  73. package/include/glib-2.0/gio/gsimpleproxyresolver.h +0 -5
  74. package/include/glib-2.0/gio/gsocket.h +13 -0
  75. package/include/glib-2.0/gio/gsocketaddressenumerator.h +0 -6
  76. package/include/glib-2.0/gio/gsocketconnectable.h +0 -5
  77. package/include/glib-2.0/gio/gtask.h +12 -0
  78. package/include/glib-2.0/gio/gthemedicon.h +0 -5
  79. package/include/glib-2.0/gio/gtlsserverconnection.h +0 -8
  80. package/include/glib-2.0/gio/gunixcredentialsmessage.h +0 -8
  81. package/include/glib-2.0/gio/gvfs.h +0 -5
  82. package/include/glib-2.0/gio/gvolume.h +2 -2
  83. package/include/glib-2.0/gio/gvolumemonitor.h +0 -5
  84. package/include/glib-2.0/girepository/gi-visibility.h +986 -0
  85. package/include/glib-2.0/girepository/giarginfo.h +100 -0
  86. package/include/glib-2.0/girepository/gibaseinfo.h +129 -0
  87. package/include/glib-2.0/girepository/gicallableinfo.h +119 -0
  88. package/include/glib-2.0/girepository/gicallbackinfo.h +60 -0
  89. package/include/glib-2.0/girepository/giconstantinfo.h +72 -0
  90. package/include/glib-2.0/girepository/gienuminfo.h +82 -0
  91. package/include/glib-2.0/girepository/gifieldinfo.h +84 -0
  92. package/include/glib-2.0/girepository/giflagsinfo.h +60 -0
  93. package/include/glib-2.0/girepository/gifunctioninfo.h +117 -0
  94. package/include/glib-2.0/girepository/giinterfaceinfo.h +120 -0
  95. package/include/glib-2.0/girepository/giobjectinfo.h +230 -0
  96. package/include/glib-2.0/girepository/gipropertyinfo.h +77 -0
  97. package/include/glib-2.0/girepository/giregisteredtypeinfo.h +75 -0
  98. package/include/glib-2.0/girepository/girepository-autocleanups.h +56 -0
  99. package/include/glib-2.0/girepository/girepository.h +247 -0
  100. package/include/glib-2.0/girepository/girffi.h +129 -0
  101. package/include/glib-2.0/girepository/gisignalinfo.h +72 -0
  102. package/include/glib-2.0/girepository/gistructinfo.h +102 -0
  103. package/include/glib-2.0/girepository/gitypeinfo.h +144 -0
  104. package/include/glib-2.0/girepository/gitypelib.h +61 -0
  105. package/include/glib-2.0/girepository/gitypes.h +421 -0
  106. package/include/glib-2.0/girepository/giunioninfo.h +105 -0
  107. package/include/glib-2.0/girepository/giunresolvedinfo.h +60 -0
  108. package/include/glib-2.0/girepository/givalueinfo.h +65 -0
  109. package/include/glib-2.0/girepository/givfuncinfo.h +88 -0
  110. package/include/glib-2.0/glib/deprecated/gcompletion.h +1 -1
  111. package/include/glib-2.0/glib/deprecated/grel.h +0 -23
  112. package/include/glib-2.0/glib/deprecated/gthread.h +10 -6
  113. package/include/glib-2.0/glib/gatomic.h +20 -20
  114. package/include/glib-2.0/glib/gbitlock.h +31 -0
  115. package/include/glib-2.0/glib/gbookmarkfile.h +39 -1
  116. package/include/glib-2.0/glib/gchecksum.h +0 -10
  117. package/include/glib-2.0/glib/gdate.h +0 -9
  118. package/include/glib-2.0/glib/gdatetime.h +33 -1
  119. package/include/glib-2.0/glib/gdir.h +5 -0
  120. package/include/glib-2.0/glib/ghmac.h +0 -9
  121. package/include/glib-2.0/glib/glib-autocleanups.h +4 -0
  122. package/include/glib-2.0/glib/glib-visibility.h +34 -0
  123. package/include/glib-2.0/glib/gmacros.h +1 -0
  124. package/include/glib-2.0/glib/gmessages.h +11 -0
  125. package/include/glib-2.0/glib/gpathbuf.h +0 -7
  126. package/include/glib-2.0/glib/gslice.h +2 -0
  127. package/include/glib-2.0/glib/gstdio.h +1 -1
  128. package/include/glib-2.0/glib/gstrfuncs.h +24 -18
  129. package/include/glib-2.0/glib/gstrvbuilder.h +4 -8
  130. package/include/glib-2.0/glib/gtestutils.h +5 -0
  131. package/include/glib-2.0/glib/gthread.h +216 -3
  132. package/include/glib-2.0/glib/gunicode.h +12 -2
  133. package/include/glib-2.0/glib/gvarianttype.h +1 -10
  134. package/include/glib-2.0/glib/gversionmacros.h +9 -0
  135. package/include/glib-2.0/glib/gwin32.h +4 -4
  136. package/include/glib-2.0/glib-unix.h +214 -0
  137. package/include/glib-2.0/gmodule/gmodule-visibility.h +34 -0
  138. package/include/glib-2.0/gobject/gbinding.h +0 -8
  139. package/include/glib-2.0/gobject/gbindinggroup.h +0 -8
  140. package/include/glib-2.0/gobject/gclosure.h +1 -9
  141. package/include/glib-2.0/gobject/genums.h +6 -6
  142. package/include/glib-2.0/gobject/glib-types.h +44 -0
  143. package/include/glib-2.0/gobject/gobject-autocleanups.h +4 -0
  144. package/include/glib-2.0/gobject/gobject-visibility.h +34 -0
  145. package/include/glib-2.0/gobject/gobject.h +1 -16
  146. package/include/glib-2.0/gobject/gparam.h +3 -12
  147. package/include/glib-2.0/gobject/gsignal.h +16 -6
  148. package/include/glib-2.0/gobject/gsignalgroup.h +0 -8
  149. package/include/glib-2.0/gobject/gtype.h +53 -20
  150. package/include/glib-2.0/gobject/gtypemodule.h +0 -7
  151. package/include/glib-2.0/gobject/gtypeplugin.h +0 -6
  152. package/include/glib-2.0/gobject/gvaluearray.h +0 -7
  153. package/include/glib-2.0/gobject/gvaluecollector.h +1 -11
  154. package/include/glib-2.0/gobject/gvaluetypes.h +2 -0
  155. package/include/hwy/aligned_allocator.h +171 -6
  156. package/include/hwy/base.h +1765 -543
  157. package/include/hwy/cache_control.h +24 -6
  158. package/include/hwy/detect_compiler_arch.h +23 -2
  159. package/include/hwy/detect_targets.h +56 -13
  160. package/include/hwy/foreach_target.h +24 -0
  161. package/include/hwy/highway.h +20 -3
  162. package/include/hwy/ops/arm_neon-inl.h +1086 -667
  163. package/include/hwy/ops/arm_sve-inl.h +1091 -235
  164. package/include/hwy/ops/emu128-inl.h +271 -196
  165. package/include/hwy/ops/generic_ops-inl.h +2270 -399
  166. package/include/hwy/ops/ppc_vsx-inl.h +1786 -563
  167. package/include/hwy/ops/rvv-inl.h +1043 -311
  168. package/include/hwy/ops/scalar-inl.h +189 -159
  169. package/include/hwy/ops/set_macros-inl.h +66 -6
  170. package/include/hwy/ops/shared-inl.h +175 -56
  171. package/include/hwy/ops/wasm_128-inl.h +153 -136
  172. package/include/hwy/ops/x86_128-inl.h +1647 -646
  173. package/include/hwy/ops/x86_256-inl.h +1003 -370
  174. package/include/hwy/ops/x86_512-inl.h +948 -353
  175. package/include/hwy/per_target.h +4 -0
  176. package/include/hwy/profiler.h +648 -0
  177. package/include/hwy/robust_statistics.h +2 -2
  178. package/include/hwy/targets.h +18 -11
  179. package/include/hwy/timer.h +11 -0
  180. package/include/lcms2.h +46 -7
  181. package/include/lcms2_plugin.h +4 -4
  182. package/include/libheif/heif_version.h +2 -2
  183. package/include/libpng16/png.h +32 -29
  184. package/include/libpng16/pngconf.h +2 -2
  185. package/include/libpng16/pnglibconf.h +7 -2
  186. package/include/librsvg-2.0/librsvg/rsvg-version.h +2 -2
  187. package/include/libxml2/libxml/HTMLparser.h +23 -0
  188. package/include/libxml2/libxml/SAX.h +0 -2
  189. package/include/libxml2/libxml/SAX2.h +0 -2
  190. package/include/libxml2/libxml/c14n.h +0 -2
  191. package/include/libxml2/libxml/dict.h +1 -0
  192. package/include/libxml2/libxml/encoding.h +16 -14
  193. package/include/libxml2/libxml/entities.h +4 -0
  194. package/include/libxml2/libxml/globals.h +15 -503
  195. package/include/libxml2/libxml/hash.h +57 -61
  196. package/include/libxml2/libxml/nanoftp.h +2 -2
  197. package/include/libxml2/libxml/parser.h +137 -18
  198. package/include/libxml2/libxml/parserInternals.h +1 -0
  199. package/include/libxml2/libxml/relaxng.h +2 -1
  200. package/include/libxml2/libxml/schemasInternals.h +1 -0
  201. package/include/libxml2/libxml/schematron.h +1 -0
  202. package/include/libxml2/libxml/threads.h +4 -11
  203. package/include/libxml2/libxml/tree.h +68 -20
  204. package/include/libxml2/libxml/uri.h +2 -1
  205. package/include/libxml2/libxml/valid.h +2 -0
  206. package/include/libxml2/libxml/xmlIO.h +65 -13
  207. package/include/libxml2/libxml/xmlerror.h +37 -8
  208. package/include/libxml2/libxml/xmlmemory.h +37 -40
  209. package/include/libxml2/libxml/xmlreader.h +6 -0
  210. package/include/libxml2/libxml/xmlregexp.h +2 -9
  211. package/include/libxml2/libxml/xmlsave.h +9 -0
  212. package/include/libxml2/libxml/xmlschemas.h +3 -0
  213. package/include/libxml2/libxml/xmlversion.h +28 -43
  214. package/include/libxml2/libxml/xpath.h +1 -1
  215. package/include/libxml2/libxml/xpathInternals.h +2 -1
  216. package/include/libxml2/libxml/xpointer.h +5 -4
  217. package/include/pango-1.0/pango/pango-features.h +3 -3
  218. package/include/pango-1.0/pango/pango-fontmap.h +7 -0
  219. package/include/pixman-1/pixman-version.h +3 -3
  220. package/include/pixman-1/pixman.h +9 -2
  221. package/include/png.h +32 -29
  222. package/include/pngconf.h +2 -2
  223. package/include/pnglibconf.h +7 -2
  224. package/include/vips/connection.h +9 -3
  225. package/include/vips/util.h +0 -9
  226. package/include/vips/version.h +4 -4
  227. package/include/zconf.h +3 -0
  228. package/include/zlib.h +3 -3
  229. package/package.json +1 -1
  230. package/versions.json +15 -15
@@ -22,16 +22,15 @@
22
22
  #include <stddef.h>
23
23
  #include <stdint.h>
24
24
 
25
- // Wrapping this into a HWY_HAS_INCLUDE causes clang-format to fail.
26
- #if __cplusplus >= 202100L && defined(__has_include)
27
- #if __has_include(<stdfloat>)
28
- #include <stdfloat> // std::float16_t
29
- #endif
30
- #endif
31
-
32
25
  #include "hwy/detect_compiler_arch.h"
33
26
  #include "hwy/highway_export.h"
34
27
 
28
+ #if HWY_COMPILER_MSVC && defined(_MSVC_LANG) && _MSVC_LANG > __cplusplus
29
+ #define HWY_CXX_LANG _MSVC_LANG
30
+ #else
31
+ #define HWY_CXX_LANG __cplusplus
32
+ #endif
33
+
35
34
  // "IWYU pragma: keep" does not work for these includes, so hide from the IDE.
36
35
  #if !HWY_IDE
37
36
 
@@ -48,6 +47,15 @@
48
47
 
49
48
  #endif // !HWY_IDE
50
49
 
50
+ #if !defined(HWY_NO_LIBCXX) && HWY_CXX_LANG > 201703L && \
51
+ __cpp_impl_three_way_comparison >= 201907L && defined(__has_include) && \
52
+ !defined(HWY_DISABLE_CXX20_THREE_WAY_COMPARE)
53
+ #if __has_include(<compare>)
54
+ #include <compare>
55
+ #define HWY_HAVE_CXX20_THREE_WAY_COMPARE 1
56
+ #endif
57
+ #endif
58
+
51
59
  // IWYU pragma: end_exports
52
60
 
53
61
  #if HWY_COMPILER_MSVC
@@ -131,6 +139,10 @@ namespace hwy {
131
139
  #define HWY_ASSUME_ALIGNED(ptr, align) (ptr) /* not supported */
132
140
  #endif
133
141
 
142
+ // Special case to increases required alignment
143
+ #define HWY_RCAST_ALIGNED(type, ptr) \
144
+ reinterpret_cast<type>(HWY_ASSUME_ALIGNED((ptr), alignof(type)))
145
+
134
146
  // Clang and GCC require attributes on each function into which SIMD intrinsics
135
147
  // are inlined. Support both per-function annotation (HWY_ATTR) for lambdas and
136
148
  // automatic annotation via pragmas.
@@ -274,6 +286,16 @@ HWY_DLLEXPORT HWY_NORETURN void HWY_FORMAT(3, 4)
274
286
  } while (0)
275
287
  #endif
276
288
 
289
+ #if __cpp_constexpr >= 201304L
290
+ #define HWY_CXX14_CONSTEXPR constexpr
291
+ #else
292
+ #define HWY_CXX14_CONSTEXPR
293
+ #endif
294
+
295
+ #ifndef HWY_HAVE_CXX20_THREE_WAY_COMPARE
296
+ #define HWY_HAVE_CXX20_THREE_WAY_COMPARE 0
297
+ #endif
298
+
277
299
  //------------------------------------------------------------------------------
278
300
  // CopyBytes / ZeroBytes
279
301
 
@@ -288,8 +310,7 @@ HWY_API void CopyBytes(const From* from, To* to) {
288
310
  #if HWY_COMPILER_MSVC
289
311
  memcpy(to, from, kBytes);
290
312
  #else
291
- __builtin_memcpy(static_cast<void*>(to), static_cast<const void*>(from),
292
- kBytes);
313
+ __builtin_memcpy(to, from, kBytes);
293
314
  #endif
294
315
  }
295
316
 
@@ -357,349 +378,11 @@ static constexpr HWY_MAYBE_UNUSED size_t kMaxVectorSize = 16;
357
378
  //------------------------------------------------------------------------------
358
379
  // Lane types
359
380
 
360
- #pragma pack(push, 1)
361
-
362
- // float16_t load/store/conversion intrinsics are always supported on Armv8 and
363
- // VFPv4 (except with MSVC). On Armv7 Clang requires __ARM_FP & 2; GCC requires
364
- // -mfp16-format=ieee.
365
- #if (HWY_ARCH_ARM_A64 && !HWY_COMPILER_MSVC) || \
366
- (HWY_COMPILER_CLANG && defined(__ARM_FP) && (__ARM_FP & 2)) || \
367
- (HWY_COMPILER_GCC_ACTUAL && defined(__ARM_FP16_FORMAT_IEEE))
368
- #define HWY_NEON_HAVE_FLOAT16C 1
369
- #else
370
- #define HWY_NEON_HAVE_FLOAT16C 0
371
- #endif
372
-
373
- // C11 extension ISO/IEC TS 18661-3:2015 but not supported on all targets.
374
- // Required if HWY_HAVE_FLOAT16, i.e. RVV with zvfh or AVX3_SPR (with
375
- // sufficiently new compiler supporting avx512fp16). Do not use on clang-cl,
376
- // which is missing __extendhfsf2.
377
- #if ((HWY_ARCH_RVV && defined(__riscv_zvfh) && HWY_COMPILER_CLANG) || \
378
- (HWY_ARCH_X86 && defined(__SSE2__) && \
379
- ((HWY_COMPILER_CLANG >= 1600 && !HWY_COMPILER_CLANGCL) || \
380
- HWY_COMPILER_GCC_ACTUAL >= 1200)))
381
- #define HWY_HAVE_C11_FLOAT16 1
382
- #else
383
- #define HWY_HAVE_C11_FLOAT16 0
384
- #endif
385
-
386
- // If 1, both __bf16 and a limited set of *_bf16 SVE intrinsics are available:
387
- // create/get/set/dup, ld/st, sel, rev, trn, uzp, zip.
388
- #if HWY_ARCH_ARM_A64 && defined(__ARM_FEATURE_SVE_BF16)
389
- #define HWY_SVE_HAVE_BFLOAT16 1
390
- #else
391
- #define HWY_SVE_HAVE_BFLOAT16 0
392
- #endif
393
-
394
- // Match [u]int##_t naming scheme so rvv-inl.h macros can obtain the type name
395
- // by concatenating base type and bits. We use a wrapper class instead of a
396
- // typedef to the native type to ensure that the same symbols, e.g. for VQSort,
397
- // are generated regardless of F16 support; see #1684.
398
- struct float16_t {
399
- #if HWY_NEON_HAVE_FLOAT16C // ACLE's __fp16
400
- using Raw = __fp16;
401
- #elif HWY_HAVE_C11_FLOAT16 // C11 _Float16
402
- using Raw = _Float16;
403
- #elif __cplusplus > 202002L && defined(__STDCPP_FLOAT16_T__) // C++23
404
- using Raw = std::float16_t;
405
- #else
406
- #define HWY_EMULATE_FLOAT16
407
- using Raw = uint16_t;
408
- Raw bits;
409
- #endif // float16_t
410
-
411
- // When backed by a native type, ensure the wrapper behaves like the native
412
- // type by forwarding all operators. Unfortunately it seems difficult to reuse
413
- // this code in a base class, so we repeat it in bfloat16_t.
414
- #ifndef HWY_EMULATE_FLOAT16
415
- Raw raw;
416
-
417
- float16_t() noexcept = default;
418
- template <typename T>
419
- constexpr float16_t(T arg) noexcept : raw(static_cast<Raw>(arg)) {}
420
- float16_t& operator=(Raw arg) noexcept {
421
- raw = arg;
422
- return *this;
423
- }
424
- constexpr float16_t(const float16_t&) noexcept = default;
425
- float16_t& operator=(const float16_t&) noexcept = default;
426
- constexpr operator Raw() const noexcept { return raw; }
427
-
428
- template <typename T>
429
- float16_t& operator+=(T rhs) noexcept {
430
- raw = static_cast<Raw>(raw + rhs);
431
- return *this;
432
- }
433
-
434
- template <typename T>
435
- float16_t& operator-=(T rhs) noexcept {
436
- raw = static_cast<Raw>(raw - rhs);
437
- return *this;
438
- }
439
-
440
- template <typename T>
441
- float16_t& operator*=(T rhs) noexcept {
442
- raw = static_cast<Raw>(raw * rhs);
443
- return *this;
444
- }
445
-
446
- template <typename T>
447
- float16_t& operator/=(T rhs) noexcept {
448
- raw = static_cast<Raw>(raw / rhs);
449
- return *this;
450
- }
451
-
452
- float16_t operator--() noexcept {
453
- raw = static_cast<Raw>(raw - Raw{1});
454
- return *this;
455
- }
456
-
457
- float16_t operator--(int) noexcept {
458
- raw = static_cast<Raw>(raw - Raw{1});
459
- return *this;
460
- }
461
-
462
- float16_t operator++() noexcept {
463
- raw = static_cast<Raw>(raw + Raw{1});
464
- return *this;
465
- }
466
-
467
- float16_t operator++(int) noexcept {
468
- raw = static_cast<Raw>(raw + Raw{1});
469
- return *this;
470
- }
471
-
472
- constexpr float16_t operator-() const noexcept {
473
- return float16_t(static_cast<Raw>(-raw));
474
- }
475
- constexpr float16_t operator+() const noexcept { return *this; }
476
- #endif // HWY_EMULATE_FLOAT16
477
- };
478
-
479
- #ifndef HWY_EMULATE_FLOAT16
480
- constexpr inline bool operator==(float16_t lhs, float16_t rhs) noexcept {
481
- return lhs.raw == rhs.raw;
482
- }
483
- constexpr inline bool operator!=(float16_t lhs, float16_t rhs) noexcept {
484
- return lhs.raw != rhs.raw;
485
- }
486
- constexpr inline bool operator<(float16_t lhs, float16_t rhs) noexcept {
487
- return lhs.raw < rhs.raw;
488
- }
489
- constexpr inline bool operator<=(float16_t lhs, float16_t rhs) noexcept {
490
- return lhs.raw <= rhs.raw;
491
- }
492
- constexpr inline bool operator>(float16_t lhs, float16_t rhs) noexcept {
493
- return lhs.raw > rhs.raw;
494
- }
495
- constexpr inline bool operator>=(float16_t lhs, float16_t rhs) noexcept {
496
- return lhs.raw >= rhs.raw;
497
- }
498
- #endif // HWY_EMULATE_FLOAT16
499
-
500
- struct bfloat16_t {
501
- #if HWY_SVE_HAVE_BFLOAT16
502
- using Raw = __bf16;
503
- #elif __cplusplus >= 202100L && defined(__STDCPP_BFLOAT16_T__) // C++23
504
- using Raw = std::bfloat16_t;
505
- #else
506
- #define HWY_EMULATE_BFLOAT16
507
- using Raw = uint16_t;
508
- Raw bits;
509
- #endif
510
-
511
- #ifndef HWY_EMULATE_BFLOAT16
512
- Raw raw;
513
-
514
- bfloat16_t() noexcept = default;
515
- template <typename T>
516
- constexpr bfloat16_t(T arg) noexcept : raw(static_cast<Raw>(arg)) {}
517
- bfloat16_t& operator=(Raw arg) noexcept {
518
- raw = arg;
519
- return *this;
520
- }
521
- constexpr bfloat16_t(const bfloat16_t&) noexcept = default;
522
- bfloat16_t& operator=(const bfloat16_t&) noexcept = default;
523
- constexpr operator Raw() const noexcept { return raw; }
524
-
525
- template <typename T>
526
- bfloat16_t& operator+=(T rhs) noexcept {
527
- raw = static_cast<Raw>(raw + rhs);
528
- return *this;
529
- }
530
-
531
- template <typename T>
532
- bfloat16_t& operator-=(T rhs) noexcept {
533
- raw = static_cast<Raw>(raw - rhs);
534
- return *this;
535
- }
536
-
537
- template <typename T>
538
- bfloat16_t& operator*=(T rhs) noexcept {
539
- raw = static_cast<Raw>(raw * rhs);
540
- return *this;
541
- }
542
-
543
- template <typename T>
544
- bfloat16_t& operator/=(T rhs) noexcept {
545
- raw = static_cast<Raw>(raw / rhs);
546
- return *this;
547
- }
548
-
549
- bfloat16_t operator--() noexcept {
550
- raw = static_cast<Raw>(raw - Raw{1});
551
- return *this;
552
- }
553
-
554
- bfloat16_t operator--(int) noexcept {
555
- raw = static_cast<Raw>(raw - Raw{1});
556
- return *this;
557
- }
558
-
559
- bfloat16_t operator++() noexcept {
560
- raw = static_cast<Raw>(raw + Raw{1});
561
- return *this;
562
- }
563
-
564
- bfloat16_t operator++(int) noexcept {
565
- raw = static_cast<Raw>(raw + Raw{1});
566
- return *this;
567
- }
568
-
569
- constexpr bfloat16_t operator-() const noexcept {
570
- return bfloat16_t(static_cast<Raw>(-raw));
571
- }
572
- constexpr bfloat16_t operator+() const noexcept { return *this; }
573
- #endif // HWY_EMULATE_BFLOAT16
574
- };
575
-
576
- #ifndef HWY_EMULATE_BFLOAT16
577
- constexpr inline bool operator==(bfloat16_t lhs, bfloat16_t rhs) noexcept {
578
- return lhs.raw == rhs.raw;
579
- }
580
- constexpr inline bool operator!=(bfloat16_t lhs, bfloat16_t rhs) noexcept {
581
- return lhs.raw != rhs.raw;
582
- }
583
- constexpr inline bool operator<(bfloat16_t lhs, bfloat16_t rhs) noexcept {
584
- return lhs.raw < rhs.raw;
585
- }
586
- constexpr inline bool operator<=(bfloat16_t lhs, bfloat16_t rhs) noexcept {
587
- return lhs.raw <= rhs.raw;
588
- }
589
- constexpr inline bool operator>(bfloat16_t lhs, bfloat16_t rhs) noexcept {
590
- return lhs.raw > rhs.raw;
591
- }
592
- constexpr inline bool operator>=(bfloat16_t lhs, bfloat16_t rhs) noexcept {
593
- return lhs.raw >= rhs.raw;
594
- }
595
- #endif // HWY_EMULATE_BFLOAT16
596
-
597
- #pragma pack(pop)
598
-
599
- HWY_API float F32FromF16(float16_t f16) {
600
- #ifdef HWY_EMULATE_FLOAT16
601
- uint16_t bits16;
602
- CopySameSize(&f16, &bits16);
603
- const uint32_t sign = static_cast<uint32_t>(bits16 >> 15);
604
- const uint32_t biased_exp = (bits16 >> 10) & 0x1F;
605
- const uint32_t mantissa = bits16 & 0x3FF;
606
-
607
- // Subnormal or zero
608
- if (biased_exp == 0) {
609
- const float subnormal =
610
- (1.0f / 16384) * (static_cast<float>(mantissa) * (1.0f / 1024));
611
- return sign ? -subnormal : subnormal;
612
- }
613
-
614
- // Normalized: convert the representation directly (faster than ldexp/tables).
615
- const uint32_t biased_exp32 = biased_exp + (127 - 15);
616
- const uint32_t mantissa32 = mantissa << (23 - 10);
617
- const uint32_t bits32 = (sign << 31) | (biased_exp32 << 23) | mantissa32;
618
-
619
- float result;
620
- CopySameSize(&bits32, &result);
621
- return result;
622
- #else
623
- return static_cast<float>(f16);
624
- #endif
625
- }
626
-
627
- HWY_API float16_t F16FromF32(float f32) {
628
- #ifdef HWY_EMULATE_FLOAT16
629
- uint32_t bits32;
630
- CopySameSize(&f32, &bits32);
631
- const uint32_t sign = bits32 >> 31;
632
- const uint32_t biased_exp32 = (bits32 >> 23) & 0xFF;
633
- const uint32_t mantissa32 = bits32 & 0x7FFFFF;
634
-
635
- const int32_t exp = HWY_MIN(static_cast<int32_t>(biased_exp32) - 127, 15);
636
-
637
- // Tiny or zero => zero.
638
- float16_t out;
639
- if (exp < -24) {
640
- // restore original sign
641
- const uint16_t bits = static_cast<uint16_t>(sign << 15);
642
- CopySameSize(&bits, &out);
643
- return out;
644
- }
645
-
646
- uint32_t biased_exp16, mantissa16;
647
-
648
- // exp = [-24, -15] => subnormal
649
- if (exp < -14) {
650
- biased_exp16 = 0;
651
- const uint32_t sub_exp = static_cast<uint32_t>(-14 - exp);
652
- HWY_DASSERT(1 <= sub_exp && sub_exp < 11);
653
- mantissa16 = static_cast<uint32_t>((1u << (10 - sub_exp)) +
654
- (mantissa32 >> (13 + sub_exp)));
655
- } else {
656
- // exp = [-14, 15]
657
- biased_exp16 = static_cast<uint32_t>(exp + 15);
658
- HWY_DASSERT(1 <= biased_exp16 && biased_exp16 < 31);
659
- mantissa16 = mantissa32 >> 13;
660
- }
661
-
662
- HWY_DASSERT(mantissa16 < 1024);
663
- const uint32_t bits16 = (sign << 15) | (biased_exp16 << 10) | mantissa16;
664
- HWY_DASSERT(bits16 < 0x10000);
665
- const uint16_t narrowed = static_cast<uint16_t>(bits16); // big-endian safe
666
- CopySameSize(&narrowed, &out);
667
- return out;
668
- #else
669
- return float16_t(static_cast<float16_t::Raw>(f32));
670
- #endif
671
- }
672
-
673
- HWY_API float F32FromBF16(bfloat16_t bf) {
674
- uint16_t bits16;
675
- CopyBytes<2>(&bf, &bits16);
676
- uint32_t bits = bits16;
677
- bits <<= 16;
678
- float f;
679
- CopySameSize(&bits, &f);
680
- return f;
681
- }
682
-
683
- HWY_API float F32FromF16Mem(const void* ptr) {
684
- float16_t f16;
685
- CopyBytes<2>(ptr, &f16);
686
- return F32FromF16(f16);
687
- }
688
-
689
- HWY_API float F32FromBF16Mem(const void* ptr) {
690
- bfloat16_t bf;
691
- CopyBytes<2>(ptr, &bf);
692
- return F32FromBF16(bf);
693
- }
694
-
695
- HWY_API bfloat16_t BF16FromF32(float f) {
696
- uint32_t bits;
697
- CopySameSize(&f, &bits);
698
- const uint16_t bits16 = static_cast<uint16_t>(bits >> 16);
699
- bfloat16_t bf;
700
- CopySameSize(&bits16, &bf);
701
- return bf;
702
- }
381
+ // hwy::float16_t and hwy::bfloat16_t are forward declared here to allow
382
+ // BitCastScalar to be implemented before the implementations of the
383
+ // hwy::float16_t and hwy::bfloat16_t types
384
+ struct float16_t;
385
+ struct bfloat16_t;
703
386
 
704
387
  using float32_t = float;
705
388
  using float64_t = double;
@@ -729,24 +412,6 @@ struct alignas(8) K32V32 {
729
412
 
730
413
  #pragma pack(pop)
731
414
 
732
- #ifdef HWY_EMULATE_FLOAT16
733
-
734
- static inline HWY_MAYBE_UNUSED bool operator<(const float16_t& a,
735
- const float16_t& b) {
736
- return F32FromF16(a) < F32FromF16(b);
737
- }
738
- // Required for std::greater.
739
- static inline HWY_MAYBE_UNUSED bool operator>(const float16_t& a,
740
- const float16_t& b) {
741
- return F32FromF16(a) > F32FromF16(b);
742
- }
743
- static inline HWY_MAYBE_UNUSED bool operator==(const float16_t& a,
744
- const float16_t& b) {
745
- return F32FromF16(a) == F32FromF16(b);
746
- }
747
-
748
- #endif // HWY_EMULATE_FLOAT16
749
-
750
415
  static inline HWY_MAYBE_UNUSED bool operator<(const uint128_t& a,
751
416
  const uint128_t& b) {
752
417
  return (a.hi == b.hi) ? a.lo < b.lo : a.hi < b.hi;
@@ -817,106 +482,1378 @@ HWY_API constexpr bool IsSame() {
817
482
  return IsSameT<T, U>::value;
818
483
  }
819
484
 
485
+ // Returns whether T matches either of U1 or U2
486
+ template <typename T, typename U1, typename U2>
487
+ HWY_API constexpr bool IsSameEither() {
488
+ return IsSameT<T, U1>::value || IsSameT<T, U2>::value;
489
+ }
490
+
820
491
  template <bool Condition, typename Then, typename Else>
821
492
  struct IfT {
822
493
  using type = Then;
823
494
  };
824
495
 
825
- template <class Then, class Else>
826
- struct IfT<false, Then, Else> {
827
- using type = Else;
828
- };
496
+ template <class Then, class Else>
497
+ struct IfT<false, Then, Else> {
498
+ using type = Else;
499
+ };
500
+
501
+ template <bool Condition, typename Then, typename Else>
502
+ using If = typename IfT<Condition, Then, Else>::type;
503
+
504
+ template <typename T>
505
+ struct IsConstT {
506
+ enum { value = 0 };
507
+ };
508
+
509
+ template <typename T>
510
+ struct IsConstT<const T> {
511
+ enum { value = 1 };
512
+ };
513
+
514
+ template <typename T>
515
+ HWY_API constexpr bool IsConst() {
516
+ return IsConstT<T>::value;
517
+ }
518
+
519
+ template <class T>
520
+ struct RemoveConstT {
521
+ using type = T;
522
+ };
523
+ template <class T>
524
+ struct RemoveConstT<const T> {
525
+ using type = T;
526
+ };
527
+
528
+ template <class T>
529
+ using RemoveConst = typename RemoveConstT<T>::type;
530
+
531
+ template <class T>
532
+ struct RemoveVolatileT {
533
+ using type = T;
534
+ };
535
+ template <class T>
536
+ struct RemoveVolatileT<volatile T> {
537
+ using type = T;
538
+ };
539
+
540
+ template <class T>
541
+ using RemoveVolatile = typename RemoveVolatileT<T>::type;
542
+
543
+ template <class T>
544
+ struct RemoveRefT {
545
+ using type = T;
546
+ };
547
+ template <class T>
548
+ struct RemoveRefT<T&> {
549
+ using type = T;
550
+ };
551
+ template <class T>
552
+ struct RemoveRefT<T&&> {
553
+ using type = T;
554
+ };
555
+
556
+ template <class T>
557
+ using RemoveRef = typename RemoveRefT<T>::type;
558
+
559
+ template <class T>
560
+ using RemoveCvRef = RemoveConst<RemoveVolatile<RemoveRef<T>>>;
561
+
562
+ // Insert into template/function arguments to enable this overload only for
563
+ // vectors of exactly, at most (LE), or more than (GT) this many bytes.
564
+ //
565
+ // As an example, checking for a total size of 16 bytes will match both
566
+ // Simd<uint8_t, 16, 0> and Simd<uint8_t, 8, 1>.
567
+ #define HWY_IF_V_SIZE(T, kN, bytes) \
568
+ hwy::EnableIf<kN * sizeof(T) == bytes>* = nullptr
569
+ #define HWY_IF_V_SIZE_LE(T, kN, bytes) \
570
+ hwy::EnableIf<kN * sizeof(T) <= bytes>* = nullptr
571
+ #define HWY_IF_V_SIZE_GT(T, kN, bytes) \
572
+ hwy::EnableIf<(kN * sizeof(T) > bytes)>* = nullptr
573
+
574
+ #define HWY_IF_LANES(kN, lanes) hwy::EnableIf<(kN == lanes)>* = nullptr
575
+ #define HWY_IF_LANES_LE(kN, lanes) hwy::EnableIf<(kN <= lanes)>* = nullptr
576
+ #define HWY_IF_LANES_GT(kN, lanes) hwy::EnableIf<(kN > lanes)>* = nullptr
577
+
578
+ #define HWY_IF_UNSIGNED(T) hwy::EnableIf<!hwy::IsSigned<T>()>* = nullptr
579
+ #define HWY_IF_SIGNED(T) \
580
+ hwy::EnableIf<hwy::IsSigned<T>() && !hwy::IsFloat<T>() && \
581
+ !hwy::IsSpecialFloat<T>()>* = nullptr
582
+ #define HWY_IF_FLOAT(T) hwy::EnableIf<hwy::IsFloat<T>()>* = nullptr
583
+ #define HWY_IF_NOT_FLOAT(T) hwy::EnableIf<!hwy::IsFloat<T>()>* = nullptr
584
+ #define HWY_IF_FLOAT3264(T) hwy::EnableIf<hwy::IsFloat3264<T>()>* = nullptr
585
+ #define HWY_IF_NOT_FLOAT3264(T) hwy::EnableIf<!hwy::IsFloat3264<T>()>* = nullptr
586
+ #define HWY_IF_SPECIAL_FLOAT(T) \
587
+ hwy::EnableIf<hwy::IsSpecialFloat<T>()>* = nullptr
588
+ #define HWY_IF_NOT_SPECIAL_FLOAT(T) \
589
+ hwy::EnableIf<!hwy::IsSpecialFloat<T>()>* = nullptr
590
+ #define HWY_IF_FLOAT_OR_SPECIAL(T) \
591
+ hwy::EnableIf<hwy::IsFloat<T>() || hwy::IsSpecialFloat<T>()>* = nullptr
592
+ #define HWY_IF_NOT_FLOAT_NOR_SPECIAL(T) \
593
+ hwy::EnableIf<!hwy::IsFloat<T>() && !hwy::IsSpecialFloat<T>()>* = nullptr
594
+ #define HWY_IF_INTEGER(T) hwy::EnableIf<hwy::IsInteger<T>()>* = nullptr
595
+
596
+ #define HWY_IF_T_SIZE(T, bytes) hwy::EnableIf<sizeof(T) == (bytes)>* = nullptr
597
+ #define HWY_IF_NOT_T_SIZE(T, bytes) \
598
+ hwy::EnableIf<sizeof(T) != (bytes)>* = nullptr
599
+ // bit_array = 0x102 means 1 or 8 bytes. There is no NONE_OF because it sounds
600
+ // too similar. If you want the opposite of this (2 or 4 bytes), ask for those
601
+ // bits explicitly (0x14) instead of attempting to 'negate' 0x102.
602
+ #define HWY_IF_T_SIZE_ONE_OF(T, bit_array) \
603
+ hwy::EnableIf<((size_t{1} << sizeof(T)) & (bit_array)) != 0>* = nullptr
604
+ #define HWY_IF_T_SIZE_LE(T, bytes) \
605
+ hwy::EnableIf<(sizeof(T) <= (bytes))>* = nullptr
606
+ #define HWY_IF_T_SIZE_GT(T, bytes) \
607
+ hwy::EnableIf<(sizeof(T) > (bytes))>* = nullptr
608
+
609
+ #define HWY_IF_SAME(T, expected) \
610
+ hwy::EnableIf<hwy::IsSame<hwy::RemoveCvRef<T>, expected>()>* = nullptr
611
+ #define HWY_IF_NOT_SAME(T, expected) \
612
+ hwy::EnableIf<!hwy::IsSame<hwy::RemoveCvRef<T>, expected>()>* = nullptr
613
+
614
+ // One of two expected types
615
+ #define HWY_IF_SAME2(T, expected1, expected2) \
616
+ hwy::EnableIf< \
617
+ hwy::IsSameEither<hwy::RemoveCvRef<T>, expected1, expected2>()>* = \
618
+ nullptr
619
+
620
+ #define HWY_IF_U8(T) HWY_IF_SAME(T, uint8_t)
621
+ #define HWY_IF_U16(T) HWY_IF_SAME(T, uint16_t)
622
+ #define HWY_IF_U32(T) HWY_IF_SAME(T, uint32_t)
623
+ #define HWY_IF_U64(T) HWY_IF_SAME(T, uint64_t)
624
+
625
+ #define HWY_IF_I8(T) HWY_IF_SAME(T, int8_t)
626
+ #define HWY_IF_I16(T) HWY_IF_SAME(T, int16_t)
627
+ #define HWY_IF_I32(T) HWY_IF_SAME(T, int32_t)
628
+ #define HWY_IF_I64(T) HWY_IF_SAME(T, int64_t)
629
+
630
+ #define HWY_IF_BF16(T) HWY_IF_SAME(T, hwy::bfloat16_t)
631
+ #define HWY_IF_NOT_BF16(T) HWY_IF_NOT_SAME(T, hwy::bfloat16_t)
632
+
633
+ #define HWY_IF_F16(T) HWY_IF_SAME(T, hwy::float16_t)
634
+ #define HWY_IF_NOT_F16(T) HWY_IF_NOT_SAME(T, hwy::float16_t)
635
+
636
+ #define HWY_IF_F32(T) HWY_IF_SAME(T, float)
637
+ #define HWY_IF_F64(T) HWY_IF_SAME(T, double)
638
+
639
+ // Use instead of HWY_IF_T_SIZE to avoid ambiguity with float16_t/float/double
640
+ // overloads.
641
+ #define HWY_IF_UI8(T) HWY_IF_SAME2(T, uint8_t, int8_t)
642
+ #define HWY_IF_UI16(T) HWY_IF_SAME2(T, uint16_t, int16_t)
643
+ #define HWY_IF_UI32(T) HWY_IF_SAME2(T, uint32_t, int32_t)
644
+ #define HWY_IF_UI64(T) HWY_IF_SAME2(T, uint64_t, int64_t)
645
+
646
+ #define HWY_IF_LANES_PER_BLOCK(T, N, LANES) \
647
+ hwy::EnableIf<HWY_MIN(sizeof(T) * N, 16) / sizeof(T) == (LANES)>* = nullptr
648
+
649
+ // Empty struct used as a size tag type.
650
+ template <size_t N>
651
+ struct SizeTag {};
652
+
653
+ template <class T>
654
+ class DeclValT {
655
+ private:
656
+ template <class U, class URef = U&&>
657
+ static URef TryAddRValRef(int);
658
+ template <class U, class Arg>
659
+ static U TryAddRValRef(Arg);
660
+
661
+ public:
662
+ using type = decltype(TryAddRValRef<T>(0));
663
+ enum { kDisableDeclValEvaluation = 1 };
664
+ };
665
+
666
+ // hwy::DeclVal<T>() can only be used in unevaluated contexts such as within an
667
+ // expression of a decltype specifier.
668
+
669
+ // hwy::DeclVal<T>() does not require that T have a public default constructor
670
+ template <class T>
671
+ HWY_API typename DeclValT<T>::type DeclVal() noexcept {
672
+ static_assert(!DeclValT<T>::kDisableDeclValEvaluation,
673
+ "DeclVal() cannot be used in an evaluated context");
674
+ }
675
+
676
+ template <class T>
677
+ struct IsArrayT {
678
+ enum { value = 0 };
679
+ };
680
+
681
+ template <class T>
682
+ struct IsArrayT<T[]> {
683
+ enum { value = 1 };
684
+ };
685
+
686
+ template <class T, size_t N>
687
+ struct IsArrayT<T[N]> {
688
+ enum { value = 1 };
689
+ };
690
+
691
+ template <class T>
692
+ static constexpr bool IsArray() {
693
+ return IsArrayT<T>::value;
694
+ }
695
+
696
+ #if HWY_COMPILER_MSVC
697
+ HWY_DIAGNOSTICS(push)
698
+ HWY_DIAGNOSTICS_OFF(disable : 4180, ignored "-Wignored-qualifiers")
699
+ #endif
700
+
701
+ template <class From, class To>
702
+ class IsConvertibleT {
703
+ private:
704
+ template <class T>
705
+ static hwy::SizeTag<1> TestFuncWithToArg(T);
706
+
707
+ template <class T, class U>
708
+ static decltype(IsConvertibleT<T, U>::template TestFuncWithToArg<U>(
709
+ DeclVal<T>()))
710
+ TryConvTest(int);
711
+
712
+ template <class T, class U, class Arg>
713
+ static hwy::SizeTag<0> TryConvTest(Arg);
714
+
715
+ public:
716
+ enum {
717
+ value = (IsSame<RemoveConst<RemoveVolatile<From>>, void>() &&
718
+ IsSame<RemoveConst<RemoveVolatile<To>>, void>()) ||
719
+ (!IsArray<To>() &&
720
+ (IsSame<To, decltype(DeclVal<To>())>() ||
721
+ !IsSame<const RemoveConst<To>, RemoveConst<To>>()) &&
722
+ IsSame<decltype(TryConvTest<From, To>(0)), hwy::SizeTag<1>>())
723
+ };
724
+ };
725
+
726
+ #if HWY_COMPILER_MSVC
727
+ HWY_DIAGNOSTICS(pop)
728
+ #endif
729
+
730
+ template <class From, class To>
731
+ HWY_API constexpr bool IsConvertible() {
732
+ return IsConvertibleT<From, To>::value;
733
+ }
734
+
735
+ template <class From, class To>
736
+ class IsStaticCastableT {
737
+ private:
738
+ template <class T, class U, class = decltype(static_cast<U>(DeclVal<T>()))>
739
+ static hwy::SizeTag<1> TryStaticCastTest(int);
740
+
741
+ template <class T, class U, class Arg>
742
+ static hwy::SizeTag<0> TryStaticCastTest(Arg);
743
+
744
+ public:
745
+ enum {
746
+ value = IsSame<decltype(TryStaticCastTest<From, To>(0)), hwy::SizeTag<1>>()
747
+ };
748
+ };
749
+
750
+ template <class From, class To>
751
+ static constexpr bool IsStaticCastable() {
752
+ return IsStaticCastableT<From, To>::value;
753
+ }
754
+
755
+ #define HWY_IF_CASTABLE(From, To) \
756
+ hwy::EnableIf<IsStaticCastable<From, To>()>* = nullptr
757
+
758
+ #define HWY_IF_OP_CASTABLE(op, T, Native) \
759
+ HWY_IF_CASTABLE(decltype(DeclVal<Native>() op DeclVal<T>()), Native)
760
+
761
+ template <class T, class From>
762
+ class IsAssignableT {
763
+ private:
764
+ template <class T1, class T2, class = decltype(DeclVal<T1>() = DeclVal<T2>())>
765
+ static hwy::SizeTag<1> TryAssignTest(int);
766
+
767
+ template <class T1, class T2, class Arg>
768
+ static hwy::SizeTag<0> TryAssignTest(Arg);
769
+
770
+ public:
771
+ enum {
772
+ value = IsSame<decltype(TryAssignTest<T, From>(0)), hwy::SizeTag<1>>()
773
+ };
774
+ };
775
+
776
+ template <class T, class From>
777
+ static constexpr bool IsAssignable() {
778
+ return IsAssignableT<T, From>::value;
779
+ }
780
+
781
+ #define HWY_IF_ASSIGNABLE(T, From) \
782
+ hwy::EnableIf<IsAssignable<T, From>()>* = nullptr
783
+
784
+ // ----------------------------------------------------------------------------
785
+ // IsSpecialFloat
786
+
787
+ // These types are often special-cased and not supported in all ops.
788
+ template <typename T>
789
+ HWY_API constexpr bool IsSpecialFloat() {
790
+ return IsSameEither<RemoveCvRef<T>, hwy::float16_t, hwy::bfloat16_t>();
791
+ }
792
+
793
+ // -----------------------------------------------------------------------------
794
+ // IsIntegerLaneType and IsInteger
795
+
796
+ template <class T>
797
+ HWY_API constexpr bool IsIntegerLaneType() {
798
+ return false;
799
+ }
800
+ template <>
801
+ HWY_INLINE constexpr bool IsIntegerLaneType<int8_t>() {
802
+ return true;
803
+ }
804
+ template <>
805
+ HWY_INLINE constexpr bool IsIntegerLaneType<uint8_t>() {
806
+ return true;
807
+ }
808
+ template <>
809
+ HWY_INLINE constexpr bool IsIntegerLaneType<int16_t>() {
810
+ return true;
811
+ }
812
+ template <>
813
+ HWY_INLINE constexpr bool IsIntegerLaneType<uint16_t>() {
814
+ return true;
815
+ }
816
+ template <>
817
+ HWY_INLINE constexpr bool IsIntegerLaneType<int32_t>() {
818
+ return true;
819
+ }
820
+ template <>
821
+ HWY_INLINE constexpr bool IsIntegerLaneType<uint32_t>() {
822
+ return true;
823
+ }
824
+ template <>
825
+ HWY_INLINE constexpr bool IsIntegerLaneType<int64_t>() {
826
+ return true;
827
+ }
828
+ template <>
829
+ HWY_INLINE constexpr bool IsIntegerLaneType<uint64_t>() {
830
+ return true;
831
+ }
832
+
833
+ template <class T>
834
+ HWY_API constexpr bool IsInteger() {
835
+ // NOTE: Do not add a IsInteger<wchar_t>() specialization below as it is
836
+ // possible for IsSame<wchar_t, uint16_t>() to be true when compiled with MSVC
837
+ // with the /Zc:wchar_t- option.
838
+ return IsIntegerLaneType<T>() || IsSame<RemoveCvRef<T>, wchar_t>() ||
839
+ IsSameEither<RemoveCvRef<T>, size_t, ptrdiff_t>() ||
840
+ IsSameEither<RemoveCvRef<T>, intptr_t, uintptr_t>();
841
+ }
842
+ template <>
843
+ HWY_INLINE constexpr bool IsInteger<bool>() {
844
+ return true;
845
+ }
846
+ template <>
847
+ HWY_INLINE constexpr bool IsInteger<char>() {
848
+ return true;
849
+ }
850
+ template <>
851
+ HWY_INLINE constexpr bool IsInteger<signed char>() {
852
+ return true;
853
+ }
854
+ template <>
855
+ HWY_INLINE constexpr bool IsInteger<unsigned char>() {
856
+ return true;
857
+ }
858
+ template <>
859
+ HWY_INLINE constexpr bool IsInteger<short>() { // NOLINT
860
+ return true;
861
+ }
862
+ template <>
863
+ HWY_INLINE constexpr bool IsInteger<unsigned short>() { // NOLINT
864
+ return true;
865
+ }
866
+ template <>
867
+ HWY_INLINE constexpr bool IsInteger<int>() {
868
+ return true;
869
+ }
870
+ template <>
871
+ HWY_INLINE constexpr bool IsInteger<unsigned>() {
872
+ return true;
873
+ }
874
+ template <>
875
+ HWY_INLINE constexpr bool IsInteger<long>() { // NOLINT
876
+ return true;
877
+ }
878
+ template <>
879
+ HWY_INLINE constexpr bool IsInteger<unsigned long>() { // NOLINT
880
+ return true;
881
+ }
882
+ template <>
883
+ HWY_INLINE constexpr bool IsInteger<long long>() { // NOLINT
884
+ return true;
885
+ }
886
+ template <>
887
+ HWY_INLINE constexpr bool IsInteger<unsigned long long>() { // NOLINT
888
+ return true;
889
+ }
890
+ #if defined(__cpp_char8_t) && __cpp_char8_t >= 201811L
891
+ template <>
892
+ HWY_INLINE constexpr bool IsInteger<char8_t>() {
893
+ return true;
894
+ }
895
+ #endif
896
+ template <>
897
+ HWY_INLINE constexpr bool IsInteger<char16_t>() {
898
+ return true;
899
+ }
900
+ template <>
901
+ HWY_INLINE constexpr bool IsInteger<char32_t>() {
902
+ return true;
903
+ }
904
+
905
+ // -----------------------------------------------------------------------------
906
+ // BitCastScalar
907
+
908
+ #if HWY_HAS_BUILTIN(__builtin_bit_cast) || HWY_COMPILER_MSVC >= 1926
909
+ #define HWY_BITCASTSCALAR_CONSTEXPR constexpr
910
+ #else
911
+ #define HWY_BITCASTSCALAR_CONSTEXPR
912
+ #endif
913
+
914
+ #if __cpp_constexpr >= 201304L
915
+ #define HWY_BITCASTSCALAR_CXX14_CONSTEXPR HWY_BITCASTSCALAR_CONSTEXPR
916
+ #else
917
+ #define HWY_BITCASTSCALAR_CXX14_CONSTEXPR
918
+ #endif
919
+
920
+ #if HWY_HAS_BUILTIN(__builtin_bit_cast) || HWY_COMPILER_MSVC >= 1926
921
+ namespace detail {
922
+
923
+ template <class From>
924
+ struct BitCastScalarSrcCastHelper {
925
+ static HWY_INLINE constexpr const From& CastSrcValRef(const From& val) {
926
+ return val;
927
+ }
928
+ };
929
+
930
+ #if HWY_COMPILER_CLANG >= 900 && HWY_COMPILER_CLANG < 1000
931
+ // Workaround for Clang 9 constexpr __builtin_bit_cast bug
932
+ template <class To, class From,
933
+ hwy::EnableIf<hwy::IsInteger<RemoveCvRef<To>>() &&
934
+ hwy::IsInteger<RemoveCvRef<From>>()>* = nullptr>
935
+ static HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR To
936
+ BuiltinBitCastScalar(const From& val) {
937
+ static_assert(sizeof(To) == sizeof(From),
938
+ "sizeof(To) == sizeof(From) must be true");
939
+ return static_cast<To>(val);
940
+ }
941
+
942
+ template <class To, class From,
943
+ hwy::EnableIf<!(hwy::IsInteger<RemoveCvRef<To>>() &&
944
+ hwy::IsInteger<RemoveCvRef<From>>())>* = nullptr>
945
+ static HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR To
946
+ BuiltinBitCastScalar(const From& val) {
947
+ return __builtin_bit_cast(To, val);
948
+ }
949
+ #endif // HWY_COMPILER_CLANG >= 900 && HWY_COMPILER_CLANG < 1000
950
+
951
+ } // namespace detail
952
+
953
+ template <class To, class From, HWY_IF_NOT_SPECIAL_FLOAT(To)>
954
+ HWY_API HWY_BITCASTSCALAR_CONSTEXPR To BitCastScalar(const From& val) {
955
+ // If From is hwy::float16_t or hwy::bfloat16_t, first cast val to either
956
+ // const typename From::Native& or const uint16_t& using
957
+ // detail::BitCastScalarSrcCastHelper<RemoveCvRef<From>>::CastSrcValRef to
958
+ // allow BitCastScalar from hwy::float16_t or hwy::bfloat16_t to be constexpr
959
+ // if To is not a pointer type, union type, or a struct/class containing a
960
+ // pointer, union, or reference subobject
961
+ #if HWY_COMPILER_CLANG >= 900 && HWY_COMPILER_CLANG < 1000
962
+ return detail::BuiltinBitCastScalar<To>(
963
+ detail::BitCastScalarSrcCastHelper<RemoveCvRef<From>>::CastSrcValRef(
964
+ val));
965
+ #else
966
+ return __builtin_bit_cast(
967
+ To, detail::BitCastScalarSrcCastHelper<RemoveCvRef<From>>::CastSrcValRef(
968
+ val));
969
+ #endif
970
+ }
971
+ template <class To, class From, HWY_IF_SPECIAL_FLOAT(To)>
972
+ HWY_API HWY_BITCASTSCALAR_CONSTEXPR To BitCastScalar(const From& val) {
973
+ // If To is hwy::float16_t or hwy::bfloat16_t, first do a BitCastScalar of val
974
+ // to uint16_t, and then bit cast the uint16_t value to To using To::FromBits
975
+ // as hwy::float16_t::FromBits and hwy::bfloat16_t::FromBits are guaranteed to
976
+ // be constexpr if the __builtin_bit_cast intrinsic is available.
977
+ return To::FromBits(BitCastScalar<uint16_t>(val));
978
+ }
979
+ #else
980
+ template <class To, class From>
981
+ HWY_API HWY_BITCASTSCALAR_CONSTEXPR To BitCastScalar(const From& val) {
982
+ To result;
983
+ CopySameSize(&val, &result);
984
+ return result;
985
+ }
986
+ #endif
987
+
988
+ //------------------------------------------------------------------------------
989
+ // F16 lane type
990
+
991
+ #pragma pack(push, 1)
992
+
993
+ // Compiler supports __fp16 and load/store/conversion NEON intrinsics, which are
994
+ // included in Armv8 and VFPv4 (except with MSVC). On Armv7 Clang requires
995
+ // __ARM_FP & 2 whereas Armv7 GCC requires -mfp16-format=ieee.
996
+ #if (HWY_ARCH_ARM_A64 && !HWY_COMPILER_MSVC) || \
997
+ (HWY_COMPILER_CLANG && defined(__ARM_FP) && (__ARM_FP & 2)) || \
998
+ (HWY_COMPILER_GCC_ACTUAL && defined(__ARM_FP16_FORMAT_IEEE))
999
+ #define HWY_NEON_HAVE_F16C 1
1000
+ #else
1001
+ #define HWY_NEON_HAVE_F16C 0
1002
+ #endif
1003
+
1004
+ // RVV with f16 extension supports _Float16 and f16 vector ops. If set, implies
1005
+ // HWY_HAVE_FLOAT16.
1006
+ #if HWY_ARCH_RVV && defined(__riscv_zvfh) && HWY_COMPILER_CLANG >= 1600
1007
+ #define HWY_RVV_HAVE_F16_VEC 1
1008
+ #else
1009
+ #define HWY_RVV_HAVE_F16_VEC 0
1010
+ #endif
1011
+
1012
+ // x86 compiler supports _Float16, not necessarily with operators.
1013
+ // Avoid clang-cl because it lacks __extendhfsf2.
1014
+ #if HWY_ARCH_X86 && defined(__SSE2__) && defined(__FLT16_MAX__) && \
1015
+ ((HWY_COMPILER_CLANG >= 1500 && !HWY_COMPILER_CLANGCL) || \
1016
+ HWY_COMPILER_GCC_ACTUAL >= 1200)
1017
+ #define HWY_SSE2_HAVE_F16_TYPE 1
1018
+ #else
1019
+ #define HWY_SSE2_HAVE_F16_TYPE 0
1020
+ #endif
1021
+
1022
+ #ifndef HWY_HAVE_SCALAR_F16_TYPE
1023
+ // Compiler supports _Float16, not necessarily with operators.
1024
+ #if HWY_NEON_HAVE_F16C || HWY_RVV_HAVE_F16_VEC || HWY_SSE2_HAVE_F16_TYPE
1025
+ #define HWY_HAVE_SCALAR_F16_TYPE 1
1026
+ #else
1027
+ #define HWY_HAVE_SCALAR_F16_TYPE 0
1028
+ #endif
1029
+ #endif // HWY_HAVE_SCALAR_F16_TYPE
1030
+
1031
+ #ifndef HWY_HAVE_SCALAR_F16_OPERATORS
1032
+ // Recent enough compiler also has operators.
1033
+ #if HWY_HAVE_SCALAR_F16_TYPE && \
1034
+ (HWY_COMPILER_CLANG >= 1800 || HWY_COMPILER_GCC_ACTUAL >= 1200 || \
1035
+ (HWY_COMPILER_CLANG >= 1500 && !HWY_COMPILER_CLANGCL && \
1036
+ !defined(_WIN32)) || \
1037
+ (HWY_ARCH_ARM && \
1038
+ (HWY_COMPILER_CLANG >= 900 || HWY_COMPILER_GCC_ACTUAL >= 800)))
1039
+ #define HWY_HAVE_SCALAR_F16_OPERATORS 1
1040
+ #else
1041
+ #define HWY_HAVE_SCALAR_F16_OPERATORS 0
1042
+ #endif
1043
+ #endif // HWY_HAVE_SCALAR_F16_OPERATORS
1044
+
1045
+ namespace detail {
1046
+
1047
+ template <class T, class TVal = RemoveCvRef<T>, bool = IsSpecialFloat<TVal>()>
1048
+ struct SpecialFloatUnwrapArithOpOperandT {};
1049
+
1050
+ template <class T, class TVal>
1051
+ struct SpecialFloatUnwrapArithOpOperandT<T, TVal, false> {
1052
+ using type = T;
1053
+ };
1054
+
1055
+ template <class T>
1056
+ using SpecialFloatUnwrapArithOpOperand =
1057
+ typename SpecialFloatUnwrapArithOpOperandT<T>::type;
1058
+
1059
+ template <class T, class TVal = RemoveCvRef<T>>
1060
+ struct NativeSpecialFloatToWrapperT {
1061
+ using type = T;
1062
+ };
1063
+
1064
+ template <class T>
1065
+ using NativeSpecialFloatToWrapper =
1066
+ typename NativeSpecialFloatToWrapperT<T>::type;
1067
+
1068
+ } // namespace detail
1069
+
1070
+ // Match [u]int##_t naming scheme so rvv-inl.h macros can obtain the type name
1071
+ // by concatenating base type and bits. We use a wrapper class instead of a
1072
+ // typedef to the native type to ensure that the same symbols, e.g. for VQSort,
1073
+ // are generated regardless of F16 support; see #1684.
1074
+ struct alignas(2) float16_t {
1075
+ #if HWY_HAVE_SCALAR_F16_TYPE
1076
+ #if HWY_RVV_HAVE_F16_VEC || HWY_SSE2_HAVE_F16_TYPE
1077
+ using Native = _Float16;
1078
+ #elif HWY_NEON_HAVE_F16C
1079
+ using Native = __fp16;
1080
+ #else
1081
+ #error "Logic error: condition should be 'all but NEON_HAVE_F16C'"
1082
+ #endif
1083
+ #endif // HWY_HAVE_SCALAR_F16_TYPE
1084
+
1085
+ union {
1086
+ #if HWY_HAVE_SCALAR_F16_TYPE
1087
+ // Accessed via NativeLaneType, and used directly if
1088
+ // HWY_HAVE_SCALAR_F16_OPERATORS.
1089
+ Native native;
1090
+ #endif
1091
+ // Only accessed via NativeLaneType or U16LaneType.
1092
+ uint16_t bits;
1093
+ };
1094
+
1095
+ // Default init and copying.
1096
+ float16_t() noexcept = default;
1097
+ constexpr float16_t(const float16_t&) noexcept = default;
1098
+ constexpr float16_t(float16_t&&) noexcept = default;
1099
+ float16_t& operator=(const float16_t&) noexcept = default;
1100
+ float16_t& operator=(float16_t&&) noexcept = default;
1101
+
1102
+ #if HWY_HAVE_SCALAR_F16_TYPE
1103
+ // NEON vget/set_lane intrinsics and SVE `svaddv` could use explicit
1104
+ // float16_t(intrinsic()), but user code expects implicit conversions.
1105
+ constexpr float16_t(Native arg) noexcept : native(arg) {}
1106
+ constexpr operator Native() const noexcept { return native; }
1107
+ #endif
1108
+
1109
+ #if HWY_HAVE_SCALAR_F16_TYPE
1110
+ static HWY_BITCASTSCALAR_CONSTEXPR float16_t FromBits(uint16_t bits) {
1111
+ return float16_t(BitCastScalar<Native>(bits));
1112
+ }
1113
+ #else
1114
+
1115
+ private:
1116
+ struct F16FromU16BitsTag {};
1117
+ constexpr float16_t(F16FromU16BitsTag /*tag*/, uint16_t u16_bits)
1118
+ : bits(u16_bits) {}
1119
+
1120
+ public:
1121
+ static constexpr float16_t FromBits(uint16_t bits) {
1122
+ return float16_t(F16FromU16BitsTag(), bits);
1123
+ }
1124
+ #endif
1125
+
1126
+ // When backed by a native type, ensure the wrapper behaves like the native
1127
+ // type by forwarding all operators. Unfortunately it seems difficult to reuse
1128
+ // this code in a base class, so we repeat it in float16_t.
1129
+ #if HWY_HAVE_SCALAR_F16_OPERATORS || HWY_IDE
1130
+ template <typename T, hwy::EnableIf<!IsSame<RemoveCvRef<T>, float16_t>() &&
1131
+ IsConvertible<T, Native>()>* = nullptr>
1132
+ constexpr float16_t(T&& arg) noexcept
1133
+ : native(static_cast<Native>(static_cast<T&&>(arg))) {}
1134
+
1135
+ template <typename T, hwy::EnableIf<!IsSame<RemoveCvRef<T>, float16_t>() &&
1136
+ !IsConvertible<T, Native>() &&
1137
+ IsStaticCastable<T, Native>()>* = nullptr>
1138
+ explicit constexpr float16_t(T&& arg) noexcept
1139
+ : native(static_cast<Native>(static_cast<T&&>(arg))) {}
1140
+
1141
+ // pre-decrement operator (--x)
1142
+ HWY_CXX14_CONSTEXPR float16_t& operator--() noexcept {
1143
+ native = static_cast<Native>(native - Native{1});
1144
+ return *this;
1145
+ }
1146
+
1147
+ // post-decrement operator (x--)
1148
+ HWY_CXX14_CONSTEXPR float16_t operator--(int) noexcept {
1149
+ float16_t result = *this;
1150
+ native = static_cast<Native>(native - Native{1});
1151
+ return result;
1152
+ }
1153
+
1154
+ // pre-increment operator (++x)
1155
+ HWY_CXX14_CONSTEXPR float16_t& operator++() noexcept {
1156
+ native = static_cast<Native>(native + Native{1});
1157
+ return *this;
1158
+ }
1159
+
1160
+ // post-increment operator (x++)
1161
+ HWY_CXX14_CONSTEXPR float16_t operator++(int) noexcept {
1162
+ float16_t result = *this;
1163
+ native = static_cast<Native>(native + Native{1});
1164
+ return result;
1165
+ }
1166
+
1167
+ constexpr float16_t operator-() const noexcept {
1168
+ return float16_t(static_cast<Native>(-native));
1169
+ }
1170
+ constexpr float16_t operator+() const noexcept { return *this; }
1171
+
1172
+ // Reduce clutter by generating `operator+` and `operator+=` etc. Note that
1173
+ // we cannot token-paste `operator` and `+`, so pass it in as `op_func`.
1174
+ #define HWY_FLOAT16_BINARY_OP(op, op_func, assign_func) \
1175
+ constexpr float16_t op_func(const float16_t& rhs) const noexcept { \
1176
+ return float16_t(static_cast<Native>(native op rhs.native)); \
1177
+ } \
1178
+ template <typename T, HWY_IF_NOT_F16(T), \
1179
+ typename UnwrappedT = \
1180
+ detail::SpecialFloatUnwrapArithOpOperand<const T&>, \
1181
+ typename RawResultT = \
1182
+ decltype(DeclVal<Native>() op DeclVal<UnwrappedT>()), \
1183
+ typename ResultT = \
1184
+ detail::NativeSpecialFloatToWrapper<RawResultT>, \
1185
+ HWY_IF_CASTABLE(RawResultT, ResultT)> \
1186
+ constexpr ResultT op_func(const T& rhs) const noexcept(noexcept( \
1187
+ static_cast<ResultT>(DeclVal<Native>() op DeclVal<UnwrappedT>()))) { \
1188
+ return static_cast<ResultT>(native op static_cast<UnwrappedT>(rhs)); \
1189
+ } \
1190
+ HWY_CXX14_CONSTEXPR hwy::float16_t& assign_func( \
1191
+ const hwy::float16_t& rhs) noexcept { \
1192
+ native = static_cast<Native>(native op rhs.native); \
1193
+ return *this; \
1194
+ } \
1195
+ template <typename T, HWY_IF_NOT_F16(T), \
1196
+ HWY_IF_OP_CASTABLE(op, const T&, Native), \
1197
+ HWY_IF_ASSIGNABLE( \
1198
+ Native, decltype(DeclVal<Native>() op DeclVal<const T&>()))> \
1199
+ HWY_CXX14_CONSTEXPR hwy::float16_t& assign_func(const T& rhs) noexcept( \
1200
+ noexcept( \
1201
+ static_cast<Native>(DeclVal<Native>() op DeclVal<const T&>()))) { \
1202
+ native = static_cast<Native>(native op rhs); \
1203
+ return *this; \
1204
+ }
1205
+
1206
+ HWY_FLOAT16_BINARY_OP(+, operator+, operator+=)
1207
+ HWY_FLOAT16_BINARY_OP(-, operator-, operator-=)
1208
+ HWY_FLOAT16_BINARY_OP(*, operator*, operator*=)
1209
+ HWY_FLOAT16_BINARY_OP(/, operator/, operator/=)
1210
+ #undef HWY_FLOAT16_BINARY_OP
1211
+
1212
+ #endif // HWY_HAVE_SCALAR_F16_OPERATORS
1213
+ };
1214
+ static_assert(sizeof(hwy::float16_t) == 2, "Wrong size of float16_t");
1215
+
1216
+ #if HWY_HAVE_SCALAR_F16_TYPE
1217
+ namespace detail {
1218
+
1219
+ #if HWY_HAVE_SCALAR_F16_OPERATORS
1220
+ template <class T>
1221
+ struct SpecialFloatUnwrapArithOpOperandT<T, hwy::float16_t, true> {
1222
+ using type = hwy::float16_t::Native;
1223
+ };
1224
+ #endif
1225
+
1226
+ template <class T>
1227
+ struct NativeSpecialFloatToWrapperT<T, hwy::float16_t::Native> {
1228
+ using type = hwy::float16_t;
1229
+ };
1230
+
1231
+ } // namespace detail
1232
+ #endif // HWY_HAVE_SCALAR_F16_TYPE
1233
+
1234
+ #if HWY_HAS_BUILTIN(__builtin_bit_cast) || HWY_COMPILER_MSVC >= 1926
1235
+ namespace detail {
1236
+
1237
+ template <>
1238
+ struct BitCastScalarSrcCastHelper<hwy::float16_t> {
1239
+ #if HWY_HAVE_SCALAR_F16_TYPE
1240
+ static HWY_INLINE constexpr const hwy::float16_t::Native& CastSrcValRef(
1241
+ const hwy::float16_t& val) {
1242
+ return val.native;
1243
+ }
1244
+ #else
1245
+ static HWY_INLINE constexpr const uint16_t& CastSrcValRef(
1246
+ const hwy::float16_t& val) {
1247
+ return val.bits;
1248
+ }
1249
+ #endif
1250
+ };
1251
+
1252
+ } // namespace detail
1253
+ #endif // HWY_HAS_BUILTIN(__builtin_bit_cast) || HWY_COMPILER_MSVC >= 1926
1254
+
1255
+ #if HWY_HAVE_SCALAR_F16_OPERATORS
1256
+ #define HWY_F16_CONSTEXPR constexpr
1257
+ #else
1258
+ #define HWY_F16_CONSTEXPR HWY_BITCASTSCALAR_CXX14_CONSTEXPR
1259
+ #endif // HWY_HAVE_SCALAR_F16_OPERATORS
1260
+
1261
+ HWY_API HWY_F16_CONSTEXPR float F32FromF16(float16_t f16) {
1262
+ #if HWY_HAVE_SCALAR_F16_OPERATORS && !HWY_IDE
1263
+ return static_cast<float>(f16);
1264
+ #endif
1265
+ #if !HWY_HAVE_SCALAR_F16_OPERATORS || HWY_IDE
1266
+ const uint16_t bits16 = BitCastScalar<uint16_t>(f16);
1267
+ const uint32_t sign = static_cast<uint32_t>(bits16 >> 15);
1268
+ const uint32_t biased_exp = (bits16 >> 10) & 0x1F;
1269
+ const uint32_t mantissa = bits16 & 0x3FF;
1270
+
1271
+ // Subnormal or zero
1272
+ if (biased_exp == 0) {
1273
+ const float subnormal =
1274
+ (1.0f / 16384) * (static_cast<float>(mantissa) * (1.0f / 1024));
1275
+ return sign ? -subnormal : subnormal;
1276
+ }
1277
+
1278
+ // Normalized, infinity or NaN: convert the representation directly
1279
+ // (faster than ldexp/tables).
1280
+ const uint32_t biased_exp32 =
1281
+ biased_exp == 31 ? 0xFF : biased_exp + (127 - 15);
1282
+ const uint32_t mantissa32 = mantissa << (23 - 10);
1283
+ const uint32_t bits32 = (sign << 31) | (biased_exp32 << 23) | mantissa32;
1284
+
1285
+ return BitCastScalar<float>(bits32);
1286
+ #endif // !HWY_HAVE_SCALAR_F16_OPERATORS
1287
+ }
1288
+
1289
+ #if HWY_IS_DEBUG_BUILD && \
1290
+ (HWY_HAS_BUILTIN(__builtin_bit_cast) || HWY_COMPILER_MSVC >= 1926)
1291
+ #if defined(__cpp_if_consteval) && __cpp_if_consteval >= 202106L
1292
+ // If C++23 if !consteval support is available, only execute
1293
+ // HWY_DASSERT(condition) if F16FromF32 is not called from a constant-evaluated
1294
+ // context to avoid compilation errors.
1295
+ #define HWY_F16_FROM_F32_DASSERT(condition) \
1296
+ do { \
1297
+ if !consteval { \
1298
+ HWY_DASSERT(condition); \
1299
+ } \
1300
+ } while (0)
1301
+ #elif HWY_HAS_BUILTIN(__builtin_is_constant_evaluated) || \
1302
+ HWY_COMPILER_MSVC >= 1926
1303
+ // If the __builtin_is_constant_evaluated() intrinsic is available,
1304
+ // only do HWY_DASSERT(condition) if __builtin_is_constant_evaluated() returns
1305
+ // false to avoid compilation errors if F16FromF32 is called from a
1306
+ // constant-evaluated context.
1307
+ #define HWY_F16_FROM_F32_DASSERT(condition) \
1308
+ do { \
1309
+ if (!__builtin_is_constant_evaluated()) { \
1310
+ HWY_DASSERT(condition); \
1311
+ } \
1312
+ } while (0)
1313
+ #else
1314
+ // If C++23 if !consteval support is not available,
1315
+ // the __builtin_is_constant_evaluated() intrinsic is not available,
1316
+ // HWY_IS_DEBUG_BUILD is 1, and the __builtin_bit_cast intrinsic is available,
1317
+ // do not do a HWY_DASSERT to avoid compilation errors if F16FromF32 is
1318
+ // called from a constant-evaluated context.
1319
+ #define HWY_F16_FROM_F32_DASSERT(condition) \
1320
+ do { \
1321
+ } while (0)
1322
+ #endif // defined(__cpp_if_consteval) && __cpp_if_consteval >= 202106L
1323
+ #else
1324
+ // If HWY_IS_DEBUG_BUILD is 0 or the __builtin_bit_cast intrinsic is not
1325
+ // available, define HWY_F16_FROM_F32_DASSERT(condition) as
1326
+ // HWY_DASSERT(condition)
1327
+ #define HWY_F16_FROM_F32_DASSERT(condition) HWY_DASSERT(condition)
1328
+ #endif // HWY_IS_DEBUG_BUILD && (HWY_HAS_BUILTIN(__builtin_bit_cast) ||
1329
+ // HWY_COMPILER_MSVC >= 1926)
1330
+
1331
+ HWY_API HWY_F16_CONSTEXPR float16_t F16FromF32(float f32) {
1332
+ #if HWY_HAVE_SCALAR_F16_OPERATORS && !HWY_IDE
1333
+ return float16_t(static_cast<float16_t::Native>(f32));
1334
+ #endif
1335
+ #if !HWY_HAVE_SCALAR_F16_OPERATORS || HWY_IDE
1336
+ const uint32_t bits32 = BitCastScalar<uint32_t>(f32);
1337
+ const uint32_t sign = bits32 >> 31;
1338
+ const uint32_t biased_exp32 = (bits32 >> 23) & 0xFF;
1339
+ constexpr uint32_t kMantissaMask = 0x7FFFFF;
1340
+ const uint32_t mantissa32 = bits32 & kMantissaMask;
1341
+
1342
+ // Before shifting (truncation), round to nearest even to reduce bias. If
1343
+ // the lowest remaining mantissa bit is odd, increase the offset. Example
1344
+ // with the lowest remaining bit (left) and next lower two bits; the
1345
+ // latter, plus two more, will be truncated.
1346
+ // 0[00] + 1 = 0[01]
1347
+ // 0[01] + 1 = 0[10]
1348
+ // 0[10] + 1 = 0[11] (round down toward even)
1349
+ // 0[11] + 1 = 1[00] (round up)
1350
+ // 1[00] + 10 = 1[10]
1351
+ // 1[01] + 10 = 1[11]
1352
+ // 1[10] + 10 = C0[00] (round up toward even with C=1 carry out)
1353
+ // 1[11] + 10 = C0[01] (round up toward even with C=1 carry out)
1354
+ const uint32_t odd_bit = (mantissa32 >> 13) & 1;
1355
+ const uint32_t rounded = mantissa32 + odd_bit + 0xFFF;
1356
+ const bool carry = rounded >= (1u << 23);
1357
+
1358
+ const int32_t exp = static_cast<int32_t>(biased_exp32) - 127 + carry;
1359
+
1360
+ // Tiny or zero => zero.
1361
+ if (exp < -24) {
1362
+ // restore original sign
1363
+ return float16_t::FromBits(static_cast<uint16_t>(sign << 15));
1364
+ }
1365
+
1366
+ // If biased_exp16 would be >= 31, first check whether the input was NaN so we
1367
+ // can set the mantissa to nonzero.
1368
+ const bool is_nan = (biased_exp32 == 255) && mantissa32 != 0;
1369
+ const bool overflowed = exp >= 16;
1370
+ const uint32_t biased_exp16 =
1371
+ static_cast<uint32_t>(HWY_MIN(HWY_MAX(0, exp + 15), 31));
1372
+ // exp = [-24, -15] => subnormal, shift the mantissa.
1373
+ const uint32_t sub_exp = static_cast<uint32_t>(HWY_MAX(-14 - exp, 0));
1374
+ HWY_F16_FROM_F32_DASSERT(sub_exp < 11);
1375
+ const uint32_t shifted_mantissa =
1376
+ (rounded & kMantissaMask) >> (23 - 10 + sub_exp);
1377
+ const uint32_t leading = sub_exp == 0u ? 0u : (1024u >> sub_exp);
1378
+ const uint32_t mantissa16 = is_nan ? 0x3FF
1379
+ : overflowed ? 0u
1380
+ : (leading + shifted_mantissa);
1381
+
1382
+ #if HWY_IS_DEBUG_BUILD
1383
+ if (exp < -14) {
1384
+ HWY_F16_FROM_F32_DASSERT(biased_exp16 == 0);
1385
+ HWY_F16_FROM_F32_DASSERT(sub_exp >= 1);
1386
+ } else if (exp <= 15) {
1387
+ HWY_F16_FROM_F32_DASSERT(1 <= biased_exp16 && biased_exp16 < 31);
1388
+ HWY_F16_FROM_F32_DASSERT(sub_exp == 0);
1389
+ }
1390
+ #endif
1391
+
1392
+ HWY_F16_FROM_F32_DASSERT(mantissa16 < 1024);
1393
+ const uint32_t bits16 = (sign << 15) | (biased_exp16 << 10) | mantissa16;
1394
+ HWY_F16_FROM_F32_DASSERT(bits16 < 0x10000);
1395
+ const uint16_t narrowed = static_cast<uint16_t>(bits16); // big-endian safe
1396
+ return float16_t::FromBits(narrowed);
1397
+ #endif // !HWY_HAVE_SCALAR_F16_OPERATORS
1398
+ }
1399
+
1400
+ HWY_API HWY_F16_CONSTEXPR float16_t F16FromF64(double f64) {
1401
+ #if HWY_HAVE_SCALAR_F16_OPERATORS
1402
+ return float16_t(static_cast<float16_t::Native>(f64));
1403
+ #else
1404
+ // The mantissa bits of f64 are first rounded using round-to-odd rounding
1405
+ // to the nearest f64 value that has the lower 29 bits zeroed out to
1406
+ // ensure that the result is correctly rounded to a F16.
1407
+
1408
+ // The F64 round-to-odd operation below will round a normal F64 value
1409
+ // (using round-to-odd rounding) to a F64 value that has 24 bits of precision.
1410
+
1411
+ // It is okay if the magnitude of a denormal F64 value is rounded up in the
1412
+ // F64 round-to-odd step below as the magnitude of a denormal F64 value is
1413
+ // much smaller than 2^(-24) (the smallest positive denormal F16 value).
1414
+
1415
+ // It is also okay if bit 29 of a NaN F64 value is changed by the F64
1416
+ // round-to-odd step below as the lower 13 bits of a F32 NaN value are usually
1417
+ // discarded or ignored by the conversion of a F32 NaN value to a F16.
1418
+
1419
+ // If f64 is a NaN value, the result of the F64 round-to-odd step will be a
1420
+ // NaN value as the result of the F64 round-to-odd step will have at least one
1421
+ // mantissa bit if f64 is a NaN value.
1422
+
1423
+ // The F64 round-to-odd step will ensure that the F64 to F32 conversion is
1424
+ // exact if the magnitude of the rounded F64 value (using round-to-odd
1425
+ // rounding) is between 2^(-126) (the smallest normal F32 value) and
1426
+ // HighestValue<float>() (the largest finite F32 value)
1427
+
1428
+ // It is okay if the F64 to F32 conversion is inexact for F64 values that have
1429
+ // a magnitude that is less than 2^(-126) as the magnitude of a denormal F32
1430
+ // value is much smaller than 2^(-24) (the smallest positive denormal F16
1431
+ // value).
1432
+
1433
+ return F16FromF32(
1434
+ static_cast<float>(BitCastScalar<double>(static_cast<uint64_t>(
1435
+ (BitCastScalar<uint64_t>(f64) & 0xFFFFFFFFE0000000ULL) |
1436
+ ((BitCastScalar<uint64_t>(f64) + 0x000000001FFFFFFFULL) &
1437
+ 0x0000000020000000ULL)))));
1438
+ #endif
1439
+ }
1440
+
1441
+ // More convenient to define outside float16_t because these may use
1442
+ // F32FromF16, which is defined after the struct.
1443
+ HWY_F16_CONSTEXPR inline bool operator==(float16_t lhs,
1444
+ float16_t rhs) noexcept {
1445
+ #if HWY_HAVE_SCALAR_F16_OPERATORS
1446
+ return lhs.native == rhs.native;
1447
+ #else
1448
+ return F32FromF16(lhs) == F32FromF16(rhs);
1449
+ #endif
1450
+ }
1451
+ HWY_F16_CONSTEXPR inline bool operator!=(float16_t lhs,
1452
+ float16_t rhs) noexcept {
1453
+ #if HWY_HAVE_SCALAR_F16_OPERATORS
1454
+ return lhs.native != rhs.native;
1455
+ #else
1456
+ return F32FromF16(lhs) != F32FromF16(rhs);
1457
+ #endif
1458
+ }
1459
+ HWY_F16_CONSTEXPR inline bool operator<(float16_t lhs, float16_t rhs) noexcept {
1460
+ #if HWY_HAVE_SCALAR_F16_OPERATORS
1461
+ return lhs.native < rhs.native;
1462
+ #else
1463
+ return F32FromF16(lhs) < F32FromF16(rhs);
1464
+ #endif
1465
+ }
1466
+ HWY_F16_CONSTEXPR inline bool operator<=(float16_t lhs,
1467
+ float16_t rhs) noexcept {
1468
+ #if HWY_HAVE_SCALAR_F16_OPERATORS
1469
+ return lhs.native <= rhs.native;
1470
+ #else
1471
+ return F32FromF16(lhs) <= F32FromF16(rhs);
1472
+ #endif
1473
+ }
1474
+ HWY_F16_CONSTEXPR inline bool operator>(float16_t lhs, float16_t rhs) noexcept {
1475
+ #if HWY_HAVE_SCALAR_F16_OPERATORS
1476
+ return lhs.native > rhs.native;
1477
+ #else
1478
+ return F32FromF16(lhs) > F32FromF16(rhs);
1479
+ #endif
1480
+ }
1481
+ HWY_F16_CONSTEXPR inline bool operator>=(float16_t lhs,
1482
+ float16_t rhs) noexcept {
1483
+ #if HWY_HAVE_SCALAR_F16_OPERATORS
1484
+ return lhs.native >= rhs.native;
1485
+ #else
1486
+ return F32FromF16(lhs) >= F32FromF16(rhs);
1487
+ #endif
1488
+ }
1489
+ #if HWY_HAVE_CXX20_THREE_WAY_COMPARE
1490
+ HWY_F16_CONSTEXPR inline std::partial_ordering operator<=>(
1491
+ float16_t lhs, float16_t rhs) noexcept {
1492
+ #if HWY_HAVE_SCALAR_F16_OPERATORS
1493
+ return lhs.native <=> rhs.native;
1494
+ #else
1495
+ return F32FromF16(lhs) <=> F32FromF16(rhs);
1496
+ #endif
1497
+ }
1498
+ #endif // HWY_HAVE_CXX20_THREE_WAY_COMPARE
1499
+
1500
+ //------------------------------------------------------------------------------
1501
+ // BF16 lane type
1502
+
1503
+ // Compiler supports ACLE __bf16, not necessarily with operators.
1504
+
1505
+ // Disable the __bf16 type on AArch64 with GCC 13 or earlier as there is a bug
1506
+ // in GCC 13 and earlier that sometimes causes BF16 constant values to be
1507
+ // incorrectly loaded on AArch64, and this GCC bug on AArch64 is
1508
+ // described at https://gcc.gnu.org/bugzilla/show_bug.cgi?id=111867.
1509
+
1510
+ #if HWY_ARCH_ARM_A64 && \
1511
+ (HWY_COMPILER_CLANG >= 1700 || HWY_COMPILER_GCC_ACTUAL >= 1400)
1512
+ #define HWY_ARM_HAVE_SCALAR_BF16_TYPE 1
1513
+ #else
1514
+ #define HWY_ARM_HAVE_SCALAR_BF16_TYPE 0
1515
+ #endif
1516
+
1517
+ // x86 compiler supports __bf16, not necessarily with operators.
1518
+ #ifndef HWY_SSE2_HAVE_SCALAR_BF16_TYPE
1519
+ #if HWY_ARCH_X86 && defined(__SSE2__) && \
1520
+ ((HWY_COMPILER_CLANG >= 1700 && !HWY_COMPILER_CLANGCL) || \
1521
+ HWY_COMPILER_GCC_ACTUAL >= 1300)
1522
+ #define HWY_SSE2_HAVE_SCALAR_BF16_TYPE 1
1523
+ #else
1524
+ #define HWY_SSE2_HAVE_SCALAR_BF16_TYPE 0
1525
+ #endif
1526
+ #endif // HWY_SSE2_HAVE_SCALAR_BF16_TYPE
1527
+
1528
+ // Compiler supports __bf16, not necessarily with operators.
1529
+ #if HWY_ARM_HAVE_SCALAR_BF16_TYPE || HWY_SSE2_HAVE_SCALAR_BF16_TYPE
1530
+ #define HWY_HAVE_SCALAR_BF16_TYPE 1
1531
+ #else
1532
+ #define HWY_HAVE_SCALAR_BF16_TYPE 0
1533
+ #endif
1534
+
1535
+ #ifndef HWY_HAVE_SCALAR_BF16_OPERATORS
1536
+ // Recent enough compiler also has operators. aarch64 clang 18 hits internal
1537
+ // compiler errors on bf16 ToString, hence only enable on GCC for now.
1538
+ #if HWY_HAVE_SCALAR_BF16_TYPE && (HWY_COMPILER_GCC_ACTUAL >= 1300)
1539
+ #define HWY_HAVE_SCALAR_BF16_OPERATORS 1
1540
+ #else
1541
+ #define HWY_HAVE_SCALAR_BF16_OPERATORS 0
1542
+ #endif
1543
+ #endif // HWY_HAVE_SCALAR_BF16_OPERATORS
1544
+
1545
+ #if HWY_HAVE_SCALAR_BF16_OPERATORS
1546
+ #define HWY_BF16_CONSTEXPR constexpr
1547
+ #else
1548
+ #define HWY_BF16_CONSTEXPR HWY_BITCASTSCALAR_CONSTEXPR
1549
+ #endif
1550
+
1551
+ struct alignas(2) bfloat16_t {
1552
+ #if HWY_HAVE_SCALAR_BF16_TYPE
1553
+ using Native = __bf16;
1554
+ #endif
1555
+
1556
+ union {
1557
+ #if HWY_HAVE_SCALAR_BF16_TYPE
1558
+ // Accessed via NativeLaneType, and used directly if
1559
+ // HWY_HAVE_SCALAR_BF16_OPERATORS.
1560
+ Native native;
1561
+ #endif
1562
+ // Only accessed via NativeLaneType or U16LaneType.
1563
+ uint16_t bits;
1564
+ };
1565
+
1566
+ // Default init and copying
1567
+ bfloat16_t() noexcept = default;
1568
+ constexpr bfloat16_t(bfloat16_t&&) noexcept = default;
1569
+ constexpr bfloat16_t(const bfloat16_t&) noexcept = default;
1570
+ bfloat16_t& operator=(bfloat16_t&& arg) noexcept = default;
1571
+ bfloat16_t& operator=(const bfloat16_t& arg) noexcept = default;
1572
+
1573
+ // Only enable implicit conversions if we have a native type.
1574
+ #if HWY_HAVE_SCALAR_BF16_TYPE
1575
+ constexpr bfloat16_t(Native arg) noexcept : native(arg) {}
1576
+ constexpr operator Native() const noexcept { return native; }
1577
+ #endif
1578
+
1579
+ #if HWY_HAVE_SCALAR_BF16_TYPE
1580
+ static HWY_BITCASTSCALAR_CONSTEXPR bfloat16_t FromBits(uint16_t bits) {
1581
+ return bfloat16_t(BitCastScalar<Native>(bits));
1582
+ }
1583
+ #else
1584
+
1585
+ private:
1586
+ struct BF16FromU16BitsTag {};
1587
+ constexpr bfloat16_t(BF16FromU16BitsTag /*tag*/, uint16_t u16_bits)
1588
+ : bits(u16_bits) {}
829
1589
 
830
- template <bool Condition, typename Then, typename Else>
831
- using If = typename IfT<Condition, Then, Else>::type;
1590
+ public:
1591
+ static constexpr bfloat16_t FromBits(uint16_t bits) {
1592
+ return bfloat16_t(BF16FromU16BitsTag(), bits);
1593
+ }
1594
+ #endif
832
1595
 
833
- // Insert into template/function arguments to enable this overload only for
834
- // vectors of exactly, at most (LE), or more than (GT) this many bytes.
835
- //
836
- // As an example, checking for a total size of 16 bytes will match both
837
- // Simd<uint8_t, 16, 0> and Simd<uint8_t, 8, 1>.
838
- #define HWY_IF_V_SIZE(T, kN, bytes) \
839
- hwy::EnableIf<kN * sizeof(T) == bytes>* = nullptr
840
- #define HWY_IF_V_SIZE_LE(T, kN, bytes) \
841
- hwy::EnableIf<kN * sizeof(T) <= bytes>* = nullptr
842
- #define HWY_IF_V_SIZE_GT(T, kN, bytes) \
843
- hwy::EnableIf<(kN * sizeof(T) > bytes)>* = nullptr
1596
+ // When backed by a native type, ensure the wrapper behaves like the native
1597
+ // type by forwarding all operators. Unfortunately it seems difficult to reuse
1598
+ // this code in a base class, so we repeat it in float16_t.
1599
+ #if HWY_HAVE_SCALAR_BF16_OPERATORS || HWY_IDE
1600
+ template <typename T, hwy::EnableIf<!IsSame<RemoveCvRef<T>, Native>() &&
1601
+ !IsSame<RemoveCvRef<T>, bfloat16_t>() &&
1602
+ IsConvertible<T, Native>()>* = nullptr>
1603
+ constexpr bfloat16_t(T&& arg) noexcept(
1604
+ noexcept(static_cast<Native>(DeclVal<T>())))
1605
+ : native(static_cast<Native>(static_cast<T&&>(arg))) {}
1606
+
1607
+ template <typename T, hwy::EnableIf<!IsSame<RemoveCvRef<T>, Native>() &&
1608
+ !IsSame<RemoveCvRef<T>, bfloat16_t>() &&
1609
+ !IsConvertible<T, Native>() &&
1610
+ IsStaticCastable<T, Native>()>* = nullptr>
1611
+ explicit constexpr bfloat16_t(T&& arg) noexcept(
1612
+ noexcept(static_cast<Native>(DeclVal<T>())))
1613
+ : native(static_cast<Native>(static_cast<T&&>(arg))) {}
1614
+
1615
+ HWY_CXX14_CONSTEXPR bfloat16_t& operator=(Native arg) noexcept {
1616
+ native = arg;
1617
+ return *this;
1618
+ }
844
1619
 
845
- #define HWY_IF_LANES(kN, lanes) hwy::EnableIf<(kN == lanes)>* = nullptr
846
- #define HWY_IF_LANES_LE(kN, lanes) hwy::EnableIf<(kN <= lanes)>* = nullptr
847
- #define HWY_IF_LANES_GT(kN, lanes) hwy::EnableIf<(kN > lanes)>* = nullptr
1620
+ // pre-decrement operator (--x)
1621
+ HWY_CXX14_CONSTEXPR bfloat16_t& operator--() noexcept {
1622
+ native = static_cast<Native>(native - Native{1});
1623
+ return *this;
1624
+ }
848
1625
 
849
- #define HWY_IF_UNSIGNED(T) hwy::EnableIf<!IsSigned<T>()>* = nullptr
850
- #define HWY_IF_SIGNED(T) \
851
- hwy::EnableIf<IsSigned<T>() && !IsFloat<T>() && !IsSpecialFloat<T>()>* = \
852
- nullptr
853
- #define HWY_IF_FLOAT(T) hwy::EnableIf<hwy::IsFloat<T>()>* = nullptr
854
- #define HWY_IF_NOT_FLOAT(T) hwy::EnableIf<!hwy::IsFloat<T>()>* = nullptr
855
- #define HWY_IF_FLOAT3264(T) hwy::EnableIf<hwy::IsFloat3264<T>()>* = nullptr
856
- #define HWY_IF_NOT_FLOAT3264(T) hwy::EnableIf<!hwy::IsFloat3264<T>()>* = nullptr
857
- #define HWY_IF_SPECIAL_FLOAT(T) \
858
- hwy::EnableIf<hwy::IsSpecialFloat<T>()>* = nullptr
859
- #define HWY_IF_NOT_SPECIAL_FLOAT(T) \
860
- hwy::EnableIf<!hwy::IsSpecialFloat<T>()>* = nullptr
861
- #define HWY_IF_FLOAT_OR_SPECIAL(T) \
862
- hwy::EnableIf<hwy::IsFloat<T>() || hwy::IsSpecialFloat<T>()>* = nullptr
863
- #define HWY_IF_NOT_FLOAT_NOR_SPECIAL(T) \
864
- hwy::EnableIf<!hwy::IsFloat<T>() && !hwy::IsSpecialFloat<T>()>* = nullptr
1626
+ // post-decrement operator (x--)
1627
+ HWY_CXX14_CONSTEXPR bfloat16_t operator--(int) noexcept {
1628
+ bfloat16_t result = *this;
1629
+ native = static_cast<Native>(native - Native{1});
1630
+ return result;
1631
+ }
865
1632
 
866
- #define HWY_IF_T_SIZE(T, bytes) hwy::EnableIf<sizeof(T) == (bytes)>* = nullptr
867
- #define HWY_IF_NOT_T_SIZE(T, bytes) \
868
- hwy::EnableIf<sizeof(T) != (bytes)>* = nullptr
869
- // bit_array = 0x102 means 1 or 8 bytes. There is no NONE_OF because it sounds
870
- // too similar. If you want the opposite of this (2 or 4 bytes), ask for those
871
- // bits explicitly (0x14) instead of attempting to 'negate' 0x102.
872
- #define HWY_IF_T_SIZE_ONE_OF(T, bit_array) \
873
- hwy::EnableIf<((size_t{1} << sizeof(T)) & (bit_array)) != 0>* = nullptr
1633
+ // pre-increment operator (++x)
1634
+ HWY_CXX14_CONSTEXPR bfloat16_t& operator++() noexcept {
1635
+ native = static_cast<Native>(native + Native{1});
1636
+ return *this;
1637
+ }
874
1638
 
875
- // Use instead of HWY_IF_T_SIZE to avoid ambiguity with float16_t/float/double
876
- // overloads.
877
- #define HWY_IF_UI16(T) \
878
- hwy::EnableIf<IsSame<T, uint16_t>() || IsSame<T, int16_t>()>* = nullptr
879
- #define HWY_IF_UI32(T) \
880
- hwy::EnableIf<IsSame<T, uint32_t>() || IsSame<T, int32_t>()>* = nullptr
881
- #define HWY_IF_UI64(T) \
882
- hwy::EnableIf<IsSame<T, uint64_t>() || IsSame<T, int64_t>()>* = nullptr
883
- #define HWY_IF_BF16(T) hwy::EnableIf<IsSame<T, hwy::bfloat16_t>()>* = nullptr
884
- #define HWY_IF_F16(T) hwy::EnableIf<IsSame<T, hwy::float16_t>()>* = nullptr
1639
+ // post-increment operator (x++)
1640
+ HWY_CXX14_CONSTEXPR bfloat16_t operator++(int) noexcept {
1641
+ bfloat16_t result = *this;
1642
+ native = static_cast<Native>(native + Native{1});
1643
+ return result;
1644
+ }
885
1645
 
886
- #define HWY_IF_LANES_PER_BLOCK(T, N, LANES) \
887
- hwy::EnableIf<HWY_MIN(sizeof(T) * N, 16) / sizeof(T) == (LANES)>* = nullptr
1646
+ constexpr bfloat16_t operator-() const noexcept {
1647
+ return bfloat16_t(static_cast<Native>(-native));
1648
+ }
1649
+ constexpr bfloat16_t operator+() const noexcept { return *this; }
888
1650
 
889
- // Empty struct used as a size tag type.
890
- template <size_t N>
891
- struct SizeTag {};
1651
+ // Reduce clutter by generating `operator+` and `operator+=` etc. Note that
1652
+ // we cannot token-paste `operator` and `+`, so pass it in as `op_func`.
1653
+ #define HWY_BFLOAT16_BINARY_OP(op, op_func, assign_func) \
1654
+ constexpr bfloat16_t op_func(const bfloat16_t& rhs) const noexcept { \
1655
+ return bfloat16_t(static_cast<Native>(native op rhs.native)); \
1656
+ } \
1657
+ template <typename T, HWY_IF_NOT_BF16(T), \
1658
+ typename UnwrappedT = \
1659
+ detail::SpecialFloatUnwrapArithOpOperand<const T&>, \
1660
+ typename RawResultT = \
1661
+ decltype(DeclVal<Native>() op DeclVal<UnwrappedT>()), \
1662
+ typename ResultT = \
1663
+ detail::NativeSpecialFloatToWrapper<RawResultT>, \
1664
+ HWY_IF_CASTABLE(RawResultT, ResultT)> \
1665
+ constexpr ResultT op_func(const T& rhs) const noexcept(noexcept( \
1666
+ static_cast<ResultT>(DeclVal<Native>() op DeclVal<UnwrappedT>()))) { \
1667
+ return static_cast<ResultT>(native op static_cast<UnwrappedT>(rhs)); \
1668
+ } \
1669
+ HWY_CXX14_CONSTEXPR hwy::bfloat16_t& assign_func( \
1670
+ const hwy::bfloat16_t& rhs) noexcept { \
1671
+ native = static_cast<Native>(native op rhs.native); \
1672
+ return *this; \
1673
+ } \
1674
+ template <typename T, HWY_IF_NOT_BF16(T), \
1675
+ HWY_IF_OP_CASTABLE(op, const T&, Native), \
1676
+ HWY_IF_ASSIGNABLE( \
1677
+ Native, decltype(DeclVal<Native>() op DeclVal<const T&>()))> \
1678
+ HWY_CXX14_CONSTEXPR hwy::bfloat16_t& assign_func(const T& rhs) noexcept( \
1679
+ noexcept( \
1680
+ static_cast<Native>(DeclVal<Native>() op DeclVal<const T&>()))) { \
1681
+ native = static_cast<Native>(native op rhs); \
1682
+ return *this; \
1683
+ }
1684
+ HWY_BFLOAT16_BINARY_OP(+, operator+, operator+=)
1685
+ HWY_BFLOAT16_BINARY_OP(-, operator-, operator-=)
1686
+ HWY_BFLOAT16_BINARY_OP(*, operator*, operator*=)
1687
+ HWY_BFLOAT16_BINARY_OP(/, operator/, operator/=)
1688
+ #undef HWY_BFLOAT16_BINARY_OP
892
1689
 
893
- template <class T>
894
- struct RemoveConstT {
895
- using type = T;
896
- };
897
- template <class T>
898
- struct RemoveConstT<const T> {
899
- using type = T;
1690
+ #endif // HWY_HAVE_SCALAR_BF16_OPERATORS
900
1691
  };
1692
+ static_assert(sizeof(hwy::bfloat16_t) == 2, "Wrong size of bfloat16_t");
901
1693
 
902
- template <class T>
903
- using RemoveConst = typename RemoveConstT<T>::type;
1694
+ #pragma pack(pop)
1695
+
1696
+ #if HWY_HAVE_SCALAR_BF16_TYPE
1697
+ namespace detail {
904
1698
 
1699
+ #if HWY_HAVE_SCALAR_BF16_OPERATORS
905
1700
  template <class T>
906
- struct RemoveRefT {
907
- using type = T;
1701
+ struct SpecialFloatUnwrapArithOpOperandT<T, hwy::bfloat16_t, true> {
1702
+ using type = hwy::bfloat16_t::Native;
908
1703
  };
1704
+ #endif
1705
+
909
1706
  template <class T>
910
- struct RemoveRefT<T&> {
911
- using type = T;
1707
+ struct NativeSpecialFloatToWrapperT<T, hwy::bfloat16_t::Native> {
1708
+ using type = hwy::bfloat16_t;
912
1709
  };
913
- template <class T>
914
- struct RemoveRefT<T&&> {
915
- using type = T;
1710
+
1711
+ } // namespace detail
1712
+ #endif // HWY_HAVE_SCALAR_BF16_TYPE
1713
+
1714
+ #if HWY_HAS_BUILTIN(__builtin_bit_cast) || HWY_COMPILER_MSVC >= 1926
1715
+ namespace detail {
1716
+
1717
+ template <>
1718
+ struct BitCastScalarSrcCastHelper<hwy::bfloat16_t> {
1719
+ #if HWY_HAVE_SCALAR_BF16_TYPE
1720
+ static HWY_INLINE constexpr const hwy::bfloat16_t::Native& CastSrcValRef(
1721
+ const hwy::bfloat16_t& val) {
1722
+ return val.native;
1723
+ }
1724
+ #else
1725
+ static HWY_INLINE constexpr const uint16_t& CastSrcValRef(
1726
+ const hwy::bfloat16_t& val) {
1727
+ return val.bits;
1728
+ }
1729
+ #endif
916
1730
  };
917
1731
 
918
- template <class T>
919
- using RemoveRef = typename RemoveRefT<T>::type;
1732
+ } // namespace detail
1733
+ #endif // HWY_HAS_BUILTIN(__builtin_bit_cast) || HWY_COMPILER_MSVC >= 1926
1734
+
1735
+ HWY_API HWY_BF16_CONSTEXPR float F32FromBF16(bfloat16_t bf) {
1736
+ #if HWY_HAVE_SCALAR_BF16_OPERATORS
1737
+ return static_cast<float>(bf);
1738
+ #else
1739
+ return BitCastScalar<float>(static_cast<uint32_t>(
1740
+ static_cast<uint32_t>(BitCastScalar<uint16_t>(bf)) << 16));
1741
+ #endif
1742
+ }
1743
+
1744
+ HWY_API HWY_BF16_CONSTEXPR bfloat16_t BF16FromF32(float f) {
1745
+ #if HWY_HAVE_SCALAR_BF16_OPERATORS
1746
+ return static_cast<bfloat16_t>(f);
1747
+ #else
1748
+ return bfloat16_t::FromBits(
1749
+ static_cast<uint16_t>(BitCastScalar<uint32_t>(f) >> 16));
1750
+ #endif
1751
+ }
1752
+
1753
+ HWY_API HWY_BF16_CONSTEXPR bfloat16_t BF16FromF64(double f64) {
1754
+ #if HWY_HAVE_SCALAR_BF16_OPERATORS
1755
+ return static_cast<bfloat16_t>(f64);
1756
+ #else
1757
+ // The mantissa bits of f64 are first rounded using round-to-odd rounding
1758
+ // to the nearest f64 value that has the lower 38 bits zeroed out to
1759
+ // ensure that the result is correctly rounded to a BF16.
1760
+
1761
+ // The F64 round-to-odd operation below will round a normal F64 value
1762
+ // (using round-to-odd rounding) to a F64 value that has 15 bits of precision.
1763
+
1764
+ // It is okay if the magnitude of a denormal F64 value is rounded up in the
1765
+ // F64 round-to-odd step below as the magnitude of a denormal F64 value is
1766
+ // much smaller than 2^(-133) (the smallest positive denormal BF16 value).
1767
+
1768
+ // It is also okay if bit 38 of a NaN F64 value is changed by the F64
1769
+ // round-to-odd step below as the lower 16 bits of a F32 NaN value are usually
1770
+ // discarded or ignored by the conversion of a F32 NaN value to a BF16.
1771
+
1772
+ // If f64 is a NaN value, the result of the F64 round-to-odd step will be a
1773
+ // NaN value as the result of the F64 round-to-odd step will have at least one
1774
+ // mantissa bit if f64 is a NaN value.
1775
+
1776
+ // The F64 round-to-odd step below will ensure that the F64 to F32 conversion
1777
+ // is exact if the magnitude of the rounded F64 value (using round-to-odd
1778
+ // rounding) is between 2^(-135) (one-fourth of the smallest positive denormal
1779
+ // BF16 value) and HighestValue<float>() (the largest finite F32 value).
1780
+
1781
+ // If |f64| is less than 2^(-135), the magnitude of the result of the F64 to
1782
+ // F32 conversion is guaranteed to be less than or equal to 2^(-135), which
1783
+ // ensures that the F32 to BF16 conversion is correctly rounded, even if the
1784
+ // conversion of a rounded F64 value whose magnitude is less than 2^(-135)
1785
+ // to a F32 is inexact.
1786
+
1787
+ return BF16FromF32(
1788
+ static_cast<float>(BitCastScalar<double>(static_cast<uint64_t>(
1789
+ (BitCastScalar<uint64_t>(f64) & 0xFFFFFFC000000000ULL) |
1790
+ ((BitCastScalar<uint64_t>(f64) + 0x0000003FFFFFFFFFULL) &
1791
+ 0x0000004000000000ULL)))));
1792
+ #endif
1793
+ }
1794
+
1795
+ // More convenient to define outside bfloat16_t because these may use
1796
+ // F32FromBF16, which is defined after the struct.
1797
+
1798
+ HWY_BF16_CONSTEXPR inline bool operator==(bfloat16_t lhs,
1799
+ bfloat16_t rhs) noexcept {
1800
+ #if HWY_HAVE_SCALAR_BF16_OPERATORS
1801
+ return lhs.native == rhs.native;
1802
+ #else
1803
+ return F32FromBF16(lhs) == F32FromBF16(rhs);
1804
+ #endif
1805
+ }
1806
+
1807
+ HWY_BF16_CONSTEXPR inline bool operator!=(bfloat16_t lhs,
1808
+ bfloat16_t rhs) noexcept {
1809
+ #if HWY_HAVE_SCALAR_BF16_OPERATORS
1810
+ return lhs.native != rhs.native;
1811
+ #else
1812
+ return F32FromBF16(lhs) != F32FromBF16(rhs);
1813
+ #endif
1814
+ }
1815
+ HWY_BF16_CONSTEXPR inline bool operator<(bfloat16_t lhs,
1816
+ bfloat16_t rhs) noexcept {
1817
+ #if HWY_HAVE_SCALAR_BF16_OPERATORS
1818
+ return lhs.native < rhs.native;
1819
+ #else
1820
+ return F32FromBF16(lhs) < F32FromBF16(rhs);
1821
+ #endif
1822
+ }
1823
+ HWY_BF16_CONSTEXPR inline bool operator<=(bfloat16_t lhs,
1824
+ bfloat16_t rhs) noexcept {
1825
+ #if HWY_HAVE_SCALAR_BF16_OPERATORS
1826
+ return lhs.native <= rhs.native;
1827
+ #else
1828
+ return F32FromBF16(lhs) <= F32FromBF16(rhs);
1829
+ #endif
1830
+ }
1831
+ HWY_BF16_CONSTEXPR inline bool operator>(bfloat16_t lhs,
1832
+ bfloat16_t rhs) noexcept {
1833
+ #if HWY_HAVE_SCALAR_BF16_OPERATORS
1834
+ return lhs.native > rhs.native;
1835
+ #else
1836
+ return F32FromBF16(lhs) > F32FromBF16(rhs);
1837
+ #endif
1838
+ }
1839
+ HWY_BF16_CONSTEXPR inline bool operator>=(bfloat16_t lhs,
1840
+ bfloat16_t rhs) noexcept {
1841
+ #if HWY_HAVE_SCALAR_BF16_OPERATORS
1842
+ return lhs.native >= rhs.native;
1843
+ #else
1844
+ return F32FromBF16(lhs) >= F32FromBF16(rhs);
1845
+ #endif
1846
+ }
1847
+ #if HWY_HAVE_CXX20_THREE_WAY_COMPARE
1848
+ HWY_BF16_CONSTEXPR inline std::partial_ordering operator<=>(
1849
+ bfloat16_t lhs, bfloat16_t rhs) noexcept {
1850
+ #if HWY_HAVE_SCALAR_BF16_OPERATORS
1851
+ return lhs.native <=> rhs.native;
1852
+ #else
1853
+ return F32FromBF16(lhs) <=> F32FromBF16(rhs);
1854
+ #endif
1855
+ }
1856
+ #endif // HWY_HAVE_CXX20_THREE_WAY_COMPARE
920
1857
 
921
1858
  //------------------------------------------------------------------------------
922
1859
  // Type relations
@@ -1110,25 +2047,19 @@ constexpr auto IsFloatTag() -> hwy::SizeTag<(R::is_float ? 0x200 : 0x400)> {
1110
2047
 
1111
2048
  template <typename T>
1112
2049
  HWY_API constexpr bool IsFloat3264() {
1113
- return IsSame<T, float>() || IsSame<T, double>();
2050
+ return IsSameEither<RemoveCvRef<T>, float, double>();
1114
2051
  }
1115
2052
 
1116
2053
  template <typename T>
1117
2054
  HWY_API constexpr bool IsFloat() {
1118
2055
  // Cannot use T(1.25) != T(1) for float16_t, which can only be converted to or
1119
2056
  // from a float, not compared. Include float16_t in case HWY_HAVE_FLOAT16=1.
1120
- return IsSame<T, float16_t>() || IsFloat3264<T>();
1121
- }
1122
-
1123
- // These types are often special-cased and not supported in all ops.
1124
- template <typename T>
1125
- HWY_API constexpr bool IsSpecialFloat() {
1126
- return IsSame<T, float16_t>() || IsSame<T, bfloat16_t>();
2057
+ return IsSame<RemoveCvRef<T>, float16_t>() || IsFloat3264<T>();
1127
2058
  }
1128
2059
 
1129
2060
  template <typename T>
1130
2061
  HWY_API constexpr bool IsSigned() {
1131
- return T(0) > T(-1);
2062
+ return static_cast<T>(0) > static_cast<T>(-1);
1132
2063
  }
1133
2064
  template <>
1134
2065
  constexpr bool IsSigned<float16_t>() {
@@ -1138,104 +2069,113 @@ template <>
1138
2069
  constexpr bool IsSigned<bfloat16_t>() {
1139
2070
  return true;
1140
2071
  }
2072
+ template <>
2073
+ constexpr bool IsSigned<hwy::uint128_t>() {
2074
+ return false;
2075
+ }
2076
+ template <>
2077
+ constexpr bool IsSigned<hwy::K64V64>() {
2078
+ return false;
2079
+ }
2080
+ template <>
2081
+ constexpr bool IsSigned<hwy::K32V32>() {
2082
+ return false;
2083
+ }
2084
+
2085
+ template <typename T, bool = IsInteger<T>() && !IsIntegerLaneType<T>()>
2086
+ struct MakeLaneTypeIfIntegerT {
2087
+ using type = T;
2088
+ };
2089
+
2090
+ template <typename T>
2091
+ struct MakeLaneTypeIfIntegerT<T, true> {
2092
+ using type = hwy::If<IsSigned<T>(), SignedFromSize<sizeof(T)>,
2093
+ UnsignedFromSize<sizeof(T)>>;
2094
+ };
2095
+
2096
+ template <typename T>
2097
+ using MakeLaneTypeIfInteger = typename MakeLaneTypeIfIntegerT<T>::type;
1141
2098
 
1142
2099
  // Largest/smallest representable integer values.
1143
2100
  template <typename T>
1144
2101
  HWY_API constexpr T LimitsMax() {
1145
- static_assert(!IsFloat<T>(), "Only for integer types");
1146
- using TU = MakeUnsigned<T>;
1147
- return static_cast<T>(IsSigned<T>() ? (static_cast<TU>(~0ull) >> 1)
1148
- : static_cast<TU>(~0ull));
2102
+ static_assert(IsInteger<T>(), "Only for integer types");
2103
+ using TU = UnsignedFromSize<sizeof(T)>;
2104
+ return static_cast<T>(IsSigned<T>() ? (static_cast<TU>(~TU(0)) >> 1)
2105
+ : static_cast<TU>(~TU(0)));
1149
2106
  }
1150
2107
  template <typename T>
1151
2108
  HWY_API constexpr T LimitsMin() {
1152
- static_assert(!IsFloat<T>(), "Only for integer types");
1153
- return IsSigned<T>() ? T(-1) - LimitsMax<T>() : T(0);
2109
+ static_assert(IsInteger<T>(), "Only for integer types");
2110
+ return IsSigned<T>() ? static_cast<T>(-1) - LimitsMax<T>()
2111
+ : static_cast<T>(0);
1154
2112
  }
1155
2113
 
1156
2114
  // Largest/smallest representable value (integer or float). This naming avoids
1157
2115
  // confusion with numeric_limits<float>::min() (the smallest positive value).
1158
2116
  // Cannot be constexpr because we use CopySameSize for [b]float16_t.
1159
2117
  template <typename T>
1160
- HWY_API T LowestValue() {
2118
+ HWY_API HWY_BITCASTSCALAR_CONSTEXPR T LowestValue() {
1161
2119
  return LimitsMin<T>();
1162
2120
  }
1163
2121
  template <>
1164
- HWY_INLINE bfloat16_t LowestValue<bfloat16_t>() {
1165
- const uint16_t kBits = 0xFF7F; // -1.1111111 x 2^127
1166
- bfloat16_t ret;
1167
- CopySameSize(&kBits, &ret);
1168
- return ret;
2122
+ HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR bfloat16_t LowestValue<bfloat16_t>() {
2123
+ return bfloat16_t::FromBits(uint16_t{0xFF7Fu}); // -1.1111111 x 2^127
1169
2124
  }
1170
2125
  template <>
1171
- HWY_INLINE float16_t LowestValue<float16_t>() {
1172
- const uint16_t kBits = 0xFBFF; // -1.1111111111 x 2^15
1173
- float16_t ret;
1174
- CopySameSize(&kBits, &ret);
1175
- return ret;
2126
+ HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR float16_t LowestValue<float16_t>() {
2127
+ return float16_t::FromBits(uint16_t{0xFBFFu}); // -1.1111111111 x 2^15
1176
2128
  }
1177
2129
  template <>
1178
- HWY_INLINE float LowestValue<float>() {
2130
+ HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR float LowestValue<float>() {
1179
2131
  return -3.402823466e+38F;
1180
2132
  }
1181
2133
  template <>
1182
- HWY_INLINE double LowestValue<double>() {
2134
+ HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR double LowestValue<double>() {
1183
2135
  return -1.7976931348623158e+308;
1184
2136
  }
1185
2137
 
1186
2138
  template <typename T>
1187
- HWY_API T HighestValue() {
2139
+ HWY_API HWY_BITCASTSCALAR_CONSTEXPR T HighestValue() {
1188
2140
  return LimitsMax<T>();
1189
2141
  }
1190
2142
  template <>
1191
- HWY_INLINE bfloat16_t HighestValue<bfloat16_t>() {
1192
- const uint16_t kBits = 0x7F7F; // 1.1111111 x 2^127
1193
- bfloat16_t ret;
1194
- CopySameSize(&kBits, &ret);
1195
- return ret;
2143
+ HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR bfloat16_t HighestValue<bfloat16_t>() {
2144
+ return bfloat16_t::FromBits(uint16_t{0x7F7Fu}); // 1.1111111 x 2^127
1196
2145
  }
1197
2146
  template <>
1198
- HWY_INLINE float16_t HighestValue<float16_t>() {
1199
- const uint16_t kBits = 0x7BFF; // 1.1111111111 x 2^15
1200
- float16_t ret;
1201
- CopySameSize(&kBits, &ret);
1202
- return ret;
2147
+ HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR float16_t HighestValue<float16_t>() {
2148
+ return float16_t::FromBits(uint16_t{0x7BFFu}); // 1.1111111111 x 2^15
1203
2149
  }
1204
2150
  template <>
1205
- HWY_INLINE float HighestValue<float>() {
2151
+ HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR float HighestValue<float>() {
1206
2152
  return 3.402823466e+38F;
1207
2153
  }
1208
2154
  template <>
1209
- HWY_INLINE double HighestValue<double>() {
2155
+ HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR double HighestValue<double>() {
1210
2156
  return 1.7976931348623158e+308;
1211
2157
  }
1212
2158
 
1213
2159
  // Difference between 1.0 and the next representable value. Equal to
1214
2160
  // 1 / (1ULL << MantissaBits<T>()), but hard-coding ensures precision.
1215
2161
  template <typename T>
1216
- HWY_API T Epsilon() {
2162
+ HWY_API HWY_BITCASTSCALAR_CONSTEXPR T Epsilon() {
1217
2163
  return 1;
1218
2164
  }
1219
2165
  template <>
1220
- HWY_INLINE bfloat16_t Epsilon<bfloat16_t>() {
1221
- const uint16_t kBits = 0x3C00; // 0.0078125
1222
- bfloat16_t ret;
1223
- CopySameSize(&kBits, &ret);
1224
- return ret;
2166
+ HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR bfloat16_t Epsilon<bfloat16_t>() {
2167
+ return bfloat16_t::FromBits(uint16_t{0x3C00u}); // 0.0078125
1225
2168
  }
1226
2169
  template <>
1227
- HWY_INLINE float16_t Epsilon<float16_t>() {
1228
- const uint16_t kBits = 0x1400; // 0.0009765625
1229
- float16_t ret;
1230
- CopySameSize(&kBits, &ret);
1231
- return ret;
2170
+ HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR float16_t Epsilon<float16_t>() {
2171
+ return float16_t::FromBits(uint16_t{0x1400u}); // 0.0009765625
1232
2172
  }
1233
2173
  template <>
1234
- HWY_INLINE float Epsilon<float>() {
2174
+ HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR float Epsilon<float>() {
1235
2175
  return 1.192092896e-7f;
1236
2176
  }
1237
2177
  template <>
1238
- HWY_INLINE double Epsilon<double>() {
2178
+ HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR double Epsilon<double>() {
1239
2179
  return 2.2204460492503131e-16;
1240
2180
  }
1241
2181
 
@@ -1278,7 +2218,8 @@ constexpr MakeUnsigned<T> SignMask() {
1278
2218
  // Returns bitmask of the exponent field in IEEE binary16/32/64.
1279
2219
  template <typename T>
1280
2220
  constexpr MakeUnsigned<T> ExponentMask() {
1281
- return (~(MakeUnsigned<T>{1} << MantissaBits<T>()) + 1) & ~SignMask<T>();
2221
+ return (~(MakeUnsigned<T>{1} << MantissaBits<T>()) + 1) &
2222
+ static_cast<MakeUnsigned<T>>(~SignMask<T>());
1282
2223
  }
1283
2224
 
1284
2225
  // Returns bitmask of the mantissa field in IEEE binary16/32/64.
@@ -1290,30 +2231,24 @@ constexpr MakeUnsigned<T> MantissaMask() {
1290
2231
  // Returns 1 << mantissa_bits as a floating-point number. All integers whose
1291
2232
  // absolute value are less than this can be represented exactly.
1292
2233
  template <typename T>
1293
- HWY_INLINE T MantissaEnd() {
2234
+ HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR T MantissaEnd() {
1294
2235
  static_assert(sizeof(T) == 0, "Only instantiate the specializations");
1295
2236
  return 0;
1296
2237
  }
1297
2238
  template <>
1298
- HWY_INLINE bfloat16_t MantissaEnd<bfloat16_t>() {
1299
- const uint16_t kBits = 0x4300; // 1.0 x 2^7
1300
- bfloat16_t ret;
1301
- CopySameSize(&kBits, &ret);
1302
- return ret;
2239
+ HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR bfloat16_t MantissaEnd<bfloat16_t>() {
2240
+ return bfloat16_t::FromBits(uint16_t{0x4300u}); // 1.0 x 2^7
1303
2241
  }
1304
2242
  template <>
1305
- HWY_INLINE float16_t MantissaEnd<float16_t>() {
1306
- const uint16_t kBits = 0x6400; // 1.0 x 2^10
1307
- float16_t ret;
1308
- CopySameSize(&kBits, &ret);
1309
- return ret;
2243
+ HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR float16_t MantissaEnd<float16_t>() {
2244
+ return float16_t::FromBits(uint16_t{0x6400u}); // 1.0 x 2^10
1310
2245
  }
1311
2246
  template <>
1312
- HWY_INLINE float MantissaEnd<float>() {
2247
+ HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR float MantissaEnd<float>() {
1313
2248
  return 8388608.0f; // 1 << 23
1314
2249
  }
1315
2250
  template <>
1316
- HWY_INLINE double MantissaEnd<double>() {
2251
+ HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR double MantissaEnd<double>() {
1317
2252
  // floating point literal with p52 requires C++17.
1318
2253
  return 4503599627370496.0; // 1 << 52
1319
2254
  }
@@ -1333,6 +2268,143 @@ constexpr MakeSigned<T> MaxExponentField() {
1333
2268
  return (MakeSigned<T>{1} << ExponentBits<T>()) - 1;
1334
2269
  }
1335
2270
 
2271
+ //------------------------------------------------------------------------------
2272
+ // Additional F16/BF16 operators
2273
+
2274
+ #if HWY_HAVE_SCALAR_F16_OPERATORS || HWY_HAVE_SCALAR_BF16_OPERATORS
2275
+
2276
+ #define HWY_RHS_SPECIAL_FLOAT_ARITH_OP(op, op_func, T2) \
2277
+ template < \
2278
+ typename T1, \
2279
+ hwy::EnableIf<hwy::IsInteger<RemoveCvRef<T1>>() || \
2280
+ hwy::IsFloat3264<RemoveCvRef<T1>>()>* = nullptr, \
2281
+ typename RawResultT = decltype(DeclVal<T1>() op DeclVal<T2::Native>()), \
2282
+ typename ResultT = detail::NativeSpecialFloatToWrapper<RawResultT>, \
2283
+ HWY_IF_CASTABLE(RawResultT, ResultT)> \
2284
+ static HWY_INLINE constexpr ResultT op_func(T1 a, T2 b) noexcept { \
2285
+ return static_cast<ResultT>(a op b.native); \
2286
+ }
2287
+
2288
+ #define HWY_SPECIAL_FLOAT_CMP_AGAINST_NON_SPECIAL_OP(op, op_func, T1) \
2289
+ HWY_RHS_SPECIAL_FLOAT_ARITH_OP(op, op_func, T1) \
2290
+ template < \
2291
+ typename T2, \
2292
+ hwy::EnableIf<hwy::IsInteger<RemoveCvRef<T2>>() || \
2293
+ hwy::IsFloat3264<RemoveCvRef<T2>>()>* = nullptr, \
2294
+ typename RawResultT = decltype(DeclVal<T1::Native>() op DeclVal<T2>()), \
2295
+ typename ResultT = detail::NativeSpecialFloatToWrapper<RawResultT>, \
2296
+ HWY_IF_CASTABLE(RawResultT, ResultT)> \
2297
+ static HWY_INLINE constexpr ResultT op_func(T1 a, T2 b) noexcept { \
2298
+ return static_cast<ResultT>(a.native op b); \
2299
+ }
2300
+
2301
+ #if HWY_HAVE_SCALAR_F16_OPERATORS
2302
+ HWY_RHS_SPECIAL_FLOAT_ARITH_OP(+, operator+, float16_t)
2303
+ HWY_RHS_SPECIAL_FLOAT_ARITH_OP(-, operator-, float16_t)
2304
+ HWY_RHS_SPECIAL_FLOAT_ARITH_OP(*, operator*, float16_t)
2305
+ HWY_RHS_SPECIAL_FLOAT_ARITH_OP(/, operator/, float16_t)
2306
+ HWY_SPECIAL_FLOAT_CMP_AGAINST_NON_SPECIAL_OP(==, operator==, float16_t)
2307
+ HWY_SPECIAL_FLOAT_CMP_AGAINST_NON_SPECIAL_OP(!=, operator!=, float16_t)
2308
+ HWY_SPECIAL_FLOAT_CMP_AGAINST_NON_SPECIAL_OP(<, operator<, float16_t)
2309
+ HWY_SPECIAL_FLOAT_CMP_AGAINST_NON_SPECIAL_OP(<=, operator<=, float16_t)
2310
+ HWY_SPECIAL_FLOAT_CMP_AGAINST_NON_SPECIAL_OP(>, operator>, float16_t)
2311
+ HWY_SPECIAL_FLOAT_CMP_AGAINST_NON_SPECIAL_OP(>=, operator>=, float16_t)
2312
+ #if HWY_HAVE_CXX20_THREE_WAY_COMPARE
2313
+ HWY_SPECIAL_FLOAT_CMP_AGAINST_NON_SPECIAL_OP(<=>, operator<=>, float16_t)
2314
+ #endif
2315
+ #endif // HWY_HAVE_SCALAR_F16_OPERATORS
2316
+
2317
+ #if HWY_HAVE_SCALAR_BF16_OPERATORS
2318
+ HWY_RHS_SPECIAL_FLOAT_ARITH_OP(+, operator+, bfloat16_t)
2319
+ HWY_RHS_SPECIAL_FLOAT_ARITH_OP(-, operator-, bfloat16_t)
2320
+ HWY_RHS_SPECIAL_FLOAT_ARITH_OP(*, operator*, bfloat16_t)
2321
+ HWY_RHS_SPECIAL_FLOAT_ARITH_OP(/, operator/, bfloat16_t)
2322
+ HWY_SPECIAL_FLOAT_CMP_AGAINST_NON_SPECIAL_OP(==, operator==, bfloat16_t)
2323
+ HWY_SPECIAL_FLOAT_CMP_AGAINST_NON_SPECIAL_OP(!=, operator!=, bfloat16_t)
2324
+ HWY_SPECIAL_FLOAT_CMP_AGAINST_NON_SPECIAL_OP(<, operator<, bfloat16_t)
2325
+ HWY_SPECIAL_FLOAT_CMP_AGAINST_NON_SPECIAL_OP(<=, operator<=, bfloat16_t)
2326
+ HWY_SPECIAL_FLOAT_CMP_AGAINST_NON_SPECIAL_OP(>, operator>, bfloat16_t)
2327
+ HWY_SPECIAL_FLOAT_CMP_AGAINST_NON_SPECIAL_OP(>=, operator>=, bfloat16_t)
2328
+ #if HWY_HAVE_CXX20_THREE_WAY_COMPARE
2329
+ HWY_SPECIAL_FLOAT_CMP_AGAINST_NON_SPECIAL_OP(<=>, operator<=>, bfloat16_t)
2330
+ #endif
2331
+ #endif // HWY_HAVE_SCALAR_BF16_OPERATORS
2332
+
2333
+ #undef HWY_RHS_SPECIAL_FLOAT_ARITH_OP
2334
+ #undef HWY_SPECIAL_FLOAT_CMP_AGAINST_NON_SPECIAL_OP
2335
+
2336
+ #endif // HWY_HAVE_SCALAR_F16_OPERATORS || HWY_HAVE_SCALAR_BF16_OPERATORS
2337
+
2338
+ //------------------------------------------------------------------------------
2339
+ // Type conversions (after IsSpecialFloat)
2340
+
2341
+ HWY_API float F32FromF16Mem(const void* ptr) {
2342
+ float16_t f16;
2343
+ CopyBytes<2>(HWY_ASSUME_ALIGNED(ptr, 2), &f16);
2344
+ return F32FromF16(f16);
2345
+ }
2346
+
2347
+ HWY_API float F32FromBF16Mem(const void* ptr) {
2348
+ bfloat16_t bf;
2349
+ CopyBytes<2>(HWY_ASSUME_ALIGNED(ptr, 2), &bf);
2350
+ return F32FromBF16(bf);
2351
+ }
2352
+
2353
+ #if HWY_HAVE_SCALAR_F16_OPERATORS
2354
+ #define HWY_BF16_TO_F16_CONSTEXPR HWY_BF16_CONSTEXPR
2355
+ #else
2356
+ #define HWY_BF16_TO_F16_CONSTEXPR HWY_F16_CONSTEXPR
2357
+ #endif
2358
+
2359
+ // For casting from TFrom to TTo
2360
+ template <typename TTo, typename TFrom, HWY_IF_NOT_SPECIAL_FLOAT(TTo),
2361
+ HWY_IF_NOT_SPECIAL_FLOAT(TFrom), HWY_IF_NOT_SAME(TTo, TFrom)>
2362
+ HWY_API constexpr TTo ConvertScalarTo(const TFrom in) {
2363
+ return static_cast<TTo>(in);
2364
+ }
2365
+ template <typename TTo, typename TFrom, HWY_IF_F16(TTo),
2366
+ HWY_IF_NOT_SPECIAL_FLOAT(TFrom), HWY_IF_NOT_SAME(TFrom, double)>
2367
+ HWY_API constexpr TTo ConvertScalarTo(const TFrom in) {
2368
+ return F16FromF32(static_cast<float>(in));
2369
+ }
2370
+ template <typename TTo, HWY_IF_F16(TTo)>
2371
+ HWY_API HWY_BF16_TO_F16_CONSTEXPR TTo
2372
+ ConvertScalarTo(const hwy::bfloat16_t in) {
2373
+ return F16FromF32(F32FromBF16(in));
2374
+ }
2375
+ template <typename TTo, HWY_IF_F16(TTo)>
2376
+ HWY_API HWY_F16_CONSTEXPR TTo ConvertScalarTo(const double in) {
2377
+ return F16FromF64(in);
2378
+ }
2379
+ template <typename TTo, typename TFrom, HWY_IF_BF16(TTo),
2380
+ HWY_IF_NOT_SPECIAL_FLOAT(TFrom), HWY_IF_NOT_SAME(TFrom, double)>
2381
+ HWY_API HWY_BF16_CONSTEXPR TTo ConvertScalarTo(const TFrom in) {
2382
+ return BF16FromF32(static_cast<float>(in));
2383
+ }
2384
+ template <typename TTo, HWY_IF_BF16(TTo)>
2385
+ HWY_API HWY_BF16_TO_F16_CONSTEXPR TTo ConvertScalarTo(const hwy::float16_t in) {
2386
+ return BF16FromF32(F32FromF16(in));
2387
+ }
2388
+ template <typename TTo, HWY_IF_BF16(TTo)>
2389
+ HWY_API HWY_BF16_CONSTEXPR TTo ConvertScalarTo(const double in) {
2390
+ return BF16FromF64(in);
2391
+ }
2392
+ template <typename TTo, typename TFrom, HWY_IF_F16(TFrom),
2393
+ HWY_IF_NOT_SPECIAL_FLOAT(TTo)>
2394
+ HWY_API HWY_F16_CONSTEXPR TTo ConvertScalarTo(const TFrom in) {
2395
+ return static_cast<TTo>(F32FromF16(in));
2396
+ }
2397
+ template <typename TTo, typename TFrom, HWY_IF_BF16(TFrom),
2398
+ HWY_IF_NOT_SPECIAL_FLOAT(TTo)>
2399
+ HWY_API HWY_BF16_CONSTEXPR TTo ConvertScalarTo(TFrom in) {
2400
+ return static_cast<TTo>(F32FromBF16(in));
2401
+ }
2402
+ // Same: return unchanged
2403
+ template <typename TTo>
2404
+ HWY_API constexpr TTo ConvertScalarTo(TTo in) {
2405
+ return in;
2406
+ }
2407
+
1336
2408
  //------------------------------------------------------------------------------
1337
2409
  // Helper functions
1338
2410
 
@@ -1348,6 +2420,7 @@ constexpr inline size_t RoundUpTo(size_t what, size_t align) {
1348
2420
 
1349
2421
  // Undefined results for x == 0.
1350
2422
  HWY_API size_t Num0BitsBelowLS1Bit_Nonzero32(const uint32_t x) {
2423
+ HWY_DASSERT(x != 0);
1351
2424
  #if HWY_COMPILER_MSVC
1352
2425
  unsigned long index; // NOLINT
1353
2426
  _BitScanForward(&index, x);
@@ -1358,6 +2431,7 @@ HWY_API size_t Num0BitsBelowLS1Bit_Nonzero32(const uint32_t x) {
1358
2431
  }
1359
2432
 
1360
2433
  HWY_API size_t Num0BitsBelowLS1Bit_Nonzero64(const uint64_t x) {
2434
+ HWY_DASSERT(x != 0);
1361
2435
  #if HWY_COMPILER_MSVC
1362
2436
  #if HWY_ARCH_X86_64
1363
2437
  unsigned long index; // NOLINT
@@ -1383,6 +2457,7 @@ HWY_API size_t Num0BitsBelowLS1Bit_Nonzero64(const uint64_t x) {
1383
2457
 
1384
2458
  // Undefined results for x == 0.
1385
2459
  HWY_API size_t Num0BitsAboveMS1Bit_Nonzero32(const uint32_t x) {
2460
+ HWY_DASSERT(x != 0);
1386
2461
  #if HWY_COMPILER_MSVC
1387
2462
  unsigned long index; // NOLINT
1388
2463
  _BitScanReverse(&index, x);
@@ -1393,6 +2468,7 @@ HWY_API size_t Num0BitsAboveMS1Bit_Nonzero32(const uint32_t x) {
1393
2468
  }
1394
2469
 
1395
2470
  HWY_API size_t Num0BitsAboveMS1Bit_Nonzero64(const uint64_t x) {
2471
+ HWY_DASSERT(x != 0);
1396
2472
  #if HWY_COMPILER_MSVC
1397
2473
  #if HWY_ARCH_X86_64
1398
2474
  unsigned long index; // NOLINT
@@ -1416,26 +2492,48 @@ HWY_API size_t Num0BitsAboveMS1Bit_Nonzero64(const uint64_t x) {
1416
2492
  #endif // HWY_COMPILER_MSVC
1417
2493
  }
1418
2494
 
1419
- HWY_API size_t PopCount(uint64_t x) {
1420
- #if HWY_COMPILER_GCC // includes clang
1421
- return static_cast<size_t>(__builtin_popcountll(x));
1422
- // This instruction has a separate feature flag, but is often called from
1423
- // non-SIMD code, so we don't want to require dynamic dispatch. It was first
1424
- // supported by Intel in Nehalem (SSE4.2), but MSVC only predefines a macro
1425
- // for AVX, so check for that.
2495
+ template <class T, HWY_IF_INTEGER(RemoveCvRef<T>),
2496
+ HWY_IF_T_SIZE_ONE_OF(RemoveCvRef<T>, (1 << 1) | (1 << 2) | (1 << 4))>
2497
+ HWY_API size_t PopCount(T x) {
2498
+ uint32_t u32_x = static_cast<uint32_t>(
2499
+ static_cast<UnsignedFromSize<sizeof(RemoveCvRef<T>)>>(x));
2500
+
2501
+ #if HWY_COMPILER_GCC || HWY_COMPILER_CLANG
2502
+ return static_cast<size_t>(__builtin_popcountl(u32_x));
2503
+ #elif HWY_COMPILER_MSVC && HWY_ARCH_X86_32 && defined(__AVX__)
2504
+ return static_cast<size_t>(_mm_popcnt_u32(u32_x));
2505
+ #else
2506
+ u32_x -= ((u32_x >> 1) & 0x55555555u);
2507
+ u32_x = (((u32_x >> 2) & 0x33333333u) + (u32_x & 0x33333333u));
2508
+ u32_x = (((u32_x >> 4) + u32_x) & 0x0F0F0F0Fu);
2509
+ u32_x += (u32_x >> 8);
2510
+ u32_x += (u32_x >> 16);
2511
+ return static_cast<size_t>(u32_x & 0x3Fu);
2512
+ #endif
2513
+ }
2514
+
2515
+ template <class T, HWY_IF_INTEGER(RemoveCvRef<T>),
2516
+ HWY_IF_T_SIZE(RemoveCvRef<T>, 8)>
2517
+ HWY_API size_t PopCount(T x) {
2518
+ uint64_t u64_x = static_cast<uint64_t>(
2519
+ static_cast<UnsignedFromSize<sizeof(RemoveCvRef<T>)>>(x));
2520
+
2521
+ #if HWY_COMPILER_GCC || HWY_COMPILER_CLANG
2522
+ return static_cast<size_t>(__builtin_popcountll(u64_x));
1426
2523
  #elif HWY_COMPILER_MSVC && HWY_ARCH_X86_64 && defined(__AVX__)
1427
- return _mm_popcnt_u64(x);
2524
+ return _mm_popcnt_u64(u64_x);
1428
2525
  #elif HWY_COMPILER_MSVC && HWY_ARCH_X86_32 && defined(__AVX__)
1429
- return _mm_popcnt_u32(static_cast<uint32_t>(x & 0xFFFFFFFFu)) +
1430
- _mm_popcnt_u32(static_cast<uint32_t>(x >> 32));
2526
+ return _mm_popcnt_u32(static_cast<uint32_t>(u64_x & 0xFFFFFFFFu)) +
2527
+ _mm_popcnt_u32(static_cast<uint32_t>(u64_x >> 32));
1431
2528
  #else
1432
- x -= ((x >> 1) & 0x5555555555555555ULL);
1433
- x = (((x >> 2) & 0x3333333333333333ULL) + (x & 0x3333333333333333ULL));
1434
- x = (((x >> 4) + x) & 0x0F0F0F0F0F0F0F0FULL);
1435
- x += (x >> 8);
1436
- x += (x >> 16);
1437
- x += (x >> 32);
1438
- return static_cast<size_t>(x & 0x7Fu);
2529
+ u64_x -= ((u64_x >> 1) & 0x5555555555555555ULL);
2530
+ u64_x = (((u64_x >> 2) & 0x3333333333333333ULL) +
2531
+ (u64_x & 0x3333333333333333ULL));
2532
+ u64_x = (((u64_x >> 4) + u64_x) & 0x0F0F0F0F0F0F0F0FULL);
2533
+ u64_x += (u64_x >> 8);
2534
+ u64_x += (u64_x >> 16);
2535
+ u64_x += (u64_x >> 32);
2536
+ return static_cast<size_t>(u64_x & 0x7Fu);
1439
2537
  #endif
1440
2538
  }
1441
2539
 
@@ -1456,18 +2554,28 @@ template <typename TI>
1456
2554
  : static_cast<size_t>(FloorLog2(static_cast<TI>(x - 1)) + 1);
1457
2555
  }
1458
2556
 
1459
- template <typename T>
1460
- HWY_INLINE constexpr T AddWithWraparound(hwy::FloatTag /*tag*/, T t, size_t n) {
1461
- return t + static_cast<T>(n);
2557
+ template <typename T, typename T2, HWY_IF_FLOAT(T), HWY_IF_NOT_SPECIAL_FLOAT(T)>
2558
+ HWY_INLINE constexpr T AddWithWraparound(T t, T2 increment) {
2559
+ return t + static_cast<T>(increment);
1462
2560
  }
1463
2561
 
1464
- template <typename T>
1465
- HWY_INLINE constexpr T AddWithWraparound(hwy::NonFloatTag /*tag*/, T t,
1466
- size_t n) {
2562
+ template <typename T, typename T2, HWY_IF_SPECIAL_FLOAT(T)>
2563
+ HWY_INLINE constexpr T AddWithWraparound(T t, T2 increment) {
2564
+ return ConvertScalarTo<T>(ConvertScalarTo<float>(t) +
2565
+ ConvertScalarTo<float>(increment));
2566
+ }
2567
+
2568
+ template <typename T, typename T2, HWY_IF_NOT_FLOAT(T)>
2569
+ HWY_INLINE constexpr T AddWithWraparound(T t, T2 n) {
1467
2570
  using TU = MakeUnsigned<T>;
1468
- return static_cast<T>(
1469
- static_cast<TU>(static_cast<TU>(t) + static_cast<TU>(n)) &
1470
- hwy::LimitsMax<TU>());
2571
+ // Sub-int types would promote to int, not unsigned, which would trigger
2572
+ // warnings, so first promote to the largest unsigned type. Due to
2573
+ // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=87519, which affected GCC 8
2574
+ // until fixed in 9.3, we use built-in types rather than uint64_t.
2575
+ return static_cast<T>(static_cast<TU>(
2576
+ static_cast<unsigned long long>(static_cast<unsigned long long>(t) +
2577
+ static_cast<unsigned long long>(n)) &
2578
+ uint64_t{hwy::LimitsMax<TU>()}));
1471
2579
  }
1472
2580
 
1473
2581
  #if HWY_COMPILER_MSVC && HWY_ARCH_X86_64
@@ -1494,7 +2602,120 @@ HWY_API uint64_t Mul128(uint64_t a, uint64_t b, uint64_t* HWY_RESTRICT upper) {
1494
2602
  #endif
1495
2603
  }
1496
2604
 
2605
+ namespace detail {
2606
+
2607
+ template <typename T>
2608
+ static HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR T ScalarAbs(hwy::FloatTag /*tag*/,
2609
+ T val) {
2610
+ using TU = MakeUnsigned<T>;
2611
+ return BitCastScalar<T>(
2612
+ static_cast<TU>(BitCastScalar<TU>(val) & (~SignMask<T>())));
2613
+ }
2614
+
2615
+ template <typename T>
2616
+ static HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR T
2617
+ ScalarAbs(hwy::SpecialTag /*tag*/, T val) {
2618
+ return ScalarAbs(hwy::FloatTag(), val);
2619
+ }
2620
+
2621
+ template <typename T>
2622
+ static HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR T
2623
+ ScalarAbs(hwy::SignedTag /*tag*/, T val) {
2624
+ using TU = MakeUnsigned<T>;
2625
+ return (val < T{0}) ? static_cast<T>(TU{0} - static_cast<TU>(val)) : val;
2626
+ }
2627
+
2628
+ template <typename T>
2629
+ static HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR T
2630
+ ScalarAbs(hwy::UnsignedTag /*tag*/, T val) {
2631
+ return val;
2632
+ }
2633
+
2634
+ } // namespace detail
2635
+
2636
+ template <typename T>
2637
+ HWY_API HWY_BITCASTSCALAR_CONSTEXPR RemoveCvRef<T> ScalarAbs(T val) {
2638
+ using TVal = MakeLaneTypeIfInteger<
2639
+ detail::NativeSpecialFloatToWrapper<RemoveCvRef<T>>>;
2640
+ return detail::ScalarAbs(hwy::TypeTag<TVal>(), static_cast<TVal>(val));
2641
+ }
2642
+
2643
+ template <typename T>
2644
+ HWY_API HWY_BITCASTSCALAR_CONSTEXPR bool ScalarIsNaN(T val) {
2645
+ using TF = detail::NativeSpecialFloatToWrapper<RemoveCvRef<T>>;
2646
+ using TU = MakeUnsigned<TF>;
2647
+ return (BitCastScalar<TU>(ScalarAbs(val)) > ExponentMask<TF>());
2648
+ }
2649
+
2650
+ template <typename T>
2651
+ HWY_API HWY_BITCASTSCALAR_CONSTEXPR bool ScalarIsInf(T val) {
2652
+ using TF = detail::NativeSpecialFloatToWrapper<RemoveCvRef<T>>;
2653
+ using TU = MakeUnsigned<TF>;
2654
+ return static_cast<TU>(BitCastScalar<TU>(static_cast<TF>(val)) << 1) ==
2655
+ static_cast<TU>(MaxExponentTimes2<TF>());
2656
+ }
2657
+
2658
+ namespace detail {
2659
+
2660
+ template <typename T>
2661
+ static HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR bool ScalarIsFinite(
2662
+ hwy::FloatTag /*tag*/, T val) {
2663
+ using TU = MakeUnsigned<T>;
2664
+ return (BitCastScalar<TU>(hwy::ScalarAbs(val)) < ExponentMask<T>());
2665
+ }
2666
+
2667
+ template <typename T>
2668
+ static HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR bool ScalarIsFinite(
2669
+ hwy::NonFloatTag /*tag*/, T /*val*/) {
2670
+ // Integer values are always finite
2671
+ return true;
2672
+ }
2673
+
2674
+ } // namespace detail
2675
+
2676
+ template <typename T>
2677
+ HWY_API HWY_BITCASTSCALAR_CONSTEXPR bool ScalarIsFinite(T val) {
2678
+ using TVal = MakeLaneTypeIfInteger<
2679
+ detail::NativeSpecialFloatToWrapper<RemoveCvRef<T>>>;
2680
+ return detail::ScalarIsFinite(hwy::IsFloatTag<TVal>(),
2681
+ static_cast<TVal>(val));
2682
+ }
2683
+
2684
+ template <typename T>
2685
+ HWY_API HWY_BITCASTSCALAR_CONSTEXPR RemoveCvRef<T> ScalarCopySign(T magn,
2686
+ T sign) {
2687
+ using TF = RemoveCvRef<detail::NativeSpecialFloatToWrapper<RemoveCvRef<T>>>;
2688
+ using TU = MakeUnsigned<TF>;
2689
+ return BitCastScalar<TF>(static_cast<TU>(
2690
+ (BitCastScalar<TU>(static_cast<TF>(magn)) & (~SignMask<TF>())) |
2691
+ (BitCastScalar<TU>(static_cast<TF>(sign)) & SignMask<TF>())));
2692
+ }
2693
+
2694
+ template <typename T>
2695
+ HWY_API HWY_BITCASTSCALAR_CONSTEXPR bool ScalarSignBit(T val) {
2696
+ using TVal = MakeLaneTypeIfInteger<
2697
+ detail::NativeSpecialFloatToWrapper<RemoveCvRef<T>>>;
2698
+ using TU = MakeUnsigned<TVal>;
2699
+ return ((BitCastScalar<TU>(static_cast<TVal>(val)) & SignMask<TVal>()) != 0);
2700
+ }
2701
+
1497
2702
  // Prevents the compiler from eliding the computations that led to "output".
2703
+ #if HWY_ARCH_PPC && (HWY_COMPILER_GCC || HWY_COMPILER_CLANG) && \
2704
+ !defined(_SOFT_FLOAT)
2705
+ // Workaround to avoid test failures on PPC if compiled with Clang
2706
+ template <class T, HWY_IF_F32(T)>
2707
+ HWY_API void PreventElision(T&& output) {
2708
+ asm volatile("" : "+f"(output)::"memory");
2709
+ }
2710
+ template <class T, HWY_IF_F64(T)>
2711
+ HWY_API void PreventElision(T&& output) {
2712
+ asm volatile("" : "+d"(output)::"memory");
2713
+ }
2714
+ template <class T, HWY_IF_NOT_FLOAT3264(T)>
2715
+ HWY_API void PreventElision(T&& output) {
2716
+ asm volatile("" : "+r"(output)::"memory");
2717
+ }
2718
+ #else
1498
2719
  template <class T>
1499
2720
  HWY_API void PreventElision(T&& output) {
1500
2721
  #if HWY_COMPILER_MSVC
@@ -1502,8 +2723,8 @@ HWY_API void PreventElision(T&& output) {
1502
2723
  // RTL constraints). Self-assignment with #pragma optimize("off") might be
1503
2724
  // expected to prevent elision, but it does not with MSVC 2015. Type-punning
1504
2725
  // with volatile pointers generates inefficient code on MSVC 2017.
1505
- static std::atomic<RemoveRef<T>> dummy;
1506
- dummy.store(output, std::memory_order_relaxed);
2726
+ static std::atomic<RemoveCvRef<T>> sink;
2727
+ sink.store(output, std::memory_order_relaxed);
1507
2728
  #else
1508
2729
  // Works by indicating to the compiler that "output" is being read and
1509
2730
  // modified. The +r constraint avoids unnecessary writes to memory, but only
@@ -1511,6 +2732,7 @@ HWY_API void PreventElision(T&& output) {
1511
2732
  asm volatile("" : "+r"(output) : : "memory");
1512
2733
  #endif
1513
2734
  }
2735
+ #endif
1514
2736
 
1515
2737
  } // namespace hwy
1516
2738