@img/sharp-libvips-dev 1.0.1 → 1.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/include/expat.h +21 -10
- package/include/expat_config.h +11 -5
- package/include/ffi.h +12 -25
- package/include/freetype2/freetype/config/ftoption.h +1 -1
- package/include/gio-unix-2.0/gio/gfiledescriptorbased.h +3 -2
- package/include/glib-2.0/gio/gapplication.h +6 -0
- package/include/glib-2.0/gio/giotypes.h +0 -1
- package/include/glib-2.0/girepository/giarginfo.h +23 -6
- package/include/glib-2.0/girepository/gibaseinfo.h +44 -18
- package/include/glib-2.0/girepository/gicallableinfo.h +26 -16
- package/include/glib-2.0/girepository/gicallbackinfo.h +17 -2
- package/include/glib-2.0/girepository/giconstantinfo.h +19 -4
- package/include/glib-2.0/girepository/gienuminfo.h +20 -21
- package/include/glib-2.0/girepository/gifieldinfo.h +22 -7
- package/include/glib-2.0/girepository/giflagsinfo.h +60 -0
- package/include/glib-2.0/girepository/gifunctioninfo.h +22 -7
- package/include/glib-2.0/girepository/giinterfaceinfo.h +33 -18
- package/include/glib-2.0/girepository/giobjectinfo.h +41 -26
- package/include/glib-2.0/girepository/gipropertyinfo.h +18 -3
- package/include/glib-2.0/girepository/giregisteredtypeinfo.h +22 -11
- package/include/glib-2.0/girepository/girepository-autocleanups.h +56 -0
- package/include/glib-2.0/girepository/girepository.h +53 -62
- package/include/glib-2.0/girepository/girffi.h +8 -7
- package/include/glib-2.0/girepository/gisignalinfo.h +18 -3
- package/include/glib-2.0/girepository/gistructinfo.h +26 -11
- package/include/glib-2.0/girepository/gitypeinfo.h +29 -16
- package/include/glib-2.0/girepository/gitypelib.h +9 -13
- package/include/glib-2.0/girepository/gitypes.h +52 -104
- package/include/glib-2.0/girepository/giunioninfo.h +28 -12
- package/include/glib-2.0/girepository/giunresolvedinfo.h +17 -2
- package/include/glib-2.0/girepository/givalueinfo.h +65 -0
- package/include/glib-2.0/girepository/givfuncinfo.h +23 -8
- package/include/glib-2.0/glib/deprecated/gthread.h +9 -5
- package/include/glib-2.0/glib/gbitlock.h +31 -0
- package/include/glib-2.0/glib/gmessages.h +8 -0
- package/include/glib-2.0/glib/gslice.h +2 -0
- package/include/glib-2.0/glib/gstrfuncs.h +24 -18
- package/include/glib-2.0/glib/gthread.h +191 -3
- package/include/glib-2.0/glib-unix.h +7 -1
- package/include/glib-2.0/gobject/genums.h +6 -6
- package/include/glib-2.0/gobject/glib-types.h +11 -0
- package/include/glib-2.0/gobject/gsignal.h +16 -6
- package/include/hwy/aligned_allocator.h +171 -6
- package/include/hwy/base.h +1765 -543
- package/include/hwy/cache_control.h +24 -6
- package/include/hwy/detect_compiler_arch.h +23 -2
- package/include/hwy/detect_targets.h +56 -13
- package/include/hwy/foreach_target.h +24 -0
- package/include/hwy/highway.h +20 -3
- package/include/hwy/ops/arm_neon-inl.h +1086 -667
- package/include/hwy/ops/arm_sve-inl.h +1091 -235
- package/include/hwy/ops/emu128-inl.h +271 -196
- package/include/hwy/ops/generic_ops-inl.h +2270 -399
- package/include/hwy/ops/ppc_vsx-inl.h +1786 -563
- package/include/hwy/ops/rvv-inl.h +1043 -311
- package/include/hwy/ops/scalar-inl.h +189 -159
- package/include/hwy/ops/set_macros-inl.h +66 -6
- package/include/hwy/ops/shared-inl.h +175 -56
- package/include/hwy/ops/wasm_128-inl.h +153 -136
- package/include/hwy/ops/x86_128-inl.h +1647 -646
- package/include/hwy/ops/x86_256-inl.h +1003 -370
- package/include/hwy/ops/x86_512-inl.h +948 -353
- package/include/hwy/per_target.h +4 -0
- package/include/hwy/profiler.h +648 -0
- package/include/hwy/robust_statistics.h +2 -2
- package/include/hwy/targets.h +18 -11
- package/include/hwy/timer.h +11 -0
- package/include/libpng16/png.h +32 -29
- package/include/libpng16/pngconf.h +2 -2
- package/include/libpng16/pnglibconf.h +7 -2
- package/include/librsvg-2.0/librsvg/rsvg-version.h +2 -2
- package/include/libxml2/libxml/parser.h +16 -7
- package/include/libxml2/libxml/xmlIO.h +0 -1
- package/include/libxml2/libxml/xmlversion.h +4 -4
- package/include/pango-1.0/pango/pango-features.h +3 -3
- package/include/pango-1.0/pango/pango-fontmap.h +7 -0
- package/include/pixman-1/pixman-version.h +2 -2
- package/include/png.h +32 -29
- package/include/pngconf.h +2 -2
- package/include/pnglibconf.h +7 -2
- package/include/vips/connection.h +9 -3
- package/include/vips/util.h +0 -9
- package/include/vips/version.h +4 -4
- package/package.json +1 -1
- package/versions.json +11 -11
package/include/hwy/base.h
CHANGED
|
@@ -22,16 +22,15 @@
|
|
|
22
22
|
#include <stddef.h>
|
|
23
23
|
#include <stdint.h>
|
|
24
24
|
|
|
25
|
-
// Wrapping this into a HWY_HAS_INCLUDE causes clang-format to fail.
|
|
26
|
-
#if __cplusplus >= 202100L && defined(__has_include)
|
|
27
|
-
#if __has_include(<stdfloat>)
|
|
28
|
-
#include <stdfloat> // std::float16_t
|
|
29
|
-
#endif
|
|
30
|
-
#endif
|
|
31
|
-
|
|
32
25
|
#include "hwy/detect_compiler_arch.h"
|
|
33
26
|
#include "hwy/highway_export.h"
|
|
34
27
|
|
|
28
|
+
#if HWY_COMPILER_MSVC && defined(_MSVC_LANG) && _MSVC_LANG > __cplusplus
|
|
29
|
+
#define HWY_CXX_LANG _MSVC_LANG
|
|
30
|
+
#else
|
|
31
|
+
#define HWY_CXX_LANG __cplusplus
|
|
32
|
+
#endif
|
|
33
|
+
|
|
35
34
|
// "IWYU pragma: keep" does not work for these includes, so hide from the IDE.
|
|
36
35
|
#if !HWY_IDE
|
|
37
36
|
|
|
@@ -48,6 +47,15 @@
|
|
|
48
47
|
|
|
49
48
|
#endif // !HWY_IDE
|
|
50
49
|
|
|
50
|
+
#if !defined(HWY_NO_LIBCXX) && HWY_CXX_LANG > 201703L && \
|
|
51
|
+
__cpp_impl_three_way_comparison >= 201907L && defined(__has_include) && \
|
|
52
|
+
!defined(HWY_DISABLE_CXX20_THREE_WAY_COMPARE)
|
|
53
|
+
#if __has_include(<compare>)
|
|
54
|
+
#include <compare>
|
|
55
|
+
#define HWY_HAVE_CXX20_THREE_WAY_COMPARE 1
|
|
56
|
+
#endif
|
|
57
|
+
#endif
|
|
58
|
+
|
|
51
59
|
// IWYU pragma: end_exports
|
|
52
60
|
|
|
53
61
|
#if HWY_COMPILER_MSVC
|
|
@@ -131,6 +139,10 @@ namespace hwy {
|
|
|
131
139
|
#define HWY_ASSUME_ALIGNED(ptr, align) (ptr) /* not supported */
|
|
132
140
|
#endif
|
|
133
141
|
|
|
142
|
+
// Special case to increases required alignment
|
|
143
|
+
#define HWY_RCAST_ALIGNED(type, ptr) \
|
|
144
|
+
reinterpret_cast<type>(HWY_ASSUME_ALIGNED((ptr), alignof(type)))
|
|
145
|
+
|
|
134
146
|
// Clang and GCC require attributes on each function into which SIMD intrinsics
|
|
135
147
|
// are inlined. Support both per-function annotation (HWY_ATTR) for lambdas and
|
|
136
148
|
// automatic annotation via pragmas.
|
|
@@ -274,6 +286,16 @@ HWY_DLLEXPORT HWY_NORETURN void HWY_FORMAT(3, 4)
|
|
|
274
286
|
} while (0)
|
|
275
287
|
#endif
|
|
276
288
|
|
|
289
|
+
#if __cpp_constexpr >= 201304L
|
|
290
|
+
#define HWY_CXX14_CONSTEXPR constexpr
|
|
291
|
+
#else
|
|
292
|
+
#define HWY_CXX14_CONSTEXPR
|
|
293
|
+
#endif
|
|
294
|
+
|
|
295
|
+
#ifndef HWY_HAVE_CXX20_THREE_WAY_COMPARE
|
|
296
|
+
#define HWY_HAVE_CXX20_THREE_WAY_COMPARE 0
|
|
297
|
+
#endif
|
|
298
|
+
|
|
277
299
|
//------------------------------------------------------------------------------
|
|
278
300
|
// CopyBytes / ZeroBytes
|
|
279
301
|
|
|
@@ -288,8 +310,7 @@ HWY_API void CopyBytes(const From* from, To* to) {
|
|
|
288
310
|
#if HWY_COMPILER_MSVC
|
|
289
311
|
memcpy(to, from, kBytes);
|
|
290
312
|
#else
|
|
291
|
-
__builtin_memcpy(
|
|
292
|
-
kBytes);
|
|
313
|
+
__builtin_memcpy(to, from, kBytes);
|
|
293
314
|
#endif
|
|
294
315
|
}
|
|
295
316
|
|
|
@@ -357,349 +378,11 @@ static constexpr HWY_MAYBE_UNUSED size_t kMaxVectorSize = 16;
|
|
|
357
378
|
//------------------------------------------------------------------------------
|
|
358
379
|
// Lane types
|
|
359
380
|
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
// float16_t
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
#if (HWY_ARCH_ARM_A64 && !HWY_COMPILER_MSVC) || \
|
|
366
|
-
(HWY_COMPILER_CLANG && defined(__ARM_FP) && (__ARM_FP & 2)) || \
|
|
367
|
-
(HWY_COMPILER_GCC_ACTUAL && defined(__ARM_FP16_FORMAT_IEEE))
|
|
368
|
-
#define HWY_NEON_HAVE_FLOAT16C 1
|
|
369
|
-
#else
|
|
370
|
-
#define HWY_NEON_HAVE_FLOAT16C 0
|
|
371
|
-
#endif
|
|
372
|
-
|
|
373
|
-
// C11 extension ISO/IEC TS 18661-3:2015 but not supported on all targets.
|
|
374
|
-
// Required if HWY_HAVE_FLOAT16, i.e. RVV with zvfh or AVX3_SPR (with
|
|
375
|
-
// sufficiently new compiler supporting avx512fp16). Do not use on clang-cl,
|
|
376
|
-
// which is missing __extendhfsf2.
|
|
377
|
-
#if ((HWY_ARCH_RVV && defined(__riscv_zvfh) && HWY_COMPILER_CLANG) || \
|
|
378
|
-
(HWY_ARCH_X86 && defined(__SSE2__) && \
|
|
379
|
-
((HWY_COMPILER_CLANG >= 1600 && !HWY_COMPILER_CLANGCL) || \
|
|
380
|
-
HWY_COMPILER_GCC_ACTUAL >= 1200)))
|
|
381
|
-
#define HWY_HAVE_C11_FLOAT16 1
|
|
382
|
-
#else
|
|
383
|
-
#define HWY_HAVE_C11_FLOAT16 0
|
|
384
|
-
#endif
|
|
385
|
-
|
|
386
|
-
// If 1, both __bf16 and a limited set of *_bf16 SVE intrinsics are available:
|
|
387
|
-
// create/get/set/dup, ld/st, sel, rev, trn, uzp, zip.
|
|
388
|
-
#if HWY_ARCH_ARM_A64 && defined(__ARM_FEATURE_SVE_BF16)
|
|
389
|
-
#define HWY_SVE_HAVE_BFLOAT16 1
|
|
390
|
-
#else
|
|
391
|
-
#define HWY_SVE_HAVE_BFLOAT16 0
|
|
392
|
-
#endif
|
|
393
|
-
|
|
394
|
-
// Match [u]int##_t naming scheme so rvv-inl.h macros can obtain the type name
|
|
395
|
-
// by concatenating base type and bits. We use a wrapper class instead of a
|
|
396
|
-
// typedef to the native type to ensure that the same symbols, e.g. for VQSort,
|
|
397
|
-
// are generated regardless of F16 support; see #1684.
|
|
398
|
-
struct float16_t {
|
|
399
|
-
#if HWY_NEON_HAVE_FLOAT16C // ACLE's __fp16
|
|
400
|
-
using Raw = __fp16;
|
|
401
|
-
#elif HWY_HAVE_C11_FLOAT16 // C11 _Float16
|
|
402
|
-
using Raw = _Float16;
|
|
403
|
-
#elif __cplusplus > 202002L && defined(__STDCPP_FLOAT16_T__) // C++23
|
|
404
|
-
using Raw = std::float16_t;
|
|
405
|
-
#else
|
|
406
|
-
#define HWY_EMULATE_FLOAT16
|
|
407
|
-
using Raw = uint16_t;
|
|
408
|
-
Raw bits;
|
|
409
|
-
#endif // float16_t
|
|
410
|
-
|
|
411
|
-
// When backed by a native type, ensure the wrapper behaves like the native
|
|
412
|
-
// type by forwarding all operators. Unfortunately it seems difficult to reuse
|
|
413
|
-
// this code in a base class, so we repeat it in bfloat16_t.
|
|
414
|
-
#ifndef HWY_EMULATE_FLOAT16
|
|
415
|
-
Raw raw;
|
|
416
|
-
|
|
417
|
-
float16_t() noexcept = default;
|
|
418
|
-
template <typename T>
|
|
419
|
-
constexpr float16_t(T arg) noexcept : raw(static_cast<Raw>(arg)) {}
|
|
420
|
-
float16_t& operator=(Raw arg) noexcept {
|
|
421
|
-
raw = arg;
|
|
422
|
-
return *this;
|
|
423
|
-
}
|
|
424
|
-
constexpr float16_t(const float16_t&) noexcept = default;
|
|
425
|
-
float16_t& operator=(const float16_t&) noexcept = default;
|
|
426
|
-
constexpr operator Raw() const noexcept { return raw; }
|
|
427
|
-
|
|
428
|
-
template <typename T>
|
|
429
|
-
float16_t& operator+=(T rhs) noexcept {
|
|
430
|
-
raw = static_cast<Raw>(raw + rhs);
|
|
431
|
-
return *this;
|
|
432
|
-
}
|
|
433
|
-
|
|
434
|
-
template <typename T>
|
|
435
|
-
float16_t& operator-=(T rhs) noexcept {
|
|
436
|
-
raw = static_cast<Raw>(raw - rhs);
|
|
437
|
-
return *this;
|
|
438
|
-
}
|
|
439
|
-
|
|
440
|
-
template <typename T>
|
|
441
|
-
float16_t& operator*=(T rhs) noexcept {
|
|
442
|
-
raw = static_cast<Raw>(raw * rhs);
|
|
443
|
-
return *this;
|
|
444
|
-
}
|
|
445
|
-
|
|
446
|
-
template <typename T>
|
|
447
|
-
float16_t& operator/=(T rhs) noexcept {
|
|
448
|
-
raw = static_cast<Raw>(raw / rhs);
|
|
449
|
-
return *this;
|
|
450
|
-
}
|
|
451
|
-
|
|
452
|
-
float16_t operator--() noexcept {
|
|
453
|
-
raw = static_cast<Raw>(raw - Raw{1});
|
|
454
|
-
return *this;
|
|
455
|
-
}
|
|
456
|
-
|
|
457
|
-
float16_t operator--(int) noexcept {
|
|
458
|
-
raw = static_cast<Raw>(raw - Raw{1});
|
|
459
|
-
return *this;
|
|
460
|
-
}
|
|
461
|
-
|
|
462
|
-
float16_t operator++() noexcept {
|
|
463
|
-
raw = static_cast<Raw>(raw + Raw{1});
|
|
464
|
-
return *this;
|
|
465
|
-
}
|
|
466
|
-
|
|
467
|
-
float16_t operator++(int) noexcept {
|
|
468
|
-
raw = static_cast<Raw>(raw + Raw{1});
|
|
469
|
-
return *this;
|
|
470
|
-
}
|
|
471
|
-
|
|
472
|
-
constexpr float16_t operator-() const noexcept {
|
|
473
|
-
return float16_t(static_cast<Raw>(-raw));
|
|
474
|
-
}
|
|
475
|
-
constexpr float16_t operator+() const noexcept { return *this; }
|
|
476
|
-
#endif // HWY_EMULATE_FLOAT16
|
|
477
|
-
};
|
|
478
|
-
|
|
479
|
-
#ifndef HWY_EMULATE_FLOAT16
|
|
480
|
-
constexpr inline bool operator==(float16_t lhs, float16_t rhs) noexcept {
|
|
481
|
-
return lhs.raw == rhs.raw;
|
|
482
|
-
}
|
|
483
|
-
constexpr inline bool operator!=(float16_t lhs, float16_t rhs) noexcept {
|
|
484
|
-
return lhs.raw != rhs.raw;
|
|
485
|
-
}
|
|
486
|
-
constexpr inline bool operator<(float16_t lhs, float16_t rhs) noexcept {
|
|
487
|
-
return lhs.raw < rhs.raw;
|
|
488
|
-
}
|
|
489
|
-
constexpr inline bool operator<=(float16_t lhs, float16_t rhs) noexcept {
|
|
490
|
-
return lhs.raw <= rhs.raw;
|
|
491
|
-
}
|
|
492
|
-
constexpr inline bool operator>(float16_t lhs, float16_t rhs) noexcept {
|
|
493
|
-
return lhs.raw > rhs.raw;
|
|
494
|
-
}
|
|
495
|
-
constexpr inline bool operator>=(float16_t lhs, float16_t rhs) noexcept {
|
|
496
|
-
return lhs.raw >= rhs.raw;
|
|
497
|
-
}
|
|
498
|
-
#endif // HWY_EMULATE_FLOAT16
|
|
499
|
-
|
|
500
|
-
struct bfloat16_t {
|
|
501
|
-
#if HWY_SVE_HAVE_BFLOAT16
|
|
502
|
-
using Raw = __bf16;
|
|
503
|
-
#elif __cplusplus >= 202100L && defined(__STDCPP_BFLOAT16_T__) // C++23
|
|
504
|
-
using Raw = std::bfloat16_t;
|
|
505
|
-
#else
|
|
506
|
-
#define HWY_EMULATE_BFLOAT16
|
|
507
|
-
using Raw = uint16_t;
|
|
508
|
-
Raw bits;
|
|
509
|
-
#endif
|
|
510
|
-
|
|
511
|
-
#ifndef HWY_EMULATE_BFLOAT16
|
|
512
|
-
Raw raw;
|
|
513
|
-
|
|
514
|
-
bfloat16_t() noexcept = default;
|
|
515
|
-
template <typename T>
|
|
516
|
-
constexpr bfloat16_t(T arg) noexcept : raw(static_cast<Raw>(arg)) {}
|
|
517
|
-
bfloat16_t& operator=(Raw arg) noexcept {
|
|
518
|
-
raw = arg;
|
|
519
|
-
return *this;
|
|
520
|
-
}
|
|
521
|
-
constexpr bfloat16_t(const bfloat16_t&) noexcept = default;
|
|
522
|
-
bfloat16_t& operator=(const bfloat16_t&) noexcept = default;
|
|
523
|
-
constexpr operator Raw() const noexcept { return raw; }
|
|
524
|
-
|
|
525
|
-
template <typename T>
|
|
526
|
-
bfloat16_t& operator+=(T rhs) noexcept {
|
|
527
|
-
raw = static_cast<Raw>(raw + rhs);
|
|
528
|
-
return *this;
|
|
529
|
-
}
|
|
530
|
-
|
|
531
|
-
template <typename T>
|
|
532
|
-
bfloat16_t& operator-=(T rhs) noexcept {
|
|
533
|
-
raw = static_cast<Raw>(raw - rhs);
|
|
534
|
-
return *this;
|
|
535
|
-
}
|
|
536
|
-
|
|
537
|
-
template <typename T>
|
|
538
|
-
bfloat16_t& operator*=(T rhs) noexcept {
|
|
539
|
-
raw = static_cast<Raw>(raw * rhs);
|
|
540
|
-
return *this;
|
|
541
|
-
}
|
|
542
|
-
|
|
543
|
-
template <typename T>
|
|
544
|
-
bfloat16_t& operator/=(T rhs) noexcept {
|
|
545
|
-
raw = static_cast<Raw>(raw / rhs);
|
|
546
|
-
return *this;
|
|
547
|
-
}
|
|
548
|
-
|
|
549
|
-
bfloat16_t operator--() noexcept {
|
|
550
|
-
raw = static_cast<Raw>(raw - Raw{1});
|
|
551
|
-
return *this;
|
|
552
|
-
}
|
|
553
|
-
|
|
554
|
-
bfloat16_t operator--(int) noexcept {
|
|
555
|
-
raw = static_cast<Raw>(raw - Raw{1});
|
|
556
|
-
return *this;
|
|
557
|
-
}
|
|
558
|
-
|
|
559
|
-
bfloat16_t operator++() noexcept {
|
|
560
|
-
raw = static_cast<Raw>(raw + Raw{1});
|
|
561
|
-
return *this;
|
|
562
|
-
}
|
|
563
|
-
|
|
564
|
-
bfloat16_t operator++(int) noexcept {
|
|
565
|
-
raw = static_cast<Raw>(raw + Raw{1});
|
|
566
|
-
return *this;
|
|
567
|
-
}
|
|
568
|
-
|
|
569
|
-
constexpr bfloat16_t operator-() const noexcept {
|
|
570
|
-
return bfloat16_t(static_cast<Raw>(-raw));
|
|
571
|
-
}
|
|
572
|
-
constexpr bfloat16_t operator+() const noexcept { return *this; }
|
|
573
|
-
#endif // HWY_EMULATE_BFLOAT16
|
|
574
|
-
};
|
|
575
|
-
|
|
576
|
-
#ifndef HWY_EMULATE_BFLOAT16
|
|
577
|
-
constexpr inline bool operator==(bfloat16_t lhs, bfloat16_t rhs) noexcept {
|
|
578
|
-
return lhs.raw == rhs.raw;
|
|
579
|
-
}
|
|
580
|
-
constexpr inline bool operator!=(bfloat16_t lhs, bfloat16_t rhs) noexcept {
|
|
581
|
-
return lhs.raw != rhs.raw;
|
|
582
|
-
}
|
|
583
|
-
constexpr inline bool operator<(bfloat16_t lhs, bfloat16_t rhs) noexcept {
|
|
584
|
-
return lhs.raw < rhs.raw;
|
|
585
|
-
}
|
|
586
|
-
constexpr inline bool operator<=(bfloat16_t lhs, bfloat16_t rhs) noexcept {
|
|
587
|
-
return lhs.raw <= rhs.raw;
|
|
588
|
-
}
|
|
589
|
-
constexpr inline bool operator>(bfloat16_t lhs, bfloat16_t rhs) noexcept {
|
|
590
|
-
return lhs.raw > rhs.raw;
|
|
591
|
-
}
|
|
592
|
-
constexpr inline bool operator>=(bfloat16_t lhs, bfloat16_t rhs) noexcept {
|
|
593
|
-
return lhs.raw >= rhs.raw;
|
|
594
|
-
}
|
|
595
|
-
#endif // HWY_EMULATE_BFLOAT16
|
|
596
|
-
|
|
597
|
-
#pragma pack(pop)
|
|
598
|
-
|
|
599
|
-
HWY_API float F32FromF16(float16_t f16) {
|
|
600
|
-
#ifdef HWY_EMULATE_FLOAT16
|
|
601
|
-
uint16_t bits16;
|
|
602
|
-
CopySameSize(&f16, &bits16);
|
|
603
|
-
const uint32_t sign = static_cast<uint32_t>(bits16 >> 15);
|
|
604
|
-
const uint32_t biased_exp = (bits16 >> 10) & 0x1F;
|
|
605
|
-
const uint32_t mantissa = bits16 & 0x3FF;
|
|
606
|
-
|
|
607
|
-
// Subnormal or zero
|
|
608
|
-
if (biased_exp == 0) {
|
|
609
|
-
const float subnormal =
|
|
610
|
-
(1.0f / 16384) * (static_cast<float>(mantissa) * (1.0f / 1024));
|
|
611
|
-
return sign ? -subnormal : subnormal;
|
|
612
|
-
}
|
|
613
|
-
|
|
614
|
-
// Normalized: convert the representation directly (faster than ldexp/tables).
|
|
615
|
-
const uint32_t biased_exp32 = biased_exp + (127 - 15);
|
|
616
|
-
const uint32_t mantissa32 = mantissa << (23 - 10);
|
|
617
|
-
const uint32_t bits32 = (sign << 31) | (biased_exp32 << 23) | mantissa32;
|
|
618
|
-
|
|
619
|
-
float result;
|
|
620
|
-
CopySameSize(&bits32, &result);
|
|
621
|
-
return result;
|
|
622
|
-
#else
|
|
623
|
-
return static_cast<float>(f16);
|
|
624
|
-
#endif
|
|
625
|
-
}
|
|
626
|
-
|
|
627
|
-
HWY_API float16_t F16FromF32(float f32) {
|
|
628
|
-
#ifdef HWY_EMULATE_FLOAT16
|
|
629
|
-
uint32_t bits32;
|
|
630
|
-
CopySameSize(&f32, &bits32);
|
|
631
|
-
const uint32_t sign = bits32 >> 31;
|
|
632
|
-
const uint32_t biased_exp32 = (bits32 >> 23) & 0xFF;
|
|
633
|
-
const uint32_t mantissa32 = bits32 & 0x7FFFFF;
|
|
634
|
-
|
|
635
|
-
const int32_t exp = HWY_MIN(static_cast<int32_t>(biased_exp32) - 127, 15);
|
|
636
|
-
|
|
637
|
-
// Tiny or zero => zero.
|
|
638
|
-
float16_t out;
|
|
639
|
-
if (exp < -24) {
|
|
640
|
-
// restore original sign
|
|
641
|
-
const uint16_t bits = static_cast<uint16_t>(sign << 15);
|
|
642
|
-
CopySameSize(&bits, &out);
|
|
643
|
-
return out;
|
|
644
|
-
}
|
|
645
|
-
|
|
646
|
-
uint32_t biased_exp16, mantissa16;
|
|
647
|
-
|
|
648
|
-
// exp = [-24, -15] => subnormal
|
|
649
|
-
if (exp < -14) {
|
|
650
|
-
biased_exp16 = 0;
|
|
651
|
-
const uint32_t sub_exp = static_cast<uint32_t>(-14 - exp);
|
|
652
|
-
HWY_DASSERT(1 <= sub_exp && sub_exp < 11);
|
|
653
|
-
mantissa16 = static_cast<uint32_t>((1u << (10 - sub_exp)) +
|
|
654
|
-
(mantissa32 >> (13 + sub_exp)));
|
|
655
|
-
} else {
|
|
656
|
-
// exp = [-14, 15]
|
|
657
|
-
biased_exp16 = static_cast<uint32_t>(exp + 15);
|
|
658
|
-
HWY_DASSERT(1 <= biased_exp16 && biased_exp16 < 31);
|
|
659
|
-
mantissa16 = mantissa32 >> 13;
|
|
660
|
-
}
|
|
661
|
-
|
|
662
|
-
HWY_DASSERT(mantissa16 < 1024);
|
|
663
|
-
const uint32_t bits16 = (sign << 15) | (biased_exp16 << 10) | mantissa16;
|
|
664
|
-
HWY_DASSERT(bits16 < 0x10000);
|
|
665
|
-
const uint16_t narrowed = static_cast<uint16_t>(bits16); // big-endian safe
|
|
666
|
-
CopySameSize(&narrowed, &out);
|
|
667
|
-
return out;
|
|
668
|
-
#else
|
|
669
|
-
return float16_t(static_cast<float16_t::Raw>(f32));
|
|
670
|
-
#endif
|
|
671
|
-
}
|
|
672
|
-
|
|
673
|
-
HWY_API float F32FromBF16(bfloat16_t bf) {
|
|
674
|
-
uint16_t bits16;
|
|
675
|
-
CopyBytes<2>(&bf, &bits16);
|
|
676
|
-
uint32_t bits = bits16;
|
|
677
|
-
bits <<= 16;
|
|
678
|
-
float f;
|
|
679
|
-
CopySameSize(&bits, &f);
|
|
680
|
-
return f;
|
|
681
|
-
}
|
|
682
|
-
|
|
683
|
-
HWY_API float F32FromF16Mem(const void* ptr) {
|
|
684
|
-
float16_t f16;
|
|
685
|
-
CopyBytes<2>(ptr, &f16);
|
|
686
|
-
return F32FromF16(f16);
|
|
687
|
-
}
|
|
688
|
-
|
|
689
|
-
HWY_API float F32FromBF16Mem(const void* ptr) {
|
|
690
|
-
bfloat16_t bf;
|
|
691
|
-
CopyBytes<2>(ptr, &bf);
|
|
692
|
-
return F32FromBF16(bf);
|
|
693
|
-
}
|
|
694
|
-
|
|
695
|
-
HWY_API bfloat16_t BF16FromF32(float f) {
|
|
696
|
-
uint32_t bits;
|
|
697
|
-
CopySameSize(&f, &bits);
|
|
698
|
-
const uint16_t bits16 = static_cast<uint16_t>(bits >> 16);
|
|
699
|
-
bfloat16_t bf;
|
|
700
|
-
CopySameSize(&bits16, &bf);
|
|
701
|
-
return bf;
|
|
702
|
-
}
|
|
381
|
+
// hwy::float16_t and hwy::bfloat16_t are forward declared here to allow
|
|
382
|
+
// BitCastScalar to be implemented before the implementations of the
|
|
383
|
+
// hwy::float16_t and hwy::bfloat16_t types
|
|
384
|
+
struct float16_t;
|
|
385
|
+
struct bfloat16_t;
|
|
703
386
|
|
|
704
387
|
using float32_t = float;
|
|
705
388
|
using float64_t = double;
|
|
@@ -729,24 +412,6 @@ struct alignas(8) K32V32 {
|
|
|
729
412
|
|
|
730
413
|
#pragma pack(pop)
|
|
731
414
|
|
|
732
|
-
#ifdef HWY_EMULATE_FLOAT16
|
|
733
|
-
|
|
734
|
-
static inline HWY_MAYBE_UNUSED bool operator<(const float16_t& a,
|
|
735
|
-
const float16_t& b) {
|
|
736
|
-
return F32FromF16(a) < F32FromF16(b);
|
|
737
|
-
}
|
|
738
|
-
// Required for std::greater.
|
|
739
|
-
static inline HWY_MAYBE_UNUSED bool operator>(const float16_t& a,
|
|
740
|
-
const float16_t& b) {
|
|
741
|
-
return F32FromF16(a) > F32FromF16(b);
|
|
742
|
-
}
|
|
743
|
-
static inline HWY_MAYBE_UNUSED bool operator==(const float16_t& a,
|
|
744
|
-
const float16_t& b) {
|
|
745
|
-
return F32FromF16(a) == F32FromF16(b);
|
|
746
|
-
}
|
|
747
|
-
|
|
748
|
-
#endif // HWY_EMULATE_FLOAT16
|
|
749
|
-
|
|
750
415
|
static inline HWY_MAYBE_UNUSED bool operator<(const uint128_t& a,
|
|
751
416
|
const uint128_t& b) {
|
|
752
417
|
return (a.hi == b.hi) ? a.lo < b.lo : a.hi < b.hi;
|
|
@@ -817,106 +482,1378 @@ HWY_API constexpr bool IsSame() {
|
|
|
817
482
|
return IsSameT<T, U>::value;
|
|
818
483
|
}
|
|
819
484
|
|
|
485
|
+
// Returns whether T matches either of U1 or U2
|
|
486
|
+
template <typename T, typename U1, typename U2>
|
|
487
|
+
HWY_API constexpr bool IsSameEither() {
|
|
488
|
+
return IsSameT<T, U1>::value || IsSameT<T, U2>::value;
|
|
489
|
+
}
|
|
490
|
+
|
|
820
491
|
template <bool Condition, typename Then, typename Else>
|
|
821
492
|
struct IfT {
|
|
822
493
|
using type = Then;
|
|
823
494
|
};
|
|
824
495
|
|
|
825
|
-
template <class Then, class Else>
|
|
826
|
-
struct IfT<false, Then, Else> {
|
|
827
|
-
using type = Else;
|
|
828
|
-
};
|
|
496
|
+
template <class Then, class Else>
|
|
497
|
+
struct IfT<false, Then, Else> {
|
|
498
|
+
using type = Else;
|
|
499
|
+
};
|
|
500
|
+
|
|
501
|
+
template <bool Condition, typename Then, typename Else>
|
|
502
|
+
using If = typename IfT<Condition, Then, Else>::type;
|
|
503
|
+
|
|
504
|
+
template <typename T>
|
|
505
|
+
struct IsConstT {
|
|
506
|
+
enum { value = 0 };
|
|
507
|
+
};
|
|
508
|
+
|
|
509
|
+
template <typename T>
|
|
510
|
+
struct IsConstT<const T> {
|
|
511
|
+
enum { value = 1 };
|
|
512
|
+
};
|
|
513
|
+
|
|
514
|
+
template <typename T>
|
|
515
|
+
HWY_API constexpr bool IsConst() {
|
|
516
|
+
return IsConstT<T>::value;
|
|
517
|
+
}
|
|
518
|
+
|
|
519
|
+
template <class T>
|
|
520
|
+
struct RemoveConstT {
|
|
521
|
+
using type = T;
|
|
522
|
+
};
|
|
523
|
+
template <class T>
|
|
524
|
+
struct RemoveConstT<const T> {
|
|
525
|
+
using type = T;
|
|
526
|
+
};
|
|
527
|
+
|
|
528
|
+
template <class T>
|
|
529
|
+
using RemoveConst = typename RemoveConstT<T>::type;
|
|
530
|
+
|
|
531
|
+
template <class T>
|
|
532
|
+
struct RemoveVolatileT {
|
|
533
|
+
using type = T;
|
|
534
|
+
};
|
|
535
|
+
template <class T>
|
|
536
|
+
struct RemoveVolatileT<volatile T> {
|
|
537
|
+
using type = T;
|
|
538
|
+
};
|
|
539
|
+
|
|
540
|
+
template <class T>
|
|
541
|
+
using RemoveVolatile = typename RemoveVolatileT<T>::type;
|
|
542
|
+
|
|
543
|
+
template <class T>
|
|
544
|
+
struct RemoveRefT {
|
|
545
|
+
using type = T;
|
|
546
|
+
};
|
|
547
|
+
template <class T>
|
|
548
|
+
struct RemoveRefT<T&> {
|
|
549
|
+
using type = T;
|
|
550
|
+
};
|
|
551
|
+
template <class T>
|
|
552
|
+
struct RemoveRefT<T&&> {
|
|
553
|
+
using type = T;
|
|
554
|
+
};
|
|
555
|
+
|
|
556
|
+
template <class T>
|
|
557
|
+
using RemoveRef = typename RemoveRefT<T>::type;
|
|
558
|
+
|
|
559
|
+
template <class T>
|
|
560
|
+
using RemoveCvRef = RemoveConst<RemoveVolatile<RemoveRef<T>>>;
|
|
561
|
+
|
|
562
|
+
// Insert into template/function arguments to enable this overload only for
|
|
563
|
+
// vectors of exactly, at most (LE), or more than (GT) this many bytes.
|
|
564
|
+
//
|
|
565
|
+
// As an example, checking for a total size of 16 bytes will match both
|
|
566
|
+
// Simd<uint8_t, 16, 0> and Simd<uint8_t, 8, 1>.
|
|
567
|
+
#define HWY_IF_V_SIZE(T, kN, bytes) \
|
|
568
|
+
hwy::EnableIf<kN * sizeof(T) == bytes>* = nullptr
|
|
569
|
+
#define HWY_IF_V_SIZE_LE(T, kN, bytes) \
|
|
570
|
+
hwy::EnableIf<kN * sizeof(T) <= bytes>* = nullptr
|
|
571
|
+
#define HWY_IF_V_SIZE_GT(T, kN, bytes) \
|
|
572
|
+
hwy::EnableIf<(kN * sizeof(T) > bytes)>* = nullptr
|
|
573
|
+
|
|
574
|
+
#define HWY_IF_LANES(kN, lanes) hwy::EnableIf<(kN == lanes)>* = nullptr
|
|
575
|
+
#define HWY_IF_LANES_LE(kN, lanes) hwy::EnableIf<(kN <= lanes)>* = nullptr
|
|
576
|
+
#define HWY_IF_LANES_GT(kN, lanes) hwy::EnableIf<(kN > lanes)>* = nullptr
|
|
577
|
+
|
|
578
|
+
#define HWY_IF_UNSIGNED(T) hwy::EnableIf<!hwy::IsSigned<T>()>* = nullptr
|
|
579
|
+
#define HWY_IF_SIGNED(T) \
|
|
580
|
+
hwy::EnableIf<hwy::IsSigned<T>() && !hwy::IsFloat<T>() && \
|
|
581
|
+
!hwy::IsSpecialFloat<T>()>* = nullptr
|
|
582
|
+
#define HWY_IF_FLOAT(T) hwy::EnableIf<hwy::IsFloat<T>()>* = nullptr
|
|
583
|
+
#define HWY_IF_NOT_FLOAT(T) hwy::EnableIf<!hwy::IsFloat<T>()>* = nullptr
|
|
584
|
+
#define HWY_IF_FLOAT3264(T) hwy::EnableIf<hwy::IsFloat3264<T>()>* = nullptr
|
|
585
|
+
#define HWY_IF_NOT_FLOAT3264(T) hwy::EnableIf<!hwy::IsFloat3264<T>()>* = nullptr
|
|
586
|
+
#define HWY_IF_SPECIAL_FLOAT(T) \
|
|
587
|
+
hwy::EnableIf<hwy::IsSpecialFloat<T>()>* = nullptr
|
|
588
|
+
#define HWY_IF_NOT_SPECIAL_FLOAT(T) \
|
|
589
|
+
hwy::EnableIf<!hwy::IsSpecialFloat<T>()>* = nullptr
|
|
590
|
+
#define HWY_IF_FLOAT_OR_SPECIAL(T) \
|
|
591
|
+
hwy::EnableIf<hwy::IsFloat<T>() || hwy::IsSpecialFloat<T>()>* = nullptr
|
|
592
|
+
#define HWY_IF_NOT_FLOAT_NOR_SPECIAL(T) \
|
|
593
|
+
hwy::EnableIf<!hwy::IsFloat<T>() && !hwy::IsSpecialFloat<T>()>* = nullptr
|
|
594
|
+
#define HWY_IF_INTEGER(T) hwy::EnableIf<hwy::IsInteger<T>()>* = nullptr
|
|
595
|
+
|
|
596
|
+
#define HWY_IF_T_SIZE(T, bytes) hwy::EnableIf<sizeof(T) == (bytes)>* = nullptr
|
|
597
|
+
#define HWY_IF_NOT_T_SIZE(T, bytes) \
|
|
598
|
+
hwy::EnableIf<sizeof(T) != (bytes)>* = nullptr
|
|
599
|
+
// bit_array = 0x102 means 1 or 8 bytes. There is no NONE_OF because it sounds
|
|
600
|
+
// too similar. If you want the opposite of this (2 or 4 bytes), ask for those
|
|
601
|
+
// bits explicitly (0x14) instead of attempting to 'negate' 0x102.
|
|
602
|
+
#define HWY_IF_T_SIZE_ONE_OF(T, bit_array) \
|
|
603
|
+
hwy::EnableIf<((size_t{1} << sizeof(T)) & (bit_array)) != 0>* = nullptr
|
|
604
|
+
#define HWY_IF_T_SIZE_LE(T, bytes) \
|
|
605
|
+
hwy::EnableIf<(sizeof(T) <= (bytes))>* = nullptr
|
|
606
|
+
#define HWY_IF_T_SIZE_GT(T, bytes) \
|
|
607
|
+
hwy::EnableIf<(sizeof(T) > (bytes))>* = nullptr
|
|
608
|
+
|
|
609
|
+
#define HWY_IF_SAME(T, expected) \
|
|
610
|
+
hwy::EnableIf<hwy::IsSame<hwy::RemoveCvRef<T>, expected>()>* = nullptr
|
|
611
|
+
#define HWY_IF_NOT_SAME(T, expected) \
|
|
612
|
+
hwy::EnableIf<!hwy::IsSame<hwy::RemoveCvRef<T>, expected>()>* = nullptr
|
|
613
|
+
|
|
614
|
+
// One of two expected types
|
|
615
|
+
#define HWY_IF_SAME2(T, expected1, expected2) \
|
|
616
|
+
hwy::EnableIf< \
|
|
617
|
+
hwy::IsSameEither<hwy::RemoveCvRef<T>, expected1, expected2>()>* = \
|
|
618
|
+
nullptr
|
|
619
|
+
|
|
620
|
+
#define HWY_IF_U8(T) HWY_IF_SAME(T, uint8_t)
|
|
621
|
+
#define HWY_IF_U16(T) HWY_IF_SAME(T, uint16_t)
|
|
622
|
+
#define HWY_IF_U32(T) HWY_IF_SAME(T, uint32_t)
|
|
623
|
+
#define HWY_IF_U64(T) HWY_IF_SAME(T, uint64_t)
|
|
624
|
+
|
|
625
|
+
#define HWY_IF_I8(T) HWY_IF_SAME(T, int8_t)
|
|
626
|
+
#define HWY_IF_I16(T) HWY_IF_SAME(T, int16_t)
|
|
627
|
+
#define HWY_IF_I32(T) HWY_IF_SAME(T, int32_t)
|
|
628
|
+
#define HWY_IF_I64(T) HWY_IF_SAME(T, int64_t)
|
|
629
|
+
|
|
630
|
+
#define HWY_IF_BF16(T) HWY_IF_SAME(T, hwy::bfloat16_t)
|
|
631
|
+
#define HWY_IF_NOT_BF16(T) HWY_IF_NOT_SAME(T, hwy::bfloat16_t)
|
|
632
|
+
|
|
633
|
+
#define HWY_IF_F16(T) HWY_IF_SAME(T, hwy::float16_t)
|
|
634
|
+
#define HWY_IF_NOT_F16(T) HWY_IF_NOT_SAME(T, hwy::float16_t)
|
|
635
|
+
|
|
636
|
+
#define HWY_IF_F32(T) HWY_IF_SAME(T, float)
|
|
637
|
+
#define HWY_IF_F64(T) HWY_IF_SAME(T, double)
|
|
638
|
+
|
|
639
|
+
// Use instead of HWY_IF_T_SIZE to avoid ambiguity with float16_t/float/double
|
|
640
|
+
// overloads.
|
|
641
|
+
#define HWY_IF_UI8(T) HWY_IF_SAME2(T, uint8_t, int8_t)
|
|
642
|
+
#define HWY_IF_UI16(T) HWY_IF_SAME2(T, uint16_t, int16_t)
|
|
643
|
+
#define HWY_IF_UI32(T) HWY_IF_SAME2(T, uint32_t, int32_t)
|
|
644
|
+
#define HWY_IF_UI64(T) HWY_IF_SAME2(T, uint64_t, int64_t)
|
|
645
|
+
|
|
646
|
+
#define HWY_IF_LANES_PER_BLOCK(T, N, LANES) \
|
|
647
|
+
hwy::EnableIf<HWY_MIN(sizeof(T) * N, 16) / sizeof(T) == (LANES)>* = nullptr
|
|
648
|
+
|
|
649
|
+
// Empty struct used as a size tag type.
|
|
650
|
+
template <size_t N>
|
|
651
|
+
struct SizeTag {};
|
|
652
|
+
|
|
653
|
+
template <class T>
|
|
654
|
+
class DeclValT {
|
|
655
|
+
private:
|
|
656
|
+
template <class U, class URef = U&&>
|
|
657
|
+
static URef TryAddRValRef(int);
|
|
658
|
+
template <class U, class Arg>
|
|
659
|
+
static U TryAddRValRef(Arg);
|
|
660
|
+
|
|
661
|
+
public:
|
|
662
|
+
using type = decltype(TryAddRValRef<T>(0));
|
|
663
|
+
enum { kDisableDeclValEvaluation = 1 };
|
|
664
|
+
};
|
|
665
|
+
|
|
666
|
+
// hwy::DeclVal<T>() can only be used in unevaluated contexts such as within an
|
|
667
|
+
// expression of a decltype specifier.
|
|
668
|
+
|
|
669
|
+
// hwy::DeclVal<T>() does not require that T have a public default constructor
|
|
670
|
+
template <class T>
|
|
671
|
+
HWY_API typename DeclValT<T>::type DeclVal() noexcept {
|
|
672
|
+
static_assert(!DeclValT<T>::kDisableDeclValEvaluation,
|
|
673
|
+
"DeclVal() cannot be used in an evaluated context");
|
|
674
|
+
}
|
|
675
|
+
|
|
676
|
+
template <class T>
|
|
677
|
+
struct IsArrayT {
|
|
678
|
+
enum { value = 0 };
|
|
679
|
+
};
|
|
680
|
+
|
|
681
|
+
template <class T>
|
|
682
|
+
struct IsArrayT<T[]> {
|
|
683
|
+
enum { value = 1 };
|
|
684
|
+
};
|
|
685
|
+
|
|
686
|
+
template <class T, size_t N>
|
|
687
|
+
struct IsArrayT<T[N]> {
|
|
688
|
+
enum { value = 1 };
|
|
689
|
+
};
|
|
690
|
+
|
|
691
|
+
template <class T>
|
|
692
|
+
static constexpr bool IsArray() {
|
|
693
|
+
return IsArrayT<T>::value;
|
|
694
|
+
}
|
|
695
|
+
|
|
696
|
+
#if HWY_COMPILER_MSVC
|
|
697
|
+
HWY_DIAGNOSTICS(push)
|
|
698
|
+
HWY_DIAGNOSTICS_OFF(disable : 4180, ignored "-Wignored-qualifiers")
|
|
699
|
+
#endif
|
|
700
|
+
|
|
701
|
+
template <class From, class To>
|
|
702
|
+
class IsConvertibleT {
|
|
703
|
+
private:
|
|
704
|
+
template <class T>
|
|
705
|
+
static hwy::SizeTag<1> TestFuncWithToArg(T);
|
|
706
|
+
|
|
707
|
+
template <class T, class U>
|
|
708
|
+
static decltype(IsConvertibleT<T, U>::template TestFuncWithToArg<U>(
|
|
709
|
+
DeclVal<T>()))
|
|
710
|
+
TryConvTest(int);
|
|
711
|
+
|
|
712
|
+
template <class T, class U, class Arg>
|
|
713
|
+
static hwy::SizeTag<0> TryConvTest(Arg);
|
|
714
|
+
|
|
715
|
+
public:
|
|
716
|
+
enum {
|
|
717
|
+
value = (IsSame<RemoveConst<RemoveVolatile<From>>, void>() &&
|
|
718
|
+
IsSame<RemoveConst<RemoveVolatile<To>>, void>()) ||
|
|
719
|
+
(!IsArray<To>() &&
|
|
720
|
+
(IsSame<To, decltype(DeclVal<To>())>() ||
|
|
721
|
+
!IsSame<const RemoveConst<To>, RemoveConst<To>>()) &&
|
|
722
|
+
IsSame<decltype(TryConvTest<From, To>(0)), hwy::SizeTag<1>>())
|
|
723
|
+
};
|
|
724
|
+
};
|
|
725
|
+
|
|
726
|
+
#if HWY_COMPILER_MSVC
|
|
727
|
+
HWY_DIAGNOSTICS(pop)
|
|
728
|
+
#endif
|
|
729
|
+
|
|
730
|
+
template <class From, class To>
|
|
731
|
+
HWY_API constexpr bool IsConvertible() {
|
|
732
|
+
return IsConvertibleT<From, To>::value;
|
|
733
|
+
}
|
|
734
|
+
|
|
735
|
+
template <class From, class To>
|
|
736
|
+
class IsStaticCastableT {
|
|
737
|
+
private:
|
|
738
|
+
template <class T, class U, class = decltype(static_cast<U>(DeclVal<T>()))>
|
|
739
|
+
static hwy::SizeTag<1> TryStaticCastTest(int);
|
|
740
|
+
|
|
741
|
+
template <class T, class U, class Arg>
|
|
742
|
+
static hwy::SizeTag<0> TryStaticCastTest(Arg);
|
|
743
|
+
|
|
744
|
+
public:
|
|
745
|
+
enum {
|
|
746
|
+
value = IsSame<decltype(TryStaticCastTest<From, To>(0)), hwy::SizeTag<1>>()
|
|
747
|
+
};
|
|
748
|
+
};
|
|
749
|
+
|
|
750
|
+
template <class From, class To>
|
|
751
|
+
static constexpr bool IsStaticCastable() {
|
|
752
|
+
return IsStaticCastableT<From, To>::value;
|
|
753
|
+
}
|
|
754
|
+
|
|
755
|
+
#define HWY_IF_CASTABLE(From, To) \
|
|
756
|
+
hwy::EnableIf<IsStaticCastable<From, To>()>* = nullptr
|
|
757
|
+
|
|
758
|
+
#define HWY_IF_OP_CASTABLE(op, T, Native) \
|
|
759
|
+
HWY_IF_CASTABLE(decltype(DeclVal<Native>() op DeclVal<T>()), Native)
|
|
760
|
+
|
|
761
|
+
template <class T, class From>
|
|
762
|
+
class IsAssignableT {
|
|
763
|
+
private:
|
|
764
|
+
template <class T1, class T2, class = decltype(DeclVal<T1>() = DeclVal<T2>())>
|
|
765
|
+
static hwy::SizeTag<1> TryAssignTest(int);
|
|
766
|
+
|
|
767
|
+
template <class T1, class T2, class Arg>
|
|
768
|
+
static hwy::SizeTag<0> TryAssignTest(Arg);
|
|
769
|
+
|
|
770
|
+
public:
|
|
771
|
+
enum {
|
|
772
|
+
value = IsSame<decltype(TryAssignTest<T, From>(0)), hwy::SizeTag<1>>()
|
|
773
|
+
};
|
|
774
|
+
};
|
|
775
|
+
|
|
776
|
+
template <class T, class From>
|
|
777
|
+
static constexpr bool IsAssignable() {
|
|
778
|
+
return IsAssignableT<T, From>::value;
|
|
779
|
+
}
|
|
780
|
+
|
|
781
|
+
#define HWY_IF_ASSIGNABLE(T, From) \
|
|
782
|
+
hwy::EnableIf<IsAssignable<T, From>()>* = nullptr
|
|
783
|
+
|
|
784
|
+
// ----------------------------------------------------------------------------
|
|
785
|
+
// IsSpecialFloat
|
|
786
|
+
|
|
787
|
+
// These types are often special-cased and not supported in all ops.
|
|
788
|
+
template <typename T>
|
|
789
|
+
HWY_API constexpr bool IsSpecialFloat() {
|
|
790
|
+
return IsSameEither<RemoveCvRef<T>, hwy::float16_t, hwy::bfloat16_t>();
|
|
791
|
+
}
|
|
792
|
+
|
|
793
|
+
// -----------------------------------------------------------------------------
|
|
794
|
+
// IsIntegerLaneType and IsInteger
|
|
795
|
+
|
|
796
|
+
template <class T>
|
|
797
|
+
HWY_API constexpr bool IsIntegerLaneType() {
|
|
798
|
+
return false;
|
|
799
|
+
}
|
|
800
|
+
template <>
|
|
801
|
+
HWY_INLINE constexpr bool IsIntegerLaneType<int8_t>() {
|
|
802
|
+
return true;
|
|
803
|
+
}
|
|
804
|
+
template <>
|
|
805
|
+
HWY_INLINE constexpr bool IsIntegerLaneType<uint8_t>() {
|
|
806
|
+
return true;
|
|
807
|
+
}
|
|
808
|
+
template <>
|
|
809
|
+
HWY_INLINE constexpr bool IsIntegerLaneType<int16_t>() {
|
|
810
|
+
return true;
|
|
811
|
+
}
|
|
812
|
+
template <>
|
|
813
|
+
HWY_INLINE constexpr bool IsIntegerLaneType<uint16_t>() {
|
|
814
|
+
return true;
|
|
815
|
+
}
|
|
816
|
+
template <>
|
|
817
|
+
HWY_INLINE constexpr bool IsIntegerLaneType<int32_t>() {
|
|
818
|
+
return true;
|
|
819
|
+
}
|
|
820
|
+
template <>
|
|
821
|
+
HWY_INLINE constexpr bool IsIntegerLaneType<uint32_t>() {
|
|
822
|
+
return true;
|
|
823
|
+
}
|
|
824
|
+
template <>
|
|
825
|
+
HWY_INLINE constexpr bool IsIntegerLaneType<int64_t>() {
|
|
826
|
+
return true;
|
|
827
|
+
}
|
|
828
|
+
template <>
|
|
829
|
+
HWY_INLINE constexpr bool IsIntegerLaneType<uint64_t>() {
|
|
830
|
+
return true;
|
|
831
|
+
}
|
|
832
|
+
|
|
833
|
+
template <class T>
|
|
834
|
+
HWY_API constexpr bool IsInteger() {
|
|
835
|
+
// NOTE: Do not add a IsInteger<wchar_t>() specialization below as it is
|
|
836
|
+
// possible for IsSame<wchar_t, uint16_t>() to be true when compiled with MSVC
|
|
837
|
+
// with the /Zc:wchar_t- option.
|
|
838
|
+
return IsIntegerLaneType<T>() || IsSame<RemoveCvRef<T>, wchar_t>() ||
|
|
839
|
+
IsSameEither<RemoveCvRef<T>, size_t, ptrdiff_t>() ||
|
|
840
|
+
IsSameEither<RemoveCvRef<T>, intptr_t, uintptr_t>();
|
|
841
|
+
}
|
|
842
|
+
template <>
|
|
843
|
+
HWY_INLINE constexpr bool IsInteger<bool>() {
|
|
844
|
+
return true;
|
|
845
|
+
}
|
|
846
|
+
template <>
|
|
847
|
+
HWY_INLINE constexpr bool IsInteger<char>() {
|
|
848
|
+
return true;
|
|
849
|
+
}
|
|
850
|
+
template <>
|
|
851
|
+
HWY_INLINE constexpr bool IsInteger<signed char>() {
|
|
852
|
+
return true;
|
|
853
|
+
}
|
|
854
|
+
template <>
|
|
855
|
+
HWY_INLINE constexpr bool IsInteger<unsigned char>() {
|
|
856
|
+
return true;
|
|
857
|
+
}
|
|
858
|
+
template <>
|
|
859
|
+
HWY_INLINE constexpr bool IsInteger<short>() { // NOLINT
|
|
860
|
+
return true;
|
|
861
|
+
}
|
|
862
|
+
template <>
|
|
863
|
+
HWY_INLINE constexpr bool IsInteger<unsigned short>() { // NOLINT
|
|
864
|
+
return true;
|
|
865
|
+
}
|
|
866
|
+
template <>
|
|
867
|
+
HWY_INLINE constexpr bool IsInteger<int>() {
|
|
868
|
+
return true;
|
|
869
|
+
}
|
|
870
|
+
template <>
|
|
871
|
+
HWY_INLINE constexpr bool IsInteger<unsigned>() {
|
|
872
|
+
return true;
|
|
873
|
+
}
|
|
874
|
+
template <>
|
|
875
|
+
HWY_INLINE constexpr bool IsInteger<long>() { // NOLINT
|
|
876
|
+
return true;
|
|
877
|
+
}
|
|
878
|
+
template <>
|
|
879
|
+
HWY_INLINE constexpr bool IsInteger<unsigned long>() { // NOLINT
|
|
880
|
+
return true;
|
|
881
|
+
}
|
|
882
|
+
template <>
|
|
883
|
+
HWY_INLINE constexpr bool IsInteger<long long>() { // NOLINT
|
|
884
|
+
return true;
|
|
885
|
+
}
|
|
886
|
+
template <>
|
|
887
|
+
HWY_INLINE constexpr bool IsInteger<unsigned long long>() { // NOLINT
|
|
888
|
+
return true;
|
|
889
|
+
}
|
|
890
|
+
#if defined(__cpp_char8_t) && __cpp_char8_t >= 201811L
|
|
891
|
+
template <>
|
|
892
|
+
HWY_INLINE constexpr bool IsInteger<char8_t>() {
|
|
893
|
+
return true;
|
|
894
|
+
}
|
|
895
|
+
#endif
|
|
896
|
+
template <>
|
|
897
|
+
HWY_INLINE constexpr bool IsInteger<char16_t>() {
|
|
898
|
+
return true;
|
|
899
|
+
}
|
|
900
|
+
template <>
|
|
901
|
+
HWY_INLINE constexpr bool IsInteger<char32_t>() {
|
|
902
|
+
return true;
|
|
903
|
+
}
|
|
904
|
+
|
|
905
|
+
// -----------------------------------------------------------------------------
|
|
906
|
+
// BitCastScalar
|
|
907
|
+
|
|
908
|
+
#if HWY_HAS_BUILTIN(__builtin_bit_cast) || HWY_COMPILER_MSVC >= 1926
|
|
909
|
+
#define HWY_BITCASTSCALAR_CONSTEXPR constexpr
|
|
910
|
+
#else
|
|
911
|
+
#define HWY_BITCASTSCALAR_CONSTEXPR
|
|
912
|
+
#endif
|
|
913
|
+
|
|
914
|
+
#if __cpp_constexpr >= 201304L
|
|
915
|
+
#define HWY_BITCASTSCALAR_CXX14_CONSTEXPR HWY_BITCASTSCALAR_CONSTEXPR
|
|
916
|
+
#else
|
|
917
|
+
#define HWY_BITCASTSCALAR_CXX14_CONSTEXPR
|
|
918
|
+
#endif
|
|
919
|
+
|
|
920
|
+
#if HWY_HAS_BUILTIN(__builtin_bit_cast) || HWY_COMPILER_MSVC >= 1926
|
|
921
|
+
namespace detail {
|
|
922
|
+
|
|
923
|
+
template <class From>
|
|
924
|
+
struct BitCastScalarSrcCastHelper {
|
|
925
|
+
static HWY_INLINE constexpr const From& CastSrcValRef(const From& val) {
|
|
926
|
+
return val;
|
|
927
|
+
}
|
|
928
|
+
};
|
|
929
|
+
|
|
930
|
+
#if HWY_COMPILER_CLANG >= 900 && HWY_COMPILER_CLANG < 1000
|
|
931
|
+
// Workaround for Clang 9 constexpr __builtin_bit_cast bug
|
|
932
|
+
template <class To, class From,
|
|
933
|
+
hwy::EnableIf<hwy::IsInteger<RemoveCvRef<To>>() &&
|
|
934
|
+
hwy::IsInteger<RemoveCvRef<From>>()>* = nullptr>
|
|
935
|
+
static HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR To
|
|
936
|
+
BuiltinBitCastScalar(const From& val) {
|
|
937
|
+
static_assert(sizeof(To) == sizeof(From),
|
|
938
|
+
"sizeof(To) == sizeof(From) must be true");
|
|
939
|
+
return static_cast<To>(val);
|
|
940
|
+
}
|
|
941
|
+
|
|
942
|
+
template <class To, class From,
|
|
943
|
+
hwy::EnableIf<!(hwy::IsInteger<RemoveCvRef<To>>() &&
|
|
944
|
+
hwy::IsInteger<RemoveCvRef<From>>())>* = nullptr>
|
|
945
|
+
static HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR To
|
|
946
|
+
BuiltinBitCastScalar(const From& val) {
|
|
947
|
+
return __builtin_bit_cast(To, val);
|
|
948
|
+
}
|
|
949
|
+
#endif // HWY_COMPILER_CLANG >= 900 && HWY_COMPILER_CLANG < 1000
|
|
950
|
+
|
|
951
|
+
} // namespace detail
|
|
952
|
+
|
|
953
|
+
template <class To, class From, HWY_IF_NOT_SPECIAL_FLOAT(To)>
|
|
954
|
+
HWY_API HWY_BITCASTSCALAR_CONSTEXPR To BitCastScalar(const From& val) {
|
|
955
|
+
// If From is hwy::float16_t or hwy::bfloat16_t, first cast val to either
|
|
956
|
+
// const typename From::Native& or const uint16_t& using
|
|
957
|
+
// detail::BitCastScalarSrcCastHelper<RemoveCvRef<From>>::CastSrcValRef to
|
|
958
|
+
// allow BitCastScalar from hwy::float16_t or hwy::bfloat16_t to be constexpr
|
|
959
|
+
// if To is not a pointer type, union type, or a struct/class containing a
|
|
960
|
+
// pointer, union, or reference subobject
|
|
961
|
+
#if HWY_COMPILER_CLANG >= 900 && HWY_COMPILER_CLANG < 1000
|
|
962
|
+
return detail::BuiltinBitCastScalar<To>(
|
|
963
|
+
detail::BitCastScalarSrcCastHelper<RemoveCvRef<From>>::CastSrcValRef(
|
|
964
|
+
val));
|
|
965
|
+
#else
|
|
966
|
+
return __builtin_bit_cast(
|
|
967
|
+
To, detail::BitCastScalarSrcCastHelper<RemoveCvRef<From>>::CastSrcValRef(
|
|
968
|
+
val));
|
|
969
|
+
#endif
|
|
970
|
+
}
|
|
971
|
+
template <class To, class From, HWY_IF_SPECIAL_FLOAT(To)>
|
|
972
|
+
HWY_API HWY_BITCASTSCALAR_CONSTEXPR To BitCastScalar(const From& val) {
|
|
973
|
+
// If To is hwy::float16_t or hwy::bfloat16_t, first do a BitCastScalar of val
|
|
974
|
+
// to uint16_t, and then bit cast the uint16_t value to To using To::FromBits
|
|
975
|
+
// as hwy::float16_t::FromBits and hwy::bfloat16_t::FromBits are guaranteed to
|
|
976
|
+
// be constexpr if the __builtin_bit_cast intrinsic is available.
|
|
977
|
+
return To::FromBits(BitCastScalar<uint16_t>(val));
|
|
978
|
+
}
|
|
979
|
+
#else
|
|
980
|
+
template <class To, class From>
|
|
981
|
+
HWY_API HWY_BITCASTSCALAR_CONSTEXPR To BitCastScalar(const From& val) {
|
|
982
|
+
To result;
|
|
983
|
+
CopySameSize(&val, &result);
|
|
984
|
+
return result;
|
|
985
|
+
}
|
|
986
|
+
#endif
|
|
987
|
+
|
|
988
|
+
//------------------------------------------------------------------------------
|
|
989
|
+
// F16 lane type
|
|
990
|
+
|
|
991
|
+
#pragma pack(push, 1)
|
|
992
|
+
|
|
993
|
+
// Compiler supports __fp16 and load/store/conversion NEON intrinsics, which are
|
|
994
|
+
// included in Armv8 and VFPv4 (except with MSVC). On Armv7 Clang requires
|
|
995
|
+
// __ARM_FP & 2 whereas Armv7 GCC requires -mfp16-format=ieee.
|
|
996
|
+
#if (HWY_ARCH_ARM_A64 && !HWY_COMPILER_MSVC) || \
|
|
997
|
+
(HWY_COMPILER_CLANG && defined(__ARM_FP) && (__ARM_FP & 2)) || \
|
|
998
|
+
(HWY_COMPILER_GCC_ACTUAL && defined(__ARM_FP16_FORMAT_IEEE))
|
|
999
|
+
#define HWY_NEON_HAVE_F16C 1
|
|
1000
|
+
#else
|
|
1001
|
+
#define HWY_NEON_HAVE_F16C 0
|
|
1002
|
+
#endif
|
|
1003
|
+
|
|
1004
|
+
// RVV with f16 extension supports _Float16 and f16 vector ops. If set, implies
|
|
1005
|
+
// HWY_HAVE_FLOAT16.
|
|
1006
|
+
#if HWY_ARCH_RVV && defined(__riscv_zvfh) && HWY_COMPILER_CLANG >= 1600
|
|
1007
|
+
#define HWY_RVV_HAVE_F16_VEC 1
|
|
1008
|
+
#else
|
|
1009
|
+
#define HWY_RVV_HAVE_F16_VEC 0
|
|
1010
|
+
#endif
|
|
1011
|
+
|
|
1012
|
+
// x86 compiler supports _Float16, not necessarily with operators.
|
|
1013
|
+
// Avoid clang-cl because it lacks __extendhfsf2.
|
|
1014
|
+
#if HWY_ARCH_X86 && defined(__SSE2__) && defined(__FLT16_MAX__) && \
|
|
1015
|
+
((HWY_COMPILER_CLANG >= 1500 && !HWY_COMPILER_CLANGCL) || \
|
|
1016
|
+
HWY_COMPILER_GCC_ACTUAL >= 1200)
|
|
1017
|
+
#define HWY_SSE2_HAVE_F16_TYPE 1
|
|
1018
|
+
#else
|
|
1019
|
+
#define HWY_SSE2_HAVE_F16_TYPE 0
|
|
1020
|
+
#endif
|
|
1021
|
+
|
|
1022
|
+
#ifndef HWY_HAVE_SCALAR_F16_TYPE
|
|
1023
|
+
// Compiler supports _Float16, not necessarily with operators.
|
|
1024
|
+
#if HWY_NEON_HAVE_F16C || HWY_RVV_HAVE_F16_VEC || HWY_SSE2_HAVE_F16_TYPE
|
|
1025
|
+
#define HWY_HAVE_SCALAR_F16_TYPE 1
|
|
1026
|
+
#else
|
|
1027
|
+
#define HWY_HAVE_SCALAR_F16_TYPE 0
|
|
1028
|
+
#endif
|
|
1029
|
+
#endif // HWY_HAVE_SCALAR_F16_TYPE
|
|
1030
|
+
|
|
1031
|
+
#ifndef HWY_HAVE_SCALAR_F16_OPERATORS
|
|
1032
|
+
// Recent enough compiler also has operators.
|
|
1033
|
+
#if HWY_HAVE_SCALAR_F16_TYPE && \
|
|
1034
|
+
(HWY_COMPILER_CLANG >= 1800 || HWY_COMPILER_GCC_ACTUAL >= 1200 || \
|
|
1035
|
+
(HWY_COMPILER_CLANG >= 1500 && !HWY_COMPILER_CLANGCL && \
|
|
1036
|
+
!defined(_WIN32)) || \
|
|
1037
|
+
(HWY_ARCH_ARM && \
|
|
1038
|
+
(HWY_COMPILER_CLANG >= 900 || HWY_COMPILER_GCC_ACTUAL >= 800)))
|
|
1039
|
+
#define HWY_HAVE_SCALAR_F16_OPERATORS 1
|
|
1040
|
+
#else
|
|
1041
|
+
#define HWY_HAVE_SCALAR_F16_OPERATORS 0
|
|
1042
|
+
#endif
|
|
1043
|
+
#endif // HWY_HAVE_SCALAR_F16_OPERATORS
|
|
1044
|
+
|
|
1045
|
+
namespace detail {
|
|
1046
|
+
|
|
1047
|
+
template <class T, class TVal = RemoveCvRef<T>, bool = IsSpecialFloat<TVal>()>
|
|
1048
|
+
struct SpecialFloatUnwrapArithOpOperandT {};
|
|
1049
|
+
|
|
1050
|
+
template <class T, class TVal>
|
|
1051
|
+
struct SpecialFloatUnwrapArithOpOperandT<T, TVal, false> {
|
|
1052
|
+
using type = T;
|
|
1053
|
+
};
|
|
1054
|
+
|
|
1055
|
+
template <class T>
|
|
1056
|
+
using SpecialFloatUnwrapArithOpOperand =
|
|
1057
|
+
typename SpecialFloatUnwrapArithOpOperandT<T>::type;
|
|
1058
|
+
|
|
1059
|
+
template <class T, class TVal = RemoveCvRef<T>>
|
|
1060
|
+
struct NativeSpecialFloatToWrapperT {
|
|
1061
|
+
using type = T;
|
|
1062
|
+
};
|
|
1063
|
+
|
|
1064
|
+
template <class T>
|
|
1065
|
+
using NativeSpecialFloatToWrapper =
|
|
1066
|
+
typename NativeSpecialFloatToWrapperT<T>::type;
|
|
1067
|
+
|
|
1068
|
+
} // namespace detail
|
|
1069
|
+
|
|
1070
|
+
// Match [u]int##_t naming scheme so rvv-inl.h macros can obtain the type name
|
|
1071
|
+
// by concatenating base type and bits. We use a wrapper class instead of a
|
|
1072
|
+
// typedef to the native type to ensure that the same symbols, e.g. for VQSort,
|
|
1073
|
+
// are generated regardless of F16 support; see #1684.
|
|
1074
|
+
struct alignas(2) float16_t {
|
|
1075
|
+
#if HWY_HAVE_SCALAR_F16_TYPE
|
|
1076
|
+
#if HWY_RVV_HAVE_F16_VEC || HWY_SSE2_HAVE_F16_TYPE
|
|
1077
|
+
using Native = _Float16;
|
|
1078
|
+
#elif HWY_NEON_HAVE_F16C
|
|
1079
|
+
using Native = __fp16;
|
|
1080
|
+
#else
|
|
1081
|
+
#error "Logic error: condition should be 'all but NEON_HAVE_F16C'"
|
|
1082
|
+
#endif
|
|
1083
|
+
#endif // HWY_HAVE_SCALAR_F16_TYPE
|
|
1084
|
+
|
|
1085
|
+
union {
|
|
1086
|
+
#if HWY_HAVE_SCALAR_F16_TYPE
|
|
1087
|
+
// Accessed via NativeLaneType, and used directly if
|
|
1088
|
+
// HWY_HAVE_SCALAR_F16_OPERATORS.
|
|
1089
|
+
Native native;
|
|
1090
|
+
#endif
|
|
1091
|
+
// Only accessed via NativeLaneType or U16LaneType.
|
|
1092
|
+
uint16_t bits;
|
|
1093
|
+
};
|
|
1094
|
+
|
|
1095
|
+
// Default init and copying.
|
|
1096
|
+
float16_t() noexcept = default;
|
|
1097
|
+
constexpr float16_t(const float16_t&) noexcept = default;
|
|
1098
|
+
constexpr float16_t(float16_t&&) noexcept = default;
|
|
1099
|
+
float16_t& operator=(const float16_t&) noexcept = default;
|
|
1100
|
+
float16_t& operator=(float16_t&&) noexcept = default;
|
|
1101
|
+
|
|
1102
|
+
#if HWY_HAVE_SCALAR_F16_TYPE
|
|
1103
|
+
// NEON vget/set_lane intrinsics and SVE `svaddv` could use explicit
|
|
1104
|
+
// float16_t(intrinsic()), but user code expects implicit conversions.
|
|
1105
|
+
constexpr float16_t(Native arg) noexcept : native(arg) {}
|
|
1106
|
+
constexpr operator Native() const noexcept { return native; }
|
|
1107
|
+
#endif
|
|
1108
|
+
|
|
1109
|
+
#if HWY_HAVE_SCALAR_F16_TYPE
|
|
1110
|
+
static HWY_BITCASTSCALAR_CONSTEXPR float16_t FromBits(uint16_t bits) {
|
|
1111
|
+
return float16_t(BitCastScalar<Native>(bits));
|
|
1112
|
+
}
|
|
1113
|
+
#else
|
|
1114
|
+
|
|
1115
|
+
private:
|
|
1116
|
+
struct F16FromU16BitsTag {};
|
|
1117
|
+
constexpr float16_t(F16FromU16BitsTag /*tag*/, uint16_t u16_bits)
|
|
1118
|
+
: bits(u16_bits) {}
|
|
1119
|
+
|
|
1120
|
+
public:
|
|
1121
|
+
static constexpr float16_t FromBits(uint16_t bits) {
|
|
1122
|
+
return float16_t(F16FromU16BitsTag(), bits);
|
|
1123
|
+
}
|
|
1124
|
+
#endif
|
|
1125
|
+
|
|
1126
|
+
// When backed by a native type, ensure the wrapper behaves like the native
|
|
1127
|
+
// type by forwarding all operators. Unfortunately it seems difficult to reuse
|
|
1128
|
+
// this code in a base class, so we repeat it in float16_t.
|
|
1129
|
+
#if HWY_HAVE_SCALAR_F16_OPERATORS || HWY_IDE
|
|
1130
|
+
template <typename T, hwy::EnableIf<!IsSame<RemoveCvRef<T>, float16_t>() &&
|
|
1131
|
+
IsConvertible<T, Native>()>* = nullptr>
|
|
1132
|
+
constexpr float16_t(T&& arg) noexcept
|
|
1133
|
+
: native(static_cast<Native>(static_cast<T&&>(arg))) {}
|
|
1134
|
+
|
|
1135
|
+
template <typename T, hwy::EnableIf<!IsSame<RemoveCvRef<T>, float16_t>() &&
|
|
1136
|
+
!IsConvertible<T, Native>() &&
|
|
1137
|
+
IsStaticCastable<T, Native>()>* = nullptr>
|
|
1138
|
+
explicit constexpr float16_t(T&& arg) noexcept
|
|
1139
|
+
: native(static_cast<Native>(static_cast<T&&>(arg))) {}
|
|
1140
|
+
|
|
1141
|
+
// pre-decrement operator (--x)
|
|
1142
|
+
HWY_CXX14_CONSTEXPR float16_t& operator--() noexcept {
|
|
1143
|
+
native = static_cast<Native>(native - Native{1});
|
|
1144
|
+
return *this;
|
|
1145
|
+
}
|
|
1146
|
+
|
|
1147
|
+
// post-decrement operator (x--)
|
|
1148
|
+
HWY_CXX14_CONSTEXPR float16_t operator--(int) noexcept {
|
|
1149
|
+
float16_t result = *this;
|
|
1150
|
+
native = static_cast<Native>(native - Native{1});
|
|
1151
|
+
return result;
|
|
1152
|
+
}
|
|
1153
|
+
|
|
1154
|
+
// pre-increment operator (++x)
|
|
1155
|
+
HWY_CXX14_CONSTEXPR float16_t& operator++() noexcept {
|
|
1156
|
+
native = static_cast<Native>(native + Native{1});
|
|
1157
|
+
return *this;
|
|
1158
|
+
}
|
|
1159
|
+
|
|
1160
|
+
// post-increment operator (x++)
|
|
1161
|
+
HWY_CXX14_CONSTEXPR float16_t operator++(int) noexcept {
|
|
1162
|
+
float16_t result = *this;
|
|
1163
|
+
native = static_cast<Native>(native + Native{1});
|
|
1164
|
+
return result;
|
|
1165
|
+
}
|
|
1166
|
+
|
|
1167
|
+
constexpr float16_t operator-() const noexcept {
|
|
1168
|
+
return float16_t(static_cast<Native>(-native));
|
|
1169
|
+
}
|
|
1170
|
+
constexpr float16_t operator+() const noexcept { return *this; }
|
|
1171
|
+
|
|
1172
|
+
// Reduce clutter by generating `operator+` and `operator+=` etc. Note that
|
|
1173
|
+
// we cannot token-paste `operator` and `+`, so pass it in as `op_func`.
|
|
1174
|
+
#define HWY_FLOAT16_BINARY_OP(op, op_func, assign_func) \
|
|
1175
|
+
constexpr float16_t op_func(const float16_t& rhs) const noexcept { \
|
|
1176
|
+
return float16_t(static_cast<Native>(native op rhs.native)); \
|
|
1177
|
+
} \
|
|
1178
|
+
template <typename T, HWY_IF_NOT_F16(T), \
|
|
1179
|
+
typename UnwrappedT = \
|
|
1180
|
+
detail::SpecialFloatUnwrapArithOpOperand<const T&>, \
|
|
1181
|
+
typename RawResultT = \
|
|
1182
|
+
decltype(DeclVal<Native>() op DeclVal<UnwrappedT>()), \
|
|
1183
|
+
typename ResultT = \
|
|
1184
|
+
detail::NativeSpecialFloatToWrapper<RawResultT>, \
|
|
1185
|
+
HWY_IF_CASTABLE(RawResultT, ResultT)> \
|
|
1186
|
+
constexpr ResultT op_func(const T& rhs) const noexcept(noexcept( \
|
|
1187
|
+
static_cast<ResultT>(DeclVal<Native>() op DeclVal<UnwrappedT>()))) { \
|
|
1188
|
+
return static_cast<ResultT>(native op static_cast<UnwrappedT>(rhs)); \
|
|
1189
|
+
} \
|
|
1190
|
+
HWY_CXX14_CONSTEXPR hwy::float16_t& assign_func( \
|
|
1191
|
+
const hwy::float16_t& rhs) noexcept { \
|
|
1192
|
+
native = static_cast<Native>(native op rhs.native); \
|
|
1193
|
+
return *this; \
|
|
1194
|
+
} \
|
|
1195
|
+
template <typename T, HWY_IF_NOT_F16(T), \
|
|
1196
|
+
HWY_IF_OP_CASTABLE(op, const T&, Native), \
|
|
1197
|
+
HWY_IF_ASSIGNABLE( \
|
|
1198
|
+
Native, decltype(DeclVal<Native>() op DeclVal<const T&>()))> \
|
|
1199
|
+
HWY_CXX14_CONSTEXPR hwy::float16_t& assign_func(const T& rhs) noexcept( \
|
|
1200
|
+
noexcept( \
|
|
1201
|
+
static_cast<Native>(DeclVal<Native>() op DeclVal<const T&>()))) { \
|
|
1202
|
+
native = static_cast<Native>(native op rhs); \
|
|
1203
|
+
return *this; \
|
|
1204
|
+
}
|
|
1205
|
+
|
|
1206
|
+
HWY_FLOAT16_BINARY_OP(+, operator+, operator+=)
|
|
1207
|
+
HWY_FLOAT16_BINARY_OP(-, operator-, operator-=)
|
|
1208
|
+
HWY_FLOAT16_BINARY_OP(*, operator*, operator*=)
|
|
1209
|
+
HWY_FLOAT16_BINARY_OP(/, operator/, operator/=)
|
|
1210
|
+
#undef HWY_FLOAT16_BINARY_OP
|
|
1211
|
+
|
|
1212
|
+
#endif // HWY_HAVE_SCALAR_F16_OPERATORS
|
|
1213
|
+
};
|
|
1214
|
+
static_assert(sizeof(hwy::float16_t) == 2, "Wrong size of float16_t");
|
|
1215
|
+
|
|
1216
|
+
#if HWY_HAVE_SCALAR_F16_TYPE
|
|
1217
|
+
namespace detail {
|
|
1218
|
+
|
|
1219
|
+
#if HWY_HAVE_SCALAR_F16_OPERATORS
|
|
1220
|
+
template <class T>
|
|
1221
|
+
struct SpecialFloatUnwrapArithOpOperandT<T, hwy::float16_t, true> {
|
|
1222
|
+
using type = hwy::float16_t::Native;
|
|
1223
|
+
};
|
|
1224
|
+
#endif
|
|
1225
|
+
|
|
1226
|
+
template <class T>
|
|
1227
|
+
struct NativeSpecialFloatToWrapperT<T, hwy::float16_t::Native> {
|
|
1228
|
+
using type = hwy::float16_t;
|
|
1229
|
+
};
|
|
1230
|
+
|
|
1231
|
+
} // namespace detail
|
|
1232
|
+
#endif // HWY_HAVE_SCALAR_F16_TYPE
|
|
1233
|
+
|
|
1234
|
+
#if HWY_HAS_BUILTIN(__builtin_bit_cast) || HWY_COMPILER_MSVC >= 1926
|
|
1235
|
+
namespace detail {
|
|
1236
|
+
|
|
1237
|
+
template <>
|
|
1238
|
+
struct BitCastScalarSrcCastHelper<hwy::float16_t> {
|
|
1239
|
+
#if HWY_HAVE_SCALAR_F16_TYPE
|
|
1240
|
+
static HWY_INLINE constexpr const hwy::float16_t::Native& CastSrcValRef(
|
|
1241
|
+
const hwy::float16_t& val) {
|
|
1242
|
+
return val.native;
|
|
1243
|
+
}
|
|
1244
|
+
#else
|
|
1245
|
+
static HWY_INLINE constexpr const uint16_t& CastSrcValRef(
|
|
1246
|
+
const hwy::float16_t& val) {
|
|
1247
|
+
return val.bits;
|
|
1248
|
+
}
|
|
1249
|
+
#endif
|
|
1250
|
+
};
|
|
1251
|
+
|
|
1252
|
+
} // namespace detail
|
|
1253
|
+
#endif // HWY_HAS_BUILTIN(__builtin_bit_cast) || HWY_COMPILER_MSVC >= 1926
|
|
1254
|
+
|
|
1255
|
+
#if HWY_HAVE_SCALAR_F16_OPERATORS
|
|
1256
|
+
#define HWY_F16_CONSTEXPR constexpr
|
|
1257
|
+
#else
|
|
1258
|
+
#define HWY_F16_CONSTEXPR HWY_BITCASTSCALAR_CXX14_CONSTEXPR
|
|
1259
|
+
#endif // HWY_HAVE_SCALAR_F16_OPERATORS
|
|
1260
|
+
|
|
1261
|
+
HWY_API HWY_F16_CONSTEXPR float F32FromF16(float16_t f16) {
|
|
1262
|
+
#if HWY_HAVE_SCALAR_F16_OPERATORS && !HWY_IDE
|
|
1263
|
+
return static_cast<float>(f16);
|
|
1264
|
+
#endif
|
|
1265
|
+
#if !HWY_HAVE_SCALAR_F16_OPERATORS || HWY_IDE
|
|
1266
|
+
const uint16_t bits16 = BitCastScalar<uint16_t>(f16);
|
|
1267
|
+
const uint32_t sign = static_cast<uint32_t>(bits16 >> 15);
|
|
1268
|
+
const uint32_t biased_exp = (bits16 >> 10) & 0x1F;
|
|
1269
|
+
const uint32_t mantissa = bits16 & 0x3FF;
|
|
1270
|
+
|
|
1271
|
+
// Subnormal or zero
|
|
1272
|
+
if (biased_exp == 0) {
|
|
1273
|
+
const float subnormal =
|
|
1274
|
+
(1.0f / 16384) * (static_cast<float>(mantissa) * (1.0f / 1024));
|
|
1275
|
+
return sign ? -subnormal : subnormal;
|
|
1276
|
+
}
|
|
1277
|
+
|
|
1278
|
+
// Normalized, infinity or NaN: convert the representation directly
|
|
1279
|
+
// (faster than ldexp/tables).
|
|
1280
|
+
const uint32_t biased_exp32 =
|
|
1281
|
+
biased_exp == 31 ? 0xFF : biased_exp + (127 - 15);
|
|
1282
|
+
const uint32_t mantissa32 = mantissa << (23 - 10);
|
|
1283
|
+
const uint32_t bits32 = (sign << 31) | (biased_exp32 << 23) | mantissa32;
|
|
1284
|
+
|
|
1285
|
+
return BitCastScalar<float>(bits32);
|
|
1286
|
+
#endif // !HWY_HAVE_SCALAR_F16_OPERATORS
|
|
1287
|
+
}
|
|
1288
|
+
|
|
1289
|
+
#if HWY_IS_DEBUG_BUILD && \
|
|
1290
|
+
(HWY_HAS_BUILTIN(__builtin_bit_cast) || HWY_COMPILER_MSVC >= 1926)
|
|
1291
|
+
#if defined(__cpp_if_consteval) && __cpp_if_consteval >= 202106L
|
|
1292
|
+
// If C++23 if !consteval support is available, only execute
|
|
1293
|
+
// HWY_DASSERT(condition) if F16FromF32 is not called from a constant-evaluated
|
|
1294
|
+
// context to avoid compilation errors.
|
|
1295
|
+
#define HWY_F16_FROM_F32_DASSERT(condition) \
|
|
1296
|
+
do { \
|
|
1297
|
+
if !consteval { \
|
|
1298
|
+
HWY_DASSERT(condition); \
|
|
1299
|
+
} \
|
|
1300
|
+
} while (0)
|
|
1301
|
+
#elif HWY_HAS_BUILTIN(__builtin_is_constant_evaluated) || \
|
|
1302
|
+
HWY_COMPILER_MSVC >= 1926
|
|
1303
|
+
// If the __builtin_is_constant_evaluated() intrinsic is available,
|
|
1304
|
+
// only do HWY_DASSERT(condition) if __builtin_is_constant_evaluated() returns
|
|
1305
|
+
// false to avoid compilation errors if F16FromF32 is called from a
|
|
1306
|
+
// constant-evaluated context.
|
|
1307
|
+
#define HWY_F16_FROM_F32_DASSERT(condition) \
|
|
1308
|
+
do { \
|
|
1309
|
+
if (!__builtin_is_constant_evaluated()) { \
|
|
1310
|
+
HWY_DASSERT(condition); \
|
|
1311
|
+
} \
|
|
1312
|
+
} while (0)
|
|
1313
|
+
#else
|
|
1314
|
+
// If C++23 if !consteval support is not available,
|
|
1315
|
+
// the __builtin_is_constant_evaluated() intrinsic is not available,
|
|
1316
|
+
// HWY_IS_DEBUG_BUILD is 1, and the __builtin_bit_cast intrinsic is available,
|
|
1317
|
+
// do not do a HWY_DASSERT to avoid compilation errors if F16FromF32 is
|
|
1318
|
+
// called from a constant-evaluated context.
|
|
1319
|
+
#define HWY_F16_FROM_F32_DASSERT(condition) \
|
|
1320
|
+
do { \
|
|
1321
|
+
} while (0)
|
|
1322
|
+
#endif // defined(__cpp_if_consteval) && __cpp_if_consteval >= 202106L
|
|
1323
|
+
#else
|
|
1324
|
+
// If HWY_IS_DEBUG_BUILD is 0 or the __builtin_bit_cast intrinsic is not
|
|
1325
|
+
// available, define HWY_F16_FROM_F32_DASSERT(condition) as
|
|
1326
|
+
// HWY_DASSERT(condition)
|
|
1327
|
+
#define HWY_F16_FROM_F32_DASSERT(condition) HWY_DASSERT(condition)
|
|
1328
|
+
#endif // HWY_IS_DEBUG_BUILD && (HWY_HAS_BUILTIN(__builtin_bit_cast) ||
|
|
1329
|
+
// HWY_COMPILER_MSVC >= 1926)
|
|
1330
|
+
|
|
1331
|
+
HWY_API HWY_F16_CONSTEXPR float16_t F16FromF32(float f32) {
|
|
1332
|
+
#if HWY_HAVE_SCALAR_F16_OPERATORS && !HWY_IDE
|
|
1333
|
+
return float16_t(static_cast<float16_t::Native>(f32));
|
|
1334
|
+
#endif
|
|
1335
|
+
#if !HWY_HAVE_SCALAR_F16_OPERATORS || HWY_IDE
|
|
1336
|
+
const uint32_t bits32 = BitCastScalar<uint32_t>(f32);
|
|
1337
|
+
const uint32_t sign = bits32 >> 31;
|
|
1338
|
+
const uint32_t biased_exp32 = (bits32 >> 23) & 0xFF;
|
|
1339
|
+
constexpr uint32_t kMantissaMask = 0x7FFFFF;
|
|
1340
|
+
const uint32_t mantissa32 = bits32 & kMantissaMask;
|
|
1341
|
+
|
|
1342
|
+
// Before shifting (truncation), round to nearest even to reduce bias. If
|
|
1343
|
+
// the lowest remaining mantissa bit is odd, increase the offset. Example
|
|
1344
|
+
// with the lowest remaining bit (left) and next lower two bits; the
|
|
1345
|
+
// latter, plus two more, will be truncated.
|
|
1346
|
+
// 0[00] + 1 = 0[01]
|
|
1347
|
+
// 0[01] + 1 = 0[10]
|
|
1348
|
+
// 0[10] + 1 = 0[11] (round down toward even)
|
|
1349
|
+
// 0[11] + 1 = 1[00] (round up)
|
|
1350
|
+
// 1[00] + 10 = 1[10]
|
|
1351
|
+
// 1[01] + 10 = 1[11]
|
|
1352
|
+
// 1[10] + 10 = C0[00] (round up toward even with C=1 carry out)
|
|
1353
|
+
// 1[11] + 10 = C0[01] (round up toward even with C=1 carry out)
|
|
1354
|
+
const uint32_t odd_bit = (mantissa32 >> 13) & 1;
|
|
1355
|
+
const uint32_t rounded = mantissa32 + odd_bit + 0xFFF;
|
|
1356
|
+
const bool carry = rounded >= (1u << 23);
|
|
1357
|
+
|
|
1358
|
+
const int32_t exp = static_cast<int32_t>(biased_exp32) - 127 + carry;
|
|
1359
|
+
|
|
1360
|
+
// Tiny or zero => zero.
|
|
1361
|
+
if (exp < -24) {
|
|
1362
|
+
// restore original sign
|
|
1363
|
+
return float16_t::FromBits(static_cast<uint16_t>(sign << 15));
|
|
1364
|
+
}
|
|
1365
|
+
|
|
1366
|
+
// If biased_exp16 would be >= 31, first check whether the input was NaN so we
|
|
1367
|
+
// can set the mantissa to nonzero.
|
|
1368
|
+
const bool is_nan = (biased_exp32 == 255) && mantissa32 != 0;
|
|
1369
|
+
const bool overflowed = exp >= 16;
|
|
1370
|
+
const uint32_t biased_exp16 =
|
|
1371
|
+
static_cast<uint32_t>(HWY_MIN(HWY_MAX(0, exp + 15), 31));
|
|
1372
|
+
// exp = [-24, -15] => subnormal, shift the mantissa.
|
|
1373
|
+
const uint32_t sub_exp = static_cast<uint32_t>(HWY_MAX(-14 - exp, 0));
|
|
1374
|
+
HWY_F16_FROM_F32_DASSERT(sub_exp < 11);
|
|
1375
|
+
const uint32_t shifted_mantissa =
|
|
1376
|
+
(rounded & kMantissaMask) >> (23 - 10 + sub_exp);
|
|
1377
|
+
const uint32_t leading = sub_exp == 0u ? 0u : (1024u >> sub_exp);
|
|
1378
|
+
const uint32_t mantissa16 = is_nan ? 0x3FF
|
|
1379
|
+
: overflowed ? 0u
|
|
1380
|
+
: (leading + shifted_mantissa);
|
|
1381
|
+
|
|
1382
|
+
#if HWY_IS_DEBUG_BUILD
|
|
1383
|
+
if (exp < -14) {
|
|
1384
|
+
HWY_F16_FROM_F32_DASSERT(biased_exp16 == 0);
|
|
1385
|
+
HWY_F16_FROM_F32_DASSERT(sub_exp >= 1);
|
|
1386
|
+
} else if (exp <= 15) {
|
|
1387
|
+
HWY_F16_FROM_F32_DASSERT(1 <= biased_exp16 && biased_exp16 < 31);
|
|
1388
|
+
HWY_F16_FROM_F32_DASSERT(sub_exp == 0);
|
|
1389
|
+
}
|
|
1390
|
+
#endif
|
|
1391
|
+
|
|
1392
|
+
HWY_F16_FROM_F32_DASSERT(mantissa16 < 1024);
|
|
1393
|
+
const uint32_t bits16 = (sign << 15) | (biased_exp16 << 10) | mantissa16;
|
|
1394
|
+
HWY_F16_FROM_F32_DASSERT(bits16 < 0x10000);
|
|
1395
|
+
const uint16_t narrowed = static_cast<uint16_t>(bits16); // big-endian safe
|
|
1396
|
+
return float16_t::FromBits(narrowed);
|
|
1397
|
+
#endif // !HWY_HAVE_SCALAR_F16_OPERATORS
|
|
1398
|
+
}
|
|
1399
|
+
|
|
1400
|
+
HWY_API HWY_F16_CONSTEXPR float16_t F16FromF64(double f64) {
|
|
1401
|
+
#if HWY_HAVE_SCALAR_F16_OPERATORS
|
|
1402
|
+
return float16_t(static_cast<float16_t::Native>(f64));
|
|
1403
|
+
#else
|
|
1404
|
+
// The mantissa bits of f64 are first rounded using round-to-odd rounding
|
|
1405
|
+
// to the nearest f64 value that has the lower 29 bits zeroed out to
|
|
1406
|
+
// ensure that the result is correctly rounded to a F16.
|
|
1407
|
+
|
|
1408
|
+
// The F64 round-to-odd operation below will round a normal F64 value
|
|
1409
|
+
// (using round-to-odd rounding) to a F64 value that has 24 bits of precision.
|
|
1410
|
+
|
|
1411
|
+
// It is okay if the magnitude of a denormal F64 value is rounded up in the
|
|
1412
|
+
// F64 round-to-odd step below as the magnitude of a denormal F64 value is
|
|
1413
|
+
// much smaller than 2^(-24) (the smallest positive denormal F16 value).
|
|
1414
|
+
|
|
1415
|
+
// It is also okay if bit 29 of a NaN F64 value is changed by the F64
|
|
1416
|
+
// round-to-odd step below as the lower 13 bits of a F32 NaN value are usually
|
|
1417
|
+
// discarded or ignored by the conversion of a F32 NaN value to a F16.
|
|
1418
|
+
|
|
1419
|
+
// If f64 is a NaN value, the result of the F64 round-to-odd step will be a
|
|
1420
|
+
// NaN value as the result of the F64 round-to-odd step will have at least one
|
|
1421
|
+
// mantissa bit if f64 is a NaN value.
|
|
1422
|
+
|
|
1423
|
+
// The F64 round-to-odd step will ensure that the F64 to F32 conversion is
|
|
1424
|
+
// exact if the magnitude of the rounded F64 value (using round-to-odd
|
|
1425
|
+
// rounding) is between 2^(-126) (the smallest normal F32 value) and
|
|
1426
|
+
// HighestValue<float>() (the largest finite F32 value)
|
|
1427
|
+
|
|
1428
|
+
// It is okay if the F64 to F32 conversion is inexact for F64 values that have
|
|
1429
|
+
// a magnitude that is less than 2^(-126) as the magnitude of a denormal F32
|
|
1430
|
+
// value is much smaller than 2^(-24) (the smallest positive denormal F16
|
|
1431
|
+
// value).
|
|
1432
|
+
|
|
1433
|
+
return F16FromF32(
|
|
1434
|
+
static_cast<float>(BitCastScalar<double>(static_cast<uint64_t>(
|
|
1435
|
+
(BitCastScalar<uint64_t>(f64) & 0xFFFFFFFFE0000000ULL) |
|
|
1436
|
+
((BitCastScalar<uint64_t>(f64) + 0x000000001FFFFFFFULL) &
|
|
1437
|
+
0x0000000020000000ULL)))));
|
|
1438
|
+
#endif
|
|
1439
|
+
}
|
|
1440
|
+
|
|
1441
|
+
// More convenient to define outside float16_t because these may use
|
|
1442
|
+
// F32FromF16, which is defined after the struct.
|
|
1443
|
+
HWY_F16_CONSTEXPR inline bool operator==(float16_t lhs,
|
|
1444
|
+
float16_t rhs) noexcept {
|
|
1445
|
+
#if HWY_HAVE_SCALAR_F16_OPERATORS
|
|
1446
|
+
return lhs.native == rhs.native;
|
|
1447
|
+
#else
|
|
1448
|
+
return F32FromF16(lhs) == F32FromF16(rhs);
|
|
1449
|
+
#endif
|
|
1450
|
+
}
|
|
1451
|
+
HWY_F16_CONSTEXPR inline bool operator!=(float16_t lhs,
|
|
1452
|
+
float16_t rhs) noexcept {
|
|
1453
|
+
#if HWY_HAVE_SCALAR_F16_OPERATORS
|
|
1454
|
+
return lhs.native != rhs.native;
|
|
1455
|
+
#else
|
|
1456
|
+
return F32FromF16(lhs) != F32FromF16(rhs);
|
|
1457
|
+
#endif
|
|
1458
|
+
}
|
|
1459
|
+
HWY_F16_CONSTEXPR inline bool operator<(float16_t lhs, float16_t rhs) noexcept {
|
|
1460
|
+
#if HWY_HAVE_SCALAR_F16_OPERATORS
|
|
1461
|
+
return lhs.native < rhs.native;
|
|
1462
|
+
#else
|
|
1463
|
+
return F32FromF16(lhs) < F32FromF16(rhs);
|
|
1464
|
+
#endif
|
|
1465
|
+
}
|
|
1466
|
+
HWY_F16_CONSTEXPR inline bool operator<=(float16_t lhs,
|
|
1467
|
+
float16_t rhs) noexcept {
|
|
1468
|
+
#if HWY_HAVE_SCALAR_F16_OPERATORS
|
|
1469
|
+
return lhs.native <= rhs.native;
|
|
1470
|
+
#else
|
|
1471
|
+
return F32FromF16(lhs) <= F32FromF16(rhs);
|
|
1472
|
+
#endif
|
|
1473
|
+
}
|
|
1474
|
+
HWY_F16_CONSTEXPR inline bool operator>(float16_t lhs, float16_t rhs) noexcept {
|
|
1475
|
+
#if HWY_HAVE_SCALAR_F16_OPERATORS
|
|
1476
|
+
return lhs.native > rhs.native;
|
|
1477
|
+
#else
|
|
1478
|
+
return F32FromF16(lhs) > F32FromF16(rhs);
|
|
1479
|
+
#endif
|
|
1480
|
+
}
|
|
1481
|
+
HWY_F16_CONSTEXPR inline bool operator>=(float16_t lhs,
|
|
1482
|
+
float16_t rhs) noexcept {
|
|
1483
|
+
#if HWY_HAVE_SCALAR_F16_OPERATORS
|
|
1484
|
+
return lhs.native >= rhs.native;
|
|
1485
|
+
#else
|
|
1486
|
+
return F32FromF16(lhs) >= F32FromF16(rhs);
|
|
1487
|
+
#endif
|
|
1488
|
+
}
|
|
1489
|
+
#if HWY_HAVE_CXX20_THREE_WAY_COMPARE
|
|
1490
|
+
HWY_F16_CONSTEXPR inline std::partial_ordering operator<=>(
|
|
1491
|
+
float16_t lhs, float16_t rhs) noexcept {
|
|
1492
|
+
#if HWY_HAVE_SCALAR_F16_OPERATORS
|
|
1493
|
+
return lhs.native <=> rhs.native;
|
|
1494
|
+
#else
|
|
1495
|
+
return F32FromF16(lhs) <=> F32FromF16(rhs);
|
|
1496
|
+
#endif
|
|
1497
|
+
}
|
|
1498
|
+
#endif // HWY_HAVE_CXX20_THREE_WAY_COMPARE
|
|
1499
|
+
|
|
1500
|
+
//------------------------------------------------------------------------------
|
|
1501
|
+
// BF16 lane type
|
|
1502
|
+
|
|
1503
|
+
// Compiler supports ACLE __bf16, not necessarily with operators.
|
|
1504
|
+
|
|
1505
|
+
// Disable the __bf16 type on AArch64 with GCC 13 or earlier as there is a bug
|
|
1506
|
+
// in GCC 13 and earlier that sometimes causes BF16 constant values to be
|
|
1507
|
+
// incorrectly loaded on AArch64, and this GCC bug on AArch64 is
|
|
1508
|
+
// described at https://gcc.gnu.org/bugzilla/show_bug.cgi?id=111867.
|
|
1509
|
+
|
|
1510
|
+
#if HWY_ARCH_ARM_A64 && \
|
|
1511
|
+
(HWY_COMPILER_CLANG >= 1700 || HWY_COMPILER_GCC_ACTUAL >= 1400)
|
|
1512
|
+
#define HWY_ARM_HAVE_SCALAR_BF16_TYPE 1
|
|
1513
|
+
#else
|
|
1514
|
+
#define HWY_ARM_HAVE_SCALAR_BF16_TYPE 0
|
|
1515
|
+
#endif
|
|
1516
|
+
|
|
1517
|
+
// x86 compiler supports __bf16, not necessarily with operators.
|
|
1518
|
+
#ifndef HWY_SSE2_HAVE_SCALAR_BF16_TYPE
|
|
1519
|
+
#if HWY_ARCH_X86 && defined(__SSE2__) && \
|
|
1520
|
+
((HWY_COMPILER_CLANG >= 1700 && !HWY_COMPILER_CLANGCL) || \
|
|
1521
|
+
HWY_COMPILER_GCC_ACTUAL >= 1300)
|
|
1522
|
+
#define HWY_SSE2_HAVE_SCALAR_BF16_TYPE 1
|
|
1523
|
+
#else
|
|
1524
|
+
#define HWY_SSE2_HAVE_SCALAR_BF16_TYPE 0
|
|
1525
|
+
#endif
|
|
1526
|
+
#endif // HWY_SSE2_HAVE_SCALAR_BF16_TYPE
|
|
1527
|
+
|
|
1528
|
+
// Compiler supports __bf16, not necessarily with operators.
|
|
1529
|
+
#if HWY_ARM_HAVE_SCALAR_BF16_TYPE || HWY_SSE2_HAVE_SCALAR_BF16_TYPE
|
|
1530
|
+
#define HWY_HAVE_SCALAR_BF16_TYPE 1
|
|
1531
|
+
#else
|
|
1532
|
+
#define HWY_HAVE_SCALAR_BF16_TYPE 0
|
|
1533
|
+
#endif
|
|
1534
|
+
|
|
1535
|
+
#ifndef HWY_HAVE_SCALAR_BF16_OPERATORS
|
|
1536
|
+
// Recent enough compiler also has operators. aarch64 clang 18 hits internal
|
|
1537
|
+
// compiler errors on bf16 ToString, hence only enable on GCC for now.
|
|
1538
|
+
#if HWY_HAVE_SCALAR_BF16_TYPE && (HWY_COMPILER_GCC_ACTUAL >= 1300)
|
|
1539
|
+
#define HWY_HAVE_SCALAR_BF16_OPERATORS 1
|
|
1540
|
+
#else
|
|
1541
|
+
#define HWY_HAVE_SCALAR_BF16_OPERATORS 0
|
|
1542
|
+
#endif
|
|
1543
|
+
#endif // HWY_HAVE_SCALAR_BF16_OPERATORS
|
|
1544
|
+
|
|
1545
|
+
#if HWY_HAVE_SCALAR_BF16_OPERATORS
|
|
1546
|
+
#define HWY_BF16_CONSTEXPR constexpr
|
|
1547
|
+
#else
|
|
1548
|
+
#define HWY_BF16_CONSTEXPR HWY_BITCASTSCALAR_CONSTEXPR
|
|
1549
|
+
#endif
|
|
1550
|
+
|
|
1551
|
+
struct alignas(2) bfloat16_t {
|
|
1552
|
+
#if HWY_HAVE_SCALAR_BF16_TYPE
|
|
1553
|
+
using Native = __bf16;
|
|
1554
|
+
#endif
|
|
1555
|
+
|
|
1556
|
+
union {
|
|
1557
|
+
#if HWY_HAVE_SCALAR_BF16_TYPE
|
|
1558
|
+
// Accessed via NativeLaneType, and used directly if
|
|
1559
|
+
// HWY_HAVE_SCALAR_BF16_OPERATORS.
|
|
1560
|
+
Native native;
|
|
1561
|
+
#endif
|
|
1562
|
+
// Only accessed via NativeLaneType or U16LaneType.
|
|
1563
|
+
uint16_t bits;
|
|
1564
|
+
};
|
|
1565
|
+
|
|
1566
|
+
// Default init and copying
|
|
1567
|
+
bfloat16_t() noexcept = default;
|
|
1568
|
+
constexpr bfloat16_t(bfloat16_t&&) noexcept = default;
|
|
1569
|
+
constexpr bfloat16_t(const bfloat16_t&) noexcept = default;
|
|
1570
|
+
bfloat16_t& operator=(bfloat16_t&& arg) noexcept = default;
|
|
1571
|
+
bfloat16_t& operator=(const bfloat16_t& arg) noexcept = default;
|
|
1572
|
+
|
|
1573
|
+
// Only enable implicit conversions if we have a native type.
|
|
1574
|
+
#if HWY_HAVE_SCALAR_BF16_TYPE
|
|
1575
|
+
constexpr bfloat16_t(Native arg) noexcept : native(arg) {}
|
|
1576
|
+
constexpr operator Native() const noexcept { return native; }
|
|
1577
|
+
#endif
|
|
1578
|
+
|
|
1579
|
+
#if HWY_HAVE_SCALAR_BF16_TYPE
|
|
1580
|
+
static HWY_BITCASTSCALAR_CONSTEXPR bfloat16_t FromBits(uint16_t bits) {
|
|
1581
|
+
return bfloat16_t(BitCastScalar<Native>(bits));
|
|
1582
|
+
}
|
|
1583
|
+
#else
|
|
1584
|
+
|
|
1585
|
+
private:
|
|
1586
|
+
struct BF16FromU16BitsTag {};
|
|
1587
|
+
constexpr bfloat16_t(BF16FromU16BitsTag /*tag*/, uint16_t u16_bits)
|
|
1588
|
+
: bits(u16_bits) {}
|
|
829
1589
|
|
|
830
|
-
|
|
831
|
-
|
|
1590
|
+
public:
|
|
1591
|
+
static constexpr bfloat16_t FromBits(uint16_t bits) {
|
|
1592
|
+
return bfloat16_t(BF16FromU16BitsTag(), bits);
|
|
1593
|
+
}
|
|
1594
|
+
#endif
|
|
832
1595
|
|
|
833
|
-
//
|
|
834
|
-
//
|
|
835
|
-
//
|
|
836
|
-
|
|
837
|
-
|
|
838
|
-
|
|
839
|
-
|
|
840
|
-
|
|
841
|
-
|
|
842
|
-
|
|
843
|
-
|
|
1596
|
+
// When backed by a native type, ensure the wrapper behaves like the native
|
|
1597
|
+
// type by forwarding all operators. Unfortunately it seems difficult to reuse
|
|
1598
|
+
// this code in a base class, so we repeat it in float16_t.
|
|
1599
|
+
#if HWY_HAVE_SCALAR_BF16_OPERATORS || HWY_IDE
|
|
1600
|
+
template <typename T, hwy::EnableIf<!IsSame<RemoveCvRef<T>, Native>() &&
|
|
1601
|
+
!IsSame<RemoveCvRef<T>, bfloat16_t>() &&
|
|
1602
|
+
IsConvertible<T, Native>()>* = nullptr>
|
|
1603
|
+
constexpr bfloat16_t(T&& arg) noexcept(
|
|
1604
|
+
noexcept(static_cast<Native>(DeclVal<T>())))
|
|
1605
|
+
: native(static_cast<Native>(static_cast<T&&>(arg))) {}
|
|
1606
|
+
|
|
1607
|
+
template <typename T, hwy::EnableIf<!IsSame<RemoveCvRef<T>, Native>() &&
|
|
1608
|
+
!IsSame<RemoveCvRef<T>, bfloat16_t>() &&
|
|
1609
|
+
!IsConvertible<T, Native>() &&
|
|
1610
|
+
IsStaticCastable<T, Native>()>* = nullptr>
|
|
1611
|
+
explicit constexpr bfloat16_t(T&& arg) noexcept(
|
|
1612
|
+
noexcept(static_cast<Native>(DeclVal<T>())))
|
|
1613
|
+
: native(static_cast<Native>(static_cast<T&&>(arg))) {}
|
|
1614
|
+
|
|
1615
|
+
HWY_CXX14_CONSTEXPR bfloat16_t& operator=(Native arg) noexcept {
|
|
1616
|
+
native = arg;
|
|
1617
|
+
return *this;
|
|
1618
|
+
}
|
|
844
1619
|
|
|
845
|
-
|
|
846
|
-
|
|
847
|
-
|
|
1620
|
+
// pre-decrement operator (--x)
|
|
1621
|
+
HWY_CXX14_CONSTEXPR bfloat16_t& operator--() noexcept {
|
|
1622
|
+
native = static_cast<Native>(native - Native{1});
|
|
1623
|
+
return *this;
|
|
1624
|
+
}
|
|
848
1625
|
|
|
849
|
-
|
|
850
|
-
|
|
851
|
-
|
|
852
|
-
|
|
853
|
-
|
|
854
|
-
|
|
855
|
-
#define HWY_IF_FLOAT3264(T) hwy::EnableIf<hwy::IsFloat3264<T>()>* = nullptr
|
|
856
|
-
#define HWY_IF_NOT_FLOAT3264(T) hwy::EnableIf<!hwy::IsFloat3264<T>()>* = nullptr
|
|
857
|
-
#define HWY_IF_SPECIAL_FLOAT(T) \
|
|
858
|
-
hwy::EnableIf<hwy::IsSpecialFloat<T>()>* = nullptr
|
|
859
|
-
#define HWY_IF_NOT_SPECIAL_FLOAT(T) \
|
|
860
|
-
hwy::EnableIf<!hwy::IsSpecialFloat<T>()>* = nullptr
|
|
861
|
-
#define HWY_IF_FLOAT_OR_SPECIAL(T) \
|
|
862
|
-
hwy::EnableIf<hwy::IsFloat<T>() || hwy::IsSpecialFloat<T>()>* = nullptr
|
|
863
|
-
#define HWY_IF_NOT_FLOAT_NOR_SPECIAL(T) \
|
|
864
|
-
hwy::EnableIf<!hwy::IsFloat<T>() && !hwy::IsSpecialFloat<T>()>* = nullptr
|
|
1626
|
+
// post-decrement operator (x--)
|
|
1627
|
+
HWY_CXX14_CONSTEXPR bfloat16_t operator--(int) noexcept {
|
|
1628
|
+
bfloat16_t result = *this;
|
|
1629
|
+
native = static_cast<Native>(native - Native{1});
|
|
1630
|
+
return result;
|
|
1631
|
+
}
|
|
865
1632
|
|
|
866
|
-
|
|
867
|
-
|
|
868
|
-
|
|
869
|
-
|
|
870
|
-
|
|
871
|
-
// bits explicitly (0x14) instead of attempting to 'negate' 0x102.
|
|
872
|
-
#define HWY_IF_T_SIZE_ONE_OF(T, bit_array) \
|
|
873
|
-
hwy::EnableIf<((size_t{1} << sizeof(T)) & (bit_array)) != 0>* = nullptr
|
|
1633
|
+
// pre-increment operator (++x)
|
|
1634
|
+
HWY_CXX14_CONSTEXPR bfloat16_t& operator++() noexcept {
|
|
1635
|
+
native = static_cast<Native>(native + Native{1});
|
|
1636
|
+
return *this;
|
|
1637
|
+
}
|
|
874
1638
|
|
|
875
|
-
//
|
|
876
|
-
|
|
877
|
-
|
|
878
|
-
|
|
879
|
-
|
|
880
|
-
|
|
881
|
-
#define HWY_IF_UI64(T) \
|
|
882
|
-
hwy::EnableIf<IsSame<T, uint64_t>() || IsSame<T, int64_t>()>* = nullptr
|
|
883
|
-
#define HWY_IF_BF16(T) hwy::EnableIf<IsSame<T, hwy::bfloat16_t>()>* = nullptr
|
|
884
|
-
#define HWY_IF_F16(T) hwy::EnableIf<IsSame<T, hwy::float16_t>()>* = nullptr
|
|
1639
|
+
// post-increment operator (x++)
|
|
1640
|
+
HWY_CXX14_CONSTEXPR bfloat16_t operator++(int) noexcept {
|
|
1641
|
+
bfloat16_t result = *this;
|
|
1642
|
+
native = static_cast<Native>(native + Native{1});
|
|
1643
|
+
return result;
|
|
1644
|
+
}
|
|
885
1645
|
|
|
886
|
-
|
|
887
|
-
|
|
1646
|
+
constexpr bfloat16_t operator-() const noexcept {
|
|
1647
|
+
return bfloat16_t(static_cast<Native>(-native));
|
|
1648
|
+
}
|
|
1649
|
+
constexpr bfloat16_t operator+() const noexcept { return *this; }
|
|
888
1650
|
|
|
889
|
-
//
|
|
890
|
-
|
|
891
|
-
|
|
1651
|
+
// Reduce clutter by generating `operator+` and `operator+=` etc. Note that
|
|
1652
|
+
// we cannot token-paste `operator` and `+`, so pass it in as `op_func`.
|
|
1653
|
+
#define HWY_BFLOAT16_BINARY_OP(op, op_func, assign_func) \
|
|
1654
|
+
constexpr bfloat16_t op_func(const bfloat16_t& rhs) const noexcept { \
|
|
1655
|
+
return bfloat16_t(static_cast<Native>(native op rhs.native)); \
|
|
1656
|
+
} \
|
|
1657
|
+
template <typename T, HWY_IF_NOT_BF16(T), \
|
|
1658
|
+
typename UnwrappedT = \
|
|
1659
|
+
detail::SpecialFloatUnwrapArithOpOperand<const T&>, \
|
|
1660
|
+
typename RawResultT = \
|
|
1661
|
+
decltype(DeclVal<Native>() op DeclVal<UnwrappedT>()), \
|
|
1662
|
+
typename ResultT = \
|
|
1663
|
+
detail::NativeSpecialFloatToWrapper<RawResultT>, \
|
|
1664
|
+
HWY_IF_CASTABLE(RawResultT, ResultT)> \
|
|
1665
|
+
constexpr ResultT op_func(const T& rhs) const noexcept(noexcept( \
|
|
1666
|
+
static_cast<ResultT>(DeclVal<Native>() op DeclVal<UnwrappedT>()))) { \
|
|
1667
|
+
return static_cast<ResultT>(native op static_cast<UnwrappedT>(rhs)); \
|
|
1668
|
+
} \
|
|
1669
|
+
HWY_CXX14_CONSTEXPR hwy::bfloat16_t& assign_func( \
|
|
1670
|
+
const hwy::bfloat16_t& rhs) noexcept { \
|
|
1671
|
+
native = static_cast<Native>(native op rhs.native); \
|
|
1672
|
+
return *this; \
|
|
1673
|
+
} \
|
|
1674
|
+
template <typename T, HWY_IF_NOT_BF16(T), \
|
|
1675
|
+
HWY_IF_OP_CASTABLE(op, const T&, Native), \
|
|
1676
|
+
HWY_IF_ASSIGNABLE( \
|
|
1677
|
+
Native, decltype(DeclVal<Native>() op DeclVal<const T&>()))> \
|
|
1678
|
+
HWY_CXX14_CONSTEXPR hwy::bfloat16_t& assign_func(const T& rhs) noexcept( \
|
|
1679
|
+
noexcept( \
|
|
1680
|
+
static_cast<Native>(DeclVal<Native>() op DeclVal<const T&>()))) { \
|
|
1681
|
+
native = static_cast<Native>(native op rhs); \
|
|
1682
|
+
return *this; \
|
|
1683
|
+
}
|
|
1684
|
+
HWY_BFLOAT16_BINARY_OP(+, operator+, operator+=)
|
|
1685
|
+
HWY_BFLOAT16_BINARY_OP(-, operator-, operator-=)
|
|
1686
|
+
HWY_BFLOAT16_BINARY_OP(*, operator*, operator*=)
|
|
1687
|
+
HWY_BFLOAT16_BINARY_OP(/, operator/, operator/=)
|
|
1688
|
+
#undef HWY_BFLOAT16_BINARY_OP
|
|
892
1689
|
|
|
893
|
-
|
|
894
|
-
struct RemoveConstT {
|
|
895
|
-
using type = T;
|
|
896
|
-
};
|
|
897
|
-
template <class T>
|
|
898
|
-
struct RemoveConstT<const T> {
|
|
899
|
-
using type = T;
|
|
1690
|
+
#endif // HWY_HAVE_SCALAR_BF16_OPERATORS
|
|
900
1691
|
};
|
|
1692
|
+
static_assert(sizeof(hwy::bfloat16_t) == 2, "Wrong size of bfloat16_t");
|
|
901
1693
|
|
|
902
|
-
|
|
903
|
-
|
|
1694
|
+
#pragma pack(pop)
|
|
1695
|
+
|
|
1696
|
+
#if HWY_HAVE_SCALAR_BF16_TYPE
|
|
1697
|
+
namespace detail {
|
|
904
1698
|
|
|
1699
|
+
#if HWY_HAVE_SCALAR_BF16_OPERATORS
|
|
905
1700
|
template <class T>
|
|
906
|
-
struct
|
|
907
|
-
using type =
|
|
1701
|
+
struct SpecialFloatUnwrapArithOpOperandT<T, hwy::bfloat16_t, true> {
|
|
1702
|
+
using type = hwy::bfloat16_t::Native;
|
|
908
1703
|
};
|
|
1704
|
+
#endif
|
|
1705
|
+
|
|
909
1706
|
template <class T>
|
|
910
|
-
struct
|
|
911
|
-
using type =
|
|
1707
|
+
struct NativeSpecialFloatToWrapperT<T, hwy::bfloat16_t::Native> {
|
|
1708
|
+
using type = hwy::bfloat16_t;
|
|
912
1709
|
};
|
|
913
|
-
|
|
914
|
-
|
|
915
|
-
|
|
1710
|
+
|
|
1711
|
+
} // namespace detail
|
|
1712
|
+
#endif // HWY_HAVE_SCALAR_BF16_TYPE
|
|
1713
|
+
|
|
1714
|
+
#if HWY_HAS_BUILTIN(__builtin_bit_cast) || HWY_COMPILER_MSVC >= 1926
|
|
1715
|
+
namespace detail {
|
|
1716
|
+
|
|
1717
|
+
template <>
|
|
1718
|
+
struct BitCastScalarSrcCastHelper<hwy::bfloat16_t> {
|
|
1719
|
+
#if HWY_HAVE_SCALAR_BF16_TYPE
|
|
1720
|
+
static HWY_INLINE constexpr const hwy::bfloat16_t::Native& CastSrcValRef(
|
|
1721
|
+
const hwy::bfloat16_t& val) {
|
|
1722
|
+
return val.native;
|
|
1723
|
+
}
|
|
1724
|
+
#else
|
|
1725
|
+
static HWY_INLINE constexpr const uint16_t& CastSrcValRef(
|
|
1726
|
+
const hwy::bfloat16_t& val) {
|
|
1727
|
+
return val.bits;
|
|
1728
|
+
}
|
|
1729
|
+
#endif
|
|
916
1730
|
};
|
|
917
1731
|
|
|
918
|
-
|
|
919
|
-
|
|
1732
|
+
} // namespace detail
|
|
1733
|
+
#endif // HWY_HAS_BUILTIN(__builtin_bit_cast) || HWY_COMPILER_MSVC >= 1926
|
|
1734
|
+
|
|
1735
|
+
HWY_API HWY_BF16_CONSTEXPR float F32FromBF16(bfloat16_t bf) {
|
|
1736
|
+
#if HWY_HAVE_SCALAR_BF16_OPERATORS
|
|
1737
|
+
return static_cast<float>(bf);
|
|
1738
|
+
#else
|
|
1739
|
+
return BitCastScalar<float>(static_cast<uint32_t>(
|
|
1740
|
+
static_cast<uint32_t>(BitCastScalar<uint16_t>(bf)) << 16));
|
|
1741
|
+
#endif
|
|
1742
|
+
}
|
|
1743
|
+
|
|
1744
|
+
HWY_API HWY_BF16_CONSTEXPR bfloat16_t BF16FromF32(float f) {
|
|
1745
|
+
#if HWY_HAVE_SCALAR_BF16_OPERATORS
|
|
1746
|
+
return static_cast<bfloat16_t>(f);
|
|
1747
|
+
#else
|
|
1748
|
+
return bfloat16_t::FromBits(
|
|
1749
|
+
static_cast<uint16_t>(BitCastScalar<uint32_t>(f) >> 16));
|
|
1750
|
+
#endif
|
|
1751
|
+
}
|
|
1752
|
+
|
|
1753
|
+
HWY_API HWY_BF16_CONSTEXPR bfloat16_t BF16FromF64(double f64) {
|
|
1754
|
+
#if HWY_HAVE_SCALAR_BF16_OPERATORS
|
|
1755
|
+
return static_cast<bfloat16_t>(f64);
|
|
1756
|
+
#else
|
|
1757
|
+
// The mantissa bits of f64 are first rounded using round-to-odd rounding
|
|
1758
|
+
// to the nearest f64 value that has the lower 38 bits zeroed out to
|
|
1759
|
+
// ensure that the result is correctly rounded to a BF16.
|
|
1760
|
+
|
|
1761
|
+
// The F64 round-to-odd operation below will round a normal F64 value
|
|
1762
|
+
// (using round-to-odd rounding) to a F64 value that has 15 bits of precision.
|
|
1763
|
+
|
|
1764
|
+
// It is okay if the magnitude of a denormal F64 value is rounded up in the
|
|
1765
|
+
// F64 round-to-odd step below as the magnitude of a denormal F64 value is
|
|
1766
|
+
// much smaller than 2^(-133) (the smallest positive denormal BF16 value).
|
|
1767
|
+
|
|
1768
|
+
// It is also okay if bit 38 of a NaN F64 value is changed by the F64
|
|
1769
|
+
// round-to-odd step below as the lower 16 bits of a F32 NaN value are usually
|
|
1770
|
+
// discarded or ignored by the conversion of a F32 NaN value to a BF16.
|
|
1771
|
+
|
|
1772
|
+
// If f64 is a NaN value, the result of the F64 round-to-odd step will be a
|
|
1773
|
+
// NaN value as the result of the F64 round-to-odd step will have at least one
|
|
1774
|
+
// mantissa bit if f64 is a NaN value.
|
|
1775
|
+
|
|
1776
|
+
// The F64 round-to-odd step below will ensure that the F64 to F32 conversion
|
|
1777
|
+
// is exact if the magnitude of the rounded F64 value (using round-to-odd
|
|
1778
|
+
// rounding) is between 2^(-135) (one-fourth of the smallest positive denormal
|
|
1779
|
+
// BF16 value) and HighestValue<float>() (the largest finite F32 value).
|
|
1780
|
+
|
|
1781
|
+
// If |f64| is less than 2^(-135), the magnitude of the result of the F64 to
|
|
1782
|
+
// F32 conversion is guaranteed to be less than or equal to 2^(-135), which
|
|
1783
|
+
// ensures that the F32 to BF16 conversion is correctly rounded, even if the
|
|
1784
|
+
// conversion of a rounded F64 value whose magnitude is less than 2^(-135)
|
|
1785
|
+
// to a F32 is inexact.
|
|
1786
|
+
|
|
1787
|
+
return BF16FromF32(
|
|
1788
|
+
static_cast<float>(BitCastScalar<double>(static_cast<uint64_t>(
|
|
1789
|
+
(BitCastScalar<uint64_t>(f64) & 0xFFFFFFC000000000ULL) |
|
|
1790
|
+
((BitCastScalar<uint64_t>(f64) + 0x0000003FFFFFFFFFULL) &
|
|
1791
|
+
0x0000004000000000ULL)))));
|
|
1792
|
+
#endif
|
|
1793
|
+
}
|
|
1794
|
+
|
|
1795
|
+
// More convenient to define outside bfloat16_t because these may use
|
|
1796
|
+
// F32FromBF16, which is defined after the struct.
|
|
1797
|
+
|
|
1798
|
+
HWY_BF16_CONSTEXPR inline bool operator==(bfloat16_t lhs,
|
|
1799
|
+
bfloat16_t rhs) noexcept {
|
|
1800
|
+
#if HWY_HAVE_SCALAR_BF16_OPERATORS
|
|
1801
|
+
return lhs.native == rhs.native;
|
|
1802
|
+
#else
|
|
1803
|
+
return F32FromBF16(lhs) == F32FromBF16(rhs);
|
|
1804
|
+
#endif
|
|
1805
|
+
}
|
|
1806
|
+
|
|
1807
|
+
HWY_BF16_CONSTEXPR inline bool operator!=(bfloat16_t lhs,
|
|
1808
|
+
bfloat16_t rhs) noexcept {
|
|
1809
|
+
#if HWY_HAVE_SCALAR_BF16_OPERATORS
|
|
1810
|
+
return lhs.native != rhs.native;
|
|
1811
|
+
#else
|
|
1812
|
+
return F32FromBF16(lhs) != F32FromBF16(rhs);
|
|
1813
|
+
#endif
|
|
1814
|
+
}
|
|
1815
|
+
HWY_BF16_CONSTEXPR inline bool operator<(bfloat16_t lhs,
|
|
1816
|
+
bfloat16_t rhs) noexcept {
|
|
1817
|
+
#if HWY_HAVE_SCALAR_BF16_OPERATORS
|
|
1818
|
+
return lhs.native < rhs.native;
|
|
1819
|
+
#else
|
|
1820
|
+
return F32FromBF16(lhs) < F32FromBF16(rhs);
|
|
1821
|
+
#endif
|
|
1822
|
+
}
|
|
1823
|
+
HWY_BF16_CONSTEXPR inline bool operator<=(bfloat16_t lhs,
|
|
1824
|
+
bfloat16_t rhs) noexcept {
|
|
1825
|
+
#if HWY_HAVE_SCALAR_BF16_OPERATORS
|
|
1826
|
+
return lhs.native <= rhs.native;
|
|
1827
|
+
#else
|
|
1828
|
+
return F32FromBF16(lhs) <= F32FromBF16(rhs);
|
|
1829
|
+
#endif
|
|
1830
|
+
}
|
|
1831
|
+
HWY_BF16_CONSTEXPR inline bool operator>(bfloat16_t lhs,
|
|
1832
|
+
bfloat16_t rhs) noexcept {
|
|
1833
|
+
#if HWY_HAVE_SCALAR_BF16_OPERATORS
|
|
1834
|
+
return lhs.native > rhs.native;
|
|
1835
|
+
#else
|
|
1836
|
+
return F32FromBF16(lhs) > F32FromBF16(rhs);
|
|
1837
|
+
#endif
|
|
1838
|
+
}
|
|
1839
|
+
HWY_BF16_CONSTEXPR inline bool operator>=(bfloat16_t lhs,
|
|
1840
|
+
bfloat16_t rhs) noexcept {
|
|
1841
|
+
#if HWY_HAVE_SCALAR_BF16_OPERATORS
|
|
1842
|
+
return lhs.native >= rhs.native;
|
|
1843
|
+
#else
|
|
1844
|
+
return F32FromBF16(lhs) >= F32FromBF16(rhs);
|
|
1845
|
+
#endif
|
|
1846
|
+
}
|
|
1847
|
+
#if HWY_HAVE_CXX20_THREE_WAY_COMPARE
|
|
1848
|
+
HWY_BF16_CONSTEXPR inline std::partial_ordering operator<=>(
|
|
1849
|
+
bfloat16_t lhs, bfloat16_t rhs) noexcept {
|
|
1850
|
+
#if HWY_HAVE_SCALAR_BF16_OPERATORS
|
|
1851
|
+
return lhs.native <=> rhs.native;
|
|
1852
|
+
#else
|
|
1853
|
+
return F32FromBF16(lhs) <=> F32FromBF16(rhs);
|
|
1854
|
+
#endif
|
|
1855
|
+
}
|
|
1856
|
+
#endif // HWY_HAVE_CXX20_THREE_WAY_COMPARE
|
|
920
1857
|
|
|
921
1858
|
//------------------------------------------------------------------------------
|
|
922
1859
|
// Type relations
|
|
@@ -1110,25 +2047,19 @@ constexpr auto IsFloatTag() -> hwy::SizeTag<(R::is_float ? 0x200 : 0x400)> {
|
|
|
1110
2047
|
|
|
1111
2048
|
template <typename T>
|
|
1112
2049
|
HWY_API constexpr bool IsFloat3264() {
|
|
1113
|
-
return
|
|
2050
|
+
return IsSameEither<RemoveCvRef<T>, float, double>();
|
|
1114
2051
|
}
|
|
1115
2052
|
|
|
1116
2053
|
template <typename T>
|
|
1117
2054
|
HWY_API constexpr bool IsFloat() {
|
|
1118
2055
|
// Cannot use T(1.25) != T(1) for float16_t, which can only be converted to or
|
|
1119
2056
|
// from a float, not compared. Include float16_t in case HWY_HAVE_FLOAT16=1.
|
|
1120
|
-
return IsSame<T
|
|
1121
|
-
}
|
|
1122
|
-
|
|
1123
|
-
// These types are often special-cased and not supported in all ops.
|
|
1124
|
-
template <typename T>
|
|
1125
|
-
HWY_API constexpr bool IsSpecialFloat() {
|
|
1126
|
-
return IsSame<T, float16_t>() || IsSame<T, bfloat16_t>();
|
|
2057
|
+
return IsSame<RemoveCvRef<T>, float16_t>() || IsFloat3264<T>();
|
|
1127
2058
|
}
|
|
1128
2059
|
|
|
1129
2060
|
template <typename T>
|
|
1130
2061
|
HWY_API constexpr bool IsSigned() {
|
|
1131
|
-
return T(0) > T(-1);
|
|
2062
|
+
return static_cast<T>(0) > static_cast<T>(-1);
|
|
1132
2063
|
}
|
|
1133
2064
|
template <>
|
|
1134
2065
|
constexpr bool IsSigned<float16_t>() {
|
|
@@ -1138,104 +2069,113 @@ template <>
|
|
|
1138
2069
|
constexpr bool IsSigned<bfloat16_t>() {
|
|
1139
2070
|
return true;
|
|
1140
2071
|
}
|
|
2072
|
+
template <>
|
|
2073
|
+
constexpr bool IsSigned<hwy::uint128_t>() {
|
|
2074
|
+
return false;
|
|
2075
|
+
}
|
|
2076
|
+
template <>
|
|
2077
|
+
constexpr bool IsSigned<hwy::K64V64>() {
|
|
2078
|
+
return false;
|
|
2079
|
+
}
|
|
2080
|
+
template <>
|
|
2081
|
+
constexpr bool IsSigned<hwy::K32V32>() {
|
|
2082
|
+
return false;
|
|
2083
|
+
}
|
|
2084
|
+
|
|
2085
|
+
template <typename T, bool = IsInteger<T>() && !IsIntegerLaneType<T>()>
|
|
2086
|
+
struct MakeLaneTypeIfIntegerT {
|
|
2087
|
+
using type = T;
|
|
2088
|
+
};
|
|
2089
|
+
|
|
2090
|
+
template <typename T>
|
|
2091
|
+
struct MakeLaneTypeIfIntegerT<T, true> {
|
|
2092
|
+
using type = hwy::If<IsSigned<T>(), SignedFromSize<sizeof(T)>,
|
|
2093
|
+
UnsignedFromSize<sizeof(T)>>;
|
|
2094
|
+
};
|
|
2095
|
+
|
|
2096
|
+
template <typename T>
|
|
2097
|
+
using MakeLaneTypeIfInteger = typename MakeLaneTypeIfIntegerT<T>::type;
|
|
1141
2098
|
|
|
1142
2099
|
// Largest/smallest representable integer values.
|
|
1143
2100
|
template <typename T>
|
|
1144
2101
|
HWY_API constexpr T LimitsMax() {
|
|
1145
|
-
static_assert(
|
|
1146
|
-
using TU =
|
|
1147
|
-
return static_cast<T>(IsSigned<T>() ? (static_cast<TU>(~
|
|
1148
|
-
: static_cast<TU>(~
|
|
2102
|
+
static_assert(IsInteger<T>(), "Only for integer types");
|
|
2103
|
+
using TU = UnsignedFromSize<sizeof(T)>;
|
|
2104
|
+
return static_cast<T>(IsSigned<T>() ? (static_cast<TU>(~TU(0)) >> 1)
|
|
2105
|
+
: static_cast<TU>(~TU(0)));
|
|
1149
2106
|
}
|
|
1150
2107
|
template <typename T>
|
|
1151
2108
|
HWY_API constexpr T LimitsMin() {
|
|
1152
|
-
static_assert(
|
|
1153
|
-
return IsSigned<T>() ? T(-1) - LimitsMax<T>()
|
|
2109
|
+
static_assert(IsInteger<T>(), "Only for integer types");
|
|
2110
|
+
return IsSigned<T>() ? static_cast<T>(-1) - LimitsMax<T>()
|
|
2111
|
+
: static_cast<T>(0);
|
|
1154
2112
|
}
|
|
1155
2113
|
|
|
1156
2114
|
// Largest/smallest representable value (integer or float). This naming avoids
|
|
1157
2115
|
// confusion with numeric_limits<float>::min() (the smallest positive value).
|
|
1158
2116
|
// Cannot be constexpr because we use CopySameSize for [b]float16_t.
|
|
1159
2117
|
template <typename T>
|
|
1160
|
-
HWY_API T LowestValue() {
|
|
2118
|
+
HWY_API HWY_BITCASTSCALAR_CONSTEXPR T LowestValue() {
|
|
1161
2119
|
return LimitsMin<T>();
|
|
1162
2120
|
}
|
|
1163
2121
|
template <>
|
|
1164
|
-
HWY_INLINE bfloat16_t LowestValue<bfloat16_t>() {
|
|
1165
|
-
|
|
1166
|
-
bfloat16_t ret;
|
|
1167
|
-
CopySameSize(&kBits, &ret);
|
|
1168
|
-
return ret;
|
|
2122
|
+
HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR bfloat16_t LowestValue<bfloat16_t>() {
|
|
2123
|
+
return bfloat16_t::FromBits(uint16_t{0xFF7Fu}); // -1.1111111 x 2^127
|
|
1169
2124
|
}
|
|
1170
2125
|
template <>
|
|
1171
|
-
HWY_INLINE float16_t LowestValue<float16_t>() {
|
|
1172
|
-
|
|
1173
|
-
float16_t ret;
|
|
1174
|
-
CopySameSize(&kBits, &ret);
|
|
1175
|
-
return ret;
|
|
2126
|
+
HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR float16_t LowestValue<float16_t>() {
|
|
2127
|
+
return float16_t::FromBits(uint16_t{0xFBFFu}); // -1.1111111111 x 2^15
|
|
1176
2128
|
}
|
|
1177
2129
|
template <>
|
|
1178
|
-
HWY_INLINE float LowestValue<float>() {
|
|
2130
|
+
HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR float LowestValue<float>() {
|
|
1179
2131
|
return -3.402823466e+38F;
|
|
1180
2132
|
}
|
|
1181
2133
|
template <>
|
|
1182
|
-
HWY_INLINE double LowestValue<double>() {
|
|
2134
|
+
HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR double LowestValue<double>() {
|
|
1183
2135
|
return -1.7976931348623158e+308;
|
|
1184
2136
|
}
|
|
1185
2137
|
|
|
1186
2138
|
template <typename T>
|
|
1187
|
-
HWY_API T HighestValue() {
|
|
2139
|
+
HWY_API HWY_BITCASTSCALAR_CONSTEXPR T HighestValue() {
|
|
1188
2140
|
return LimitsMax<T>();
|
|
1189
2141
|
}
|
|
1190
2142
|
template <>
|
|
1191
|
-
HWY_INLINE bfloat16_t HighestValue<bfloat16_t>() {
|
|
1192
|
-
|
|
1193
|
-
bfloat16_t ret;
|
|
1194
|
-
CopySameSize(&kBits, &ret);
|
|
1195
|
-
return ret;
|
|
2143
|
+
HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR bfloat16_t HighestValue<bfloat16_t>() {
|
|
2144
|
+
return bfloat16_t::FromBits(uint16_t{0x7F7Fu}); // 1.1111111 x 2^127
|
|
1196
2145
|
}
|
|
1197
2146
|
template <>
|
|
1198
|
-
HWY_INLINE float16_t HighestValue<float16_t>() {
|
|
1199
|
-
|
|
1200
|
-
float16_t ret;
|
|
1201
|
-
CopySameSize(&kBits, &ret);
|
|
1202
|
-
return ret;
|
|
2147
|
+
HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR float16_t HighestValue<float16_t>() {
|
|
2148
|
+
return float16_t::FromBits(uint16_t{0x7BFFu}); // 1.1111111111 x 2^15
|
|
1203
2149
|
}
|
|
1204
2150
|
template <>
|
|
1205
|
-
HWY_INLINE float HighestValue<float>() {
|
|
2151
|
+
HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR float HighestValue<float>() {
|
|
1206
2152
|
return 3.402823466e+38F;
|
|
1207
2153
|
}
|
|
1208
2154
|
template <>
|
|
1209
|
-
HWY_INLINE double HighestValue<double>() {
|
|
2155
|
+
HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR double HighestValue<double>() {
|
|
1210
2156
|
return 1.7976931348623158e+308;
|
|
1211
2157
|
}
|
|
1212
2158
|
|
|
1213
2159
|
// Difference between 1.0 and the next representable value. Equal to
|
|
1214
2160
|
// 1 / (1ULL << MantissaBits<T>()), but hard-coding ensures precision.
|
|
1215
2161
|
template <typename T>
|
|
1216
|
-
HWY_API T Epsilon() {
|
|
2162
|
+
HWY_API HWY_BITCASTSCALAR_CONSTEXPR T Epsilon() {
|
|
1217
2163
|
return 1;
|
|
1218
2164
|
}
|
|
1219
2165
|
template <>
|
|
1220
|
-
HWY_INLINE bfloat16_t Epsilon<bfloat16_t>() {
|
|
1221
|
-
|
|
1222
|
-
bfloat16_t ret;
|
|
1223
|
-
CopySameSize(&kBits, &ret);
|
|
1224
|
-
return ret;
|
|
2166
|
+
HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR bfloat16_t Epsilon<bfloat16_t>() {
|
|
2167
|
+
return bfloat16_t::FromBits(uint16_t{0x3C00u}); // 0.0078125
|
|
1225
2168
|
}
|
|
1226
2169
|
template <>
|
|
1227
|
-
HWY_INLINE float16_t Epsilon<float16_t>() {
|
|
1228
|
-
|
|
1229
|
-
float16_t ret;
|
|
1230
|
-
CopySameSize(&kBits, &ret);
|
|
1231
|
-
return ret;
|
|
2170
|
+
HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR float16_t Epsilon<float16_t>() {
|
|
2171
|
+
return float16_t::FromBits(uint16_t{0x1400u}); // 0.0009765625
|
|
1232
2172
|
}
|
|
1233
2173
|
template <>
|
|
1234
|
-
HWY_INLINE float Epsilon<float>() {
|
|
2174
|
+
HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR float Epsilon<float>() {
|
|
1235
2175
|
return 1.192092896e-7f;
|
|
1236
2176
|
}
|
|
1237
2177
|
template <>
|
|
1238
|
-
HWY_INLINE double Epsilon<double>() {
|
|
2178
|
+
HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR double Epsilon<double>() {
|
|
1239
2179
|
return 2.2204460492503131e-16;
|
|
1240
2180
|
}
|
|
1241
2181
|
|
|
@@ -1278,7 +2218,8 @@ constexpr MakeUnsigned<T> SignMask() {
|
|
|
1278
2218
|
// Returns bitmask of the exponent field in IEEE binary16/32/64.
|
|
1279
2219
|
template <typename T>
|
|
1280
2220
|
constexpr MakeUnsigned<T> ExponentMask() {
|
|
1281
|
-
return (~(MakeUnsigned<T>{1} << MantissaBits<T>()) + 1) &
|
|
2221
|
+
return (~(MakeUnsigned<T>{1} << MantissaBits<T>()) + 1) &
|
|
2222
|
+
static_cast<MakeUnsigned<T>>(~SignMask<T>());
|
|
1282
2223
|
}
|
|
1283
2224
|
|
|
1284
2225
|
// Returns bitmask of the mantissa field in IEEE binary16/32/64.
|
|
@@ -1290,30 +2231,24 @@ constexpr MakeUnsigned<T> MantissaMask() {
|
|
|
1290
2231
|
// Returns 1 << mantissa_bits as a floating-point number. All integers whose
|
|
1291
2232
|
// absolute value are less than this can be represented exactly.
|
|
1292
2233
|
template <typename T>
|
|
1293
|
-
HWY_INLINE T MantissaEnd() {
|
|
2234
|
+
HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR T MantissaEnd() {
|
|
1294
2235
|
static_assert(sizeof(T) == 0, "Only instantiate the specializations");
|
|
1295
2236
|
return 0;
|
|
1296
2237
|
}
|
|
1297
2238
|
template <>
|
|
1298
|
-
HWY_INLINE bfloat16_t MantissaEnd<bfloat16_t>() {
|
|
1299
|
-
|
|
1300
|
-
bfloat16_t ret;
|
|
1301
|
-
CopySameSize(&kBits, &ret);
|
|
1302
|
-
return ret;
|
|
2239
|
+
HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR bfloat16_t MantissaEnd<bfloat16_t>() {
|
|
2240
|
+
return bfloat16_t::FromBits(uint16_t{0x4300u}); // 1.0 x 2^7
|
|
1303
2241
|
}
|
|
1304
2242
|
template <>
|
|
1305
|
-
HWY_INLINE float16_t MantissaEnd<float16_t>() {
|
|
1306
|
-
|
|
1307
|
-
float16_t ret;
|
|
1308
|
-
CopySameSize(&kBits, &ret);
|
|
1309
|
-
return ret;
|
|
2243
|
+
HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR float16_t MantissaEnd<float16_t>() {
|
|
2244
|
+
return float16_t::FromBits(uint16_t{0x6400u}); // 1.0 x 2^10
|
|
1310
2245
|
}
|
|
1311
2246
|
template <>
|
|
1312
|
-
HWY_INLINE float MantissaEnd<float>() {
|
|
2247
|
+
HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR float MantissaEnd<float>() {
|
|
1313
2248
|
return 8388608.0f; // 1 << 23
|
|
1314
2249
|
}
|
|
1315
2250
|
template <>
|
|
1316
|
-
HWY_INLINE double MantissaEnd<double>() {
|
|
2251
|
+
HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR double MantissaEnd<double>() {
|
|
1317
2252
|
// floating point literal with p52 requires C++17.
|
|
1318
2253
|
return 4503599627370496.0; // 1 << 52
|
|
1319
2254
|
}
|
|
@@ -1333,6 +2268,143 @@ constexpr MakeSigned<T> MaxExponentField() {
|
|
|
1333
2268
|
return (MakeSigned<T>{1} << ExponentBits<T>()) - 1;
|
|
1334
2269
|
}
|
|
1335
2270
|
|
|
2271
|
+
//------------------------------------------------------------------------------
|
|
2272
|
+
// Additional F16/BF16 operators
|
|
2273
|
+
|
|
2274
|
+
#if HWY_HAVE_SCALAR_F16_OPERATORS || HWY_HAVE_SCALAR_BF16_OPERATORS
|
|
2275
|
+
|
|
2276
|
+
#define HWY_RHS_SPECIAL_FLOAT_ARITH_OP(op, op_func, T2) \
|
|
2277
|
+
template < \
|
|
2278
|
+
typename T1, \
|
|
2279
|
+
hwy::EnableIf<hwy::IsInteger<RemoveCvRef<T1>>() || \
|
|
2280
|
+
hwy::IsFloat3264<RemoveCvRef<T1>>()>* = nullptr, \
|
|
2281
|
+
typename RawResultT = decltype(DeclVal<T1>() op DeclVal<T2::Native>()), \
|
|
2282
|
+
typename ResultT = detail::NativeSpecialFloatToWrapper<RawResultT>, \
|
|
2283
|
+
HWY_IF_CASTABLE(RawResultT, ResultT)> \
|
|
2284
|
+
static HWY_INLINE constexpr ResultT op_func(T1 a, T2 b) noexcept { \
|
|
2285
|
+
return static_cast<ResultT>(a op b.native); \
|
|
2286
|
+
}
|
|
2287
|
+
|
|
2288
|
+
#define HWY_SPECIAL_FLOAT_CMP_AGAINST_NON_SPECIAL_OP(op, op_func, T1) \
|
|
2289
|
+
HWY_RHS_SPECIAL_FLOAT_ARITH_OP(op, op_func, T1) \
|
|
2290
|
+
template < \
|
|
2291
|
+
typename T2, \
|
|
2292
|
+
hwy::EnableIf<hwy::IsInteger<RemoveCvRef<T2>>() || \
|
|
2293
|
+
hwy::IsFloat3264<RemoveCvRef<T2>>()>* = nullptr, \
|
|
2294
|
+
typename RawResultT = decltype(DeclVal<T1::Native>() op DeclVal<T2>()), \
|
|
2295
|
+
typename ResultT = detail::NativeSpecialFloatToWrapper<RawResultT>, \
|
|
2296
|
+
HWY_IF_CASTABLE(RawResultT, ResultT)> \
|
|
2297
|
+
static HWY_INLINE constexpr ResultT op_func(T1 a, T2 b) noexcept { \
|
|
2298
|
+
return static_cast<ResultT>(a.native op b); \
|
|
2299
|
+
}
|
|
2300
|
+
|
|
2301
|
+
#if HWY_HAVE_SCALAR_F16_OPERATORS
|
|
2302
|
+
HWY_RHS_SPECIAL_FLOAT_ARITH_OP(+, operator+, float16_t)
|
|
2303
|
+
HWY_RHS_SPECIAL_FLOAT_ARITH_OP(-, operator-, float16_t)
|
|
2304
|
+
HWY_RHS_SPECIAL_FLOAT_ARITH_OP(*, operator*, float16_t)
|
|
2305
|
+
HWY_RHS_SPECIAL_FLOAT_ARITH_OP(/, operator/, float16_t)
|
|
2306
|
+
HWY_SPECIAL_FLOAT_CMP_AGAINST_NON_SPECIAL_OP(==, operator==, float16_t)
|
|
2307
|
+
HWY_SPECIAL_FLOAT_CMP_AGAINST_NON_SPECIAL_OP(!=, operator!=, float16_t)
|
|
2308
|
+
HWY_SPECIAL_FLOAT_CMP_AGAINST_NON_SPECIAL_OP(<, operator<, float16_t)
|
|
2309
|
+
HWY_SPECIAL_FLOAT_CMP_AGAINST_NON_SPECIAL_OP(<=, operator<=, float16_t)
|
|
2310
|
+
HWY_SPECIAL_FLOAT_CMP_AGAINST_NON_SPECIAL_OP(>, operator>, float16_t)
|
|
2311
|
+
HWY_SPECIAL_FLOAT_CMP_AGAINST_NON_SPECIAL_OP(>=, operator>=, float16_t)
|
|
2312
|
+
#if HWY_HAVE_CXX20_THREE_WAY_COMPARE
|
|
2313
|
+
HWY_SPECIAL_FLOAT_CMP_AGAINST_NON_SPECIAL_OP(<=>, operator<=>, float16_t)
|
|
2314
|
+
#endif
|
|
2315
|
+
#endif // HWY_HAVE_SCALAR_F16_OPERATORS
|
|
2316
|
+
|
|
2317
|
+
#if HWY_HAVE_SCALAR_BF16_OPERATORS
|
|
2318
|
+
HWY_RHS_SPECIAL_FLOAT_ARITH_OP(+, operator+, bfloat16_t)
|
|
2319
|
+
HWY_RHS_SPECIAL_FLOAT_ARITH_OP(-, operator-, bfloat16_t)
|
|
2320
|
+
HWY_RHS_SPECIAL_FLOAT_ARITH_OP(*, operator*, bfloat16_t)
|
|
2321
|
+
HWY_RHS_SPECIAL_FLOAT_ARITH_OP(/, operator/, bfloat16_t)
|
|
2322
|
+
HWY_SPECIAL_FLOAT_CMP_AGAINST_NON_SPECIAL_OP(==, operator==, bfloat16_t)
|
|
2323
|
+
HWY_SPECIAL_FLOAT_CMP_AGAINST_NON_SPECIAL_OP(!=, operator!=, bfloat16_t)
|
|
2324
|
+
HWY_SPECIAL_FLOAT_CMP_AGAINST_NON_SPECIAL_OP(<, operator<, bfloat16_t)
|
|
2325
|
+
HWY_SPECIAL_FLOAT_CMP_AGAINST_NON_SPECIAL_OP(<=, operator<=, bfloat16_t)
|
|
2326
|
+
HWY_SPECIAL_FLOAT_CMP_AGAINST_NON_SPECIAL_OP(>, operator>, bfloat16_t)
|
|
2327
|
+
HWY_SPECIAL_FLOAT_CMP_AGAINST_NON_SPECIAL_OP(>=, operator>=, bfloat16_t)
|
|
2328
|
+
#if HWY_HAVE_CXX20_THREE_WAY_COMPARE
|
|
2329
|
+
HWY_SPECIAL_FLOAT_CMP_AGAINST_NON_SPECIAL_OP(<=>, operator<=>, bfloat16_t)
|
|
2330
|
+
#endif
|
|
2331
|
+
#endif // HWY_HAVE_SCALAR_BF16_OPERATORS
|
|
2332
|
+
|
|
2333
|
+
#undef HWY_RHS_SPECIAL_FLOAT_ARITH_OP
|
|
2334
|
+
#undef HWY_SPECIAL_FLOAT_CMP_AGAINST_NON_SPECIAL_OP
|
|
2335
|
+
|
|
2336
|
+
#endif // HWY_HAVE_SCALAR_F16_OPERATORS || HWY_HAVE_SCALAR_BF16_OPERATORS
|
|
2337
|
+
|
|
2338
|
+
//------------------------------------------------------------------------------
|
|
2339
|
+
// Type conversions (after IsSpecialFloat)
|
|
2340
|
+
|
|
2341
|
+
HWY_API float F32FromF16Mem(const void* ptr) {
|
|
2342
|
+
float16_t f16;
|
|
2343
|
+
CopyBytes<2>(HWY_ASSUME_ALIGNED(ptr, 2), &f16);
|
|
2344
|
+
return F32FromF16(f16);
|
|
2345
|
+
}
|
|
2346
|
+
|
|
2347
|
+
HWY_API float F32FromBF16Mem(const void* ptr) {
|
|
2348
|
+
bfloat16_t bf;
|
|
2349
|
+
CopyBytes<2>(HWY_ASSUME_ALIGNED(ptr, 2), &bf);
|
|
2350
|
+
return F32FromBF16(bf);
|
|
2351
|
+
}
|
|
2352
|
+
|
|
2353
|
+
#if HWY_HAVE_SCALAR_F16_OPERATORS
|
|
2354
|
+
#define HWY_BF16_TO_F16_CONSTEXPR HWY_BF16_CONSTEXPR
|
|
2355
|
+
#else
|
|
2356
|
+
#define HWY_BF16_TO_F16_CONSTEXPR HWY_F16_CONSTEXPR
|
|
2357
|
+
#endif
|
|
2358
|
+
|
|
2359
|
+
// For casting from TFrom to TTo
|
|
2360
|
+
template <typename TTo, typename TFrom, HWY_IF_NOT_SPECIAL_FLOAT(TTo),
|
|
2361
|
+
HWY_IF_NOT_SPECIAL_FLOAT(TFrom), HWY_IF_NOT_SAME(TTo, TFrom)>
|
|
2362
|
+
HWY_API constexpr TTo ConvertScalarTo(const TFrom in) {
|
|
2363
|
+
return static_cast<TTo>(in);
|
|
2364
|
+
}
|
|
2365
|
+
template <typename TTo, typename TFrom, HWY_IF_F16(TTo),
|
|
2366
|
+
HWY_IF_NOT_SPECIAL_FLOAT(TFrom), HWY_IF_NOT_SAME(TFrom, double)>
|
|
2367
|
+
HWY_API constexpr TTo ConvertScalarTo(const TFrom in) {
|
|
2368
|
+
return F16FromF32(static_cast<float>(in));
|
|
2369
|
+
}
|
|
2370
|
+
template <typename TTo, HWY_IF_F16(TTo)>
|
|
2371
|
+
HWY_API HWY_BF16_TO_F16_CONSTEXPR TTo
|
|
2372
|
+
ConvertScalarTo(const hwy::bfloat16_t in) {
|
|
2373
|
+
return F16FromF32(F32FromBF16(in));
|
|
2374
|
+
}
|
|
2375
|
+
template <typename TTo, HWY_IF_F16(TTo)>
|
|
2376
|
+
HWY_API HWY_F16_CONSTEXPR TTo ConvertScalarTo(const double in) {
|
|
2377
|
+
return F16FromF64(in);
|
|
2378
|
+
}
|
|
2379
|
+
template <typename TTo, typename TFrom, HWY_IF_BF16(TTo),
|
|
2380
|
+
HWY_IF_NOT_SPECIAL_FLOAT(TFrom), HWY_IF_NOT_SAME(TFrom, double)>
|
|
2381
|
+
HWY_API HWY_BF16_CONSTEXPR TTo ConvertScalarTo(const TFrom in) {
|
|
2382
|
+
return BF16FromF32(static_cast<float>(in));
|
|
2383
|
+
}
|
|
2384
|
+
template <typename TTo, HWY_IF_BF16(TTo)>
|
|
2385
|
+
HWY_API HWY_BF16_TO_F16_CONSTEXPR TTo ConvertScalarTo(const hwy::float16_t in) {
|
|
2386
|
+
return BF16FromF32(F32FromF16(in));
|
|
2387
|
+
}
|
|
2388
|
+
template <typename TTo, HWY_IF_BF16(TTo)>
|
|
2389
|
+
HWY_API HWY_BF16_CONSTEXPR TTo ConvertScalarTo(const double in) {
|
|
2390
|
+
return BF16FromF64(in);
|
|
2391
|
+
}
|
|
2392
|
+
template <typename TTo, typename TFrom, HWY_IF_F16(TFrom),
|
|
2393
|
+
HWY_IF_NOT_SPECIAL_FLOAT(TTo)>
|
|
2394
|
+
HWY_API HWY_F16_CONSTEXPR TTo ConvertScalarTo(const TFrom in) {
|
|
2395
|
+
return static_cast<TTo>(F32FromF16(in));
|
|
2396
|
+
}
|
|
2397
|
+
template <typename TTo, typename TFrom, HWY_IF_BF16(TFrom),
|
|
2398
|
+
HWY_IF_NOT_SPECIAL_FLOAT(TTo)>
|
|
2399
|
+
HWY_API HWY_BF16_CONSTEXPR TTo ConvertScalarTo(TFrom in) {
|
|
2400
|
+
return static_cast<TTo>(F32FromBF16(in));
|
|
2401
|
+
}
|
|
2402
|
+
// Same: return unchanged
|
|
2403
|
+
template <typename TTo>
|
|
2404
|
+
HWY_API constexpr TTo ConvertScalarTo(TTo in) {
|
|
2405
|
+
return in;
|
|
2406
|
+
}
|
|
2407
|
+
|
|
1336
2408
|
//------------------------------------------------------------------------------
|
|
1337
2409
|
// Helper functions
|
|
1338
2410
|
|
|
@@ -1348,6 +2420,7 @@ constexpr inline size_t RoundUpTo(size_t what, size_t align) {
|
|
|
1348
2420
|
|
|
1349
2421
|
// Undefined results for x == 0.
|
|
1350
2422
|
HWY_API size_t Num0BitsBelowLS1Bit_Nonzero32(const uint32_t x) {
|
|
2423
|
+
HWY_DASSERT(x != 0);
|
|
1351
2424
|
#if HWY_COMPILER_MSVC
|
|
1352
2425
|
unsigned long index; // NOLINT
|
|
1353
2426
|
_BitScanForward(&index, x);
|
|
@@ -1358,6 +2431,7 @@ HWY_API size_t Num0BitsBelowLS1Bit_Nonzero32(const uint32_t x) {
|
|
|
1358
2431
|
}
|
|
1359
2432
|
|
|
1360
2433
|
HWY_API size_t Num0BitsBelowLS1Bit_Nonzero64(const uint64_t x) {
|
|
2434
|
+
HWY_DASSERT(x != 0);
|
|
1361
2435
|
#if HWY_COMPILER_MSVC
|
|
1362
2436
|
#if HWY_ARCH_X86_64
|
|
1363
2437
|
unsigned long index; // NOLINT
|
|
@@ -1383,6 +2457,7 @@ HWY_API size_t Num0BitsBelowLS1Bit_Nonzero64(const uint64_t x) {
|
|
|
1383
2457
|
|
|
1384
2458
|
// Undefined results for x == 0.
|
|
1385
2459
|
HWY_API size_t Num0BitsAboveMS1Bit_Nonzero32(const uint32_t x) {
|
|
2460
|
+
HWY_DASSERT(x != 0);
|
|
1386
2461
|
#if HWY_COMPILER_MSVC
|
|
1387
2462
|
unsigned long index; // NOLINT
|
|
1388
2463
|
_BitScanReverse(&index, x);
|
|
@@ -1393,6 +2468,7 @@ HWY_API size_t Num0BitsAboveMS1Bit_Nonzero32(const uint32_t x) {
|
|
|
1393
2468
|
}
|
|
1394
2469
|
|
|
1395
2470
|
HWY_API size_t Num0BitsAboveMS1Bit_Nonzero64(const uint64_t x) {
|
|
2471
|
+
HWY_DASSERT(x != 0);
|
|
1396
2472
|
#if HWY_COMPILER_MSVC
|
|
1397
2473
|
#if HWY_ARCH_X86_64
|
|
1398
2474
|
unsigned long index; // NOLINT
|
|
@@ -1416,26 +2492,48 @@ HWY_API size_t Num0BitsAboveMS1Bit_Nonzero64(const uint64_t x) {
|
|
|
1416
2492
|
#endif // HWY_COMPILER_MSVC
|
|
1417
2493
|
}
|
|
1418
2494
|
|
|
1419
|
-
|
|
1420
|
-
|
|
1421
|
-
|
|
1422
|
-
|
|
1423
|
-
|
|
1424
|
-
|
|
1425
|
-
|
|
2495
|
+
template <class T, HWY_IF_INTEGER(RemoveCvRef<T>),
|
|
2496
|
+
HWY_IF_T_SIZE_ONE_OF(RemoveCvRef<T>, (1 << 1) | (1 << 2) | (1 << 4))>
|
|
2497
|
+
HWY_API size_t PopCount(T x) {
|
|
2498
|
+
uint32_t u32_x = static_cast<uint32_t>(
|
|
2499
|
+
static_cast<UnsignedFromSize<sizeof(RemoveCvRef<T>)>>(x));
|
|
2500
|
+
|
|
2501
|
+
#if HWY_COMPILER_GCC || HWY_COMPILER_CLANG
|
|
2502
|
+
return static_cast<size_t>(__builtin_popcountl(u32_x));
|
|
2503
|
+
#elif HWY_COMPILER_MSVC && HWY_ARCH_X86_32 && defined(__AVX__)
|
|
2504
|
+
return static_cast<size_t>(_mm_popcnt_u32(u32_x));
|
|
2505
|
+
#else
|
|
2506
|
+
u32_x -= ((u32_x >> 1) & 0x55555555u);
|
|
2507
|
+
u32_x = (((u32_x >> 2) & 0x33333333u) + (u32_x & 0x33333333u));
|
|
2508
|
+
u32_x = (((u32_x >> 4) + u32_x) & 0x0F0F0F0Fu);
|
|
2509
|
+
u32_x += (u32_x >> 8);
|
|
2510
|
+
u32_x += (u32_x >> 16);
|
|
2511
|
+
return static_cast<size_t>(u32_x & 0x3Fu);
|
|
2512
|
+
#endif
|
|
2513
|
+
}
|
|
2514
|
+
|
|
2515
|
+
template <class T, HWY_IF_INTEGER(RemoveCvRef<T>),
|
|
2516
|
+
HWY_IF_T_SIZE(RemoveCvRef<T>, 8)>
|
|
2517
|
+
HWY_API size_t PopCount(T x) {
|
|
2518
|
+
uint64_t u64_x = static_cast<uint64_t>(
|
|
2519
|
+
static_cast<UnsignedFromSize<sizeof(RemoveCvRef<T>)>>(x));
|
|
2520
|
+
|
|
2521
|
+
#if HWY_COMPILER_GCC || HWY_COMPILER_CLANG
|
|
2522
|
+
return static_cast<size_t>(__builtin_popcountll(u64_x));
|
|
1426
2523
|
#elif HWY_COMPILER_MSVC && HWY_ARCH_X86_64 && defined(__AVX__)
|
|
1427
|
-
return _mm_popcnt_u64(
|
|
2524
|
+
return _mm_popcnt_u64(u64_x);
|
|
1428
2525
|
#elif HWY_COMPILER_MSVC && HWY_ARCH_X86_32 && defined(__AVX__)
|
|
1429
|
-
return _mm_popcnt_u32(static_cast<uint32_t>(
|
|
1430
|
-
_mm_popcnt_u32(static_cast<uint32_t>(
|
|
2526
|
+
return _mm_popcnt_u32(static_cast<uint32_t>(u64_x & 0xFFFFFFFFu)) +
|
|
2527
|
+
_mm_popcnt_u32(static_cast<uint32_t>(u64_x >> 32));
|
|
1431
2528
|
#else
|
|
1432
|
-
|
|
1433
|
-
|
|
1434
|
-
|
|
1435
|
-
|
|
1436
|
-
|
|
1437
|
-
|
|
1438
|
-
|
|
2529
|
+
u64_x -= ((u64_x >> 1) & 0x5555555555555555ULL);
|
|
2530
|
+
u64_x = (((u64_x >> 2) & 0x3333333333333333ULL) +
|
|
2531
|
+
(u64_x & 0x3333333333333333ULL));
|
|
2532
|
+
u64_x = (((u64_x >> 4) + u64_x) & 0x0F0F0F0F0F0F0F0FULL);
|
|
2533
|
+
u64_x += (u64_x >> 8);
|
|
2534
|
+
u64_x += (u64_x >> 16);
|
|
2535
|
+
u64_x += (u64_x >> 32);
|
|
2536
|
+
return static_cast<size_t>(u64_x & 0x7Fu);
|
|
1439
2537
|
#endif
|
|
1440
2538
|
}
|
|
1441
2539
|
|
|
@@ -1456,18 +2554,28 @@ template <typename TI>
|
|
|
1456
2554
|
: static_cast<size_t>(FloorLog2(static_cast<TI>(x - 1)) + 1);
|
|
1457
2555
|
}
|
|
1458
2556
|
|
|
1459
|
-
template <typename T>
|
|
1460
|
-
HWY_INLINE constexpr T AddWithWraparound(
|
|
1461
|
-
return t + static_cast<T>(
|
|
2557
|
+
template <typename T, typename T2, HWY_IF_FLOAT(T), HWY_IF_NOT_SPECIAL_FLOAT(T)>
|
|
2558
|
+
HWY_INLINE constexpr T AddWithWraparound(T t, T2 increment) {
|
|
2559
|
+
return t + static_cast<T>(increment);
|
|
1462
2560
|
}
|
|
1463
2561
|
|
|
1464
|
-
template <typename T>
|
|
1465
|
-
HWY_INLINE constexpr T AddWithWraparound(
|
|
1466
|
-
|
|
2562
|
+
template <typename T, typename T2, HWY_IF_SPECIAL_FLOAT(T)>
|
|
2563
|
+
HWY_INLINE constexpr T AddWithWraparound(T t, T2 increment) {
|
|
2564
|
+
return ConvertScalarTo<T>(ConvertScalarTo<float>(t) +
|
|
2565
|
+
ConvertScalarTo<float>(increment));
|
|
2566
|
+
}
|
|
2567
|
+
|
|
2568
|
+
template <typename T, typename T2, HWY_IF_NOT_FLOAT(T)>
|
|
2569
|
+
HWY_INLINE constexpr T AddWithWraparound(T t, T2 n) {
|
|
1467
2570
|
using TU = MakeUnsigned<T>;
|
|
1468
|
-
|
|
1469
|
-
|
|
1470
|
-
|
|
2571
|
+
// Sub-int types would promote to int, not unsigned, which would trigger
|
|
2572
|
+
// warnings, so first promote to the largest unsigned type. Due to
|
|
2573
|
+
// https://gcc.gnu.org/bugzilla/show_bug.cgi?id=87519, which affected GCC 8
|
|
2574
|
+
// until fixed in 9.3, we use built-in types rather than uint64_t.
|
|
2575
|
+
return static_cast<T>(static_cast<TU>(
|
|
2576
|
+
static_cast<unsigned long long>(static_cast<unsigned long long>(t) +
|
|
2577
|
+
static_cast<unsigned long long>(n)) &
|
|
2578
|
+
uint64_t{hwy::LimitsMax<TU>()}));
|
|
1471
2579
|
}
|
|
1472
2580
|
|
|
1473
2581
|
#if HWY_COMPILER_MSVC && HWY_ARCH_X86_64
|
|
@@ -1494,7 +2602,120 @@ HWY_API uint64_t Mul128(uint64_t a, uint64_t b, uint64_t* HWY_RESTRICT upper) {
|
|
|
1494
2602
|
#endif
|
|
1495
2603
|
}
|
|
1496
2604
|
|
|
2605
|
+
namespace detail {
|
|
2606
|
+
|
|
2607
|
+
template <typename T>
|
|
2608
|
+
static HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR T ScalarAbs(hwy::FloatTag /*tag*/,
|
|
2609
|
+
T val) {
|
|
2610
|
+
using TU = MakeUnsigned<T>;
|
|
2611
|
+
return BitCastScalar<T>(
|
|
2612
|
+
static_cast<TU>(BitCastScalar<TU>(val) & (~SignMask<T>())));
|
|
2613
|
+
}
|
|
2614
|
+
|
|
2615
|
+
template <typename T>
|
|
2616
|
+
static HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR T
|
|
2617
|
+
ScalarAbs(hwy::SpecialTag /*tag*/, T val) {
|
|
2618
|
+
return ScalarAbs(hwy::FloatTag(), val);
|
|
2619
|
+
}
|
|
2620
|
+
|
|
2621
|
+
template <typename T>
|
|
2622
|
+
static HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR T
|
|
2623
|
+
ScalarAbs(hwy::SignedTag /*tag*/, T val) {
|
|
2624
|
+
using TU = MakeUnsigned<T>;
|
|
2625
|
+
return (val < T{0}) ? static_cast<T>(TU{0} - static_cast<TU>(val)) : val;
|
|
2626
|
+
}
|
|
2627
|
+
|
|
2628
|
+
template <typename T>
|
|
2629
|
+
static HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR T
|
|
2630
|
+
ScalarAbs(hwy::UnsignedTag /*tag*/, T val) {
|
|
2631
|
+
return val;
|
|
2632
|
+
}
|
|
2633
|
+
|
|
2634
|
+
} // namespace detail
|
|
2635
|
+
|
|
2636
|
+
template <typename T>
|
|
2637
|
+
HWY_API HWY_BITCASTSCALAR_CONSTEXPR RemoveCvRef<T> ScalarAbs(T val) {
|
|
2638
|
+
using TVal = MakeLaneTypeIfInteger<
|
|
2639
|
+
detail::NativeSpecialFloatToWrapper<RemoveCvRef<T>>>;
|
|
2640
|
+
return detail::ScalarAbs(hwy::TypeTag<TVal>(), static_cast<TVal>(val));
|
|
2641
|
+
}
|
|
2642
|
+
|
|
2643
|
+
template <typename T>
|
|
2644
|
+
HWY_API HWY_BITCASTSCALAR_CONSTEXPR bool ScalarIsNaN(T val) {
|
|
2645
|
+
using TF = detail::NativeSpecialFloatToWrapper<RemoveCvRef<T>>;
|
|
2646
|
+
using TU = MakeUnsigned<TF>;
|
|
2647
|
+
return (BitCastScalar<TU>(ScalarAbs(val)) > ExponentMask<TF>());
|
|
2648
|
+
}
|
|
2649
|
+
|
|
2650
|
+
template <typename T>
|
|
2651
|
+
HWY_API HWY_BITCASTSCALAR_CONSTEXPR bool ScalarIsInf(T val) {
|
|
2652
|
+
using TF = detail::NativeSpecialFloatToWrapper<RemoveCvRef<T>>;
|
|
2653
|
+
using TU = MakeUnsigned<TF>;
|
|
2654
|
+
return static_cast<TU>(BitCastScalar<TU>(static_cast<TF>(val)) << 1) ==
|
|
2655
|
+
static_cast<TU>(MaxExponentTimes2<TF>());
|
|
2656
|
+
}
|
|
2657
|
+
|
|
2658
|
+
namespace detail {
|
|
2659
|
+
|
|
2660
|
+
template <typename T>
|
|
2661
|
+
static HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR bool ScalarIsFinite(
|
|
2662
|
+
hwy::FloatTag /*tag*/, T val) {
|
|
2663
|
+
using TU = MakeUnsigned<T>;
|
|
2664
|
+
return (BitCastScalar<TU>(hwy::ScalarAbs(val)) < ExponentMask<T>());
|
|
2665
|
+
}
|
|
2666
|
+
|
|
2667
|
+
template <typename T>
|
|
2668
|
+
static HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR bool ScalarIsFinite(
|
|
2669
|
+
hwy::NonFloatTag /*tag*/, T /*val*/) {
|
|
2670
|
+
// Integer values are always finite
|
|
2671
|
+
return true;
|
|
2672
|
+
}
|
|
2673
|
+
|
|
2674
|
+
} // namespace detail
|
|
2675
|
+
|
|
2676
|
+
template <typename T>
|
|
2677
|
+
HWY_API HWY_BITCASTSCALAR_CONSTEXPR bool ScalarIsFinite(T val) {
|
|
2678
|
+
using TVal = MakeLaneTypeIfInteger<
|
|
2679
|
+
detail::NativeSpecialFloatToWrapper<RemoveCvRef<T>>>;
|
|
2680
|
+
return detail::ScalarIsFinite(hwy::IsFloatTag<TVal>(),
|
|
2681
|
+
static_cast<TVal>(val));
|
|
2682
|
+
}
|
|
2683
|
+
|
|
2684
|
+
template <typename T>
|
|
2685
|
+
HWY_API HWY_BITCASTSCALAR_CONSTEXPR RemoveCvRef<T> ScalarCopySign(T magn,
|
|
2686
|
+
T sign) {
|
|
2687
|
+
using TF = RemoveCvRef<detail::NativeSpecialFloatToWrapper<RemoveCvRef<T>>>;
|
|
2688
|
+
using TU = MakeUnsigned<TF>;
|
|
2689
|
+
return BitCastScalar<TF>(static_cast<TU>(
|
|
2690
|
+
(BitCastScalar<TU>(static_cast<TF>(magn)) & (~SignMask<TF>())) |
|
|
2691
|
+
(BitCastScalar<TU>(static_cast<TF>(sign)) & SignMask<TF>())));
|
|
2692
|
+
}
|
|
2693
|
+
|
|
2694
|
+
template <typename T>
|
|
2695
|
+
HWY_API HWY_BITCASTSCALAR_CONSTEXPR bool ScalarSignBit(T val) {
|
|
2696
|
+
using TVal = MakeLaneTypeIfInteger<
|
|
2697
|
+
detail::NativeSpecialFloatToWrapper<RemoveCvRef<T>>>;
|
|
2698
|
+
using TU = MakeUnsigned<TVal>;
|
|
2699
|
+
return ((BitCastScalar<TU>(static_cast<TVal>(val)) & SignMask<TVal>()) != 0);
|
|
2700
|
+
}
|
|
2701
|
+
|
|
1497
2702
|
// Prevents the compiler from eliding the computations that led to "output".
|
|
2703
|
+
#if HWY_ARCH_PPC && (HWY_COMPILER_GCC || HWY_COMPILER_CLANG) && \
|
|
2704
|
+
!defined(_SOFT_FLOAT)
|
|
2705
|
+
// Workaround to avoid test failures on PPC if compiled with Clang
|
|
2706
|
+
template <class T, HWY_IF_F32(T)>
|
|
2707
|
+
HWY_API void PreventElision(T&& output) {
|
|
2708
|
+
asm volatile("" : "+f"(output)::"memory");
|
|
2709
|
+
}
|
|
2710
|
+
template <class T, HWY_IF_F64(T)>
|
|
2711
|
+
HWY_API void PreventElision(T&& output) {
|
|
2712
|
+
asm volatile("" : "+d"(output)::"memory");
|
|
2713
|
+
}
|
|
2714
|
+
template <class T, HWY_IF_NOT_FLOAT3264(T)>
|
|
2715
|
+
HWY_API void PreventElision(T&& output) {
|
|
2716
|
+
asm volatile("" : "+r"(output)::"memory");
|
|
2717
|
+
}
|
|
2718
|
+
#else
|
|
1498
2719
|
template <class T>
|
|
1499
2720
|
HWY_API void PreventElision(T&& output) {
|
|
1500
2721
|
#if HWY_COMPILER_MSVC
|
|
@@ -1502,8 +2723,8 @@ HWY_API void PreventElision(T&& output) {
|
|
|
1502
2723
|
// RTL constraints). Self-assignment with #pragma optimize("off") might be
|
|
1503
2724
|
// expected to prevent elision, but it does not with MSVC 2015. Type-punning
|
|
1504
2725
|
// with volatile pointers generates inefficient code on MSVC 2017.
|
|
1505
|
-
static std::atomic<
|
|
1506
|
-
|
|
2726
|
+
static std::atomic<RemoveCvRef<T>> sink;
|
|
2727
|
+
sink.store(output, std::memory_order_relaxed);
|
|
1507
2728
|
#else
|
|
1508
2729
|
// Works by indicating to the compiler that "output" is being read and
|
|
1509
2730
|
// modified. The +r constraint avoids unnecessary writes to memory, but only
|
|
@@ -1511,6 +2732,7 @@ HWY_API void PreventElision(T&& output) {
|
|
|
1511
2732
|
asm volatile("" : "+r"(output) : : "memory");
|
|
1512
2733
|
#endif
|
|
1513
2734
|
}
|
|
2735
|
+
#endif
|
|
1514
2736
|
|
|
1515
2737
|
} // namespace hwy
|
|
1516
2738
|
|