@img/sharp-libvips-dev 1.0.1 → 1.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -2
- package/include/aom/aom_decoder.h +1 -1
- package/include/aom/aom_encoder.h +7 -1
- package/include/aom/aom_image.h +24 -12
- package/include/aom/aom_integer.h +3 -3
- package/include/aom/aomcx.h +15 -0
- package/include/aom/aomdx.h +5 -2
- package/include/archive.h +7 -5
- package/include/archive_entry.h +5 -3
- package/include/cgif.h +3 -0
- package/include/expat.h +21 -10
- package/include/expat_config.h +11 -5
- package/include/ffi.h +12 -25
- package/include/freetype2/freetype/config/ftoption.h +2 -2
- package/include/fribidi/fribidi-config.h +2 -2
- package/include/fribidi/fribidi-unicode-version.h +3 -3
- package/include/gio-unix-2.0/gio/gfiledescriptorbased.h +3 -2
- package/include/glib-2.0/gio/gappinfo.h +40 -25
- package/include/glib-2.0/gio/gapplication.h +6 -0
- package/include/glib-2.0/gio/gasyncresult.h +1 -1
- package/include/glib-2.0/gio/gconverter.h +5 -0
- package/include/glib-2.0/gio/gdbusintrospection.h +1 -1
- package/include/glib-2.0/gio/gfile.h +16 -0
- package/include/glib-2.0/gio/gio-visibility.h +34 -0
- package/include/glib-2.0/gio/giotypes.h +0 -1
- package/include/glib-2.0/gio/gsettings.h +8 -0
- package/include/glib-2.0/gio/gvfs.h +2 -2
- package/include/glib-2.0/girepository/gi-visibility.h +34 -0
- package/include/glib-2.0/girepository/giarginfo.h +23 -6
- package/include/glib-2.0/girepository/gibaseinfo.h +44 -18
- package/include/glib-2.0/girepository/gicallableinfo.h +26 -16
- package/include/glib-2.0/girepository/gicallbackinfo.h +17 -2
- package/include/glib-2.0/girepository/giconstantinfo.h +19 -4
- package/include/glib-2.0/girepository/gienuminfo.h +20 -21
- package/include/glib-2.0/girepository/gifieldinfo.h +22 -7
- package/include/glib-2.0/girepository/giflagsinfo.h +60 -0
- package/include/glib-2.0/girepository/gifunctioninfo.h +22 -7
- package/include/glib-2.0/girepository/giinterfaceinfo.h +33 -18
- package/include/glib-2.0/girepository/giobjectinfo.h +41 -26
- package/include/glib-2.0/girepository/gipropertyinfo.h +18 -3
- package/include/glib-2.0/girepository/giregisteredtypeinfo.h +22 -11
- package/include/glib-2.0/girepository/girepository-autocleanups.h +56 -0
- package/include/glib-2.0/girepository/girepository.h +53 -62
- package/include/glib-2.0/girepository/girffi.h +8 -7
- package/include/glib-2.0/girepository/gisignalinfo.h +18 -3
- package/include/glib-2.0/girepository/gistructinfo.h +26 -11
- package/include/glib-2.0/girepository/gitypeinfo.h +29 -16
- package/include/glib-2.0/girepository/gitypelib.h +9 -13
- package/include/glib-2.0/girepository/gitypes.h +52 -104
- package/include/glib-2.0/girepository/giunioninfo.h +28 -12
- package/include/glib-2.0/girepository/giunresolvedinfo.h +17 -2
- package/include/glib-2.0/girepository/givalueinfo.h +65 -0
- package/include/glib-2.0/girepository/givfuncinfo.h +23 -8
- package/include/glib-2.0/glib/deprecated/gthread.h +9 -5
- package/include/glib-2.0/glib/gbitlock.h +31 -0
- package/include/glib-2.0/glib/gbookmarkfile.h +1 -1
- package/include/glib-2.0/glib/giochannel.h +2 -2
- package/include/glib-2.0/glib/glib-visibility.h +34 -0
- package/include/glib-2.0/glib/gmacros.h +12 -5
- package/include/glib-2.0/glib/gmain.h +93 -7
- package/include/glib-2.0/glib/gmessages.h +8 -0
- package/include/glib-2.0/glib/gqsort.h +8 -1
- package/include/glib-2.0/glib/gslice.h +2 -0
- package/include/glib-2.0/glib/gstrfuncs.h +24 -30
- package/include/glib-2.0/glib/gstrvbuilder.h +3 -0
- package/include/glib-2.0/glib/gthread.h +191 -3
- package/include/glib-2.0/glib/gunicode.h +1 -1
- package/include/glib-2.0/glib/gversionmacros.h +9 -0
- package/include/glib-2.0/glib-unix.h +7 -1
- package/include/glib-2.0/gmodule/gmodule-visibility.h +34 -0
- package/include/glib-2.0/gobject/genums.h +6 -6
- package/include/glib-2.0/gobject/glib-types.h +11 -0
- package/include/glib-2.0/gobject/gobject-visibility.h +34 -0
- package/include/glib-2.0/gobject/gsignal.h +16 -6
- package/include/glib-2.0/gobject/gtype.h +6 -6
- package/include/harfbuzz/hb-buffer.h +6 -0
- package/include/harfbuzz/hb-common.h +6 -9
- package/include/harfbuzz/hb-cplusplus.hh +8 -11
- package/include/harfbuzz/hb-subset.h +17 -4
- package/include/harfbuzz/hb-version.h +3 -3
- package/include/hwy/abort.h +28 -0
- package/include/hwy/aligned_allocator.h +218 -6
- package/include/hwy/base.h +1935 -512
- package/include/hwy/cache_control.h +24 -6
- package/include/hwy/detect_compiler_arch.h +105 -10
- package/include/hwy/detect_targets.h +146 -37
- package/include/hwy/foreach_target.h +36 -1
- package/include/hwy/highway.h +222 -50
- package/include/hwy/ops/arm_neon-inl.h +2055 -894
- package/include/hwy/ops/arm_sve-inl.h +1476 -348
- package/include/hwy/ops/emu128-inl.h +711 -623
- package/include/hwy/ops/generic_ops-inl.h +4431 -2157
- package/include/hwy/ops/inside-inl.h +691 -0
- package/include/hwy/ops/ppc_vsx-inl.h +2186 -673
- package/include/hwy/ops/rvv-inl.h +1556 -536
- package/include/hwy/ops/scalar-inl.h +353 -233
- package/include/hwy/ops/set_macros-inl.h +171 -23
- package/include/hwy/ops/shared-inl.h +198 -56
- package/include/hwy/ops/wasm_128-inl.h +283 -244
- package/include/hwy/ops/x86_128-inl.h +3673 -1357
- package/include/hwy/ops/x86_256-inl.h +1737 -663
- package/include/hwy/ops/x86_512-inl.h +1697 -500
- package/include/hwy/per_target.h +4 -0
- package/include/hwy/profiler.h +648 -0
- package/include/hwy/robust_statistics.h +2 -2
- package/include/hwy/targets.h +40 -32
- package/include/hwy/timer-inl.h +3 -3
- package/include/hwy/timer.h +16 -1
- package/include/libheif/heif.h +170 -15
- package/include/libheif/heif_items.h +237 -0
- package/include/libheif/heif_properties.h +38 -2
- package/include/libheif/heif_regions.h +1 -1
- package/include/libheif/heif_version.h +2 -2
- package/include/libpng16/png.h +32 -29
- package/include/libpng16/pngconf.h +2 -2
- package/include/libpng16/pnglibconf.h +8 -3
- package/include/librsvg-2.0/librsvg/rsvg-cairo.h +1 -1
- package/include/librsvg-2.0/librsvg/rsvg-features.h +3 -4
- package/include/librsvg-2.0/librsvg/rsvg-pixbuf.h +235 -0
- package/include/librsvg-2.0/librsvg/rsvg-version.h +3 -3
- package/include/librsvg-2.0/librsvg/rsvg.h +55 -176
- package/include/libxml2/libxml/HTMLparser.h +12 -19
- package/include/libxml2/libxml/c14n.h +1 -12
- package/include/libxml2/libxml/debugXML.h +1 -1
- package/include/libxml2/libxml/encoding.h +9 -0
- package/include/libxml2/libxml/entities.h +12 -1
- package/include/libxml2/libxml/hash.h +19 -0
- package/include/libxml2/libxml/list.h +2 -2
- package/include/libxml2/libxml/nanohttp.h +17 -0
- package/include/libxml2/libxml/parser.h +73 -58
- package/include/libxml2/libxml/parserInternals.h +9 -1
- package/include/libxml2/libxml/pattern.h +6 -0
- package/include/libxml2/libxml/tree.h +32 -12
- package/include/libxml2/libxml/uri.h +11 -0
- package/include/libxml2/libxml/valid.h +29 -2
- package/include/libxml2/libxml/xinclude.h +7 -0
- package/include/libxml2/libxml/xmlIO.h +21 -5
- package/include/libxml2/libxml/xmlerror.h +14 -0
- package/include/libxml2/libxml/xmlexports.h +111 -15
- package/include/libxml2/libxml/xmlmemory.h +8 -45
- package/include/libxml2/libxml/xmlreader.h +2 -0
- package/include/libxml2/libxml/xmlsave.h +5 -0
- package/include/libxml2/libxml/xmlunicode.h +165 -1
- package/include/libxml2/libxml/xmlversion.h +15 -179
- package/include/libxml2/libxml/xmlwriter.h +1 -0
- package/include/libxml2/libxml/xpath.h +4 -0
- package/include/pango-1.0/pango/pango-features.h +2 -2
- package/include/pango-1.0/pango/pango-fontmap.h +7 -0
- package/include/pango-1.0/pango/pango-item.h +4 -2
- package/include/pango-1.0/pango/pango-version-macros.h +25 -0
- package/include/pango-1.0/pango/pangofc-font.h +2 -1
- package/include/pixman-1/pixman-version.h +2 -2
- package/include/png.h +32 -29
- package/include/pngconf.h +2 -2
- package/include/pnglibconf.h +8 -3
- package/include/vips/connection.h +9 -3
- package/include/vips/util.h +1 -11
- package/include/vips/version.h +4 -4
- package/include/webp/decode.h +58 -56
- package/include/webp/demux.h +25 -21
- package/include/webp/encode.h +44 -39
- package/include/webp/mux.h +76 -15
- package/include/webp/mux_types.h +2 -1
- package/include/webp/sharpyuv/sharpyuv.h +77 -8
- package/include/webp/types.h +29 -8
- package/include/zconf.h +1 -1
- package/include/zlib.h +12 -12
- package/package.json +1 -1
- package/versions.json +18 -19
package/include/hwy/base.h
CHANGED
|
@@ -16,22 +16,27 @@
|
|
|
16
16
|
#ifndef HIGHWAY_HWY_BASE_H_
|
|
17
17
|
#define HIGHWAY_HWY_BASE_H_
|
|
18
18
|
|
|
19
|
-
//
|
|
19
|
+
// Target-independent definitions.
|
|
20
20
|
|
|
21
21
|
// IWYU pragma: begin_exports
|
|
22
22
|
#include <stddef.h>
|
|
23
23
|
#include <stdint.h>
|
|
24
24
|
|
|
25
|
-
// Wrapping this into a HWY_HAS_INCLUDE causes clang-format to fail.
|
|
26
|
-
#if __cplusplus >= 202100L && defined(__has_include)
|
|
27
|
-
#if __has_include(<stdfloat>)
|
|
28
|
-
#include <stdfloat> // std::float16_t
|
|
29
|
-
#endif
|
|
30
|
-
#endif
|
|
31
|
-
|
|
32
25
|
#include "hwy/detect_compiler_arch.h"
|
|
33
26
|
#include "hwy/highway_export.h"
|
|
34
27
|
|
|
28
|
+
// API version (https://semver.org/); keep in sync with CMakeLists.txt.
|
|
29
|
+
#define HWY_MAJOR 1
|
|
30
|
+
#define HWY_MINOR 2
|
|
31
|
+
#define HWY_PATCH 0
|
|
32
|
+
|
|
33
|
+
// True if the Highway version >= major.minor.0. Added in 1.2.0.
|
|
34
|
+
#define HWY_VERSION_GE(major, minor) \
|
|
35
|
+
(HWY_MAJOR > (major) || (HWY_MAJOR == (major) && HWY_MINOR >= (minor)))
|
|
36
|
+
// True if the Highway version < major.minor.0. Added in 1.2.0.
|
|
37
|
+
#define HWY_VERSION_LT(major, minor) \
|
|
38
|
+
(HWY_MAJOR < (major) || (HWY_MAJOR == (major) && HWY_MINOR < (minor)))
|
|
39
|
+
|
|
35
40
|
// "IWYU pragma: keep" does not work for these includes, so hide from the IDE.
|
|
36
41
|
#if !HWY_IDE
|
|
37
42
|
|
|
@@ -48,6 +53,26 @@
|
|
|
48
53
|
|
|
49
54
|
#endif // !HWY_IDE
|
|
50
55
|
|
|
56
|
+
#ifndef HWY_HAVE_COMPARE_HEADER // allow override
|
|
57
|
+
#define HWY_HAVE_COMPARE_HEADER 0
|
|
58
|
+
#if defined(__has_include) // note: wrapper macro fails on Clang ~17
|
|
59
|
+
#if __has_include(<compare>)
|
|
60
|
+
#undef HWY_HAVE_COMPARE_HEADER
|
|
61
|
+
#define HWY_HAVE_COMPARE_HEADER 1
|
|
62
|
+
#endif // __has_include
|
|
63
|
+
#endif // defined(__has_include)
|
|
64
|
+
#endif // HWY_HAVE_COMPARE_HEADER
|
|
65
|
+
|
|
66
|
+
#ifndef HWY_HAVE_CXX20_THREE_WAY_COMPARE // allow override
|
|
67
|
+
#if !defined(HWY_NO_LIBCXX) && defined(__cpp_impl_three_way_comparison) && \
|
|
68
|
+
__cpp_impl_three_way_comparison >= 201907L && HWY_HAVE_COMPARE_HEADER
|
|
69
|
+
#include <compare>
|
|
70
|
+
#define HWY_HAVE_CXX20_THREE_WAY_COMPARE 1
|
|
71
|
+
#else
|
|
72
|
+
#define HWY_HAVE_CXX20_THREE_WAY_COMPARE 0
|
|
73
|
+
#endif
|
|
74
|
+
#endif // HWY_HAVE_CXX20_THREE_WAY_COMPARE
|
|
75
|
+
|
|
51
76
|
// IWYU pragma: end_exports
|
|
52
77
|
|
|
53
78
|
#if HWY_COMPILER_MSVC
|
|
@@ -64,6 +89,7 @@
|
|
|
64
89
|
|
|
65
90
|
#include <intrin.h>
|
|
66
91
|
|
|
92
|
+
#define HWY_FUNCTION __FUNCSIG__ // function name + template args
|
|
67
93
|
#define HWY_RESTRICT __restrict
|
|
68
94
|
#define HWY_INLINE __forceinline
|
|
69
95
|
#define HWY_NOINLINE __declspec(noinline)
|
|
@@ -84,6 +110,7 @@
|
|
|
84
110
|
|
|
85
111
|
#else
|
|
86
112
|
|
|
113
|
+
#define HWY_FUNCTION __PRETTY_FUNCTION__ // function name + template args
|
|
87
114
|
#define HWY_RESTRICT __restrict__
|
|
88
115
|
// force inlining without optimization enabled creates very inefficient code
|
|
89
116
|
// that can cause compiler timeout
|
|
@@ -131,6 +158,11 @@ namespace hwy {
|
|
|
131
158
|
#define HWY_ASSUME_ALIGNED(ptr, align) (ptr) /* not supported */
|
|
132
159
|
#endif
|
|
133
160
|
|
|
161
|
+
// Returns a pointer whose type is `type` (T*), while allowing the compiler to
|
|
162
|
+
// assume that the untyped pointer `ptr` is aligned to a multiple of sizeof(T).
|
|
163
|
+
#define HWY_RCAST_ALIGNED(type, ptr) \
|
|
164
|
+
reinterpret_cast<type>(HWY_ASSUME_ALIGNED((ptr), alignof(RemovePtr<type>)))
|
|
165
|
+
|
|
134
166
|
// Clang and GCC require attributes on each function into which SIMD intrinsics
|
|
135
167
|
// are inlined. Support both per-function annotation (HWY_ATTR) for lambdas and
|
|
136
168
|
// automatic annotation via pragmas.
|
|
@@ -228,24 +260,41 @@ HWY_DLLEXPORT HWY_NORETURN void HWY_FORMAT(3, 4)
|
|
|
228
260
|
} \
|
|
229
261
|
} while (0)
|
|
230
262
|
|
|
231
|
-
#if HWY_HAS_FEATURE(memory_sanitizer) || defined(MEMORY_SANITIZER)
|
|
263
|
+
#if HWY_HAS_FEATURE(memory_sanitizer) || defined(MEMORY_SANITIZER) || \
|
|
264
|
+
defined(__SANITIZE_MEMORY__)
|
|
232
265
|
#define HWY_IS_MSAN 1
|
|
233
266
|
#else
|
|
234
267
|
#define HWY_IS_MSAN 0
|
|
235
268
|
#endif
|
|
236
269
|
|
|
237
|
-
#if HWY_HAS_FEATURE(address_sanitizer) || defined(ADDRESS_SANITIZER)
|
|
270
|
+
#if HWY_HAS_FEATURE(address_sanitizer) || defined(ADDRESS_SANITIZER) || \
|
|
271
|
+
defined(__SANITIZE_ADDRESS__)
|
|
238
272
|
#define HWY_IS_ASAN 1
|
|
239
273
|
#else
|
|
240
274
|
#define HWY_IS_ASAN 0
|
|
241
275
|
#endif
|
|
242
276
|
|
|
243
|
-
#if HWY_HAS_FEATURE(
|
|
277
|
+
#if HWY_HAS_FEATURE(hwaddress_sanitizer) || defined(HWADDRESS_SANITIZER) || \
|
|
278
|
+
defined(__SANITIZE_HWADDRESS__)
|
|
279
|
+
#define HWY_IS_HWASAN 1
|
|
280
|
+
#else
|
|
281
|
+
#define HWY_IS_HWASAN 0
|
|
282
|
+
#endif
|
|
283
|
+
|
|
284
|
+
#if HWY_HAS_FEATURE(thread_sanitizer) || defined(THREAD_SANITIZER) || \
|
|
285
|
+
defined(__SANITIZE_THREAD__)
|
|
244
286
|
#define HWY_IS_TSAN 1
|
|
245
287
|
#else
|
|
246
288
|
#define HWY_IS_TSAN 0
|
|
247
289
|
#endif
|
|
248
290
|
|
|
291
|
+
#if HWY_HAS_FEATURE(undefined_behavior_sanitizer) || \
|
|
292
|
+
defined(UNDEFINED_BEHAVIOR_SANITIZER)
|
|
293
|
+
#define HWY_IS_UBSAN 1
|
|
294
|
+
#else
|
|
295
|
+
#define HWY_IS_UBSAN 0
|
|
296
|
+
#endif
|
|
297
|
+
|
|
249
298
|
// MSAN may cause lengthy build times or false positives e.g. in AVX3 DemoteTo.
|
|
250
299
|
// You can disable MSAN by adding this attribute to the function that fails.
|
|
251
300
|
#if HWY_IS_MSAN
|
|
@@ -259,7 +308,8 @@ HWY_DLLEXPORT HWY_NORETURN void HWY_FORMAT(3, 4)
|
|
|
259
308
|
// Clang does not define NDEBUG, but it and GCC define __OPTIMIZE__, and recent
|
|
260
309
|
// MSVC defines NDEBUG (if not, could instead check _DEBUG).
|
|
261
310
|
#if (!defined(__OPTIMIZE__) && !defined(NDEBUG)) || HWY_IS_ASAN || \
|
|
262
|
-
HWY_IS_MSAN || HWY_IS_TSAN ||
|
|
311
|
+
HWY_IS_HWASAN || HWY_IS_MSAN || HWY_IS_TSAN || HWY_IS_UBSAN || \
|
|
312
|
+
defined(__clang_analyzer__)
|
|
263
313
|
#define HWY_IS_DEBUG_BUILD 1
|
|
264
314
|
#else
|
|
265
315
|
#define HWY_IS_DEBUG_BUILD 0
|
|
@@ -282,14 +332,12 @@ HWY_DLLEXPORT HWY_NORETURN void HWY_FORMAT(3, 4)
|
|
|
282
332
|
#pragma intrinsic(memset)
|
|
283
333
|
#endif
|
|
284
334
|
|
|
285
|
-
// The source/destination must not overlap/alias.
|
|
286
335
|
template <size_t kBytes, typename From, typename To>
|
|
287
|
-
HWY_API void CopyBytes(const From* from, To* to) {
|
|
336
|
+
HWY_API void CopyBytes(const From* HWY_RESTRICT from, To* HWY_RESTRICT to) {
|
|
288
337
|
#if HWY_COMPILER_MSVC
|
|
289
338
|
memcpy(to, from, kBytes);
|
|
290
339
|
#else
|
|
291
|
-
__builtin_memcpy(
|
|
292
|
-
kBytes);
|
|
340
|
+
__builtin_memcpy(to, from, kBytes);
|
|
293
341
|
#endif
|
|
294
342
|
}
|
|
295
343
|
|
|
@@ -331,7 +379,7 @@ HWY_API void ZeroBytes(void* to, size_t num_bytes) {
|
|
|
331
379
|
|
|
332
380
|
#if HWY_ARCH_X86
|
|
333
381
|
static constexpr HWY_MAYBE_UNUSED size_t kMaxVectorSize = 64; // AVX-512
|
|
334
|
-
#elif
|
|
382
|
+
#elif HWY_ARCH_RISCV && defined(__riscv_v_intrinsic) && \
|
|
335
383
|
__riscv_v_intrinsic >= 11000
|
|
336
384
|
// Not actually an upper bound on the size.
|
|
337
385
|
static constexpr HWY_MAYBE_UNUSED size_t kMaxVectorSize = 4096;
|
|
@@ -347,7 +395,7 @@ static constexpr HWY_MAYBE_UNUSED size_t kMaxVectorSize = 16;
|
|
|
347
395
|
// exceed the stack size.
|
|
348
396
|
#if HWY_ARCH_X86
|
|
349
397
|
#define HWY_ALIGN_MAX alignas(64)
|
|
350
|
-
#elif
|
|
398
|
+
#elif HWY_ARCH_RISCV && defined(__riscv_v_intrinsic) && \
|
|
351
399
|
__riscv_v_intrinsic >= 11000
|
|
352
400
|
#define HWY_ALIGN_MAX alignas(8) // only elements need be aligned
|
|
353
401
|
#else
|
|
@@ -357,349 +405,11 @@ static constexpr HWY_MAYBE_UNUSED size_t kMaxVectorSize = 16;
|
|
|
357
405
|
//------------------------------------------------------------------------------
|
|
358
406
|
// Lane types
|
|
359
407
|
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
// float16_t
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
#if (HWY_ARCH_ARM_A64 && !HWY_COMPILER_MSVC) || \
|
|
366
|
-
(HWY_COMPILER_CLANG && defined(__ARM_FP) && (__ARM_FP & 2)) || \
|
|
367
|
-
(HWY_COMPILER_GCC_ACTUAL && defined(__ARM_FP16_FORMAT_IEEE))
|
|
368
|
-
#define HWY_NEON_HAVE_FLOAT16C 1
|
|
369
|
-
#else
|
|
370
|
-
#define HWY_NEON_HAVE_FLOAT16C 0
|
|
371
|
-
#endif
|
|
372
|
-
|
|
373
|
-
// C11 extension ISO/IEC TS 18661-3:2015 but not supported on all targets.
|
|
374
|
-
// Required if HWY_HAVE_FLOAT16, i.e. RVV with zvfh or AVX3_SPR (with
|
|
375
|
-
// sufficiently new compiler supporting avx512fp16). Do not use on clang-cl,
|
|
376
|
-
// which is missing __extendhfsf2.
|
|
377
|
-
#if ((HWY_ARCH_RVV && defined(__riscv_zvfh) && HWY_COMPILER_CLANG) || \
|
|
378
|
-
(HWY_ARCH_X86 && defined(__SSE2__) && \
|
|
379
|
-
((HWY_COMPILER_CLANG >= 1600 && !HWY_COMPILER_CLANGCL) || \
|
|
380
|
-
HWY_COMPILER_GCC_ACTUAL >= 1200)))
|
|
381
|
-
#define HWY_HAVE_C11_FLOAT16 1
|
|
382
|
-
#else
|
|
383
|
-
#define HWY_HAVE_C11_FLOAT16 0
|
|
384
|
-
#endif
|
|
385
|
-
|
|
386
|
-
// If 1, both __bf16 and a limited set of *_bf16 SVE intrinsics are available:
|
|
387
|
-
// create/get/set/dup, ld/st, sel, rev, trn, uzp, zip.
|
|
388
|
-
#if HWY_ARCH_ARM_A64 && defined(__ARM_FEATURE_SVE_BF16)
|
|
389
|
-
#define HWY_SVE_HAVE_BFLOAT16 1
|
|
390
|
-
#else
|
|
391
|
-
#define HWY_SVE_HAVE_BFLOAT16 0
|
|
392
|
-
#endif
|
|
393
|
-
|
|
394
|
-
// Match [u]int##_t naming scheme so rvv-inl.h macros can obtain the type name
|
|
395
|
-
// by concatenating base type and bits. We use a wrapper class instead of a
|
|
396
|
-
// typedef to the native type to ensure that the same symbols, e.g. for VQSort,
|
|
397
|
-
// are generated regardless of F16 support; see #1684.
|
|
398
|
-
struct float16_t {
|
|
399
|
-
#if HWY_NEON_HAVE_FLOAT16C // ACLE's __fp16
|
|
400
|
-
using Raw = __fp16;
|
|
401
|
-
#elif HWY_HAVE_C11_FLOAT16 // C11 _Float16
|
|
402
|
-
using Raw = _Float16;
|
|
403
|
-
#elif __cplusplus > 202002L && defined(__STDCPP_FLOAT16_T__) // C++23
|
|
404
|
-
using Raw = std::float16_t;
|
|
405
|
-
#else
|
|
406
|
-
#define HWY_EMULATE_FLOAT16
|
|
407
|
-
using Raw = uint16_t;
|
|
408
|
-
Raw bits;
|
|
409
|
-
#endif // float16_t
|
|
410
|
-
|
|
411
|
-
// When backed by a native type, ensure the wrapper behaves like the native
|
|
412
|
-
// type by forwarding all operators. Unfortunately it seems difficult to reuse
|
|
413
|
-
// this code in a base class, so we repeat it in bfloat16_t.
|
|
414
|
-
#ifndef HWY_EMULATE_FLOAT16
|
|
415
|
-
Raw raw;
|
|
416
|
-
|
|
417
|
-
float16_t() noexcept = default;
|
|
418
|
-
template <typename T>
|
|
419
|
-
constexpr float16_t(T arg) noexcept : raw(static_cast<Raw>(arg)) {}
|
|
420
|
-
float16_t& operator=(Raw arg) noexcept {
|
|
421
|
-
raw = arg;
|
|
422
|
-
return *this;
|
|
423
|
-
}
|
|
424
|
-
constexpr float16_t(const float16_t&) noexcept = default;
|
|
425
|
-
float16_t& operator=(const float16_t&) noexcept = default;
|
|
426
|
-
constexpr operator Raw() const noexcept { return raw; }
|
|
427
|
-
|
|
428
|
-
template <typename T>
|
|
429
|
-
float16_t& operator+=(T rhs) noexcept {
|
|
430
|
-
raw = static_cast<Raw>(raw + rhs);
|
|
431
|
-
return *this;
|
|
432
|
-
}
|
|
433
|
-
|
|
434
|
-
template <typename T>
|
|
435
|
-
float16_t& operator-=(T rhs) noexcept {
|
|
436
|
-
raw = static_cast<Raw>(raw - rhs);
|
|
437
|
-
return *this;
|
|
438
|
-
}
|
|
439
|
-
|
|
440
|
-
template <typename T>
|
|
441
|
-
float16_t& operator*=(T rhs) noexcept {
|
|
442
|
-
raw = static_cast<Raw>(raw * rhs);
|
|
443
|
-
return *this;
|
|
444
|
-
}
|
|
445
|
-
|
|
446
|
-
template <typename T>
|
|
447
|
-
float16_t& operator/=(T rhs) noexcept {
|
|
448
|
-
raw = static_cast<Raw>(raw / rhs);
|
|
449
|
-
return *this;
|
|
450
|
-
}
|
|
451
|
-
|
|
452
|
-
float16_t operator--() noexcept {
|
|
453
|
-
raw = static_cast<Raw>(raw - Raw{1});
|
|
454
|
-
return *this;
|
|
455
|
-
}
|
|
456
|
-
|
|
457
|
-
float16_t operator--(int) noexcept {
|
|
458
|
-
raw = static_cast<Raw>(raw - Raw{1});
|
|
459
|
-
return *this;
|
|
460
|
-
}
|
|
461
|
-
|
|
462
|
-
float16_t operator++() noexcept {
|
|
463
|
-
raw = static_cast<Raw>(raw + Raw{1});
|
|
464
|
-
return *this;
|
|
465
|
-
}
|
|
466
|
-
|
|
467
|
-
float16_t operator++(int) noexcept {
|
|
468
|
-
raw = static_cast<Raw>(raw + Raw{1});
|
|
469
|
-
return *this;
|
|
470
|
-
}
|
|
471
|
-
|
|
472
|
-
constexpr float16_t operator-() const noexcept {
|
|
473
|
-
return float16_t(static_cast<Raw>(-raw));
|
|
474
|
-
}
|
|
475
|
-
constexpr float16_t operator+() const noexcept { return *this; }
|
|
476
|
-
#endif // HWY_EMULATE_FLOAT16
|
|
477
|
-
};
|
|
478
|
-
|
|
479
|
-
#ifndef HWY_EMULATE_FLOAT16
|
|
480
|
-
constexpr inline bool operator==(float16_t lhs, float16_t rhs) noexcept {
|
|
481
|
-
return lhs.raw == rhs.raw;
|
|
482
|
-
}
|
|
483
|
-
constexpr inline bool operator!=(float16_t lhs, float16_t rhs) noexcept {
|
|
484
|
-
return lhs.raw != rhs.raw;
|
|
485
|
-
}
|
|
486
|
-
constexpr inline bool operator<(float16_t lhs, float16_t rhs) noexcept {
|
|
487
|
-
return lhs.raw < rhs.raw;
|
|
488
|
-
}
|
|
489
|
-
constexpr inline bool operator<=(float16_t lhs, float16_t rhs) noexcept {
|
|
490
|
-
return lhs.raw <= rhs.raw;
|
|
491
|
-
}
|
|
492
|
-
constexpr inline bool operator>(float16_t lhs, float16_t rhs) noexcept {
|
|
493
|
-
return lhs.raw > rhs.raw;
|
|
494
|
-
}
|
|
495
|
-
constexpr inline bool operator>=(float16_t lhs, float16_t rhs) noexcept {
|
|
496
|
-
return lhs.raw >= rhs.raw;
|
|
497
|
-
}
|
|
498
|
-
#endif // HWY_EMULATE_FLOAT16
|
|
499
|
-
|
|
500
|
-
struct bfloat16_t {
|
|
501
|
-
#if HWY_SVE_HAVE_BFLOAT16
|
|
502
|
-
using Raw = __bf16;
|
|
503
|
-
#elif __cplusplus >= 202100L && defined(__STDCPP_BFLOAT16_T__) // C++23
|
|
504
|
-
using Raw = std::bfloat16_t;
|
|
505
|
-
#else
|
|
506
|
-
#define HWY_EMULATE_BFLOAT16
|
|
507
|
-
using Raw = uint16_t;
|
|
508
|
-
Raw bits;
|
|
509
|
-
#endif
|
|
510
|
-
|
|
511
|
-
#ifndef HWY_EMULATE_BFLOAT16
|
|
512
|
-
Raw raw;
|
|
513
|
-
|
|
514
|
-
bfloat16_t() noexcept = default;
|
|
515
|
-
template <typename T>
|
|
516
|
-
constexpr bfloat16_t(T arg) noexcept : raw(static_cast<Raw>(arg)) {}
|
|
517
|
-
bfloat16_t& operator=(Raw arg) noexcept {
|
|
518
|
-
raw = arg;
|
|
519
|
-
return *this;
|
|
520
|
-
}
|
|
521
|
-
constexpr bfloat16_t(const bfloat16_t&) noexcept = default;
|
|
522
|
-
bfloat16_t& operator=(const bfloat16_t&) noexcept = default;
|
|
523
|
-
constexpr operator Raw() const noexcept { return raw; }
|
|
524
|
-
|
|
525
|
-
template <typename T>
|
|
526
|
-
bfloat16_t& operator+=(T rhs) noexcept {
|
|
527
|
-
raw = static_cast<Raw>(raw + rhs);
|
|
528
|
-
return *this;
|
|
529
|
-
}
|
|
530
|
-
|
|
531
|
-
template <typename T>
|
|
532
|
-
bfloat16_t& operator-=(T rhs) noexcept {
|
|
533
|
-
raw = static_cast<Raw>(raw - rhs);
|
|
534
|
-
return *this;
|
|
535
|
-
}
|
|
536
|
-
|
|
537
|
-
template <typename T>
|
|
538
|
-
bfloat16_t& operator*=(T rhs) noexcept {
|
|
539
|
-
raw = static_cast<Raw>(raw * rhs);
|
|
540
|
-
return *this;
|
|
541
|
-
}
|
|
542
|
-
|
|
543
|
-
template <typename T>
|
|
544
|
-
bfloat16_t& operator/=(T rhs) noexcept {
|
|
545
|
-
raw = static_cast<Raw>(raw / rhs);
|
|
546
|
-
return *this;
|
|
547
|
-
}
|
|
548
|
-
|
|
549
|
-
bfloat16_t operator--() noexcept {
|
|
550
|
-
raw = static_cast<Raw>(raw - Raw{1});
|
|
551
|
-
return *this;
|
|
552
|
-
}
|
|
553
|
-
|
|
554
|
-
bfloat16_t operator--(int) noexcept {
|
|
555
|
-
raw = static_cast<Raw>(raw - Raw{1});
|
|
556
|
-
return *this;
|
|
557
|
-
}
|
|
558
|
-
|
|
559
|
-
bfloat16_t operator++() noexcept {
|
|
560
|
-
raw = static_cast<Raw>(raw + Raw{1});
|
|
561
|
-
return *this;
|
|
562
|
-
}
|
|
563
|
-
|
|
564
|
-
bfloat16_t operator++(int) noexcept {
|
|
565
|
-
raw = static_cast<Raw>(raw + Raw{1});
|
|
566
|
-
return *this;
|
|
567
|
-
}
|
|
568
|
-
|
|
569
|
-
constexpr bfloat16_t operator-() const noexcept {
|
|
570
|
-
return bfloat16_t(static_cast<Raw>(-raw));
|
|
571
|
-
}
|
|
572
|
-
constexpr bfloat16_t operator+() const noexcept { return *this; }
|
|
573
|
-
#endif // HWY_EMULATE_BFLOAT16
|
|
574
|
-
};
|
|
575
|
-
|
|
576
|
-
#ifndef HWY_EMULATE_BFLOAT16
|
|
577
|
-
constexpr inline bool operator==(bfloat16_t lhs, bfloat16_t rhs) noexcept {
|
|
578
|
-
return lhs.raw == rhs.raw;
|
|
579
|
-
}
|
|
580
|
-
constexpr inline bool operator!=(bfloat16_t lhs, bfloat16_t rhs) noexcept {
|
|
581
|
-
return lhs.raw != rhs.raw;
|
|
582
|
-
}
|
|
583
|
-
constexpr inline bool operator<(bfloat16_t lhs, bfloat16_t rhs) noexcept {
|
|
584
|
-
return lhs.raw < rhs.raw;
|
|
585
|
-
}
|
|
586
|
-
constexpr inline bool operator<=(bfloat16_t lhs, bfloat16_t rhs) noexcept {
|
|
587
|
-
return lhs.raw <= rhs.raw;
|
|
588
|
-
}
|
|
589
|
-
constexpr inline bool operator>(bfloat16_t lhs, bfloat16_t rhs) noexcept {
|
|
590
|
-
return lhs.raw > rhs.raw;
|
|
591
|
-
}
|
|
592
|
-
constexpr inline bool operator>=(bfloat16_t lhs, bfloat16_t rhs) noexcept {
|
|
593
|
-
return lhs.raw >= rhs.raw;
|
|
594
|
-
}
|
|
595
|
-
#endif // HWY_EMULATE_BFLOAT16
|
|
596
|
-
|
|
597
|
-
#pragma pack(pop)
|
|
598
|
-
|
|
599
|
-
HWY_API float F32FromF16(float16_t f16) {
|
|
600
|
-
#ifdef HWY_EMULATE_FLOAT16
|
|
601
|
-
uint16_t bits16;
|
|
602
|
-
CopySameSize(&f16, &bits16);
|
|
603
|
-
const uint32_t sign = static_cast<uint32_t>(bits16 >> 15);
|
|
604
|
-
const uint32_t biased_exp = (bits16 >> 10) & 0x1F;
|
|
605
|
-
const uint32_t mantissa = bits16 & 0x3FF;
|
|
606
|
-
|
|
607
|
-
// Subnormal or zero
|
|
608
|
-
if (biased_exp == 0) {
|
|
609
|
-
const float subnormal =
|
|
610
|
-
(1.0f / 16384) * (static_cast<float>(mantissa) * (1.0f / 1024));
|
|
611
|
-
return sign ? -subnormal : subnormal;
|
|
612
|
-
}
|
|
613
|
-
|
|
614
|
-
// Normalized: convert the representation directly (faster than ldexp/tables).
|
|
615
|
-
const uint32_t biased_exp32 = biased_exp + (127 - 15);
|
|
616
|
-
const uint32_t mantissa32 = mantissa << (23 - 10);
|
|
617
|
-
const uint32_t bits32 = (sign << 31) | (biased_exp32 << 23) | mantissa32;
|
|
618
|
-
|
|
619
|
-
float result;
|
|
620
|
-
CopySameSize(&bits32, &result);
|
|
621
|
-
return result;
|
|
622
|
-
#else
|
|
623
|
-
return static_cast<float>(f16);
|
|
624
|
-
#endif
|
|
625
|
-
}
|
|
626
|
-
|
|
627
|
-
HWY_API float16_t F16FromF32(float f32) {
|
|
628
|
-
#ifdef HWY_EMULATE_FLOAT16
|
|
629
|
-
uint32_t bits32;
|
|
630
|
-
CopySameSize(&f32, &bits32);
|
|
631
|
-
const uint32_t sign = bits32 >> 31;
|
|
632
|
-
const uint32_t biased_exp32 = (bits32 >> 23) & 0xFF;
|
|
633
|
-
const uint32_t mantissa32 = bits32 & 0x7FFFFF;
|
|
634
|
-
|
|
635
|
-
const int32_t exp = HWY_MIN(static_cast<int32_t>(biased_exp32) - 127, 15);
|
|
636
|
-
|
|
637
|
-
// Tiny or zero => zero.
|
|
638
|
-
float16_t out;
|
|
639
|
-
if (exp < -24) {
|
|
640
|
-
// restore original sign
|
|
641
|
-
const uint16_t bits = static_cast<uint16_t>(sign << 15);
|
|
642
|
-
CopySameSize(&bits, &out);
|
|
643
|
-
return out;
|
|
644
|
-
}
|
|
645
|
-
|
|
646
|
-
uint32_t biased_exp16, mantissa16;
|
|
647
|
-
|
|
648
|
-
// exp = [-24, -15] => subnormal
|
|
649
|
-
if (exp < -14) {
|
|
650
|
-
biased_exp16 = 0;
|
|
651
|
-
const uint32_t sub_exp = static_cast<uint32_t>(-14 - exp);
|
|
652
|
-
HWY_DASSERT(1 <= sub_exp && sub_exp < 11);
|
|
653
|
-
mantissa16 = static_cast<uint32_t>((1u << (10 - sub_exp)) +
|
|
654
|
-
(mantissa32 >> (13 + sub_exp)));
|
|
655
|
-
} else {
|
|
656
|
-
// exp = [-14, 15]
|
|
657
|
-
biased_exp16 = static_cast<uint32_t>(exp + 15);
|
|
658
|
-
HWY_DASSERT(1 <= biased_exp16 && biased_exp16 < 31);
|
|
659
|
-
mantissa16 = mantissa32 >> 13;
|
|
660
|
-
}
|
|
661
|
-
|
|
662
|
-
HWY_DASSERT(mantissa16 < 1024);
|
|
663
|
-
const uint32_t bits16 = (sign << 15) | (biased_exp16 << 10) | mantissa16;
|
|
664
|
-
HWY_DASSERT(bits16 < 0x10000);
|
|
665
|
-
const uint16_t narrowed = static_cast<uint16_t>(bits16); // big-endian safe
|
|
666
|
-
CopySameSize(&narrowed, &out);
|
|
667
|
-
return out;
|
|
668
|
-
#else
|
|
669
|
-
return float16_t(static_cast<float16_t::Raw>(f32));
|
|
670
|
-
#endif
|
|
671
|
-
}
|
|
672
|
-
|
|
673
|
-
HWY_API float F32FromBF16(bfloat16_t bf) {
|
|
674
|
-
uint16_t bits16;
|
|
675
|
-
CopyBytes<2>(&bf, &bits16);
|
|
676
|
-
uint32_t bits = bits16;
|
|
677
|
-
bits <<= 16;
|
|
678
|
-
float f;
|
|
679
|
-
CopySameSize(&bits, &f);
|
|
680
|
-
return f;
|
|
681
|
-
}
|
|
682
|
-
|
|
683
|
-
HWY_API float F32FromF16Mem(const void* ptr) {
|
|
684
|
-
float16_t f16;
|
|
685
|
-
CopyBytes<2>(ptr, &f16);
|
|
686
|
-
return F32FromF16(f16);
|
|
687
|
-
}
|
|
688
|
-
|
|
689
|
-
HWY_API float F32FromBF16Mem(const void* ptr) {
|
|
690
|
-
bfloat16_t bf;
|
|
691
|
-
CopyBytes<2>(ptr, &bf);
|
|
692
|
-
return F32FromBF16(bf);
|
|
693
|
-
}
|
|
694
|
-
|
|
695
|
-
HWY_API bfloat16_t BF16FromF32(float f) {
|
|
696
|
-
uint32_t bits;
|
|
697
|
-
CopySameSize(&f, &bits);
|
|
698
|
-
const uint16_t bits16 = static_cast<uint16_t>(bits >> 16);
|
|
699
|
-
bfloat16_t bf;
|
|
700
|
-
CopySameSize(&bits16, &bf);
|
|
701
|
-
return bf;
|
|
702
|
-
}
|
|
408
|
+
// hwy::float16_t and hwy::bfloat16_t are forward declared here to allow
|
|
409
|
+
// BitCastScalar to be implemented before the implementations of the
|
|
410
|
+
// hwy::float16_t and hwy::bfloat16_t types
|
|
411
|
+
struct float16_t;
|
|
412
|
+
struct bfloat16_t;
|
|
703
413
|
|
|
704
414
|
using float32_t = float;
|
|
705
415
|
using float64_t = double;
|
|
@@ -729,24 +439,6 @@ struct alignas(8) K32V32 {
|
|
|
729
439
|
|
|
730
440
|
#pragma pack(pop)
|
|
731
441
|
|
|
732
|
-
#ifdef HWY_EMULATE_FLOAT16
|
|
733
|
-
|
|
734
|
-
static inline HWY_MAYBE_UNUSED bool operator<(const float16_t& a,
|
|
735
|
-
const float16_t& b) {
|
|
736
|
-
return F32FromF16(a) < F32FromF16(b);
|
|
737
|
-
}
|
|
738
|
-
// Required for std::greater.
|
|
739
|
-
static inline HWY_MAYBE_UNUSED bool operator>(const float16_t& a,
|
|
740
|
-
const float16_t& b) {
|
|
741
|
-
return F32FromF16(a) > F32FromF16(b);
|
|
742
|
-
}
|
|
743
|
-
static inline HWY_MAYBE_UNUSED bool operator==(const float16_t& a,
|
|
744
|
-
const float16_t& b) {
|
|
745
|
-
return F32FromF16(a) == F32FromF16(b);
|
|
746
|
-
}
|
|
747
|
-
|
|
748
|
-
#endif // HWY_EMULATE_FLOAT16
|
|
749
|
-
|
|
750
442
|
static inline HWY_MAYBE_UNUSED bool operator<(const uint128_t& a,
|
|
751
443
|
const uint128_t& b) {
|
|
752
444
|
return (a.hi == b.hi) ? a.lo < b.lo : a.hi < b.hi;
|
|
@@ -817,6 +509,12 @@ HWY_API constexpr bool IsSame() {
|
|
|
817
509
|
return IsSameT<T, U>::value;
|
|
818
510
|
}
|
|
819
511
|
|
|
512
|
+
// Returns whether T matches either of U1 or U2
|
|
513
|
+
template <typename T, typename U1, typename U2>
|
|
514
|
+
HWY_API constexpr bool IsSameEither() {
|
|
515
|
+
return IsSameT<T, U1>::value || IsSameT<T, U2>::value;
|
|
516
|
+
}
|
|
517
|
+
|
|
820
518
|
template <bool Condition, typename Then, typename Else>
|
|
821
519
|
struct IfT {
|
|
822
520
|
using type = Then;
|
|
@@ -830,6 +528,88 @@ struct IfT<false, Then, Else> {
|
|
|
830
528
|
template <bool Condition, typename Then, typename Else>
|
|
831
529
|
using If = typename IfT<Condition, Then, Else>::type;
|
|
832
530
|
|
|
531
|
+
template <typename T>
|
|
532
|
+
struct IsConstT {
|
|
533
|
+
enum { value = 0 };
|
|
534
|
+
};
|
|
535
|
+
|
|
536
|
+
template <typename T>
|
|
537
|
+
struct IsConstT<const T> {
|
|
538
|
+
enum { value = 1 };
|
|
539
|
+
};
|
|
540
|
+
|
|
541
|
+
template <typename T>
|
|
542
|
+
HWY_API constexpr bool IsConst() {
|
|
543
|
+
return IsConstT<T>::value;
|
|
544
|
+
}
|
|
545
|
+
|
|
546
|
+
template <class T>
|
|
547
|
+
struct RemoveConstT {
|
|
548
|
+
using type = T;
|
|
549
|
+
};
|
|
550
|
+
template <class T>
|
|
551
|
+
struct RemoveConstT<const T> {
|
|
552
|
+
using type = T;
|
|
553
|
+
};
|
|
554
|
+
|
|
555
|
+
template <class T>
|
|
556
|
+
using RemoveConst = typename RemoveConstT<T>::type;
|
|
557
|
+
|
|
558
|
+
template <class T>
|
|
559
|
+
struct RemoveVolatileT {
|
|
560
|
+
using type = T;
|
|
561
|
+
};
|
|
562
|
+
template <class T>
|
|
563
|
+
struct RemoveVolatileT<volatile T> {
|
|
564
|
+
using type = T;
|
|
565
|
+
};
|
|
566
|
+
|
|
567
|
+
template <class T>
|
|
568
|
+
using RemoveVolatile = typename RemoveVolatileT<T>::type;
|
|
569
|
+
|
|
570
|
+
template <class T>
|
|
571
|
+
struct RemoveRefT {
|
|
572
|
+
using type = T;
|
|
573
|
+
};
|
|
574
|
+
template <class T>
|
|
575
|
+
struct RemoveRefT<T&> {
|
|
576
|
+
using type = T;
|
|
577
|
+
};
|
|
578
|
+
template <class T>
|
|
579
|
+
struct RemoveRefT<T&&> {
|
|
580
|
+
using type = T;
|
|
581
|
+
};
|
|
582
|
+
|
|
583
|
+
template <class T>
|
|
584
|
+
using RemoveRef = typename RemoveRefT<T>::type;
|
|
585
|
+
|
|
586
|
+
template <class T>
|
|
587
|
+
using RemoveCvRef = RemoveConst<RemoveVolatile<RemoveRef<T>>>;
|
|
588
|
+
|
|
589
|
+
template <class T>
|
|
590
|
+
struct RemovePtrT {
|
|
591
|
+
using type = T;
|
|
592
|
+
};
|
|
593
|
+
template <class T>
|
|
594
|
+
struct RemovePtrT<T*> {
|
|
595
|
+
using type = T;
|
|
596
|
+
};
|
|
597
|
+
template <class T>
|
|
598
|
+
struct RemovePtrT<const T*> {
|
|
599
|
+
using type = T;
|
|
600
|
+
};
|
|
601
|
+
template <class T>
|
|
602
|
+
struct RemovePtrT<volatile T*> {
|
|
603
|
+
using type = T;
|
|
604
|
+
};
|
|
605
|
+
template <class T>
|
|
606
|
+
struct RemovePtrT<const volatile T*> {
|
|
607
|
+
using type = T;
|
|
608
|
+
};
|
|
609
|
+
|
|
610
|
+
template <class T>
|
|
611
|
+
using RemovePtr = typename RemovePtrT<T>::type;
|
|
612
|
+
|
|
833
613
|
// Insert into template/function arguments to enable this overload only for
|
|
834
614
|
// vectors of exactly, at most (LE), or more than (GT) this many bytes.
|
|
835
615
|
//
|
|
@@ -846,10 +626,11 @@ using If = typename IfT<Condition, Then, Else>::type;
|
|
|
846
626
|
#define HWY_IF_LANES_LE(kN, lanes) hwy::EnableIf<(kN <= lanes)>* = nullptr
|
|
847
627
|
#define HWY_IF_LANES_GT(kN, lanes) hwy::EnableIf<(kN > lanes)>* = nullptr
|
|
848
628
|
|
|
849
|
-
#define HWY_IF_UNSIGNED(T) hwy::EnableIf<!IsSigned<T>()>* = nullptr
|
|
850
|
-
#define
|
|
851
|
-
|
|
852
|
-
|
|
629
|
+
#define HWY_IF_UNSIGNED(T) hwy::EnableIf<!hwy::IsSigned<T>()>* = nullptr
|
|
630
|
+
#define HWY_IF_NOT_UNSIGNED(T) hwy::EnableIf<hwy::IsSigned<T>()>* = nullptr
|
|
631
|
+
#define HWY_IF_SIGNED(T) \
|
|
632
|
+
hwy::EnableIf<hwy::IsSigned<T>() && !hwy::IsFloat<T>() && \
|
|
633
|
+
!hwy::IsSpecialFloat<T>()>* = nullptr
|
|
853
634
|
#define HWY_IF_FLOAT(T) hwy::EnableIf<hwy::IsFloat<T>()>* = nullptr
|
|
854
635
|
#define HWY_IF_NOT_FLOAT(T) hwy::EnableIf<!hwy::IsFloat<T>()>* = nullptr
|
|
855
636
|
#define HWY_IF_FLOAT3264(T) hwy::EnableIf<hwy::IsFloat3264<T>()>* = nullptr
|
|
@@ -862,6 +643,7 @@ using If = typename IfT<Condition, Then, Else>::type;
|
|
|
862
643
|
hwy::EnableIf<hwy::IsFloat<T>() || hwy::IsSpecialFloat<T>()>* = nullptr
|
|
863
644
|
#define HWY_IF_NOT_FLOAT_NOR_SPECIAL(T) \
|
|
864
645
|
hwy::EnableIf<!hwy::IsFloat<T>() && !hwy::IsSpecialFloat<T>()>* = nullptr
|
|
646
|
+
#define HWY_IF_INTEGER(T) hwy::EnableIf<hwy::IsInteger<T>()>* = nullptr
|
|
865
647
|
|
|
866
648
|
#define HWY_IF_T_SIZE(T, bytes) hwy::EnableIf<sizeof(T) == (bytes)>* = nullptr
|
|
867
649
|
#define HWY_IF_NOT_T_SIZE(T, bytes) \
|
|
@@ -871,52 +653,1303 @@ using If = typename IfT<Condition, Then, Else>::type;
|
|
|
871
653
|
// bits explicitly (0x14) instead of attempting to 'negate' 0x102.
|
|
872
654
|
#define HWY_IF_T_SIZE_ONE_OF(T, bit_array) \
|
|
873
655
|
hwy::EnableIf<((size_t{1} << sizeof(T)) & (bit_array)) != 0>* = nullptr
|
|
656
|
+
#define HWY_IF_T_SIZE_LE(T, bytes) \
|
|
657
|
+
hwy::EnableIf<(sizeof(T) <= (bytes))>* = nullptr
|
|
658
|
+
#define HWY_IF_T_SIZE_GT(T, bytes) \
|
|
659
|
+
hwy::EnableIf<(sizeof(T) > (bytes))>* = nullptr
|
|
660
|
+
|
|
661
|
+
#define HWY_IF_SAME(T, expected) \
|
|
662
|
+
hwy::EnableIf<hwy::IsSame<hwy::RemoveCvRef<T>, expected>()>* = nullptr
|
|
663
|
+
#define HWY_IF_NOT_SAME(T, expected) \
|
|
664
|
+
hwy::EnableIf<!hwy::IsSame<hwy::RemoveCvRef<T>, expected>()>* = nullptr
|
|
665
|
+
|
|
666
|
+
// One of two expected types
|
|
667
|
+
#define HWY_IF_SAME2(T, expected1, expected2) \
|
|
668
|
+
hwy::EnableIf< \
|
|
669
|
+
hwy::IsSameEither<hwy::RemoveCvRef<T>, expected1, expected2>()>* = \
|
|
670
|
+
nullptr
|
|
671
|
+
|
|
672
|
+
#define HWY_IF_U8(T) HWY_IF_SAME(T, uint8_t)
|
|
673
|
+
#define HWY_IF_U16(T) HWY_IF_SAME(T, uint16_t)
|
|
674
|
+
#define HWY_IF_U32(T) HWY_IF_SAME(T, uint32_t)
|
|
675
|
+
#define HWY_IF_U64(T) HWY_IF_SAME(T, uint64_t)
|
|
676
|
+
|
|
677
|
+
#define HWY_IF_I8(T) HWY_IF_SAME(T, int8_t)
|
|
678
|
+
#define HWY_IF_I16(T) HWY_IF_SAME(T, int16_t)
|
|
679
|
+
#define HWY_IF_I32(T) HWY_IF_SAME(T, int32_t)
|
|
680
|
+
#define HWY_IF_I64(T) HWY_IF_SAME(T, int64_t)
|
|
681
|
+
|
|
682
|
+
#define HWY_IF_BF16(T) HWY_IF_SAME(T, hwy::bfloat16_t)
|
|
683
|
+
#define HWY_IF_NOT_BF16(T) HWY_IF_NOT_SAME(T, hwy::bfloat16_t)
|
|
684
|
+
|
|
685
|
+
#define HWY_IF_F16(T) HWY_IF_SAME(T, hwy::float16_t)
|
|
686
|
+
#define HWY_IF_NOT_F16(T) HWY_IF_NOT_SAME(T, hwy::float16_t)
|
|
687
|
+
|
|
688
|
+
#define HWY_IF_F32(T) HWY_IF_SAME(T, float)
|
|
689
|
+
#define HWY_IF_F64(T) HWY_IF_SAME(T, double)
|
|
690
|
+
|
|
691
|
+
// Use instead of HWY_IF_T_SIZE to avoid ambiguity with float16_t/float/double
|
|
692
|
+
// overloads.
|
|
693
|
+
#define HWY_IF_UI8(T) HWY_IF_SAME2(T, uint8_t, int8_t)
|
|
694
|
+
#define HWY_IF_UI16(T) HWY_IF_SAME2(T, uint16_t, int16_t)
|
|
695
|
+
#define HWY_IF_UI32(T) HWY_IF_SAME2(T, uint32_t, int32_t)
|
|
696
|
+
#define HWY_IF_UI64(T) HWY_IF_SAME2(T, uint64_t, int64_t)
|
|
697
|
+
|
|
698
|
+
#define HWY_IF_LANES_PER_BLOCK(T, N, LANES) \
|
|
699
|
+
hwy::EnableIf<HWY_MIN(sizeof(T) * N, 16) / sizeof(T) == (LANES)>* = nullptr
|
|
700
|
+
|
|
701
|
+
// Empty struct used as a size tag type.
|
|
702
|
+
template <size_t N>
|
|
703
|
+
struct SizeTag {};
|
|
704
|
+
|
|
705
|
+
template <class T>
|
|
706
|
+
class DeclValT {
|
|
707
|
+
private:
|
|
708
|
+
template <class U, class URef = U&&>
|
|
709
|
+
static URef TryAddRValRef(int);
|
|
710
|
+
template <class U, class Arg>
|
|
711
|
+
static U TryAddRValRef(Arg);
|
|
712
|
+
|
|
713
|
+
public:
|
|
714
|
+
using type = decltype(TryAddRValRef<T>(0));
|
|
715
|
+
enum { kDisableDeclValEvaluation = 1 };
|
|
716
|
+
};
|
|
717
|
+
|
|
718
|
+
// hwy::DeclVal<T>() can only be used in unevaluated contexts such as within an
|
|
719
|
+
// expression of a decltype specifier.
|
|
720
|
+
|
|
721
|
+
// hwy::DeclVal<T>() does not require that T have a public default constructor
|
|
722
|
+
template <class T>
|
|
723
|
+
HWY_API typename DeclValT<T>::type DeclVal() noexcept {
|
|
724
|
+
static_assert(!DeclValT<T>::kDisableDeclValEvaluation,
|
|
725
|
+
"DeclVal() cannot be used in an evaluated context");
|
|
726
|
+
}
|
|
727
|
+
|
|
728
|
+
template <class T>
|
|
729
|
+
struct IsArrayT {
|
|
730
|
+
enum { value = 0 };
|
|
731
|
+
};
|
|
732
|
+
|
|
733
|
+
template <class T>
|
|
734
|
+
struct IsArrayT<T[]> {
|
|
735
|
+
enum { value = 1 };
|
|
736
|
+
};
|
|
737
|
+
|
|
738
|
+
template <class T, size_t N>
|
|
739
|
+
struct IsArrayT<T[N]> {
|
|
740
|
+
enum { value = 1 };
|
|
741
|
+
};
|
|
742
|
+
|
|
743
|
+
template <class T>
|
|
744
|
+
static constexpr bool IsArray() {
|
|
745
|
+
return IsArrayT<T>::value;
|
|
746
|
+
}
|
|
747
|
+
|
|
748
|
+
#if HWY_COMPILER_MSVC
|
|
749
|
+
HWY_DIAGNOSTICS(push)
|
|
750
|
+
HWY_DIAGNOSTICS_OFF(disable : 4180, ignored "-Wignored-qualifiers")
|
|
751
|
+
#endif
|
|
752
|
+
|
|
753
|
+
template <class From, class To>
|
|
754
|
+
class IsConvertibleT {
|
|
755
|
+
private:
|
|
756
|
+
template <class T>
|
|
757
|
+
static hwy::SizeTag<1> TestFuncWithToArg(T);
|
|
758
|
+
|
|
759
|
+
template <class T, class U>
|
|
760
|
+
static decltype(IsConvertibleT<T, U>::template TestFuncWithToArg<U>(
|
|
761
|
+
DeclVal<T>()))
|
|
762
|
+
TryConvTest(int);
|
|
763
|
+
|
|
764
|
+
template <class T, class U, class Arg>
|
|
765
|
+
static hwy::SizeTag<0> TryConvTest(Arg);
|
|
766
|
+
|
|
767
|
+
public:
|
|
768
|
+
enum {
|
|
769
|
+
value = (IsSame<RemoveConst<RemoveVolatile<From>>, void>() &&
|
|
770
|
+
IsSame<RemoveConst<RemoveVolatile<To>>, void>()) ||
|
|
771
|
+
(!IsArray<To>() &&
|
|
772
|
+
(IsSame<To, decltype(DeclVal<To>())>() ||
|
|
773
|
+
!IsSame<const RemoveConst<To>, RemoveConst<To>>()) &&
|
|
774
|
+
IsSame<decltype(TryConvTest<From, To>(0)), hwy::SizeTag<1>>())
|
|
775
|
+
};
|
|
776
|
+
};
|
|
777
|
+
|
|
778
|
+
#if HWY_COMPILER_MSVC
|
|
779
|
+
HWY_DIAGNOSTICS(pop)
|
|
780
|
+
#endif
|
|
781
|
+
|
|
782
|
+
template <class From, class To>
|
|
783
|
+
HWY_API constexpr bool IsConvertible() {
|
|
784
|
+
return IsConvertibleT<From, To>::value;
|
|
785
|
+
}
|
|
786
|
+
|
|
787
|
+
template <class From, class To>
|
|
788
|
+
class IsStaticCastableT {
|
|
789
|
+
private:
|
|
790
|
+
template <class T, class U, class = decltype(static_cast<U>(DeclVal<T>()))>
|
|
791
|
+
static hwy::SizeTag<1> TryStaticCastTest(int);
|
|
792
|
+
|
|
793
|
+
template <class T, class U, class Arg>
|
|
794
|
+
static hwy::SizeTag<0> TryStaticCastTest(Arg);
|
|
795
|
+
|
|
796
|
+
public:
|
|
797
|
+
enum {
|
|
798
|
+
value = IsSame<decltype(TryStaticCastTest<From, To>(0)), hwy::SizeTag<1>>()
|
|
799
|
+
};
|
|
800
|
+
};
|
|
801
|
+
|
|
802
|
+
template <class From, class To>
|
|
803
|
+
static constexpr bool IsStaticCastable() {
|
|
804
|
+
return IsStaticCastableT<From, To>::value;
|
|
805
|
+
}
|
|
806
|
+
|
|
807
|
+
#define HWY_IF_CASTABLE(From, To) \
|
|
808
|
+
hwy::EnableIf<IsStaticCastable<From, To>()>* = nullptr
|
|
809
|
+
|
|
810
|
+
#define HWY_IF_OP_CASTABLE(op, T, Native) \
|
|
811
|
+
HWY_IF_CASTABLE(decltype(DeclVal<Native>() op DeclVal<T>()), Native)
|
|
812
|
+
|
|
813
|
+
template <class T, class From>
|
|
814
|
+
class IsAssignableT {
|
|
815
|
+
private:
|
|
816
|
+
template <class T1, class T2, class = decltype(DeclVal<T1>() = DeclVal<T2>())>
|
|
817
|
+
static hwy::SizeTag<1> TryAssignTest(int);
|
|
818
|
+
|
|
819
|
+
template <class T1, class T2, class Arg>
|
|
820
|
+
static hwy::SizeTag<0> TryAssignTest(Arg);
|
|
821
|
+
|
|
822
|
+
public:
|
|
823
|
+
enum {
|
|
824
|
+
value = IsSame<decltype(TryAssignTest<T, From>(0)), hwy::SizeTag<1>>()
|
|
825
|
+
};
|
|
826
|
+
};
|
|
827
|
+
|
|
828
|
+
template <class T, class From>
|
|
829
|
+
static constexpr bool IsAssignable() {
|
|
830
|
+
return IsAssignableT<T, From>::value;
|
|
831
|
+
}
|
|
832
|
+
|
|
833
|
+
#define HWY_IF_ASSIGNABLE(T, From) \
|
|
834
|
+
hwy::EnableIf<IsAssignable<T, From>()>* = nullptr
|
|
835
|
+
|
|
836
|
+
// ----------------------------------------------------------------------------
|
|
837
|
+
// IsSpecialFloat
|
|
838
|
+
|
|
839
|
+
// These types are often special-cased and not supported in all ops.
|
|
840
|
+
template <typename T>
|
|
841
|
+
HWY_API constexpr bool IsSpecialFloat() {
|
|
842
|
+
return IsSameEither<RemoveCvRef<T>, hwy::float16_t, hwy::bfloat16_t>();
|
|
843
|
+
}
|
|
844
|
+
|
|
845
|
+
// -----------------------------------------------------------------------------
|
|
846
|
+
// IsIntegerLaneType and IsInteger
|
|
847
|
+
|
|
848
|
+
template <class T>
|
|
849
|
+
HWY_API constexpr bool IsIntegerLaneType() {
|
|
850
|
+
return false;
|
|
851
|
+
}
|
|
852
|
+
template <>
|
|
853
|
+
HWY_INLINE constexpr bool IsIntegerLaneType<int8_t>() {
|
|
854
|
+
return true;
|
|
855
|
+
}
|
|
856
|
+
template <>
|
|
857
|
+
HWY_INLINE constexpr bool IsIntegerLaneType<uint8_t>() {
|
|
858
|
+
return true;
|
|
859
|
+
}
|
|
860
|
+
template <>
|
|
861
|
+
HWY_INLINE constexpr bool IsIntegerLaneType<int16_t>() {
|
|
862
|
+
return true;
|
|
863
|
+
}
|
|
864
|
+
template <>
|
|
865
|
+
HWY_INLINE constexpr bool IsIntegerLaneType<uint16_t>() {
|
|
866
|
+
return true;
|
|
867
|
+
}
|
|
868
|
+
template <>
|
|
869
|
+
HWY_INLINE constexpr bool IsIntegerLaneType<int32_t>() {
|
|
870
|
+
return true;
|
|
871
|
+
}
|
|
872
|
+
template <>
|
|
873
|
+
HWY_INLINE constexpr bool IsIntegerLaneType<uint32_t>() {
|
|
874
|
+
return true;
|
|
875
|
+
}
|
|
876
|
+
template <>
|
|
877
|
+
HWY_INLINE constexpr bool IsIntegerLaneType<int64_t>() {
|
|
878
|
+
return true;
|
|
879
|
+
}
|
|
880
|
+
template <>
|
|
881
|
+
HWY_INLINE constexpr bool IsIntegerLaneType<uint64_t>() {
|
|
882
|
+
return true;
|
|
883
|
+
}
|
|
884
|
+
|
|
885
|
+
template <class T>
|
|
886
|
+
HWY_API constexpr bool IsInteger() {
|
|
887
|
+
// NOTE: Do not add a IsInteger<wchar_t>() specialization below as it is
|
|
888
|
+
// possible for IsSame<wchar_t, uint16_t>() to be true when compiled with MSVC
|
|
889
|
+
// with the /Zc:wchar_t- option.
|
|
890
|
+
return IsIntegerLaneType<T>() || IsSame<RemoveCvRef<T>, wchar_t>() ||
|
|
891
|
+
IsSameEither<RemoveCvRef<T>, size_t, ptrdiff_t>() ||
|
|
892
|
+
IsSameEither<RemoveCvRef<T>, intptr_t, uintptr_t>();
|
|
893
|
+
}
|
|
894
|
+
template <>
|
|
895
|
+
HWY_INLINE constexpr bool IsInteger<bool>() {
|
|
896
|
+
return true;
|
|
897
|
+
}
|
|
898
|
+
template <>
|
|
899
|
+
HWY_INLINE constexpr bool IsInteger<char>() {
|
|
900
|
+
return true;
|
|
901
|
+
}
|
|
902
|
+
template <>
|
|
903
|
+
HWY_INLINE constexpr bool IsInteger<signed char>() {
|
|
904
|
+
return true;
|
|
905
|
+
}
|
|
906
|
+
template <>
|
|
907
|
+
HWY_INLINE constexpr bool IsInteger<unsigned char>() {
|
|
908
|
+
return true;
|
|
909
|
+
}
|
|
910
|
+
template <>
|
|
911
|
+
HWY_INLINE constexpr bool IsInteger<short>() { // NOLINT
|
|
912
|
+
return true;
|
|
913
|
+
}
|
|
914
|
+
template <>
|
|
915
|
+
HWY_INLINE constexpr bool IsInteger<unsigned short>() { // NOLINT
|
|
916
|
+
return true;
|
|
917
|
+
}
|
|
918
|
+
template <>
|
|
919
|
+
HWY_INLINE constexpr bool IsInteger<int>() {
|
|
920
|
+
return true;
|
|
921
|
+
}
|
|
922
|
+
template <>
|
|
923
|
+
HWY_INLINE constexpr bool IsInteger<unsigned>() {
|
|
924
|
+
return true;
|
|
925
|
+
}
|
|
926
|
+
template <>
|
|
927
|
+
HWY_INLINE constexpr bool IsInteger<long>() { // NOLINT
|
|
928
|
+
return true;
|
|
929
|
+
}
|
|
930
|
+
template <>
|
|
931
|
+
HWY_INLINE constexpr bool IsInteger<unsigned long>() { // NOLINT
|
|
932
|
+
return true;
|
|
933
|
+
}
|
|
934
|
+
template <>
|
|
935
|
+
HWY_INLINE constexpr bool IsInteger<long long>() { // NOLINT
|
|
936
|
+
return true;
|
|
937
|
+
}
|
|
938
|
+
template <>
|
|
939
|
+
HWY_INLINE constexpr bool IsInteger<unsigned long long>() { // NOLINT
|
|
940
|
+
return true;
|
|
941
|
+
}
|
|
942
|
+
#if defined(__cpp_char8_t) && __cpp_char8_t >= 201811L
|
|
943
|
+
template <>
|
|
944
|
+
HWY_INLINE constexpr bool IsInteger<char8_t>() {
|
|
945
|
+
return true;
|
|
946
|
+
}
|
|
947
|
+
#endif
|
|
948
|
+
template <>
|
|
949
|
+
HWY_INLINE constexpr bool IsInteger<char16_t>() {
|
|
950
|
+
return true;
|
|
951
|
+
}
|
|
952
|
+
template <>
|
|
953
|
+
HWY_INLINE constexpr bool IsInteger<char32_t>() {
|
|
954
|
+
return true;
|
|
955
|
+
}
|
|
956
|
+
|
|
957
|
+
// -----------------------------------------------------------------------------
|
|
958
|
+
// BitCastScalar
|
|
959
|
+
|
|
960
|
+
#if HWY_HAS_BUILTIN(__builtin_bit_cast) || HWY_COMPILER_MSVC >= 1926
|
|
961
|
+
#define HWY_BITCASTSCALAR_CONSTEXPR constexpr
|
|
962
|
+
#else
|
|
963
|
+
#define HWY_BITCASTSCALAR_CONSTEXPR
|
|
964
|
+
#endif
|
|
965
|
+
|
|
966
|
+
#if __cpp_constexpr >= 201304L
|
|
967
|
+
#define HWY_BITCASTSCALAR_CXX14_CONSTEXPR HWY_BITCASTSCALAR_CONSTEXPR
|
|
968
|
+
#else
|
|
969
|
+
#define HWY_BITCASTSCALAR_CXX14_CONSTEXPR
|
|
970
|
+
#endif
|
|
971
|
+
|
|
972
|
+
#if HWY_HAS_BUILTIN(__builtin_bit_cast) || HWY_COMPILER_MSVC >= 1926
|
|
973
|
+
namespace detail {
|
|
974
|
+
|
|
975
|
+
template <class From>
|
|
976
|
+
struct BitCastScalarSrcCastHelper {
|
|
977
|
+
static HWY_INLINE constexpr const From& CastSrcValRef(const From& val) {
|
|
978
|
+
return val;
|
|
979
|
+
}
|
|
980
|
+
};
|
|
981
|
+
|
|
982
|
+
#if HWY_COMPILER_CLANG >= 900 && HWY_COMPILER_CLANG < 1000
|
|
983
|
+
// Workaround for Clang 9 constexpr __builtin_bit_cast bug
|
|
984
|
+
template <class To, class From,
|
|
985
|
+
hwy::EnableIf<hwy::IsInteger<RemoveCvRef<To>>() &&
|
|
986
|
+
hwy::IsInteger<RemoveCvRef<From>>()>* = nullptr>
|
|
987
|
+
static HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR To
|
|
988
|
+
BuiltinBitCastScalar(const From& val) {
|
|
989
|
+
static_assert(sizeof(To) == sizeof(From),
|
|
990
|
+
"sizeof(To) == sizeof(From) must be true");
|
|
991
|
+
return static_cast<To>(val);
|
|
992
|
+
}
|
|
993
|
+
|
|
994
|
+
template <class To, class From,
|
|
995
|
+
hwy::EnableIf<!(hwy::IsInteger<RemoveCvRef<To>>() &&
|
|
996
|
+
hwy::IsInteger<RemoveCvRef<From>>())>* = nullptr>
|
|
997
|
+
static HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR To
|
|
998
|
+
BuiltinBitCastScalar(const From& val) {
|
|
999
|
+
return __builtin_bit_cast(To, val);
|
|
1000
|
+
}
|
|
1001
|
+
#endif // HWY_COMPILER_CLANG >= 900 && HWY_COMPILER_CLANG < 1000
|
|
1002
|
+
|
|
1003
|
+
} // namespace detail
|
|
1004
|
+
|
|
1005
|
+
template <class To, class From, HWY_IF_NOT_SPECIAL_FLOAT(To)>
|
|
1006
|
+
HWY_API HWY_BITCASTSCALAR_CONSTEXPR To BitCastScalar(const From& val) {
|
|
1007
|
+
// If From is hwy::float16_t or hwy::bfloat16_t, first cast val to either
|
|
1008
|
+
// const typename From::Native& or const uint16_t& using
|
|
1009
|
+
// detail::BitCastScalarSrcCastHelper<RemoveCvRef<From>>::CastSrcValRef to
|
|
1010
|
+
// allow BitCastScalar from hwy::float16_t or hwy::bfloat16_t to be constexpr
|
|
1011
|
+
// if To is not a pointer type, union type, or a struct/class containing a
|
|
1012
|
+
// pointer, union, or reference subobject
|
|
1013
|
+
#if HWY_COMPILER_CLANG >= 900 && HWY_COMPILER_CLANG < 1000
|
|
1014
|
+
return detail::BuiltinBitCastScalar<To>(
|
|
1015
|
+
detail::BitCastScalarSrcCastHelper<RemoveCvRef<From>>::CastSrcValRef(
|
|
1016
|
+
val));
|
|
1017
|
+
#else
|
|
1018
|
+
return __builtin_bit_cast(
|
|
1019
|
+
To, detail::BitCastScalarSrcCastHelper<RemoveCvRef<From>>::CastSrcValRef(
|
|
1020
|
+
val));
|
|
1021
|
+
#endif
|
|
1022
|
+
}
|
|
1023
|
+
template <class To, class From, HWY_IF_SPECIAL_FLOAT(To)>
|
|
1024
|
+
HWY_API HWY_BITCASTSCALAR_CONSTEXPR To BitCastScalar(const From& val) {
|
|
1025
|
+
// If To is hwy::float16_t or hwy::bfloat16_t, first do a BitCastScalar of val
|
|
1026
|
+
// to uint16_t, and then bit cast the uint16_t value to To using To::FromBits
|
|
1027
|
+
// as hwy::float16_t::FromBits and hwy::bfloat16_t::FromBits are guaranteed to
|
|
1028
|
+
// be constexpr if the __builtin_bit_cast intrinsic is available.
|
|
1029
|
+
return To::FromBits(BitCastScalar<uint16_t>(val));
|
|
1030
|
+
}
|
|
1031
|
+
#else
|
|
1032
|
+
template <class To, class From>
|
|
1033
|
+
HWY_API HWY_BITCASTSCALAR_CONSTEXPR To BitCastScalar(const From& val) {
|
|
1034
|
+
To result;
|
|
1035
|
+
CopySameSize(&val, &result);
|
|
1036
|
+
return result;
|
|
1037
|
+
}
|
|
1038
|
+
#endif
|
|
1039
|
+
|
|
1040
|
+
//------------------------------------------------------------------------------
|
|
1041
|
+
// F16 lane type
|
|
1042
|
+
|
|
1043
|
+
#pragma pack(push, 1)
|
|
1044
|
+
|
|
1045
|
+
// Compiler supports __fp16 and load/store/conversion NEON intrinsics, which are
|
|
1046
|
+
// included in Armv8 and VFPv4 (except with MSVC). On Armv7 Clang requires
|
|
1047
|
+
// __ARM_FP & 2 whereas Armv7 GCC requires -mfp16-format=ieee.
|
|
1048
|
+
#if (HWY_ARCH_ARM_A64 && !HWY_COMPILER_MSVC) || \
|
|
1049
|
+
(HWY_COMPILER_CLANG && defined(__ARM_FP) && (__ARM_FP & 2)) || \
|
|
1050
|
+
(HWY_COMPILER_GCC_ACTUAL && defined(__ARM_FP16_FORMAT_IEEE))
|
|
1051
|
+
#define HWY_NEON_HAVE_F16C 1
|
|
1052
|
+
#else
|
|
1053
|
+
#define HWY_NEON_HAVE_F16C 0
|
|
1054
|
+
#endif
|
|
1055
|
+
|
|
1056
|
+
// RVV with f16 extension supports _Float16 and f16 vector ops. If set, implies
|
|
1057
|
+
// HWY_HAVE_FLOAT16.
|
|
1058
|
+
#if HWY_ARCH_RISCV && defined(__riscv_zvfh) && HWY_COMPILER_CLANG >= 1600
|
|
1059
|
+
#define HWY_RVV_HAVE_F16_VEC 1
|
|
1060
|
+
#else
|
|
1061
|
+
#define HWY_RVV_HAVE_F16_VEC 0
|
|
1062
|
+
#endif
|
|
1063
|
+
|
|
1064
|
+
// x86 compiler supports _Float16, not necessarily with operators.
|
|
1065
|
+
// Avoid clang-cl because it lacks __extendhfsf2.
|
|
1066
|
+
#if HWY_ARCH_X86 && defined(__SSE2__) && defined(__FLT16_MAX__) && \
|
|
1067
|
+
((HWY_COMPILER_CLANG >= 1500 && !HWY_COMPILER_CLANGCL) || \
|
|
1068
|
+
HWY_COMPILER_GCC_ACTUAL >= 1200)
|
|
1069
|
+
#define HWY_SSE2_HAVE_F16_TYPE 1
|
|
1070
|
+
#else
|
|
1071
|
+
#define HWY_SSE2_HAVE_F16_TYPE 0
|
|
1072
|
+
#endif
|
|
1073
|
+
|
|
1074
|
+
#ifndef HWY_HAVE_SCALAR_F16_TYPE
|
|
1075
|
+
// Compiler supports _Float16, not necessarily with operators.
|
|
1076
|
+
#if HWY_NEON_HAVE_F16C || HWY_RVV_HAVE_F16_VEC || HWY_SSE2_HAVE_F16_TYPE
|
|
1077
|
+
#define HWY_HAVE_SCALAR_F16_TYPE 1
|
|
1078
|
+
#else
|
|
1079
|
+
#define HWY_HAVE_SCALAR_F16_TYPE 0
|
|
1080
|
+
#endif
|
|
1081
|
+
#endif // HWY_HAVE_SCALAR_F16_TYPE
|
|
1082
|
+
|
|
1083
|
+
#ifndef HWY_HAVE_SCALAR_F16_OPERATORS
|
|
1084
|
+
// Recent enough compiler also has operators.
|
|
1085
|
+
#if HWY_HAVE_SCALAR_F16_TYPE && \
|
|
1086
|
+
(HWY_COMPILER_CLANG >= 1800 || HWY_COMPILER_GCC_ACTUAL >= 1200 || \
|
|
1087
|
+
(HWY_COMPILER_CLANG >= 1500 && !HWY_COMPILER_CLANGCL && \
|
|
1088
|
+
!defined(_WIN32)) || \
|
|
1089
|
+
(HWY_ARCH_ARM && \
|
|
1090
|
+
(HWY_COMPILER_CLANG >= 900 || HWY_COMPILER_GCC_ACTUAL >= 800)))
|
|
1091
|
+
#define HWY_HAVE_SCALAR_F16_OPERATORS 1
|
|
1092
|
+
#else
|
|
1093
|
+
#define HWY_HAVE_SCALAR_F16_OPERATORS 0
|
|
1094
|
+
#endif
|
|
1095
|
+
#endif // HWY_HAVE_SCALAR_F16_OPERATORS
|
|
1096
|
+
|
|
1097
|
+
namespace detail {
|
|
1098
|
+
|
|
1099
|
+
template <class T, class TVal = RemoveCvRef<T>, bool = IsSpecialFloat<TVal>()>
|
|
1100
|
+
struct SpecialFloatUnwrapArithOpOperandT {};
|
|
1101
|
+
|
|
1102
|
+
template <class T, class TVal>
|
|
1103
|
+
struct SpecialFloatUnwrapArithOpOperandT<T, TVal, false> {
|
|
1104
|
+
using type = T;
|
|
1105
|
+
};
|
|
1106
|
+
|
|
1107
|
+
template <class T>
|
|
1108
|
+
using SpecialFloatUnwrapArithOpOperand =
|
|
1109
|
+
typename SpecialFloatUnwrapArithOpOperandT<T>::type;
|
|
1110
|
+
|
|
1111
|
+
template <class T, class TVal = RemoveCvRef<T>>
|
|
1112
|
+
struct NativeSpecialFloatToWrapperT {
|
|
1113
|
+
using type = T;
|
|
1114
|
+
};
|
|
1115
|
+
|
|
1116
|
+
template <class T>
|
|
1117
|
+
using NativeSpecialFloatToWrapper =
|
|
1118
|
+
typename NativeSpecialFloatToWrapperT<T>::type;
|
|
1119
|
+
|
|
1120
|
+
} // namespace detail
|
|
1121
|
+
|
|
1122
|
+
// Match [u]int##_t naming scheme so rvv-inl.h macros can obtain the type name
|
|
1123
|
+
// by concatenating base type and bits. We use a wrapper class instead of a
|
|
1124
|
+
// typedef to the native type to ensure that the same symbols, e.g. for VQSort,
|
|
1125
|
+
// are generated regardless of F16 support; see #1684.
|
|
1126
|
+
struct alignas(2) float16_t {
|
|
1127
|
+
#if HWY_HAVE_SCALAR_F16_TYPE
|
|
1128
|
+
#if HWY_RVV_HAVE_F16_VEC || HWY_SSE2_HAVE_F16_TYPE
|
|
1129
|
+
using Native = _Float16;
|
|
1130
|
+
#elif HWY_NEON_HAVE_F16C
|
|
1131
|
+
using Native = __fp16;
|
|
1132
|
+
#else
|
|
1133
|
+
#error "Logic error: condition should be 'all but NEON_HAVE_F16C'"
|
|
1134
|
+
#endif
|
|
1135
|
+
#endif // HWY_HAVE_SCALAR_F16_TYPE
|
|
1136
|
+
|
|
1137
|
+
union {
|
|
1138
|
+
#if HWY_HAVE_SCALAR_F16_TYPE
|
|
1139
|
+
// Accessed via NativeLaneType, and used directly if
|
|
1140
|
+
// HWY_HAVE_SCALAR_F16_OPERATORS.
|
|
1141
|
+
Native native;
|
|
1142
|
+
#endif
|
|
1143
|
+
// Only accessed via NativeLaneType or U16LaneType.
|
|
1144
|
+
uint16_t bits;
|
|
1145
|
+
};
|
|
1146
|
+
|
|
1147
|
+
// Default init and copying.
|
|
1148
|
+
float16_t() noexcept = default;
|
|
1149
|
+
constexpr float16_t(const float16_t&) noexcept = default;
|
|
1150
|
+
constexpr float16_t(float16_t&&) noexcept = default;
|
|
1151
|
+
float16_t& operator=(const float16_t&) noexcept = default;
|
|
1152
|
+
float16_t& operator=(float16_t&&) noexcept = default;
|
|
1153
|
+
|
|
1154
|
+
#if HWY_HAVE_SCALAR_F16_TYPE
|
|
1155
|
+
// NEON vget/set_lane intrinsics and SVE `svaddv` could use explicit
|
|
1156
|
+
// float16_t(intrinsic()), but user code expects implicit conversions.
|
|
1157
|
+
constexpr float16_t(Native arg) noexcept : native(arg) {}
|
|
1158
|
+
constexpr operator Native() const noexcept { return native; }
|
|
1159
|
+
#endif
|
|
1160
|
+
|
|
1161
|
+
#if HWY_HAVE_SCALAR_F16_TYPE
|
|
1162
|
+
static HWY_BITCASTSCALAR_CONSTEXPR float16_t FromBits(uint16_t bits) {
|
|
1163
|
+
return float16_t(BitCastScalar<Native>(bits));
|
|
1164
|
+
}
|
|
1165
|
+
#else
|
|
1166
|
+
|
|
1167
|
+
private:
|
|
1168
|
+
struct F16FromU16BitsTag {};
|
|
1169
|
+
constexpr float16_t(F16FromU16BitsTag /*tag*/, uint16_t u16_bits)
|
|
1170
|
+
: bits(u16_bits) {}
|
|
1171
|
+
|
|
1172
|
+
public:
|
|
1173
|
+
static constexpr float16_t FromBits(uint16_t bits) {
|
|
1174
|
+
return float16_t(F16FromU16BitsTag(), bits);
|
|
1175
|
+
}
|
|
1176
|
+
#endif
|
|
1177
|
+
|
|
1178
|
+
// When backed by a native type, ensure the wrapper behaves like the native
|
|
1179
|
+
// type by forwarding all operators. Unfortunately it seems difficult to reuse
|
|
1180
|
+
// this code in a base class, so we repeat it in float16_t.
|
|
1181
|
+
#if HWY_HAVE_SCALAR_F16_OPERATORS || HWY_IDE
|
|
1182
|
+
template <typename T, hwy::EnableIf<!IsSame<RemoveCvRef<T>, float16_t>() &&
|
|
1183
|
+
IsConvertible<T, Native>()>* = nullptr>
|
|
1184
|
+
constexpr float16_t(T&& arg) noexcept
|
|
1185
|
+
: native(static_cast<Native>(static_cast<T&&>(arg))) {}
|
|
1186
|
+
|
|
1187
|
+
template <typename T, hwy::EnableIf<!IsSame<RemoveCvRef<T>, float16_t>() &&
|
|
1188
|
+
!IsConvertible<T, Native>() &&
|
|
1189
|
+
IsStaticCastable<T, Native>()>* = nullptr>
|
|
1190
|
+
explicit constexpr float16_t(T&& arg) noexcept
|
|
1191
|
+
: native(static_cast<Native>(static_cast<T&&>(arg))) {}
|
|
1192
|
+
|
|
1193
|
+
// pre-decrement operator (--x)
|
|
1194
|
+
HWY_CXX14_CONSTEXPR float16_t& operator--() noexcept {
|
|
1195
|
+
native = static_cast<Native>(native - Native{1});
|
|
1196
|
+
return *this;
|
|
1197
|
+
}
|
|
1198
|
+
|
|
1199
|
+
// post-decrement operator (x--)
|
|
1200
|
+
HWY_CXX14_CONSTEXPR float16_t operator--(int) noexcept {
|
|
1201
|
+
float16_t result = *this;
|
|
1202
|
+
native = static_cast<Native>(native - Native{1});
|
|
1203
|
+
return result;
|
|
1204
|
+
}
|
|
1205
|
+
|
|
1206
|
+
// pre-increment operator (++x)
|
|
1207
|
+
HWY_CXX14_CONSTEXPR float16_t& operator++() noexcept {
|
|
1208
|
+
native = static_cast<Native>(native + Native{1});
|
|
1209
|
+
return *this;
|
|
1210
|
+
}
|
|
1211
|
+
|
|
1212
|
+
// post-increment operator (x++)
|
|
1213
|
+
HWY_CXX14_CONSTEXPR float16_t operator++(int) noexcept {
|
|
1214
|
+
float16_t result = *this;
|
|
1215
|
+
native = static_cast<Native>(native + Native{1});
|
|
1216
|
+
return result;
|
|
1217
|
+
}
|
|
1218
|
+
|
|
1219
|
+
constexpr float16_t operator-() const noexcept {
|
|
1220
|
+
return float16_t(static_cast<Native>(-native));
|
|
1221
|
+
}
|
|
1222
|
+
constexpr float16_t operator+() const noexcept { return *this; }
|
|
1223
|
+
|
|
1224
|
+
// Reduce clutter by generating `operator+` and `operator+=` etc. Note that
|
|
1225
|
+
// we cannot token-paste `operator` and `+`, so pass it in as `op_func`.
|
|
1226
|
+
#define HWY_FLOAT16_BINARY_OP(op, op_func, assign_func) \
|
|
1227
|
+
constexpr float16_t op_func(const float16_t& rhs) const noexcept { \
|
|
1228
|
+
return float16_t(static_cast<Native>(native op rhs.native)); \
|
|
1229
|
+
} \
|
|
1230
|
+
template <typename T, HWY_IF_NOT_F16(T), \
|
|
1231
|
+
typename UnwrappedT = \
|
|
1232
|
+
detail::SpecialFloatUnwrapArithOpOperand<const T&>, \
|
|
1233
|
+
typename RawResultT = \
|
|
1234
|
+
decltype(DeclVal<Native>() op DeclVal<UnwrappedT>()), \
|
|
1235
|
+
typename ResultT = \
|
|
1236
|
+
detail::NativeSpecialFloatToWrapper<RawResultT>, \
|
|
1237
|
+
HWY_IF_CASTABLE(RawResultT, ResultT)> \
|
|
1238
|
+
constexpr ResultT op_func(const T& rhs) const noexcept(noexcept( \
|
|
1239
|
+
static_cast<ResultT>(DeclVal<Native>() op DeclVal<UnwrappedT>()))) { \
|
|
1240
|
+
return static_cast<ResultT>(native op static_cast<UnwrappedT>(rhs)); \
|
|
1241
|
+
} \
|
|
1242
|
+
HWY_CXX14_CONSTEXPR hwy::float16_t& assign_func( \
|
|
1243
|
+
const hwy::float16_t& rhs) noexcept { \
|
|
1244
|
+
native = static_cast<Native>(native op rhs.native); \
|
|
1245
|
+
return *this; \
|
|
1246
|
+
} \
|
|
1247
|
+
template <typename T, HWY_IF_NOT_F16(T), \
|
|
1248
|
+
HWY_IF_OP_CASTABLE(op, const T&, Native), \
|
|
1249
|
+
HWY_IF_ASSIGNABLE( \
|
|
1250
|
+
Native, decltype(DeclVal<Native>() op DeclVal<const T&>()))> \
|
|
1251
|
+
HWY_CXX14_CONSTEXPR hwy::float16_t& assign_func(const T& rhs) noexcept( \
|
|
1252
|
+
noexcept( \
|
|
1253
|
+
static_cast<Native>(DeclVal<Native>() op DeclVal<const T&>()))) { \
|
|
1254
|
+
native = static_cast<Native>(native op rhs); \
|
|
1255
|
+
return *this; \
|
|
1256
|
+
}
|
|
1257
|
+
|
|
1258
|
+
HWY_FLOAT16_BINARY_OP(+, operator+, operator+=)
|
|
1259
|
+
HWY_FLOAT16_BINARY_OP(-, operator-, operator-=)
|
|
1260
|
+
HWY_FLOAT16_BINARY_OP(*, operator*, operator*=)
|
|
1261
|
+
HWY_FLOAT16_BINARY_OP(/, operator/, operator/=)
|
|
1262
|
+
#undef HWY_FLOAT16_BINARY_OP
|
|
1263
|
+
|
|
1264
|
+
#endif // HWY_HAVE_SCALAR_F16_OPERATORS
|
|
1265
|
+
};
|
|
1266
|
+
static_assert(sizeof(hwy::float16_t) == 2, "Wrong size of float16_t");
|
|
1267
|
+
|
|
1268
|
+
#if HWY_HAVE_SCALAR_F16_TYPE
|
|
1269
|
+
namespace detail {
|
|
1270
|
+
|
|
1271
|
+
#if HWY_HAVE_SCALAR_F16_OPERATORS
|
|
1272
|
+
template <class T>
|
|
1273
|
+
struct SpecialFloatUnwrapArithOpOperandT<T, hwy::float16_t, true> {
|
|
1274
|
+
using type = hwy::float16_t::Native;
|
|
1275
|
+
};
|
|
1276
|
+
#endif
|
|
1277
|
+
|
|
1278
|
+
template <class T>
|
|
1279
|
+
struct NativeSpecialFloatToWrapperT<T, hwy::float16_t::Native> {
|
|
1280
|
+
using type = hwy::float16_t;
|
|
1281
|
+
};
|
|
1282
|
+
|
|
1283
|
+
} // namespace detail
|
|
1284
|
+
#endif // HWY_HAVE_SCALAR_F16_TYPE
|
|
1285
|
+
|
|
1286
|
+
#if HWY_HAS_BUILTIN(__builtin_bit_cast) || HWY_COMPILER_MSVC >= 1926
|
|
1287
|
+
namespace detail {
|
|
1288
|
+
|
|
1289
|
+
template <>
|
|
1290
|
+
struct BitCastScalarSrcCastHelper<hwy::float16_t> {
|
|
1291
|
+
#if HWY_HAVE_SCALAR_F16_TYPE
|
|
1292
|
+
static HWY_INLINE constexpr const hwy::float16_t::Native& CastSrcValRef(
|
|
1293
|
+
const hwy::float16_t& val) {
|
|
1294
|
+
return val.native;
|
|
1295
|
+
}
|
|
1296
|
+
#else
|
|
1297
|
+
static HWY_INLINE constexpr const uint16_t& CastSrcValRef(
|
|
1298
|
+
const hwy::float16_t& val) {
|
|
1299
|
+
return val.bits;
|
|
1300
|
+
}
|
|
1301
|
+
#endif
|
|
1302
|
+
};
|
|
1303
|
+
|
|
1304
|
+
} // namespace detail
|
|
1305
|
+
#endif // HWY_HAS_BUILTIN(__builtin_bit_cast) || HWY_COMPILER_MSVC >= 1926
|
|
1306
|
+
|
|
1307
|
+
#if HWY_HAVE_SCALAR_F16_OPERATORS
|
|
1308
|
+
#define HWY_F16_CONSTEXPR constexpr
|
|
1309
|
+
#else
|
|
1310
|
+
#define HWY_F16_CONSTEXPR HWY_BITCASTSCALAR_CXX14_CONSTEXPR
|
|
1311
|
+
#endif // HWY_HAVE_SCALAR_F16_OPERATORS
|
|
1312
|
+
|
|
1313
|
+
HWY_API HWY_F16_CONSTEXPR float F32FromF16(float16_t f16) {
|
|
1314
|
+
#if HWY_HAVE_SCALAR_F16_OPERATORS && !HWY_IDE
|
|
1315
|
+
return static_cast<float>(f16);
|
|
1316
|
+
#endif
|
|
1317
|
+
#if !HWY_HAVE_SCALAR_F16_OPERATORS || HWY_IDE
|
|
1318
|
+
const uint16_t bits16 = BitCastScalar<uint16_t>(f16);
|
|
1319
|
+
const uint32_t sign = static_cast<uint32_t>(bits16 >> 15);
|
|
1320
|
+
const uint32_t biased_exp = (bits16 >> 10) & 0x1F;
|
|
1321
|
+
const uint32_t mantissa = bits16 & 0x3FF;
|
|
1322
|
+
|
|
1323
|
+
// Subnormal or zero
|
|
1324
|
+
if (biased_exp == 0) {
|
|
1325
|
+
const float subnormal =
|
|
1326
|
+
(1.0f / 16384) * (static_cast<float>(mantissa) * (1.0f / 1024));
|
|
1327
|
+
return sign ? -subnormal : subnormal;
|
|
1328
|
+
}
|
|
1329
|
+
|
|
1330
|
+
// Normalized, infinity or NaN: convert the representation directly
|
|
1331
|
+
// (faster than ldexp/tables).
|
|
1332
|
+
const uint32_t biased_exp32 =
|
|
1333
|
+
biased_exp == 31 ? 0xFF : biased_exp + (127 - 15);
|
|
1334
|
+
const uint32_t mantissa32 = mantissa << (23 - 10);
|
|
1335
|
+
const uint32_t bits32 = (sign << 31) | (biased_exp32 << 23) | mantissa32;
|
|
1336
|
+
|
|
1337
|
+
return BitCastScalar<float>(bits32);
|
|
1338
|
+
#endif // !HWY_HAVE_SCALAR_F16_OPERATORS
|
|
1339
|
+
}
|
|
1340
|
+
|
|
1341
|
+
#if HWY_IS_DEBUG_BUILD && \
|
|
1342
|
+
(HWY_HAS_BUILTIN(__builtin_bit_cast) || HWY_COMPILER_MSVC >= 1926)
|
|
1343
|
+
#if defined(__cpp_if_consteval) && __cpp_if_consteval >= 202106L
|
|
1344
|
+
// If C++23 if !consteval support is available, only execute
|
|
1345
|
+
// HWY_DASSERT(condition) if F16FromF32 is not called from a constant-evaluated
|
|
1346
|
+
// context to avoid compilation errors.
|
|
1347
|
+
#define HWY_F16_FROM_F32_DASSERT(condition) \
|
|
1348
|
+
do { \
|
|
1349
|
+
if !consteval { \
|
|
1350
|
+
HWY_DASSERT(condition); \
|
|
1351
|
+
} \
|
|
1352
|
+
} while (0)
|
|
1353
|
+
#elif HWY_HAS_BUILTIN(__builtin_is_constant_evaluated) || \
|
|
1354
|
+
HWY_COMPILER_MSVC >= 1926
|
|
1355
|
+
// If the __builtin_is_constant_evaluated() intrinsic is available,
|
|
1356
|
+
// only do HWY_DASSERT(condition) if __builtin_is_constant_evaluated() returns
|
|
1357
|
+
// false to avoid compilation errors if F16FromF32 is called from a
|
|
1358
|
+
// constant-evaluated context.
|
|
1359
|
+
#define HWY_F16_FROM_F32_DASSERT(condition) \
|
|
1360
|
+
do { \
|
|
1361
|
+
if (!__builtin_is_constant_evaluated()) { \
|
|
1362
|
+
HWY_DASSERT(condition); \
|
|
1363
|
+
} \
|
|
1364
|
+
} while (0)
|
|
1365
|
+
#else
|
|
1366
|
+
// If C++23 if !consteval support is not available,
|
|
1367
|
+
// the __builtin_is_constant_evaluated() intrinsic is not available,
|
|
1368
|
+
// HWY_IS_DEBUG_BUILD is 1, and the __builtin_bit_cast intrinsic is available,
|
|
1369
|
+
// do not do a HWY_DASSERT to avoid compilation errors if F16FromF32 is
|
|
1370
|
+
// called from a constant-evaluated context.
|
|
1371
|
+
#define HWY_F16_FROM_F32_DASSERT(condition) \
|
|
1372
|
+
do { \
|
|
1373
|
+
} while (0)
|
|
1374
|
+
#endif // defined(__cpp_if_consteval) && __cpp_if_consteval >= 202106L
|
|
1375
|
+
#else
|
|
1376
|
+
// If HWY_IS_DEBUG_BUILD is 0 or the __builtin_bit_cast intrinsic is not
|
|
1377
|
+
// available, define HWY_F16_FROM_F32_DASSERT(condition) as
|
|
1378
|
+
// HWY_DASSERT(condition)
|
|
1379
|
+
#define HWY_F16_FROM_F32_DASSERT(condition) HWY_DASSERT(condition)
|
|
1380
|
+
#endif // HWY_IS_DEBUG_BUILD && (HWY_HAS_BUILTIN(__builtin_bit_cast) ||
|
|
1381
|
+
// HWY_COMPILER_MSVC >= 1926)
|
|
1382
|
+
|
|
1383
|
+
HWY_API HWY_F16_CONSTEXPR float16_t F16FromF32(float f32) {
|
|
1384
|
+
#if HWY_HAVE_SCALAR_F16_OPERATORS && !HWY_IDE
|
|
1385
|
+
return float16_t(static_cast<float16_t::Native>(f32));
|
|
1386
|
+
#endif
|
|
1387
|
+
#if !HWY_HAVE_SCALAR_F16_OPERATORS || HWY_IDE
|
|
1388
|
+
const uint32_t bits32 = BitCastScalar<uint32_t>(f32);
|
|
1389
|
+
const uint32_t sign = bits32 >> 31;
|
|
1390
|
+
const uint32_t biased_exp32 = (bits32 >> 23) & 0xFF;
|
|
1391
|
+
constexpr uint32_t kMantissaMask = 0x7FFFFF;
|
|
1392
|
+
const uint32_t mantissa32 = bits32 & kMantissaMask;
|
|
1393
|
+
|
|
1394
|
+
// Before shifting (truncation), round to nearest even to reduce bias. If
|
|
1395
|
+
// the lowest remaining mantissa bit is odd, increase the offset. Example
|
|
1396
|
+
// with the lowest remaining bit (left) and next lower two bits; the
|
|
1397
|
+
// latter, plus two more, will be truncated.
|
|
1398
|
+
// 0[00] + 1 = 0[01]
|
|
1399
|
+
// 0[01] + 1 = 0[10]
|
|
1400
|
+
// 0[10] + 1 = 0[11] (round down toward even)
|
|
1401
|
+
// 0[11] + 1 = 1[00] (round up)
|
|
1402
|
+
// 1[00] + 10 = 1[10]
|
|
1403
|
+
// 1[01] + 10 = 1[11]
|
|
1404
|
+
// 1[10] + 10 = C0[00] (round up toward even with C=1 carry out)
|
|
1405
|
+
// 1[11] + 10 = C0[01] (round up toward even with C=1 carry out)
|
|
1406
|
+
|
|
1407
|
+
// If |f32| >= 2^-24, f16_ulp_bit_idx is the index of the F32 mantissa bit
|
|
1408
|
+
// that will be shifted down into the ULP bit of the rounded down F16 result
|
|
1409
|
+
|
|
1410
|
+
// The biased F32 exponent of 2^-14 (the smallest positive normal F16 value)
|
|
1411
|
+
// is 113, and bit 13 of the F32 mantissa will be shifted down to into the ULP
|
|
1412
|
+
// bit of the rounded down F16 result if |f32| >= 2^14
|
|
1413
|
+
|
|
1414
|
+
// If |f32| < 2^-24, f16_ulp_bit_idx is equal to 24 as there are 24 mantissa
|
|
1415
|
+
// bits (including the implied 1 bit) in the mantissa of a normal F32 value
|
|
1416
|
+
// and as we want to round up the mantissa if |f32| > 2^-25 && |f32| < 2^-24
|
|
1417
|
+
const int32_t f16_ulp_bit_idx =
|
|
1418
|
+
HWY_MIN(HWY_MAX(126 - static_cast<int32_t>(biased_exp32), 13), 24);
|
|
1419
|
+
const uint32_t odd_bit = ((mantissa32 | 0x800000u) >> f16_ulp_bit_idx) & 1;
|
|
1420
|
+
const uint32_t rounded =
|
|
1421
|
+
mantissa32 + odd_bit + (uint32_t{1} << (f16_ulp_bit_idx - 1)) - 1u;
|
|
1422
|
+
const bool carry = rounded >= (1u << 23);
|
|
1423
|
+
|
|
1424
|
+
const int32_t exp = static_cast<int32_t>(biased_exp32) - 127 + carry;
|
|
1425
|
+
|
|
1426
|
+
// Tiny or zero => zero.
|
|
1427
|
+
if (exp < -24) {
|
|
1428
|
+
// restore original sign
|
|
1429
|
+
return float16_t::FromBits(static_cast<uint16_t>(sign << 15));
|
|
1430
|
+
}
|
|
1431
|
+
|
|
1432
|
+
// If biased_exp16 would be >= 31, first check whether the input was NaN so we
|
|
1433
|
+
// can set the mantissa to nonzero.
|
|
1434
|
+
const bool is_nan = (biased_exp32 == 255) && mantissa32 != 0;
|
|
1435
|
+
const bool overflowed = exp >= 16;
|
|
1436
|
+
const uint32_t biased_exp16 =
|
|
1437
|
+
static_cast<uint32_t>(HWY_MIN(HWY_MAX(0, exp + 15), 31));
|
|
1438
|
+
// exp = [-24, -15] => subnormal, shift the mantissa.
|
|
1439
|
+
const uint32_t sub_exp = static_cast<uint32_t>(HWY_MAX(-14 - exp, 0));
|
|
1440
|
+
HWY_F16_FROM_F32_DASSERT(sub_exp < 11);
|
|
1441
|
+
const uint32_t shifted_mantissa =
|
|
1442
|
+
(rounded & kMantissaMask) >> (23 - 10 + sub_exp);
|
|
1443
|
+
const uint32_t leading = sub_exp == 0u ? 0u : (1024u >> sub_exp);
|
|
1444
|
+
const uint32_t mantissa16 = is_nan ? 0x3FF
|
|
1445
|
+
: overflowed ? 0u
|
|
1446
|
+
: (leading + shifted_mantissa);
|
|
1447
|
+
|
|
1448
|
+
#if HWY_IS_DEBUG_BUILD
|
|
1449
|
+
if (exp < -14) {
|
|
1450
|
+
HWY_F16_FROM_F32_DASSERT(biased_exp16 == 0);
|
|
1451
|
+
HWY_F16_FROM_F32_DASSERT(sub_exp >= 1);
|
|
1452
|
+
} else if (exp <= 15) {
|
|
1453
|
+
HWY_F16_FROM_F32_DASSERT(1 <= biased_exp16 && biased_exp16 < 31);
|
|
1454
|
+
HWY_F16_FROM_F32_DASSERT(sub_exp == 0);
|
|
1455
|
+
}
|
|
1456
|
+
#endif
|
|
1457
|
+
|
|
1458
|
+
HWY_F16_FROM_F32_DASSERT(mantissa16 < 1024);
|
|
1459
|
+
const uint32_t bits16 = (sign << 15) | (biased_exp16 << 10) | mantissa16;
|
|
1460
|
+
HWY_F16_FROM_F32_DASSERT(bits16 < 0x10000);
|
|
1461
|
+
const uint16_t narrowed = static_cast<uint16_t>(bits16); // big-endian safe
|
|
1462
|
+
return float16_t::FromBits(narrowed);
|
|
1463
|
+
#endif // !HWY_HAVE_SCALAR_F16_OPERATORS
|
|
1464
|
+
}
|
|
1465
|
+
|
|
1466
|
+
HWY_API HWY_F16_CONSTEXPR float16_t F16FromF64(double f64) {
|
|
1467
|
+
#if HWY_HAVE_SCALAR_F16_OPERATORS
|
|
1468
|
+
return float16_t(static_cast<float16_t::Native>(f64));
|
|
1469
|
+
#else
|
|
1470
|
+
// The mantissa bits of f64 are first rounded using round-to-odd rounding
|
|
1471
|
+
// to the nearest f64 value that has the lower 29 bits zeroed out to
|
|
1472
|
+
// ensure that the result is correctly rounded to a F16.
|
|
1473
|
+
|
|
1474
|
+
// The F64 round-to-odd operation below will round a normal F64 value
|
|
1475
|
+
// (using round-to-odd rounding) to a F64 value that has 24 bits of precision.
|
|
1476
|
+
|
|
1477
|
+
// It is okay if the magnitude of a denormal F64 value is rounded up in the
|
|
1478
|
+
// F64 round-to-odd step below as the magnitude of a denormal F64 value is
|
|
1479
|
+
// much smaller than 2^(-24) (the smallest positive denormal F16 value).
|
|
1480
|
+
|
|
1481
|
+
// It is also okay if bit 29 of a NaN F64 value is changed by the F64
|
|
1482
|
+
// round-to-odd step below as the lower 13 bits of a F32 NaN value are usually
|
|
1483
|
+
// discarded or ignored by the conversion of a F32 NaN value to a F16.
|
|
1484
|
+
|
|
1485
|
+
// If f64 is a NaN value, the result of the F64 round-to-odd step will be a
|
|
1486
|
+
// NaN value as the result of the F64 round-to-odd step will have at least one
|
|
1487
|
+
// mantissa bit if f64 is a NaN value.
|
|
1488
|
+
|
|
1489
|
+
// The F64 round-to-odd step will ensure that the F64 to F32 conversion is
|
|
1490
|
+
// exact if the magnitude of the rounded F64 value (using round-to-odd
|
|
1491
|
+
// rounding) is between 2^(-126) (the smallest normal F32 value) and
|
|
1492
|
+
// HighestValue<float>() (the largest finite F32 value)
|
|
1493
|
+
|
|
1494
|
+
// It is okay if the F64 to F32 conversion is inexact for F64 values that have
|
|
1495
|
+
// a magnitude that is less than 2^(-126) as the magnitude of a denormal F32
|
|
1496
|
+
// value is much smaller than 2^(-24) (the smallest positive denormal F16
|
|
1497
|
+
// value).
|
|
1498
|
+
|
|
1499
|
+
return F16FromF32(
|
|
1500
|
+
static_cast<float>(BitCastScalar<double>(static_cast<uint64_t>(
|
|
1501
|
+
(BitCastScalar<uint64_t>(f64) & 0xFFFFFFFFE0000000ULL) |
|
|
1502
|
+
((BitCastScalar<uint64_t>(f64) + 0x000000001FFFFFFFULL) &
|
|
1503
|
+
0x0000000020000000ULL)))));
|
|
1504
|
+
#endif
|
|
1505
|
+
}
|
|
1506
|
+
|
|
1507
|
+
// More convenient to define outside float16_t because these may use
|
|
1508
|
+
// F32FromF16, which is defined after the struct.
|
|
1509
|
+
HWY_F16_CONSTEXPR inline bool operator==(float16_t lhs,
|
|
1510
|
+
float16_t rhs) noexcept {
|
|
1511
|
+
#if HWY_HAVE_SCALAR_F16_OPERATORS
|
|
1512
|
+
return lhs.native == rhs.native;
|
|
1513
|
+
#else
|
|
1514
|
+
return F32FromF16(lhs) == F32FromF16(rhs);
|
|
1515
|
+
#endif
|
|
1516
|
+
}
|
|
1517
|
+
HWY_F16_CONSTEXPR inline bool operator!=(float16_t lhs,
|
|
1518
|
+
float16_t rhs) noexcept {
|
|
1519
|
+
#if HWY_HAVE_SCALAR_F16_OPERATORS
|
|
1520
|
+
return lhs.native != rhs.native;
|
|
1521
|
+
#else
|
|
1522
|
+
return F32FromF16(lhs) != F32FromF16(rhs);
|
|
1523
|
+
#endif
|
|
1524
|
+
}
|
|
1525
|
+
HWY_F16_CONSTEXPR inline bool operator<(float16_t lhs, float16_t rhs) noexcept {
|
|
1526
|
+
#if HWY_HAVE_SCALAR_F16_OPERATORS
|
|
1527
|
+
return lhs.native < rhs.native;
|
|
1528
|
+
#else
|
|
1529
|
+
return F32FromF16(lhs) < F32FromF16(rhs);
|
|
1530
|
+
#endif
|
|
1531
|
+
}
|
|
1532
|
+
HWY_F16_CONSTEXPR inline bool operator<=(float16_t lhs,
|
|
1533
|
+
float16_t rhs) noexcept {
|
|
1534
|
+
#if HWY_HAVE_SCALAR_F16_OPERATORS
|
|
1535
|
+
return lhs.native <= rhs.native;
|
|
1536
|
+
#else
|
|
1537
|
+
return F32FromF16(lhs) <= F32FromF16(rhs);
|
|
1538
|
+
#endif
|
|
1539
|
+
}
|
|
1540
|
+
HWY_F16_CONSTEXPR inline bool operator>(float16_t lhs, float16_t rhs) noexcept {
|
|
1541
|
+
#if HWY_HAVE_SCALAR_F16_OPERATORS
|
|
1542
|
+
return lhs.native > rhs.native;
|
|
1543
|
+
#else
|
|
1544
|
+
return F32FromF16(lhs) > F32FromF16(rhs);
|
|
1545
|
+
#endif
|
|
1546
|
+
}
|
|
1547
|
+
HWY_F16_CONSTEXPR inline bool operator>=(float16_t lhs,
|
|
1548
|
+
float16_t rhs) noexcept {
|
|
1549
|
+
#if HWY_HAVE_SCALAR_F16_OPERATORS
|
|
1550
|
+
return lhs.native >= rhs.native;
|
|
1551
|
+
#else
|
|
1552
|
+
return F32FromF16(lhs) >= F32FromF16(rhs);
|
|
1553
|
+
#endif
|
|
1554
|
+
}
|
|
1555
|
+
#if HWY_HAVE_CXX20_THREE_WAY_COMPARE
|
|
1556
|
+
HWY_F16_CONSTEXPR inline std::partial_ordering operator<=>(
|
|
1557
|
+
float16_t lhs, float16_t rhs) noexcept {
|
|
1558
|
+
#if HWY_HAVE_SCALAR_F16_OPERATORS
|
|
1559
|
+
return lhs.native <=> rhs.native;
|
|
1560
|
+
#else
|
|
1561
|
+
return F32FromF16(lhs) <=> F32FromF16(rhs);
|
|
1562
|
+
#endif
|
|
1563
|
+
}
|
|
1564
|
+
#endif // HWY_HAVE_CXX20_THREE_WAY_COMPARE
|
|
1565
|
+
|
|
1566
|
+
//------------------------------------------------------------------------------
|
|
1567
|
+
// BF16 lane type
|
|
1568
|
+
|
|
1569
|
+
// Compiler supports ACLE __bf16, not necessarily with operators.
|
|
1570
|
+
|
|
1571
|
+
// Disable the __bf16 type on AArch64 with GCC 13 or earlier as there is a bug
|
|
1572
|
+
// in GCC 13 and earlier that sometimes causes BF16 constant values to be
|
|
1573
|
+
// incorrectly loaded on AArch64, and this GCC bug on AArch64 is
|
|
1574
|
+
// described at https://gcc.gnu.org/bugzilla/show_bug.cgi?id=111867.
|
|
1575
|
+
|
|
1576
|
+
#if HWY_ARCH_ARM_A64 && \
|
|
1577
|
+
(HWY_COMPILER_CLANG >= 1700 || HWY_COMPILER_GCC_ACTUAL >= 1400)
|
|
1578
|
+
#define HWY_ARM_HAVE_SCALAR_BF16_TYPE 1
|
|
1579
|
+
#else
|
|
1580
|
+
#define HWY_ARM_HAVE_SCALAR_BF16_TYPE 0
|
|
1581
|
+
#endif
|
|
1582
|
+
|
|
1583
|
+
// x86 compiler supports __bf16, not necessarily with operators.
|
|
1584
|
+
#ifndef HWY_SSE2_HAVE_SCALAR_BF16_TYPE
|
|
1585
|
+
#if HWY_ARCH_X86 && defined(__SSE2__) && \
|
|
1586
|
+
((HWY_COMPILER_CLANG >= 1700 && !HWY_COMPILER_CLANGCL) || \
|
|
1587
|
+
HWY_COMPILER_GCC_ACTUAL >= 1300)
|
|
1588
|
+
#define HWY_SSE2_HAVE_SCALAR_BF16_TYPE 1
|
|
1589
|
+
#else
|
|
1590
|
+
#define HWY_SSE2_HAVE_SCALAR_BF16_TYPE 0
|
|
1591
|
+
#endif
|
|
1592
|
+
#endif // HWY_SSE2_HAVE_SCALAR_BF16_TYPE
|
|
1593
|
+
|
|
1594
|
+
// Compiler supports __bf16, not necessarily with operators.
|
|
1595
|
+
#if HWY_ARM_HAVE_SCALAR_BF16_TYPE || HWY_SSE2_HAVE_SCALAR_BF16_TYPE
|
|
1596
|
+
#define HWY_HAVE_SCALAR_BF16_TYPE 1
|
|
1597
|
+
#else
|
|
1598
|
+
#define HWY_HAVE_SCALAR_BF16_TYPE 0
|
|
1599
|
+
#endif
|
|
1600
|
+
|
|
1601
|
+
#ifndef HWY_HAVE_SCALAR_BF16_OPERATORS
|
|
1602
|
+
// Recent enough compiler also has operators. aarch64 clang 18 hits internal
|
|
1603
|
+
// compiler errors on bf16 ToString, hence only enable on GCC for now.
|
|
1604
|
+
#if HWY_HAVE_SCALAR_BF16_TYPE && (HWY_COMPILER_GCC_ACTUAL >= 1300)
|
|
1605
|
+
#define HWY_HAVE_SCALAR_BF16_OPERATORS 1
|
|
1606
|
+
#else
|
|
1607
|
+
#define HWY_HAVE_SCALAR_BF16_OPERATORS 0
|
|
1608
|
+
#endif
|
|
1609
|
+
#endif // HWY_HAVE_SCALAR_BF16_OPERATORS
|
|
1610
|
+
|
|
1611
|
+
#if HWY_HAVE_SCALAR_BF16_OPERATORS
|
|
1612
|
+
#define HWY_BF16_CONSTEXPR constexpr
|
|
1613
|
+
#else
|
|
1614
|
+
#define HWY_BF16_CONSTEXPR HWY_BITCASTSCALAR_CONSTEXPR
|
|
1615
|
+
#endif
|
|
1616
|
+
|
|
1617
|
+
struct alignas(2) bfloat16_t {
|
|
1618
|
+
#if HWY_HAVE_SCALAR_BF16_TYPE
|
|
1619
|
+
using Native = __bf16;
|
|
1620
|
+
#endif
|
|
1621
|
+
|
|
1622
|
+
union {
|
|
1623
|
+
#if HWY_HAVE_SCALAR_BF16_TYPE
|
|
1624
|
+
// Accessed via NativeLaneType, and used directly if
|
|
1625
|
+
// HWY_HAVE_SCALAR_BF16_OPERATORS.
|
|
1626
|
+
Native native;
|
|
1627
|
+
#endif
|
|
1628
|
+
// Only accessed via NativeLaneType or U16LaneType.
|
|
1629
|
+
uint16_t bits;
|
|
1630
|
+
};
|
|
1631
|
+
|
|
1632
|
+
// Default init and copying
|
|
1633
|
+
bfloat16_t() noexcept = default;
|
|
1634
|
+
constexpr bfloat16_t(bfloat16_t&&) noexcept = default;
|
|
1635
|
+
constexpr bfloat16_t(const bfloat16_t&) noexcept = default;
|
|
1636
|
+
bfloat16_t& operator=(bfloat16_t&& arg) noexcept = default;
|
|
1637
|
+
bfloat16_t& operator=(const bfloat16_t& arg) noexcept = default;
|
|
1638
|
+
|
|
1639
|
+
// Only enable implicit conversions if we have a native type.
|
|
1640
|
+
#if HWY_HAVE_SCALAR_BF16_TYPE
|
|
1641
|
+
constexpr bfloat16_t(Native arg) noexcept : native(arg) {}
|
|
1642
|
+
constexpr operator Native() const noexcept { return native; }
|
|
1643
|
+
#endif
|
|
1644
|
+
|
|
1645
|
+
#if HWY_HAVE_SCALAR_BF16_TYPE
|
|
1646
|
+
static HWY_BITCASTSCALAR_CONSTEXPR bfloat16_t FromBits(uint16_t bits) {
|
|
1647
|
+
return bfloat16_t(BitCastScalar<Native>(bits));
|
|
1648
|
+
}
|
|
1649
|
+
#else
|
|
1650
|
+
|
|
1651
|
+
private:
|
|
1652
|
+
struct BF16FromU16BitsTag {};
|
|
1653
|
+
constexpr bfloat16_t(BF16FromU16BitsTag /*tag*/, uint16_t u16_bits)
|
|
1654
|
+
: bits(u16_bits) {}
|
|
1655
|
+
|
|
1656
|
+
public:
|
|
1657
|
+
static constexpr bfloat16_t FromBits(uint16_t bits) {
|
|
1658
|
+
return bfloat16_t(BF16FromU16BitsTag(), bits);
|
|
1659
|
+
}
|
|
1660
|
+
#endif
|
|
1661
|
+
|
|
1662
|
+
// When backed by a native type, ensure the wrapper behaves like the native
|
|
1663
|
+
// type by forwarding all operators. Unfortunately it seems difficult to reuse
|
|
1664
|
+
// this code in a base class, so we repeat it in float16_t.
|
|
1665
|
+
#if HWY_HAVE_SCALAR_BF16_OPERATORS || HWY_IDE
|
|
1666
|
+
template <typename T, hwy::EnableIf<!IsSame<RemoveCvRef<T>, Native>() &&
|
|
1667
|
+
!IsSame<RemoveCvRef<T>, bfloat16_t>() &&
|
|
1668
|
+
IsConvertible<T, Native>()>* = nullptr>
|
|
1669
|
+
constexpr bfloat16_t(T&& arg) noexcept(
|
|
1670
|
+
noexcept(static_cast<Native>(DeclVal<T>())))
|
|
1671
|
+
: native(static_cast<Native>(static_cast<T&&>(arg))) {}
|
|
1672
|
+
|
|
1673
|
+
template <typename T, hwy::EnableIf<!IsSame<RemoveCvRef<T>, Native>() &&
|
|
1674
|
+
!IsSame<RemoveCvRef<T>, bfloat16_t>() &&
|
|
1675
|
+
!IsConvertible<T, Native>() &&
|
|
1676
|
+
IsStaticCastable<T, Native>()>* = nullptr>
|
|
1677
|
+
explicit constexpr bfloat16_t(T&& arg) noexcept(
|
|
1678
|
+
noexcept(static_cast<Native>(DeclVal<T>())))
|
|
1679
|
+
: native(static_cast<Native>(static_cast<T&&>(arg))) {}
|
|
1680
|
+
|
|
1681
|
+
HWY_CXX14_CONSTEXPR bfloat16_t& operator=(Native arg) noexcept {
|
|
1682
|
+
native = arg;
|
|
1683
|
+
return *this;
|
|
1684
|
+
}
|
|
1685
|
+
|
|
1686
|
+
// pre-decrement operator (--x)
|
|
1687
|
+
HWY_CXX14_CONSTEXPR bfloat16_t& operator--() noexcept {
|
|
1688
|
+
native = static_cast<Native>(native - Native{1});
|
|
1689
|
+
return *this;
|
|
1690
|
+
}
|
|
874
1691
|
|
|
875
|
-
//
|
|
876
|
-
|
|
877
|
-
|
|
878
|
-
|
|
879
|
-
|
|
880
|
-
|
|
881
|
-
#define HWY_IF_UI64(T) \
|
|
882
|
-
hwy::EnableIf<IsSame<T, uint64_t>() || IsSame<T, int64_t>()>* = nullptr
|
|
883
|
-
#define HWY_IF_BF16(T) hwy::EnableIf<IsSame<T, hwy::bfloat16_t>()>* = nullptr
|
|
884
|
-
#define HWY_IF_F16(T) hwy::EnableIf<IsSame<T, hwy::float16_t>()>* = nullptr
|
|
1692
|
+
// post-decrement operator (x--)
|
|
1693
|
+
HWY_CXX14_CONSTEXPR bfloat16_t operator--(int) noexcept {
|
|
1694
|
+
bfloat16_t result = *this;
|
|
1695
|
+
native = static_cast<Native>(native - Native{1});
|
|
1696
|
+
return result;
|
|
1697
|
+
}
|
|
885
1698
|
|
|
886
|
-
|
|
887
|
-
|
|
1699
|
+
// pre-increment operator (++x)
|
|
1700
|
+
HWY_CXX14_CONSTEXPR bfloat16_t& operator++() noexcept {
|
|
1701
|
+
native = static_cast<Native>(native + Native{1});
|
|
1702
|
+
return *this;
|
|
1703
|
+
}
|
|
888
1704
|
|
|
889
|
-
//
|
|
890
|
-
|
|
891
|
-
|
|
1705
|
+
// post-increment operator (x++)
|
|
1706
|
+
HWY_CXX14_CONSTEXPR bfloat16_t operator++(int) noexcept {
|
|
1707
|
+
bfloat16_t result = *this;
|
|
1708
|
+
native = static_cast<Native>(native + Native{1});
|
|
1709
|
+
return result;
|
|
1710
|
+
}
|
|
892
1711
|
|
|
893
|
-
|
|
894
|
-
|
|
895
|
-
|
|
896
|
-
}
|
|
897
|
-
|
|
898
|
-
|
|
899
|
-
|
|
1712
|
+
constexpr bfloat16_t operator-() const noexcept {
|
|
1713
|
+
return bfloat16_t(static_cast<Native>(-native));
|
|
1714
|
+
}
|
|
1715
|
+
constexpr bfloat16_t operator+() const noexcept { return *this; }
|
|
1716
|
+
|
|
1717
|
+
// Reduce clutter by generating `operator+` and `operator+=` etc. Note that
|
|
1718
|
+
// we cannot token-paste `operator` and `+`, so pass it in as `op_func`.
|
|
1719
|
+
#define HWY_BFLOAT16_BINARY_OP(op, op_func, assign_func) \
|
|
1720
|
+
constexpr bfloat16_t op_func(const bfloat16_t& rhs) const noexcept { \
|
|
1721
|
+
return bfloat16_t(static_cast<Native>(native op rhs.native)); \
|
|
1722
|
+
} \
|
|
1723
|
+
template <typename T, HWY_IF_NOT_BF16(T), \
|
|
1724
|
+
typename UnwrappedT = \
|
|
1725
|
+
detail::SpecialFloatUnwrapArithOpOperand<const T&>, \
|
|
1726
|
+
typename RawResultT = \
|
|
1727
|
+
decltype(DeclVal<Native>() op DeclVal<UnwrappedT>()), \
|
|
1728
|
+
typename ResultT = \
|
|
1729
|
+
detail::NativeSpecialFloatToWrapper<RawResultT>, \
|
|
1730
|
+
HWY_IF_CASTABLE(RawResultT, ResultT)> \
|
|
1731
|
+
constexpr ResultT op_func(const T& rhs) const noexcept(noexcept( \
|
|
1732
|
+
static_cast<ResultT>(DeclVal<Native>() op DeclVal<UnwrappedT>()))) { \
|
|
1733
|
+
return static_cast<ResultT>(native op static_cast<UnwrappedT>(rhs)); \
|
|
1734
|
+
} \
|
|
1735
|
+
HWY_CXX14_CONSTEXPR hwy::bfloat16_t& assign_func( \
|
|
1736
|
+
const hwy::bfloat16_t& rhs) noexcept { \
|
|
1737
|
+
native = static_cast<Native>(native op rhs.native); \
|
|
1738
|
+
return *this; \
|
|
1739
|
+
} \
|
|
1740
|
+
template <typename T, HWY_IF_NOT_BF16(T), \
|
|
1741
|
+
HWY_IF_OP_CASTABLE(op, const T&, Native), \
|
|
1742
|
+
HWY_IF_ASSIGNABLE( \
|
|
1743
|
+
Native, decltype(DeclVal<Native>() op DeclVal<const T&>()))> \
|
|
1744
|
+
HWY_CXX14_CONSTEXPR hwy::bfloat16_t& assign_func(const T& rhs) noexcept( \
|
|
1745
|
+
noexcept( \
|
|
1746
|
+
static_cast<Native>(DeclVal<Native>() op DeclVal<const T&>()))) { \
|
|
1747
|
+
native = static_cast<Native>(native op rhs); \
|
|
1748
|
+
return *this; \
|
|
1749
|
+
}
|
|
1750
|
+
HWY_BFLOAT16_BINARY_OP(+, operator+, operator+=)
|
|
1751
|
+
HWY_BFLOAT16_BINARY_OP(-, operator-, operator-=)
|
|
1752
|
+
HWY_BFLOAT16_BINARY_OP(*, operator*, operator*=)
|
|
1753
|
+
HWY_BFLOAT16_BINARY_OP(/, operator/, operator/=)
|
|
1754
|
+
#undef HWY_BFLOAT16_BINARY_OP
|
|
1755
|
+
|
|
1756
|
+
#endif // HWY_HAVE_SCALAR_BF16_OPERATORS
|
|
900
1757
|
};
|
|
1758
|
+
static_assert(sizeof(hwy::bfloat16_t) == 2, "Wrong size of bfloat16_t");
|
|
901
1759
|
|
|
902
|
-
|
|
903
|
-
|
|
1760
|
+
#pragma pack(pop)
|
|
1761
|
+
|
|
1762
|
+
#if HWY_HAVE_SCALAR_BF16_TYPE
|
|
1763
|
+
namespace detail {
|
|
904
1764
|
|
|
1765
|
+
#if HWY_HAVE_SCALAR_BF16_OPERATORS
|
|
905
1766
|
template <class T>
|
|
906
|
-
struct
|
|
907
|
-
using type =
|
|
1767
|
+
struct SpecialFloatUnwrapArithOpOperandT<T, hwy::bfloat16_t, true> {
|
|
1768
|
+
using type = hwy::bfloat16_t::Native;
|
|
908
1769
|
};
|
|
1770
|
+
#endif
|
|
1771
|
+
|
|
909
1772
|
template <class T>
|
|
910
|
-
struct
|
|
911
|
-
using type =
|
|
1773
|
+
struct NativeSpecialFloatToWrapperT<T, hwy::bfloat16_t::Native> {
|
|
1774
|
+
using type = hwy::bfloat16_t;
|
|
912
1775
|
};
|
|
913
|
-
|
|
914
|
-
|
|
915
|
-
|
|
1776
|
+
|
|
1777
|
+
} // namespace detail
|
|
1778
|
+
#endif // HWY_HAVE_SCALAR_BF16_TYPE
|
|
1779
|
+
|
|
1780
|
+
#if HWY_HAS_BUILTIN(__builtin_bit_cast) || HWY_COMPILER_MSVC >= 1926
|
|
1781
|
+
namespace detail {
|
|
1782
|
+
|
|
1783
|
+
template <>
|
|
1784
|
+
struct BitCastScalarSrcCastHelper<hwy::bfloat16_t> {
|
|
1785
|
+
#if HWY_HAVE_SCALAR_BF16_TYPE
|
|
1786
|
+
static HWY_INLINE constexpr const hwy::bfloat16_t::Native& CastSrcValRef(
|
|
1787
|
+
const hwy::bfloat16_t& val) {
|
|
1788
|
+
return val.native;
|
|
1789
|
+
}
|
|
1790
|
+
#else
|
|
1791
|
+
static HWY_INLINE constexpr const uint16_t& CastSrcValRef(
|
|
1792
|
+
const hwy::bfloat16_t& val) {
|
|
1793
|
+
return val.bits;
|
|
1794
|
+
}
|
|
1795
|
+
#endif
|
|
916
1796
|
};
|
|
917
1797
|
|
|
918
|
-
|
|
919
|
-
|
|
1798
|
+
} // namespace detail
|
|
1799
|
+
#endif // HWY_HAS_BUILTIN(__builtin_bit_cast) || HWY_COMPILER_MSVC >= 1926
|
|
1800
|
+
|
|
1801
|
+
HWY_API HWY_BF16_CONSTEXPR float F32FromBF16(bfloat16_t bf) {
|
|
1802
|
+
#if HWY_HAVE_SCALAR_BF16_OPERATORS
|
|
1803
|
+
return static_cast<float>(bf);
|
|
1804
|
+
#else
|
|
1805
|
+
return BitCastScalar<float>(static_cast<uint32_t>(
|
|
1806
|
+
static_cast<uint32_t>(BitCastScalar<uint16_t>(bf)) << 16));
|
|
1807
|
+
#endif
|
|
1808
|
+
}
|
|
1809
|
+
|
|
1810
|
+
namespace detail {
|
|
1811
|
+
|
|
1812
|
+
// Returns the increment to add to the bits of a finite F32 value to round a
|
|
1813
|
+
// finite F32 to the nearest BF16 value
|
|
1814
|
+
static HWY_INLINE HWY_MAYBE_UNUSED constexpr uint32_t F32BitsToBF16RoundIncr(
|
|
1815
|
+
const uint32_t f32_bits) {
|
|
1816
|
+
return static_cast<uint32_t>(((f32_bits & 0x7FFFFFFFu) < 0x7F800000u)
|
|
1817
|
+
? (0x7FFFu + ((f32_bits >> 16) & 1u))
|
|
1818
|
+
: 0u);
|
|
1819
|
+
}
|
|
1820
|
+
|
|
1821
|
+
// Converts f32_bits (which is the bits of a F32 value) to BF16 bits,
|
|
1822
|
+
// rounded to the nearest F16 value
|
|
1823
|
+
static HWY_INLINE HWY_MAYBE_UNUSED constexpr uint16_t F32BitsToBF16Bits(
|
|
1824
|
+
const uint32_t f32_bits) {
|
|
1825
|
+
// Round f32_bits to the nearest BF16 by first adding
|
|
1826
|
+
// F32BitsToBF16RoundIncr(f32_bits) to f32_bits and then right shifting
|
|
1827
|
+
// f32_bits + F32BitsToBF16RoundIncr(f32_bits) by 16
|
|
1828
|
+
|
|
1829
|
+
// If f32_bits is the bit representation of a NaN F32 value, make sure that
|
|
1830
|
+
// bit 6 of the BF16 result is set to convert SNaN F32 values to QNaN BF16
|
|
1831
|
+
// values and to prevent NaN F32 values from being converted to an infinite
|
|
1832
|
+
// BF16 value
|
|
1833
|
+
return static_cast<uint16_t>(
|
|
1834
|
+
((f32_bits + F32BitsToBF16RoundIncr(f32_bits)) >> 16) |
|
|
1835
|
+
(static_cast<uint32_t>((f32_bits & 0x7FFFFFFFu) > 0x7F800000u) << 6));
|
|
1836
|
+
}
|
|
1837
|
+
|
|
1838
|
+
} // namespace detail
|
|
1839
|
+
|
|
1840
|
+
HWY_API HWY_BF16_CONSTEXPR bfloat16_t BF16FromF32(float f) {
|
|
1841
|
+
#if HWY_HAVE_SCALAR_BF16_OPERATORS
|
|
1842
|
+
return static_cast<bfloat16_t>(f);
|
|
1843
|
+
#else
|
|
1844
|
+
return bfloat16_t::FromBits(
|
|
1845
|
+
detail::F32BitsToBF16Bits(BitCastScalar<uint32_t>(f)));
|
|
1846
|
+
#endif
|
|
1847
|
+
}
|
|
1848
|
+
|
|
1849
|
+
HWY_API HWY_BF16_CONSTEXPR bfloat16_t BF16FromF64(double f64) {
|
|
1850
|
+
#if HWY_HAVE_SCALAR_BF16_OPERATORS
|
|
1851
|
+
return static_cast<bfloat16_t>(f64);
|
|
1852
|
+
#else
|
|
1853
|
+
// The mantissa bits of f64 are first rounded using round-to-odd rounding
|
|
1854
|
+
// to the nearest f64 value that has the lower 38 bits zeroed out to
|
|
1855
|
+
// ensure that the result is correctly rounded to a BF16.
|
|
1856
|
+
|
|
1857
|
+
// The F64 round-to-odd operation below will round a normal F64 value
|
|
1858
|
+
// (using round-to-odd rounding) to a F64 value that has 15 bits of precision.
|
|
1859
|
+
|
|
1860
|
+
// It is okay if the magnitude of a denormal F64 value is rounded up in the
|
|
1861
|
+
// F64 round-to-odd step below as the magnitude of a denormal F64 value is
|
|
1862
|
+
// much smaller than 2^(-133) (the smallest positive denormal BF16 value).
|
|
1863
|
+
|
|
1864
|
+
// It is also okay if bit 38 of a NaN F64 value is changed by the F64
|
|
1865
|
+
// round-to-odd step below as the lower 16 bits of a F32 NaN value are usually
|
|
1866
|
+
// discarded or ignored by the conversion of a F32 NaN value to a BF16.
|
|
1867
|
+
|
|
1868
|
+
// If f64 is a NaN value, the result of the F64 round-to-odd step will be a
|
|
1869
|
+
// NaN value as the result of the F64 round-to-odd step will have at least one
|
|
1870
|
+
// mantissa bit if f64 is a NaN value.
|
|
1871
|
+
|
|
1872
|
+
// The F64 round-to-odd step below will ensure that the F64 to F32 conversion
|
|
1873
|
+
// is exact if the magnitude of the rounded F64 value (using round-to-odd
|
|
1874
|
+
// rounding) is between 2^(-135) (one-fourth of the smallest positive denormal
|
|
1875
|
+
// BF16 value) and HighestValue<float>() (the largest finite F32 value).
|
|
1876
|
+
|
|
1877
|
+
// If |f64| is less than 2^(-135), the magnitude of the result of the F64 to
|
|
1878
|
+
// F32 conversion is guaranteed to be less than or equal to 2^(-135), which
|
|
1879
|
+
// ensures that the F32 to BF16 conversion is correctly rounded, even if the
|
|
1880
|
+
// conversion of a rounded F64 value whose magnitude is less than 2^(-135)
|
|
1881
|
+
// to a F32 is inexact.
|
|
1882
|
+
|
|
1883
|
+
return BF16FromF32(
|
|
1884
|
+
static_cast<float>(BitCastScalar<double>(static_cast<uint64_t>(
|
|
1885
|
+
(BitCastScalar<uint64_t>(f64) & 0xFFFFFFC000000000ULL) |
|
|
1886
|
+
((BitCastScalar<uint64_t>(f64) + 0x0000003FFFFFFFFFULL) &
|
|
1887
|
+
0x0000004000000000ULL)))));
|
|
1888
|
+
#endif
|
|
1889
|
+
}
|
|
1890
|
+
|
|
1891
|
+
// More convenient to define outside bfloat16_t because these may use
|
|
1892
|
+
// F32FromBF16, which is defined after the struct.
|
|
1893
|
+
|
|
1894
|
+
HWY_BF16_CONSTEXPR inline bool operator==(bfloat16_t lhs,
|
|
1895
|
+
bfloat16_t rhs) noexcept {
|
|
1896
|
+
#if HWY_HAVE_SCALAR_BF16_OPERATORS
|
|
1897
|
+
return lhs.native == rhs.native;
|
|
1898
|
+
#else
|
|
1899
|
+
return F32FromBF16(lhs) == F32FromBF16(rhs);
|
|
1900
|
+
#endif
|
|
1901
|
+
}
|
|
1902
|
+
|
|
1903
|
+
HWY_BF16_CONSTEXPR inline bool operator!=(bfloat16_t lhs,
|
|
1904
|
+
bfloat16_t rhs) noexcept {
|
|
1905
|
+
#if HWY_HAVE_SCALAR_BF16_OPERATORS
|
|
1906
|
+
return lhs.native != rhs.native;
|
|
1907
|
+
#else
|
|
1908
|
+
return F32FromBF16(lhs) != F32FromBF16(rhs);
|
|
1909
|
+
#endif
|
|
1910
|
+
}
|
|
1911
|
+
HWY_BF16_CONSTEXPR inline bool operator<(bfloat16_t lhs,
|
|
1912
|
+
bfloat16_t rhs) noexcept {
|
|
1913
|
+
#if HWY_HAVE_SCALAR_BF16_OPERATORS
|
|
1914
|
+
return lhs.native < rhs.native;
|
|
1915
|
+
#else
|
|
1916
|
+
return F32FromBF16(lhs) < F32FromBF16(rhs);
|
|
1917
|
+
#endif
|
|
1918
|
+
}
|
|
1919
|
+
HWY_BF16_CONSTEXPR inline bool operator<=(bfloat16_t lhs,
|
|
1920
|
+
bfloat16_t rhs) noexcept {
|
|
1921
|
+
#if HWY_HAVE_SCALAR_BF16_OPERATORS
|
|
1922
|
+
return lhs.native <= rhs.native;
|
|
1923
|
+
#else
|
|
1924
|
+
return F32FromBF16(lhs) <= F32FromBF16(rhs);
|
|
1925
|
+
#endif
|
|
1926
|
+
}
|
|
1927
|
+
HWY_BF16_CONSTEXPR inline bool operator>(bfloat16_t lhs,
|
|
1928
|
+
bfloat16_t rhs) noexcept {
|
|
1929
|
+
#if HWY_HAVE_SCALAR_BF16_OPERATORS
|
|
1930
|
+
return lhs.native > rhs.native;
|
|
1931
|
+
#else
|
|
1932
|
+
return F32FromBF16(lhs) > F32FromBF16(rhs);
|
|
1933
|
+
#endif
|
|
1934
|
+
}
|
|
1935
|
+
HWY_BF16_CONSTEXPR inline bool operator>=(bfloat16_t lhs,
|
|
1936
|
+
bfloat16_t rhs) noexcept {
|
|
1937
|
+
#if HWY_HAVE_SCALAR_BF16_OPERATORS
|
|
1938
|
+
return lhs.native >= rhs.native;
|
|
1939
|
+
#else
|
|
1940
|
+
return F32FromBF16(lhs) >= F32FromBF16(rhs);
|
|
1941
|
+
#endif
|
|
1942
|
+
}
|
|
1943
|
+
#if HWY_HAVE_CXX20_THREE_WAY_COMPARE
|
|
1944
|
+
HWY_BF16_CONSTEXPR inline std::partial_ordering operator<=>(
|
|
1945
|
+
bfloat16_t lhs, bfloat16_t rhs) noexcept {
|
|
1946
|
+
#if HWY_HAVE_SCALAR_BF16_OPERATORS
|
|
1947
|
+
return lhs.native <=> rhs.native;
|
|
1948
|
+
#else
|
|
1949
|
+
return F32FromBF16(lhs) <=> F32FromBF16(rhs);
|
|
1950
|
+
#endif
|
|
1951
|
+
}
|
|
1952
|
+
#endif // HWY_HAVE_CXX20_THREE_WAY_COMPARE
|
|
920
1953
|
|
|
921
1954
|
//------------------------------------------------------------------------------
|
|
922
1955
|
// Type relations
|
|
@@ -1110,25 +2143,19 @@ constexpr auto IsFloatTag() -> hwy::SizeTag<(R::is_float ? 0x200 : 0x400)> {
|
|
|
1110
2143
|
|
|
1111
2144
|
template <typename T>
|
|
1112
2145
|
HWY_API constexpr bool IsFloat3264() {
|
|
1113
|
-
return
|
|
2146
|
+
return IsSameEither<RemoveCvRef<T>, float, double>();
|
|
1114
2147
|
}
|
|
1115
2148
|
|
|
1116
2149
|
template <typename T>
|
|
1117
2150
|
HWY_API constexpr bool IsFloat() {
|
|
1118
2151
|
// Cannot use T(1.25) != T(1) for float16_t, which can only be converted to or
|
|
1119
2152
|
// from a float, not compared. Include float16_t in case HWY_HAVE_FLOAT16=1.
|
|
1120
|
-
return IsSame<T
|
|
1121
|
-
}
|
|
1122
|
-
|
|
1123
|
-
// These types are often special-cased and not supported in all ops.
|
|
1124
|
-
template <typename T>
|
|
1125
|
-
HWY_API constexpr bool IsSpecialFloat() {
|
|
1126
|
-
return IsSame<T, float16_t>() || IsSame<T, bfloat16_t>();
|
|
2153
|
+
return IsSame<RemoveCvRef<T>, float16_t>() || IsFloat3264<T>();
|
|
1127
2154
|
}
|
|
1128
2155
|
|
|
1129
2156
|
template <typename T>
|
|
1130
2157
|
HWY_API constexpr bool IsSigned() {
|
|
1131
|
-
return T(0) > T(-1);
|
|
2158
|
+
return static_cast<T>(0) > static_cast<T>(-1);
|
|
1132
2159
|
}
|
|
1133
2160
|
template <>
|
|
1134
2161
|
constexpr bool IsSigned<float16_t>() {
|
|
@@ -1138,104 +2165,113 @@ template <>
|
|
|
1138
2165
|
constexpr bool IsSigned<bfloat16_t>() {
|
|
1139
2166
|
return true;
|
|
1140
2167
|
}
|
|
2168
|
+
template <>
|
|
2169
|
+
constexpr bool IsSigned<hwy::uint128_t>() {
|
|
2170
|
+
return false;
|
|
2171
|
+
}
|
|
2172
|
+
template <>
|
|
2173
|
+
constexpr bool IsSigned<hwy::K64V64>() {
|
|
2174
|
+
return false;
|
|
2175
|
+
}
|
|
2176
|
+
template <>
|
|
2177
|
+
constexpr bool IsSigned<hwy::K32V32>() {
|
|
2178
|
+
return false;
|
|
2179
|
+
}
|
|
2180
|
+
|
|
2181
|
+
template <typename T, bool = IsInteger<T>() && !IsIntegerLaneType<T>()>
|
|
2182
|
+
struct MakeLaneTypeIfIntegerT {
|
|
2183
|
+
using type = T;
|
|
2184
|
+
};
|
|
2185
|
+
|
|
2186
|
+
template <typename T>
|
|
2187
|
+
struct MakeLaneTypeIfIntegerT<T, true> {
|
|
2188
|
+
using type = hwy::If<IsSigned<T>(), SignedFromSize<sizeof(T)>,
|
|
2189
|
+
UnsignedFromSize<sizeof(T)>>;
|
|
2190
|
+
};
|
|
2191
|
+
|
|
2192
|
+
template <typename T>
|
|
2193
|
+
using MakeLaneTypeIfInteger = typename MakeLaneTypeIfIntegerT<T>::type;
|
|
1141
2194
|
|
|
1142
2195
|
// Largest/smallest representable integer values.
|
|
1143
2196
|
template <typename T>
|
|
1144
2197
|
HWY_API constexpr T LimitsMax() {
|
|
1145
|
-
static_assert(
|
|
1146
|
-
using TU =
|
|
1147
|
-
return static_cast<T>(IsSigned<T>() ? (static_cast<TU>(~
|
|
1148
|
-
: static_cast<TU>(~
|
|
2198
|
+
static_assert(IsInteger<T>(), "Only for integer types");
|
|
2199
|
+
using TU = UnsignedFromSize<sizeof(T)>;
|
|
2200
|
+
return static_cast<T>(IsSigned<T>() ? (static_cast<TU>(~TU(0)) >> 1)
|
|
2201
|
+
: static_cast<TU>(~TU(0)));
|
|
1149
2202
|
}
|
|
1150
2203
|
template <typename T>
|
|
1151
2204
|
HWY_API constexpr T LimitsMin() {
|
|
1152
|
-
static_assert(
|
|
1153
|
-
return IsSigned<T>() ? T(-1) - LimitsMax<T>()
|
|
2205
|
+
static_assert(IsInteger<T>(), "Only for integer types");
|
|
2206
|
+
return IsSigned<T>() ? static_cast<T>(-1) - LimitsMax<T>()
|
|
2207
|
+
: static_cast<T>(0);
|
|
1154
2208
|
}
|
|
1155
2209
|
|
|
1156
2210
|
// Largest/smallest representable value (integer or float). This naming avoids
|
|
1157
2211
|
// confusion with numeric_limits<float>::min() (the smallest positive value).
|
|
1158
2212
|
// Cannot be constexpr because we use CopySameSize for [b]float16_t.
|
|
1159
2213
|
template <typename T>
|
|
1160
|
-
HWY_API T LowestValue() {
|
|
2214
|
+
HWY_API HWY_BITCASTSCALAR_CONSTEXPR T LowestValue() {
|
|
1161
2215
|
return LimitsMin<T>();
|
|
1162
2216
|
}
|
|
1163
2217
|
template <>
|
|
1164
|
-
HWY_INLINE bfloat16_t LowestValue<bfloat16_t>() {
|
|
1165
|
-
|
|
1166
|
-
bfloat16_t ret;
|
|
1167
|
-
CopySameSize(&kBits, &ret);
|
|
1168
|
-
return ret;
|
|
2218
|
+
HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR bfloat16_t LowestValue<bfloat16_t>() {
|
|
2219
|
+
return bfloat16_t::FromBits(uint16_t{0xFF7Fu}); // -1.1111111 x 2^127
|
|
1169
2220
|
}
|
|
1170
2221
|
template <>
|
|
1171
|
-
HWY_INLINE float16_t LowestValue<float16_t>() {
|
|
1172
|
-
|
|
1173
|
-
float16_t ret;
|
|
1174
|
-
CopySameSize(&kBits, &ret);
|
|
1175
|
-
return ret;
|
|
2222
|
+
HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR float16_t LowestValue<float16_t>() {
|
|
2223
|
+
return float16_t::FromBits(uint16_t{0xFBFFu}); // -1.1111111111 x 2^15
|
|
1176
2224
|
}
|
|
1177
2225
|
template <>
|
|
1178
|
-
HWY_INLINE float LowestValue<float>() {
|
|
2226
|
+
HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR float LowestValue<float>() {
|
|
1179
2227
|
return -3.402823466e+38F;
|
|
1180
2228
|
}
|
|
1181
2229
|
template <>
|
|
1182
|
-
HWY_INLINE double LowestValue<double>() {
|
|
2230
|
+
HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR double LowestValue<double>() {
|
|
1183
2231
|
return -1.7976931348623158e+308;
|
|
1184
2232
|
}
|
|
1185
2233
|
|
|
1186
2234
|
template <typename T>
|
|
1187
|
-
HWY_API T HighestValue() {
|
|
2235
|
+
HWY_API HWY_BITCASTSCALAR_CONSTEXPR T HighestValue() {
|
|
1188
2236
|
return LimitsMax<T>();
|
|
1189
2237
|
}
|
|
1190
2238
|
template <>
|
|
1191
|
-
HWY_INLINE bfloat16_t HighestValue<bfloat16_t>() {
|
|
1192
|
-
|
|
1193
|
-
bfloat16_t ret;
|
|
1194
|
-
CopySameSize(&kBits, &ret);
|
|
1195
|
-
return ret;
|
|
2239
|
+
HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR bfloat16_t HighestValue<bfloat16_t>() {
|
|
2240
|
+
return bfloat16_t::FromBits(uint16_t{0x7F7Fu}); // 1.1111111 x 2^127
|
|
1196
2241
|
}
|
|
1197
2242
|
template <>
|
|
1198
|
-
HWY_INLINE float16_t HighestValue<float16_t>() {
|
|
1199
|
-
|
|
1200
|
-
float16_t ret;
|
|
1201
|
-
CopySameSize(&kBits, &ret);
|
|
1202
|
-
return ret;
|
|
2243
|
+
HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR float16_t HighestValue<float16_t>() {
|
|
2244
|
+
return float16_t::FromBits(uint16_t{0x7BFFu}); // 1.1111111111 x 2^15
|
|
1203
2245
|
}
|
|
1204
2246
|
template <>
|
|
1205
|
-
HWY_INLINE float HighestValue<float>() {
|
|
2247
|
+
HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR float HighestValue<float>() {
|
|
1206
2248
|
return 3.402823466e+38F;
|
|
1207
2249
|
}
|
|
1208
2250
|
template <>
|
|
1209
|
-
HWY_INLINE double HighestValue<double>() {
|
|
2251
|
+
HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR double HighestValue<double>() {
|
|
1210
2252
|
return 1.7976931348623158e+308;
|
|
1211
2253
|
}
|
|
1212
2254
|
|
|
1213
2255
|
// Difference between 1.0 and the next representable value. Equal to
|
|
1214
2256
|
// 1 / (1ULL << MantissaBits<T>()), but hard-coding ensures precision.
|
|
1215
2257
|
template <typename T>
|
|
1216
|
-
HWY_API T Epsilon() {
|
|
2258
|
+
HWY_API HWY_BITCASTSCALAR_CONSTEXPR T Epsilon() {
|
|
1217
2259
|
return 1;
|
|
1218
2260
|
}
|
|
1219
2261
|
template <>
|
|
1220
|
-
HWY_INLINE bfloat16_t Epsilon<bfloat16_t>() {
|
|
1221
|
-
|
|
1222
|
-
bfloat16_t ret;
|
|
1223
|
-
CopySameSize(&kBits, &ret);
|
|
1224
|
-
return ret;
|
|
2262
|
+
HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR bfloat16_t Epsilon<bfloat16_t>() {
|
|
2263
|
+
return bfloat16_t::FromBits(uint16_t{0x3C00u}); // 0.0078125
|
|
1225
2264
|
}
|
|
1226
2265
|
template <>
|
|
1227
|
-
HWY_INLINE float16_t Epsilon<float16_t>() {
|
|
1228
|
-
|
|
1229
|
-
float16_t ret;
|
|
1230
|
-
CopySameSize(&kBits, &ret);
|
|
1231
|
-
return ret;
|
|
2266
|
+
HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR float16_t Epsilon<float16_t>() {
|
|
2267
|
+
return float16_t::FromBits(uint16_t{0x1400u}); // 0.0009765625
|
|
1232
2268
|
}
|
|
1233
2269
|
template <>
|
|
1234
|
-
HWY_INLINE float Epsilon<float>() {
|
|
2270
|
+
HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR float Epsilon<float>() {
|
|
1235
2271
|
return 1.192092896e-7f;
|
|
1236
2272
|
}
|
|
1237
2273
|
template <>
|
|
1238
|
-
HWY_INLINE double Epsilon<double>() {
|
|
2274
|
+
HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR double Epsilon<double>() {
|
|
1239
2275
|
return 2.2204460492503131e-16;
|
|
1240
2276
|
}
|
|
1241
2277
|
|
|
@@ -1278,7 +2314,8 @@ constexpr MakeUnsigned<T> SignMask() {
|
|
|
1278
2314
|
// Returns bitmask of the exponent field in IEEE binary16/32/64.
|
|
1279
2315
|
template <typename T>
|
|
1280
2316
|
constexpr MakeUnsigned<T> ExponentMask() {
|
|
1281
|
-
return (~(MakeUnsigned<T>{1} << MantissaBits<T>()) + 1) &
|
|
2317
|
+
return (~(MakeUnsigned<T>{1} << MantissaBits<T>()) + 1) &
|
|
2318
|
+
static_cast<MakeUnsigned<T>>(~SignMask<T>());
|
|
1282
2319
|
}
|
|
1283
2320
|
|
|
1284
2321
|
// Returns bitmask of the mantissa field in IEEE binary16/32/64.
|
|
@@ -1290,30 +2327,24 @@ constexpr MakeUnsigned<T> MantissaMask() {
|
|
|
1290
2327
|
// Returns 1 << mantissa_bits as a floating-point number. All integers whose
|
|
1291
2328
|
// absolute value are less than this can be represented exactly.
|
|
1292
2329
|
template <typename T>
|
|
1293
|
-
HWY_INLINE T MantissaEnd() {
|
|
2330
|
+
HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR T MantissaEnd() {
|
|
1294
2331
|
static_assert(sizeof(T) == 0, "Only instantiate the specializations");
|
|
1295
2332
|
return 0;
|
|
1296
2333
|
}
|
|
1297
2334
|
template <>
|
|
1298
|
-
HWY_INLINE bfloat16_t MantissaEnd<bfloat16_t>() {
|
|
1299
|
-
|
|
1300
|
-
bfloat16_t ret;
|
|
1301
|
-
CopySameSize(&kBits, &ret);
|
|
1302
|
-
return ret;
|
|
2335
|
+
HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR bfloat16_t MantissaEnd<bfloat16_t>() {
|
|
2336
|
+
return bfloat16_t::FromBits(uint16_t{0x4300u}); // 1.0 x 2^7
|
|
1303
2337
|
}
|
|
1304
2338
|
template <>
|
|
1305
|
-
HWY_INLINE float16_t MantissaEnd<float16_t>() {
|
|
1306
|
-
|
|
1307
|
-
float16_t ret;
|
|
1308
|
-
CopySameSize(&kBits, &ret);
|
|
1309
|
-
return ret;
|
|
2339
|
+
HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR float16_t MantissaEnd<float16_t>() {
|
|
2340
|
+
return float16_t::FromBits(uint16_t{0x6400u}); // 1.0 x 2^10
|
|
1310
2341
|
}
|
|
1311
2342
|
template <>
|
|
1312
|
-
HWY_INLINE float MantissaEnd<float>() {
|
|
2343
|
+
HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR float MantissaEnd<float>() {
|
|
1313
2344
|
return 8388608.0f; // 1 << 23
|
|
1314
2345
|
}
|
|
1315
2346
|
template <>
|
|
1316
|
-
HWY_INLINE double MantissaEnd<double>() {
|
|
2347
|
+
HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR double MantissaEnd<double>() {
|
|
1317
2348
|
// floating point literal with p52 requires C++17.
|
|
1318
2349
|
return 4503599627370496.0; // 1 << 52
|
|
1319
2350
|
}
|
|
@@ -1333,6 +2364,143 @@ constexpr MakeSigned<T> MaxExponentField() {
|
|
|
1333
2364
|
return (MakeSigned<T>{1} << ExponentBits<T>()) - 1;
|
|
1334
2365
|
}
|
|
1335
2366
|
|
|
2367
|
+
//------------------------------------------------------------------------------
|
|
2368
|
+
// Additional F16/BF16 operators
|
|
2369
|
+
|
|
2370
|
+
#if HWY_HAVE_SCALAR_F16_OPERATORS || HWY_HAVE_SCALAR_BF16_OPERATORS
|
|
2371
|
+
|
|
2372
|
+
#define HWY_RHS_SPECIAL_FLOAT_ARITH_OP(op, op_func, T2) \
|
|
2373
|
+
template < \
|
|
2374
|
+
typename T1, \
|
|
2375
|
+
hwy::EnableIf<hwy::IsInteger<RemoveCvRef<T1>>() || \
|
|
2376
|
+
hwy::IsFloat3264<RemoveCvRef<T1>>()>* = nullptr, \
|
|
2377
|
+
typename RawResultT = decltype(DeclVal<T1>() op DeclVal<T2::Native>()), \
|
|
2378
|
+
typename ResultT = detail::NativeSpecialFloatToWrapper<RawResultT>, \
|
|
2379
|
+
HWY_IF_CASTABLE(RawResultT, ResultT)> \
|
|
2380
|
+
static HWY_INLINE constexpr ResultT op_func(T1 a, T2 b) noexcept { \
|
|
2381
|
+
return static_cast<ResultT>(a op b.native); \
|
|
2382
|
+
}
|
|
2383
|
+
|
|
2384
|
+
#define HWY_SPECIAL_FLOAT_CMP_AGAINST_NON_SPECIAL_OP(op, op_func, T1) \
|
|
2385
|
+
HWY_RHS_SPECIAL_FLOAT_ARITH_OP(op, op_func, T1) \
|
|
2386
|
+
template < \
|
|
2387
|
+
typename T2, \
|
|
2388
|
+
hwy::EnableIf<hwy::IsInteger<RemoveCvRef<T2>>() || \
|
|
2389
|
+
hwy::IsFloat3264<RemoveCvRef<T2>>()>* = nullptr, \
|
|
2390
|
+
typename RawResultT = decltype(DeclVal<T1::Native>() op DeclVal<T2>()), \
|
|
2391
|
+
typename ResultT = detail::NativeSpecialFloatToWrapper<RawResultT>, \
|
|
2392
|
+
HWY_IF_CASTABLE(RawResultT, ResultT)> \
|
|
2393
|
+
static HWY_INLINE constexpr ResultT op_func(T1 a, T2 b) noexcept { \
|
|
2394
|
+
return static_cast<ResultT>(a.native op b); \
|
|
2395
|
+
}
|
|
2396
|
+
|
|
2397
|
+
#if HWY_HAVE_SCALAR_F16_OPERATORS
|
|
2398
|
+
HWY_RHS_SPECIAL_FLOAT_ARITH_OP(+, operator+, float16_t)
|
|
2399
|
+
HWY_RHS_SPECIAL_FLOAT_ARITH_OP(-, operator-, float16_t)
|
|
2400
|
+
HWY_RHS_SPECIAL_FLOAT_ARITH_OP(*, operator*, float16_t)
|
|
2401
|
+
HWY_RHS_SPECIAL_FLOAT_ARITH_OP(/, operator/, float16_t)
|
|
2402
|
+
HWY_SPECIAL_FLOAT_CMP_AGAINST_NON_SPECIAL_OP(==, operator==, float16_t)
|
|
2403
|
+
HWY_SPECIAL_FLOAT_CMP_AGAINST_NON_SPECIAL_OP(!=, operator!=, float16_t)
|
|
2404
|
+
HWY_SPECIAL_FLOAT_CMP_AGAINST_NON_SPECIAL_OP(<, operator<, float16_t)
|
|
2405
|
+
HWY_SPECIAL_FLOAT_CMP_AGAINST_NON_SPECIAL_OP(<=, operator<=, float16_t)
|
|
2406
|
+
HWY_SPECIAL_FLOAT_CMP_AGAINST_NON_SPECIAL_OP(>, operator>, float16_t)
|
|
2407
|
+
HWY_SPECIAL_FLOAT_CMP_AGAINST_NON_SPECIAL_OP(>=, operator>=, float16_t)
|
|
2408
|
+
#if HWY_HAVE_CXX20_THREE_WAY_COMPARE
|
|
2409
|
+
HWY_SPECIAL_FLOAT_CMP_AGAINST_NON_SPECIAL_OP(<=>, operator<=>, float16_t)
|
|
2410
|
+
#endif
|
|
2411
|
+
#endif // HWY_HAVE_SCALAR_F16_OPERATORS
|
|
2412
|
+
|
|
2413
|
+
#if HWY_HAVE_SCALAR_BF16_OPERATORS
|
|
2414
|
+
HWY_RHS_SPECIAL_FLOAT_ARITH_OP(+, operator+, bfloat16_t)
|
|
2415
|
+
HWY_RHS_SPECIAL_FLOAT_ARITH_OP(-, operator-, bfloat16_t)
|
|
2416
|
+
HWY_RHS_SPECIAL_FLOAT_ARITH_OP(*, operator*, bfloat16_t)
|
|
2417
|
+
HWY_RHS_SPECIAL_FLOAT_ARITH_OP(/, operator/, bfloat16_t)
|
|
2418
|
+
HWY_SPECIAL_FLOAT_CMP_AGAINST_NON_SPECIAL_OP(==, operator==, bfloat16_t)
|
|
2419
|
+
HWY_SPECIAL_FLOAT_CMP_AGAINST_NON_SPECIAL_OP(!=, operator!=, bfloat16_t)
|
|
2420
|
+
HWY_SPECIAL_FLOAT_CMP_AGAINST_NON_SPECIAL_OP(<, operator<, bfloat16_t)
|
|
2421
|
+
HWY_SPECIAL_FLOAT_CMP_AGAINST_NON_SPECIAL_OP(<=, operator<=, bfloat16_t)
|
|
2422
|
+
HWY_SPECIAL_FLOAT_CMP_AGAINST_NON_SPECIAL_OP(>, operator>, bfloat16_t)
|
|
2423
|
+
HWY_SPECIAL_FLOAT_CMP_AGAINST_NON_SPECIAL_OP(>=, operator>=, bfloat16_t)
|
|
2424
|
+
#if HWY_HAVE_CXX20_THREE_WAY_COMPARE
|
|
2425
|
+
HWY_SPECIAL_FLOAT_CMP_AGAINST_NON_SPECIAL_OP(<=>, operator<=>, bfloat16_t)
|
|
2426
|
+
#endif
|
|
2427
|
+
#endif // HWY_HAVE_SCALAR_BF16_OPERATORS
|
|
2428
|
+
|
|
2429
|
+
#undef HWY_RHS_SPECIAL_FLOAT_ARITH_OP
|
|
2430
|
+
#undef HWY_SPECIAL_FLOAT_CMP_AGAINST_NON_SPECIAL_OP
|
|
2431
|
+
|
|
2432
|
+
#endif // HWY_HAVE_SCALAR_F16_OPERATORS || HWY_HAVE_SCALAR_BF16_OPERATORS
|
|
2433
|
+
|
|
2434
|
+
//------------------------------------------------------------------------------
|
|
2435
|
+
// Type conversions (after IsSpecialFloat)
|
|
2436
|
+
|
|
2437
|
+
HWY_API float F32FromF16Mem(const void* ptr) {
|
|
2438
|
+
float16_t f16;
|
|
2439
|
+
CopyBytes<2>(HWY_ASSUME_ALIGNED(ptr, 2), &f16);
|
|
2440
|
+
return F32FromF16(f16);
|
|
2441
|
+
}
|
|
2442
|
+
|
|
2443
|
+
HWY_API float F32FromBF16Mem(const void* ptr) {
|
|
2444
|
+
bfloat16_t bf;
|
|
2445
|
+
CopyBytes<2>(HWY_ASSUME_ALIGNED(ptr, 2), &bf);
|
|
2446
|
+
return F32FromBF16(bf);
|
|
2447
|
+
}
|
|
2448
|
+
|
|
2449
|
+
#if HWY_HAVE_SCALAR_F16_OPERATORS
|
|
2450
|
+
#define HWY_BF16_TO_F16_CONSTEXPR HWY_BF16_CONSTEXPR
|
|
2451
|
+
#else
|
|
2452
|
+
#define HWY_BF16_TO_F16_CONSTEXPR HWY_F16_CONSTEXPR
|
|
2453
|
+
#endif
|
|
2454
|
+
|
|
2455
|
+
// For casting from TFrom to TTo
|
|
2456
|
+
template <typename TTo, typename TFrom, HWY_IF_NOT_SPECIAL_FLOAT(TTo),
|
|
2457
|
+
HWY_IF_NOT_SPECIAL_FLOAT(TFrom), HWY_IF_NOT_SAME(TTo, TFrom)>
|
|
2458
|
+
HWY_API constexpr TTo ConvertScalarTo(const TFrom in) {
|
|
2459
|
+
return static_cast<TTo>(in);
|
|
2460
|
+
}
|
|
2461
|
+
template <typename TTo, typename TFrom, HWY_IF_F16(TTo),
|
|
2462
|
+
HWY_IF_NOT_SPECIAL_FLOAT(TFrom), HWY_IF_NOT_SAME(TFrom, double)>
|
|
2463
|
+
HWY_API constexpr TTo ConvertScalarTo(const TFrom in) {
|
|
2464
|
+
return F16FromF32(static_cast<float>(in));
|
|
2465
|
+
}
|
|
2466
|
+
template <typename TTo, HWY_IF_F16(TTo)>
|
|
2467
|
+
HWY_API HWY_BF16_TO_F16_CONSTEXPR TTo
|
|
2468
|
+
ConvertScalarTo(const hwy::bfloat16_t in) {
|
|
2469
|
+
return F16FromF32(F32FromBF16(in));
|
|
2470
|
+
}
|
|
2471
|
+
template <typename TTo, HWY_IF_F16(TTo)>
|
|
2472
|
+
HWY_API HWY_F16_CONSTEXPR TTo ConvertScalarTo(const double in) {
|
|
2473
|
+
return F16FromF64(in);
|
|
2474
|
+
}
|
|
2475
|
+
template <typename TTo, typename TFrom, HWY_IF_BF16(TTo),
|
|
2476
|
+
HWY_IF_NOT_SPECIAL_FLOAT(TFrom), HWY_IF_NOT_SAME(TFrom, double)>
|
|
2477
|
+
HWY_API HWY_BF16_CONSTEXPR TTo ConvertScalarTo(const TFrom in) {
|
|
2478
|
+
return BF16FromF32(static_cast<float>(in));
|
|
2479
|
+
}
|
|
2480
|
+
template <typename TTo, HWY_IF_BF16(TTo)>
|
|
2481
|
+
HWY_API HWY_BF16_TO_F16_CONSTEXPR TTo ConvertScalarTo(const hwy::float16_t in) {
|
|
2482
|
+
return BF16FromF32(F32FromF16(in));
|
|
2483
|
+
}
|
|
2484
|
+
template <typename TTo, HWY_IF_BF16(TTo)>
|
|
2485
|
+
HWY_API HWY_BF16_CONSTEXPR TTo ConvertScalarTo(const double in) {
|
|
2486
|
+
return BF16FromF64(in);
|
|
2487
|
+
}
|
|
2488
|
+
template <typename TTo, typename TFrom, HWY_IF_F16(TFrom),
|
|
2489
|
+
HWY_IF_NOT_SPECIAL_FLOAT(TTo)>
|
|
2490
|
+
HWY_API HWY_F16_CONSTEXPR TTo ConvertScalarTo(const TFrom in) {
|
|
2491
|
+
return static_cast<TTo>(F32FromF16(in));
|
|
2492
|
+
}
|
|
2493
|
+
template <typename TTo, typename TFrom, HWY_IF_BF16(TFrom),
|
|
2494
|
+
HWY_IF_NOT_SPECIAL_FLOAT(TTo)>
|
|
2495
|
+
HWY_API HWY_BF16_CONSTEXPR TTo ConvertScalarTo(TFrom in) {
|
|
2496
|
+
return static_cast<TTo>(F32FromBF16(in));
|
|
2497
|
+
}
|
|
2498
|
+
// Same: return unchanged
|
|
2499
|
+
template <typename TTo>
|
|
2500
|
+
HWY_API constexpr TTo ConvertScalarTo(TTo in) {
|
|
2501
|
+
return in;
|
|
2502
|
+
}
|
|
2503
|
+
|
|
1336
2504
|
//------------------------------------------------------------------------------
|
|
1337
2505
|
// Helper functions
|
|
1338
2506
|
|
|
@@ -1346,8 +2514,54 @@ constexpr inline size_t RoundUpTo(size_t what, size_t align) {
|
|
|
1346
2514
|
return DivCeil(what, align) * align;
|
|
1347
2515
|
}
|
|
1348
2516
|
|
|
2517
|
+
// Works for any `align`; if a power of two, compiler emits AND.
|
|
2518
|
+
constexpr inline size_t RoundDownTo(size_t what, size_t align) {
|
|
2519
|
+
return what - (what % align);
|
|
2520
|
+
}
|
|
2521
|
+
|
|
2522
|
+
namespace detail {
|
|
2523
|
+
|
|
2524
|
+
// T is unsigned or T is signed and (val >> shift_amt) is an arithmetic right
|
|
2525
|
+
// shift
|
|
2526
|
+
template <class T>
|
|
2527
|
+
static HWY_INLINE constexpr T ScalarShr(hwy::UnsignedTag /*type_tag*/, T val,
|
|
2528
|
+
int shift_amt) {
|
|
2529
|
+
return static_cast<T>(val >> shift_amt);
|
|
2530
|
+
}
|
|
2531
|
+
|
|
2532
|
+
// T is signed and (val >> shift_amt) is a non-arithmetic right shift
|
|
2533
|
+
template <class T>
|
|
2534
|
+
static HWY_INLINE constexpr T ScalarShr(hwy::SignedTag /*type_tag*/, T val,
|
|
2535
|
+
int shift_amt) {
|
|
2536
|
+
using TU = MakeUnsigned<MakeLaneTypeIfInteger<T>>;
|
|
2537
|
+
return static_cast<T>(
|
|
2538
|
+
(val < 0) ? static_cast<TU>(
|
|
2539
|
+
~(static_cast<TU>(~static_cast<TU>(val)) >> shift_amt))
|
|
2540
|
+
: static_cast<TU>(static_cast<TU>(val) >> shift_amt));
|
|
2541
|
+
}
|
|
2542
|
+
|
|
2543
|
+
} // namespace detail
|
|
2544
|
+
|
|
2545
|
+
// If T is an signed integer type, ScalarShr is guaranteed to perform an
|
|
2546
|
+
// arithmetic right shift
|
|
2547
|
+
|
|
2548
|
+
// Otherwise, if T is an unsigned integer type, ScalarShr is guaranteed to
|
|
2549
|
+
// perform a logical right shift
|
|
2550
|
+
template <class T, HWY_IF_INTEGER(RemoveCvRef<T>)>
|
|
2551
|
+
HWY_API constexpr RemoveCvRef<T> ScalarShr(T val, int shift_amt) {
|
|
2552
|
+
using NonCvRefT = RemoveCvRef<T>;
|
|
2553
|
+
return detail::ScalarShr(
|
|
2554
|
+
hwy::SizeTag<((IsSigned<NonCvRefT>() &&
|
|
2555
|
+
(LimitsMin<NonCvRefT>() >> (sizeof(T) * 8 - 1)) !=
|
|
2556
|
+
static_cast<NonCvRefT>(-1))
|
|
2557
|
+
? 0x100
|
|
2558
|
+
: 0)>(),
|
|
2559
|
+
static_cast<NonCvRefT>(val), shift_amt);
|
|
2560
|
+
}
|
|
2561
|
+
|
|
1349
2562
|
// Undefined results for x == 0.
|
|
1350
2563
|
HWY_API size_t Num0BitsBelowLS1Bit_Nonzero32(const uint32_t x) {
|
|
2564
|
+
HWY_DASSERT(x != 0);
|
|
1351
2565
|
#if HWY_COMPILER_MSVC
|
|
1352
2566
|
unsigned long index; // NOLINT
|
|
1353
2567
|
_BitScanForward(&index, x);
|
|
@@ -1358,6 +2572,7 @@ HWY_API size_t Num0BitsBelowLS1Bit_Nonzero32(const uint32_t x) {
|
|
|
1358
2572
|
}
|
|
1359
2573
|
|
|
1360
2574
|
HWY_API size_t Num0BitsBelowLS1Bit_Nonzero64(const uint64_t x) {
|
|
2575
|
+
HWY_DASSERT(x != 0);
|
|
1361
2576
|
#if HWY_COMPILER_MSVC
|
|
1362
2577
|
#if HWY_ARCH_X86_64
|
|
1363
2578
|
unsigned long index; // NOLINT
|
|
@@ -1383,6 +2598,7 @@ HWY_API size_t Num0BitsBelowLS1Bit_Nonzero64(const uint64_t x) {
|
|
|
1383
2598
|
|
|
1384
2599
|
// Undefined results for x == 0.
|
|
1385
2600
|
HWY_API size_t Num0BitsAboveMS1Bit_Nonzero32(const uint32_t x) {
|
|
2601
|
+
HWY_DASSERT(x != 0);
|
|
1386
2602
|
#if HWY_COMPILER_MSVC
|
|
1387
2603
|
unsigned long index; // NOLINT
|
|
1388
2604
|
_BitScanReverse(&index, x);
|
|
@@ -1393,6 +2609,7 @@ HWY_API size_t Num0BitsAboveMS1Bit_Nonzero32(const uint32_t x) {
|
|
|
1393
2609
|
}
|
|
1394
2610
|
|
|
1395
2611
|
HWY_API size_t Num0BitsAboveMS1Bit_Nonzero64(const uint64_t x) {
|
|
2612
|
+
HWY_DASSERT(x != 0);
|
|
1396
2613
|
#if HWY_COMPILER_MSVC
|
|
1397
2614
|
#if HWY_ARCH_X86_64
|
|
1398
2615
|
unsigned long index; // NOLINT
|
|
@@ -1416,26 +2633,48 @@ HWY_API size_t Num0BitsAboveMS1Bit_Nonzero64(const uint64_t x) {
|
|
|
1416
2633
|
#endif // HWY_COMPILER_MSVC
|
|
1417
2634
|
}
|
|
1418
2635
|
|
|
1419
|
-
|
|
1420
|
-
|
|
1421
|
-
|
|
1422
|
-
|
|
1423
|
-
|
|
1424
|
-
|
|
1425
|
-
|
|
2636
|
+
template <class T, HWY_IF_INTEGER(RemoveCvRef<T>),
|
|
2637
|
+
HWY_IF_T_SIZE_ONE_OF(RemoveCvRef<T>, (1 << 1) | (1 << 2) | (1 << 4))>
|
|
2638
|
+
HWY_API size_t PopCount(T x) {
|
|
2639
|
+
uint32_t u32_x = static_cast<uint32_t>(
|
|
2640
|
+
static_cast<UnsignedFromSize<sizeof(RemoveCvRef<T>)>>(x));
|
|
2641
|
+
|
|
2642
|
+
#if HWY_COMPILER_GCC || HWY_COMPILER_CLANG
|
|
2643
|
+
return static_cast<size_t>(__builtin_popcountl(u32_x));
|
|
2644
|
+
#elif HWY_COMPILER_MSVC && HWY_ARCH_X86_32 && defined(__AVX__)
|
|
2645
|
+
return static_cast<size_t>(_mm_popcnt_u32(u32_x));
|
|
2646
|
+
#else
|
|
2647
|
+
u32_x -= ((u32_x >> 1) & 0x55555555u);
|
|
2648
|
+
u32_x = (((u32_x >> 2) & 0x33333333u) + (u32_x & 0x33333333u));
|
|
2649
|
+
u32_x = (((u32_x >> 4) + u32_x) & 0x0F0F0F0Fu);
|
|
2650
|
+
u32_x += (u32_x >> 8);
|
|
2651
|
+
u32_x += (u32_x >> 16);
|
|
2652
|
+
return static_cast<size_t>(u32_x & 0x3Fu);
|
|
2653
|
+
#endif
|
|
2654
|
+
}
|
|
2655
|
+
|
|
2656
|
+
template <class T, HWY_IF_INTEGER(RemoveCvRef<T>),
|
|
2657
|
+
HWY_IF_T_SIZE(RemoveCvRef<T>, 8)>
|
|
2658
|
+
HWY_API size_t PopCount(T x) {
|
|
2659
|
+
uint64_t u64_x = static_cast<uint64_t>(
|
|
2660
|
+
static_cast<UnsignedFromSize<sizeof(RemoveCvRef<T>)>>(x));
|
|
2661
|
+
|
|
2662
|
+
#if HWY_COMPILER_GCC || HWY_COMPILER_CLANG
|
|
2663
|
+
return static_cast<size_t>(__builtin_popcountll(u64_x));
|
|
1426
2664
|
#elif HWY_COMPILER_MSVC && HWY_ARCH_X86_64 && defined(__AVX__)
|
|
1427
|
-
return _mm_popcnt_u64(
|
|
2665
|
+
return _mm_popcnt_u64(u64_x);
|
|
1428
2666
|
#elif HWY_COMPILER_MSVC && HWY_ARCH_X86_32 && defined(__AVX__)
|
|
1429
|
-
return _mm_popcnt_u32(static_cast<uint32_t>(
|
|
1430
|
-
_mm_popcnt_u32(static_cast<uint32_t>(
|
|
2667
|
+
return _mm_popcnt_u32(static_cast<uint32_t>(u64_x & 0xFFFFFFFFu)) +
|
|
2668
|
+
_mm_popcnt_u32(static_cast<uint32_t>(u64_x >> 32));
|
|
1431
2669
|
#else
|
|
1432
|
-
|
|
1433
|
-
|
|
1434
|
-
|
|
1435
|
-
|
|
1436
|
-
|
|
1437
|
-
|
|
1438
|
-
|
|
2670
|
+
u64_x -= ((u64_x >> 1) & 0x5555555555555555ULL);
|
|
2671
|
+
u64_x = (((u64_x >> 2) & 0x3333333333333333ULL) +
|
|
2672
|
+
(u64_x & 0x3333333333333333ULL));
|
|
2673
|
+
u64_x = (((u64_x >> 4) + u64_x) & 0x0F0F0F0F0F0F0F0FULL);
|
|
2674
|
+
u64_x += (u64_x >> 8);
|
|
2675
|
+
u64_x += (u64_x >> 16);
|
|
2676
|
+
u64_x += (u64_x >> 32);
|
|
2677
|
+
return static_cast<size_t>(u64_x & 0x7Fu);
|
|
1439
2678
|
#endif
|
|
1440
2679
|
}
|
|
1441
2680
|
|
|
@@ -1456,21 +2695,32 @@ template <typename TI>
|
|
|
1456
2695
|
: static_cast<size_t>(FloorLog2(static_cast<TI>(x - 1)) + 1);
|
|
1457
2696
|
}
|
|
1458
2697
|
|
|
1459
|
-
template <typename T>
|
|
1460
|
-
HWY_INLINE constexpr T AddWithWraparound(
|
|
1461
|
-
return t + static_cast<T>(
|
|
2698
|
+
template <typename T, typename T2, HWY_IF_FLOAT(T), HWY_IF_NOT_SPECIAL_FLOAT(T)>
|
|
2699
|
+
HWY_INLINE constexpr T AddWithWraparound(T t, T2 increment) {
|
|
2700
|
+
return t + static_cast<T>(increment);
|
|
1462
2701
|
}
|
|
1463
2702
|
|
|
1464
|
-
template <typename T>
|
|
1465
|
-
HWY_INLINE constexpr T AddWithWraparound(
|
|
1466
|
-
|
|
2703
|
+
template <typename T, typename T2, HWY_IF_SPECIAL_FLOAT(T)>
|
|
2704
|
+
HWY_INLINE constexpr T AddWithWraparound(T t, T2 increment) {
|
|
2705
|
+
return ConvertScalarTo<T>(ConvertScalarTo<float>(t) +
|
|
2706
|
+
ConvertScalarTo<float>(increment));
|
|
2707
|
+
}
|
|
2708
|
+
|
|
2709
|
+
template <typename T, typename T2, HWY_IF_NOT_FLOAT(T)>
|
|
2710
|
+
HWY_INLINE constexpr T AddWithWraparound(T t, T2 n) {
|
|
1467
2711
|
using TU = MakeUnsigned<T>;
|
|
1468
|
-
|
|
1469
|
-
|
|
1470
|
-
|
|
2712
|
+
// Sub-int types would promote to int, not unsigned, which would trigger
|
|
2713
|
+
// warnings, so first promote to the largest unsigned type. Due to
|
|
2714
|
+
// https://gcc.gnu.org/bugzilla/show_bug.cgi?id=87519, which affected GCC 8
|
|
2715
|
+
// until fixed in 9.3, we use built-in types rather than uint64_t.
|
|
2716
|
+
return static_cast<T>(static_cast<TU>(
|
|
2717
|
+
static_cast<unsigned long long>(static_cast<unsigned long long>(t) +
|
|
2718
|
+
static_cast<unsigned long long>(n)) &
|
|
2719
|
+
uint64_t{hwy::LimitsMax<TU>()}));
|
|
1471
2720
|
}
|
|
1472
2721
|
|
|
1473
2722
|
#if HWY_COMPILER_MSVC && HWY_ARCH_X86_64
|
|
2723
|
+
#pragma intrinsic(_mul128)
|
|
1474
2724
|
#pragma intrinsic(_umul128)
|
|
1475
2725
|
#endif
|
|
1476
2726
|
|
|
@@ -1494,7 +2744,179 @@ HWY_API uint64_t Mul128(uint64_t a, uint64_t b, uint64_t* HWY_RESTRICT upper) {
|
|
|
1494
2744
|
#endif
|
|
1495
2745
|
}
|
|
1496
2746
|
|
|
2747
|
+
HWY_API int64_t Mul128(int64_t a, int64_t b, int64_t* HWY_RESTRICT upper) {
|
|
2748
|
+
#if defined(__SIZEOF_INT128__)
|
|
2749
|
+
__int128_t product = (__int128_t)a * (__int128_t)b;
|
|
2750
|
+
*upper = (int64_t)(product >> 64);
|
|
2751
|
+
return (int64_t)(product & 0xFFFFFFFFFFFFFFFFULL);
|
|
2752
|
+
#elif HWY_COMPILER_MSVC && HWY_ARCH_X86_64
|
|
2753
|
+
return _mul128(a, b, upper);
|
|
2754
|
+
#else
|
|
2755
|
+
uint64_t unsigned_upper;
|
|
2756
|
+
const int64_t lower = static_cast<int64_t>(Mul128(
|
|
2757
|
+
static_cast<uint64_t>(a), static_cast<uint64_t>(b), &unsigned_upper));
|
|
2758
|
+
*upper = static_cast<int64_t>(
|
|
2759
|
+
unsigned_upper -
|
|
2760
|
+
(static_cast<uint64_t>(ScalarShr(a, 63)) & static_cast<uint64_t>(b)) -
|
|
2761
|
+
(static_cast<uint64_t>(ScalarShr(b, 63)) & static_cast<uint64_t>(a)));
|
|
2762
|
+
return lower;
|
|
2763
|
+
#endif
|
|
2764
|
+
}
|
|
2765
|
+
|
|
2766
|
+
// Precomputation for fast n / divisor and n % divisor, where n is a variable
|
|
2767
|
+
// and divisor is unchanging but unknown at compile-time.
|
|
2768
|
+
class Divisor {
|
|
2769
|
+
public:
|
|
2770
|
+
explicit Divisor(uint32_t divisor) : divisor_(divisor) {
|
|
2771
|
+
if (divisor <= 1) return;
|
|
2772
|
+
|
|
2773
|
+
const uint32_t len =
|
|
2774
|
+
static_cast<uint32_t>(31 - Num0BitsAboveMS1Bit_Nonzero32(divisor - 1));
|
|
2775
|
+
const uint64_t u_hi = (2ULL << len) - divisor;
|
|
2776
|
+
const uint32_t q = Truncate((u_hi << 32) / divisor);
|
|
2777
|
+
|
|
2778
|
+
mul_ = q + 1;
|
|
2779
|
+
shift1_ = 1;
|
|
2780
|
+
shift2_ = len;
|
|
2781
|
+
}
|
|
2782
|
+
|
|
2783
|
+
uint32_t GetDivisor() const { return divisor_; }
|
|
2784
|
+
|
|
2785
|
+
// Returns n / divisor_.
|
|
2786
|
+
uint32_t Divide(uint32_t n) const {
|
|
2787
|
+
const uint64_t mul = mul_;
|
|
2788
|
+
const uint32_t t = Truncate((mul * n) >> 32);
|
|
2789
|
+
return (t + ((n - t) >> shift1_)) >> shift2_;
|
|
2790
|
+
}
|
|
2791
|
+
|
|
2792
|
+
// Returns n % divisor_.
|
|
2793
|
+
uint32_t Remainder(uint32_t n) const { return n - (Divide(n) * divisor_); }
|
|
2794
|
+
|
|
2795
|
+
private:
|
|
2796
|
+
static uint32_t Truncate(uint64_t x) {
|
|
2797
|
+
return static_cast<uint32_t>(x & 0xFFFFFFFFu);
|
|
2798
|
+
}
|
|
2799
|
+
|
|
2800
|
+
uint32_t divisor_;
|
|
2801
|
+
uint32_t mul_ = 1;
|
|
2802
|
+
uint32_t shift1_ = 0;
|
|
2803
|
+
uint32_t shift2_ = 0;
|
|
2804
|
+
};
|
|
2805
|
+
|
|
2806
|
+
namespace detail {
|
|
2807
|
+
|
|
2808
|
+
template <typename T>
|
|
2809
|
+
static HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR T ScalarAbs(hwy::FloatTag /*tag*/,
|
|
2810
|
+
T val) {
|
|
2811
|
+
using TU = MakeUnsigned<T>;
|
|
2812
|
+
return BitCastScalar<T>(
|
|
2813
|
+
static_cast<TU>(BitCastScalar<TU>(val) & (~SignMask<T>())));
|
|
2814
|
+
}
|
|
2815
|
+
|
|
2816
|
+
template <typename T>
|
|
2817
|
+
static HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR T
|
|
2818
|
+
ScalarAbs(hwy::SpecialTag /*tag*/, T val) {
|
|
2819
|
+
return ScalarAbs(hwy::FloatTag(), val);
|
|
2820
|
+
}
|
|
2821
|
+
|
|
2822
|
+
template <typename T>
|
|
2823
|
+
static HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR T
|
|
2824
|
+
ScalarAbs(hwy::SignedTag /*tag*/, T val) {
|
|
2825
|
+
using TU = MakeUnsigned<T>;
|
|
2826
|
+
return (val < T{0}) ? static_cast<T>(TU{0} - static_cast<TU>(val)) : val;
|
|
2827
|
+
}
|
|
2828
|
+
|
|
2829
|
+
template <typename T>
|
|
2830
|
+
static HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR T
|
|
2831
|
+
ScalarAbs(hwy::UnsignedTag /*tag*/, T val) {
|
|
2832
|
+
return val;
|
|
2833
|
+
}
|
|
2834
|
+
|
|
2835
|
+
} // namespace detail
|
|
2836
|
+
|
|
2837
|
+
template <typename T>
|
|
2838
|
+
HWY_API HWY_BITCASTSCALAR_CONSTEXPR RemoveCvRef<T> ScalarAbs(T val) {
|
|
2839
|
+
using TVal = MakeLaneTypeIfInteger<
|
|
2840
|
+
detail::NativeSpecialFloatToWrapper<RemoveCvRef<T>>>;
|
|
2841
|
+
return detail::ScalarAbs(hwy::TypeTag<TVal>(), static_cast<TVal>(val));
|
|
2842
|
+
}
|
|
2843
|
+
|
|
2844
|
+
template <typename T>
|
|
2845
|
+
HWY_API HWY_BITCASTSCALAR_CONSTEXPR bool ScalarIsNaN(T val) {
|
|
2846
|
+
using TF = detail::NativeSpecialFloatToWrapper<RemoveCvRef<T>>;
|
|
2847
|
+
using TU = MakeUnsigned<TF>;
|
|
2848
|
+
return (BitCastScalar<TU>(ScalarAbs(val)) > ExponentMask<TF>());
|
|
2849
|
+
}
|
|
2850
|
+
|
|
2851
|
+
template <typename T>
|
|
2852
|
+
HWY_API HWY_BITCASTSCALAR_CONSTEXPR bool ScalarIsInf(T val) {
|
|
2853
|
+
using TF = detail::NativeSpecialFloatToWrapper<RemoveCvRef<T>>;
|
|
2854
|
+
using TU = MakeUnsigned<TF>;
|
|
2855
|
+
return static_cast<TU>(BitCastScalar<TU>(static_cast<TF>(val)) << 1) ==
|
|
2856
|
+
static_cast<TU>(MaxExponentTimes2<TF>());
|
|
2857
|
+
}
|
|
2858
|
+
|
|
2859
|
+
namespace detail {
|
|
2860
|
+
|
|
2861
|
+
template <typename T>
|
|
2862
|
+
static HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR bool ScalarIsFinite(
|
|
2863
|
+
hwy::FloatTag /*tag*/, T val) {
|
|
2864
|
+
using TU = MakeUnsigned<T>;
|
|
2865
|
+
return (BitCastScalar<TU>(hwy::ScalarAbs(val)) < ExponentMask<T>());
|
|
2866
|
+
}
|
|
2867
|
+
|
|
2868
|
+
template <typename T>
|
|
2869
|
+
static HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR bool ScalarIsFinite(
|
|
2870
|
+
hwy::NonFloatTag /*tag*/, T /*val*/) {
|
|
2871
|
+
// Integer values are always finite
|
|
2872
|
+
return true;
|
|
2873
|
+
}
|
|
2874
|
+
|
|
2875
|
+
} // namespace detail
|
|
2876
|
+
|
|
2877
|
+
template <typename T>
|
|
2878
|
+
HWY_API HWY_BITCASTSCALAR_CONSTEXPR bool ScalarIsFinite(T val) {
|
|
2879
|
+
using TVal = MakeLaneTypeIfInteger<
|
|
2880
|
+
detail::NativeSpecialFloatToWrapper<RemoveCvRef<T>>>;
|
|
2881
|
+
return detail::ScalarIsFinite(hwy::IsFloatTag<TVal>(),
|
|
2882
|
+
static_cast<TVal>(val));
|
|
2883
|
+
}
|
|
2884
|
+
|
|
2885
|
+
template <typename T>
|
|
2886
|
+
HWY_API HWY_BITCASTSCALAR_CONSTEXPR RemoveCvRef<T> ScalarCopySign(T magn,
|
|
2887
|
+
T sign) {
|
|
2888
|
+
using TF = RemoveCvRef<detail::NativeSpecialFloatToWrapper<RemoveCvRef<T>>>;
|
|
2889
|
+
using TU = MakeUnsigned<TF>;
|
|
2890
|
+
return BitCastScalar<TF>(static_cast<TU>(
|
|
2891
|
+
(BitCastScalar<TU>(static_cast<TF>(magn)) & (~SignMask<TF>())) |
|
|
2892
|
+
(BitCastScalar<TU>(static_cast<TF>(sign)) & SignMask<TF>())));
|
|
2893
|
+
}
|
|
2894
|
+
|
|
2895
|
+
template <typename T>
|
|
2896
|
+
HWY_API HWY_BITCASTSCALAR_CONSTEXPR bool ScalarSignBit(T val) {
|
|
2897
|
+
using TVal = MakeLaneTypeIfInteger<
|
|
2898
|
+
detail::NativeSpecialFloatToWrapper<RemoveCvRef<T>>>;
|
|
2899
|
+
using TU = MakeUnsigned<TVal>;
|
|
2900
|
+
return ((BitCastScalar<TU>(static_cast<TVal>(val)) & SignMask<TVal>()) != 0);
|
|
2901
|
+
}
|
|
2902
|
+
|
|
1497
2903
|
// Prevents the compiler from eliding the computations that led to "output".
|
|
2904
|
+
#if HWY_ARCH_PPC && (HWY_COMPILER_GCC || HWY_COMPILER_CLANG) && \
|
|
2905
|
+
!defined(_SOFT_FLOAT)
|
|
2906
|
+
// Workaround to avoid test failures on PPC if compiled with Clang
|
|
2907
|
+
template <class T, HWY_IF_F32(T)>
|
|
2908
|
+
HWY_API void PreventElision(T&& output) {
|
|
2909
|
+
asm volatile("" : "+f"(output)::"memory");
|
|
2910
|
+
}
|
|
2911
|
+
template <class T, HWY_IF_F64(T)>
|
|
2912
|
+
HWY_API void PreventElision(T&& output) {
|
|
2913
|
+
asm volatile("" : "+d"(output)::"memory");
|
|
2914
|
+
}
|
|
2915
|
+
template <class T, HWY_IF_NOT_FLOAT3264(T)>
|
|
2916
|
+
HWY_API void PreventElision(T&& output) {
|
|
2917
|
+
asm volatile("" : "+r"(output)::"memory");
|
|
2918
|
+
}
|
|
2919
|
+
#else
|
|
1498
2920
|
template <class T>
|
|
1499
2921
|
HWY_API void PreventElision(T&& output) {
|
|
1500
2922
|
#if HWY_COMPILER_MSVC
|
|
@@ -1502,8 +2924,8 @@ HWY_API void PreventElision(T&& output) {
|
|
|
1502
2924
|
// RTL constraints). Self-assignment with #pragma optimize("off") might be
|
|
1503
2925
|
// expected to prevent elision, but it does not with MSVC 2015. Type-punning
|
|
1504
2926
|
// with volatile pointers generates inefficient code on MSVC 2017.
|
|
1505
|
-
static std::atomic<
|
|
1506
|
-
|
|
2927
|
+
static std::atomic<RemoveCvRef<T>> sink;
|
|
2928
|
+
sink.store(output, std::memory_order_relaxed);
|
|
1507
2929
|
#else
|
|
1508
2930
|
// Works by indicating to the compiler that "output" is being read and
|
|
1509
2931
|
// modified. The +r constraint avoids unnecessary writes to memory, but only
|
|
@@ -1511,6 +2933,7 @@ HWY_API void PreventElision(T&& output) {
|
|
|
1511
2933
|
asm volatile("" : "+r"(output) : : "memory");
|
|
1512
2934
|
#endif
|
|
1513
2935
|
}
|
|
2936
|
+
#endif
|
|
1514
2937
|
|
|
1515
2938
|
} // namespace hwy
|
|
1516
2939
|
|