@img/sharp-libvips-dev 1.2.0 → 1.2.2-rc.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/include/ffi.h +3 -3
- package/include/harfbuzz/hb-deprecated.h +4 -4
- package/include/harfbuzz/hb-font.h +120 -9
- package/include/harfbuzz/hb-version.h +3 -3
- package/include/hwy/abort.h +2 -19
- package/include/hwy/aligned_allocator.h +11 -7
- package/include/hwy/auto_tune.h +504 -0
- package/include/hwy/base.h +425 -104
- package/include/hwy/cache_control.h +16 -0
- package/include/hwy/detect_compiler_arch.h +32 -1
- package/include/hwy/detect_targets.h +251 -67
- package/include/hwy/foreach_target.h +35 -0
- package/include/hwy/highway.h +185 -76
- package/include/hwy/nanobenchmark.h +1 -19
- package/include/hwy/ops/arm_neon-inl.h +969 -458
- package/include/hwy/ops/arm_sve-inl.h +1137 -359
- package/include/hwy/ops/emu128-inl.h +97 -11
- package/include/hwy/ops/generic_ops-inl.h +1222 -34
- package/include/hwy/ops/loongarch_lasx-inl.h +4664 -0
- package/include/hwy/ops/loongarch_lsx-inl.h +5933 -0
- package/include/hwy/ops/ppc_vsx-inl.h +306 -126
- package/include/hwy/ops/rvv-inl.h +546 -51
- package/include/hwy/ops/scalar-inl.h +77 -22
- package/include/hwy/ops/set_macros-inl.h +138 -17
- package/include/hwy/ops/shared-inl.h +50 -10
- package/include/hwy/ops/wasm_128-inl.h +137 -92
- package/include/hwy/ops/x86_128-inl.h +773 -214
- package/include/hwy/ops/x86_256-inl.h +712 -255
- package/include/hwy/ops/x86_512-inl.h +429 -753
- package/include/hwy/ops/x86_avx3-inl.h +501 -0
- package/include/hwy/per_target.h +2 -1
- package/include/hwy/profiler.h +622 -486
- package/include/hwy/targets.h +62 -20
- package/include/hwy/timer-inl.h +8 -160
- package/include/hwy/timer.h +170 -3
- package/include/hwy/x86_cpuid.h +81 -0
- package/include/libheif/heif_cxx.h +25 -5
- package/include/libheif/heif_regions.h +5 -5
- package/include/libheif/heif_version.h +2 -2
- package/include/librsvg-2.0/librsvg/rsvg-version.h +2 -2
- package/include/libxml2/libxml/xmlversion.h +4 -4
- package/include/pango-1.0/pango/pango-enum-types.h +3 -0
- package/include/pango-1.0/pango/pango-features.h +3 -3
- package/include/pango-1.0/pango/pango-font.h +30 -0
- package/include/pango-1.0/pango/pango-version-macros.h +26 -0
- package/include/pixman-1/pixman-version.h +2 -2
- package/include/webp/decode.h +11 -2
- package/include/webp/demux.h +2 -0
- package/include/webp/encode.h +2 -0
- package/include/webp/mux_types.h +1 -0
- package/include/webp/sharpyuv/sharpyuv.h +1 -1
- package/include/webp/types.h +2 -2
- package/include/zlib.h +3 -3
- package/package.json +1 -1
- package/versions.json +11 -11
package/include/hwy/base.h
CHANGED
|
@@ -21,13 +21,21 @@
|
|
|
21
21
|
// IWYU pragma: begin_exports
|
|
22
22
|
#include <stddef.h>
|
|
23
23
|
#include <stdint.h>
|
|
24
|
+
#if defined(HWY_HEADER_ONLY)
|
|
25
|
+
#include <cstdarg>
|
|
26
|
+
#include <cstdio>
|
|
27
|
+
#endif
|
|
28
|
+
|
|
29
|
+
#if !defined(HWY_NO_LIBCXX)
|
|
30
|
+
#include <ostream>
|
|
31
|
+
#endif
|
|
24
32
|
|
|
25
33
|
#include "hwy/detect_compiler_arch.h"
|
|
26
34
|
#include "hwy/highway_export.h"
|
|
27
35
|
|
|
28
36
|
// API version (https://semver.org/); keep in sync with CMakeLists.txt.
|
|
29
37
|
#define HWY_MAJOR 1
|
|
30
|
-
#define HWY_MINOR
|
|
38
|
+
#define HWY_MINOR 3
|
|
31
39
|
#define HWY_PATCH 0
|
|
32
40
|
|
|
33
41
|
// True if the Highway version >= major.minor.0. Added in 1.2.0.
|
|
@@ -47,12 +55,12 @@
|
|
|
47
55
|
#include <inttypes.h>
|
|
48
56
|
#endif
|
|
49
57
|
|
|
58
|
+
#endif // !HWY_IDE
|
|
59
|
+
|
|
50
60
|
#if (HWY_ARCH_X86 && !defined(HWY_NO_LIBCXX)) || HWY_COMPILER_MSVC
|
|
51
61
|
#include <atomic>
|
|
52
62
|
#endif
|
|
53
63
|
|
|
54
|
-
#endif // !HWY_IDE
|
|
55
|
-
|
|
56
64
|
#ifndef HWY_HAVE_COMPARE_HEADER // allow override
|
|
57
65
|
#define HWY_HAVE_COMPARE_HEADER 0
|
|
58
66
|
#if defined(__has_include) // note: wrapper macro fails on Clang ~17
|
|
@@ -97,6 +105,7 @@
|
|
|
97
105
|
#define HWY_NORETURN __declspec(noreturn)
|
|
98
106
|
#define HWY_LIKELY(expr) (expr)
|
|
99
107
|
#define HWY_UNLIKELY(expr) (expr)
|
|
108
|
+
#define HWY_UNREACHABLE __assume(false)
|
|
100
109
|
#define HWY_PRAGMA(tokens) __pragma(tokens)
|
|
101
110
|
#define HWY_DIAGNOSTICS(tokens) HWY_PRAGMA(warning(tokens))
|
|
102
111
|
#define HWY_DIAGNOSTICS_OFF(msc, gcc) HWY_DIAGNOSTICS(msc)
|
|
@@ -124,6 +133,11 @@
|
|
|
124
133
|
#define HWY_NORETURN __attribute__((noreturn))
|
|
125
134
|
#define HWY_LIKELY(expr) __builtin_expect(!!(expr), 1)
|
|
126
135
|
#define HWY_UNLIKELY(expr) __builtin_expect(!!(expr), 0)
|
|
136
|
+
#if HWY_COMPILER_GCC || HWY_HAS_BUILTIN(__builtin_unreachable)
|
|
137
|
+
#define HWY_UNREACHABLE __builtin_unreachable()
|
|
138
|
+
#else
|
|
139
|
+
#define HWY_UNREACHABLE
|
|
140
|
+
#endif
|
|
127
141
|
#define HWY_PRAGMA(tokens) _Pragma(#tokens)
|
|
128
142
|
#define HWY_DIAGNOSTICS(tokens) HWY_PRAGMA(GCC diagnostic tokens)
|
|
129
143
|
#define HWY_DIAGNOSTICS_OFF(msc, gcc) HWY_DIAGNOSTICS(gcc)
|
|
@@ -161,7 +175,8 @@ namespace hwy {
|
|
|
161
175
|
// Returns a pointer whose type is `type` (T*), while allowing the compiler to
|
|
162
176
|
// assume that the untyped pointer `ptr` is aligned to a multiple of sizeof(T).
|
|
163
177
|
#define HWY_RCAST_ALIGNED(type, ptr) \
|
|
164
|
-
reinterpret_cast<type>(
|
|
178
|
+
reinterpret_cast<type>( \
|
|
179
|
+
HWY_ASSUME_ALIGNED((ptr), alignof(hwy::RemovePtr<type>)))
|
|
165
180
|
|
|
166
181
|
// Clang and GCC require attributes on each function into which SIMD intrinsics
|
|
167
182
|
// are inlined. Support both per-function annotation (HWY_ATTR) for lambdas and
|
|
@@ -217,7 +232,7 @@ namespace hwy {
|
|
|
217
232
|
// Better:
|
|
218
233
|
// HWY_ASSUME(x == 2);
|
|
219
234
|
// HWY_ASSUME(y == 3);
|
|
220
|
-
#if HWY_HAS_CPP_ATTRIBUTE(assume)
|
|
235
|
+
#if (HWY_CXX_LANG >= 202302L) && HWY_HAS_CPP_ATTRIBUTE(assume)
|
|
221
236
|
#define HWY_ASSUME(expr) [[assume(expr)]]
|
|
222
237
|
#elif HWY_COMPILER_MSVC || HWY_COMPILER_ICC
|
|
223
238
|
#define HWY_ASSUME(expr) __assume(expr)
|
|
@@ -233,32 +248,106 @@ namespace hwy {
|
|
|
233
248
|
#define HWY_ASSUME(expr) static_cast<void>(0)
|
|
234
249
|
#endif
|
|
235
250
|
|
|
236
|
-
// Compile-time fence to prevent undesirable code reordering. On Clang
|
|
237
|
-
// typical asm volatile("" : : : "memory")
|
|
238
|
-
//
|
|
239
|
-
|
|
240
|
-
#
|
|
251
|
+
// Compile-time fence to prevent undesirable code reordering. On Clang, the
|
|
252
|
+
// typical `asm volatile("" : : : "memory")` seems to be ignored. Note that
|
|
253
|
+
// `std::atomic_thread_fence` affects other threads, hence might generate a
|
|
254
|
+
// barrier instruction, but this does not.
|
|
255
|
+
#if !defined(HWY_NO_LIBCXX)
|
|
256
|
+
#define HWY_FENCE std::atomic_signal_fence(std::memory_order_seq_cst)
|
|
257
|
+
#elif HWY_COMPILER_GCC
|
|
258
|
+
#define HWY_FENCE asm volatile("" : : : "memory")
|
|
241
259
|
#else
|
|
242
|
-
// TODO(janwas): investigate alternatives. On Arm, the above generates barriers.
|
|
243
260
|
#define HWY_FENCE
|
|
244
261
|
#endif
|
|
245
262
|
|
|
246
263
|
// 4 instances of a given literal value, useful as input to LoadDup128.
|
|
247
264
|
#define HWY_REP4(literal) literal, literal, literal, literal
|
|
248
265
|
|
|
266
|
+
//------------------------------------------------------------------------------
|
|
267
|
+
// Abort / Warn
|
|
268
|
+
|
|
269
|
+
#if defined(HWY_HEADER_ONLY)
|
|
270
|
+
HWY_DLLEXPORT inline void HWY_FORMAT(3, 4)
|
|
271
|
+
Warn(const char* file, int line, const char* format, ...) {
|
|
272
|
+
char buf[800];
|
|
273
|
+
va_list args;
|
|
274
|
+
va_start(args, format);
|
|
275
|
+
vsnprintf(buf, sizeof(buf), format, args);
|
|
276
|
+
va_end(args);
|
|
277
|
+
|
|
278
|
+
fprintf(stderr, "Warn at %s:%d: %s\n", file, line, buf);
|
|
279
|
+
}
|
|
280
|
+
|
|
281
|
+
HWY_DLLEXPORT HWY_NORETURN inline void HWY_FORMAT(3, 4)
|
|
282
|
+
Abort(const char* file, int line, const char* format, ...) {
|
|
283
|
+
char buf[800];
|
|
284
|
+
va_list args;
|
|
285
|
+
va_start(args, format);
|
|
286
|
+
vsnprintf(buf, sizeof(buf), format, args);
|
|
287
|
+
va_end(args);
|
|
288
|
+
|
|
289
|
+
fprintf(stderr, "Abort at %s:%d: %s\n", file, line, buf);
|
|
290
|
+
|
|
291
|
+
fflush(stderr);
|
|
292
|
+
|
|
293
|
+
// Now terminate the program:
|
|
294
|
+
#if HWY_ARCH_RISCV
|
|
295
|
+
exit(1); // trap/abort just freeze Spike.
|
|
296
|
+
#else
|
|
297
|
+
abort(); // Compile error without this due to HWY_NORETURN.
|
|
298
|
+
#endif
|
|
299
|
+
}
|
|
300
|
+
#else // !HWY_HEADER_ONLY
|
|
301
|
+
// Interfaces for custom Warn/Abort handlers.
|
|
302
|
+
typedef void (*WarnFunc)(const char* file, int line, const char* message);
|
|
303
|
+
|
|
304
|
+
typedef void (*AbortFunc)(const char* file, int line, const char* message);
|
|
305
|
+
|
|
306
|
+
// Returns current Warn() handler, or nullptr if no handler was yet registered,
|
|
307
|
+
// indicating Highway should print to stderr.
|
|
308
|
+
// DEPRECATED because this is thread-hostile and prone to misuse (modifying the
|
|
309
|
+
// underlying pointer through the reference).
|
|
310
|
+
HWY_DLLEXPORT WarnFunc& GetWarnFunc();
|
|
311
|
+
|
|
312
|
+
// Returns current Abort() handler, or nullptr if no handler was yet registered,
|
|
313
|
+
// indicating Highway should print to stderr and abort.
|
|
314
|
+
// DEPRECATED because this is thread-hostile and prone to misuse (modifying the
|
|
315
|
+
// underlying pointer through the reference).
|
|
316
|
+
HWY_DLLEXPORT AbortFunc& GetAbortFunc();
|
|
317
|
+
|
|
318
|
+
// Sets a new Warn() handler and returns the previous handler, which is nullptr
|
|
319
|
+
// if no previous handler was registered, and should otherwise be called from
|
|
320
|
+
// the new handler. Thread-safe.
|
|
321
|
+
HWY_DLLEXPORT WarnFunc SetWarnFunc(WarnFunc func);
|
|
322
|
+
|
|
323
|
+
// Sets a new Abort() handler and returns the previous handler, which is nullptr
|
|
324
|
+
// if no previous handler was registered, and should otherwise be called from
|
|
325
|
+
// the new handler. If all handlers return, then Highway will terminate the app.
|
|
326
|
+
// Thread-safe.
|
|
327
|
+
HWY_DLLEXPORT AbortFunc SetAbortFunc(AbortFunc func);
|
|
328
|
+
|
|
329
|
+
HWY_DLLEXPORT void HWY_FORMAT(3, 4)
|
|
330
|
+
Warn(const char* file, int line, const char* format, ...);
|
|
331
|
+
|
|
249
332
|
HWY_DLLEXPORT HWY_NORETURN void HWY_FORMAT(3, 4)
|
|
250
333
|
Abort(const char* file, int line, const char* format, ...);
|
|
251
334
|
|
|
335
|
+
#endif // HWY_HEADER_ONLY
|
|
336
|
+
|
|
337
|
+
#define HWY_WARN(format, ...) \
|
|
338
|
+
::hwy::Warn(__FILE__, __LINE__, format, ##__VA_ARGS__)
|
|
339
|
+
|
|
252
340
|
#define HWY_ABORT(format, ...) \
|
|
253
341
|
::hwy::Abort(__FILE__, __LINE__, format, ##__VA_ARGS__)
|
|
254
342
|
|
|
255
343
|
// Always enabled.
|
|
256
|
-
#define
|
|
257
|
-
do {
|
|
258
|
-
if (!(condition)) {
|
|
259
|
-
HWY_ABORT("Assert %s", #condition); \
|
|
260
|
-
}
|
|
344
|
+
#define HWY_ASSERT_M(condition, msg) \
|
|
345
|
+
do { \
|
|
346
|
+
if (!(condition)) { \
|
|
347
|
+
HWY_ABORT("Assert %s: %s", #condition, msg); \
|
|
348
|
+
} \
|
|
261
349
|
} while (0)
|
|
350
|
+
#define HWY_ASSERT(condition) HWY_ASSERT_M(condition, "")
|
|
262
351
|
|
|
263
352
|
#if HWY_HAS_FEATURE(memory_sanitizer) || defined(MEMORY_SANITIZER) || \
|
|
264
353
|
defined(__SANITIZE_MEMORY__)
|
|
@@ -303,12 +392,17 @@ HWY_DLLEXPORT HWY_NORETURN void HWY_FORMAT(3, 4)
|
|
|
303
392
|
#define HWY_ATTR_NO_MSAN
|
|
304
393
|
#endif
|
|
305
394
|
|
|
395
|
+
#if HWY_IS_ASAN || HWY_IS_HWASAN || HWY_IS_MSAN || HWY_IS_TSAN || HWY_IS_UBSAN
|
|
396
|
+
#define HWY_IS_SANITIZER 1
|
|
397
|
+
#else
|
|
398
|
+
#define HWY_IS_SANITIZER 0
|
|
399
|
+
#endif
|
|
400
|
+
|
|
306
401
|
// For enabling HWY_DASSERT and shortening tests in slower debug builds
|
|
307
402
|
#if !defined(HWY_IS_DEBUG_BUILD)
|
|
308
403
|
// Clang does not define NDEBUG, but it and GCC define __OPTIMIZE__, and recent
|
|
309
404
|
// MSVC defines NDEBUG (if not, could instead check _DEBUG).
|
|
310
|
-
#if (!defined(__OPTIMIZE__) && !defined(NDEBUG)) ||
|
|
311
|
-
HWY_IS_HWASAN || HWY_IS_MSAN || HWY_IS_TSAN || HWY_IS_UBSAN || \
|
|
405
|
+
#if (!defined(__OPTIMIZE__) && !defined(NDEBUG)) || HWY_IS_SANITIZER || \
|
|
312
406
|
defined(__clang_analyzer__)
|
|
313
407
|
#define HWY_IS_DEBUG_BUILD 1
|
|
314
408
|
#else
|
|
@@ -317,8 +411,12 @@ HWY_DLLEXPORT HWY_NORETURN void HWY_FORMAT(3, 4)
|
|
|
317
411
|
#endif // HWY_IS_DEBUG_BUILD
|
|
318
412
|
|
|
319
413
|
#if HWY_IS_DEBUG_BUILD
|
|
320
|
-
#define
|
|
414
|
+
#define HWY_DASSERT_M(condition, msg) HWY_ASSERT_M(condition, msg)
|
|
415
|
+
#define HWY_DASSERT(condition) HWY_ASSERT_M(condition, "")
|
|
321
416
|
#else
|
|
417
|
+
#define HWY_DASSERT_M(condition, msg) \
|
|
418
|
+
do { \
|
|
419
|
+
} while (0)
|
|
322
420
|
#define HWY_DASSERT(condition) \
|
|
323
421
|
do { \
|
|
324
422
|
} while (0)
|
|
@@ -453,6 +551,13 @@ static inline HWY_MAYBE_UNUSED bool operator==(const uint128_t& a,
|
|
|
453
551
|
return a.lo == b.lo && a.hi == b.hi;
|
|
454
552
|
}
|
|
455
553
|
|
|
554
|
+
#if !defined(HWY_NO_LIBCXX)
|
|
555
|
+
static inline HWY_MAYBE_UNUSED std::ostream& operator<<(std::ostream& os,
|
|
556
|
+
const uint128_t& n) {
|
|
557
|
+
return os << "[hi=" << n.hi << ",lo=" << n.lo << "]";
|
|
558
|
+
}
|
|
559
|
+
#endif
|
|
560
|
+
|
|
456
561
|
static inline HWY_MAYBE_UNUSED bool operator<(const K64V64& a,
|
|
457
562
|
const K64V64& b) {
|
|
458
563
|
return a.key < b.key;
|
|
@@ -467,6 +572,13 @@ static inline HWY_MAYBE_UNUSED bool operator==(const K64V64& a,
|
|
|
467
572
|
return a.key == b.key;
|
|
468
573
|
}
|
|
469
574
|
|
|
575
|
+
#if !defined(HWY_NO_LIBCXX)
|
|
576
|
+
static inline HWY_MAYBE_UNUSED std::ostream& operator<<(std::ostream& os,
|
|
577
|
+
const K64V64& n) {
|
|
578
|
+
return os << "[k=" << n.key << ",v=" << n.value << "]";
|
|
579
|
+
}
|
|
580
|
+
#endif
|
|
581
|
+
|
|
470
582
|
static inline HWY_MAYBE_UNUSED bool operator<(const K32V32& a,
|
|
471
583
|
const K32V32& b) {
|
|
472
584
|
return a.key < b.key;
|
|
@@ -481,6 +593,13 @@ static inline HWY_MAYBE_UNUSED bool operator==(const K32V32& a,
|
|
|
481
593
|
return a.key == b.key;
|
|
482
594
|
}
|
|
483
595
|
|
|
596
|
+
#if !defined(HWY_NO_LIBCXX)
|
|
597
|
+
static inline HWY_MAYBE_UNUSED std::ostream& operator<<(std::ostream& os,
|
|
598
|
+
const K32V32& n) {
|
|
599
|
+
return os << "[k=" << n.key << ",v=" << n.value << "]";
|
|
600
|
+
}
|
|
601
|
+
#endif
|
|
602
|
+
|
|
484
603
|
//------------------------------------------------------------------------------
|
|
485
604
|
// Controlling overload resolution (SFINAE)
|
|
486
605
|
|
|
@@ -882,78 +1001,87 @@ HWY_INLINE constexpr bool IsIntegerLaneType<uint64_t>() {
|
|
|
882
1001
|
return true;
|
|
883
1002
|
}
|
|
884
1003
|
|
|
1004
|
+
namespace detail {
|
|
1005
|
+
|
|
885
1006
|
template <class T>
|
|
886
|
-
|
|
887
|
-
// NOTE: Do not add a
|
|
1007
|
+
static HWY_INLINE constexpr bool IsNonCvInteger() {
|
|
1008
|
+
// NOTE: Do not add a IsNonCvInteger<wchar_t>() specialization below as it is
|
|
888
1009
|
// possible for IsSame<wchar_t, uint16_t>() to be true when compiled with MSVC
|
|
889
1010
|
// with the /Zc:wchar_t- option.
|
|
890
|
-
return IsIntegerLaneType<T>() || IsSame<
|
|
891
|
-
IsSameEither<
|
|
892
|
-
IsSameEither<
|
|
1011
|
+
return IsIntegerLaneType<T>() || IsSame<T, wchar_t>() ||
|
|
1012
|
+
IsSameEither<T, size_t, ptrdiff_t>() ||
|
|
1013
|
+
IsSameEither<T, intptr_t, uintptr_t>();
|
|
893
1014
|
}
|
|
894
1015
|
template <>
|
|
895
|
-
HWY_INLINE constexpr bool
|
|
1016
|
+
HWY_INLINE constexpr bool IsNonCvInteger<bool>() {
|
|
896
1017
|
return true;
|
|
897
1018
|
}
|
|
898
1019
|
template <>
|
|
899
|
-
HWY_INLINE constexpr bool
|
|
1020
|
+
HWY_INLINE constexpr bool IsNonCvInteger<char>() {
|
|
900
1021
|
return true;
|
|
901
1022
|
}
|
|
902
1023
|
template <>
|
|
903
|
-
HWY_INLINE constexpr bool
|
|
1024
|
+
HWY_INLINE constexpr bool IsNonCvInteger<signed char>() {
|
|
904
1025
|
return true;
|
|
905
1026
|
}
|
|
906
1027
|
template <>
|
|
907
|
-
HWY_INLINE constexpr bool
|
|
1028
|
+
HWY_INLINE constexpr bool IsNonCvInteger<unsigned char>() {
|
|
908
1029
|
return true;
|
|
909
1030
|
}
|
|
910
1031
|
template <>
|
|
911
|
-
HWY_INLINE constexpr bool
|
|
1032
|
+
HWY_INLINE constexpr bool IsNonCvInteger<short>() { // NOLINT
|
|
912
1033
|
return true;
|
|
913
1034
|
}
|
|
914
1035
|
template <>
|
|
915
|
-
HWY_INLINE constexpr bool
|
|
1036
|
+
HWY_INLINE constexpr bool IsNonCvInteger<unsigned short>() { // NOLINT
|
|
916
1037
|
return true;
|
|
917
1038
|
}
|
|
918
1039
|
template <>
|
|
919
|
-
HWY_INLINE constexpr bool
|
|
1040
|
+
HWY_INLINE constexpr bool IsNonCvInteger<int>() {
|
|
920
1041
|
return true;
|
|
921
1042
|
}
|
|
922
1043
|
template <>
|
|
923
|
-
HWY_INLINE constexpr bool
|
|
1044
|
+
HWY_INLINE constexpr bool IsNonCvInteger<unsigned>() {
|
|
924
1045
|
return true;
|
|
925
1046
|
}
|
|
926
1047
|
template <>
|
|
927
|
-
HWY_INLINE constexpr bool
|
|
1048
|
+
HWY_INLINE constexpr bool IsNonCvInteger<long>() { // NOLINT
|
|
928
1049
|
return true;
|
|
929
1050
|
}
|
|
930
1051
|
template <>
|
|
931
|
-
HWY_INLINE constexpr bool
|
|
1052
|
+
HWY_INLINE constexpr bool IsNonCvInteger<unsigned long>() { // NOLINT
|
|
932
1053
|
return true;
|
|
933
1054
|
}
|
|
934
1055
|
template <>
|
|
935
|
-
HWY_INLINE constexpr bool
|
|
1056
|
+
HWY_INLINE constexpr bool IsNonCvInteger<long long>() { // NOLINT
|
|
936
1057
|
return true;
|
|
937
1058
|
}
|
|
938
1059
|
template <>
|
|
939
|
-
HWY_INLINE constexpr bool
|
|
1060
|
+
HWY_INLINE constexpr bool IsNonCvInteger<unsigned long long>() { // NOLINT
|
|
940
1061
|
return true;
|
|
941
1062
|
}
|
|
942
1063
|
#if defined(__cpp_char8_t) && __cpp_char8_t >= 201811L
|
|
943
1064
|
template <>
|
|
944
|
-
HWY_INLINE constexpr bool
|
|
1065
|
+
HWY_INLINE constexpr bool IsNonCvInteger<char8_t>() {
|
|
945
1066
|
return true;
|
|
946
1067
|
}
|
|
947
1068
|
#endif
|
|
948
1069
|
template <>
|
|
949
|
-
HWY_INLINE constexpr bool
|
|
1070
|
+
HWY_INLINE constexpr bool IsNonCvInteger<char16_t>() {
|
|
950
1071
|
return true;
|
|
951
1072
|
}
|
|
952
1073
|
template <>
|
|
953
|
-
HWY_INLINE constexpr bool
|
|
1074
|
+
HWY_INLINE constexpr bool IsNonCvInteger<char32_t>() {
|
|
954
1075
|
return true;
|
|
955
1076
|
}
|
|
956
1077
|
|
|
1078
|
+
} // namespace detail
|
|
1079
|
+
|
|
1080
|
+
template <class T>
|
|
1081
|
+
HWY_API constexpr bool IsInteger() {
|
|
1082
|
+
return detail::IsNonCvInteger<RemoveCvRef<T>>();
|
|
1083
|
+
}
|
|
1084
|
+
|
|
957
1085
|
// -----------------------------------------------------------------------------
|
|
958
1086
|
// BitCastScalar
|
|
959
1087
|
|
|
@@ -1042,6 +1170,7 @@ HWY_API HWY_BITCASTSCALAR_CONSTEXPR To BitCastScalar(const From& val) {
|
|
|
1042
1170
|
|
|
1043
1171
|
#pragma pack(push, 1)
|
|
1044
1172
|
|
|
1173
|
+
#ifndef HWY_NEON_HAVE_F16C // allow override
|
|
1045
1174
|
// Compiler supports __fp16 and load/store/conversion NEON intrinsics, which are
|
|
1046
1175
|
// included in Armv8 and VFPv4 (except with MSVC). On Armv7 Clang requires
|
|
1047
1176
|
// __ARM_FP & 2 whereas Armv7 GCC requires -mfp16-format=ieee.
|
|
@@ -1052,6 +1181,7 @@ HWY_API HWY_BITCASTSCALAR_CONSTEXPR To BitCastScalar(const From& val) {
|
|
|
1052
1181
|
#else
|
|
1053
1182
|
#define HWY_NEON_HAVE_F16C 0
|
|
1054
1183
|
#endif
|
|
1184
|
+
#endif // HWY_NEON_HAVE_F16C
|
|
1055
1185
|
|
|
1056
1186
|
// RVV with f16 extension supports _Float16 and f16 vector ops. If set, implies
|
|
1057
1187
|
// HWY_HAVE_FLOAT16.
|
|
@@ -1071,9 +1201,10 @@ HWY_API HWY_BITCASTSCALAR_CONSTEXPR To BitCastScalar(const From& val) {
|
|
|
1071
1201
|
#define HWY_SSE2_HAVE_F16_TYPE 0
|
|
1072
1202
|
#endif
|
|
1073
1203
|
|
|
1074
|
-
#ifndef HWY_HAVE_SCALAR_F16_TYPE
|
|
1204
|
+
#ifndef HWY_HAVE_SCALAR_F16_TYPE // allow override
|
|
1075
1205
|
// Compiler supports _Float16, not necessarily with operators.
|
|
1076
|
-
#if HWY_NEON_HAVE_F16C || HWY_RVV_HAVE_F16_VEC || HWY_SSE2_HAVE_F16_TYPE
|
|
1206
|
+
#if HWY_NEON_HAVE_F16C || HWY_RVV_HAVE_F16_VEC || HWY_SSE2_HAVE_F16_TYPE || \
|
|
1207
|
+
__SPIRV_DEVICE__
|
|
1077
1208
|
#define HWY_HAVE_SCALAR_F16_TYPE 1
|
|
1078
1209
|
#else
|
|
1079
1210
|
#define HWY_HAVE_SCALAR_F16_TYPE 0
|
|
@@ -1125,17 +1256,19 @@ using NativeSpecialFloatToWrapper =
|
|
|
1125
1256
|
// are generated regardless of F16 support; see #1684.
|
|
1126
1257
|
struct alignas(2) float16_t {
|
|
1127
1258
|
#if HWY_HAVE_SCALAR_F16_TYPE
|
|
1128
|
-
#if HWY_RVV_HAVE_F16_VEC || HWY_SSE2_HAVE_F16_TYPE
|
|
1259
|
+
#if HWY_RVV_HAVE_F16_VEC || HWY_SSE2_HAVE_F16_TYPE || __SPIRV_DEVICE__
|
|
1129
1260
|
using Native = _Float16;
|
|
1130
1261
|
#elif HWY_NEON_HAVE_F16C
|
|
1131
1262
|
using Native = __fp16;
|
|
1132
1263
|
#else
|
|
1133
1264
|
#error "Logic error: condition should be 'all but NEON_HAVE_F16C'"
|
|
1134
1265
|
#endif
|
|
1266
|
+
#elif HWY_IDE
|
|
1267
|
+
using Native = uint16_t;
|
|
1135
1268
|
#endif // HWY_HAVE_SCALAR_F16_TYPE
|
|
1136
1269
|
|
|
1137
1270
|
union {
|
|
1138
|
-
#if HWY_HAVE_SCALAR_F16_TYPE
|
|
1271
|
+
#if HWY_HAVE_SCALAR_F16_TYPE || HWY_IDE
|
|
1139
1272
|
// Accessed via NativeLaneType, and used directly if
|
|
1140
1273
|
// HWY_HAVE_SCALAR_F16_OPERATORS.
|
|
1141
1274
|
Native native;
|
|
@@ -1581,9 +1714,13 @@ HWY_F16_CONSTEXPR inline std::partial_ordering operator<=>(
|
|
|
1581
1714
|
#endif
|
|
1582
1715
|
|
|
1583
1716
|
// x86 compiler supports __bf16, not necessarily with operators.
|
|
1717
|
+
// Disable in debug builds due to clang miscompiles as of 2025-07-22: casting
|
|
1718
|
+
// bf16 <-> f32 in convert_test results in 0x2525 for 1.0 instead of 0x3f80.
|
|
1719
|
+
// Reported at https://github.com/llvm/llvm-project/issues/151692.
|
|
1584
1720
|
#ifndef HWY_SSE2_HAVE_SCALAR_BF16_TYPE
|
|
1585
|
-
#if HWY_ARCH_X86 && defined(__SSE2__) &&
|
|
1586
|
-
((HWY_COMPILER_CLANG >= 1700 && !HWY_COMPILER_CLANGCL
|
|
1721
|
+
#if HWY_ARCH_X86 && defined(__SSE2__) && \
|
|
1722
|
+
((HWY_COMPILER_CLANG >= 1700 && !HWY_COMPILER_CLANGCL && \
|
|
1723
|
+
!HWY_IS_DEBUG_BUILD) || \
|
|
1587
1724
|
HWY_COMPILER_GCC_ACTUAL >= 1300)
|
|
1588
1725
|
#define HWY_SSE2_HAVE_SCALAR_BF16_TYPE 1
|
|
1589
1726
|
#else
|
|
@@ -1617,10 +1754,12 @@ HWY_F16_CONSTEXPR inline std::partial_ordering operator<=>(
|
|
|
1617
1754
|
struct alignas(2) bfloat16_t {
|
|
1618
1755
|
#if HWY_HAVE_SCALAR_BF16_TYPE
|
|
1619
1756
|
using Native = __bf16;
|
|
1757
|
+
#elif HWY_IDE
|
|
1758
|
+
using Native = uint16_t;
|
|
1620
1759
|
#endif
|
|
1621
1760
|
|
|
1622
1761
|
union {
|
|
1623
|
-
#if HWY_HAVE_SCALAR_BF16_TYPE
|
|
1762
|
+
#if HWY_HAVE_SCALAR_BF16_TYPE || HWY_IDE
|
|
1624
1763
|
// Accessed via NativeLaneType, and used directly if
|
|
1625
1764
|
// HWY_HAVE_SCALAR_BF16_OPERATORS.
|
|
1626
1765
|
Native native;
|
|
@@ -1637,7 +1776,7 @@ struct alignas(2) bfloat16_t {
|
|
|
1637
1776
|
bfloat16_t& operator=(const bfloat16_t& arg) noexcept = default;
|
|
1638
1777
|
|
|
1639
1778
|
// Only enable implicit conversions if we have a native type.
|
|
1640
|
-
#if HWY_HAVE_SCALAR_BF16_TYPE
|
|
1779
|
+
#if HWY_HAVE_SCALAR_BF16_TYPE || HWY_IDE
|
|
1641
1780
|
constexpr bfloat16_t(Native arg) noexcept : native(arg) {}
|
|
1642
1781
|
constexpr operator Native() const noexcept { return native; }
|
|
1643
1782
|
#endif
|
|
@@ -1818,38 +1957,33 @@ static HWY_INLINE HWY_MAYBE_UNUSED constexpr uint32_t F32BitsToBF16RoundIncr(
|
|
|
1818
1957
|
: 0u);
|
|
1819
1958
|
}
|
|
1820
1959
|
|
|
1960
|
+
// If f32_bits is the bit representation of a NaN F32 value, make sure that
|
|
1961
|
+
// bit 6 of the BF16 result is set to convert SNaN F32 values to QNaN BF16
|
|
1962
|
+
// values and to prevent NaN F32 values from being converted to an infinite
|
|
1963
|
+
// BF16 value
|
|
1964
|
+
static HWY_INLINE constexpr uint32_t BF16BitsIfSNAN(uint32_t f32_bits) {
|
|
1965
|
+
return ((f32_bits & 0x7FFFFFFFu) > 0x7F800000u) ? (uint32_t{1} << 6) : 0;
|
|
1966
|
+
}
|
|
1967
|
+
|
|
1821
1968
|
// Converts f32_bits (which is the bits of a F32 value) to BF16 bits,
|
|
1822
1969
|
// rounded to the nearest F16 value
|
|
1823
1970
|
static HWY_INLINE HWY_MAYBE_UNUSED constexpr uint16_t F32BitsToBF16Bits(
|
|
1824
1971
|
const uint32_t f32_bits) {
|
|
1825
|
-
// Round f32_bits to the nearest BF16 by first adding
|
|
1826
|
-
// F32BitsToBF16RoundIncr(f32_bits) to f32_bits and then right shifting
|
|
1827
|
-
// f32_bits + F32BitsToBF16RoundIncr(f32_bits) by 16
|
|
1828
|
-
|
|
1829
|
-
// If f32_bits is the bit representation of a NaN F32 value, make sure that
|
|
1830
|
-
// bit 6 of the BF16 result is set to convert SNaN F32 values to QNaN BF16
|
|
1831
|
-
// values and to prevent NaN F32 values from being converted to an infinite
|
|
1832
|
-
// BF16 value
|
|
1833
1972
|
return static_cast<uint16_t>(
|
|
1834
|
-
(
|
|
1835
|
-
(
|
|
1973
|
+
BF16BitsIfSNAN(f32_bits) |
|
|
1974
|
+
((f32_bits + F32BitsToBF16RoundIncr(f32_bits)) >> 16));
|
|
1836
1975
|
}
|
|
1837
1976
|
|
|
1838
1977
|
} // namespace detail
|
|
1839
1978
|
|
|
1840
1979
|
HWY_API HWY_BF16_CONSTEXPR bfloat16_t BF16FromF32(float f) {
|
|
1841
|
-
|
|
1842
|
-
|
|
1843
|
-
#else
|
|
1980
|
+
// The rounding mode is not specified in the C++ standard, so ignore
|
|
1981
|
+
// `HWY_HAVE_SCALAR_BF16_OPERATORS` and only use our round to nearest.
|
|
1844
1982
|
return bfloat16_t::FromBits(
|
|
1845
1983
|
detail::F32BitsToBF16Bits(BitCastScalar<uint32_t>(f)));
|
|
1846
|
-
#endif
|
|
1847
1984
|
}
|
|
1848
1985
|
|
|
1849
1986
|
HWY_API HWY_BF16_CONSTEXPR bfloat16_t BF16FromF64(double f64) {
|
|
1850
|
-
#if HWY_HAVE_SCALAR_BF16_OPERATORS
|
|
1851
|
-
return static_cast<bfloat16_t>(f64);
|
|
1852
|
-
#else
|
|
1853
1987
|
// The mantissa bits of f64 are first rounded using round-to-odd rounding
|
|
1854
1988
|
// to the nearest f64 value that has the lower 38 bits zeroed out to
|
|
1855
1989
|
// ensure that the result is correctly rounded to a BF16.
|
|
@@ -1885,7 +2019,6 @@ HWY_API HWY_BF16_CONSTEXPR bfloat16_t BF16FromF64(double f64) {
|
|
|
1885
2019
|
(BitCastScalar<uint64_t>(f64) & 0xFFFFFFC000000000ULL) |
|
|
1886
2020
|
((BitCastScalar<uint64_t>(f64) + 0x0000003FFFFFFFFFULL) &
|
|
1887
2021
|
0x0000004000000000ULL)))));
|
|
1888
|
-
#endif
|
|
1889
2022
|
}
|
|
1890
2023
|
|
|
1891
2024
|
// More convenient to define outside bfloat16_t because these may use
|
|
@@ -2178,6 +2311,11 @@ constexpr bool IsSigned<hwy::K32V32>() {
|
|
|
2178
2311
|
return false;
|
|
2179
2312
|
}
|
|
2180
2313
|
|
|
2314
|
+
template <typename T>
|
|
2315
|
+
HWY_API constexpr bool IsUnsigned() {
|
|
2316
|
+
return IsInteger<T>() && !IsSigned<T>();
|
|
2317
|
+
}
|
|
2318
|
+
|
|
2181
2319
|
template <typename T, bool = IsInteger<T>() && !IsIntegerLaneType<T>()>
|
|
2182
2320
|
struct MakeLaneTypeIfIntegerT {
|
|
2183
2321
|
using type = T;
|
|
@@ -2364,6 +2502,45 @@ constexpr MakeSigned<T> MaxExponentField() {
|
|
|
2364
2502
|
return (MakeSigned<T>{1} << ExponentBits<T>()) - 1;
|
|
2365
2503
|
}
|
|
2366
2504
|
|
|
2505
|
+
namespace detail {
|
|
2506
|
+
|
|
2507
|
+
template <typename T>
|
|
2508
|
+
static HWY_INLINE HWY_MAYBE_UNUSED HWY_BITCASTSCALAR_CONSTEXPR T
|
|
2509
|
+
NegativeInfOrLowestValue(hwy::FloatTag /* tag */) {
|
|
2510
|
+
return BitCastScalar<T>(
|
|
2511
|
+
static_cast<MakeUnsigned<T>>(SignMask<T>() | ExponentMask<T>()));
|
|
2512
|
+
}
|
|
2513
|
+
|
|
2514
|
+
template <typename T>
|
|
2515
|
+
static HWY_INLINE HWY_MAYBE_UNUSED HWY_BITCASTSCALAR_CONSTEXPR T
|
|
2516
|
+
NegativeInfOrLowestValue(hwy::NonFloatTag /* tag */) {
|
|
2517
|
+
return LowestValue<T>();
|
|
2518
|
+
}
|
|
2519
|
+
|
|
2520
|
+
template <typename T>
|
|
2521
|
+
static HWY_INLINE HWY_MAYBE_UNUSED HWY_BITCASTSCALAR_CONSTEXPR T
|
|
2522
|
+
PositiveInfOrHighestValue(hwy::FloatTag /* tag */) {
|
|
2523
|
+
return BitCastScalar<T>(ExponentMask<T>());
|
|
2524
|
+
}
|
|
2525
|
+
|
|
2526
|
+
template <typename T>
|
|
2527
|
+
static HWY_INLINE HWY_MAYBE_UNUSED HWY_BITCASTSCALAR_CONSTEXPR T
|
|
2528
|
+
PositiveInfOrHighestValue(hwy::NonFloatTag /* tag */) {
|
|
2529
|
+
return HighestValue<T>();
|
|
2530
|
+
}
|
|
2531
|
+
|
|
2532
|
+
} // namespace detail
|
|
2533
|
+
|
|
2534
|
+
template <typename T>
|
|
2535
|
+
HWY_API HWY_BITCASTSCALAR_CONSTEXPR T NegativeInfOrLowestValue() {
|
|
2536
|
+
return detail::NegativeInfOrLowestValue<T>(IsFloatTag<T>());
|
|
2537
|
+
}
|
|
2538
|
+
|
|
2539
|
+
template <typename T>
|
|
2540
|
+
HWY_API HWY_BITCASTSCALAR_CONSTEXPR T PositiveInfOrHighestValue() {
|
|
2541
|
+
return detail::PositiveInfOrHighestValue<T>(IsFloatTag<T>());
|
|
2542
|
+
}
|
|
2543
|
+
|
|
2367
2544
|
//------------------------------------------------------------------------------
|
|
2368
2545
|
// Additional F16/BF16 operators
|
|
2369
2546
|
|
|
@@ -2381,6 +2558,17 @@ constexpr MakeSigned<T> MaxExponentField() {
|
|
|
2381
2558
|
return static_cast<ResultT>(a op b.native); \
|
|
2382
2559
|
}
|
|
2383
2560
|
|
|
2561
|
+
#define HWY_RHS_SPECIAL_FLOAT_ASSIGN_OP(op, assign_op, T2) \
|
|
2562
|
+
template <typename T1, \
|
|
2563
|
+
hwy::EnableIf<hwy::IsInteger<RemoveCvRef<T1>>() || \
|
|
2564
|
+
hwy::IsFloat3264<RemoveCvRef<T1>>()>* = nullptr, \
|
|
2565
|
+
typename ResultT = \
|
|
2566
|
+
decltype(DeclVal<T1&>() assign_op DeclVal<T2::Native>())> \
|
|
2567
|
+
static HWY_INLINE constexpr ResultT operator assign_op(T1& a, \
|
|
2568
|
+
T2 b) noexcept { \
|
|
2569
|
+
return (a assign_op b.native); \
|
|
2570
|
+
}
|
|
2571
|
+
|
|
2384
2572
|
#define HWY_SPECIAL_FLOAT_CMP_AGAINST_NON_SPECIAL_OP(op, op_func, T1) \
|
|
2385
2573
|
HWY_RHS_SPECIAL_FLOAT_ARITH_OP(op, op_func, T1) \
|
|
2386
2574
|
template < \
|
|
@@ -2399,6 +2587,10 @@ HWY_RHS_SPECIAL_FLOAT_ARITH_OP(+, operator+, float16_t)
|
|
|
2399
2587
|
HWY_RHS_SPECIAL_FLOAT_ARITH_OP(-, operator-, float16_t)
|
|
2400
2588
|
HWY_RHS_SPECIAL_FLOAT_ARITH_OP(*, operator*, float16_t)
|
|
2401
2589
|
HWY_RHS_SPECIAL_FLOAT_ARITH_OP(/, operator/, float16_t)
|
|
2590
|
+
HWY_RHS_SPECIAL_FLOAT_ASSIGN_OP(+, +=, float16_t)
|
|
2591
|
+
HWY_RHS_SPECIAL_FLOAT_ASSIGN_OP(-, -=, float16_t)
|
|
2592
|
+
HWY_RHS_SPECIAL_FLOAT_ASSIGN_OP(*, *=, float16_t)
|
|
2593
|
+
HWY_RHS_SPECIAL_FLOAT_ASSIGN_OP(/, /=, float16_t)
|
|
2402
2594
|
HWY_SPECIAL_FLOAT_CMP_AGAINST_NON_SPECIAL_OP(==, operator==, float16_t)
|
|
2403
2595
|
HWY_SPECIAL_FLOAT_CMP_AGAINST_NON_SPECIAL_OP(!=, operator!=, float16_t)
|
|
2404
2596
|
HWY_SPECIAL_FLOAT_CMP_AGAINST_NON_SPECIAL_OP(<, operator<, float16_t)
|
|
@@ -2415,6 +2607,10 @@ HWY_RHS_SPECIAL_FLOAT_ARITH_OP(+, operator+, bfloat16_t)
|
|
|
2415
2607
|
HWY_RHS_SPECIAL_FLOAT_ARITH_OP(-, operator-, bfloat16_t)
|
|
2416
2608
|
HWY_RHS_SPECIAL_FLOAT_ARITH_OP(*, operator*, bfloat16_t)
|
|
2417
2609
|
HWY_RHS_SPECIAL_FLOAT_ARITH_OP(/, operator/, bfloat16_t)
|
|
2610
|
+
HWY_RHS_SPECIAL_FLOAT_ASSIGN_OP(+, +=, bfloat16_t)
|
|
2611
|
+
HWY_RHS_SPECIAL_FLOAT_ASSIGN_OP(-, -=, bfloat16_t)
|
|
2612
|
+
HWY_RHS_SPECIAL_FLOAT_ASSIGN_OP(*, *=, bfloat16_t)
|
|
2613
|
+
HWY_RHS_SPECIAL_FLOAT_ASSIGN_OP(/, /=, bfloat16_t)
|
|
2418
2614
|
HWY_SPECIAL_FLOAT_CMP_AGAINST_NON_SPECIAL_OP(==, operator==, bfloat16_t)
|
|
2419
2615
|
HWY_SPECIAL_FLOAT_CMP_AGAINST_NON_SPECIAL_OP(!=, operator!=, bfloat16_t)
|
|
2420
2616
|
HWY_SPECIAL_FLOAT_CMP_AGAINST_NON_SPECIAL_OP(<, operator<, bfloat16_t)
|
|
@@ -2427,6 +2623,7 @@ HWY_SPECIAL_FLOAT_CMP_AGAINST_NON_SPECIAL_OP(<=>, operator<=>, bfloat16_t)
|
|
|
2427
2623
|
#endif // HWY_HAVE_SCALAR_BF16_OPERATORS
|
|
2428
2624
|
|
|
2429
2625
|
#undef HWY_RHS_SPECIAL_FLOAT_ARITH_OP
|
|
2626
|
+
#undef HWY_RHS_SPECIAL_FLOAT_ASSIGN_OP
|
|
2430
2627
|
#undef HWY_SPECIAL_FLOAT_CMP_AGAINST_NON_SPECIAL_OP
|
|
2431
2628
|
|
|
2432
2629
|
#endif // HWY_HAVE_SCALAR_F16_OPERATORS || HWY_HAVE_SCALAR_BF16_OPERATORS
|
|
@@ -2452,53 +2649,83 @@ HWY_API float F32FromBF16Mem(const void* ptr) {
|
|
|
2452
2649
|
#define HWY_BF16_TO_F16_CONSTEXPR HWY_F16_CONSTEXPR
|
|
2453
2650
|
#endif
|
|
2454
2651
|
|
|
2455
|
-
|
|
2456
|
-
|
|
2457
|
-
|
|
2458
|
-
|
|
2459
|
-
|
|
2460
|
-
|
|
2461
|
-
template <typename TTo, typename TFrom, HWY_IF_F16(TTo),
|
|
2462
|
-
HWY_IF_NOT_SPECIAL_FLOAT(TFrom), HWY_IF_NOT_SAME(TFrom, double)>
|
|
2463
|
-
HWY_API constexpr TTo ConvertScalarTo(const TFrom in) {
|
|
2464
|
-
return F16FromF32(static_cast<float>(in));
|
|
2652
|
+
namespace detail {
|
|
2653
|
+
|
|
2654
|
+
template <class TTo, class TFrom>
|
|
2655
|
+
static HWY_INLINE HWY_MAYBE_UNUSED constexpr TTo ConvertScalarToResult(
|
|
2656
|
+
hwy::SizeTag<0> /*conv_to_tag*/, TFrom in) {
|
|
2657
|
+
return static_cast<TTo>(static_cast<TFrom>(in));
|
|
2465
2658
|
}
|
|
2466
|
-
|
|
2467
|
-
|
|
2468
|
-
|
|
2469
|
-
|
|
2659
|
+
|
|
2660
|
+
template <class TTo>
|
|
2661
|
+
static HWY_INLINE HWY_MAYBE_UNUSED HWY_F16_CONSTEXPR TTo
|
|
2662
|
+
ConvertScalarToResult(hwy::FloatTag /*conv_to_tag*/, float in) {
|
|
2663
|
+
return F16FromF32(in);
|
|
2470
2664
|
}
|
|
2471
|
-
|
|
2472
|
-
|
|
2665
|
+
|
|
2666
|
+
template <class TTo>
|
|
2667
|
+
static HWY_INLINE HWY_MAYBE_UNUSED HWY_F16_CONSTEXPR TTo
|
|
2668
|
+
ConvertScalarToResult(hwy::FloatTag /*conv_to_tag*/, double in) {
|
|
2473
2669
|
return F16FromF64(in);
|
|
2474
2670
|
}
|
|
2475
|
-
|
|
2476
|
-
|
|
2477
|
-
|
|
2478
|
-
|
|
2479
|
-
|
|
2480
|
-
template <typename TTo, HWY_IF_BF16(TTo)>
|
|
2481
|
-
HWY_API HWY_BF16_TO_F16_CONSTEXPR TTo ConvertScalarTo(const hwy::float16_t in) {
|
|
2482
|
-
return BF16FromF32(F32FromF16(in));
|
|
2671
|
+
|
|
2672
|
+
template <class TTo>
|
|
2673
|
+
static HWY_INLINE HWY_MAYBE_UNUSED HWY_BF16_CONSTEXPR TTo
|
|
2674
|
+
ConvertScalarToResult(hwy::SpecialTag /*conv_to_tag*/, float in) {
|
|
2675
|
+
return BF16FromF32(in);
|
|
2483
2676
|
}
|
|
2484
|
-
|
|
2485
|
-
|
|
2677
|
+
|
|
2678
|
+
template <class TTo>
|
|
2679
|
+
static HWY_INLINE HWY_MAYBE_UNUSED HWY_BF16_CONSTEXPR TTo
|
|
2680
|
+
ConvertScalarToResult(hwy::SpecialTag /*conv_to_tag*/, double in) {
|
|
2486
2681
|
return BF16FromF64(in);
|
|
2487
2682
|
}
|
|
2488
|
-
|
|
2489
|
-
|
|
2490
|
-
|
|
2491
|
-
|
|
2683
|
+
|
|
2684
|
+
template <class TFrom, HWY_IF_BF16(TFrom)>
|
|
2685
|
+
static HWY_INLINE HWY_MAYBE_UNUSED HWY_BF16_CONSTEXPR float
|
|
2686
|
+
ConvertScalarSpecialFloatToF32(hwy::SpecialTag /*conv_from_tag*/, TFrom in) {
|
|
2687
|
+
return F32FromBF16(in);
|
|
2492
2688
|
}
|
|
2493
|
-
|
|
2494
|
-
|
|
2495
|
-
|
|
2496
|
-
|
|
2689
|
+
|
|
2690
|
+
template <class TFrom, HWY_IF_F16(TFrom)>
|
|
2691
|
+
static HWY_INLINE HWY_MAYBE_UNUSED HWY_F16_CONSTEXPR float
|
|
2692
|
+
ConvertScalarSpecialFloatToF32(hwy::SpecialTag /*conv_from_tag*/, TFrom in) {
|
|
2693
|
+
return F32FromF16(in);
|
|
2497
2694
|
}
|
|
2498
|
-
|
|
2499
|
-
template <
|
|
2500
|
-
|
|
2501
|
-
|
|
2695
|
+
|
|
2696
|
+
template <class TFrom>
|
|
2697
|
+
static HWY_INLINE HWY_MAYBE_UNUSED constexpr auto
|
|
2698
|
+
ConvertScalarSpecialFloatToF32(hwy::FloatTag /*conv_from_tag*/, TFrom in)
|
|
2699
|
+
-> hwy::If<hwy::IsSame<hwy::RemoveCvRef<TFrom>, double>(), double, float> {
|
|
2700
|
+
return static_cast<
|
|
2701
|
+
hwy::If<hwy::IsSame<hwy::RemoveCvRef<TFrom>, double>(), double, float>>(
|
|
2702
|
+
in);
|
|
2703
|
+
}
|
|
2704
|
+
|
|
2705
|
+
template <class TFrom>
|
|
2706
|
+
static HWY_INLINE HWY_MAYBE_UNUSED constexpr TFrom
|
|
2707
|
+
ConvertScalarSpecialFloatToF32(hwy::SizeTag<0> /*conv_from_tag*/, TFrom in) {
|
|
2708
|
+
return static_cast<TFrom>(in);
|
|
2709
|
+
}
|
|
2710
|
+
|
|
2711
|
+
} // namespace detail
|
|
2712
|
+
|
|
2713
|
+
template <typename TTo, typename TFrom>
|
|
2714
|
+
HWY_API constexpr TTo ConvertScalarTo(TFrom in) {
|
|
2715
|
+
return detail::ConvertScalarToResult<TTo>(
|
|
2716
|
+
hwy::SizeTag<
|
|
2717
|
+
(!hwy::IsSame<hwy::RemoveCvRef<TFrom>, hwy::RemoveCvRef<TTo>>() &&
|
|
2718
|
+
hwy::IsSpecialFloat<TTo>())
|
|
2719
|
+
? (hwy::IsSame<RemoveCvRef<TTo>, hwy::bfloat16_t>() ? 0x300
|
|
2720
|
+
: 0x200)
|
|
2721
|
+
: 0>(),
|
|
2722
|
+
detail::ConvertScalarSpecialFloatToF32(
|
|
2723
|
+
hwy::SizeTag<
|
|
2724
|
+
(!hwy::IsSame<hwy::RemoveCvRef<TFrom>, hwy::RemoveCvRef<TTo>>() &&
|
|
2725
|
+
(hwy::IsSpecialFloat<TFrom>() || hwy::IsSpecialFloat<TTo>()))
|
|
2726
|
+
? (hwy::IsSpecialFloat<TFrom>() ? 0x300 : 0x200)
|
|
2727
|
+
: 0>(),
|
|
2728
|
+
static_cast<TFrom&&>(in)));
|
|
2502
2729
|
}
|
|
2503
2730
|
|
|
2504
2731
|
//------------------------------------------------------------------------------
|
|
@@ -2506,10 +2733,13 @@ HWY_API constexpr TTo ConvertScalarTo(TTo in) {
|
|
|
2506
2733
|
|
|
2507
2734
|
template <typename T1, typename T2>
|
|
2508
2735
|
constexpr inline T1 DivCeil(T1 a, T2 b) {
|
|
2736
|
+
#if HWY_CXX_LANG >= 201703L
|
|
2737
|
+
HWY_DASSERT(b != 0);
|
|
2738
|
+
#endif
|
|
2509
2739
|
return (a + b - 1) / b;
|
|
2510
2740
|
}
|
|
2511
2741
|
|
|
2512
|
-
// Works for any `align`; if a power of two, compiler emits ADD+AND.
|
|
2742
|
+
// Works for any non-zero `align`; if a power of two, compiler emits ADD+AND.
|
|
2513
2743
|
constexpr inline size_t RoundUpTo(size_t what, size_t align) {
|
|
2514
2744
|
return DivCeil(what, align) * align;
|
|
2515
2745
|
}
|
|
@@ -2803,6 +3033,97 @@ class Divisor {
|
|
|
2803
3033
|
uint32_t shift2_ = 0;
|
|
2804
3034
|
};
|
|
2805
3035
|
|
|
3036
|
+
#ifndef HWY_HAVE_DIV128 // allow override
|
|
3037
|
+
// Exclude clang-cl because it calls __divti3 from clang_rt.builtins-x86_64,
|
|
3038
|
+
// which is not linked in.
|
|
3039
|
+
#if (HWY_COMPILER_MSVC >= 1920 && HWY_ARCH_X86_64) || \
|
|
3040
|
+
(defined(__SIZEOF_INT128__) && !HWY_COMPILER_CLANGCL)
|
|
3041
|
+
#define HWY_HAVE_DIV128 1
|
|
3042
|
+
#else
|
|
3043
|
+
#define HWY_HAVE_DIV128 0
|
|
3044
|
+
#endif
|
|
3045
|
+
#endif // HWY_HAVE_DIV128
|
|
3046
|
+
|
|
3047
|
+
// Divisor64 can precompute the multiplicative inverse.
|
|
3048
|
+
#if HWY_HAVE_DIV128
|
|
3049
|
+
|
|
3050
|
+
#if HWY_COMPILER_MSVC >= 1920 && HWY_ARCH_X86_64
|
|
3051
|
+
#pragma intrinsic(_udiv128)
|
|
3052
|
+
#pragma intrinsic(__umulh)
|
|
3053
|
+
#endif
|
|
3054
|
+
|
|
3055
|
+
// As above, but for 64-bit divisors: more expensive to compute and initialize.
|
|
3056
|
+
class Divisor64 {
|
|
3057
|
+
public:
|
|
3058
|
+
explicit Divisor64(uint64_t divisor) : divisor_(divisor) {
|
|
3059
|
+
if (divisor <= 1) return;
|
|
3060
|
+
|
|
3061
|
+
const uint64_t len =
|
|
3062
|
+
static_cast<uint64_t>(63 - Num0BitsAboveMS1Bit_Nonzero64(divisor - 1));
|
|
3063
|
+
const uint64_t u_hi = (2ULL << len) - divisor;
|
|
3064
|
+
const uint64_t q = Div128(u_hi, divisor);
|
|
3065
|
+
|
|
3066
|
+
mul_ = q + 1;
|
|
3067
|
+
shift1_ = 1;
|
|
3068
|
+
shift2_ = len;
|
|
3069
|
+
}
|
|
3070
|
+
|
|
3071
|
+
uint64_t GetDivisor() const { return divisor_; }
|
|
3072
|
+
|
|
3073
|
+
// Returns n / divisor_.
|
|
3074
|
+
uint64_t Divide(uint64_t n) const {
|
|
3075
|
+
const uint64_t t = MulHigh(mul_, n);
|
|
3076
|
+
return (t + ((n - t) >> shift1_)) >> shift2_;
|
|
3077
|
+
}
|
|
3078
|
+
|
|
3079
|
+
// Returns n % divisor_.
|
|
3080
|
+
uint64_t Remainder(uint64_t n) const { return n - (Divide(n) * divisor_); }
|
|
3081
|
+
|
|
3082
|
+
private:
|
|
3083
|
+
uint64_t divisor_;
|
|
3084
|
+
|
|
3085
|
+
static uint64_t Div128(uint64_t hi, uint64_t div) {
|
|
3086
|
+
#if HWY_COMPILER_MSVC >= 1920 && HWY_ARCH_X86_64
|
|
3087
|
+
unsigned __int64 remainder; // unused
|
|
3088
|
+
return _udiv128(hi, uint64_t{0}, div, &remainder);
|
|
3089
|
+
#else
|
|
3090
|
+
using u128 = unsigned __int128;
|
|
3091
|
+
const u128 hi128 = static_cast<u128>(hi) << 64;
|
|
3092
|
+
return static_cast<uint64_t>(hi128 / static_cast<u128>(div));
|
|
3093
|
+
#endif
|
|
3094
|
+
}
|
|
3095
|
+
|
|
3096
|
+
static uint64_t MulHigh(uint64_t a, uint64_t b) {
|
|
3097
|
+
#if HWY_COMPILER_MSVC >= 1920 && HWY_ARCH_X86_64
|
|
3098
|
+
return __umulh(a, b);
|
|
3099
|
+
#else
|
|
3100
|
+
using u128 = unsigned __int128;
|
|
3101
|
+
const u128 a128 = static_cast<u128>(a);
|
|
3102
|
+
const u128 b128 = static_cast<u128>(b);
|
|
3103
|
+
return static_cast<uint64_t>((a128 * b128) >> 64);
|
|
3104
|
+
#endif
|
|
3105
|
+
}
|
|
3106
|
+
|
|
3107
|
+
uint64_t mul_ = 1;
|
|
3108
|
+
uint64_t shift1_ = 0;
|
|
3109
|
+
uint64_t shift2_ = 0;
|
|
3110
|
+
};
|
|
3111
|
+
#else
|
|
3112
|
+
// No Div128 available, use built-in 64-bit division on each call.
|
|
3113
|
+
class Divisor64 {
|
|
3114
|
+
public:
|
|
3115
|
+
explicit Divisor64(uint64_t divisor) : divisor_(divisor) {}
|
|
3116
|
+
|
|
3117
|
+
uint64_t GetDivisor() const { return divisor_; }
|
|
3118
|
+
|
|
3119
|
+
uint64_t Divide(uint64_t n) const { return n / divisor_; }
|
|
3120
|
+
uint64_t Remainder(uint64_t n) const { return n % divisor_; }
|
|
3121
|
+
|
|
3122
|
+
private:
|
|
3123
|
+
uint64_t divisor_;
|
|
3124
|
+
};
|
|
3125
|
+
#endif // HWY_HAVE_DIV128
|
|
3126
|
+
|
|
2806
3127
|
namespace detail {
|
|
2807
3128
|
|
|
2808
3129
|
template <typename T>
|