@img/sharp-libvips-dev 0.0.1 → 0.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (109) hide show
  1. package/README.md +3 -3
  2. package/cplusplus/VConnection.cpp +54 -54
  3. package/cplusplus/VError.cpp +20 -18
  4. package/cplusplus/VImage.cpp +636 -589
  5. package/cplusplus/VInterpolate.cpp +22 -22
  6. package/cplusplus/VRegion.cpp +4 -4
  7. package/cplusplus/vips-operators.cpp +2326 -2301
  8. package/include/aom/aom_codec.h +10 -6
  9. package/include/aom/aom_decoder.h +1 -1
  10. package/include/aom/aom_encoder.h +9 -2
  11. package/include/aom/aomcx.h +72 -3
  12. package/include/cairo/cairo-ft.h +1 -1
  13. package/include/cairo/cairo-gobject.h +8 -0
  14. package/include/cairo/cairo-svg.h +3 -3
  15. package/include/cairo/cairo-version.h +2 -2
  16. package/include/cairo/cairo.h +91 -24
  17. package/include/glib-2.0/glib/gmacros.h +1 -1
  18. package/include/glib-2.0/glib/gtestutils.h +1 -1
  19. package/include/glib-2.0/gobject/gtype.h +7 -7
  20. package/include/harfbuzz/hb-version.h +2 -2
  21. package/include/hwy/aligned_allocator.h +211 -0
  22. package/include/hwy/base.h +1517 -0
  23. package/include/hwy/cache_control.h +108 -0
  24. package/include/hwy/detect_compiler_arch.h +281 -0
  25. package/include/hwy/detect_targets.h +644 -0
  26. package/include/hwy/foreach_target.h +340 -0
  27. package/include/hwy/highway.h +435 -0
  28. package/include/hwy/highway_export.h +74 -0
  29. package/include/hwy/nanobenchmark.h +171 -0
  30. package/include/hwy/ops/arm_neon-inl.h +8913 -0
  31. package/include/hwy/ops/arm_sve-inl.h +5105 -0
  32. package/include/hwy/ops/emu128-inl.h +2811 -0
  33. package/include/hwy/ops/generic_ops-inl.h +4745 -0
  34. package/include/hwy/ops/ppc_vsx-inl.h +5716 -0
  35. package/include/hwy/ops/rvv-inl.h +5070 -0
  36. package/include/hwy/ops/scalar-inl.h +1995 -0
  37. package/include/hwy/ops/set_macros-inl.h +578 -0
  38. package/include/hwy/ops/shared-inl.h +539 -0
  39. package/include/hwy/ops/tuple-inl.h +125 -0
  40. package/include/hwy/ops/wasm_128-inl.h +5917 -0
  41. package/include/hwy/ops/x86_128-inl.h +11173 -0
  42. package/include/hwy/ops/x86_256-inl.h +7529 -0
  43. package/include/hwy/ops/x86_512-inl.h +6849 -0
  44. package/include/hwy/per_target.h +44 -0
  45. package/include/hwy/print-inl.h +62 -0
  46. package/include/hwy/print.h +75 -0
  47. package/include/hwy/robust_statistics.h +148 -0
  48. package/include/hwy/targets.h +338 -0
  49. package/include/hwy/timer-inl.h +200 -0
  50. package/include/hwy/timer.h +55 -0
  51. package/include/jconfig.h +2 -2
  52. package/include/jpeglib.h +3 -2
  53. package/include/libheif/heif.h +461 -384
  54. package/include/libheif/heif_cxx.h +4 -1
  55. package/include/libheif/heif_plugin.h +1 -1
  56. package/include/libheif/heif_properties.h +138 -0
  57. package/include/libheif/heif_regions.h +866 -0
  58. package/include/libheif/heif_version.h +3 -3
  59. package/include/libpng16/pnglibconf.h +1 -1
  60. package/include/pnglibconf.h +1 -1
  61. package/include/vips/VConnection8.h +43 -49
  62. package/include/vips/VError8.h +27 -24
  63. package/include/vips/VImage8.h +4861 -4597
  64. package/include/vips/VInterpolate8.h +24 -27
  65. package/include/vips/VRegion8.h +32 -33
  66. package/include/vips/arithmetic.h +169 -169
  67. package/include/vips/basic.h +33 -33
  68. package/include/vips/buf.h +56 -54
  69. package/include/vips/colour.h +95 -95
  70. package/include/vips/connection.h +190 -193
  71. package/include/vips/conversion.h +91 -91
  72. package/include/vips/convolution.h +36 -30
  73. package/include/vips/create.h +63 -63
  74. package/include/vips/dbuf.h +35 -37
  75. package/include/vips/debug.h +65 -33
  76. package/include/vips/draw.h +41 -41
  77. package/include/vips/enumtypes.h +54 -51
  78. package/include/vips/error.h +63 -63
  79. package/include/vips/foreign.h +263 -223
  80. package/include/vips/format.h +48 -48
  81. package/include/vips/freqfilt.h +22 -22
  82. package/include/vips/gate.h +55 -47
  83. package/include/vips/generate.h +34 -34
  84. package/include/vips/header.h +111 -101
  85. package/include/vips/histogram.h +28 -28
  86. package/include/vips/image.h +213 -213
  87. package/include/vips/interpolate.h +40 -41
  88. package/include/vips/memory.h +61 -52
  89. package/include/vips/morphology.h +24 -24
  90. package/include/vips/mosaicing.h +32 -33
  91. package/include/vips/object.h +371 -357
  92. package/include/vips/operation.h +68 -67
  93. package/include/vips/private.h +76 -76
  94. package/include/vips/rect.h +26 -26
  95. package/include/vips/region.h +92 -92
  96. package/include/vips/resample.h +38 -38
  97. package/include/vips/sbuf.h +53 -54
  98. package/include/vips/semaphore.h +24 -24
  99. package/include/vips/thread.h +30 -27
  100. package/include/vips/threadpool.h +48 -49
  101. package/include/vips/transform.h +39 -39
  102. package/include/vips/type.h +90 -85
  103. package/include/vips/util.h +274 -229
  104. package/include/vips/vector.h +24 -144
  105. package/include/vips/version.h +9 -9
  106. package/include/vips/vips.h +41 -40
  107. package/include/zlib.h +23 -19
  108. package/package.json +1 -1
  109. package/versions.json +9 -9
@@ -0,0 +1,1517 @@
1
+ // Copyright 2020 Google LLC
2
+ // SPDX-License-Identifier: Apache-2.0
3
+ //
4
+ // Licensed under the Apache License, Version 2.0 (the "License");
5
+ // you may not use this file except in compliance with the License.
6
+ // You may obtain a copy of the License at
7
+ //
8
+ // http://www.apache.org/licenses/LICENSE-2.0
9
+ //
10
+ // Unless required by applicable law or agreed to in writing, software
11
+ // distributed under the License is distributed on an "AS IS" BASIS,
12
+ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ // See the License for the specific language governing permissions and
14
+ // limitations under the License.
15
+
16
+ #ifndef HIGHWAY_HWY_BASE_H_
17
+ #define HIGHWAY_HWY_BASE_H_
18
+
19
+ // For SIMD module implementations and their callers, target-independent.
20
+
21
+ // IWYU pragma: begin_exports
22
+ #include <stddef.h>
23
+ #include <stdint.h>
24
+
25
+ // Wrapping this into a HWY_HAS_INCLUDE causes clang-format to fail.
26
+ #if __cplusplus >= 202100L && defined(__has_include)
27
+ #if __has_include(<stdfloat>)
28
+ #include <stdfloat> // std::float16_t
29
+ #endif
30
+ #endif
31
+
32
+ #include "hwy/detect_compiler_arch.h"
33
+ #include "hwy/highway_export.h"
34
+
35
+ // "IWYU pragma: keep" does not work for these includes, so hide from the IDE.
36
+ #if !HWY_IDE
37
+
38
+ #if !defined(HWY_NO_LIBCXX)
39
+ #ifndef __STDC_FORMAT_MACROS
40
+ #define __STDC_FORMAT_MACROS // before inttypes.h
41
+ #endif
42
+ #include <inttypes.h>
43
+ #endif
44
+
45
+ #if (HWY_ARCH_X86 && !defined(HWY_NO_LIBCXX)) || HWY_COMPILER_MSVC
46
+ #include <atomic>
47
+ #endif
48
+
49
+ #endif // !HWY_IDE
50
+
51
+ // IWYU pragma: end_exports
52
+
53
+ #if HWY_COMPILER_MSVC
54
+ #include <string.h> // memcpy
55
+ #endif
56
+
57
+ //------------------------------------------------------------------------------
58
+ // Compiler-specific definitions
59
+
60
+ #define HWY_STR_IMPL(macro) #macro
61
+ #define HWY_STR(macro) HWY_STR_IMPL(macro)
62
+
63
+ #if HWY_COMPILER_MSVC
64
+
65
+ #include <intrin.h>
66
+
67
+ #define HWY_RESTRICT __restrict
68
+ #define HWY_INLINE __forceinline
69
+ #define HWY_NOINLINE __declspec(noinline)
70
+ #define HWY_FLATTEN
71
+ #define HWY_NORETURN __declspec(noreturn)
72
+ #define HWY_LIKELY(expr) (expr)
73
+ #define HWY_UNLIKELY(expr) (expr)
74
+ #define HWY_PRAGMA(tokens) __pragma(tokens)
75
+ #define HWY_DIAGNOSTICS(tokens) HWY_PRAGMA(warning(tokens))
76
+ #define HWY_DIAGNOSTICS_OFF(msc, gcc) HWY_DIAGNOSTICS(msc)
77
+ #define HWY_MAYBE_UNUSED
78
+ #define HWY_HAS_ASSUME_ALIGNED 0
79
+ #if (_MSC_VER >= 1700)
80
+ #define HWY_MUST_USE_RESULT _Check_return_
81
+ #else
82
+ #define HWY_MUST_USE_RESULT
83
+ #endif
84
+
85
+ #else
86
+
87
+ #define HWY_RESTRICT __restrict__
88
+ // force inlining without optimization enabled creates very inefficient code
89
+ // that can cause compiler timeout
90
+ #ifdef __OPTIMIZE__
91
+ #define HWY_INLINE inline __attribute__((always_inline))
92
+ #else
93
+ #define HWY_INLINE inline
94
+ #endif
95
+ #define HWY_NOINLINE __attribute__((noinline))
96
+ #define HWY_FLATTEN __attribute__((flatten))
97
+ #define HWY_NORETURN __attribute__((noreturn))
98
+ #define HWY_LIKELY(expr) __builtin_expect(!!(expr), 1)
99
+ #define HWY_UNLIKELY(expr) __builtin_expect(!!(expr), 0)
100
+ #define HWY_PRAGMA(tokens) _Pragma(#tokens)
101
+ #define HWY_DIAGNOSTICS(tokens) HWY_PRAGMA(GCC diagnostic tokens)
102
+ #define HWY_DIAGNOSTICS_OFF(msc, gcc) HWY_DIAGNOSTICS(gcc)
103
+ // Encountered "attribute list cannot appear here" when using the C++17
104
+ // [[maybe_unused]], so only use the old style attribute for now.
105
+ #define HWY_MAYBE_UNUSED __attribute__((unused))
106
+ #define HWY_MUST_USE_RESULT __attribute__((warn_unused_result))
107
+
108
+ #endif // !HWY_COMPILER_MSVC
109
+
110
+ //------------------------------------------------------------------------------
111
+ // Builtin/attributes (no more #include after this point due to namespace!)
112
+
113
+ namespace hwy {
114
+
115
+ // Enables error-checking of format strings.
116
+ #if HWY_HAS_ATTRIBUTE(__format__)
117
+ #define HWY_FORMAT(idx_fmt, idx_arg) \
118
+ __attribute__((__format__(__printf__, idx_fmt, idx_arg)))
119
+ #else
120
+ #define HWY_FORMAT(idx_fmt, idx_arg)
121
+ #endif
122
+
123
+ // Returns a void* pointer which the compiler then assumes is N-byte aligned.
124
+ // Example: float* HWY_RESTRICT aligned = (float*)HWY_ASSUME_ALIGNED(in, 32);
125
+ //
126
+ // The assignment semantics are required by GCC/Clang. ICC provides an in-place
127
+ // __assume_aligned, whereas MSVC's __assume appears unsuitable.
128
+ #if HWY_HAS_BUILTIN(__builtin_assume_aligned)
129
+ #define HWY_ASSUME_ALIGNED(ptr, align) __builtin_assume_aligned((ptr), (align))
130
+ #else
131
+ #define HWY_ASSUME_ALIGNED(ptr, align) (ptr) /* not supported */
132
+ #endif
133
+
134
+ // Clang and GCC require attributes on each function into which SIMD intrinsics
135
+ // are inlined. Support both per-function annotation (HWY_ATTR) for lambdas and
136
+ // automatic annotation via pragmas.
137
+ #if HWY_COMPILER_ICC
138
+ // As of ICC 2021.{1-9} the pragma is neither implemented nor required.
139
+ #define HWY_PUSH_ATTRIBUTES(targets_str)
140
+ #define HWY_POP_ATTRIBUTES
141
+ #elif HWY_COMPILER_CLANG
142
+ #define HWY_PUSH_ATTRIBUTES(targets_str) \
143
+ HWY_PRAGMA(clang attribute push(__attribute__((target(targets_str))), \
144
+ apply_to = function))
145
+ #define HWY_POP_ATTRIBUTES HWY_PRAGMA(clang attribute pop)
146
+ #elif HWY_COMPILER_GCC_ACTUAL
147
+ #define HWY_PUSH_ATTRIBUTES(targets_str) \
148
+ HWY_PRAGMA(GCC push_options) HWY_PRAGMA(GCC target targets_str)
149
+ #define HWY_POP_ATTRIBUTES HWY_PRAGMA(GCC pop_options)
150
+ #else
151
+ #define HWY_PUSH_ATTRIBUTES(targets_str)
152
+ #define HWY_POP_ATTRIBUTES
153
+ #endif
154
+
155
+ //------------------------------------------------------------------------------
156
+ // Macros
157
+
158
+ #define HWY_API static HWY_INLINE HWY_FLATTEN HWY_MAYBE_UNUSED
159
+
160
+ #define HWY_CONCAT_IMPL(a, b) a##b
161
+ #define HWY_CONCAT(a, b) HWY_CONCAT_IMPL(a, b)
162
+
163
+ #define HWY_MIN(a, b) ((a) < (b) ? (a) : (b))
164
+ #define HWY_MAX(a, b) ((a) > (b) ? (a) : (b))
165
+
166
+ #if HWY_COMPILER_GCC_ACTUAL
167
+ // nielskm: GCC does not support '#pragma GCC unroll' without the factor.
168
+ #define HWY_UNROLL(factor) HWY_PRAGMA(GCC unroll factor)
169
+ #define HWY_DEFAULT_UNROLL HWY_UNROLL(4)
170
+ #elif HWY_COMPILER_CLANG || HWY_COMPILER_ICC || HWY_COMPILER_ICX
171
+ #define HWY_UNROLL(factor) HWY_PRAGMA(unroll factor)
172
+ #define HWY_DEFAULT_UNROLL HWY_UNROLL()
173
+ #else
174
+ #define HWY_UNROLL(factor)
175
+ #define HWY_DEFAULT_UNROLL
176
+ #endif
177
+
178
+ // Tell a compiler that the expression always evaluates to true.
179
+ // The expression should be free from any side effects.
180
+ // Some older compilers may have trouble with complex expressions, therefore
181
+ // it is advisable to split multiple conditions into separate assume statements,
182
+ // and manually check the generated code.
183
+ // OK but could fail:
184
+ // HWY_ASSUME(x == 2 && y == 3);
185
+ // Better:
186
+ // HWY_ASSUME(x == 2);
187
+ // HWY_ASSUME(y == 3);
188
+ #if HWY_HAS_CPP_ATTRIBUTE(assume)
189
+ #define HWY_ASSUME(expr) [[assume(expr)]]
190
+ #elif HWY_COMPILER_MSVC || HWY_COMPILER_ICC
191
+ #define HWY_ASSUME(expr) __assume(expr)
192
+ // __builtin_assume() was added in clang 3.6.
193
+ #elif HWY_COMPILER_CLANG && HWY_HAS_BUILTIN(__builtin_assume)
194
+ #define HWY_ASSUME(expr) __builtin_assume(expr)
195
+ // __builtin_unreachable() was added in GCC 4.5, but __has_builtin() was added
196
+ // later, so check for the compiler version directly.
197
+ #elif HWY_COMPILER_GCC_ACTUAL >= 405
198
+ #define HWY_ASSUME(expr) \
199
+ ((expr) ? static_cast<void>(0) : __builtin_unreachable())
200
+ #else
201
+ #define HWY_ASSUME(expr) static_cast<void>(0)
202
+ #endif
203
+
204
+ // Compile-time fence to prevent undesirable code reordering. On Clang x86, the
205
+ // typical asm volatile("" : : : "memory") has no effect, whereas atomic fence
206
+ // does, without generating code.
207
+ #if HWY_ARCH_X86 && !defined(HWY_NO_LIBCXX)
208
+ #define HWY_FENCE std::atomic_thread_fence(std::memory_order_acq_rel)
209
+ #else
210
+ // TODO(janwas): investigate alternatives. On Arm, the above generates barriers.
211
+ #define HWY_FENCE
212
+ #endif
213
+
214
+ // 4 instances of a given literal value, useful as input to LoadDup128.
215
+ #define HWY_REP4(literal) literal, literal, literal, literal
216
+
217
+ HWY_DLLEXPORT HWY_NORETURN void HWY_FORMAT(3, 4)
218
+ Abort(const char* file, int line, const char* format, ...);
219
+
220
+ #define HWY_ABORT(format, ...) \
221
+ ::hwy::Abort(__FILE__, __LINE__, format, ##__VA_ARGS__)
222
+
223
+ // Always enabled.
224
+ #define HWY_ASSERT(condition) \
225
+ do { \
226
+ if (!(condition)) { \
227
+ HWY_ABORT("Assert %s", #condition); \
228
+ } \
229
+ } while (0)
230
+
231
+ #if HWY_HAS_FEATURE(memory_sanitizer) || defined(MEMORY_SANITIZER)
232
+ #define HWY_IS_MSAN 1
233
+ #else
234
+ #define HWY_IS_MSAN 0
235
+ #endif
236
+
237
+ #if HWY_HAS_FEATURE(address_sanitizer) || defined(ADDRESS_SANITIZER)
238
+ #define HWY_IS_ASAN 1
239
+ #else
240
+ #define HWY_IS_ASAN 0
241
+ #endif
242
+
243
+ #if HWY_HAS_FEATURE(thread_sanitizer) || defined(THREAD_SANITIZER)
244
+ #define HWY_IS_TSAN 1
245
+ #else
246
+ #define HWY_IS_TSAN 0
247
+ #endif
248
+
249
+ // MSAN may cause lengthy build times or false positives e.g. in AVX3 DemoteTo.
250
+ // You can disable MSAN by adding this attribute to the function that fails.
251
+ #if HWY_IS_MSAN
252
+ #define HWY_ATTR_NO_MSAN __attribute__((no_sanitize_memory))
253
+ #else
254
+ #define HWY_ATTR_NO_MSAN
255
+ #endif
256
+
257
+ // For enabling HWY_DASSERT and shortening tests in slower debug builds
258
+ #if !defined(HWY_IS_DEBUG_BUILD)
259
+ // Clang does not define NDEBUG, but it and GCC define __OPTIMIZE__, and recent
260
+ // MSVC defines NDEBUG (if not, could instead check _DEBUG).
261
+ #if (!defined(__OPTIMIZE__) && !defined(NDEBUG)) || HWY_IS_ASAN || \
262
+ HWY_IS_MSAN || HWY_IS_TSAN || defined(__clang_analyzer__)
263
+ #define HWY_IS_DEBUG_BUILD 1
264
+ #else
265
+ #define HWY_IS_DEBUG_BUILD 0
266
+ #endif
267
+ #endif // HWY_IS_DEBUG_BUILD
268
+
269
+ #if HWY_IS_DEBUG_BUILD
270
+ #define HWY_DASSERT(condition) HWY_ASSERT(condition)
271
+ #else
272
+ #define HWY_DASSERT(condition) \
273
+ do { \
274
+ } while (0)
275
+ #endif
276
+
277
+ //------------------------------------------------------------------------------
278
+ // CopyBytes / ZeroBytes
279
+
280
+ #if HWY_COMPILER_MSVC
281
+ #pragma intrinsic(memcpy)
282
+ #pragma intrinsic(memset)
283
+ #endif
284
+
285
+ // The source/destination must not overlap/alias.
286
+ template <size_t kBytes, typename From, typename To>
287
+ HWY_API void CopyBytes(const From* from, To* to) {
288
+ #if HWY_COMPILER_MSVC
289
+ memcpy(to, from, kBytes);
290
+ #else
291
+ __builtin_memcpy(static_cast<void*>(to), static_cast<const void*>(from),
292
+ kBytes);
293
+ #endif
294
+ }
295
+
296
+ HWY_API void CopyBytes(const void* HWY_RESTRICT from, void* HWY_RESTRICT to,
297
+ size_t num_of_bytes_to_copy) {
298
+ #if HWY_COMPILER_MSVC
299
+ memcpy(to, from, num_of_bytes_to_copy);
300
+ #else
301
+ __builtin_memcpy(to, from, num_of_bytes_to_copy);
302
+ #endif
303
+ }
304
+
305
+ // Same as CopyBytes, but for same-sized objects; avoids a size argument.
306
+ template <typename From, typename To>
307
+ HWY_API void CopySameSize(const From* HWY_RESTRICT from, To* HWY_RESTRICT to) {
308
+ static_assert(sizeof(From) == sizeof(To), "");
309
+ CopyBytes<sizeof(From)>(from, to);
310
+ }
311
+
312
+ template <size_t kBytes, typename To>
313
+ HWY_API void ZeroBytes(To* to) {
314
+ #if HWY_COMPILER_MSVC
315
+ memset(to, 0, kBytes);
316
+ #else
317
+ __builtin_memset(to, 0, kBytes);
318
+ #endif
319
+ }
320
+
321
+ HWY_API void ZeroBytes(void* to, size_t num_bytes) {
322
+ #if HWY_COMPILER_MSVC
323
+ memset(to, 0, num_bytes);
324
+ #else
325
+ __builtin_memset(to, 0, num_bytes);
326
+ #endif
327
+ }
328
+
329
+ //------------------------------------------------------------------------------
330
+ // kMaxVectorSize (undocumented, pending removal)
331
+
332
+ #if HWY_ARCH_X86
333
+ static constexpr HWY_MAYBE_UNUSED size_t kMaxVectorSize = 64; // AVX-512
334
+ #elif HWY_ARCH_RVV && defined(__riscv_v_intrinsic) && \
335
+ __riscv_v_intrinsic >= 11000
336
+ // Not actually an upper bound on the size.
337
+ static constexpr HWY_MAYBE_UNUSED size_t kMaxVectorSize = 4096;
338
+ #else
339
+ static constexpr HWY_MAYBE_UNUSED size_t kMaxVectorSize = 16;
340
+ #endif
341
+
342
+ //------------------------------------------------------------------------------
343
+ // Alignment
344
+
345
+ // Potentially useful for LoadDup128 and capped vectors. In other cases, arrays
346
+ // should be allocated dynamically via aligned_allocator.h because Lanes() may
347
+ // exceed the stack size.
348
+ #if HWY_ARCH_X86
349
+ #define HWY_ALIGN_MAX alignas(64)
350
+ #elif HWY_ARCH_RVV && defined(__riscv_v_intrinsic) && \
351
+ __riscv_v_intrinsic >= 11000
352
+ #define HWY_ALIGN_MAX alignas(8) // only elements need be aligned
353
+ #else
354
+ #define HWY_ALIGN_MAX alignas(16)
355
+ #endif
356
+
357
+ //------------------------------------------------------------------------------
358
+ // Lane types
359
+
360
+ #pragma pack(push, 1)
361
+
362
+ // float16_t load/store/conversion intrinsics are always supported on Armv8 and
363
+ // VFPv4 (except with MSVC). On Armv7 Clang requires __ARM_FP & 2; GCC requires
364
+ // -mfp16-format=ieee.
365
+ #if (HWY_ARCH_ARM_A64 && !HWY_COMPILER_MSVC) || \
366
+ (HWY_COMPILER_CLANG && defined(__ARM_FP) && (__ARM_FP & 2)) || \
367
+ (HWY_COMPILER_GCC_ACTUAL && defined(__ARM_FP16_FORMAT_IEEE))
368
+ #define HWY_NEON_HAVE_FLOAT16C 1
369
+ #else
370
+ #define HWY_NEON_HAVE_FLOAT16C 0
371
+ #endif
372
+
373
+ // C11 extension ISO/IEC TS 18661-3:2015 but not supported on all targets.
374
+ // Required if HWY_HAVE_FLOAT16, i.e. RVV with zvfh or AVX3_SPR (with
375
+ // sufficiently new compiler supporting avx512fp16). Do not use on clang-cl,
376
+ // which is missing __extendhfsf2.
377
+ #if ((HWY_ARCH_RVV && defined(__riscv_zvfh) && HWY_COMPILER_CLANG) || \
378
+ (HWY_ARCH_X86 && defined(__SSE2__) && \
379
+ ((HWY_COMPILER_CLANG >= 1600 && !HWY_COMPILER_CLANGCL) || \
380
+ HWY_COMPILER_GCC_ACTUAL >= 1200)))
381
+ #define HWY_HAVE_C11_FLOAT16 1
382
+ #else
383
+ #define HWY_HAVE_C11_FLOAT16 0
384
+ #endif
385
+
386
+ // If 1, both __bf16 and a limited set of *_bf16 SVE intrinsics are available:
387
+ // create/get/set/dup, ld/st, sel, rev, trn, uzp, zip.
388
+ #if HWY_ARCH_ARM_A64 && defined(__ARM_FEATURE_SVE_BF16)
389
+ #define HWY_SVE_HAVE_BFLOAT16 1
390
+ #else
391
+ #define HWY_SVE_HAVE_BFLOAT16 0
392
+ #endif
393
+
394
+ // Match [u]int##_t naming scheme so rvv-inl.h macros can obtain the type name
395
+ // by concatenating base type and bits. We use a wrapper class instead of a
396
+ // typedef to the native type to ensure that the same symbols, e.g. for VQSort,
397
+ // are generated regardless of F16 support; see #1684.
398
+ struct float16_t {
399
+ #if HWY_NEON_HAVE_FLOAT16C // ACLE's __fp16
400
+ using Raw = __fp16;
401
+ #elif HWY_HAVE_C11_FLOAT16 // C11 _Float16
402
+ using Raw = _Float16;
403
+ #elif __cplusplus > 202002L && defined(__STDCPP_FLOAT16_T__) // C++23
404
+ using Raw = std::float16_t;
405
+ #else
406
+ #define HWY_EMULATE_FLOAT16
407
+ using Raw = uint16_t;
408
+ Raw bits;
409
+ #endif // float16_t
410
+
411
+ // When backed by a native type, ensure the wrapper behaves like the native
412
+ // type by forwarding all operators. Unfortunately it seems difficult to reuse
413
+ // this code in a base class, so we repeat it in bfloat16_t.
414
+ #ifndef HWY_EMULATE_FLOAT16
415
+ Raw raw;
416
+
417
+ float16_t() noexcept = default;
418
+ template <typename T>
419
+ constexpr float16_t(T arg) noexcept : raw(static_cast<Raw>(arg)) {}
420
+ float16_t& operator=(Raw arg) noexcept {
421
+ raw = arg;
422
+ return *this;
423
+ }
424
+ constexpr float16_t(const float16_t&) noexcept = default;
425
+ float16_t& operator=(const float16_t&) noexcept = default;
426
+ constexpr operator Raw() const noexcept { return raw; }
427
+
428
+ template <typename T>
429
+ float16_t& operator+=(T rhs) noexcept {
430
+ raw = static_cast<Raw>(raw + rhs);
431
+ return *this;
432
+ }
433
+
434
+ template <typename T>
435
+ float16_t& operator-=(T rhs) noexcept {
436
+ raw = static_cast<Raw>(raw - rhs);
437
+ return *this;
438
+ }
439
+
440
+ template <typename T>
441
+ float16_t& operator*=(T rhs) noexcept {
442
+ raw = static_cast<Raw>(raw * rhs);
443
+ return *this;
444
+ }
445
+
446
+ template <typename T>
447
+ float16_t& operator/=(T rhs) noexcept {
448
+ raw = static_cast<Raw>(raw / rhs);
449
+ return *this;
450
+ }
451
+
452
+ float16_t operator--() noexcept {
453
+ raw = static_cast<Raw>(raw - Raw{1});
454
+ return *this;
455
+ }
456
+
457
+ float16_t operator--(int) noexcept {
458
+ raw = static_cast<Raw>(raw - Raw{1});
459
+ return *this;
460
+ }
461
+
462
+ float16_t operator++() noexcept {
463
+ raw = static_cast<Raw>(raw + Raw{1});
464
+ return *this;
465
+ }
466
+
467
+ float16_t operator++(int) noexcept {
468
+ raw = static_cast<Raw>(raw + Raw{1});
469
+ return *this;
470
+ }
471
+
472
+ constexpr float16_t operator-() const noexcept {
473
+ return float16_t(static_cast<Raw>(-raw));
474
+ }
475
+ constexpr float16_t operator+() const noexcept { return *this; }
476
+ #endif // HWY_EMULATE_FLOAT16
477
+ };
478
+
479
+ #ifndef HWY_EMULATE_FLOAT16
480
+ constexpr inline bool operator==(float16_t lhs, float16_t rhs) noexcept {
481
+ return lhs.raw == rhs.raw;
482
+ }
483
+ constexpr inline bool operator!=(float16_t lhs, float16_t rhs) noexcept {
484
+ return lhs.raw != rhs.raw;
485
+ }
486
+ constexpr inline bool operator<(float16_t lhs, float16_t rhs) noexcept {
487
+ return lhs.raw < rhs.raw;
488
+ }
489
+ constexpr inline bool operator<=(float16_t lhs, float16_t rhs) noexcept {
490
+ return lhs.raw <= rhs.raw;
491
+ }
492
+ constexpr inline bool operator>(float16_t lhs, float16_t rhs) noexcept {
493
+ return lhs.raw > rhs.raw;
494
+ }
495
+ constexpr inline bool operator>=(float16_t lhs, float16_t rhs) noexcept {
496
+ return lhs.raw >= rhs.raw;
497
+ }
498
+ #endif // HWY_EMULATE_FLOAT16
499
+
500
+ struct bfloat16_t {
501
+ #if HWY_SVE_HAVE_BFLOAT16
502
+ using Raw = __bf16;
503
+ #elif __cplusplus >= 202100L && defined(__STDCPP_BFLOAT16_T__) // C++23
504
+ using Raw = std::bfloat16_t;
505
+ #else
506
+ #define HWY_EMULATE_BFLOAT16
507
+ using Raw = uint16_t;
508
+ Raw bits;
509
+ #endif
510
+
511
+ #ifndef HWY_EMULATE_BFLOAT16
512
+ Raw raw;
513
+
514
+ bfloat16_t() noexcept = default;
515
+ template <typename T>
516
+ constexpr bfloat16_t(T arg) noexcept : raw(static_cast<Raw>(arg)) {}
517
+ bfloat16_t& operator=(Raw arg) noexcept {
518
+ raw = arg;
519
+ return *this;
520
+ }
521
+ constexpr bfloat16_t(const bfloat16_t&) noexcept = default;
522
+ bfloat16_t& operator=(const bfloat16_t&) noexcept = default;
523
+ constexpr operator Raw() const noexcept { return raw; }
524
+
525
+ template <typename T>
526
+ bfloat16_t& operator+=(T rhs) noexcept {
527
+ raw = static_cast<Raw>(raw + rhs);
528
+ return *this;
529
+ }
530
+
531
+ template <typename T>
532
+ bfloat16_t& operator-=(T rhs) noexcept {
533
+ raw = static_cast<Raw>(raw - rhs);
534
+ return *this;
535
+ }
536
+
537
+ template <typename T>
538
+ bfloat16_t& operator*=(T rhs) noexcept {
539
+ raw = static_cast<Raw>(raw * rhs);
540
+ return *this;
541
+ }
542
+
543
+ template <typename T>
544
+ bfloat16_t& operator/=(T rhs) noexcept {
545
+ raw = static_cast<Raw>(raw / rhs);
546
+ return *this;
547
+ }
548
+
549
+ bfloat16_t operator--() noexcept {
550
+ raw = static_cast<Raw>(raw - Raw{1});
551
+ return *this;
552
+ }
553
+
554
+ bfloat16_t operator--(int) noexcept {
555
+ raw = static_cast<Raw>(raw - Raw{1});
556
+ return *this;
557
+ }
558
+
559
+ bfloat16_t operator++() noexcept {
560
+ raw = static_cast<Raw>(raw + Raw{1});
561
+ return *this;
562
+ }
563
+
564
+ bfloat16_t operator++(int) noexcept {
565
+ raw = static_cast<Raw>(raw + Raw{1});
566
+ return *this;
567
+ }
568
+
569
+ constexpr bfloat16_t operator-() const noexcept {
570
+ return bfloat16_t(static_cast<Raw>(-raw));
571
+ }
572
+ constexpr bfloat16_t operator+() const noexcept { return *this; }
573
+ #endif // HWY_EMULATE_BFLOAT16
574
+ };
575
+
576
+ #ifndef HWY_EMULATE_BFLOAT16
577
+ constexpr inline bool operator==(bfloat16_t lhs, bfloat16_t rhs) noexcept {
578
+ return lhs.raw == rhs.raw;
579
+ }
580
+ constexpr inline bool operator!=(bfloat16_t lhs, bfloat16_t rhs) noexcept {
581
+ return lhs.raw != rhs.raw;
582
+ }
583
+ constexpr inline bool operator<(bfloat16_t lhs, bfloat16_t rhs) noexcept {
584
+ return lhs.raw < rhs.raw;
585
+ }
586
+ constexpr inline bool operator<=(bfloat16_t lhs, bfloat16_t rhs) noexcept {
587
+ return lhs.raw <= rhs.raw;
588
+ }
589
+ constexpr inline bool operator>(bfloat16_t lhs, bfloat16_t rhs) noexcept {
590
+ return lhs.raw > rhs.raw;
591
+ }
592
+ constexpr inline bool operator>=(bfloat16_t lhs, bfloat16_t rhs) noexcept {
593
+ return lhs.raw >= rhs.raw;
594
+ }
595
+ #endif // HWY_EMULATE_BFLOAT16
596
+
597
+ #pragma pack(pop)
598
+
599
+ HWY_API float F32FromF16(float16_t f16) {
600
+ #ifdef HWY_EMULATE_FLOAT16
601
+ uint16_t bits16;
602
+ CopySameSize(&f16, &bits16);
603
+ const uint32_t sign = static_cast<uint32_t>(bits16 >> 15);
604
+ const uint32_t biased_exp = (bits16 >> 10) & 0x1F;
605
+ const uint32_t mantissa = bits16 & 0x3FF;
606
+
607
+ // Subnormal or zero
608
+ if (biased_exp == 0) {
609
+ const float subnormal =
610
+ (1.0f / 16384) * (static_cast<float>(mantissa) * (1.0f / 1024));
611
+ return sign ? -subnormal : subnormal;
612
+ }
613
+
614
+ // Normalized: convert the representation directly (faster than ldexp/tables).
615
+ const uint32_t biased_exp32 = biased_exp + (127 - 15);
616
+ const uint32_t mantissa32 = mantissa << (23 - 10);
617
+ const uint32_t bits32 = (sign << 31) | (biased_exp32 << 23) | mantissa32;
618
+
619
+ float result;
620
+ CopySameSize(&bits32, &result);
621
+ return result;
622
+ #else
623
+ return static_cast<float>(f16);
624
+ #endif
625
+ }
626
+
627
+ HWY_API float16_t F16FromF32(float f32) {
628
+ #ifdef HWY_EMULATE_FLOAT16
629
+ uint32_t bits32;
630
+ CopySameSize(&f32, &bits32);
631
+ const uint32_t sign = bits32 >> 31;
632
+ const uint32_t biased_exp32 = (bits32 >> 23) & 0xFF;
633
+ const uint32_t mantissa32 = bits32 & 0x7FFFFF;
634
+
635
+ const int32_t exp = HWY_MIN(static_cast<int32_t>(biased_exp32) - 127, 15);
636
+
637
+ // Tiny or zero => zero.
638
+ float16_t out;
639
+ if (exp < -24) {
640
+ // restore original sign
641
+ const uint16_t bits = static_cast<uint16_t>(sign << 15);
642
+ CopySameSize(&bits, &out);
643
+ return out;
644
+ }
645
+
646
+ uint32_t biased_exp16, mantissa16;
647
+
648
+ // exp = [-24, -15] => subnormal
649
+ if (exp < -14) {
650
+ biased_exp16 = 0;
651
+ const uint32_t sub_exp = static_cast<uint32_t>(-14 - exp);
652
+ HWY_DASSERT(1 <= sub_exp && sub_exp < 11);
653
+ mantissa16 = static_cast<uint32_t>((1u << (10 - sub_exp)) +
654
+ (mantissa32 >> (13 + sub_exp)));
655
+ } else {
656
+ // exp = [-14, 15]
657
+ biased_exp16 = static_cast<uint32_t>(exp + 15);
658
+ HWY_DASSERT(1 <= biased_exp16 && biased_exp16 < 31);
659
+ mantissa16 = mantissa32 >> 13;
660
+ }
661
+
662
+ HWY_DASSERT(mantissa16 < 1024);
663
+ const uint32_t bits16 = (sign << 15) | (biased_exp16 << 10) | mantissa16;
664
+ HWY_DASSERT(bits16 < 0x10000);
665
+ const uint16_t narrowed = static_cast<uint16_t>(bits16); // big-endian safe
666
+ CopySameSize(&narrowed, &out);
667
+ return out;
668
+ #else
669
+ return float16_t(static_cast<float16_t::Raw>(f32));
670
+ #endif
671
+ }
672
+
673
+ HWY_API float F32FromBF16(bfloat16_t bf) {
674
+ uint16_t bits16;
675
+ CopyBytes<2>(&bf, &bits16);
676
+ uint32_t bits = bits16;
677
+ bits <<= 16;
678
+ float f;
679
+ CopySameSize(&bits, &f);
680
+ return f;
681
+ }
682
+
683
+ HWY_API float F32FromF16Mem(const void* ptr) {
684
+ float16_t f16;
685
+ CopyBytes<2>(ptr, &f16);
686
+ return F32FromF16(f16);
687
+ }
688
+
689
+ HWY_API float F32FromBF16Mem(const void* ptr) {
690
+ bfloat16_t bf;
691
+ CopyBytes<2>(ptr, &bf);
692
+ return F32FromBF16(bf);
693
+ }
694
+
695
+ HWY_API bfloat16_t BF16FromF32(float f) {
696
+ uint32_t bits;
697
+ CopySameSize(&f, &bits);
698
+ const uint16_t bits16 = static_cast<uint16_t>(bits >> 16);
699
+ bfloat16_t bf;
700
+ CopySameSize(&bits16, &bf);
701
+ return bf;
702
+ }
703
+
704
+ using float32_t = float;
705
+ using float64_t = double;
706
+
707
+ #pragma pack(push, 1)
708
+
709
+ // Aligned 128-bit type. Cannot use __int128 because clang doesn't yet align it:
710
+ // https://reviews.llvm.org/D86310
711
+ struct alignas(16) uint128_t {
712
+ uint64_t lo; // little-endian layout
713
+ uint64_t hi;
714
+ };
715
+
716
+ // 64 bit key plus 64 bit value. Faster than using uint128_t when only the key
717
+ // field is to be compared (Lt128Upper instead of Lt128).
718
+ struct alignas(16) K64V64 {
719
+ uint64_t value; // little-endian layout
720
+ uint64_t key;
721
+ };
722
+
723
+ // 32 bit key plus 32 bit value. Allows vqsort recursions to terminate earlier
724
+ // than when considering both to be a 64-bit key.
725
+ struct alignas(8) K32V32 {
726
+ uint32_t value; // little-endian layout
727
+ uint32_t key;
728
+ };
729
+
730
+ #pragma pack(pop)
731
+
732
+ #ifdef HWY_EMULATE_FLOAT16
733
+
734
+ static inline HWY_MAYBE_UNUSED bool operator<(const float16_t& a,
735
+ const float16_t& b) {
736
+ return F32FromF16(a) < F32FromF16(b);
737
+ }
738
+ // Required for std::greater.
739
+ static inline HWY_MAYBE_UNUSED bool operator>(const float16_t& a,
740
+ const float16_t& b) {
741
+ return F32FromF16(a) > F32FromF16(b);
742
+ }
743
+ static inline HWY_MAYBE_UNUSED bool operator==(const float16_t& a,
744
+ const float16_t& b) {
745
+ return F32FromF16(a) == F32FromF16(b);
746
+ }
747
+
748
+ #endif // HWY_EMULATE_FLOAT16
749
+
750
+ static inline HWY_MAYBE_UNUSED bool operator<(const uint128_t& a,
751
+ const uint128_t& b) {
752
+ return (a.hi == b.hi) ? a.lo < b.lo : a.hi < b.hi;
753
+ }
754
+ // Required for std::greater.
755
+ static inline HWY_MAYBE_UNUSED bool operator>(const uint128_t& a,
756
+ const uint128_t& b) {
757
+ return b < a;
758
+ }
759
+ static inline HWY_MAYBE_UNUSED bool operator==(const uint128_t& a,
760
+ const uint128_t& b) {
761
+ return a.lo == b.lo && a.hi == b.hi;
762
+ }
763
+
764
+ static inline HWY_MAYBE_UNUSED bool operator<(const K64V64& a,
765
+ const K64V64& b) {
766
+ return a.key < b.key;
767
+ }
768
+ // Required for std::greater.
769
+ static inline HWY_MAYBE_UNUSED bool operator>(const K64V64& a,
770
+ const K64V64& b) {
771
+ return b < a;
772
+ }
773
+ static inline HWY_MAYBE_UNUSED bool operator==(const K64V64& a,
774
+ const K64V64& b) {
775
+ return a.key == b.key;
776
+ }
777
+
778
+ static inline HWY_MAYBE_UNUSED bool operator<(const K32V32& a,
779
+ const K32V32& b) {
780
+ return a.key < b.key;
781
+ }
782
+ // Required for std::greater.
783
+ static inline HWY_MAYBE_UNUSED bool operator>(const K32V32& a,
784
+ const K32V32& b) {
785
+ return b < a;
786
+ }
787
+ static inline HWY_MAYBE_UNUSED bool operator==(const K32V32& a,
788
+ const K32V32& b) {
789
+ return a.key == b.key;
790
+ }
791
+
792
+ //------------------------------------------------------------------------------
793
+ // Controlling overload resolution (SFINAE)
794
+
795
+ template <bool Condition>
796
+ struct EnableIfT {};
797
+ template <>
798
+ struct EnableIfT<true> {
799
+ using type = void;
800
+ };
801
+
802
+ template <bool Condition>
803
+ using EnableIf = typename EnableIfT<Condition>::type;
804
+
805
+ template <typename T, typename U>
806
+ struct IsSameT {
807
+ enum { value = 0 };
808
+ };
809
+
810
+ template <typename T>
811
+ struct IsSameT<T, T> {
812
+ enum { value = 1 };
813
+ };
814
+
815
+ template <typename T, typename U>
816
+ HWY_API constexpr bool IsSame() {
817
+ return IsSameT<T, U>::value;
818
+ }
819
+
820
+ template <bool Condition, typename Then, typename Else>
821
+ struct IfT {
822
+ using type = Then;
823
+ };
824
+
825
+ template <class Then, class Else>
826
+ struct IfT<false, Then, Else> {
827
+ using type = Else;
828
+ };
829
+
830
+ template <bool Condition, typename Then, typename Else>
831
+ using If = typename IfT<Condition, Then, Else>::type;
832
+
833
+ // Insert into template/function arguments to enable this overload only for
834
+ // vectors of exactly, at most (LE), or more than (GT) this many bytes.
835
+ //
836
+ // As an example, checking for a total size of 16 bytes will match both
837
+ // Simd<uint8_t, 16, 0> and Simd<uint8_t, 8, 1>.
838
+ #define HWY_IF_V_SIZE(T, kN, bytes) \
839
+ hwy::EnableIf<kN * sizeof(T) == bytes>* = nullptr
840
+ #define HWY_IF_V_SIZE_LE(T, kN, bytes) \
841
+ hwy::EnableIf<kN * sizeof(T) <= bytes>* = nullptr
842
+ #define HWY_IF_V_SIZE_GT(T, kN, bytes) \
843
+ hwy::EnableIf<(kN * sizeof(T) > bytes)>* = nullptr
844
+
845
+ #define HWY_IF_LANES(kN, lanes) hwy::EnableIf<(kN == lanes)>* = nullptr
846
+ #define HWY_IF_LANES_LE(kN, lanes) hwy::EnableIf<(kN <= lanes)>* = nullptr
847
+ #define HWY_IF_LANES_GT(kN, lanes) hwy::EnableIf<(kN > lanes)>* = nullptr
848
+
849
+ #define HWY_IF_UNSIGNED(T) hwy::EnableIf<!IsSigned<T>()>* = nullptr
850
+ #define HWY_IF_SIGNED(T) \
851
+ hwy::EnableIf<IsSigned<T>() && !IsFloat<T>() && !IsSpecialFloat<T>()>* = \
852
+ nullptr
853
+ #define HWY_IF_FLOAT(T) hwy::EnableIf<hwy::IsFloat<T>()>* = nullptr
854
+ #define HWY_IF_NOT_FLOAT(T) hwy::EnableIf<!hwy::IsFloat<T>()>* = nullptr
855
+ #define HWY_IF_FLOAT3264(T) hwy::EnableIf<hwy::IsFloat3264<T>()>* = nullptr
856
+ #define HWY_IF_NOT_FLOAT3264(T) hwy::EnableIf<!hwy::IsFloat3264<T>()>* = nullptr
857
+ #define HWY_IF_SPECIAL_FLOAT(T) \
858
+ hwy::EnableIf<hwy::IsSpecialFloat<T>()>* = nullptr
859
+ #define HWY_IF_NOT_SPECIAL_FLOAT(T) \
860
+ hwy::EnableIf<!hwy::IsSpecialFloat<T>()>* = nullptr
861
+ #define HWY_IF_FLOAT_OR_SPECIAL(T) \
862
+ hwy::EnableIf<hwy::IsFloat<T>() || hwy::IsSpecialFloat<T>()>* = nullptr
863
+ #define HWY_IF_NOT_FLOAT_NOR_SPECIAL(T) \
864
+ hwy::EnableIf<!hwy::IsFloat<T>() && !hwy::IsSpecialFloat<T>()>* = nullptr
865
+
866
+ #define HWY_IF_T_SIZE(T, bytes) hwy::EnableIf<sizeof(T) == (bytes)>* = nullptr
867
+ #define HWY_IF_NOT_T_SIZE(T, bytes) \
868
+ hwy::EnableIf<sizeof(T) != (bytes)>* = nullptr
869
+ // bit_array = 0x102 means 1 or 8 bytes. There is no NONE_OF because it sounds
870
+ // too similar. If you want the opposite of this (2 or 4 bytes), ask for those
871
+ // bits explicitly (0x14) instead of attempting to 'negate' 0x102.
872
+ #define HWY_IF_T_SIZE_ONE_OF(T, bit_array) \
873
+ hwy::EnableIf<((size_t{1} << sizeof(T)) & (bit_array)) != 0>* = nullptr
874
+
875
+ // Use instead of HWY_IF_T_SIZE to avoid ambiguity with float16_t/float/double
876
+ // overloads.
877
+ #define HWY_IF_UI16(T) \
878
+ hwy::EnableIf<IsSame<T, uint16_t>() || IsSame<T, int16_t>()>* = nullptr
879
+ #define HWY_IF_UI32(T) \
880
+ hwy::EnableIf<IsSame<T, uint32_t>() || IsSame<T, int32_t>()>* = nullptr
881
+ #define HWY_IF_UI64(T) \
882
+ hwy::EnableIf<IsSame<T, uint64_t>() || IsSame<T, int64_t>()>* = nullptr
883
+ #define HWY_IF_BF16(T) hwy::EnableIf<IsSame<T, hwy::bfloat16_t>()>* = nullptr
884
+ #define HWY_IF_F16(T) hwy::EnableIf<IsSame<T, hwy::float16_t>()>* = nullptr
885
+
886
+ #define HWY_IF_LANES_PER_BLOCK(T, N, LANES) \
887
+ hwy::EnableIf<HWY_MIN(sizeof(T) * N, 16) / sizeof(T) == (LANES)>* = nullptr
888
+
889
+ // Empty struct used as a size tag type.
890
+ template <size_t N>
891
+ struct SizeTag {};
892
+
893
+ template <class T>
894
+ struct RemoveConstT {
895
+ using type = T;
896
+ };
897
+ template <class T>
898
+ struct RemoveConstT<const T> {
899
+ using type = T;
900
+ };
901
+
902
+ template <class T>
903
+ using RemoveConst = typename RemoveConstT<T>::type;
904
+
905
+ template <class T>
906
+ struct RemoveRefT {
907
+ using type = T;
908
+ };
909
+ template <class T>
910
+ struct RemoveRefT<T&> {
911
+ using type = T;
912
+ };
913
+ template <class T>
914
+ struct RemoveRefT<T&&> {
915
+ using type = T;
916
+ };
917
+
918
+ template <class T>
919
+ using RemoveRef = typename RemoveRefT<T>::type;
920
+
921
+ //------------------------------------------------------------------------------
922
+ // Type relations
923
+
924
+ namespace detail {
925
+
926
+ template <typename T>
927
+ struct Relations;
928
+ template <>
929
+ struct Relations<uint8_t> {
930
+ using Unsigned = uint8_t;
931
+ using Signed = int8_t;
932
+ using Wide = uint16_t;
933
+ enum { is_signed = 0, is_float = 0, is_bf16 = 0 };
934
+ };
935
+ template <>
936
+ struct Relations<int8_t> {
937
+ using Unsigned = uint8_t;
938
+ using Signed = int8_t;
939
+ using Wide = int16_t;
940
+ enum { is_signed = 1, is_float = 0, is_bf16 = 0 };
941
+ };
942
+ template <>
943
+ struct Relations<uint16_t> {
944
+ using Unsigned = uint16_t;
945
+ using Signed = int16_t;
946
+ using Float = float16_t;
947
+ using Wide = uint32_t;
948
+ using Narrow = uint8_t;
949
+ enum { is_signed = 0, is_float = 0, is_bf16 = 0 };
950
+ };
951
+ template <>
952
+ struct Relations<int16_t> {
953
+ using Unsigned = uint16_t;
954
+ using Signed = int16_t;
955
+ using Float = float16_t;
956
+ using Wide = int32_t;
957
+ using Narrow = int8_t;
958
+ enum { is_signed = 1, is_float = 0, is_bf16 = 0 };
959
+ };
960
+ template <>
961
+ struct Relations<uint32_t> {
962
+ using Unsigned = uint32_t;
963
+ using Signed = int32_t;
964
+ using Float = float;
965
+ using Wide = uint64_t;
966
+ using Narrow = uint16_t;
967
+ enum { is_signed = 0, is_float = 0, is_bf16 = 0 };
968
+ };
969
+ template <>
970
+ struct Relations<int32_t> {
971
+ using Unsigned = uint32_t;
972
+ using Signed = int32_t;
973
+ using Float = float;
974
+ using Wide = int64_t;
975
+ using Narrow = int16_t;
976
+ enum { is_signed = 1, is_float = 0, is_bf16 = 0 };
977
+ };
978
+ template <>
979
+ struct Relations<uint64_t> {
980
+ using Unsigned = uint64_t;
981
+ using Signed = int64_t;
982
+ using Float = double;
983
+ using Wide = uint128_t;
984
+ using Narrow = uint32_t;
985
+ enum { is_signed = 0, is_float = 0, is_bf16 = 0 };
986
+ };
987
+ template <>
988
+ struct Relations<int64_t> {
989
+ using Unsigned = uint64_t;
990
+ using Signed = int64_t;
991
+ using Float = double;
992
+ using Narrow = int32_t;
993
+ enum { is_signed = 1, is_float = 0, is_bf16 = 0 };
994
+ };
995
+ template <>
996
+ struct Relations<uint128_t> {
997
+ using Unsigned = uint128_t;
998
+ using Narrow = uint64_t;
999
+ enum { is_signed = 0, is_float = 0, is_bf16 = 0 };
1000
+ };
1001
+ template <>
1002
+ struct Relations<float16_t> {
1003
+ using Unsigned = uint16_t;
1004
+ using Signed = int16_t;
1005
+ using Float = float16_t;
1006
+ using Wide = float;
1007
+ enum { is_signed = 1, is_float = 1, is_bf16 = 0 };
1008
+ };
1009
+ template <>
1010
+ struct Relations<bfloat16_t> {
1011
+ using Unsigned = uint16_t;
1012
+ using Signed = int16_t;
1013
+ using Wide = float;
1014
+ enum { is_signed = 1, is_float = 1, is_bf16 = 1 };
1015
+ };
1016
+ template <>
1017
+ struct Relations<float> {
1018
+ using Unsigned = uint32_t;
1019
+ using Signed = int32_t;
1020
+ using Float = float;
1021
+ using Wide = double;
1022
+ using Narrow = float16_t;
1023
+ enum { is_signed = 1, is_float = 1, is_bf16 = 0 };
1024
+ };
1025
+ template <>
1026
+ struct Relations<double> {
1027
+ using Unsigned = uint64_t;
1028
+ using Signed = int64_t;
1029
+ using Float = double;
1030
+ using Narrow = float;
1031
+ enum { is_signed = 1, is_float = 1, is_bf16 = 0 };
1032
+ };
1033
+
1034
+ template <size_t N>
1035
+ struct TypeFromSize;
1036
+ template <>
1037
+ struct TypeFromSize<1> {
1038
+ using Unsigned = uint8_t;
1039
+ using Signed = int8_t;
1040
+ };
1041
+ template <>
1042
+ struct TypeFromSize<2> {
1043
+ using Unsigned = uint16_t;
1044
+ using Signed = int16_t;
1045
+ using Float = float16_t;
1046
+ };
1047
+ template <>
1048
+ struct TypeFromSize<4> {
1049
+ using Unsigned = uint32_t;
1050
+ using Signed = int32_t;
1051
+ using Float = float;
1052
+ };
1053
+ template <>
1054
+ struct TypeFromSize<8> {
1055
+ using Unsigned = uint64_t;
1056
+ using Signed = int64_t;
1057
+ using Float = double;
1058
+ };
1059
+ template <>
1060
+ struct TypeFromSize<16> {
1061
+ using Unsigned = uint128_t;
1062
+ };
1063
+
1064
+ } // namespace detail
1065
+
1066
+ // Aliases for types of a different category, but the same size.
1067
+ template <typename T>
1068
+ using MakeUnsigned = typename detail::Relations<T>::Unsigned;
1069
+ template <typename T>
1070
+ using MakeSigned = typename detail::Relations<T>::Signed;
1071
+ template <typename T>
1072
+ using MakeFloat = typename detail::Relations<T>::Float;
1073
+
1074
+ // Aliases for types of the same category, but different size.
1075
+ template <typename T>
1076
+ using MakeWide = typename detail::Relations<T>::Wide;
1077
+ template <typename T>
1078
+ using MakeNarrow = typename detail::Relations<T>::Narrow;
1079
+
1080
+ // Obtain type from its size [bytes].
1081
+ template <size_t N>
1082
+ using UnsignedFromSize = typename detail::TypeFromSize<N>::Unsigned;
1083
+ template <size_t N>
1084
+ using SignedFromSize = typename detail::TypeFromSize<N>::Signed;
1085
+ template <size_t N>
1086
+ using FloatFromSize = typename detail::TypeFromSize<N>::Float;
1087
+
1088
+ // Avoid confusion with SizeTag where the parameter is a lane size.
1089
+ using UnsignedTag = SizeTag<0>;
1090
+ using SignedTag = SizeTag<0x100>; // integer
1091
+ using FloatTag = SizeTag<0x200>;
1092
+ using SpecialTag = SizeTag<0x300>;
1093
+
1094
+ template <typename T, class R = detail::Relations<T>>
1095
+ constexpr auto TypeTag()
1096
+ -> hwy::SizeTag<((R::is_signed + R::is_float + R::is_bf16) << 8)> {
1097
+ return hwy::SizeTag<((R::is_signed + R::is_float + R::is_bf16) << 8)>();
1098
+ }
1099
+
1100
+ // For when we only want to distinguish FloatTag from everything else.
1101
+ using NonFloatTag = SizeTag<0x400>;
1102
+
1103
+ template <typename T, class R = detail::Relations<T>>
1104
+ constexpr auto IsFloatTag() -> hwy::SizeTag<(R::is_float ? 0x200 : 0x400)> {
1105
+ return hwy::SizeTag<(R::is_float ? 0x200 : 0x400)>();
1106
+ }
1107
+
1108
+ //------------------------------------------------------------------------------
1109
+ // Type traits
1110
+
1111
+ template <typename T>
1112
+ HWY_API constexpr bool IsFloat3264() {
1113
+ return IsSame<T, float>() || IsSame<T, double>();
1114
+ }
1115
+
1116
+ template <typename T>
1117
+ HWY_API constexpr bool IsFloat() {
1118
+ // Cannot use T(1.25) != T(1) for float16_t, which can only be converted to or
1119
+ // from a float, not compared. Include float16_t in case HWY_HAVE_FLOAT16=1.
1120
+ return IsSame<T, float16_t>() || IsFloat3264<T>();
1121
+ }
1122
+
1123
+ // These types are often special-cased and not supported in all ops.
1124
+ template <typename T>
1125
+ HWY_API constexpr bool IsSpecialFloat() {
1126
+ return IsSame<T, float16_t>() || IsSame<T, bfloat16_t>();
1127
+ }
1128
+
1129
+ template <typename T>
1130
+ HWY_API constexpr bool IsSigned() {
1131
+ return T(0) > T(-1);
1132
+ }
1133
+ template <>
1134
+ constexpr bool IsSigned<float16_t>() {
1135
+ return true;
1136
+ }
1137
+ template <>
1138
+ constexpr bool IsSigned<bfloat16_t>() {
1139
+ return true;
1140
+ }
1141
+
1142
+ // Largest/smallest representable integer values.
1143
+ template <typename T>
1144
+ HWY_API constexpr T LimitsMax() {
1145
+ static_assert(!IsFloat<T>(), "Only for integer types");
1146
+ using TU = MakeUnsigned<T>;
1147
+ return static_cast<T>(IsSigned<T>() ? (static_cast<TU>(~0ull) >> 1)
1148
+ : static_cast<TU>(~0ull));
1149
+ }
1150
+ template <typename T>
1151
+ HWY_API constexpr T LimitsMin() {
1152
+ static_assert(!IsFloat<T>(), "Only for integer types");
1153
+ return IsSigned<T>() ? T(-1) - LimitsMax<T>() : T(0);
1154
+ }
1155
+
1156
+ // Largest/smallest representable value (integer or float). This naming avoids
1157
+ // confusion with numeric_limits<float>::min() (the smallest positive value).
1158
+ // Cannot be constexpr because we use CopySameSize for [b]float16_t.
1159
+ template <typename T>
1160
+ HWY_API T LowestValue() {
1161
+ return LimitsMin<T>();
1162
+ }
1163
+ template <>
1164
+ HWY_INLINE bfloat16_t LowestValue<bfloat16_t>() {
1165
+ const uint16_t kBits = 0xFF7F; // -1.1111111 x 2^127
1166
+ bfloat16_t ret;
1167
+ CopySameSize(&kBits, &ret);
1168
+ return ret;
1169
+ }
1170
+ template <>
1171
+ HWY_INLINE float16_t LowestValue<float16_t>() {
1172
+ const uint16_t kBits = 0xFBFF; // -1.1111111111 x 2^15
1173
+ float16_t ret;
1174
+ CopySameSize(&kBits, &ret);
1175
+ return ret;
1176
+ }
1177
+ template <>
1178
+ HWY_INLINE float LowestValue<float>() {
1179
+ return -3.402823466e+38F;
1180
+ }
1181
+ template <>
1182
+ HWY_INLINE double LowestValue<double>() {
1183
+ return -1.7976931348623158e+308;
1184
+ }
1185
+
1186
+ template <typename T>
1187
+ HWY_API T HighestValue() {
1188
+ return LimitsMax<T>();
1189
+ }
1190
+ template <>
1191
+ HWY_INLINE bfloat16_t HighestValue<bfloat16_t>() {
1192
+ const uint16_t kBits = 0x7F7F; // 1.1111111 x 2^127
1193
+ bfloat16_t ret;
1194
+ CopySameSize(&kBits, &ret);
1195
+ return ret;
1196
+ }
1197
+ template <>
1198
+ HWY_INLINE float16_t HighestValue<float16_t>() {
1199
+ const uint16_t kBits = 0x7BFF; // 1.1111111111 x 2^15
1200
+ float16_t ret;
1201
+ CopySameSize(&kBits, &ret);
1202
+ return ret;
1203
+ }
1204
+ template <>
1205
+ HWY_INLINE float HighestValue<float>() {
1206
+ return 3.402823466e+38F;
1207
+ }
1208
+ template <>
1209
+ HWY_INLINE double HighestValue<double>() {
1210
+ return 1.7976931348623158e+308;
1211
+ }
1212
+
1213
+ // Difference between 1.0 and the next representable value. Equal to
1214
+ // 1 / (1ULL << MantissaBits<T>()), but hard-coding ensures precision.
1215
+ template <typename T>
1216
+ HWY_API T Epsilon() {
1217
+ return 1;
1218
+ }
1219
+ template <>
1220
+ HWY_INLINE bfloat16_t Epsilon<bfloat16_t>() {
1221
+ const uint16_t kBits = 0x3C00; // 0.0078125
1222
+ bfloat16_t ret;
1223
+ CopySameSize(&kBits, &ret);
1224
+ return ret;
1225
+ }
1226
+ template <>
1227
+ HWY_INLINE float16_t Epsilon<float16_t>() {
1228
+ const uint16_t kBits = 0x1400; // 0.0009765625
1229
+ float16_t ret;
1230
+ CopySameSize(&kBits, &ret);
1231
+ return ret;
1232
+ }
1233
+ template <>
1234
+ HWY_INLINE float Epsilon<float>() {
1235
+ return 1.192092896e-7f;
1236
+ }
1237
+ template <>
1238
+ HWY_INLINE double Epsilon<double>() {
1239
+ return 2.2204460492503131e-16;
1240
+ }
1241
+
1242
+ // Returns width in bits of the mantissa field in IEEE binary16/32/64.
1243
+ template <typename T>
1244
+ constexpr int MantissaBits() {
1245
+ static_assert(sizeof(T) == 0, "Only instantiate the specializations");
1246
+ return 0;
1247
+ }
1248
+ template <>
1249
+ constexpr int MantissaBits<bfloat16_t>() {
1250
+ return 7;
1251
+ }
1252
+ template <>
1253
+ constexpr int MantissaBits<float16_t>() {
1254
+ return 10;
1255
+ }
1256
+ template <>
1257
+ constexpr int MantissaBits<float>() {
1258
+ return 23;
1259
+ }
1260
+ template <>
1261
+ constexpr int MantissaBits<double>() {
1262
+ return 52;
1263
+ }
1264
+
1265
+ // Returns the (left-shifted by one bit) IEEE binary16/32/64 representation with
1266
+ // the largest possible (biased) exponent field. Used by IsInf.
1267
+ template <typename T>
1268
+ constexpr MakeSigned<T> MaxExponentTimes2() {
1269
+ return -(MakeSigned<T>{1} << (MantissaBits<T>() + 1));
1270
+ }
1271
+
1272
+ // Returns bitmask of the sign bit in IEEE binary16/32/64.
1273
+ template <typename T>
1274
+ constexpr MakeUnsigned<T> SignMask() {
1275
+ return MakeUnsigned<T>{1} << (sizeof(T) * 8 - 1);
1276
+ }
1277
+
1278
+ // Returns bitmask of the exponent field in IEEE binary16/32/64.
1279
+ template <typename T>
1280
+ constexpr MakeUnsigned<T> ExponentMask() {
1281
+ return (~(MakeUnsigned<T>{1} << MantissaBits<T>()) + 1) & ~SignMask<T>();
1282
+ }
1283
+
1284
+ // Returns bitmask of the mantissa field in IEEE binary16/32/64.
1285
+ template <typename T>
1286
+ constexpr MakeUnsigned<T> MantissaMask() {
1287
+ return (MakeUnsigned<T>{1} << MantissaBits<T>()) - 1;
1288
+ }
1289
+
1290
+ // Returns 1 << mantissa_bits as a floating-point number. All integers whose
1291
+ // absolute value are less than this can be represented exactly.
1292
+ template <typename T>
1293
+ HWY_INLINE T MantissaEnd() {
1294
+ static_assert(sizeof(T) == 0, "Only instantiate the specializations");
1295
+ return 0;
1296
+ }
1297
+ template <>
1298
+ HWY_INLINE bfloat16_t MantissaEnd<bfloat16_t>() {
1299
+ const uint16_t kBits = 0x4300; // 1.0 x 2^7
1300
+ bfloat16_t ret;
1301
+ CopySameSize(&kBits, &ret);
1302
+ return ret;
1303
+ }
1304
+ template <>
1305
+ HWY_INLINE float16_t MantissaEnd<float16_t>() {
1306
+ const uint16_t kBits = 0x6400; // 1.0 x 2^10
1307
+ float16_t ret;
1308
+ CopySameSize(&kBits, &ret);
1309
+ return ret;
1310
+ }
1311
+ template <>
1312
+ HWY_INLINE float MantissaEnd<float>() {
1313
+ return 8388608.0f; // 1 << 23
1314
+ }
1315
+ template <>
1316
+ HWY_INLINE double MantissaEnd<double>() {
1317
+ // floating point literal with p52 requires C++17.
1318
+ return 4503599627370496.0; // 1 << 52
1319
+ }
1320
+
1321
+ // Returns width in bits of the exponent field in IEEE binary16/32/64.
1322
+ template <typename T>
1323
+ constexpr int ExponentBits() {
1324
+ // Exponent := remaining bits after deducting sign and mantissa.
1325
+ return 8 * sizeof(T) - 1 - MantissaBits<T>();
1326
+ }
1327
+
1328
+ // Returns largest value of the biased exponent field in IEEE binary16/32/64,
1329
+ // right-shifted so that the LSB is bit zero. Example: 0xFF for float.
1330
+ // This is expressed as a signed integer for more efficient comparison.
1331
+ template <typename T>
1332
+ constexpr MakeSigned<T> MaxExponentField() {
1333
+ return (MakeSigned<T>{1} << ExponentBits<T>()) - 1;
1334
+ }
1335
+
1336
+ //------------------------------------------------------------------------------
1337
+ // Helper functions
1338
+
1339
+ template <typename T1, typename T2>
1340
+ constexpr inline T1 DivCeil(T1 a, T2 b) {
1341
+ return (a + b - 1) / b;
1342
+ }
1343
+
1344
+ // Works for any `align`; if a power of two, compiler emits ADD+AND.
1345
+ constexpr inline size_t RoundUpTo(size_t what, size_t align) {
1346
+ return DivCeil(what, align) * align;
1347
+ }
1348
+
1349
+ // Undefined results for x == 0.
1350
+ HWY_API size_t Num0BitsBelowLS1Bit_Nonzero32(const uint32_t x) {
1351
+ #if HWY_COMPILER_MSVC
1352
+ unsigned long index; // NOLINT
1353
+ _BitScanForward(&index, x);
1354
+ return index;
1355
+ #else // HWY_COMPILER_MSVC
1356
+ return static_cast<size_t>(__builtin_ctz(x));
1357
+ #endif // HWY_COMPILER_MSVC
1358
+ }
1359
+
1360
+ HWY_API size_t Num0BitsBelowLS1Bit_Nonzero64(const uint64_t x) {
1361
+ #if HWY_COMPILER_MSVC
1362
+ #if HWY_ARCH_X86_64
1363
+ unsigned long index; // NOLINT
1364
+ _BitScanForward64(&index, x);
1365
+ return index;
1366
+ #else // HWY_ARCH_X86_64
1367
+ // _BitScanForward64 not available
1368
+ uint32_t lsb = static_cast<uint32_t>(x & 0xFFFFFFFF);
1369
+ unsigned long index; // NOLINT
1370
+ if (lsb == 0) {
1371
+ uint32_t msb = static_cast<uint32_t>(x >> 32u);
1372
+ _BitScanForward(&index, msb);
1373
+ return 32 + index;
1374
+ } else {
1375
+ _BitScanForward(&index, lsb);
1376
+ return index;
1377
+ }
1378
+ #endif // HWY_ARCH_X86_64
1379
+ #else // HWY_COMPILER_MSVC
1380
+ return static_cast<size_t>(__builtin_ctzll(x));
1381
+ #endif // HWY_COMPILER_MSVC
1382
+ }
1383
+
1384
+ // Undefined results for x == 0.
1385
+ HWY_API size_t Num0BitsAboveMS1Bit_Nonzero32(const uint32_t x) {
1386
+ #if HWY_COMPILER_MSVC
1387
+ unsigned long index; // NOLINT
1388
+ _BitScanReverse(&index, x);
1389
+ return 31 - index;
1390
+ #else // HWY_COMPILER_MSVC
1391
+ return static_cast<size_t>(__builtin_clz(x));
1392
+ #endif // HWY_COMPILER_MSVC
1393
+ }
1394
+
1395
+ HWY_API size_t Num0BitsAboveMS1Bit_Nonzero64(const uint64_t x) {
1396
+ #if HWY_COMPILER_MSVC
1397
+ #if HWY_ARCH_X86_64
1398
+ unsigned long index; // NOLINT
1399
+ _BitScanReverse64(&index, x);
1400
+ return 63 - index;
1401
+ #else // HWY_ARCH_X86_64
1402
+ // _BitScanReverse64 not available
1403
+ const uint32_t msb = static_cast<uint32_t>(x >> 32u);
1404
+ unsigned long index; // NOLINT
1405
+ if (msb == 0) {
1406
+ const uint32_t lsb = static_cast<uint32_t>(x & 0xFFFFFFFF);
1407
+ _BitScanReverse(&index, lsb);
1408
+ return 63 - index;
1409
+ } else {
1410
+ _BitScanReverse(&index, msb);
1411
+ return 31 - index;
1412
+ }
1413
+ #endif // HWY_ARCH_X86_64
1414
+ #else // HWY_COMPILER_MSVC
1415
+ return static_cast<size_t>(__builtin_clzll(x));
1416
+ #endif // HWY_COMPILER_MSVC
1417
+ }
1418
+
1419
+ HWY_API size_t PopCount(uint64_t x) {
1420
+ #if HWY_COMPILER_GCC // includes clang
1421
+ return static_cast<size_t>(__builtin_popcountll(x));
1422
+ // This instruction has a separate feature flag, but is often called from
1423
+ // non-SIMD code, so we don't want to require dynamic dispatch. It was first
1424
+ // supported by Intel in Nehalem (SSE4.2), but MSVC only predefines a macro
1425
+ // for AVX, so check for that.
1426
+ #elif HWY_COMPILER_MSVC && HWY_ARCH_X86_64 && defined(__AVX__)
1427
+ return _mm_popcnt_u64(x);
1428
+ #elif HWY_COMPILER_MSVC && HWY_ARCH_X86_32 && defined(__AVX__)
1429
+ return _mm_popcnt_u32(static_cast<uint32_t>(x & 0xFFFFFFFFu)) +
1430
+ _mm_popcnt_u32(static_cast<uint32_t>(x >> 32));
1431
+ #else
1432
+ x -= ((x >> 1) & 0x5555555555555555ULL);
1433
+ x = (((x >> 2) & 0x3333333333333333ULL) + (x & 0x3333333333333333ULL));
1434
+ x = (((x >> 4) + x) & 0x0F0F0F0F0F0F0F0FULL);
1435
+ x += (x >> 8);
1436
+ x += (x >> 16);
1437
+ x += (x >> 32);
1438
+ return static_cast<size_t>(x & 0x7Fu);
1439
+ #endif
1440
+ }
1441
+
1442
+ // Skip HWY_API due to GCC "function not considered for inlining". Previously
1443
+ // such errors were caused by underlying type mismatches, but it's not clear
1444
+ // what is still mismatched despite all the casts.
1445
+ template <typename TI>
1446
+ /*HWY_API*/ constexpr size_t FloorLog2(TI x) {
1447
+ return x == TI{1}
1448
+ ? 0
1449
+ : static_cast<size_t>(FloorLog2(static_cast<TI>(x >> 1)) + 1);
1450
+ }
1451
+
1452
+ template <typename TI>
1453
+ /*HWY_API*/ constexpr size_t CeilLog2(TI x) {
1454
+ return x == TI{1}
1455
+ ? 0
1456
+ : static_cast<size_t>(FloorLog2(static_cast<TI>(x - 1)) + 1);
1457
+ }
1458
+
1459
+ template <typename T>
1460
+ HWY_INLINE constexpr T AddWithWraparound(hwy::FloatTag /*tag*/, T t, size_t n) {
1461
+ return t + static_cast<T>(n);
1462
+ }
1463
+
1464
+ template <typename T>
1465
+ HWY_INLINE constexpr T AddWithWraparound(hwy::NonFloatTag /*tag*/, T t,
1466
+ size_t n) {
1467
+ using TU = MakeUnsigned<T>;
1468
+ return static_cast<T>(
1469
+ static_cast<TU>(static_cast<TU>(t) + static_cast<TU>(n)) &
1470
+ hwy::LimitsMax<TU>());
1471
+ }
1472
+
1473
+ #if HWY_COMPILER_MSVC && HWY_ARCH_X86_64
1474
+ #pragma intrinsic(_umul128)
1475
+ #endif
1476
+
1477
+ // 64 x 64 = 128 bit multiplication
1478
+ HWY_API uint64_t Mul128(uint64_t a, uint64_t b, uint64_t* HWY_RESTRICT upper) {
1479
+ #if defined(__SIZEOF_INT128__)
1480
+ __uint128_t product = (__uint128_t)a * (__uint128_t)b;
1481
+ *upper = (uint64_t)(product >> 64);
1482
+ return (uint64_t)(product & 0xFFFFFFFFFFFFFFFFULL);
1483
+ #elif HWY_COMPILER_MSVC && HWY_ARCH_X86_64
1484
+ return _umul128(a, b, upper);
1485
+ #else
1486
+ constexpr uint64_t kLo32 = 0xFFFFFFFFU;
1487
+ const uint64_t lo_lo = (a & kLo32) * (b & kLo32);
1488
+ const uint64_t hi_lo = (a >> 32) * (b & kLo32);
1489
+ const uint64_t lo_hi = (a & kLo32) * (b >> 32);
1490
+ const uint64_t hi_hi = (a >> 32) * (b >> 32);
1491
+ const uint64_t t = (lo_lo >> 32) + (hi_lo & kLo32) + lo_hi;
1492
+ *upper = (hi_lo >> 32) + (t >> 32) + hi_hi;
1493
+ return (t << 32) | (lo_lo & kLo32);
1494
+ #endif
1495
+ }
1496
+
1497
+ // Prevents the compiler from eliding the computations that led to "output".
1498
+ template <class T>
1499
+ HWY_API void PreventElision(T&& output) {
1500
+ #if HWY_COMPILER_MSVC
1501
+ // MSVC does not support inline assembly anymore (and never supported GCC's
1502
+ // RTL constraints). Self-assignment with #pragma optimize("off") might be
1503
+ // expected to prevent elision, but it does not with MSVC 2015. Type-punning
1504
+ // with volatile pointers generates inefficient code on MSVC 2017.
1505
+ static std::atomic<RemoveRef<T>> dummy;
1506
+ dummy.store(output, std::memory_order_relaxed);
1507
+ #else
1508
+ // Works by indicating to the compiler that "output" is being read and
1509
+ // modified. The +r constraint avoids unnecessary writes to memory, but only
1510
+ // works for built-in types (typically FuncOutput).
1511
+ asm volatile("" : "+r"(output) : : "memory");
1512
+ #endif
1513
+ }
1514
+
1515
+ } // namespace hwy
1516
+
1517
+ #endif // HIGHWAY_HWY_BASE_H_