@img/sharp-libvips-dev 1.0.2 → 1.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (118) hide show
  1. package/README.md +1 -2
  2. package/include/aom/aom_decoder.h +1 -1
  3. package/include/aom/aom_encoder.h +7 -1
  4. package/include/aom/aom_image.h +24 -12
  5. package/include/aom/aom_integer.h +3 -3
  6. package/include/aom/aomcx.h +15 -0
  7. package/include/aom/aomdx.h +5 -2
  8. package/include/archive.h +7 -5
  9. package/include/archive_entry.h +5 -3
  10. package/include/cgif.h +3 -0
  11. package/include/freetype2/freetype/config/ftoption.h +1 -1
  12. package/include/fribidi/fribidi-config.h +2 -2
  13. package/include/fribidi/fribidi-unicode-version.h +3 -3
  14. package/include/glib-2.0/gio/gappinfo.h +40 -25
  15. package/include/glib-2.0/gio/gasyncresult.h +1 -1
  16. package/include/glib-2.0/gio/gconverter.h +5 -0
  17. package/include/glib-2.0/gio/gdbusintrospection.h +1 -1
  18. package/include/glib-2.0/gio/gfile.h +16 -0
  19. package/include/glib-2.0/gio/gio-visibility.h +34 -0
  20. package/include/glib-2.0/gio/gsettings.h +8 -0
  21. package/include/glib-2.0/gio/gvfs.h +2 -2
  22. package/include/glib-2.0/girepository/gi-visibility.h +34 -0
  23. package/include/glib-2.0/glib/gbookmarkfile.h +1 -1
  24. package/include/glib-2.0/glib/giochannel.h +2 -2
  25. package/include/glib-2.0/glib/glib-visibility.h +34 -0
  26. package/include/glib-2.0/glib/gmacros.h +12 -5
  27. package/include/glib-2.0/glib/gmain.h +93 -7
  28. package/include/glib-2.0/glib/gqsort.h +8 -1
  29. package/include/glib-2.0/glib/gstrfuncs.h +0 -12
  30. package/include/glib-2.0/glib/gstrvbuilder.h +3 -0
  31. package/include/glib-2.0/glib/gunicode.h +1 -1
  32. package/include/glib-2.0/glib/gversionmacros.h +9 -0
  33. package/include/glib-2.0/gmodule/gmodule-visibility.h +34 -0
  34. package/include/glib-2.0/gobject/gobject-visibility.h +34 -0
  35. package/include/glib-2.0/gobject/gtype.h +6 -6
  36. package/include/harfbuzz/hb-buffer.h +6 -0
  37. package/include/harfbuzz/hb-common.h +6 -9
  38. package/include/harfbuzz/hb-cplusplus.hh +8 -11
  39. package/include/harfbuzz/hb-subset.h +17 -4
  40. package/include/harfbuzz/hb-version.h +3 -3
  41. package/include/hwy/abort.h +28 -0
  42. package/include/hwy/aligned_allocator.h +48 -1
  43. package/include/hwy/base.h +235 -34
  44. package/include/hwy/detect_compiler_arch.h +84 -10
  45. package/include/hwy/detect_targets.h +95 -29
  46. package/include/hwy/foreach_target.h +12 -1
  47. package/include/hwy/highway.h +205 -50
  48. package/include/hwy/ops/arm_neon-inl.h +841 -99
  49. package/include/hwy/ops/arm_sve-inl.h +413 -141
  50. package/include/hwy/ops/emu128-inl.h +373 -360
  51. package/include/hwy/ops/generic_ops-inl.h +804 -401
  52. package/include/hwy/ops/inside-inl.h +691 -0
  53. package/include/hwy/ops/ppc_vsx-inl.h +456 -166
  54. package/include/hwy/ops/rvv-inl.h +537 -249
  55. package/include/hwy/ops/scalar-inl.h +169 -79
  56. package/include/hwy/ops/set_macros-inl.h +106 -18
  57. package/include/hwy/ops/shared-inl.h +23 -0
  58. package/include/hwy/ops/wasm_128-inl.h +130 -108
  59. package/include/hwy/ops/x86_128-inl.h +1892 -577
  60. package/include/hwy/ops/x86_256-inl.h +625 -184
  61. package/include/hwy/ops/x86_512-inl.h +733 -131
  62. package/include/hwy/targets.h +22 -21
  63. package/include/hwy/timer-inl.h +3 -3
  64. package/include/hwy/timer.h +5 -1
  65. package/include/libheif/heif.h +170 -15
  66. package/include/libheif/heif_items.h +237 -0
  67. package/include/libheif/heif_properties.h +38 -2
  68. package/include/libheif/heif_regions.h +1 -1
  69. package/include/libheif/heif_version.h +2 -2
  70. package/include/libpng16/pnglibconf.h +1 -1
  71. package/include/librsvg-2.0/librsvg/rsvg-cairo.h +1 -1
  72. package/include/librsvg-2.0/librsvg/rsvg-features.h +3 -4
  73. package/include/librsvg-2.0/librsvg/rsvg-pixbuf.h +235 -0
  74. package/include/librsvg-2.0/librsvg/rsvg-version.h +3 -3
  75. package/include/librsvg-2.0/librsvg/rsvg.h +55 -176
  76. package/include/libxml2/libxml/HTMLparser.h +12 -19
  77. package/include/libxml2/libxml/c14n.h +1 -12
  78. package/include/libxml2/libxml/debugXML.h +1 -1
  79. package/include/libxml2/libxml/encoding.h +9 -0
  80. package/include/libxml2/libxml/entities.h +12 -1
  81. package/include/libxml2/libxml/hash.h +19 -0
  82. package/include/libxml2/libxml/list.h +2 -2
  83. package/include/libxml2/libxml/nanohttp.h +17 -0
  84. package/include/libxml2/libxml/parser.h +61 -55
  85. package/include/libxml2/libxml/parserInternals.h +9 -1
  86. package/include/libxml2/libxml/pattern.h +6 -0
  87. package/include/libxml2/libxml/tree.h +32 -12
  88. package/include/libxml2/libxml/uri.h +11 -0
  89. package/include/libxml2/libxml/valid.h +29 -2
  90. package/include/libxml2/libxml/xinclude.h +7 -0
  91. package/include/libxml2/libxml/xmlIO.h +21 -4
  92. package/include/libxml2/libxml/xmlerror.h +14 -0
  93. package/include/libxml2/libxml/xmlexports.h +111 -15
  94. package/include/libxml2/libxml/xmlmemory.h +8 -45
  95. package/include/libxml2/libxml/xmlreader.h +2 -0
  96. package/include/libxml2/libxml/xmlsave.h +5 -0
  97. package/include/libxml2/libxml/xmlunicode.h +165 -1
  98. package/include/libxml2/libxml/xmlversion.h +15 -179
  99. package/include/libxml2/libxml/xmlwriter.h +1 -0
  100. package/include/libxml2/libxml/xpath.h +4 -0
  101. package/include/pango-1.0/pango/pango-features.h +3 -3
  102. package/include/pango-1.0/pango/pango-item.h +4 -2
  103. package/include/pango-1.0/pango/pango-version-macros.h +25 -0
  104. package/include/pango-1.0/pango/pangofc-font.h +2 -1
  105. package/include/pnglibconf.h +1 -1
  106. package/include/vips/util.h +1 -2
  107. package/include/vips/version.h +4 -4
  108. package/include/webp/decode.h +58 -56
  109. package/include/webp/demux.h +25 -21
  110. package/include/webp/encode.h +44 -39
  111. package/include/webp/mux.h +76 -15
  112. package/include/webp/mux_types.h +2 -1
  113. package/include/webp/sharpyuv/sharpyuv.h +77 -8
  114. package/include/webp/types.h +29 -8
  115. package/include/zconf.h +1 -1
  116. package/include/zlib.h +12 -12
  117. package/package.json +1 -1
  118. package/versions.json +14 -15
@@ -21,11 +21,13 @@
21
21
  #include <algorithm>
22
22
  #include <array>
23
23
  #include <cassert>
24
+ #include <cstdint>
24
25
  #include <cstring>
25
26
  #include <initializer_list>
26
27
  #include <memory>
27
28
  #include <type_traits>
28
29
  #include <utility>
30
+ #include <vector>
29
31
 
30
32
  #include "hwy/base.h"
31
33
  #include "hwy/per_target.h"
@@ -35,9 +37,14 @@ namespace hwy {
35
37
  // Minimum alignment of allocated memory for use in HWY_ASSUME_ALIGNED, which
36
38
  // requires a literal. To prevent false sharing, this should be at least the
37
39
  // L1 cache line size, usually 64 bytes. However, Intel's L2 prefetchers may
38
- // access pairs of lines, and POWER8 also has 128.
40
+ // access pairs of lines, and M1 L2 and POWER8 lines are also 128 bytes.
39
41
  #define HWY_ALIGNMENT 128
40
42
 
43
+ template <typename T>
44
+ HWY_API constexpr bool IsAligned(T* ptr, size_t align = HWY_ALIGNMENT) {
45
+ return reinterpret_cast<uintptr_t>(ptr) % align == 0;
46
+ }
47
+
41
48
  // Pointers to functions equivalent to malloc/free with an opaque void* passed
42
49
  // to them.
43
50
  using AllocPtr = void* (*)(void* opaque, size_t bytes);
@@ -124,6 +131,46 @@ AlignedUniquePtr<T> MakeUniqueAligned(Args&&... args) {
124
131
  AlignedDeleter());
125
132
  }
126
133
 
134
+ template <class T>
135
+ struct AlignedAllocator {
136
+ using value_type = T;
137
+
138
+ AlignedAllocator() = default;
139
+
140
+ template <class V>
141
+ explicit AlignedAllocator(const AlignedAllocator<V>&) noexcept {}
142
+
143
+ template <class V>
144
+ value_type* allocate(V n) {
145
+ static_assert(std::is_integral<V>::value,
146
+ "AlignedAllocator only supports integer types");
147
+ static_assert(sizeof(V) <= sizeof(std::size_t),
148
+ "V n must be smaller or equal size_t to avoid overflow");
149
+ return static_cast<value_type*>(
150
+ AllocateAlignedBytes(static_cast<std::size_t>(n) * sizeof(value_type)));
151
+ }
152
+
153
+ template <class V>
154
+ void deallocate(value_type* p, HWY_MAYBE_UNUSED V n) {
155
+ return FreeAlignedBytes(p, nullptr, nullptr);
156
+ }
157
+ };
158
+
159
+ template <class T, class V>
160
+ constexpr bool operator==(const AlignedAllocator<T>&,
161
+ const AlignedAllocator<V>&) noexcept {
162
+ return true;
163
+ }
164
+
165
+ template <class T, class V>
166
+ constexpr bool operator!=(const AlignedAllocator<T>&,
167
+ const AlignedAllocator<V>&) noexcept {
168
+ return false;
169
+ }
170
+
171
+ template <class T>
172
+ using AlignedVector = std::vector<T, AlignedAllocator<T>>;
173
+
127
174
  // Helpers for array allocators (avoids overflow)
128
175
  namespace detail {
129
176
 
@@ -16,7 +16,7 @@
16
16
  #ifndef HIGHWAY_HWY_BASE_H_
17
17
  #define HIGHWAY_HWY_BASE_H_
18
18
 
19
- // For SIMD module implementations and their callers, target-independent.
19
+ // Target-independent definitions.
20
20
 
21
21
  // IWYU pragma: begin_exports
22
22
  #include <stddef.h>
@@ -25,11 +25,17 @@
25
25
  #include "hwy/detect_compiler_arch.h"
26
26
  #include "hwy/highway_export.h"
27
27
 
28
- #if HWY_COMPILER_MSVC && defined(_MSVC_LANG) && _MSVC_LANG > __cplusplus
29
- #define HWY_CXX_LANG _MSVC_LANG
30
- #else
31
- #define HWY_CXX_LANG __cplusplus
32
- #endif
28
+ // API version (https://semver.org/); keep in sync with CMakeLists.txt.
29
+ #define HWY_MAJOR 1
30
+ #define HWY_MINOR 2
31
+ #define HWY_PATCH 0
32
+
33
+ // True if the Highway version >= major.minor.0. Added in 1.2.0.
34
+ #define HWY_VERSION_GE(major, minor) \
35
+ (HWY_MAJOR > (major) || (HWY_MAJOR == (major) && HWY_MINOR >= (minor)))
36
+ // True if the Highway version < major.minor.0. Added in 1.2.0.
37
+ #define HWY_VERSION_LT(major, minor) \
38
+ (HWY_MAJOR < (major) || (HWY_MAJOR == (major) && HWY_MINOR < (minor)))
33
39
 
34
40
  // "IWYU pragma: keep" does not work for these includes, so hide from the IDE.
35
41
  #if !HWY_IDE
@@ -47,14 +53,25 @@
47
53
 
48
54
  #endif // !HWY_IDE
49
55
 
50
- #if !defined(HWY_NO_LIBCXX) && HWY_CXX_LANG > 201703L && \
51
- __cpp_impl_three_way_comparison >= 201907L && defined(__has_include) && \
52
- !defined(HWY_DISABLE_CXX20_THREE_WAY_COMPARE)
56
+ #ifndef HWY_HAVE_COMPARE_HEADER // allow override
57
+ #define HWY_HAVE_COMPARE_HEADER 0
58
+ #if defined(__has_include) // note: wrapper macro fails on Clang ~17
53
59
  #if __has_include(<compare>)
60
+ #undef HWY_HAVE_COMPARE_HEADER
61
+ #define HWY_HAVE_COMPARE_HEADER 1
62
+ #endif // __has_include
63
+ #endif // defined(__has_include)
64
+ #endif // HWY_HAVE_COMPARE_HEADER
65
+
66
+ #ifndef HWY_HAVE_CXX20_THREE_WAY_COMPARE // allow override
67
+ #if !defined(HWY_NO_LIBCXX) && defined(__cpp_impl_three_way_comparison) && \
68
+ __cpp_impl_three_way_comparison >= 201907L && HWY_HAVE_COMPARE_HEADER
54
69
  #include <compare>
55
70
  #define HWY_HAVE_CXX20_THREE_WAY_COMPARE 1
71
+ #else
72
+ #define HWY_HAVE_CXX20_THREE_WAY_COMPARE 0
56
73
  #endif
57
- #endif
74
+ #endif // HWY_HAVE_CXX20_THREE_WAY_COMPARE
58
75
 
59
76
  // IWYU pragma: end_exports
60
77
 
@@ -72,6 +89,7 @@
72
89
 
73
90
  #include <intrin.h>
74
91
 
92
+ #define HWY_FUNCTION __FUNCSIG__ // function name + template args
75
93
  #define HWY_RESTRICT __restrict
76
94
  #define HWY_INLINE __forceinline
77
95
  #define HWY_NOINLINE __declspec(noinline)
@@ -92,6 +110,7 @@
92
110
 
93
111
  #else
94
112
 
113
+ #define HWY_FUNCTION __PRETTY_FUNCTION__ // function name + template args
95
114
  #define HWY_RESTRICT __restrict__
96
115
  // force inlining without optimization enabled creates very inefficient code
97
116
  // that can cause compiler timeout
@@ -139,9 +158,10 @@ namespace hwy {
139
158
  #define HWY_ASSUME_ALIGNED(ptr, align) (ptr) /* not supported */
140
159
  #endif
141
160
 
142
- // Special case to increases required alignment
161
+ // Returns a pointer whose type is `type` (T*), while allowing the compiler to
162
+ // assume that the untyped pointer `ptr` is aligned to a multiple of sizeof(T).
143
163
  #define HWY_RCAST_ALIGNED(type, ptr) \
144
- reinterpret_cast<type>(HWY_ASSUME_ALIGNED((ptr), alignof(type)))
164
+ reinterpret_cast<type>(HWY_ASSUME_ALIGNED((ptr), alignof(RemovePtr<type>)))
145
165
 
146
166
  // Clang and GCC require attributes on each function into which SIMD intrinsics
147
167
  // are inlined. Support both per-function annotation (HWY_ATTR) for lambdas and
@@ -240,24 +260,41 @@ HWY_DLLEXPORT HWY_NORETURN void HWY_FORMAT(3, 4)
240
260
  } \
241
261
  } while (0)
242
262
 
243
- #if HWY_HAS_FEATURE(memory_sanitizer) || defined(MEMORY_SANITIZER)
263
+ #if HWY_HAS_FEATURE(memory_sanitizer) || defined(MEMORY_SANITIZER) || \
264
+ defined(__SANITIZE_MEMORY__)
244
265
  #define HWY_IS_MSAN 1
245
266
  #else
246
267
  #define HWY_IS_MSAN 0
247
268
  #endif
248
269
 
249
- #if HWY_HAS_FEATURE(address_sanitizer) || defined(ADDRESS_SANITIZER)
270
+ #if HWY_HAS_FEATURE(address_sanitizer) || defined(ADDRESS_SANITIZER) || \
271
+ defined(__SANITIZE_ADDRESS__)
250
272
  #define HWY_IS_ASAN 1
251
273
  #else
252
274
  #define HWY_IS_ASAN 0
253
275
  #endif
254
276
 
255
- #if HWY_HAS_FEATURE(thread_sanitizer) || defined(THREAD_SANITIZER)
277
+ #if HWY_HAS_FEATURE(hwaddress_sanitizer) || defined(HWADDRESS_SANITIZER) || \
278
+ defined(__SANITIZE_HWADDRESS__)
279
+ #define HWY_IS_HWASAN 1
280
+ #else
281
+ #define HWY_IS_HWASAN 0
282
+ #endif
283
+
284
+ #if HWY_HAS_FEATURE(thread_sanitizer) || defined(THREAD_SANITIZER) || \
285
+ defined(__SANITIZE_THREAD__)
256
286
  #define HWY_IS_TSAN 1
257
287
  #else
258
288
  #define HWY_IS_TSAN 0
259
289
  #endif
260
290
 
291
+ #if HWY_HAS_FEATURE(undefined_behavior_sanitizer) || \
292
+ defined(UNDEFINED_BEHAVIOR_SANITIZER)
293
+ #define HWY_IS_UBSAN 1
294
+ #else
295
+ #define HWY_IS_UBSAN 0
296
+ #endif
297
+
261
298
  // MSAN may cause lengthy build times or false positives e.g. in AVX3 DemoteTo.
262
299
  // You can disable MSAN by adding this attribute to the function that fails.
263
300
  #if HWY_IS_MSAN
@@ -271,7 +308,8 @@ HWY_DLLEXPORT HWY_NORETURN void HWY_FORMAT(3, 4)
271
308
  // Clang does not define NDEBUG, but it and GCC define __OPTIMIZE__, and recent
272
309
  // MSVC defines NDEBUG (if not, could instead check _DEBUG).
273
310
  #if (!defined(__OPTIMIZE__) && !defined(NDEBUG)) || HWY_IS_ASAN || \
274
- HWY_IS_MSAN || HWY_IS_TSAN || defined(__clang_analyzer__)
311
+ HWY_IS_HWASAN || HWY_IS_MSAN || HWY_IS_TSAN || HWY_IS_UBSAN || \
312
+ defined(__clang_analyzer__)
275
313
  #define HWY_IS_DEBUG_BUILD 1
276
314
  #else
277
315
  #define HWY_IS_DEBUG_BUILD 0
@@ -286,16 +324,6 @@ HWY_DLLEXPORT HWY_NORETURN void HWY_FORMAT(3, 4)
286
324
  } while (0)
287
325
  #endif
288
326
 
289
- #if __cpp_constexpr >= 201304L
290
- #define HWY_CXX14_CONSTEXPR constexpr
291
- #else
292
- #define HWY_CXX14_CONSTEXPR
293
- #endif
294
-
295
- #ifndef HWY_HAVE_CXX20_THREE_WAY_COMPARE
296
- #define HWY_HAVE_CXX20_THREE_WAY_COMPARE 0
297
- #endif
298
-
299
327
  //------------------------------------------------------------------------------
300
328
  // CopyBytes / ZeroBytes
301
329
 
@@ -304,9 +332,8 @@ HWY_DLLEXPORT HWY_NORETURN void HWY_FORMAT(3, 4)
304
332
  #pragma intrinsic(memset)
305
333
  #endif
306
334
 
307
- // The source/destination must not overlap/alias.
308
335
  template <size_t kBytes, typename From, typename To>
309
- HWY_API void CopyBytes(const From* from, To* to) {
336
+ HWY_API void CopyBytes(const From* HWY_RESTRICT from, To* HWY_RESTRICT to) {
310
337
  #if HWY_COMPILER_MSVC
311
338
  memcpy(to, from, kBytes);
312
339
  #else
@@ -352,7 +379,7 @@ HWY_API void ZeroBytes(void* to, size_t num_bytes) {
352
379
 
353
380
  #if HWY_ARCH_X86
354
381
  static constexpr HWY_MAYBE_UNUSED size_t kMaxVectorSize = 64; // AVX-512
355
- #elif HWY_ARCH_RVV && defined(__riscv_v_intrinsic) && \
382
+ #elif HWY_ARCH_RISCV && defined(__riscv_v_intrinsic) && \
356
383
  __riscv_v_intrinsic >= 11000
357
384
  // Not actually an upper bound on the size.
358
385
  static constexpr HWY_MAYBE_UNUSED size_t kMaxVectorSize = 4096;
@@ -368,7 +395,7 @@ static constexpr HWY_MAYBE_UNUSED size_t kMaxVectorSize = 16;
368
395
  // exceed the stack size.
369
396
  #if HWY_ARCH_X86
370
397
  #define HWY_ALIGN_MAX alignas(64)
371
- #elif HWY_ARCH_RVV && defined(__riscv_v_intrinsic) && \
398
+ #elif HWY_ARCH_RISCV && defined(__riscv_v_intrinsic) && \
372
399
  __riscv_v_intrinsic >= 11000
373
400
  #define HWY_ALIGN_MAX alignas(8) // only elements need be aligned
374
401
  #else
@@ -559,6 +586,30 @@ using RemoveRef = typename RemoveRefT<T>::type;
559
586
  template <class T>
560
587
  using RemoveCvRef = RemoveConst<RemoveVolatile<RemoveRef<T>>>;
561
588
 
589
+ template <class T>
590
+ struct RemovePtrT {
591
+ using type = T;
592
+ };
593
+ template <class T>
594
+ struct RemovePtrT<T*> {
595
+ using type = T;
596
+ };
597
+ template <class T>
598
+ struct RemovePtrT<const T*> {
599
+ using type = T;
600
+ };
601
+ template <class T>
602
+ struct RemovePtrT<volatile T*> {
603
+ using type = T;
604
+ };
605
+ template <class T>
606
+ struct RemovePtrT<const volatile T*> {
607
+ using type = T;
608
+ };
609
+
610
+ template <class T>
611
+ using RemovePtr = typename RemovePtrT<T>::type;
612
+
562
613
  // Insert into template/function arguments to enable this overload only for
563
614
  // vectors of exactly, at most (LE), or more than (GT) this many bytes.
564
615
  //
@@ -576,6 +627,7 @@ using RemoveCvRef = RemoveConst<RemoveVolatile<RemoveRef<T>>>;
576
627
  #define HWY_IF_LANES_GT(kN, lanes) hwy::EnableIf<(kN > lanes)>* = nullptr
577
628
 
578
629
  #define HWY_IF_UNSIGNED(T) hwy::EnableIf<!hwy::IsSigned<T>()>* = nullptr
630
+ #define HWY_IF_NOT_UNSIGNED(T) hwy::EnableIf<hwy::IsSigned<T>()>* = nullptr
579
631
  #define HWY_IF_SIGNED(T) \
580
632
  hwy::EnableIf<hwy::IsSigned<T>() && !hwy::IsFloat<T>() && \
581
633
  !hwy::IsSpecialFloat<T>()>* = nullptr
@@ -1003,7 +1055,7 @@ HWY_API HWY_BITCASTSCALAR_CONSTEXPR To BitCastScalar(const From& val) {
1003
1055
 
1004
1056
  // RVV with f16 extension supports _Float16 and f16 vector ops. If set, implies
1005
1057
  // HWY_HAVE_FLOAT16.
1006
- #if HWY_ARCH_RVV && defined(__riscv_zvfh) && HWY_COMPILER_CLANG >= 1600
1058
+ #if HWY_ARCH_RISCV && defined(__riscv_zvfh) && HWY_COMPILER_CLANG >= 1600
1007
1059
  #define HWY_RVV_HAVE_F16_VEC 1
1008
1060
  #else
1009
1061
  #define HWY_RVV_HAVE_F16_VEC 0
@@ -1351,8 +1403,22 @@ HWY_API HWY_F16_CONSTEXPR float16_t F16FromF32(float f32) {
1351
1403
  // 1[01] + 10 = 1[11]
1352
1404
  // 1[10] + 10 = C0[00] (round up toward even with C=1 carry out)
1353
1405
  // 1[11] + 10 = C0[01] (round up toward even with C=1 carry out)
1354
- const uint32_t odd_bit = (mantissa32 >> 13) & 1;
1355
- const uint32_t rounded = mantissa32 + odd_bit + 0xFFF;
1406
+
1407
+ // If |f32| >= 2^-24, f16_ulp_bit_idx is the index of the F32 mantissa bit
1408
+ // that will be shifted down into the ULP bit of the rounded down F16 result
1409
+
1410
+ // The biased F32 exponent of 2^-14 (the smallest positive normal F16 value)
1411
+ // is 113, and bit 13 of the F32 mantissa will be shifted down to into the ULP
1412
+ // bit of the rounded down F16 result if |f32| >= 2^14
1413
+
1414
+ // If |f32| < 2^-24, f16_ulp_bit_idx is equal to 24 as there are 24 mantissa
1415
+ // bits (including the implied 1 bit) in the mantissa of a normal F32 value
1416
+ // and as we want to round up the mantissa if |f32| > 2^-25 && |f32| < 2^-24
1417
+ const int32_t f16_ulp_bit_idx =
1418
+ HWY_MIN(HWY_MAX(126 - static_cast<int32_t>(biased_exp32), 13), 24);
1419
+ const uint32_t odd_bit = ((mantissa32 | 0x800000u) >> f16_ulp_bit_idx) & 1;
1420
+ const uint32_t rounded =
1421
+ mantissa32 + odd_bit + (uint32_t{1} << (f16_ulp_bit_idx - 1)) - 1u;
1356
1422
  const bool carry = rounded >= (1u << 23);
1357
1423
 
1358
1424
  const int32_t exp = static_cast<int32_t>(biased_exp32) - 127 + carry;
@@ -1741,12 +1807,42 @@ HWY_API HWY_BF16_CONSTEXPR float F32FromBF16(bfloat16_t bf) {
1741
1807
  #endif
1742
1808
  }
1743
1809
 
1810
+ namespace detail {
1811
+
1812
+ // Returns the increment to add to the bits of a finite F32 value to round a
1813
+ // finite F32 to the nearest BF16 value
1814
+ static HWY_INLINE HWY_MAYBE_UNUSED constexpr uint32_t F32BitsToBF16RoundIncr(
1815
+ const uint32_t f32_bits) {
1816
+ return static_cast<uint32_t>(((f32_bits & 0x7FFFFFFFu) < 0x7F800000u)
1817
+ ? (0x7FFFu + ((f32_bits >> 16) & 1u))
1818
+ : 0u);
1819
+ }
1820
+
1821
+ // Converts f32_bits (which is the bits of a F32 value) to BF16 bits,
1822
+ // rounded to the nearest F16 value
1823
+ static HWY_INLINE HWY_MAYBE_UNUSED constexpr uint16_t F32BitsToBF16Bits(
1824
+ const uint32_t f32_bits) {
1825
+ // Round f32_bits to the nearest BF16 by first adding
1826
+ // F32BitsToBF16RoundIncr(f32_bits) to f32_bits and then right shifting
1827
+ // f32_bits + F32BitsToBF16RoundIncr(f32_bits) by 16
1828
+
1829
+ // If f32_bits is the bit representation of a NaN F32 value, make sure that
1830
+ // bit 6 of the BF16 result is set to convert SNaN F32 values to QNaN BF16
1831
+ // values and to prevent NaN F32 values from being converted to an infinite
1832
+ // BF16 value
1833
+ return static_cast<uint16_t>(
1834
+ ((f32_bits + F32BitsToBF16RoundIncr(f32_bits)) >> 16) |
1835
+ (static_cast<uint32_t>((f32_bits & 0x7FFFFFFFu) > 0x7F800000u) << 6));
1836
+ }
1837
+
1838
+ } // namespace detail
1839
+
1744
1840
  HWY_API HWY_BF16_CONSTEXPR bfloat16_t BF16FromF32(float f) {
1745
1841
  #if HWY_HAVE_SCALAR_BF16_OPERATORS
1746
1842
  return static_cast<bfloat16_t>(f);
1747
1843
  #else
1748
1844
  return bfloat16_t::FromBits(
1749
- static_cast<uint16_t>(BitCastScalar<uint32_t>(f) >> 16));
1845
+ detail::F32BitsToBF16Bits(BitCastScalar<uint32_t>(f)));
1750
1846
  #endif
1751
1847
  }
1752
1848
 
@@ -2418,6 +2514,51 @@ constexpr inline size_t RoundUpTo(size_t what, size_t align) {
2418
2514
  return DivCeil(what, align) * align;
2419
2515
  }
2420
2516
 
2517
+ // Works for any `align`; if a power of two, compiler emits AND.
2518
+ constexpr inline size_t RoundDownTo(size_t what, size_t align) {
2519
+ return what - (what % align);
2520
+ }
2521
+
2522
+ namespace detail {
2523
+
2524
+ // T is unsigned or T is signed and (val >> shift_amt) is an arithmetic right
2525
+ // shift
2526
+ template <class T>
2527
+ static HWY_INLINE constexpr T ScalarShr(hwy::UnsignedTag /*type_tag*/, T val,
2528
+ int shift_amt) {
2529
+ return static_cast<T>(val >> shift_amt);
2530
+ }
2531
+
2532
+ // T is signed and (val >> shift_amt) is a non-arithmetic right shift
2533
+ template <class T>
2534
+ static HWY_INLINE constexpr T ScalarShr(hwy::SignedTag /*type_tag*/, T val,
2535
+ int shift_amt) {
2536
+ using TU = MakeUnsigned<MakeLaneTypeIfInteger<T>>;
2537
+ return static_cast<T>(
2538
+ (val < 0) ? static_cast<TU>(
2539
+ ~(static_cast<TU>(~static_cast<TU>(val)) >> shift_amt))
2540
+ : static_cast<TU>(static_cast<TU>(val) >> shift_amt));
2541
+ }
2542
+
2543
+ } // namespace detail
2544
+
2545
+ // If T is an signed integer type, ScalarShr is guaranteed to perform an
2546
+ // arithmetic right shift
2547
+
2548
+ // Otherwise, if T is an unsigned integer type, ScalarShr is guaranteed to
2549
+ // perform a logical right shift
2550
+ template <class T, HWY_IF_INTEGER(RemoveCvRef<T>)>
2551
+ HWY_API constexpr RemoveCvRef<T> ScalarShr(T val, int shift_amt) {
2552
+ using NonCvRefT = RemoveCvRef<T>;
2553
+ return detail::ScalarShr(
2554
+ hwy::SizeTag<((IsSigned<NonCvRefT>() &&
2555
+ (LimitsMin<NonCvRefT>() >> (sizeof(T) * 8 - 1)) !=
2556
+ static_cast<NonCvRefT>(-1))
2557
+ ? 0x100
2558
+ : 0)>(),
2559
+ static_cast<NonCvRefT>(val), shift_amt);
2560
+ }
2561
+
2421
2562
  // Undefined results for x == 0.
2422
2563
  HWY_API size_t Num0BitsBelowLS1Bit_Nonzero32(const uint32_t x) {
2423
2564
  HWY_DASSERT(x != 0);
@@ -2579,6 +2720,7 @@ HWY_INLINE constexpr T AddWithWraparound(T t, T2 n) {
2579
2720
  }
2580
2721
 
2581
2722
  #if HWY_COMPILER_MSVC && HWY_ARCH_X86_64
2723
+ #pragma intrinsic(_mul128)
2582
2724
  #pragma intrinsic(_umul128)
2583
2725
  #endif
2584
2726
 
@@ -2602,6 +2744,65 @@ HWY_API uint64_t Mul128(uint64_t a, uint64_t b, uint64_t* HWY_RESTRICT upper) {
2602
2744
  #endif
2603
2745
  }
2604
2746
 
2747
+ HWY_API int64_t Mul128(int64_t a, int64_t b, int64_t* HWY_RESTRICT upper) {
2748
+ #if defined(__SIZEOF_INT128__)
2749
+ __int128_t product = (__int128_t)a * (__int128_t)b;
2750
+ *upper = (int64_t)(product >> 64);
2751
+ return (int64_t)(product & 0xFFFFFFFFFFFFFFFFULL);
2752
+ #elif HWY_COMPILER_MSVC && HWY_ARCH_X86_64
2753
+ return _mul128(a, b, upper);
2754
+ #else
2755
+ uint64_t unsigned_upper;
2756
+ const int64_t lower = static_cast<int64_t>(Mul128(
2757
+ static_cast<uint64_t>(a), static_cast<uint64_t>(b), &unsigned_upper));
2758
+ *upper = static_cast<int64_t>(
2759
+ unsigned_upper -
2760
+ (static_cast<uint64_t>(ScalarShr(a, 63)) & static_cast<uint64_t>(b)) -
2761
+ (static_cast<uint64_t>(ScalarShr(b, 63)) & static_cast<uint64_t>(a)));
2762
+ return lower;
2763
+ #endif
2764
+ }
2765
+
2766
+ // Precomputation for fast n / divisor and n % divisor, where n is a variable
2767
+ // and divisor is unchanging but unknown at compile-time.
2768
+ class Divisor {
2769
+ public:
2770
+ explicit Divisor(uint32_t divisor) : divisor_(divisor) {
2771
+ if (divisor <= 1) return;
2772
+
2773
+ const uint32_t len =
2774
+ static_cast<uint32_t>(31 - Num0BitsAboveMS1Bit_Nonzero32(divisor - 1));
2775
+ const uint64_t u_hi = (2ULL << len) - divisor;
2776
+ const uint32_t q = Truncate((u_hi << 32) / divisor);
2777
+
2778
+ mul_ = q + 1;
2779
+ shift1_ = 1;
2780
+ shift2_ = len;
2781
+ }
2782
+
2783
+ uint32_t GetDivisor() const { return divisor_; }
2784
+
2785
+ // Returns n / divisor_.
2786
+ uint32_t Divide(uint32_t n) const {
2787
+ const uint64_t mul = mul_;
2788
+ const uint32_t t = Truncate((mul * n) >> 32);
2789
+ return (t + ((n - t) >> shift1_)) >> shift2_;
2790
+ }
2791
+
2792
+ // Returns n % divisor_.
2793
+ uint32_t Remainder(uint32_t n) const { return n - (Divide(n) * divisor_); }
2794
+
2795
+ private:
2796
+ static uint32_t Truncate(uint64_t x) {
2797
+ return static_cast<uint32_t>(x & 0xFFFFFFFFu);
2798
+ }
2799
+
2800
+ uint32_t divisor_;
2801
+ uint32_t mul_ = 1;
2802
+ uint32_t shift1_ = 0;
2803
+ uint32_t shift2_ = 0;
2804
+ };
2805
+
2605
2806
  namespace detail {
2606
2807
 
2607
2808
  template <typename T>
@@ -73,7 +73,11 @@
73
73
  // https://github.com/simd-everywhere/simde/blob/47d6e603de9d04ee05cdfbc57cf282a02be1bf2a/simde/simde-detect-clang.h#L59.
74
74
  // Please send updates below to them as well, thanks!
75
75
  #if defined(__apple_build_version__) || __clang_major__ >= 999
76
- #if __has_attribute(unsafe_buffer_usage) // no new warnings in 17.0
76
+ #if __has_warning("-Woverriding-option")
77
+ #define HWY_COMPILER_CLANG 1801
78
+ // No new warnings in 17.0, and Apple LLVM 15.3, which should be 1600, already
79
+ // has the unsafe_buffer_usage attribute, so we instead check for new builtins.
80
+ #elif __has_builtin(__builtin_nondeterministic_value)
77
81
  #define HWY_COMPILER_CLANG 1700
78
82
  #elif __has_attribute(nouwtable) // no new warnings in 16.0
79
83
  #define HWY_COMPILER_CLANG 1600
@@ -115,7 +119,8 @@
115
119
  #define HWY_COMPILER3_CLANG 0
116
120
  #endif
117
121
 
118
- #if HWY_COMPILER_GCC && !HWY_COMPILER_CLANG && !HWY_COMPILER_ICC
122
+ #if HWY_COMPILER_GCC && !HWY_COMPILER_CLANG && !HWY_COMPILER_ICC && \
123
+ !HWY_COMPILER_ICX
119
124
  #define HWY_COMPILER_GCC_ACTUAL HWY_COMPILER_GCC
120
125
  #else
121
126
  #define HWY_COMPILER_GCC_ACTUAL 0
@@ -123,17 +128,20 @@
123
128
 
124
129
  // More than one may be nonzero, but we want at least one.
125
130
  #if 0 == (HWY_COMPILER_MSVC + HWY_COMPILER_CLANGCL + HWY_COMPILER_ICC + \
126
- HWY_COMPILER_GCC + HWY_COMPILER_CLANG)
131
+ HWY_COMPILER_ICX + HWY_COMPILER_GCC + HWY_COMPILER_CLANG)
127
132
  #error "Unsupported compiler"
128
133
  #endif
129
134
 
130
- // We should only detect one of these (only clang/clangcl overlap)
131
- #if 1 < \
132
- (!!HWY_COMPILER_MSVC + !!HWY_COMPILER_ICC + !!HWY_COMPILER_GCC_ACTUAL + \
133
- !!(HWY_COMPILER_CLANGCL | HWY_COMPILER_CLANG))
135
+ // We should only detect one of these (only clang/clangcl/icx overlap)
136
+ #if 1 < (!!HWY_COMPILER_MSVC + (!!HWY_COMPILER_ICC & !HWY_COMPILER_ICX) + \
137
+ !!HWY_COMPILER_GCC_ACTUAL + \
138
+ !!(HWY_COMPILER_ICX | HWY_COMPILER_CLANGCL | HWY_COMPILER_CLANG))
134
139
  #error "Detected multiple compilers"
135
140
  #endif
136
141
 
142
+ //------------------------------------------------------------------------------
143
+ // Compiler features and C++ version
144
+
137
145
  #ifdef __has_builtin
138
146
  #define HWY_HAS_BUILTIN(name) __has_builtin(name)
139
147
  #else
@@ -158,6 +166,32 @@
158
166
  #define HWY_HAS_FEATURE(name) 0
159
167
  #endif
160
168
 
169
+ // NOTE: clang ~17 does not correctly handle wrapping __has_include in a macro.
170
+
171
+ #if HWY_COMPILER_MSVC && defined(_MSVC_LANG) && _MSVC_LANG > __cplusplus
172
+ #define HWY_CXX_LANG _MSVC_LANG
173
+ #else
174
+ #define HWY_CXX_LANG __cplusplus
175
+ #endif
176
+
177
+ #if defined(__cpp_constexpr) && __cpp_constexpr >= 201603L
178
+ #define HWY_CXX17_CONSTEXPR constexpr
179
+ #else
180
+ #define HWY_CXX17_CONSTEXPR
181
+ #endif
182
+
183
+ #if defined(__cpp_constexpr) && __cpp_constexpr >= 201304L
184
+ #define HWY_CXX14_CONSTEXPR constexpr
185
+ #else
186
+ #define HWY_CXX14_CONSTEXPR
187
+ #endif
188
+
189
+ #if HWY_CXX_LANG >= 201703L
190
+ #define HWY_IF_CONSTEXPR if constexpr
191
+ #else
192
+ #define HWY_IF_CONSTEXPR if
193
+ #endif
194
+
161
195
  //------------------------------------------------------------------------------
162
196
  // Architecture
163
197
 
@@ -233,9 +267,34 @@
233
267
  #endif
234
268
 
235
269
  #ifdef __riscv
236
- #define HWY_ARCH_RVV 1
270
+ #define HWY_ARCH_RISCV 1
237
271
  #else
238
- #define HWY_ARCH_RVV 0
272
+ #define HWY_ARCH_RISCV 0
273
+ #endif
274
+ // DEPRECATED names; please use HWY_ARCH_RISCV instead.
275
+ #define HWY_ARCH_RVV HWY_ARCH_RISCV
276
+
277
+ #if HWY_ARCH_RISCV && defined(__riscv_xlen)
278
+
279
+ #if __riscv_xlen == 32
280
+ #define HWY_ARCH_RISCV_32 1
281
+ #else
282
+ #define HWY_ARCH_RISCV_32 0
283
+ #endif
284
+
285
+ #if __riscv_xlen == 64
286
+ #define HWY_ARCH_RISCV_64 1
287
+ #else
288
+ #define HWY_ARCH_RISCV_64 0
289
+ #endif
290
+
291
+ #else // !HWY_ARCH_RISCV || !defined(__riscv_xlen)
292
+ #define HWY_ARCH_RISCV_32 0
293
+ #define HWY_ARCH_RISCV_64 0
294
+ #endif // HWY_ARCH_RISCV && defined(__riscv_xlen)
295
+
296
+ #if HWY_ARCH_RISCV_32 && HWY_ARCH_RISCV_64
297
+ #error "Cannot have both RISCV_32 and RISCV_64"
239
298
  #endif
240
299
 
241
300
  #if defined(__s390x__)
@@ -247,10 +306,13 @@
247
306
  // It is an error to detect multiple architectures at the same time, but OK to
248
307
  // detect none of the above.
249
308
  #if (HWY_ARCH_X86 + HWY_ARCH_PPC + HWY_ARCH_ARM + HWY_ARCH_ARM_OLD + \
250
- HWY_ARCH_WASM + HWY_ARCH_RVV + HWY_ARCH_S390X) > 1
309
+ HWY_ARCH_WASM + HWY_ARCH_RISCV + HWY_ARCH_S390X) > 1
251
310
  #error "Must not detect more than one architecture"
252
311
  #endif
253
312
 
313
+ //------------------------------------------------------------------------------
314
+ // Operating system
315
+
254
316
  #if defined(_WIN32) || defined(_WIN64)
255
317
  #define HWY_OS_WIN 1
256
318
  #else
@@ -270,6 +332,18 @@
270
332
  #define HWY_OS_APPLE 0
271
333
  #endif
272
334
 
335
+ #if defined(__FreeBSD__)
336
+ #define HWY_OS_FREEBSD 1
337
+ #else
338
+ #define HWY_OS_FREEBSD 0
339
+ #endif
340
+
341
+ // It is an error to detect multiple OSes at the same time, but OK to
342
+ // detect none of the above.
343
+ #if (HWY_OS_WIN + HWY_OS_LINUX + HWY_OS_APPLE + HWY_OS_FREEBSD) > 1
344
+ #error "Must not detect more than one OS"
345
+ #endif
346
+
273
347
  //------------------------------------------------------------------------------
274
348
  // Endianness
275
349