@img/sharp-libvips-dev 1.2.1 → 1.2.2-rc.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (115) hide show
  1. package/include/aom/aom_decoder.h +1 -1
  2. package/include/aom/aom_encoder.h +2 -0
  3. package/include/aom/aomcx.h +106 -25
  4. package/include/ffi.h +3 -3
  5. package/include/freetype2/freetype/config/ftconfig.h +1 -1
  6. package/include/freetype2/freetype/config/ftheader.h +1 -1
  7. package/include/freetype2/freetype/config/ftoption.h +37 -12
  8. package/include/freetype2/freetype/config/ftstdlib.h +1 -1
  9. package/include/freetype2/freetype/config/integer-types.h +29 -2
  10. package/include/freetype2/freetype/config/mac-support.h +1 -1
  11. package/include/freetype2/freetype/config/public-macros.h +3 -3
  12. package/include/freetype2/freetype/freetype.h +51 -47
  13. package/include/freetype2/freetype/ftadvanc.h +1 -1
  14. package/include/freetype2/freetype/ftbbox.h +1 -1
  15. package/include/freetype2/freetype/ftbdf.h +1 -1
  16. package/include/freetype2/freetype/ftbitmap.h +1 -1
  17. package/include/freetype2/freetype/ftbzip2.h +1 -1
  18. package/include/freetype2/freetype/ftcache.h +1 -1
  19. package/include/freetype2/freetype/ftcid.h +1 -1
  20. package/include/freetype2/freetype/ftcolor.h +13 -4
  21. package/include/freetype2/freetype/ftdriver.h +3 -3
  22. package/include/freetype2/freetype/fterrdef.h +1 -1
  23. package/include/freetype2/freetype/fterrors.h +1 -1
  24. package/include/freetype2/freetype/ftfntfmt.h +1 -1
  25. package/include/freetype2/freetype/ftgasp.h +1 -1
  26. package/include/freetype2/freetype/ftglyph.h +1 -1
  27. package/include/freetype2/freetype/ftgxval.h +1 -1
  28. package/include/freetype2/freetype/ftgzip.h +1 -1
  29. package/include/freetype2/freetype/ftimage.h +6 -2
  30. package/include/freetype2/freetype/ftincrem.h +1 -1
  31. package/include/freetype2/freetype/ftlcdfil.h +1 -1
  32. package/include/freetype2/freetype/ftlist.h +1 -1
  33. package/include/freetype2/freetype/ftlogging.h +184 -0
  34. package/include/freetype2/freetype/ftlzw.h +1 -1
  35. package/include/freetype2/freetype/ftmac.h +1 -1
  36. package/include/freetype2/freetype/ftmm.h +159 -103
  37. package/include/freetype2/freetype/ftmodapi.h +1 -1
  38. package/include/freetype2/freetype/ftmoderr.h +1 -1
  39. package/include/freetype2/freetype/ftotval.h +1 -1
  40. package/include/freetype2/freetype/ftoutln.h +1 -1
  41. package/include/freetype2/freetype/ftparams.h +1 -1
  42. package/include/freetype2/freetype/ftpfr.h +1 -1
  43. package/include/freetype2/freetype/ftrender.h +1 -1
  44. package/include/freetype2/freetype/ftsizes.h +1 -1
  45. package/include/freetype2/freetype/ftsnames.h +1 -1
  46. package/include/freetype2/freetype/ftstroke.h +1 -1
  47. package/include/freetype2/freetype/ftsynth.h +1 -1
  48. package/include/freetype2/freetype/ftsystem.h +1 -1
  49. package/include/freetype2/freetype/fttrigon.h +1 -1
  50. package/include/freetype2/freetype/fttypes.h +1 -1
  51. package/include/freetype2/freetype/ftwinfnt.h +2 -3
  52. package/include/freetype2/freetype/otsvg.h +1 -1
  53. package/include/freetype2/freetype/t1tables.h +1 -1
  54. package/include/freetype2/freetype/ttnameid.h +129 -129
  55. package/include/freetype2/freetype/tttables.h +8 -5
  56. package/include/freetype2/freetype/tttags.h +1 -1
  57. package/include/freetype2/ft2build.h +1 -1
  58. package/include/glib-2.0/gio/gdbuserror.h +9 -8
  59. package/include/glib-2.0/gio/ginetaddress.h +12 -0
  60. package/include/glib-2.0/gio/gioenums.h +9 -2
  61. package/include/glib-2.0/glib/gstring.h +2 -2
  62. package/include/glib-2.0/glib/gunicode.h +1 -1
  63. package/include/glib-2.0/gobject/glib-types.h +1 -1
  64. package/include/glib-2.0/gobject/gparam.h +1 -1
  65. package/include/glib-2.0/gobject/gvalue.h +78 -35
  66. package/include/harfbuzz/hb-script-list.h +12 -0
  67. package/include/harfbuzz/hb-version.h +3 -3
  68. package/include/hwy/abort.h +2 -19
  69. package/include/hwy/aligned_allocator.h +11 -7
  70. package/include/hwy/auto_tune.h +504 -0
  71. package/include/hwy/base.h +425 -104
  72. package/include/hwy/cache_control.h +16 -0
  73. package/include/hwy/detect_compiler_arch.h +32 -1
  74. package/include/hwy/detect_targets.h +251 -67
  75. package/include/hwy/foreach_target.h +35 -0
  76. package/include/hwy/highway.h +185 -76
  77. package/include/hwy/nanobenchmark.h +1 -19
  78. package/include/hwy/ops/arm_neon-inl.h +969 -458
  79. package/include/hwy/ops/arm_sve-inl.h +1137 -359
  80. package/include/hwy/ops/emu128-inl.h +97 -11
  81. package/include/hwy/ops/generic_ops-inl.h +1222 -34
  82. package/include/hwy/ops/loongarch_lasx-inl.h +4664 -0
  83. package/include/hwy/ops/loongarch_lsx-inl.h +5933 -0
  84. package/include/hwy/ops/ppc_vsx-inl.h +306 -126
  85. package/include/hwy/ops/rvv-inl.h +546 -51
  86. package/include/hwy/ops/scalar-inl.h +77 -22
  87. package/include/hwy/ops/set_macros-inl.h +138 -17
  88. package/include/hwy/ops/shared-inl.h +50 -10
  89. package/include/hwy/ops/wasm_128-inl.h +137 -92
  90. package/include/hwy/ops/x86_128-inl.h +773 -214
  91. package/include/hwy/ops/x86_256-inl.h +712 -255
  92. package/include/hwy/ops/x86_512-inl.h +429 -753
  93. package/include/hwy/ops/x86_avx3-inl.h +501 -0
  94. package/include/hwy/per_target.h +2 -1
  95. package/include/hwy/profiler.h +622 -486
  96. package/include/hwy/targets.h +62 -20
  97. package/include/hwy/timer-inl.h +8 -160
  98. package/include/hwy/timer.h +170 -3
  99. package/include/hwy/x86_cpuid.h +81 -0
  100. package/include/libheif/heif_cxx.h +25 -5
  101. package/include/libheif/heif_regions.h +5 -5
  102. package/include/libheif/heif_version.h +2 -2
  103. package/include/librsvg-2.0/librsvg/rsvg-version.h +3 -3
  104. package/include/libxml2/libxml/valid.h +0 -3
  105. package/include/libxml2/libxml/xmlerror.h +1 -1
  106. package/include/libxml2/libxml/xmlversion.h +4 -4
  107. package/include/pango-1.0/pango/pango-enum-types.h +3 -0
  108. package/include/pango-1.0/pango/pango-features.h +3 -3
  109. package/include/pango-1.0/pango/pango-font.h +30 -0
  110. package/include/pango-1.0/pango/pango-version-macros.h +26 -0
  111. package/include/vips/connection.h +4 -4
  112. package/include/vips/version.h +4 -4
  113. package/include/zlib.h +3 -3
  114. package/package.json +1 -1
  115. package/versions.json +13 -13
@@ -70,6 +70,14 @@ namespace detail {
70
70
  #define HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT "x"
71
71
  #endif
72
72
 
73
+ #undef HWY_X86_HAVE_AVX10_2_OPS
74
+ #if HWY_TARGET_IS_AVX10_2 && \
75
+ (HWY_COMPILER_GCC_ACTUAL >= 1501 || HWY_COMPILER3_CLANG >= 200103)
76
+ #define HWY_X86_HAVE_AVX10_2_OPS 1
77
+ #else
78
+ #define HWY_X86_HAVE_AVX10_2_OPS 0
79
+ #endif
80
+
73
81
  template <typename T>
74
82
  struct Raw128 {
75
83
  using type = __m128i;
@@ -138,78 +146,66 @@ using Vec32 = Vec128<T, 4 / sizeof(T)>;
138
146
  template <typename T>
139
147
  using Vec16 = Vec128<T, 2 / sizeof(T)>;
140
148
 
141
- #if HWY_TARGET <= HWY_AVX3
142
-
143
149
  namespace detail {
144
150
 
151
+ #if HWY_TARGET <= HWY_AVX3
152
+
145
153
  // Template arg: sizeof(lane type)
146
154
  template <size_t size>
147
- struct RawMask128 {};
155
+ struct RawMask128T {};
148
156
  template <>
149
- struct RawMask128<1> {
157
+ struct RawMask128T<1> {
150
158
  using type = __mmask16;
151
159
  };
152
160
  template <>
153
- struct RawMask128<2> {
161
+ struct RawMask128T<2> {
154
162
  using type = __mmask8;
155
163
  };
156
164
  template <>
157
- struct RawMask128<4> {
165
+ struct RawMask128T<4> {
158
166
  using type = __mmask8;
159
167
  };
160
168
  template <>
161
- struct RawMask128<8> {
169
+ struct RawMask128T<8> {
162
170
  using type = __mmask8;
163
171
  };
164
172
 
165
- } // namespace detail
173
+ template <typename T>
174
+ using RawMask128 = typename RawMask128T<sizeof(T)>::type;
166
175
 
167
- template <typename T, size_t N = 16 / sizeof(T)>
168
- struct Mask128 {
169
- using Raw = typename detail::RawMask128<sizeof(T)>::type;
176
+ #else // AVX2 or earlier
170
177
 
171
- static Mask128<T, N> FromBits(uint64_t mask_bits) {
172
- return Mask128<T, N>{static_cast<Raw>(mask_bits)};
173
- }
178
+ template <typename T>
179
+ using RawMask128 = typename Raw128<T>::type;
174
180
 
175
- Raw raw;
176
- };
181
+ #endif // HWY_TARGET <= HWY_AVX3
177
182
 
178
- #else // AVX2 or below
183
+ } // namespace detail
179
184
 
180
- // FF..FF or 0.
181
185
  template <typename T, size_t N = 16 / sizeof(T)>
182
186
  struct Mask128 {
183
- typename detail::Raw128<T>::type raw;
184
- };
185
-
186
- #endif // AVX2 or below
187
+ using Raw = typename detail::RawMask128<T>;
187
188
 
188
- namespace detail {
189
-
190
- // Returns the lowest N of the _mm_movemask* bits.
191
- template <typename T, size_t N>
192
- constexpr uint64_t OnlyActive(uint64_t mask_bits) {
193
- return ((N * sizeof(T)) == 16) ? mask_bits : mask_bits & ((1ull << N) - 1);
194
- }
195
-
196
- } // namespace detail
189
+ using PrivateT = T; // only for DFromM
190
+ static constexpr size_t kPrivateN = N; // only for DFromM
197
191
 
198
192
  #if HWY_TARGET <= HWY_AVX3
199
- namespace detail {
200
-
201
- // Used by Expand() emulation, which is required for both AVX3 and AVX2.
202
- template <typename T, size_t N>
203
- HWY_INLINE uint64_t BitsFromMask(const Mask128<T, N> mask) {
204
- return OnlyActive<T, N>(mask.raw);
205
- }
193
+ static Mask128<T, N> FromBits(uint64_t mask_bits) {
194
+ return Mask128<T, N>{static_cast<Raw>(mask_bits)};
195
+ }
196
+ #else
197
+ // Lanes are either FF..FF or 0.
198
+ #endif
206
199
 
207
- } // namespace detail
208
- #endif // HWY_TARGET <= HWY_AVX3
200
+ Raw raw;
201
+ };
209
202
 
210
203
  template <class V>
211
204
  using DFromV = Simd<typename V::PrivateT, V::kPrivateN, 0>;
212
205
 
206
+ template <class M>
207
+ using DFromM = Simd<typename M::PrivateT, M::kPrivateN, 0>;
208
+
213
209
  template <class V>
214
210
  using TFromV = typename V::PrivateT;
215
211
 
@@ -1065,6 +1061,16 @@ HWY_API MFromD<DTo> DemoteMaskTo(DTo /*d_to*/, DFrom /*d_from*/,
1065
1061
  #define HWY_NATIVE_COMBINE_MASKS
1066
1062
  #endif
1067
1063
 
1064
+ // For Clang and GCC, mask intrinsics (KORTEST) weren't added until recently.
1065
+ #if !defined(HWY_COMPILER_HAS_MASK_INTRINSICS)
1066
+ #if HWY_COMPILER_MSVC != 0 || HWY_COMPILER_GCC_ACTUAL >= 700 || \
1067
+ HWY_COMPILER_CLANG >= 800
1068
+ #define HWY_COMPILER_HAS_MASK_INTRINSICS 1
1069
+ #else
1070
+ #define HWY_COMPILER_HAS_MASK_INTRINSICS 0
1071
+ #endif
1072
+ #endif // HWY_COMPILER_HAS_MASK_INTRINSICS
1073
+
1068
1074
  template <class D, HWY_IF_LANES_D(D, 2)>
1069
1075
  HWY_API MFromD<D> CombineMasks(D /*d*/, MFromD<Half<D>> hi,
1070
1076
  MFromD<Half<D>> lo) {
@@ -1539,16 +1545,6 @@ HWY_API V IfThenZeroElse(MFromD<D> mask, V no) {
1539
1545
 
1540
1546
  // ------------------------------ Mask logical
1541
1547
 
1542
- // For Clang and GCC, mask intrinsics (KORTEST) weren't added until recently.
1543
- #if !defined(HWY_COMPILER_HAS_MASK_INTRINSICS)
1544
- #if HWY_COMPILER_MSVC != 0 || HWY_COMPILER_GCC_ACTUAL >= 700 || \
1545
- HWY_COMPILER_CLANG >= 800
1546
- #define HWY_COMPILER_HAS_MASK_INTRINSICS 1
1547
- #else
1548
- #define HWY_COMPILER_HAS_MASK_INTRINSICS 0
1549
- #endif
1550
- #endif // HWY_COMPILER_HAS_MASK_INTRINSICS
1551
-
1552
1548
  namespace detail {
1553
1549
 
1554
1550
  template <typename T, size_t N>
@@ -2049,13 +2045,13 @@ HWY_API Vec128<int8_t, N> ShiftRight(const Vec128<int8_t, N> v) {
2049
2045
 
2050
2046
  // Clang static analysis claims the memory immediately after a partial vector
2051
2047
  // store is uninitialized, and also flags the input to partial loads (at least
2052
- // for loadl_pd) as "garbage". This is a false alarm because msan does not
2053
- // raise errors. We work around this by using CopyBytes instead of intrinsics,
2054
- // but only for the analyzer to avoid potentially bad code generation.
2048
+ // for loadl_pd) as "garbage". Since 2025-07, MSAN began raising errors. We
2049
+ // work around this by using CopyBytes instead of intrinsics, but only for MSAN
2050
+ // and static analyzer builds to avoid potentially bad code generation.
2055
2051
  // Unfortunately __clang_analyzer__ was not defined for clang-tidy prior to v7.
2056
2052
  #ifndef HWY_SAFE_PARTIAL_LOAD_STORE
2057
- #if defined(__clang_analyzer__) || \
2058
- (HWY_COMPILER_CLANG != 0 && HWY_COMPILER_CLANG < 700)
2053
+ #if HWY_IS_MSAN || (defined(__clang_analyzer__) || \
2054
+ (HWY_COMPILER_CLANG != 0 && HWY_COMPILER_CLANG < 700))
2059
2055
  #define HWY_SAFE_PARTIAL_LOAD_STORE 1
2060
2056
  #else
2061
2057
  #define HWY_SAFE_PARTIAL_LOAD_STORE 0
@@ -3921,6 +3917,64 @@ HWY_API Vec128<double> AddSub(Vec128<double> a, Vec128<double> b) {
3921
3917
  }
3922
3918
  #endif // HWY_TARGET <= HWY_SSSE3
3923
3919
 
3920
+ // ------------------------------ PairwiseAdd128/PairwiseSub128
3921
+
3922
+ // Need to use the default implementation of PairwiseAdd128/PairwiseSub128 in
3923
+ // generic_ops-inl.h for U8/I8/F16/I64/U64 vectors and 64-byte vectors
3924
+
3925
+ #if HWY_TARGET <= HWY_SSSE3
3926
+
3927
+ #undef HWY_IF_PAIRWISE_ADD_128_D
3928
+ #undef HWY_IF_PAIRWISE_SUB_128_D
3929
+ #define HWY_IF_PAIRWISE_ADD_128_D(D) \
3930
+ hwy::EnableIf<( \
3931
+ HWY_MAX_LANES_D(D) > (32 / sizeof(hwy::HWY_NAMESPACE::TFromD<D>)) || \
3932
+ (HWY_MAX_LANES_D(D) > (8 / sizeof(hwy::HWY_NAMESPACE::TFromD<D>)) && \
3933
+ !(hwy::IsSameEither<hwy::HWY_NAMESPACE::TFromD<D>, int16_t, \
3934
+ uint16_t>() || \
3935
+ sizeof(hwy::HWY_NAMESPACE::TFromD<D>) == 4 || \
3936
+ hwy::IsSame<hwy::HWY_NAMESPACE::TFromD<D>, double>())))>* = nullptr
3937
+ #define HWY_IF_PAIRWISE_SUB_128_D(D) HWY_IF_PAIRWISE_ADD_128_D(D)
3938
+
3939
+ template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_UI16_D(D)>
3940
+ HWY_API VFromD<D> PairwiseAdd128(D /*d*/, VFromD<D> a, VFromD<D> b) {
3941
+ return VFromD<D>{_mm_hadd_epi16(a.raw, b.raw)};
3942
+ }
3943
+ template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_UI16_D(D)>
3944
+ HWY_API VFromD<D> PairwiseSub128(D /*d*/, VFromD<D> a, VFromD<D> b) {
3945
+ const DFromV<decltype(a)> d;
3946
+ const RebindToSigned<decltype(d)> di;
3947
+ return BitCast(d, Neg(BitCast(di, VFromD<D>{_mm_hsub_epi16(a.raw, b.raw)})));
3948
+ }
3949
+ template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_UI32_D(D)>
3950
+ HWY_API VFromD<D> PairwiseAdd128(D /*d*/, VFromD<D> a, VFromD<D> b) {
3951
+ return VFromD<D>{_mm_hadd_epi32(a.raw, b.raw)};
3952
+ }
3953
+ template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_UI32_D(D)>
3954
+ HWY_API VFromD<D> PairwiseSub128(D /*d*/, VFromD<D> a, VFromD<D> b) {
3955
+ const DFromV<decltype(a)> d;
3956
+ const RebindToSigned<decltype(d)> di;
3957
+ return BitCast(d, Neg(BitCast(di, VFromD<D>{_mm_hsub_epi32(a.raw, b.raw)})));
3958
+ }
3959
+ template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F32_D(D)>
3960
+ HWY_API VFromD<D> PairwiseAdd128(D /*d*/, VFromD<D> a, VFromD<D> b) {
3961
+ return VFromD<D>{_mm_hadd_ps(a.raw, b.raw)};
3962
+ }
3963
+ template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F32_D(D)>
3964
+ HWY_API VFromD<D> PairwiseSub128(D /*d*/, VFromD<D> a, VFromD<D> b) {
3965
+ return Neg(VFromD<D>{_mm_hsub_ps(a.raw, b.raw)});
3966
+ }
3967
+ template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F64_D(D)>
3968
+ HWY_API VFromD<D> PairwiseAdd128(D /*d*/, VFromD<D> a, VFromD<D> b) {
3969
+ return VFromD<D>{_mm_hadd_pd(a.raw, b.raw)};
3970
+ }
3971
+ template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F64_D(D)>
3972
+ HWY_API VFromD<D> PairwiseSub128(D /*d*/, VFromD<D> a, VFromD<D> b) {
3973
+ return Neg(VFromD<D>{_mm_hsub_pd(a.raw, b.raw)});
3974
+ }
3975
+
3976
+ #endif // HWY_TARGET <= HWY_SSSE3
3977
+
3924
3978
  // ------------------------------ SumsOf8
3925
3979
  template <size_t N>
3926
3980
  HWY_API Vec128<uint64_t, N / 8> SumsOf8(const Vec128<uint8_t, N> v) {
@@ -4226,6 +4280,18 @@ HWY_API Vec128<uint16_t, N> AverageRound(const Vec128<uint16_t, N> a,
4226
4280
  return Vec128<uint16_t, N>{_mm_avg_epu16(a.raw, b.raw)};
4227
4281
  }
4228
4282
 
4283
+ // I8/I16 AverageRound is generic for all vector lengths
4284
+ template <class V, HWY_IF_SIGNED_V(V),
4285
+ HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 2))>
4286
+ HWY_API V AverageRound(V a, V b) {
4287
+ const DFromV<decltype(a)> d;
4288
+ const RebindToUnsigned<decltype(d)> du;
4289
+ const V sign_bit = SignBit(d);
4290
+ return Xor(BitCast(d, AverageRound(BitCast(du, Xor(a, sign_bit)),
4291
+ BitCast(du, Xor(b, sign_bit)))),
4292
+ sign_bit);
4293
+ }
4294
+
4229
4295
  // ------------------------------ Integer multiplication
4230
4296
 
4231
4297
  template <size_t N>
@@ -4396,6 +4462,26 @@ HWY_API Vec128<int32_t, N> operator*(const Vec128<int32_t, N> a,
4396
4462
  return BitCast(d, BitCast(du, a) * BitCast(du, b));
4397
4463
  }
4398
4464
 
4465
+ #if HWY_TARGET <= HWY_AVX3
4466
+ // Per-target flag to prevent generic_ops-inl.h from defining 64-bit operator*.
4467
+ #ifdef HWY_NATIVE_MUL_64
4468
+ #undef HWY_NATIVE_MUL_64
4469
+ #else
4470
+ #define HWY_NATIVE_MUL_64
4471
+ #endif
4472
+
4473
+ template <size_t N>
4474
+ HWY_API Vec128<uint64_t, N> operator*(Vec128<uint64_t, N> a,
4475
+ Vec128<uint64_t, N> b) {
4476
+ return Vec128<uint64_t, N>{_mm_mullo_epi64(a.raw, b.raw)};
4477
+ }
4478
+ template <size_t N>
4479
+ HWY_API Vec128<int64_t, N> operator*(Vec128<int64_t, N> a,
4480
+ Vec128<int64_t, N> b) {
4481
+ return Vec128<int64_t, N>{_mm_mullo_epi64(a.raw, b.raw)};
4482
+ }
4483
+ #endif
4484
+
4399
4485
  // ------------------------------ RotateRight (ShiftRight, Or)
4400
4486
 
4401
4487
  // U8 RotateRight implementation on AVX3_DL is now in x86_512-inl.h as U8
@@ -5051,6 +5137,43 @@ HWY_API Vec64<double> operator*(const Vec64<double> a, const Vec64<double> b) {
5051
5137
  return Vec64<double>{_mm_mul_sd(a.raw, b.raw)};
5052
5138
  }
5053
5139
 
5140
+ #if HWY_TARGET <= HWY_AVX3
5141
+
5142
+ #ifdef HWY_NATIVE_MUL_BY_POW2
5143
+ #undef HWY_NATIVE_MUL_BY_POW2
5144
+ #else
5145
+ #define HWY_NATIVE_MUL_BY_POW2
5146
+ #endif
5147
+
5148
+ #if HWY_HAVE_FLOAT16
5149
+ template <size_t N>
5150
+ HWY_API Vec128<float16_t, N> MulByFloorPow2(Vec128<float16_t, N> a,
5151
+ Vec128<float16_t, N> b) {
5152
+ return Vec128<float16_t, N>{_mm_scalef_ph(a.raw, b.raw)};
5153
+ }
5154
+ #endif
5155
+
5156
+ template <size_t N>
5157
+ HWY_API Vec128<float, N> MulByFloorPow2(Vec128<float, N> a,
5158
+ Vec128<float, N> b) {
5159
+ return Vec128<float, N>{_mm_scalef_ps(a.raw, b.raw)};
5160
+ }
5161
+
5162
+ template <size_t N>
5163
+ HWY_API Vec128<double, N> MulByFloorPow2(Vec128<double, N> a,
5164
+ Vec128<double, N> b) {
5165
+ return Vec128<double, N>{_mm_scalef_pd(a.raw, b.raw)};
5166
+ }
5167
+
5168
+ // MulByPow2 is generic for all vector lengths on AVX3
5169
+ template <class V, HWY_IF_FLOAT_V(V)>
5170
+ HWY_API V MulByPow2(V v, VFromD<RebindToSigned<DFromV<V>>> exp) {
5171
+ const DFromV<decltype(v)> d;
5172
+ return MulByFloorPow2(v, ConvertTo(d, exp));
5173
+ }
5174
+
5175
+ #endif // HWY_TARGET <= HWY_AVX3
5176
+
5054
5177
  #if HWY_HAVE_FLOAT16
5055
5178
  template <size_t N>
5056
5179
  HWY_API Vec128<float16_t, N> operator/(const Vec128<float16_t, N> a,
@@ -5113,6 +5236,33 @@ HWY_API V AbsDiff(V a, V b) {
5113
5236
  return Abs(a - b);
5114
5237
  }
5115
5238
 
5239
+ // ------------------------------ GetExponent
5240
+
5241
+ #if HWY_TARGET <= HWY_AVX3
5242
+
5243
+ #ifdef HWY_NATIVE_GET_EXPONENT
5244
+ #undef HWY_NATIVE_GET_EXPONENT
5245
+ #else
5246
+ #define HWY_NATIVE_GET_EXPONENT
5247
+ #endif
5248
+
5249
+ #if HWY_HAVE_FLOAT16
5250
+ template <class V, HWY_IF_F16(TFromV<V>), HWY_IF_V_SIZE_LE_V(V, 16)>
5251
+ HWY_API V GetExponent(V v) {
5252
+ return V{_mm_getexp_ph(v.raw)};
5253
+ }
5254
+ #endif
5255
+ template <class V, HWY_IF_F32(TFromV<V>), HWY_IF_V_SIZE_LE_V(V, 16)>
5256
+ HWY_API V GetExponent(V v) {
5257
+ return V{_mm_getexp_ps(v.raw)};
5258
+ }
5259
+ template <class V, HWY_IF_F64(TFromV<V>), HWY_IF_V_SIZE_LE_V(V, 16)>
5260
+ HWY_API V GetExponent(V v) {
5261
+ return V{_mm_getexp_pd(v.raw)};
5262
+ }
5263
+
5264
+ #endif
5265
+
5116
5266
  // ------------------------------ MaskedMinOr
5117
5267
 
5118
5268
  #if HWY_TARGET <= HWY_AVX3
@@ -5704,7 +5854,8 @@ HWY_API Vec128<uint8_t, N> Min(Vec128<uint8_t, N> a, Vec128<uint8_t, N> b) {
5704
5854
  template <size_t N>
5705
5855
  HWY_API Vec128<uint16_t, N> Min(Vec128<uint16_t, N> a, Vec128<uint16_t, N> b) {
5706
5856
  #if HWY_TARGET >= HWY_SSSE3
5707
- return detail::MinU(a, b);
5857
+ return Vec128<uint16_t, N>{
5858
+ _mm_sub_epi16(a.raw, _mm_subs_epu16(a.raw, b.raw))};
5708
5859
  #else
5709
5860
  return Vec128<uint16_t, N>{_mm_min_epu16(a.raw, b.raw)};
5710
5861
  #endif
@@ -5797,7 +5948,8 @@ HWY_API Vec128<uint8_t, N> Max(Vec128<uint8_t, N> a, Vec128<uint8_t, N> b) {
5797
5948
  template <size_t N>
5798
5949
  HWY_API Vec128<uint16_t, N> Max(Vec128<uint16_t, N> a, Vec128<uint16_t, N> b) {
5799
5950
  #if HWY_TARGET >= HWY_SSSE3
5800
- return detail::MaxU(a, b);
5951
+ return Vec128<uint16_t, N>{
5952
+ _mm_add_epi16(a.raw, _mm_subs_epu16(b.raw, a.raw))};
5801
5953
  #else
5802
5954
  return Vec128<uint16_t, N>{_mm_max_epu16(a.raw, b.raw)};
5803
5955
  #endif
@@ -5866,6 +6018,110 @@ HWY_API Vec128<double, N> Max(Vec128<double, N> a, Vec128<double, N> b) {
5866
6018
  return Vec128<double, N>{_mm_max_pd(a.raw, b.raw)};
5867
6019
  }
5868
6020
 
6021
+ // ------------------------------ MinNumber and MaxNumber
6022
+
6023
+ #ifdef HWY_NATIVE_FLOAT_MIN_MAX_NUMBER
6024
+ #undef HWY_NATIVE_FLOAT_MIN_MAX_NUMBER
6025
+ #else
6026
+ #define HWY_NATIVE_FLOAT_MIN_MAX_NUMBER
6027
+ #endif
6028
+
6029
+ #if HWY_X86_HAVE_AVX10_2_OPS
6030
+
6031
+ #if HWY_HAVE_FLOAT16
6032
+ template <size_t N>
6033
+ HWY_API Vec128<float16_t, N> MinNumber(Vec128<float16_t, N> a,
6034
+ Vec128<float16_t, N> b) {
6035
+ return Vec128<float16_t, N>{_mm_minmax_ph(a.raw, b.raw, 0x14)};
6036
+ }
6037
+ #endif
6038
+ template <size_t N>
6039
+ HWY_API Vec128<float, N> MinNumber(Vec128<float, N> a, Vec128<float, N> b) {
6040
+ return Vec128<float, N>{_mm_minmax_ps(a.raw, b.raw, 0x14)};
6041
+ }
6042
+ template <size_t N>
6043
+ HWY_API Vec128<double, N> MinNumber(Vec128<double, N> a, Vec128<double, N> b) {
6044
+ return Vec128<double, N>{_mm_minmax_pd(a.raw, b.raw, 0x14)};
6045
+ }
6046
+
6047
+ #if HWY_HAVE_FLOAT16
6048
+ template <size_t N>
6049
+ HWY_API Vec128<float16_t, N> MaxNumber(Vec128<float16_t, N> a,
6050
+ Vec128<float16_t, N> b) {
6051
+ return Vec128<float16_t, N>{_mm_minmax_ph(a.raw, b.raw, 0x15)};
6052
+ }
6053
+ #endif
6054
+ template <size_t N>
6055
+ HWY_API Vec128<float, N> MaxNumber(Vec128<float, N> a, Vec128<float, N> b) {
6056
+ return Vec128<float, N>{_mm_minmax_ps(a.raw, b.raw, 0x15)};
6057
+ }
6058
+ template <size_t N>
6059
+ HWY_API Vec128<double, N> MaxNumber(Vec128<double, N> a, Vec128<double, N> b) {
6060
+ return Vec128<double, N>{_mm_minmax_pd(a.raw, b.raw, 0x15)};
6061
+ }
6062
+
6063
+ #else
6064
+
6065
+ // MinNumber/MaxNumber are generic for all vector lengths on targets other
6066
+ // than AVX10.2
6067
+ template <class V, HWY_IF_FLOAT_OR_SPECIAL_V(V)>
6068
+ HWY_API V MinNumber(V a, V b) {
6069
+ return Min(a, IfThenElse(IsNaN(b), a, b));
6070
+ }
6071
+
6072
+ template <class V, HWY_IF_FLOAT_OR_SPECIAL_V(V)>
6073
+ HWY_API V MaxNumber(V a, V b) {
6074
+ return Max(a, IfThenElse(IsNaN(b), a, b));
6075
+ }
6076
+
6077
+ #endif
6078
+
6079
+ // ------------------------------ MinMagnitude and MaxMagnitude
6080
+
6081
+ #if HWY_X86_HAVE_AVX10_2_OPS
6082
+
6083
+ #ifdef HWY_NATIVE_FLOAT_MIN_MAX_MAGNITUDE
6084
+ #undef HWY_NATIVE_FLOAT_MIN_MAX_MAGNITUDE
6085
+ #else
6086
+ #define HWY_NATIVE_FLOAT_MIN_MAX_MAGNITUDE
6087
+ #endif
6088
+
6089
+ #if HWY_HAVE_FLOAT16
6090
+ template <size_t N>
6091
+ HWY_API Vec128<float16_t, N> MinMagnitude(Vec128<float16_t, N> a,
6092
+ Vec128<float16_t, N> b) {
6093
+ return Vec128<float16_t, N>{_mm_minmax_ph(a.raw, b.raw, 0x16)};
6094
+ }
6095
+ #endif
6096
+ template <size_t N>
6097
+ HWY_API Vec128<float, N> MinMagnitude(Vec128<float, N> a, Vec128<float, N> b) {
6098
+ return Vec128<float, N>{_mm_minmax_ps(a.raw, b.raw, 0x16)};
6099
+ }
6100
+ template <size_t N>
6101
+ HWY_API Vec128<double, N> MinMagnitude(Vec128<double, N> a,
6102
+ Vec128<double, N> b) {
6103
+ return Vec128<double, N>{_mm_minmax_pd(a.raw, b.raw, 0x16)};
6104
+ }
6105
+
6106
+ #if HWY_HAVE_FLOAT16
6107
+ template <size_t N>
6108
+ HWY_API Vec128<float16_t, N> MaxMagnitude(Vec128<float16_t, N> a,
6109
+ Vec128<float16_t, N> b) {
6110
+ return Vec128<float16_t, N>{_mm_minmax_ph(a.raw, b.raw, 0x17)};
6111
+ }
6112
+ #endif
6113
+ template <size_t N>
6114
+ HWY_API Vec128<float, N> MaxMagnitude(Vec128<float, N> a, Vec128<float, N> b) {
6115
+ return Vec128<float, N>{_mm_minmax_ps(a.raw, b.raw, 0x17)};
6116
+ }
6117
+ template <size_t N>
6118
+ HWY_API Vec128<double, N> MaxMagnitude(Vec128<double, N> a,
6119
+ Vec128<double, N> b) {
6120
+ return Vec128<double, N>{_mm_minmax_pd(a.raw, b.raw, 0x17)};
6121
+ }
6122
+
6123
+ #endif
6124
+
5869
6125
  // ================================================== MEMORY (3)
5870
6126
 
5871
6127
  // ------------------------------ Non-temporal stores
@@ -6883,52 +7139,48 @@ HWY_API Vec128<float16_t, N> TableLookupLanes(Vec128<float16_t, N> v,
6883
7139
 
6884
7140
  template <typename T, size_t N, HWY_IF_T_SIZE(T, 4)>
6885
7141
  HWY_API Vec128<T, N> TableLookupLanes(Vec128<T, N> v, Indices128<T, N> idx) {
6886
- #if HWY_TARGET <= HWY_AVX2
6887
7142
  const DFromV<decltype(v)> d;
6888
- const RebindToFloat<decltype(d)> df;
6889
- const Vec128<float, N> perm{_mm_permutevar_ps(BitCast(df, v).raw, idx.raw)};
6890
- return BitCast(d, perm);
7143
+ const Full128<T> d_full;
7144
+ const Vec128<T> v_full = ZeroExtendResizeBitCast(d_full, d, v);
7145
+
7146
+ const RebindToSigned<decltype(d)> di;
7147
+ const Full128<MakeSigned<T>> di_full;
7148
+ const VFromD<decltype(di_full)> vidx =
7149
+ ZeroExtendResizeBitCast(di_full, di, VFromD<decltype(di)>{idx.raw});
7150
+
7151
+ #if HWY_TARGET <= HWY_AVX2
7152
+ // There is no permutevar for non-float; _mm256_permutevar8x32_epi32 is for
7153
+ // 256-bit vectors, hence cast to float.
7154
+ const Full128<float> df_full;
7155
+ // Workaround for MSAN false positive.
7156
+ HWY_IF_CONSTEXPR(HWY_IS_MSAN) PreventElision(GetLane(vidx));
7157
+ const Vec128<float> perm{
7158
+ _mm_permutevar_ps(BitCast(df_full, v_full).raw, vidx.raw)};
7159
+ return ResizeBitCast(d, perm);
6891
7160
  #elif HWY_TARGET == HWY_SSE2
6892
7161
  #if HWY_COMPILER_GCC_ACTUAL && HWY_HAS_BUILTIN(__builtin_shuffle)
6893
7162
  typedef uint32_t GccU32RawVectType __attribute__((__vector_size__(16)));
6894
7163
  return Vec128<T, N>{reinterpret_cast<typename detail::Raw128<T>::type>(
6895
- __builtin_shuffle(reinterpret_cast<GccU32RawVectType>(v.raw),
6896
- reinterpret_cast<GccU32RawVectType>(idx.raw)))};
7164
+ __builtin_shuffle(reinterpret_cast<GccU32RawVectType>(v_full.raw),
7165
+ reinterpret_cast<GccU32RawVectType>(vidx.raw)))};
6897
7166
  #else
6898
- const Full128<T> d_full;
6899
7167
  alignas(16) T src_lanes[4];
6900
- alignas(16) uint32_t indices[4];
7168
+ alignas(16) int32_t indices[4];
6901
7169
  alignas(16) T result_lanes[4];
6902
7170
 
6903
- Store(Vec128<T>{v.raw}, d_full, src_lanes);
6904
- _mm_store_si128(reinterpret_cast<__m128i*>(indices), idx.raw);
7171
+ Store(v_full, d_full, src_lanes);
7172
+ Store(vidx, di_full, indices);
6905
7173
 
6906
- for (int i = 0; i < 4; i++) {
6907
- result_lanes[i] = src_lanes[indices[i] & 3u];
7174
+ for (size_t i = 0; i < N; i++) {
7175
+ result_lanes[i] = src_lanes[static_cast<size_t>(indices[i] & 3)];
6908
7176
  }
6909
-
6910
- return Vec128<T, N>{Load(d_full, result_lanes).raw};
7177
+ return Load(d, result_lanes);
6911
7178
  #endif // HWY_COMPILER_GCC_ACTUAL && HWY_HAS_BUILTIN(__builtin_shuffle)
6912
7179
  #else // SSSE3 or SSE4
6913
- return TableLookupBytes(v, Vec128<T, N>{idx.raw});
7180
+ return ResizeBitCast(d, TableLookupBytes(BitCast(di_full, v_full), vidx));
6914
7181
  #endif
6915
7182
  }
6916
7183
 
6917
- #if HWY_TARGET <= HWY_SSSE3
6918
- template <size_t N, HWY_IF_V_SIZE_GT(float, N, 4)>
6919
- HWY_API Vec128<float, N> TableLookupLanes(Vec128<float, N> v,
6920
- Indices128<float, N> idx) {
6921
- #if HWY_TARGET <= HWY_AVX2
6922
- return Vec128<float, N>{_mm_permutevar_ps(v.raw, idx.raw)};
6923
- #else // SSSE3 or SSE4
6924
- const DFromV<decltype(v)> df;
6925
- const RebindToSigned<decltype(df)> di;
6926
- return BitCast(df,
6927
- TableLookupBytes(BitCast(di, v), Vec128<int32_t, N>{idx.raw}));
6928
- #endif // HWY_TARGET <= HWY_AVX2
6929
- }
6930
- #endif // HWY_TARGET <= HWY_SSSE3
6931
-
6932
7184
  // Single lane: no change
6933
7185
  template <typename T>
6934
7186
  HWY_API Vec128<T, 1> TableLookupLanes(Vec128<T, 1> v,
@@ -6936,11 +7188,15 @@ HWY_API Vec128<T, 1> TableLookupLanes(Vec128<T, 1> v,
6936
7188
  return v;
6937
7189
  }
6938
7190
 
6939
- template <typename T, HWY_IF_UI64(T)>
7191
+ template <typename T, HWY_IF_T_SIZE(T, 8)>
6940
7192
  HWY_API Vec128<T> TableLookupLanes(Vec128<T> v, Indices128<T> idx) {
6941
7193
  const DFromV<decltype(v)> d;
7194
+ // No need for ZeroExtendResizeBitCast, we have full vectors.
6942
7195
  Vec128<int64_t> vidx{idx.raw};
6943
- #if HWY_TARGET <= HWY_AVX2
7196
+
7197
+ // Disable in MSAN builds due to false positive. Note that this affects
7198
+ // CompressNot, which assumes upper index bits will be ignored.
7199
+ #if HWY_TARGET <= HWY_AVX2 && !HWY_IS_MSAN
6944
7200
  // There is no _mm_permute[x]var_epi64.
6945
7201
  vidx += vidx; // bit1 is the decider (unusual)
6946
7202
  const RebindToFloat<decltype(d)> df;
@@ -6952,26 +7208,8 @@ HWY_API Vec128<T> TableLookupLanes(Vec128<T> v, Indices128<T> idx) {
6952
7208
  // to obtain an all-zero or all-one mask.
6953
7209
  const RebindToSigned<decltype(d)> di;
6954
7210
  const Vec128<int64_t> same = (vidx ^ Iota(di, 0)) - Set(di, 1);
6955
- const Mask128<T> mask_same = RebindMask(d, MaskFromVec(same));
6956
- return IfThenElse(mask_same, v, Shuffle01(v));
6957
- #endif
6958
- }
6959
-
6960
- HWY_API Vec128<double> TableLookupLanes(Vec128<double> v,
6961
- Indices128<double> idx) {
6962
- Vec128<int64_t> vidx{idx.raw};
6963
- #if HWY_TARGET <= HWY_AVX2
6964
- vidx += vidx; // bit1 is the decider (unusual)
6965
- return Vec128<double>{_mm_permutevar_pd(v.raw, vidx.raw)};
6966
- #else
6967
- // Only 2 lanes: can swap+blend. Choose v if vidx == iota. To avoid a 64-bit
6968
- // comparison (expensive on SSSE3), just invert the upper lane and subtract 1
6969
- // to obtain an all-zero or all-one mask.
6970
- const DFromV<decltype(v)> d;
6971
- const RebindToSigned<decltype(d)> di;
6972
- const Vec128<int64_t> same = (vidx ^ Iota(di, 0)) - Set(di, 1);
6973
- const Mask128<double> mask_same = RebindMask(d, MaskFromVec(same));
6974
- return IfThenElse(mask_same, v, Shuffle01(v));
7211
+ return BitCast(
7212
+ d, IfVecThenElse(same, BitCast(di, v), Shuffle01(BitCast(di, v))));
6975
7213
  #endif
6976
7214
  }
6977
7215
 
@@ -8861,12 +9099,22 @@ HWY_API Vec128<T, N> OddEvenBlocks(Vec128<T, N> /* odd */, Vec128<T, N> even) {
8861
9099
  }
8862
9100
 
8863
9101
  // ------------------------------ SwapAdjacentBlocks
8864
-
8865
9102
  template <typename T, size_t N>
8866
9103
  HWY_API Vec128<T, N> SwapAdjacentBlocks(Vec128<T, N> v) {
8867
9104
  return v;
8868
9105
  }
8869
9106
 
9107
+ // ------------------------------ InterleaveEvenBlocks
9108
+ template <class D, class V = VFromD<D>, HWY_IF_V_SIZE_LE_D(D, 16)>
9109
+ HWY_API V InterleaveEvenBlocks(D, V a, V /*b*/) {
9110
+ return a;
9111
+ }
9112
+ // ------------------------------ InterleaveOddBlocks
9113
+ template <class D, class V = VFromD<D>, HWY_IF_V_SIZE_LE_D(D, 16)>
9114
+ HWY_API V InterleaveOddBlocks(D, V a, V /*b*/) {
9115
+ return a;
9116
+ }
9117
+
8870
9118
  // ------------------------------ Shl (ZipLower, Mul)
8871
9119
 
8872
9120
  // Use AVX2/3 variable shifts where available, otherwise multiply by powers of
@@ -9588,15 +9836,28 @@ HWY_INLINE VFromD<D> PromoteOddTo(hwy::SignedTag /*to_type_tag*/,
9588
9836
 
9589
9837
  // ------------------------------ WidenMulPairwiseAdd (PromoteEvenTo)
9590
9838
 
9839
+ #if HWY_NATIVE_DOT_BF16
9840
+
9841
+ template <class DF, HWY_IF_F32_D(DF), HWY_IF_V_SIZE_LE_D(DF, 16),
9842
+ class VBF = VFromD<Repartition<bfloat16_t, DF>>>
9843
+ HWY_API VFromD<DF> WidenMulPairwiseAdd(DF df, VBF a, VBF b) {
9844
+ return VFromD<DF>{_mm_dpbf16_ps(Zero(df).raw,
9845
+ reinterpret_cast<__m128bh>(a.raw),
9846
+ reinterpret_cast<__m128bh>(b.raw))};
9847
+ }
9848
+
9849
+ #else
9850
+
9591
9851
  // Generic for all vector lengths.
9592
9852
  template <class DF, HWY_IF_F32_D(DF),
9593
9853
  class VBF = VFromD<Repartition<bfloat16_t, DF>>>
9594
9854
  HWY_API VFromD<DF> WidenMulPairwiseAdd(DF df, VBF a, VBF b) {
9595
- // TODO(janwas): _mm_dpbf16_ps when available
9596
9855
  return MulAdd(PromoteEvenTo(df, a), PromoteEvenTo(df, b),
9597
9856
  Mul(PromoteOddTo(df, a), PromoteOddTo(df, b)));
9598
9857
  }
9599
9858
 
9859
+ #endif // HWY_NATIVE_DOT_BF16
9860
+
9600
9861
  // Even if N=1, the input is always at least 2 lanes, hence madd_epi16 is safe.
9601
9862
  template <class D32, HWY_IF_I32_D(D32), HWY_IF_V_SIZE_LE_D(D32, 16),
9602
9863
  class V16 = VFromD<RepartitionToNarrow<D32>>>
@@ -10276,6 +10537,7 @@ X86ConvertScalarFromFloat(TF from_val) {
10276
10537
  return X86ConvertScalarFromFloat<TTo>(hwy::TypeTag<RemoveCvRef<TTo>>(),
10277
10538
  from_val);
10278
10539
  }
10540
+
10279
10541
  #endif // HWY_COMPILER_GCC_ACTUAL >= 700 && !HWY_IS_DEBUG_BUILD
10280
10542
 
10281
10543
  } // namespace detail
@@ -10288,7 +10550,9 @@ X86ConvertScalarFromFloat(TF from_val) {
10288
10550
 
10289
10551
  template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_I32_D(D)>
10290
10552
  HWY_API VFromD<D> DemoteInRangeTo(D /* tag */, VFromD<Rebind<double, D>> v) {
10291
- #if HWY_COMPILER_GCC_ACTUAL
10553
+ #if HWY_X86_HAVE_AVX10_2_OPS
10554
+ return VFromD<D>{_mm_cvtts_pd_epi32(v.raw)};
10555
+ #elif HWY_COMPILER_GCC_ACTUAL
10292
10556
  // Workaround for undefined behavior in _mm_cvttpd_epi32 with GCC if any
10293
10557
  // values of v[i] are not within the range of an int32_t
10294
10558
 
@@ -10325,7 +10589,9 @@ HWY_API VFromD<D> DemoteTo(D di32, VFromD<Rebind<double, D>> v) {
10325
10589
  #if HWY_TARGET <= HWY_AVX3
10326
10590
  template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U32_D(D)>
10327
10591
  HWY_API VFromD<D> DemoteInRangeTo(D /* tag */, VFromD<Rebind<double, D>> v) {
10328
- #if HWY_COMPILER_GCC_ACTUAL
10592
+ #if HWY_X86_HAVE_AVX10_2_OPS
10593
+ return VFromD<D>{_mm_cvtts_pd_epu32(v.raw)};
10594
+ #elif HWY_COMPILER_GCC_ACTUAL
10329
10595
  // Workaround for undefined behavior in _mm_cvttpd_epu32 with GCC if any
10330
10596
  // values of v[i] are not within the range of an uint32_t
10331
10597
 
@@ -10353,8 +10619,12 @@ HWY_API VFromD<D> DemoteInRangeTo(D /* tag */, VFromD<Rebind<double, D>> v) {
10353
10619
 
10354
10620
  // F64->U32 DemoteTo is generic for all vector lengths
10355
10621
  template <class D, HWY_IF_U32_D(D)>
10356
- HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<double, D>> v) {
10357
- return DemoteInRangeTo(D(), ZeroIfNegative(v));
10622
+ HWY_API VFromD<D> DemoteTo(D du32, VFromD<Rebind<double, D>> v) {
10623
+ #if HWY_X86_HAVE_AVX10_2_OPS
10624
+ return DemoteInRangeTo(du32, v);
10625
+ #else
10626
+ return DemoteInRangeTo(du32, ZeroIfNegative(v));
10627
+ #endif
10358
10628
  }
10359
10629
  #else // HWY_TARGET > HWY_AVX3
10360
10630
 
@@ -10482,7 +10752,9 @@ HWY_API Vec128<uint8_t, N> U8FromU32(const Vec128<uint32_t, N> v) {
10482
10752
  #if HWY_TARGET <= HWY_AVX3
10483
10753
  template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_I64_D(D)>
10484
10754
  HWY_API VFromD<D> PromoteInRangeTo(D /*di64*/, VFromD<Rebind<float, D>> v) {
10485
- #if HWY_COMPILER_GCC_ACTUAL
10755
+ #if HWY_X86_HAVE_AVX10_2_OPS
10756
+ return VFromD<D>{_mm_cvtts_ps_epi64(v.raw)};
10757
+ #elif HWY_COMPILER_GCC_ACTUAL
10486
10758
  // Workaround for undefined behavior with GCC if any values of v[i] are not
10487
10759
  // within the range of an int64_t
10488
10760
 
@@ -10510,6 +10782,9 @@ HWY_API VFromD<D> PromoteInRangeTo(D /*di64*/, VFromD<Rebind<float, D>> v) {
10510
10782
  // Generic for all vector lengths.
10511
10783
  template <class D, HWY_IF_I64_D(D)>
10512
10784
  HWY_API VFromD<D> PromoteTo(D di64, VFromD<Rebind<float, D>> v) {
10785
+ #if HWY_X86_HAVE_AVX10_2_OPS
10786
+ return PromoteInRangeTo(di64, v);
10787
+ #else
10513
10788
  const Rebind<float, decltype(di64)> df32;
10514
10789
  const RebindToFloat<decltype(di64)> df64;
10515
10790
  // We now avoid GCC UB in PromoteInRangeTo via assembly, see #2189 and
@@ -10522,14 +10797,21 @@ HWY_API VFromD<D> PromoteTo(D di64, VFromD<Rebind<float, D>> v) {
10522
10797
  di64, PromoteMaskTo(df64, df32, Ge(v, Set(df32, 9.223372e18f))));
10523
10798
  return IfThenElse(overflow, Set(di64, LimitsMax<int64_t>()),
10524
10799
  PromoteInRangeTo(di64, v));
10800
+ #endif
10525
10801
  }
10526
10802
  template <class D, HWY_IF_U64_D(D)>
10527
- HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<float, D>> v) {
10528
- return PromoteInRangeTo(D(), ZeroIfNegative(v));
10803
+ HWY_API VFromD<D> PromoteTo(D du64, VFromD<Rebind<float, D>> v) {
10804
+ #if HWY_X86_HAVE_AVX10_2_OPS
10805
+ return PromoteInRangeTo(du64, v);
10806
+ #else
10807
+ return PromoteInRangeTo(du64, ZeroIfNegative(v));
10808
+ #endif
10529
10809
  }
10530
10810
  template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_U64_D(D)>
10531
10811
  HWY_API VFromD<D> PromoteInRangeTo(D /* tag */, VFromD<Rebind<float, D>> v) {
10532
- #if HWY_COMPILER_GCC_ACTUAL
10812
+ #if HWY_X86_HAVE_AVX10_2_OPS
10813
+ return VFromD<D>{_mm_cvtts_ps_epu64(v.raw)};
10814
+ #elif HWY_COMPILER_GCC_ACTUAL
10533
10815
  // Workaround for undefined behavior with GCC if any values of v[i] are not
10534
10816
  // within the range of an uint64_t
10535
10817
 
@@ -11208,7 +11490,9 @@ HWY_API VFromD<D> ConvertTo(D /* tag */, VFromD<RebindToFloat<D>> v) {
11208
11490
 
11209
11491
  template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_I32_D(D)>
11210
11492
  HWY_API VFromD<D> ConvertInRangeTo(D /*di*/, VFromD<RebindToFloat<D>> v) {
11211
- #if HWY_COMPILER_GCC_ACTUAL
11493
+ #if HWY_X86_HAVE_AVX10_2_OPS
11494
+ return VFromD<D>{_mm_cvtts_ps_epi32(v.raw)};
11495
+ #elif HWY_COMPILER_GCC_ACTUAL
11212
11496
  // Workaround for undefined behavior in _mm_cvttps_epi32 with GCC if any
11213
11497
  // values of v[i] are not within the range of an int32_t
11214
11498
 
@@ -11238,17 +11522,23 @@ HWY_API VFromD<D> ConvertInRangeTo(D /*di*/, VFromD<RebindToFloat<D>> v) {
11238
11522
  // F32 to I32 ConvertTo is generic for all vector lengths
11239
11523
  template <class D, HWY_IF_I32_D(D)>
11240
11524
  HWY_API VFromD<D> ConvertTo(D di, VFromD<RebindToFloat<D>> v) {
11525
+ #if HWY_X86_HAVE_AVX10_2_OPS
11526
+ return ConvertInRangeTo(di, v);
11527
+ #else
11241
11528
  const RebindToFloat<decltype(di)> df;
11242
11529
  // See comment at the first occurrence of "IfThenElse(overflow,".
11243
11530
  const MFromD<D> overflow = RebindMask(di, Ge(v, Set(df, 2147483648.0f)));
11244
11531
  return IfThenElse(overflow, Set(di, LimitsMax<int32_t>()),
11245
11532
  ConvertInRangeTo(di, v));
11533
+ #endif
11246
11534
  }
11247
11535
 
11248
11536
  #if HWY_TARGET <= HWY_AVX3
11249
11537
  template <class DI, HWY_IF_V_SIZE_LE_D(DI, 16), HWY_IF_I64_D(DI)>
11250
11538
  HWY_API VFromD<DI> ConvertInRangeTo(DI /*di*/, VFromD<RebindToFloat<DI>> v) {
11251
- #if HWY_COMPILER_GCC_ACTUAL
11539
+ #if HWY_X86_HAVE_AVX10_2_OPS
11540
+ return VFromD<DI>{_mm_cvtts_pd_epi64(v.raw)};
11541
+ #elif HWY_COMPILER_GCC_ACTUAL
11252
11542
  // Workaround for undefined behavior in _mm_cvttpd_epi64 with GCC if any
11253
11543
  // values of v[i] are not within the range of an int64_t
11254
11544
 
@@ -11276,17 +11566,23 @@ HWY_API VFromD<DI> ConvertInRangeTo(DI /*di*/, VFromD<RebindToFloat<DI>> v) {
11276
11566
  // F64 to I64 ConvertTo is generic for all vector lengths on AVX3
11277
11567
  template <class DI, HWY_IF_I64_D(DI)>
11278
11568
  HWY_API VFromD<DI> ConvertTo(DI di, VFromD<RebindToFloat<DI>> v) {
11569
+ #if HWY_X86_HAVE_AVX10_2_OPS
11570
+ return ConvertInRangeTo(di, v);
11571
+ #else
11279
11572
  const RebindToFloat<decltype(di)> df;
11280
11573
  // See comment at the first occurrence of "IfThenElse(overflow,".
11281
11574
  const MFromD<DI> overflow =
11282
11575
  RebindMask(di, Ge(v, Set(df, 9.223372036854776e18)));
11283
11576
  return IfThenElse(overflow, Set(di, LimitsMax<int64_t>()),
11284
11577
  ConvertInRangeTo(di, v));
11578
+ #endif
11285
11579
  }
11286
11580
 
11287
11581
  template <class DU, HWY_IF_V_SIZE_LE_D(DU, 16), HWY_IF_U32_D(DU)>
11288
11582
  HWY_API VFromD<DU> ConvertInRangeTo(DU /*du*/, VFromD<RebindToFloat<DU>> v) {
11289
- #if HWY_COMPILER_GCC_ACTUAL
11583
+ #if HWY_X86_HAVE_AVX10_2_OPS
11584
+ return VFromD<DU>{_mm_cvtts_ps_epu32(v.raw)};
11585
+ #elif HWY_COMPILER_GCC_ACTUAL
11290
11586
  // Workaround for undefined behavior in _mm_cvttps_epu32 with GCC if any
11291
11587
  // values of v[i] are not within the range of an uint32_t
11292
11588
 
@@ -11315,13 +11611,19 @@ HWY_API VFromD<DU> ConvertInRangeTo(DU /*du*/, VFromD<RebindToFloat<DU>> v) {
11315
11611
 
11316
11612
  // F32->U32 ConvertTo is generic for all vector lengths
11317
11613
  template <class DU, HWY_IF_U32_D(DU)>
11318
- HWY_API VFromD<DU> ConvertTo(DU /*du*/, VFromD<RebindToFloat<DU>> v) {
11319
- return ConvertInRangeTo(DU(), ZeroIfNegative(v));
11614
+ HWY_API VFromD<DU> ConvertTo(DU du32, VFromD<RebindToFloat<DU>> v) {
11615
+ #if HWY_X86_HAVE_AVX10_2_OPS
11616
+ return ConvertInRangeTo(du32, v);
11617
+ #else
11618
+ return ConvertInRangeTo(du32, ZeroIfNegative(v));
11619
+ #endif
11320
11620
  }
11321
11621
 
11322
11622
  template <class DU, HWY_IF_V_SIZE_LE_D(DU, 16), HWY_IF_U64_D(DU)>
11323
11623
  HWY_API VFromD<DU> ConvertInRangeTo(DU /*du*/, VFromD<RebindToFloat<DU>> v) {
11324
- #if HWY_COMPILER_GCC_ACTUAL
11624
+ #if HWY_X86_HAVE_AVX10_2_OPS
11625
+ return VFromD<DU>{_mm_cvtts_pd_epu64(v.raw)};
11626
+ #elif HWY_COMPILER_GCC_ACTUAL
11325
11627
  // Workaround for undefined behavior in _mm_cvttpd_epu64 with GCC if any
11326
11628
  // values of v[i] are not within the range of an uint64_t
11327
11629
 
@@ -11348,8 +11650,12 @@ HWY_API VFromD<DU> ConvertInRangeTo(DU /*du*/, VFromD<RebindToFloat<DU>> v) {
11348
11650
 
11349
11651
  // F64->U64 ConvertTo is generic for all vector lengths
11350
11652
  template <class DU, HWY_IF_U64_D(DU)>
11351
- HWY_API VFromD<DU> ConvertTo(DU /*du*/, VFromD<RebindToFloat<DU>> v) {
11352
- return ConvertInRangeTo(DU(), ZeroIfNegative(v));
11653
+ HWY_API VFromD<DU> ConvertTo(DU du64, VFromD<RebindToFloat<DU>> v) {
11654
+ #if HWY_X86_HAVE_AVX10_2_OPS
11655
+ return ConvertInRangeTo(du64, v);
11656
+ #else
11657
+ return ConvertInRangeTo(du64, ZeroIfNegative(v));
11658
+ #endif
11353
11659
  }
11354
11660
 
11355
11661
  #else // AVX2 or below
@@ -11620,7 +11926,8 @@ X86ScalarNearestInt(TF flt_val) {
11620
11926
 
11621
11927
  // If these are in namespace detail, the x86_256/512 templates are not found.
11622
11928
  template <class DI, HWY_IF_V_SIZE_LE_D(DI, 16), HWY_IF_I32_D(DI)>
11623
- HWY_INLINE VFromD<DI> NearestIntInRange(DI, VFromD<RebindToFloat<DI>> v) {
11929
+ static HWY_INLINE VFromD<DI> NearestIntInRange(DI,
11930
+ VFromD<RebindToFloat<DI>> v) {
11624
11931
  #if HWY_COMPILER_GCC_ACTUAL
11625
11932
  // Workaround for undefined behavior in _mm_cvtps_epi32 with GCC if any values
11626
11933
  // of v[i] are not within the range of an int32_t
@@ -11648,17 +11955,229 @@ HWY_INLINE VFromD<DI> NearestIntInRange(DI, VFromD<RebindToFloat<DI>> v) {
11648
11955
  #endif
11649
11956
  }
11650
11957
 
11651
- // Generic for all vector lengths.
11958
+ #if HWY_HAVE_FLOAT16
11959
+ template <class DI, HWY_IF_V_SIZE_LE_D(DI, 16), HWY_IF_I16_D(DI)>
11960
+ static HWY_INLINE VFromD<DI> NearestIntInRange(DI /*di*/,
11961
+ VFromD<RebindToFloat<DI>> v) {
11962
+ #if HWY_COMPILER_GCC_ACTUAL
11963
+ // Workaround for undefined behavior in _mm_cvtph_epi16 if any values of v[i]
11964
+ // are not within the range of an int16_t
11965
+
11966
+ #if HWY_COMPILER_GCC_ACTUAL >= 1200 && !HWY_IS_DEBUG_BUILD && \
11967
+ HWY_HAVE_SCALAR_F16_TYPE
11968
+ if (detail::IsConstantX86VecForF2IConv<int16_t>(v)) {
11969
+ typedef hwy::float16_t::Native GccF16RawVectType
11970
+ __attribute__((__vector_size__(16)));
11971
+ const auto raw_v = reinterpret_cast<GccF16RawVectType>(v.raw);
11972
+ return Dup128VecFromValues(DI(),
11973
+ detail::X86ScalarNearestInt<int16_t>(raw_v[0]),
11974
+ detail::X86ScalarNearestInt<int16_t>(raw_v[1]),
11975
+ detail::X86ScalarNearestInt<int16_t>(raw_v[2]),
11976
+ detail::X86ScalarNearestInt<int16_t>(raw_v[3]),
11977
+ detail::X86ScalarNearestInt<int16_t>(raw_v[4]),
11978
+ detail::X86ScalarNearestInt<int16_t>(raw_v[5]),
11979
+ detail::X86ScalarNearestInt<int16_t>(raw_v[6]),
11980
+ detail::X86ScalarNearestInt<int16_t>(raw_v[7]));
11981
+ }
11982
+ #endif
11983
+
11984
+ __m128i raw_result;
11985
+ __asm__("vcvtph2w {%1, %0|%0, %1}"
11986
+ : "=" HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(raw_result)
11987
+ : HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(v.raw)
11988
+ :);
11989
+ return VFromD<DI>{raw_result};
11990
+ #else // !HWY_COMPILER_GCC_ACTUAL
11991
+ return VFromD<DI>{_mm_cvtph_epi16(v.raw)};
11992
+ #endif
11993
+ }
11994
+ #endif // HWY_HAVE_FLOAT16
11995
+
11996
+ #if HWY_TARGET <= HWY_AVX3
11997
+
11998
+ template <class DI, HWY_IF_V_SIZE_LE_D(DI, 16), HWY_IF_I64_D(DI)>
11999
+ static HWY_INLINE VFromD<DI> NearestIntInRange(DI /*di*/,
12000
+ VFromD<RebindToFloat<DI>> v) {
12001
+ #if HWY_COMPILER_GCC_ACTUAL
12002
+ // Workaround for undefined behavior in _mm_cvtpd_epi64 with GCC if any
12003
+ // values of v[i] are not within the range of an int64_t
12004
+
12005
+ #if HWY_COMPILER_GCC_ACTUAL >= 700 && !HWY_IS_DEBUG_BUILD
12006
+ if (detail::IsConstantX86VecForF2IConv<int64_t>(v)) {
12007
+ typedef double GccF64RawVectType __attribute__((__vector_size__(16)));
12008
+ const auto raw_v = reinterpret_cast<GccF64RawVectType>(v.raw);
12009
+ return Dup128VecFromValues(DI(),
12010
+ detail::X86ScalarNearestInt<int64_t>(raw_v[0]),
12011
+ detail::X86ScalarNearestInt<int64_t>(raw_v[1]));
12012
+ }
12013
+ #endif
12014
+
12015
+ __m128i raw_result;
12016
+ __asm__("vcvtpd2qq {%1, %0|%0, %1}"
12017
+ : "=" HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(raw_result)
12018
+ : HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(v.raw)
12019
+ :);
12020
+ return VFromD<DI>{raw_result};
12021
+ #else // !HWY_COMPILER_GCC_ACTUAL
12022
+ return VFromD<DI>{_mm_cvtpd_epi64(v.raw)};
12023
+ #endif
12024
+ }
12025
+
12026
+ #else // HWY_TARGET > HWY_AVX3
12027
+
12028
+ namespace detail {
12029
+
12030
+ #if HWY_ARCH_X86_64
12031
+ template <size_t N>
12032
+ static HWY_INLINE int64_t
12033
+ SSE2ConvFirstF64LaneToNearestI64(Vec128<double, N> v) {
12034
+ #if HWY_COMPILER_GCC_ACTUAL
12035
+ // Workaround for undefined behavior in _mm_cvtsd_si64 with GCC if v[0] is
12036
+ // not within the range of an int64_t
12037
+
12038
+ #if HWY_COMPILER_GCC_ACTUAL >= 700 && !HWY_IS_DEBUG_BUILD
12039
+ if (IsConstantX86Vec(hwy::SizeTag<1>(), v)) {
12040
+ typedef double GccF64RawVectType __attribute__((__vector_size__(16)));
12041
+ const auto raw_v = reinterpret_cast<GccF64RawVectType>(v.raw);
12042
+ return X86ScalarNearestInt<int64_t>(raw_v[0]);
12043
+ }
12044
+ #endif
12045
+
12046
+ int64_t result;
12047
+ __asm__("%vcvtsd2si {%1, %0|%0, %1}"
12048
+ : "=r"(result)
12049
+ : HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(v.raw)
12050
+ :);
12051
+ return result;
12052
+ #else
12053
+ return _mm_cvtsd_si64(v.raw);
12054
+ #endif
12055
+ }
12056
+ #endif // HWY_ARCH_X86_64
12057
+
12058
+ #if !HWY_ARCH_X86_64 || HWY_TARGET <= HWY_AVX2
12059
+ template <class DI64, HWY_IF_I64_D(DI64)>
12060
+ static HWY_INLINE VFromD<DI64> SSE2NearestI64InRange(
12061
+ DI64 di64, VFromD<RebindToFloat<DI64>> v) {
12062
+ const RebindToFloat<DI64> df64;
12063
+ const RebindToUnsigned<DI64> du64;
12064
+ using VI64 = VFromD<decltype(di64)>;
12065
+
12066
+ const auto mant_end = Set(df64, MantissaEnd<double>());
12067
+ const auto is_small = Lt(Abs(v), mant_end);
12068
+
12069
+ const auto adj_v = Max(v, Set(df64, -9223372036854775808.0)) +
12070
+ IfThenElseZero(is_small, CopySignToAbs(mant_end, v));
12071
+ const auto adj_v_biased_exp =
12072
+ And(BitCast(di64, ShiftRight<52>(BitCast(du64, adj_v))),
12073
+ Set(di64, int64_t{0x7FF}));
12074
+
12075
+ // We can simply subtract 1075 from adj_v_biased_exp[i] to get shift_int since
12076
+ // adj_v_biased_exp[i] is at least 1075
12077
+ const VI64 shift_int = adj_v_biased_exp + Set(di64, int64_t{-1075});
12078
+
12079
+ const VI64 mantissa = BitCast(di64, adj_v) & Set(di64, (1LL << 52) - 1);
12080
+ // Include implicit 1-bit if is_small[i] is 0. NOTE: the shift count may
12081
+ // exceed 63; we rely on x86 returning zero in that case.
12082
+ const VI64 int53 = mantissa | IfThenZeroElse(RebindMask(di64, is_small),
12083
+ Set(di64, 1LL << 52));
12084
+
12085
+ const VI64 sign_mask = BroadcastSignBit(BitCast(di64, v));
12086
+ // If the input was negative, negate the integer (two's complement).
12087
+ return ((int53 << shift_int) ^ sign_mask) - sign_mask;
12088
+ }
12089
+ #endif // !HWY_ARCH_X86_64 || HWY_TARGET <= HWY_AVX2
12090
+
12091
+ } // namespace detail
12092
+
12093
+ #if HWY_ARCH_X86_64
12094
+ template <class DI, HWY_IF_V_SIZE_D(DI, 8), HWY_IF_I64_D(DI)>
12095
+ static HWY_INLINE VFromD<DI> NearestIntInRange(DI /*di*/, Vec64<double> v) {
12096
+ return VFromD<DI>{
12097
+ _mm_cvtsi64_si128(detail::SSE2ConvFirstF64LaneToNearestI64(v))};
12098
+ }
12099
+ template <class DI, HWY_IF_V_SIZE_D(DI, 16), HWY_IF_I64_D(DI)>
12100
+ static HWY_INLINE VFromD<DI> NearestIntInRange(DI /*di*/, Vec128<double> v) {
12101
+ const __m128i i0 =
12102
+ _mm_cvtsi64_si128(detail::SSE2ConvFirstF64LaneToNearestI64(v));
12103
+ const Full64<double> dd2;
12104
+ const __m128i i1 = _mm_cvtsi64_si128(
12105
+ detail::SSE2ConvFirstF64LaneToNearestI64(UpperHalf(dd2, v)));
12106
+ return VFromD<DI>{_mm_unpacklo_epi64(i0, i1)};
12107
+ }
12108
+ #endif // HWY_ARCH_X86_64
12109
+
12110
+ #if !HWY_ARCH_X86_64 || HWY_TARGET <= HWY_AVX2
12111
+ template <class DI, HWY_IF_V_SIZE_GT_D(DI, (HWY_ARCH_X86_64 ? 16 : 0)),
12112
+ HWY_IF_I64_D(DI)>
12113
+ static HWY_INLINE VFromD<DI> NearestIntInRange(DI di,
12114
+ VFromD<RebindToFloat<DI>> v) {
12115
+ return detail::SSE2NearestI64InRange(di, v);
12116
+ }
12117
+ #endif // !HWY_ARCH_X86_64 || HWY_TARGET <= HWY_AVX2
12118
+
12119
+ #endif // HWY_TARGET <= HWY_AVX3
12120
+
12121
+ template <class DI, HWY_IF_V_SIZE_LE_D(DI, 8), HWY_IF_I32_D(DI)>
12122
+ static HWY_INLINE VFromD<DI> DemoteToNearestIntInRange(
12123
+ DI, VFromD<Rebind<double, DI>> v) {
12124
+ #if HWY_COMPILER_GCC_ACTUAL
12125
+ // Workaround for undefined behavior in _mm_cvtpd_epi32 with GCC if any values
12126
+ // of v[i] are not within the range of an int32_t
12127
+
12128
+ #if HWY_COMPILER_GCC_ACTUAL >= 700 && !HWY_IS_DEBUG_BUILD
12129
+ if (detail::IsConstantX86VecForF2IConv<int32_t>(v)) {
12130
+ typedef double GccF32RawVectType __attribute__((__vector_size__(16)));
12131
+ const auto raw_v = reinterpret_cast<GccF32RawVectType>(v.raw);
12132
+ return Dup128VecFromValues(
12133
+ DI(), detail::X86ScalarNearestInt<int32_t>(raw_v[0]),
12134
+ detail::X86ScalarNearestInt<int32_t>(raw_v[1]), int32_t{0}, int32_t{0});
12135
+ }
12136
+ #endif
12137
+
12138
+ __m128i raw_result;
12139
+ __asm__("%vcvtpd2dq {%1, %0|%0, %1}"
12140
+ : "=" HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(raw_result)
12141
+ : HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(v.raw)
12142
+ :);
12143
+ return VFromD<DI>{raw_result};
12144
+ #else // !HWY_COMPILER_GCC_ACTUAL
12145
+ return VFromD<DI>{_mm_cvtpd_epi32(v.raw)};
12146
+ #endif
12147
+ }
12148
+
12149
+ // F16/F32/F64 NearestInt is generic for all vector lengths
11652
12150
  template <class VF, class DF = DFromV<VF>, class DI = RebindToSigned<DF>,
11653
- HWY_IF_F32_D(DF)>
12151
+ HWY_IF_FLOAT_D(DF),
12152
+ HWY_IF_T_SIZE_ONE_OF_D(DF, (1 << 4) | (1 << 8) |
12153
+ (HWY_HAVE_FLOAT16 ? (1 << 2) : 0))>
11654
12154
  HWY_API VFromD<DI> NearestInt(const VF v) {
11655
12155
  const DI di;
12156
+ using TI = TFromD<DI>;
12157
+ using TF = TFromD<DF>;
12158
+ using TFArith = If<sizeof(TF) <= sizeof(float), float, RemoveCvRef<TF>>;
12159
+
12160
+ constexpr TFArith kMinOutOfRangePosVal =
12161
+ static_cast<TFArith>(-static_cast<TFArith>(LimitsMin<TI>()));
12162
+ static_assert(kMinOutOfRangePosVal > static_cast<TFArith>(0.0),
12163
+ "kMinOutOfRangePosVal > 0.0 must be true");
12164
+
11656
12165
  // See comment at the first occurrence of "IfThenElse(overflow,".
11657
12166
  // Here we are rounding, whereas previous occurrences truncate, but there is
11658
12167
  // no difference because the previous float value is well below the max i32.
11659
- const auto overflow = RebindMask(di, Ge(v, Set(DF(), 2147483648.0f)));
11660
- return IfThenElse(overflow, Set(di, LimitsMax<int32_t>()),
11661
- NearestIntInRange(di, v));
12168
+ const auto overflow = RebindMask(
12169
+ di, Ge(v, Set(DF(), ConvertScalarTo<TF>(kMinOutOfRangePosVal))));
12170
+ auto result =
12171
+ IfThenElse(overflow, Set(di, LimitsMax<TI>()), NearestIntInRange(di, v));
12172
+
12173
+ return result;
12174
+ }
12175
+
12176
+ template <class DI, HWY_IF_I32_D(DI)>
12177
+ HWY_API VFromD<DI> DemoteToNearestInt(DI, VFromD<Rebind<double, DI>> v) {
12178
+ const DI di;
12179
+ const Rebind<double, DI> df64;
12180
+ return DemoteToNearestIntInRange(di, Min(v, Set(df64, 2147483647.0)));
11662
12181
  }
11663
12182
 
11664
12183
  // ------------------------------ Floating-point rounding (ConvertTo)
@@ -11724,6 +12243,25 @@ HWY_API Vec128<T, N> Ceil(const Vec128<T, N> v) {
11724
12243
  return IfThenElse(detail::UseInt(v), int_f - neg1, v);
11725
12244
  }
11726
12245
 
12246
+ #ifdef HWY_NATIVE_CEIL_FLOOR_INT
12247
+ #undef HWY_NATIVE_CEIL_FLOOR_INT
12248
+ #else
12249
+ #define HWY_NATIVE_CEIL_FLOOR_INT
12250
+ #endif
12251
+
12252
+ template <class V, HWY_IF_FLOAT_V(V)>
12253
+ HWY_API VFromD<RebindToSigned<DFromV<V>>> CeilInt(V v) {
12254
+ const DFromV<decltype(v)> df;
12255
+ const RebindToSigned<decltype(df)> di;
12256
+
12257
+ const auto integer = ConvertTo(di, v); // round toward 0
12258
+ const auto int_f = ConvertTo(df, integer);
12259
+
12260
+ // Truncating a positive non-integer ends up smaller; if so, add 1.
12261
+ return integer -
12262
+ VecFromMask(di, RebindMask(di, And(detail::UseInt(v), int_f < v)));
12263
+ }
12264
+
11727
12265
  // Toward -infinity, aka floor
11728
12266
  template <typename T, size_t N>
11729
12267
  HWY_API Vec128<T, N> Floor(const Vec128<T, N> v) {
@@ -11740,6 +12278,19 @@ HWY_API Vec128<T, N> Floor(const Vec128<T, N> v) {
11740
12278
  return IfThenElse(detail::UseInt(v), int_f + neg1, v);
11741
12279
  }
11742
12280
 
12281
+ template <class V, HWY_IF_FLOAT_V(V)>
12282
+ HWY_API VFromD<RebindToSigned<DFromV<V>>> FloorInt(V v) {
12283
+ const DFromV<decltype(v)> df;
12284
+ const RebindToSigned<decltype(df)> di;
12285
+
12286
+ const auto integer = ConvertTo(di, v); // round toward 0
12287
+ const auto int_f = ConvertTo(df, integer);
12288
+
12289
+ // Truncating a negative non-integer ends up larger; if so, subtract 1.
12290
+ return integer +
12291
+ VecFromMask(di, RebindMask(di, And(detail::UseInt(v), int_f > v)));
12292
+ }
12293
+
11743
12294
  #else
11744
12295
 
11745
12296
  // Toward nearest integer, ties to even
@@ -12117,8 +12668,27 @@ struct CompressIsPartition {
12117
12668
  #endif
12118
12669
  };
12119
12670
 
12671
+ namespace detail {
12672
+
12673
+ // Returns `mask_bits` (from movemask) with the upper bits cleared, if there
12674
+ // are 8 or fewer valid bits.
12675
+ template <class D>
12676
+ constexpr uint64_t OnlyActive(D d, uint64_t mask_bits) {
12677
+ return (d.MaxBytes() >= 16) ? mask_bits
12678
+ : mask_bits & ((1ull << d.MaxLanes()) - 1);
12679
+ }
12680
+
12681
+ } // namespace detail
12682
+
12120
12683
  #if HWY_TARGET <= HWY_AVX3
12121
12684
 
12685
+ // ------------------------------ BitsFromMask (MFromD, OnlyActive)
12686
+ // Generic for all vector lengths.
12687
+ template <class D>
12688
+ HWY_INLINE uint64_t BitsFromMask(D d, MFromD<D> mask) {
12689
+ return detail::OnlyActive(d, mask.raw);
12690
+ }
12691
+
12122
12692
  // ------------------------------ StoreMaskBits
12123
12693
 
12124
12694
  // `p` points to at least 8 writable bytes.
@@ -12238,14 +12808,16 @@ HWY_API Vec128<T> CompressNot(Vec128<T> v, Mask128<T> mask) {
12238
12808
  alignas(16) static constexpr uint64_t packed_array[16] = {
12239
12809
  0x00000010, 0x00000001, 0x00000010, 0x00000010};
12240
12810
 
12241
- // For lane i, shift the i-th 4-bit index down to bits [0, 2) -
12242
- // _mm_permutexvar_epi64 will ignore the upper bits.
12811
+ // For lane i, shift the i-th 4-bit index down to bits [0, 2).
12243
12812
  const DFromV<decltype(v)> d;
12244
12813
  const RebindToUnsigned<decltype(d)> du64;
12245
12814
  const auto packed = Set(du64, packed_array[mask.raw]);
12246
- alignas(16) static constexpr uint64_t shifts[2] = {0, 4};
12247
- const auto indices = Indices128<T>{(packed >> Load(du64, shifts)).raw};
12248
- return TableLookupLanes(v, indices);
12815
+ alignas(16) static constexpr uint64_t kShifts[2] = {0, 4};
12816
+ Vec128<uint64_t> indices = packed >> Load(du64, kShifts);
12817
+ // _mm_permutevar_pd will ignore the upper bits, but TableLookupLanes uses
12818
+ // a fallback in MSAN builds, so mask there.
12819
+ HWY_IF_CONSTEXPR(HWY_IS_MSAN) indices &= Set(du64, 1);
12820
+ return TableLookupLanes(v, Indices128<T>{indices.raw});
12249
12821
  }
12250
12822
 
12251
12823
  // ------------------------------ CompressBlocksNot
@@ -12256,42 +12828,13 @@ HWY_API Vec128<uint64_t> CompressBlocksNot(Vec128<uint64_t> v,
12256
12828
 
12257
12829
  // ------------------------------ CompressStore (defined in x86_512)
12258
12830
 
12259
- // ------------------------------ CompressBlendedStore (CompressStore)
12260
- template <class D, HWY_IF_V_SIZE_LE_D(D, 8)>
12261
- HWY_API size_t CompressBlendedStore(VFromD<D> v, MFromD<D> m, D d,
12262
- TFromD<D>* HWY_RESTRICT unaligned) {
12263
- // AVX-512 already does the blending at no extra cost (latency 11,
12264
- // rthroughput 2 - same as compress plus store).
12265
- if (HWY_TARGET == HWY_AVX3_DL ||
12266
- (HWY_TARGET != HWY_AVX3_ZEN4 && sizeof(TFromD<D>) > 2)) {
12267
- // We're relying on the mask to blend. Clear the undefined upper bits.
12268
- constexpr size_t kN = MaxLanes(d);
12269
- if (kN != 16 / sizeof(TFromD<D>)) {
12270
- m = And(m, FirstN(d, kN));
12271
- }
12272
- return CompressStore(v, m, d, unaligned);
12273
- } else {
12274
- const size_t count = CountTrue(d, m);
12275
- const VFromD<D> compressed = Compress(v, m);
12276
- #if HWY_MEM_OPS_MIGHT_FAULT
12277
- // BlendedStore tests mask for each lane, but we know that the mask is
12278
- // FirstN, so we can just copy.
12279
- alignas(16) TFromD<D> buf[MaxLanes(d)];
12280
- Store(compressed, d, buf);
12281
- CopyBytes(buf, unaligned, count * sizeof(TFromD<D>));
12282
- #else
12283
- BlendedStore(compressed, FirstN(d, count), d, unaligned);
12284
- #endif
12285
- detail::MaybeUnpoison(unaligned, count);
12286
- return count;
12287
- }
12288
- }
12831
+ // ------------------------------ CompressBlendedStore (defined in x86_avx3)
12289
12832
 
12290
12833
  // ------------------------------ CompressBitsStore (defined in x86_512)
12291
12834
 
12292
12835
  #else // AVX2 or below
12293
12836
 
12294
- // ------------------------------ StoreMaskBits
12837
+ // ------------------------------ BitsFromMask
12295
12838
 
12296
12839
  namespace detail {
12297
12840
 
@@ -12299,50 +12842,45 @@ constexpr HWY_INLINE uint64_t U64FromInt(int mask_bits) {
12299
12842
  return static_cast<uint64_t>(static_cast<unsigned>(mask_bits));
12300
12843
  }
12301
12844
 
12302
- template <typename T, size_t N>
12303
- HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<1> /*tag*/,
12304
- const Mask128<T, N> mask) {
12305
- const Simd<T, N, 0> d;
12845
+ } // namespace detail
12846
+
12847
+ template <class D, HWY_IF_T_SIZE_D(D, 1), HWY_IF_V_SIZE_LE_D(D, 16)>
12848
+ HWY_API uint64_t BitsFromMask(D d, MFromD<D> mask) {
12306
12849
  const auto sign_bits = BitCast(d, VecFromMask(d, mask)).raw;
12307
- return U64FromInt(_mm_movemask_epi8(sign_bits));
12850
+ return detail::OnlyActive(d,
12851
+ detail::U64FromInt(_mm_movemask_epi8(sign_bits)));
12308
12852
  }
12309
12853
 
12310
- template <typename T, size_t N>
12311
- HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<2> /*tag*/,
12312
- const Mask128<T, N> mask) {
12854
+ template <class D, HWY_IF_T_SIZE_D(D, 2), HWY_IF_V_SIZE_LE_D(D, 16)>
12855
+ HWY_API uint64_t BitsFromMask(D d, MFromD<D> mask) {
12313
12856
  // Remove useless lower half of each u16 while preserving the sign bit.
12314
12857
  const auto sign_bits = _mm_packs_epi16(mask.raw, _mm_setzero_si128());
12315
- return U64FromInt(_mm_movemask_epi8(sign_bits));
12858
+ return detail::OnlyActive(d,
12859
+ detail::U64FromInt(_mm_movemask_epi8(sign_bits)));
12316
12860
  }
12317
12861
 
12318
- template <typename T, size_t N>
12319
- HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<4> /*tag*/, Mask128<T, N> mask) {
12320
- const Simd<T, N, 0> d;
12321
- const Simd<float, N, 0> df;
12862
+ template <class D, HWY_IF_T_SIZE_D(D, 4), HWY_IF_V_SIZE_LE_D(D, 16)>
12863
+ HWY_API uint64_t BitsFromMask(D d, MFromD<D> mask) {
12864
+ const RebindToFloat<decltype(d)> df;
12322
12865
  const auto sign_bits = BitCast(df, VecFromMask(d, mask));
12323
- return U64FromInt(_mm_movemask_ps(sign_bits.raw));
12866
+ return detail::OnlyActive(d,
12867
+ detail::U64FromInt(_mm_movemask_ps(sign_bits.raw)));
12324
12868
  }
12325
12869
 
12326
- template <typename T, size_t N>
12327
- HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<8> /*tag*/, Mask128<T, N> mask) {
12328
- const Simd<T, N, 0> d;
12329
- const Simd<double, N, 0> df;
12870
+ template <class D, HWY_IF_T_SIZE_D(D, 8), HWY_IF_V_SIZE_LE_D(D, 16)>
12871
+ HWY_API uint64_t BitsFromMask(D d, MFromD<D> mask) {
12872
+ const RebindToFloat<D> df;
12330
12873
  const auto sign_bits = BitCast(df, VecFromMask(d, mask));
12331
- return U64FromInt(_mm_movemask_pd(sign_bits.raw));
12332
- }
12333
-
12334
- template <typename T, size_t N>
12335
- HWY_INLINE uint64_t BitsFromMask(const Mask128<T, N> mask) {
12336
- return OnlyActive<T, N>(BitsFromMask(hwy::SizeTag<sizeof(T)>(), mask));
12874
+ return detail::OnlyActive(d,
12875
+ detail::U64FromInt(_mm_movemask_pd(sign_bits.raw)));
12337
12876
  }
12338
12877
 
12339
- } // namespace detail
12340
-
12878
+ // ------------------------------ StoreMaskBits
12341
12879
  // `p` points to at least 8 writable bytes.
12342
12880
  template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
12343
12881
  HWY_API size_t StoreMaskBits(D d, MFromD<D> mask, uint8_t* bits) {
12344
12882
  constexpr size_t kNumBytes = (MaxLanes(d) + 7) / 8;
12345
- const uint64_t mask_bits = detail::BitsFromMask(mask);
12883
+ const uint64_t mask_bits = BitsFromMask(d, mask);
12346
12884
  CopyBytes<kNumBytes>(&mask_bits, bits);
12347
12885
  return kNumBytes;
12348
12886
  }
@@ -12350,43 +12888,43 @@ HWY_API size_t StoreMaskBits(D d, MFromD<D> mask, uint8_t* bits) {
12350
12888
  // ------------------------------ Mask testing
12351
12889
 
12352
12890
  template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
12353
- HWY_API bool AllFalse(D /* tag */, MFromD<D> mask) {
12891
+ HWY_API bool AllFalse(D d, MFromD<D> mask) {
12354
12892
  // Cheaper than PTEST, which is 2 uop / 3L.
12355
- return detail::BitsFromMask(mask) == 0;
12893
+ return BitsFromMask(d, mask) == 0;
12356
12894
  }
12357
12895
 
12358
12896
  template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
12359
12897
  HWY_API bool AllTrue(D d, MFromD<D> mask) {
12360
12898
  constexpr uint64_t kAllBits = (1ull << MaxLanes(d)) - 1;
12361
- return detail::BitsFromMask(mask) == kAllBits;
12899
+ return BitsFromMask(d, mask) == kAllBits;
12362
12900
  }
12363
12901
 
12364
12902
  template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
12365
- HWY_API size_t CountTrue(D /* tag */, MFromD<D> mask) {
12366
- return PopCount(detail::BitsFromMask(mask));
12903
+ HWY_API size_t CountTrue(D d, MFromD<D> mask) {
12904
+ return PopCount(BitsFromMask(d, mask));
12367
12905
  }
12368
12906
 
12369
12907
  template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
12370
- HWY_API size_t FindKnownFirstTrue(D /* tag */, MFromD<D> mask) {
12908
+ HWY_API size_t FindKnownFirstTrue(D d, MFromD<D> mask) {
12371
12909
  return Num0BitsBelowLS1Bit_Nonzero32(
12372
- static_cast<uint32_t>(detail::BitsFromMask(mask)));
12910
+ static_cast<uint32_t>(BitsFromMask(d, mask)));
12373
12911
  }
12374
12912
 
12375
12913
  template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
12376
- HWY_API intptr_t FindFirstTrue(D /* tag */, MFromD<D> mask) {
12377
- const uint32_t mask_bits = static_cast<uint32_t>(detail::BitsFromMask(mask));
12914
+ HWY_API intptr_t FindFirstTrue(D d, MFromD<D> mask) {
12915
+ const uint32_t mask_bits = static_cast<uint32_t>(BitsFromMask(d, mask));
12378
12916
  return mask_bits ? intptr_t(Num0BitsBelowLS1Bit_Nonzero32(mask_bits)) : -1;
12379
12917
  }
12380
12918
 
12381
12919
  template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
12382
- HWY_API size_t FindKnownLastTrue(D /* tag */, MFromD<D> mask) {
12920
+ HWY_API size_t FindKnownLastTrue(D d, MFromD<D> mask) {
12383
12921
  return 31 - Num0BitsAboveMS1Bit_Nonzero32(
12384
- static_cast<uint32_t>(detail::BitsFromMask(mask)));
12922
+ static_cast<uint32_t>(BitsFromMask(d, mask)));
12385
12923
  }
12386
12924
 
12387
12925
  template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
12388
- HWY_API intptr_t FindLastTrue(D /* tag */, MFromD<D> mask) {
12389
- const uint32_t mask_bits = static_cast<uint32_t>(detail::BitsFromMask(mask));
12926
+ HWY_API intptr_t FindLastTrue(D d, MFromD<D> mask) {
12927
+ const uint32_t mask_bits = static_cast<uint32_t>(BitsFromMask(d, mask));
12390
12928
  return mask_bits ? intptr_t(31 - Num0BitsAboveMS1Bit_Nonzero32(mask_bits))
12391
12929
  : -1;
12392
12930
  }
@@ -12828,7 +13366,8 @@ HWY_API Vec128<T> Compress(Vec128<T> v, Mask128<T> mask) {
12828
13366
  // General case, 2 or 4 bytes
12829
13367
  template <typename T, size_t N, HWY_IF_T_SIZE_ONE_OF(T, (1 << 2) | (1 << 4))>
12830
13368
  HWY_API Vec128<T, N> Compress(Vec128<T, N> v, Mask128<T, N> mask) {
12831
- return detail::CompressBits(v, detail::BitsFromMask(mask));
13369
+ const DFromV<decltype(v)> d;
13370
+ return detail::CompressBits(v, BitsFromMask(d, mask));
12832
13371
  }
12833
13372
 
12834
13373
  // ------------------------------ CompressNot
@@ -12853,12 +13392,13 @@ HWY_API Vec128<T> CompressNot(Vec128<T> v, Mask128<T> mask) {
12853
13392
 
12854
13393
  template <typename T, size_t N, HWY_IF_T_SIZE_ONE_OF(T, (1 << 2) | (1 << 4))>
12855
13394
  HWY_API Vec128<T, N> CompressNot(Vec128<T, N> v, Mask128<T, N> mask) {
13395
+ const DFromV<decltype(v)> d;
12856
13396
  // For partial vectors, we cannot pull the Not() into the table because
12857
13397
  // BitsFromMask clears the upper bits.
12858
13398
  if (N < 16 / sizeof(T)) {
12859
- return detail::CompressBits(v, detail::BitsFromMask(Not(mask)));
13399
+ return detail::CompressBits(v, BitsFromMask(d, Not(mask)));
12860
13400
  }
12861
- return detail::CompressNotBits(v, detail::BitsFromMask(mask));
13401
+ return detail::CompressNotBits(v, BitsFromMask(d, mask));
12862
13402
  }
12863
13403
 
12864
13404
  // ------------------------------ CompressBlocksNot
@@ -12887,7 +13427,7 @@ HWY_API size_t CompressStore(VFromD<D> v, MFromD<D> m, D d,
12887
13427
  TFromD<D>* HWY_RESTRICT unaligned) {
12888
13428
  const RebindToUnsigned<decltype(d)> du;
12889
13429
 
12890
- const uint64_t mask_bits = detail::BitsFromMask(m);
13430
+ const uint64_t mask_bits = BitsFromMask(d, m);
12891
13431
  HWY_DASSERT(mask_bits < (1ull << MaxLanes(d)));
12892
13432
  const size_t count = PopCount(mask_bits);
12893
13433
 
@@ -12904,7 +13444,7 @@ HWY_API size_t CompressBlendedStore(VFromD<D> v, MFromD<D> m, D d,
12904
13444
  TFromD<D>* HWY_RESTRICT unaligned) {
12905
13445
  const RebindToUnsigned<decltype(d)> du;
12906
13446
 
12907
- const uint64_t mask_bits = detail::BitsFromMask(m);
13447
+ const uint64_t mask_bits = BitsFromMask(d, m);
12908
13448
  HWY_DASSERT(mask_bits < (1ull << MaxLanes(d)));
12909
13449
  const size_t count = PopCount(mask_bits);
12910
13450
 
@@ -13331,6 +13871,25 @@ HWY_API V BitShuffle(V v, VI idx) {
13331
13871
  }
13332
13872
  #endif // HWY_TARGET <= HWY_AVX3_DL
13333
13873
 
13874
+ // ------------------------------ MultiRotateRight
13875
+
13876
+ #if HWY_TARGET <= HWY_AVX3_DL
13877
+
13878
+ #ifdef HWY_NATIVE_MULTIROTATERIGHT
13879
+ #undef HWY_NATIVE_MULTIROTATERIGHT
13880
+ #else
13881
+ #define HWY_NATIVE_MULTIROTATERIGHT
13882
+ #endif
13883
+
13884
+ template <class V, class VI, HWY_IF_UI64(TFromV<V>), HWY_IF_UI8(TFromV<VI>),
13885
+ HWY_IF_V_SIZE_LE_V(V, 16),
13886
+ HWY_IF_V_SIZE_V(VI, HWY_MAX_LANES_V(V) * 8)>
13887
+ HWY_API V MultiRotateRight(V v, VI idx) {
13888
+ return V{_mm_multishift_epi64_epi8(idx.raw, v.raw)};
13889
+ }
13890
+
13891
+ #endif
13892
+
13334
13893
  // ------------------------------ Lt128
13335
13894
 
13336
13895
  namespace detail {