@img/sharp-libvips-dev 1.0.2 → 1.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (118) hide show
  1. package/README.md +1 -2
  2. package/include/aom/aom_decoder.h +1 -1
  3. package/include/aom/aom_encoder.h +7 -1
  4. package/include/aom/aom_image.h +24 -12
  5. package/include/aom/aom_integer.h +3 -3
  6. package/include/aom/aomcx.h +15 -0
  7. package/include/aom/aomdx.h +5 -2
  8. package/include/archive.h +7 -5
  9. package/include/archive_entry.h +5 -3
  10. package/include/cgif.h +3 -0
  11. package/include/freetype2/freetype/config/ftoption.h +1 -1
  12. package/include/fribidi/fribidi-config.h +2 -2
  13. package/include/fribidi/fribidi-unicode-version.h +3 -3
  14. package/include/glib-2.0/gio/gappinfo.h +40 -25
  15. package/include/glib-2.0/gio/gasyncresult.h +1 -1
  16. package/include/glib-2.0/gio/gconverter.h +5 -0
  17. package/include/glib-2.0/gio/gdbusintrospection.h +1 -1
  18. package/include/glib-2.0/gio/gfile.h +16 -0
  19. package/include/glib-2.0/gio/gio-visibility.h +34 -0
  20. package/include/glib-2.0/gio/gsettings.h +8 -0
  21. package/include/glib-2.0/gio/gvfs.h +2 -2
  22. package/include/glib-2.0/girepository/gi-visibility.h +34 -0
  23. package/include/glib-2.0/glib/gbookmarkfile.h +1 -1
  24. package/include/glib-2.0/glib/giochannel.h +2 -2
  25. package/include/glib-2.0/glib/glib-visibility.h +34 -0
  26. package/include/glib-2.0/glib/gmacros.h +12 -5
  27. package/include/glib-2.0/glib/gmain.h +93 -7
  28. package/include/glib-2.0/glib/gqsort.h +8 -1
  29. package/include/glib-2.0/glib/gstrfuncs.h +0 -12
  30. package/include/glib-2.0/glib/gstrvbuilder.h +3 -0
  31. package/include/glib-2.0/glib/gunicode.h +1 -1
  32. package/include/glib-2.0/glib/gversionmacros.h +9 -0
  33. package/include/glib-2.0/gmodule/gmodule-visibility.h +34 -0
  34. package/include/glib-2.0/gobject/gobject-visibility.h +34 -0
  35. package/include/glib-2.0/gobject/gtype.h +6 -6
  36. package/include/harfbuzz/hb-buffer.h +6 -0
  37. package/include/harfbuzz/hb-common.h +6 -9
  38. package/include/harfbuzz/hb-cplusplus.hh +8 -11
  39. package/include/harfbuzz/hb-subset.h +17 -4
  40. package/include/harfbuzz/hb-version.h +3 -3
  41. package/include/hwy/abort.h +28 -0
  42. package/include/hwy/aligned_allocator.h +48 -1
  43. package/include/hwy/base.h +235 -34
  44. package/include/hwy/detect_compiler_arch.h +84 -10
  45. package/include/hwy/detect_targets.h +95 -29
  46. package/include/hwy/foreach_target.h +12 -1
  47. package/include/hwy/highway.h +205 -50
  48. package/include/hwy/ops/arm_neon-inl.h +841 -99
  49. package/include/hwy/ops/arm_sve-inl.h +413 -141
  50. package/include/hwy/ops/emu128-inl.h +373 -360
  51. package/include/hwy/ops/generic_ops-inl.h +804 -401
  52. package/include/hwy/ops/inside-inl.h +691 -0
  53. package/include/hwy/ops/ppc_vsx-inl.h +456 -166
  54. package/include/hwy/ops/rvv-inl.h +537 -249
  55. package/include/hwy/ops/scalar-inl.h +169 -79
  56. package/include/hwy/ops/set_macros-inl.h +106 -18
  57. package/include/hwy/ops/shared-inl.h +23 -0
  58. package/include/hwy/ops/wasm_128-inl.h +130 -108
  59. package/include/hwy/ops/x86_128-inl.h +1892 -577
  60. package/include/hwy/ops/x86_256-inl.h +625 -184
  61. package/include/hwy/ops/x86_512-inl.h +733 -131
  62. package/include/hwy/targets.h +22 -21
  63. package/include/hwy/timer-inl.h +3 -3
  64. package/include/hwy/timer.h +5 -1
  65. package/include/libheif/heif.h +170 -15
  66. package/include/libheif/heif_items.h +237 -0
  67. package/include/libheif/heif_properties.h +38 -2
  68. package/include/libheif/heif_regions.h +1 -1
  69. package/include/libheif/heif_version.h +2 -2
  70. package/include/libpng16/pnglibconf.h +1 -1
  71. package/include/librsvg-2.0/librsvg/rsvg-cairo.h +1 -1
  72. package/include/librsvg-2.0/librsvg/rsvg-features.h +3 -4
  73. package/include/librsvg-2.0/librsvg/rsvg-pixbuf.h +235 -0
  74. package/include/librsvg-2.0/librsvg/rsvg-version.h +3 -3
  75. package/include/librsvg-2.0/librsvg/rsvg.h +55 -176
  76. package/include/libxml2/libxml/HTMLparser.h +12 -19
  77. package/include/libxml2/libxml/c14n.h +1 -12
  78. package/include/libxml2/libxml/debugXML.h +1 -1
  79. package/include/libxml2/libxml/encoding.h +9 -0
  80. package/include/libxml2/libxml/entities.h +12 -1
  81. package/include/libxml2/libxml/hash.h +19 -0
  82. package/include/libxml2/libxml/list.h +2 -2
  83. package/include/libxml2/libxml/nanohttp.h +17 -0
  84. package/include/libxml2/libxml/parser.h +61 -55
  85. package/include/libxml2/libxml/parserInternals.h +9 -1
  86. package/include/libxml2/libxml/pattern.h +6 -0
  87. package/include/libxml2/libxml/tree.h +32 -12
  88. package/include/libxml2/libxml/uri.h +11 -0
  89. package/include/libxml2/libxml/valid.h +29 -2
  90. package/include/libxml2/libxml/xinclude.h +7 -0
  91. package/include/libxml2/libxml/xmlIO.h +21 -4
  92. package/include/libxml2/libxml/xmlerror.h +14 -0
  93. package/include/libxml2/libxml/xmlexports.h +111 -15
  94. package/include/libxml2/libxml/xmlmemory.h +8 -45
  95. package/include/libxml2/libxml/xmlreader.h +2 -0
  96. package/include/libxml2/libxml/xmlsave.h +5 -0
  97. package/include/libxml2/libxml/xmlunicode.h +165 -1
  98. package/include/libxml2/libxml/xmlversion.h +15 -179
  99. package/include/libxml2/libxml/xmlwriter.h +1 -0
  100. package/include/libxml2/libxml/xpath.h +4 -0
  101. package/include/pango-1.0/pango/pango-features.h +3 -3
  102. package/include/pango-1.0/pango/pango-item.h +4 -2
  103. package/include/pango-1.0/pango/pango-version-macros.h +25 -0
  104. package/include/pango-1.0/pango/pangofc-font.h +2 -1
  105. package/include/pnglibconf.h +1 -1
  106. package/include/vips/util.h +1 -2
  107. package/include/vips/version.h +4 -4
  108. package/include/webp/decode.h +58 -56
  109. package/include/webp/demux.h +25 -21
  110. package/include/webp/encode.h +44 -39
  111. package/include/webp/mux.h +76 -15
  112. package/include/webp/mux_types.h +2 -1
  113. package/include/webp/sharpyuv/sharpyuv.h +77 -8
  114. package/include/webp/types.h +29 -8
  115. package/include/zconf.h +1 -1
  116. package/include/zlib.h +12 -12
  117. package/package.json +1 -1
  118. package/versions.json +14 -15
@@ -194,6 +194,25 @@ HWY_INLINE __m256i BitCastToInteger(__m256d v) {
194
194
  return _mm256_castpd_si256(v);
195
195
  }
196
196
 
197
+ #if HWY_AVX3_HAVE_F32_TO_BF16C
198
+ HWY_INLINE __m256i BitCastToInteger(__m256bh v) {
199
+ // Need to use reinterpret_cast on GCC/Clang or BitCastScalar on MSVC to
200
+ // bit cast a __m256bh to a __m256i as there is currently no intrinsic
201
+ // available (as of GCC 13 and Clang 17) that can bit cast a __m256bh vector
202
+ // to a __m256i vector
203
+
204
+ #if HWY_COMPILER_GCC || HWY_COMPILER_CLANG
205
+ // On GCC or Clang, use reinterpret_cast to bit cast a __m256bh to a __m256i
206
+ return reinterpret_cast<__m256i>(v);
207
+ #else
208
+ // On MSVC, use BitCastScalar to bit cast a __m256bh to a __m256i as MSVC does
209
+ // not allow reinterpret_cast, static_cast, or a C-style cast to be used to
210
+ // bit cast from one AVX vector type to a different AVX vector type
211
+ return BitCastScalar<__m256i>(v);
212
+ #endif // HWY_COMPILER_GCC || HWY_COMPILER_CLANG
213
+ }
214
+ #endif // HWY_AVX3_HAVE_F32_TO_BF16C
215
+
197
216
  template <typename T>
198
217
  HWY_INLINE Vec256<uint8_t> BitCastToByte(Vec256<T> v) {
199
218
  return Vec256<uint8_t>{BitCastToInteger(v.raw)};
@@ -516,7 +535,7 @@ template <typename T>
516
535
  HWY_API Vec256<T> Not(const Vec256<T> v) {
517
536
  const DFromV<decltype(v)> d;
518
537
  using TU = MakeUnsigned<T>;
519
- #if HWY_TARGET <= HWY_AVX3
538
+ #if HWY_TARGET <= HWY_AVX3 && !HWY_IS_MSAN
520
539
  const __m256i vu = BitCast(RebindToUnsigned<decltype(d)>(), v).raw;
521
540
  return BitCast(d, Vec256<TU>{_mm256_ternarylogic_epi32(vu, vu, vu, 0x55)});
522
541
  #else
@@ -527,7 +546,7 @@ HWY_API Vec256<T> Not(const Vec256<T> v) {
527
546
  // ------------------------------ Xor3
528
547
  template <typename T>
529
548
  HWY_API Vec256<T> Xor3(Vec256<T> x1, Vec256<T> x2, Vec256<T> x3) {
530
- #if HWY_TARGET <= HWY_AVX3
549
+ #if HWY_TARGET <= HWY_AVX3 && !HWY_IS_MSAN
531
550
  const DFromV<decltype(x1)> d;
532
551
  const RebindToUnsigned<decltype(d)> du;
533
552
  using VU = VFromD<decltype(du)>;
@@ -542,7 +561,7 @@ HWY_API Vec256<T> Xor3(Vec256<T> x1, Vec256<T> x2, Vec256<T> x3) {
542
561
  // ------------------------------ Or3
543
562
  template <typename T>
544
563
  HWY_API Vec256<T> Or3(Vec256<T> o1, Vec256<T> o2, Vec256<T> o3) {
545
- #if HWY_TARGET <= HWY_AVX3
564
+ #if HWY_TARGET <= HWY_AVX3 && !HWY_IS_MSAN
546
565
  const DFromV<decltype(o1)> d;
547
566
  const RebindToUnsigned<decltype(d)> du;
548
567
  using VU = VFromD<decltype(du)>;
@@ -557,7 +576,7 @@ HWY_API Vec256<T> Or3(Vec256<T> o1, Vec256<T> o2, Vec256<T> o3) {
557
576
  // ------------------------------ OrAnd
558
577
  template <typename T>
559
578
  HWY_API Vec256<T> OrAnd(Vec256<T> o, Vec256<T> a1, Vec256<T> a2) {
560
- #if HWY_TARGET <= HWY_AVX3
579
+ #if HWY_TARGET <= HWY_AVX3 && !HWY_IS_MSAN
561
580
  const DFromV<decltype(o)> d;
562
581
  const RebindToUnsigned<decltype(d)> du;
563
582
  using VU = VFromD<decltype(du)>;
@@ -572,7 +591,7 @@ HWY_API Vec256<T> OrAnd(Vec256<T> o, Vec256<T> a1, Vec256<T> a2) {
572
591
  // ------------------------------ IfVecThenElse
573
592
  template <typename T>
574
593
  HWY_API Vec256<T> IfVecThenElse(Vec256<T> mask, Vec256<T> yes, Vec256<T> no) {
575
- #if HWY_TARGET <= HWY_AVX3
594
+ #if HWY_TARGET <= HWY_AVX3 && !HWY_IS_MSAN
576
595
  const DFromV<decltype(yes)> d;
577
596
  const RebindToUnsigned<decltype(d)> du;
578
597
  using VU = VFromD<decltype(du)>;
@@ -768,13 +787,6 @@ HWY_API Vec256<double> IfThenZeroElse(Mask256<double> mask, Vec256<double> no) {
768
787
  return Vec256<double>{_mm256_mask_xor_pd(no.raw, mask.raw, no.raw, no.raw)};
769
788
  }
770
789
 
771
- template <typename T>
772
- HWY_API Vec256<T> ZeroIfNegative(const Vec256<T> v) {
773
- static_assert(IsSigned<T>(), "Only for float");
774
- // AVX3 MaskFromVec only looks at the MSB
775
- return IfThenZeroElse(MaskFromVec(v), v);
776
- }
777
-
778
790
  // ------------------------------ Mask logical
779
791
 
780
792
  namespace detail {
@@ -1074,6 +1086,28 @@ HWY_API MFromD<D> UpperHalfOfMask(D /*d*/, MFromD<Twice<D>> m) {
1074
1086
  return MFromD<D>{static_cast<decltype(MFromD<D>().raw)>(shifted_mask)};
1075
1087
  }
1076
1088
 
1089
+ template <class D, HWY_IF_LANES_D(D, 32)>
1090
+ HWY_API MFromD<D> SlideMask1Up(D /*d*/, MFromD<D> m) {
1091
+ using RawM = decltype(MFromD<D>().raw);
1092
+ #if HWY_COMPILER_HAS_MASK_INTRINSICS
1093
+ return MFromD<D>{
1094
+ static_cast<RawM>(_kshiftli_mask32(static_cast<__mmask32>(m.raw), 1))};
1095
+ #else
1096
+ return MFromD<D>{static_cast<RawM>(static_cast<uint32_t>(m.raw) << 1)};
1097
+ #endif
1098
+ }
1099
+
1100
+ template <class D, HWY_IF_LANES_D(D, 32)>
1101
+ HWY_API MFromD<D> SlideMask1Down(D /*d*/, MFromD<D> m) {
1102
+ using RawM = decltype(MFromD<D>().raw);
1103
+ #if HWY_COMPILER_HAS_MASK_INTRINSICS
1104
+ return MFromD<D>{
1105
+ static_cast<RawM>(_kshiftri_mask32(static_cast<__mmask32>(m.raw), 1))};
1106
+ #else
1107
+ return MFromD<D>{static_cast<RawM>(static_cast<uint32_t>(m.raw) >> 1)};
1108
+ #endif
1109
+ }
1110
+
1077
1111
  #else // AVX2
1078
1112
 
1079
1113
  // ------------------------------ Mask
@@ -1997,7 +2031,7 @@ HWY_API Vec256<int16_t> SaturatedAdd(Vec256<int16_t> a, Vec256<int16_t> b) {
1997
2031
  return Vec256<int16_t>{_mm256_adds_epi16(a.raw, b.raw)};
1998
2032
  }
1999
2033
 
2000
- #if HWY_TARGET <= HWY_AVX3
2034
+ #if HWY_TARGET <= HWY_AVX3 && !HWY_IS_MSAN
2001
2035
  HWY_API Vec256<int32_t> SaturatedAdd(Vec256<int32_t> a, Vec256<int32_t> b) {
2002
2036
  const DFromV<decltype(a)> d;
2003
2037
  const auto sum = a + b;
@@ -2019,7 +2053,7 @@ HWY_API Vec256<int64_t> SaturatedAdd(Vec256<int64_t> a, Vec256<int64_t> b) {
2019
2053
  i64_max.raw, MaskFromVec(a).raw, i64_max.raw, i64_max.raw, 0x55)};
2020
2054
  return IfThenElse(overflow_mask, overflow_result, sum);
2021
2055
  }
2022
- #endif // HWY_TARGET <= HWY_AVX3
2056
+ #endif // HWY_TARGET <= HWY_AVX3 && !HWY_IS_MSAN
2023
2057
 
2024
2058
  // ------------------------------ SaturatedSub
2025
2059
 
@@ -2041,7 +2075,7 @@ HWY_API Vec256<int16_t> SaturatedSub(Vec256<int16_t> a, Vec256<int16_t> b) {
2041
2075
  return Vec256<int16_t>{_mm256_subs_epi16(a.raw, b.raw)};
2042
2076
  }
2043
2077
 
2044
- #if HWY_TARGET <= HWY_AVX3
2078
+ #if HWY_TARGET <= HWY_AVX3 && !HWY_IS_MSAN
2045
2079
  HWY_API Vec256<int32_t> SaturatedSub(Vec256<int32_t> a, Vec256<int32_t> b) {
2046
2080
  const DFromV<decltype(a)> d;
2047
2081
  const auto diff = a - b;
@@ -2063,7 +2097,7 @@ HWY_API Vec256<int64_t> SaturatedSub(Vec256<int64_t> a, Vec256<int64_t> b) {
2063
2097
  i64_max.raw, MaskFromVec(a).raw, i64_max.raw, i64_max.raw, 0x55)};
2064
2098
  return IfThenElse(overflow_mask, overflow_result, diff);
2065
2099
  }
2066
- #endif // HWY_TARGET <= HWY_AVX3
2100
+ #endif // HWY_TARGET <= HWY_AVX3 && !HWY_IS_MSAN
2067
2101
 
2068
2102
  // ------------------------------ Average
2069
2103
 
@@ -2249,14 +2283,29 @@ HWY_API Vec256<int8_t> ShiftRight(Vec256<int8_t> v) {
2249
2283
 
2250
2284
  // ------------------------------ RotateRight
2251
2285
 
2252
- template <int kBits, typename T, HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2))>
2253
- HWY_API Vec256<T> RotateRight(const Vec256<T> v) {
2254
- constexpr size_t kSizeInBits = sizeof(T) * 8;
2255
- static_assert(0 <= kBits && kBits < kSizeInBits, "Invalid shift count");
2286
+ // U8 RotateRight implementation on AVX3_DL is now in x86_512-inl.h as U8
2287
+ // RotateRight uses detail::GaloisAffine on AVX3_DL
2288
+
2289
+ #if HWY_TARGET > HWY_AVX3_DL
2290
+ template <int kBits>
2291
+ HWY_API Vec256<uint8_t> RotateRight(const Vec256<uint8_t> v) {
2292
+ static_assert(0 <= kBits && kBits < 8, "Invalid shift count");
2293
+ if (kBits == 0) return v;
2294
+ // AVX3 does not support 8-bit.
2295
+ return Or(ShiftRight<kBits>(v), ShiftLeft<HWY_MIN(7, 8 - kBits)>(v));
2296
+ }
2297
+ #endif
2298
+
2299
+ template <int kBits>
2300
+ HWY_API Vec256<uint16_t> RotateRight(const Vec256<uint16_t> v) {
2301
+ static_assert(0 <= kBits && kBits < 16, "Invalid shift count");
2256
2302
  if (kBits == 0) return v;
2257
- // AVX3 does not support 8/16-bit.
2258
- return Or(ShiftRight<kBits>(v),
2259
- ShiftLeft<HWY_MIN(kSizeInBits - 1, kSizeInBits - kBits)>(v));
2303
+ #if HWY_TARGET <= HWY_AVX3_DL
2304
+ return Vec256<uint16_t>{_mm256_shrdi_epi16(v.raw, v.raw, kBits)};
2305
+ #else
2306
+ // AVX3 does not support 16-bit.
2307
+ return Or(ShiftRight<kBits>(v), ShiftLeft<HWY_MIN(15, 16 - kBits)>(v));
2308
+ #endif
2260
2309
  }
2261
2310
 
2262
2311
  template <int kBits>
@@ -2281,6 +2330,38 @@ HWY_API Vec256<uint64_t> RotateRight(const Vec256<uint64_t> v) {
2281
2330
  #endif
2282
2331
  }
2283
2332
 
2333
+ // ------------------------------ Rol/Ror
2334
+ #if HWY_TARGET <= HWY_AVX3_DL
2335
+ template <class T, HWY_IF_UI16(T)>
2336
+ HWY_API Vec256<T> Ror(Vec256<T> a, Vec256<T> b) {
2337
+ return Vec256<T>{_mm256_shrdv_epi16(a.raw, a.raw, b.raw)};
2338
+ }
2339
+ #endif // HWY_TARGET <= HWY_AVX3_DL
2340
+
2341
+ #if HWY_TARGET <= HWY_AVX3
2342
+
2343
+ template <class T, HWY_IF_UI32(T)>
2344
+ HWY_API Vec256<T> Rol(Vec256<T> a, Vec256<T> b) {
2345
+ return Vec256<T>{_mm256_rolv_epi32(a.raw, b.raw)};
2346
+ }
2347
+
2348
+ template <class T, HWY_IF_UI32(T)>
2349
+ HWY_API Vec256<T> Ror(Vec256<T> a, Vec256<T> b) {
2350
+ return Vec256<T>{_mm256_rorv_epi32(a.raw, b.raw)};
2351
+ }
2352
+
2353
+ template <class T, HWY_IF_UI64(T)>
2354
+ HWY_API Vec256<T> Rol(Vec256<T> a, Vec256<T> b) {
2355
+ return Vec256<T>{_mm256_rolv_epi64(a.raw, b.raw)};
2356
+ }
2357
+
2358
+ template <class T, HWY_IF_UI64(T)>
2359
+ HWY_API Vec256<T> Ror(Vec256<T> a, Vec256<T> b) {
2360
+ return Vec256<T>{_mm256_rorv_epi64(a.raw, b.raw)};
2361
+ }
2362
+
2363
+ #endif
2364
+
2284
2365
  // ------------------------------ BroadcastSignBit (ShiftRight, compare, mask)
2285
2366
 
2286
2367
  HWY_API Vec256<int8_t> BroadcastSignBit(const Vec256<int8_t> v) {
@@ -3150,6 +3231,15 @@ HWY_API Mask256<float16_t> IsNaN(Vec256<float16_t> v) {
3150
3231
  v.raw, HWY_X86_FPCLASS_SNAN | HWY_X86_FPCLASS_QNAN)};
3151
3232
  }
3152
3233
 
3234
+ HWY_API Mask256<float16_t> IsEitherNaN(Vec256<float16_t> a,
3235
+ Vec256<float16_t> b) {
3236
+ // Work around warnings in the intrinsic definitions (passing -1 as a mask).
3237
+ HWY_DIAGNOSTICS(push)
3238
+ HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion")
3239
+ return Mask256<float16_t>{_mm256_cmp_ph_mask(a.raw, b.raw, _CMP_UNORD_Q)};
3240
+ HWY_DIAGNOSTICS(pop)
3241
+ }
3242
+
3153
3243
  HWY_API Mask256<float16_t> IsInf(Vec256<float16_t> v) {
3154
3244
  return Mask256<float16_t>{_mm256_fpclass_ph_mask(
3155
3245
  v.raw, HWY_X86_FPCLASS_NEG_INF | HWY_X86_FPCLASS_POS_INF)};
@@ -3182,6 +3272,22 @@ HWY_API Mask256<double> IsNaN(Vec256<double> v) {
3182
3272
  #endif
3183
3273
  }
3184
3274
 
3275
+ HWY_API Mask256<float> IsEitherNaN(Vec256<float> a, Vec256<float> b) {
3276
+ #if HWY_TARGET <= HWY_AVX3
3277
+ return Mask256<float>{_mm256_cmp_ps_mask(a.raw, b.raw, _CMP_UNORD_Q)};
3278
+ #else
3279
+ return Mask256<float>{_mm256_cmp_ps(a.raw, b.raw, _CMP_UNORD_Q)};
3280
+ #endif
3281
+ }
3282
+
3283
+ HWY_API Mask256<double> IsEitherNaN(Vec256<double> a, Vec256<double> b) {
3284
+ #if HWY_TARGET <= HWY_AVX3
3285
+ return Mask256<double>{_mm256_cmp_pd_mask(a.raw, b.raw, _CMP_UNORD_Q)};
3286
+ #else
3287
+ return Mask256<double>{_mm256_cmp_pd(a.raw, b.raw, _CMP_UNORD_Q)};
3288
+ #endif
3289
+ }
3290
+
3185
3291
  #if HWY_TARGET <= HWY_AVX3
3186
3292
 
3187
3293
  HWY_API Mask256<float> IsInf(Vec256<float> v) {
@@ -3716,20 +3822,14 @@ HWY_API Vec256<double> NativeGather256(const double* HWY_RESTRICT base,
3716
3822
  } // namespace detail
3717
3823
 
3718
3824
  template <class D, HWY_IF_V_SIZE_D(D, 32)>
3719
- HWY_API VFromD<D> GatherOffset(D d, const TFromD<D>* HWY_RESTRICT base,
3825
+ HWY_API VFromD<D> GatherOffset(D /*d*/, const TFromD<D>* HWY_RESTRICT base,
3720
3826
  VFromD<RebindToSigned<D>> offsets) {
3721
- const RebindToSigned<decltype(d)> di;
3722
- (void)di; // for HWY_DASSERT
3723
- HWY_DASSERT(AllFalse(di, Lt(offsets, Zero(di))));
3724
3827
  return detail::NativeGather256<1>(base, offsets);
3725
3828
  }
3726
3829
 
3727
3830
  template <class D, HWY_IF_V_SIZE_D(D, 32)>
3728
- HWY_API VFromD<D> GatherIndex(D d, const TFromD<D>* HWY_RESTRICT base,
3831
+ HWY_API VFromD<D> GatherIndex(D /*d*/, const TFromD<D>* HWY_RESTRICT base,
3729
3832
  VFromD<RebindToSigned<D>> indices) {
3730
- const RebindToSigned<decltype(d)> di;
3731
- (void)di; // for HWY_DASSERT
3732
- HWY_DASSERT(AllFalse(di, Lt(indices, Zero(di))));
3733
3833
  return detail::NativeGather256<sizeof(TFromD<D>)>(base, indices);
3734
3834
  }
3735
3835
 
@@ -3802,12 +3902,9 @@ HWY_API Vec256<double> NativeMaskedGatherOr256(Vec256<double> no,
3802
3902
  } // namespace detail
3803
3903
 
3804
3904
  template <class D, HWY_IF_V_SIZE_D(D, 32)>
3805
- HWY_API VFromD<D> MaskedGatherIndexOr(VFromD<D> no, MFromD<D> m, D d,
3905
+ HWY_API VFromD<D> MaskedGatherIndexOr(VFromD<D> no, MFromD<D> m, D /*d*/,
3806
3906
  const TFromD<D>* HWY_RESTRICT base,
3807
3907
  VFromD<RebindToSigned<D>> indices) {
3808
- const RebindToSigned<decltype(d)> di;
3809
- (void)di; // for HWY_DASSERT
3810
- HWY_DASSERT(AllFalse(di, Lt(indices, Zero(di))));
3811
3908
  return detail::NativeMaskedGatherOr256<sizeof(TFromD<D>)>(no, m, base,
3812
3909
  indices);
3813
3910
  }
@@ -5218,6 +5315,72 @@ HWY_API Vec256<double> OddEven(Vec256<double> a, Vec256<double> b) {
5218
5315
  return Vec256<double>{_mm256_blend_pd(a.raw, b.raw, 5)};
5219
5316
  }
5220
5317
 
5318
+ // -------------------------- InterleaveEven
5319
+
5320
+ #if HWY_TARGET <= HWY_AVX3
5321
+ template <class D, HWY_IF_LANES_D(D, 8), HWY_IF_UI32_D(D)>
5322
+ HWY_API VFromD<D> InterleaveEven(D /*d*/, VFromD<D> a, VFromD<D> b) {
5323
+ return VFromD<D>{_mm256_mask_shuffle_epi32(
5324
+ a.raw, static_cast<__mmask8>(0xAA), b.raw,
5325
+ static_cast<_MM_PERM_ENUM>(_MM_SHUFFLE(2, 2, 0, 0)))};
5326
+ }
5327
+ template <class D, HWY_IF_LANES_D(D, 8), HWY_IF_F32_D(D)>
5328
+ HWY_API VFromD<D> InterleaveEven(D /*d*/, VFromD<D> a, VFromD<D> b) {
5329
+ return VFromD<D>{_mm256_mask_shuffle_ps(a.raw, static_cast<__mmask8>(0xAA),
5330
+ b.raw, b.raw,
5331
+ _MM_SHUFFLE(2, 2, 0, 0))};
5332
+ }
5333
+ #else
5334
+ template <class D, HWY_IF_LANES_D(D, 8), HWY_IF_T_SIZE_D(D, 4)>
5335
+ HWY_API VFromD<D> InterleaveEven(D d, VFromD<D> a, VFromD<D> b) {
5336
+ const RebindToFloat<decltype(d)> df;
5337
+ const VFromD<decltype(df)> b2_b0_a2_a0{_mm256_shuffle_ps(
5338
+ BitCast(df, a).raw, BitCast(df, b).raw, _MM_SHUFFLE(2, 0, 2, 0))};
5339
+ return BitCast(
5340
+ d, VFromD<decltype(df)>{_mm256_shuffle_ps(
5341
+ b2_b0_a2_a0.raw, b2_b0_a2_a0.raw, _MM_SHUFFLE(3, 1, 2, 0))});
5342
+ }
5343
+ #endif
5344
+
5345
+ // I64/U64/F64 InterleaveEven is generic for vector lengths >= 32 bytes
5346
+ template <class D, HWY_IF_LANES_GT_D(D, 2), HWY_IF_T_SIZE_D(D, 8)>
5347
+ HWY_API VFromD<D> InterleaveEven(D /*d*/, VFromD<D> a, VFromD<D> b) {
5348
+ return InterleaveLower(a, b);
5349
+ }
5350
+
5351
+ // -------------------------- InterleaveOdd
5352
+
5353
+ #if HWY_TARGET <= HWY_AVX3
5354
+ template <class D, HWY_IF_LANES_D(D, 8), HWY_IF_UI32_D(D)>
5355
+ HWY_API VFromD<D> InterleaveOdd(D /*d*/, VFromD<D> a, VFromD<D> b) {
5356
+ return VFromD<D>{_mm256_mask_shuffle_epi32(
5357
+ b.raw, static_cast<__mmask8>(0x55), a.raw,
5358
+ static_cast<_MM_PERM_ENUM>(_MM_SHUFFLE(3, 3, 1, 1)))};
5359
+ }
5360
+ template <class D, HWY_IF_LANES_D(D, 8), HWY_IF_F32_D(D)>
5361
+ HWY_API VFromD<D> InterleaveOdd(D /*d*/, VFromD<D> a, VFromD<D> b) {
5362
+ return VFromD<D>{_mm256_mask_shuffle_ps(b.raw, static_cast<__mmask8>(0x55),
5363
+ a.raw, a.raw,
5364
+ _MM_SHUFFLE(3, 3, 1, 1))};
5365
+ }
5366
+ #else
5367
+ template <class D, HWY_IF_LANES_D(D, 8), HWY_IF_T_SIZE_D(D, 4)>
5368
+ HWY_API VFromD<D> InterleaveOdd(D d, VFromD<D> a, VFromD<D> b) {
5369
+ const RebindToFloat<decltype(d)> df;
5370
+ const VFromD<decltype(df)> b3_b1_a3_a3{_mm256_shuffle_ps(
5371
+ BitCast(df, a).raw, BitCast(df, b).raw, _MM_SHUFFLE(3, 1, 3, 1))};
5372
+ return BitCast(
5373
+ d, VFromD<decltype(df)>{_mm256_shuffle_ps(
5374
+ b3_b1_a3_a3.raw, b3_b1_a3_a3.raw, _MM_SHUFFLE(3, 1, 2, 0))});
5375
+ }
5376
+ #endif
5377
+
5378
+ // I64/U64/F64 InterleaveOdd is generic for vector lengths >= 32 bytes
5379
+ template <class D, HWY_IF_LANES_GT_D(D, 2), HWY_IF_T_SIZE_D(D, 8)>
5380
+ HWY_API VFromD<D> InterleaveOdd(D d, VFromD<D> a, VFromD<D> b) {
5381
+ return InterleaveUpper(d, a, b);
5382
+ }
5383
+
5221
5384
  // ------------------------------ OddEvenBlocks
5222
5385
 
5223
5386
  template <typename T, HWY_IF_NOT_FLOAT3264(T)>
@@ -5969,62 +6132,6 @@ HWY_API Vec256<int64_t> operator>>(Vec256<int64_t> v, Vec256<int64_t> bits) {
5969
6132
  #endif
5970
6133
  }
5971
6134
 
5972
- HWY_INLINE Vec256<uint64_t> MulEven(const Vec256<uint64_t> a,
5973
- const Vec256<uint64_t> b) {
5974
- const Full256<uint64_t> du64;
5975
- const RepartitionToNarrow<decltype(du64)> du32;
5976
- const auto maskL = Set(du64, 0xFFFFFFFFULL);
5977
- const auto a32 = BitCast(du32, a);
5978
- const auto b32 = BitCast(du32, b);
5979
- // Inputs for MulEven: we only need the lower 32 bits
5980
- const auto aH = Shuffle2301(a32);
5981
- const auto bH = Shuffle2301(b32);
5982
-
5983
- // Knuth double-word multiplication. We use 32x32 = 64 MulEven and only need
5984
- // the even (lower 64 bits of every 128-bit block) results. See
5985
- // https://github.com/hcs0/Hackers-Delight/blob/master/muldwu.c.tat
5986
- const auto aLbL = MulEven(a32, b32);
5987
- const auto w3 = aLbL & maskL;
5988
-
5989
- const auto t2 = MulEven(aH, b32) + ShiftRight<32>(aLbL);
5990
- const auto w2 = t2 & maskL;
5991
- const auto w1 = ShiftRight<32>(t2);
5992
-
5993
- const auto t = MulEven(a32, bH) + w2;
5994
- const auto k = ShiftRight<32>(t);
5995
-
5996
- const auto mulH = MulEven(aH, bH) + w1 + k;
5997
- const auto mulL = ShiftLeft<32>(t) + w3;
5998
- return InterleaveLower(mulL, mulH);
5999
- }
6000
-
6001
- HWY_INLINE Vec256<uint64_t> MulOdd(const Vec256<uint64_t> a,
6002
- const Vec256<uint64_t> b) {
6003
- const Full256<uint64_t> du64;
6004
- const RepartitionToNarrow<decltype(du64)> du32;
6005
- const auto maskL = Set(du64, 0xFFFFFFFFULL);
6006
- const auto a32 = BitCast(du32, a);
6007
- const auto b32 = BitCast(du32, b);
6008
- // Inputs for MulEven: we only need bits [95:64] (= upper half of input)
6009
- const auto aH = Shuffle2301(a32);
6010
- const auto bH = Shuffle2301(b32);
6011
-
6012
- // Same as above, but we're using the odd results (upper 64 bits per block).
6013
- const auto aLbL = MulEven(a32, b32);
6014
- const auto w3 = aLbL & maskL;
6015
-
6016
- const auto t2 = MulEven(aH, b32) + ShiftRight<32>(aLbL);
6017
- const auto w2 = t2 & maskL;
6018
- const auto w1 = ShiftRight<32>(t2);
6019
-
6020
- const auto t = MulEven(a32, bH) + w2;
6021
- const auto k = ShiftRight<32>(t);
6022
-
6023
- const auto mulH = MulEven(aH, bH) + w1 + k;
6024
- const auto mulL = ShiftLeft<32>(t) + w3;
6025
- return InterleaveUpper(du64, mulL, mulH);
6026
- }
6027
-
6028
6135
  // ------------------------------ WidenMulPairwiseAdd
6029
6136
  template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_I32_D(D)>
6030
6137
  HWY_API VFromD<D> WidenMulPairwiseAdd(D /*d32*/, Vec256<int16_t> a,
@@ -6041,7 +6148,31 @@ HWY_API VFromD<DI16> SatWidenMulPairwiseAdd(
6041
6148
  return VFromD<DI16>{_mm256_maddubs_epi16(a.raw, b.raw)};
6042
6149
  }
6043
6150
 
6151
+ // ------------------------------ SatWidenMulPairwiseAccumulate
6152
+
6153
+ #if HWY_TARGET <= HWY_AVX3_DL
6154
+ template <class DI32, HWY_IF_I32_D(DI32), HWY_IF_V_SIZE_D(DI32, 32)>
6155
+ HWY_API VFromD<DI32> SatWidenMulPairwiseAccumulate(
6156
+ DI32 /* tag */, VFromD<Repartition<int16_t, DI32>> a,
6157
+ VFromD<Repartition<int16_t, DI32>> b, VFromD<DI32> sum) {
6158
+ return VFromD<DI32>{_mm256_dpwssds_epi32(sum.raw, a.raw, b.raw)};
6159
+ }
6160
+ #endif // HWY_TARGET <= HWY_AVX3_DL
6161
+
6044
6162
  // ------------------------------ ReorderWidenMulAccumulate
6163
+
6164
+ #if HWY_NATIVE_DOT_BF16
6165
+ template <class DF, HWY_IF_F32_D(DF), HWY_IF_V_SIZE_D(DF, 32),
6166
+ class VBF = VFromD<Repartition<bfloat16_t, DF>>>
6167
+ HWY_API VFromD<DF> ReorderWidenMulAccumulate(DF /*df*/, VBF a, VBF b,
6168
+ const VFromD<DF> sum0,
6169
+ VFromD<DF>& /*sum1*/) {
6170
+ return VFromD<DF>{_mm256_dpbf16_ps(sum0.raw,
6171
+ reinterpret_cast<__m256bh>(a.raw),
6172
+ reinterpret_cast<__m256bh>(b.raw))};
6173
+ }
6174
+ #endif // HWY_NATIVE_DOT_BF16
6175
+
6045
6176
  template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_I32_D(D)>
6046
6177
  HWY_API VFromD<D> ReorderWidenMulAccumulate(D d, Vec256<int16_t> a,
6047
6178
  Vec256<int16_t> b,
@@ -6159,19 +6290,63 @@ HWY_API VFromD<D> PromoteTo(D /* tag */, Vec32<int8_t> v) {
6159
6290
 
6160
6291
  #if HWY_TARGET <= HWY_AVX3
6161
6292
  template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_I64_D(D)>
6162
- HWY_API VFromD<D> PromoteTo(D di64, VFromD<Rebind<float, D>> v) {
6163
- const Rebind<float, decltype(di64)> df32;
6164
- const RebindToFloat<decltype(di64)> df64;
6165
- const RebindToSigned<decltype(df32)> di32;
6293
+ HWY_API VFromD<D> PromoteInRangeTo(D /*di64*/, VFromD<Rebind<float, D>> v) {
6294
+ #if HWY_COMPILER_GCC_ACTUAL
6295
+ // Workaround for undefined behavior with GCC if any values of v[i] are not
6296
+ // within the range of an int64_t
6297
+
6298
+ #if HWY_COMPILER_GCC_ACTUAL >= 700 && !HWY_IS_DEBUG_BUILD
6299
+ if (detail::IsConstantX86VecForF2IConv<int64_t>(v)) {
6300
+ typedef float GccF32RawVectType __attribute__((__vector_size__(16)));
6301
+ const auto raw_v = reinterpret_cast<GccF32RawVectType>(v.raw);
6302
+ return VFromD<D>{_mm256_setr_epi64x(
6303
+ detail::X86ConvertScalarFromFloat<int64_t>(raw_v[0]),
6304
+ detail::X86ConvertScalarFromFloat<int64_t>(raw_v[1]),
6305
+ detail::X86ConvertScalarFromFloat<int64_t>(raw_v[2]),
6306
+ detail::X86ConvertScalarFromFloat<int64_t>(raw_v[3]))};
6307
+ }
6308
+ #endif
6166
6309
 
6167
- return detail::FixConversionOverflow(
6168
- di64, BitCast(df64, PromoteTo(di64, BitCast(di32, v))),
6169
- VFromD<D>{_mm256_cvttps_epi64(v.raw)});
6310
+ __m256i raw_result;
6311
+ __asm__("vcvttps2qq {%1, %0|%0, %1}"
6312
+ : "=" HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(raw_result)
6313
+ : HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(v.raw)
6314
+ :);
6315
+ return VFromD<D>{raw_result};
6316
+ #else // !HWY_COMPILER_GCC_ACTUAL
6317
+ return VFromD<D>{_mm256_cvttps_epi64(v.raw)};
6318
+ #endif // HWY_COMPILER_GCC_ACTUAL
6170
6319
  }
6171
6320
  template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_U64_D(D)>
6172
- HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<float, D>> v) {
6173
- return VFromD<D>{_mm256_maskz_cvttps_epu64(
6174
- detail::UnmaskedNot(MaskFromVec(v)).raw, v.raw)};
6321
+ HWY_API VFromD<D> PromoteInRangeTo(D /* tag */, VFromD<Rebind<float, D>> v) {
6322
+ #if HWY_COMPILER_GCC_ACTUAL
6323
+ // Workaround for undefined behavior with GCC if any values of v[i] are not
6324
+ // within the range of an uint64_t
6325
+ #if HWY_COMPILER_GCC_ACTUAL >= 700 && !HWY_IS_DEBUG_BUILD
6326
+ if (detail::IsConstantX86VecForF2IConv<uint64_t>(v)) {
6327
+ typedef float GccF32RawVectType __attribute__((__vector_size__(16)));
6328
+ const auto raw_v = reinterpret_cast<GccF32RawVectType>(v.raw);
6329
+ return VFromD<D>{_mm256_setr_epi64x(
6330
+ static_cast<int64_t>(
6331
+ detail::X86ConvertScalarFromFloat<uint64_t>(raw_v[0])),
6332
+ static_cast<int64_t>(
6333
+ detail::X86ConvertScalarFromFloat<uint64_t>(raw_v[1])),
6334
+ static_cast<int64_t>(
6335
+ detail::X86ConvertScalarFromFloat<uint64_t>(raw_v[2])),
6336
+ static_cast<int64_t>(
6337
+ detail::X86ConvertScalarFromFloat<uint64_t>(raw_v[3])))};
6338
+ }
6339
+ #endif
6340
+
6341
+ __m256i raw_result;
6342
+ __asm__("vcvttps2uqq {%1, %0|%0, %1}"
6343
+ : "=" HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(raw_result)
6344
+ : HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(v.raw)
6345
+ :);
6346
+ return VFromD<D>{raw_result};
6347
+ #else // !HWY_COMPILER_GCC_ACTUAL
6348
+ return VFromD<D>{_mm256_cvttps_epu64(v.raw)};
6349
+ #endif // HWY_COMPILER_GCC_ACTUAL
6175
6350
  }
6176
6351
  #endif // HWY_TARGET <= HWY_AVX3
6177
6352
 
@@ -6341,24 +6516,38 @@ HWY_API VFromD<D> DemoteTo(D /*df16*/, Vec256<double> v) {
6341
6516
  }
6342
6517
  #endif // HWY_HAVE_FLOAT16
6343
6518
 
6519
+ #if HWY_AVX3_HAVE_F32_TO_BF16C
6344
6520
  template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_BF16_D(D)>
6345
- HWY_API VFromD<D> DemoteTo(D dbf16, Vec256<float> v) {
6346
- // TODO(janwas): _mm256_cvtneps_pbh once we have avx512bf16.
6347
- const Rebind<int32_t, decltype(dbf16)> di32;
6348
- const Rebind<uint32_t, decltype(dbf16)> du32; // for logical shift right
6349
- const Rebind<uint16_t, decltype(dbf16)> du16;
6350
- const auto bits_in_32 = BitCast(di32, ShiftRight<16>(BitCast(du32, v)));
6351
- return BitCast(dbf16, DemoteTo(du16, bits_in_32));
6521
+ HWY_API VFromD<D> DemoteTo(D /*dbf16*/, Vec256<float> v) {
6522
+ #if HWY_COMPILER_CLANG >= 1600 && HWY_COMPILER_CLANG < 2000
6523
+ // Inline assembly workaround for LLVM codegen bug
6524
+ __m128i raw_result;
6525
+ __asm__("vcvtneps2bf16 %1, %0" : "=v"(raw_result) : "v"(v.raw));
6526
+ return VFromD<D>{raw_result};
6527
+ #else
6528
+ // The _mm256_cvtneps_pbh intrinsic returns a __m128bh vector that needs to be
6529
+ // bit casted to a __m128i vector
6530
+ return VFromD<D>{detail::BitCastToInteger(_mm256_cvtneps_pbh(v.raw))};
6531
+ #endif
6352
6532
  }
6353
6533
 
6354
6534
  template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_BF16_D(D)>
6355
- HWY_API VFromD<D> ReorderDemote2To(D dbf16, Vec256<float> a, Vec256<float> b) {
6356
- // TODO(janwas): _mm256_cvtne2ps_pbh once we have avx512bf16.
6357
- const RebindToUnsigned<decltype(dbf16)> du16;
6358
- const Repartition<uint32_t, decltype(dbf16)> du32;
6359
- const Vec256<uint32_t> b_in_even = ShiftRight<16>(BitCast(du32, b));
6360
- return BitCast(dbf16, OddEven(BitCast(du16, a), BitCast(du16, b_in_even)));
6535
+ HWY_API VFromD<D> ReorderDemote2To(D /*dbf16*/, Vec256<float> a,
6536
+ Vec256<float> b) {
6537
+ #if HWY_COMPILER_CLANG >= 1600 && HWY_COMPILER_CLANG < 2000
6538
+ // Inline assembly workaround for LLVM codegen bug
6539
+ __m256i raw_result;
6540
+ __asm__("vcvtne2ps2bf16 %2, %1, %0"
6541
+ : "=v"(raw_result)
6542
+ : "v"(b.raw), "v"(a.raw));
6543
+ return VFromD<D>{raw_result};
6544
+ #else
6545
+ // The _mm256_cvtne2ps_pbh intrinsic returns a __m256bh vector that needs to
6546
+ // be bit casted to a __m256i vector
6547
+ return VFromD<D>{detail::BitCastToInteger(_mm256_cvtne2ps_pbh(b.raw, a.raw))};
6548
+ #endif
6361
6549
  }
6550
+ #endif // HWY_AVX3_HAVE_F32_TO_BF16C
6362
6551
 
6363
6552
  template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_I16_D(D)>
6364
6553
  HWY_API VFromD<D> ReorderDemote2To(D /*d16*/, Vec256<int32_t> a,
@@ -6449,9 +6638,9 @@ HWY_API Vec256<uint32_t> ReorderDemote2To(D dn, Vec256<int64_t> a,
6449
6638
  _MM_SHUFFLE(2, 0, 2, 0))});
6450
6639
  }
6451
6640
 
6452
- template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_U32_D(D)>
6453
- HWY_API Vec256<uint32_t> ReorderDemote2To(D dn, Vec256<uint64_t> a,
6454
- Vec256<uint64_t> b) {
6641
+ template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_UI32_D(D)>
6642
+ HWY_API VFromD<D> ReorderDemote2To(D dn, Vec256<uint64_t> a,
6643
+ Vec256<uint64_t> b) {
6455
6644
  const Half<decltype(dn)> dnh;
6456
6645
  const Repartition<float, decltype(dn)> dn_f;
6457
6646
 
@@ -6483,37 +6672,64 @@ HWY_API VFromD<D> DemoteTo(D /* tag */, Vec256<double> v) {
6483
6672
  }
6484
6673
 
6485
6674
  template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_I32_D(D)>
6486
- HWY_API VFromD<D> DemoteTo(D /* tag */, Vec256<double> v) {
6487
- const Full256<double> d64;
6488
- const auto clamped = detail::ClampF64ToI32Max(d64, v);
6489
- return VFromD<D>{_mm256_cvttpd_epi32(clamped.raw)};
6675
+ HWY_API VFromD<D> DemoteInRangeTo(D /* tag */, Vec256<double> v) {
6676
+ #if HWY_COMPILER_GCC_ACTUAL
6677
+ // Workaround for undefined behavior in _mm256_cvttpd_epi32 with GCC if any
6678
+ // values of v[i] are not within the range of an int32_t
6679
+
6680
+ #if HWY_COMPILER_GCC_ACTUAL >= 700 && !HWY_IS_DEBUG_BUILD
6681
+ if (detail::IsConstantX86VecForF2IConv<int32_t>(v)) {
6682
+ typedef double GccF64RawVectType __attribute__((__vector_size__(32)));
6683
+ const auto raw_v = reinterpret_cast<GccF64RawVectType>(v.raw);
6684
+ return Dup128VecFromValues(
6685
+ D(), detail::X86ConvertScalarFromFloat<int32_t>(raw_v[0]),
6686
+ detail::X86ConvertScalarFromFloat<int32_t>(raw_v[1]),
6687
+ detail::X86ConvertScalarFromFloat<int32_t>(raw_v[2]),
6688
+ detail::X86ConvertScalarFromFloat<int32_t>(raw_v[3]));
6689
+ }
6690
+ #endif
6691
+
6692
+ __m128i raw_result;
6693
+ __asm__("vcvttpd2dq {%1, %0|%0, %1}"
6694
+ : "=" HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(raw_result)
6695
+ : HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(v.raw)
6696
+ :);
6697
+ return VFromD<D>{raw_result};
6698
+ #else
6699
+ return VFromD<D>{_mm256_cvttpd_epi32(v.raw)};
6700
+ #endif
6490
6701
  }
6491
6702
 
6492
- template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U32_D(D)>
6493
- HWY_API VFromD<D> DemoteTo(D du32, Vec256<double> v) {
6494
6703
  #if HWY_TARGET <= HWY_AVX3
6495
- (void)du32;
6496
- return VFromD<D>{_mm256_maskz_cvttpd_epu32(
6497
- detail::UnmaskedNot(MaskFromVec(v)).raw, v.raw)};
6498
- #else // AVX2
6499
- const Rebind<double, decltype(du32)> df64;
6500
- const RebindToUnsigned<decltype(df64)> du64;
6501
-
6502
- // Clamp v[i] to a value between 0 and 4294967295
6503
- const auto clamped = Min(ZeroIfNegative(v), Set(df64, 4294967295.0));
6704
+ template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U32_D(D)>
6705
+ HWY_API VFromD<D> DemoteInRangeTo(D /* tag */, Vec256<double> v) {
6706
+ #if HWY_COMPILER_GCC_ACTUAL
6707
+ // Workaround for undefined behavior in _mm256_cvttpd_epu32 with GCC if any
6708
+ // values of v[i] are not within the range of an uint32_t
6709
+
6710
+ #if HWY_COMPILER_GCC_ACTUAL >= 700 && !HWY_IS_DEBUG_BUILD
6711
+ if (detail::IsConstantX86VecForF2IConv<uint32_t>(v)) {
6712
+ typedef double GccF64RawVectType __attribute__((__vector_size__(32)));
6713
+ const auto raw_v = reinterpret_cast<GccF64RawVectType>(v.raw);
6714
+ return Dup128VecFromValues(
6715
+ D(), detail::X86ConvertScalarFromFloat<uint32_t>(raw_v[0]),
6716
+ detail::X86ConvertScalarFromFloat<uint32_t>(raw_v[1]),
6717
+ detail::X86ConvertScalarFromFloat<uint32_t>(raw_v[2]),
6718
+ detail::X86ConvertScalarFromFloat<uint32_t>(raw_v[3]));
6719
+ }
6720
+ #endif
6504
6721
 
6505
- const auto k2_31 = Set(df64, 2147483648.0);
6506
- const auto clamped_is_ge_k2_31 = (clamped >= k2_31);
6507
- const auto clamped_lo31_f64 =
6508
- clamped - IfThenElseZero(clamped_is_ge_k2_31, k2_31);
6509
- const VFromD<D> clamped_lo31_u32{_mm256_cvttpd_epi32(clamped_lo31_f64.raw)};
6510
- const auto clamped_u32_msb = ShiftLeft<31>(
6511
- TruncateTo(du32, BitCast(du64, VecFromMask(df64, clamped_is_ge_k2_31))));
6512
- return Or(clamped_lo31_u32, clamped_u32_msb);
6722
+ __m128i raw_result;
6723
+ __asm__("vcvttpd2udq {%1, %0|%0, %1}"
6724
+ : "=" HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(raw_result)
6725
+ : HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(v.raw)
6726
+ :);
6727
+ return VFromD<D>{raw_result};
6728
+ #else
6729
+ return VFromD<D>{_mm256_cvttpd_epu32(v.raw)};
6513
6730
  #endif
6514
6731
  }
6515
6732
 
6516
- #if HWY_TARGET <= HWY_AVX3
6517
6733
  template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F32_D(D)>
6518
6734
  HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<int64_t, D>> v) {
6519
6735
  return VFromD<D>{_mm256_cvtepi64_ps(v.raw)};
@@ -6679,66 +6895,274 @@ HWY_API VFromD<D> ConvertTo(D /*dd*/, Vec256<uint64_t> v) {
6679
6895
 
6680
6896
  #if HWY_HAVE_FLOAT16
6681
6897
  template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_I16_D(D)>
6682
- HWY_API VFromD<D> ConvertTo(D d, Vec256<float16_t> v) {
6683
- return detail::FixConversionOverflow(d, v,
6684
- VFromD<D>{_mm256_cvttph_epi16(v.raw)});
6898
+ HWY_API VFromD<D> ConvertInRangeTo(D /*d*/, Vec256<float16_t> v) {
6899
+ #if HWY_COMPILER_GCC_ACTUAL
6900
+ // Workaround for undefined behavior in _mm256_cvttph_epi16 with GCC if any
6901
+ // values of v[i] are not within the range of an int16_t
6902
+
6903
+ #if HWY_COMPILER_GCC_ACTUAL >= 1200 && !HWY_IS_DEBUG_BUILD && \
6904
+ HWY_HAVE_SCALAR_F16_TYPE
6905
+ if (detail::IsConstantX86VecForF2IConv<int16_t>(v)) {
6906
+ typedef hwy::float16_t::Native GccF16RawVectType
6907
+ __attribute__((__vector_size__(32)));
6908
+ const auto raw_v = reinterpret_cast<GccF16RawVectType>(v.raw);
6909
+ return VFromD<D>{_mm256_setr_epi16(
6910
+ detail::X86ConvertScalarFromFloat<int16_t>(raw_v[0]),
6911
+ detail::X86ConvertScalarFromFloat<int16_t>(raw_v[1]),
6912
+ detail::X86ConvertScalarFromFloat<int16_t>(raw_v[2]),
6913
+ detail::X86ConvertScalarFromFloat<int16_t>(raw_v[3]),
6914
+ detail::X86ConvertScalarFromFloat<int16_t>(raw_v[4]),
6915
+ detail::X86ConvertScalarFromFloat<int16_t>(raw_v[5]),
6916
+ detail::X86ConvertScalarFromFloat<int16_t>(raw_v[6]),
6917
+ detail::X86ConvertScalarFromFloat<int16_t>(raw_v[7]),
6918
+ detail::X86ConvertScalarFromFloat<int16_t>(raw_v[8]),
6919
+ detail::X86ConvertScalarFromFloat<int16_t>(raw_v[9]),
6920
+ detail::X86ConvertScalarFromFloat<int16_t>(raw_v[10]),
6921
+ detail::X86ConvertScalarFromFloat<int16_t>(raw_v[11]),
6922
+ detail::X86ConvertScalarFromFloat<int16_t>(raw_v[12]),
6923
+ detail::X86ConvertScalarFromFloat<int16_t>(raw_v[13]),
6924
+ detail::X86ConvertScalarFromFloat<int16_t>(raw_v[14]),
6925
+ detail::X86ConvertScalarFromFloat<int16_t>(raw_v[15]))};
6926
+ }
6927
+ #endif
6928
+
6929
+ __m256i raw_result;
6930
+ __asm__("vcvttph2w {%1, %0|%0, %1}"
6931
+ : "=" HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(raw_result)
6932
+ : HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(v.raw)
6933
+ :);
6934
+ return VFromD<D>{raw_result};
6935
+ #else // HWY_COMPILER_GCC_ACTUAL < 1200
6936
+ return VFromD<D>{_mm256_cvttph_epi16(v.raw)};
6937
+ #endif
6685
6938
  }
6686
6939
  template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_U16_D(D)>
6687
- HWY_API VFromD<D> ConvertTo(D /* tag */, VFromD<RebindToFloat<D>> v) {
6688
- return VFromD<D>{_mm256_maskz_cvttph_epu16(
6689
- detail::UnmaskedNot(MaskFromVec(v)).raw, v.raw)};
6940
+ HWY_API VFromD<D> ConvertInRangeTo(D /* tag */, VFromD<RebindToFloat<D>> v) {
6941
+ #if HWY_COMPILER_GCC_ACTUAL
6942
+ // Workaround for undefined behavior in _mm256_cvttph_epu16 with GCC if any
6943
+ // values of v[i] are not within the range of an uint16_t
6944
+
6945
+ #if HWY_COMPILER_GCC_ACTUAL >= 1200 && !HWY_IS_DEBUG_BUILD && \
6946
+ HWY_HAVE_SCALAR_F16_TYPE
6947
+ if (detail::IsConstantX86VecForF2IConv<uint16_t>(v)) {
6948
+ typedef hwy::float16_t::Native GccF16RawVectType
6949
+ __attribute__((__vector_size__(32)));
6950
+ const auto raw_v = reinterpret_cast<GccF16RawVectType>(v.raw);
6951
+ return VFromD<D>{_mm256_setr_epi16(
6952
+ static_cast<int16_t>(
6953
+ detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[0])),
6954
+ static_cast<int16_t>(
6955
+ detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[1])),
6956
+ static_cast<int16_t>(
6957
+ detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[2])),
6958
+ static_cast<int16_t>(
6959
+ detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[3])),
6960
+ static_cast<int16_t>(
6961
+ detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[4])),
6962
+ static_cast<int16_t>(
6963
+ detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[5])),
6964
+ static_cast<int16_t>(
6965
+ detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[6])),
6966
+ static_cast<int16_t>(
6967
+ detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[7])),
6968
+ static_cast<int16_t>(
6969
+ detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[8])),
6970
+ static_cast<int16_t>(
6971
+ detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[9])),
6972
+ static_cast<int16_t>(
6973
+ detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[10])),
6974
+ static_cast<int16_t>(
6975
+ detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[11])),
6976
+ static_cast<int16_t>(
6977
+ detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[12])),
6978
+ static_cast<int16_t>(
6979
+ detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[13])),
6980
+ static_cast<int16_t>(
6981
+ detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[14])),
6982
+ static_cast<int16_t>(
6983
+ detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[15])))};
6984
+ }
6985
+ #endif
6986
+
6987
+ __m256i raw_result;
6988
+ __asm__("vcvttph2uw {%1, %0|%0, %1}"
6989
+ : "=" HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(raw_result)
6990
+ : HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(v.raw)
6991
+ :);
6992
+ return VFromD<D>{raw_result};
6993
+ #else // HWY_COMPILER_GCC_ACTUAL < 1200
6994
+ return VFromD<D>{_mm256_cvttph_epu16(v.raw)};
6995
+ #endif
6690
6996
  }
6691
6997
  #endif // HWY_HAVE_FLOAT16
6692
6998
 
6693
6999
  template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_I32_D(D)>
6694
- HWY_API VFromD<D> ConvertTo(D d, Vec256<float> v) {
6695
- return detail::FixConversionOverflow(d, v,
6696
- VFromD<D>{_mm256_cvttps_epi32(v.raw)});
7000
+ HWY_API VFromD<D> ConvertInRangeTo(D /*d*/, Vec256<float> v) {
7001
+ #if HWY_COMPILER_GCC_ACTUAL
7002
+ // Workaround for undefined behavior in _mm256_cvttps_epi32 with GCC if any
7003
+ // values of v[i] are not within the range of an int32_t
7004
+
7005
+ #if HWY_COMPILER_GCC_ACTUAL >= 700 && !HWY_IS_DEBUG_BUILD
7006
+ if (detail::IsConstantX86VecForF2IConv<int32_t>(v)) {
7007
+ typedef float GccF32RawVectType __attribute__((__vector_size__(32)));
7008
+ const auto raw_v = reinterpret_cast<GccF32RawVectType>(v.raw);
7009
+ return VFromD<D>{_mm256_setr_epi32(
7010
+ detail::X86ConvertScalarFromFloat<int32_t>(raw_v[0]),
7011
+ detail::X86ConvertScalarFromFloat<int32_t>(raw_v[1]),
7012
+ detail::X86ConvertScalarFromFloat<int32_t>(raw_v[2]),
7013
+ detail::X86ConvertScalarFromFloat<int32_t>(raw_v[3]),
7014
+ detail::X86ConvertScalarFromFloat<int32_t>(raw_v[4]),
7015
+ detail::X86ConvertScalarFromFloat<int32_t>(raw_v[5]),
7016
+ detail::X86ConvertScalarFromFloat<int32_t>(raw_v[6]),
7017
+ detail::X86ConvertScalarFromFloat<int32_t>(raw_v[7]))};
7018
+ }
7019
+ #endif
7020
+
7021
+ __m256i raw_result;
7022
+ __asm__("vcvttps2dq {%1, %0|%0, %1}"
7023
+ : "=" HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(raw_result)
7024
+ : HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(v.raw)
7025
+ :);
7026
+ return VFromD<D>{raw_result};
7027
+ #else
7028
+ return VFromD<D>{_mm256_cvttps_epi32(v.raw)};
7029
+ #endif
6697
7030
  }
6698
7031
 
6699
7032
  #if HWY_TARGET <= HWY_AVX3
6700
7033
  template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_I64_D(D)>
6701
- HWY_API VFromD<D> ConvertTo(D di, Vec256<double> v) {
6702
- return detail::FixConversionOverflow(di, v,
6703
- VFromD<D>{_mm256_cvttpd_epi64(v.raw)});
7034
+ HWY_API VFromD<D> ConvertInRangeTo(D /*di*/, Vec256<double> v) {
7035
+ #if HWY_COMPILER_GCC_ACTUAL
7036
+ // Workaround for undefined behavior in _mm256_cvttpd_epi64 with GCC if any
7037
+ // values of v[i] are not within the range of an int64_t
7038
+
7039
+ #if HWY_COMPILER_GCC_ACTUAL >= 700 && !HWY_IS_DEBUG_BUILD
7040
+ if (detail::IsConstantX86VecForF2IConv<int64_t>(v)) {
7041
+ typedef double GccF64RawVectType __attribute__((__vector_size__(32)));
7042
+ const auto raw_v = reinterpret_cast<GccF64RawVectType>(v.raw);
7043
+ return VFromD<D>{_mm256_setr_epi64x(
7044
+ detail::X86ConvertScalarFromFloat<int64_t>(raw_v[0]),
7045
+ detail::X86ConvertScalarFromFloat<int64_t>(raw_v[1]),
7046
+ detail::X86ConvertScalarFromFloat<int64_t>(raw_v[2]),
7047
+ detail::X86ConvertScalarFromFloat<int64_t>(raw_v[3]))};
7048
+ }
7049
+ #endif
7050
+
7051
+ __m256i raw_result;
7052
+ __asm__("vcvttpd2qq {%1, %0|%0, %1}"
7053
+ : "=" HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(raw_result)
7054
+ : HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(v.raw)
7055
+ :);
7056
+ return VFromD<D>{raw_result};
7057
+ #else // !HWY_COMPILER_GCC_ACTUAL
7058
+ return VFromD<D>{_mm256_cvttpd_epi64(v.raw)};
7059
+ #endif // HWY_COMPILER_GCC_ACTUAL
6704
7060
  }
6705
7061
  template <class DU, HWY_IF_V_SIZE_D(DU, 32), HWY_IF_U32_D(DU)>
6706
- HWY_API VFromD<DU> ConvertTo(DU /*du*/, VFromD<RebindToFloat<DU>> v) {
6707
- return VFromD<DU>{_mm256_maskz_cvttps_epu32(
6708
- detail::UnmaskedNot(MaskFromVec(v)).raw, v.raw)};
7062
+ HWY_API VFromD<DU> ConvertInRangeTo(DU /*du*/, VFromD<RebindToFloat<DU>> v) {
7063
+ #if HWY_COMPILER_GCC_ACTUAL
7064
+ // Workaround for undefined behavior in _mm256_cvttps_epu32 with GCC if any
7065
+ // values of v[i] are not within the range of an uint32_t
7066
+
7067
+ #if HWY_COMPILER_GCC_ACTUAL >= 700 && !HWY_IS_DEBUG_BUILD
7068
+ if (detail::IsConstantX86VecForF2IConv<uint32_t>(v)) {
7069
+ typedef float GccF32RawVectType __attribute__((__vector_size__(32)));
7070
+ const auto raw_v = reinterpret_cast<GccF32RawVectType>(v.raw);
7071
+ return VFromD<DU>{_mm256_setr_epi32(
7072
+ static_cast<int32_t>(
7073
+ detail::X86ConvertScalarFromFloat<uint32_t>(raw_v[0])),
7074
+ static_cast<int32_t>(
7075
+ detail::X86ConvertScalarFromFloat<uint32_t>(raw_v[1])),
7076
+ static_cast<int32_t>(
7077
+ detail::X86ConvertScalarFromFloat<uint32_t>(raw_v[2])),
7078
+ static_cast<int32_t>(
7079
+ detail::X86ConvertScalarFromFloat<uint32_t>(raw_v[3])),
7080
+ static_cast<int32_t>(
7081
+ detail::X86ConvertScalarFromFloat<uint32_t>(raw_v[4])),
7082
+ static_cast<int32_t>(
7083
+ detail::X86ConvertScalarFromFloat<uint32_t>(raw_v[5])),
7084
+ static_cast<int32_t>(
7085
+ detail::X86ConvertScalarFromFloat<uint32_t>(raw_v[6])),
7086
+ static_cast<int32_t>(
7087
+ detail::X86ConvertScalarFromFloat<uint32_t>(raw_v[7])))};
7088
+ }
7089
+ #endif
7090
+
7091
+ __m256i raw_result;
7092
+ __asm__("vcvttps2udq {%1, %0|%0, %1}"
7093
+ : "=" HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(raw_result)
7094
+ : HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(v.raw)
7095
+ :);
7096
+ return VFromD<DU>{raw_result};
7097
+ #else // !HWY_COMPILER_GCC_ACTUAL
7098
+ return VFromD<DU>{_mm256_cvttps_epu32(v.raw)};
7099
+ #endif // HWY_COMPILER_GCC_ACTUAL
6709
7100
  }
6710
7101
  template <class DU, HWY_IF_V_SIZE_D(DU, 32), HWY_IF_U64_D(DU)>
6711
- HWY_API VFromD<DU> ConvertTo(DU /*du*/, VFromD<RebindToFloat<DU>> v) {
6712
- return VFromD<DU>{_mm256_maskz_cvttpd_epu64(
6713
- detail::UnmaskedNot(MaskFromVec(v)).raw, v.raw)};
6714
- }
6715
- #else // AVX2
6716
- template <class DU32, HWY_IF_V_SIZE_D(DU32, 32), HWY_IF_U32_D(DU32)>
6717
- HWY_API VFromD<DU32> ConvertTo(DU32 du32, VFromD<RebindToFloat<DU32>> v) {
6718
- const RebindToSigned<decltype(du32)> di32;
6719
- const RebindToFloat<decltype(du32)> df32;
6720
-
6721
- const auto non_neg_v = ZeroIfNegative(v);
6722
- const auto exp_diff = Set(di32, int32_t{158}) -
6723
- BitCast(di32, ShiftRight<23>(BitCast(du32, non_neg_v)));
6724
- const auto scale_down_f32_val_mask =
6725
- BitCast(du32, VecFromMask(di32, Eq(exp_diff, Zero(di32))));
6726
-
6727
- const auto v_scaled = BitCast(
6728
- df32, BitCast(du32, non_neg_v) + ShiftLeft<23>(scale_down_f32_val_mask));
6729
- const VFromD<decltype(du32)> f32_to_u32_result{
6730
- _mm256_cvttps_epi32(v_scaled.raw)};
6731
-
6732
- return Or(
6733
- BitCast(du32, BroadcastSignBit(exp_diff)),
6734
- f32_to_u32_result + And(f32_to_u32_result, scale_down_f32_val_mask));
7102
+ HWY_API VFromD<DU> ConvertInRangeTo(DU /*du*/, VFromD<RebindToFloat<DU>> v) {
7103
+ #if HWY_COMPILER_GCC_ACTUAL
7104
+ // Workaround for undefined behavior in _mm256_cvttpd_epu64 with GCC if any
7105
+ // values of v[i] are not within the range of an uint64_t
7106
+
7107
+ #if HWY_COMPILER_GCC_ACTUAL >= 700 && !HWY_IS_DEBUG_BUILD
7108
+ if (detail::IsConstantX86VecForF2IConv<uint64_t>(v)) {
7109
+ typedef double GccF64RawVectType __attribute__((__vector_size__(32)));
7110
+ const auto raw_v = reinterpret_cast<GccF64RawVectType>(v.raw);
7111
+ return VFromD<DU>{_mm256_setr_epi64x(
7112
+ static_cast<int64_t>(
7113
+ detail::X86ConvertScalarFromFloat<uint64_t>(raw_v[0])),
7114
+ static_cast<int64_t>(
7115
+ detail::X86ConvertScalarFromFloat<uint64_t>(raw_v[1])),
7116
+ static_cast<int64_t>(
7117
+ detail::X86ConvertScalarFromFloat<uint64_t>(raw_v[2])),
7118
+ static_cast<int64_t>(
7119
+ detail::X86ConvertScalarFromFloat<uint64_t>(raw_v[3])))};
7120
+ }
7121
+ #endif
7122
+
7123
+ __m256i raw_result;
7124
+ __asm__("vcvttpd2uqq {%1, %0|%0, %1}"
7125
+ : "=" HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(raw_result)
7126
+ : HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(v.raw)
7127
+ :);
7128
+ return VFromD<DU>{raw_result};
7129
+ #else // !HWY_COMPILER_GCC_ACTUAL
7130
+ return VFromD<DU>{_mm256_cvttpd_epu64(v.raw)};
7131
+ #endif // HWY_COMPILER_GCC_ACTUAL
6735
7132
  }
6736
7133
  #endif // HWY_TARGET <= HWY_AVX3
6737
7134
 
6738
- HWY_API Vec256<int32_t> NearestInt(const Vec256<float> v) {
6739
- const Full256<int32_t> di;
6740
- return detail::FixConversionOverflow(
6741
- di, v, Vec256<int32_t>{_mm256_cvtps_epi32(v.raw)});
7135
+ template <class DI, HWY_IF_V_SIZE_D(DI, 32), HWY_IF_I32_D(DI)>
7136
+ HWY_INLINE VFromD<DI> NearestIntInRange(DI, VFromD<RebindToFloat<DI>> v) {
7137
+ #if HWY_COMPILER_GCC_ACTUAL
7138
+ // Workaround for undefined behavior in _mm256_cvtps_epi32 if any values of
7139
+ // v[i] are not within the range of an int32_t
7140
+
7141
+ #if HWY_COMPILER_GCC >= 700 && !HWY_IS_DEBUG_BUILD
7142
+ if (detail::IsConstantX86VecForF2IConv<int32_t>(v)) {
7143
+ typedef float GccF32RawVectType __attribute__((__vector_size__(32)));
7144
+ const auto raw_v = reinterpret_cast<GccF32RawVectType>(v.raw);
7145
+ return VFromD<DI>{
7146
+ _mm256_setr_epi32(detail::X86ScalarNearestInt<int32_t>(raw_v[0]),
7147
+ detail::X86ScalarNearestInt<int32_t>(raw_v[1]),
7148
+ detail::X86ScalarNearestInt<int32_t>(raw_v[2]),
7149
+ detail::X86ScalarNearestInt<int32_t>(raw_v[3]),
7150
+ detail::X86ScalarNearestInt<int32_t>(raw_v[4]),
7151
+ detail::X86ScalarNearestInt<int32_t>(raw_v[5]),
7152
+ detail::X86ScalarNearestInt<int32_t>(raw_v[6]),
7153
+ detail::X86ScalarNearestInt<int32_t>(raw_v[7]))};
7154
+ }
7155
+ #endif
7156
+
7157
+ __m256i raw_result;
7158
+ __asm__("vcvtps2dq {%1, %0|%0, %1}"
7159
+ : "=" HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(raw_result)
7160
+ : HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(v.raw)
7161
+ :);
7162
+ return VFromD<DI>{raw_result};
7163
+ #else // !HWY_COMPILER_GCC_ACTUAL
7164
+ return VFromD<DI>{_mm256_cvtps_epi32(v.raw)};
7165
+ #endif // HWY_COMPILER_GCC_ACTUAL
6742
7166
  }
6743
7167
 
6744
7168
  #ifndef HWY_DISABLE_F16C
@@ -8138,6 +8562,23 @@ HWY_API Mask256<T> SetAtOrBeforeFirst(Mask256<T> mask) {
8138
8562
 
8139
8563
  // ------------------------------ Reductions in generic_ops
8140
8564
 
8565
+ // ------------------------------ BitShuffle
8566
+ #if HWY_TARGET <= HWY_AVX3_DL
8567
+ template <class V, class VI, HWY_IF_UI64(TFromV<V>), HWY_IF_UI8(TFromV<VI>),
8568
+ HWY_IF_V_SIZE_V(V, 32), HWY_IF_V_SIZE_V(VI, 32)>
8569
+ HWY_API V BitShuffle(V v, VI idx) {
8570
+ const DFromV<decltype(v)> d64;
8571
+ const RebindToUnsigned<decltype(d64)> du64;
8572
+ const Rebind<uint8_t, decltype(d64)> du8;
8573
+
8574
+ int32_t i32_bit_shuf_result =
8575
+ static_cast<int32_t>(_mm256_bitshuffle_epi64_mask(v.raw, idx.raw));
8576
+
8577
+ return BitCast(d64, PromoteTo(du64, VFromD<decltype(du8)>{_mm_cvtsi32_si128(
8578
+ i32_bit_shuf_result)}));
8579
+ }
8580
+ #endif // HWY_TARGET <= HWY_AVX3_DL
8581
+
8141
8582
  // ------------------------------ LeadingZeroCount
8142
8583
 
8143
8584
  #if HWY_TARGET <= HWY_AVX3