@img/sharp-libvips-dev 1.0.2 → 1.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (118) hide show
  1. package/README.md +1 -2
  2. package/include/aom/aom_decoder.h +1 -1
  3. package/include/aom/aom_encoder.h +7 -1
  4. package/include/aom/aom_image.h +24 -12
  5. package/include/aom/aom_integer.h +3 -3
  6. package/include/aom/aomcx.h +15 -0
  7. package/include/aom/aomdx.h +5 -2
  8. package/include/archive.h +7 -5
  9. package/include/archive_entry.h +5 -3
  10. package/include/cgif.h +3 -0
  11. package/include/freetype2/freetype/config/ftoption.h +1 -1
  12. package/include/fribidi/fribidi-config.h +2 -2
  13. package/include/fribidi/fribidi-unicode-version.h +3 -3
  14. package/include/glib-2.0/gio/gappinfo.h +40 -25
  15. package/include/glib-2.0/gio/gasyncresult.h +1 -1
  16. package/include/glib-2.0/gio/gconverter.h +5 -0
  17. package/include/glib-2.0/gio/gdbusintrospection.h +1 -1
  18. package/include/glib-2.0/gio/gfile.h +16 -0
  19. package/include/glib-2.0/gio/gio-visibility.h +34 -0
  20. package/include/glib-2.0/gio/gsettings.h +8 -0
  21. package/include/glib-2.0/gio/gvfs.h +2 -2
  22. package/include/glib-2.0/girepository/gi-visibility.h +34 -0
  23. package/include/glib-2.0/glib/gbookmarkfile.h +1 -1
  24. package/include/glib-2.0/glib/giochannel.h +2 -2
  25. package/include/glib-2.0/glib/glib-visibility.h +34 -0
  26. package/include/glib-2.0/glib/gmacros.h +12 -5
  27. package/include/glib-2.0/glib/gmain.h +93 -7
  28. package/include/glib-2.0/glib/gqsort.h +8 -1
  29. package/include/glib-2.0/glib/gstrfuncs.h +0 -12
  30. package/include/glib-2.0/glib/gstrvbuilder.h +3 -0
  31. package/include/glib-2.0/glib/gunicode.h +1 -1
  32. package/include/glib-2.0/glib/gversionmacros.h +9 -0
  33. package/include/glib-2.0/gmodule/gmodule-visibility.h +34 -0
  34. package/include/glib-2.0/gobject/gobject-visibility.h +34 -0
  35. package/include/glib-2.0/gobject/gtype.h +6 -6
  36. package/include/harfbuzz/hb-buffer.h +6 -0
  37. package/include/harfbuzz/hb-common.h +6 -9
  38. package/include/harfbuzz/hb-cplusplus.hh +8 -11
  39. package/include/harfbuzz/hb-subset.h +17 -4
  40. package/include/harfbuzz/hb-version.h +3 -3
  41. package/include/hwy/abort.h +28 -0
  42. package/include/hwy/aligned_allocator.h +48 -1
  43. package/include/hwy/base.h +235 -34
  44. package/include/hwy/detect_compiler_arch.h +84 -10
  45. package/include/hwy/detect_targets.h +95 -29
  46. package/include/hwy/foreach_target.h +12 -1
  47. package/include/hwy/highway.h +205 -50
  48. package/include/hwy/ops/arm_neon-inl.h +841 -99
  49. package/include/hwy/ops/arm_sve-inl.h +413 -141
  50. package/include/hwy/ops/emu128-inl.h +373 -360
  51. package/include/hwy/ops/generic_ops-inl.h +804 -401
  52. package/include/hwy/ops/inside-inl.h +691 -0
  53. package/include/hwy/ops/ppc_vsx-inl.h +456 -166
  54. package/include/hwy/ops/rvv-inl.h +537 -249
  55. package/include/hwy/ops/scalar-inl.h +169 -79
  56. package/include/hwy/ops/set_macros-inl.h +106 -18
  57. package/include/hwy/ops/shared-inl.h +23 -0
  58. package/include/hwy/ops/wasm_128-inl.h +130 -108
  59. package/include/hwy/ops/x86_128-inl.h +1892 -577
  60. package/include/hwy/ops/x86_256-inl.h +625 -184
  61. package/include/hwy/ops/x86_512-inl.h +733 -131
  62. package/include/hwy/targets.h +22 -21
  63. package/include/hwy/timer-inl.h +3 -3
  64. package/include/hwy/timer.h +5 -1
  65. package/include/libheif/heif.h +170 -15
  66. package/include/libheif/heif_items.h +237 -0
  67. package/include/libheif/heif_properties.h +38 -2
  68. package/include/libheif/heif_regions.h +1 -1
  69. package/include/libheif/heif_version.h +2 -2
  70. package/include/libpng16/pnglibconf.h +1 -1
  71. package/include/librsvg-2.0/librsvg/rsvg-cairo.h +1 -1
  72. package/include/librsvg-2.0/librsvg/rsvg-features.h +3 -4
  73. package/include/librsvg-2.0/librsvg/rsvg-pixbuf.h +235 -0
  74. package/include/librsvg-2.0/librsvg/rsvg-version.h +3 -3
  75. package/include/librsvg-2.0/librsvg/rsvg.h +55 -176
  76. package/include/libxml2/libxml/HTMLparser.h +12 -19
  77. package/include/libxml2/libxml/c14n.h +1 -12
  78. package/include/libxml2/libxml/debugXML.h +1 -1
  79. package/include/libxml2/libxml/encoding.h +9 -0
  80. package/include/libxml2/libxml/entities.h +12 -1
  81. package/include/libxml2/libxml/hash.h +19 -0
  82. package/include/libxml2/libxml/list.h +2 -2
  83. package/include/libxml2/libxml/nanohttp.h +17 -0
  84. package/include/libxml2/libxml/parser.h +61 -55
  85. package/include/libxml2/libxml/parserInternals.h +9 -1
  86. package/include/libxml2/libxml/pattern.h +6 -0
  87. package/include/libxml2/libxml/tree.h +32 -12
  88. package/include/libxml2/libxml/uri.h +11 -0
  89. package/include/libxml2/libxml/valid.h +29 -2
  90. package/include/libxml2/libxml/xinclude.h +7 -0
  91. package/include/libxml2/libxml/xmlIO.h +21 -4
  92. package/include/libxml2/libxml/xmlerror.h +14 -0
  93. package/include/libxml2/libxml/xmlexports.h +111 -15
  94. package/include/libxml2/libxml/xmlmemory.h +8 -45
  95. package/include/libxml2/libxml/xmlreader.h +2 -0
  96. package/include/libxml2/libxml/xmlsave.h +5 -0
  97. package/include/libxml2/libxml/xmlunicode.h +165 -1
  98. package/include/libxml2/libxml/xmlversion.h +15 -179
  99. package/include/libxml2/libxml/xmlwriter.h +1 -0
  100. package/include/libxml2/libxml/xpath.h +4 -0
  101. package/include/pango-1.0/pango/pango-features.h +3 -3
  102. package/include/pango-1.0/pango/pango-item.h +4 -2
  103. package/include/pango-1.0/pango/pango-version-macros.h +25 -0
  104. package/include/pango-1.0/pango/pangofc-font.h +2 -1
  105. package/include/pnglibconf.h +1 -1
  106. package/include/vips/util.h +1 -2
  107. package/include/vips/version.h +4 -4
  108. package/include/webp/decode.h +58 -56
  109. package/include/webp/demux.h +25 -21
  110. package/include/webp/encode.h +44 -39
  111. package/include/webp/mux.h +76 -15
  112. package/include/webp/mux_types.h +2 -1
  113. package/include/webp/sharpyuv/sharpyuv.h +77 -8
  114. package/include/webp/types.h +29 -8
  115. package/include/zconf.h +1 -1
  116. package/include/zlib.h +12 -12
  117. package/package.json +1 -1
  118. package/versions.json +14 -15
@@ -193,6 +193,25 @@ HWY_INLINE __m512i BitCastToInteger(__m512d v) {
193
193
  return _mm512_castpd_si512(v);
194
194
  }
195
195
 
196
+ #if HWY_AVX3_HAVE_F32_TO_BF16C
197
+ HWY_INLINE __m512i BitCastToInteger(__m512bh v) {
198
+ // Need to use reinterpret_cast on GCC/Clang or BitCastScalar on MSVC to
199
+ // bit cast a __m512bh to a __m512i as there is currently no intrinsic
200
+ // available (as of GCC 13 and Clang 17) that can bit cast a __m512bh vector
201
+ // to a __m512i vector
202
+
203
+ #if HWY_COMPILER_GCC || HWY_COMPILER_CLANG
204
+ // On GCC or Clang, use reinterpret_cast to bit cast a __m512bh to a __m512i
205
+ return reinterpret_cast<__m512i>(v);
206
+ #else
207
+ // On MSVC, use BitCastScalar to bit cast a __m512bh to a __m512i as MSVC does
208
+ // not allow reinterpret_cast, static_cast, or a C-style cast to be used to
209
+ // bit cast from one AVX vector type to a different AVX vector type
210
+ return BitCastScalar<__m512i>(v);
211
+ #endif // HWY_COMPILER_GCC || HWY_COMPILER_CLANG
212
+ }
213
+ #endif // HWY_AVX3_HAVE_F32_TO_BF16C
214
+
196
215
  template <typename T>
197
216
  HWY_INLINE Vec512<uint8_t> BitCastToByte(Vec512<T> v) {
198
217
  return Vec512<uint8_t>{BitCastToInteger(v.raw)};
@@ -698,45 +717,61 @@ HWY_API Vec512<double> Xor(const Vec512<double> a, const Vec512<double> b) {
698
717
  // ------------------------------ Xor3
699
718
  template <typename T>
700
719
  HWY_API Vec512<T> Xor3(Vec512<T> x1, Vec512<T> x2, Vec512<T> x3) {
720
+ #if !HWY_IS_MSAN
701
721
  const DFromV<decltype(x1)> d;
702
722
  const RebindToUnsigned<decltype(d)> du;
703
723
  using VU = VFromD<decltype(du)>;
704
724
  const __m512i ret = _mm512_ternarylogic_epi64(
705
725
  BitCast(du, x1).raw, BitCast(du, x2).raw, BitCast(du, x3).raw, 0x96);
706
726
  return BitCast(d, VU{ret});
727
+ #else
728
+ return Xor(x1, Xor(x2, x3));
729
+ #endif
707
730
  }
708
731
 
709
732
  // ------------------------------ Or3
710
733
  template <typename T>
711
734
  HWY_API Vec512<T> Or3(Vec512<T> o1, Vec512<T> o2, Vec512<T> o3) {
735
+ #if !HWY_IS_MSAN
712
736
  const DFromV<decltype(o1)> d;
713
737
  const RebindToUnsigned<decltype(d)> du;
714
738
  using VU = VFromD<decltype(du)>;
715
739
  const __m512i ret = _mm512_ternarylogic_epi64(
716
740
  BitCast(du, o1).raw, BitCast(du, o2).raw, BitCast(du, o3).raw, 0xFE);
717
741
  return BitCast(d, VU{ret});
742
+ #else
743
+ return Or(o1, Or(o2, o3));
744
+ #endif
718
745
  }
719
746
 
720
747
  // ------------------------------ OrAnd
721
748
  template <typename T>
722
749
  HWY_API Vec512<T> OrAnd(Vec512<T> o, Vec512<T> a1, Vec512<T> a2) {
750
+ #if !HWY_IS_MSAN
723
751
  const DFromV<decltype(o)> d;
724
752
  const RebindToUnsigned<decltype(d)> du;
725
753
  using VU = VFromD<decltype(du)>;
726
754
  const __m512i ret = _mm512_ternarylogic_epi64(
727
755
  BitCast(du, o).raw, BitCast(du, a1).raw, BitCast(du, a2).raw, 0xF8);
728
756
  return BitCast(d, VU{ret});
757
+ #else
758
+ return Or(o, And(a1, a2));
759
+ #endif
729
760
  }
730
761
 
731
762
  // ------------------------------ IfVecThenElse
732
763
  template <typename T>
733
764
  HWY_API Vec512<T> IfVecThenElse(Vec512<T> mask, Vec512<T> yes, Vec512<T> no) {
765
+ #if !HWY_IS_MSAN
734
766
  const DFromV<decltype(yes)> d;
735
767
  const RebindToUnsigned<decltype(d)> du;
736
768
  using VU = VFromD<decltype(du)>;
737
769
  return BitCast(d, VU{_mm512_ternarylogic_epi64(BitCast(du, mask).raw,
738
770
  BitCast(du, yes).raw,
739
771
  BitCast(du, no).raw, 0xCA)});
772
+ #else
773
+ return IfThenElse(MaskFromVec(mask), yes, no);
774
+ #endif
740
775
  }
741
776
 
742
777
  // ------------------------------ Operator overloads (internal-only if float)
@@ -1036,12 +1071,6 @@ HWY_API Vec512<T> IfNegativeThenNegOrUndefIfZero(Vec512<T> mask, Vec512<T> v) {
1036
1071
  return MaskedSubOr(v, MaskFromVec(mask), Zero(d), v);
1037
1072
  }
1038
1073
 
1039
- template <typename T, HWY_IF_FLOAT(T)>
1040
- HWY_API Vec512<T> ZeroIfNegative(const Vec512<T> v) {
1041
- // AVX3 MaskFromVec only looks at the MSB
1042
- return IfThenZeroElse(MaskFromVec(v), v);
1043
- }
1044
-
1045
1074
  // ================================================== ARITHMETIC
1046
1075
 
1047
1076
  // ------------------------------ Addition
@@ -1417,14 +1446,45 @@ HWY_API Vec512<int8_t> ShiftRight(const Vec512<int8_t> v) {
1417
1446
 
1418
1447
  // ------------------------------ RotateRight
1419
1448
 
1420
- template <int kBits, typename T, HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2))>
1421
- HWY_API Vec512<T> RotateRight(const Vec512<T> v) {
1422
- constexpr size_t kSizeInBits = sizeof(T) * 8;
1423
- static_assert(0 <= kBits && kBits < kSizeInBits, "Invalid shift count");
1449
+ #if HWY_TARGET <= HWY_AVX3_DL
1450
+ // U8 RotateRight is generic for all vector lengths on AVX3_DL
1451
+ template <int kBits, class V, HWY_IF_U8(TFromV<V>)>
1452
+ HWY_API V RotateRight(V v) {
1453
+ static_assert(0 <= kBits && kBits < 8, "Invalid shift count");
1454
+
1455
+ const Repartition<uint64_t, DFromV<V>> du64;
1424
1456
  if (kBits == 0) return v;
1425
- // AVX3 does not support 8/16-bit.
1426
- return Or(ShiftRight<kBits>(v),
1427
- ShiftLeft<HWY_MIN(kSizeInBits - 1, kSizeInBits - kBits)>(v));
1457
+
1458
+ constexpr uint64_t kShrMatrix =
1459
+ (0x0102040810204080ULL << kBits) &
1460
+ (0x0101010101010101ULL * ((0xFF << kBits) & 0xFF));
1461
+ constexpr int kShlBits = (-kBits) & 7;
1462
+ constexpr uint64_t kShlMatrix = (0x0102040810204080ULL >> kShlBits) &
1463
+ (0x0101010101010101ULL * (0xFF >> kShlBits));
1464
+ constexpr uint64_t kMatrix = kShrMatrix | kShlMatrix;
1465
+
1466
+ return detail::GaloisAffine(v, Set(du64, kMatrix));
1467
+ }
1468
+ #else // HWY_TARGET > HWY_AVX3_DL
1469
+ template <int kBits>
1470
+ HWY_API Vec512<uint8_t> RotateRight(const Vec512<uint8_t> v) {
1471
+ static_assert(0 <= kBits && kBits < 8, "Invalid shift count");
1472
+ if (kBits == 0) return v;
1473
+ // AVX3 does not support 8-bit.
1474
+ return Or(ShiftRight<kBits>(v), ShiftLeft<HWY_MIN(7, 8 - kBits)>(v));
1475
+ }
1476
+ #endif // HWY_TARGET <= HWY_AVX3_DL
1477
+
1478
+ template <int kBits>
1479
+ HWY_API Vec512<uint16_t> RotateRight(const Vec512<uint16_t> v) {
1480
+ static_assert(0 <= kBits && kBits < 16, "Invalid shift count");
1481
+ if (kBits == 0) return v;
1482
+ #if HWY_TARGET <= HWY_AVX3_DL
1483
+ return Vec512<uint16_t>{_mm512_shrdi_epi16(v.raw, v.raw, kBits)};
1484
+ #else
1485
+ // AVX3 does not support 16-bit.
1486
+ return Or(ShiftRight<kBits>(v), ShiftLeft<HWY_MIN(15, 16 - kBits)>(v));
1487
+ #endif
1428
1488
  }
1429
1489
 
1430
1490
  template <int kBits>
@@ -1441,6 +1501,34 @@ HWY_API Vec512<uint64_t> RotateRight(const Vec512<uint64_t> v) {
1441
1501
  return Vec512<uint64_t>{_mm512_ror_epi64(v.raw, kBits)};
1442
1502
  }
1443
1503
 
1504
+ // ------------------------------ Rol/Ror
1505
+ #if HWY_TARGET <= HWY_AVX3_DL
1506
+ template <class T, HWY_IF_UI16(T)>
1507
+ HWY_API Vec512<T> Ror(Vec512<T> a, Vec512<T> b) {
1508
+ return Vec512<T>{_mm512_shrdv_epi16(a.raw, a.raw, b.raw)};
1509
+ }
1510
+ #endif // HWY_TARGET <= HWY_AVX3_DL
1511
+
1512
+ template <class T, HWY_IF_UI32(T)>
1513
+ HWY_API Vec512<T> Rol(Vec512<T> a, Vec512<T> b) {
1514
+ return Vec512<T>{_mm512_rolv_epi32(a.raw, b.raw)};
1515
+ }
1516
+
1517
+ template <class T, HWY_IF_UI32(T)>
1518
+ HWY_API Vec512<T> Ror(Vec512<T> a, Vec512<T> b) {
1519
+ return Vec512<T>{_mm512_rorv_epi32(a.raw, b.raw)};
1520
+ }
1521
+
1522
+ template <class T, HWY_IF_UI64(T)>
1523
+ HWY_API Vec512<T> Rol(Vec512<T> a, Vec512<T> b) {
1524
+ return Vec512<T>{_mm512_rolv_epi64(a.raw, b.raw)};
1525
+ }
1526
+
1527
+ template <class T, HWY_IF_UI64(T)>
1528
+ HWY_API Vec512<T> Ror(Vec512<T> a, Vec512<T> b) {
1529
+ return Vec512<T>{_mm512_rorv_epi64(a.raw, b.raw)};
1530
+ }
1531
+
1444
1532
  // ------------------------------ ShiftLeftSame
1445
1533
 
1446
1534
  // GCC <14 and Clang <11 do not follow the Intel documentation for AVX-512
@@ -2874,6 +2962,28 @@ HWY_API MFromD<D> UpperHalfOfMask(D /*d*/, MFromD<Twice<D>> m) {
2874
2962
  return MFromD<D>{static_cast<decltype(MFromD<D>().raw)>(shifted_mask)};
2875
2963
  }
2876
2964
 
2965
+ template <class D, HWY_IF_LANES_D(D, 64)>
2966
+ HWY_API MFromD<D> SlideMask1Up(D /*d*/, MFromD<D> m) {
2967
+ using RawM = decltype(MFromD<D>().raw);
2968
+ #if HWY_COMPILER_HAS_MASK_INTRINSICS
2969
+ return MFromD<D>{
2970
+ static_cast<RawM>(_kshiftli_mask64(static_cast<__mmask64>(m.raw), 1))};
2971
+ #else
2972
+ return MFromD<D>{static_cast<RawM>(static_cast<uint64_t>(m.raw) << 1)};
2973
+ #endif
2974
+ }
2975
+
2976
+ template <class D, HWY_IF_LANES_D(D, 64)>
2977
+ HWY_API MFromD<D> SlideMask1Down(D /*d*/, MFromD<D> m) {
2978
+ using RawM = decltype(MFromD<D>().raw);
2979
+ #if HWY_COMPILER_HAS_MASK_INTRINSICS
2980
+ return MFromD<D>{
2981
+ static_cast<RawM>(_kshiftri_mask64(static_cast<__mmask64>(m.raw), 1))};
2982
+ #else
2983
+ return MFromD<D>{static_cast<RawM>(static_cast<uint64_t>(m.raw) >> 1)};
2984
+ #endif
2985
+ }
2986
+
2877
2987
  // ------------------------------ BroadcastSignBit (ShiftRight, compare, mask)
2878
2988
 
2879
2989
  HWY_API Vec512<int8_t> BroadcastSignBit(Vec512<int8_t> v) {
@@ -2907,6 +3017,15 @@ HWY_API Mask512<float16_t> IsNaN(Vec512<float16_t> v) {
2907
3017
  v.raw, HWY_X86_FPCLASS_SNAN | HWY_X86_FPCLASS_QNAN)};
2908
3018
  }
2909
3019
 
3020
+ HWY_API Mask512<float16_t> IsEitherNaN(Vec512<float16_t> a,
3021
+ Vec512<float16_t> b) {
3022
+ // Work around warnings in the intrinsic definitions (passing -1 as a mask).
3023
+ HWY_DIAGNOSTICS(push)
3024
+ HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion")
3025
+ return Mask512<float16_t>{_mm512_cmp_ph_mask(a.raw, b.raw, _CMP_UNORD_Q)};
3026
+ HWY_DIAGNOSTICS(pop)
3027
+ }
3028
+
2910
3029
  HWY_API Mask512<float16_t> IsInf(Vec512<float16_t> v) {
2911
3030
  return Mask512<float16_t>{_mm512_fpclass_ph_mask(v.raw, 0x18)};
2912
3031
  }
@@ -2930,6 +3049,14 @@ HWY_API Mask512<double> IsNaN(Vec512<double> v) {
2930
3049
  v.raw, HWY_X86_FPCLASS_SNAN | HWY_X86_FPCLASS_QNAN)};
2931
3050
  }
2932
3051
 
3052
+ HWY_API Mask512<float> IsEitherNaN(Vec512<float> a, Vec512<float> b) {
3053
+ return Mask512<float>{_mm512_cmp_ps_mask(a.raw, b.raw, _CMP_UNORD_Q)};
3054
+ }
3055
+
3056
+ HWY_API Mask512<double> IsEitherNaN(Vec512<double> a, Vec512<double> b) {
3057
+ return Mask512<double>{_mm512_cmp_pd_mask(a.raw, b.raw, _CMP_UNORD_Q)};
3058
+ }
3059
+
2933
3060
  HWY_API Mask512<float> IsInf(Vec512<float> v) {
2934
3061
  return Mask512<float>{_mm512_fpclass_ps_mask(
2935
3062
  v.raw, HWY_X86_FPCLASS_NEG_INF | HWY_X86_FPCLASS_POS_INF)};
@@ -3357,30 +3484,21 @@ HWY_INLINE Vec512<double> NativeMaskedGatherOr512(
3357
3484
  } // namespace detail
3358
3485
 
3359
3486
  template <class D, HWY_IF_V_SIZE_D(D, 64)>
3360
- HWY_API VFromD<D> GatherOffset(D d, const TFromD<D>* HWY_RESTRICT base,
3487
+ HWY_API VFromD<D> GatherOffset(D /*d*/, const TFromD<D>* HWY_RESTRICT base,
3361
3488
  VFromD<RebindToSigned<D>> offsets) {
3362
- const RebindToSigned<decltype(d)> di;
3363
- (void)di; // for HWY_DASSERT
3364
- HWY_DASSERT(AllFalse(di, Lt(offsets, Zero(di))));
3365
3489
  return detail::NativeGather512<1>(base, offsets);
3366
3490
  }
3367
3491
 
3368
3492
  template <class D, HWY_IF_V_SIZE_D(D, 64)>
3369
- HWY_API VFromD<D> GatherIndex(D d, const TFromD<D>* HWY_RESTRICT base,
3493
+ HWY_API VFromD<D> GatherIndex(D /*d*/, const TFromD<D>* HWY_RESTRICT base,
3370
3494
  VFromD<RebindToSigned<D>> indices) {
3371
- const RebindToSigned<decltype(d)> di;
3372
- (void)di; // for HWY_DASSERT
3373
- HWY_DASSERT(AllFalse(di, Lt(indices, Zero(di))));
3374
3495
  return detail::NativeGather512<sizeof(TFromD<D>)>(base, indices);
3375
3496
  }
3376
3497
 
3377
3498
  template <class D, HWY_IF_V_SIZE_D(D, 64)>
3378
- HWY_API VFromD<D> MaskedGatherIndexOr(VFromD<D> no, MFromD<D> m, D d,
3499
+ HWY_API VFromD<D> MaskedGatherIndexOr(VFromD<D> no, MFromD<D> m, D /*d*/,
3379
3500
  const TFromD<D>* HWY_RESTRICT base,
3380
3501
  VFromD<RebindToSigned<D>> indices) {
3381
- const RebindToSigned<decltype(d)> di;
3382
- (void)di; // for HWY_DASSERT
3383
- HWY_DASSERT(AllFalse(di, Lt(indices, Zero(di))));
3384
3502
  return detail::NativeMaskedGatherOr512<sizeof(TFromD<D>)>(no, m, base,
3385
3503
  indices);
3386
3504
  }
@@ -4625,6 +4743,35 @@ HWY_API Vec512<T> OddEven(const Vec512<T> a, const Vec512<T> b) {
4625
4743
  return IfThenElse(Mask512<T>{0x5555555555555555ull >> shift}, b, a);
4626
4744
  }
4627
4745
 
4746
+ // -------------------------- InterleaveEven
4747
+
4748
+ template <class D, HWY_IF_LANES_D(D, 16), HWY_IF_UI32_D(D)>
4749
+ HWY_API VFromD<D> InterleaveEven(D /*d*/, VFromD<D> a, VFromD<D> b) {
4750
+ return VFromD<D>{_mm512_mask_shuffle_epi32(
4751
+ a.raw, static_cast<__mmask16>(0xAAAA), b.raw,
4752
+ static_cast<_MM_PERM_ENUM>(_MM_SHUFFLE(2, 2, 0, 0)))};
4753
+ }
4754
+ template <class D, HWY_IF_LANES_D(D, 16), HWY_IF_F32_D(D)>
4755
+ HWY_API VFromD<D> InterleaveEven(D /*d*/, VFromD<D> a, VFromD<D> b) {
4756
+ return VFromD<D>{_mm512_mask_shuffle_ps(a.raw, static_cast<__mmask16>(0xAAAA),
4757
+ b.raw, b.raw,
4758
+ _MM_SHUFFLE(2, 2, 0, 0))};
4759
+ }
4760
+ // -------------------------- InterleaveOdd
4761
+
4762
+ template <class D, HWY_IF_LANES_D(D, 16), HWY_IF_UI32_D(D)>
4763
+ HWY_API VFromD<D> InterleaveOdd(D /*d*/, VFromD<D> a, VFromD<D> b) {
4764
+ return VFromD<D>{_mm512_mask_shuffle_epi32(
4765
+ b.raw, static_cast<__mmask16>(0x5555), a.raw,
4766
+ static_cast<_MM_PERM_ENUM>(_MM_SHUFFLE(3, 3, 1, 1)))};
4767
+ }
4768
+ template <class D, HWY_IF_LANES_D(D, 16), HWY_IF_F32_D(D)>
4769
+ HWY_API VFromD<D> InterleaveOdd(D /*d*/, VFromD<D> a, VFromD<D> b) {
4770
+ return VFromD<D>{_mm512_mask_shuffle_ps(b.raw, static_cast<__mmask16>(0x5555),
4771
+ a.raw, a.raw,
4772
+ _MM_SHUFFLE(3, 3, 1, 1))};
4773
+ }
4774
+
4628
4775
  // ------------------------------ OddEvenBlocks
4629
4776
 
4630
4777
  template <typename T>
@@ -5381,18 +5528,76 @@ HWY_API VFromD<D> PromoteTo(D /* tag */, Vec256<uint32_t> v) {
5381
5528
  }
5382
5529
 
5383
5530
  template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_I64_D(D)>
5384
- HWY_API VFromD<D> PromoteTo(D di64, VFromD<Rebind<float, D>> v) {
5385
- const Rebind<float, decltype(di64)> df32;
5386
- const RebindToFloat<decltype(di64)> df64;
5387
- const RebindToSigned<decltype(df32)> di32;
5531
+ HWY_API VFromD<D> PromoteInRangeTo(D /*di64*/, VFromD<Rebind<float, D>> v) {
5532
+ #if HWY_COMPILER_GCC_ACTUAL
5533
+ // Workaround for undefined behavior with GCC if any values of v[i] are not
5534
+ // within the range of an int64_t
5535
+
5536
+ #if HWY_COMPILER_GCC_ACTUAL >= 700 && !HWY_IS_DEBUG_BUILD
5537
+ if (detail::IsConstantX86VecForF2IConv<int64_t>(v)) {
5538
+ typedef float GccF32RawVectType __attribute__((__vector_size__(32)));
5539
+ const auto raw_v = reinterpret_cast<GccF32RawVectType>(v.raw);
5540
+ return VFromD<D>{_mm512_setr_epi64(
5541
+ detail::X86ConvertScalarFromFloat<int64_t>(raw_v[0]),
5542
+ detail::X86ConvertScalarFromFloat<int64_t>(raw_v[1]),
5543
+ detail::X86ConvertScalarFromFloat<int64_t>(raw_v[2]),
5544
+ detail::X86ConvertScalarFromFloat<int64_t>(raw_v[3]),
5545
+ detail::X86ConvertScalarFromFloat<int64_t>(raw_v[4]),
5546
+ detail::X86ConvertScalarFromFloat<int64_t>(raw_v[5]),
5547
+ detail::X86ConvertScalarFromFloat<int64_t>(raw_v[6]),
5548
+ detail::X86ConvertScalarFromFloat<int64_t>(raw_v[7]))};
5549
+ }
5550
+ #endif
5388
5551
 
5389
- return detail::FixConversionOverflow(
5390
- di64, BitCast(df64, PromoteTo(di64, BitCast(di32, v))),
5391
- VFromD<D>{_mm512_cvttps_epi64(v.raw)});
5552
+ __m512i raw_result;
5553
+ __asm__("vcvttps2qq {%1, %0|%0, %1}"
5554
+ : "=" HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(raw_result)
5555
+ : HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(v.raw)
5556
+ :);
5557
+ return VFromD<D>{raw_result};
5558
+ #else
5559
+ return VFromD<D>{_mm512_cvttps_epi64(v.raw)};
5560
+ #endif
5392
5561
  }
5393
5562
  template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_U64_D(D)>
5394
- HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<float, D>> v) {
5395
- return VFromD<D>{_mm512_maskz_cvttps_epu64(Not(MaskFromVec(v)).raw, v.raw)};
5563
+ HWY_API VFromD<D> PromoteInRangeTo(D /* tag */, VFromD<Rebind<float, D>> v) {
5564
+ #if HWY_COMPILER_GCC_ACTUAL
5565
+ // Workaround for undefined behavior with GCC if any values of v[i] are not
5566
+ // within the range of an uint64_t
5567
+
5568
+ #if HWY_COMPILER_GCC_ACTUAL >= 700 && !HWY_IS_DEBUG_BUILD
5569
+ if (detail::IsConstantX86VecForF2IConv<int64_t>(v)) {
5570
+ typedef float GccF32RawVectType __attribute__((__vector_size__(32)));
5571
+ const auto raw_v = reinterpret_cast<GccF32RawVectType>(v.raw);
5572
+ return VFromD<D>{_mm512_setr_epi64(
5573
+ static_cast<int64_t>(
5574
+ detail::X86ConvertScalarFromFloat<uint64_t>(raw_v[0])),
5575
+ static_cast<int64_t>(
5576
+ detail::X86ConvertScalarFromFloat<uint64_t>(raw_v[1])),
5577
+ static_cast<int64_t>(
5578
+ detail::X86ConvertScalarFromFloat<uint64_t>(raw_v[2])),
5579
+ static_cast<int64_t>(
5580
+ detail::X86ConvertScalarFromFloat<uint64_t>(raw_v[3])),
5581
+ static_cast<int64_t>(
5582
+ detail::X86ConvertScalarFromFloat<uint64_t>(raw_v[4])),
5583
+ static_cast<int64_t>(
5584
+ detail::X86ConvertScalarFromFloat<uint64_t>(raw_v[5])),
5585
+ static_cast<int64_t>(
5586
+ detail::X86ConvertScalarFromFloat<uint64_t>(raw_v[6])),
5587
+ static_cast<int64_t>(
5588
+ detail::X86ConvertScalarFromFloat<uint64_t>(raw_v[7])))};
5589
+ }
5590
+ #endif
5591
+
5592
+ __m512i raw_result;
5593
+ __asm__("vcvttps2uqq {%1, %0|%0, %1}"
5594
+ : "=" HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(raw_result)
5595
+ : HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(v.raw)
5596
+ :);
5597
+ return VFromD<D>{raw_result};
5598
+ #else
5599
+ return VFromD<D>{_mm512_cvttps_epu64(v.raw)};
5600
+ #endif
5396
5601
  }
5397
5602
 
5398
5603
  // ------------------------------ Demotions (full -> part w/ narrow lanes)
@@ -5546,24 +5751,38 @@ HWY_API VFromD<D> DemoteTo(D /*df16*/, Vec512<double> v) {
5546
5751
  }
5547
5752
  #endif // HWY_HAVE_FLOAT16
5548
5753
 
5754
+ #if HWY_AVX3_HAVE_F32_TO_BF16C
5549
5755
  template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_BF16_D(D)>
5550
- HWY_API VFromD<D> DemoteTo(D dbf16, Vec512<float> v) {
5551
- // TODO(janwas): _mm512_cvtneps_pbh once we have avx512bf16.
5552
- const Rebind<int32_t, decltype(dbf16)> di32;
5553
- const Rebind<uint32_t, decltype(dbf16)> du32; // for logical shift right
5554
- const Rebind<uint16_t, decltype(dbf16)> du16;
5555
- const auto bits_in_32 = BitCast(di32, ShiftRight<16>(BitCast(du32, v)));
5556
- return BitCast(dbf16, DemoteTo(du16, bits_in_32));
5756
+ HWY_API VFromD<D> DemoteTo(D /*dbf16*/, Vec512<float> v) {
5757
+ #if HWY_COMPILER_CLANG >= 1600 && HWY_COMPILER_CLANG < 2000
5758
+ // Inline assembly workaround for LLVM codegen bug
5759
+ __m256i raw_result;
5760
+ __asm__("vcvtneps2bf16 %1, %0" : "=v"(raw_result) : "v"(v.raw));
5761
+ return VFromD<D>{raw_result};
5762
+ #else
5763
+ // The _mm512_cvtneps_pbh intrinsic returns a __m256bh vector that needs to be
5764
+ // bit casted to a __m256i vector
5765
+ return VFromD<D>{detail::BitCastToInteger(_mm512_cvtneps_pbh(v.raw))};
5766
+ #endif
5557
5767
  }
5558
5768
 
5559
5769
  template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_BF16_D(D)>
5560
- HWY_API VFromD<D> ReorderDemote2To(D dbf16, Vec512<float> a, Vec512<float> b) {
5561
- // TODO(janwas): _mm512_cvtne2ps_pbh once we have avx512bf16.
5562
- const RebindToUnsigned<decltype(dbf16)> du16;
5563
- const Repartition<uint32_t, decltype(dbf16)> du32;
5564
- const Vec512<uint32_t> b_in_even = ShiftRight<16>(BitCast(du32, b));
5565
- return BitCast(dbf16, OddEven(BitCast(du16, a), BitCast(du16, b_in_even)));
5770
+ HWY_API VFromD<D> ReorderDemote2To(D /*dbf16*/, Vec512<float> a,
5771
+ Vec512<float> b) {
5772
+ #if HWY_COMPILER_CLANG >= 1600 && HWY_COMPILER_CLANG < 2000
5773
+ // Inline assembly workaround for LLVM codegen bug
5774
+ __m512i raw_result;
5775
+ __asm__("vcvtne2ps2bf16 %2, %1, %0"
5776
+ : "=v"(raw_result)
5777
+ : "v"(b.raw), "v"(a.raw));
5778
+ return VFromD<D>{raw_result};
5779
+ #else
5780
+ // The _mm512_cvtne2ps_pbh intrinsic returns a __m512bh vector that needs to
5781
+ // be bit casted to a __m512i vector
5782
+ return VFromD<D>{detail::BitCastToInteger(_mm512_cvtne2ps_pbh(b.raw, a.raw))};
5783
+ #endif
5566
5784
  }
5785
+ #endif // HWY_AVX3_HAVE_F32_TO_BF16C
5567
5786
 
5568
5787
  template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_I16_D(D)>
5569
5788
  HWY_API VFromD<D> ReorderDemote2To(D /* tag */, Vec512<int32_t> a,
@@ -5651,15 +5870,77 @@ HWY_API VFromD<D> DemoteTo(D /* tag */, Vec512<double> v) {
5651
5870
  }
5652
5871
 
5653
5872
  template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_I32_D(D)>
5654
- HWY_API VFromD<D> DemoteTo(D /* tag */, Vec512<double> v) {
5655
- const Full512<double> d64;
5656
- const auto clamped = detail::ClampF64ToI32Max(d64, v);
5657
- return VFromD<D>{_mm512_cvttpd_epi32(clamped.raw)};
5873
+ HWY_API VFromD<D> DemoteInRangeTo(D /* tag */, Vec512<double> v) {
5874
+ #if HWY_COMPILER_GCC_ACTUAL
5875
+ // Workaround for undefined behavior in _mm512_cvttpd_epi32 with GCC if any
5876
+ // values of v[i] are not within the range of an int32_t
5877
+
5878
+ #if HWY_COMPILER_GCC_ACTUAL >= 700 && !HWY_IS_DEBUG_BUILD
5879
+ if (detail::IsConstantX86VecForF2IConv<int32_t>(v)) {
5880
+ typedef double GccF64RawVectType __attribute__((__vector_size__(64)));
5881
+ const auto raw_v = reinterpret_cast<GccF64RawVectType>(v.raw);
5882
+ return VFromD<D>{_mm256_setr_epi32(
5883
+ detail::X86ConvertScalarFromFloat<int32_t>(raw_v[0]),
5884
+ detail::X86ConvertScalarFromFloat<int32_t>(raw_v[1]),
5885
+ detail::X86ConvertScalarFromFloat<int32_t>(raw_v[2]),
5886
+ detail::X86ConvertScalarFromFloat<int32_t>(raw_v[3]),
5887
+ detail::X86ConvertScalarFromFloat<int32_t>(raw_v[4]),
5888
+ detail::X86ConvertScalarFromFloat<int32_t>(raw_v[5]),
5889
+ detail::X86ConvertScalarFromFloat<int32_t>(raw_v[6]),
5890
+ detail::X86ConvertScalarFromFloat<int32_t>(raw_v[7]))};
5891
+ }
5892
+ #endif
5893
+
5894
+ __m256i raw_result;
5895
+ __asm__("vcvttpd2dq {%1, %0|%0, %1}"
5896
+ : "=" HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(raw_result)
5897
+ : HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(v.raw)
5898
+ :);
5899
+ return VFromD<D>{raw_result};
5900
+ #else
5901
+ return VFromD<D>{_mm512_cvttpd_epi32(v.raw)};
5902
+ #endif
5658
5903
  }
5659
5904
 
5660
5905
  template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_U32_D(D)>
5661
- HWY_API VFromD<D> DemoteTo(D /* tag */, Vec512<double> v) {
5662
- return VFromD<D>{_mm512_maskz_cvttpd_epu32(Not(MaskFromVec(v)).raw, v.raw)};
5906
+ HWY_API VFromD<D> DemoteInRangeTo(D /* tag */, Vec512<double> v) {
5907
+ #if HWY_COMPILER_GCC_ACTUAL
5908
+ // Workaround for undefined behavior in _mm512_cvttpd_epu32 with GCC if any
5909
+ // values of v[i] are not within the range of an uint32_t
5910
+
5911
+ #if HWY_COMPILER_GCC_ACTUAL >= 700 && !HWY_IS_DEBUG_BUILD
5912
+ if (detail::IsConstantX86VecForF2IConv<uint32_t>(v)) {
5913
+ typedef double GccF64RawVectType __attribute__((__vector_size__(64)));
5914
+ const auto raw_v = reinterpret_cast<GccF64RawVectType>(v.raw);
5915
+ return VFromD<D>{_mm256_setr_epi32(
5916
+ static_cast<int32_t>(
5917
+ detail::X86ConvertScalarFromFloat<uint32_t>(raw_v[0])),
5918
+ static_cast<int32_t>(
5919
+ detail::X86ConvertScalarFromFloat<uint32_t>(raw_v[1])),
5920
+ static_cast<int32_t>(
5921
+ detail::X86ConvertScalarFromFloat<uint32_t>(raw_v[2])),
5922
+ static_cast<int32_t>(
5923
+ detail::X86ConvertScalarFromFloat<uint32_t>(raw_v[3])),
5924
+ static_cast<int32_t>(
5925
+ detail::X86ConvertScalarFromFloat<uint32_t>(raw_v[4])),
5926
+ static_cast<int32_t>(
5927
+ detail::X86ConvertScalarFromFloat<uint32_t>(raw_v[5])),
5928
+ static_cast<int32_t>(
5929
+ detail::X86ConvertScalarFromFloat<uint32_t>(raw_v[6])),
5930
+ static_cast<int32_t>(
5931
+ detail::X86ConvertScalarFromFloat<uint32_t>(raw_v[7])))};
5932
+ }
5933
+ #endif
5934
+
5935
+ __m256i raw_result;
5936
+ __asm__("vcvttpd2udq {%1, %0|%0, %1}"
5937
+ : "=" HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(raw_result)
5938
+ : HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(v.raw)
5939
+ :);
5940
+ return VFromD<D>{raw_result};
5941
+ #else
5942
+ return VFromD<D>{_mm512_cvttpd_epu32(v.raw)};
5943
+ #endif
5663
5944
  }
5664
5945
 
5665
5946
  template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)>
@@ -5819,38 +6100,362 @@ HWY_API VFromD<D> ConvertTo(D /* tag*/, Vec512<uint64_t> v) {
5819
6100
  // Truncates (rounds toward zero).
5820
6101
  #if HWY_HAVE_FLOAT16
5821
6102
  template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_I16_D(D)>
5822
- HWY_API VFromD<D> ConvertTo(D d, Vec512<float16_t> v) {
5823
- return detail::FixConversionOverflow(d, v,
5824
- VFromD<D>{_mm512_cvttph_epi16(v.raw)});
6103
+ HWY_API VFromD<D> ConvertInRangeTo(D /*d*/, Vec512<float16_t> v) {
6104
+ #if HWY_COMPILER_GCC_ACTUAL
6105
+ // Workaround for undefined behavior in _mm512_cvttph_epi16 with GCC if any
6106
+ // values of v[i] are not within the range of an int16_t
6107
+
6108
+ #if HWY_COMPILER_GCC_ACTUAL >= 1200 && !HWY_IS_DEBUG_BUILD && \
6109
+ HWY_HAVE_SCALAR_F16_TYPE
6110
+ if (detail::IsConstantX86VecForF2IConv<int16_t>(v)) {
6111
+ typedef hwy::float16_t::Native GccF16RawVectType
6112
+ __attribute__((__vector_size__(64)));
6113
+ const auto raw_v = reinterpret_cast<GccF16RawVectType>(v.raw);
6114
+ return VFromD<D>{
6115
+ _mm512_set_epi16(detail::X86ConvertScalarFromFloat<int16_t>(raw_v[31]),
6116
+ detail::X86ConvertScalarFromFloat<int16_t>(raw_v[30]),
6117
+ detail::X86ConvertScalarFromFloat<int16_t>(raw_v[29]),
6118
+ detail::X86ConvertScalarFromFloat<int16_t>(raw_v[28]),
6119
+ detail::X86ConvertScalarFromFloat<int16_t>(raw_v[27]),
6120
+ detail::X86ConvertScalarFromFloat<int16_t>(raw_v[26]),
6121
+ detail::X86ConvertScalarFromFloat<int16_t>(raw_v[25]),
6122
+ detail::X86ConvertScalarFromFloat<int16_t>(raw_v[24]),
6123
+ detail::X86ConvertScalarFromFloat<int16_t>(raw_v[23]),
6124
+ detail::X86ConvertScalarFromFloat<int16_t>(raw_v[22]),
6125
+ detail::X86ConvertScalarFromFloat<int16_t>(raw_v[21]),
6126
+ detail::X86ConvertScalarFromFloat<int16_t>(raw_v[20]),
6127
+ detail::X86ConvertScalarFromFloat<int16_t>(raw_v[19]),
6128
+ detail::X86ConvertScalarFromFloat<int16_t>(raw_v[18]),
6129
+ detail::X86ConvertScalarFromFloat<int16_t>(raw_v[17]),
6130
+ detail::X86ConvertScalarFromFloat<int16_t>(raw_v[16]),
6131
+ detail::X86ConvertScalarFromFloat<int16_t>(raw_v[15]),
6132
+ detail::X86ConvertScalarFromFloat<int16_t>(raw_v[14]),
6133
+ detail::X86ConvertScalarFromFloat<int16_t>(raw_v[13]),
6134
+ detail::X86ConvertScalarFromFloat<int16_t>(raw_v[12]),
6135
+ detail::X86ConvertScalarFromFloat<int16_t>(raw_v[11]),
6136
+ detail::X86ConvertScalarFromFloat<int16_t>(raw_v[10]),
6137
+ detail::X86ConvertScalarFromFloat<int16_t>(raw_v[9]),
6138
+ detail::X86ConvertScalarFromFloat<int16_t>(raw_v[8]),
6139
+ detail::X86ConvertScalarFromFloat<int16_t>(raw_v[7]),
6140
+ detail::X86ConvertScalarFromFloat<int16_t>(raw_v[6]),
6141
+ detail::X86ConvertScalarFromFloat<int16_t>(raw_v[5]),
6142
+ detail::X86ConvertScalarFromFloat<int16_t>(raw_v[4]),
6143
+ detail::X86ConvertScalarFromFloat<int16_t>(raw_v[3]),
6144
+ detail::X86ConvertScalarFromFloat<int16_t>(raw_v[2]),
6145
+ detail::X86ConvertScalarFromFloat<int16_t>(raw_v[1]),
6146
+ detail::X86ConvertScalarFromFloat<int16_t>(raw_v[0]))};
6147
+ }
6148
+ #endif
6149
+
6150
+ __m512i raw_result;
6151
+ __asm__("vcvttph2w {%1, %0|%0, %1}"
6152
+ : "=" HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(raw_result)
6153
+ : HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(v.raw)
6154
+ :);
6155
+ return VFromD<D>{raw_result};
6156
+ #else
6157
+ return VFromD<D>{_mm512_cvttph_epi16(v.raw)};
6158
+ #endif
5825
6159
  }
5826
6160
  template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_U16_D(D)>
5827
- HWY_API VFromD<D> ConvertTo(D /* tag */, VFromD<RebindToFloat<D>> v) {
5828
- return VFromD<D>{_mm512_maskz_cvttph_epu16(Not(MaskFromVec(v)).raw, v.raw)};
6161
+ HWY_API VFromD<D> ConvertInRangeTo(D /* tag */, VFromD<RebindToFloat<D>> v) {
6162
+ #if HWY_COMPILER_GCC_ACTUAL
6163
+ // Workaround for undefined behavior in _mm512_cvttph_epu16 with GCC if any
6164
+ // values of v[i] are not within the range of an uint16_t
6165
+
6166
+ #if HWY_COMPILER_GCC_ACTUAL >= 1200 && !HWY_IS_DEBUG_BUILD && \
6167
+ HWY_HAVE_SCALAR_F16_TYPE
6168
+ if (detail::IsConstantX86VecForF2IConv<uint16_t>(v)) {
6169
+ typedef hwy::float16_t::Native GccF16RawVectType
6170
+ __attribute__((__vector_size__(64)));
6171
+ const auto raw_v = reinterpret_cast<GccF16RawVectType>(v.raw);
6172
+ return VFromD<D>{_mm512_set_epi16(
6173
+ static_cast<int16_t>(
6174
+ detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[31])),
6175
+ static_cast<int16_t>(
6176
+ detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[30])),
6177
+ static_cast<int16_t>(
6178
+ detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[29])),
6179
+ static_cast<int16_t>(
6180
+ detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[28])),
6181
+ static_cast<int16_t>(
6182
+ detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[27])),
6183
+ static_cast<int16_t>(
6184
+ detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[26])),
6185
+ static_cast<int16_t>(
6186
+ detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[25])),
6187
+ static_cast<int16_t>(
6188
+ detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[24])),
6189
+ static_cast<int16_t>(
6190
+ detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[23])),
6191
+ static_cast<int16_t>(
6192
+ detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[22])),
6193
+ static_cast<int16_t>(
6194
+ detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[21])),
6195
+ static_cast<int16_t>(
6196
+ detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[20])),
6197
+ static_cast<int16_t>(
6198
+ detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[19])),
6199
+ static_cast<int16_t>(
6200
+ detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[18])),
6201
+ static_cast<int16_t>(
6202
+ detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[17])),
6203
+ static_cast<int16_t>(
6204
+ detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[16])),
6205
+ static_cast<int16_t>(
6206
+ detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[15])),
6207
+ static_cast<int16_t>(
6208
+ detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[14])),
6209
+ static_cast<int16_t>(
6210
+ detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[13])),
6211
+ static_cast<int16_t>(
6212
+ detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[12])),
6213
+ static_cast<int16_t>(
6214
+ detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[11])),
6215
+ static_cast<int16_t>(
6216
+ detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[10])),
6217
+ static_cast<int16_t>(
6218
+ detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[9])),
6219
+ static_cast<int16_t>(
6220
+ detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[8])),
6221
+ static_cast<int16_t>(
6222
+ detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[7])),
6223
+ static_cast<int16_t>(
6224
+ detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[6])),
6225
+ static_cast<int16_t>(
6226
+ detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[5])),
6227
+ static_cast<int16_t>(
6228
+ detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[4])),
6229
+ static_cast<int16_t>(
6230
+ detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[3])),
6231
+ static_cast<int16_t>(
6232
+ detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[2])),
6233
+ static_cast<int16_t>(
6234
+ detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[1])),
6235
+ static_cast<int16_t>(
6236
+ detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[0])))};
6237
+ }
6238
+ #endif
6239
+
6240
+ __m512i raw_result;
6241
+ __asm__("vcvttph2uw {%1, %0|%0, %1}"
6242
+ : "=" HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(raw_result)
6243
+ : HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(v.raw)
6244
+ :);
6245
+ return VFromD<D>{raw_result};
6246
+ #else
6247
+ return VFromD<D>{_mm512_cvttph_epu16(v.raw)};
6248
+ #endif
5829
6249
  }
5830
6250
  #endif // HWY_HAVE_FLOAT16
5831
6251
  template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_I32_D(D)>
5832
- HWY_API VFromD<D> ConvertTo(D d, Vec512<float> v) {
5833
- return detail::FixConversionOverflow(d, v,
5834
- VFromD<D>{_mm512_cvttps_epi32(v.raw)});
6252
+ HWY_API VFromD<D> ConvertInRangeTo(D /*d*/, Vec512<float> v) {
6253
+ #if HWY_COMPILER_GCC_ACTUAL
6254
+ // Workaround for undefined behavior in _mm512_cvttps_epi32 with GCC if any
6255
+ // values of v[i] are not within the range of an int32_t
6256
+
6257
+ #if HWY_COMPILER_GCC_ACTUAL >= 700 && !HWY_IS_DEBUG_BUILD
6258
+ if (detail::IsConstantX86VecForF2IConv<int32_t>(v)) {
6259
+ typedef float GccF32RawVectType __attribute__((__vector_size__(64)));
6260
+ const auto raw_v = reinterpret_cast<GccF32RawVectType>(v.raw);
6261
+ return VFromD<D>{_mm512_setr_epi32(
6262
+ detail::X86ConvertScalarFromFloat<int32_t>(raw_v[0]),
6263
+ detail::X86ConvertScalarFromFloat<int32_t>(raw_v[1]),
6264
+ detail::X86ConvertScalarFromFloat<int32_t>(raw_v[2]),
6265
+ detail::X86ConvertScalarFromFloat<int32_t>(raw_v[3]),
6266
+ detail::X86ConvertScalarFromFloat<int32_t>(raw_v[4]),
6267
+ detail::X86ConvertScalarFromFloat<int32_t>(raw_v[5]),
6268
+ detail::X86ConvertScalarFromFloat<int32_t>(raw_v[6]),
6269
+ detail::X86ConvertScalarFromFloat<int32_t>(raw_v[7]),
6270
+ detail::X86ConvertScalarFromFloat<int32_t>(raw_v[8]),
6271
+ detail::X86ConvertScalarFromFloat<int32_t>(raw_v[9]),
6272
+ detail::X86ConvertScalarFromFloat<int32_t>(raw_v[10]),
6273
+ detail::X86ConvertScalarFromFloat<int32_t>(raw_v[11]),
6274
+ detail::X86ConvertScalarFromFloat<int32_t>(raw_v[12]),
6275
+ detail::X86ConvertScalarFromFloat<int32_t>(raw_v[13]),
6276
+ detail::X86ConvertScalarFromFloat<int32_t>(raw_v[14]),
6277
+ detail::X86ConvertScalarFromFloat<int32_t>(raw_v[15]))};
6278
+ }
6279
+ #endif
6280
+
6281
+ __m512i raw_result;
6282
+ __asm__("vcvttps2dq {%1, %0|%0, %1}"
6283
+ : "=" HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(raw_result)
6284
+ : HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(v.raw)
6285
+ :);
6286
+ return VFromD<D>{raw_result};
6287
+ #else
6288
+ return VFromD<D>{_mm512_cvttps_epi32(v.raw)};
6289
+ #endif
5835
6290
  }
5836
6291
  template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_I64_D(D)>
5837
- HWY_API VFromD<D> ConvertTo(D di, Vec512<double> v) {
5838
- return detail::FixConversionOverflow(di, v,
5839
- VFromD<D>{_mm512_cvttpd_epi64(v.raw)});
6292
+ HWY_API VFromD<D> ConvertInRangeTo(D /*di*/, Vec512<double> v) {
6293
+ #if HWY_COMPILER_GCC_ACTUAL
6294
+ // Workaround for undefined behavior in _mm512_cvttpd_epi64 with GCC if any
6295
+ // values of v[i] are not within the range of an int64_t
6296
+
6297
+ #if HWY_COMPILER_GCC_ACTUAL >= 700 && !HWY_IS_DEBUG_BUILD
6298
+ if (detail::IsConstantX86VecForF2IConv<int64_t>(v)) {
6299
+ typedef double GccF64RawVectType __attribute__((__vector_size__(64)));
6300
+ const auto raw_v = reinterpret_cast<GccF64RawVectType>(v.raw);
6301
+ return VFromD<D>{_mm512_setr_epi64(
6302
+ detail::X86ConvertScalarFromFloat<int64_t>(raw_v[0]),
6303
+ detail::X86ConvertScalarFromFloat<int64_t>(raw_v[1]),
6304
+ detail::X86ConvertScalarFromFloat<int64_t>(raw_v[2]),
6305
+ detail::X86ConvertScalarFromFloat<int64_t>(raw_v[3]),
6306
+ detail::X86ConvertScalarFromFloat<int64_t>(raw_v[4]),
6307
+ detail::X86ConvertScalarFromFloat<int64_t>(raw_v[5]),
6308
+ detail::X86ConvertScalarFromFloat<int64_t>(raw_v[6]),
6309
+ detail::X86ConvertScalarFromFloat<int64_t>(raw_v[7]))};
6310
+ }
6311
+ #endif
6312
+
6313
+ __m512i raw_result;
6314
+ __asm__("vcvttpd2qq {%1, %0|%0, %1}"
6315
+ : "=" HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(raw_result)
6316
+ : HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(v.raw)
6317
+ :);
6318
+ return VFromD<D>{raw_result};
6319
+ #else
6320
+ return VFromD<D>{_mm512_cvttpd_epi64(v.raw)};
6321
+ #endif
5840
6322
  }
5841
6323
  template <class DU, HWY_IF_V_SIZE_D(DU, 64), HWY_IF_U32_D(DU)>
5842
- HWY_API VFromD<DU> ConvertTo(DU /*du*/, VFromD<RebindToFloat<DU>> v) {
5843
- return VFromD<DU>{_mm512_maskz_cvttps_epu32(Not(MaskFromVec(v)).raw, v.raw)};
6324
+ HWY_API VFromD<DU> ConvertInRangeTo(DU /*du*/, VFromD<RebindToFloat<DU>> v) {
6325
+ #if HWY_COMPILER_GCC_ACTUAL
6326
+ // Workaround for undefined behavior in _mm512_cvttps_epu32 with GCC if any
6327
+ // values of v[i] are not within the range of an uint32_t
6328
+
6329
+ #if HWY_COMPILER_GCC_ACTUAL >= 700 && !HWY_IS_DEBUG_BUILD
6330
+ if (detail::IsConstantX86VecForF2IConv<uint32_t>(v)) {
6331
+ typedef float GccF32RawVectType __attribute__((__vector_size__(64)));
6332
+ const auto raw_v = reinterpret_cast<GccF32RawVectType>(v.raw);
6333
+ return VFromD<DU>{_mm512_setr_epi32(
6334
+ static_cast<int32_t>(
6335
+ detail::X86ConvertScalarFromFloat<uint32_t>(raw_v[0])),
6336
+ static_cast<int32_t>(
6337
+ detail::X86ConvertScalarFromFloat<uint32_t>(raw_v[1])),
6338
+ static_cast<int32_t>(
6339
+ detail::X86ConvertScalarFromFloat<uint32_t>(raw_v[2])),
6340
+ static_cast<int32_t>(
6341
+ detail::X86ConvertScalarFromFloat<uint32_t>(raw_v[3])),
6342
+ static_cast<int32_t>(
6343
+ detail::X86ConvertScalarFromFloat<uint32_t>(raw_v[4])),
6344
+ static_cast<int32_t>(
6345
+ detail::X86ConvertScalarFromFloat<uint32_t>(raw_v[5])),
6346
+ static_cast<int32_t>(
6347
+ detail::X86ConvertScalarFromFloat<uint32_t>(raw_v[6])),
6348
+ static_cast<int32_t>(
6349
+ detail::X86ConvertScalarFromFloat<uint32_t>(raw_v[7])),
6350
+ static_cast<int32_t>(
6351
+ detail::X86ConvertScalarFromFloat<uint32_t>(raw_v[8])),
6352
+ static_cast<int32_t>(
6353
+ detail::X86ConvertScalarFromFloat<uint32_t>(raw_v[9])),
6354
+ static_cast<int32_t>(
6355
+ detail::X86ConvertScalarFromFloat<uint32_t>(raw_v[10])),
6356
+ static_cast<int32_t>(
6357
+ detail::X86ConvertScalarFromFloat<uint32_t>(raw_v[11])),
6358
+ static_cast<int32_t>(
6359
+ detail::X86ConvertScalarFromFloat<uint32_t>(raw_v[12])),
6360
+ static_cast<int32_t>(
6361
+ detail::X86ConvertScalarFromFloat<uint32_t>(raw_v[13])),
6362
+ static_cast<int32_t>(
6363
+ detail::X86ConvertScalarFromFloat<uint32_t>(raw_v[14])),
6364
+ static_cast<int32_t>(
6365
+ detail::X86ConvertScalarFromFloat<uint32_t>(raw_v[15])))};
6366
+ }
6367
+ #endif
6368
+
6369
+ __m512i raw_result;
6370
+ __asm__("vcvttps2udq {%1, %0|%0, %1}"
6371
+ : "=" HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(raw_result)
6372
+ : HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(v.raw)
6373
+ :);
6374
+ return VFromD<DU>{raw_result};
6375
+ #else
6376
+ return VFromD<DU>{_mm512_cvttps_epu32(v.raw)};
6377
+ #endif
5844
6378
  }
5845
6379
  template <class DU, HWY_IF_V_SIZE_D(DU, 64), HWY_IF_U64_D(DU)>
5846
- HWY_API VFromD<DU> ConvertTo(DU /*du*/, VFromD<RebindToFloat<DU>> v) {
5847
- return VFromD<DU>{_mm512_maskz_cvttpd_epu64(Not(MaskFromVec(v)).raw, v.raw)};
6380
+ HWY_API VFromD<DU> ConvertInRangeTo(DU /*du*/, VFromD<RebindToFloat<DU>> v) {
6381
+ #if HWY_COMPILER_GCC_ACTUAL
6382
+ // Workaround for undefined behavior in _mm512_cvttpd_epu64 with GCC if any
6383
+ // values of v[i] are not within the range of an uint64_t
6384
+
6385
+ #if HWY_COMPILER_GCC_ACTUAL >= 700 && !HWY_IS_DEBUG_BUILD
6386
+ if (detail::IsConstantX86VecForF2IConv<int64_t>(v)) {
6387
+ typedef double GccF64RawVectType __attribute__((__vector_size__(64)));
6388
+ const auto raw_v = reinterpret_cast<GccF64RawVectType>(v.raw);
6389
+ return VFromD<DU>{_mm512_setr_epi64(
6390
+ static_cast<int64_t>(
6391
+ detail::X86ConvertScalarFromFloat<uint64_t>(raw_v[0])),
6392
+ static_cast<int64_t>(
6393
+ detail::X86ConvertScalarFromFloat<uint64_t>(raw_v[1])),
6394
+ static_cast<int64_t>(
6395
+ detail::X86ConvertScalarFromFloat<uint64_t>(raw_v[2])),
6396
+ static_cast<int64_t>(
6397
+ detail::X86ConvertScalarFromFloat<uint64_t>(raw_v[3])),
6398
+ static_cast<int64_t>(
6399
+ detail::X86ConvertScalarFromFloat<uint64_t>(raw_v[4])),
6400
+ static_cast<int64_t>(
6401
+ detail::X86ConvertScalarFromFloat<uint64_t>(raw_v[5])),
6402
+ static_cast<int64_t>(
6403
+ detail::X86ConvertScalarFromFloat<uint64_t>(raw_v[6])),
6404
+ static_cast<int64_t>(
6405
+ detail::X86ConvertScalarFromFloat<uint64_t>(raw_v[7])))};
6406
+ }
6407
+ #endif
6408
+
6409
+ __m512i raw_result;
6410
+ __asm__("vcvttpd2uqq {%1, %0|%0, %1}"
6411
+ : "=" HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(raw_result)
6412
+ : HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(v.raw)
6413
+ :);
6414
+ return VFromD<DU>{raw_result};
6415
+ #else
6416
+ return VFromD<DU>{_mm512_cvttpd_epu64(v.raw)};
6417
+ #endif
5848
6418
  }
5849
6419
 
5850
- HWY_API Vec512<int32_t> NearestInt(const Vec512<float> v) {
5851
- const Full512<int32_t> di;
5852
- return detail::FixConversionOverflow(
5853
- di, v, Vec512<int32_t>{_mm512_cvtps_epi32(v.raw)});
6420
+ template <class DI, HWY_IF_V_SIZE_D(DI, 64), HWY_IF_I32_D(DI)>
6421
+ HWY_INLINE VFromD<DI> NearestIntInRange(DI, VFromD<RebindToFloat<DI>> v) {
6422
+ #if HWY_COMPILER_GCC_ACTUAL
6423
+ // Workaround for undefined behavior in _mm512_cvtps_epi32 with GCC if any
6424
+ // values of v[i] are not within the range of an int32_t
6425
+
6426
+ #if HWY_COMPILER_GCC_ACTUAL >= 700 && !HWY_IS_DEBUG_BUILD
6427
+ if (detail::IsConstantX86VecForF2IConv<int32_t>(v)) {
6428
+ typedef float GccF32RawVectType __attribute__((__vector_size__(64)));
6429
+ const auto raw_v = reinterpret_cast<GccF32RawVectType>(v.raw);
6430
+ return VFromD<DI>{
6431
+ _mm512_setr_epi32(detail::X86ScalarNearestInt<int32_t>(raw_v[0]),
6432
+ detail::X86ScalarNearestInt<int32_t>(raw_v[1]),
6433
+ detail::X86ScalarNearestInt<int32_t>(raw_v[2]),
6434
+ detail::X86ScalarNearestInt<int32_t>(raw_v[3]),
6435
+ detail::X86ScalarNearestInt<int32_t>(raw_v[4]),
6436
+ detail::X86ScalarNearestInt<int32_t>(raw_v[5]),
6437
+ detail::X86ScalarNearestInt<int32_t>(raw_v[6]),
6438
+ detail::X86ScalarNearestInt<int32_t>(raw_v[7]),
6439
+ detail::X86ScalarNearestInt<int32_t>(raw_v[8]),
6440
+ detail::X86ScalarNearestInt<int32_t>(raw_v[9]),
6441
+ detail::X86ScalarNearestInt<int32_t>(raw_v[10]),
6442
+ detail::X86ScalarNearestInt<int32_t>(raw_v[11]),
6443
+ detail::X86ScalarNearestInt<int32_t>(raw_v[12]),
6444
+ detail::X86ScalarNearestInt<int32_t>(raw_v[13]),
6445
+ detail::X86ScalarNearestInt<int32_t>(raw_v[14]),
6446
+ detail::X86ScalarNearestInt<int32_t>(raw_v[15]))};
6447
+ }
6448
+ #endif
6449
+
6450
+ __m512i raw_result;
6451
+ __asm__("vcvtps2dq {%1, %0|%0, %1}"
6452
+ : "=" HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(raw_result)
6453
+ : HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(v.raw)
6454
+ :);
6455
+ return VFromD<DI>{raw_result};
6456
+ #else
6457
+ return VFromD<DI>{_mm512_cvtps_epi32(v.raw)};
6458
+ #endif
5854
6459
  }
5855
6460
 
5856
6461
  // ================================================== CRYPTO
@@ -5988,6 +6593,7 @@ static Vec512<uint16_t> SumsOfAdjQuadAbsDiff(Vec512<uint8_t> a,
5988
6593
  a, BitCast(d, Broadcast<kBOffset>(BitCast(du32, b))));
5989
6594
  }
5990
6595
 
6596
+ #if !HWY_IS_MSAN
5991
6597
  // ------------------------------ I32/I64 SaturatedAdd (MaskFromVec)
5992
6598
 
5993
6599
  HWY_API Vec512<int32_t> SaturatedAdd(Vec512<int32_t> a, Vec512<int32_t> b) {
@@ -6035,6 +6641,7 @@ HWY_API Vec512<int64_t> SaturatedSub(Vec512<int64_t> a, Vec512<int64_t> b) {
6035
6641
  i64_max.raw, MaskFromVec(a).raw, i64_max.raw, i64_max.raw, 0x55)};
6036
6642
  return IfThenElse(overflow_mask, overflow_result, diff);
6037
6643
  }
6644
+ #endif // !HWY_IS_MSAN
6038
6645
 
6039
6646
  // ------------------------------ Mask testing
6040
6647
 
@@ -7197,64 +7804,6 @@ HWY_API Vec512<int64_t> operator>>(const Vec512<int64_t> v,
7197
7804
  return Vec512<int64_t>{_mm512_srav_epi64(v.raw, bits.raw)};
7198
7805
  }
7199
7806
 
7200
- // ------------------------------ MulEven/Odd (Shuffle2301, InterleaveLower)
7201
-
7202
- HWY_INLINE Vec512<uint64_t> MulEven(const Vec512<uint64_t> a,
7203
- const Vec512<uint64_t> b) {
7204
- const DFromV<decltype(a)> du64;
7205
- const RepartitionToNarrow<decltype(du64)> du32;
7206
- const auto maskL = Set(du64, 0xFFFFFFFFULL);
7207
- const auto a32 = BitCast(du32, a);
7208
- const auto b32 = BitCast(du32, b);
7209
- // Inputs for MulEven: we only need the lower 32 bits
7210
- const auto aH = Shuffle2301(a32);
7211
- const auto bH = Shuffle2301(b32);
7212
-
7213
- // Knuth double-word multiplication. We use 32x32 = 64 MulEven and only need
7214
- // the even (lower 64 bits of every 128-bit block) results. See
7215
- // https://github.com/hcs0/Hackers-Delight/blob/master/muldwu.c.tat
7216
- const auto aLbL = MulEven(a32, b32);
7217
- const auto w3 = aLbL & maskL;
7218
-
7219
- const auto t2 = MulEven(aH, b32) + ShiftRight<32>(aLbL);
7220
- const auto w2 = t2 & maskL;
7221
- const auto w1 = ShiftRight<32>(t2);
7222
-
7223
- const auto t = MulEven(a32, bH) + w2;
7224
- const auto k = ShiftRight<32>(t);
7225
-
7226
- const auto mulH = MulEven(aH, bH) + w1 + k;
7227
- const auto mulL = ShiftLeft<32>(t) + w3;
7228
- return InterleaveLower(mulL, mulH);
7229
- }
7230
-
7231
- HWY_INLINE Vec512<uint64_t> MulOdd(const Vec512<uint64_t> a,
7232
- const Vec512<uint64_t> b) {
7233
- const DFromV<decltype(a)> du64;
7234
- const RepartitionToNarrow<decltype(du64)> du32;
7235
- const auto maskL = Set(du64, 0xFFFFFFFFULL);
7236
- const auto a32 = BitCast(du32, a);
7237
- const auto b32 = BitCast(du32, b);
7238
- // Inputs for MulEven: we only need bits [95:64] (= upper half of input)
7239
- const auto aH = Shuffle2301(a32);
7240
- const auto bH = Shuffle2301(b32);
7241
-
7242
- // Same as above, but we're using the odd results (upper 64 bits per block).
7243
- const auto aLbL = MulEven(a32, b32);
7244
- const auto w3 = aLbL & maskL;
7245
-
7246
- const auto t2 = MulEven(aH, b32) + ShiftRight<32>(aLbL);
7247
- const auto w2 = t2 & maskL;
7248
- const auto w1 = ShiftRight<32>(t2);
7249
-
7250
- const auto t = MulEven(a32, bH) + w2;
7251
- const auto k = ShiftRight<32>(t);
7252
-
7253
- const auto mulH = MulEven(aH, bH) + w1 + k;
7254
- const auto mulL = ShiftLeft<32>(t) + w3;
7255
- return InterleaveUpper(du64, mulL, mulH);
7256
- }
7257
-
7258
7807
  // ------------------------------ WidenMulPairwiseAdd
7259
7808
  template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_I32_D(D)>
7260
7809
  HWY_API VFromD<D> WidenMulPairwiseAdd(D /*d32*/, Vec512<int16_t> a,
@@ -7263,7 +7812,6 @@ HWY_API VFromD<D> WidenMulPairwiseAdd(D /*d32*/, Vec512<int16_t> a,
7263
7812
  }
7264
7813
 
7265
7814
  // ------------------------------ SatWidenMulPairwiseAdd
7266
-
7267
7815
  template <class DI16, HWY_IF_V_SIZE_D(DI16, 64), HWY_IF_I16_D(DI16)>
7268
7816
  HWY_API VFromD<DI16> SatWidenMulPairwiseAdd(
7269
7817
  DI16 /* tag */, VFromD<Repartition<uint8_t, DI16>> a,
@@ -7271,7 +7819,30 @@ HWY_API VFromD<DI16> SatWidenMulPairwiseAdd(
7271
7819
  return VFromD<DI16>{_mm512_maddubs_epi16(a.raw, b.raw)};
7272
7820
  }
7273
7821
 
7822
+ // ------------------------------ SatWidenMulPairwiseAccumulate
7823
+ #if HWY_TARGET <= HWY_AVX3_DL
7824
+ template <class DI32, HWY_IF_I32_D(DI32), HWY_IF_V_SIZE_D(DI32, 64)>
7825
+ HWY_API VFromD<DI32> SatWidenMulPairwiseAccumulate(
7826
+ DI32 /* tag */, VFromD<Repartition<int16_t, DI32>> a,
7827
+ VFromD<Repartition<int16_t, DI32>> b, VFromD<DI32> sum) {
7828
+ return VFromD<DI32>{_mm512_dpwssds_epi32(sum.raw, a.raw, b.raw)};
7829
+ }
7830
+ #endif // HWY_TARGET <= HWY_AVX3_DL
7831
+
7274
7832
  // ------------------------------ ReorderWidenMulAccumulate
7833
+
7834
+ #if HWY_NATIVE_DOT_BF16
7835
+ template <class DF, HWY_IF_F32_D(DF), HWY_IF_V_SIZE_D(DF, 64),
7836
+ class VBF = VFromD<Repartition<bfloat16_t, DF>>>
7837
+ HWY_API VFromD<DF> ReorderWidenMulAccumulate(DF /*df*/, VBF a, VBF b,
7838
+ const VFromD<DF> sum0,
7839
+ VFromD<DF>& /*sum1*/) {
7840
+ return VFromD<DF>{_mm512_dpbf16_ps(sum0.raw,
7841
+ reinterpret_cast<__m512bh>(a.raw),
7842
+ reinterpret_cast<__m512bh>(b.raw))};
7843
+ }
7844
+ #endif // HWY_NATIVE_DOT_BF16
7845
+
7275
7846
  template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_I32_D(D)>
7276
7847
  HWY_API VFromD<D> ReorderWidenMulAccumulate(D d, Vec512<int16_t> a,
7277
7848
  Vec512<int16_t> b,
@@ -7321,6 +7892,37 @@ HWY_INLINE VFromD<D> ReduceAcrossBlocks(D d, Func f, VFromD<D> v) {
7321
7892
 
7322
7893
  } // namespace detail
7323
7894
 
7895
+ // ------------------------------ BitShuffle
7896
+ #if HWY_TARGET <= HWY_AVX3_DL
7897
+ template <class V, class VI, HWY_IF_UI64(TFromV<V>), HWY_IF_UI8(TFromV<VI>),
7898
+ HWY_IF_V_SIZE_V(V, 64), HWY_IF_V_SIZE_V(VI, 64)>
7899
+ HWY_API V BitShuffle(V v, VI idx) {
7900
+ const DFromV<decltype(v)> d64;
7901
+ const RebindToUnsigned<decltype(d64)> du64;
7902
+ const Rebind<uint8_t, decltype(d64)> du8;
7903
+
7904
+ const __mmask64 mmask64_bit_shuf_result =
7905
+ _mm512_bitshuffle_epi64_mask(v.raw, idx.raw);
7906
+
7907
+ #if HWY_ARCH_X86_64
7908
+ const VFromD<decltype(du8)> vu8_bit_shuf_result{
7909
+ _mm_cvtsi64_si128(static_cast<int64_t>(mmask64_bit_shuf_result))};
7910
+ #else
7911
+ const int32_t i32_lo_bit_shuf_result =
7912
+ static_cast<int32_t>(mmask64_bit_shuf_result);
7913
+ const int32_t i32_hi_bit_shuf_result =
7914
+ static_cast<int32_t>(_kshiftri_mask64(mmask64_bit_shuf_result, 32));
7915
+
7916
+ const VFromD<decltype(du8)> vu8_bit_shuf_result = ResizeBitCast(
7917
+ du8, InterleaveLower(
7918
+ Vec128<uint32_t>{_mm_cvtsi32_si128(i32_lo_bit_shuf_result)},
7919
+ Vec128<uint32_t>{_mm_cvtsi32_si128(i32_hi_bit_shuf_result)}));
7920
+ #endif
7921
+
7922
+ return BitCast(d64, PromoteTo(du64, vu8_bit_shuf_result));
7923
+ }
7924
+ #endif // HWY_TARGET <= HWY_AVX3_DL
7925
+
7324
7926
  // -------------------- LeadingZeroCount, TrailingZeroCount, HighestSetBitIndex
7325
7927
 
7326
7928
  template <class V, HWY_IF_UI32(TFromV<V>), HWY_IF_V_SIZE_V(V, 64)>