@img/sharp-libvips-dev 1.0.2 → 1.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (118) hide show
  1. package/README.md +1 -2
  2. package/include/aom/aom_decoder.h +1 -1
  3. package/include/aom/aom_encoder.h +7 -1
  4. package/include/aom/aom_image.h +24 -12
  5. package/include/aom/aom_integer.h +3 -3
  6. package/include/aom/aomcx.h +15 -0
  7. package/include/aom/aomdx.h +5 -2
  8. package/include/archive.h +7 -5
  9. package/include/archive_entry.h +5 -3
  10. package/include/cgif.h +3 -0
  11. package/include/freetype2/freetype/config/ftoption.h +1 -1
  12. package/include/fribidi/fribidi-config.h +2 -2
  13. package/include/fribidi/fribidi-unicode-version.h +3 -3
  14. package/include/glib-2.0/gio/gappinfo.h +40 -25
  15. package/include/glib-2.0/gio/gasyncresult.h +1 -1
  16. package/include/glib-2.0/gio/gconverter.h +5 -0
  17. package/include/glib-2.0/gio/gdbusintrospection.h +1 -1
  18. package/include/glib-2.0/gio/gfile.h +16 -0
  19. package/include/glib-2.0/gio/gio-visibility.h +34 -0
  20. package/include/glib-2.0/gio/gsettings.h +8 -0
  21. package/include/glib-2.0/gio/gvfs.h +2 -2
  22. package/include/glib-2.0/girepository/gi-visibility.h +34 -0
  23. package/include/glib-2.0/glib/gbookmarkfile.h +1 -1
  24. package/include/glib-2.0/glib/giochannel.h +2 -2
  25. package/include/glib-2.0/glib/glib-visibility.h +34 -0
  26. package/include/glib-2.0/glib/gmacros.h +12 -5
  27. package/include/glib-2.0/glib/gmain.h +93 -7
  28. package/include/glib-2.0/glib/gqsort.h +8 -1
  29. package/include/glib-2.0/glib/gstrfuncs.h +0 -12
  30. package/include/glib-2.0/glib/gstrvbuilder.h +3 -0
  31. package/include/glib-2.0/glib/gunicode.h +1 -1
  32. package/include/glib-2.0/glib/gversionmacros.h +9 -0
  33. package/include/glib-2.0/gmodule/gmodule-visibility.h +34 -0
  34. package/include/glib-2.0/gobject/gobject-visibility.h +34 -0
  35. package/include/glib-2.0/gobject/gtype.h +6 -6
  36. package/include/harfbuzz/hb-buffer.h +6 -0
  37. package/include/harfbuzz/hb-common.h +6 -9
  38. package/include/harfbuzz/hb-cplusplus.hh +8 -11
  39. package/include/harfbuzz/hb-subset.h +17 -4
  40. package/include/harfbuzz/hb-version.h +3 -3
  41. package/include/hwy/abort.h +28 -0
  42. package/include/hwy/aligned_allocator.h +48 -1
  43. package/include/hwy/base.h +235 -34
  44. package/include/hwy/detect_compiler_arch.h +84 -10
  45. package/include/hwy/detect_targets.h +95 -29
  46. package/include/hwy/foreach_target.h +12 -1
  47. package/include/hwy/highway.h +205 -50
  48. package/include/hwy/ops/arm_neon-inl.h +841 -99
  49. package/include/hwy/ops/arm_sve-inl.h +413 -141
  50. package/include/hwy/ops/emu128-inl.h +373 -360
  51. package/include/hwy/ops/generic_ops-inl.h +804 -401
  52. package/include/hwy/ops/inside-inl.h +691 -0
  53. package/include/hwy/ops/ppc_vsx-inl.h +456 -166
  54. package/include/hwy/ops/rvv-inl.h +537 -249
  55. package/include/hwy/ops/scalar-inl.h +169 -79
  56. package/include/hwy/ops/set_macros-inl.h +106 -18
  57. package/include/hwy/ops/shared-inl.h +23 -0
  58. package/include/hwy/ops/wasm_128-inl.h +130 -108
  59. package/include/hwy/ops/x86_128-inl.h +1892 -577
  60. package/include/hwy/ops/x86_256-inl.h +625 -184
  61. package/include/hwy/ops/x86_512-inl.h +733 -131
  62. package/include/hwy/targets.h +22 -21
  63. package/include/hwy/timer-inl.h +3 -3
  64. package/include/hwy/timer.h +5 -1
  65. package/include/libheif/heif.h +170 -15
  66. package/include/libheif/heif_items.h +237 -0
  67. package/include/libheif/heif_properties.h +38 -2
  68. package/include/libheif/heif_regions.h +1 -1
  69. package/include/libheif/heif_version.h +2 -2
  70. package/include/libpng16/pnglibconf.h +1 -1
  71. package/include/librsvg-2.0/librsvg/rsvg-cairo.h +1 -1
  72. package/include/librsvg-2.0/librsvg/rsvg-features.h +3 -4
  73. package/include/librsvg-2.0/librsvg/rsvg-pixbuf.h +235 -0
  74. package/include/librsvg-2.0/librsvg/rsvg-version.h +3 -3
  75. package/include/librsvg-2.0/librsvg/rsvg.h +55 -176
  76. package/include/libxml2/libxml/HTMLparser.h +12 -19
  77. package/include/libxml2/libxml/c14n.h +1 -12
  78. package/include/libxml2/libxml/debugXML.h +1 -1
  79. package/include/libxml2/libxml/encoding.h +9 -0
  80. package/include/libxml2/libxml/entities.h +12 -1
  81. package/include/libxml2/libxml/hash.h +19 -0
  82. package/include/libxml2/libxml/list.h +2 -2
  83. package/include/libxml2/libxml/nanohttp.h +17 -0
  84. package/include/libxml2/libxml/parser.h +61 -55
  85. package/include/libxml2/libxml/parserInternals.h +9 -1
  86. package/include/libxml2/libxml/pattern.h +6 -0
  87. package/include/libxml2/libxml/tree.h +32 -12
  88. package/include/libxml2/libxml/uri.h +11 -0
  89. package/include/libxml2/libxml/valid.h +29 -2
  90. package/include/libxml2/libxml/xinclude.h +7 -0
  91. package/include/libxml2/libxml/xmlIO.h +21 -4
  92. package/include/libxml2/libxml/xmlerror.h +14 -0
  93. package/include/libxml2/libxml/xmlexports.h +111 -15
  94. package/include/libxml2/libxml/xmlmemory.h +8 -45
  95. package/include/libxml2/libxml/xmlreader.h +2 -0
  96. package/include/libxml2/libxml/xmlsave.h +5 -0
  97. package/include/libxml2/libxml/xmlunicode.h +165 -1
  98. package/include/libxml2/libxml/xmlversion.h +15 -179
  99. package/include/libxml2/libxml/xmlwriter.h +1 -0
  100. package/include/libxml2/libxml/xpath.h +4 -0
  101. package/include/pango-1.0/pango/pango-features.h +3 -3
  102. package/include/pango-1.0/pango/pango-item.h +4 -2
  103. package/include/pango-1.0/pango/pango-version-macros.h +25 -0
  104. package/include/pango-1.0/pango/pangofc-font.h +2 -1
  105. package/include/pnglibconf.h +1 -1
  106. package/include/vips/util.h +1 -2
  107. package/include/vips/version.h +4 -4
  108. package/include/webp/decode.h +58 -56
  109. package/include/webp/demux.h +25 -21
  110. package/include/webp/encode.h +44 -39
  111. package/include/webp/mux.h +76 -15
  112. package/include/webp/mux_types.h +2 -1
  113. package/include/webp/sharpyuv/sharpyuv.h +77 -8
  114. package/include/webp/types.h +29 -8
  115. package/include/zconf.h +1 -1
  116. package/include/zlib.h +12 -12
  117. package/package.json +1 -1
  118. package/versions.json +14 -15
@@ -199,9 +199,6 @@ HWY_API Vec128<T, HWY_MAX_LANES_D(D)> Zero(D /* tag */) {
199
199
  template <class D>
200
200
  using VFromD = decltype(Zero(D()));
201
201
 
202
- // ------------------------------ Tuple (VFromD)
203
- #include "hwy/ops/tuple-inl.h"
204
-
205
202
  // ------------------------------ BitCast
206
203
 
207
204
  template <class D, typename FromT>
@@ -247,6 +244,8 @@ HWY_API VFromD<D> Undefined(D d) {
247
244
  // Suppressing maybe-uninitialized both here and at the caller does not work,
248
245
  // so initialize.
249
246
  return Zero(d);
247
+ #elif HWY_HAS_BUILTIN(__builtin_nondeterministic_value)
248
+ return VFromD<D>{__builtin_nondeterministic_value(Zero(d).raw)};
250
249
  #else
251
250
  HWY_DIAGNOSTICS(push)
252
251
  HWY_DIAGNOSTICS_OFF(disable : 4700, ignored "-Wuninitialized")
@@ -671,10 +670,21 @@ HWY_API Vec128<T, N> CopySignToAbs(Vec128<T, N> abs, Vec128<T, N> sign) {
671
670
 
672
671
  template <class D, HWY_IF_V_SIZE_D(D, 16), typename T = TFromD<D>>
673
672
  HWY_API Vec128<T> Load(D /* tag */, const T* HWY_RESTRICT aligned) {
673
+ // Suppress the ignoring attributes warning that is generated by
674
+ // HWY_RCAST_ALIGNED(const LoadRaw*, aligned) with GCC
675
+ #if HWY_COMPILER_GCC
676
+ HWY_DIAGNOSTICS(push)
677
+ HWY_DIAGNOSTICS_OFF(disable : 4649, ignored "-Wignored-attributes")
678
+ #endif
679
+
674
680
  using LoadRaw = typename detail::Raw128<T>::AlignedRawVec;
675
681
  const LoadRaw* HWY_RESTRICT p = HWY_RCAST_ALIGNED(const LoadRaw*, aligned);
676
682
  using ResultRaw = typename detail::Raw128<T>::type;
677
683
  return Vec128<T>{reinterpret_cast<ResultRaw>(*p)};
684
+
685
+ #if HWY_COMPILER_GCC
686
+ HWY_DIAGNOSTICS(pop)
687
+ #endif
678
688
  }
679
689
 
680
690
  // Any <= 64 bit
@@ -1275,8 +1285,19 @@ HWY_API VFromD<D> MaskedLoadOr(VFromD<D> v, MFromD<D> m, D d,
1275
1285
 
1276
1286
  template <class D, HWY_IF_V_SIZE_D(D, 16), typename T = TFromD<D>>
1277
1287
  HWY_API void Store(Vec128<T> v, D /* tag */, T* HWY_RESTRICT aligned) {
1288
+ // Suppress the ignoring attributes warning that is generated by
1289
+ // HWY_RCAST_ALIGNED(StoreRaw*, aligned) with GCC
1290
+ #if HWY_COMPILER_GCC
1291
+ HWY_DIAGNOSTICS(push)
1292
+ HWY_DIAGNOSTICS_OFF(disable : 4649, ignored "-Wignored-attributes")
1293
+ #endif
1294
+
1278
1295
  using StoreRaw = typename detail::Raw128<T>::AlignedRawVec;
1279
1296
  *HWY_RCAST_ALIGNED(StoreRaw*, aligned) = reinterpret_cast<StoreRaw>(v.raw);
1297
+
1298
+ #if HWY_COMPILER_GCC
1299
+ HWY_DIAGNOSTICS(pop)
1300
+ #endif
1280
1301
  }
1281
1302
 
1282
1303
  template <class D, HWY_IF_V_SIZE_D(D, 16), typename T = TFromD<D>>
@@ -1343,17 +1364,8 @@ HWY_API void StoreN(VFromD<D> v, D d, T* HWY_RESTRICT p,
1343
1364
  template <class D>
1344
1365
  HWY_API void BlendedStore(VFromD<D> v, MFromD<D> m, D d,
1345
1366
  TFromD<D>* HWY_RESTRICT p) {
1346
- const RebindToSigned<decltype(d)> di; // for testing mask if T=bfloat16_t.
1347
- using TI = TFromD<decltype(di)>;
1348
- alignas(16) TI buf[MaxLanes(d)];
1349
- alignas(16) TI mask[MaxLanes(d)];
1350
- Store(BitCast(di, v), di, buf);
1351
- Store(BitCast(di, VecFromMask(d, m)), di, mask);
1352
- for (size_t i = 0; i < MaxLanes(d); ++i) {
1353
- if (mask[i]) {
1354
- CopySameSize(buf + i, p + i);
1355
- }
1356
- }
1367
+ const VFromD<D> old = LoadU(d, p);
1368
+ StoreU(IfThenElse(RebindMask(d, m), v, old), d, p);
1357
1369
  }
1358
1370
 
1359
1371
  // ================================================== ARITHMETIC
@@ -1577,27 +1589,84 @@ HWY_API Vec128<T, N> operator*(Vec128<T, N> a, Vec128<T, N> b) {
1577
1589
  #endif
1578
1590
  }
1579
1591
 
1580
- // Returns the upper 16 bits of a * b in each lane.
1581
- template <typename T, size_t N, HWY_IF_T_SIZE(T, 2), HWY_IF_NOT_FLOAT(T)>
1582
- HWY_API Vec128<T, N> MulHigh(Vec128<T, N> a, Vec128<T, N> b) {
1592
+ // Returns the upper sizeof(T)*8 bits of a * b in each lane.
1593
+
1583
1594
  #if HWY_S390X_HAVE_Z14
1595
+ #define HWY_PPC_IF_MULHIGH_USING_VEC_MULH(T) \
1596
+ HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2) | (1 << 4))
1597
+ #define HWY_PPC_IF_MULHIGH_8_16_32_NOT_USING_VEC_MULH(T) \
1598
+ hwy::EnableIf<!hwy::IsSame<T, T>()>* = nullptr
1599
+ #elif HWY_PPC_HAVE_10
1600
+ #define HWY_PPC_IF_MULHIGH_USING_VEC_MULH(T) \
1601
+ HWY_IF_T_SIZE_ONE_OF(T, (1 << 4) | (1 << 8))
1602
+ #define HWY_PPC_IF_MULHIGH_8_16_32_NOT_USING_VEC_MULH(T) \
1603
+ HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2))
1604
+ #else
1605
+ #define HWY_PPC_IF_MULHIGH_USING_VEC_MULH(T) \
1606
+ hwy::EnableIf<!hwy::IsSame<T, T>()>* = nullptr
1607
+ #define HWY_PPC_IF_MULHIGH_8_16_32_NOT_USING_VEC_MULH(T) \
1608
+ HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2) | (1 << 4))
1609
+ #endif
1610
+
1611
+ #if HWY_S390X_HAVE_Z14 || HWY_PPC_HAVE_10
1612
+ template <typename T, size_t N, HWY_PPC_IF_MULHIGH_USING_VEC_MULH(T),
1613
+ HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
1614
+ HWY_API Vec128<T, N> MulHigh(Vec128<T, N> a, Vec128<T, N> b) {
1584
1615
  return Vec128<T, N>{vec_mulh(a.raw, b.raw)};
1616
+ }
1617
+ #endif
1618
+
1619
+ template <typename T, HWY_PPC_IF_MULHIGH_8_16_32_NOT_USING_VEC_MULH(T),
1620
+ HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
1621
+ HWY_API Vec128<T, 1> MulHigh(Vec128<T, 1> a, Vec128<T, 1> b) {
1622
+ const auto p_even = MulEven(a, b);
1623
+
1624
+ #if HWY_IS_LITTLE_ENDIAN
1625
+ const auto p_even_full = ResizeBitCast(Full128<T>(), p_even);
1626
+ return Vec128<T, 1>{
1627
+ vec_sld(p_even_full.raw, p_even_full.raw, 16 - sizeof(T))};
1585
1628
  #else
1586
1629
  const DFromV<decltype(a)> d;
1587
- const RepartitionToWide<decltype(d)> dw;
1588
- const VFromD<decltype(dw)> p1{vec_mule(a.raw, b.raw)};
1589
- const VFromD<decltype(dw)> p2{vec_mulo(a.raw, b.raw)};
1630
+ return ResizeBitCast(d, p_even);
1631
+ #endif
1632
+ }
1633
+
1634
+ template <typename T, size_t N,
1635
+ HWY_PPC_IF_MULHIGH_8_16_32_NOT_USING_VEC_MULH(T),
1636
+ HWY_IF_NOT_FLOAT_NOR_SPECIAL(T), HWY_IF_LANES_GT(N, 1)>
1637
+ HWY_API Vec128<T, N> MulHigh(Vec128<T, N> a, Vec128<T, N> b) {
1638
+ const DFromV<decltype(a)> d;
1639
+
1640
+ const auto p_even = BitCast(d, MulEven(a, b));
1641
+ const auto p_odd = BitCast(d, MulOdd(a, b));
1642
+
1590
1643
  #if HWY_IS_LITTLE_ENDIAN
1591
- const __vector unsigned char kShuffle = {2, 3, 18, 19, 6, 7, 22, 23,
1592
- 10, 11, 26, 27, 14, 15, 30, 31};
1644
+ return InterleaveOdd(d, p_even, p_odd);
1593
1645
  #else
1594
- const __vector unsigned char kShuffle = {0, 1, 16, 17, 4, 5, 20, 21,
1595
- 8, 9, 24, 25, 12, 13, 28, 29};
1596
- #endif
1597
- return BitCast(d, VFromD<decltype(dw)>{vec_perm(p1.raw, p2.raw, kShuffle)});
1646
+ return InterleaveEven(d, p_even, p_odd);
1598
1647
  #endif
1599
1648
  }
1600
1649
 
1650
+ #if !HWY_PPC_HAVE_10
1651
+ template <class T, HWY_IF_UI64(T)>
1652
+ HWY_API Vec64<T> MulHigh(Vec64<T> a, Vec64<T> b) {
1653
+ T p_hi;
1654
+ Mul128(GetLane(a), GetLane(b), &p_hi);
1655
+ return Set(Full64<T>(), p_hi);
1656
+ }
1657
+
1658
+ template <class T, HWY_IF_UI64(T)>
1659
+ HWY_API Vec128<T> MulHigh(Vec128<T> a, Vec128<T> b) {
1660
+ const DFromV<decltype(a)> d;
1661
+ const Half<decltype(d)> dh;
1662
+ return Combine(d, MulHigh(UpperHalf(dh, a), UpperHalf(dh, b)),
1663
+ MulHigh(LowerHalf(dh, a), LowerHalf(dh, b)));
1664
+ }
1665
+ #endif // !HWY_PPC_HAVE_10
1666
+
1667
+ #undef HWY_PPC_IF_MULHIGH_USING_VEC_MULH
1668
+ #undef HWY_PPC_IF_MULHIGH_8_16_32_NOT_USING_VEC_MULH
1669
+
1601
1670
  // Multiplies even lanes (0, 2, ..) and places the double-wide result into
1602
1671
  // even and the upper half into its odd neighbor lane.
1603
1672
  template <typename T, size_t N,
@@ -1618,29 +1687,83 @@ HWY_API Vec128<MakeWide<T>, (N + 1) / 2> MulOdd(Vec128<T, N> a,
1618
1687
  return Vec128<MakeWide<T>, (N + 1) / 2>{vec_mulo(a.raw, b.raw)};
1619
1688
  }
1620
1689
 
1690
+ // ------------------------------ Rol/Ror
1691
+
1692
+ #ifdef HWY_NATIVE_ROL_ROR_8
1693
+ #undef HWY_NATIVE_ROL_ROR_8
1694
+ #else
1695
+ #define HWY_NATIVE_ROL_ROR_8
1696
+ #endif
1697
+
1698
+ #ifdef HWY_NATIVE_ROL_ROR_16
1699
+ #undef HWY_NATIVE_ROL_ROR_16
1700
+ #else
1701
+ #define HWY_NATIVE_ROL_ROR_16
1702
+ #endif
1703
+
1704
+ #ifdef HWY_NATIVE_ROL_ROR_32_64
1705
+ #undef HWY_NATIVE_ROL_ROR_32_64
1706
+ #else
1707
+ #define HWY_NATIVE_ROL_ROR_32_64
1708
+ #endif
1709
+
1710
+ template <typename T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
1711
+ HWY_API Vec128<T, N> Rol(Vec128<T, N> a, Vec128<T, N> b) {
1712
+ const DFromV<decltype(a)> d;
1713
+ const RebindToUnsigned<decltype(d)> du;
1714
+ return BitCast(
1715
+ d, VFromD<decltype(du)>{vec_rl(BitCast(du, a).raw, BitCast(du, b).raw)});
1716
+ }
1717
+
1718
+ template <typename T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
1719
+ HWY_API Vec128<T, N> Ror(Vec128<T, N> a, Vec128<T, N> b) {
1720
+ const DFromV<decltype(a)> d;
1721
+ const RebindToSigned<decltype(d)> di;
1722
+ return Rol(a, BitCast(d, Neg(BitCast(di, b))));
1723
+ }
1724
+
1621
1725
  // ------------------------------ RotateRight
1622
- template <int kBits, typename T, size_t N>
1726
+ template <int kBits, typename T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
1623
1727
  HWY_API Vec128<T, N> RotateRight(const Vec128<T, N> v) {
1624
1728
  const DFromV<decltype(v)> d;
1625
- const RebindToUnsigned<decltype(d)> du;
1626
1729
  constexpr size_t kSizeInBits = sizeof(T) * 8;
1627
1730
  static_assert(0 <= kBits && kBits < kSizeInBits, "Invalid shift count");
1628
1731
 
1629
- if (kBits == 0) return v;
1732
+ return (kBits == 0)
1733
+ ? v
1734
+ : Rol(v, Set(d, static_cast<T>(static_cast<int>(kSizeInBits) -
1735
+ kBits)));
1736
+ }
1737
+
1738
+ // ------------------------------ RotateLeftSame/RotateRightSame
1739
+ #ifdef HWY_NATIVE_ROL_ROR_SAME_8
1740
+ #undef HWY_NATIVE_ROL_ROR_SAME_8
1741
+ #else
1742
+ #define HWY_NATIVE_ROL_ROR_SAME_8
1743
+ #endif
1744
+
1745
+ #ifdef HWY_NATIVE_ROL_ROR_SAME_16
1746
+ #undef HWY_NATIVE_ROL_ROR_SAME_16
1747
+ #else
1748
+ #define HWY_NATIVE_ROL_ROR_SAME_16
1749
+ #endif
1750
+
1751
+ #ifdef HWY_NATIVE_ROL_ROR_SAME_32_64
1752
+ #undef HWY_NATIVE_ROL_ROR_SAME_32_64
1753
+ #else
1754
+ #define HWY_NATIVE_ROL_ROR_SAME_32_64
1755
+ #endif
1630
1756
 
1631
- // Do an unsigned vec_rl operation to avoid undefined behavior
1632
- return BitCast(d, VFromD<decltype(du)>{vec_rl(
1633
- BitCast(du, v).raw, Set(du, kSizeInBits - kBits).raw)});
1757
+ template <typename T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
1758
+ HWY_API Vec128<T, N> RotateLeftSame(Vec128<T, N> v, int bits) {
1759
+ const DFromV<decltype(v)> d;
1760
+ return Rol(v, Set(d, static_cast<T>(static_cast<unsigned>(bits))));
1634
1761
  }
1635
1762
 
1636
- // ------------------------------ ZeroIfNegative (BroadcastSignBit)
1637
- template <typename T, size_t N>
1638
- HWY_API Vec128<T, N> ZeroIfNegative(Vec128<T, N> v) {
1639
- static_assert(IsFloat<T>(), "Only works for float");
1763
+ template <typename T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
1764
+ HWY_API Vec128<T, N> RotateRightSame(Vec128<T, N> v, int bits) {
1640
1765
  const DFromV<decltype(v)> d;
1641
- const RebindToSigned<decltype(d)> di;
1642
- const auto mask = MaskFromVec(BitCast(d, BroadcastSignBit(BitCast(di, v))));
1643
- return IfThenElse(mask, Zero(d), v);
1766
+ return Rol(v, Set(d, static_cast<T>(0u - static_cast<unsigned>(bits))));
1644
1767
  }
1645
1768
 
1646
1769
  // ------------------------------ IfNegativeThenElse
@@ -1662,6 +1785,32 @@ HWY_API Vec128<T, N> IfNegativeThenElse(Vec128<T, N> v, Vec128<T, N> yes,
1662
1785
  #endif
1663
1786
  }
1664
1787
 
1788
+ #if HWY_PPC_HAVE_10
1789
+ #ifdef HWY_NATIVE_IF_NEG_THEN_ELSE_ZERO
1790
+ #undef HWY_NATIVE_IF_NEG_THEN_ELSE_ZERO
1791
+ #else
1792
+ #define HWY_NATIVE_IF_NEG_THEN_ELSE_ZERO
1793
+ #endif
1794
+
1795
+ #ifdef HWY_NATIVE_IF_NEG_THEN_ZERO_ELSE
1796
+ #undef HWY_NATIVE_IF_NEG_THEN_ZERO_ELSE
1797
+ #else
1798
+ #define HWY_NATIVE_IF_NEG_THEN_ZERO_ELSE
1799
+ #endif
1800
+
1801
+ template <class V, HWY_IF_NOT_UNSIGNED_V(V)>
1802
+ HWY_API V IfNegativeThenElseZero(V v, V yes) {
1803
+ const DFromV<decltype(v)> d;
1804
+ return IfNegativeThenElse(v, yes, Zero(d));
1805
+ }
1806
+
1807
+ template <class V, HWY_IF_NOT_UNSIGNED_V(V)>
1808
+ HWY_API V IfNegativeThenZeroElse(V v, V no) {
1809
+ const DFromV<decltype(v)> d;
1810
+ return IfNegativeThenElse(v, Zero(d), no);
1811
+ }
1812
+ #endif
1813
+
1665
1814
  // generic_ops takes care of integer T.
1666
1815
  template <typename T, size_t N, HWY_IF_FLOAT(T)>
1667
1816
  HWY_API Vec128<T, N> AbsDiff(Vec128<T, N> a, Vec128<T, N> b) {
@@ -3033,6 +3182,96 @@ HWY_INLINE Vec128<T, N> OddEven(Vec128<T, N> a, Vec128<T, N> b) {
3033
3182
  return IfVecThenElse(BitCast(d, Vec128<uint8_t, N * 8>{mask}), b, a);
3034
3183
  }
3035
3184
 
3185
+ // ------------------------------ InterleaveEven
3186
+
3187
+ template <class D, HWY_IF_T_SIZE_D(D, 1)>
3188
+ HWY_API VFromD<D> InterleaveEven(D d, VFromD<D> a, VFromD<D> b) {
3189
+ const Full128<TFromD<D>> d_full;
3190
+ const Indices128<TFromD<D>> idx{
3191
+ Dup128VecFromValues(Full128<uint8_t>(), 0, 16, 2, 18, 4, 20, 6, 22, 8, 24,
3192
+ 10, 26, 12, 28, 14, 30)
3193
+ .raw};
3194
+ return ResizeBitCast(d, TwoTablesLookupLanes(ResizeBitCast(d_full, a),
3195
+ ResizeBitCast(d_full, b), idx));
3196
+ }
3197
+
3198
+ template <class D, HWY_IF_T_SIZE_D(D, 2)>
3199
+ HWY_API VFromD<D> InterleaveEven(D d, VFromD<D> a, VFromD<D> b) {
3200
+ const Full128<TFromD<D>> d_full;
3201
+ const Indices128<TFromD<D>> idx{Dup128VecFromValues(Full128<uint8_t>(), 0, 1,
3202
+ 16, 17, 4, 5, 20, 21, 8,
3203
+ 9, 24, 25, 12, 13, 28, 29)
3204
+ .raw};
3205
+ return ResizeBitCast(d, TwoTablesLookupLanes(ResizeBitCast(d_full, a),
3206
+ ResizeBitCast(d_full, b), idx));
3207
+ }
3208
+
3209
+ template <class D, HWY_IF_T_SIZE_D(D, 4)>
3210
+ HWY_API VFromD<D> InterleaveEven(D d, VFromD<D> a, VFromD<D> b) {
3211
+ #if HWY_S390X_HAVE_Z14
3212
+ const Full128<TFromD<D>> d_full;
3213
+ const Indices128<TFromD<D>> idx{Dup128VecFromValues(Full128<uint8_t>(), 0, 1,
3214
+ 2, 3, 16, 17, 18, 19, 8,
3215
+ 9, 10, 11, 24, 25, 26, 27)
3216
+ .raw};
3217
+ return ResizeBitCast(d, TwoTablesLookupLanes(ResizeBitCast(d_full, a),
3218
+ ResizeBitCast(d_full, b), idx));
3219
+ #else
3220
+ (void)d;
3221
+ return VFromD<D>{vec_mergee(a.raw, b.raw)};
3222
+ #endif
3223
+ }
3224
+
3225
+ template <class D, HWY_IF_T_SIZE_D(D, 8)>
3226
+ HWY_API VFromD<D> InterleaveEven(D /*d*/, VFromD<D> a, VFromD<D> b) {
3227
+ return InterleaveLower(a, b);
3228
+ }
3229
+
3230
+ // ------------------------------ InterleaveOdd
3231
+
3232
+ template <class D, HWY_IF_T_SIZE_D(D, 1)>
3233
+ HWY_API VFromD<D> InterleaveOdd(D d, VFromD<D> a, VFromD<D> b) {
3234
+ const Full128<TFromD<D>> d_full;
3235
+ const Indices128<TFromD<D>> idx{
3236
+ Dup128VecFromValues(Full128<uint8_t>(), 1, 17, 3, 19, 5, 21, 7, 23, 9, 25,
3237
+ 11, 27, 13, 29, 15, 31)
3238
+ .raw};
3239
+ return ResizeBitCast(d, TwoTablesLookupLanes(ResizeBitCast(d_full, a),
3240
+ ResizeBitCast(d_full, b), idx));
3241
+ }
3242
+
3243
+ template <class D, HWY_IF_T_SIZE_D(D, 2)>
3244
+ HWY_API VFromD<D> InterleaveOdd(D d, VFromD<D> a, VFromD<D> b) {
3245
+ const Full128<TFromD<D>> d_full;
3246
+ const Indices128<TFromD<D>> idx{
3247
+ Dup128VecFromValues(Full128<uint8_t>(), 2, 3, 18, 19, 6, 7, 22, 23, 10,
3248
+ 11, 26, 27, 14, 15, 30, 31)
3249
+ .raw};
3250
+ return ResizeBitCast(d, TwoTablesLookupLanes(ResizeBitCast(d_full, a),
3251
+ ResizeBitCast(d_full, b), idx));
3252
+ }
3253
+
3254
+ template <class D, HWY_IF_T_SIZE_D(D, 4)>
3255
+ HWY_API VFromD<D> InterleaveOdd(D d, VFromD<D> a, VFromD<D> b) {
3256
+ #if HWY_S390X_HAVE_Z14
3257
+ const Full128<TFromD<D>> d_full;
3258
+ const Indices128<TFromD<D>> idx{
3259
+ Dup128VecFromValues(Full128<uint8_t>(), 4, 5, 6, 7, 20, 21, 22, 23, 12,
3260
+ 13, 14, 15, 28, 29, 30, 31)
3261
+ .raw};
3262
+ return ResizeBitCast(d, TwoTablesLookupLanes(ResizeBitCast(d_full, a),
3263
+ ResizeBitCast(d_full, b), idx));
3264
+ #else
3265
+ (void)d;
3266
+ return VFromD<D>{vec_mergeo(a.raw, b.raw)};
3267
+ #endif
3268
+ }
3269
+
3270
+ template <class D, HWY_IF_T_SIZE_D(D, 8)>
3271
+ HWY_API VFromD<D> InterleaveOdd(D d, VFromD<D> a, VFromD<D> b) {
3272
+ return InterleaveUpper(d, a, b);
3273
+ }
3274
+
3036
3275
  // ------------------------------ OddEvenBlocks
3037
3276
  template <typename T, size_t N>
3038
3277
  HWY_API Vec128<T, N> OddEvenBlocks(Vec128<T, N> /* odd */, Vec128<T, N> even) {
@@ -3144,63 +3383,58 @@ HWY_API Vec128<T, N> operator>>(Vec128<T, N> v, Vec128<T, N> bits) {
3144
3383
 
3145
3384
  // ------------------------------ MulEven/Odd 64x64 (UpperHalf)
3146
3385
 
3147
- HWY_INLINE Vec128<uint64_t> MulEven(Vec128<uint64_t> a, Vec128<uint64_t> b) {
3386
+ template <class T, HWY_IF_UI64(T)>
3387
+ HWY_INLINE Vec128<T> MulEven(Vec128<T> a, Vec128<T> b) {
3148
3388
  #if HWY_PPC_HAVE_10 && defined(__SIZEOF_INT128__)
3149
- using VU64 = __vector unsigned long long;
3150
- const VU64 mul128_result = reinterpret_cast<VU64>(vec_mule(a.raw, b.raw));
3389
+ using V64 = typename detail::Raw128<T>::type;
3390
+ const V64 mul128_result = reinterpret_cast<V64>(vec_mule(a.raw, b.raw));
3151
3391
  #if HWY_IS_LITTLE_ENDIAN
3152
- return Vec128<uint64_t>{mul128_result};
3392
+ return Vec128<T>{mul128_result};
3153
3393
  #else
3154
3394
  // Need to swap the two halves of mul128_result on big-endian targets as
3155
3395
  // the upper 64 bits of the product are in lane 0 of mul128_result and
3156
3396
  // the lower 64 bits of the product are in lane 1 of mul128_result
3157
- return Vec128<uint64_t>{vec_sld(mul128_result, mul128_result, 8)};
3397
+ return Vec128<T>{vec_sld(mul128_result, mul128_result, 8)};
3158
3398
  #endif
3159
3399
  #else
3160
- alignas(16) uint64_t mul[2];
3400
+ alignas(16) T mul[2];
3161
3401
  mul[0] = Mul128(GetLane(a), GetLane(b), &mul[1]);
3162
- return Load(Full128<uint64_t>(), mul);
3402
+ return Load(Full128<T>(), mul);
3163
3403
  #endif
3164
3404
  }
3165
3405
 
3166
- HWY_INLINE Vec128<uint64_t> MulOdd(Vec128<uint64_t> a, Vec128<uint64_t> b) {
3406
+ template <class T, HWY_IF_UI64(T)>
3407
+ HWY_INLINE Vec128<T> MulOdd(Vec128<T> a, Vec128<T> b) {
3167
3408
  #if HWY_PPC_HAVE_10 && defined(__SIZEOF_INT128__)
3168
- using VU64 = __vector unsigned long long;
3169
- const VU64 mul128_result = reinterpret_cast<VU64>(vec_mulo(a.raw, b.raw));
3409
+ using V64 = typename detail::Raw128<T>::type;
3410
+ const V64 mul128_result = reinterpret_cast<V64>(vec_mulo(a.raw, b.raw));
3170
3411
  #if HWY_IS_LITTLE_ENDIAN
3171
- return Vec128<uint64_t>{mul128_result};
3412
+ return Vec128<T>{mul128_result};
3172
3413
  #else
3173
3414
  // Need to swap the two halves of mul128_result on big-endian targets as
3174
3415
  // the upper 64 bits of the product are in lane 0 of mul128_result and
3175
3416
  // the lower 64 bits of the product are in lane 1 of mul128_result
3176
- return Vec128<uint64_t>{vec_sld(mul128_result, mul128_result, 8)};
3417
+ return Vec128<T>{vec_sld(mul128_result, mul128_result, 8)};
3177
3418
  #endif
3178
3419
  #else
3179
- alignas(16) uint64_t mul[2];
3180
- const Full64<uint64_t> d2;
3420
+ alignas(16) T mul[2];
3421
+ const Full64<T> d2;
3181
3422
  mul[0] =
3182
3423
  Mul128(GetLane(UpperHalf(d2, a)), GetLane(UpperHalf(d2, b)), &mul[1]);
3183
- return Load(Full128<uint64_t>(), mul);
3424
+ return Load(Full128<T>(), mul);
3184
3425
  #endif
3185
3426
  }
3186
3427
 
3428
+ // ------------------------------ PromoteEvenTo/PromoteOddTo
3429
+ #include "hwy/ops/inside-inl.h"
3430
+
3187
3431
  // ------------------------------ WidenMulPairwiseAdd
3188
3432
 
3189
- template <class D32, HWY_IF_F32_D(D32),
3190
- class V16 = VFromD<Repartition<bfloat16_t, D32>>>
3191
- HWY_API VFromD<D32> WidenMulPairwiseAdd(D32 df32, V16 a, V16 b) {
3192
- const RebindToUnsigned<decltype(df32)> du32;
3193
- // Lane order within sum0/1 is undefined, hence we can avoid the
3194
- // longer-latency lane-crossing PromoteTo. Using shift/and instead of Zip
3195
- // leads to the odd/even order that RearrangeToOddPlusEven prefers.
3196
- using VU32 = VFromD<decltype(du32)>;
3197
- const VU32 odd = Set(du32, 0xFFFF0000u);
3198
- const VU32 ae = ShiftLeft<16>(BitCast(du32, a));
3199
- const VU32 ao = And(BitCast(du32, a), odd);
3200
- const VU32 be = ShiftLeft<16>(BitCast(du32, b));
3201
- const VU32 bo = And(BitCast(du32, b), odd);
3202
- return MulAdd(BitCast(df32, ae), BitCast(df32, be),
3203
- Mul(BitCast(df32, ao), BitCast(df32, bo)));
3433
+ template <class DF, HWY_IF_F32_D(DF),
3434
+ class VBF = VFromD<Repartition<bfloat16_t, DF>>>
3435
+ HWY_API VFromD<DF> WidenMulPairwiseAdd(DF df, VBF a, VBF b) {
3436
+ return MulAdd(PromoteEvenTo(df, a), PromoteEvenTo(df, b),
3437
+ Mul(PromoteOddTo(df, a), PromoteOddTo(df, b)));
3204
3438
  }
3205
3439
 
3206
3440
  // Even if N=1, the input is always at least 2 lanes, hence vec_msum is safe.
@@ -3217,25 +3451,6 @@ HWY_API VFromD<D32> WidenMulPairwiseAdd(D32 d32, V16 a, V16 b) {
3217
3451
 
3218
3452
  // ------------------------------ ReorderWidenMulAccumulate (MulAdd, ZipLower)
3219
3453
 
3220
- template <class D32, HWY_IF_F32_D(D32),
3221
- class V16 = VFromD<Repartition<bfloat16_t, D32>>>
3222
- HWY_API VFromD<D32> ReorderWidenMulAccumulate(D32 df32, V16 a, V16 b,
3223
- VFromD<D32> sum0,
3224
- VFromD<D32>& sum1) {
3225
- const RebindToUnsigned<decltype(df32)> du32;
3226
- // Lane order within sum0/1 is undefined, hence we can avoid the
3227
- // longer-latency lane-crossing PromoteTo. Using shift/and instead of Zip
3228
- // leads to the odd/even order that RearrangeToOddPlusEven prefers.
3229
- using VU32 = VFromD<decltype(du32)>;
3230
- const VU32 odd = Set(du32, 0xFFFF0000u);
3231
- const VU32 ae = ShiftLeft<16>(BitCast(du32, a));
3232
- const VU32 ao = And(BitCast(du32, a), odd);
3233
- const VU32 be = ShiftLeft<16>(BitCast(du32, b));
3234
- const VU32 bo = And(BitCast(du32, b), odd);
3235
- sum1 = MulAdd(BitCast(df32, ao), BitCast(df32, bo), sum1);
3236
- return MulAdd(BitCast(df32, ae), BitCast(df32, be), sum0);
3237
- }
3238
-
3239
3454
  // Even if N=1, the input is always at least 2 lanes, hence vec_msum is safe.
3240
3455
  template <class D32, HWY_IF_UI32_D(D32),
3241
3456
  class V16 = VFromD<RepartitionToNarrow<D32>>>
@@ -3267,6 +3482,24 @@ HWY_API VW RearrangeToOddPlusEven(const VW sum0, const VW sum1) {
3267
3482
  return Add(sum0, sum1);
3268
3483
  }
3269
3484
 
3485
+ // ------------------------------ SatWidenMulPairwiseAccumulate
3486
+ #if !HWY_S390X_HAVE_Z14
3487
+
3488
+ #ifdef HWY_NATIVE_I16_I16_SATWIDENMULPAIRWISEACCUM
3489
+ #undef HWY_NATIVE_I16_I16_SATWIDENMULPAIRWISEACCUM
3490
+ #else
3491
+ #define HWY_NATIVE_I16_I16_SATWIDENMULPAIRWISEACCUM
3492
+ #endif
3493
+
3494
+ template <class DI32, HWY_IF_I32_D(DI32), HWY_IF_V_SIZE_LE_D(DI32, 16)>
3495
+ HWY_API VFromD<DI32> SatWidenMulPairwiseAccumulate(
3496
+ DI32 /* tag */, VFromD<Repartition<int16_t, DI32>> a,
3497
+ VFromD<Repartition<int16_t, DI32>> b, VFromD<DI32> sum) {
3498
+ return VFromD<DI32>{vec_msums(a.raw, b.raw, sum.raw)};
3499
+ }
3500
+
3501
+ #endif // !HWY_S390X_HAVE_Z14
3502
+
3270
3503
  // ------------------------------ SumOfMulQuadAccumulate
3271
3504
  #if !HWY_S390X_HAVE_Z14
3272
3505
 
@@ -3997,29 +4230,43 @@ HWY_API VFromD<D> DemoteTo(D df16, VFromD<Rebind<double, D>> v) {
3997
4230
 
3998
4231
  #endif // HWY_PPC_HAVE_9
3999
4232
 
4000
- template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_BF16_D(D)>
4001
- HWY_API VFromD<D> DemoteTo(D dbf16, VFromD<Rebind<float, D>> v) {
4002
- const Rebind<uint32_t, decltype(dbf16)> du32; // for logical shift right
4003
- const Rebind<uint16_t, decltype(dbf16)> du16;
4004
- const auto bits_in_32 = ShiftRight<16>(BitCast(du32, v));
4005
- return BitCast(dbf16, TruncateTo(du16, bits_in_32));
4006
- }
4233
+ #if HWY_PPC_HAVE_10 && HWY_HAS_BUILTIN(__builtin_vsx_xvcvspbf16)
4007
4234
 
4008
- template <class D, HWY_IF_BF16_D(D), class V32 = VFromD<Repartition<float, D>>>
4009
- HWY_API VFromD<D> ReorderDemote2To(D dbf16, V32 a, V32 b) {
4010
- const RebindToUnsigned<decltype(dbf16)> du16;
4011
- const Repartition<uint32_t, decltype(dbf16)> du32;
4012
- #if HWY_IS_LITTLE_ENDIAN
4013
- const auto a_in_odd = a;
4014
- const auto b_in_even = ShiftRight<16>(BitCast(du32, b));
4235
+ #ifdef HWY_NATIVE_DEMOTE_F32_TO_BF16
4236
+ #undef HWY_NATIVE_DEMOTE_F32_TO_BF16
4015
4237
  #else
4016
- const auto a_in_odd = ShiftRight<16>(BitCast(du32, a));
4017
- const auto b_in_even = b;
4238
+ #define HWY_NATIVE_DEMOTE_F32_TO_BF16
4018
4239
  #endif
4019
- return BitCast(dbf16,
4020
- OddEven(BitCast(du16, a_in_odd), BitCast(du16, b_in_even)));
4240
+
4241
+ namespace detail {
4242
+
4243
+ // VsxXvcvspbf16 converts a F32 vector to a BF16 vector, bitcasted to an U32
4244
+ // vector with the resulting BF16 bits in the lower 16 bits of each U32 lane
4245
+ template <class D, HWY_IF_BF16_D(D)>
4246
+ static HWY_INLINE VFromD<Rebind<uint32_t, D>> VsxXvcvspbf16(
4247
+ D dbf16, VFromD<Rebind<float, D>> v) {
4248
+ const Rebind<uint32_t, decltype(dbf16)> du32;
4249
+ const Repartition<uint8_t, decltype(du32)> du32_as_du8;
4250
+
4251
+ using VU32 = __vector unsigned int;
4252
+
4253
+ // Even though the __builtin_vsx_xvcvspbf16 builtin performs a F32 to BF16
4254
+ // conversion, the __builtin_vsx_xvcvspbf16 intrinsic expects a
4255
+ // __vector unsigned char argument (at least as of GCC 13 and Clang 17)
4256
+ return VFromD<Rebind<uint32_t, D>>{reinterpret_cast<VU32>(
4257
+ __builtin_vsx_xvcvspbf16(BitCast(du32_as_du8, v).raw))};
4258
+ }
4259
+
4260
+ } // namespace detail
4261
+
4262
+ template <class D, HWY_IF_BF16_D(D)>
4263
+ HWY_API VFromD<D> DemoteTo(D dbf16, VFromD<Rebind<float, D>> v) {
4264
+ const RebindToUnsigned<decltype(dbf16)> du16;
4265
+ return BitCast(dbf16, TruncateTo(du16, detail::VsxXvcvspbf16(dbf16, v)));
4021
4266
  }
4022
4267
 
4268
+ #endif // HWY_PPC_HAVE_10 && HWY_HAS_BUILTIN(__builtin_vsx_xvcvspbf16)
4269
+
4023
4270
  // Specializations for partial vectors because vec_packs sets lanes above 2*N.
4024
4271
  template <class DN, typename V, HWY_IF_V_SIZE_LE_D(DN, 4), HWY_IF_SIGNED_D(DN),
4025
4272
  HWY_IF_SIGNED_V(V),
@@ -4111,6 +4358,18 @@ HWY_API VFromD<DN> ReorderDemote2To(DN /*dn*/, V a, V b) {
4111
4358
  return VFromD<DN>{vec_packs(a.raw, b.raw)};
4112
4359
  }
4113
4360
 
4361
+ #if HWY_PPC_HAVE_10 && HWY_HAS_BUILTIN(__builtin_vsx_xvcvspbf16)
4362
+ template <class D, class V, HWY_IF_BF16_D(D), HWY_IF_F32(TFromV<V>),
4363
+ HWY_IF_LANES_D(D, HWY_MAX_LANES_V(V) * 2)>
4364
+ HWY_API VFromD<D> ReorderDemote2To(D dbf16, V a, V b) {
4365
+ const RebindToUnsigned<decltype(dbf16)> du16;
4366
+ const Half<decltype(dbf16)> dh_bf16;
4367
+ return BitCast(dbf16,
4368
+ OrderedTruncate2To(du16, detail::VsxXvcvspbf16(dh_bf16, a),
4369
+ detail::VsxXvcvspbf16(dh_bf16, b)));
4370
+ }
4371
+ #endif
4372
+
4114
4373
  template <class D, HWY_IF_NOT_FLOAT_NOR_SPECIAL(TFromD<D>), class V,
4115
4374
  HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V),
4116
4375
  HWY_IF_T_SIZE_V(V, sizeof(TFromD<D>) * 2),
@@ -4119,15 +4378,13 @@ HWY_API VFromD<D> OrderedDemote2To(D d, V a, V b) {
4119
4378
  return ReorderDemote2To(d, a, b);
4120
4379
  }
4121
4380
 
4122
- template <class D, HWY_IF_BF16_D(D), class V32 = VFromD<Repartition<float, D>>>
4123
- HWY_API VFromD<D> OrderedDemote2To(D dbf16, V32 a, V32 b) {
4124
- const RebindToUnsigned<decltype(dbf16)> du16;
4125
- #if HWY_IS_LITTLE_ENDIAN
4126
- return BitCast(dbf16, ConcatOdd(du16, BitCast(du16, b), BitCast(du16, a)));
4127
- #else
4128
- return BitCast(dbf16, ConcatEven(du16, BitCast(du16, b), BitCast(du16, a)));
4129
- #endif
4381
+ #if HWY_PPC_HAVE_10 && HWY_HAS_BUILTIN(__builtin_vsx_xvcvspbf16)
4382
+ template <class D, HWY_IF_BF16_D(D), class V, HWY_IF_F32(TFromV<V>),
4383
+ HWY_IF_LANES_D(D, HWY_MAX_LANES_D(DFromV<V>) * 2)>
4384
+ HWY_API VFromD<D> OrderedDemote2To(D d, V a, V b) {
4385
+ return ReorderDemote2To(d, a, b);
4130
4386
  }
4387
+ #endif
4131
4388
 
4132
4389
  template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_F32_D(D)>
4133
4390
  HWY_API Vec32<float> DemoteTo(D /* tag */, Vec64<double> v) {
@@ -4938,7 +5195,7 @@ HWY_API MFromD<D> Dup128MaskFromMaskBits(D d, unsigned mask_bits) {
4938
5195
 
4939
5196
  namespace detail {
4940
5197
 
4941
- #if !HWY_S390X_HAVE_Z14 && (!HWY_PPC_HAVE_10 || HWY_IS_BIG_ENDIAN)
5198
+ #if !HWY_PPC_HAVE_10 || HWY_IS_BIG_ENDIAN
4942
5199
  // fallback for missing vec_extractm
4943
5200
  template <size_t N>
4944
5201
  HWY_INLINE uint64_t ExtractSignBits(Vec128<uint8_t, N> sign_bits,
@@ -4946,42 +5203,18 @@ HWY_INLINE uint64_t ExtractSignBits(Vec128<uint8_t, N> sign_bits,
4946
5203
  // clang POWER8 and 9 targets appear to differ in their return type of
4947
5204
  // vec_vbpermq: unsigned or signed, so cast to avoid a warning.
4948
5205
  using VU64 = detail::Raw128<uint64_t>::type;
5206
+ #if HWY_S390X_HAVE_Z14
5207
+ const Vec128<uint64_t> extracted{
5208
+ reinterpret_cast<VU64>(vec_bperm_u128(sign_bits.raw, bit_shuffle))};
5209
+ #else
4949
5210
  const Vec128<uint64_t> extracted{
4950
5211
  reinterpret_cast<VU64>(vec_vbpermq(sign_bits.raw, bit_shuffle))};
5212
+ #endif
4951
5213
  return extracted.raw[HWY_IS_LITTLE_ENDIAN];
4952
5214
  }
4953
5215
 
4954
- #endif // !HWY_S390X_HAVE_Z14 && !HWY_PPC_HAVE_10
4955
-
4956
- #if HWY_S390X_HAVE_Z14
4957
- template <typename T, size_t N, HWY_IF_V_SIZE_LE(T, N, 8)>
4958
- HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<1> /*tag*/, Mask128<T, N> mask) {
4959
- const DFromM<decltype(mask)> d;
4960
- const Repartition<uint8_t, decltype(d)> du8;
4961
- const VFromD<decltype(du8)> sign_bits = BitCast(du8, VecFromMask(d, mask));
4962
-
4963
- return ReduceSum(
4964
- du8, And(sign_bits, Dup128VecFromValues(du8, 1, 2, 4, 8, 16, 32, 64, 128,
4965
- 1, 2, 4, 8, 16, 32, 64, 128)));
4966
- }
5216
+ #endif // !HWY_PPC_HAVE_10 || HWY_IS_BIG_ENDIAN
4967
5217
 
4968
- template <typename T>
4969
- HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<1> /*tag*/, Mask128<T> mask) {
4970
- const DFromM<decltype(mask)> d;
4971
- const Repartition<uint8_t, decltype(d)> du8;
4972
- const Repartition<uint64_t, decltype(d)> du64;
4973
- const VFromD<decltype(du8)> sign_bits = BitCast(du8, VecFromMask(d, mask));
4974
-
4975
- const auto mask_bytes = SumsOf8(
4976
- And(sign_bits, Dup128VecFromValues(du8, 1, 2, 4, 8, 16, 32, 64, 128, 1, 2,
4977
- 4, 8, 16, 32, 64, 128)));
4978
-
4979
- const Rebind<uint8_t, decltype(du64)> du8_2;
4980
- const Repartition<uint16_t, decltype(du8_2)> du16_1;
4981
- return GetLane(
4982
- BitCast(du16_1, TruncateTo(du8_2, Reverse2(du64, mask_bytes))));
4983
- }
4984
- #else
4985
5218
  template <typename T, size_t N>
4986
5219
  HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<1> /*tag*/, Mask128<T, N> mask) {
4987
5220
  const DFromM<decltype(mask)> d;
@@ -4990,30 +5223,24 @@ HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<1> /*tag*/, Mask128<T, N> mask) {
4990
5223
 
4991
5224
  #if HWY_PPC_HAVE_10 && HWY_IS_LITTLE_ENDIAN
4992
5225
  return static_cast<uint64_t>(vec_extractm(sign_bits.raw));
4993
- #else // PPC8, PPC9, or big-endian PPC10
5226
+ #else // Z14, Z15, PPC8, PPC9, or big-endian PPC10
4994
5227
  const __vector unsigned char kBitShuffle = {120, 112, 104, 96, 88, 80, 72, 64,
4995
5228
  56, 48, 40, 32, 24, 16, 8, 0};
4996
5229
  return ExtractSignBits(sign_bits, kBitShuffle);
4997
5230
  #endif // HWY_PPC_HAVE_10 && HWY_IS_LITTLE_ENDIAN
4998
5231
  }
4999
- #endif // HWY_S390X_HAVE_Z14
5000
5232
 
5001
5233
  template <typename T, size_t N>
5002
5234
  HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<2> /*tag*/, Mask128<T, N> mask) {
5003
5235
  const DFromM<decltype(mask)> d;
5004
5236
  const RebindToUnsigned<decltype(d)> du;
5005
5237
 
5006
- #if HWY_S390X_HAVE_Z14
5007
- const VFromD<decltype(du)> sign_bits = BitCast(du, VecFromMask(d, mask));
5008
- return ReduceSum(
5009
- du, And(sign_bits, Dup128VecFromValues(du, 1, 2, 4, 8, 16, 32, 64, 128)));
5010
- #else // VSX
5011
5238
  const Repartition<uint8_t, decltype(d)> du8;
5012
5239
  const VFromD<decltype(du8)> sign_bits = BitCast(du8, VecFromMask(d, mask));
5013
5240
 
5014
5241
  #if HWY_PPC_HAVE_10 && HWY_IS_LITTLE_ENDIAN
5015
5242
  return static_cast<uint64_t>(vec_extractm(BitCast(du, sign_bits).raw));
5016
- #else // PPC8, PPC9, or big-endian PPC10
5243
+ #else // Z14, Z15, PPC8, PPC9, or big-endian PPC10
5017
5244
  (void)du;
5018
5245
  #if HWY_IS_LITTLE_ENDIAN
5019
5246
  const __vector unsigned char kBitShuffle = {
@@ -5024,7 +5251,6 @@ HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<2> /*tag*/, Mask128<T, N> mask) {
5024
5251
  #endif
5025
5252
  return ExtractSignBits(sign_bits, kBitShuffle);
5026
5253
  #endif // HWY_PPC_HAVE_10
5027
- #endif // HWY_S390X_HAVE_Z14
5028
5254
  }
5029
5255
 
5030
5256
  template <typename T, size_t N>
@@ -5032,16 +5258,12 @@ HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<4> /*tag*/, Mask128<T, N> mask) {
5032
5258
  const DFromM<decltype(mask)> d;
5033
5259
  const RebindToUnsigned<decltype(d)> du;
5034
5260
 
5035
- #if HWY_S390X_HAVE_Z14
5036
- const VFromD<decltype(du)> sign_bits = BitCast(du, VecFromMask(d, mask));
5037
- return ReduceSum(du, And(sign_bits, Dup128VecFromValues(du, 1, 2, 4, 8)));
5038
- #else // VSX
5039
5261
  const Repartition<uint8_t, decltype(d)> du8;
5040
5262
  const VFromD<decltype(du8)> sign_bits = BitCast(du8, VecFromMask(d, mask));
5041
5263
 
5042
5264
  #if HWY_PPC_HAVE_10 && HWY_IS_LITTLE_ENDIAN
5043
5265
  return static_cast<uint64_t>(vec_extractm(BitCast(du, sign_bits).raw));
5044
- #else // PPC8, PPC9, or big-endian PPC10
5266
+ #else // Z14, Z15, PPC8, PPC9, or big-endian PPC10
5045
5267
  (void)du;
5046
5268
  #if HWY_IS_LITTLE_ENDIAN
5047
5269
  const __vector unsigned char kBitShuffle = {96, 64, 32, 0, 128, 128,
@@ -5054,7 +5276,6 @@ HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<4> /*tag*/, Mask128<T, N> mask) {
5054
5276
  #endif
5055
5277
  return ExtractSignBits(sign_bits, kBitShuffle);
5056
5278
  #endif // HWY_PPC_HAVE_10
5057
- #endif // HWY_S390X_HAVE_Z14
5058
5279
  }
5059
5280
 
5060
5281
  template <typename T, size_t N>
@@ -5062,16 +5283,12 @@ HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<8> /*tag*/, Mask128<T, N> mask) {
5062
5283
  const DFromM<decltype(mask)> d;
5063
5284
  const RebindToUnsigned<decltype(d)> du;
5064
5285
 
5065
- #if HWY_S390X_HAVE_Z14
5066
- const VFromD<decltype(du)> sign_bits = BitCast(du, VecFromMask(d, mask));
5067
- return ReduceSum(du, And(sign_bits, Dup128VecFromValues(du, 1, 2)));
5068
- #else // VSX
5069
5286
  const Repartition<uint8_t, decltype(d)> du8;
5070
5287
  const VFromD<decltype(du8)> sign_bits = BitCast(du8, VecFromMask(d, mask));
5071
5288
 
5072
5289
  #if HWY_PPC_HAVE_10 && HWY_IS_LITTLE_ENDIAN
5073
5290
  return static_cast<uint64_t>(vec_extractm(BitCast(du, sign_bits).raw));
5074
- #else
5291
+ #else // Z14, Z15, PPC8, PPC9, or big-endian PPC10
5075
5292
  (void)du;
5076
5293
  #if HWY_IS_LITTLE_ENDIAN
5077
5294
  const __vector unsigned char kBitShuffle = {64, 0, 128, 128, 128, 128,
@@ -5084,7 +5301,6 @@ HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<8> /*tag*/, Mask128<T, N> mask) {
5084
5301
  #endif
5085
5302
  return ExtractSignBits(sign_bits, kBitShuffle);
5086
5303
  #endif // HWY_PPC_HAVE_10
5087
- #endif // HWY_S390X_HAVE_Z14
5088
5304
  }
5089
5305
 
5090
5306
  // Returns the lowest N of the mask bits.
@@ -6704,6 +6920,80 @@ HWY_API TFromD<D> ReduceSum(D /*d*/, VFromD<D> v) {
6704
6920
  return static_cast<TFromD<D>>(GetLane(SumsOf4(v)));
6705
6921
  }
6706
6922
 
6923
+ // ------------------------------ BitShuffle
6924
+
6925
+ #ifdef HWY_NATIVE_BITSHUFFLE
6926
+ #undef HWY_NATIVE_BITSHUFFLE
6927
+ #else
6928
+ #define HWY_NATIVE_BITSHUFFLE
6929
+ #endif
6930
+
6931
+ template <class V, class VI, HWY_IF_UI64(TFromV<V>), HWY_IF_UI8(TFromV<VI>),
6932
+ HWY_IF_V_SIZE_V(VI, HWY_MAX_LANES_V(V) * 8)>
6933
+ HWY_API V BitShuffle(V v, VI idx) {
6934
+ const DFromV<decltype(v)> d64;
6935
+ const RebindToUnsigned<decltype(d64)> du64;
6936
+ const Repartition<uint8_t, decltype(d64)> du8;
6937
+
6938
+ const Full128<TFromD<decltype(du64)>> d_full_u64;
6939
+ const Full128<TFromD<decltype(du8)>> d_full_u8;
6940
+
6941
+ using RawVU64 = __vector unsigned long long;
6942
+
6943
+ #if HWY_PPC_HAVE_9
6944
+
6945
+ #if HWY_IS_LITTLE_ENDIAN
6946
+ (void)d_full_u64;
6947
+ auto bit_idx = ResizeBitCast(d_full_u8, idx);
6948
+ #else
6949
+ auto bit_idx =
6950
+ BitCast(d_full_u8, ReverseLaneBytes(ResizeBitCast(d_full_u64, idx)));
6951
+ #endif
6952
+
6953
+ bit_idx = Xor(bit_idx, Set(d_full_u8, uint8_t{0x3F}));
6954
+
6955
+ return BitCast(d64, VFromD<decltype(du64)>{reinterpret_cast<RawVU64>(
6956
+ vec_bperm(BitCast(du64, v).raw, bit_idx.raw))});
6957
+ #else // !HWY_PPC_HAVE_9
6958
+
6959
+ #if HWY_IS_LITTLE_ENDIAN
6960
+ const auto bit_idx_xor_mask = BitCast(
6961
+ d_full_u8, Dup128VecFromValues(d_full_u64, uint64_t{0x7F7F7F7F7F7F7F7Fu},
6962
+ uint64_t{0x3F3F3F3F3F3F3F3Fu}));
6963
+ const auto bit_idx = Xor(ResizeBitCast(d_full_u8, idx), bit_idx_xor_mask);
6964
+ constexpr int kBitShufResultByteShrAmt = 8;
6965
+ #else
6966
+ const auto bit_idx_xor_mask = BitCast(
6967
+ d_full_u8, Dup128VecFromValues(d_full_u64, uint64_t{0x3F3F3F3F3F3F3F3Fu},
6968
+ uint64_t{0x7F7F7F7F7F7F7F7Fu}));
6969
+ const auto bit_idx =
6970
+ Xor(BitCast(d_full_u8, ReverseLaneBytes(ResizeBitCast(d_full_u64, idx))),
6971
+ bit_idx_xor_mask);
6972
+ constexpr int kBitShufResultByteShrAmt = 6;
6973
+ #endif
6974
+
6975
+ #if HWY_S390X_HAVE_Z14
6976
+ const VFromD<decltype(d_full_u64)> bit_shuf_result{reinterpret_cast<RawVU64>(
6977
+ vec_bperm_u128(BitCast(du8, v).raw, bit_idx.raw))};
6978
+ #elif defined(__SIZEOF_INT128__)
6979
+ using RawVU128 = __vector unsigned __int128;
6980
+ const VFromD<decltype(d_full_u64)> bit_shuf_result{reinterpret_cast<RawVU64>(
6981
+ vec_vbpermq(reinterpret_cast<RawVU128>(v.raw), bit_idx.raw))};
6982
+ #else
6983
+ using RawVU128 = __vector unsigned char;
6984
+ const VFromD<decltype(d_full_u64)> bit_shuf_result{reinterpret_cast<RawVU64>(
6985
+ vec_vbpermq(reinterpret_cast<RawVU128>(v.raw), bit_idx.raw))};
6986
+ #endif
6987
+
6988
+ return ResizeBitCast(
6989
+ d64, PromoteTo(d_full_u64,
6990
+ ResizeBitCast(
6991
+ Rebind<uint8_t, decltype(d_full_u64)>(),
6992
+ CombineShiftRightBytes<kBitShufResultByteShrAmt>(
6993
+ d_full_u64, bit_shuf_result, bit_shuf_result))));
6994
+ #endif // HWY_PPC_HAVE_9
6995
+ }
6996
+
6707
6997
  // ------------------------------ Lt128
6708
6998
 
6709
6999
  namespace detail {