@img/sharp-libvips-dev 1.0.2 → 1.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -2
- package/include/aom/aom_decoder.h +1 -1
- package/include/aom/aom_encoder.h +7 -1
- package/include/aom/aom_image.h +24 -12
- package/include/aom/aom_integer.h +3 -3
- package/include/aom/aomcx.h +15 -0
- package/include/aom/aomdx.h +5 -2
- package/include/archive.h +7 -5
- package/include/archive_entry.h +5 -3
- package/include/cgif.h +3 -0
- package/include/freetype2/freetype/config/ftoption.h +1 -1
- package/include/fribidi/fribidi-config.h +2 -2
- package/include/fribidi/fribidi-unicode-version.h +3 -3
- package/include/glib-2.0/gio/gappinfo.h +40 -25
- package/include/glib-2.0/gio/gasyncresult.h +1 -1
- package/include/glib-2.0/gio/gconverter.h +5 -0
- package/include/glib-2.0/gio/gdbusintrospection.h +1 -1
- package/include/glib-2.0/gio/gfile.h +16 -0
- package/include/glib-2.0/gio/gio-visibility.h +34 -0
- package/include/glib-2.0/gio/gsettings.h +8 -0
- package/include/glib-2.0/gio/gvfs.h +2 -2
- package/include/glib-2.0/girepository/gi-visibility.h +34 -0
- package/include/glib-2.0/glib/gbookmarkfile.h +1 -1
- package/include/glib-2.0/glib/giochannel.h +2 -2
- package/include/glib-2.0/glib/glib-visibility.h +34 -0
- package/include/glib-2.0/glib/gmacros.h +12 -5
- package/include/glib-2.0/glib/gmain.h +93 -7
- package/include/glib-2.0/glib/gqsort.h +8 -1
- package/include/glib-2.0/glib/gstrfuncs.h +0 -12
- package/include/glib-2.0/glib/gstrvbuilder.h +3 -0
- package/include/glib-2.0/glib/gunicode.h +1 -1
- package/include/glib-2.0/glib/gversionmacros.h +9 -0
- package/include/glib-2.0/gmodule/gmodule-visibility.h +34 -0
- package/include/glib-2.0/gobject/gobject-visibility.h +34 -0
- package/include/glib-2.0/gobject/gtype.h +6 -6
- package/include/harfbuzz/hb-buffer.h +6 -0
- package/include/harfbuzz/hb-common.h +6 -9
- package/include/harfbuzz/hb-cplusplus.hh +8 -11
- package/include/harfbuzz/hb-subset.h +17 -4
- package/include/harfbuzz/hb-version.h +3 -3
- package/include/hwy/abort.h +28 -0
- package/include/hwy/aligned_allocator.h +48 -1
- package/include/hwy/base.h +235 -34
- package/include/hwy/detect_compiler_arch.h +84 -10
- package/include/hwy/detect_targets.h +95 -29
- package/include/hwy/foreach_target.h +12 -1
- package/include/hwy/highway.h +205 -50
- package/include/hwy/ops/arm_neon-inl.h +841 -99
- package/include/hwy/ops/arm_sve-inl.h +413 -141
- package/include/hwy/ops/emu128-inl.h +373 -360
- package/include/hwy/ops/generic_ops-inl.h +804 -401
- package/include/hwy/ops/inside-inl.h +691 -0
- package/include/hwy/ops/ppc_vsx-inl.h +456 -166
- package/include/hwy/ops/rvv-inl.h +537 -249
- package/include/hwy/ops/scalar-inl.h +169 -79
- package/include/hwy/ops/set_macros-inl.h +106 -18
- package/include/hwy/ops/shared-inl.h +23 -0
- package/include/hwy/ops/wasm_128-inl.h +130 -108
- package/include/hwy/ops/x86_128-inl.h +1892 -577
- package/include/hwy/ops/x86_256-inl.h +625 -184
- package/include/hwy/ops/x86_512-inl.h +733 -131
- package/include/hwy/targets.h +22 -21
- package/include/hwy/timer-inl.h +3 -3
- package/include/hwy/timer.h +5 -1
- package/include/libheif/heif.h +170 -15
- package/include/libheif/heif_items.h +237 -0
- package/include/libheif/heif_properties.h +38 -2
- package/include/libheif/heif_regions.h +1 -1
- package/include/libheif/heif_version.h +2 -2
- package/include/libpng16/pnglibconf.h +1 -1
- package/include/librsvg-2.0/librsvg/rsvg-cairo.h +1 -1
- package/include/librsvg-2.0/librsvg/rsvg-features.h +3 -4
- package/include/librsvg-2.0/librsvg/rsvg-pixbuf.h +235 -0
- package/include/librsvg-2.0/librsvg/rsvg-version.h +3 -3
- package/include/librsvg-2.0/librsvg/rsvg.h +55 -176
- package/include/libxml2/libxml/HTMLparser.h +12 -19
- package/include/libxml2/libxml/c14n.h +1 -12
- package/include/libxml2/libxml/debugXML.h +1 -1
- package/include/libxml2/libxml/encoding.h +9 -0
- package/include/libxml2/libxml/entities.h +12 -1
- package/include/libxml2/libxml/hash.h +19 -0
- package/include/libxml2/libxml/list.h +2 -2
- package/include/libxml2/libxml/nanohttp.h +17 -0
- package/include/libxml2/libxml/parser.h +61 -55
- package/include/libxml2/libxml/parserInternals.h +9 -1
- package/include/libxml2/libxml/pattern.h +6 -0
- package/include/libxml2/libxml/tree.h +32 -12
- package/include/libxml2/libxml/uri.h +11 -0
- package/include/libxml2/libxml/valid.h +29 -2
- package/include/libxml2/libxml/xinclude.h +7 -0
- package/include/libxml2/libxml/xmlIO.h +21 -4
- package/include/libxml2/libxml/xmlerror.h +14 -0
- package/include/libxml2/libxml/xmlexports.h +111 -15
- package/include/libxml2/libxml/xmlmemory.h +8 -45
- package/include/libxml2/libxml/xmlreader.h +2 -0
- package/include/libxml2/libxml/xmlsave.h +5 -0
- package/include/libxml2/libxml/xmlunicode.h +165 -1
- package/include/libxml2/libxml/xmlversion.h +15 -179
- package/include/libxml2/libxml/xmlwriter.h +1 -0
- package/include/libxml2/libxml/xpath.h +4 -0
- package/include/pango-1.0/pango/pango-features.h +3 -3
- package/include/pango-1.0/pango/pango-item.h +4 -2
- package/include/pango-1.0/pango/pango-version-macros.h +25 -0
- package/include/pango-1.0/pango/pangofc-font.h +2 -1
- package/include/pnglibconf.h +1 -1
- package/include/vips/util.h +1 -2
- package/include/vips/version.h +4 -4
- package/include/webp/decode.h +58 -56
- package/include/webp/demux.h +25 -21
- package/include/webp/encode.h +44 -39
- package/include/webp/mux.h +76 -15
- package/include/webp/mux_types.h +2 -1
- package/include/webp/sharpyuv/sharpyuv.h +77 -8
- package/include/webp/types.h +29 -8
- package/include/zconf.h +1 -1
- package/include/zlib.h +12 -12
- package/package.json +1 -1
- package/versions.json +14 -15
|
@@ -199,9 +199,6 @@ HWY_API Vec128<T, HWY_MAX_LANES_D(D)> Zero(D /* tag */) {
|
|
|
199
199
|
template <class D>
|
|
200
200
|
using VFromD = decltype(Zero(D()));
|
|
201
201
|
|
|
202
|
-
// ------------------------------ Tuple (VFromD)
|
|
203
|
-
#include "hwy/ops/tuple-inl.h"
|
|
204
|
-
|
|
205
202
|
// ------------------------------ BitCast
|
|
206
203
|
|
|
207
204
|
template <class D, typename FromT>
|
|
@@ -247,6 +244,8 @@ HWY_API VFromD<D> Undefined(D d) {
|
|
|
247
244
|
// Suppressing maybe-uninitialized both here and at the caller does not work,
|
|
248
245
|
// so initialize.
|
|
249
246
|
return Zero(d);
|
|
247
|
+
#elif HWY_HAS_BUILTIN(__builtin_nondeterministic_value)
|
|
248
|
+
return VFromD<D>{__builtin_nondeterministic_value(Zero(d).raw)};
|
|
250
249
|
#else
|
|
251
250
|
HWY_DIAGNOSTICS(push)
|
|
252
251
|
HWY_DIAGNOSTICS_OFF(disable : 4700, ignored "-Wuninitialized")
|
|
@@ -671,10 +670,21 @@ HWY_API Vec128<T, N> CopySignToAbs(Vec128<T, N> abs, Vec128<T, N> sign) {
|
|
|
671
670
|
|
|
672
671
|
template <class D, HWY_IF_V_SIZE_D(D, 16), typename T = TFromD<D>>
|
|
673
672
|
HWY_API Vec128<T> Load(D /* tag */, const T* HWY_RESTRICT aligned) {
|
|
673
|
+
// Suppress the ignoring attributes warning that is generated by
|
|
674
|
+
// HWY_RCAST_ALIGNED(const LoadRaw*, aligned) with GCC
|
|
675
|
+
#if HWY_COMPILER_GCC
|
|
676
|
+
HWY_DIAGNOSTICS(push)
|
|
677
|
+
HWY_DIAGNOSTICS_OFF(disable : 4649, ignored "-Wignored-attributes")
|
|
678
|
+
#endif
|
|
679
|
+
|
|
674
680
|
using LoadRaw = typename detail::Raw128<T>::AlignedRawVec;
|
|
675
681
|
const LoadRaw* HWY_RESTRICT p = HWY_RCAST_ALIGNED(const LoadRaw*, aligned);
|
|
676
682
|
using ResultRaw = typename detail::Raw128<T>::type;
|
|
677
683
|
return Vec128<T>{reinterpret_cast<ResultRaw>(*p)};
|
|
684
|
+
|
|
685
|
+
#if HWY_COMPILER_GCC
|
|
686
|
+
HWY_DIAGNOSTICS(pop)
|
|
687
|
+
#endif
|
|
678
688
|
}
|
|
679
689
|
|
|
680
690
|
// Any <= 64 bit
|
|
@@ -1275,8 +1285,19 @@ HWY_API VFromD<D> MaskedLoadOr(VFromD<D> v, MFromD<D> m, D d,
|
|
|
1275
1285
|
|
|
1276
1286
|
template <class D, HWY_IF_V_SIZE_D(D, 16), typename T = TFromD<D>>
|
|
1277
1287
|
HWY_API void Store(Vec128<T> v, D /* tag */, T* HWY_RESTRICT aligned) {
|
|
1288
|
+
// Suppress the ignoring attributes warning that is generated by
|
|
1289
|
+
// HWY_RCAST_ALIGNED(StoreRaw*, aligned) with GCC
|
|
1290
|
+
#if HWY_COMPILER_GCC
|
|
1291
|
+
HWY_DIAGNOSTICS(push)
|
|
1292
|
+
HWY_DIAGNOSTICS_OFF(disable : 4649, ignored "-Wignored-attributes")
|
|
1293
|
+
#endif
|
|
1294
|
+
|
|
1278
1295
|
using StoreRaw = typename detail::Raw128<T>::AlignedRawVec;
|
|
1279
1296
|
*HWY_RCAST_ALIGNED(StoreRaw*, aligned) = reinterpret_cast<StoreRaw>(v.raw);
|
|
1297
|
+
|
|
1298
|
+
#if HWY_COMPILER_GCC
|
|
1299
|
+
HWY_DIAGNOSTICS(pop)
|
|
1300
|
+
#endif
|
|
1280
1301
|
}
|
|
1281
1302
|
|
|
1282
1303
|
template <class D, HWY_IF_V_SIZE_D(D, 16), typename T = TFromD<D>>
|
|
@@ -1343,17 +1364,8 @@ HWY_API void StoreN(VFromD<D> v, D d, T* HWY_RESTRICT p,
|
|
|
1343
1364
|
template <class D>
|
|
1344
1365
|
HWY_API void BlendedStore(VFromD<D> v, MFromD<D> m, D d,
|
|
1345
1366
|
TFromD<D>* HWY_RESTRICT p) {
|
|
1346
|
-
const
|
|
1347
|
-
|
|
1348
|
-
alignas(16) TI buf[MaxLanes(d)];
|
|
1349
|
-
alignas(16) TI mask[MaxLanes(d)];
|
|
1350
|
-
Store(BitCast(di, v), di, buf);
|
|
1351
|
-
Store(BitCast(di, VecFromMask(d, m)), di, mask);
|
|
1352
|
-
for (size_t i = 0; i < MaxLanes(d); ++i) {
|
|
1353
|
-
if (mask[i]) {
|
|
1354
|
-
CopySameSize(buf + i, p + i);
|
|
1355
|
-
}
|
|
1356
|
-
}
|
|
1367
|
+
const VFromD<D> old = LoadU(d, p);
|
|
1368
|
+
StoreU(IfThenElse(RebindMask(d, m), v, old), d, p);
|
|
1357
1369
|
}
|
|
1358
1370
|
|
|
1359
1371
|
// ================================================== ARITHMETIC
|
|
@@ -1577,27 +1589,84 @@ HWY_API Vec128<T, N> operator*(Vec128<T, N> a, Vec128<T, N> b) {
|
|
|
1577
1589
|
#endif
|
|
1578
1590
|
}
|
|
1579
1591
|
|
|
1580
|
-
// Returns the upper
|
|
1581
|
-
|
|
1582
|
-
HWY_API Vec128<T, N> MulHigh(Vec128<T, N> a, Vec128<T, N> b) {
|
|
1592
|
+
// Returns the upper sizeof(T)*8 bits of a * b in each lane.
|
|
1593
|
+
|
|
1583
1594
|
#if HWY_S390X_HAVE_Z14
|
|
1595
|
+
#define HWY_PPC_IF_MULHIGH_USING_VEC_MULH(T) \
|
|
1596
|
+
HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2) | (1 << 4))
|
|
1597
|
+
#define HWY_PPC_IF_MULHIGH_8_16_32_NOT_USING_VEC_MULH(T) \
|
|
1598
|
+
hwy::EnableIf<!hwy::IsSame<T, T>()>* = nullptr
|
|
1599
|
+
#elif HWY_PPC_HAVE_10
|
|
1600
|
+
#define HWY_PPC_IF_MULHIGH_USING_VEC_MULH(T) \
|
|
1601
|
+
HWY_IF_T_SIZE_ONE_OF(T, (1 << 4) | (1 << 8))
|
|
1602
|
+
#define HWY_PPC_IF_MULHIGH_8_16_32_NOT_USING_VEC_MULH(T) \
|
|
1603
|
+
HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2))
|
|
1604
|
+
#else
|
|
1605
|
+
#define HWY_PPC_IF_MULHIGH_USING_VEC_MULH(T) \
|
|
1606
|
+
hwy::EnableIf<!hwy::IsSame<T, T>()>* = nullptr
|
|
1607
|
+
#define HWY_PPC_IF_MULHIGH_8_16_32_NOT_USING_VEC_MULH(T) \
|
|
1608
|
+
HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2) | (1 << 4))
|
|
1609
|
+
#endif
|
|
1610
|
+
|
|
1611
|
+
#if HWY_S390X_HAVE_Z14 || HWY_PPC_HAVE_10
|
|
1612
|
+
template <typename T, size_t N, HWY_PPC_IF_MULHIGH_USING_VEC_MULH(T),
|
|
1613
|
+
HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
|
|
1614
|
+
HWY_API Vec128<T, N> MulHigh(Vec128<T, N> a, Vec128<T, N> b) {
|
|
1584
1615
|
return Vec128<T, N>{vec_mulh(a.raw, b.raw)};
|
|
1616
|
+
}
|
|
1617
|
+
#endif
|
|
1618
|
+
|
|
1619
|
+
template <typename T, HWY_PPC_IF_MULHIGH_8_16_32_NOT_USING_VEC_MULH(T),
|
|
1620
|
+
HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
|
|
1621
|
+
HWY_API Vec128<T, 1> MulHigh(Vec128<T, 1> a, Vec128<T, 1> b) {
|
|
1622
|
+
const auto p_even = MulEven(a, b);
|
|
1623
|
+
|
|
1624
|
+
#if HWY_IS_LITTLE_ENDIAN
|
|
1625
|
+
const auto p_even_full = ResizeBitCast(Full128<T>(), p_even);
|
|
1626
|
+
return Vec128<T, 1>{
|
|
1627
|
+
vec_sld(p_even_full.raw, p_even_full.raw, 16 - sizeof(T))};
|
|
1585
1628
|
#else
|
|
1586
1629
|
const DFromV<decltype(a)> d;
|
|
1587
|
-
|
|
1588
|
-
|
|
1589
|
-
|
|
1630
|
+
return ResizeBitCast(d, p_even);
|
|
1631
|
+
#endif
|
|
1632
|
+
}
|
|
1633
|
+
|
|
1634
|
+
template <typename T, size_t N,
|
|
1635
|
+
HWY_PPC_IF_MULHIGH_8_16_32_NOT_USING_VEC_MULH(T),
|
|
1636
|
+
HWY_IF_NOT_FLOAT_NOR_SPECIAL(T), HWY_IF_LANES_GT(N, 1)>
|
|
1637
|
+
HWY_API Vec128<T, N> MulHigh(Vec128<T, N> a, Vec128<T, N> b) {
|
|
1638
|
+
const DFromV<decltype(a)> d;
|
|
1639
|
+
|
|
1640
|
+
const auto p_even = BitCast(d, MulEven(a, b));
|
|
1641
|
+
const auto p_odd = BitCast(d, MulOdd(a, b));
|
|
1642
|
+
|
|
1590
1643
|
#if HWY_IS_LITTLE_ENDIAN
|
|
1591
|
-
|
|
1592
|
-
10, 11, 26, 27, 14, 15, 30, 31};
|
|
1644
|
+
return InterleaveOdd(d, p_even, p_odd);
|
|
1593
1645
|
#else
|
|
1594
|
-
|
|
1595
|
-
8, 9, 24, 25, 12, 13, 28, 29};
|
|
1596
|
-
#endif
|
|
1597
|
-
return BitCast(d, VFromD<decltype(dw)>{vec_perm(p1.raw, p2.raw, kShuffle)});
|
|
1646
|
+
return InterleaveEven(d, p_even, p_odd);
|
|
1598
1647
|
#endif
|
|
1599
1648
|
}
|
|
1600
1649
|
|
|
1650
|
+
#if !HWY_PPC_HAVE_10
|
|
1651
|
+
template <class T, HWY_IF_UI64(T)>
|
|
1652
|
+
HWY_API Vec64<T> MulHigh(Vec64<T> a, Vec64<T> b) {
|
|
1653
|
+
T p_hi;
|
|
1654
|
+
Mul128(GetLane(a), GetLane(b), &p_hi);
|
|
1655
|
+
return Set(Full64<T>(), p_hi);
|
|
1656
|
+
}
|
|
1657
|
+
|
|
1658
|
+
template <class T, HWY_IF_UI64(T)>
|
|
1659
|
+
HWY_API Vec128<T> MulHigh(Vec128<T> a, Vec128<T> b) {
|
|
1660
|
+
const DFromV<decltype(a)> d;
|
|
1661
|
+
const Half<decltype(d)> dh;
|
|
1662
|
+
return Combine(d, MulHigh(UpperHalf(dh, a), UpperHalf(dh, b)),
|
|
1663
|
+
MulHigh(LowerHalf(dh, a), LowerHalf(dh, b)));
|
|
1664
|
+
}
|
|
1665
|
+
#endif // !HWY_PPC_HAVE_10
|
|
1666
|
+
|
|
1667
|
+
#undef HWY_PPC_IF_MULHIGH_USING_VEC_MULH
|
|
1668
|
+
#undef HWY_PPC_IF_MULHIGH_8_16_32_NOT_USING_VEC_MULH
|
|
1669
|
+
|
|
1601
1670
|
// Multiplies even lanes (0, 2, ..) and places the double-wide result into
|
|
1602
1671
|
// even and the upper half into its odd neighbor lane.
|
|
1603
1672
|
template <typename T, size_t N,
|
|
@@ -1618,29 +1687,83 @@ HWY_API Vec128<MakeWide<T>, (N + 1) / 2> MulOdd(Vec128<T, N> a,
|
|
|
1618
1687
|
return Vec128<MakeWide<T>, (N + 1) / 2>{vec_mulo(a.raw, b.raw)};
|
|
1619
1688
|
}
|
|
1620
1689
|
|
|
1690
|
+
// ------------------------------ Rol/Ror
|
|
1691
|
+
|
|
1692
|
+
#ifdef HWY_NATIVE_ROL_ROR_8
|
|
1693
|
+
#undef HWY_NATIVE_ROL_ROR_8
|
|
1694
|
+
#else
|
|
1695
|
+
#define HWY_NATIVE_ROL_ROR_8
|
|
1696
|
+
#endif
|
|
1697
|
+
|
|
1698
|
+
#ifdef HWY_NATIVE_ROL_ROR_16
|
|
1699
|
+
#undef HWY_NATIVE_ROL_ROR_16
|
|
1700
|
+
#else
|
|
1701
|
+
#define HWY_NATIVE_ROL_ROR_16
|
|
1702
|
+
#endif
|
|
1703
|
+
|
|
1704
|
+
#ifdef HWY_NATIVE_ROL_ROR_32_64
|
|
1705
|
+
#undef HWY_NATIVE_ROL_ROR_32_64
|
|
1706
|
+
#else
|
|
1707
|
+
#define HWY_NATIVE_ROL_ROR_32_64
|
|
1708
|
+
#endif
|
|
1709
|
+
|
|
1710
|
+
template <typename T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
|
|
1711
|
+
HWY_API Vec128<T, N> Rol(Vec128<T, N> a, Vec128<T, N> b) {
|
|
1712
|
+
const DFromV<decltype(a)> d;
|
|
1713
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
1714
|
+
return BitCast(
|
|
1715
|
+
d, VFromD<decltype(du)>{vec_rl(BitCast(du, a).raw, BitCast(du, b).raw)});
|
|
1716
|
+
}
|
|
1717
|
+
|
|
1718
|
+
template <typename T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
|
|
1719
|
+
HWY_API Vec128<T, N> Ror(Vec128<T, N> a, Vec128<T, N> b) {
|
|
1720
|
+
const DFromV<decltype(a)> d;
|
|
1721
|
+
const RebindToSigned<decltype(d)> di;
|
|
1722
|
+
return Rol(a, BitCast(d, Neg(BitCast(di, b))));
|
|
1723
|
+
}
|
|
1724
|
+
|
|
1621
1725
|
// ------------------------------ RotateRight
|
|
1622
|
-
template <int kBits, typename T, size_t N>
|
|
1726
|
+
template <int kBits, typename T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
|
|
1623
1727
|
HWY_API Vec128<T, N> RotateRight(const Vec128<T, N> v) {
|
|
1624
1728
|
const DFromV<decltype(v)> d;
|
|
1625
|
-
const RebindToUnsigned<decltype(d)> du;
|
|
1626
1729
|
constexpr size_t kSizeInBits = sizeof(T) * 8;
|
|
1627
1730
|
static_assert(0 <= kBits && kBits < kSizeInBits, "Invalid shift count");
|
|
1628
1731
|
|
|
1629
|
-
|
|
1732
|
+
return (kBits == 0)
|
|
1733
|
+
? v
|
|
1734
|
+
: Rol(v, Set(d, static_cast<T>(static_cast<int>(kSizeInBits) -
|
|
1735
|
+
kBits)));
|
|
1736
|
+
}
|
|
1737
|
+
|
|
1738
|
+
// ------------------------------ RotateLeftSame/RotateRightSame
|
|
1739
|
+
#ifdef HWY_NATIVE_ROL_ROR_SAME_8
|
|
1740
|
+
#undef HWY_NATIVE_ROL_ROR_SAME_8
|
|
1741
|
+
#else
|
|
1742
|
+
#define HWY_NATIVE_ROL_ROR_SAME_8
|
|
1743
|
+
#endif
|
|
1744
|
+
|
|
1745
|
+
#ifdef HWY_NATIVE_ROL_ROR_SAME_16
|
|
1746
|
+
#undef HWY_NATIVE_ROL_ROR_SAME_16
|
|
1747
|
+
#else
|
|
1748
|
+
#define HWY_NATIVE_ROL_ROR_SAME_16
|
|
1749
|
+
#endif
|
|
1750
|
+
|
|
1751
|
+
#ifdef HWY_NATIVE_ROL_ROR_SAME_32_64
|
|
1752
|
+
#undef HWY_NATIVE_ROL_ROR_SAME_32_64
|
|
1753
|
+
#else
|
|
1754
|
+
#define HWY_NATIVE_ROL_ROR_SAME_32_64
|
|
1755
|
+
#endif
|
|
1630
1756
|
|
|
1631
|
-
|
|
1632
|
-
|
|
1633
|
-
|
|
1757
|
+
template <typename T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
|
|
1758
|
+
HWY_API Vec128<T, N> RotateLeftSame(Vec128<T, N> v, int bits) {
|
|
1759
|
+
const DFromV<decltype(v)> d;
|
|
1760
|
+
return Rol(v, Set(d, static_cast<T>(static_cast<unsigned>(bits))));
|
|
1634
1761
|
}
|
|
1635
1762
|
|
|
1636
|
-
|
|
1637
|
-
|
|
1638
|
-
HWY_API Vec128<T, N> ZeroIfNegative(Vec128<T, N> v) {
|
|
1639
|
-
static_assert(IsFloat<T>(), "Only works for float");
|
|
1763
|
+
template <typename T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
|
|
1764
|
+
HWY_API Vec128<T, N> RotateRightSame(Vec128<T, N> v, int bits) {
|
|
1640
1765
|
const DFromV<decltype(v)> d;
|
|
1641
|
-
|
|
1642
|
-
const auto mask = MaskFromVec(BitCast(d, BroadcastSignBit(BitCast(di, v))));
|
|
1643
|
-
return IfThenElse(mask, Zero(d), v);
|
|
1766
|
+
return Rol(v, Set(d, static_cast<T>(0u - static_cast<unsigned>(bits))));
|
|
1644
1767
|
}
|
|
1645
1768
|
|
|
1646
1769
|
// ------------------------------ IfNegativeThenElse
|
|
@@ -1662,6 +1785,32 @@ HWY_API Vec128<T, N> IfNegativeThenElse(Vec128<T, N> v, Vec128<T, N> yes,
|
|
|
1662
1785
|
#endif
|
|
1663
1786
|
}
|
|
1664
1787
|
|
|
1788
|
+
#if HWY_PPC_HAVE_10
|
|
1789
|
+
#ifdef HWY_NATIVE_IF_NEG_THEN_ELSE_ZERO
|
|
1790
|
+
#undef HWY_NATIVE_IF_NEG_THEN_ELSE_ZERO
|
|
1791
|
+
#else
|
|
1792
|
+
#define HWY_NATIVE_IF_NEG_THEN_ELSE_ZERO
|
|
1793
|
+
#endif
|
|
1794
|
+
|
|
1795
|
+
#ifdef HWY_NATIVE_IF_NEG_THEN_ZERO_ELSE
|
|
1796
|
+
#undef HWY_NATIVE_IF_NEG_THEN_ZERO_ELSE
|
|
1797
|
+
#else
|
|
1798
|
+
#define HWY_NATIVE_IF_NEG_THEN_ZERO_ELSE
|
|
1799
|
+
#endif
|
|
1800
|
+
|
|
1801
|
+
template <class V, HWY_IF_NOT_UNSIGNED_V(V)>
|
|
1802
|
+
HWY_API V IfNegativeThenElseZero(V v, V yes) {
|
|
1803
|
+
const DFromV<decltype(v)> d;
|
|
1804
|
+
return IfNegativeThenElse(v, yes, Zero(d));
|
|
1805
|
+
}
|
|
1806
|
+
|
|
1807
|
+
template <class V, HWY_IF_NOT_UNSIGNED_V(V)>
|
|
1808
|
+
HWY_API V IfNegativeThenZeroElse(V v, V no) {
|
|
1809
|
+
const DFromV<decltype(v)> d;
|
|
1810
|
+
return IfNegativeThenElse(v, Zero(d), no);
|
|
1811
|
+
}
|
|
1812
|
+
#endif
|
|
1813
|
+
|
|
1665
1814
|
// generic_ops takes care of integer T.
|
|
1666
1815
|
template <typename T, size_t N, HWY_IF_FLOAT(T)>
|
|
1667
1816
|
HWY_API Vec128<T, N> AbsDiff(Vec128<T, N> a, Vec128<T, N> b) {
|
|
@@ -3033,6 +3182,96 @@ HWY_INLINE Vec128<T, N> OddEven(Vec128<T, N> a, Vec128<T, N> b) {
|
|
|
3033
3182
|
return IfVecThenElse(BitCast(d, Vec128<uint8_t, N * 8>{mask}), b, a);
|
|
3034
3183
|
}
|
|
3035
3184
|
|
|
3185
|
+
// ------------------------------ InterleaveEven
|
|
3186
|
+
|
|
3187
|
+
template <class D, HWY_IF_T_SIZE_D(D, 1)>
|
|
3188
|
+
HWY_API VFromD<D> InterleaveEven(D d, VFromD<D> a, VFromD<D> b) {
|
|
3189
|
+
const Full128<TFromD<D>> d_full;
|
|
3190
|
+
const Indices128<TFromD<D>> idx{
|
|
3191
|
+
Dup128VecFromValues(Full128<uint8_t>(), 0, 16, 2, 18, 4, 20, 6, 22, 8, 24,
|
|
3192
|
+
10, 26, 12, 28, 14, 30)
|
|
3193
|
+
.raw};
|
|
3194
|
+
return ResizeBitCast(d, TwoTablesLookupLanes(ResizeBitCast(d_full, a),
|
|
3195
|
+
ResizeBitCast(d_full, b), idx));
|
|
3196
|
+
}
|
|
3197
|
+
|
|
3198
|
+
template <class D, HWY_IF_T_SIZE_D(D, 2)>
|
|
3199
|
+
HWY_API VFromD<D> InterleaveEven(D d, VFromD<D> a, VFromD<D> b) {
|
|
3200
|
+
const Full128<TFromD<D>> d_full;
|
|
3201
|
+
const Indices128<TFromD<D>> idx{Dup128VecFromValues(Full128<uint8_t>(), 0, 1,
|
|
3202
|
+
16, 17, 4, 5, 20, 21, 8,
|
|
3203
|
+
9, 24, 25, 12, 13, 28, 29)
|
|
3204
|
+
.raw};
|
|
3205
|
+
return ResizeBitCast(d, TwoTablesLookupLanes(ResizeBitCast(d_full, a),
|
|
3206
|
+
ResizeBitCast(d_full, b), idx));
|
|
3207
|
+
}
|
|
3208
|
+
|
|
3209
|
+
template <class D, HWY_IF_T_SIZE_D(D, 4)>
|
|
3210
|
+
HWY_API VFromD<D> InterleaveEven(D d, VFromD<D> a, VFromD<D> b) {
|
|
3211
|
+
#if HWY_S390X_HAVE_Z14
|
|
3212
|
+
const Full128<TFromD<D>> d_full;
|
|
3213
|
+
const Indices128<TFromD<D>> idx{Dup128VecFromValues(Full128<uint8_t>(), 0, 1,
|
|
3214
|
+
2, 3, 16, 17, 18, 19, 8,
|
|
3215
|
+
9, 10, 11, 24, 25, 26, 27)
|
|
3216
|
+
.raw};
|
|
3217
|
+
return ResizeBitCast(d, TwoTablesLookupLanes(ResizeBitCast(d_full, a),
|
|
3218
|
+
ResizeBitCast(d_full, b), idx));
|
|
3219
|
+
#else
|
|
3220
|
+
(void)d;
|
|
3221
|
+
return VFromD<D>{vec_mergee(a.raw, b.raw)};
|
|
3222
|
+
#endif
|
|
3223
|
+
}
|
|
3224
|
+
|
|
3225
|
+
template <class D, HWY_IF_T_SIZE_D(D, 8)>
|
|
3226
|
+
HWY_API VFromD<D> InterleaveEven(D /*d*/, VFromD<D> a, VFromD<D> b) {
|
|
3227
|
+
return InterleaveLower(a, b);
|
|
3228
|
+
}
|
|
3229
|
+
|
|
3230
|
+
// ------------------------------ InterleaveOdd
|
|
3231
|
+
|
|
3232
|
+
template <class D, HWY_IF_T_SIZE_D(D, 1)>
|
|
3233
|
+
HWY_API VFromD<D> InterleaveOdd(D d, VFromD<D> a, VFromD<D> b) {
|
|
3234
|
+
const Full128<TFromD<D>> d_full;
|
|
3235
|
+
const Indices128<TFromD<D>> idx{
|
|
3236
|
+
Dup128VecFromValues(Full128<uint8_t>(), 1, 17, 3, 19, 5, 21, 7, 23, 9, 25,
|
|
3237
|
+
11, 27, 13, 29, 15, 31)
|
|
3238
|
+
.raw};
|
|
3239
|
+
return ResizeBitCast(d, TwoTablesLookupLanes(ResizeBitCast(d_full, a),
|
|
3240
|
+
ResizeBitCast(d_full, b), idx));
|
|
3241
|
+
}
|
|
3242
|
+
|
|
3243
|
+
template <class D, HWY_IF_T_SIZE_D(D, 2)>
|
|
3244
|
+
HWY_API VFromD<D> InterleaveOdd(D d, VFromD<D> a, VFromD<D> b) {
|
|
3245
|
+
const Full128<TFromD<D>> d_full;
|
|
3246
|
+
const Indices128<TFromD<D>> idx{
|
|
3247
|
+
Dup128VecFromValues(Full128<uint8_t>(), 2, 3, 18, 19, 6, 7, 22, 23, 10,
|
|
3248
|
+
11, 26, 27, 14, 15, 30, 31)
|
|
3249
|
+
.raw};
|
|
3250
|
+
return ResizeBitCast(d, TwoTablesLookupLanes(ResizeBitCast(d_full, a),
|
|
3251
|
+
ResizeBitCast(d_full, b), idx));
|
|
3252
|
+
}
|
|
3253
|
+
|
|
3254
|
+
template <class D, HWY_IF_T_SIZE_D(D, 4)>
|
|
3255
|
+
HWY_API VFromD<D> InterleaveOdd(D d, VFromD<D> a, VFromD<D> b) {
|
|
3256
|
+
#if HWY_S390X_HAVE_Z14
|
|
3257
|
+
const Full128<TFromD<D>> d_full;
|
|
3258
|
+
const Indices128<TFromD<D>> idx{
|
|
3259
|
+
Dup128VecFromValues(Full128<uint8_t>(), 4, 5, 6, 7, 20, 21, 22, 23, 12,
|
|
3260
|
+
13, 14, 15, 28, 29, 30, 31)
|
|
3261
|
+
.raw};
|
|
3262
|
+
return ResizeBitCast(d, TwoTablesLookupLanes(ResizeBitCast(d_full, a),
|
|
3263
|
+
ResizeBitCast(d_full, b), idx));
|
|
3264
|
+
#else
|
|
3265
|
+
(void)d;
|
|
3266
|
+
return VFromD<D>{vec_mergeo(a.raw, b.raw)};
|
|
3267
|
+
#endif
|
|
3268
|
+
}
|
|
3269
|
+
|
|
3270
|
+
template <class D, HWY_IF_T_SIZE_D(D, 8)>
|
|
3271
|
+
HWY_API VFromD<D> InterleaveOdd(D d, VFromD<D> a, VFromD<D> b) {
|
|
3272
|
+
return InterleaveUpper(d, a, b);
|
|
3273
|
+
}
|
|
3274
|
+
|
|
3036
3275
|
// ------------------------------ OddEvenBlocks
|
|
3037
3276
|
template <typename T, size_t N>
|
|
3038
3277
|
HWY_API Vec128<T, N> OddEvenBlocks(Vec128<T, N> /* odd */, Vec128<T, N> even) {
|
|
@@ -3144,63 +3383,58 @@ HWY_API Vec128<T, N> operator>>(Vec128<T, N> v, Vec128<T, N> bits) {
|
|
|
3144
3383
|
|
|
3145
3384
|
// ------------------------------ MulEven/Odd 64x64 (UpperHalf)
|
|
3146
3385
|
|
|
3147
|
-
|
|
3386
|
+
template <class T, HWY_IF_UI64(T)>
|
|
3387
|
+
HWY_INLINE Vec128<T> MulEven(Vec128<T> a, Vec128<T> b) {
|
|
3148
3388
|
#if HWY_PPC_HAVE_10 && defined(__SIZEOF_INT128__)
|
|
3149
|
-
using
|
|
3150
|
-
const
|
|
3389
|
+
using V64 = typename detail::Raw128<T>::type;
|
|
3390
|
+
const V64 mul128_result = reinterpret_cast<V64>(vec_mule(a.raw, b.raw));
|
|
3151
3391
|
#if HWY_IS_LITTLE_ENDIAN
|
|
3152
|
-
return Vec128<
|
|
3392
|
+
return Vec128<T>{mul128_result};
|
|
3153
3393
|
#else
|
|
3154
3394
|
// Need to swap the two halves of mul128_result on big-endian targets as
|
|
3155
3395
|
// the upper 64 bits of the product are in lane 0 of mul128_result and
|
|
3156
3396
|
// the lower 64 bits of the product are in lane 1 of mul128_result
|
|
3157
|
-
return Vec128<
|
|
3397
|
+
return Vec128<T>{vec_sld(mul128_result, mul128_result, 8)};
|
|
3158
3398
|
#endif
|
|
3159
3399
|
#else
|
|
3160
|
-
alignas(16)
|
|
3400
|
+
alignas(16) T mul[2];
|
|
3161
3401
|
mul[0] = Mul128(GetLane(a), GetLane(b), &mul[1]);
|
|
3162
|
-
return Load(Full128<
|
|
3402
|
+
return Load(Full128<T>(), mul);
|
|
3163
3403
|
#endif
|
|
3164
3404
|
}
|
|
3165
3405
|
|
|
3166
|
-
|
|
3406
|
+
template <class T, HWY_IF_UI64(T)>
|
|
3407
|
+
HWY_INLINE Vec128<T> MulOdd(Vec128<T> a, Vec128<T> b) {
|
|
3167
3408
|
#if HWY_PPC_HAVE_10 && defined(__SIZEOF_INT128__)
|
|
3168
|
-
using
|
|
3169
|
-
const
|
|
3409
|
+
using V64 = typename detail::Raw128<T>::type;
|
|
3410
|
+
const V64 mul128_result = reinterpret_cast<V64>(vec_mulo(a.raw, b.raw));
|
|
3170
3411
|
#if HWY_IS_LITTLE_ENDIAN
|
|
3171
|
-
return Vec128<
|
|
3412
|
+
return Vec128<T>{mul128_result};
|
|
3172
3413
|
#else
|
|
3173
3414
|
// Need to swap the two halves of mul128_result on big-endian targets as
|
|
3174
3415
|
// the upper 64 bits of the product are in lane 0 of mul128_result and
|
|
3175
3416
|
// the lower 64 bits of the product are in lane 1 of mul128_result
|
|
3176
|
-
return Vec128<
|
|
3417
|
+
return Vec128<T>{vec_sld(mul128_result, mul128_result, 8)};
|
|
3177
3418
|
#endif
|
|
3178
3419
|
#else
|
|
3179
|
-
alignas(16)
|
|
3180
|
-
const Full64<
|
|
3420
|
+
alignas(16) T mul[2];
|
|
3421
|
+
const Full64<T> d2;
|
|
3181
3422
|
mul[0] =
|
|
3182
3423
|
Mul128(GetLane(UpperHalf(d2, a)), GetLane(UpperHalf(d2, b)), &mul[1]);
|
|
3183
|
-
return Load(Full128<
|
|
3424
|
+
return Load(Full128<T>(), mul);
|
|
3184
3425
|
#endif
|
|
3185
3426
|
}
|
|
3186
3427
|
|
|
3428
|
+
// ------------------------------ PromoteEvenTo/PromoteOddTo
|
|
3429
|
+
#include "hwy/ops/inside-inl.h"
|
|
3430
|
+
|
|
3187
3431
|
// ------------------------------ WidenMulPairwiseAdd
|
|
3188
3432
|
|
|
3189
|
-
template <class
|
|
3190
|
-
class
|
|
3191
|
-
HWY_API VFromD<
|
|
3192
|
-
|
|
3193
|
-
|
|
3194
|
-
// longer-latency lane-crossing PromoteTo. Using shift/and instead of Zip
|
|
3195
|
-
// leads to the odd/even order that RearrangeToOddPlusEven prefers.
|
|
3196
|
-
using VU32 = VFromD<decltype(du32)>;
|
|
3197
|
-
const VU32 odd = Set(du32, 0xFFFF0000u);
|
|
3198
|
-
const VU32 ae = ShiftLeft<16>(BitCast(du32, a));
|
|
3199
|
-
const VU32 ao = And(BitCast(du32, a), odd);
|
|
3200
|
-
const VU32 be = ShiftLeft<16>(BitCast(du32, b));
|
|
3201
|
-
const VU32 bo = And(BitCast(du32, b), odd);
|
|
3202
|
-
return MulAdd(BitCast(df32, ae), BitCast(df32, be),
|
|
3203
|
-
Mul(BitCast(df32, ao), BitCast(df32, bo)));
|
|
3433
|
+
template <class DF, HWY_IF_F32_D(DF),
|
|
3434
|
+
class VBF = VFromD<Repartition<bfloat16_t, DF>>>
|
|
3435
|
+
HWY_API VFromD<DF> WidenMulPairwiseAdd(DF df, VBF a, VBF b) {
|
|
3436
|
+
return MulAdd(PromoteEvenTo(df, a), PromoteEvenTo(df, b),
|
|
3437
|
+
Mul(PromoteOddTo(df, a), PromoteOddTo(df, b)));
|
|
3204
3438
|
}
|
|
3205
3439
|
|
|
3206
3440
|
// Even if N=1, the input is always at least 2 lanes, hence vec_msum is safe.
|
|
@@ -3217,25 +3451,6 @@ HWY_API VFromD<D32> WidenMulPairwiseAdd(D32 d32, V16 a, V16 b) {
|
|
|
3217
3451
|
|
|
3218
3452
|
// ------------------------------ ReorderWidenMulAccumulate (MulAdd, ZipLower)
|
|
3219
3453
|
|
|
3220
|
-
template <class D32, HWY_IF_F32_D(D32),
|
|
3221
|
-
class V16 = VFromD<Repartition<bfloat16_t, D32>>>
|
|
3222
|
-
HWY_API VFromD<D32> ReorderWidenMulAccumulate(D32 df32, V16 a, V16 b,
|
|
3223
|
-
VFromD<D32> sum0,
|
|
3224
|
-
VFromD<D32>& sum1) {
|
|
3225
|
-
const RebindToUnsigned<decltype(df32)> du32;
|
|
3226
|
-
// Lane order within sum0/1 is undefined, hence we can avoid the
|
|
3227
|
-
// longer-latency lane-crossing PromoteTo. Using shift/and instead of Zip
|
|
3228
|
-
// leads to the odd/even order that RearrangeToOddPlusEven prefers.
|
|
3229
|
-
using VU32 = VFromD<decltype(du32)>;
|
|
3230
|
-
const VU32 odd = Set(du32, 0xFFFF0000u);
|
|
3231
|
-
const VU32 ae = ShiftLeft<16>(BitCast(du32, a));
|
|
3232
|
-
const VU32 ao = And(BitCast(du32, a), odd);
|
|
3233
|
-
const VU32 be = ShiftLeft<16>(BitCast(du32, b));
|
|
3234
|
-
const VU32 bo = And(BitCast(du32, b), odd);
|
|
3235
|
-
sum1 = MulAdd(BitCast(df32, ao), BitCast(df32, bo), sum1);
|
|
3236
|
-
return MulAdd(BitCast(df32, ae), BitCast(df32, be), sum0);
|
|
3237
|
-
}
|
|
3238
|
-
|
|
3239
3454
|
// Even if N=1, the input is always at least 2 lanes, hence vec_msum is safe.
|
|
3240
3455
|
template <class D32, HWY_IF_UI32_D(D32),
|
|
3241
3456
|
class V16 = VFromD<RepartitionToNarrow<D32>>>
|
|
@@ -3267,6 +3482,24 @@ HWY_API VW RearrangeToOddPlusEven(const VW sum0, const VW sum1) {
|
|
|
3267
3482
|
return Add(sum0, sum1);
|
|
3268
3483
|
}
|
|
3269
3484
|
|
|
3485
|
+
// ------------------------------ SatWidenMulPairwiseAccumulate
|
|
3486
|
+
#if !HWY_S390X_HAVE_Z14
|
|
3487
|
+
|
|
3488
|
+
#ifdef HWY_NATIVE_I16_I16_SATWIDENMULPAIRWISEACCUM
|
|
3489
|
+
#undef HWY_NATIVE_I16_I16_SATWIDENMULPAIRWISEACCUM
|
|
3490
|
+
#else
|
|
3491
|
+
#define HWY_NATIVE_I16_I16_SATWIDENMULPAIRWISEACCUM
|
|
3492
|
+
#endif
|
|
3493
|
+
|
|
3494
|
+
template <class DI32, HWY_IF_I32_D(DI32), HWY_IF_V_SIZE_LE_D(DI32, 16)>
|
|
3495
|
+
HWY_API VFromD<DI32> SatWidenMulPairwiseAccumulate(
|
|
3496
|
+
DI32 /* tag */, VFromD<Repartition<int16_t, DI32>> a,
|
|
3497
|
+
VFromD<Repartition<int16_t, DI32>> b, VFromD<DI32> sum) {
|
|
3498
|
+
return VFromD<DI32>{vec_msums(a.raw, b.raw, sum.raw)};
|
|
3499
|
+
}
|
|
3500
|
+
|
|
3501
|
+
#endif // !HWY_S390X_HAVE_Z14
|
|
3502
|
+
|
|
3270
3503
|
// ------------------------------ SumOfMulQuadAccumulate
|
|
3271
3504
|
#if !HWY_S390X_HAVE_Z14
|
|
3272
3505
|
|
|
@@ -3997,29 +4230,43 @@ HWY_API VFromD<D> DemoteTo(D df16, VFromD<Rebind<double, D>> v) {
|
|
|
3997
4230
|
|
|
3998
4231
|
#endif // HWY_PPC_HAVE_9
|
|
3999
4232
|
|
|
4000
|
-
|
|
4001
|
-
HWY_API VFromD<D> DemoteTo(D dbf16, VFromD<Rebind<float, D>> v) {
|
|
4002
|
-
const Rebind<uint32_t, decltype(dbf16)> du32; // for logical shift right
|
|
4003
|
-
const Rebind<uint16_t, decltype(dbf16)> du16;
|
|
4004
|
-
const auto bits_in_32 = ShiftRight<16>(BitCast(du32, v));
|
|
4005
|
-
return BitCast(dbf16, TruncateTo(du16, bits_in_32));
|
|
4006
|
-
}
|
|
4233
|
+
#if HWY_PPC_HAVE_10 && HWY_HAS_BUILTIN(__builtin_vsx_xvcvspbf16)
|
|
4007
4234
|
|
|
4008
|
-
|
|
4009
|
-
|
|
4010
|
-
const RebindToUnsigned<decltype(dbf16)> du16;
|
|
4011
|
-
const Repartition<uint32_t, decltype(dbf16)> du32;
|
|
4012
|
-
#if HWY_IS_LITTLE_ENDIAN
|
|
4013
|
-
const auto a_in_odd = a;
|
|
4014
|
-
const auto b_in_even = ShiftRight<16>(BitCast(du32, b));
|
|
4235
|
+
#ifdef HWY_NATIVE_DEMOTE_F32_TO_BF16
|
|
4236
|
+
#undef HWY_NATIVE_DEMOTE_F32_TO_BF16
|
|
4015
4237
|
#else
|
|
4016
|
-
|
|
4017
|
-
const auto b_in_even = b;
|
|
4238
|
+
#define HWY_NATIVE_DEMOTE_F32_TO_BF16
|
|
4018
4239
|
#endif
|
|
4019
|
-
|
|
4020
|
-
|
|
4240
|
+
|
|
4241
|
+
namespace detail {
|
|
4242
|
+
|
|
4243
|
+
// VsxXvcvspbf16 converts a F32 vector to a BF16 vector, bitcasted to an U32
|
|
4244
|
+
// vector with the resulting BF16 bits in the lower 16 bits of each U32 lane
|
|
4245
|
+
template <class D, HWY_IF_BF16_D(D)>
|
|
4246
|
+
static HWY_INLINE VFromD<Rebind<uint32_t, D>> VsxXvcvspbf16(
|
|
4247
|
+
D dbf16, VFromD<Rebind<float, D>> v) {
|
|
4248
|
+
const Rebind<uint32_t, decltype(dbf16)> du32;
|
|
4249
|
+
const Repartition<uint8_t, decltype(du32)> du32_as_du8;
|
|
4250
|
+
|
|
4251
|
+
using VU32 = __vector unsigned int;
|
|
4252
|
+
|
|
4253
|
+
// Even though the __builtin_vsx_xvcvspbf16 builtin performs a F32 to BF16
|
|
4254
|
+
// conversion, the __builtin_vsx_xvcvspbf16 intrinsic expects a
|
|
4255
|
+
// __vector unsigned char argument (at least as of GCC 13 and Clang 17)
|
|
4256
|
+
return VFromD<Rebind<uint32_t, D>>{reinterpret_cast<VU32>(
|
|
4257
|
+
__builtin_vsx_xvcvspbf16(BitCast(du32_as_du8, v).raw))};
|
|
4258
|
+
}
|
|
4259
|
+
|
|
4260
|
+
} // namespace detail
|
|
4261
|
+
|
|
4262
|
+
template <class D, HWY_IF_BF16_D(D)>
|
|
4263
|
+
HWY_API VFromD<D> DemoteTo(D dbf16, VFromD<Rebind<float, D>> v) {
|
|
4264
|
+
const RebindToUnsigned<decltype(dbf16)> du16;
|
|
4265
|
+
return BitCast(dbf16, TruncateTo(du16, detail::VsxXvcvspbf16(dbf16, v)));
|
|
4021
4266
|
}
|
|
4022
4267
|
|
|
4268
|
+
#endif // HWY_PPC_HAVE_10 && HWY_HAS_BUILTIN(__builtin_vsx_xvcvspbf16)
|
|
4269
|
+
|
|
4023
4270
|
// Specializations for partial vectors because vec_packs sets lanes above 2*N.
|
|
4024
4271
|
template <class DN, typename V, HWY_IF_V_SIZE_LE_D(DN, 4), HWY_IF_SIGNED_D(DN),
|
|
4025
4272
|
HWY_IF_SIGNED_V(V),
|
|
@@ -4111,6 +4358,18 @@ HWY_API VFromD<DN> ReorderDemote2To(DN /*dn*/, V a, V b) {
|
|
|
4111
4358
|
return VFromD<DN>{vec_packs(a.raw, b.raw)};
|
|
4112
4359
|
}
|
|
4113
4360
|
|
|
4361
|
+
#if HWY_PPC_HAVE_10 && HWY_HAS_BUILTIN(__builtin_vsx_xvcvspbf16)
|
|
4362
|
+
template <class D, class V, HWY_IF_BF16_D(D), HWY_IF_F32(TFromV<V>),
|
|
4363
|
+
HWY_IF_LANES_D(D, HWY_MAX_LANES_V(V) * 2)>
|
|
4364
|
+
HWY_API VFromD<D> ReorderDemote2To(D dbf16, V a, V b) {
|
|
4365
|
+
const RebindToUnsigned<decltype(dbf16)> du16;
|
|
4366
|
+
const Half<decltype(dbf16)> dh_bf16;
|
|
4367
|
+
return BitCast(dbf16,
|
|
4368
|
+
OrderedTruncate2To(du16, detail::VsxXvcvspbf16(dh_bf16, a),
|
|
4369
|
+
detail::VsxXvcvspbf16(dh_bf16, b)));
|
|
4370
|
+
}
|
|
4371
|
+
#endif
|
|
4372
|
+
|
|
4114
4373
|
template <class D, HWY_IF_NOT_FLOAT_NOR_SPECIAL(TFromD<D>), class V,
|
|
4115
4374
|
HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V),
|
|
4116
4375
|
HWY_IF_T_SIZE_V(V, sizeof(TFromD<D>) * 2),
|
|
@@ -4119,15 +4378,13 @@ HWY_API VFromD<D> OrderedDemote2To(D d, V a, V b) {
|
|
|
4119
4378
|
return ReorderDemote2To(d, a, b);
|
|
4120
4379
|
}
|
|
4121
4380
|
|
|
4122
|
-
|
|
4123
|
-
|
|
4124
|
-
|
|
4125
|
-
|
|
4126
|
-
return
|
|
4127
|
-
#else
|
|
4128
|
-
return BitCast(dbf16, ConcatEven(du16, BitCast(du16, b), BitCast(du16, a)));
|
|
4129
|
-
#endif
|
|
4381
|
+
#if HWY_PPC_HAVE_10 && HWY_HAS_BUILTIN(__builtin_vsx_xvcvspbf16)
|
|
4382
|
+
template <class D, HWY_IF_BF16_D(D), class V, HWY_IF_F32(TFromV<V>),
|
|
4383
|
+
HWY_IF_LANES_D(D, HWY_MAX_LANES_D(DFromV<V>) * 2)>
|
|
4384
|
+
HWY_API VFromD<D> OrderedDemote2To(D d, V a, V b) {
|
|
4385
|
+
return ReorderDemote2To(d, a, b);
|
|
4130
4386
|
}
|
|
4387
|
+
#endif
|
|
4131
4388
|
|
|
4132
4389
|
template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_F32_D(D)>
|
|
4133
4390
|
HWY_API Vec32<float> DemoteTo(D /* tag */, Vec64<double> v) {
|
|
@@ -4938,7 +5195,7 @@ HWY_API MFromD<D> Dup128MaskFromMaskBits(D d, unsigned mask_bits) {
|
|
|
4938
5195
|
|
|
4939
5196
|
namespace detail {
|
|
4940
5197
|
|
|
4941
|
-
#if !
|
|
5198
|
+
#if !HWY_PPC_HAVE_10 || HWY_IS_BIG_ENDIAN
|
|
4942
5199
|
// fallback for missing vec_extractm
|
|
4943
5200
|
template <size_t N>
|
|
4944
5201
|
HWY_INLINE uint64_t ExtractSignBits(Vec128<uint8_t, N> sign_bits,
|
|
@@ -4946,42 +5203,18 @@ HWY_INLINE uint64_t ExtractSignBits(Vec128<uint8_t, N> sign_bits,
|
|
|
4946
5203
|
// clang POWER8 and 9 targets appear to differ in their return type of
|
|
4947
5204
|
// vec_vbpermq: unsigned or signed, so cast to avoid a warning.
|
|
4948
5205
|
using VU64 = detail::Raw128<uint64_t>::type;
|
|
5206
|
+
#if HWY_S390X_HAVE_Z14
|
|
5207
|
+
const Vec128<uint64_t> extracted{
|
|
5208
|
+
reinterpret_cast<VU64>(vec_bperm_u128(sign_bits.raw, bit_shuffle))};
|
|
5209
|
+
#else
|
|
4949
5210
|
const Vec128<uint64_t> extracted{
|
|
4950
5211
|
reinterpret_cast<VU64>(vec_vbpermq(sign_bits.raw, bit_shuffle))};
|
|
5212
|
+
#endif
|
|
4951
5213
|
return extracted.raw[HWY_IS_LITTLE_ENDIAN];
|
|
4952
5214
|
}
|
|
4953
5215
|
|
|
4954
|
-
#endif // !
|
|
4955
|
-
|
|
4956
|
-
#if HWY_S390X_HAVE_Z14
|
|
4957
|
-
template <typename T, size_t N, HWY_IF_V_SIZE_LE(T, N, 8)>
|
|
4958
|
-
HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<1> /*tag*/, Mask128<T, N> mask) {
|
|
4959
|
-
const DFromM<decltype(mask)> d;
|
|
4960
|
-
const Repartition<uint8_t, decltype(d)> du8;
|
|
4961
|
-
const VFromD<decltype(du8)> sign_bits = BitCast(du8, VecFromMask(d, mask));
|
|
4962
|
-
|
|
4963
|
-
return ReduceSum(
|
|
4964
|
-
du8, And(sign_bits, Dup128VecFromValues(du8, 1, 2, 4, 8, 16, 32, 64, 128,
|
|
4965
|
-
1, 2, 4, 8, 16, 32, 64, 128)));
|
|
4966
|
-
}
|
|
5216
|
+
#endif // !HWY_PPC_HAVE_10 || HWY_IS_BIG_ENDIAN
|
|
4967
5217
|
|
|
4968
|
-
template <typename T>
|
|
4969
|
-
HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<1> /*tag*/, Mask128<T> mask) {
|
|
4970
|
-
const DFromM<decltype(mask)> d;
|
|
4971
|
-
const Repartition<uint8_t, decltype(d)> du8;
|
|
4972
|
-
const Repartition<uint64_t, decltype(d)> du64;
|
|
4973
|
-
const VFromD<decltype(du8)> sign_bits = BitCast(du8, VecFromMask(d, mask));
|
|
4974
|
-
|
|
4975
|
-
const auto mask_bytes = SumsOf8(
|
|
4976
|
-
And(sign_bits, Dup128VecFromValues(du8, 1, 2, 4, 8, 16, 32, 64, 128, 1, 2,
|
|
4977
|
-
4, 8, 16, 32, 64, 128)));
|
|
4978
|
-
|
|
4979
|
-
const Rebind<uint8_t, decltype(du64)> du8_2;
|
|
4980
|
-
const Repartition<uint16_t, decltype(du8_2)> du16_1;
|
|
4981
|
-
return GetLane(
|
|
4982
|
-
BitCast(du16_1, TruncateTo(du8_2, Reverse2(du64, mask_bytes))));
|
|
4983
|
-
}
|
|
4984
|
-
#else
|
|
4985
5218
|
template <typename T, size_t N>
|
|
4986
5219
|
HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<1> /*tag*/, Mask128<T, N> mask) {
|
|
4987
5220
|
const DFromM<decltype(mask)> d;
|
|
@@ -4990,30 +5223,24 @@ HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<1> /*tag*/, Mask128<T, N> mask) {
|
|
|
4990
5223
|
|
|
4991
5224
|
#if HWY_PPC_HAVE_10 && HWY_IS_LITTLE_ENDIAN
|
|
4992
5225
|
return static_cast<uint64_t>(vec_extractm(sign_bits.raw));
|
|
4993
|
-
#else // PPC8, PPC9, or big-endian PPC10
|
|
5226
|
+
#else // Z14, Z15, PPC8, PPC9, or big-endian PPC10
|
|
4994
5227
|
const __vector unsigned char kBitShuffle = {120, 112, 104, 96, 88, 80, 72, 64,
|
|
4995
5228
|
56, 48, 40, 32, 24, 16, 8, 0};
|
|
4996
5229
|
return ExtractSignBits(sign_bits, kBitShuffle);
|
|
4997
5230
|
#endif // HWY_PPC_HAVE_10 && HWY_IS_LITTLE_ENDIAN
|
|
4998
5231
|
}
|
|
4999
|
-
#endif // HWY_S390X_HAVE_Z14
|
|
5000
5232
|
|
|
5001
5233
|
template <typename T, size_t N>
|
|
5002
5234
|
HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<2> /*tag*/, Mask128<T, N> mask) {
|
|
5003
5235
|
const DFromM<decltype(mask)> d;
|
|
5004
5236
|
const RebindToUnsigned<decltype(d)> du;
|
|
5005
5237
|
|
|
5006
|
-
#if HWY_S390X_HAVE_Z14
|
|
5007
|
-
const VFromD<decltype(du)> sign_bits = BitCast(du, VecFromMask(d, mask));
|
|
5008
|
-
return ReduceSum(
|
|
5009
|
-
du, And(sign_bits, Dup128VecFromValues(du, 1, 2, 4, 8, 16, 32, 64, 128)));
|
|
5010
|
-
#else // VSX
|
|
5011
5238
|
const Repartition<uint8_t, decltype(d)> du8;
|
|
5012
5239
|
const VFromD<decltype(du8)> sign_bits = BitCast(du8, VecFromMask(d, mask));
|
|
5013
5240
|
|
|
5014
5241
|
#if HWY_PPC_HAVE_10 && HWY_IS_LITTLE_ENDIAN
|
|
5015
5242
|
return static_cast<uint64_t>(vec_extractm(BitCast(du, sign_bits).raw));
|
|
5016
|
-
#else // PPC8, PPC9, or big-endian PPC10
|
|
5243
|
+
#else // Z14, Z15, PPC8, PPC9, or big-endian PPC10
|
|
5017
5244
|
(void)du;
|
|
5018
5245
|
#if HWY_IS_LITTLE_ENDIAN
|
|
5019
5246
|
const __vector unsigned char kBitShuffle = {
|
|
@@ -5024,7 +5251,6 @@ HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<2> /*tag*/, Mask128<T, N> mask) {
|
|
|
5024
5251
|
#endif
|
|
5025
5252
|
return ExtractSignBits(sign_bits, kBitShuffle);
|
|
5026
5253
|
#endif // HWY_PPC_HAVE_10
|
|
5027
|
-
#endif // HWY_S390X_HAVE_Z14
|
|
5028
5254
|
}
|
|
5029
5255
|
|
|
5030
5256
|
template <typename T, size_t N>
|
|
@@ -5032,16 +5258,12 @@ HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<4> /*tag*/, Mask128<T, N> mask) {
|
|
|
5032
5258
|
const DFromM<decltype(mask)> d;
|
|
5033
5259
|
const RebindToUnsigned<decltype(d)> du;
|
|
5034
5260
|
|
|
5035
|
-
#if HWY_S390X_HAVE_Z14
|
|
5036
|
-
const VFromD<decltype(du)> sign_bits = BitCast(du, VecFromMask(d, mask));
|
|
5037
|
-
return ReduceSum(du, And(sign_bits, Dup128VecFromValues(du, 1, 2, 4, 8)));
|
|
5038
|
-
#else // VSX
|
|
5039
5261
|
const Repartition<uint8_t, decltype(d)> du8;
|
|
5040
5262
|
const VFromD<decltype(du8)> sign_bits = BitCast(du8, VecFromMask(d, mask));
|
|
5041
5263
|
|
|
5042
5264
|
#if HWY_PPC_HAVE_10 && HWY_IS_LITTLE_ENDIAN
|
|
5043
5265
|
return static_cast<uint64_t>(vec_extractm(BitCast(du, sign_bits).raw));
|
|
5044
|
-
#else // PPC8, PPC9, or big-endian PPC10
|
|
5266
|
+
#else // Z14, Z15, PPC8, PPC9, or big-endian PPC10
|
|
5045
5267
|
(void)du;
|
|
5046
5268
|
#if HWY_IS_LITTLE_ENDIAN
|
|
5047
5269
|
const __vector unsigned char kBitShuffle = {96, 64, 32, 0, 128, 128,
|
|
@@ -5054,7 +5276,6 @@ HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<4> /*tag*/, Mask128<T, N> mask) {
|
|
|
5054
5276
|
#endif
|
|
5055
5277
|
return ExtractSignBits(sign_bits, kBitShuffle);
|
|
5056
5278
|
#endif // HWY_PPC_HAVE_10
|
|
5057
|
-
#endif // HWY_S390X_HAVE_Z14
|
|
5058
5279
|
}
|
|
5059
5280
|
|
|
5060
5281
|
template <typename T, size_t N>
|
|
@@ -5062,16 +5283,12 @@ HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<8> /*tag*/, Mask128<T, N> mask) {
|
|
|
5062
5283
|
const DFromM<decltype(mask)> d;
|
|
5063
5284
|
const RebindToUnsigned<decltype(d)> du;
|
|
5064
5285
|
|
|
5065
|
-
#if HWY_S390X_HAVE_Z14
|
|
5066
|
-
const VFromD<decltype(du)> sign_bits = BitCast(du, VecFromMask(d, mask));
|
|
5067
|
-
return ReduceSum(du, And(sign_bits, Dup128VecFromValues(du, 1, 2)));
|
|
5068
|
-
#else // VSX
|
|
5069
5286
|
const Repartition<uint8_t, decltype(d)> du8;
|
|
5070
5287
|
const VFromD<decltype(du8)> sign_bits = BitCast(du8, VecFromMask(d, mask));
|
|
5071
5288
|
|
|
5072
5289
|
#if HWY_PPC_HAVE_10 && HWY_IS_LITTLE_ENDIAN
|
|
5073
5290
|
return static_cast<uint64_t>(vec_extractm(BitCast(du, sign_bits).raw));
|
|
5074
|
-
#else
|
|
5291
|
+
#else // Z14, Z15, PPC8, PPC9, or big-endian PPC10
|
|
5075
5292
|
(void)du;
|
|
5076
5293
|
#if HWY_IS_LITTLE_ENDIAN
|
|
5077
5294
|
const __vector unsigned char kBitShuffle = {64, 0, 128, 128, 128, 128,
|
|
@@ -5084,7 +5301,6 @@ HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<8> /*tag*/, Mask128<T, N> mask) {
|
|
|
5084
5301
|
#endif
|
|
5085
5302
|
return ExtractSignBits(sign_bits, kBitShuffle);
|
|
5086
5303
|
#endif // HWY_PPC_HAVE_10
|
|
5087
|
-
#endif // HWY_S390X_HAVE_Z14
|
|
5088
5304
|
}
|
|
5089
5305
|
|
|
5090
5306
|
// Returns the lowest N of the mask bits.
|
|
@@ -6704,6 +6920,80 @@ HWY_API TFromD<D> ReduceSum(D /*d*/, VFromD<D> v) {
|
|
|
6704
6920
|
return static_cast<TFromD<D>>(GetLane(SumsOf4(v)));
|
|
6705
6921
|
}
|
|
6706
6922
|
|
|
6923
|
+
// ------------------------------ BitShuffle
|
|
6924
|
+
|
|
6925
|
+
#ifdef HWY_NATIVE_BITSHUFFLE
|
|
6926
|
+
#undef HWY_NATIVE_BITSHUFFLE
|
|
6927
|
+
#else
|
|
6928
|
+
#define HWY_NATIVE_BITSHUFFLE
|
|
6929
|
+
#endif
|
|
6930
|
+
|
|
6931
|
+
template <class V, class VI, HWY_IF_UI64(TFromV<V>), HWY_IF_UI8(TFromV<VI>),
|
|
6932
|
+
HWY_IF_V_SIZE_V(VI, HWY_MAX_LANES_V(V) * 8)>
|
|
6933
|
+
HWY_API V BitShuffle(V v, VI idx) {
|
|
6934
|
+
const DFromV<decltype(v)> d64;
|
|
6935
|
+
const RebindToUnsigned<decltype(d64)> du64;
|
|
6936
|
+
const Repartition<uint8_t, decltype(d64)> du8;
|
|
6937
|
+
|
|
6938
|
+
const Full128<TFromD<decltype(du64)>> d_full_u64;
|
|
6939
|
+
const Full128<TFromD<decltype(du8)>> d_full_u8;
|
|
6940
|
+
|
|
6941
|
+
using RawVU64 = __vector unsigned long long;
|
|
6942
|
+
|
|
6943
|
+
#if HWY_PPC_HAVE_9
|
|
6944
|
+
|
|
6945
|
+
#if HWY_IS_LITTLE_ENDIAN
|
|
6946
|
+
(void)d_full_u64;
|
|
6947
|
+
auto bit_idx = ResizeBitCast(d_full_u8, idx);
|
|
6948
|
+
#else
|
|
6949
|
+
auto bit_idx =
|
|
6950
|
+
BitCast(d_full_u8, ReverseLaneBytes(ResizeBitCast(d_full_u64, idx)));
|
|
6951
|
+
#endif
|
|
6952
|
+
|
|
6953
|
+
bit_idx = Xor(bit_idx, Set(d_full_u8, uint8_t{0x3F}));
|
|
6954
|
+
|
|
6955
|
+
return BitCast(d64, VFromD<decltype(du64)>{reinterpret_cast<RawVU64>(
|
|
6956
|
+
vec_bperm(BitCast(du64, v).raw, bit_idx.raw))});
|
|
6957
|
+
#else // !HWY_PPC_HAVE_9
|
|
6958
|
+
|
|
6959
|
+
#if HWY_IS_LITTLE_ENDIAN
|
|
6960
|
+
const auto bit_idx_xor_mask = BitCast(
|
|
6961
|
+
d_full_u8, Dup128VecFromValues(d_full_u64, uint64_t{0x7F7F7F7F7F7F7F7Fu},
|
|
6962
|
+
uint64_t{0x3F3F3F3F3F3F3F3Fu}));
|
|
6963
|
+
const auto bit_idx = Xor(ResizeBitCast(d_full_u8, idx), bit_idx_xor_mask);
|
|
6964
|
+
constexpr int kBitShufResultByteShrAmt = 8;
|
|
6965
|
+
#else
|
|
6966
|
+
const auto bit_idx_xor_mask = BitCast(
|
|
6967
|
+
d_full_u8, Dup128VecFromValues(d_full_u64, uint64_t{0x3F3F3F3F3F3F3F3Fu},
|
|
6968
|
+
uint64_t{0x7F7F7F7F7F7F7F7Fu}));
|
|
6969
|
+
const auto bit_idx =
|
|
6970
|
+
Xor(BitCast(d_full_u8, ReverseLaneBytes(ResizeBitCast(d_full_u64, idx))),
|
|
6971
|
+
bit_idx_xor_mask);
|
|
6972
|
+
constexpr int kBitShufResultByteShrAmt = 6;
|
|
6973
|
+
#endif
|
|
6974
|
+
|
|
6975
|
+
#if HWY_S390X_HAVE_Z14
|
|
6976
|
+
const VFromD<decltype(d_full_u64)> bit_shuf_result{reinterpret_cast<RawVU64>(
|
|
6977
|
+
vec_bperm_u128(BitCast(du8, v).raw, bit_idx.raw))};
|
|
6978
|
+
#elif defined(__SIZEOF_INT128__)
|
|
6979
|
+
using RawVU128 = __vector unsigned __int128;
|
|
6980
|
+
const VFromD<decltype(d_full_u64)> bit_shuf_result{reinterpret_cast<RawVU64>(
|
|
6981
|
+
vec_vbpermq(reinterpret_cast<RawVU128>(v.raw), bit_idx.raw))};
|
|
6982
|
+
#else
|
|
6983
|
+
using RawVU128 = __vector unsigned char;
|
|
6984
|
+
const VFromD<decltype(d_full_u64)> bit_shuf_result{reinterpret_cast<RawVU64>(
|
|
6985
|
+
vec_vbpermq(reinterpret_cast<RawVU128>(v.raw), bit_idx.raw))};
|
|
6986
|
+
#endif
|
|
6987
|
+
|
|
6988
|
+
return ResizeBitCast(
|
|
6989
|
+
d64, PromoteTo(d_full_u64,
|
|
6990
|
+
ResizeBitCast(
|
|
6991
|
+
Rebind<uint8_t, decltype(d_full_u64)>(),
|
|
6992
|
+
CombineShiftRightBytes<kBitShufResultByteShrAmt>(
|
|
6993
|
+
d_full_u64, bit_shuf_result, bit_shuf_result))));
|
|
6994
|
+
#endif // HWY_PPC_HAVE_9
|
|
6995
|
+
}
|
|
6996
|
+
|
|
6707
6997
|
// ------------------------------ Lt128
|
|
6708
6998
|
|
|
6709
6999
|
namespace detail {
|