@img/sharp-libvips-dev 1.2.0 → 1.2.2-rc.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/include/ffi.h +3 -3
- package/include/harfbuzz/hb-deprecated.h +4 -4
- package/include/harfbuzz/hb-font.h +120 -9
- package/include/harfbuzz/hb-version.h +3 -3
- package/include/hwy/abort.h +2 -19
- package/include/hwy/aligned_allocator.h +11 -7
- package/include/hwy/auto_tune.h +504 -0
- package/include/hwy/base.h +425 -104
- package/include/hwy/cache_control.h +16 -0
- package/include/hwy/detect_compiler_arch.h +32 -1
- package/include/hwy/detect_targets.h +251 -67
- package/include/hwy/foreach_target.h +35 -0
- package/include/hwy/highway.h +185 -76
- package/include/hwy/nanobenchmark.h +1 -19
- package/include/hwy/ops/arm_neon-inl.h +969 -458
- package/include/hwy/ops/arm_sve-inl.h +1137 -359
- package/include/hwy/ops/emu128-inl.h +97 -11
- package/include/hwy/ops/generic_ops-inl.h +1222 -34
- package/include/hwy/ops/loongarch_lasx-inl.h +4664 -0
- package/include/hwy/ops/loongarch_lsx-inl.h +5933 -0
- package/include/hwy/ops/ppc_vsx-inl.h +306 -126
- package/include/hwy/ops/rvv-inl.h +546 -51
- package/include/hwy/ops/scalar-inl.h +77 -22
- package/include/hwy/ops/set_macros-inl.h +138 -17
- package/include/hwy/ops/shared-inl.h +50 -10
- package/include/hwy/ops/wasm_128-inl.h +137 -92
- package/include/hwy/ops/x86_128-inl.h +773 -214
- package/include/hwy/ops/x86_256-inl.h +712 -255
- package/include/hwy/ops/x86_512-inl.h +429 -753
- package/include/hwy/ops/x86_avx3-inl.h +501 -0
- package/include/hwy/per_target.h +2 -1
- package/include/hwy/profiler.h +622 -486
- package/include/hwy/targets.h +62 -20
- package/include/hwy/timer-inl.h +8 -160
- package/include/hwy/timer.h +170 -3
- package/include/hwy/x86_cpuid.h +81 -0
- package/include/libheif/heif_cxx.h +25 -5
- package/include/libheif/heif_regions.h +5 -5
- package/include/libheif/heif_version.h +2 -2
- package/include/librsvg-2.0/librsvg/rsvg-version.h +2 -2
- package/include/libxml2/libxml/xmlversion.h +4 -4
- package/include/pango-1.0/pango/pango-enum-types.h +3 -0
- package/include/pango-1.0/pango/pango-features.h +3 -3
- package/include/pango-1.0/pango/pango-font.h +30 -0
- package/include/pango-1.0/pango/pango-version-macros.h +26 -0
- package/include/pixman-1/pixman-version.h +2 -2
- package/include/webp/decode.h +11 -2
- package/include/webp/demux.h +2 -0
- package/include/webp/encode.h +2 -0
- package/include/webp/mux_types.h +1 -0
- package/include/webp/sharpyuv/sharpyuv.h +1 -1
- package/include/webp/types.h +2 -2
- package/include/zlib.h +3 -3
- package/package.json +1 -1
- package/versions.json +11 -11
|
@@ -172,6 +172,10 @@ class Vec512 {
|
|
|
172
172
|
template <typename T>
|
|
173
173
|
struct Mask512 {
|
|
174
174
|
using Raw = typename detail::RawMask512<sizeof(T)>::type;
|
|
175
|
+
|
|
176
|
+
using PrivateT = T; // only for DFromM
|
|
177
|
+
static constexpr size_t kPrivateN = 64 / sizeof(T); // only for DFromM
|
|
178
|
+
|
|
175
179
|
Raw raw;
|
|
176
180
|
};
|
|
177
181
|
|
|
@@ -1338,20 +1342,7 @@ HWY_API Vec512<int64_t> ShiftLeft(const Vec512<int64_t> v) {
|
|
|
1338
1342
|
return Vec512<int64_t>{_mm512_slli_epi64(v.raw, kBits)};
|
|
1339
1343
|
}
|
|
1340
1344
|
|
|
1341
|
-
#if HWY_TARGET
|
|
1342
|
-
|
|
1343
|
-
// Generic for all vector lengths. Must be defined after all GaloisAffine.
|
|
1344
|
-
template <int kBits, class V, HWY_IF_T_SIZE_V(V, 1)>
|
|
1345
|
-
HWY_API V ShiftLeft(const V v) {
|
|
1346
|
-
const Repartition<uint64_t, DFromV<V>> du64;
|
|
1347
|
-
if (kBits == 0) return v;
|
|
1348
|
-
if (kBits == 1) return v + v;
|
|
1349
|
-
constexpr uint64_t kMatrix = (0x0102040810204080ULL >> kBits) &
|
|
1350
|
-
(0x0101010101010101ULL * (0xFF >> kBits));
|
|
1351
|
-
return detail::GaloisAffine(v, Set(du64, kMatrix));
|
|
1352
|
-
}
|
|
1353
|
-
|
|
1354
|
-
#else // HWY_TARGET > HWY_AVX3_DL
|
|
1345
|
+
#if HWY_TARGET > HWY_AVX3_DL
|
|
1355
1346
|
|
|
1356
1347
|
template <int kBits, typename T, HWY_IF_T_SIZE(T, 1)>
|
|
1357
1348
|
HWY_API Vec512<T> ShiftLeft(const Vec512<T> v) {
|
|
@@ -1397,33 +1388,7 @@ HWY_API Vec512<int64_t> ShiftRight(const Vec512<int64_t> v) {
|
|
|
1397
1388
|
return Vec512<int64_t>{_mm512_srai_epi64(v.raw, kBits)};
|
|
1398
1389
|
}
|
|
1399
1390
|
|
|
1400
|
-
#if HWY_TARGET
|
|
1401
|
-
|
|
1402
|
-
// Generic for all vector lengths. Must be defined after all GaloisAffine.
|
|
1403
|
-
template <int kBits, class V, HWY_IF_U8_D(DFromV<V>)>
|
|
1404
|
-
HWY_API V ShiftRight(const V v) {
|
|
1405
|
-
const Repartition<uint64_t, DFromV<V>> du64;
|
|
1406
|
-
if (kBits == 0) return v;
|
|
1407
|
-
constexpr uint64_t kMatrix =
|
|
1408
|
-
(0x0102040810204080ULL << kBits) &
|
|
1409
|
-
(0x0101010101010101ULL * ((0xFF << kBits) & 0xFF));
|
|
1410
|
-
return detail::GaloisAffine(v, Set(du64, kMatrix));
|
|
1411
|
-
}
|
|
1412
|
-
|
|
1413
|
-
// Generic for all vector lengths. Must be defined after all GaloisAffine.
|
|
1414
|
-
template <int kBits, class V, HWY_IF_I8_D(DFromV<V>)>
|
|
1415
|
-
HWY_API V ShiftRight(const V v) {
|
|
1416
|
-
const Repartition<uint64_t, DFromV<V>> du64;
|
|
1417
|
-
if (kBits == 0) return v;
|
|
1418
|
-
constexpr uint64_t kShift =
|
|
1419
|
-
(0x0102040810204080ULL << kBits) &
|
|
1420
|
-
(0x0101010101010101ULL * ((0xFF << kBits) & 0xFF));
|
|
1421
|
-
constexpr uint64_t kSign =
|
|
1422
|
-
kBits == 0 ? 0 : (0x8080808080808080ULL >> (64 - (8 * kBits)));
|
|
1423
|
-
return detail::GaloisAffine(v, Set(du64, kShift | kSign));
|
|
1424
|
-
}
|
|
1425
|
-
|
|
1426
|
-
#else // HWY_TARGET > HWY_AVX3_DL
|
|
1391
|
+
#if HWY_TARGET > HWY_AVX3_DL
|
|
1427
1392
|
|
|
1428
1393
|
template <int kBits>
|
|
1429
1394
|
HWY_API Vec512<uint8_t> ShiftRight(const Vec512<uint8_t> v) {
|
|
@@ -1446,26 +1411,7 @@ HWY_API Vec512<int8_t> ShiftRight(const Vec512<int8_t> v) {
|
|
|
1446
1411
|
|
|
1447
1412
|
// ------------------------------ RotateRight
|
|
1448
1413
|
|
|
1449
|
-
#if HWY_TARGET
|
|
1450
|
-
// U8 RotateRight is generic for all vector lengths on AVX3_DL
|
|
1451
|
-
template <int kBits, class V, HWY_IF_U8(TFromV<V>)>
|
|
1452
|
-
HWY_API V RotateRight(V v) {
|
|
1453
|
-
static_assert(0 <= kBits && kBits < 8, "Invalid shift count");
|
|
1454
|
-
|
|
1455
|
-
const Repartition<uint64_t, DFromV<V>> du64;
|
|
1456
|
-
if (kBits == 0) return v;
|
|
1457
|
-
|
|
1458
|
-
constexpr uint64_t kShrMatrix =
|
|
1459
|
-
(0x0102040810204080ULL << kBits) &
|
|
1460
|
-
(0x0101010101010101ULL * ((0xFF << kBits) & 0xFF));
|
|
1461
|
-
constexpr int kShlBits = (-kBits) & 7;
|
|
1462
|
-
constexpr uint64_t kShlMatrix = (0x0102040810204080ULL >> kShlBits) &
|
|
1463
|
-
(0x0101010101010101ULL * (0xFF >> kShlBits));
|
|
1464
|
-
constexpr uint64_t kMatrix = kShrMatrix | kShlMatrix;
|
|
1465
|
-
|
|
1466
|
-
return detail::GaloisAffine(v, Set(du64, kMatrix));
|
|
1467
|
-
}
|
|
1468
|
-
#else // HWY_TARGET > HWY_AVX3_DL
|
|
1414
|
+
#if HWY_TARGET > HWY_AVX3_DL
|
|
1469
1415
|
template <int kBits>
|
|
1470
1416
|
HWY_API Vec512<uint8_t> RotateRight(const Vec512<uint8_t> v) {
|
|
1471
1417
|
static_assert(0 <= kBits && kBits < 8, "Invalid shift count");
|
|
@@ -1473,7 +1419,7 @@ HWY_API Vec512<uint8_t> RotateRight(const Vec512<uint8_t> v) {
|
|
|
1473
1419
|
// AVX3 does not support 8-bit.
|
|
1474
1420
|
return Or(ShiftRight<kBits>(v), ShiftLeft<HWY_MIN(7, 8 - kBits)>(v));
|
|
1475
1421
|
}
|
|
1476
|
-
#endif // HWY_TARGET
|
|
1422
|
+
#endif // HWY_TARGET > HWY_AVX3_DL
|
|
1477
1423
|
|
|
1478
1424
|
template <int kBits>
|
|
1479
1425
|
HWY_API Vec512<uint16_t> RotateRight(const Vec512<uint16_t> v) {
|
|
@@ -1532,7 +1478,11 @@ HWY_API Vec512<T> Ror(Vec512<T> a, Vec512<T> b) {
|
|
|
1532
1478
|
// ------------------------------ ShiftLeftSame
|
|
1533
1479
|
|
|
1534
1480
|
// GCC <14 and Clang <11 do not follow the Intel documentation for AVX-512
|
|
1535
|
-
// shift-with-immediate: the counts should all be unsigned int.
|
|
1481
|
+
// shift-with-immediate: the counts should all be unsigned int. Despite casting,
|
|
1482
|
+
// we still see warnings in GCC debug builds, hence disable.
|
|
1483
|
+
HWY_DIAGNOSTICS(push)
|
|
1484
|
+
HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion")
|
|
1485
|
+
|
|
1536
1486
|
#if HWY_COMPILER_CLANG && HWY_COMPILER_CLANG < 1100
|
|
1537
1487
|
using Shift16Count = int;
|
|
1538
1488
|
using Shift3264Count = int;
|
|
@@ -1696,6 +1646,8 @@ HWY_API Vec512<int8_t> ShiftRightSame(Vec512<int8_t> v, const int bits) {
|
|
|
1696
1646
|
return (shifted ^ shifted_sign) - shifted_sign;
|
|
1697
1647
|
}
|
|
1698
1648
|
|
|
1649
|
+
HWY_DIAGNOSTICS(pop)
|
|
1650
|
+
|
|
1699
1651
|
// ------------------------------ Minimum
|
|
1700
1652
|
|
|
1701
1653
|
// Unsigned
|
|
@@ -1782,15 +1734,70 @@ HWY_API Vec512<double> Max(Vec512<double> a, Vec512<double> b) {
|
|
|
1782
1734
|
return Vec512<double>{_mm512_max_pd(a.raw, b.raw)};
|
|
1783
1735
|
}
|
|
1784
1736
|
|
|
1785
|
-
// ------------------------------
|
|
1737
|
+
// ------------------------------ MinNumber and MaxNumber
|
|
1738
|
+
|
|
1739
|
+
#if HWY_X86_HAVE_AVX10_2_OPS
|
|
1740
|
+
|
|
1741
|
+
#if HWY_HAVE_FLOAT16
|
|
1742
|
+
HWY_API Vec512<float16_t> MinNumber(Vec512<float16_t> a, Vec512<float16_t> b) {
|
|
1743
|
+
return Vec512<float16_t>{_mm512_minmax_ph(a.raw, b.raw, 0x14)};
|
|
1744
|
+
}
|
|
1745
|
+
#endif
|
|
1746
|
+
HWY_API Vec512<float> MinNumber(Vec512<float> a, Vec512<float> b) {
|
|
1747
|
+
return Vec512<float>{_mm512_minmax_ps(a.raw, b.raw, 0x14)};
|
|
1748
|
+
}
|
|
1749
|
+
HWY_API Vec512<double> MinNumber(Vec512<double> a, Vec512<double> b) {
|
|
1750
|
+
return Vec512<double>{_mm512_minmax_pd(a.raw, b.raw, 0x14)};
|
|
1751
|
+
}
|
|
1752
|
+
|
|
1753
|
+
#if HWY_HAVE_FLOAT16
|
|
1754
|
+
HWY_API Vec512<float16_t> MaxNumber(Vec512<float16_t> a, Vec512<float16_t> b) {
|
|
1755
|
+
return Vec512<float16_t>{_mm512_minmax_ph(a.raw, b.raw, 0x15)};
|
|
1756
|
+
}
|
|
1757
|
+
#endif
|
|
1758
|
+
HWY_API Vec512<float> MaxNumber(Vec512<float> a, Vec512<float> b) {
|
|
1759
|
+
return Vec512<float>{_mm512_minmax_ps(a.raw, b.raw, 0x15)};
|
|
1760
|
+
}
|
|
1761
|
+
HWY_API Vec512<double> MaxNumber(Vec512<double> a, Vec512<double> b) {
|
|
1762
|
+
return Vec512<double>{_mm512_minmax_pd(a.raw, b.raw, 0x15)};
|
|
1763
|
+
}
|
|
1764
|
+
|
|
1765
|
+
#endif
|
|
1766
|
+
|
|
1767
|
+
// ------------------------------ MinMagnitude and MaxMagnitude
|
|
1768
|
+
|
|
1769
|
+
#if HWY_X86_HAVE_AVX10_2_OPS
|
|
1770
|
+
|
|
1771
|
+
#if HWY_HAVE_FLOAT16
|
|
1772
|
+
HWY_API Vec512<float16_t> MinMagnitude(Vec512<float16_t> a,
|
|
1773
|
+
Vec512<float16_t> b) {
|
|
1774
|
+
return Vec512<float16_t>{_mm512_minmax_ph(a.raw, b.raw, 0x16)};
|
|
1775
|
+
}
|
|
1776
|
+
#endif
|
|
1777
|
+
HWY_API Vec512<float> MinMagnitude(Vec512<float> a, Vec512<float> b) {
|
|
1778
|
+
return Vec512<float>{_mm512_minmax_ps(a.raw, b.raw, 0x16)};
|
|
1779
|
+
}
|
|
1780
|
+
HWY_API Vec512<double> MinMagnitude(Vec512<double> a, Vec512<double> b) {
|
|
1781
|
+
return Vec512<double>{_mm512_minmax_pd(a.raw, b.raw, 0x16)};
|
|
1782
|
+
}
|
|
1783
|
+
|
|
1784
|
+
#if HWY_HAVE_FLOAT16
|
|
1785
|
+
HWY_API Vec512<float16_t> MaxMagnitude(Vec512<float16_t> a,
|
|
1786
|
+
Vec512<float16_t> b) {
|
|
1787
|
+
return Vec512<float16_t>{_mm512_minmax_ph(a.raw, b.raw, 0x17)};
|
|
1788
|
+
}
|
|
1789
|
+
#endif
|
|
1790
|
+
HWY_API Vec512<float> MaxMagnitude(Vec512<float> a, Vec512<float> b) {
|
|
1791
|
+
return Vec512<float>{_mm512_minmax_ps(a.raw, b.raw, 0x17)};
|
|
1792
|
+
}
|
|
1793
|
+
HWY_API Vec512<double> MaxMagnitude(Vec512<double> a, Vec512<double> b) {
|
|
1794
|
+
return Vec512<double>{_mm512_minmax_pd(a.raw, b.raw, 0x17)};
|
|
1795
|
+
}
|
|
1786
1796
|
|
|
1787
|
-
// Per-target flag to prevent generic_ops-inl.h from defining 64-bit operator*.
|
|
1788
|
-
#ifdef HWY_NATIVE_MUL_64
|
|
1789
|
-
#undef HWY_NATIVE_MUL_64
|
|
1790
|
-
#else
|
|
1791
|
-
#define HWY_NATIVE_MUL_64
|
|
1792
1797
|
#endif
|
|
1793
1798
|
|
|
1799
|
+
// ------------------------------ Integer multiplication
|
|
1800
|
+
|
|
1794
1801
|
// Unsigned
|
|
1795
1802
|
HWY_API Vec512<uint16_t> operator*(Vec512<uint16_t> a, Vec512<uint16_t> b) {
|
|
1796
1803
|
return Vec512<uint16_t>{_mm512_mullo_epi16(a.raw, b.raw)};
|
|
@@ -1801,14 +1808,6 @@ HWY_API Vec512<uint32_t> operator*(Vec512<uint32_t> a, Vec512<uint32_t> b) {
|
|
|
1801
1808
|
HWY_API Vec512<uint64_t> operator*(Vec512<uint64_t> a, Vec512<uint64_t> b) {
|
|
1802
1809
|
return Vec512<uint64_t>{_mm512_mullo_epi64(a.raw, b.raw)};
|
|
1803
1810
|
}
|
|
1804
|
-
HWY_API Vec256<uint64_t> operator*(Vec256<uint64_t> a, Vec256<uint64_t> b) {
|
|
1805
|
-
return Vec256<uint64_t>{_mm256_mullo_epi64(a.raw, b.raw)};
|
|
1806
|
-
}
|
|
1807
|
-
template <size_t N>
|
|
1808
|
-
HWY_API Vec128<uint64_t, N> operator*(Vec128<uint64_t, N> a,
|
|
1809
|
-
Vec128<uint64_t, N> b) {
|
|
1810
|
-
return Vec128<uint64_t, N>{_mm_mullo_epi64(a.raw, b.raw)};
|
|
1811
|
-
}
|
|
1812
1811
|
|
|
1813
1812
|
// Signed
|
|
1814
1813
|
HWY_API Vec512<int16_t> operator*(Vec512<int16_t> a, Vec512<int16_t> b) {
|
|
@@ -1820,14 +1819,7 @@ HWY_API Vec512<int32_t> operator*(Vec512<int32_t> a, Vec512<int32_t> b) {
|
|
|
1820
1819
|
HWY_API Vec512<int64_t> operator*(Vec512<int64_t> a, Vec512<int64_t> b) {
|
|
1821
1820
|
return Vec512<int64_t>{_mm512_mullo_epi64(a.raw, b.raw)};
|
|
1822
1821
|
}
|
|
1823
|
-
|
|
1824
|
-
return Vec256<int64_t>{_mm256_mullo_epi64(a.raw, b.raw)};
|
|
1825
|
-
}
|
|
1826
|
-
template <size_t N>
|
|
1827
|
-
HWY_API Vec128<int64_t, N> operator*(Vec128<int64_t, N> a,
|
|
1828
|
-
Vec128<int64_t, N> b) {
|
|
1829
|
-
return Vec128<int64_t, N>{_mm_mullo_epi64(a.raw, b.raw)};
|
|
1830
|
-
}
|
|
1822
|
+
|
|
1831
1823
|
// Returns the upper 16 bits of a * b in each lane.
|
|
1832
1824
|
HWY_API Vec512<uint16_t> MulHigh(Vec512<uint16_t> a, Vec512<uint16_t> b) {
|
|
1833
1825
|
return Vec512<uint16_t>{_mm512_mulhi_epu16(a.raw, b.raw)};
|
|
@@ -1877,6 +1869,21 @@ HWY_API Vec512<double> operator*(Vec512<double> a, Vec512<double> b) {
|
|
|
1877
1869
|
return Vec512<double>{_mm512_mul_pd(a.raw, b.raw)};
|
|
1878
1870
|
}
|
|
1879
1871
|
|
|
1872
|
+
#if HWY_HAVE_FLOAT16
|
|
1873
|
+
HWY_API Vec512<float16_t> MulByFloorPow2(Vec512<float16_t> a,
|
|
1874
|
+
Vec512<float16_t> b) {
|
|
1875
|
+
return Vec512<float16_t>{_mm512_scalef_ph(a.raw, b.raw)};
|
|
1876
|
+
}
|
|
1877
|
+
#endif
|
|
1878
|
+
|
|
1879
|
+
HWY_API Vec512<float> MulByFloorPow2(Vec512<float> a, Vec512<float> b) {
|
|
1880
|
+
return Vec512<float>{_mm512_scalef_ps(a.raw, b.raw)};
|
|
1881
|
+
}
|
|
1882
|
+
|
|
1883
|
+
HWY_API Vec512<double> MulByFloorPow2(Vec512<double> a, Vec512<double> b) {
|
|
1884
|
+
return Vec512<double>{_mm512_scalef_pd(a.raw, b.raw)};
|
|
1885
|
+
}
|
|
1886
|
+
|
|
1880
1887
|
#if HWY_HAVE_FLOAT16
|
|
1881
1888
|
HWY_API Vec512<float16_t> operator/(Vec512<float16_t> a, Vec512<float16_t> b) {
|
|
1882
1889
|
return Vec512<float16_t>{_mm512_div_ph(a.raw, b.raw)};
|
|
@@ -1903,6 +1910,31 @@ HWY_API Vec512<double> ApproximateReciprocal(Vec512<double> v) {
|
|
|
1903
1910
|
return Vec512<double>{_mm512_rcp14_pd(v.raw)};
|
|
1904
1911
|
}
|
|
1905
1912
|
|
|
1913
|
+
// ------------------------------ GetExponent
|
|
1914
|
+
|
|
1915
|
+
#if HWY_HAVE_FLOAT16
|
|
1916
|
+
template <class V, HWY_IF_F16(TFromV<V>), HWY_IF_V_SIZE_V(V, 64)>
|
|
1917
|
+
HWY_API V GetExponent(V v) {
|
|
1918
|
+
return V{_mm512_getexp_ph(v.raw)};
|
|
1919
|
+
}
|
|
1920
|
+
#endif
|
|
1921
|
+
template <class V, HWY_IF_F32(TFromV<V>), HWY_IF_V_SIZE_V(V, 64)>
|
|
1922
|
+
HWY_API V GetExponent(V v) {
|
|
1923
|
+
// Work around warnings in the intrinsic definitions (passing -1 as a mask).
|
|
1924
|
+
HWY_DIAGNOSTICS(push)
|
|
1925
|
+
HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion")
|
|
1926
|
+
return V{_mm512_getexp_ps(v.raw)};
|
|
1927
|
+
HWY_DIAGNOSTICS(pop)
|
|
1928
|
+
}
|
|
1929
|
+
template <class V, HWY_IF_F64(TFromV<V>), HWY_IF_V_SIZE_V(V, 64)>
|
|
1930
|
+
HWY_API V GetExponent(V v) {
|
|
1931
|
+
// Work around warnings in the intrinsic definitions (passing -1 as a mask).
|
|
1932
|
+
HWY_DIAGNOSTICS(push)
|
|
1933
|
+
HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion")
|
|
1934
|
+
return V{_mm512_getexp_pd(v.raw)};
|
|
1935
|
+
HWY_DIAGNOSTICS(pop)
|
|
1936
|
+
}
|
|
1937
|
+
|
|
1906
1938
|
// ------------------------------ MaskedMinOr
|
|
1907
1939
|
|
|
1908
1940
|
template <typename T, HWY_IF_U8(T)>
|
|
@@ -2625,74 +2657,54 @@ HWY_API Mask512<T> operator<=(Vec512<T> a, Vec512<T> b) {
|
|
|
2625
2657
|
|
|
2626
2658
|
// ------------------------------ Mask
|
|
2627
2659
|
|
|
2628
|
-
|
|
2629
|
-
|
|
2630
|
-
template <typename T>
|
|
2631
|
-
HWY_INLINE Mask512<T> MaskFromVec(hwy::SizeTag<1> /*tag*/, Vec512<T> v) {
|
|
2660
|
+
template <typename T, HWY_IF_UI8(T)>
|
|
2661
|
+
HWY_API Mask512<T> MaskFromVec(Vec512<T> v) {
|
|
2632
2662
|
return Mask512<T>{_mm512_movepi8_mask(v.raw)};
|
|
2633
2663
|
}
|
|
2634
|
-
template <typename T>
|
|
2635
|
-
|
|
2664
|
+
template <typename T, HWY_IF_UI16(T)>
|
|
2665
|
+
HWY_API Mask512<T> MaskFromVec(Vec512<T> v) {
|
|
2636
2666
|
return Mask512<T>{_mm512_movepi16_mask(v.raw)};
|
|
2637
2667
|
}
|
|
2638
|
-
template <typename T>
|
|
2639
|
-
|
|
2668
|
+
template <typename T, HWY_IF_UI32(T)>
|
|
2669
|
+
HWY_API Mask512<T> MaskFromVec(Vec512<T> v) {
|
|
2640
2670
|
return Mask512<T>{_mm512_movepi32_mask(v.raw)};
|
|
2641
2671
|
}
|
|
2642
|
-
template <typename T>
|
|
2643
|
-
HWY_INLINE Mask512<T> MaskFromVec(hwy::SizeTag<8> /*tag*/, Vec512<T> v) {
|
|
2644
|
-
return Mask512<T>{_mm512_movepi64_mask(v.raw)};
|
|
2645
|
-
}
|
|
2646
|
-
|
|
2647
|
-
} // namespace detail
|
|
2648
|
-
|
|
2649
|
-
template <typename T, HWY_IF_NOT_FLOAT(T)>
|
|
2672
|
+
template <typename T, HWY_IF_UI64(T)>
|
|
2650
2673
|
HWY_API Mask512<T> MaskFromVec(Vec512<T> v) {
|
|
2651
|
-
return
|
|
2674
|
+
return Mask512<T>{_mm512_movepi64_mask(v.raw)};
|
|
2652
2675
|
}
|
|
2653
|
-
template <typename T,
|
|
2676
|
+
template <typename T, HWY_IF_FLOAT_OR_SPECIAL(T)>
|
|
2654
2677
|
HWY_API Mask512<T> MaskFromVec(Vec512<T> v) {
|
|
2655
2678
|
const RebindToSigned<DFromV<decltype(v)>> di;
|
|
2656
2679
|
return Mask512<T>{MaskFromVec(BitCast(di, v)).raw};
|
|
2657
2680
|
}
|
|
2658
2681
|
|
|
2659
|
-
|
|
2660
|
-
|
|
2661
|
-
}
|
|
2662
|
-
HWY_API Vec512<int8_t> VecFromMask(Mask512<int8_t> v) {
|
|
2663
|
-
return Vec512<int8_t>{_mm512_movm_epi8(v.raw)};
|
|
2664
|
-
}
|
|
2665
|
-
|
|
2666
|
-
HWY_API Vec512<uint16_t> VecFromMask(Mask512<uint16_t> v) {
|
|
2667
|
-
return Vec512<uint16_t>{_mm512_movm_epi16(v.raw)};
|
|
2682
|
+
template <typename T, HWY_IF_UI8(T)>
|
|
2683
|
+
HWY_API Vec512<T> VecFromMask(Mask512<T> m) {
|
|
2684
|
+
return Vec512<T>{_mm512_movm_epi8(m.raw)};
|
|
2668
2685
|
}
|
|
2669
|
-
|
|
2670
|
-
|
|
2686
|
+
template <typename T, HWY_IF_UI16(T)>
|
|
2687
|
+
HWY_API Vec512<T> VecFromMask(Mask512<T> m) {
|
|
2688
|
+
return Vec512<T>{_mm512_movm_epi16(m.raw)};
|
|
2671
2689
|
}
|
|
2672
2690
|
#if HWY_HAVE_FLOAT16
|
|
2673
|
-
HWY_API Vec512<float16_t> VecFromMask(Mask512<float16_t>
|
|
2674
|
-
return Vec512<float16_t>{_mm512_castsi512_ph(_mm512_movm_epi16(
|
|
2691
|
+
HWY_API Vec512<float16_t> VecFromMask(Mask512<float16_t> m) {
|
|
2692
|
+
return Vec512<float16_t>{_mm512_castsi512_ph(_mm512_movm_epi16(m.raw))};
|
|
2675
2693
|
}
|
|
2676
2694
|
#endif // HWY_HAVE_FLOAT16
|
|
2677
|
-
|
|
2678
|
-
HWY_API Vec512<
|
|
2679
|
-
return Vec512<
|
|
2680
|
-
}
|
|
2681
|
-
HWY_API Vec512<int32_t> VecFromMask(Mask512<int32_t> v) {
|
|
2682
|
-
return Vec512<int32_t>{_mm512_movm_epi32(v.raw)};
|
|
2683
|
-
}
|
|
2684
|
-
HWY_API Vec512<float> VecFromMask(Mask512<float> v) {
|
|
2685
|
-
return Vec512<float>{_mm512_castsi512_ps(_mm512_movm_epi32(v.raw))};
|
|
2686
|
-
}
|
|
2687
|
-
|
|
2688
|
-
HWY_API Vec512<uint64_t> VecFromMask(Mask512<uint64_t> v) {
|
|
2689
|
-
return Vec512<uint64_t>{_mm512_movm_epi64(v.raw)};
|
|
2695
|
+
template <typename T, HWY_IF_UI32(T)>
|
|
2696
|
+
HWY_API Vec512<T> VecFromMask(Mask512<T> m) {
|
|
2697
|
+
return Vec512<T>{_mm512_movm_epi32(m.raw)};
|
|
2690
2698
|
}
|
|
2691
|
-
|
|
2692
|
-
|
|
2699
|
+
template <typename T, HWY_IF_UI64(T)>
|
|
2700
|
+
HWY_API Vec512<T> VecFromMask(Mask512<T> m) {
|
|
2701
|
+
return Vec512<T>{_mm512_movm_epi64(m.raw)};
|
|
2693
2702
|
}
|
|
2694
|
-
|
|
2695
|
-
|
|
2703
|
+
template <typename T, HWY_IF_FLOAT_OR_SPECIAL(T)>
|
|
2704
|
+
HWY_API Vec512<T> VecFromMask(Mask512<T> m) {
|
|
2705
|
+
const Full512<T> d;
|
|
2706
|
+
const Full512<MakeSigned<T>> di;
|
|
2707
|
+
return BitCast(d, VecFromMask(RebindMask(di, m)));
|
|
2696
2708
|
}
|
|
2697
2709
|
|
|
2698
2710
|
// ------------------------------ Mask logical
|
|
@@ -3012,9 +3024,26 @@ HWY_API Vec512<int64_t> BroadcastSignBit(Vec512<int64_t> v) {
|
|
|
3012
3024
|
|
|
3013
3025
|
#if HWY_HAVE_FLOAT16 || HWY_IDE
|
|
3014
3026
|
|
|
3027
|
+
namespace detail {
|
|
3028
|
+
|
|
3029
|
+
template <int kCategories>
|
|
3030
|
+
__mmask32 Fix_mm512_fpclass_ph_mask(__m512h v) {
|
|
3031
|
+
#if HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 1500
|
|
3032
|
+
// GCC's _mm512_cmp_ph_mask uses `__mmask8` instead of `__mmask32`, hence only
|
|
3033
|
+
// the first 8 lanes are set.
|
|
3034
|
+
return static_cast<__mmask32>(__builtin_ia32_fpclassph512_mask(
|
|
3035
|
+
static_cast<__v32hf>(v), kCategories, static_cast<__mmask32>(-1)));
|
|
3036
|
+
#else
|
|
3037
|
+
return _mm512_fpclass_ph_mask(v, kCategories);
|
|
3038
|
+
#endif
|
|
3039
|
+
}
|
|
3040
|
+
|
|
3041
|
+
} // namespace detail
|
|
3042
|
+
|
|
3015
3043
|
HWY_API Mask512<float16_t> IsNaN(Vec512<float16_t> v) {
|
|
3016
|
-
|
|
3017
|
-
|
|
3044
|
+
constexpr int kCategories = HWY_X86_FPCLASS_SNAN | HWY_X86_FPCLASS_QNAN;
|
|
3045
|
+
return Mask512<float16_t>{
|
|
3046
|
+
detail::Fix_mm512_fpclass_ph_mask<kCategories>(v.raw)};
|
|
3018
3047
|
}
|
|
3019
3048
|
|
|
3020
3049
|
HWY_API Mask512<float16_t> IsEitherNaN(Vec512<float16_t> a,
|
|
@@ -3027,15 +3056,18 @@ HWY_API Mask512<float16_t> IsEitherNaN(Vec512<float16_t> a,
|
|
|
3027
3056
|
}
|
|
3028
3057
|
|
|
3029
3058
|
HWY_API Mask512<float16_t> IsInf(Vec512<float16_t> v) {
|
|
3030
|
-
|
|
3059
|
+
constexpr int kCategories = HWY_X86_FPCLASS_POS_INF | HWY_X86_FPCLASS_NEG_INF;
|
|
3060
|
+
return Mask512<float16_t>{
|
|
3061
|
+
detail::Fix_mm512_fpclass_ph_mask<kCategories>(v.raw)};
|
|
3031
3062
|
}
|
|
3032
3063
|
|
|
3033
3064
|
// Returns whether normal/subnormal/zero. fpclass doesn't have a flag for
|
|
3034
3065
|
// positive, so we have to check for inf/NaN and negate.
|
|
3035
3066
|
HWY_API Mask512<float16_t> IsFinite(Vec512<float16_t> v) {
|
|
3036
|
-
|
|
3037
|
-
|
|
3038
|
-
|
|
3067
|
+
constexpr int kCategories = HWY_X86_FPCLASS_SNAN | HWY_X86_FPCLASS_QNAN |
|
|
3068
|
+
HWY_X86_FPCLASS_NEG_INF | HWY_X86_FPCLASS_POS_INF;
|
|
3069
|
+
return Not(Mask512<float16_t>{
|
|
3070
|
+
detail::Fix_mm512_fpclass_ph_mask<kCategories>(v.raw)});
|
|
3039
3071
|
}
|
|
3040
3072
|
|
|
3041
3073
|
#endif // HWY_HAVE_FLOAT16
|
|
@@ -3571,7 +3603,7 @@ HWY_API T ExtractLane(const Vec512<T> v, size_t i) {
|
|
|
3571
3603
|
}
|
|
3572
3604
|
#endif
|
|
3573
3605
|
|
|
3574
|
-
alignas(64) T lanes[
|
|
3606
|
+
alignas(64) T lanes[MaxLanes(d)];
|
|
3575
3607
|
Store(v, d, lanes);
|
|
3576
3608
|
return lanes[i];
|
|
3577
3609
|
}
|
|
@@ -4812,6 +4844,18 @@ HWY_API Vec512<double> SwapAdjacentBlocks(Vec512<double> v) {
|
|
|
4812
4844
|
return Vec512<double>{_mm512_shuffle_f64x2(v.raw, v.raw, _MM_PERM_CDAB)};
|
|
4813
4845
|
}
|
|
4814
4846
|
|
|
4847
|
+
// ------------------------------ InterleaveEvenBlocks
|
|
4848
|
+
template <typename T>
|
|
4849
|
+
HWY_API Vec512<T> InterleaveEvenBlocks(Full512<T> d, Vec512<T> a, Vec512<T> b) {
|
|
4850
|
+
return OddEvenBlocks(SlideUpBlocks<1>(d, b), a);
|
|
4851
|
+
}
|
|
4852
|
+
|
|
4853
|
+
// ------------------------------ InterleaveOddBlocks (ConcatUpperUpper)
|
|
4854
|
+
template <typename T>
|
|
4855
|
+
HWY_API Vec512<T> InterleaveOddBlocks(Full512<T> d, Vec512<T> a, Vec512<T> b) {
|
|
4856
|
+
return OddEvenBlocks(b, SlideDownBlocks<1>(d, a));
|
|
4857
|
+
}
|
|
4858
|
+
|
|
4815
4859
|
// ------------------------------ ReverseBlocks
|
|
4816
4860
|
|
|
4817
4861
|
template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_NOT_FLOAT3264_D(D)>
|
|
@@ -5529,7 +5573,9 @@ HWY_API VFromD<D> PromoteTo(D /* tag */, Vec256<uint32_t> v) {
|
|
|
5529
5573
|
|
|
5530
5574
|
template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_I64_D(D)>
|
|
5531
5575
|
HWY_API VFromD<D> PromoteInRangeTo(D /*di64*/, VFromD<Rebind<float, D>> v) {
|
|
5532
|
-
#if
|
|
5576
|
+
#if HWY_X86_HAVE_AVX10_2_OPS
|
|
5577
|
+
return VFromD<D>{_mm512_cvtts_ps_epi64(v.raw)};
|
|
5578
|
+
#elif HWY_COMPILER_GCC_ACTUAL
|
|
5533
5579
|
// Workaround for undefined behavior with GCC if any values of v[i] are not
|
|
5534
5580
|
// within the range of an int64_t
|
|
5535
5581
|
|
|
@@ -5561,7 +5607,9 @@ HWY_API VFromD<D> PromoteInRangeTo(D /*di64*/, VFromD<Rebind<float, D>> v) {
|
|
|
5561
5607
|
}
|
|
5562
5608
|
template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_U64_D(D)>
|
|
5563
5609
|
HWY_API VFromD<D> PromoteInRangeTo(D /* tag */, VFromD<Rebind<float, D>> v) {
|
|
5564
|
-
#if
|
|
5610
|
+
#if HWY_X86_HAVE_AVX10_2_OPS
|
|
5611
|
+
return VFromD<D>{_mm512_cvtts_ps_epu64(v.raw)};
|
|
5612
|
+
#elif HWY_COMPILER_GCC_ACTUAL
|
|
5565
5613
|
// Workaround for undefined behavior with GCC if any values of v[i] are not
|
|
5566
5614
|
// within the range of an uint64_t
|
|
5567
5615
|
|
|
@@ -5830,19 +5878,6 @@ HWY_API VFromD<D> ReorderDemote2To(D dn, Vec512<uint16_t> a,
|
|
|
5830
5878
|
BitCast(di16, Min(b, max_i16)));
|
|
5831
5879
|
}
|
|
5832
5880
|
|
|
5833
|
-
template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_UI32_D(D)>
|
|
5834
|
-
HWY_API VFromD<D> ReorderDemote2To(D dn, Vec512<int64_t> a, Vec512<int64_t> b) {
|
|
5835
|
-
const Half<decltype(dn)> dnh;
|
|
5836
|
-
return Combine(dn, DemoteTo(dnh, b), DemoteTo(dnh, a));
|
|
5837
|
-
}
|
|
5838
|
-
|
|
5839
|
-
template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_U32_D(D)>
|
|
5840
|
-
HWY_API VFromD<D> ReorderDemote2To(D dn, Vec512<uint64_t> a,
|
|
5841
|
-
Vec512<uint64_t> b) {
|
|
5842
|
-
const Half<decltype(dn)> dnh;
|
|
5843
|
-
return Combine(dn, DemoteTo(dnh, b), DemoteTo(dnh, a));
|
|
5844
|
-
}
|
|
5845
|
-
|
|
5846
5881
|
template <class D, class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL(TFromD<D>),
|
|
5847
5882
|
HWY_IF_V_SIZE_D(D, 64), HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V),
|
|
5848
5883
|
HWY_IF_T_SIZE_V(V, sizeof(TFromD<D>) * 2),
|
|
@@ -5855,15 +5890,6 @@ HWY_API VFromD<D> OrderedDemote2To(D d, V a, V b) {
|
|
|
5855
5890
|
SetTableIndices(du64, kIdx)));
|
|
5856
5891
|
}
|
|
5857
5892
|
|
|
5858
|
-
template <class D, HWY_IF_NOT_FLOAT_NOR_SPECIAL(TFromD<D>),
|
|
5859
|
-
HWY_IF_V_SIZE_GT_D(D, 16), class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V),
|
|
5860
|
-
HWY_IF_T_SIZE_V(V, sizeof(TFromD<D>) * 2),
|
|
5861
|
-
HWY_IF_LANES_D(D, HWY_MAX_LANES_D(DFromV<V>) * 2),
|
|
5862
|
-
HWY_IF_T_SIZE_V(V, 8)>
|
|
5863
|
-
HWY_API VFromD<D> OrderedDemote2To(D d, V a, V b) {
|
|
5864
|
-
return ReorderDemote2To(d, a, b);
|
|
5865
|
-
}
|
|
5866
|
-
|
|
5867
5893
|
template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)>
|
|
5868
5894
|
HWY_API VFromD<D> DemoteTo(D /* tag */, Vec512<double> v) {
|
|
5869
5895
|
return VFromD<D>{_mm512_cvtpd_ps(v.raw)};
|
|
@@ -5871,7 +5897,9 @@ HWY_API VFromD<D> DemoteTo(D /* tag */, Vec512<double> v) {
|
|
|
5871
5897
|
|
|
5872
5898
|
template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_I32_D(D)>
|
|
5873
5899
|
HWY_API VFromD<D> DemoteInRangeTo(D /* tag */, Vec512<double> v) {
|
|
5874
|
-
#if
|
|
5900
|
+
#if HWY_X86_HAVE_AVX10_2_OPS
|
|
5901
|
+
return VFromD<D>{_mm512_cvtts_pd_epi32(v.raw)};
|
|
5902
|
+
#elif HWY_COMPILER_GCC_ACTUAL
|
|
5875
5903
|
// Workaround for undefined behavior in _mm512_cvttpd_epi32 with GCC if any
|
|
5876
5904
|
// values of v[i] are not within the range of an int32_t
|
|
5877
5905
|
|
|
@@ -5879,7 +5907,8 @@ HWY_API VFromD<D> DemoteInRangeTo(D /* tag */, Vec512<double> v) {
|
|
|
5879
5907
|
if (detail::IsConstantX86VecForF2IConv<int32_t>(v)) {
|
|
5880
5908
|
typedef double GccF64RawVectType __attribute__((__vector_size__(64)));
|
|
5881
5909
|
const auto raw_v = reinterpret_cast<GccF64RawVectType>(v.raw);
|
|
5882
|
-
return VFromD<D>{
|
|
5910
|
+
return VFromD<D>{
|
|
5911
|
+
_mm256_setr_epi32(
|
|
5883
5912
|
detail::X86ConvertScalarFromFloat<int32_t>(raw_v[0]),
|
|
5884
5913
|
detail::X86ConvertScalarFromFloat<int32_t>(raw_v[1]),
|
|
5885
5914
|
detail::X86ConvertScalarFromFloat<int32_t>(raw_v[2]),
|
|
@@ -5887,7 +5916,8 @@ HWY_API VFromD<D> DemoteInRangeTo(D /* tag */, Vec512<double> v) {
|
|
|
5887
5916
|
detail::X86ConvertScalarFromFloat<int32_t>(raw_v[4]),
|
|
5888
5917
|
detail::X86ConvertScalarFromFloat<int32_t>(raw_v[5]),
|
|
5889
5918
|
detail::X86ConvertScalarFromFloat<int32_t>(raw_v[6]),
|
|
5890
|
-
detail::X86ConvertScalarFromFloat<int32_t>(raw_v[7]))
|
|
5919
|
+
detail::X86ConvertScalarFromFloat<int32_t>(raw_v[7]))
|
|
5920
|
+
};
|
|
5891
5921
|
}
|
|
5892
5922
|
#endif
|
|
5893
5923
|
|
|
@@ -5904,7 +5934,9 @@ HWY_API VFromD<D> DemoteInRangeTo(D /* tag */, Vec512<double> v) {
|
|
|
5904
5934
|
|
|
5905
5935
|
template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_U32_D(D)>
|
|
5906
5936
|
HWY_API VFromD<D> DemoteInRangeTo(D /* tag */, Vec512<double> v) {
|
|
5907
|
-
#if
|
|
5937
|
+
#if HWY_X86_HAVE_AVX10_2_OPS
|
|
5938
|
+
return VFromD<D>{_mm512_cvtts_pd_epu32(v.raw)};
|
|
5939
|
+
#elif HWY_COMPILER_GCC_ACTUAL
|
|
5908
5940
|
// Workaround for undefined behavior in _mm512_cvttpd_epu32 with GCC if any
|
|
5909
5941
|
// values of v[i] are not within the range of an uint32_t
|
|
5910
5942
|
|
|
@@ -6250,7 +6282,9 @@ HWY_API VFromD<D> ConvertInRangeTo(D /* tag */, VFromD<RebindToFloat<D>> v) {
|
|
|
6250
6282
|
#endif // HWY_HAVE_FLOAT16
|
|
6251
6283
|
template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_I32_D(D)>
|
|
6252
6284
|
HWY_API VFromD<D> ConvertInRangeTo(D /*d*/, Vec512<float> v) {
|
|
6253
|
-
#if
|
|
6285
|
+
#if HWY_X86_HAVE_AVX10_2_OPS
|
|
6286
|
+
return VFromD<D>{_mm512_cvtts_ps_epi32(v.raw)};
|
|
6287
|
+
#elif HWY_COMPILER_GCC_ACTUAL
|
|
6254
6288
|
// Workaround for undefined behavior in _mm512_cvttps_epi32 with GCC if any
|
|
6255
6289
|
// values of v[i] are not within the range of an int32_t
|
|
6256
6290
|
|
|
@@ -6290,7 +6324,9 @@ HWY_API VFromD<D> ConvertInRangeTo(D /*d*/, Vec512<float> v) {
|
|
|
6290
6324
|
}
|
|
6291
6325
|
template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_I64_D(D)>
|
|
6292
6326
|
HWY_API VFromD<D> ConvertInRangeTo(D /*di*/, Vec512<double> v) {
|
|
6293
|
-
#if
|
|
6327
|
+
#if HWY_X86_HAVE_AVX10_2_OPS
|
|
6328
|
+
return VFromD<D>{_mm512_cvtts_pd_epi64(v.raw)};
|
|
6329
|
+
#elif HWY_COMPILER_GCC_ACTUAL
|
|
6294
6330
|
// Workaround for undefined behavior in _mm512_cvttpd_epi64 with GCC if any
|
|
6295
6331
|
// values of v[i] are not within the range of an int64_t
|
|
6296
6332
|
|
|
@@ -6322,7 +6358,9 @@ HWY_API VFromD<D> ConvertInRangeTo(D /*di*/, Vec512<double> v) {
|
|
|
6322
6358
|
}
|
|
6323
6359
|
template <class DU, HWY_IF_V_SIZE_D(DU, 64), HWY_IF_U32_D(DU)>
|
|
6324
6360
|
HWY_API VFromD<DU> ConvertInRangeTo(DU /*du*/, VFromD<RebindToFloat<DU>> v) {
|
|
6325
|
-
#if
|
|
6361
|
+
#if HWY_X86_HAVE_AVX10_2_OPS
|
|
6362
|
+
return VFromD<DU>{_mm512_cvtts_ps_epu32(v.raw)};
|
|
6363
|
+
#elif HWY_COMPILER_GCC_ACTUAL
|
|
6326
6364
|
// Workaround for undefined behavior in _mm512_cvttps_epu32 with GCC if any
|
|
6327
6365
|
// values of v[i] are not within the range of an uint32_t
|
|
6328
6366
|
|
|
@@ -6378,7 +6416,9 @@ HWY_API VFromD<DU> ConvertInRangeTo(DU /*du*/, VFromD<RebindToFloat<DU>> v) {
|
|
|
6378
6416
|
}
|
|
6379
6417
|
template <class DU, HWY_IF_V_SIZE_D(DU, 64), HWY_IF_U64_D(DU)>
|
|
6380
6418
|
HWY_API VFromD<DU> ConvertInRangeTo(DU /*du*/, VFromD<RebindToFloat<DU>> v) {
|
|
6381
|
-
#if
|
|
6419
|
+
#if HWY_X86_HAVE_AVX10_2_OPS
|
|
6420
|
+
return VFromD<DU>{_mm512_cvtts_pd_epu64(v.raw)};
|
|
6421
|
+
#elif HWY_COMPILER_GCC_ACTUAL
|
|
6382
6422
|
// Workaround for undefined behavior in _mm512_cvttpd_epu64 with GCC if any
|
|
6383
6423
|
// values of v[i] are not within the range of an uint64_t
|
|
6384
6424
|
|
|
@@ -6418,7 +6458,8 @@ HWY_API VFromD<DU> ConvertInRangeTo(DU /*du*/, VFromD<RebindToFloat<DU>> v) {
|
|
|
6418
6458
|
}
|
|
6419
6459
|
|
|
6420
6460
|
template <class DI, HWY_IF_V_SIZE_D(DI, 64), HWY_IF_I32_D(DI)>
|
|
6421
|
-
HWY_INLINE VFromD<DI> NearestIntInRange(DI,
|
|
6461
|
+
static HWY_INLINE VFromD<DI> NearestIntInRange(DI,
|
|
6462
|
+
VFromD<RebindToFloat<DI>> v) {
|
|
6422
6463
|
#if HWY_COMPILER_GCC_ACTUAL
|
|
6423
6464
|
// Workaround for undefined behavior in _mm512_cvtps_epi32 with GCC if any
|
|
6424
6465
|
// values of v[i] are not within the range of an int32_t
|
|
@@ -6458,6 +6499,134 @@ HWY_INLINE VFromD<DI> NearestIntInRange(DI, VFromD<RebindToFloat<DI>> v) {
|
|
|
6458
6499
|
#endif
|
|
6459
6500
|
}
|
|
6460
6501
|
|
|
6502
|
+
#if HWY_HAVE_FLOAT16
|
|
6503
|
+
template <class DI, HWY_IF_V_SIZE_D(DI, 64), HWY_IF_I16_D(DI)>
|
|
6504
|
+
static HWY_INLINE VFromD<DI> NearestIntInRange(DI /*d*/, Vec512<float16_t> v) {
|
|
6505
|
+
#if HWY_COMPILER_GCC_ACTUAL
|
|
6506
|
+
// Workaround for undefined behavior in _mm512_cvtph_epi16 with GCC if any
|
|
6507
|
+
// values of v[i] are not within the range of an int16_t
|
|
6508
|
+
|
|
6509
|
+
#if HWY_COMPILER_GCC_ACTUAL >= 1200 && !HWY_IS_DEBUG_BUILD && \
|
|
6510
|
+
HWY_HAVE_SCALAR_F16_TYPE
|
|
6511
|
+
if (detail::IsConstantX86VecForF2IConv<int16_t>(v)) {
|
|
6512
|
+
typedef hwy::float16_t::Native GccF16RawVectType
|
|
6513
|
+
__attribute__((__vector_size__(64)));
|
|
6514
|
+
const auto raw_v = reinterpret_cast<GccF16RawVectType>(v.raw);
|
|
6515
|
+
return VFromD<DI>{
|
|
6516
|
+
_mm512_set_epi16(detail::X86ScalarNearestInt<int16_t>(raw_v[31]),
|
|
6517
|
+
detail::X86ScalarNearestInt<int16_t>(raw_v[30]),
|
|
6518
|
+
detail::X86ScalarNearestInt<int16_t>(raw_v[29]),
|
|
6519
|
+
detail::X86ScalarNearestInt<int16_t>(raw_v[28]),
|
|
6520
|
+
detail::X86ScalarNearestInt<int16_t>(raw_v[27]),
|
|
6521
|
+
detail::X86ScalarNearestInt<int16_t>(raw_v[26]),
|
|
6522
|
+
detail::X86ScalarNearestInt<int16_t>(raw_v[25]),
|
|
6523
|
+
detail::X86ScalarNearestInt<int16_t>(raw_v[24]),
|
|
6524
|
+
detail::X86ScalarNearestInt<int16_t>(raw_v[23]),
|
|
6525
|
+
detail::X86ScalarNearestInt<int16_t>(raw_v[22]),
|
|
6526
|
+
detail::X86ScalarNearestInt<int16_t>(raw_v[21]),
|
|
6527
|
+
detail::X86ScalarNearestInt<int16_t>(raw_v[20]),
|
|
6528
|
+
detail::X86ScalarNearestInt<int16_t>(raw_v[19]),
|
|
6529
|
+
detail::X86ScalarNearestInt<int16_t>(raw_v[18]),
|
|
6530
|
+
detail::X86ScalarNearestInt<int16_t>(raw_v[17]),
|
|
6531
|
+
detail::X86ScalarNearestInt<int16_t>(raw_v[16]),
|
|
6532
|
+
detail::X86ScalarNearestInt<int16_t>(raw_v[15]),
|
|
6533
|
+
detail::X86ScalarNearestInt<int16_t>(raw_v[14]),
|
|
6534
|
+
detail::X86ScalarNearestInt<int16_t>(raw_v[13]),
|
|
6535
|
+
detail::X86ScalarNearestInt<int16_t>(raw_v[12]),
|
|
6536
|
+
detail::X86ScalarNearestInt<int16_t>(raw_v[11]),
|
|
6537
|
+
detail::X86ScalarNearestInt<int16_t>(raw_v[10]),
|
|
6538
|
+
detail::X86ScalarNearestInt<int16_t>(raw_v[9]),
|
|
6539
|
+
detail::X86ScalarNearestInt<int16_t>(raw_v[8]),
|
|
6540
|
+
detail::X86ScalarNearestInt<int16_t>(raw_v[7]),
|
|
6541
|
+
detail::X86ScalarNearestInt<int16_t>(raw_v[6]),
|
|
6542
|
+
detail::X86ScalarNearestInt<int16_t>(raw_v[5]),
|
|
6543
|
+
detail::X86ScalarNearestInt<int16_t>(raw_v[4]),
|
|
6544
|
+
detail::X86ScalarNearestInt<int16_t>(raw_v[3]),
|
|
6545
|
+
detail::X86ScalarNearestInt<int16_t>(raw_v[2]),
|
|
6546
|
+
detail::X86ScalarNearestInt<int16_t>(raw_v[1]),
|
|
6547
|
+
detail::X86ScalarNearestInt<int16_t>(raw_v[0]))};
|
|
6548
|
+
}
|
|
6549
|
+
#endif
|
|
6550
|
+
|
|
6551
|
+
__m512i raw_result;
|
|
6552
|
+
__asm__("vcvtph2w {%1, %0|%0, %1}"
|
|
6553
|
+
: "=" HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(raw_result)
|
|
6554
|
+
: HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(v.raw)
|
|
6555
|
+
:);
|
|
6556
|
+
return VFromD<DI>{raw_result};
|
|
6557
|
+
#else
|
|
6558
|
+
return VFromD<DI>{_mm512_cvtph_epi16(v.raw)};
|
|
6559
|
+
#endif
|
|
6560
|
+
}
|
|
6561
|
+
#endif // HWY_HAVE_FLOAT16
|
|
6562
|
+
|
|
6563
|
+
template <class DI, HWY_IF_V_SIZE_D(DI, 64), HWY_IF_I64_D(DI)>
|
|
6564
|
+
static HWY_INLINE VFromD<DI> NearestIntInRange(DI /*di*/, Vec512<double> v) {
|
|
6565
|
+
#if HWY_COMPILER_GCC_ACTUAL
|
|
6566
|
+
// Workaround for undefined behavior in _mm512_cvtpd_epi64 with GCC if any
|
|
6567
|
+
// values of v[i] are not within the range of an int64_t
|
|
6568
|
+
|
|
6569
|
+
#if HWY_COMPILER_GCC_ACTUAL >= 700 && !HWY_IS_DEBUG_BUILD
|
|
6570
|
+
if (detail::IsConstantX86VecForF2IConv<int64_t>(v)) {
|
|
6571
|
+
typedef double GccF64RawVectType __attribute__((__vector_size__(64)));
|
|
6572
|
+
const auto raw_v = reinterpret_cast<GccF64RawVectType>(v.raw);
|
|
6573
|
+
return VFromD<DI>{
|
|
6574
|
+
_mm512_setr_epi64(detail::X86ScalarNearestInt<int64_t>(raw_v[0]),
|
|
6575
|
+
detail::X86ScalarNearestInt<int64_t>(raw_v[1]),
|
|
6576
|
+
detail::X86ScalarNearestInt<int64_t>(raw_v[2]),
|
|
6577
|
+
detail::X86ScalarNearestInt<int64_t>(raw_v[3]),
|
|
6578
|
+
detail::X86ScalarNearestInt<int64_t>(raw_v[4]),
|
|
6579
|
+
detail::X86ScalarNearestInt<int64_t>(raw_v[5]),
|
|
6580
|
+
detail::X86ScalarNearestInt<int64_t>(raw_v[6]),
|
|
6581
|
+
detail::X86ScalarNearestInt<int64_t>(raw_v[7]))};
|
|
6582
|
+
}
|
|
6583
|
+
#endif
|
|
6584
|
+
|
|
6585
|
+
__m512i raw_result;
|
|
6586
|
+
__asm__("vcvtpd2qq {%1, %0|%0, %1}"
|
|
6587
|
+
: "=" HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(raw_result)
|
|
6588
|
+
: HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(v.raw)
|
|
6589
|
+
:);
|
|
6590
|
+
return VFromD<DI>{raw_result};
|
|
6591
|
+
#else
|
|
6592
|
+
return VFromD<DI>{_mm512_cvtpd_epi64(v.raw)};
|
|
6593
|
+
#endif
|
|
6594
|
+
}
|
|
6595
|
+
|
|
6596
|
+
template <class DI, HWY_IF_V_SIZE_D(DI, 32), HWY_IF_I32_D(DI)>
|
|
6597
|
+
static HWY_INLINE VFromD<DI> DemoteToNearestIntInRange(DI /* tag */,
|
|
6598
|
+
Vec512<double> v) {
|
|
6599
|
+
#if HWY_COMPILER_GCC_ACTUAL
|
|
6600
|
+
// Workaround for undefined behavior in _mm512_cvtpd_epi32 with GCC if any
|
|
6601
|
+
// values of v[i] are not within the range of an int32_t
|
|
6602
|
+
|
|
6603
|
+
#if HWY_COMPILER_GCC_ACTUAL >= 700 && !HWY_IS_DEBUG_BUILD
|
|
6604
|
+
if (detail::IsConstantX86VecForF2IConv<int32_t>(v)) {
|
|
6605
|
+
typedef double GccF64RawVectType __attribute__((__vector_size__(64)));
|
|
6606
|
+
const auto raw_v = reinterpret_cast<GccF64RawVectType>(v.raw);
|
|
6607
|
+
return VFromD<DI>{
|
|
6608
|
+
_mm256_setr_epi32(detail::X86ScalarNearestInt<int32_t>(raw_v[0]),
|
|
6609
|
+
detail::X86ScalarNearestInt<int32_t>(raw_v[1]),
|
|
6610
|
+
detail::X86ScalarNearestInt<int32_t>(raw_v[2]),
|
|
6611
|
+
detail::X86ScalarNearestInt<int32_t>(raw_v[3]),
|
|
6612
|
+
detail::X86ScalarNearestInt<int32_t>(raw_v[4]),
|
|
6613
|
+
detail::X86ScalarNearestInt<int32_t>(raw_v[5]),
|
|
6614
|
+
detail::X86ScalarNearestInt<int32_t>(raw_v[6]),
|
|
6615
|
+
detail::X86ScalarNearestInt<int32_t>(raw_v[7]))};
|
|
6616
|
+
}
|
|
6617
|
+
#endif
|
|
6618
|
+
|
|
6619
|
+
__m256i raw_result;
|
|
6620
|
+
__asm__("vcvtpd2dq {%1, %0|%0, %1}"
|
|
6621
|
+
: "=" HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(raw_result)
|
|
6622
|
+
: HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(v.raw)
|
|
6623
|
+
:);
|
|
6624
|
+
return VFromD<DI>{raw_result};
|
|
6625
|
+
#else
|
|
6626
|
+
return VFromD<DI>{_mm512_cvtpd_epi32(v.raw)};
|
|
6627
|
+
#endif
|
|
6628
|
+
}
|
|
6629
|
+
|
|
6461
6630
|
// ================================================== CRYPTO
|
|
6462
6631
|
|
|
6463
6632
|
#if !defined(HWY_DISABLE_PCLMUL_AES)
|
|
@@ -6788,376 +6957,6 @@ HWY_API intptr_t FindLastTrue(D d, MFromD<D> mask) {
|
|
|
6788
6957
|
|
|
6789
6958
|
// ------------------------------ Compress
|
|
6790
6959
|
|
|
6791
|
-
// Always implement 8-bit here even if we lack VBMI2 because we can do better
|
|
6792
|
-
// than generic_ops (8 at a time) via the native 32-bit compress (16 at a time).
|
|
6793
|
-
#ifdef HWY_NATIVE_COMPRESS8
|
|
6794
|
-
#undef HWY_NATIVE_COMPRESS8
|
|
6795
|
-
#else
|
|
6796
|
-
#define HWY_NATIVE_COMPRESS8
|
|
6797
|
-
#endif
|
|
6798
|
-
|
|
6799
|
-
namespace detail {
|
|
6800
|
-
|
|
6801
|
-
#if HWY_TARGET <= HWY_AVX3_DL // VBMI2
|
|
6802
|
-
template <size_t N>
|
|
6803
|
-
HWY_INLINE Vec128<uint8_t, N> NativeCompress(const Vec128<uint8_t, N> v,
|
|
6804
|
-
const Mask128<uint8_t, N> mask) {
|
|
6805
|
-
return Vec128<uint8_t, N>{_mm_maskz_compress_epi8(mask.raw, v.raw)};
|
|
6806
|
-
}
|
|
6807
|
-
HWY_INLINE Vec256<uint8_t> NativeCompress(const Vec256<uint8_t> v,
|
|
6808
|
-
const Mask256<uint8_t> mask) {
|
|
6809
|
-
return Vec256<uint8_t>{_mm256_maskz_compress_epi8(mask.raw, v.raw)};
|
|
6810
|
-
}
|
|
6811
|
-
HWY_INLINE Vec512<uint8_t> NativeCompress(const Vec512<uint8_t> v,
|
|
6812
|
-
const Mask512<uint8_t> mask) {
|
|
6813
|
-
return Vec512<uint8_t>{_mm512_maskz_compress_epi8(mask.raw, v.raw)};
|
|
6814
|
-
}
|
|
6815
|
-
|
|
6816
|
-
template <size_t N>
|
|
6817
|
-
HWY_INLINE Vec128<uint16_t, N> NativeCompress(const Vec128<uint16_t, N> v,
|
|
6818
|
-
const Mask128<uint16_t, N> mask) {
|
|
6819
|
-
return Vec128<uint16_t, N>{_mm_maskz_compress_epi16(mask.raw, v.raw)};
|
|
6820
|
-
}
|
|
6821
|
-
HWY_INLINE Vec256<uint16_t> NativeCompress(const Vec256<uint16_t> v,
|
|
6822
|
-
const Mask256<uint16_t> mask) {
|
|
6823
|
-
return Vec256<uint16_t>{_mm256_maskz_compress_epi16(mask.raw, v.raw)};
|
|
6824
|
-
}
|
|
6825
|
-
HWY_INLINE Vec512<uint16_t> NativeCompress(const Vec512<uint16_t> v,
|
|
6826
|
-
const Mask512<uint16_t> mask) {
|
|
6827
|
-
return Vec512<uint16_t>{_mm512_maskz_compress_epi16(mask.raw, v.raw)};
|
|
6828
|
-
}
|
|
6829
|
-
|
|
6830
|
-
// Slow on Zen4, do not even define these to prevent accidental usage.
|
|
6831
|
-
#if HWY_TARGET != HWY_AVX3_ZEN4
|
|
6832
|
-
|
|
6833
|
-
template <size_t N>
|
|
6834
|
-
HWY_INLINE void NativeCompressStore(Vec128<uint8_t, N> v,
|
|
6835
|
-
Mask128<uint8_t, N> mask,
|
|
6836
|
-
uint8_t* HWY_RESTRICT unaligned) {
|
|
6837
|
-
_mm_mask_compressstoreu_epi8(unaligned, mask.raw, v.raw);
|
|
6838
|
-
}
|
|
6839
|
-
HWY_INLINE void NativeCompressStore(Vec256<uint8_t> v, Mask256<uint8_t> mask,
|
|
6840
|
-
uint8_t* HWY_RESTRICT unaligned) {
|
|
6841
|
-
_mm256_mask_compressstoreu_epi8(unaligned, mask.raw, v.raw);
|
|
6842
|
-
}
|
|
6843
|
-
HWY_INLINE void NativeCompressStore(Vec512<uint8_t> v, Mask512<uint8_t> mask,
|
|
6844
|
-
uint8_t* HWY_RESTRICT unaligned) {
|
|
6845
|
-
_mm512_mask_compressstoreu_epi8(unaligned, mask.raw, v.raw);
|
|
6846
|
-
}
|
|
6847
|
-
|
|
6848
|
-
template <size_t N>
|
|
6849
|
-
HWY_INLINE void NativeCompressStore(Vec128<uint16_t, N> v,
|
|
6850
|
-
Mask128<uint16_t, N> mask,
|
|
6851
|
-
uint16_t* HWY_RESTRICT unaligned) {
|
|
6852
|
-
_mm_mask_compressstoreu_epi16(unaligned, mask.raw, v.raw);
|
|
6853
|
-
}
|
|
6854
|
-
HWY_INLINE void NativeCompressStore(Vec256<uint16_t> v, Mask256<uint16_t> mask,
|
|
6855
|
-
uint16_t* HWY_RESTRICT unaligned) {
|
|
6856
|
-
_mm256_mask_compressstoreu_epi16(unaligned, mask.raw, v.raw);
|
|
6857
|
-
}
|
|
6858
|
-
HWY_INLINE void NativeCompressStore(Vec512<uint16_t> v, Mask512<uint16_t> mask,
|
|
6859
|
-
uint16_t* HWY_RESTRICT unaligned) {
|
|
6860
|
-
_mm512_mask_compressstoreu_epi16(unaligned, mask.raw, v.raw);
|
|
6861
|
-
}
|
|
6862
|
-
|
|
6863
|
-
#endif // HWY_TARGET != HWY_AVX3_ZEN4
|
|
6864
|
-
|
|
6865
|
-
HWY_INLINE Vec512<uint8_t> NativeExpand(Vec512<uint8_t> v,
|
|
6866
|
-
Mask512<uint8_t> mask) {
|
|
6867
|
-
return Vec512<uint8_t>{_mm512_maskz_expand_epi8(mask.raw, v.raw)};
|
|
6868
|
-
}
|
|
6869
|
-
|
|
6870
|
-
HWY_INLINE Vec512<uint16_t> NativeExpand(Vec512<uint16_t> v,
|
|
6871
|
-
Mask512<uint16_t> mask) {
|
|
6872
|
-
return Vec512<uint16_t>{_mm512_maskz_expand_epi16(mask.raw, v.raw)};
|
|
6873
|
-
}
|
|
6874
|
-
|
|
6875
|
-
template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_U8_D(D)>
|
|
6876
|
-
HWY_INLINE VFromD<D> NativeLoadExpand(Mask512<uint8_t> mask, D /* d */,
|
|
6877
|
-
const uint8_t* HWY_RESTRICT unaligned) {
|
|
6878
|
-
return VFromD<D>{_mm512_maskz_expandloadu_epi8(mask.raw, unaligned)};
|
|
6879
|
-
}
|
|
6880
|
-
|
|
6881
|
-
template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_U16_D(D)>
|
|
6882
|
-
HWY_INLINE VFromD<D> NativeLoadExpand(Mask512<uint16_t> mask, D /* d */,
|
|
6883
|
-
const uint16_t* HWY_RESTRICT unaligned) {
|
|
6884
|
-
return VFromD<D>{_mm512_maskz_expandloadu_epi16(mask.raw, unaligned)};
|
|
6885
|
-
}
|
|
6886
|
-
|
|
6887
|
-
#endif // HWY_TARGET <= HWY_AVX3_DL
|
|
6888
|
-
|
|
6889
|
-
template <size_t N>
|
|
6890
|
-
HWY_INLINE Vec128<uint32_t, N> NativeCompress(Vec128<uint32_t, N> v,
|
|
6891
|
-
Mask128<uint32_t, N> mask) {
|
|
6892
|
-
return Vec128<uint32_t, N>{_mm_maskz_compress_epi32(mask.raw, v.raw)};
|
|
6893
|
-
}
|
|
6894
|
-
HWY_INLINE Vec256<uint32_t> NativeCompress(Vec256<uint32_t> v,
|
|
6895
|
-
Mask256<uint32_t> mask) {
|
|
6896
|
-
return Vec256<uint32_t>{_mm256_maskz_compress_epi32(mask.raw, v.raw)};
|
|
6897
|
-
}
|
|
6898
|
-
HWY_INLINE Vec512<uint32_t> NativeCompress(Vec512<uint32_t> v,
|
|
6899
|
-
Mask512<uint32_t> mask) {
|
|
6900
|
-
return Vec512<uint32_t>{_mm512_maskz_compress_epi32(mask.raw, v.raw)};
|
|
6901
|
-
}
|
|
6902
|
-
// We use table-based compress for 64-bit lanes, see CompressIsPartition.
|
|
6903
|
-
|
|
6904
|
-
// Slow on Zen4, do not even define these to prevent accidental usage.
|
|
6905
|
-
#if HWY_TARGET != HWY_AVX3_ZEN4
|
|
6906
|
-
|
|
6907
|
-
template <size_t N>
|
|
6908
|
-
HWY_INLINE void NativeCompressStore(Vec128<uint32_t, N> v,
|
|
6909
|
-
Mask128<uint32_t, N> mask,
|
|
6910
|
-
uint32_t* HWY_RESTRICT unaligned) {
|
|
6911
|
-
_mm_mask_compressstoreu_epi32(unaligned, mask.raw, v.raw);
|
|
6912
|
-
}
|
|
6913
|
-
HWY_INLINE void NativeCompressStore(Vec256<uint32_t> v, Mask256<uint32_t> mask,
|
|
6914
|
-
uint32_t* HWY_RESTRICT unaligned) {
|
|
6915
|
-
_mm256_mask_compressstoreu_epi32(unaligned, mask.raw, v.raw);
|
|
6916
|
-
}
|
|
6917
|
-
HWY_INLINE void NativeCompressStore(Vec512<uint32_t> v, Mask512<uint32_t> mask,
|
|
6918
|
-
uint32_t* HWY_RESTRICT unaligned) {
|
|
6919
|
-
_mm512_mask_compressstoreu_epi32(unaligned, mask.raw, v.raw);
|
|
6920
|
-
}
|
|
6921
|
-
|
|
6922
|
-
template <size_t N>
|
|
6923
|
-
HWY_INLINE void NativeCompressStore(Vec128<uint64_t, N> v,
|
|
6924
|
-
Mask128<uint64_t, N> mask,
|
|
6925
|
-
uint64_t* HWY_RESTRICT unaligned) {
|
|
6926
|
-
_mm_mask_compressstoreu_epi64(unaligned, mask.raw, v.raw);
|
|
6927
|
-
}
|
|
6928
|
-
HWY_INLINE void NativeCompressStore(Vec256<uint64_t> v, Mask256<uint64_t> mask,
|
|
6929
|
-
uint64_t* HWY_RESTRICT unaligned) {
|
|
6930
|
-
_mm256_mask_compressstoreu_epi64(unaligned, mask.raw, v.raw);
|
|
6931
|
-
}
|
|
6932
|
-
HWY_INLINE void NativeCompressStore(Vec512<uint64_t> v, Mask512<uint64_t> mask,
|
|
6933
|
-
uint64_t* HWY_RESTRICT unaligned) {
|
|
6934
|
-
_mm512_mask_compressstoreu_epi64(unaligned, mask.raw, v.raw);
|
|
6935
|
-
}
|
|
6936
|
-
|
|
6937
|
-
template <size_t N>
|
|
6938
|
-
HWY_INLINE void NativeCompressStore(Vec128<float, N> v, Mask128<float, N> mask,
|
|
6939
|
-
float* HWY_RESTRICT unaligned) {
|
|
6940
|
-
_mm_mask_compressstoreu_ps(unaligned, mask.raw, v.raw);
|
|
6941
|
-
}
|
|
6942
|
-
HWY_INLINE void NativeCompressStore(Vec256<float> v, Mask256<float> mask,
|
|
6943
|
-
float* HWY_RESTRICT unaligned) {
|
|
6944
|
-
_mm256_mask_compressstoreu_ps(unaligned, mask.raw, v.raw);
|
|
6945
|
-
}
|
|
6946
|
-
HWY_INLINE void NativeCompressStore(Vec512<float> v, Mask512<float> mask,
|
|
6947
|
-
float* HWY_RESTRICT unaligned) {
|
|
6948
|
-
_mm512_mask_compressstoreu_ps(unaligned, mask.raw, v.raw);
|
|
6949
|
-
}
|
|
6950
|
-
|
|
6951
|
-
template <size_t N>
|
|
6952
|
-
HWY_INLINE void NativeCompressStore(Vec128<double, N> v,
|
|
6953
|
-
Mask128<double, N> mask,
|
|
6954
|
-
double* HWY_RESTRICT unaligned) {
|
|
6955
|
-
_mm_mask_compressstoreu_pd(unaligned, mask.raw, v.raw);
|
|
6956
|
-
}
|
|
6957
|
-
HWY_INLINE void NativeCompressStore(Vec256<double> v, Mask256<double> mask,
|
|
6958
|
-
double* HWY_RESTRICT unaligned) {
|
|
6959
|
-
_mm256_mask_compressstoreu_pd(unaligned, mask.raw, v.raw);
|
|
6960
|
-
}
|
|
6961
|
-
HWY_INLINE void NativeCompressStore(Vec512<double> v, Mask512<double> mask,
|
|
6962
|
-
double* HWY_RESTRICT unaligned) {
|
|
6963
|
-
_mm512_mask_compressstoreu_pd(unaligned, mask.raw, v.raw);
|
|
6964
|
-
}
|
|
6965
|
-
|
|
6966
|
-
#endif // HWY_TARGET != HWY_AVX3_ZEN4
|
|
6967
|
-
|
|
6968
|
-
HWY_INLINE Vec512<uint32_t> NativeExpand(Vec512<uint32_t> v,
|
|
6969
|
-
Mask512<uint32_t> mask) {
|
|
6970
|
-
return Vec512<uint32_t>{_mm512_maskz_expand_epi32(mask.raw, v.raw)};
|
|
6971
|
-
}
|
|
6972
|
-
|
|
6973
|
-
HWY_INLINE Vec512<uint64_t> NativeExpand(Vec512<uint64_t> v,
|
|
6974
|
-
Mask512<uint64_t> mask) {
|
|
6975
|
-
return Vec512<uint64_t>{_mm512_maskz_expand_epi64(mask.raw, v.raw)};
|
|
6976
|
-
}
|
|
6977
|
-
|
|
6978
|
-
template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_U32_D(D)>
|
|
6979
|
-
HWY_INLINE VFromD<D> NativeLoadExpand(Mask512<uint32_t> mask, D /* d */,
|
|
6980
|
-
const uint32_t* HWY_RESTRICT unaligned) {
|
|
6981
|
-
return VFromD<D>{_mm512_maskz_expandloadu_epi32(mask.raw, unaligned)};
|
|
6982
|
-
}
|
|
6983
|
-
|
|
6984
|
-
template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_U64_D(D)>
|
|
6985
|
-
HWY_INLINE VFromD<D> NativeLoadExpand(Mask512<uint64_t> mask, D /* d */,
|
|
6986
|
-
const uint64_t* HWY_RESTRICT unaligned) {
|
|
6987
|
-
return VFromD<D>{_mm512_maskz_expandloadu_epi64(mask.raw, unaligned)};
|
|
6988
|
-
}
|
|
6989
|
-
|
|
6990
|
-
// For u8x16 and <= u16x16 we can avoid store+load for Compress because there is
|
|
6991
|
-
// only a single compressed vector (u32x16). Other EmuCompress are implemented
|
|
6992
|
-
// after the EmuCompressStore they build upon.
|
|
6993
|
-
template <size_t N>
|
|
6994
|
-
HWY_INLINE Vec128<uint8_t, N> EmuCompress(Vec128<uint8_t, N> v,
|
|
6995
|
-
Mask128<uint8_t, N> mask) {
|
|
6996
|
-
const DFromV<decltype(v)> d;
|
|
6997
|
-
const Rebind<uint32_t, decltype(d)> d32;
|
|
6998
|
-
const VFromD<decltype(d32)> v0 = PromoteTo(d32, v);
|
|
6999
|
-
|
|
7000
|
-
const uint64_t mask_bits{mask.raw};
|
|
7001
|
-
// Mask type is __mmask16 if v is full 128, else __mmask8.
|
|
7002
|
-
using M32 = MFromD<decltype(d32)>;
|
|
7003
|
-
const M32 m0{static_cast<typename M32::Raw>(mask_bits)};
|
|
7004
|
-
return TruncateTo(d, Compress(v0, m0));
|
|
7005
|
-
}
|
|
7006
|
-
|
|
7007
|
-
template <size_t N>
|
|
7008
|
-
HWY_INLINE Vec128<uint16_t, N> EmuCompress(Vec128<uint16_t, N> v,
|
|
7009
|
-
Mask128<uint16_t, N> mask) {
|
|
7010
|
-
const DFromV<decltype(v)> d;
|
|
7011
|
-
const Rebind<int32_t, decltype(d)> di32;
|
|
7012
|
-
const RebindToUnsigned<decltype(di32)> du32;
|
|
7013
|
-
const MFromD<decltype(du32)> mask32{static_cast<__mmask8>(mask.raw)};
|
|
7014
|
-
// DemoteTo is 2 ops, but likely lower latency than TruncateTo on SKX.
|
|
7015
|
-
// Only i32 -> u16 is supported, whereas NativeCompress expects u32.
|
|
7016
|
-
const VFromD<decltype(du32)> v32 = BitCast(du32, PromoteTo(di32, v));
|
|
7017
|
-
return DemoteTo(d, BitCast(di32, NativeCompress(v32, mask32)));
|
|
7018
|
-
}
|
|
7019
|
-
|
|
7020
|
-
HWY_INLINE Vec256<uint16_t> EmuCompress(Vec256<uint16_t> v,
|
|
7021
|
-
Mask256<uint16_t> mask) {
|
|
7022
|
-
const DFromV<decltype(v)> d;
|
|
7023
|
-
const Rebind<int32_t, decltype(d)> di32;
|
|
7024
|
-
const RebindToUnsigned<decltype(di32)> du32;
|
|
7025
|
-
const Mask512<uint32_t> mask32{static_cast<__mmask16>(mask.raw)};
|
|
7026
|
-
const Vec512<uint32_t> v32 = BitCast(du32, PromoteTo(di32, v));
|
|
7027
|
-
return DemoteTo(d, BitCast(di32, NativeCompress(v32, mask32)));
|
|
7028
|
-
}
|
|
7029
|
-
|
|
7030
|
-
// See above - small-vector EmuCompressStore are implemented via EmuCompress.
|
|
7031
|
-
template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
|
|
7032
|
-
HWY_INLINE void EmuCompressStore(VFromD<D> v, MFromD<D> mask, D d,
|
|
7033
|
-
TFromD<D>* HWY_RESTRICT unaligned) {
|
|
7034
|
-
StoreU(EmuCompress(v, mask), d, unaligned);
|
|
7035
|
-
}
|
|
7036
|
-
|
|
7037
|
-
template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_U16_D(D)>
|
|
7038
|
-
HWY_INLINE void EmuCompressStore(VFromD<D> v, MFromD<D> mask, D d,
|
|
7039
|
-
TFromD<D>* HWY_RESTRICT unaligned) {
|
|
7040
|
-
StoreU(EmuCompress(v, mask), d, unaligned);
|
|
7041
|
-
}
|
|
7042
|
-
|
|
7043
|
-
// Main emulation logic for wider vector, starting with EmuCompressStore because
|
|
7044
|
-
// it is most convenient to merge pieces using memory (concatenating vectors at
|
|
7045
|
-
// byte offsets is difficult).
|
|
7046
|
-
template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_U8_D(D)>
|
|
7047
|
-
HWY_INLINE void EmuCompressStore(VFromD<D> v, MFromD<D> mask, D d,
|
|
7048
|
-
TFromD<D>* HWY_RESTRICT unaligned) {
|
|
7049
|
-
const uint64_t mask_bits{mask.raw};
|
|
7050
|
-
const Half<decltype(d)> dh;
|
|
7051
|
-
const Rebind<uint32_t, decltype(dh)> d32;
|
|
7052
|
-
const Vec512<uint32_t> v0 = PromoteTo(d32, LowerHalf(v));
|
|
7053
|
-
const Vec512<uint32_t> v1 = PromoteTo(d32, UpperHalf(dh, v));
|
|
7054
|
-
const Mask512<uint32_t> m0{static_cast<__mmask16>(mask_bits & 0xFFFFu)};
|
|
7055
|
-
const Mask512<uint32_t> m1{static_cast<__mmask16>(mask_bits >> 16)};
|
|
7056
|
-
const Vec128<uint8_t> c0 = TruncateTo(dh, NativeCompress(v0, m0));
|
|
7057
|
-
const Vec128<uint8_t> c1 = TruncateTo(dh, NativeCompress(v1, m1));
|
|
7058
|
-
uint8_t* HWY_RESTRICT pos = unaligned;
|
|
7059
|
-
StoreU(c0, dh, pos);
|
|
7060
|
-
StoreU(c1, dh, pos + CountTrue(d32, m0));
|
|
7061
|
-
}
|
|
7062
|
-
|
|
7063
|
-
template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_U8_D(D)>
|
|
7064
|
-
HWY_INLINE void EmuCompressStore(VFromD<D> v, MFromD<D> mask, D d,
|
|
7065
|
-
TFromD<D>* HWY_RESTRICT unaligned) {
|
|
7066
|
-
const uint64_t mask_bits{mask.raw};
|
|
7067
|
-
const Half<Half<decltype(d)>> dq;
|
|
7068
|
-
const Rebind<uint32_t, decltype(dq)> d32;
|
|
7069
|
-
alignas(64) uint8_t lanes[64];
|
|
7070
|
-
Store(v, d, lanes);
|
|
7071
|
-
const Vec512<uint32_t> v0 = PromoteTo(d32, LowerHalf(LowerHalf(v)));
|
|
7072
|
-
const Vec512<uint32_t> v1 = PromoteTo(d32, Load(dq, lanes + 16));
|
|
7073
|
-
const Vec512<uint32_t> v2 = PromoteTo(d32, Load(dq, lanes + 32));
|
|
7074
|
-
const Vec512<uint32_t> v3 = PromoteTo(d32, Load(dq, lanes + 48));
|
|
7075
|
-
const Mask512<uint32_t> m0{static_cast<__mmask16>(mask_bits & 0xFFFFu)};
|
|
7076
|
-
const Mask512<uint32_t> m1{
|
|
7077
|
-
static_cast<uint16_t>((mask_bits >> 16) & 0xFFFFu)};
|
|
7078
|
-
const Mask512<uint32_t> m2{
|
|
7079
|
-
static_cast<uint16_t>((mask_bits >> 32) & 0xFFFFu)};
|
|
7080
|
-
const Mask512<uint32_t> m3{static_cast<__mmask16>(mask_bits >> 48)};
|
|
7081
|
-
const Vec128<uint8_t> c0 = TruncateTo(dq, NativeCompress(v0, m0));
|
|
7082
|
-
const Vec128<uint8_t> c1 = TruncateTo(dq, NativeCompress(v1, m1));
|
|
7083
|
-
const Vec128<uint8_t> c2 = TruncateTo(dq, NativeCompress(v2, m2));
|
|
7084
|
-
const Vec128<uint8_t> c3 = TruncateTo(dq, NativeCompress(v3, m3));
|
|
7085
|
-
uint8_t* HWY_RESTRICT pos = unaligned;
|
|
7086
|
-
StoreU(c0, dq, pos);
|
|
7087
|
-
pos += CountTrue(d32, m0);
|
|
7088
|
-
StoreU(c1, dq, pos);
|
|
7089
|
-
pos += CountTrue(d32, m1);
|
|
7090
|
-
StoreU(c2, dq, pos);
|
|
7091
|
-
pos += CountTrue(d32, m2);
|
|
7092
|
-
StoreU(c3, dq, pos);
|
|
7093
|
-
}
|
|
7094
|
-
|
|
7095
|
-
template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_U16_D(D)>
|
|
7096
|
-
HWY_INLINE void EmuCompressStore(VFromD<D> v, MFromD<D> mask, D d,
|
|
7097
|
-
TFromD<D>* HWY_RESTRICT unaligned) {
|
|
7098
|
-
const Repartition<int32_t, decltype(d)> di32;
|
|
7099
|
-
const RebindToUnsigned<decltype(di32)> du32;
|
|
7100
|
-
const Half<decltype(d)> dh;
|
|
7101
|
-
const Vec512<uint32_t> promoted0 =
|
|
7102
|
-
BitCast(du32, PromoteTo(di32, LowerHalf(dh, v)));
|
|
7103
|
-
const Vec512<uint32_t> promoted1 =
|
|
7104
|
-
BitCast(du32, PromoteTo(di32, UpperHalf(dh, v)));
|
|
7105
|
-
|
|
7106
|
-
const uint64_t mask_bits{mask.raw};
|
|
7107
|
-
const uint64_t maskL = mask_bits & 0xFFFF;
|
|
7108
|
-
const uint64_t maskH = mask_bits >> 16;
|
|
7109
|
-
const Mask512<uint32_t> mask0{static_cast<__mmask16>(maskL)};
|
|
7110
|
-
const Mask512<uint32_t> mask1{static_cast<__mmask16>(maskH)};
|
|
7111
|
-
const Vec512<uint32_t> compressed0 = NativeCompress(promoted0, mask0);
|
|
7112
|
-
const Vec512<uint32_t> compressed1 = NativeCompress(promoted1, mask1);
|
|
7113
|
-
|
|
7114
|
-
const Vec256<uint16_t> demoted0 = DemoteTo(dh, BitCast(di32, compressed0));
|
|
7115
|
-
const Vec256<uint16_t> demoted1 = DemoteTo(dh, BitCast(di32, compressed1));
|
|
7116
|
-
|
|
7117
|
-
// Store 256-bit halves
|
|
7118
|
-
StoreU(demoted0, dh, unaligned);
|
|
7119
|
-
StoreU(demoted1, dh, unaligned + PopCount(maskL));
|
|
7120
|
-
}
|
|
7121
|
-
|
|
7122
|
-
// Finally, the remaining EmuCompress for wide vectors, using EmuCompressStore.
|
|
7123
|
-
template <typename T> // 1 or 2 bytes
|
|
7124
|
-
HWY_INLINE Vec512<T> EmuCompress(Vec512<T> v, Mask512<T> mask) {
|
|
7125
|
-
const DFromV<decltype(v)> d;
|
|
7126
|
-
alignas(64) T buf[2 * Lanes(d)];
|
|
7127
|
-
EmuCompressStore(v, mask, d, buf);
|
|
7128
|
-
return Load(d, buf);
|
|
7129
|
-
}
|
|
7130
|
-
|
|
7131
|
-
HWY_INLINE Vec256<uint8_t> EmuCompress(Vec256<uint8_t> v,
|
|
7132
|
-
const Mask256<uint8_t> mask) {
|
|
7133
|
-
const DFromV<decltype(v)> d;
|
|
7134
|
-
alignas(32) uint8_t buf[2 * 32 / sizeof(uint8_t)];
|
|
7135
|
-
EmuCompressStore(v, mask, d, buf);
|
|
7136
|
-
return Load(d, buf);
|
|
7137
|
-
}
|
|
7138
|
-
|
|
7139
|
-
} // namespace detail
|
|
7140
|
-
|
|
7141
|
-
template <class V, class M, HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 2))>
|
|
7142
|
-
HWY_API V Compress(V v, const M mask) {
|
|
7143
|
-
const DFromV<decltype(v)> d;
|
|
7144
|
-
const RebindToUnsigned<decltype(d)> du;
|
|
7145
|
-
const auto mu = RebindMask(du, mask);
|
|
7146
|
-
#if HWY_TARGET <= HWY_AVX3_DL // VBMI2
|
|
7147
|
-
return BitCast(d, detail::NativeCompress(BitCast(du, v), mu));
|
|
7148
|
-
#else
|
|
7149
|
-
return BitCast(d, detail::EmuCompress(BitCast(du, v), mu));
|
|
7150
|
-
#endif
|
|
7151
|
-
}
|
|
7152
|
-
|
|
7153
|
-
template <class V, class M, HWY_IF_T_SIZE_V(V, 4)>
|
|
7154
|
-
HWY_API V Compress(V v, const M mask) {
|
|
7155
|
-
const DFromV<decltype(v)> d;
|
|
7156
|
-
const RebindToUnsigned<decltype(d)> du;
|
|
7157
|
-
const auto mu = RebindMask(du, mask);
|
|
7158
|
-
return BitCast(d, detail::NativeCompress(BitCast(du, v), mu));
|
|
7159
|
-
}
|
|
7160
|
-
|
|
7161
6960
|
template <typename T, HWY_IF_T_SIZE(T, 8)>
|
|
7162
6961
|
HWY_API Vec512<T> Compress(Vec512<T> v, Mask512<T> mask) {
|
|
7163
6962
|
// See CompressIsPartition. u64 is faster than u32.
|
|
@@ -7222,6 +7021,56 @@ HWY_API Vec512<T> Compress(Vec512<T> v, Mask512<T> mask) {
|
|
|
7222
7021
|
|
|
7223
7022
|
// ------------------------------ Expand
|
|
7224
7023
|
|
|
7024
|
+
namespace detail {
|
|
7025
|
+
|
|
7026
|
+
#if HWY_TARGET <= HWY_AVX3_DL // VBMI2
|
|
7027
|
+
HWY_INLINE Vec512<uint8_t> NativeExpand(Vec512<uint8_t> v,
|
|
7028
|
+
Mask512<uint8_t> mask) {
|
|
7029
|
+
return Vec512<uint8_t>{_mm512_maskz_expand_epi8(mask.raw, v.raw)};
|
|
7030
|
+
}
|
|
7031
|
+
|
|
7032
|
+
HWY_INLINE Vec512<uint16_t> NativeExpand(Vec512<uint16_t> v,
|
|
7033
|
+
Mask512<uint16_t> mask) {
|
|
7034
|
+
return Vec512<uint16_t>{_mm512_maskz_expand_epi16(mask.raw, v.raw)};
|
|
7035
|
+
}
|
|
7036
|
+
|
|
7037
|
+
template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_U8_D(D)>
|
|
7038
|
+
HWY_INLINE VFromD<D> NativeLoadExpand(Mask512<uint8_t> mask, D /* d */,
|
|
7039
|
+
const uint8_t* HWY_RESTRICT unaligned) {
|
|
7040
|
+
return VFromD<D>{_mm512_maskz_expandloadu_epi8(mask.raw, unaligned)};
|
|
7041
|
+
}
|
|
7042
|
+
|
|
7043
|
+
template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_U16_D(D)>
|
|
7044
|
+
HWY_INLINE VFromD<D> NativeLoadExpand(Mask512<uint16_t> mask, D /* d */,
|
|
7045
|
+
const uint16_t* HWY_RESTRICT unaligned) {
|
|
7046
|
+
return VFromD<D>{_mm512_maskz_expandloadu_epi16(mask.raw, unaligned)};
|
|
7047
|
+
}
|
|
7048
|
+
#endif // HWY_TARGET <= HWY_AVX3_DL
|
|
7049
|
+
|
|
7050
|
+
HWY_INLINE Vec512<uint32_t> NativeExpand(Vec512<uint32_t> v,
|
|
7051
|
+
Mask512<uint32_t> mask) {
|
|
7052
|
+
return Vec512<uint32_t>{_mm512_maskz_expand_epi32(mask.raw, v.raw)};
|
|
7053
|
+
}
|
|
7054
|
+
|
|
7055
|
+
HWY_INLINE Vec512<uint64_t> NativeExpand(Vec512<uint64_t> v,
|
|
7056
|
+
Mask512<uint64_t> mask) {
|
|
7057
|
+
return Vec512<uint64_t>{_mm512_maskz_expand_epi64(mask.raw, v.raw)};
|
|
7058
|
+
}
|
|
7059
|
+
|
|
7060
|
+
template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_U32_D(D)>
|
|
7061
|
+
HWY_INLINE VFromD<D> NativeLoadExpand(Mask512<uint32_t> mask, D /* d */,
|
|
7062
|
+
const uint32_t* HWY_RESTRICT unaligned) {
|
|
7063
|
+
return VFromD<D>{_mm512_maskz_expandloadu_epi32(mask.raw, unaligned)};
|
|
7064
|
+
}
|
|
7065
|
+
|
|
7066
|
+
template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_U64_D(D)>
|
|
7067
|
+
HWY_INLINE VFromD<D> NativeLoadExpand(Mask512<uint64_t> mask, D /* d */,
|
|
7068
|
+
const uint64_t* HWY_RESTRICT unaligned) {
|
|
7069
|
+
return VFromD<D>{_mm512_maskz_expandloadu_epi64(mask.raw, unaligned)};
|
|
7070
|
+
}
|
|
7071
|
+
|
|
7072
|
+
} // namespace detail
|
|
7073
|
+
|
|
7225
7074
|
template <typename T, HWY_IF_T_SIZE(T, 1)>
|
|
7226
7075
|
HWY_API Vec512<T> Expand(Vec512<T> v, const Mask512<T> mask) {
|
|
7227
7076
|
const Full512<T> d;
|
|
@@ -7233,7 +7082,7 @@ HWY_API Vec512<T> Expand(Vec512<T> v, const Mask512<T> mask) {
|
|
|
7233
7082
|
// LUTs are infeasible for 2^64 possible masks, so splice together two
|
|
7234
7083
|
// half-vector Expand.
|
|
7235
7084
|
const Full256<T> dh;
|
|
7236
|
-
constexpr size_t N =
|
|
7085
|
+
constexpr size_t N = MaxLanes(d);
|
|
7237
7086
|
// We have to shift the input by a variable number of u8. Shuffling requires
|
|
7238
7087
|
// VBMI2, in which case we would already have NativeExpand. We instead
|
|
7239
7088
|
// load at an offset, which may incur a store to load forwarding stall.
|
|
@@ -7261,10 +7110,10 @@ HWY_API Vec512<T> Expand(Vec512<T> v, const Mask512<T> mask) {
|
|
|
7261
7110
|
// LUTs are infeasible for 2^32 possible masks, so splice together two
|
|
7262
7111
|
// half-vector Expand.
|
|
7263
7112
|
const Full256<T> dh;
|
|
7264
|
-
|
|
7113
|
+
HWY_LANES_CONSTEXPR size_t N = Lanes(d);
|
|
7265
7114
|
using Bits = typename Mask256<T>::Raw;
|
|
7266
7115
|
const Mask256<T> maskL{
|
|
7267
|
-
static_cast<Bits>(mask.raw & Bits
|
|
7116
|
+
static_cast<Bits>(mask.raw & static_cast<Bits>((1ULL << (N / 2)) - 1))};
|
|
7268
7117
|
const Mask256<T> maskH{static_cast<Bits>(mask.raw >> (N / 2))};
|
|
7269
7118
|
// In AVX3 we can permutevar, which avoids a potential store to load
|
|
7270
7119
|
// forwarding stall vs. reloading the input.
|
|
@@ -7336,11 +7185,6 @@ HWY_API VFromD<D> LoadExpand(MFromD<D> mask, D d,
|
|
|
7336
7185
|
|
|
7337
7186
|
// ------------------------------ CompressNot
|
|
7338
7187
|
|
|
7339
|
-
template <class V, class M, HWY_IF_NOT_T_SIZE_V(V, 8)>
|
|
7340
|
-
HWY_API V CompressNot(V v, const M mask) {
|
|
7341
|
-
return Compress(v, Not(mask));
|
|
7342
|
-
}
|
|
7343
|
-
|
|
7344
7188
|
template <typename T, HWY_IF_T_SIZE(T, 8)>
|
|
7345
7189
|
HWY_API Vec512<T> CompressNot(Vec512<T> v, Mask512<T> mask) {
|
|
7346
7190
|
// See CompressIsPartition. u64 is faster than u32.
|
|
@@ -7403,102 +7247,6 @@ HWY_API Vec512<T> CompressNot(Vec512<T> v, Mask512<T> mask) {
|
|
|
7403
7247
|
return TableLookupLanes(v, indices);
|
|
7404
7248
|
}
|
|
7405
7249
|
|
|
7406
|
-
// uint64_t lanes. Only implement for 256 and 512-bit vectors because this is a
|
|
7407
|
-
// no-op for 128-bit.
|
|
7408
|
-
template <class V, class M, HWY_IF_V_SIZE_GT_D(DFromV<V>, 16)>
|
|
7409
|
-
HWY_API V CompressBlocksNot(V v, M mask) {
|
|
7410
|
-
return CompressNot(v, mask);
|
|
7411
|
-
}
|
|
7412
|
-
|
|
7413
|
-
// ------------------------------ CompressBits
|
|
7414
|
-
template <class V>
|
|
7415
|
-
HWY_API V CompressBits(V v, const uint8_t* HWY_RESTRICT bits) {
|
|
7416
|
-
return Compress(v, LoadMaskBits(DFromV<V>(), bits));
|
|
7417
|
-
}
|
|
7418
|
-
|
|
7419
|
-
// ------------------------------ CompressStore
|
|
7420
|
-
|
|
7421
|
-
// Generic for all vector lengths.
|
|
7422
|
-
|
|
7423
|
-
template <class D, HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2))>
|
|
7424
|
-
HWY_API size_t CompressStore(VFromD<D> v, MFromD<D> mask, D d,
|
|
7425
|
-
TFromD<D>* HWY_RESTRICT unaligned) {
|
|
7426
|
-
#if HWY_TARGET == HWY_AVX3_ZEN4
|
|
7427
|
-
StoreU(Compress(v, mask), d, unaligned);
|
|
7428
|
-
#else
|
|
7429
|
-
const RebindToUnsigned<decltype(d)> du;
|
|
7430
|
-
const auto mu = RebindMask(du, mask);
|
|
7431
|
-
auto pu = reinterpret_cast<TFromD<decltype(du)> * HWY_RESTRICT>(unaligned);
|
|
7432
|
-
|
|
7433
|
-
#if HWY_TARGET <= HWY_AVX3_DL // VBMI2
|
|
7434
|
-
detail::NativeCompressStore(BitCast(du, v), mu, pu);
|
|
7435
|
-
#else
|
|
7436
|
-
detail::EmuCompressStore(BitCast(du, v), mu, du, pu);
|
|
7437
|
-
#endif
|
|
7438
|
-
#endif // HWY_TARGET != HWY_AVX3_ZEN4
|
|
7439
|
-
const size_t count = CountTrue(d, mask);
|
|
7440
|
-
detail::MaybeUnpoison(unaligned, count);
|
|
7441
|
-
return count;
|
|
7442
|
-
}
|
|
7443
|
-
|
|
7444
|
-
template <class D, HWY_IF_NOT_FLOAT_D(D),
|
|
7445
|
-
HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 4) | (1 << 8))>
|
|
7446
|
-
HWY_API size_t CompressStore(VFromD<D> v, MFromD<D> mask, D d,
|
|
7447
|
-
TFromD<D>* HWY_RESTRICT unaligned) {
|
|
7448
|
-
#if HWY_TARGET == HWY_AVX3_ZEN4
|
|
7449
|
-
StoreU(Compress(v, mask), d, unaligned);
|
|
7450
|
-
#else
|
|
7451
|
-
const RebindToUnsigned<decltype(d)> du;
|
|
7452
|
-
const auto mu = RebindMask(du, mask);
|
|
7453
|
-
using TU = TFromD<decltype(du)>;
|
|
7454
|
-
TU* HWY_RESTRICT pu = reinterpret_cast<TU*>(unaligned);
|
|
7455
|
-
detail::NativeCompressStore(BitCast(du, v), mu, pu);
|
|
7456
|
-
#endif // HWY_TARGET != HWY_AVX3_ZEN4
|
|
7457
|
-
const size_t count = CountTrue(d, mask);
|
|
7458
|
-
detail::MaybeUnpoison(unaligned, count);
|
|
7459
|
-
return count;
|
|
7460
|
-
}
|
|
7461
|
-
|
|
7462
|
-
// Additional overloads to avoid casting to uint32_t (delay?).
|
|
7463
|
-
template <class D, HWY_IF_FLOAT3264_D(D)>
|
|
7464
|
-
HWY_API size_t CompressStore(VFromD<D> v, MFromD<D> mask, D d,
|
|
7465
|
-
TFromD<D>* HWY_RESTRICT unaligned) {
|
|
7466
|
-
#if HWY_TARGET == HWY_AVX3_ZEN4
|
|
7467
|
-
StoreU(Compress(v, mask), d, unaligned);
|
|
7468
|
-
#else
|
|
7469
|
-
(void)d;
|
|
7470
|
-
detail::NativeCompressStore(v, mask, unaligned);
|
|
7471
|
-
#endif // HWY_TARGET != HWY_AVX3_ZEN4
|
|
7472
|
-
const size_t count = PopCount(uint64_t{mask.raw});
|
|
7473
|
-
detail::MaybeUnpoison(unaligned, count);
|
|
7474
|
-
return count;
|
|
7475
|
-
}
|
|
7476
|
-
|
|
7477
|
-
// ------------------------------ CompressBlendedStore
|
|
7478
|
-
template <class D, HWY_IF_V_SIZE_GT_D(D, 8)>
|
|
7479
|
-
HWY_API size_t CompressBlendedStore(VFromD<D> v, MFromD<D> m, D d,
|
|
7480
|
-
TFromD<D>* HWY_RESTRICT unaligned) {
|
|
7481
|
-
// Native CompressStore already does the blending at no extra cost (latency
|
|
7482
|
-
// 11, rthroughput 2 - same as compress plus store).
|
|
7483
|
-
if (HWY_TARGET == HWY_AVX3_DL ||
|
|
7484
|
-
(HWY_TARGET != HWY_AVX3_ZEN4 && sizeof(TFromD<D>) > 2)) {
|
|
7485
|
-
return CompressStore(v, m, d, unaligned);
|
|
7486
|
-
} else {
|
|
7487
|
-
const size_t count = CountTrue(d, m);
|
|
7488
|
-
BlendedStore(Compress(v, m), FirstN(d, count), d, unaligned);
|
|
7489
|
-
detail::MaybeUnpoison(unaligned, count);
|
|
7490
|
-
return count;
|
|
7491
|
-
}
|
|
7492
|
-
}
|
|
7493
|
-
|
|
7494
|
-
// ------------------------------ CompressBitsStore
|
|
7495
|
-
// Generic for all vector lengths.
|
|
7496
|
-
template <class D>
|
|
7497
|
-
HWY_API size_t CompressBitsStore(VFromD<D> v, const uint8_t* HWY_RESTRICT bits,
|
|
7498
|
-
D d, TFromD<D>* HWY_RESTRICT unaligned) {
|
|
7499
|
-
return CompressStore(v, LoadMaskBits(d, bits), d, unaligned);
|
|
7500
|
-
}
|
|
7501
|
-
|
|
7502
7250
|
// ------------------------------ LoadInterleaved4
|
|
7503
7251
|
|
|
7504
7252
|
// Actually implemented in generic_ops, we just overload LoadTransposedBlocks4.
|
|
@@ -7532,7 +7280,7 @@ Vec512<double> Shuffle128(const Vec512<double> lo, const Vec512<double> hi) {
|
|
|
7532
7280
|
template <class D, HWY_IF_V_SIZE_D(D, 64)>
|
|
7533
7281
|
HWY_API void LoadTransposedBlocks3(D d, const TFromD<D>* HWY_RESTRICT unaligned,
|
|
7534
7282
|
VFromD<D>& A, VFromD<D>& B, VFromD<D>& C) {
|
|
7535
|
-
|
|
7283
|
+
HWY_LANES_CONSTEXPR size_t N = Lanes(d);
|
|
7536
7284
|
const VFromD<D> v3210 = LoadU(d, unaligned + 0 * N);
|
|
7537
7285
|
const VFromD<D> v7654 = LoadU(d, unaligned + 1 * N);
|
|
7538
7286
|
const VFromD<D> vba98 = LoadU(d, unaligned + 2 * N);
|
|
@@ -7559,7 +7307,7 @@ template <class D, HWY_IF_V_SIZE_D(D, 64)>
|
|
|
7559
7307
|
HWY_API void LoadTransposedBlocks4(D d, const TFromD<D>* HWY_RESTRICT unaligned,
|
|
7560
7308
|
VFromD<D>& vA, VFromD<D>& vB, VFromD<D>& vC,
|
|
7561
7309
|
VFromD<D>& vD) {
|
|
7562
|
-
|
|
7310
|
+
HWY_LANES_CONSTEXPR size_t N = Lanes(d);
|
|
7563
7311
|
const VFromD<D> v3210 = LoadU(d, unaligned + 0 * N);
|
|
7564
7312
|
const VFromD<D> v7654 = LoadU(d, unaligned + 1 * N);
|
|
7565
7313
|
const VFromD<D> vba98 = LoadU(d, unaligned + 2 * N);
|
|
@@ -7592,7 +7340,7 @@ namespace detail {
|
|
|
7592
7340
|
template <class D, HWY_IF_V_SIZE_D(D, 64)>
|
|
7593
7341
|
HWY_API void StoreTransposedBlocks2(const VFromD<D> i, const VFromD<D> j, D d,
|
|
7594
7342
|
TFromD<D>* HWY_RESTRICT unaligned) {
|
|
7595
|
-
|
|
7343
|
+
HWY_LANES_CONSTEXPR size_t N = Lanes(d);
|
|
7596
7344
|
const auto j1_j0_i1_i0 = detail::Shuffle128<_MM_PERM_BABA>(i, j);
|
|
7597
7345
|
const auto j3_j2_i3_i2 = detail::Shuffle128<_MM_PERM_DCDC>(i, j);
|
|
7598
7346
|
const auto j1_i1_j0_i0 =
|
|
@@ -7615,7 +7363,7 @@ template <class D, HWY_IF_V_SIZE_D(D, 64)>
|
|
|
7615
7363
|
HWY_API void StoreTransposedBlocks3(const VFromD<D> i, const VFromD<D> j,
|
|
7616
7364
|
const VFromD<D> k, D d,
|
|
7617
7365
|
TFromD<D>* HWY_RESTRICT unaligned) {
|
|
7618
|
-
|
|
7366
|
+
HWY_LANES_CONSTEXPR size_t N = Lanes(d);
|
|
7619
7367
|
const VFromD<D> j2_j0_i2_i0 = detail::Shuffle128<_MM_PERM_CACA>(i, j);
|
|
7620
7368
|
const VFromD<D> i3_i1_k2_k0 = detail::Shuffle128<_MM_PERM_DBCA>(k, i);
|
|
7621
7369
|
const VFromD<D> j3_j1_k3_k1 = detail::Shuffle128<_MM_PERM_DBDB>(k, j);
|
|
@@ -7646,7 +7394,7 @@ template <class D, HWY_IF_V_SIZE_D(D, 64)>
|
|
|
7646
7394
|
HWY_API void StoreTransposedBlocks4(const VFromD<D> i, const VFromD<D> j,
|
|
7647
7395
|
const VFromD<D> k, const VFromD<D> l, D d,
|
|
7648
7396
|
TFromD<D>* HWY_RESTRICT unaligned) {
|
|
7649
|
-
|
|
7397
|
+
HWY_LANES_CONSTEXPR size_t N = Lanes(d);
|
|
7650
7398
|
const VFromD<D> j1_j0_i1_i0 = detail::Shuffle128<_MM_PERM_BABA>(i, j);
|
|
7651
7399
|
const VFromD<D> l1_l0_k1_k0 = detail::Shuffle128<_MM_PERM_BABA>(k, l);
|
|
7652
7400
|
const VFromD<D> j3_j2_i3_i2 = detail::Shuffle128<_MM_PERM_DCDC>(i, j);
|
|
@@ -7805,6 +7553,17 @@ HWY_API Vec512<int64_t> operator>>(const Vec512<int64_t> v,
|
|
|
7805
7553
|
}
|
|
7806
7554
|
|
|
7807
7555
|
// ------------------------------ WidenMulPairwiseAdd
|
|
7556
|
+
|
|
7557
|
+
#if HWY_NATIVE_DOT_BF16
|
|
7558
|
+
template <class DF, HWY_IF_F32_D(DF), HWY_IF_V_SIZE_D(DF, 64),
|
|
7559
|
+
class VBF = VFromD<Repartition<bfloat16_t, DF>>>
|
|
7560
|
+
HWY_API VFromD<DF> WidenMulPairwiseAdd(DF df, VBF a, VBF b) {
|
|
7561
|
+
return VFromD<DF>{_mm512_dpbf16_ps(Zero(df).raw,
|
|
7562
|
+
reinterpret_cast<__m512bh>(a.raw),
|
|
7563
|
+
reinterpret_cast<__m512bh>(b.raw))};
|
|
7564
|
+
}
|
|
7565
|
+
#endif // HWY_NATIVE_DOT_BF16
|
|
7566
|
+
|
|
7808
7567
|
template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_I32_D(D)>
|
|
7809
7568
|
HWY_API VFromD<D> WidenMulPairwiseAdd(D /*d32*/, Vec512<int16_t> a,
|
|
7810
7569
|
Vec512<int16_t> b) {
|
|
@@ -7923,117 +7682,34 @@ HWY_API V BitShuffle(V v, VI idx) {
|
|
|
7923
7682
|
}
|
|
7924
7683
|
#endif // HWY_TARGET <= HWY_AVX3_DL
|
|
7925
7684
|
|
|
7926
|
-
//
|
|
7927
|
-
|
|
7928
|
-
template <class V, HWY_IF_UI32(TFromV<V>), HWY_IF_V_SIZE_V(V, 64)>
|
|
7929
|
-
HWY_API V LeadingZeroCount(V v) {
|
|
7930
|
-
return V{_mm512_lzcnt_epi32(v.raw)};
|
|
7931
|
-
}
|
|
7685
|
+
// ------------------------------ MultiRotateRight
|
|
7932
7686
|
|
|
7933
|
-
|
|
7934
|
-
HWY_API V LeadingZeroCount(V v) {
|
|
7935
|
-
return V{_mm512_lzcnt_epi64(v.raw)};
|
|
7936
|
-
}
|
|
7937
|
-
|
|
7938
|
-
namespace detail {
|
|
7939
|
-
|
|
7940
|
-
template <class V, HWY_IF_UNSIGNED_V(V),
|
|
7941
|
-
HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 2)),
|
|
7942
|
-
HWY_IF_LANES_LE_D(DFromV<V>, 16)>
|
|
7943
|
-
HWY_INLINE V Lzcnt32ForU8OrU16(V v) {
|
|
7944
|
-
const DFromV<decltype(v)> d;
|
|
7945
|
-
const Rebind<int32_t, decltype(d)> di32;
|
|
7946
|
-
const Rebind<uint32_t, decltype(d)> du32;
|
|
7947
|
-
|
|
7948
|
-
const auto v_lz_count = LeadingZeroCount(PromoteTo(du32, v));
|
|
7949
|
-
return DemoteTo(d, BitCast(di32, v_lz_count));
|
|
7950
|
-
}
|
|
7951
|
-
|
|
7952
|
-
template <class V, HWY_IF_UNSIGNED_V(V),
|
|
7953
|
-
HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 2)),
|
|
7954
|
-
HWY_IF_LANES_D(DFromV<V>, 32)>
|
|
7955
|
-
HWY_INLINE VFromD<Rebind<uint16_t, DFromV<V>>> Lzcnt32ForU8OrU16AsU16(V v) {
|
|
7956
|
-
const DFromV<decltype(v)> d;
|
|
7957
|
-
const Half<decltype(d)> dh;
|
|
7958
|
-
const Rebind<int32_t, decltype(dh)> di32;
|
|
7959
|
-
const Rebind<uint32_t, decltype(dh)> du32;
|
|
7960
|
-
const Rebind<uint16_t, decltype(d)> du16;
|
|
7961
|
-
|
|
7962
|
-
const auto lo_v_lz_count =
|
|
7963
|
-
LeadingZeroCount(PromoteTo(du32, LowerHalf(dh, v)));
|
|
7964
|
-
const auto hi_v_lz_count =
|
|
7965
|
-
LeadingZeroCount(PromoteTo(du32, UpperHalf(dh, v)));
|
|
7966
|
-
return OrderedDemote2To(du16, BitCast(di32, lo_v_lz_count),
|
|
7967
|
-
BitCast(di32, hi_v_lz_count));
|
|
7968
|
-
}
|
|
7969
|
-
|
|
7970
|
-
HWY_INLINE Vec256<uint8_t> Lzcnt32ForU8OrU16(Vec256<uint8_t> v) {
|
|
7971
|
-
const DFromV<decltype(v)> d;
|
|
7972
|
-
const Rebind<int16_t, decltype(d)> di16;
|
|
7973
|
-
return DemoteTo(d, BitCast(di16, Lzcnt32ForU8OrU16AsU16(v)));
|
|
7974
|
-
}
|
|
7975
|
-
|
|
7976
|
-
HWY_INLINE Vec512<uint8_t> Lzcnt32ForU8OrU16(Vec512<uint8_t> v) {
|
|
7977
|
-
const DFromV<decltype(v)> d;
|
|
7978
|
-
const Half<decltype(d)> dh;
|
|
7979
|
-
const Rebind<int16_t, decltype(dh)> di16;
|
|
7687
|
+
#if HWY_TARGET <= HWY_AVX3_DL
|
|
7980
7688
|
|
|
7981
|
-
|
|
7982
|
-
|
|
7689
|
+
#ifdef HWY_NATIVE_MULTIROTATERIGHT
|
|
7690
|
+
#undef HWY_NATIVE_MULTIROTATERIGHT
|
|
7691
|
+
#else
|
|
7692
|
+
#define HWY_NATIVE_MULTIROTATERIGHT
|
|
7693
|
+
#endif
|
|
7983
7694
|
|
|
7984
|
-
|
|
7985
|
-
|
|
7986
|
-
|
|
7695
|
+
template <class V, class VI, HWY_IF_UI64(TFromV<V>), HWY_IF_UI8(TFromV<VI>),
|
|
7696
|
+
HWY_IF_V_SIZE_V(V, 64), HWY_IF_V_SIZE_V(VI, HWY_MAX_LANES_V(V) * 8)>
|
|
7697
|
+
HWY_API V MultiRotateRight(V v, VI idx) {
|
|
7698
|
+
return V{_mm512_multishift_epi64_epi8(idx.raw, v.raw)};
|
|
7987
7699
|
}
|
|
7988
7700
|
|
|
7989
|
-
|
|
7990
|
-
return Lzcnt32ForU8OrU16AsU16(v);
|
|
7991
|
-
}
|
|
7701
|
+
#endif
|
|
7992
7702
|
|
|
7993
|
-
|
|
7703
|
+
// -------------------- LeadingZeroCount
|
|
7994
7704
|
|
|
7995
|
-
template <class V,
|
|
7996
|
-
HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 2))>
|
|
7705
|
+
template <class V, HWY_IF_UI32(TFromV<V>), HWY_IF_V_SIZE_V(V, 64)>
|
|
7997
7706
|
HWY_API V LeadingZeroCount(V v) {
|
|
7998
|
-
|
|
7999
|
-
const RebindToUnsigned<decltype(d)> du;
|
|
8000
|
-
using TU = TFromD<decltype(du)>;
|
|
8001
|
-
|
|
8002
|
-
constexpr TU kNumOfBitsInT{sizeof(TU) * 8};
|
|
8003
|
-
const auto v_lzcnt32 = detail::Lzcnt32ForU8OrU16(BitCast(du, v));
|
|
8004
|
-
return BitCast(d, Min(v_lzcnt32 - Set(du, TU{32 - kNumOfBitsInT}),
|
|
8005
|
-
Set(du, TU{kNumOfBitsInT})));
|
|
8006
|
-
}
|
|
8007
|
-
|
|
8008
|
-
template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V),
|
|
8009
|
-
HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 2))>
|
|
8010
|
-
HWY_API V HighestSetBitIndex(V v) {
|
|
8011
|
-
const DFromV<decltype(v)> d;
|
|
8012
|
-
const RebindToUnsigned<decltype(d)> du;
|
|
8013
|
-
using TU = TFromD<decltype(du)>;
|
|
8014
|
-
return BitCast(d,
|
|
8015
|
-
Set(du, TU{31}) - detail::Lzcnt32ForU8OrU16(BitCast(du, v)));
|
|
8016
|
-
}
|
|
8017
|
-
|
|
8018
|
-
template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V),
|
|
8019
|
-
HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 4) | (1 << 8))>
|
|
8020
|
-
HWY_API V HighestSetBitIndex(V v) {
|
|
8021
|
-
const DFromV<decltype(v)> d;
|
|
8022
|
-
using T = TFromD<decltype(d)>;
|
|
8023
|
-
return BitCast(d, Set(d, T{sizeof(T) * 8 - 1}) - LeadingZeroCount(v));
|
|
7707
|
+
return V{_mm512_lzcnt_epi32(v.raw)};
|
|
8024
7708
|
}
|
|
8025
7709
|
|
|
8026
|
-
template <class V,
|
|
8027
|
-
HWY_API V
|
|
8028
|
-
|
|
8029
|
-
const RebindToSigned<decltype(d)> di;
|
|
8030
|
-
using T = TFromD<decltype(d)>;
|
|
8031
|
-
|
|
8032
|
-
const auto vi = BitCast(di, v);
|
|
8033
|
-
const auto lowest_bit = BitCast(d, And(vi, Neg(vi)));
|
|
8034
|
-
constexpr T kNumOfBitsInT{sizeof(T) * 8};
|
|
8035
|
-
const auto bit_idx = HighestSetBitIndex(lowest_bit);
|
|
8036
|
-
return IfThenElse(MaskFromVec(bit_idx), Set(d, kNumOfBitsInT), bit_idx);
|
|
7710
|
+
template <class V, HWY_IF_UI64(TFromV<V>), HWY_IF_V_SIZE_V(V, 64)>
|
|
7711
|
+
HWY_API V LeadingZeroCount(V v) {
|
|
7712
|
+
return V{_mm512_lzcnt_epi64(v.raw)};
|
|
8037
7713
|
}
|
|
8038
7714
|
|
|
8039
7715
|
// NOLINTNEXTLINE(google-readability-namespace-comments)
|