@img/sharp-libvips-dev 1.2.0 → 1.2.2-rc.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. package/include/ffi.h +3 -3
  2. package/include/harfbuzz/hb-deprecated.h +4 -4
  3. package/include/harfbuzz/hb-font.h +120 -9
  4. package/include/harfbuzz/hb-version.h +3 -3
  5. package/include/hwy/abort.h +2 -19
  6. package/include/hwy/aligned_allocator.h +11 -7
  7. package/include/hwy/auto_tune.h +504 -0
  8. package/include/hwy/base.h +425 -104
  9. package/include/hwy/cache_control.h +16 -0
  10. package/include/hwy/detect_compiler_arch.h +32 -1
  11. package/include/hwy/detect_targets.h +251 -67
  12. package/include/hwy/foreach_target.h +35 -0
  13. package/include/hwy/highway.h +185 -76
  14. package/include/hwy/nanobenchmark.h +1 -19
  15. package/include/hwy/ops/arm_neon-inl.h +969 -458
  16. package/include/hwy/ops/arm_sve-inl.h +1137 -359
  17. package/include/hwy/ops/emu128-inl.h +97 -11
  18. package/include/hwy/ops/generic_ops-inl.h +1222 -34
  19. package/include/hwy/ops/loongarch_lasx-inl.h +4664 -0
  20. package/include/hwy/ops/loongarch_lsx-inl.h +5933 -0
  21. package/include/hwy/ops/ppc_vsx-inl.h +306 -126
  22. package/include/hwy/ops/rvv-inl.h +546 -51
  23. package/include/hwy/ops/scalar-inl.h +77 -22
  24. package/include/hwy/ops/set_macros-inl.h +138 -17
  25. package/include/hwy/ops/shared-inl.h +50 -10
  26. package/include/hwy/ops/wasm_128-inl.h +137 -92
  27. package/include/hwy/ops/x86_128-inl.h +773 -214
  28. package/include/hwy/ops/x86_256-inl.h +712 -255
  29. package/include/hwy/ops/x86_512-inl.h +429 -753
  30. package/include/hwy/ops/x86_avx3-inl.h +501 -0
  31. package/include/hwy/per_target.h +2 -1
  32. package/include/hwy/profiler.h +622 -486
  33. package/include/hwy/targets.h +62 -20
  34. package/include/hwy/timer-inl.h +8 -160
  35. package/include/hwy/timer.h +170 -3
  36. package/include/hwy/x86_cpuid.h +81 -0
  37. package/include/libheif/heif_cxx.h +25 -5
  38. package/include/libheif/heif_regions.h +5 -5
  39. package/include/libheif/heif_version.h +2 -2
  40. package/include/librsvg-2.0/librsvg/rsvg-version.h +2 -2
  41. package/include/libxml2/libxml/xmlversion.h +4 -4
  42. package/include/pango-1.0/pango/pango-enum-types.h +3 -0
  43. package/include/pango-1.0/pango/pango-features.h +3 -3
  44. package/include/pango-1.0/pango/pango-font.h +30 -0
  45. package/include/pango-1.0/pango/pango-version-macros.h +26 -0
  46. package/include/pixman-1/pixman-version.h +2 -2
  47. package/include/webp/decode.h +11 -2
  48. package/include/webp/demux.h +2 -0
  49. package/include/webp/encode.h +2 -0
  50. package/include/webp/mux_types.h +1 -0
  51. package/include/webp/sharpyuv/sharpyuv.h +1 -1
  52. package/include/webp/types.h +2 -2
  53. package/include/zlib.h +3 -3
  54. package/package.json +1 -1
  55. package/versions.json +11 -11
@@ -172,6 +172,10 @@ class Vec512 {
172
172
  template <typename T>
173
173
  struct Mask512 {
174
174
  using Raw = typename detail::RawMask512<sizeof(T)>::type;
175
+
176
+ using PrivateT = T; // only for DFromM
177
+ static constexpr size_t kPrivateN = 64 / sizeof(T); // only for DFromM
178
+
175
179
  Raw raw;
176
180
  };
177
181
 
@@ -1338,20 +1342,7 @@ HWY_API Vec512<int64_t> ShiftLeft(const Vec512<int64_t> v) {
1338
1342
  return Vec512<int64_t>{_mm512_slli_epi64(v.raw, kBits)};
1339
1343
  }
1340
1344
 
1341
- #if HWY_TARGET <= HWY_AVX3_DL
1342
-
1343
- // Generic for all vector lengths. Must be defined after all GaloisAffine.
1344
- template <int kBits, class V, HWY_IF_T_SIZE_V(V, 1)>
1345
- HWY_API V ShiftLeft(const V v) {
1346
- const Repartition<uint64_t, DFromV<V>> du64;
1347
- if (kBits == 0) return v;
1348
- if (kBits == 1) return v + v;
1349
- constexpr uint64_t kMatrix = (0x0102040810204080ULL >> kBits) &
1350
- (0x0101010101010101ULL * (0xFF >> kBits));
1351
- return detail::GaloisAffine(v, Set(du64, kMatrix));
1352
- }
1353
-
1354
- #else // HWY_TARGET > HWY_AVX3_DL
1345
+ #if HWY_TARGET > HWY_AVX3_DL
1355
1346
 
1356
1347
  template <int kBits, typename T, HWY_IF_T_SIZE(T, 1)>
1357
1348
  HWY_API Vec512<T> ShiftLeft(const Vec512<T> v) {
@@ -1397,33 +1388,7 @@ HWY_API Vec512<int64_t> ShiftRight(const Vec512<int64_t> v) {
1397
1388
  return Vec512<int64_t>{_mm512_srai_epi64(v.raw, kBits)};
1398
1389
  }
1399
1390
 
1400
- #if HWY_TARGET <= HWY_AVX3_DL
1401
-
1402
- // Generic for all vector lengths. Must be defined after all GaloisAffine.
1403
- template <int kBits, class V, HWY_IF_U8_D(DFromV<V>)>
1404
- HWY_API V ShiftRight(const V v) {
1405
- const Repartition<uint64_t, DFromV<V>> du64;
1406
- if (kBits == 0) return v;
1407
- constexpr uint64_t kMatrix =
1408
- (0x0102040810204080ULL << kBits) &
1409
- (0x0101010101010101ULL * ((0xFF << kBits) & 0xFF));
1410
- return detail::GaloisAffine(v, Set(du64, kMatrix));
1411
- }
1412
-
1413
- // Generic for all vector lengths. Must be defined after all GaloisAffine.
1414
- template <int kBits, class V, HWY_IF_I8_D(DFromV<V>)>
1415
- HWY_API V ShiftRight(const V v) {
1416
- const Repartition<uint64_t, DFromV<V>> du64;
1417
- if (kBits == 0) return v;
1418
- constexpr uint64_t kShift =
1419
- (0x0102040810204080ULL << kBits) &
1420
- (0x0101010101010101ULL * ((0xFF << kBits) & 0xFF));
1421
- constexpr uint64_t kSign =
1422
- kBits == 0 ? 0 : (0x8080808080808080ULL >> (64 - (8 * kBits)));
1423
- return detail::GaloisAffine(v, Set(du64, kShift | kSign));
1424
- }
1425
-
1426
- #else // HWY_TARGET > HWY_AVX3_DL
1391
+ #if HWY_TARGET > HWY_AVX3_DL
1427
1392
 
1428
1393
  template <int kBits>
1429
1394
  HWY_API Vec512<uint8_t> ShiftRight(const Vec512<uint8_t> v) {
@@ -1446,26 +1411,7 @@ HWY_API Vec512<int8_t> ShiftRight(const Vec512<int8_t> v) {
1446
1411
 
1447
1412
  // ------------------------------ RotateRight
1448
1413
 
1449
- #if HWY_TARGET <= HWY_AVX3_DL
1450
- // U8 RotateRight is generic for all vector lengths on AVX3_DL
1451
- template <int kBits, class V, HWY_IF_U8(TFromV<V>)>
1452
- HWY_API V RotateRight(V v) {
1453
- static_assert(0 <= kBits && kBits < 8, "Invalid shift count");
1454
-
1455
- const Repartition<uint64_t, DFromV<V>> du64;
1456
- if (kBits == 0) return v;
1457
-
1458
- constexpr uint64_t kShrMatrix =
1459
- (0x0102040810204080ULL << kBits) &
1460
- (0x0101010101010101ULL * ((0xFF << kBits) & 0xFF));
1461
- constexpr int kShlBits = (-kBits) & 7;
1462
- constexpr uint64_t kShlMatrix = (0x0102040810204080ULL >> kShlBits) &
1463
- (0x0101010101010101ULL * (0xFF >> kShlBits));
1464
- constexpr uint64_t kMatrix = kShrMatrix | kShlMatrix;
1465
-
1466
- return detail::GaloisAffine(v, Set(du64, kMatrix));
1467
- }
1468
- #else // HWY_TARGET > HWY_AVX3_DL
1414
+ #if HWY_TARGET > HWY_AVX3_DL
1469
1415
  template <int kBits>
1470
1416
  HWY_API Vec512<uint8_t> RotateRight(const Vec512<uint8_t> v) {
1471
1417
  static_assert(0 <= kBits && kBits < 8, "Invalid shift count");
@@ -1473,7 +1419,7 @@ HWY_API Vec512<uint8_t> RotateRight(const Vec512<uint8_t> v) {
1473
1419
  // AVX3 does not support 8-bit.
1474
1420
  return Or(ShiftRight<kBits>(v), ShiftLeft<HWY_MIN(7, 8 - kBits)>(v));
1475
1421
  }
1476
- #endif // HWY_TARGET <= HWY_AVX3_DL
1422
+ #endif // HWY_TARGET > HWY_AVX3_DL
1477
1423
 
1478
1424
  template <int kBits>
1479
1425
  HWY_API Vec512<uint16_t> RotateRight(const Vec512<uint16_t> v) {
@@ -1532,7 +1478,11 @@ HWY_API Vec512<T> Ror(Vec512<T> a, Vec512<T> b) {
1532
1478
  // ------------------------------ ShiftLeftSame
1533
1479
 
1534
1480
  // GCC <14 and Clang <11 do not follow the Intel documentation for AVX-512
1535
- // shift-with-immediate: the counts should all be unsigned int.
1481
+ // shift-with-immediate: the counts should all be unsigned int. Despite casting,
1482
+ // we still see warnings in GCC debug builds, hence disable.
1483
+ HWY_DIAGNOSTICS(push)
1484
+ HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion")
1485
+
1536
1486
  #if HWY_COMPILER_CLANG && HWY_COMPILER_CLANG < 1100
1537
1487
  using Shift16Count = int;
1538
1488
  using Shift3264Count = int;
@@ -1696,6 +1646,8 @@ HWY_API Vec512<int8_t> ShiftRightSame(Vec512<int8_t> v, const int bits) {
1696
1646
  return (shifted ^ shifted_sign) - shifted_sign;
1697
1647
  }
1698
1648
 
1649
+ HWY_DIAGNOSTICS(pop)
1650
+
1699
1651
  // ------------------------------ Minimum
1700
1652
 
1701
1653
  // Unsigned
@@ -1782,15 +1734,70 @@ HWY_API Vec512<double> Max(Vec512<double> a, Vec512<double> b) {
1782
1734
  return Vec512<double>{_mm512_max_pd(a.raw, b.raw)};
1783
1735
  }
1784
1736
 
1785
- // ------------------------------ Integer multiplication
1737
+ // ------------------------------ MinNumber and MaxNumber
1738
+
1739
+ #if HWY_X86_HAVE_AVX10_2_OPS
1740
+
1741
+ #if HWY_HAVE_FLOAT16
1742
+ HWY_API Vec512<float16_t> MinNumber(Vec512<float16_t> a, Vec512<float16_t> b) {
1743
+ return Vec512<float16_t>{_mm512_minmax_ph(a.raw, b.raw, 0x14)};
1744
+ }
1745
+ #endif
1746
+ HWY_API Vec512<float> MinNumber(Vec512<float> a, Vec512<float> b) {
1747
+ return Vec512<float>{_mm512_minmax_ps(a.raw, b.raw, 0x14)};
1748
+ }
1749
+ HWY_API Vec512<double> MinNumber(Vec512<double> a, Vec512<double> b) {
1750
+ return Vec512<double>{_mm512_minmax_pd(a.raw, b.raw, 0x14)};
1751
+ }
1752
+
1753
+ #if HWY_HAVE_FLOAT16
1754
+ HWY_API Vec512<float16_t> MaxNumber(Vec512<float16_t> a, Vec512<float16_t> b) {
1755
+ return Vec512<float16_t>{_mm512_minmax_ph(a.raw, b.raw, 0x15)};
1756
+ }
1757
+ #endif
1758
+ HWY_API Vec512<float> MaxNumber(Vec512<float> a, Vec512<float> b) {
1759
+ return Vec512<float>{_mm512_minmax_ps(a.raw, b.raw, 0x15)};
1760
+ }
1761
+ HWY_API Vec512<double> MaxNumber(Vec512<double> a, Vec512<double> b) {
1762
+ return Vec512<double>{_mm512_minmax_pd(a.raw, b.raw, 0x15)};
1763
+ }
1764
+
1765
+ #endif
1766
+
1767
+ // ------------------------------ MinMagnitude and MaxMagnitude
1768
+
1769
+ #if HWY_X86_HAVE_AVX10_2_OPS
1770
+
1771
+ #if HWY_HAVE_FLOAT16
1772
+ HWY_API Vec512<float16_t> MinMagnitude(Vec512<float16_t> a,
1773
+ Vec512<float16_t> b) {
1774
+ return Vec512<float16_t>{_mm512_minmax_ph(a.raw, b.raw, 0x16)};
1775
+ }
1776
+ #endif
1777
+ HWY_API Vec512<float> MinMagnitude(Vec512<float> a, Vec512<float> b) {
1778
+ return Vec512<float>{_mm512_minmax_ps(a.raw, b.raw, 0x16)};
1779
+ }
1780
+ HWY_API Vec512<double> MinMagnitude(Vec512<double> a, Vec512<double> b) {
1781
+ return Vec512<double>{_mm512_minmax_pd(a.raw, b.raw, 0x16)};
1782
+ }
1783
+
1784
+ #if HWY_HAVE_FLOAT16
1785
+ HWY_API Vec512<float16_t> MaxMagnitude(Vec512<float16_t> a,
1786
+ Vec512<float16_t> b) {
1787
+ return Vec512<float16_t>{_mm512_minmax_ph(a.raw, b.raw, 0x17)};
1788
+ }
1789
+ #endif
1790
+ HWY_API Vec512<float> MaxMagnitude(Vec512<float> a, Vec512<float> b) {
1791
+ return Vec512<float>{_mm512_minmax_ps(a.raw, b.raw, 0x17)};
1792
+ }
1793
+ HWY_API Vec512<double> MaxMagnitude(Vec512<double> a, Vec512<double> b) {
1794
+ return Vec512<double>{_mm512_minmax_pd(a.raw, b.raw, 0x17)};
1795
+ }
1786
1796
 
1787
- // Per-target flag to prevent generic_ops-inl.h from defining 64-bit operator*.
1788
- #ifdef HWY_NATIVE_MUL_64
1789
- #undef HWY_NATIVE_MUL_64
1790
- #else
1791
- #define HWY_NATIVE_MUL_64
1792
1797
  #endif
1793
1798
 
1799
+ // ------------------------------ Integer multiplication
1800
+
1794
1801
  // Unsigned
1795
1802
  HWY_API Vec512<uint16_t> operator*(Vec512<uint16_t> a, Vec512<uint16_t> b) {
1796
1803
  return Vec512<uint16_t>{_mm512_mullo_epi16(a.raw, b.raw)};
@@ -1801,14 +1808,6 @@ HWY_API Vec512<uint32_t> operator*(Vec512<uint32_t> a, Vec512<uint32_t> b) {
1801
1808
  HWY_API Vec512<uint64_t> operator*(Vec512<uint64_t> a, Vec512<uint64_t> b) {
1802
1809
  return Vec512<uint64_t>{_mm512_mullo_epi64(a.raw, b.raw)};
1803
1810
  }
1804
- HWY_API Vec256<uint64_t> operator*(Vec256<uint64_t> a, Vec256<uint64_t> b) {
1805
- return Vec256<uint64_t>{_mm256_mullo_epi64(a.raw, b.raw)};
1806
- }
1807
- template <size_t N>
1808
- HWY_API Vec128<uint64_t, N> operator*(Vec128<uint64_t, N> a,
1809
- Vec128<uint64_t, N> b) {
1810
- return Vec128<uint64_t, N>{_mm_mullo_epi64(a.raw, b.raw)};
1811
- }
1812
1811
 
1813
1812
  // Signed
1814
1813
  HWY_API Vec512<int16_t> operator*(Vec512<int16_t> a, Vec512<int16_t> b) {
@@ -1820,14 +1819,7 @@ HWY_API Vec512<int32_t> operator*(Vec512<int32_t> a, Vec512<int32_t> b) {
1820
1819
  HWY_API Vec512<int64_t> operator*(Vec512<int64_t> a, Vec512<int64_t> b) {
1821
1820
  return Vec512<int64_t>{_mm512_mullo_epi64(a.raw, b.raw)};
1822
1821
  }
1823
- HWY_API Vec256<int64_t> operator*(Vec256<int64_t> a, Vec256<int64_t> b) {
1824
- return Vec256<int64_t>{_mm256_mullo_epi64(a.raw, b.raw)};
1825
- }
1826
- template <size_t N>
1827
- HWY_API Vec128<int64_t, N> operator*(Vec128<int64_t, N> a,
1828
- Vec128<int64_t, N> b) {
1829
- return Vec128<int64_t, N>{_mm_mullo_epi64(a.raw, b.raw)};
1830
- }
1822
+
1831
1823
  // Returns the upper 16 bits of a * b in each lane.
1832
1824
  HWY_API Vec512<uint16_t> MulHigh(Vec512<uint16_t> a, Vec512<uint16_t> b) {
1833
1825
  return Vec512<uint16_t>{_mm512_mulhi_epu16(a.raw, b.raw)};
@@ -1877,6 +1869,21 @@ HWY_API Vec512<double> operator*(Vec512<double> a, Vec512<double> b) {
1877
1869
  return Vec512<double>{_mm512_mul_pd(a.raw, b.raw)};
1878
1870
  }
1879
1871
 
1872
+ #if HWY_HAVE_FLOAT16
1873
+ HWY_API Vec512<float16_t> MulByFloorPow2(Vec512<float16_t> a,
1874
+ Vec512<float16_t> b) {
1875
+ return Vec512<float16_t>{_mm512_scalef_ph(a.raw, b.raw)};
1876
+ }
1877
+ #endif
1878
+
1879
+ HWY_API Vec512<float> MulByFloorPow2(Vec512<float> a, Vec512<float> b) {
1880
+ return Vec512<float>{_mm512_scalef_ps(a.raw, b.raw)};
1881
+ }
1882
+
1883
+ HWY_API Vec512<double> MulByFloorPow2(Vec512<double> a, Vec512<double> b) {
1884
+ return Vec512<double>{_mm512_scalef_pd(a.raw, b.raw)};
1885
+ }
1886
+
1880
1887
  #if HWY_HAVE_FLOAT16
1881
1888
  HWY_API Vec512<float16_t> operator/(Vec512<float16_t> a, Vec512<float16_t> b) {
1882
1889
  return Vec512<float16_t>{_mm512_div_ph(a.raw, b.raw)};
@@ -1903,6 +1910,31 @@ HWY_API Vec512<double> ApproximateReciprocal(Vec512<double> v) {
1903
1910
  return Vec512<double>{_mm512_rcp14_pd(v.raw)};
1904
1911
  }
1905
1912
 
1913
+ // ------------------------------ GetExponent
1914
+
1915
+ #if HWY_HAVE_FLOAT16
1916
+ template <class V, HWY_IF_F16(TFromV<V>), HWY_IF_V_SIZE_V(V, 64)>
1917
+ HWY_API V GetExponent(V v) {
1918
+ return V{_mm512_getexp_ph(v.raw)};
1919
+ }
1920
+ #endif
1921
+ template <class V, HWY_IF_F32(TFromV<V>), HWY_IF_V_SIZE_V(V, 64)>
1922
+ HWY_API V GetExponent(V v) {
1923
+ // Work around warnings in the intrinsic definitions (passing -1 as a mask).
1924
+ HWY_DIAGNOSTICS(push)
1925
+ HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion")
1926
+ return V{_mm512_getexp_ps(v.raw)};
1927
+ HWY_DIAGNOSTICS(pop)
1928
+ }
1929
+ template <class V, HWY_IF_F64(TFromV<V>), HWY_IF_V_SIZE_V(V, 64)>
1930
+ HWY_API V GetExponent(V v) {
1931
+ // Work around warnings in the intrinsic definitions (passing -1 as a mask).
1932
+ HWY_DIAGNOSTICS(push)
1933
+ HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion")
1934
+ return V{_mm512_getexp_pd(v.raw)};
1935
+ HWY_DIAGNOSTICS(pop)
1936
+ }
1937
+
1906
1938
  // ------------------------------ MaskedMinOr
1907
1939
 
1908
1940
  template <typename T, HWY_IF_U8(T)>
@@ -2625,74 +2657,54 @@ HWY_API Mask512<T> operator<=(Vec512<T> a, Vec512<T> b) {
2625
2657
 
2626
2658
  // ------------------------------ Mask
2627
2659
 
2628
- namespace detail {
2629
-
2630
- template <typename T>
2631
- HWY_INLINE Mask512<T> MaskFromVec(hwy::SizeTag<1> /*tag*/, Vec512<T> v) {
2660
+ template <typename T, HWY_IF_UI8(T)>
2661
+ HWY_API Mask512<T> MaskFromVec(Vec512<T> v) {
2632
2662
  return Mask512<T>{_mm512_movepi8_mask(v.raw)};
2633
2663
  }
2634
- template <typename T>
2635
- HWY_INLINE Mask512<T> MaskFromVec(hwy::SizeTag<2> /*tag*/, Vec512<T> v) {
2664
+ template <typename T, HWY_IF_UI16(T)>
2665
+ HWY_API Mask512<T> MaskFromVec(Vec512<T> v) {
2636
2666
  return Mask512<T>{_mm512_movepi16_mask(v.raw)};
2637
2667
  }
2638
- template <typename T>
2639
- HWY_INLINE Mask512<T> MaskFromVec(hwy::SizeTag<4> /*tag*/, Vec512<T> v) {
2668
+ template <typename T, HWY_IF_UI32(T)>
2669
+ HWY_API Mask512<T> MaskFromVec(Vec512<T> v) {
2640
2670
  return Mask512<T>{_mm512_movepi32_mask(v.raw)};
2641
2671
  }
2642
- template <typename T>
2643
- HWY_INLINE Mask512<T> MaskFromVec(hwy::SizeTag<8> /*tag*/, Vec512<T> v) {
2644
- return Mask512<T>{_mm512_movepi64_mask(v.raw)};
2645
- }
2646
-
2647
- } // namespace detail
2648
-
2649
- template <typename T, HWY_IF_NOT_FLOAT(T)>
2672
+ template <typename T, HWY_IF_UI64(T)>
2650
2673
  HWY_API Mask512<T> MaskFromVec(Vec512<T> v) {
2651
- return detail::MaskFromVec(hwy::SizeTag<sizeof(T)>(), v);
2674
+ return Mask512<T>{_mm512_movepi64_mask(v.raw)};
2652
2675
  }
2653
- template <typename T, HWY_IF_FLOAT(T)>
2676
+ template <typename T, HWY_IF_FLOAT_OR_SPECIAL(T)>
2654
2677
  HWY_API Mask512<T> MaskFromVec(Vec512<T> v) {
2655
2678
  const RebindToSigned<DFromV<decltype(v)>> di;
2656
2679
  return Mask512<T>{MaskFromVec(BitCast(di, v)).raw};
2657
2680
  }
2658
2681
 
2659
- HWY_API Vec512<uint8_t> VecFromMask(Mask512<uint8_t> v) {
2660
- return Vec512<uint8_t>{_mm512_movm_epi8(v.raw)};
2661
- }
2662
- HWY_API Vec512<int8_t> VecFromMask(Mask512<int8_t> v) {
2663
- return Vec512<int8_t>{_mm512_movm_epi8(v.raw)};
2664
- }
2665
-
2666
- HWY_API Vec512<uint16_t> VecFromMask(Mask512<uint16_t> v) {
2667
- return Vec512<uint16_t>{_mm512_movm_epi16(v.raw)};
2682
+ template <typename T, HWY_IF_UI8(T)>
2683
+ HWY_API Vec512<T> VecFromMask(Mask512<T> m) {
2684
+ return Vec512<T>{_mm512_movm_epi8(m.raw)};
2668
2685
  }
2669
- HWY_API Vec512<int16_t> VecFromMask(Mask512<int16_t> v) {
2670
- return Vec512<int16_t>{_mm512_movm_epi16(v.raw)};
2686
+ template <typename T, HWY_IF_UI16(T)>
2687
+ HWY_API Vec512<T> VecFromMask(Mask512<T> m) {
2688
+ return Vec512<T>{_mm512_movm_epi16(m.raw)};
2671
2689
  }
2672
2690
  #if HWY_HAVE_FLOAT16
2673
- HWY_API Vec512<float16_t> VecFromMask(Mask512<float16_t> v) {
2674
- return Vec512<float16_t>{_mm512_castsi512_ph(_mm512_movm_epi16(v.raw))};
2691
+ HWY_API Vec512<float16_t> VecFromMask(Mask512<float16_t> m) {
2692
+ return Vec512<float16_t>{_mm512_castsi512_ph(_mm512_movm_epi16(m.raw))};
2675
2693
  }
2676
2694
  #endif // HWY_HAVE_FLOAT16
2677
-
2678
- HWY_API Vec512<uint32_t> VecFromMask(Mask512<uint32_t> v) {
2679
- return Vec512<uint32_t>{_mm512_movm_epi32(v.raw)};
2680
- }
2681
- HWY_API Vec512<int32_t> VecFromMask(Mask512<int32_t> v) {
2682
- return Vec512<int32_t>{_mm512_movm_epi32(v.raw)};
2683
- }
2684
- HWY_API Vec512<float> VecFromMask(Mask512<float> v) {
2685
- return Vec512<float>{_mm512_castsi512_ps(_mm512_movm_epi32(v.raw))};
2686
- }
2687
-
2688
- HWY_API Vec512<uint64_t> VecFromMask(Mask512<uint64_t> v) {
2689
- return Vec512<uint64_t>{_mm512_movm_epi64(v.raw)};
2695
+ template <typename T, HWY_IF_UI32(T)>
2696
+ HWY_API Vec512<T> VecFromMask(Mask512<T> m) {
2697
+ return Vec512<T>{_mm512_movm_epi32(m.raw)};
2690
2698
  }
2691
- HWY_API Vec512<int64_t> VecFromMask(Mask512<int64_t> v) {
2692
- return Vec512<int64_t>{_mm512_movm_epi64(v.raw)};
2699
+ template <typename T, HWY_IF_UI64(T)>
2700
+ HWY_API Vec512<T> VecFromMask(Mask512<T> m) {
2701
+ return Vec512<T>{_mm512_movm_epi64(m.raw)};
2693
2702
  }
2694
- HWY_API Vec512<double> VecFromMask(Mask512<double> v) {
2695
- return Vec512<double>{_mm512_castsi512_pd(_mm512_movm_epi64(v.raw))};
2703
+ template <typename T, HWY_IF_FLOAT_OR_SPECIAL(T)>
2704
+ HWY_API Vec512<T> VecFromMask(Mask512<T> m) {
2705
+ const Full512<T> d;
2706
+ const Full512<MakeSigned<T>> di;
2707
+ return BitCast(d, VecFromMask(RebindMask(di, m)));
2696
2708
  }
2697
2709
 
2698
2710
  // ------------------------------ Mask logical
@@ -3012,9 +3024,26 @@ HWY_API Vec512<int64_t> BroadcastSignBit(Vec512<int64_t> v) {
3012
3024
 
3013
3025
  #if HWY_HAVE_FLOAT16 || HWY_IDE
3014
3026
 
3027
+ namespace detail {
3028
+
3029
+ template <int kCategories>
3030
+ __mmask32 Fix_mm512_fpclass_ph_mask(__m512h v) {
3031
+ #if HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 1500
3032
+ // GCC's _mm512_cmp_ph_mask uses `__mmask8` instead of `__mmask32`, hence only
3033
+ // the first 8 lanes are set.
3034
+ return static_cast<__mmask32>(__builtin_ia32_fpclassph512_mask(
3035
+ static_cast<__v32hf>(v), kCategories, static_cast<__mmask32>(-1)));
3036
+ #else
3037
+ return _mm512_fpclass_ph_mask(v, kCategories);
3038
+ #endif
3039
+ }
3040
+
3041
+ } // namespace detail
3042
+
3015
3043
  HWY_API Mask512<float16_t> IsNaN(Vec512<float16_t> v) {
3016
- return Mask512<float16_t>{_mm512_fpclass_ph_mask(
3017
- v.raw, HWY_X86_FPCLASS_SNAN | HWY_X86_FPCLASS_QNAN)};
3044
+ constexpr int kCategories = HWY_X86_FPCLASS_SNAN | HWY_X86_FPCLASS_QNAN;
3045
+ return Mask512<float16_t>{
3046
+ detail::Fix_mm512_fpclass_ph_mask<kCategories>(v.raw)};
3018
3047
  }
3019
3048
 
3020
3049
  HWY_API Mask512<float16_t> IsEitherNaN(Vec512<float16_t> a,
@@ -3027,15 +3056,18 @@ HWY_API Mask512<float16_t> IsEitherNaN(Vec512<float16_t> a,
3027
3056
  }
3028
3057
 
3029
3058
  HWY_API Mask512<float16_t> IsInf(Vec512<float16_t> v) {
3030
- return Mask512<float16_t>{_mm512_fpclass_ph_mask(v.raw, 0x18)};
3059
+ constexpr int kCategories = HWY_X86_FPCLASS_POS_INF | HWY_X86_FPCLASS_NEG_INF;
3060
+ return Mask512<float16_t>{
3061
+ detail::Fix_mm512_fpclass_ph_mask<kCategories>(v.raw)};
3031
3062
  }
3032
3063
 
3033
3064
  // Returns whether normal/subnormal/zero. fpclass doesn't have a flag for
3034
3065
  // positive, so we have to check for inf/NaN and negate.
3035
3066
  HWY_API Mask512<float16_t> IsFinite(Vec512<float16_t> v) {
3036
- return Not(Mask512<float16_t>{_mm512_fpclass_ph_mask(
3037
- v.raw, HWY_X86_FPCLASS_SNAN | HWY_X86_FPCLASS_QNAN |
3038
- HWY_X86_FPCLASS_NEG_INF | HWY_X86_FPCLASS_POS_INF)});
3067
+ constexpr int kCategories = HWY_X86_FPCLASS_SNAN | HWY_X86_FPCLASS_QNAN |
3068
+ HWY_X86_FPCLASS_NEG_INF | HWY_X86_FPCLASS_POS_INF;
3069
+ return Not(Mask512<float16_t>{
3070
+ detail::Fix_mm512_fpclass_ph_mask<kCategories>(v.raw)});
3039
3071
  }
3040
3072
 
3041
3073
  #endif // HWY_HAVE_FLOAT16
@@ -3571,7 +3603,7 @@ HWY_API T ExtractLane(const Vec512<T> v, size_t i) {
3571
3603
  }
3572
3604
  #endif
3573
3605
 
3574
- alignas(64) T lanes[Lanes(d)];
3606
+ alignas(64) T lanes[MaxLanes(d)];
3575
3607
  Store(v, d, lanes);
3576
3608
  return lanes[i];
3577
3609
  }
@@ -4812,6 +4844,18 @@ HWY_API Vec512<double> SwapAdjacentBlocks(Vec512<double> v) {
4812
4844
  return Vec512<double>{_mm512_shuffle_f64x2(v.raw, v.raw, _MM_PERM_CDAB)};
4813
4845
  }
4814
4846
 
4847
+ // ------------------------------ InterleaveEvenBlocks
4848
+ template <typename T>
4849
+ HWY_API Vec512<T> InterleaveEvenBlocks(Full512<T> d, Vec512<T> a, Vec512<T> b) {
4850
+ return OddEvenBlocks(SlideUpBlocks<1>(d, b), a);
4851
+ }
4852
+
4853
+ // ------------------------------ InterleaveOddBlocks (ConcatUpperUpper)
4854
+ template <typename T>
4855
+ HWY_API Vec512<T> InterleaveOddBlocks(Full512<T> d, Vec512<T> a, Vec512<T> b) {
4856
+ return OddEvenBlocks(b, SlideDownBlocks<1>(d, a));
4857
+ }
4858
+
4815
4859
  // ------------------------------ ReverseBlocks
4816
4860
 
4817
4861
  template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_NOT_FLOAT3264_D(D)>
@@ -5529,7 +5573,9 @@ HWY_API VFromD<D> PromoteTo(D /* tag */, Vec256<uint32_t> v) {
5529
5573
 
5530
5574
  template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_I64_D(D)>
5531
5575
  HWY_API VFromD<D> PromoteInRangeTo(D /*di64*/, VFromD<Rebind<float, D>> v) {
5532
- #if HWY_COMPILER_GCC_ACTUAL
5576
+ #if HWY_X86_HAVE_AVX10_2_OPS
5577
+ return VFromD<D>{_mm512_cvtts_ps_epi64(v.raw)};
5578
+ #elif HWY_COMPILER_GCC_ACTUAL
5533
5579
  // Workaround for undefined behavior with GCC if any values of v[i] are not
5534
5580
  // within the range of an int64_t
5535
5581
 
@@ -5561,7 +5607,9 @@ HWY_API VFromD<D> PromoteInRangeTo(D /*di64*/, VFromD<Rebind<float, D>> v) {
5561
5607
  }
5562
5608
  template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_U64_D(D)>
5563
5609
  HWY_API VFromD<D> PromoteInRangeTo(D /* tag */, VFromD<Rebind<float, D>> v) {
5564
- #if HWY_COMPILER_GCC_ACTUAL
5610
+ #if HWY_X86_HAVE_AVX10_2_OPS
5611
+ return VFromD<D>{_mm512_cvtts_ps_epu64(v.raw)};
5612
+ #elif HWY_COMPILER_GCC_ACTUAL
5565
5613
  // Workaround for undefined behavior with GCC if any values of v[i] are not
5566
5614
  // within the range of an uint64_t
5567
5615
 
@@ -5830,19 +5878,6 @@ HWY_API VFromD<D> ReorderDemote2To(D dn, Vec512<uint16_t> a,
5830
5878
  BitCast(di16, Min(b, max_i16)));
5831
5879
  }
5832
5880
 
5833
- template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_UI32_D(D)>
5834
- HWY_API VFromD<D> ReorderDemote2To(D dn, Vec512<int64_t> a, Vec512<int64_t> b) {
5835
- const Half<decltype(dn)> dnh;
5836
- return Combine(dn, DemoteTo(dnh, b), DemoteTo(dnh, a));
5837
- }
5838
-
5839
- template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_U32_D(D)>
5840
- HWY_API VFromD<D> ReorderDemote2To(D dn, Vec512<uint64_t> a,
5841
- Vec512<uint64_t> b) {
5842
- const Half<decltype(dn)> dnh;
5843
- return Combine(dn, DemoteTo(dnh, b), DemoteTo(dnh, a));
5844
- }
5845
-
5846
5881
  template <class D, class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL(TFromD<D>),
5847
5882
  HWY_IF_V_SIZE_D(D, 64), HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V),
5848
5883
  HWY_IF_T_SIZE_V(V, sizeof(TFromD<D>) * 2),
@@ -5855,15 +5890,6 @@ HWY_API VFromD<D> OrderedDemote2To(D d, V a, V b) {
5855
5890
  SetTableIndices(du64, kIdx)));
5856
5891
  }
5857
5892
 
5858
- template <class D, HWY_IF_NOT_FLOAT_NOR_SPECIAL(TFromD<D>),
5859
- HWY_IF_V_SIZE_GT_D(D, 16), class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V),
5860
- HWY_IF_T_SIZE_V(V, sizeof(TFromD<D>) * 2),
5861
- HWY_IF_LANES_D(D, HWY_MAX_LANES_D(DFromV<V>) * 2),
5862
- HWY_IF_T_SIZE_V(V, 8)>
5863
- HWY_API VFromD<D> OrderedDemote2To(D d, V a, V b) {
5864
- return ReorderDemote2To(d, a, b);
5865
- }
5866
-
5867
5893
  template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)>
5868
5894
  HWY_API VFromD<D> DemoteTo(D /* tag */, Vec512<double> v) {
5869
5895
  return VFromD<D>{_mm512_cvtpd_ps(v.raw)};
@@ -5871,7 +5897,9 @@ HWY_API VFromD<D> DemoteTo(D /* tag */, Vec512<double> v) {
5871
5897
 
5872
5898
  template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_I32_D(D)>
5873
5899
  HWY_API VFromD<D> DemoteInRangeTo(D /* tag */, Vec512<double> v) {
5874
- #if HWY_COMPILER_GCC_ACTUAL
5900
+ #if HWY_X86_HAVE_AVX10_2_OPS
5901
+ return VFromD<D>{_mm512_cvtts_pd_epi32(v.raw)};
5902
+ #elif HWY_COMPILER_GCC_ACTUAL
5875
5903
  // Workaround for undefined behavior in _mm512_cvttpd_epi32 with GCC if any
5876
5904
  // values of v[i] are not within the range of an int32_t
5877
5905
 
@@ -5879,7 +5907,8 @@ HWY_API VFromD<D> DemoteInRangeTo(D /* tag */, Vec512<double> v) {
5879
5907
  if (detail::IsConstantX86VecForF2IConv<int32_t>(v)) {
5880
5908
  typedef double GccF64RawVectType __attribute__((__vector_size__(64)));
5881
5909
  const auto raw_v = reinterpret_cast<GccF64RawVectType>(v.raw);
5882
- return VFromD<D>{_mm256_setr_epi32(
5910
+ return VFromD<D>{
5911
+ _mm256_setr_epi32(
5883
5912
  detail::X86ConvertScalarFromFloat<int32_t>(raw_v[0]),
5884
5913
  detail::X86ConvertScalarFromFloat<int32_t>(raw_v[1]),
5885
5914
  detail::X86ConvertScalarFromFloat<int32_t>(raw_v[2]),
@@ -5887,7 +5916,8 @@ HWY_API VFromD<D> DemoteInRangeTo(D /* tag */, Vec512<double> v) {
5887
5916
  detail::X86ConvertScalarFromFloat<int32_t>(raw_v[4]),
5888
5917
  detail::X86ConvertScalarFromFloat<int32_t>(raw_v[5]),
5889
5918
  detail::X86ConvertScalarFromFloat<int32_t>(raw_v[6]),
5890
- detail::X86ConvertScalarFromFloat<int32_t>(raw_v[7]))};
5919
+ detail::X86ConvertScalarFromFloat<int32_t>(raw_v[7]))
5920
+ };
5891
5921
  }
5892
5922
  #endif
5893
5923
 
@@ -5904,7 +5934,9 @@ HWY_API VFromD<D> DemoteInRangeTo(D /* tag */, Vec512<double> v) {
5904
5934
 
5905
5935
  template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_U32_D(D)>
5906
5936
  HWY_API VFromD<D> DemoteInRangeTo(D /* tag */, Vec512<double> v) {
5907
- #if HWY_COMPILER_GCC_ACTUAL
5937
+ #if HWY_X86_HAVE_AVX10_2_OPS
5938
+ return VFromD<D>{_mm512_cvtts_pd_epu32(v.raw)};
5939
+ #elif HWY_COMPILER_GCC_ACTUAL
5908
5940
  // Workaround for undefined behavior in _mm512_cvttpd_epu32 with GCC if any
5909
5941
  // values of v[i] are not within the range of an uint32_t
5910
5942
 
@@ -6250,7 +6282,9 @@ HWY_API VFromD<D> ConvertInRangeTo(D /* tag */, VFromD<RebindToFloat<D>> v) {
6250
6282
  #endif // HWY_HAVE_FLOAT16
6251
6283
  template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_I32_D(D)>
6252
6284
  HWY_API VFromD<D> ConvertInRangeTo(D /*d*/, Vec512<float> v) {
6253
- #if HWY_COMPILER_GCC_ACTUAL
6285
+ #if HWY_X86_HAVE_AVX10_2_OPS
6286
+ return VFromD<D>{_mm512_cvtts_ps_epi32(v.raw)};
6287
+ #elif HWY_COMPILER_GCC_ACTUAL
6254
6288
  // Workaround for undefined behavior in _mm512_cvttps_epi32 with GCC if any
6255
6289
  // values of v[i] are not within the range of an int32_t
6256
6290
 
@@ -6290,7 +6324,9 @@ HWY_API VFromD<D> ConvertInRangeTo(D /*d*/, Vec512<float> v) {
6290
6324
  }
6291
6325
  template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_I64_D(D)>
6292
6326
  HWY_API VFromD<D> ConvertInRangeTo(D /*di*/, Vec512<double> v) {
6293
- #if HWY_COMPILER_GCC_ACTUAL
6327
+ #if HWY_X86_HAVE_AVX10_2_OPS
6328
+ return VFromD<D>{_mm512_cvtts_pd_epi64(v.raw)};
6329
+ #elif HWY_COMPILER_GCC_ACTUAL
6294
6330
  // Workaround for undefined behavior in _mm512_cvttpd_epi64 with GCC if any
6295
6331
  // values of v[i] are not within the range of an int64_t
6296
6332
 
@@ -6322,7 +6358,9 @@ HWY_API VFromD<D> ConvertInRangeTo(D /*di*/, Vec512<double> v) {
6322
6358
  }
6323
6359
  template <class DU, HWY_IF_V_SIZE_D(DU, 64), HWY_IF_U32_D(DU)>
6324
6360
  HWY_API VFromD<DU> ConvertInRangeTo(DU /*du*/, VFromD<RebindToFloat<DU>> v) {
6325
- #if HWY_COMPILER_GCC_ACTUAL
6361
+ #if HWY_X86_HAVE_AVX10_2_OPS
6362
+ return VFromD<DU>{_mm512_cvtts_ps_epu32(v.raw)};
6363
+ #elif HWY_COMPILER_GCC_ACTUAL
6326
6364
  // Workaround for undefined behavior in _mm512_cvttps_epu32 with GCC if any
6327
6365
  // values of v[i] are not within the range of an uint32_t
6328
6366
 
@@ -6378,7 +6416,9 @@ HWY_API VFromD<DU> ConvertInRangeTo(DU /*du*/, VFromD<RebindToFloat<DU>> v) {
6378
6416
  }
6379
6417
  template <class DU, HWY_IF_V_SIZE_D(DU, 64), HWY_IF_U64_D(DU)>
6380
6418
  HWY_API VFromD<DU> ConvertInRangeTo(DU /*du*/, VFromD<RebindToFloat<DU>> v) {
6381
- #if HWY_COMPILER_GCC_ACTUAL
6419
+ #if HWY_X86_HAVE_AVX10_2_OPS
6420
+ return VFromD<DU>{_mm512_cvtts_pd_epu64(v.raw)};
6421
+ #elif HWY_COMPILER_GCC_ACTUAL
6382
6422
  // Workaround for undefined behavior in _mm512_cvttpd_epu64 with GCC if any
6383
6423
  // values of v[i] are not within the range of an uint64_t
6384
6424
 
@@ -6418,7 +6458,8 @@ HWY_API VFromD<DU> ConvertInRangeTo(DU /*du*/, VFromD<RebindToFloat<DU>> v) {
6418
6458
  }
6419
6459
 
6420
6460
  template <class DI, HWY_IF_V_SIZE_D(DI, 64), HWY_IF_I32_D(DI)>
6421
- HWY_INLINE VFromD<DI> NearestIntInRange(DI, VFromD<RebindToFloat<DI>> v) {
6461
+ static HWY_INLINE VFromD<DI> NearestIntInRange(DI,
6462
+ VFromD<RebindToFloat<DI>> v) {
6422
6463
  #if HWY_COMPILER_GCC_ACTUAL
6423
6464
  // Workaround for undefined behavior in _mm512_cvtps_epi32 with GCC if any
6424
6465
  // values of v[i] are not within the range of an int32_t
@@ -6458,6 +6499,134 @@ HWY_INLINE VFromD<DI> NearestIntInRange(DI, VFromD<RebindToFloat<DI>> v) {
6458
6499
  #endif
6459
6500
  }
6460
6501
 
6502
+ #if HWY_HAVE_FLOAT16
6503
+ template <class DI, HWY_IF_V_SIZE_D(DI, 64), HWY_IF_I16_D(DI)>
6504
+ static HWY_INLINE VFromD<DI> NearestIntInRange(DI /*d*/, Vec512<float16_t> v) {
6505
+ #if HWY_COMPILER_GCC_ACTUAL
6506
+ // Workaround for undefined behavior in _mm512_cvtph_epi16 with GCC if any
6507
+ // values of v[i] are not within the range of an int16_t
6508
+
6509
+ #if HWY_COMPILER_GCC_ACTUAL >= 1200 && !HWY_IS_DEBUG_BUILD && \
6510
+ HWY_HAVE_SCALAR_F16_TYPE
6511
+ if (detail::IsConstantX86VecForF2IConv<int16_t>(v)) {
6512
+ typedef hwy::float16_t::Native GccF16RawVectType
6513
+ __attribute__((__vector_size__(64)));
6514
+ const auto raw_v = reinterpret_cast<GccF16RawVectType>(v.raw);
6515
+ return VFromD<DI>{
6516
+ _mm512_set_epi16(detail::X86ScalarNearestInt<int16_t>(raw_v[31]),
6517
+ detail::X86ScalarNearestInt<int16_t>(raw_v[30]),
6518
+ detail::X86ScalarNearestInt<int16_t>(raw_v[29]),
6519
+ detail::X86ScalarNearestInt<int16_t>(raw_v[28]),
6520
+ detail::X86ScalarNearestInt<int16_t>(raw_v[27]),
6521
+ detail::X86ScalarNearestInt<int16_t>(raw_v[26]),
6522
+ detail::X86ScalarNearestInt<int16_t>(raw_v[25]),
6523
+ detail::X86ScalarNearestInt<int16_t>(raw_v[24]),
6524
+ detail::X86ScalarNearestInt<int16_t>(raw_v[23]),
6525
+ detail::X86ScalarNearestInt<int16_t>(raw_v[22]),
6526
+ detail::X86ScalarNearestInt<int16_t>(raw_v[21]),
6527
+ detail::X86ScalarNearestInt<int16_t>(raw_v[20]),
6528
+ detail::X86ScalarNearestInt<int16_t>(raw_v[19]),
6529
+ detail::X86ScalarNearestInt<int16_t>(raw_v[18]),
6530
+ detail::X86ScalarNearestInt<int16_t>(raw_v[17]),
6531
+ detail::X86ScalarNearestInt<int16_t>(raw_v[16]),
6532
+ detail::X86ScalarNearestInt<int16_t>(raw_v[15]),
6533
+ detail::X86ScalarNearestInt<int16_t>(raw_v[14]),
6534
+ detail::X86ScalarNearestInt<int16_t>(raw_v[13]),
6535
+ detail::X86ScalarNearestInt<int16_t>(raw_v[12]),
6536
+ detail::X86ScalarNearestInt<int16_t>(raw_v[11]),
6537
+ detail::X86ScalarNearestInt<int16_t>(raw_v[10]),
6538
+ detail::X86ScalarNearestInt<int16_t>(raw_v[9]),
6539
+ detail::X86ScalarNearestInt<int16_t>(raw_v[8]),
6540
+ detail::X86ScalarNearestInt<int16_t>(raw_v[7]),
6541
+ detail::X86ScalarNearestInt<int16_t>(raw_v[6]),
6542
+ detail::X86ScalarNearestInt<int16_t>(raw_v[5]),
6543
+ detail::X86ScalarNearestInt<int16_t>(raw_v[4]),
6544
+ detail::X86ScalarNearestInt<int16_t>(raw_v[3]),
6545
+ detail::X86ScalarNearestInt<int16_t>(raw_v[2]),
6546
+ detail::X86ScalarNearestInt<int16_t>(raw_v[1]),
6547
+ detail::X86ScalarNearestInt<int16_t>(raw_v[0]))};
6548
+ }
6549
+ #endif
6550
+
6551
+ __m512i raw_result;
6552
+ __asm__("vcvtph2w {%1, %0|%0, %1}"
6553
+ : "=" HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(raw_result)
6554
+ : HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(v.raw)
6555
+ :);
6556
+ return VFromD<DI>{raw_result};
6557
+ #else
6558
+ return VFromD<DI>{_mm512_cvtph_epi16(v.raw)};
6559
+ #endif
6560
+ }
6561
+ #endif // HWY_HAVE_FLOAT16
6562
+
6563
+ template <class DI, HWY_IF_V_SIZE_D(DI, 64), HWY_IF_I64_D(DI)>
6564
+ static HWY_INLINE VFromD<DI> NearestIntInRange(DI /*di*/, Vec512<double> v) {
6565
+ #if HWY_COMPILER_GCC_ACTUAL
6566
+ // Workaround for undefined behavior in _mm512_cvtpd_epi64 with GCC if any
6567
+ // values of v[i] are not within the range of an int64_t
6568
+
6569
+ #if HWY_COMPILER_GCC_ACTUAL >= 700 && !HWY_IS_DEBUG_BUILD
6570
+ if (detail::IsConstantX86VecForF2IConv<int64_t>(v)) {
6571
+ typedef double GccF64RawVectType __attribute__((__vector_size__(64)));
6572
+ const auto raw_v = reinterpret_cast<GccF64RawVectType>(v.raw);
6573
+ return VFromD<DI>{
6574
+ _mm512_setr_epi64(detail::X86ScalarNearestInt<int64_t>(raw_v[0]),
6575
+ detail::X86ScalarNearestInt<int64_t>(raw_v[1]),
6576
+ detail::X86ScalarNearestInt<int64_t>(raw_v[2]),
6577
+ detail::X86ScalarNearestInt<int64_t>(raw_v[3]),
6578
+ detail::X86ScalarNearestInt<int64_t>(raw_v[4]),
6579
+ detail::X86ScalarNearestInt<int64_t>(raw_v[5]),
6580
+ detail::X86ScalarNearestInt<int64_t>(raw_v[6]),
6581
+ detail::X86ScalarNearestInt<int64_t>(raw_v[7]))};
6582
+ }
6583
+ #endif
6584
+
6585
+ __m512i raw_result;
6586
+ __asm__("vcvtpd2qq {%1, %0|%0, %1}"
6587
+ : "=" HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(raw_result)
6588
+ : HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(v.raw)
6589
+ :);
6590
+ return VFromD<DI>{raw_result};
6591
+ #else
6592
+ return VFromD<DI>{_mm512_cvtpd_epi64(v.raw)};
6593
+ #endif
6594
+ }
6595
+
6596
+ template <class DI, HWY_IF_V_SIZE_D(DI, 32), HWY_IF_I32_D(DI)>
6597
+ static HWY_INLINE VFromD<DI> DemoteToNearestIntInRange(DI /* tag */,
6598
+ Vec512<double> v) {
6599
+ #if HWY_COMPILER_GCC_ACTUAL
6600
+ // Workaround for undefined behavior in _mm512_cvtpd_epi32 with GCC if any
6601
+ // values of v[i] are not within the range of an int32_t
6602
+
6603
+ #if HWY_COMPILER_GCC_ACTUAL >= 700 && !HWY_IS_DEBUG_BUILD
6604
+ if (detail::IsConstantX86VecForF2IConv<int32_t>(v)) {
6605
+ typedef double GccF64RawVectType __attribute__((__vector_size__(64)));
6606
+ const auto raw_v = reinterpret_cast<GccF64RawVectType>(v.raw);
6607
+ return VFromD<DI>{
6608
+ _mm256_setr_epi32(detail::X86ScalarNearestInt<int32_t>(raw_v[0]),
6609
+ detail::X86ScalarNearestInt<int32_t>(raw_v[1]),
6610
+ detail::X86ScalarNearestInt<int32_t>(raw_v[2]),
6611
+ detail::X86ScalarNearestInt<int32_t>(raw_v[3]),
6612
+ detail::X86ScalarNearestInt<int32_t>(raw_v[4]),
6613
+ detail::X86ScalarNearestInt<int32_t>(raw_v[5]),
6614
+ detail::X86ScalarNearestInt<int32_t>(raw_v[6]),
6615
+ detail::X86ScalarNearestInt<int32_t>(raw_v[7]))};
6616
+ }
6617
+ #endif
6618
+
6619
+ __m256i raw_result;
6620
+ __asm__("vcvtpd2dq {%1, %0|%0, %1}"
6621
+ : "=" HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(raw_result)
6622
+ : HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(v.raw)
6623
+ :);
6624
+ return VFromD<DI>{raw_result};
6625
+ #else
6626
+ return VFromD<DI>{_mm512_cvtpd_epi32(v.raw)};
6627
+ #endif
6628
+ }
6629
+
6461
6630
  // ================================================== CRYPTO
6462
6631
 
6463
6632
  #if !defined(HWY_DISABLE_PCLMUL_AES)
@@ -6788,376 +6957,6 @@ HWY_API intptr_t FindLastTrue(D d, MFromD<D> mask) {
6788
6957
 
6789
6958
  // ------------------------------ Compress
6790
6959
 
6791
- // Always implement 8-bit here even if we lack VBMI2 because we can do better
6792
- // than generic_ops (8 at a time) via the native 32-bit compress (16 at a time).
6793
- #ifdef HWY_NATIVE_COMPRESS8
6794
- #undef HWY_NATIVE_COMPRESS8
6795
- #else
6796
- #define HWY_NATIVE_COMPRESS8
6797
- #endif
6798
-
6799
- namespace detail {
6800
-
6801
- #if HWY_TARGET <= HWY_AVX3_DL // VBMI2
6802
- template <size_t N>
6803
- HWY_INLINE Vec128<uint8_t, N> NativeCompress(const Vec128<uint8_t, N> v,
6804
- const Mask128<uint8_t, N> mask) {
6805
- return Vec128<uint8_t, N>{_mm_maskz_compress_epi8(mask.raw, v.raw)};
6806
- }
6807
- HWY_INLINE Vec256<uint8_t> NativeCompress(const Vec256<uint8_t> v,
6808
- const Mask256<uint8_t> mask) {
6809
- return Vec256<uint8_t>{_mm256_maskz_compress_epi8(mask.raw, v.raw)};
6810
- }
6811
- HWY_INLINE Vec512<uint8_t> NativeCompress(const Vec512<uint8_t> v,
6812
- const Mask512<uint8_t> mask) {
6813
- return Vec512<uint8_t>{_mm512_maskz_compress_epi8(mask.raw, v.raw)};
6814
- }
6815
-
6816
- template <size_t N>
6817
- HWY_INLINE Vec128<uint16_t, N> NativeCompress(const Vec128<uint16_t, N> v,
6818
- const Mask128<uint16_t, N> mask) {
6819
- return Vec128<uint16_t, N>{_mm_maskz_compress_epi16(mask.raw, v.raw)};
6820
- }
6821
- HWY_INLINE Vec256<uint16_t> NativeCompress(const Vec256<uint16_t> v,
6822
- const Mask256<uint16_t> mask) {
6823
- return Vec256<uint16_t>{_mm256_maskz_compress_epi16(mask.raw, v.raw)};
6824
- }
6825
- HWY_INLINE Vec512<uint16_t> NativeCompress(const Vec512<uint16_t> v,
6826
- const Mask512<uint16_t> mask) {
6827
- return Vec512<uint16_t>{_mm512_maskz_compress_epi16(mask.raw, v.raw)};
6828
- }
6829
-
6830
- // Slow on Zen4, do not even define these to prevent accidental usage.
6831
- #if HWY_TARGET != HWY_AVX3_ZEN4
6832
-
6833
- template <size_t N>
6834
- HWY_INLINE void NativeCompressStore(Vec128<uint8_t, N> v,
6835
- Mask128<uint8_t, N> mask,
6836
- uint8_t* HWY_RESTRICT unaligned) {
6837
- _mm_mask_compressstoreu_epi8(unaligned, mask.raw, v.raw);
6838
- }
6839
- HWY_INLINE void NativeCompressStore(Vec256<uint8_t> v, Mask256<uint8_t> mask,
6840
- uint8_t* HWY_RESTRICT unaligned) {
6841
- _mm256_mask_compressstoreu_epi8(unaligned, mask.raw, v.raw);
6842
- }
6843
- HWY_INLINE void NativeCompressStore(Vec512<uint8_t> v, Mask512<uint8_t> mask,
6844
- uint8_t* HWY_RESTRICT unaligned) {
6845
- _mm512_mask_compressstoreu_epi8(unaligned, mask.raw, v.raw);
6846
- }
6847
-
6848
- template <size_t N>
6849
- HWY_INLINE void NativeCompressStore(Vec128<uint16_t, N> v,
6850
- Mask128<uint16_t, N> mask,
6851
- uint16_t* HWY_RESTRICT unaligned) {
6852
- _mm_mask_compressstoreu_epi16(unaligned, mask.raw, v.raw);
6853
- }
6854
- HWY_INLINE void NativeCompressStore(Vec256<uint16_t> v, Mask256<uint16_t> mask,
6855
- uint16_t* HWY_RESTRICT unaligned) {
6856
- _mm256_mask_compressstoreu_epi16(unaligned, mask.raw, v.raw);
6857
- }
6858
- HWY_INLINE void NativeCompressStore(Vec512<uint16_t> v, Mask512<uint16_t> mask,
6859
- uint16_t* HWY_RESTRICT unaligned) {
6860
- _mm512_mask_compressstoreu_epi16(unaligned, mask.raw, v.raw);
6861
- }
6862
-
6863
- #endif // HWY_TARGET != HWY_AVX3_ZEN4
6864
-
6865
- HWY_INLINE Vec512<uint8_t> NativeExpand(Vec512<uint8_t> v,
6866
- Mask512<uint8_t> mask) {
6867
- return Vec512<uint8_t>{_mm512_maskz_expand_epi8(mask.raw, v.raw)};
6868
- }
6869
-
6870
- HWY_INLINE Vec512<uint16_t> NativeExpand(Vec512<uint16_t> v,
6871
- Mask512<uint16_t> mask) {
6872
- return Vec512<uint16_t>{_mm512_maskz_expand_epi16(mask.raw, v.raw)};
6873
- }
6874
-
6875
- template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_U8_D(D)>
6876
- HWY_INLINE VFromD<D> NativeLoadExpand(Mask512<uint8_t> mask, D /* d */,
6877
- const uint8_t* HWY_RESTRICT unaligned) {
6878
- return VFromD<D>{_mm512_maskz_expandloadu_epi8(mask.raw, unaligned)};
6879
- }
6880
-
6881
- template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_U16_D(D)>
6882
- HWY_INLINE VFromD<D> NativeLoadExpand(Mask512<uint16_t> mask, D /* d */,
6883
- const uint16_t* HWY_RESTRICT unaligned) {
6884
- return VFromD<D>{_mm512_maskz_expandloadu_epi16(mask.raw, unaligned)};
6885
- }
6886
-
6887
- #endif // HWY_TARGET <= HWY_AVX3_DL
6888
-
6889
- template <size_t N>
6890
- HWY_INLINE Vec128<uint32_t, N> NativeCompress(Vec128<uint32_t, N> v,
6891
- Mask128<uint32_t, N> mask) {
6892
- return Vec128<uint32_t, N>{_mm_maskz_compress_epi32(mask.raw, v.raw)};
6893
- }
6894
- HWY_INLINE Vec256<uint32_t> NativeCompress(Vec256<uint32_t> v,
6895
- Mask256<uint32_t> mask) {
6896
- return Vec256<uint32_t>{_mm256_maskz_compress_epi32(mask.raw, v.raw)};
6897
- }
6898
- HWY_INLINE Vec512<uint32_t> NativeCompress(Vec512<uint32_t> v,
6899
- Mask512<uint32_t> mask) {
6900
- return Vec512<uint32_t>{_mm512_maskz_compress_epi32(mask.raw, v.raw)};
6901
- }
6902
- // We use table-based compress for 64-bit lanes, see CompressIsPartition.
6903
-
6904
- // Slow on Zen4, do not even define these to prevent accidental usage.
6905
- #if HWY_TARGET != HWY_AVX3_ZEN4
6906
-
6907
- template <size_t N>
6908
- HWY_INLINE void NativeCompressStore(Vec128<uint32_t, N> v,
6909
- Mask128<uint32_t, N> mask,
6910
- uint32_t* HWY_RESTRICT unaligned) {
6911
- _mm_mask_compressstoreu_epi32(unaligned, mask.raw, v.raw);
6912
- }
6913
- HWY_INLINE void NativeCompressStore(Vec256<uint32_t> v, Mask256<uint32_t> mask,
6914
- uint32_t* HWY_RESTRICT unaligned) {
6915
- _mm256_mask_compressstoreu_epi32(unaligned, mask.raw, v.raw);
6916
- }
6917
- HWY_INLINE void NativeCompressStore(Vec512<uint32_t> v, Mask512<uint32_t> mask,
6918
- uint32_t* HWY_RESTRICT unaligned) {
6919
- _mm512_mask_compressstoreu_epi32(unaligned, mask.raw, v.raw);
6920
- }
6921
-
6922
- template <size_t N>
6923
- HWY_INLINE void NativeCompressStore(Vec128<uint64_t, N> v,
6924
- Mask128<uint64_t, N> mask,
6925
- uint64_t* HWY_RESTRICT unaligned) {
6926
- _mm_mask_compressstoreu_epi64(unaligned, mask.raw, v.raw);
6927
- }
6928
- HWY_INLINE void NativeCompressStore(Vec256<uint64_t> v, Mask256<uint64_t> mask,
6929
- uint64_t* HWY_RESTRICT unaligned) {
6930
- _mm256_mask_compressstoreu_epi64(unaligned, mask.raw, v.raw);
6931
- }
6932
- HWY_INLINE void NativeCompressStore(Vec512<uint64_t> v, Mask512<uint64_t> mask,
6933
- uint64_t* HWY_RESTRICT unaligned) {
6934
- _mm512_mask_compressstoreu_epi64(unaligned, mask.raw, v.raw);
6935
- }
6936
-
6937
- template <size_t N>
6938
- HWY_INLINE void NativeCompressStore(Vec128<float, N> v, Mask128<float, N> mask,
6939
- float* HWY_RESTRICT unaligned) {
6940
- _mm_mask_compressstoreu_ps(unaligned, mask.raw, v.raw);
6941
- }
6942
- HWY_INLINE void NativeCompressStore(Vec256<float> v, Mask256<float> mask,
6943
- float* HWY_RESTRICT unaligned) {
6944
- _mm256_mask_compressstoreu_ps(unaligned, mask.raw, v.raw);
6945
- }
6946
- HWY_INLINE void NativeCompressStore(Vec512<float> v, Mask512<float> mask,
6947
- float* HWY_RESTRICT unaligned) {
6948
- _mm512_mask_compressstoreu_ps(unaligned, mask.raw, v.raw);
6949
- }
6950
-
6951
- template <size_t N>
6952
- HWY_INLINE void NativeCompressStore(Vec128<double, N> v,
6953
- Mask128<double, N> mask,
6954
- double* HWY_RESTRICT unaligned) {
6955
- _mm_mask_compressstoreu_pd(unaligned, mask.raw, v.raw);
6956
- }
6957
- HWY_INLINE void NativeCompressStore(Vec256<double> v, Mask256<double> mask,
6958
- double* HWY_RESTRICT unaligned) {
6959
- _mm256_mask_compressstoreu_pd(unaligned, mask.raw, v.raw);
6960
- }
6961
- HWY_INLINE void NativeCompressStore(Vec512<double> v, Mask512<double> mask,
6962
- double* HWY_RESTRICT unaligned) {
6963
- _mm512_mask_compressstoreu_pd(unaligned, mask.raw, v.raw);
6964
- }
6965
-
6966
- #endif // HWY_TARGET != HWY_AVX3_ZEN4
6967
-
6968
- HWY_INLINE Vec512<uint32_t> NativeExpand(Vec512<uint32_t> v,
6969
- Mask512<uint32_t> mask) {
6970
- return Vec512<uint32_t>{_mm512_maskz_expand_epi32(mask.raw, v.raw)};
6971
- }
6972
-
6973
- HWY_INLINE Vec512<uint64_t> NativeExpand(Vec512<uint64_t> v,
6974
- Mask512<uint64_t> mask) {
6975
- return Vec512<uint64_t>{_mm512_maskz_expand_epi64(mask.raw, v.raw)};
6976
- }
6977
-
6978
- template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_U32_D(D)>
6979
- HWY_INLINE VFromD<D> NativeLoadExpand(Mask512<uint32_t> mask, D /* d */,
6980
- const uint32_t* HWY_RESTRICT unaligned) {
6981
- return VFromD<D>{_mm512_maskz_expandloadu_epi32(mask.raw, unaligned)};
6982
- }
6983
-
6984
- template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_U64_D(D)>
6985
- HWY_INLINE VFromD<D> NativeLoadExpand(Mask512<uint64_t> mask, D /* d */,
6986
- const uint64_t* HWY_RESTRICT unaligned) {
6987
- return VFromD<D>{_mm512_maskz_expandloadu_epi64(mask.raw, unaligned)};
6988
- }
6989
-
6990
- // For u8x16 and <= u16x16 we can avoid store+load for Compress because there is
6991
- // only a single compressed vector (u32x16). Other EmuCompress are implemented
6992
- // after the EmuCompressStore they build upon.
6993
- template <size_t N>
6994
- HWY_INLINE Vec128<uint8_t, N> EmuCompress(Vec128<uint8_t, N> v,
6995
- Mask128<uint8_t, N> mask) {
6996
- const DFromV<decltype(v)> d;
6997
- const Rebind<uint32_t, decltype(d)> d32;
6998
- const VFromD<decltype(d32)> v0 = PromoteTo(d32, v);
6999
-
7000
- const uint64_t mask_bits{mask.raw};
7001
- // Mask type is __mmask16 if v is full 128, else __mmask8.
7002
- using M32 = MFromD<decltype(d32)>;
7003
- const M32 m0{static_cast<typename M32::Raw>(mask_bits)};
7004
- return TruncateTo(d, Compress(v0, m0));
7005
- }
7006
-
7007
- template <size_t N>
7008
- HWY_INLINE Vec128<uint16_t, N> EmuCompress(Vec128<uint16_t, N> v,
7009
- Mask128<uint16_t, N> mask) {
7010
- const DFromV<decltype(v)> d;
7011
- const Rebind<int32_t, decltype(d)> di32;
7012
- const RebindToUnsigned<decltype(di32)> du32;
7013
- const MFromD<decltype(du32)> mask32{static_cast<__mmask8>(mask.raw)};
7014
- // DemoteTo is 2 ops, but likely lower latency than TruncateTo on SKX.
7015
- // Only i32 -> u16 is supported, whereas NativeCompress expects u32.
7016
- const VFromD<decltype(du32)> v32 = BitCast(du32, PromoteTo(di32, v));
7017
- return DemoteTo(d, BitCast(di32, NativeCompress(v32, mask32)));
7018
- }
7019
-
7020
- HWY_INLINE Vec256<uint16_t> EmuCompress(Vec256<uint16_t> v,
7021
- Mask256<uint16_t> mask) {
7022
- const DFromV<decltype(v)> d;
7023
- const Rebind<int32_t, decltype(d)> di32;
7024
- const RebindToUnsigned<decltype(di32)> du32;
7025
- const Mask512<uint32_t> mask32{static_cast<__mmask16>(mask.raw)};
7026
- const Vec512<uint32_t> v32 = BitCast(du32, PromoteTo(di32, v));
7027
- return DemoteTo(d, BitCast(di32, NativeCompress(v32, mask32)));
7028
- }
7029
-
7030
- // See above - small-vector EmuCompressStore are implemented via EmuCompress.
7031
- template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
7032
- HWY_INLINE void EmuCompressStore(VFromD<D> v, MFromD<D> mask, D d,
7033
- TFromD<D>* HWY_RESTRICT unaligned) {
7034
- StoreU(EmuCompress(v, mask), d, unaligned);
7035
- }
7036
-
7037
- template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_U16_D(D)>
7038
- HWY_INLINE void EmuCompressStore(VFromD<D> v, MFromD<D> mask, D d,
7039
- TFromD<D>* HWY_RESTRICT unaligned) {
7040
- StoreU(EmuCompress(v, mask), d, unaligned);
7041
- }
7042
-
7043
- // Main emulation logic for wider vector, starting with EmuCompressStore because
7044
- // it is most convenient to merge pieces using memory (concatenating vectors at
7045
- // byte offsets is difficult).
7046
- template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_U8_D(D)>
7047
- HWY_INLINE void EmuCompressStore(VFromD<D> v, MFromD<D> mask, D d,
7048
- TFromD<D>* HWY_RESTRICT unaligned) {
7049
- const uint64_t mask_bits{mask.raw};
7050
- const Half<decltype(d)> dh;
7051
- const Rebind<uint32_t, decltype(dh)> d32;
7052
- const Vec512<uint32_t> v0 = PromoteTo(d32, LowerHalf(v));
7053
- const Vec512<uint32_t> v1 = PromoteTo(d32, UpperHalf(dh, v));
7054
- const Mask512<uint32_t> m0{static_cast<__mmask16>(mask_bits & 0xFFFFu)};
7055
- const Mask512<uint32_t> m1{static_cast<__mmask16>(mask_bits >> 16)};
7056
- const Vec128<uint8_t> c0 = TruncateTo(dh, NativeCompress(v0, m0));
7057
- const Vec128<uint8_t> c1 = TruncateTo(dh, NativeCompress(v1, m1));
7058
- uint8_t* HWY_RESTRICT pos = unaligned;
7059
- StoreU(c0, dh, pos);
7060
- StoreU(c1, dh, pos + CountTrue(d32, m0));
7061
- }
7062
-
7063
- template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_U8_D(D)>
7064
- HWY_INLINE void EmuCompressStore(VFromD<D> v, MFromD<D> mask, D d,
7065
- TFromD<D>* HWY_RESTRICT unaligned) {
7066
- const uint64_t mask_bits{mask.raw};
7067
- const Half<Half<decltype(d)>> dq;
7068
- const Rebind<uint32_t, decltype(dq)> d32;
7069
- alignas(64) uint8_t lanes[64];
7070
- Store(v, d, lanes);
7071
- const Vec512<uint32_t> v0 = PromoteTo(d32, LowerHalf(LowerHalf(v)));
7072
- const Vec512<uint32_t> v1 = PromoteTo(d32, Load(dq, lanes + 16));
7073
- const Vec512<uint32_t> v2 = PromoteTo(d32, Load(dq, lanes + 32));
7074
- const Vec512<uint32_t> v3 = PromoteTo(d32, Load(dq, lanes + 48));
7075
- const Mask512<uint32_t> m0{static_cast<__mmask16>(mask_bits & 0xFFFFu)};
7076
- const Mask512<uint32_t> m1{
7077
- static_cast<uint16_t>((mask_bits >> 16) & 0xFFFFu)};
7078
- const Mask512<uint32_t> m2{
7079
- static_cast<uint16_t>((mask_bits >> 32) & 0xFFFFu)};
7080
- const Mask512<uint32_t> m3{static_cast<__mmask16>(mask_bits >> 48)};
7081
- const Vec128<uint8_t> c0 = TruncateTo(dq, NativeCompress(v0, m0));
7082
- const Vec128<uint8_t> c1 = TruncateTo(dq, NativeCompress(v1, m1));
7083
- const Vec128<uint8_t> c2 = TruncateTo(dq, NativeCompress(v2, m2));
7084
- const Vec128<uint8_t> c3 = TruncateTo(dq, NativeCompress(v3, m3));
7085
- uint8_t* HWY_RESTRICT pos = unaligned;
7086
- StoreU(c0, dq, pos);
7087
- pos += CountTrue(d32, m0);
7088
- StoreU(c1, dq, pos);
7089
- pos += CountTrue(d32, m1);
7090
- StoreU(c2, dq, pos);
7091
- pos += CountTrue(d32, m2);
7092
- StoreU(c3, dq, pos);
7093
- }
7094
-
7095
- template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_U16_D(D)>
7096
- HWY_INLINE void EmuCompressStore(VFromD<D> v, MFromD<D> mask, D d,
7097
- TFromD<D>* HWY_RESTRICT unaligned) {
7098
- const Repartition<int32_t, decltype(d)> di32;
7099
- const RebindToUnsigned<decltype(di32)> du32;
7100
- const Half<decltype(d)> dh;
7101
- const Vec512<uint32_t> promoted0 =
7102
- BitCast(du32, PromoteTo(di32, LowerHalf(dh, v)));
7103
- const Vec512<uint32_t> promoted1 =
7104
- BitCast(du32, PromoteTo(di32, UpperHalf(dh, v)));
7105
-
7106
- const uint64_t mask_bits{mask.raw};
7107
- const uint64_t maskL = mask_bits & 0xFFFF;
7108
- const uint64_t maskH = mask_bits >> 16;
7109
- const Mask512<uint32_t> mask0{static_cast<__mmask16>(maskL)};
7110
- const Mask512<uint32_t> mask1{static_cast<__mmask16>(maskH)};
7111
- const Vec512<uint32_t> compressed0 = NativeCompress(promoted0, mask0);
7112
- const Vec512<uint32_t> compressed1 = NativeCompress(promoted1, mask1);
7113
-
7114
- const Vec256<uint16_t> demoted0 = DemoteTo(dh, BitCast(di32, compressed0));
7115
- const Vec256<uint16_t> demoted1 = DemoteTo(dh, BitCast(di32, compressed1));
7116
-
7117
- // Store 256-bit halves
7118
- StoreU(demoted0, dh, unaligned);
7119
- StoreU(demoted1, dh, unaligned + PopCount(maskL));
7120
- }
7121
-
7122
- // Finally, the remaining EmuCompress for wide vectors, using EmuCompressStore.
7123
- template <typename T> // 1 or 2 bytes
7124
- HWY_INLINE Vec512<T> EmuCompress(Vec512<T> v, Mask512<T> mask) {
7125
- const DFromV<decltype(v)> d;
7126
- alignas(64) T buf[2 * Lanes(d)];
7127
- EmuCompressStore(v, mask, d, buf);
7128
- return Load(d, buf);
7129
- }
7130
-
7131
- HWY_INLINE Vec256<uint8_t> EmuCompress(Vec256<uint8_t> v,
7132
- const Mask256<uint8_t> mask) {
7133
- const DFromV<decltype(v)> d;
7134
- alignas(32) uint8_t buf[2 * 32 / sizeof(uint8_t)];
7135
- EmuCompressStore(v, mask, d, buf);
7136
- return Load(d, buf);
7137
- }
7138
-
7139
- } // namespace detail
7140
-
7141
- template <class V, class M, HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 2))>
7142
- HWY_API V Compress(V v, const M mask) {
7143
- const DFromV<decltype(v)> d;
7144
- const RebindToUnsigned<decltype(d)> du;
7145
- const auto mu = RebindMask(du, mask);
7146
- #if HWY_TARGET <= HWY_AVX3_DL // VBMI2
7147
- return BitCast(d, detail::NativeCompress(BitCast(du, v), mu));
7148
- #else
7149
- return BitCast(d, detail::EmuCompress(BitCast(du, v), mu));
7150
- #endif
7151
- }
7152
-
7153
- template <class V, class M, HWY_IF_T_SIZE_V(V, 4)>
7154
- HWY_API V Compress(V v, const M mask) {
7155
- const DFromV<decltype(v)> d;
7156
- const RebindToUnsigned<decltype(d)> du;
7157
- const auto mu = RebindMask(du, mask);
7158
- return BitCast(d, detail::NativeCompress(BitCast(du, v), mu));
7159
- }
7160
-
7161
6960
  template <typename T, HWY_IF_T_SIZE(T, 8)>
7162
6961
  HWY_API Vec512<T> Compress(Vec512<T> v, Mask512<T> mask) {
7163
6962
  // See CompressIsPartition. u64 is faster than u32.
@@ -7222,6 +7021,56 @@ HWY_API Vec512<T> Compress(Vec512<T> v, Mask512<T> mask) {
7222
7021
 
7223
7022
  // ------------------------------ Expand
7224
7023
 
7024
+ namespace detail {
7025
+
7026
+ #if HWY_TARGET <= HWY_AVX3_DL // VBMI2
7027
+ HWY_INLINE Vec512<uint8_t> NativeExpand(Vec512<uint8_t> v,
7028
+ Mask512<uint8_t> mask) {
7029
+ return Vec512<uint8_t>{_mm512_maskz_expand_epi8(mask.raw, v.raw)};
7030
+ }
7031
+
7032
+ HWY_INLINE Vec512<uint16_t> NativeExpand(Vec512<uint16_t> v,
7033
+ Mask512<uint16_t> mask) {
7034
+ return Vec512<uint16_t>{_mm512_maskz_expand_epi16(mask.raw, v.raw)};
7035
+ }
7036
+
7037
+ template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_U8_D(D)>
7038
+ HWY_INLINE VFromD<D> NativeLoadExpand(Mask512<uint8_t> mask, D /* d */,
7039
+ const uint8_t* HWY_RESTRICT unaligned) {
7040
+ return VFromD<D>{_mm512_maskz_expandloadu_epi8(mask.raw, unaligned)};
7041
+ }
7042
+
7043
+ template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_U16_D(D)>
7044
+ HWY_INLINE VFromD<D> NativeLoadExpand(Mask512<uint16_t> mask, D /* d */,
7045
+ const uint16_t* HWY_RESTRICT unaligned) {
7046
+ return VFromD<D>{_mm512_maskz_expandloadu_epi16(mask.raw, unaligned)};
7047
+ }
7048
+ #endif // HWY_TARGET <= HWY_AVX3_DL
7049
+
7050
+ HWY_INLINE Vec512<uint32_t> NativeExpand(Vec512<uint32_t> v,
7051
+ Mask512<uint32_t> mask) {
7052
+ return Vec512<uint32_t>{_mm512_maskz_expand_epi32(mask.raw, v.raw)};
7053
+ }
7054
+
7055
+ HWY_INLINE Vec512<uint64_t> NativeExpand(Vec512<uint64_t> v,
7056
+ Mask512<uint64_t> mask) {
7057
+ return Vec512<uint64_t>{_mm512_maskz_expand_epi64(mask.raw, v.raw)};
7058
+ }
7059
+
7060
+ template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_U32_D(D)>
7061
+ HWY_INLINE VFromD<D> NativeLoadExpand(Mask512<uint32_t> mask, D /* d */,
7062
+ const uint32_t* HWY_RESTRICT unaligned) {
7063
+ return VFromD<D>{_mm512_maskz_expandloadu_epi32(mask.raw, unaligned)};
7064
+ }
7065
+
7066
+ template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_U64_D(D)>
7067
+ HWY_INLINE VFromD<D> NativeLoadExpand(Mask512<uint64_t> mask, D /* d */,
7068
+ const uint64_t* HWY_RESTRICT unaligned) {
7069
+ return VFromD<D>{_mm512_maskz_expandloadu_epi64(mask.raw, unaligned)};
7070
+ }
7071
+
7072
+ } // namespace detail
7073
+
7225
7074
  template <typename T, HWY_IF_T_SIZE(T, 1)>
7226
7075
  HWY_API Vec512<T> Expand(Vec512<T> v, const Mask512<T> mask) {
7227
7076
  const Full512<T> d;
@@ -7233,7 +7082,7 @@ HWY_API Vec512<T> Expand(Vec512<T> v, const Mask512<T> mask) {
7233
7082
  // LUTs are infeasible for 2^64 possible masks, so splice together two
7234
7083
  // half-vector Expand.
7235
7084
  const Full256<T> dh;
7236
- constexpr size_t N = Lanes(d);
7085
+ constexpr size_t N = MaxLanes(d);
7237
7086
  // We have to shift the input by a variable number of u8. Shuffling requires
7238
7087
  // VBMI2, in which case we would already have NativeExpand. We instead
7239
7088
  // load at an offset, which may incur a store to load forwarding stall.
@@ -7261,10 +7110,10 @@ HWY_API Vec512<T> Expand(Vec512<T> v, const Mask512<T> mask) {
7261
7110
  // LUTs are infeasible for 2^32 possible masks, so splice together two
7262
7111
  // half-vector Expand.
7263
7112
  const Full256<T> dh;
7264
- constexpr size_t N = Lanes(d);
7113
+ HWY_LANES_CONSTEXPR size_t N = Lanes(d);
7265
7114
  using Bits = typename Mask256<T>::Raw;
7266
7115
  const Mask256<T> maskL{
7267
- static_cast<Bits>(mask.raw & Bits{(1ULL << (N / 2)) - 1})};
7116
+ static_cast<Bits>(mask.raw & static_cast<Bits>((1ULL << (N / 2)) - 1))};
7268
7117
  const Mask256<T> maskH{static_cast<Bits>(mask.raw >> (N / 2))};
7269
7118
  // In AVX3 we can permutevar, which avoids a potential store to load
7270
7119
  // forwarding stall vs. reloading the input.
@@ -7336,11 +7185,6 @@ HWY_API VFromD<D> LoadExpand(MFromD<D> mask, D d,
7336
7185
 
7337
7186
  // ------------------------------ CompressNot
7338
7187
 
7339
- template <class V, class M, HWY_IF_NOT_T_SIZE_V(V, 8)>
7340
- HWY_API V CompressNot(V v, const M mask) {
7341
- return Compress(v, Not(mask));
7342
- }
7343
-
7344
7188
  template <typename T, HWY_IF_T_SIZE(T, 8)>
7345
7189
  HWY_API Vec512<T> CompressNot(Vec512<T> v, Mask512<T> mask) {
7346
7190
  // See CompressIsPartition. u64 is faster than u32.
@@ -7403,102 +7247,6 @@ HWY_API Vec512<T> CompressNot(Vec512<T> v, Mask512<T> mask) {
7403
7247
  return TableLookupLanes(v, indices);
7404
7248
  }
7405
7249
 
7406
- // uint64_t lanes. Only implement for 256 and 512-bit vectors because this is a
7407
- // no-op for 128-bit.
7408
- template <class V, class M, HWY_IF_V_SIZE_GT_D(DFromV<V>, 16)>
7409
- HWY_API V CompressBlocksNot(V v, M mask) {
7410
- return CompressNot(v, mask);
7411
- }
7412
-
7413
- // ------------------------------ CompressBits
7414
- template <class V>
7415
- HWY_API V CompressBits(V v, const uint8_t* HWY_RESTRICT bits) {
7416
- return Compress(v, LoadMaskBits(DFromV<V>(), bits));
7417
- }
7418
-
7419
- // ------------------------------ CompressStore
7420
-
7421
- // Generic for all vector lengths.
7422
-
7423
- template <class D, HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2))>
7424
- HWY_API size_t CompressStore(VFromD<D> v, MFromD<D> mask, D d,
7425
- TFromD<D>* HWY_RESTRICT unaligned) {
7426
- #if HWY_TARGET == HWY_AVX3_ZEN4
7427
- StoreU(Compress(v, mask), d, unaligned);
7428
- #else
7429
- const RebindToUnsigned<decltype(d)> du;
7430
- const auto mu = RebindMask(du, mask);
7431
- auto pu = reinterpret_cast<TFromD<decltype(du)> * HWY_RESTRICT>(unaligned);
7432
-
7433
- #if HWY_TARGET <= HWY_AVX3_DL // VBMI2
7434
- detail::NativeCompressStore(BitCast(du, v), mu, pu);
7435
- #else
7436
- detail::EmuCompressStore(BitCast(du, v), mu, du, pu);
7437
- #endif
7438
- #endif // HWY_TARGET != HWY_AVX3_ZEN4
7439
- const size_t count = CountTrue(d, mask);
7440
- detail::MaybeUnpoison(unaligned, count);
7441
- return count;
7442
- }
7443
-
7444
- template <class D, HWY_IF_NOT_FLOAT_D(D),
7445
- HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 4) | (1 << 8))>
7446
- HWY_API size_t CompressStore(VFromD<D> v, MFromD<D> mask, D d,
7447
- TFromD<D>* HWY_RESTRICT unaligned) {
7448
- #if HWY_TARGET == HWY_AVX3_ZEN4
7449
- StoreU(Compress(v, mask), d, unaligned);
7450
- #else
7451
- const RebindToUnsigned<decltype(d)> du;
7452
- const auto mu = RebindMask(du, mask);
7453
- using TU = TFromD<decltype(du)>;
7454
- TU* HWY_RESTRICT pu = reinterpret_cast<TU*>(unaligned);
7455
- detail::NativeCompressStore(BitCast(du, v), mu, pu);
7456
- #endif // HWY_TARGET != HWY_AVX3_ZEN4
7457
- const size_t count = CountTrue(d, mask);
7458
- detail::MaybeUnpoison(unaligned, count);
7459
- return count;
7460
- }
7461
-
7462
- // Additional overloads to avoid casting to uint32_t (delay?).
7463
- template <class D, HWY_IF_FLOAT3264_D(D)>
7464
- HWY_API size_t CompressStore(VFromD<D> v, MFromD<D> mask, D d,
7465
- TFromD<D>* HWY_RESTRICT unaligned) {
7466
- #if HWY_TARGET == HWY_AVX3_ZEN4
7467
- StoreU(Compress(v, mask), d, unaligned);
7468
- #else
7469
- (void)d;
7470
- detail::NativeCompressStore(v, mask, unaligned);
7471
- #endif // HWY_TARGET != HWY_AVX3_ZEN4
7472
- const size_t count = PopCount(uint64_t{mask.raw});
7473
- detail::MaybeUnpoison(unaligned, count);
7474
- return count;
7475
- }
7476
-
7477
- // ------------------------------ CompressBlendedStore
7478
- template <class D, HWY_IF_V_SIZE_GT_D(D, 8)>
7479
- HWY_API size_t CompressBlendedStore(VFromD<D> v, MFromD<D> m, D d,
7480
- TFromD<D>* HWY_RESTRICT unaligned) {
7481
- // Native CompressStore already does the blending at no extra cost (latency
7482
- // 11, rthroughput 2 - same as compress plus store).
7483
- if (HWY_TARGET == HWY_AVX3_DL ||
7484
- (HWY_TARGET != HWY_AVX3_ZEN4 && sizeof(TFromD<D>) > 2)) {
7485
- return CompressStore(v, m, d, unaligned);
7486
- } else {
7487
- const size_t count = CountTrue(d, m);
7488
- BlendedStore(Compress(v, m), FirstN(d, count), d, unaligned);
7489
- detail::MaybeUnpoison(unaligned, count);
7490
- return count;
7491
- }
7492
- }
7493
-
7494
- // ------------------------------ CompressBitsStore
7495
- // Generic for all vector lengths.
7496
- template <class D>
7497
- HWY_API size_t CompressBitsStore(VFromD<D> v, const uint8_t* HWY_RESTRICT bits,
7498
- D d, TFromD<D>* HWY_RESTRICT unaligned) {
7499
- return CompressStore(v, LoadMaskBits(d, bits), d, unaligned);
7500
- }
7501
-
7502
7250
  // ------------------------------ LoadInterleaved4
7503
7251
 
7504
7252
  // Actually implemented in generic_ops, we just overload LoadTransposedBlocks4.
@@ -7532,7 +7280,7 @@ Vec512<double> Shuffle128(const Vec512<double> lo, const Vec512<double> hi) {
7532
7280
  template <class D, HWY_IF_V_SIZE_D(D, 64)>
7533
7281
  HWY_API void LoadTransposedBlocks3(D d, const TFromD<D>* HWY_RESTRICT unaligned,
7534
7282
  VFromD<D>& A, VFromD<D>& B, VFromD<D>& C) {
7535
- constexpr size_t N = Lanes(d);
7283
+ HWY_LANES_CONSTEXPR size_t N = Lanes(d);
7536
7284
  const VFromD<D> v3210 = LoadU(d, unaligned + 0 * N);
7537
7285
  const VFromD<D> v7654 = LoadU(d, unaligned + 1 * N);
7538
7286
  const VFromD<D> vba98 = LoadU(d, unaligned + 2 * N);
@@ -7559,7 +7307,7 @@ template <class D, HWY_IF_V_SIZE_D(D, 64)>
7559
7307
  HWY_API void LoadTransposedBlocks4(D d, const TFromD<D>* HWY_RESTRICT unaligned,
7560
7308
  VFromD<D>& vA, VFromD<D>& vB, VFromD<D>& vC,
7561
7309
  VFromD<D>& vD) {
7562
- constexpr size_t N = Lanes(d);
7310
+ HWY_LANES_CONSTEXPR size_t N = Lanes(d);
7563
7311
  const VFromD<D> v3210 = LoadU(d, unaligned + 0 * N);
7564
7312
  const VFromD<D> v7654 = LoadU(d, unaligned + 1 * N);
7565
7313
  const VFromD<D> vba98 = LoadU(d, unaligned + 2 * N);
@@ -7592,7 +7340,7 @@ namespace detail {
7592
7340
  template <class D, HWY_IF_V_SIZE_D(D, 64)>
7593
7341
  HWY_API void StoreTransposedBlocks2(const VFromD<D> i, const VFromD<D> j, D d,
7594
7342
  TFromD<D>* HWY_RESTRICT unaligned) {
7595
- constexpr size_t N = Lanes(d);
7343
+ HWY_LANES_CONSTEXPR size_t N = Lanes(d);
7596
7344
  const auto j1_j0_i1_i0 = detail::Shuffle128<_MM_PERM_BABA>(i, j);
7597
7345
  const auto j3_j2_i3_i2 = detail::Shuffle128<_MM_PERM_DCDC>(i, j);
7598
7346
  const auto j1_i1_j0_i0 =
@@ -7615,7 +7363,7 @@ template <class D, HWY_IF_V_SIZE_D(D, 64)>
7615
7363
  HWY_API void StoreTransposedBlocks3(const VFromD<D> i, const VFromD<D> j,
7616
7364
  const VFromD<D> k, D d,
7617
7365
  TFromD<D>* HWY_RESTRICT unaligned) {
7618
- constexpr size_t N = Lanes(d);
7366
+ HWY_LANES_CONSTEXPR size_t N = Lanes(d);
7619
7367
  const VFromD<D> j2_j0_i2_i0 = detail::Shuffle128<_MM_PERM_CACA>(i, j);
7620
7368
  const VFromD<D> i3_i1_k2_k0 = detail::Shuffle128<_MM_PERM_DBCA>(k, i);
7621
7369
  const VFromD<D> j3_j1_k3_k1 = detail::Shuffle128<_MM_PERM_DBDB>(k, j);
@@ -7646,7 +7394,7 @@ template <class D, HWY_IF_V_SIZE_D(D, 64)>
7646
7394
  HWY_API void StoreTransposedBlocks4(const VFromD<D> i, const VFromD<D> j,
7647
7395
  const VFromD<D> k, const VFromD<D> l, D d,
7648
7396
  TFromD<D>* HWY_RESTRICT unaligned) {
7649
- constexpr size_t N = Lanes(d);
7397
+ HWY_LANES_CONSTEXPR size_t N = Lanes(d);
7650
7398
  const VFromD<D> j1_j0_i1_i0 = detail::Shuffle128<_MM_PERM_BABA>(i, j);
7651
7399
  const VFromD<D> l1_l0_k1_k0 = detail::Shuffle128<_MM_PERM_BABA>(k, l);
7652
7400
  const VFromD<D> j3_j2_i3_i2 = detail::Shuffle128<_MM_PERM_DCDC>(i, j);
@@ -7805,6 +7553,17 @@ HWY_API Vec512<int64_t> operator>>(const Vec512<int64_t> v,
7805
7553
  }
7806
7554
 
7807
7555
  // ------------------------------ WidenMulPairwiseAdd
7556
+
7557
+ #if HWY_NATIVE_DOT_BF16
7558
+ template <class DF, HWY_IF_F32_D(DF), HWY_IF_V_SIZE_D(DF, 64),
7559
+ class VBF = VFromD<Repartition<bfloat16_t, DF>>>
7560
+ HWY_API VFromD<DF> WidenMulPairwiseAdd(DF df, VBF a, VBF b) {
7561
+ return VFromD<DF>{_mm512_dpbf16_ps(Zero(df).raw,
7562
+ reinterpret_cast<__m512bh>(a.raw),
7563
+ reinterpret_cast<__m512bh>(b.raw))};
7564
+ }
7565
+ #endif // HWY_NATIVE_DOT_BF16
7566
+
7808
7567
  template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_I32_D(D)>
7809
7568
  HWY_API VFromD<D> WidenMulPairwiseAdd(D /*d32*/, Vec512<int16_t> a,
7810
7569
  Vec512<int16_t> b) {
@@ -7923,117 +7682,34 @@ HWY_API V BitShuffle(V v, VI idx) {
7923
7682
  }
7924
7683
  #endif // HWY_TARGET <= HWY_AVX3_DL
7925
7684
 
7926
- // -------------------- LeadingZeroCount, TrailingZeroCount, HighestSetBitIndex
7927
-
7928
- template <class V, HWY_IF_UI32(TFromV<V>), HWY_IF_V_SIZE_V(V, 64)>
7929
- HWY_API V LeadingZeroCount(V v) {
7930
- return V{_mm512_lzcnt_epi32(v.raw)};
7931
- }
7685
+ // ------------------------------ MultiRotateRight
7932
7686
 
7933
- template <class V, HWY_IF_UI64(TFromV<V>), HWY_IF_V_SIZE_V(V, 64)>
7934
- HWY_API V LeadingZeroCount(V v) {
7935
- return V{_mm512_lzcnt_epi64(v.raw)};
7936
- }
7937
-
7938
- namespace detail {
7939
-
7940
- template <class V, HWY_IF_UNSIGNED_V(V),
7941
- HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 2)),
7942
- HWY_IF_LANES_LE_D(DFromV<V>, 16)>
7943
- HWY_INLINE V Lzcnt32ForU8OrU16(V v) {
7944
- const DFromV<decltype(v)> d;
7945
- const Rebind<int32_t, decltype(d)> di32;
7946
- const Rebind<uint32_t, decltype(d)> du32;
7947
-
7948
- const auto v_lz_count = LeadingZeroCount(PromoteTo(du32, v));
7949
- return DemoteTo(d, BitCast(di32, v_lz_count));
7950
- }
7951
-
7952
- template <class V, HWY_IF_UNSIGNED_V(V),
7953
- HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 2)),
7954
- HWY_IF_LANES_D(DFromV<V>, 32)>
7955
- HWY_INLINE VFromD<Rebind<uint16_t, DFromV<V>>> Lzcnt32ForU8OrU16AsU16(V v) {
7956
- const DFromV<decltype(v)> d;
7957
- const Half<decltype(d)> dh;
7958
- const Rebind<int32_t, decltype(dh)> di32;
7959
- const Rebind<uint32_t, decltype(dh)> du32;
7960
- const Rebind<uint16_t, decltype(d)> du16;
7961
-
7962
- const auto lo_v_lz_count =
7963
- LeadingZeroCount(PromoteTo(du32, LowerHalf(dh, v)));
7964
- const auto hi_v_lz_count =
7965
- LeadingZeroCount(PromoteTo(du32, UpperHalf(dh, v)));
7966
- return OrderedDemote2To(du16, BitCast(di32, lo_v_lz_count),
7967
- BitCast(di32, hi_v_lz_count));
7968
- }
7969
-
7970
- HWY_INLINE Vec256<uint8_t> Lzcnt32ForU8OrU16(Vec256<uint8_t> v) {
7971
- const DFromV<decltype(v)> d;
7972
- const Rebind<int16_t, decltype(d)> di16;
7973
- return DemoteTo(d, BitCast(di16, Lzcnt32ForU8OrU16AsU16(v)));
7974
- }
7975
-
7976
- HWY_INLINE Vec512<uint8_t> Lzcnt32ForU8OrU16(Vec512<uint8_t> v) {
7977
- const DFromV<decltype(v)> d;
7978
- const Half<decltype(d)> dh;
7979
- const Rebind<int16_t, decltype(dh)> di16;
7687
+ #if HWY_TARGET <= HWY_AVX3_DL
7980
7688
 
7981
- const auto lo_half = LowerHalf(dh, v);
7982
- const auto hi_half = UpperHalf(dh, v);
7689
+ #ifdef HWY_NATIVE_MULTIROTATERIGHT
7690
+ #undef HWY_NATIVE_MULTIROTATERIGHT
7691
+ #else
7692
+ #define HWY_NATIVE_MULTIROTATERIGHT
7693
+ #endif
7983
7694
 
7984
- const auto lo_v_lz_count = BitCast(di16, Lzcnt32ForU8OrU16AsU16(lo_half));
7985
- const auto hi_v_lz_count = BitCast(di16, Lzcnt32ForU8OrU16AsU16(hi_half));
7986
- return OrderedDemote2To(d, lo_v_lz_count, hi_v_lz_count);
7695
+ template <class V, class VI, HWY_IF_UI64(TFromV<V>), HWY_IF_UI8(TFromV<VI>),
7696
+ HWY_IF_V_SIZE_V(V, 64), HWY_IF_V_SIZE_V(VI, HWY_MAX_LANES_V(V) * 8)>
7697
+ HWY_API V MultiRotateRight(V v, VI idx) {
7698
+ return V{_mm512_multishift_epi64_epi8(idx.raw, v.raw)};
7987
7699
  }
7988
7700
 
7989
- HWY_INLINE Vec512<uint16_t> Lzcnt32ForU8OrU16(Vec512<uint16_t> v) {
7990
- return Lzcnt32ForU8OrU16AsU16(v);
7991
- }
7701
+ #endif
7992
7702
 
7993
- } // namespace detail
7703
+ // -------------------- LeadingZeroCount
7994
7704
 
7995
- template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V),
7996
- HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 2))>
7705
+ template <class V, HWY_IF_UI32(TFromV<V>), HWY_IF_V_SIZE_V(V, 64)>
7997
7706
  HWY_API V LeadingZeroCount(V v) {
7998
- const DFromV<decltype(v)> d;
7999
- const RebindToUnsigned<decltype(d)> du;
8000
- using TU = TFromD<decltype(du)>;
8001
-
8002
- constexpr TU kNumOfBitsInT{sizeof(TU) * 8};
8003
- const auto v_lzcnt32 = detail::Lzcnt32ForU8OrU16(BitCast(du, v));
8004
- return BitCast(d, Min(v_lzcnt32 - Set(du, TU{32 - kNumOfBitsInT}),
8005
- Set(du, TU{kNumOfBitsInT})));
8006
- }
8007
-
8008
- template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V),
8009
- HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 2))>
8010
- HWY_API V HighestSetBitIndex(V v) {
8011
- const DFromV<decltype(v)> d;
8012
- const RebindToUnsigned<decltype(d)> du;
8013
- using TU = TFromD<decltype(du)>;
8014
- return BitCast(d,
8015
- Set(du, TU{31}) - detail::Lzcnt32ForU8OrU16(BitCast(du, v)));
8016
- }
8017
-
8018
- template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V),
8019
- HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 4) | (1 << 8))>
8020
- HWY_API V HighestSetBitIndex(V v) {
8021
- const DFromV<decltype(v)> d;
8022
- using T = TFromD<decltype(d)>;
8023
- return BitCast(d, Set(d, T{sizeof(T) * 8 - 1}) - LeadingZeroCount(v));
7707
+ return V{_mm512_lzcnt_epi32(v.raw)};
8024
7708
  }
8025
7709
 
8026
- template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
8027
- HWY_API V TrailingZeroCount(V v) {
8028
- const DFromV<decltype(v)> d;
8029
- const RebindToSigned<decltype(d)> di;
8030
- using T = TFromD<decltype(d)>;
8031
-
8032
- const auto vi = BitCast(di, v);
8033
- const auto lowest_bit = BitCast(d, And(vi, Neg(vi)));
8034
- constexpr T kNumOfBitsInT{sizeof(T) * 8};
8035
- const auto bit_idx = HighestSetBitIndex(lowest_bit);
8036
- return IfThenElse(MaskFromVec(bit_idx), Set(d, kNumOfBitsInT), bit_idx);
7710
+ template <class V, HWY_IF_UI64(TFromV<V>), HWY_IF_V_SIZE_V(V, 64)>
7711
+ HWY_API V LeadingZeroCount(V v) {
7712
+ return V{_mm512_lzcnt_epi64(v.raw)};
8037
7713
  }
8038
7714
 
8039
7715
  // NOLINTNEXTLINE(google-readability-namespace-comments)