@img/sharp-libvips-dev 1.2.1 → 1.2.2-rc.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. package/include/ffi.h +3 -3
  2. package/include/harfbuzz/hb-version.h +3 -3
  3. package/include/hwy/abort.h +2 -19
  4. package/include/hwy/aligned_allocator.h +11 -7
  5. package/include/hwy/auto_tune.h +504 -0
  6. package/include/hwy/base.h +425 -104
  7. package/include/hwy/cache_control.h +16 -0
  8. package/include/hwy/detect_compiler_arch.h +32 -1
  9. package/include/hwy/detect_targets.h +251 -67
  10. package/include/hwy/foreach_target.h +35 -0
  11. package/include/hwy/highway.h +185 -76
  12. package/include/hwy/nanobenchmark.h +1 -19
  13. package/include/hwy/ops/arm_neon-inl.h +969 -458
  14. package/include/hwy/ops/arm_sve-inl.h +1137 -359
  15. package/include/hwy/ops/emu128-inl.h +97 -11
  16. package/include/hwy/ops/generic_ops-inl.h +1222 -34
  17. package/include/hwy/ops/loongarch_lasx-inl.h +4664 -0
  18. package/include/hwy/ops/loongarch_lsx-inl.h +5933 -0
  19. package/include/hwy/ops/ppc_vsx-inl.h +306 -126
  20. package/include/hwy/ops/rvv-inl.h +546 -51
  21. package/include/hwy/ops/scalar-inl.h +77 -22
  22. package/include/hwy/ops/set_macros-inl.h +138 -17
  23. package/include/hwy/ops/shared-inl.h +50 -10
  24. package/include/hwy/ops/wasm_128-inl.h +137 -92
  25. package/include/hwy/ops/x86_128-inl.h +773 -214
  26. package/include/hwy/ops/x86_256-inl.h +712 -255
  27. package/include/hwy/ops/x86_512-inl.h +429 -753
  28. package/include/hwy/ops/x86_avx3-inl.h +501 -0
  29. package/include/hwy/per_target.h +2 -1
  30. package/include/hwy/profiler.h +622 -486
  31. package/include/hwy/targets.h +62 -20
  32. package/include/hwy/timer-inl.h +8 -160
  33. package/include/hwy/timer.h +170 -3
  34. package/include/hwy/x86_cpuid.h +81 -0
  35. package/include/libheif/heif_cxx.h +25 -5
  36. package/include/libheif/heif_regions.h +5 -5
  37. package/include/libheif/heif_version.h +2 -2
  38. package/include/librsvg-2.0/librsvg/rsvg-version.h +2 -2
  39. package/include/pango-1.0/pango/pango-enum-types.h +3 -0
  40. package/include/pango-1.0/pango/pango-features.h +3 -3
  41. package/include/pango-1.0/pango/pango-font.h +30 -0
  42. package/include/pango-1.0/pango/pango-version-macros.h +26 -0
  43. package/include/zlib.h +3 -3
  44. package/package.json +1 -1
  45. package/versions.json +8 -8
@@ -878,10 +878,47 @@ HWY_API VI TableLookupBytesOr0(const V bytes, const VI from) {
878
878
  }
879
879
 
880
880
  // ------------------------------ Reverse
881
+ #if HWY_S390X_HAVE_Z14 && HWY_COMPILER_GCC_ACTUAL && \
882
+ HWY_COMPILER_GCC_ACTUAL < 900
883
+ // Workaround for missing vec_reve on Z14 with GCC 8 or earlier
884
+ template <class D, typename T = TFromD<D>, HWY_IF_LANES_GT_D(D, 1),
885
+ HWY_IF_T_SIZE_D(D, 1)>
886
+ HWY_API Vec128<T> Reverse(D d, Vec128<T> v) {
887
+ const Repartition<uint8_t, decltype(d)> du8;
888
+ return TableLookupBytes(
889
+ v, BitCast(d, Dup128VecFromValues(du8, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6,
890
+ 5, 4, 3, 2, 1, 0)));
891
+ }
892
+
893
+ template <class D, typename T = TFromD<D>, HWY_IF_LANES_GT_D(D, 1),
894
+ HWY_IF_T_SIZE_D(D, 2)>
895
+ HWY_API Vec128<T> Reverse(D d, Vec128<T> v) {
896
+ const Repartition<uint8_t, decltype(d)> du8;
897
+ return TableLookupBytes(
898
+ v, BitCast(d, Dup128VecFromValues(du8, 14, 15, 12, 13, 10, 11, 8, 9, 6, 7,
899
+ 4, 5, 2, 3, 0, 1)));
900
+ }
901
+
902
+ template <class D, typename T = TFromD<D>, HWY_IF_LANES_GT_D(D, 1),
903
+ HWY_IF_T_SIZE_D(D, 4)>
904
+ HWY_API Vec128<T> Reverse(D d, Vec128<T> v) {
905
+ const Repartition<uint8_t, decltype(d)> du8;
906
+ return TableLookupBytes(
907
+ v, BitCast(d, Dup128VecFromValues(du8, 12, 13, 14, 15, 8, 9, 10, 11, 4, 5,
908
+ 6, 7, 0, 1, 2, 3)));
909
+ }
910
+
911
+ template <class D, typename T = TFromD<D>, HWY_IF_LANES_GT_D(D, 1),
912
+ HWY_IF_T_SIZE_D(D, 8)>
913
+ HWY_API Vec128<T> Reverse(D /* tag */, Vec128<T> v) {
914
+ return Vec128<T>{vec_sld(v.raw, v.raw, 8)};
915
+ }
916
+ #else
881
917
  template <class D, typename T = TFromD<D>, HWY_IF_LANES_GT_D(D, 1)>
882
918
  HWY_API Vec128<T> Reverse(D /* tag */, Vec128<T> v) {
883
919
  return Vec128<T>{vec_reve(v.raw)};
884
920
  }
921
+ #endif
885
922
 
886
923
  // ------------------------------ Shuffles (Reverse)
887
924
 
@@ -1554,12 +1591,33 @@ HWY_API V SaturatedSub(V a, V b) {
1554
1591
 
1555
1592
  // Returns (a + b + 1) / 2
1556
1593
 
1557
- template <typename T, size_t N, HWY_IF_UNSIGNED(T),
1558
- HWY_IF_T_SIZE_ONE_OF(T, 0x6)>
1594
+ #ifdef HWY_NATIVE_AVERAGE_ROUND_UI32
1595
+ #undef HWY_NATIVE_AVERAGE_ROUND_UI32
1596
+ #else
1597
+ #define HWY_NATIVE_AVERAGE_ROUND_UI32
1598
+ #endif
1599
+
1600
+ #if HWY_S390X_HAVE_Z14
1601
+ #ifdef HWY_NATIVE_AVERAGE_ROUND_UI64
1602
+ #undef HWY_NATIVE_AVERAGE_ROUND_UI64
1603
+ #else
1604
+ #define HWY_NATIVE_AVERAGE_ROUND_UI64
1605
+ #endif
1606
+
1607
+ #define HWY_PPC_IF_AVERAGE_ROUND_T(T) void* = nullptr
1608
+ #else // !HWY_S390X_HAVE_Z14
1609
+ #define HWY_PPC_IF_AVERAGE_ROUND_T(T) \
1610
+ HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2) | (1 << 4))
1611
+ #endif // HWY_S390X_HAVE_Z14
1612
+
1613
+ template <typename T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T),
1614
+ HWY_PPC_IF_AVERAGE_ROUND_T(T)>
1559
1615
  HWY_API Vec128<T, N> AverageRound(Vec128<T, N> a, Vec128<T, N> b) {
1560
1616
  return Vec128<T, N>{vec_avg(a.raw, b.raw)};
1561
1617
  }
1562
1618
 
1619
+ #undef HWY_PPC_IF_AVERAGE_ROUND_T
1620
+
1563
1621
  // ------------------------------ Multiplication
1564
1622
 
1565
1623
  // Per-target flags to prevent generic_ops-inl.h defining 8/64-bit operator*.
@@ -1918,6 +1976,23 @@ HWY_API Vec128<T, N> Sqrt(Vec128<T, N> v) {
1918
1976
  return Vec128<T, N>{vec_sqrt(v.raw)};
1919
1977
  }
1920
1978
 
1979
+ // ------------------------------ GetBiasedExponent
1980
+
1981
+ #if HWY_PPC_HAVE_9
1982
+
1983
+ #ifdef HWY_NATIVE_GET_BIASED_EXPONENT
1984
+ #undef HWY_NATIVE_GET_BIASED_EXPONENT
1985
+ #else
1986
+ #define HWY_NATIVE_GET_BIASED_EXPONENT
1987
+ #endif
1988
+
1989
+ template <class V, HWY_IF_FLOAT3264_V(V)>
1990
+ HWY_API VFromD<RebindToUnsigned<DFromV<V>>> GetBiasedExponent(V v) {
1991
+ return VFromD<RebindToUnsigned<DFromV<V>>>{vec_extract_exp(v.raw)};
1992
+ }
1993
+
1994
+ #endif // HWY_PPC_HAVE_9
1995
+
1921
1996
  // ------------------------------ Min (Gt, IfThenElse)
1922
1997
 
1923
1998
  template <typename T, size_t N, HWY_IF_NOT_SPECIAL_FLOAT(T)>
@@ -2522,8 +2597,10 @@ HWY_API Vec32<T> Reverse(D d, Vec32<T> v) {
2522
2597
 
2523
2598
  // ------------------------------- ReverseLaneBytes
2524
2599
 
2525
- #if (HWY_PPC_HAVE_9 || HWY_S390X_HAVE_Z14) && \
2526
- (HWY_COMPILER_GCC_ACTUAL >= 710 || HWY_COMPILER_CLANG >= 400)
2600
+ #if (HWY_PPC_HAVE_9 || HWY_S390X_HAVE_Z14) && \
2601
+ ((!HWY_S390X_HAVE_Z14 && HWY_COMPILER_GCC_ACTUAL >= 710) || \
2602
+ (HWY_S390X_HAVE_Z14 && HWY_COMPILER_GCC_ACTUAL >= 900) || \
2603
+ HWY_COMPILER_CLANG >= 400)
2527
2604
 
2528
2605
  // Per-target flag to prevent generic_ops-inl.h defining 8-bit ReverseLaneBytes.
2529
2606
  #ifdef HWY_NATIVE_REVERSE_LANE_BYTES
@@ -3279,12 +3356,22 @@ HWY_API Vec128<T, N> OddEvenBlocks(Vec128<T, N> /* odd */, Vec128<T, N> even) {
3279
3356
  }
3280
3357
 
3281
3358
  // ------------------------------ SwapAdjacentBlocks
3282
-
3283
3359
  template <typename T, size_t N>
3284
3360
  HWY_API Vec128<T, N> SwapAdjacentBlocks(Vec128<T, N> v) {
3285
3361
  return v;
3286
3362
  }
3287
3363
 
3364
+ // ------------------------------ InterleaveEvenBlocks
3365
+ template <class D, class V = VFromD<D>>
3366
+ HWY_API V InterleaveEvenBlocks(D, V a, V /*b*/) {
3367
+ return a;
3368
+ }
3369
+ // ------------------------------ InterleaveOddBlocks
3370
+ template <class D, class V = VFromD<D>>
3371
+ HWY_API V InterleaveOddBlocks(D, V a, V /*b*/) {
3372
+ return a;
3373
+ }
3374
+
3288
3375
  // ------------------------------ MulFixedPoint15 (OddEven)
3289
3376
 
3290
3377
  #if HWY_S390X_HAVE_Z14
@@ -3630,6 +3717,10 @@ HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<float, D>> v) {
3630
3717
  const __vector float raw_v = InterleaveLower(v, v).raw;
3631
3718
  #if HWY_IS_LITTLE_ENDIAN
3632
3719
  return VFromD<D>{vec_doubleo(raw_v)};
3720
+ #elif HWY_S390X_HAVE_Z14 && HWY_COMPILER_GCC_ACTUAL && \
3721
+ HWY_COMPILER_GCC_ACTUAL < 1000
3722
+ // Workaround for compiler errors with GCC 9 or earlier on Z14
3723
+ return VFromD<D>{__builtin_s390_vflls(raw_v)};
3633
3724
  #else
3634
3725
  return VFromD<D>{vec_doublee(raw_v)};
3635
3726
  #endif
@@ -3680,16 +3771,73 @@ static HWY_INLINE V VsxF2INormalizeSrcVals(V v) {
3680
3771
  #endif
3681
3772
  }
3682
3773
 
3774
+ template <class VF32>
3775
+ static HWY_INLINE HWY_MAYBE_UNUSED VFromD<Repartition<int64_t, DFromV<VF32>>>
3776
+ VsxXvcvspsxds(VF32 vf32) {
3777
+ using VI64 = VFromD<Repartition<int64_t, DFromV<VF32>>>;
3778
+ #if (HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 1500) || \
3779
+ HWY_HAS_BUILTIN(__builtin_vsx_xvcvspsxds)
3780
+ // Use __builtin_vsx_xvcvspsxds if it is available (which is the case with
3781
+ // GCC 4.8 through GCC 14 or Clang 13 or later on PPC8/PPC9/PPC10)
3782
+ return VI64{__builtin_vsx_xvcvspsxds(vf32.raw)};
3783
+ #elif HWY_COMPILER_GCC_ACTUAL >= 1500 && HWY_IS_LITTLE_ENDIAN
3784
+ // On little-endian PPC8/PPC9/PPC10 with GCC 15 or later, use the F32->I64
3785
+ // vec_signedo intrinsic as the __builtin_vsx_xvcvspsxds intrinsic has been
3786
+ // removed from GCC in GCC 15
3787
+ return VI64{vec_signedo(vf32.raw)};
3788
+ #elif HWY_COMPILER_GCC_ACTUAL >= 1500 && HWY_IS_BIG_ENDIAN
3789
+ // On big-endian PPC8/PPC9/PPC10 with GCC 15 or later, use the F32->I64
3790
+ // vec_signede intrinsic as the __builtin_vsx_xvcvspsxds intrinsic has been
3791
+ // removed from GCC in GCC 15
3792
+ return VI64{vec_signede(vf32.raw)};
3793
+ #else
3794
+ // Inline assembly fallback for older versions of Clang that do not have the
3795
+ // __builtin_vsx_xvcvspsxds intrinsic
3796
+ __vector signed long long raw_result;
3797
+ __asm__("xvcvspsxds %x0, %x1" : "=wa"(raw_result) : "wa"(vf32.raw) :);
3798
+ return VI64{raw_result};
3799
+ #endif
3800
+ }
3801
+
3802
+ template <class VF32>
3803
+ static HWY_INLINE HWY_MAYBE_UNUSED VFromD<Repartition<uint64_t, DFromV<VF32>>>
3804
+ VsxXvcvspuxds(VF32 vf32) {
3805
+ using VU64 = VFromD<Repartition<uint64_t, DFromV<VF32>>>;
3806
+ #if (HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 1500) || \
3807
+ HWY_HAS_BUILTIN(__builtin_vsx_xvcvspuxds)
3808
+ // Use __builtin_vsx_xvcvspuxds if it is available (which is the case with
3809
+ // GCC 4.8 through GCC 14 or Clang 13 or later on PPC8/PPC9/PPC10)
3810
+ return VU64{reinterpret_cast<__vector unsigned long long>(
3811
+ __builtin_vsx_xvcvspuxds(vf32.raw))};
3812
+ #elif HWY_COMPILER_GCC_ACTUAL >= 1500 && HWY_IS_LITTLE_ENDIAN
3813
+ // On little-endian PPC8/PPC9/PPC10 with GCC 15 or later, use the F32->U64
3814
+ // vec_unsignedo intrinsic as the __builtin_vsx_xvcvspuxds intrinsic has been
3815
+ // removed from GCC in GCC 15
3816
+ return VU64{vec_unsignedo(vf32.raw)};
3817
+ #elif HWY_COMPILER_GCC_ACTUAL >= 1500 && HWY_IS_BIG_ENDIAN
3818
+ // On big-endian PPC8/PPC9/PPC10 with GCC 15 or later, use the F32->U64
3819
+ // vec_unsignedo intrinsic as the __builtin_vsx_xvcvspuxds intrinsic has been
3820
+ // removed from GCC in GCC 15
3821
+ return VU64{vec_unsignede(vf32.raw)};
3822
+ #else
3823
+ // Inline assembly fallback for older versions of Clang that do not have the
3824
+ // __builtin_vsx_xvcvspuxds intrinsic
3825
+ __vector unsigned long long raw_result;
3826
+ __asm__("xvcvspuxds %x0, %x1" : "=wa"(raw_result) : "wa"(vf32.raw) :);
3827
+ return VU64{raw_result};
3828
+ #endif
3829
+ }
3830
+
3683
3831
  } // namespace detail
3684
3832
  #endif // !HWY_S390X_HAVE_Z14
3685
3833
 
3686
3834
  template <class D, HWY_IF_I64_D(D)>
3687
3835
  HWY_API VFromD<D> PromoteTo(D di64, VFromD<Rebind<float, D>> v) {
3688
- #if !HWY_S390X_HAVE_Z14 && \
3689
- (HWY_COMPILER_GCC_ACTUAL || HWY_HAS_BUILTIN(__builtin_vsx_xvcvspsxds))
3690
- const __vector float raw_v =
3691
- detail::VsxF2INormalizeSrcVals(InterleaveLower(v, v)).raw;
3692
- return VFromD<decltype(di64)>{__builtin_vsx_xvcvspsxds(raw_v)};
3836
+ #if !HWY_S390X_HAVE_Z14
3837
+ const Repartition<float, decltype(di64)> dt_f32;
3838
+ const auto vt_f32 = ResizeBitCast(dt_f32, v);
3839
+ return detail::VsxXvcvspsxds(
3840
+ detail::VsxF2INormalizeSrcVals(InterleaveLower(vt_f32, vt_f32)));
3693
3841
  #else
3694
3842
  const RebindToFloat<decltype(di64)> df64;
3695
3843
  return ConvertTo(di64, PromoteTo(df64, v));
@@ -3698,12 +3846,11 @@ HWY_API VFromD<D> PromoteTo(D di64, VFromD<Rebind<float, D>> v) {
3698
3846
 
3699
3847
  template <class D, HWY_IF_U64_D(D)>
3700
3848
  HWY_API VFromD<D> PromoteTo(D du64, VFromD<Rebind<float, D>> v) {
3701
- #if !HWY_S390X_HAVE_Z14 && \
3702
- (HWY_COMPILER_GCC_ACTUAL || HWY_HAS_BUILTIN(__builtin_vsx_xvcvspuxds))
3703
- const __vector float raw_v =
3704
- detail::VsxF2INormalizeSrcVals(InterleaveLower(v, v)).raw;
3705
- return VFromD<decltype(du64)>{reinterpret_cast<__vector unsigned long long>(
3706
- __builtin_vsx_xvcvspuxds(raw_v))};
3849
+ #if !HWY_S390X_HAVE_Z14
3850
+ const Repartition<float, decltype(du64)> dt_f32;
3851
+ const auto vt_f32 = ResizeBitCast(dt_f32, v);
3852
+ return detail::VsxXvcvspuxds(
3853
+ detail::VsxF2INormalizeSrcVals(InterleaveLower(vt_f32, vt_f32)));
3707
3854
  #else
3708
3855
  const RebindToFloat<decltype(du64)> df64;
3709
3856
  return ConvertTo(du64, PromoteTo(df64, v));
@@ -3767,6 +3914,10 @@ HWY_API VFromD<D> PromoteUpperTo(D /*tag*/, Vec128<float> v) {
3767
3914
  const __vector float raw_v = InterleaveUpper(Full128<float>(), v, v).raw;
3768
3915
  #if HWY_IS_LITTLE_ENDIAN
3769
3916
  return VFromD<D>{vec_doubleo(raw_v)};
3917
+ #elif HWY_S390X_HAVE_Z14 && HWY_COMPILER_GCC_ACTUAL && \
3918
+ HWY_COMPILER_GCC_ACTUAL < 1000
3919
+ // Workaround for compiler error with GCC 9 or earlier on Z14
3920
+ return VFromD<D>{__builtin_s390_vflls(raw_v)};
3770
3921
  #else
3771
3922
  return VFromD<D>{vec_doublee(raw_v)};
3772
3923
  #endif
@@ -3808,12 +3959,10 @@ HWY_API VFromD<D> PromoteUpperTo(D df64, Vec128<uint32_t> v) {
3808
3959
 
3809
3960
  template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_I64_D(D)>
3810
3961
  HWY_API VFromD<D> PromoteUpperTo(D di64, Vec128<float> v) {
3811
- #if !HWY_S390X_HAVE_Z14 && \
3812
- (HWY_COMPILER_GCC_ACTUAL || HWY_HAS_BUILTIN(__builtin_vsx_xvcvspsxds))
3813
- const __vector float raw_v =
3814
- detail::VsxF2INormalizeSrcVals(InterleaveUpper(Full128<float>(), v, v))
3815
- .raw;
3816
- return VFromD<decltype(di64)>{__builtin_vsx_xvcvspsxds(raw_v)};
3962
+ #if !HWY_S390X_HAVE_Z14
3963
+ (void)di64;
3964
+ return detail::VsxXvcvspsxds(
3965
+ detail::VsxF2INormalizeSrcVals(InterleaveUpper(Full128<float>(), v, v)));
3817
3966
  #else
3818
3967
  const RebindToFloat<decltype(di64)> df64;
3819
3968
  return ConvertTo(di64, PromoteUpperTo(df64, v));
@@ -3822,13 +3971,10 @@ HWY_API VFromD<D> PromoteUpperTo(D di64, Vec128<float> v) {
3822
3971
 
3823
3972
  template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U64_D(D)>
3824
3973
  HWY_API VFromD<D> PromoteUpperTo(D du64, Vec128<float> v) {
3825
- #if !HWY_S390X_HAVE_Z14 && \
3826
- (HWY_COMPILER_GCC_ACTUAL || HWY_HAS_BUILTIN(__builtin_vsx_xvcvspuxds))
3827
- const __vector float raw_v =
3828
- detail::VsxF2INormalizeSrcVals(InterleaveUpper(Full128<float>(), v, v))
3829
- .raw;
3830
- return VFromD<decltype(du64)>{reinterpret_cast<__vector unsigned long long>(
3831
- __builtin_vsx_xvcvspuxds(raw_v))};
3974
+ #if !HWY_S390X_HAVE_Z14
3975
+ (void)du64;
3976
+ return detail::VsxXvcvspuxds(
3977
+ detail::VsxF2INormalizeSrcVals(InterleaveUpper(Full128<float>(), v, v)));
3832
3978
  #else
3833
3979
  const RebindToFloat<decltype(du64)> df64;
3834
3980
  return ConvertTo(du64, PromoteUpperTo(df64, v));
@@ -3916,20 +4062,18 @@ HWY_INLINE VFromD<D> PromoteEvenTo(hwy::SignedTag /*to_type_tag*/,
3916
4062
  hwy::SizeTag<8> /*to_lane_size_tag*/,
3917
4063
  hwy::FloatTag /*from_type_tag*/, D d_to,
3918
4064
  V v) {
3919
- #if !HWY_S390X_HAVE_Z14 && \
3920
- (HWY_COMPILER_GCC_ACTUAL || HWY_HAS_BUILTIN(__builtin_vsx_xvcvspsxds))
4065
+ #if !HWY_S390X_HAVE_Z14
3921
4066
  (void)d_to;
3922
4067
  const auto normalized_v = detail::VsxF2INormalizeSrcVals(v);
3923
4068
  #if HWY_IS_LITTLE_ENDIAN
3924
- // __builtin_vsx_xvcvspsxds expects the source values to be in the odd lanes
3925
- // on little-endian PPC, and the vec_sld operation below will shift the even
4069
+ // VsxXvcvspsxds expects the source values to be in the odd lanes on
4070
+ // little-endian PPC, and the Shuffle2103 operation below will shift the even
3926
4071
  // lanes of normalized_v into the odd lanes.
3927
- return VFromD<D>{
3928
- __builtin_vsx_xvcvspsxds(vec_sld(normalized_v.raw, normalized_v.raw, 4))};
4072
+ return VsxXvcvspsxds(Shuffle2103(normalized_v));
3929
4073
  #else
3930
- // __builtin_vsx_xvcvspsxds expects the source values to be in the even lanes
3931
- // on big-endian PPC.
3932
- return VFromD<D>{__builtin_vsx_xvcvspsxds(normalized_v.raw)};
4074
+ // VsxXvcvspsxds expects the source values to be in the even lanes on
4075
+ // big-endian PPC.
4076
+ return VsxXvcvspsxds(normalized_v);
3933
4077
  #endif
3934
4078
  #else
3935
4079
  const RebindToFloat<decltype(d_to)> df64;
@@ -3944,22 +4088,18 @@ HWY_INLINE VFromD<D> PromoteEvenTo(hwy::UnsignedTag /*to_type_tag*/,
3944
4088
  hwy::SizeTag<8> /*to_lane_size_tag*/,
3945
4089
  hwy::FloatTag /*from_type_tag*/, D d_to,
3946
4090
  V v) {
3947
- #if !HWY_S390X_HAVE_Z14 && \
3948
- (HWY_COMPILER_GCC_ACTUAL || HWY_HAS_BUILTIN(__builtin_vsx_xvcvspuxds))
4091
+ #if !HWY_S390X_HAVE_Z14
3949
4092
  (void)d_to;
3950
4093
  const auto normalized_v = detail::VsxF2INormalizeSrcVals(v);
3951
4094
  #if HWY_IS_LITTLE_ENDIAN
3952
- // __builtin_vsx_xvcvspuxds expects the source values to be in the odd lanes
3953
- // on little-endian PPC, and the vec_sld operation below will shift the even
3954
- // lanes of normalized_v into the odd lanes.
3955
- return VFromD<D>{
3956
- reinterpret_cast<__vector unsigned long long>(__builtin_vsx_xvcvspuxds(
3957
- vec_sld(normalized_v.raw, normalized_v.raw, 4)))};
4095
+ // VsxXvcvspuxds expects the source values to be in the odd lanes
4096
+ // on little-endian PPC, and the Shuffle2103 operation below will shift the
4097
+ // even lanes of normalized_v into the odd lanes.
4098
+ return VsxXvcvspuxds(Shuffle2103(normalized_v));
3958
4099
  #else
3959
- // __builtin_vsx_xvcvspuxds expects the source values to be in the even lanes
4100
+ // VsxXvcvspuxds expects the source values to be in the even lanes
3960
4101
  // on big-endian PPC.
3961
- return VFromD<D>{reinterpret_cast<__vector unsigned long long>(
3962
- __builtin_vsx_xvcvspuxds(normalized_v.raw))};
4102
+ return VsxXvcvspuxds(normalized_v);
3963
4103
  #endif
3964
4104
  #else
3965
4105
  const RebindToFloat<decltype(d_to)> df64;
@@ -4001,20 +4141,18 @@ HWY_INLINE VFromD<D> PromoteOddTo(hwy::SignedTag /*to_type_tag*/,
4001
4141
  hwy::SizeTag<8> /*to_lane_size_tag*/,
4002
4142
  hwy::FloatTag /*from_type_tag*/, D d_to,
4003
4143
  V v) {
4004
- #if !HWY_S390X_HAVE_Z14 && \
4005
- (HWY_COMPILER_GCC_ACTUAL || HWY_HAS_BUILTIN(__builtin_vsx_xvcvspsxds))
4144
+ #if !HWY_S390X_HAVE_Z14
4006
4145
  (void)d_to;
4007
4146
  const auto normalized_v = detail::VsxF2INormalizeSrcVals(v);
4008
4147
  #if HWY_IS_LITTLE_ENDIAN
4009
- // __builtin_vsx_xvcvspsxds expects the source values to be in the odd lanes
4148
+ // VsxXvcvspsxds expects the source values to be in the odd lanes
4010
4149
  // on little-endian PPC
4011
- return VFromD<D>{__builtin_vsx_xvcvspsxds(normalized_v.raw)};
4150
+ return VsxXvcvspsxds(normalized_v);
4012
4151
  #else
4013
- // __builtin_vsx_xvcvspsxds expects the source values to be in the even lanes
4014
- // on big-endian PPC, and the vec_sld operation below will shift the odd lanes
4015
- // of normalized_v into the even lanes.
4016
- return VFromD<D>{
4017
- __builtin_vsx_xvcvspsxds(vec_sld(normalized_v.raw, normalized_v.raw, 4))};
4152
+ // VsxXvcvspsxds expects the source values to be in the even lanes
4153
+ // on big-endian PPC, and the Shuffle0321 operation below will shift the odd
4154
+ // lanes of normalized_v into the even lanes.
4155
+ return VsxXvcvspsxds(Shuffle0321(normalized_v));
4018
4156
  #endif
4019
4157
  #else
4020
4158
  const RebindToFloat<decltype(d_to)> df64;
@@ -4029,22 +4167,18 @@ HWY_INLINE VFromD<D> PromoteOddTo(hwy::UnsignedTag /*to_type_tag*/,
4029
4167
  hwy::SizeTag<8> /*to_lane_size_tag*/,
4030
4168
  hwy::FloatTag /*from_type_tag*/, D d_to,
4031
4169
  V v) {
4032
- #if !HWY_S390X_HAVE_Z14 && \
4033
- (HWY_COMPILER_GCC_ACTUAL || HWY_HAS_BUILTIN(__builtin_vsx_xvcvspuxds))
4170
+ #if !HWY_S390X_HAVE_Z14
4034
4171
  (void)d_to;
4035
4172
  const auto normalized_v = detail::VsxF2INormalizeSrcVals(v);
4036
4173
  #if HWY_IS_LITTLE_ENDIAN
4037
- // __builtin_vsx_xvcvspuxds expects the source values to be in the odd lanes
4174
+ // VsxXvcvspuxds expects the source values to be in the odd lanes
4038
4175
  // on little-endian PPC
4039
- return VFromD<D>{reinterpret_cast<__vector unsigned long long>(
4040
- __builtin_vsx_xvcvspuxds(normalized_v.raw))};
4176
+ return VsxXvcvspuxds(normalized_v);
4041
4177
  #else
4042
- // __builtin_vsx_xvcvspuxds expects the source values to be in the even lanes
4043
- // on big-endian PPC, and the vec_sld operation below will shift the odd lanes
4044
- // of normalized_v into the even lanes.
4045
- return VFromD<D>{
4046
- reinterpret_cast<__vector unsigned long long>(__builtin_vsx_xvcvspuxds(
4047
- vec_sld(normalized_v.raw, normalized_v.raw, 4)))};
4178
+ // VsxXvcvspuxds expects the source values to be in the even lanes
4179
+ // on big-endian PPC, and the Shuffle0321 operation below will shift the odd
4180
+ // lanes of normalized_v into the even lanes.
4181
+ return VsxXvcvspuxds(Shuffle0321(normalized_v));
4048
4182
  #endif
4049
4183
  #else
4050
4184
  const RebindToFloat<decltype(d_to)> df64;
@@ -4388,12 +4522,22 @@ HWY_API VFromD<D> OrderedDemote2To(D d, V a, V b) {
4388
4522
 
4389
4523
  template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_F32_D(D)>
4390
4524
  HWY_API Vec32<float> DemoteTo(D /* tag */, Vec64<double> v) {
4525
+ #if HWY_S390X_HAVE_Z14 && HWY_COMPILER_GCC_ACTUAL && \
4526
+ HWY_COMPILER_GCC_ACTUAL < 1000
4527
+ // Workaround for compiler error with GCC 9 or earlier on Z14
4528
+ return Vec32<float>{__builtin_s390_vflrd(v.raw, 0, 0)};
4529
+ #else
4391
4530
  return Vec32<float>{vec_floate(v.raw)};
4531
+ #endif
4392
4532
  }
4393
4533
 
4394
4534
  template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_F32_D(D)>
4395
4535
  HWY_API Vec64<float> DemoteTo(D d, Vec128<double> v) {
4396
- #if HWY_S390X_HAVE_Z14 || HWY_IS_LITTLE_ENDIAN
4536
+ #if HWY_S390X_HAVE_Z14 && HWY_COMPILER_GCC_ACTUAL && \
4537
+ HWY_COMPILER_GCC_ACTUAL < 1000
4538
+ // Workaround for compiler error with GCC 9 or earlier on Z14
4539
+ const Vec128<float> f64_to_f32{__builtin_s390_vflrd(v.raw, 0, 0)};
4540
+ #elif HWY_S390X_HAVE_Z14 || HWY_IS_LITTLE_ENDIAN
4397
4541
  const Vec128<float> f64_to_f32{vec_floate(v.raw)};
4398
4542
  #else
4399
4543
  const Vec128<float> f64_to_f32{vec_floato(v.raw)};
@@ -4578,8 +4722,16 @@ template <class D, typename FromT, HWY_IF_F32_D(D), HWY_IF_UI32(FromT),
4578
4722
  HWY_API VFromD<D> ConvertTo(D df32, Vec128<FromT> v) {
4579
4723
  const RepartitionToWide<decltype(df32)> df64;
4580
4724
 
4725
+ #if HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 1000
4726
+ // Workaround for compiler error with GCC 9 or earlier on Z14
4727
+ const VFromD<D> vf32_lo{
4728
+ __builtin_s390_vflrd(PromoteLowerTo(df64, v).raw, 0, 0)};
4729
+ const VFromD<D> vf32_hi{
4730
+ __builtin_s390_vflrd(PromoteUpperTo(df64, v).raw, 0, 0)};
4731
+ #else
4581
4732
  const VFromD<D> vf32_lo{vec_floate(PromoteLowerTo(df64, v).raw)};
4582
4733
  const VFromD<D> vf32_hi{vec_floate(PromoteUpperTo(df64, v).raw)};
4734
+ #endif
4583
4735
  return ConcatEven(df32, vf32_hi, vf32_lo);
4584
4736
  }
4585
4737
  #else // Z15 or PPC
@@ -4670,7 +4822,7 @@ HWY_API VFromD<D> ConvertTo(D /* tag */,
4670
4822
  template <class D, HWY_IF_I64_D(D)>
4671
4823
  HWY_API VFromD<D> ConvertTo(D /* tag */,
4672
4824
  Vec128<double, Rebind<double, D>().MaxLanes()> v) {
4673
- #if defined(__OPTIMIZE__)
4825
+ #if defined(__OPTIMIZE__) && (!HWY_COMPILER_CLANG || !HWY_S390X_HAVE_Z14)
4674
4826
  if (detail::IsConstantRawAltivecVect(v.raw)) {
4675
4827
  constexpr int64_t kMinI64 = LimitsMin<int64_t>();
4676
4828
  constexpr int64_t kMaxI64 = LimitsMax<int64_t>();
@@ -4769,7 +4921,7 @@ HWY_API VFromD<D> ConvertTo(D /* tag */,
4769
4921
  HWY_DIAGNOSTICS_OFF(disable : 5219, ignored "-Wdeprecate-lax-vec-conv-all")
4770
4922
  #endif
4771
4923
 
4772
- #if defined(__OPTIMIZE__)
4924
+ #if defined(__OPTIMIZE__) && (!HWY_COMPILER_CLANG || !HWY_S390X_HAVE_Z14)
4773
4925
  if (detail::IsConstantRawAltivecVect(v.raw)) {
4774
4926
  constexpr uint64_t kMaxU64 = LimitsMax<uint64_t>();
4775
4927
  return Dup128VecFromValues(
@@ -4815,13 +4967,19 @@ HWY_API Vec128<double, N> Round(Vec128<double, N> v) {
4815
4967
  #endif
4816
4968
  }
4817
4969
 
4818
- template <size_t N>
4819
- HWY_API Vec128<int32_t, N> NearestInt(Vec128<float, N> v) {
4970
+ template <typename T, size_t N, HWY_IF_FLOAT3264(T)>
4971
+ HWY_API Vec128<MakeSigned<T>, N> NearestInt(Vec128<T, N> v) {
4820
4972
  const DFromV<decltype(v)> d;
4821
4973
  const RebindToSigned<decltype(d)> di;
4822
4974
  return ConvertTo(di, Round(v));
4823
4975
  }
4824
4976
 
4977
+ template <class DI32, HWY_IF_I32_D(DI32)>
4978
+ HWY_API VFromD<DI32> DemoteToNearestInt(DI32 di32,
4979
+ VFromD<Rebind<double, DI32>> v) {
4980
+ return DemoteTo(di32, Round(v));
4981
+ }
4982
+
4825
4983
  // Toward zero, aka truncate
4826
4984
  template <typename T, size_t N, HWY_IF_FLOAT(T)>
4827
4985
  HWY_API Vec128<T, N> Trunc(Vec128<T, N> v) {
@@ -5195,6 +5353,13 @@ HWY_API MFromD<D> Dup128MaskFromMaskBits(D d, unsigned mask_bits) {
5195
5353
 
5196
5354
  namespace detail {
5197
5355
 
5356
+ // Returns the lowest N of the mask bits.
5357
+ template <class D>
5358
+ constexpr uint64_t OnlyActive(D d, uint64_t mask_bits) {
5359
+ return (d.MaxBytes() == 16) ? mask_bits
5360
+ : mask_bits & ((1ull << d.MaxLanes()) - 1);
5361
+ }
5362
+
5198
5363
  #if !HWY_PPC_HAVE_10 || HWY_IS_BIG_ENDIAN
5199
5364
  // fallback for missing vec_extractm
5200
5365
  template <size_t N>
@@ -5215,31 +5380,33 @@ HWY_INLINE uint64_t ExtractSignBits(Vec128<uint8_t, N> sign_bits,
5215
5380
 
5216
5381
  #endif // !HWY_PPC_HAVE_10 || HWY_IS_BIG_ENDIAN
5217
5382
 
5218
- template <typename T, size_t N>
5219
- HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<1> /*tag*/, Mask128<T, N> mask) {
5220
- const DFromM<decltype(mask)> d;
5383
+ } // namespace detail
5384
+
5385
+ template <class D, HWY_IF_T_SIZE_D(D, 1)>
5386
+ HWY_API uint64_t BitsFromMask(D d, MFromD<D> mask) {
5221
5387
  const Repartition<uint8_t, decltype(d)> du8;
5222
5388
  const VFromD<decltype(du8)> sign_bits = BitCast(du8, VecFromMask(d, mask));
5223
5389
 
5224
5390
  #if HWY_PPC_HAVE_10 && HWY_IS_LITTLE_ENDIAN
5225
- return static_cast<uint64_t>(vec_extractm(sign_bits.raw));
5391
+ return detail::OnlyActive(d,
5392
+ static_cast<uint64_t>(vec_extractm(sign_bits.raw)));
5226
5393
  #else // Z14, Z15, PPC8, PPC9, or big-endian PPC10
5227
5394
  const __vector unsigned char kBitShuffle = {120, 112, 104, 96, 88, 80, 72, 64,
5228
5395
  56, 48, 40, 32, 24, 16, 8, 0};
5229
- return ExtractSignBits(sign_bits, kBitShuffle);
5396
+ return detail::OnlyActive(d, detail::ExtractSignBits(sign_bits, kBitShuffle));
5230
5397
  #endif // HWY_PPC_HAVE_10 && HWY_IS_LITTLE_ENDIAN
5231
5398
  }
5232
5399
 
5233
- template <typename T, size_t N>
5234
- HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<2> /*tag*/, Mask128<T, N> mask) {
5235
- const DFromM<decltype(mask)> d;
5400
+ template <class D, HWY_IF_T_SIZE_D(D, 2)>
5401
+ HWY_API uint64_t BitsFromMask(D d, MFromD<D> mask) {
5236
5402
  const RebindToUnsigned<decltype(d)> du;
5237
5403
 
5238
5404
  const Repartition<uint8_t, decltype(d)> du8;
5239
5405
  const VFromD<decltype(du8)> sign_bits = BitCast(du8, VecFromMask(d, mask));
5240
5406
 
5241
5407
  #if HWY_PPC_HAVE_10 && HWY_IS_LITTLE_ENDIAN
5242
- return static_cast<uint64_t>(vec_extractm(BitCast(du, sign_bits).raw));
5408
+ return detail::OnlyActive(
5409
+ d, static_cast<uint64_t>(vec_extractm(BitCast(du, sign_bits).raw)));
5243
5410
  #else // Z14, Z15, PPC8, PPC9, or big-endian PPC10
5244
5411
  (void)du;
5245
5412
  #if HWY_IS_LITTLE_ENDIAN
@@ -5249,20 +5416,20 @@ HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<2> /*tag*/, Mask128<T, N> mask) {
5249
5416
  const __vector unsigned char kBitShuffle = {
5250
5417
  128, 128, 128, 128, 128, 128, 128, 128, 112, 96, 80, 64, 48, 32, 16, 0};
5251
5418
  #endif
5252
- return ExtractSignBits(sign_bits, kBitShuffle);
5419
+ return detail::OnlyActive(d, detail::ExtractSignBits(sign_bits, kBitShuffle));
5253
5420
  #endif // HWY_PPC_HAVE_10
5254
5421
  }
5255
5422
 
5256
- template <typename T, size_t N>
5257
- HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<4> /*tag*/, Mask128<T, N> mask) {
5258
- const DFromM<decltype(mask)> d;
5423
+ template <class D, HWY_IF_T_SIZE_D(D, 4)>
5424
+ HWY_API uint64_t BitsFromMask(D d, MFromD<D> mask) {
5259
5425
  const RebindToUnsigned<decltype(d)> du;
5260
5426
 
5261
5427
  const Repartition<uint8_t, decltype(d)> du8;
5262
5428
  const VFromD<decltype(du8)> sign_bits = BitCast(du8, VecFromMask(d, mask));
5263
5429
 
5264
5430
  #if HWY_PPC_HAVE_10 && HWY_IS_LITTLE_ENDIAN
5265
- return static_cast<uint64_t>(vec_extractm(BitCast(du, sign_bits).raw));
5431
+ return detail::OnlyActive(
5432
+ d, static_cast<uint64_t>(vec_extractm(BitCast(du, sign_bits).raw)));
5266
5433
  #else // Z14, Z15, PPC8, PPC9, or big-endian PPC10
5267
5434
  (void)du;
5268
5435
  #if HWY_IS_LITTLE_ENDIAN
@@ -5274,20 +5441,20 @@ HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<4> /*tag*/, Mask128<T, N> mask) {
5274
5441
  128, 128, 128, 128, 128, 128,
5275
5442
  96, 64, 32, 0};
5276
5443
  #endif
5277
- return ExtractSignBits(sign_bits, kBitShuffle);
5444
+ return detail::OnlyActive(d, detail::ExtractSignBits(sign_bits, kBitShuffle));
5278
5445
  #endif // HWY_PPC_HAVE_10
5279
5446
  }
5280
5447
 
5281
- template <typename T, size_t N>
5282
- HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<8> /*tag*/, Mask128<T, N> mask) {
5283
- const DFromM<decltype(mask)> d;
5448
+ template <class D, HWY_IF_T_SIZE_D(D, 8)>
5449
+ HWY_API uint64_t BitsFromMask(D d, MFromD<D> mask) {
5284
5450
  const RebindToUnsigned<decltype(d)> du;
5285
5451
 
5286
5452
  const Repartition<uint8_t, decltype(d)> du8;
5287
5453
  const VFromD<decltype(du8)> sign_bits = BitCast(du8, VecFromMask(d, mask));
5288
5454
 
5289
5455
  #if HWY_PPC_HAVE_10 && HWY_IS_LITTLE_ENDIAN
5290
- return static_cast<uint64_t>(vec_extractm(BitCast(du, sign_bits).raw));
5456
+ return detail::OnlyActive(
5457
+ d, static_cast<uint64_t>(vec_extractm(BitCast(du, sign_bits).raw)));
5291
5458
  #else // Z14, Z15, PPC8, PPC9, or big-endian PPC10
5292
5459
  (void)du;
5293
5460
  #if HWY_IS_LITTLE_ENDIAN
@@ -5299,35 +5466,22 @@ HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<8> /*tag*/, Mask128<T, N> mask) {
5299
5466
  128, 128, 128, 128, 128, 128,
5300
5467
  128, 128, 64, 0};
5301
5468
  #endif
5302
- return ExtractSignBits(sign_bits, kBitShuffle);
5469
+ return detail::OnlyActive(d, detail::ExtractSignBits(sign_bits, kBitShuffle));
5303
5470
  #endif // HWY_PPC_HAVE_10
5304
5471
  }
5305
5472
 
5306
- // Returns the lowest N of the mask bits.
5307
- template <typename T, size_t N>
5308
- constexpr uint64_t OnlyActive(uint64_t mask_bits) {
5309
- return ((N * sizeof(T)) == 16) ? mask_bits : mask_bits & ((1ull << N) - 1);
5310
- }
5311
-
5312
- template <typename T, size_t N>
5313
- HWY_INLINE uint64_t BitsFromMask(Mask128<T, N> mask) {
5314
- return OnlyActive<T, N>(BitsFromMask(hwy::SizeTag<sizeof(T)>(), mask));
5315
- }
5316
-
5317
- } // namespace detail
5318
-
5319
5473
  // `p` points to at least 8 writable bytes.
5320
5474
  template <class D, HWY_IF_LANES_LE_D(D, 8)>
5321
- HWY_API size_t StoreMaskBits(D /*d*/, MFromD<D> mask, uint8_t* bits) {
5475
+ HWY_API size_t StoreMaskBits(D d, MFromD<D> mask, uint8_t* bits) {
5322
5476
  // For vectors with 8 or fewer lanes, simply cast the result of BitsFromMask
5323
5477
  // to an uint8_t and store the result in bits[0].
5324
- bits[0] = static_cast<uint8_t>(detail::BitsFromMask(mask));
5478
+ bits[0] = static_cast<uint8_t>(BitsFromMask(d, mask));
5325
5479
  return sizeof(uint8_t);
5326
5480
  }
5327
5481
 
5328
5482
  template <class D, HWY_IF_LANES_D(D, 16)>
5329
- HWY_API size_t StoreMaskBits(D /*d*/, MFromD<D> mask, uint8_t* bits) {
5330
- const auto mask_bits = detail::BitsFromMask(mask);
5483
+ HWY_API size_t StoreMaskBits(D d, MFromD<D> mask, uint8_t* bits) {
5484
+ const auto mask_bits = BitsFromMask(d, mask);
5331
5485
 
5332
5486
  // First convert mask_bits to a uint16_t as we only want to store
5333
5487
  // the lower 16 bits of mask_bits as there are 16 lanes in mask.
@@ -5392,8 +5546,8 @@ HWY_API bool AllTrue(D d, MFromD<D> mask) {
5392
5546
  }
5393
5547
 
5394
5548
  template <class D>
5395
- HWY_API size_t CountTrue(D /* tag */, MFromD<D> mask) {
5396
- return PopCount(detail::BitsFromMask(mask));
5549
+ HWY_API size_t CountTrue(D d, MFromD<D> mask) {
5550
+ return PopCount(BitsFromMask(d, mask));
5397
5551
  }
5398
5552
 
5399
5553
  #if HWY_PPC_HAVE_9 && (!HWY_PPC_HAVE_10 || HWY_IS_BIG_ENDIAN)
@@ -5440,8 +5594,7 @@ HWY_API size_t FindKnownFirstTrue(D d, MFromD<D> mask) {
5440
5594
  return detail::VsxCntlzLsbb(bytes) / sizeof(T);
5441
5595
  }
5442
5596
  #endif // HWY_PPC_HAVE_9 && (!HWY_PPC_HAVE_10 || HWY_IS_BIG_ENDIAN)
5443
- (void)d;
5444
- return Num0BitsBelowLS1Bit_Nonzero64(detail::BitsFromMask(mask));
5597
+ return Num0BitsBelowLS1Bit_Nonzero64(BitsFromMask(d, mask));
5445
5598
  }
5446
5599
 
5447
5600
  template <class D, typename T = TFromD<D>>
@@ -5456,8 +5609,7 @@ HWY_API intptr_t FindFirstTrue(D d, MFromD<D> mask) {
5456
5609
  return idx == kN ? -1 : static_cast<intptr_t>(idx);
5457
5610
  }
5458
5611
  #endif // HWY_PPC_HAVE_9 && (!HWY_PPC_HAVE_10 || HWY_IS_BIG_ENDIAN)
5459
- (void)d;
5460
- const uint64_t mask_bits = detail::BitsFromMask(mask);
5612
+ const uint64_t mask_bits = BitsFromMask(d, mask);
5461
5613
  return mask_bits ? intptr_t(Num0BitsBelowLS1Bit_Nonzero64(mask_bits)) : -1;
5462
5614
  }
5463
5615
 
@@ -5472,8 +5624,7 @@ HWY_API size_t FindKnownLastTrue(D d, MFromD<D> mask) {
5472
5624
  return 16 / sizeof(T) - 1 - idx;
5473
5625
  }
5474
5626
  #endif // HWY_PPC_HAVE_9 && (!HWY_PPC_HAVE_10 || HWY_IS_BIG_ENDIAN)
5475
- (void)d;
5476
- return 63 - Num0BitsAboveMS1Bit_Nonzero64(detail::BitsFromMask(mask));
5627
+ return 63 - Num0BitsAboveMS1Bit_Nonzero64(BitsFromMask(d, mask));
5477
5628
  }
5478
5629
 
5479
5630
  template <class D, typename T = TFromD<D>>
@@ -5488,8 +5639,7 @@ HWY_API intptr_t FindLastTrue(D d, MFromD<D> mask) {
5488
5639
  return idx == kN ? -1 : static_cast<intptr_t>(kN - 1 - idx);
5489
5640
  }
5490
5641
  #endif // HWY_PPC_HAVE_9 && (!HWY_PPC_HAVE_10 || HWY_IS_BIG_ENDIAN)
5491
- (void)d;
5492
- const uint64_t mask_bits = detail::BitsFromMask(mask);
5642
+ const uint64_t mask_bits = BitsFromMask(d, mask);
5493
5643
  return mask_bits ? intptr_t(63 - Num0BitsAboveMS1Bit_Nonzero64(mask_bits))
5494
5644
  : -1;
5495
5645
  }
@@ -5985,7 +6135,8 @@ HWY_API Vec128<T, N> Compress(Vec128<T, N> v, Mask128<T, N> mask) {
5985
6135
  // General case, 2 or 4 bytes
5986
6136
  template <typename T, size_t N, HWY_IF_T_SIZE_ONE_OF(T, (1 << 2) | (1 << 4))>
5987
6137
  HWY_API Vec128<T, N> Compress(Vec128<T, N> v, Mask128<T, N> mask) {
5988
- return detail::CompressBits(v, detail::BitsFromMask(mask));
6138
+ const DFromV<decltype(v)> d;
6139
+ return detail::CompressBits(v, BitsFromMask(d, mask));
5989
6140
  }
5990
6141
 
5991
6142
  // ------------------------------ CompressNot
@@ -6021,12 +6172,13 @@ HWY_API Vec128<T, N> CompressNot(Vec128<T, N> v, Mask128<T, N> mask) {
6021
6172
  // General case, 2 or 4 bytes
6022
6173
  template <typename T, size_t N, HWY_IF_T_SIZE_ONE_OF(T, (1 << 2) | (1 << 4))>
6023
6174
  HWY_API Vec128<T, N> CompressNot(Vec128<T, N> v, Mask128<T, N> mask) {
6175
+ const DFromV<decltype(v)> d;
6024
6176
  // For partial vectors, we cannot pull the Not() into the table because
6025
6177
  // BitsFromMask clears the upper bits.
6026
6178
  if (N < 16 / sizeof(T)) {
6027
- return detail::CompressBits(v, detail::BitsFromMask(Not(mask)));
6179
+ return detail::CompressBits(v, BitsFromMask(d, Not(mask)));
6028
6180
  }
6029
- return detail::CompressNotBits(v, detail::BitsFromMask(mask));
6181
+ return detail::CompressNotBits(v, BitsFromMask(d, mask));
6030
6182
  }
6031
6183
 
6032
6184
  // ------------------------------ CompressBlocksNot
@@ -6076,7 +6228,7 @@ HWY_API size_t CompressStore(VFromD<D> v, MFromD<D> m, D d,
6076
6228
  TFromD<D>* HWY_RESTRICT unaligned) {
6077
6229
  const RebindToUnsigned<decltype(d)> du;
6078
6230
 
6079
- const uint64_t mask_bits = detail::BitsFromMask(m);
6231
+ const uint64_t mask_bits = BitsFromMask(d, m);
6080
6232
  HWY_DASSERT(mask_bits < (1ull << MaxLanes(d)));
6081
6233
  const size_t count = PopCount(mask_bits);
6082
6234
 
@@ -6103,7 +6255,7 @@ HWY_API size_t CompressBlendedStore(VFromD<D> v, MFromD<D> m, D d,
6103
6255
  TFromD<D>* HWY_RESTRICT unaligned) {
6104
6256
  const RebindToUnsigned<decltype(d)> du;
6105
6257
 
6106
- const uint64_t mask_bits = detail::BitsFromMask(m);
6258
+ const uint64_t mask_bits = BitsFromMask(d, m);
6107
6259
  HWY_DASSERT(mask_bits < (1ull << MaxLanes(d)));
6108
6260
  const size_t count = PopCount(mask_bits);
6109
6261
 
@@ -6228,9 +6380,16 @@ HWY_INLINE V Per128BitBlkRevLanesOnBe(V v) {
6228
6380
  template <class V>
6229
6381
  HWY_INLINE V I128Subtract(V a, V b) {
6230
6382
  #if HWY_S390X_HAVE_Z14
6383
+ #if HWY_COMPILER_CLANG
6384
+ // Workaround for bug in vec_sub_u128 in Clang vecintrin.h
6385
+ typedef __uint128_t VU128 __attribute__((__vector_size__(16)));
6386
+ const V diff_i128{reinterpret_cast<typename detail::Raw128<TFromV<V>>::type>(
6387
+ reinterpret_cast<VU128>(a.raw) - reinterpret_cast<VU128>(b.raw))};
6388
+ #else // !HWY_COMPILER_CLANG
6231
6389
  const V diff_i128{reinterpret_cast<typename detail::Raw128<TFromV<V>>::type>(
6232
6390
  vec_sub_u128(reinterpret_cast<__vector unsigned char>(a.raw),
6233
6391
  reinterpret_cast<__vector unsigned char>(b.raw)))};
6392
+ #endif // HWY_COMPILER_CLANG
6234
6393
  #elif defined(__SIZEOF_INT128__)
6235
6394
  using VU128 = __vector unsigned __int128;
6236
6395
  const V diff_i128{reinterpret_cast<typename detail::Raw128<TFromV<V>>::type>(
@@ -6725,6 +6884,26 @@ HWY_INLINE VFromD<RepartitionToWideX2<DFromV<V>>> SumsOf4(
6725
6884
  #if HWY_S390X_HAVE_Z14
6726
6885
  namespace detail {
6727
6886
 
6887
+ #if HWY_COMPILER_CLANG && HWY_HAS_BUILTIN(__builtin_s390_vsumqf) && \
6888
+ HWY_HAS_BUILTIN(__builtin_s390_vsumqg)
6889
+ // Workaround for bug in vec_sum_u128 in Clang vecintrin.h
6890
+ template <class T, HWY_IF_UI32(T)>
6891
+ HWY_INLINE Vec128<T> SumOfU32OrU64LanesAsU128(Vec128<T> v) {
6892
+ typedef __uint128_t VU128 __attribute__((__vector_size__(16)));
6893
+ const DFromV<decltype(v)> d;
6894
+ const RebindToUnsigned<decltype(d)> du;
6895
+ const VU128 sum = {__builtin_s390_vsumqf(BitCast(du, v).raw, Zero(du).raw)};
6896
+ return Vec128<T>{reinterpret_cast<typename detail::Raw128<T>::type>(sum)};
6897
+ }
6898
+ template <class T, HWY_IF_UI64(T)>
6899
+ HWY_INLINE Vec128<T> SumOfU32OrU64LanesAsU128(Vec128<T> v) {
6900
+ typedef __uint128_t VU128 __attribute__((__vector_size__(16)));
6901
+ const DFromV<decltype(v)> d;
6902
+ const RebindToUnsigned<decltype(d)> du;
6903
+ const VU128 sum = {__builtin_s390_vsumqg(BitCast(du, v).raw, Zero(du).raw)};
6904
+ return Vec128<T>{reinterpret_cast<typename detail::Raw128<T>::type>(sum)};
6905
+ }
6906
+ #else
6728
6907
  template <class T, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T),
6729
6908
  HWY_IF_T_SIZE_ONE_OF(T, (1 << 4) | (1 << 8))>
6730
6909
  HWY_INLINE Vec128<T> SumOfU32OrU64LanesAsU128(Vec128<T> v) {
@@ -6733,6 +6912,7 @@ HWY_INLINE Vec128<T> SumOfU32OrU64LanesAsU128(Vec128<T> v) {
6733
6912
  return BitCast(
6734
6913
  d, Vec128<uint8_t>{vec_sum_u128(BitCast(du, v).raw, Zero(du).raw)});
6735
6914
  }
6915
+ #endif
6736
6916
 
6737
6917
  } // namespace detail
6738
6918