@img/sharp-libvips-dev 1.2.1 → 1.2.2-rc.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/include/ffi.h +3 -3
- package/include/harfbuzz/hb-version.h +3 -3
- package/include/hwy/abort.h +2 -19
- package/include/hwy/aligned_allocator.h +11 -7
- package/include/hwy/auto_tune.h +504 -0
- package/include/hwy/base.h +425 -104
- package/include/hwy/cache_control.h +16 -0
- package/include/hwy/detect_compiler_arch.h +32 -1
- package/include/hwy/detect_targets.h +251 -67
- package/include/hwy/foreach_target.h +35 -0
- package/include/hwy/highway.h +185 -76
- package/include/hwy/nanobenchmark.h +1 -19
- package/include/hwy/ops/arm_neon-inl.h +969 -458
- package/include/hwy/ops/arm_sve-inl.h +1137 -359
- package/include/hwy/ops/emu128-inl.h +97 -11
- package/include/hwy/ops/generic_ops-inl.h +1222 -34
- package/include/hwy/ops/loongarch_lasx-inl.h +4664 -0
- package/include/hwy/ops/loongarch_lsx-inl.h +5933 -0
- package/include/hwy/ops/ppc_vsx-inl.h +306 -126
- package/include/hwy/ops/rvv-inl.h +546 -51
- package/include/hwy/ops/scalar-inl.h +77 -22
- package/include/hwy/ops/set_macros-inl.h +138 -17
- package/include/hwy/ops/shared-inl.h +50 -10
- package/include/hwy/ops/wasm_128-inl.h +137 -92
- package/include/hwy/ops/x86_128-inl.h +773 -214
- package/include/hwy/ops/x86_256-inl.h +712 -255
- package/include/hwy/ops/x86_512-inl.h +429 -753
- package/include/hwy/ops/x86_avx3-inl.h +501 -0
- package/include/hwy/per_target.h +2 -1
- package/include/hwy/profiler.h +622 -486
- package/include/hwy/targets.h +62 -20
- package/include/hwy/timer-inl.h +8 -160
- package/include/hwy/timer.h +170 -3
- package/include/hwy/x86_cpuid.h +81 -0
- package/include/libheif/heif_cxx.h +25 -5
- package/include/libheif/heif_regions.h +5 -5
- package/include/libheif/heif_version.h +2 -2
- package/include/librsvg-2.0/librsvg/rsvg-version.h +2 -2
- package/include/pango-1.0/pango/pango-enum-types.h +3 -0
- package/include/pango-1.0/pango/pango-features.h +3 -3
- package/include/pango-1.0/pango/pango-font.h +30 -0
- package/include/pango-1.0/pango/pango-version-macros.h +26 -0
- package/include/zlib.h +3 -3
- package/package.json +1 -1
- package/versions.json +8 -8
|
@@ -878,10 +878,47 @@ HWY_API VI TableLookupBytesOr0(const V bytes, const VI from) {
|
|
|
878
878
|
}
|
|
879
879
|
|
|
880
880
|
// ------------------------------ Reverse
|
|
881
|
+
#if HWY_S390X_HAVE_Z14 && HWY_COMPILER_GCC_ACTUAL && \
|
|
882
|
+
HWY_COMPILER_GCC_ACTUAL < 900
|
|
883
|
+
// Workaround for missing vec_reve on Z14 with GCC 8 or earlier
|
|
884
|
+
template <class D, typename T = TFromD<D>, HWY_IF_LANES_GT_D(D, 1),
|
|
885
|
+
HWY_IF_T_SIZE_D(D, 1)>
|
|
886
|
+
HWY_API Vec128<T> Reverse(D d, Vec128<T> v) {
|
|
887
|
+
const Repartition<uint8_t, decltype(d)> du8;
|
|
888
|
+
return TableLookupBytes(
|
|
889
|
+
v, BitCast(d, Dup128VecFromValues(du8, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6,
|
|
890
|
+
5, 4, 3, 2, 1, 0)));
|
|
891
|
+
}
|
|
892
|
+
|
|
893
|
+
template <class D, typename T = TFromD<D>, HWY_IF_LANES_GT_D(D, 1),
|
|
894
|
+
HWY_IF_T_SIZE_D(D, 2)>
|
|
895
|
+
HWY_API Vec128<T> Reverse(D d, Vec128<T> v) {
|
|
896
|
+
const Repartition<uint8_t, decltype(d)> du8;
|
|
897
|
+
return TableLookupBytes(
|
|
898
|
+
v, BitCast(d, Dup128VecFromValues(du8, 14, 15, 12, 13, 10, 11, 8, 9, 6, 7,
|
|
899
|
+
4, 5, 2, 3, 0, 1)));
|
|
900
|
+
}
|
|
901
|
+
|
|
902
|
+
template <class D, typename T = TFromD<D>, HWY_IF_LANES_GT_D(D, 1),
|
|
903
|
+
HWY_IF_T_SIZE_D(D, 4)>
|
|
904
|
+
HWY_API Vec128<T> Reverse(D d, Vec128<T> v) {
|
|
905
|
+
const Repartition<uint8_t, decltype(d)> du8;
|
|
906
|
+
return TableLookupBytes(
|
|
907
|
+
v, BitCast(d, Dup128VecFromValues(du8, 12, 13, 14, 15, 8, 9, 10, 11, 4, 5,
|
|
908
|
+
6, 7, 0, 1, 2, 3)));
|
|
909
|
+
}
|
|
910
|
+
|
|
911
|
+
template <class D, typename T = TFromD<D>, HWY_IF_LANES_GT_D(D, 1),
|
|
912
|
+
HWY_IF_T_SIZE_D(D, 8)>
|
|
913
|
+
HWY_API Vec128<T> Reverse(D /* tag */, Vec128<T> v) {
|
|
914
|
+
return Vec128<T>{vec_sld(v.raw, v.raw, 8)};
|
|
915
|
+
}
|
|
916
|
+
#else
|
|
881
917
|
template <class D, typename T = TFromD<D>, HWY_IF_LANES_GT_D(D, 1)>
|
|
882
918
|
HWY_API Vec128<T> Reverse(D /* tag */, Vec128<T> v) {
|
|
883
919
|
return Vec128<T>{vec_reve(v.raw)};
|
|
884
920
|
}
|
|
921
|
+
#endif
|
|
885
922
|
|
|
886
923
|
// ------------------------------ Shuffles (Reverse)
|
|
887
924
|
|
|
@@ -1554,12 +1591,33 @@ HWY_API V SaturatedSub(V a, V b) {
|
|
|
1554
1591
|
|
|
1555
1592
|
// Returns (a + b + 1) / 2
|
|
1556
1593
|
|
|
1557
|
-
|
|
1558
|
-
|
|
1594
|
+
#ifdef HWY_NATIVE_AVERAGE_ROUND_UI32
|
|
1595
|
+
#undef HWY_NATIVE_AVERAGE_ROUND_UI32
|
|
1596
|
+
#else
|
|
1597
|
+
#define HWY_NATIVE_AVERAGE_ROUND_UI32
|
|
1598
|
+
#endif
|
|
1599
|
+
|
|
1600
|
+
#if HWY_S390X_HAVE_Z14
|
|
1601
|
+
#ifdef HWY_NATIVE_AVERAGE_ROUND_UI64
|
|
1602
|
+
#undef HWY_NATIVE_AVERAGE_ROUND_UI64
|
|
1603
|
+
#else
|
|
1604
|
+
#define HWY_NATIVE_AVERAGE_ROUND_UI64
|
|
1605
|
+
#endif
|
|
1606
|
+
|
|
1607
|
+
#define HWY_PPC_IF_AVERAGE_ROUND_T(T) void* = nullptr
|
|
1608
|
+
#else // !HWY_S390X_HAVE_Z14
|
|
1609
|
+
#define HWY_PPC_IF_AVERAGE_ROUND_T(T) \
|
|
1610
|
+
HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2) | (1 << 4))
|
|
1611
|
+
#endif // HWY_S390X_HAVE_Z14
|
|
1612
|
+
|
|
1613
|
+
template <typename T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T),
|
|
1614
|
+
HWY_PPC_IF_AVERAGE_ROUND_T(T)>
|
|
1559
1615
|
HWY_API Vec128<T, N> AverageRound(Vec128<T, N> a, Vec128<T, N> b) {
|
|
1560
1616
|
return Vec128<T, N>{vec_avg(a.raw, b.raw)};
|
|
1561
1617
|
}
|
|
1562
1618
|
|
|
1619
|
+
#undef HWY_PPC_IF_AVERAGE_ROUND_T
|
|
1620
|
+
|
|
1563
1621
|
// ------------------------------ Multiplication
|
|
1564
1622
|
|
|
1565
1623
|
// Per-target flags to prevent generic_ops-inl.h defining 8/64-bit operator*.
|
|
@@ -1918,6 +1976,23 @@ HWY_API Vec128<T, N> Sqrt(Vec128<T, N> v) {
|
|
|
1918
1976
|
return Vec128<T, N>{vec_sqrt(v.raw)};
|
|
1919
1977
|
}
|
|
1920
1978
|
|
|
1979
|
+
// ------------------------------ GetBiasedExponent
|
|
1980
|
+
|
|
1981
|
+
#if HWY_PPC_HAVE_9
|
|
1982
|
+
|
|
1983
|
+
#ifdef HWY_NATIVE_GET_BIASED_EXPONENT
|
|
1984
|
+
#undef HWY_NATIVE_GET_BIASED_EXPONENT
|
|
1985
|
+
#else
|
|
1986
|
+
#define HWY_NATIVE_GET_BIASED_EXPONENT
|
|
1987
|
+
#endif
|
|
1988
|
+
|
|
1989
|
+
template <class V, HWY_IF_FLOAT3264_V(V)>
|
|
1990
|
+
HWY_API VFromD<RebindToUnsigned<DFromV<V>>> GetBiasedExponent(V v) {
|
|
1991
|
+
return VFromD<RebindToUnsigned<DFromV<V>>>{vec_extract_exp(v.raw)};
|
|
1992
|
+
}
|
|
1993
|
+
|
|
1994
|
+
#endif // HWY_PPC_HAVE_9
|
|
1995
|
+
|
|
1921
1996
|
// ------------------------------ Min (Gt, IfThenElse)
|
|
1922
1997
|
|
|
1923
1998
|
template <typename T, size_t N, HWY_IF_NOT_SPECIAL_FLOAT(T)>
|
|
@@ -2522,8 +2597,10 @@ HWY_API Vec32<T> Reverse(D d, Vec32<T> v) {
|
|
|
2522
2597
|
|
|
2523
2598
|
// ------------------------------- ReverseLaneBytes
|
|
2524
2599
|
|
|
2525
|
-
#if (HWY_PPC_HAVE_9 || HWY_S390X_HAVE_Z14) &&
|
|
2526
|
-
(HWY_COMPILER_GCC_ACTUAL >= 710 ||
|
|
2600
|
+
#if (HWY_PPC_HAVE_9 || HWY_S390X_HAVE_Z14) && \
|
|
2601
|
+
((!HWY_S390X_HAVE_Z14 && HWY_COMPILER_GCC_ACTUAL >= 710) || \
|
|
2602
|
+
(HWY_S390X_HAVE_Z14 && HWY_COMPILER_GCC_ACTUAL >= 900) || \
|
|
2603
|
+
HWY_COMPILER_CLANG >= 400)
|
|
2527
2604
|
|
|
2528
2605
|
// Per-target flag to prevent generic_ops-inl.h defining 8-bit ReverseLaneBytes.
|
|
2529
2606
|
#ifdef HWY_NATIVE_REVERSE_LANE_BYTES
|
|
@@ -3279,12 +3356,22 @@ HWY_API Vec128<T, N> OddEvenBlocks(Vec128<T, N> /* odd */, Vec128<T, N> even) {
|
|
|
3279
3356
|
}
|
|
3280
3357
|
|
|
3281
3358
|
// ------------------------------ SwapAdjacentBlocks
|
|
3282
|
-
|
|
3283
3359
|
template <typename T, size_t N>
|
|
3284
3360
|
HWY_API Vec128<T, N> SwapAdjacentBlocks(Vec128<T, N> v) {
|
|
3285
3361
|
return v;
|
|
3286
3362
|
}
|
|
3287
3363
|
|
|
3364
|
+
// ------------------------------ InterleaveEvenBlocks
|
|
3365
|
+
template <class D, class V = VFromD<D>>
|
|
3366
|
+
HWY_API V InterleaveEvenBlocks(D, V a, V /*b*/) {
|
|
3367
|
+
return a;
|
|
3368
|
+
}
|
|
3369
|
+
// ------------------------------ InterleaveOddBlocks
|
|
3370
|
+
template <class D, class V = VFromD<D>>
|
|
3371
|
+
HWY_API V InterleaveOddBlocks(D, V a, V /*b*/) {
|
|
3372
|
+
return a;
|
|
3373
|
+
}
|
|
3374
|
+
|
|
3288
3375
|
// ------------------------------ MulFixedPoint15 (OddEven)
|
|
3289
3376
|
|
|
3290
3377
|
#if HWY_S390X_HAVE_Z14
|
|
@@ -3630,6 +3717,10 @@ HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<float, D>> v) {
|
|
|
3630
3717
|
const __vector float raw_v = InterleaveLower(v, v).raw;
|
|
3631
3718
|
#if HWY_IS_LITTLE_ENDIAN
|
|
3632
3719
|
return VFromD<D>{vec_doubleo(raw_v)};
|
|
3720
|
+
#elif HWY_S390X_HAVE_Z14 && HWY_COMPILER_GCC_ACTUAL && \
|
|
3721
|
+
HWY_COMPILER_GCC_ACTUAL < 1000
|
|
3722
|
+
// Workaround for compiler errors with GCC 9 or earlier on Z14
|
|
3723
|
+
return VFromD<D>{__builtin_s390_vflls(raw_v)};
|
|
3633
3724
|
#else
|
|
3634
3725
|
return VFromD<D>{vec_doublee(raw_v)};
|
|
3635
3726
|
#endif
|
|
@@ -3680,16 +3771,73 @@ static HWY_INLINE V VsxF2INormalizeSrcVals(V v) {
|
|
|
3680
3771
|
#endif
|
|
3681
3772
|
}
|
|
3682
3773
|
|
|
3774
|
+
template <class VF32>
|
|
3775
|
+
static HWY_INLINE HWY_MAYBE_UNUSED VFromD<Repartition<int64_t, DFromV<VF32>>>
|
|
3776
|
+
VsxXvcvspsxds(VF32 vf32) {
|
|
3777
|
+
using VI64 = VFromD<Repartition<int64_t, DFromV<VF32>>>;
|
|
3778
|
+
#if (HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 1500) || \
|
|
3779
|
+
HWY_HAS_BUILTIN(__builtin_vsx_xvcvspsxds)
|
|
3780
|
+
// Use __builtin_vsx_xvcvspsxds if it is available (which is the case with
|
|
3781
|
+
// GCC 4.8 through GCC 14 or Clang 13 or later on PPC8/PPC9/PPC10)
|
|
3782
|
+
return VI64{__builtin_vsx_xvcvspsxds(vf32.raw)};
|
|
3783
|
+
#elif HWY_COMPILER_GCC_ACTUAL >= 1500 && HWY_IS_LITTLE_ENDIAN
|
|
3784
|
+
// On little-endian PPC8/PPC9/PPC10 with GCC 15 or later, use the F32->I64
|
|
3785
|
+
// vec_signedo intrinsic as the __builtin_vsx_xvcvspsxds intrinsic has been
|
|
3786
|
+
// removed from GCC in GCC 15
|
|
3787
|
+
return VI64{vec_signedo(vf32.raw)};
|
|
3788
|
+
#elif HWY_COMPILER_GCC_ACTUAL >= 1500 && HWY_IS_BIG_ENDIAN
|
|
3789
|
+
// On big-endian PPC8/PPC9/PPC10 with GCC 15 or later, use the F32->I64
|
|
3790
|
+
// vec_signede intrinsic as the __builtin_vsx_xvcvspsxds intrinsic has been
|
|
3791
|
+
// removed from GCC in GCC 15
|
|
3792
|
+
return VI64{vec_signede(vf32.raw)};
|
|
3793
|
+
#else
|
|
3794
|
+
// Inline assembly fallback for older versions of Clang that do not have the
|
|
3795
|
+
// __builtin_vsx_xvcvspsxds intrinsic
|
|
3796
|
+
__vector signed long long raw_result;
|
|
3797
|
+
__asm__("xvcvspsxds %x0, %x1" : "=wa"(raw_result) : "wa"(vf32.raw) :);
|
|
3798
|
+
return VI64{raw_result};
|
|
3799
|
+
#endif
|
|
3800
|
+
}
|
|
3801
|
+
|
|
3802
|
+
template <class VF32>
|
|
3803
|
+
static HWY_INLINE HWY_MAYBE_UNUSED VFromD<Repartition<uint64_t, DFromV<VF32>>>
|
|
3804
|
+
VsxXvcvspuxds(VF32 vf32) {
|
|
3805
|
+
using VU64 = VFromD<Repartition<uint64_t, DFromV<VF32>>>;
|
|
3806
|
+
#if (HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 1500) || \
|
|
3807
|
+
HWY_HAS_BUILTIN(__builtin_vsx_xvcvspuxds)
|
|
3808
|
+
// Use __builtin_vsx_xvcvspuxds if it is available (which is the case with
|
|
3809
|
+
// GCC 4.8 through GCC 14 or Clang 13 or later on PPC8/PPC9/PPC10)
|
|
3810
|
+
return VU64{reinterpret_cast<__vector unsigned long long>(
|
|
3811
|
+
__builtin_vsx_xvcvspuxds(vf32.raw))};
|
|
3812
|
+
#elif HWY_COMPILER_GCC_ACTUAL >= 1500 && HWY_IS_LITTLE_ENDIAN
|
|
3813
|
+
// On little-endian PPC8/PPC9/PPC10 with GCC 15 or later, use the F32->U64
|
|
3814
|
+
// vec_unsignedo intrinsic as the __builtin_vsx_xvcvspuxds intrinsic has been
|
|
3815
|
+
// removed from GCC in GCC 15
|
|
3816
|
+
return VU64{vec_unsignedo(vf32.raw)};
|
|
3817
|
+
#elif HWY_COMPILER_GCC_ACTUAL >= 1500 && HWY_IS_BIG_ENDIAN
|
|
3818
|
+
// On big-endian PPC8/PPC9/PPC10 with GCC 15 or later, use the F32->U64
|
|
3819
|
+
// vec_unsignedo intrinsic as the __builtin_vsx_xvcvspuxds intrinsic has been
|
|
3820
|
+
// removed from GCC in GCC 15
|
|
3821
|
+
return VU64{vec_unsignede(vf32.raw)};
|
|
3822
|
+
#else
|
|
3823
|
+
// Inline assembly fallback for older versions of Clang that do not have the
|
|
3824
|
+
// __builtin_vsx_xvcvspuxds intrinsic
|
|
3825
|
+
__vector unsigned long long raw_result;
|
|
3826
|
+
__asm__("xvcvspuxds %x0, %x1" : "=wa"(raw_result) : "wa"(vf32.raw) :);
|
|
3827
|
+
return VU64{raw_result};
|
|
3828
|
+
#endif
|
|
3829
|
+
}
|
|
3830
|
+
|
|
3683
3831
|
} // namespace detail
|
|
3684
3832
|
#endif // !HWY_S390X_HAVE_Z14
|
|
3685
3833
|
|
|
3686
3834
|
template <class D, HWY_IF_I64_D(D)>
|
|
3687
3835
|
HWY_API VFromD<D> PromoteTo(D di64, VFromD<Rebind<float, D>> v) {
|
|
3688
|
-
#if !HWY_S390X_HAVE_Z14
|
|
3689
|
-
|
|
3690
|
-
const
|
|
3691
|
-
|
|
3692
|
-
|
|
3836
|
+
#if !HWY_S390X_HAVE_Z14
|
|
3837
|
+
const Repartition<float, decltype(di64)> dt_f32;
|
|
3838
|
+
const auto vt_f32 = ResizeBitCast(dt_f32, v);
|
|
3839
|
+
return detail::VsxXvcvspsxds(
|
|
3840
|
+
detail::VsxF2INormalizeSrcVals(InterleaveLower(vt_f32, vt_f32)));
|
|
3693
3841
|
#else
|
|
3694
3842
|
const RebindToFloat<decltype(di64)> df64;
|
|
3695
3843
|
return ConvertTo(di64, PromoteTo(df64, v));
|
|
@@ -3698,12 +3846,11 @@ HWY_API VFromD<D> PromoteTo(D di64, VFromD<Rebind<float, D>> v) {
|
|
|
3698
3846
|
|
|
3699
3847
|
template <class D, HWY_IF_U64_D(D)>
|
|
3700
3848
|
HWY_API VFromD<D> PromoteTo(D du64, VFromD<Rebind<float, D>> v) {
|
|
3701
|
-
#if !HWY_S390X_HAVE_Z14
|
|
3702
|
-
|
|
3703
|
-
const
|
|
3704
|
-
|
|
3705
|
-
|
|
3706
|
-
__builtin_vsx_xvcvspuxds(raw_v))};
|
|
3849
|
+
#if !HWY_S390X_HAVE_Z14
|
|
3850
|
+
const Repartition<float, decltype(du64)> dt_f32;
|
|
3851
|
+
const auto vt_f32 = ResizeBitCast(dt_f32, v);
|
|
3852
|
+
return detail::VsxXvcvspuxds(
|
|
3853
|
+
detail::VsxF2INormalizeSrcVals(InterleaveLower(vt_f32, vt_f32)));
|
|
3707
3854
|
#else
|
|
3708
3855
|
const RebindToFloat<decltype(du64)> df64;
|
|
3709
3856
|
return ConvertTo(du64, PromoteTo(df64, v));
|
|
@@ -3767,6 +3914,10 @@ HWY_API VFromD<D> PromoteUpperTo(D /*tag*/, Vec128<float> v) {
|
|
|
3767
3914
|
const __vector float raw_v = InterleaveUpper(Full128<float>(), v, v).raw;
|
|
3768
3915
|
#if HWY_IS_LITTLE_ENDIAN
|
|
3769
3916
|
return VFromD<D>{vec_doubleo(raw_v)};
|
|
3917
|
+
#elif HWY_S390X_HAVE_Z14 && HWY_COMPILER_GCC_ACTUAL && \
|
|
3918
|
+
HWY_COMPILER_GCC_ACTUAL < 1000
|
|
3919
|
+
// Workaround for compiler error with GCC 9 or earlier on Z14
|
|
3920
|
+
return VFromD<D>{__builtin_s390_vflls(raw_v)};
|
|
3770
3921
|
#else
|
|
3771
3922
|
return VFromD<D>{vec_doublee(raw_v)};
|
|
3772
3923
|
#endif
|
|
@@ -3808,12 +3959,10 @@ HWY_API VFromD<D> PromoteUpperTo(D df64, Vec128<uint32_t> v) {
|
|
|
3808
3959
|
|
|
3809
3960
|
template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_I64_D(D)>
|
|
3810
3961
|
HWY_API VFromD<D> PromoteUpperTo(D di64, Vec128<float> v) {
|
|
3811
|
-
#if !HWY_S390X_HAVE_Z14
|
|
3812
|
-
|
|
3813
|
-
|
|
3814
|
-
detail::VsxF2INormalizeSrcVals(InterleaveUpper(Full128<float>(), v, v))
|
|
3815
|
-
.raw;
|
|
3816
|
-
return VFromD<decltype(di64)>{__builtin_vsx_xvcvspsxds(raw_v)};
|
|
3962
|
+
#if !HWY_S390X_HAVE_Z14
|
|
3963
|
+
(void)di64;
|
|
3964
|
+
return detail::VsxXvcvspsxds(
|
|
3965
|
+
detail::VsxF2INormalizeSrcVals(InterleaveUpper(Full128<float>(), v, v)));
|
|
3817
3966
|
#else
|
|
3818
3967
|
const RebindToFloat<decltype(di64)> df64;
|
|
3819
3968
|
return ConvertTo(di64, PromoteUpperTo(df64, v));
|
|
@@ -3822,13 +3971,10 @@ HWY_API VFromD<D> PromoteUpperTo(D di64, Vec128<float> v) {
|
|
|
3822
3971
|
|
|
3823
3972
|
template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U64_D(D)>
|
|
3824
3973
|
HWY_API VFromD<D> PromoteUpperTo(D du64, Vec128<float> v) {
|
|
3825
|
-
#if !HWY_S390X_HAVE_Z14
|
|
3826
|
-
|
|
3827
|
-
|
|
3828
|
-
detail::VsxF2INormalizeSrcVals(InterleaveUpper(Full128<float>(), v, v))
|
|
3829
|
-
.raw;
|
|
3830
|
-
return VFromD<decltype(du64)>{reinterpret_cast<__vector unsigned long long>(
|
|
3831
|
-
__builtin_vsx_xvcvspuxds(raw_v))};
|
|
3974
|
+
#if !HWY_S390X_HAVE_Z14
|
|
3975
|
+
(void)du64;
|
|
3976
|
+
return detail::VsxXvcvspuxds(
|
|
3977
|
+
detail::VsxF2INormalizeSrcVals(InterleaveUpper(Full128<float>(), v, v)));
|
|
3832
3978
|
#else
|
|
3833
3979
|
const RebindToFloat<decltype(du64)> df64;
|
|
3834
3980
|
return ConvertTo(du64, PromoteUpperTo(df64, v));
|
|
@@ -3916,20 +4062,18 @@ HWY_INLINE VFromD<D> PromoteEvenTo(hwy::SignedTag /*to_type_tag*/,
|
|
|
3916
4062
|
hwy::SizeTag<8> /*to_lane_size_tag*/,
|
|
3917
4063
|
hwy::FloatTag /*from_type_tag*/, D d_to,
|
|
3918
4064
|
V v) {
|
|
3919
|
-
#if !HWY_S390X_HAVE_Z14
|
|
3920
|
-
(HWY_COMPILER_GCC_ACTUAL || HWY_HAS_BUILTIN(__builtin_vsx_xvcvspsxds))
|
|
4065
|
+
#if !HWY_S390X_HAVE_Z14
|
|
3921
4066
|
(void)d_to;
|
|
3922
4067
|
const auto normalized_v = detail::VsxF2INormalizeSrcVals(v);
|
|
3923
4068
|
#if HWY_IS_LITTLE_ENDIAN
|
|
3924
|
-
//
|
|
3925
|
-
//
|
|
4069
|
+
// VsxXvcvspsxds expects the source values to be in the odd lanes on
|
|
4070
|
+
// little-endian PPC, and the Shuffle2103 operation below will shift the even
|
|
3926
4071
|
// lanes of normalized_v into the odd lanes.
|
|
3927
|
-
return
|
|
3928
|
-
__builtin_vsx_xvcvspsxds(vec_sld(normalized_v.raw, normalized_v.raw, 4))};
|
|
4072
|
+
return VsxXvcvspsxds(Shuffle2103(normalized_v));
|
|
3929
4073
|
#else
|
|
3930
|
-
//
|
|
3931
|
-
//
|
|
3932
|
-
return
|
|
4074
|
+
// VsxXvcvspsxds expects the source values to be in the even lanes on
|
|
4075
|
+
// big-endian PPC.
|
|
4076
|
+
return VsxXvcvspsxds(normalized_v);
|
|
3933
4077
|
#endif
|
|
3934
4078
|
#else
|
|
3935
4079
|
const RebindToFloat<decltype(d_to)> df64;
|
|
@@ -3944,22 +4088,18 @@ HWY_INLINE VFromD<D> PromoteEvenTo(hwy::UnsignedTag /*to_type_tag*/,
|
|
|
3944
4088
|
hwy::SizeTag<8> /*to_lane_size_tag*/,
|
|
3945
4089
|
hwy::FloatTag /*from_type_tag*/, D d_to,
|
|
3946
4090
|
V v) {
|
|
3947
|
-
#if !HWY_S390X_HAVE_Z14
|
|
3948
|
-
(HWY_COMPILER_GCC_ACTUAL || HWY_HAS_BUILTIN(__builtin_vsx_xvcvspuxds))
|
|
4091
|
+
#if !HWY_S390X_HAVE_Z14
|
|
3949
4092
|
(void)d_to;
|
|
3950
4093
|
const auto normalized_v = detail::VsxF2INormalizeSrcVals(v);
|
|
3951
4094
|
#if HWY_IS_LITTLE_ENDIAN
|
|
3952
|
-
//
|
|
3953
|
-
// on little-endian PPC, and the
|
|
3954
|
-
// lanes of normalized_v into the odd lanes.
|
|
3955
|
-
return
|
|
3956
|
-
reinterpret_cast<__vector unsigned long long>(__builtin_vsx_xvcvspuxds(
|
|
3957
|
-
vec_sld(normalized_v.raw, normalized_v.raw, 4)))};
|
|
4095
|
+
// VsxXvcvspuxds expects the source values to be in the odd lanes
|
|
4096
|
+
// on little-endian PPC, and the Shuffle2103 operation below will shift the
|
|
4097
|
+
// even lanes of normalized_v into the odd lanes.
|
|
4098
|
+
return VsxXvcvspuxds(Shuffle2103(normalized_v));
|
|
3958
4099
|
#else
|
|
3959
|
-
//
|
|
4100
|
+
// VsxXvcvspuxds expects the source values to be in the even lanes
|
|
3960
4101
|
// on big-endian PPC.
|
|
3961
|
-
return
|
|
3962
|
-
__builtin_vsx_xvcvspuxds(normalized_v.raw))};
|
|
4102
|
+
return VsxXvcvspuxds(normalized_v);
|
|
3963
4103
|
#endif
|
|
3964
4104
|
#else
|
|
3965
4105
|
const RebindToFloat<decltype(d_to)> df64;
|
|
@@ -4001,20 +4141,18 @@ HWY_INLINE VFromD<D> PromoteOddTo(hwy::SignedTag /*to_type_tag*/,
|
|
|
4001
4141
|
hwy::SizeTag<8> /*to_lane_size_tag*/,
|
|
4002
4142
|
hwy::FloatTag /*from_type_tag*/, D d_to,
|
|
4003
4143
|
V v) {
|
|
4004
|
-
#if !HWY_S390X_HAVE_Z14
|
|
4005
|
-
(HWY_COMPILER_GCC_ACTUAL || HWY_HAS_BUILTIN(__builtin_vsx_xvcvspsxds))
|
|
4144
|
+
#if !HWY_S390X_HAVE_Z14
|
|
4006
4145
|
(void)d_to;
|
|
4007
4146
|
const auto normalized_v = detail::VsxF2INormalizeSrcVals(v);
|
|
4008
4147
|
#if HWY_IS_LITTLE_ENDIAN
|
|
4009
|
-
//
|
|
4148
|
+
// VsxXvcvspsxds expects the source values to be in the odd lanes
|
|
4010
4149
|
// on little-endian PPC
|
|
4011
|
-
return
|
|
4150
|
+
return VsxXvcvspsxds(normalized_v);
|
|
4012
4151
|
#else
|
|
4013
|
-
//
|
|
4014
|
-
// on big-endian PPC, and the
|
|
4015
|
-
// of normalized_v into the even lanes.
|
|
4016
|
-
return
|
|
4017
|
-
__builtin_vsx_xvcvspsxds(vec_sld(normalized_v.raw, normalized_v.raw, 4))};
|
|
4152
|
+
// VsxXvcvspsxds expects the source values to be in the even lanes
|
|
4153
|
+
// on big-endian PPC, and the Shuffle0321 operation below will shift the odd
|
|
4154
|
+
// lanes of normalized_v into the even lanes.
|
|
4155
|
+
return VsxXvcvspsxds(Shuffle0321(normalized_v));
|
|
4018
4156
|
#endif
|
|
4019
4157
|
#else
|
|
4020
4158
|
const RebindToFloat<decltype(d_to)> df64;
|
|
@@ -4029,22 +4167,18 @@ HWY_INLINE VFromD<D> PromoteOddTo(hwy::UnsignedTag /*to_type_tag*/,
|
|
|
4029
4167
|
hwy::SizeTag<8> /*to_lane_size_tag*/,
|
|
4030
4168
|
hwy::FloatTag /*from_type_tag*/, D d_to,
|
|
4031
4169
|
V v) {
|
|
4032
|
-
#if !HWY_S390X_HAVE_Z14
|
|
4033
|
-
(HWY_COMPILER_GCC_ACTUAL || HWY_HAS_BUILTIN(__builtin_vsx_xvcvspuxds))
|
|
4170
|
+
#if !HWY_S390X_HAVE_Z14
|
|
4034
4171
|
(void)d_to;
|
|
4035
4172
|
const auto normalized_v = detail::VsxF2INormalizeSrcVals(v);
|
|
4036
4173
|
#if HWY_IS_LITTLE_ENDIAN
|
|
4037
|
-
//
|
|
4174
|
+
// VsxXvcvspuxds expects the source values to be in the odd lanes
|
|
4038
4175
|
// on little-endian PPC
|
|
4039
|
-
return
|
|
4040
|
-
__builtin_vsx_xvcvspuxds(normalized_v.raw))};
|
|
4176
|
+
return VsxXvcvspuxds(normalized_v);
|
|
4041
4177
|
#else
|
|
4042
|
-
//
|
|
4043
|
-
// on big-endian PPC, and the
|
|
4044
|
-
// of normalized_v into the even lanes.
|
|
4045
|
-
return
|
|
4046
|
-
reinterpret_cast<__vector unsigned long long>(__builtin_vsx_xvcvspuxds(
|
|
4047
|
-
vec_sld(normalized_v.raw, normalized_v.raw, 4)))};
|
|
4178
|
+
// VsxXvcvspuxds expects the source values to be in the even lanes
|
|
4179
|
+
// on big-endian PPC, and the Shuffle0321 operation below will shift the odd
|
|
4180
|
+
// lanes of normalized_v into the even lanes.
|
|
4181
|
+
return VsxXvcvspuxds(Shuffle0321(normalized_v));
|
|
4048
4182
|
#endif
|
|
4049
4183
|
#else
|
|
4050
4184
|
const RebindToFloat<decltype(d_to)> df64;
|
|
@@ -4388,12 +4522,22 @@ HWY_API VFromD<D> OrderedDemote2To(D d, V a, V b) {
|
|
|
4388
4522
|
|
|
4389
4523
|
template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_F32_D(D)>
|
|
4390
4524
|
HWY_API Vec32<float> DemoteTo(D /* tag */, Vec64<double> v) {
|
|
4525
|
+
#if HWY_S390X_HAVE_Z14 && HWY_COMPILER_GCC_ACTUAL && \
|
|
4526
|
+
HWY_COMPILER_GCC_ACTUAL < 1000
|
|
4527
|
+
// Workaround for compiler error with GCC 9 or earlier on Z14
|
|
4528
|
+
return Vec32<float>{__builtin_s390_vflrd(v.raw, 0, 0)};
|
|
4529
|
+
#else
|
|
4391
4530
|
return Vec32<float>{vec_floate(v.raw)};
|
|
4531
|
+
#endif
|
|
4392
4532
|
}
|
|
4393
4533
|
|
|
4394
4534
|
template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_F32_D(D)>
|
|
4395
4535
|
HWY_API Vec64<float> DemoteTo(D d, Vec128<double> v) {
|
|
4396
|
-
#if HWY_S390X_HAVE_Z14
|
|
4536
|
+
#if HWY_S390X_HAVE_Z14 && HWY_COMPILER_GCC_ACTUAL && \
|
|
4537
|
+
HWY_COMPILER_GCC_ACTUAL < 1000
|
|
4538
|
+
// Workaround for compiler error with GCC 9 or earlier on Z14
|
|
4539
|
+
const Vec128<float> f64_to_f32{__builtin_s390_vflrd(v.raw, 0, 0)};
|
|
4540
|
+
#elif HWY_S390X_HAVE_Z14 || HWY_IS_LITTLE_ENDIAN
|
|
4397
4541
|
const Vec128<float> f64_to_f32{vec_floate(v.raw)};
|
|
4398
4542
|
#else
|
|
4399
4543
|
const Vec128<float> f64_to_f32{vec_floato(v.raw)};
|
|
@@ -4578,8 +4722,16 @@ template <class D, typename FromT, HWY_IF_F32_D(D), HWY_IF_UI32(FromT),
|
|
|
4578
4722
|
HWY_API VFromD<D> ConvertTo(D df32, Vec128<FromT> v) {
|
|
4579
4723
|
const RepartitionToWide<decltype(df32)> df64;
|
|
4580
4724
|
|
|
4725
|
+
#if HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 1000
|
|
4726
|
+
// Workaround for compiler error with GCC 9 or earlier on Z14
|
|
4727
|
+
const VFromD<D> vf32_lo{
|
|
4728
|
+
__builtin_s390_vflrd(PromoteLowerTo(df64, v).raw, 0, 0)};
|
|
4729
|
+
const VFromD<D> vf32_hi{
|
|
4730
|
+
__builtin_s390_vflrd(PromoteUpperTo(df64, v).raw, 0, 0)};
|
|
4731
|
+
#else
|
|
4581
4732
|
const VFromD<D> vf32_lo{vec_floate(PromoteLowerTo(df64, v).raw)};
|
|
4582
4733
|
const VFromD<D> vf32_hi{vec_floate(PromoteUpperTo(df64, v).raw)};
|
|
4734
|
+
#endif
|
|
4583
4735
|
return ConcatEven(df32, vf32_hi, vf32_lo);
|
|
4584
4736
|
}
|
|
4585
4737
|
#else // Z15 or PPC
|
|
@@ -4670,7 +4822,7 @@ HWY_API VFromD<D> ConvertTo(D /* tag */,
|
|
|
4670
4822
|
template <class D, HWY_IF_I64_D(D)>
|
|
4671
4823
|
HWY_API VFromD<D> ConvertTo(D /* tag */,
|
|
4672
4824
|
Vec128<double, Rebind<double, D>().MaxLanes()> v) {
|
|
4673
|
-
#if defined(__OPTIMIZE__)
|
|
4825
|
+
#if defined(__OPTIMIZE__) && (!HWY_COMPILER_CLANG || !HWY_S390X_HAVE_Z14)
|
|
4674
4826
|
if (detail::IsConstantRawAltivecVect(v.raw)) {
|
|
4675
4827
|
constexpr int64_t kMinI64 = LimitsMin<int64_t>();
|
|
4676
4828
|
constexpr int64_t kMaxI64 = LimitsMax<int64_t>();
|
|
@@ -4769,7 +4921,7 @@ HWY_API VFromD<D> ConvertTo(D /* tag */,
|
|
|
4769
4921
|
HWY_DIAGNOSTICS_OFF(disable : 5219, ignored "-Wdeprecate-lax-vec-conv-all")
|
|
4770
4922
|
#endif
|
|
4771
4923
|
|
|
4772
|
-
#if defined(__OPTIMIZE__)
|
|
4924
|
+
#if defined(__OPTIMIZE__) && (!HWY_COMPILER_CLANG || !HWY_S390X_HAVE_Z14)
|
|
4773
4925
|
if (detail::IsConstantRawAltivecVect(v.raw)) {
|
|
4774
4926
|
constexpr uint64_t kMaxU64 = LimitsMax<uint64_t>();
|
|
4775
4927
|
return Dup128VecFromValues(
|
|
@@ -4815,13 +4967,19 @@ HWY_API Vec128<double, N> Round(Vec128<double, N> v) {
|
|
|
4815
4967
|
#endif
|
|
4816
4968
|
}
|
|
4817
4969
|
|
|
4818
|
-
template <size_t N>
|
|
4819
|
-
HWY_API Vec128<
|
|
4970
|
+
template <typename T, size_t N, HWY_IF_FLOAT3264(T)>
|
|
4971
|
+
HWY_API Vec128<MakeSigned<T>, N> NearestInt(Vec128<T, N> v) {
|
|
4820
4972
|
const DFromV<decltype(v)> d;
|
|
4821
4973
|
const RebindToSigned<decltype(d)> di;
|
|
4822
4974
|
return ConvertTo(di, Round(v));
|
|
4823
4975
|
}
|
|
4824
4976
|
|
|
4977
|
+
template <class DI32, HWY_IF_I32_D(DI32)>
|
|
4978
|
+
HWY_API VFromD<DI32> DemoteToNearestInt(DI32 di32,
|
|
4979
|
+
VFromD<Rebind<double, DI32>> v) {
|
|
4980
|
+
return DemoteTo(di32, Round(v));
|
|
4981
|
+
}
|
|
4982
|
+
|
|
4825
4983
|
// Toward zero, aka truncate
|
|
4826
4984
|
template <typename T, size_t N, HWY_IF_FLOAT(T)>
|
|
4827
4985
|
HWY_API Vec128<T, N> Trunc(Vec128<T, N> v) {
|
|
@@ -5195,6 +5353,13 @@ HWY_API MFromD<D> Dup128MaskFromMaskBits(D d, unsigned mask_bits) {
|
|
|
5195
5353
|
|
|
5196
5354
|
namespace detail {
|
|
5197
5355
|
|
|
5356
|
+
// Returns the lowest N of the mask bits.
|
|
5357
|
+
template <class D>
|
|
5358
|
+
constexpr uint64_t OnlyActive(D d, uint64_t mask_bits) {
|
|
5359
|
+
return (d.MaxBytes() == 16) ? mask_bits
|
|
5360
|
+
: mask_bits & ((1ull << d.MaxLanes()) - 1);
|
|
5361
|
+
}
|
|
5362
|
+
|
|
5198
5363
|
#if !HWY_PPC_HAVE_10 || HWY_IS_BIG_ENDIAN
|
|
5199
5364
|
// fallback for missing vec_extractm
|
|
5200
5365
|
template <size_t N>
|
|
@@ -5215,31 +5380,33 @@ HWY_INLINE uint64_t ExtractSignBits(Vec128<uint8_t, N> sign_bits,
|
|
|
5215
5380
|
|
|
5216
5381
|
#endif // !HWY_PPC_HAVE_10 || HWY_IS_BIG_ENDIAN
|
|
5217
5382
|
|
|
5218
|
-
|
|
5219
|
-
|
|
5220
|
-
|
|
5383
|
+
} // namespace detail
|
|
5384
|
+
|
|
5385
|
+
template <class D, HWY_IF_T_SIZE_D(D, 1)>
|
|
5386
|
+
HWY_API uint64_t BitsFromMask(D d, MFromD<D> mask) {
|
|
5221
5387
|
const Repartition<uint8_t, decltype(d)> du8;
|
|
5222
5388
|
const VFromD<decltype(du8)> sign_bits = BitCast(du8, VecFromMask(d, mask));
|
|
5223
5389
|
|
|
5224
5390
|
#if HWY_PPC_HAVE_10 && HWY_IS_LITTLE_ENDIAN
|
|
5225
|
-
return
|
|
5391
|
+
return detail::OnlyActive(d,
|
|
5392
|
+
static_cast<uint64_t>(vec_extractm(sign_bits.raw)));
|
|
5226
5393
|
#else // Z14, Z15, PPC8, PPC9, or big-endian PPC10
|
|
5227
5394
|
const __vector unsigned char kBitShuffle = {120, 112, 104, 96, 88, 80, 72, 64,
|
|
5228
5395
|
56, 48, 40, 32, 24, 16, 8, 0};
|
|
5229
|
-
return ExtractSignBits(sign_bits, kBitShuffle);
|
|
5396
|
+
return detail::OnlyActive(d, detail::ExtractSignBits(sign_bits, kBitShuffle));
|
|
5230
5397
|
#endif // HWY_PPC_HAVE_10 && HWY_IS_LITTLE_ENDIAN
|
|
5231
5398
|
}
|
|
5232
5399
|
|
|
5233
|
-
template <
|
|
5234
|
-
|
|
5235
|
-
const DFromM<decltype(mask)> d;
|
|
5400
|
+
template <class D, HWY_IF_T_SIZE_D(D, 2)>
|
|
5401
|
+
HWY_API uint64_t BitsFromMask(D d, MFromD<D> mask) {
|
|
5236
5402
|
const RebindToUnsigned<decltype(d)> du;
|
|
5237
5403
|
|
|
5238
5404
|
const Repartition<uint8_t, decltype(d)> du8;
|
|
5239
5405
|
const VFromD<decltype(du8)> sign_bits = BitCast(du8, VecFromMask(d, mask));
|
|
5240
5406
|
|
|
5241
5407
|
#if HWY_PPC_HAVE_10 && HWY_IS_LITTLE_ENDIAN
|
|
5242
|
-
return
|
|
5408
|
+
return detail::OnlyActive(
|
|
5409
|
+
d, static_cast<uint64_t>(vec_extractm(BitCast(du, sign_bits).raw)));
|
|
5243
5410
|
#else // Z14, Z15, PPC8, PPC9, or big-endian PPC10
|
|
5244
5411
|
(void)du;
|
|
5245
5412
|
#if HWY_IS_LITTLE_ENDIAN
|
|
@@ -5249,20 +5416,20 @@ HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<2> /*tag*/, Mask128<T, N> mask) {
|
|
|
5249
5416
|
const __vector unsigned char kBitShuffle = {
|
|
5250
5417
|
128, 128, 128, 128, 128, 128, 128, 128, 112, 96, 80, 64, 48, 32, 16, 0};
|
|
5251
5418
|
#endif
|
|
5252
|
-
return ExtractSignBits(sign_bits, kBitShuffle);
|
|
5419
|
+
return detail::OnlyActive(d, detail::ExtractSignBits(sign_bits, kBitShuffle));
|
|
5253
5420
|
#endif // HWY_PPC_HAVE_10
|
|
5254
5421
|
}
|
|
5255
5422
|
|
|
5256
|
-
template <
|
|
5257
|
-
|
|
5258
|
-
const DFromM<decltype(mask)> d;
|
|
5423
|
+
template <class D, HWY_IF_T_SIZE_D(D, 4)>
|
|
5424
|
+
HWY_API uint64_t BitsFromMask(D d, MFromD<D> mask) {
|
|
5259
5425
|
const RebindToUnsigned<decltype(d)> du;
|
|
5260
5426
|
|
|
5261
5427
|
const Repartition<uint8_t, decltype(d)> du8;
|
|
5262
5428
|
const VFromD<decltype(du8)> sign_bits = BitCast(du8, VecFromMask(d, mask));
|
|
5263
5429
|
|
|
5264
5430
|
#if HWY_PPC_HAVE_10 && HWY_IS_LITTLE_ENDIAN
|
|
5265
|
-
return
|
|
5431
|
+
return detail::OnlyActive(
|
|
5432
|
+
d, static_cast<uint64_t>(vec_extractm(BitCast(du, sign_bits).raw)));
|
|
5266
5433
|
#else // Z14, Z15, PPC8, PPC9, or big-endian PPC10
|
|
5267
5434
|
(void)du;
|
|
5268
5435
|
#if HWY_IS_LITTLE_ENDIAN
|
|
@@ -5274,20 +5441,20 @@ HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<4> /*tag*/, Mask128<T, N> mask) {
|
|
|
5274
5441
|
128, 128, 128, 128, 128, 128,
|
|
5275
5442
|
96, 64, 32, 0};
|
|
5276
5443
|
#endif
|
|
5277
|
-
return ExtractSignBits(sign_bits, kBitShuffle);
|
|
5444
|
+
return detail::OnlyActive(d, detail::ExtractSignBits(sign_bits, kBitShuffle));
|
|
5278
5445
|
#endif // HWY_PPC_HAVE_10
|
|
5279
5446
|
}
|
|
5280
5447
|
|
|
5281
|
-
template <
|
|
5282
|
-
|
|
5283
|
-
const DFromM<decltype(mask)> d;
|
|
5448
|
+
template <class D, HWY_IF_T_SIZE_D(D, 8)>
|
|
5449
|
+
HWY_API uint64_t BitsFromMask(D d, MFromD<D> mask) {
|
|
5284
5450
|
const RebindToUnsigned<decltype(d)> du;
|
|
5285
5451
|
|
|
5286
5452
|
const Repartition<uint8_t, decltype(d)> du8;
|
|
5287
5453
|
const VFromD<decltype(du8)> sign_bits = BitCast(du8, VecFromMask(d, mask));
|
|
5288
5454
|
|
|
5289
5455
|
#if HWY_PPC_HAVE_10 && HWY_IS_LITTLE_ENDIAN
|
|
5290
|
-
return
|
|
5456
|
+
return detail::OnlyActive(
|
|
5457
|
+
d, static_cast<uint64_t>(vec_extractm(BitCast(du, sign_bits).raw)));
|
|
5291
5458
|
#else // Z14, Z15, PPC8, PPC9, or big-endian PPC10
|
|
5292
5459
|
(void)du;
|
|
5293
5460
|
#if HWY_IS_LITTLE_ENDIAN
|
|
@@ -5299,35 +5466,22 @@ HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<8> /*tag*/, Mask128<T, N> mask) {
|
|
|
5299
5466
|
128, 128, 128, 128, 128, 128,
|
|
5300
5467
|
128, 128, 64, 0};
|
|
5301
5468
|
#endif
|
|
5302
|
-
return ExtractSignBits(sign_bits, kBitShuffle);
|
|
5469
|
+
return detail::OnlyActive(d, detail::ExtractSignBits(sign_bits, kBitShuffle));
|
|
5303
5470
|
#endif // HWY_PPC_HAVE_10
|
|
5304
5471
|
}
|
|
5305
5472
|
|
|
5306
|
-
// Returns the lowest N of the mask bits.
|
|
5307
|
-
template <typename T, size_t N>
|
|
5308
|
-
constexpr uint64_t OnlyActive(uint64_t mask_bits) {
|
|
5309
|
-
return ((N * sizeof(T)) == 16) ? mask_bits : mask_bits & ((1ull << N) - 1);
|
|
5310
|
-
}
|
|
5311
|
-
|
|
5312
|
-
template <typename T, size_t N>
|
|
5313
|
-
HWY_INLINE uint64_t BitsFromMask(Mask128<T, N> mask) {
|
|
5314
|
-
return OnlyActive<T, N>(BitsFromMask(hwy::SizeTag<sizeof(T)>(), mask));
|
|
5315
|
-
}
|
|
5316
|
-
|
|
5317
|
-
} // namespace detail
|
|
5318
|
-
|
|
5319
5473
|
// `p` points to at least 8 writable bytes.
|
|
5320
5474
|
template <class D, HWY_IF_LANES_LE_D(D, 8)>
|
|
5321
|
-
HWY_API size_t StoreMaskBits(D
|
|
5475
|
+
HWY_API size_t StoreMaskBits(D d, MFromD<D> mask, uint8_t* bits) {
|
|
5322
5476
|
// For vectors with 8 or fewer lanes, simply cast the result of BitsFromMask
|
|
5323
5477
|
// to an uint8_t and store the result in bits[0].
|
|
5324
|
-
bits[0] = static_cast<uint8_t>(
|
|
5478
|
+
bits[0] = static_cast<uint8_t>(BitsFromMask(d, mask));
|
|
5325
5479
|
return sizeof(uint8_t);
|
|
5326
5480
|
}
|
|
5327
5481
|
|
|
5328
5482
|
template <class D, HWY_IF_LANES_D(D, 16)>
|
|
5329
|
-
HWY_API size_t StoreMaskBits(D
|
|
5330
|
-
const auto mask_bits =
|
|
5483
|
+
HWY_API size_t StoreMaskBits(D d, MFromD<D> mask, uint8_t* bits) {
|
|
5484
|
+
const auto mask_bits = BitsFromMask(d, mask);
|
|
5331
5485
|
|
|
5332
5486
|
// First convert mask_bits to a uint16_t as we only want to store
|
|
5333
5487
|
// the lower 16 bits of mask_bits as there are 16 lanes in mask.
|
|
@@ -5392,8 +5546,8 @@ HWY_API bool AllTrue(D d, MFromD<D> mask) {
|
|
|
5392
5546
|
}
|
|
5393
5547
|
|
|
5394
5548
|
template <class D>
|
|
5395
|
-
HWY_API size_t CountTrue(D
|
|
5396
|
-
return PopCount(
|
|
5549
|
+
HWY_API size_t CountTrue(D d, MFromD<D> mask) {
|
|
5550
|
+
return PopCount(BitsFromMask(d, mask));
|
|
5397
5551
|
}
|
|
5398
5552
|
|
|
5399
5553
|
#if HWY_PPC_HAVE_9 && (!HWY_PPC_HAVE_10 || HWY_IS_BIG_ENDIAN)
|
|
@@ -5440,8 +5594,7 @@ HWY_API size_t FindKnownFirstTrue(D d, MFromD<D> mask) {
|
|
|
5440
5594
|
return detail::VsxCntlzLsbb(bytes) / sizeof(T);
|
|
5441
5595
|
}
|
|
5442
5596
|
#endif // HWY_PPC_HAVE_9 && (!HWY_PPC_HAVE_10 || HWY_IS_BIG_ENDIAN)
|
|
5443
|
-
(
|
|
5444
|
-
return Num0BitsBelowLS1Bit_Nonzero64(detail::BitsFromMask(mask));
|
|
5597
|
+
return Num0BitsBelowLS1Bit_Nonzero64(BitsFromMask(d, mask));
|
|
5445
5598
|
}
|
|
5446
5599
|
|
|
5447
5600
|
template <class D, typename T = TFromD<D>>
|
|
@@ -5456,8 +5609,7 @@ HWY_API intptr_t FindFirstTrue(D d, MFromD<D> mask) {
|
|
|
5456
5609
|
return idx == kN ? -1 : static_cast<intptr_t>(idx);
|
|
5457
5610
|
}
|
|
5458
5611
|
#endif // HWY_PPC_HAVE_9 && (!HWY_PPC_HAVE_10 || HWY_IS_BIG_ENDIAN)
|
|
5459
|
-
(
|
|
5460
|
-
const uint64_t mask_bits = detail::BitsFromMask(mask);
|
|
5612
|
+
const uint64_t mask_bits = BitsFromMask(d, mask);
|
|
5461
5613
|
return mask_bits ? intptr_t(Num0BitsBelowLS1Bit_Nonzero64(mask_bits)) : -1;
|
|
5462
5614
|
}
|
|
5463
5615
|
|
|
@@ -5472,8 +5624,7 @@ HWY_API size_t FindKnownLastTrue(D d, MFromD<D> mask) {
|
|
|
5472
5624
|
return 16 / sizeof(T) - 1 - idx;
|
|
5473
5625
|
}
|
|
5474
5626
|
#endif // HWY_PPC_HAVE_9 && (!HWY_PPC_HAVE_10 || HWY_IS_BIG_ENDIAN)
|
|
5475
|
-
(
|
|
5476
|
-
return 63 - Num0BitsAboveMS1Bit_Nonzero64(detail::BitsFromMask(mask));
|
|
5627
|
+
return 63 - Num0BitsAboveMS1Bit_Nonzero64(BitsFromMask(d, mask));
|
|
5477
5628
|
}
|
|
5478
5629
|
|
|
5479
5630
|
template <class D, typename T = TFromD<D>>
|
|
@@ -5488,8 +5639,7 @@ HWY_API intptr_t FindLastTrue(D d, MFromD<D> mask) {
|
|
|
5488
5639
|
return idx == kN ? -1 : static_cast<intptr_t>(kN - 1 - idx);
|
|
5489
5640
|
}
|
|
5490
5641
|
#endif // HWY_PPC_HAVE_9 && (!HWY_PPC_HAVE_10 || HWY_IS_BIG_ENDIAN)
|
|
5491
|
-
(
|
|
5492
|
-
const uint64_t mask_bits = detail::BitsFromMask(mask);
|
|
5642
|
+
const uint64_t mask_bits = BitsFromMask(d, mask);
|
|
5493
5643
|
return mask_bits ? intptr_t(63 - Num0BitsAboveMS1Bit_Nonzero64(mask_bits))
|
|
5494
5644
|
: -1;
|
|
5495
5645
|
}
|
|
@@ -5985,7 +6135,8 @@ HWY_API Vec128<T, N> Compress(Vec128<T, N> v, Mask128<T, N> mask) {
|
|
|
5985
6135
|
// General case, 2 or 4 bytes
|
|
5986
6136
|
template <typename T, size_t N, HWY_IF_T_SIZE_ONE_OF(T, (1 << 2) | (1 << 4))>
|
|
5987
6137
|
HWY_API Vec128<T, N> Compress(Vec128<T, N> v, Mask128<T, N> mask) {
|
|
5988
|
-
|
|
6138
|
+
const DFromV<decltype(v)> d;
|
|
6139
|
+
return detail::CompressBits(v, BitsFromMask(d, mask));
|
|
5989
6140
|
}
|
|
5990
6141
|
|
|
5991
6142
|
// ------------------------------ CompressNot
|
|
@@ -6021,12 +6172,13 @@ HWY_API Vec128<T, N> CompressNot(Vec128<T, N> v, Mask128<T, N> mask) {
|
|
|
6021
6172
|
// General case, 2 or 4 bytes
|
|
6022
6173
|
template <typename T, size_t N, HWY_IF_T_SIZE_ONE_OF(T, (1 << 2) | (1 << 4))>
|
|
6023
6174
|
HWY_API Vec128<T, N> CompressNot(Vec128<T, N> v, Mask128<T, N> mask) {
|
|
6175
|
+
const DFromV<decltype(v)> d;
|
|
6024
6176
|
// For partial vectors, we cannot pull the Not() into the table because
|
|
6025
6177
|
// BitsFromMask clears the upper bits.
|
|
6026
6178
|
if (N < 16 / sizeof(T)) {
|
|
6027
|
-
return detail::CompressBits(v,
|
|
6179
|
+
return detail::CompressBits(v, BitsFromMask(d, Not(mask)));
|
|
6028
6180
|
}
|
|
6029
|
-
return detail::CompressNotBits(v,
|
|
6181
|
+
return detail::CompressNotBits(v, BitsFromMask(d, mask));
|
|
6030
6182
|
}
|
|
6031
6183
|
|
|
6032
6184
|
// ------------------------------ CompressBlocksNot
|
|
@@ -6076,7 +6228,7 @@ HWY_API size_t CompressStore(VFromD<D> v, MFromD<D> m, D d,
|
|
|
6076
6228
|
TFromD<D>* HWY_RESTRICT unaligned) {
|
|
6077
6229
|
const RebindToUnsigned<decltype(d)> du;
|
|
6078
6230
|
|
|
6079
|
-
const uint64_t mask_bits =
|
|
6231
|
+
const uint64_t mask_bits = BitsFromMask(d, m);
|
|
6080
6232
|
HWY_DASSERT(mask_bits < (1ull << MaxLanes(d)));
|
|
6081
6233
|
const size_t count = PopCount(mask_bits);
|
|
6082
6234
|
|
|
@@ -6103,7 +6255,7 @@ HWY_API size_t CompressBlendedStore(VFromD<D> v, MFromD<D> m, D d,
|
|
|
6103
6255
|
TFromD<D>* HWY_RESTRICT unaligned) {
|
|
6104
6256
|
const RebindToUnsigned<decltype(d)> du;
|
|
6105
6257
|
|
|
6106
|
-
const uint64_t mask_bits =
|
|
6258
|
+
const uint64_t mask_bits = BitsFromMask(d, m);
|
|
6107
6259
|
HWY_DASSERT(mask_bits < (1ull << MaxLanes(d)));
|
|
6108
6260
|
const size_t count = PopCount(mask_bits);
|
|
6109
6261
|
|
|
@@ -6228,9 +6380,16 @@ HWY_INLINE V Per128BitBlkRevLanesOnBe(V v) {
|
|
|
6228
6380
|
template <class V>
|
|
6229
6381
|
HWY_INLINE V I128Subtract(V a, V b) {
|
|
6230
6382
|
#if HWY_S390X_HAVE_Z14
|
|
6383
|
+
#if HWY_COMPILER_CLANG
|
|
6384
|
+
// Workaround for bug in vec_sub_u128 in Clang vecintrin.h
|
|
6385
|
+
typedef __uint128_t VU128 __attribute__((__vector_size__(16)));
|
|
6386
|
+
const V diff_i128{reinterpret_cast<typename detail::Raw128<TFromV<V>>::type>(
|
|
6387
|
+
reinterpret_cast<VU128>(a.raw) - reinterpret_cast<VU128>(b.raw))};
|
|
6388
|
+
#else // !HWY_COMPILER_CLANG
|
|
6231
6389
|
const V diff_i128{reinterpret_cast<typename detail::Raw128<TFromV<V>>::type>(
|
|
6232
6390
|
vec_sub_u128(reinterpret_cast<__vector unsigned char>(a.raw),
|
|
6233
6391
|
reinterpret_cast<__vector unsigned char>(b.raw)))};
|
|
6392
|
+
#endif // HWY_COMPILER_CLANG
|
|
6234
6393
|
#elif defined(__SIZEOF_INT128__)
|
|
6235
6394
|
using VU128 = __vector unsigned __int128;
|
|
6236
6395
|
const V diff_i128{reinterpret_cast<typename detail::Raw128<TFromV<V>>::type>(
|
|
@@ -6725,6 +6884,26 @@ HWY_INLINE VFromD<RepartitionToWideX2<DFromV<V>>> SumsOf4(
|
|
|
6725
6884
|
#if HWY_S390X_HAVE_Z14
|
|
6726
6885
|
namespace detail {
|
|
6727
6886
|
|
|
6887
|
+
#if HWY_COMPILER_CLANG && HWY_HAS_BUILTIN(__builtin_s390_vsumqf) && \
|
|
6888
|
+
HWY_HAS_BUILTIN(__builtin_s390_vsumqg)
|
|
6889
|
+
// Workaround for bug in vec_sum_u128 in Clang vecintrin.h
|
|
6890
|
+
template <class T, HWY_IF_UI32(T)>
|
|
6891
|
+
HWY_INLINE Vec128<T> SumOfU32OrU64LanesAsU128(Vec128<T> v) {
|
|
6892
|
+
typedef __uint128_t VU128 __attribute__((__vector_size__(16)));
|
|
6893
|
+
const DFromV<decltype(v)> d;
|
|
6894
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
6895
|
+
const VU128 sum = {__builtin_s390_vsumqf(BitCast(du, v).raw, Zero(du).raw)};
|
|
6896
|
+
return Vec128<T>{reinterpret_cast<typename detail::Raw128<T>::type>(sum)};
|
|
6897
|
+
}
|
|
6898
|
+
template <class T, HWY_IF_UI64(T)>
|
|
6899
|
+
HWY_INLINE Vec128<T> SumOfU32OrU64LanesAsU128(Vec128<T> v) {
|
|
6900
|
+
typedef __uint128_t VU128 __attribute__((__vector_size__(16)));
|
|
6901
|
+
const DFromV<decltype(v)> d;
|
|
6902
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
6903
|
+
const VU128 sum = {__builtin_s390_vsumqg(BitCast(du, v).raw, Zero(du).raw)};
|
|
6904
|
+
return Vec128<T>{reinterpret_cast<typename detail::Raw128<T>::type>(sum)};
|
|
6905
|
+
}
|
|
6906
|
+
#else
|
|
6728
6907
|
template <class T, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T),
|
|
6729
6908
|
HWY_IF_T_SIZE_ONE_OF(T, (1 << 4) | (1 << 8))>
|
|
6730
6909
|
HWY_INLINE Vec128<T> SumOfU32OrU64LanesAsU128(Vec128<T> v) {
|
|
@@ -6733,6 +6912,7 @@ HWY_INLINE Vec128<T> SumOfU32OrU64LanesAsU128(Vec128<T> v) {
|
|
|
6733
6912
|
return BitCast(
|
|
6734
6913
|
d, Vec128<uint8_t>{vec_sum_u128(BitCast(du, v).raw, Zero(du).raw)});
|
|
6735
6914
|
}
|
|
6915
|
+
#endif
|
|
6736
6916
|
|
|
6737
6917
|
} // namespace detail
|
|
6738
6918
|
|