@img/sharp-libvips-dev 1.2.0 → 1.2.2-rc.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/include/ffi.h +3 -3
- package/include/harfbuzz/hb-deprecated.h +4 -4
- package/include/harfbuzz/hb-font.h +120 -9
- package/include/harfbuzz/hb-version.h +3 -3
- package/include/hwy/abort.h +2 -19
- package/include/hwy/aligned_allocator.h +11 -7
- package/include/hwy/auto_tune.h +504 -0
- package/include/hwy/base.h +425 -104
- package/include/hwy/cache_control.h +16 -0
- package/include/hwy/detect_compiler_arch.h +32 -1
- package/include/hwy/detect_targets.h +251 -67
- package/include/hwy/foreach_target.h +35 -0
- package/include/hwy/highway.h +185 -76
- package/include/hwy/nanobenchmark.h +1 -19
- package/include/hwy/ops/arm_neon-inl.h +969 -458
- package/include/hwy/ops/arm_sve-inl.h +1137 -359
- package/include/hwy/ops/emu128-inl.h +97 -11
- package/include/hwy/ops/generic_ops-inl.h +1222 -34
- package/include/hwy/ops/loongarch_lasx-inl.h +4664 -0
- package/include/hwy/ops/loongarch_lsx-inl.h +5933 -0
- package/include/hwy/ops/ppc_vsx-inl.h +306 -126
- package/include/hwy/ops/rvv-inl.h +546 -51
- package/include/hwy/ops/scalar-inl.h +77 -22
- package/include/hwy/ops/set_macros-inl.h +138 -17
- package/include/hwy/ops/shared-inl.h +50 -10
- package/include/hwy/ops/wasm_128-inl.h +137 -92
- package/include/hwy/ops/x86_128-inl.h +773 -214
- package/include/hwy/ops/x86_256-inl.h +712 -255
- package/include/hwy/ops/x86_512-inl.h +429 -753
- package/include/hwy/ops/x86_avx3-inl.h +501 -0
- package/include/hwy/per_target.h +2 -1
- package/include/hwy/profiler.h +622 -486
- package/include/hwy/targets.h +62 -20
- package/include/hwy/timer-inl.h +8 -160
- package/include/hwy/timer.h +170 -3
- package/include/hwy/x86_cpuid.h +81 -0
- package/include/libheif/heif_cxx.h +25 -5
- package/include/libheif/heif_regions.h +5 -5
- package/include/libheif/heif_version.h +2 -2
- package/include/librsvg-2.0/librsvg/rsvg-version.h +2 -2
- package/include/libxml2/libxml/xmlversion.h +4 -4
- package/include/pango-1.0/pango/pango-enum-types.h +3 -0
- package/include/pango-1.0/pango/pango-features.h +3 -3
- package/include/pango-1.0/pango/pango-font.h +30 -0
- package/include/pango-1.0/pango/pango-version-macros.h +26 -0
- package/include/pixman-1/pixman-version.h +2 -2
- package/include/webp/decode.h +11 -2
- package/include/webp/demux.h +2 -0
- package/include/webp/encode.h +2 -0
- package/include/webp/mux_types.h +1 -0
- package/include/webp/sharpyuv/sharpyuv.h +1 -1
- package/include/webp/types.h +2 -2
- package/include/zlib.h +3 -3
- package/package.json +1 -1
- package/versions.json +11 -11
|
@@ -538,6 +538,17 @@ HWY_API Vec128<uint16_t, N> AverageRound(const Vec128<uint16_t, N> a,
|
|
|
538
538
|
return Vec128<uint16_t, N>{wasm_u16x8_avgr(a.raw, b.raw)};
|
|
539
539
|
}
|
|
540
540
|
|
|
541
|
+
template <class V, HWY_IF_SIGNED_V(V),
|
|
542
|
+
HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 2))>
|
|
543
|
+
HWY_API V AverageRound(V a, V b) {
|
|
544
|
+
const DFromV<decltype(a)> d;
|
|
545
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
546
|
+
const V sign_bit = SignBit(d);
|
|
547
|
+
return Xor(BitCast(d, AverageRound(BitCast(du, Xor(a, sign_bit)),
|
|
548
|
+
BitCast(du, Xor(b, sign_bit)))),
|
|
549
|
+
sign_bit);
|
|
550
|
+
}
|
|
551
|
+
|
|
541
552
|
// ------------------------------ Absolute value
|
|
542
553
|
|
|
543
554
|
// Returns absolute value, except that LimitsMin() maps to LimitsMax() + 1.
|
|
@@ -892,6 +903,24 @@ HWY_API Vec128<double, N> Max(Vec128<double, N> a, Vec128<double, N> b) {
|
|
|
892
903
|
return Vec128<double, N>{wasm_f64x2_pmax(b.raw, a.raw)};
|
|
893
904
|
}
|
|
894
905
|
|
|
906
|
+
// ------------------------------ MinNumber and MaxNumber
|
|
907
|
+
|
|
908
|
+
#ifdef HWY_NATIVE_FLOAT_MIN_MAX_NUMBER
|
|
909
|
+
#undef HWY_NATIVE_FLOAT_MIN_MAX_NUMBER
|
|
910
|
+
#else
|
|
911
|
+
#define HWY_NATIVE_FLOAT_MIN_MAX_NUMBER
|
|
912
|
+
#endif
|
|
913
|
+
|
|
914
|
+
template <class V, HWY_IF_FLOAT_OR_SPECIAL_V(V)>
|
|
915
|
+
HWY_API V MinNumber(V a, V b) {
|
|
916
|
+
return Min(a, IfThenElse(IsNaN(b), a, b));
|
|
917
|
+
}
|
|
918
|
+
|
|
919
|
+
template <class V, HWY_IF_FLOAT_OR_SPECIAL_V(V)>
|
|
920
|
+
HWY_API V MaxNumber(V a, V b) {
|
|
921
|
+
return Max(a, IfThenElse(IsNaN(b), a, b));
|
|
922
|
+
}
|
|
923
|
+
|
|
895
924
|
// ------------------------------ Integer multiplication
|
|
896
925
|
|
|
897
926
|
// Unsigned
|
|
@@ -1093,9 +1122,9 @@ HWY_API Vec128<double, N> operator/(const Vec128<double, N> a,
|
|
|
1093
1122
|
return Vec128<double, N>{wasm_f64x2_div(a.raw, b.raw)};
|
|
1094
1123
|
}
|
|
1095
1124
|
|
|
1096
|
-
template <
|
|
1097
|
-
HWY_API
|
|
1098
|
-
return Set(DFromV<decltype(v)>(),
|
|
1125
|
+
template <class V, HWY_IF_F32(TFromV<V>)>
|
|
1126
|
+
HWY_API V ApproximateReciprocal(const V v) {
|
|
1127
|
+
return Set(DFromV<decltype(v)>(), 1.0f) / v;
|
|
1099
1128
|
}
|
|
1100
1129
|
|
|
1101
1130
|
// Integer overload defined in generic_ops-inl.h.
|
|
@@ -1143,10 +1172,10 @@ HWY_API Vec128<double, N> Sqrt(const Vec128<double, N> v) {
|
|
|
1143
1172
|
}
|
|
1144
1173
|
|
|
1145
1174
|
// Approximate reciprocal square root
|
|
1146
|
-
template <
|
|
1147
|
-
HWY_API
|
|
1175
|
+
template <class V, HWY_IF_F32(TFromV<V>)>
|
|
1176
|
+
HWY_API V ApproximateReciprocalSqrt(V v) {
|
|
1148
1177
|
// TODO(eustas): find cheaper a way to calculate this.
|
|
1149
|
-
return Set(DFromV<decltype(v)>(),
|
|
1178
|
+
return Set(DFromV<decltype(v)>(), static_cast<TFromV<V>>(1.0)) / Sqrt(v);
|
|
1150
1179
|
}
|
|
1151
1180
|
|
|
1152
1181
|
// ------------------------------ Floating-point rounding
|
|
@@ -3185,6 +3214,19 @@ HWY_API Vec128<int64_t, N> InterleaveUpper(Vec128<int64_t, N> a,
|
|
|
3185
3214
|
return Vec128<int64_t, N>{wasm_i64x2_shuffle(a.raw, b.raw, 1, 3)};
|
|
3186
3215
|
}
|
|
3187
3216
|
|
|
3217
|
+
template <size_t N>
|
|
3218
|
+
HWY_API Vec128<float16_t, N> InterleaveUpper(Vec128<float16_t, N> a,
|
|
3219
|
+
Vec128<float16_t, N> b) {
|
|
3220
|
+
return Vec128<float16_t, N>{
|
|
3221
|
+
wasm_i16x8_shuffle(a.raw, b.raw, 4, 12, 5, 13, 6, 14, 7, 15)};
|
|
3222
|
+
}
|
|
3223
|
+
template <size_t N>
|
|
3224
|
+
HWY_API Vec128<bfloat16_t, N> InterleaveUpper(Vec128<bfloat16_t, N> a,
|
|
3225
|
+
Vec128<bfloat16_t, N> b) {
|
|
3226
|
+
return Vec128<bfloat16_t, N>{
|
|
3227
|
+
wasm_i16x8_shuffle(a.raw, b.raw, 4, 12, 5, 13, 6, 14, 7, 15)};
|
|
3228
|
+
}
|
|
3229
|
+
|
|
3188
3230
|
template <size_t N>
|
|
3189
3231
|
HWY_API Vec128<float, N> InterleaveUpper(Vec128<float, N> a,
|
|
3190
3232
|
Vec128<float, N> b) {
|
|
@@ -3885,18 +3927,26 @@ HWY_API Vec128<T, N> OddEvenBlocks(Vec128<T, N> /* odd */, Vec128<T, N> even) {
|
|
|
3885
3927
|
}
|
|
3886
3928
|
|
|
3887
3929
|
// ------------------------------ SwapAdjacentBlocks
|
|
3888
|
-
|
|
3889
3930
|
template <typename T, size_t N>
|
|
3890
3931
|
HWY_API Vec128<T, N> SwapAdjacentBlocks(Vec128<T, N> v) {
|
|
3891
3932
|
return v;
|
|
3892
3933
|
}
|
|
3893
3934
|
|
|
3894
|
-
// ------------------------------
|
|
3935
|
+
// ------------------------------ InterleaveEvenBlocks
|
|
3936
|
+
template <class D, class V = VFromD<D>, HWY_IF_V_SIZE_LE_D(D, 16)>
|
|
3937
|
+
HWY_API V InterleaveEvenBlocks(D, V a, V /*b*/) {
|
|
3938
|
+
return a;
|
|
3939
|
+
}
|
|
3940
|
+
// ------------------------------ InterleaveOddBlocks
|
|
3941
|
+
template <class D, class V = VFromD<D>, HWY_IF_V_SIZE_LE_D(D, 16)>
|
|
3942
|
+
HWY_API V InterleaveOddBlocks(D, V a, V /*b*/) {
|
|
3943
|
+
return a;
|
|
3944
|
+
}
|
|
3895
3945
|
|
|
3896
|
-
//
|
|
3946
|
+
// ------------------------------ ReverseBlocks
|
|
3897
3947
|
template <class D>
|
|
3898
3948
|
HWY_API VFromD<D> ReverseBlocks(D /* tag */, VFromD<D> v) {
|
|
3899
|
-
return v;
|
|
3949
|
+
return v; // Single block: no change
|
|
3900
3950
|
}
|
|
3901
3951
|
|
|
3902
3952
|
// ================================================== CONVERT
|
|
@@ -4791,11 +4841,19 @@ HWY_API VFromD<DU> ConvertTo(DU du, VFromD<Rebind<double, DU>> v) {
|
|
|
4791
4841
|
}
|
|
4792
4842
|
|
|
4793
4843
|
// ------------------------------ NearestInt (Round)
|
|
4794
|
-
template <size_t N>
|
|
4795
|
-
HWY_API Vec128<
|
|
4844
|
+
template <typename T, size_t N, HWY_IF_FLOAT3264(T)>
|
|
4845
|
+
HWY_API Vec128<MakeSigned<T>, N> NearestInt(const Vec128<T, N> v) {
|
|
4796
4846
|
return ConvertTo(RebindToSigned<DFromV<decltype(v)>>(), Round(v));
|
|
4797
4847
|
}
|
|
4798
4848
|
|
|
4849
|
+
// ------------------------------ DemoteToNearestInt (Round)
|
|
4850
|
+
template <class DI32, HWY_IF_I32_D(DI32)>
|
|
4851
|
+
HWY_API VFromD<DI32> DemoteToNearestInt(DI32 di32,
|
|
4852
|
+
VFromD<Rebind<double, DI32>> v) {
|
|
4853
|
+
// No single instruction, round then demote.
|
|
4854
|
+
return DemoteTo(di32, Round(v));
|
|
4855
|
+
}
|
|
4856
|
+
|
|
4799
4857
|
// ================================================== MISC
|
|
4800
4858
|
|
|
4801
4859
|
// ------------------------------ SumsOf8 (ShiftRight, Add)
|
|
@@ -4914,76 +4972,74 @@ HWY_API MFromD<D> Dup128MaskFromMaskBits(D d, unsigned mask_bits) {
|
|
|
4914
4972
|
|
|
4915
4973
|
namespace detail {
|
|
4916
4974
|
|
|
4917
|
-
//
|
|
4918
|
-
template <
|
|
4919
|
-
|
|
4920
|
-
|
|
4975
|
+
// Returns the lowest N bits for the BitsFromMask result.
|
|
4976
|
+
template <class D>
|
|
4977
|
+
constexpr uint64_t OnlyActive(D d, uint64_t bits) {
|
|
4978
|
+
return (d.MaxBytes() == 16) ? bits : bits & ((1ull << d.MaxLanes()) - 1);
|
|
4979
|
+
}
|
|
4980
|
+
|
|
4981
|
+
} // namespace detail
|
|
4982
|
+
|
|
4983
|
+
template <class D, HWY_IF_T_SIZE_D(D, 1), HWY_IF_V_SIZE_D(D, 16)>
|
|
4984
|
+
HWY_API uint64_t BitsFromMask(D /*d*/, const MFromD<D> mask) {
|
|
4921
4985
|
alignas(16) uint64_t lanes[2];
|
|
4922
4986
|
wasm_v128_store(lanes, mask.raw);
|
|
4923
4987
|
|
|
4924
4988
|
constexpr uint64_t kMagic = 0x103070F1F3F80ULL;
|
|
4925
4989
|
const uint64_t lo = ((lanes[0] * kMagic) >> 56);
|
|
4926
4990
|
const uint64_t hi = ((lanes[1] * kMagic) >> 48) & 0xFF00;
|
|
4927
|
-
return
|
|
4991
|
+
return hi + lo; // exactly 16 bits, no OnlyActive required
|
|
4928
4992
|
}
|
|
4929
4993
|
|
|
4930
|
-
|
|
4931
|
-
|
|
4932
|
-
HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<1> /*tag*/,
|
|
4933
|
-
const Mask128<T, 8> mask) {
|
|
4994
|
+
template <class D, HWY_IF_T_SIZE_D(D, 1), HWY_IF_V_SIZE_D(D, 8)>
|
|
4995
|
+
HWY_API uint64_t BitsFromMask(D /*d*/, const MFromD<D> mask) {
|
|
4934
4996
|
constexpr uint64_t kMagic = 0x103070F1F3F80ULL;
|
|
4935
|
-
|
|
4936
|
-
|
|
4937
|
-
|
|
4997
|
+
const uint64_t bytes =
|
|
4998
|
+
static_cast<uint64_t>(wasm_i64x2_extract_lane(mask.raw, 0));
|
|
4999
|
+
return (bytes * kMagic) >> 56; // exactly 8 bits, no OnlyActive required
|
|
4938
5000
|
}
|
|
4939
5001
|
|
|
4940
5002
|
// 32-bit or less: need masking
|
|
4941
|
-
template <
|
|
4942
|
-
|
|
4943
|
-
const Mask128<T, N> mask) {
|
|
5003
|
+
template <class D, HWY_IF_T_SIZE_D(D, 1), HWY_IF_V_SIZE_LE_D(D, 4)>
|
|
5004
|
+
HWY_API uint64_t BitsFromMask(D d, const MFromD<D> mask) {
|
|
4944
5005
|
uint64_t bytes = static_cast<uint64_t>(wasm_i64x2_extract_lane(mask.raw, 0));
|
|
4945
5006
|
// Clear potentially undefined bytes.
|
|
4946
|
-
bytes &= (1ULL << (
|
|
5007
|
+
bytes &= (1ULL << (Lanes(d) * 8)) - 1;
|
|
4947
5008
|
constexpr uint64_t kMagic = 0x103070F1F3F80ULL;
|
|
4948
|
-
return (bytes * kMagic) >> 56;
|
|
5009
|
+
return detail::OnlyActive(d, (bytes * kMagic) >> 56);
|
|
4949
5010
|
}
|
|
4950
5011
|
|
|
4951
|
-
template <
|
|
4952
|
-
|
|
4953
|
-
const Mask128<T, N> mask) {
|
|
5012
|
+
template <class D, HWY_IF_T_SIZE_D(D, 2), HWY_IF_V_SIZE_LE_D(D, 16)>
|
|
5013
|
+
HWY_API uint64_t BitsFromMask(D /*d*/, const MFromD<D> mask) {
|
|
4954
5014
|
// Remove useless lower half of each u16 while preserving the sign bit.
|
|
5015
|
+
const Rebind<uint8_t, D> d8;
|
|
5016
|
+
using M8 = MFromD<decltype(d8)>;
|
|
4955
5017
|
const __i16x8 zero = wasm_i16x8_splat(0);
|
|
4956
|
-
const
|
|
4957
|
-
return
|
|
5018
|
+
const M8 mask8{wasm_i8x16_narrow_i16x8(mask.raw, zero)};
|
|
5019
|
+
return detail::OnlyActive(d8, BitsFromMask(d8, mask8));
|
|
4958
5020
|
}
|
|
4959
5021
|
|
|
4960
|
-
template <
|
|
4961
|
-
|
|
4962
|
-
const Mask128<T, N> mask) {
|
|
5022
|
+
template <class D, HWY_IF_T_SIZE_D(D, 4), HWY_IF_V_SIZE_LE_D(D, 16)>
|
|
5023
|
+
HWY_API uint64_t BitsFromMask(D d, const MFromD<D> mask) {
|
|
4963
5024
|
const __i32x4 mask_i = static_cast<__i32x4>(mask.raw);
|
|
4964
5025
|
const __i32x4 slice = wasm_i32x4_make(1, 2, 4, 8);
|
|
4965
5026
|
const __i32x4 sliced_mask = wasm_v128_and(mask_i, slice);
|
|
4966
5027
|
alignas(16) uint32_t lanes[4];
|
|
4967
5028
|
wasm_v128_store(lanes, sliced_mask);
|
|
4968
|
-
return lanes[0] | lanes[1] | lanes[2] | lanes[3];
|
|
5029
|
+
return detail::OnlyActive(d, lanes[0] | lanes[1] | lanes[2] | lanes[3]);
|
|
4969
5030
|
}
|
|
4970
5031
|
|
|
4971
|
-
template <
|
|
4972
|
-
|
|
4973
|
-
const Mask128<T, N> mask) {
|
|
5032
|
+
template <class D, HWY_IF_T_SIZE_D(D, 8), HWY_IF_V_SIZE_LE_D(D, 16)>
|
|
5033
|
+
HWY_API uint64_t BitsFromMask(D d, const MFromD<D> mask) {
|
|
4974
5034
|
const __i64x2 mask_i = static_cast<__i64x2>(mask.raw);
|
|
4975
5035
|
const __i64x2 slice = wasm_i64x2_make(1, 2);
|
|
4976
5036
|
const __i64x2 sliced_mask = wasm_v128_and(mask_i, slice);
|
|
4977
5037
|
alignas(16) uint64_t lanes[2];
|
|
4978
5038
|
wasm_v128_store(lanes, sliced_mask);
|
|
4979
|
-
return lanes[0] | lanes[1];
|
|
5039
|
+
return detail::OnlyActive(d, lanes[0] | lanes[1]);
|
|
4980
5040
|
}
|
|
4981
5041
|
|
|
4982
|
-
|
|
4983
|
-
template <typename T, size_t N>
|
|
4984
|
-
constexpr uint64_t OnlyActive(uint64_t bits) {
|
|
4985
|
-
return ((N * sizeof(T)) == 16) ? bits : bits & ((1ull << N) - 1);
|
|
4986
|
-
}
|
|
5042
|
+
namespace detail {
|
|
4987
5043
|
|
|
4988
5044
|
// Returns 0xFF for bytes with index >= N, otherwise 0.
|
|
4989
5045
|
template <size_t N>
|
|
@@ -5015,53 +5071,40 @@ constexpr __i8x16 BytesAbove() {
|
|
|
5015
5071
|
: wasm_i8x16_make(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1);
|
|
5016
5072
|
}
|
|
5017
5073
|
|
|
5018
|
-
|
|
5019
|
-
HWY_INLINE uint64_t BitsFromMask(const Mask128<T, N> mask) {
|
|
5020
|
-
return OnlyActive<T, N>(BitsFromMask(hwy::SizeTag<sizeof(T)>(), mask));
|
|
5021
|
-
}
|
|
5074
|
+
} // namespace detail
|
|
5022
5075
|
|
|
5023
|
-
|
|
5024
|
-
|
|
5025
|
-
|
|
5076
|
+
// `p` points to at least 8 writable bytes.
|
|
5077
|
+
template <class D>
|
|
5078
|
+
HWY_API size_t StoreMaskBits(D d, const MFromD<D> mask, uint8_t* bits) {
|
|
5079
|
+
const uint64_t mask_bits = BitsFromMask(d, mask);
|
|
5080
|
+
const size_t kNumBytes = (d.MaxLanes() + 7) / 8;
|
|
5081
|
+
CopyBytes<kNumBytes>(&mask_bits, bits);
|
|
5082
|
+
return kNumBytes;
|
|
5026
5083
|
}
|
|
5027
5084
|
|
|
5028
|
-
template <
|
|
5029
|
-
|
|
5030
|
-
return PopCount(BitsFromMask(
|
|
5085
|
+
template <class D, HWY_IF_UI8_D(D), HWY_IF_V_SIZE_D(D, 16)>
|
|
5086
|
+
HWY_API size_t CountTrue(D d, const MFromD<D> m) {
|
|
5087
|
+
return PopCount(BitsFromMask(d, m));
|
|
5031
5088
|
}
|
|
5032
|
-
|
|
5033
|
-
|
|
5034
|
-
|
|
5089
|
+
template <class D, HWY_IF_UI16_D(D), HWY_IF_V_SIZE_D(D, 16)>
|
|
5090
|
+
HWY_API size_t CountTrue(D d, const MFromD<D> m) {
|
|
5091
|
+
return PopCount(BitsFromMask(d, m));
|
|
5092
|
+
}
|
|
5093
|
+
template <class D, HWY_IF_T_SIZE_D(D, 4), HWY_IF_V_SIZE_D(D, 16)>
|
|
5094
|
+
HWY_API size_t CountTrue(D /*d*/, const MFromD<D> m) {
|
|
5035
5095
|
const __i32x4 var_shift = wasm_i32x4_make(1, 2, 4, 8);
|
|
5036
5096
|
const __i32x4 shifted_bits = wasm_v128_and(m.raw, var_shift);
|
|
5037
5097
|
alignas(16) uint64_t lanes[2];
|
|
5038
5098
|
wasm_v128_store(lanes, shifted_bits);
|
|
5039
5099
|
return PopCount(lanes[0] | lanes[1]);
|
|
5040
5100
|
}
|
|
5041
|
-
|
|
5042
|
-
|
|
5043
|
-
HWY_INLINE size_t CountTrue(hwy::SizeTag<8> /*tag*/, const Mask128<T> m) {
|
|
5101
|
+
template <class D, HWY_IF_T_SIZE_D(D, 8), HWY_IF_V_SIZE_D(D, 16)>
|
|
5102
|
+
HWY_API size_t CountTrue(D /*d*/, const MFromD<D> m) {
|
|
5044
5103
|
alignas(16) int64_t lanes[2];
|
|
5045
5104
|
wasm_v128_store(lanes, m.raw);
|
|
5046
5105
|
return static_cast<size_t>(-(lanes[0] + lanes[1]));
|
|
5047
5106
|
}
|
|
5048
5107
|
|
|
5049
|
-
} // namespace detail
|
|
5050
|
-
|
|
5051
|
-
// `p` points to at least 8 writable bytes.
|
|
5052
|
-
template <class D>
|
|
5053
|
-
HWY_API size_t StoreMaskBits(D d, const MFromD<D> mask, uint8_t* bits) {
|
|
5054
|
-
const uint64_t mask_bits = detail::BitsFromMask(mask);
|
|
5055
|
-
const size_t kNumBytes = (d.MaxLanes() + 7) / 8;
|
|
5056
|
-
CopyBytes<kNumBytes>(&mask_bits, bits);
|
|
5057
|
-
return kNumBytes;
|
|
5058
|
-
}
|
|
5059
|
-
|
|
5060
|
-
template <class D, HWY_IF_V_SIZE_D(D, 16)>
|
|
5061
|
-
HWY_API size_t CountTrue(D /* tag */, const MFromD<D> m) {
|
|
5062
|
-
return detail::CountTrue(hwy::SizeTag<sizeof(TFromD<D>)>(), m);
|
|
5063
|
-
}
|
|
5064
|
-
|
|
5065
5108
|
// Partial
|
|
5066
5109
|
template <class D, typename T = TFromD<D>, HWY_IF_V_SIZE_LE_D(D, 8)>
|
|
5067
5110
|
HWY_API size_t CountTrue(D d, MFromD<D> m) {
|
|
@@ -5121,26 +5164,26 @@ HWY_API bool AllTrue(D d, const MFromD<D> m) {
|
|
|
5121
5164
|
}
|
|
5122
5165
|
|
|
5123
5166
|
template <class D>
|
|
5124
|
-
HWY_API size_t FindKnownFirstTrue(D
|
|
5125
|
-
const uint32_t bits = static_cast<uint32_t>(
|
|
5167
|
+
HWY_API size_t FindKnownFirstTrue(D d, const MFromD<D> mask) {
|
|
5168
|
+
const uint32_t bits = static_cast<uint32_t>(BitsFromMask(d, mask));
|
|
5126
5169
|
return Num0BitsBelowLS1Bit_Nonzero32(bits);
|
|
5127
5170
|
}
|
|
5128
5171
|
|
|
5129
5172
|
template <class D>
|
|
5130
|
-
HWY_API intptr_t FindFirstTrue(D
|
|
5131
|
-
const uint32_t bits = static_cast<uint32_t>(
|
|
5173
|
+
HWY_API intptr_t FindFirstTrue(D d, const MFromD<D> mask) {
|
|
5174
|
+
const uint32_t bits = static_cast<uint32_t>(BitsFromMask(d, mask));
|
|
5132
5175
|
return bits ? static_cast<intptr_t>(Num0BitsBelowLS1Bit_Nonzero32(bits)) : -1;
|
|
5133
5176
|
}
|
|
5134
5177
|
|
|
5135
5178
|
template <class D>
|
|
5136
|
-
HWY_API size_t FindKnownLastTrue(D
|
|
5137
|
-
const uint32_t bits = static_cast<uint32_t>(
|
|
5179
|
+
HWY_API size_t FindKnownLastTrue(D d, const MFromD<D> mask) {
|
|
5180
|
+
const uint32_t bits = static_cast<uint32_t>(BitsFromMask(d, mask));
|
|
5138
5181
|
return 31 - Num0BitsAboveMS1Bit_Nonzero32(bits);
|
|
5139
5182
|
}
|
|
5140
5183
|
|
|
5141
5184
|
template <class D>
|
|
5142
|
-
HWY_API intptr_t FindLastTrue(D
|
|
5143
|
-
const uint32_t bits = static_cast<uint32_t>(
|
|
5185
|
+
HWY_API intptr_t FindLastTrue(D d, const MFromD<D> mask) {
|
|
5186
|
+
const uint32_t bits = static_cast<uint32_t>(BitsFromMask(d, mask));
|
|
5144
5187
|
return bits
|
|
5145
5188
|
? (31 - static_cast<intptr_t>(Num0BitsAboveMS1Bit_Nonzero32(bits)))
|
|
5146
5189
|
: -1;
|
|
@@ -5586,7 +5629,8 @@ HWY_API Vec128<T> Compress(Vec128<T> v, Mask128<T> mask) {
|
|
|
5586
5629
|
// General case, 2 or 4 byte lanes
|
|
5587
5630
|
template <typename T, size_t N, HWY_IF_T_SIZE_ONE_OF(T, (1 << 4) | (1 << 2))>
|
|
5588
5631
|
HWY_API Vec128<T, N> Compress(Vec128<T, N> v, Mask128<T, N> mask) {
|
|
5589
|
-
|
|
5632
|
+
const DFromV<decltype(v)> d;
|
|
5633
|
+
return detail::Compress(v, BitsFromMask(d, mask));
|
|
5590
5634
|
}
|
|
5591
5635
|
|
|
5592
5636
|
// Single lane: no-op
|
|
@@ -5610,12 +5654,13 @@ HWY_API Vec128<T> CompressNot(Vec128<T> v, Mask128<T> mask) {
|
|
|
5610
5654
|
// General case, 2 or 4 byte lanes
|
|
5611
5655
|
template <typename T, size_t N, HWY_IF_T_SIZE_ONE_OF(T, (1 << 2) | (1 << 4))>
|
|
5612
5656
|
HWY_API Vec128<T, N> CompressNot(Vec128<T, N> v, Mask128<T, N> mask) {
|
|
5657
|
+
const DFromV<decltype(v)> d;
|
|
5613
5658
|
// For partial vectors, we cannot pull the Not() into the table because
|
|
5614
5659
|
// BitsFromMask clears the upper bits.
|
|
5615
5660
|
if (N < 16 / sizeof(T)) {
|
|
5616
|
-
return detail::Compress(v,
|
|
5661
|
+
return detail::Compress(v, BitsFromMask(d, Not(mask)));
|
|
5617
5662
|
}
|
|
5618
|
-
return detail::CompressNot(v,
|
|
5663
|
+
return detail::CompressNot(v, BitsFromMask(d, mask));
|
|
5619
5664
|
}
|
|
5620
5665
|
|
|
5621
5666
|
// ------------------------------ CompressBlocksNot
|
|
@@ -5642,7 +5687,7 @@ HWY_API Vec128<T, N> CompressBits(Vec128<T, N> v,
|
|
|
5642
5687
|
template <class D, HWY_IF_NOT_T_SIZE_D(D, 1)>
|
|
5643
5688
|
HWY_API size_t CompressStore(VFromD<D> v, MFromD<D> mask, D d,
|
|
5644
5689
|
TFromD<D>* HWY_RESTRICT unaligned) {
|
|
5645
|
-
const uint64_t mask_bits =
|
|
5690
|
+
const uint64_t mask_bits = BitsFromMask(d, mask);
|
|
5646
5691
|
const auto c = detail::Compress(v, mask_bits);
|
|
5647
5692
|
StoreU(c, d, unaligned);
|
|
5648
5693
|
return PopCount(mask_bits);
|
|
@@ -5653,7 +5698,7 @@ template <class D, HWY_IF_NOT_T_SIZE_D(D, 1)>
|
|
|
5653
5698
|
HWY_API size_t CompressBlendedStore(VFromD<D> v, MFromD<D> m, D d,
|
|
5654
5699
|
TFromD<D>* HWY_RESTRICT unaligned) {
|
|
5655
5700
|
const RebindToUnsigned<decltype(d)> du; // so we can support fp16/bf16
|
|
5656
|
-
const uint64_t mask_bits =
|
|
5701
|
+
const uint64_t mask_bits = BitsFromMask(d, m);
|
|
5657
5702
|
const size_t count = PopCount(mask_bits);
|
|
5658
5703
|
const VFromD<decltype(du)> compressed =
|
|
5659
5704
|
detail::Compress(BitCast(du, v), mask_bits);
|