@img/sharp-libvips-dev 1.2.1 → 1.2.2-rc.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/include/ffi.h +3 -3
- package/include/harfbuzz/hb-version.h +3 -3
- package/include/hwy/abort.h +2 -19
- package/include/hwy/aligned_allocator.h +11 -7
- package/include/hwy/auto_tune.h +504 -0
- package/include/hwy/base.h +425 -104
- package/include/hwy/cache_control.h +16 -0
- package/include/hwy/detect_compiler_arch.h +32 -1
- package/include/hwy/detect_targets.h +251 -67
- package/include/hwy/foreach_target.h +35 -0
- package/include/hwy/highway.h +185 -76
- package/include/hwy/nanobenchmark.h +1 -19
- package/include/hwy/ops/arm_neon-inl.h +969 -458
- package/include/hwy/ops/arm_sve-inl.h +1137 -359
- package/include/hwy/ops/emu128-inl.h +97 -11
- package/include/hwy/ops/generic_ops-inl.h +1222 -34
- package/include/hwy/ops/loongarch_lasx-inl.h +4664 -0
- package/include/hwy/ops/loongarch_lsx-inl.h +5933 -0
- package/include/hwy/ops/ppc_vsx-inl.h +306 -126
- package/include/hwy/ops/rvv-inl.h +546 -51
- package/include/hwy/ops/scalar-inl.h +77 -22
- package/include/hwy/ops/set_macros-inl.h +138 -17
- package/include/hwy/ops/shared-inl.h +50 -10
- package/include/hwy/ops/wasm_128-inl.h +137 -92
- package/include/hwy/ops/x86_128-inl.h +773 -214
- package/include/hwy/ops/x86_256-inl.h +712 -255
- package/include/hwy/ops/x86_512-inl.h +429 -753
- package/include/hwy/ops/x86_avx3-inl.h +501 -0
- package/include/hwy/per_target.h +2 -1
- package/include/hwy/profiler.h +622 -486
- package/include/hwy/targets.h +62 -20
- package/include/hwy/timer-inl.h +8 -160
- package/include/hwy/timer.h +170 -3
- package/include/hwy/x86_cpuid.h +81 -0
- package/include/libheif/heif_cxx.h +25 -5
- package/include/libheif/heif_regions.h +5 -5
- package/include/libheif/heif_version.h +2 -2
- package/include/librsvg-2.0/librsvg/rsvg-version.h +2 -2
- package/include/pango-1.0/pango/pango-enum-types.h +3 -0
- package/include/pango-1.0/pango/pango-features.h +3 -3
- package/include/pango-1.0/pango/pango-font.h +30 -0
- package/include/pango-1.0/pango/pango-version-macros.h +26 -0
- package/include/zlib.h +3 -3
- package/package.json +1 -1
- package/versions.json +8 -8
|
@@ -70,6 +70,14 @@ namespace detail {
|
|
|
70
70
|
#define HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT "x"
|
|
71
71
|
#endif
|
|
72
72
|
|
|
73
|
+
#undef HWY_X86_HAVE_AVX10_2_OPS
|
|
74
|
+
#if HWY_TARGET_IS_AVX10_2 && \
|
|
75
|
+
(HWY_COMPILER_GCC_ACTUAL >= 1501 || HWY_COMPILER3_CLANG >= 200103)
|
|
76
|
+
#define HWY_X86_HAVE_AVX10_2_OPS 1
|
|
77
|
+
#else
|
|
78
|
+
#define HWY_X86_HAVE_AVX10_2_OPS 0
|
|
79
|
+
#endif
|
|
80
|
+
|
|
73
81
|
template <typename T>
|
|
74
82
|
struct Raw128 {
|
|
75
83
|
using type = __m128i;
|
|
@@ -138,78 +146,66 @@ using Vec32 = Vec128<T, 4 / sizeof(T)>;
|
|
|
138
146
|
template <typename T>
|
|
139
147
|
using Vec16 = Vec128<T, 2 / sizeof(T)>;
|
|
140
148
|
|
|
141
|
-
#if HWY_TARGET <= HWY_AVX3
|
|
142
|
-
|
|
143
149
|
namespace detail {
|
|
144
150
|
|
|
151
|
+
#if HWY_TARGET <= HWY_AVX3
|
|
152
|
+
|
|
145
153
|
// Template arg: sizeof(lane type)
|
|
146
154
|
template <size_t size>
|
|
147
|
-
struct
|
|
155
|
+
struct RawMask128T {};
|
|
148
156
|
template <>
|
|
149
|
-
struct
|
|
157
|
+
struct RawMask128T<1> {
|
|
150
158
|
using type = __mmask16;
|
|
151
159
|
};
|
|
152
160
|
template <>
|
|
153
|
-
struct
|
|
161
|
+
struct RawMask128T<2> {
|
|
154
162
|
using type = __mmask8;
|
|
155
163
|
};
|
|
156
164
|
template <>
|
|
157
|
-
struct
|
|
165
|
+
struct RawMask128T<4> {
|
|
158
166
|
using type = __mmask8;
|
|
159
167
|
};
|
|
160
168
|
template <>
|
|
161
|
-
struct
|
|
169
|
+
struct RawMask128T<8> {
|
|
162
170
|
using type = __mmask8;
|
|
163
171
|
};
|
|
164
172
|
|
|
165
|
-
|
|
173
|
+
template <typename T>
|
|
174
|
+
using RawMask128 = typename RawMask128T<sizeof(T)>::type;
|
|
166
175
|
|
|
167
|
-
|
|
168
|
-
struct Mask128 {
|
|
169
|
-
using Raw = typename detail::RawMask128<sizeof(T)>::type;
|
|
176
|
+
#else // AVX2 or earlier
|
|
170
177
|
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
}
|
|
178
|
+
template <typename T>
|
|
179
|
+
using RawMask128 = typename Raw128<T>::type;
|
|
174
180
|
|
|
175
|
-
|
|
176
|
-
};
|
|
181
|
+
#endif // HWY_TARGET <= HWY_AVX3
|
|
177
182
|
|
|
178
|
-
|
|
183
|
+
} // namespace detail
|
|
179
184
|
|
|
180
|
-
// FF..FF or 0.
|
|
181
185
|
template <typename T, size_t N = 16 / sizeof(T)>
|
|
182
186
|
struct Mask128 {
|
|
183
|
-
typename detail::
|
|
184
|
-
};
|
|
185
|
-
|
|
186
|
-
#endif // AVX2 or below
|
|
187
|
+
using Raw = typename detail::RawMask128<T>;
|
|
187
188
|
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
// Returns the lowest N of the _mm_movemask* bits.
|
|
191
|
-
template <typename T, size_t N>
|
|
192
|
-
constexpr uint64_t OnlyActive(uint64_t mask_bits) {
|
|
193
|
-
return ((N * sizeof(T)) == 16) ? mask_bits : mask_bits & ((1ull << N) - 1);
|
|
194
|
-
}
|
|
195
|
-
|
|
196
|
-
} // namespace detail
|
|
189
|
+
using PrivateT = T; // only for DFromM
|
|
190
|
+
static constexpr size_t kPrivateN = N; // only for DFromM
|
|
197
191
|
|
|
198
192
|
#if HWY_TARGET <= HWY_AVX3
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
}
|
|
193
|
+
static Mask128<T, N> FromBits(uint64_t mask_bits) {
|
|
194
|
+
return Mask128<T, N>{static_cast<Raw>(mask_bits)};
|
|
195
|
+
}
|
|
196
|
+
#else
|
|
197
|
+
// Lanes are either FF..FF or 0.
|
|
198
|
+
#endif
|
|
206
199
|
|
|
207
|
-
|
|
208
|
-
|
|
200
|
+
Raw raw;
|
|
201
|
+
};
|
|
209
202
|
|
|
210
203
|
template <class V>
|
|
211
204
|
using DFromV = Simd<typename V::PrivateT, V::kPrivateN, 0>;
|
|
212
205
|
|
|
206
|
+
template <class M>
|
|
207
|
+
using DFromM = Simd<typename M::PrivateT, M::kPrivateN, 0>;
|
|
208
|
+
|
|
213
209
|
template <class V>
|
|
214
210
|
using TFromV = typename V::PrivateT;
|
|
215
211
|
|
|
@@ -1065,6 +1061,16 @@ HWY_API MFromD<DTo> DemoteMaskTo(DTo /*d_to*/, DFrom /*d_from*/,
|
|
|
1065
1061
|
#define HWY_NATIVE_COMBINE_MASKS
|
|
1066
1062
|
#endif
|
|
1067
1063
|
|
|
1064
|
+
// For Clang and GCC, mask intrinsics (KORTEST) weren't added until recently.
|
|
1065
|
+
#if !defined(HWY_COMPILER_HAS_MASK_INTRINSICS)
|
|
1066
|
+
#if HWY_COMPILER_MSVC != 0 || HWY_COMPILER_GCC_ACTUAL >= 700 || \
|
|
1067
|
+
HWY_COMPILER_CLANG >= 800
|
|
1068
|
+
#define HWY_COMPILER_HAS_MASK_INTRINSICS 1
|
|
1069
|
+
#else
|
|
1070
|
+
#define HWY_COMPILER_HAS_MASK_INTRINSICS 0
|
|
1071
|
+
#endif
|
|
1072
|
+
#endif // HWY_COMPILER_HAS_MASK_INTRINSICS
|
|
1073
|
+
|
|
1068
1074
|
template <class D, HWY_IF_LANES_D(D, 2)>
|
|
1069
1075
|
HWY_API MFromD<D> CombineMasks(D /*d*/, MFromD<Half<D>> hi,
|
|
1070
1076
|
MFromD<Half<D>> lo) {
|
|
@@ -1539,16 +1545,6 @@ HWY_API V IfThenZeroElse(MFromD<D> mask, V no) {
|
|
|
1539
1545
|
|
|
1540
1546
|
// ------------------------------ Mask logical
|
|
1541
1547
|
|
|
1542
|
-
// For Clang and GCC, mask intrinsics (KORTEST) weren't added until recently.
|
|
1543
|
-
#if !defined(HWY_COMPILER_HAS_MASK_INTRINSICS)
|
|
1544
|
-
#if HWY_COMPILER_MSVC != 0 || HWY_COMPILER_GCC_ACTUAL >= 700 || \
|
|
1545
|
-
HWY_COMPILER_CLANG >= 800
|
|
1546
|
-
#define HWY_COMPILER_HAS_MASK_INTRINSICS 1
|
|
1547
|
-
#else
|
|
1548
|
-
#define HWY_COMPILER_HAS_MASK_INTRINSICS 0
|
|
1549
|
-
#endif
|
|
1550
|
-
#endif // HWY_COMPILER_HAS_MASK_INTRINSICS
|
|
1551
|
-
|
|
1552
1548
|
namespace detail {
|
|
1553
1549
|
|
|
1554
1550
|
template <typename T, size_t N>
|
|
@@ -2049,13 +2045,13 @@ HWY_API Vec128<int8_t, N> ShiftRight(const Vec128<int8_t, N> v) {
|
|
|
2049
2045
|
|
|
2050
2046
|
// Clang static analysis claims the memory immediately after a partial vector
|
|
2051
2047
|
// store is uninitialized, and also flags the input to partial loads (at least
|
|
2052
|
-
// for loadl_pd) as "garbage".
|
|
2053
|
-
//
|
|
2054
|
-
//
|
|
2048
|
+
// for loadl_pd) as "garbage". Since 2025-07, MSAN began raising errors. We
|
|
2049
|
+
// work around this by using CopyBytes instead of intrinsics, but only for MSAN
|
|
2050
|
+
// and static analyzer builds to avoid potentially bad code generation.
|
|
2055
2051
|
// Unfortunately __clang_analyzer__ was not defined for clang-tidy prior to v7.
|
|
2056
2052
|
#ifndef HWY_SAFE_PARTIAL_LOAD_STORE
|
|
2057
|
-
#if defined(__clang_analyzer__) || \
|
|
2058
|
-
|
|
2053
|
+
#if HWY_IS_MSAN || (defined(__clang_analyzer__) || \
|
|
2054
|
+
(HWY_COMPILER_CLANG != 0 && HWY_COMPILER_CLANG < 700))
|
|
2059
2055
|
#define HWY_SAFE_PARTIAL_LOAD_STORE 1
|
|
2060
2056
|
#else
|
|
2061
2057
|
#define HWY_SAFE_PARTIAL_LOAD_STORE 0
|
|
@@ -3921,6 +3917,64 @@ HWY_API Vec128<double> AddSub(Vec128<double> a, Vec128<double> b) {
|
|
|
3921
3917
|
}
|
|
3922
3918
|
#endif // HWY_TARGET <= HWY_SSSE3
|
|
3923
3919
|
|
|
3920
|
+
// ------------------------------ PairwiseAdd128/PairwiseSub128
|
|
3921
|
+
|
|
3922
|
+
// Need to use the default implementation of PairwiseAdd128/PairwiseSub128 in
|
|
3923
|
+
// generic_ops-inl.h for U8/I8/F16/I64/U64 vectors and 64-byte vectors
|
|
3924
|
+
|
|
3925
|
+
#if HWY_TARGET <= HWY_SSSE3
|
|
3926
|
+
|
|
3927
|
+
#undef HWY_IF_PAIRWISE_ADD_128_D
|
|
3928
|
+
#undef HWY_IF_PAIRWISE_SUB_128_D
|
|
3929
|
+
#define HWY_IF_PAIRWISE_ADD_128_D(D) \
|
|
3930
|
+
hwy::EnableIf<( \
|
|
3931
|
+
HWY_MAX_LANES_D(D) > (32 / sizeof(hwy::HWY_NAMESPACE::TFromD<D>)) || \
|
|
3932
|
+
(HWY_MAX_LANES_D(D) > (8 / sizeof(hwy::HWY_NAMESPACE::TFromD<D>)) && \
|
|
3933
|
+
!(hwy::IsSameEither<hwy::HWY_NAMESPACE::TFromD<D>, int16_t, \
|
|
3934
|
+
uint16_t>() || \
|
|
3935
|
+
sizeof(hwy::HWY_NAMESPACE::TFromD<D>) == 4 || \
|
|
3936
|
+
hwy::IsSame<hwy::HWY_NAMESPACE::TFromD<D>, double>())))>* = nullptr
|
|
3937
|
+
#define HWY_IF_PAIRWISE_SUB_128_D(D) HWY_IF_PAIRWISE_ADD_128_D(D)
|
|
3938
|
+
|
|
3939
|
+
template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_UI16_D(D)>
|
|
3940
|
+
HWY_API VFromD<D> PairwiseAdd128(D /*d*/, VFromD<D> a, VFromD<D> b) {
|
|
3941
|
+
return VFromD<D>{_mm_hadd_epi16(a.raw, b.raw)};
|
|
3942
|
+
}
|
|
3943
|
+
template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_UI16_D(D)>
|
|
3944
|
+
HWY_API VFromD<D> PairwiseSub128(D /*d*/, VFromD<D> a, VFromD<D> b) {
|
|
3945
|
+
const DFromV<decltype(a)> d;
|
|
3946
|
+
const RebindToSigned<decltype(d)> di;
|
|
3947
|
+
return BitCast(d, Neg(BitCast(di, VFromD<D>{_mm_hsub_epi16(a.raw, b.raw)})));
|
|
3948
|
+
}
|
|
3949
|
+
template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_UI32_D(D)>
|
|
3950
|
+
HWY_API VFromD<D> PairwiseAdd128(D /*d*/, VFromD<D> a, VFromD<D> b) {
|
|
3951
|
+
return VFromD<D>{_mm_hadd_epi32(a.raw, b.raw)};
|
|
3952
|
+
}
|
|
3953
|
+
template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_UI32_D(D)>
|
|
3954
|
+
HWY_API VFromD<D> PairwiseSub128(D /*d*/, VFromD<D> a, VFromD<D> b) {
|
|
3955
|
+
const DFromV<decltype(a)> d;
|
|
3956
|
+
const RebindToSigned<decltype(d)> di;
|
|
3957
|
+
return BitCast(d, Neg(BitCast(di, VFromD<D>{_mm_hsub_epi32(a.raw, b.raw)})));
|
|
3958
|
+
}
|
|
3959
|
+
template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F32_D(D)>
|
|
3960
|
+
HWY_API VFromD<D> PairwiseAdd128(D /*d*/, VFromD<D> a, VFromD<D> b) {
|
|
3961
|
+
return VFromD<D>{_mm_hadd_ps(a.raw, b.raw)};
|
|
3962
|
+
}
|
|
3963
|
+
template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F32_D(D)>
|
|
3964
|
+
HWY_API VFromD<D> PairwiseSub128(D /*d*/, VFromD<D> a, VFromD<D> b) {
|
|
3965
|
+
return Neg(VFromD<D>{_mm_hsub_ps(a.raw, b.raw)});
|
|
3966
|
+
}
|
|
3967
|
+
template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F64_D(D)>
|
|
3968
|
+
HWY_API VFromD<D> PairwiseAdd128(D /*d*/, VFromD<D> a, VFromD<D> b) {
|
|
3969
|
+
return VFromD<D>{_mm_hadd_pd(a.raw, b.raw)};
|
|
3970
|
+
}
|
|
3971
|
+
template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F64_D(D)>
|
|
3972
|
+
HWY_API VFromD<D> PairwiseSub128(D /*d*/, VFromD<D> a, VFromD<D> b) {
|
|
3973
|
+
return Neg(VFromD<D>{_mm_hsub_pd(a.raw, b.raw)});
|
|
3974
|
+
}
|
|
3975
|
+
|
|
3976
|
+
#endif // HWY_TARGET <= HWY_SSSE3
|
|
3977
|
+
|
|
3924
3978
|
// ------------------------------ SumsOf8
|
|
3925
3979
|
template <size_t N>
|
|
3926
3980
|
HWY_API Vec128<uint64_t, N / 8> SumsOf8(const Vec128<uint8_t, N> v) {
|
|
@@ -4226,6 +4280,18 @@ HWY_API Vec128<uint16_t, N> AverageRound(const Vec128<uint16_t, N> a,
|
|
|
4226
4280
|
return Vec128<uint16_t, N>{_mm_avg_epu16(a.raw, b.raw)};
|
|
4227
4281
|
}
|
|
4228
4282
|
|
|
4283
|
+
// I8/I16 AverageRound is generic for all vector lengths
|
|
4284
|
+
template <class V, HWY_IF_SIGNED_V(V),
|
|
4285
|
+
HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 2))>
|
|
4286
|
+
HWY_API V AverageRound(V a, V b) {
|
|
4287
|
+
const DFromV<decltype(a)> d;
|
|
4288
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
4289
|
+
const V sign_bit = SignBit(d);
|
|
4290
|
+
return Xor(BitCast(d, AverageRound(BitCast(du, Xor(a, sign_bit)),
|
|
4291
|
+
BitCast(du, Xor(b, sign_bit)))),
|
|
4292
|
+
sign_bit);
|
|
4293
|
+
}
|
|
4294
|
+
|
|
4229
4295
|
// ------------------------------ Integer multiplication
|
|
4230
4296
|
|
|
4231
4297
|
template <size_t N>
|
|
@@ -4396,6 +4462,26 @@ HWY_API Vec128<int32_t, N> operator*(const Vec128<int32_t, N> a,
|
|
|
4396
4462
|
return BitCast(d, BitCast(du, a) * BitCast(du, b));
|
|
4397
4463
|
}
|
|
4398
4464
|
|
|
4465
|
+
#if HWY_TARGET <= HWY_AVX3
|
|
4466
|
+
// Per-target flag to prevent generic_ops-inl.h from defining 64-bit operator*.
|
|
4467
|
+
#ifdef HWY_NATIVE_MUL_64
|
|
4468
|
+
#undef HWY_NATIVE_MUL_64
|
|
4469
|
+
#else
|
|
4470
|
+
#define HWY_NATIVE_MUL_64
|
|
4471
|
+
#endif
|
|
4472
|
+
|
|
4473
|
+
template <size_t N>
|
|
4474
|
+
HWY_API Vec128<uint64_t, N> operator*(Vec128<uint64_t, N> a,
|
|
4475
|
+
Vec128<uint64_t, N> b) {
|
|
4476
|
+
return Vec128<uint64_t, N>{_mm_mullo_epi64(a.raw, b.raw)};
|
|
4477
|
+
}
|
|
4478
|
+
template <size_t N>
|
|
4479
|
+
HWY_API Vec128<int64_t, N> operator*(Vec128<int64_t, N> a,
|
|
4480
|
+
Vec128<int64_t, N> b) {
|
|
4481
|
+
return Vec128<int64_t, N>{_mm_mullo_epi64(a.raw, b.raw)};
|
|
4482
|
+
}
|
|
4483
|
+
#endif
|
|
4484
|
+
|
|
4399
4485
|
// ------------------------------ RotateRight (ShiftRight, Or)
|
|
4400
4486
|
|
|
4401
4487
|
// U8 RotateRight implementation on AVX3_DL is now in x86_512-inl.h as U8
|
|
@@ -5051,6 +5137,43 @@ HWY_API Vec64<double> operator*(const Vec64<double> a, const Vec64<double> b) {
|
|
|
5051
5137
|
return Vec64<double>{_mm_mul_sd(a.raw, b.raw)};
|
|
5052
5138
|
}
|
|
5053
5139
|
|
|
5140
|
+
#if HWY_TARGET <= HWY_AVX3
|
|
5141
|
+
|
|
5142
|
+
#ifdef HWY_NATIVE_MUL_BY_POW2
|
|
5143
|
+
#undef HWY_NATIVE_MUL_BY_POW2
|
|
5144
|
+
#else
|
|
5145
|
+
#define HWY_NATIVE_MUL_BY_POW2
|
|
5146
|
+
#endif
|
|
5147
|
+
|
|
5148
|
+
#if HWY_HAVE_FLOAT16
|
|
5149
|
+
template <size_t N>
|
|
5150
|
+
HWY_API Vec128<float16_t, N> MulByFloorPow2(Vec128<float16_t, N> a,
|
|
5151
|
+
Vec128<float16_t, N> b) {
|
|
5152
|
+
return Vec128<float16_t, N>{_mm_scalef_ph(a.raw, b.raw)};
|
|
5153
|
+
}
|
|
5154
|
+
#endif
|
|
5155
|
+
|
|
5156
|
+
template <size_t N>
|
|
5157
|
+
HWY_API Vec128<float, N> MulByFloorPow2(Vec128<float, N> a,
|
|
5158
|
+
Vec128<float, N> b) {
|
|
5159
|
+
return Vec128<float, N>{_mm_scalef_ps(a.raw, b.raw)};
|
|
5160
|
+
}
|
|
5161
|
+
|
|
5162
|
+
template <size_t N>
|
|
5163
|
+
HWY_API Vec128<double, N> MulByFloorPow2(Vec128<double, N> a,
|
|
5164
|
+
Vec128<double, N> b) {
|
|
5165
|
+
return Vec128<double, N>{_mm_scalef_pd(a.raw, b.raw)};
|
|
5166
|
+
}
|
|
5167
|
+
|
|
5168
|
+
// MulByPow2 is generic for all vector lengths on AVX3
|
|
5169
|
+
template <class V, HWY_IF_FLOAT_V(V)>
|
|
5170
|
+
HWY_API V MulByPow2(V v, VFromD<RebindToSigned<DFromV<V>>> exp) {
|
|
5171
|
+
const DFromV<decltype(v)> d;
|
|
5172
|
+
return MulByFloorPow2(v, ConvertTo(d, exp));
|
|
5173
|
+
}
|
|
5174
|
+
|
|
5175
|
+
#endif // HWY_TARGET <= HWY_AVX3
|
|
5176
|
+
|
|
5054
5177
|
#if HWY_HAVE_FLOAT16
|
|
5055
5178
|
template <size_t N>
|
|
5056
5179
|
HWY_API Vec128<float16_t, N> operator/(const Vec128<float16_t, N> a,
|
|
@@ -5113,6 +5236,33 @@ HWY_API V AbsDiff(V a, V b) {
|
|
|
5113
5236
|
return Abs(a - b);
|
|
5114
5237
|
}
|
|
5115
5238
|
|
|
5239
|
+
// ------------------------------ GetExponent
|
|
5240
|
+
|
|
5241
|
+
#if HWY_TARGET <= HWY_AVX3
|
|
5242
|
+
|
|
5243
|
+
#ifdef HWY_NATIVE_GET_EXPONENT
|
|
5244
|
+
#undef HWY_NATIVE_GET_EXPONENT
|
|
5245
|
+
#else
|
|
5246
|
+
#define HWY_NATIVE_GET_EXPONENT
|
|
5247
|
+
#endif
|
|
5248
|
+
|
|
5249
|
+
#if HWY_HAVE_FLOAT16
|
|
5250
|
+
template <class V, HWY_IF_F16(TFromV<V>), HWY_IF_V_SIZE_LE_V(V, 16)>
|
|
5251
|
+
HWY_API V GetExponent(V v) {
|
|
5252
|
+
return V{_mm_getexp_ph(v.raw)};
|
|
5253
|
+
}
|
|
5254
|
+
#endif
|
|
5255
|
+
template <class V, HWY_IF_F32(TFromV<V>), HWY_IF_V_SIZE_LE_V(V, 16)>
|
|
5256
|
+
HWY_API V GetExponent(V v) {
|
|
5257
|
+
return V{_mm_getexp_ps(v.raw)};
|
|
5258
|
+
}
|
|
5259
|
+
template <class V, HWY_IF_F64(TFromV<V>), HWY_IF_V_SIZE_LE_V(V, 16)>
|
|
5260
|
+
HWY_API V GetExponent(V v) {
|
|
5261
|
+
return V{_mm_getexp_pd(v.raw)};
|
|
5262
|
+
}
|
|
5263
|
+
|
|
5264
|
+
#endif
|
|
5265
|
+
|
|
5116
5266
|
// ------------------------------ MaskedMinOr
|
|
5117
5267
|
|
|
5118
5268
|
#if HWY_TARGET <= HWY_AVX3
|
|
@@ -5704,7 +5854,8 @@ HWY_API Vec128<uint8_t, N> Min(Vec128<uint8_t, N> a, Vec128<uint8_t, N> b) {
|
|
|
5704
5854
|
template <size_t N>
|
|
5705
5855
|
HWY_API Vec128<uint16_t, N> Min(Vec128<uint16_t, N> a, Vec128<uint16_t, N> b) {
|
|
5706
5856
|
#if HWY_TARGET >= HWY_SSSE3
|
|
5707
|
-
return
|
|
5857
|
+
return Vec128<uint16_t, N>{
|
|
5858
|
+
_mm_sub_epi16(a.raw, _mm_subs_epu16(a.raw, b.raw))};
|
|
5708
5859
|
#else
|
|
5709
5860
|
return Vec128<uint16_t, N>{_mm_min_epu16(a.raw, b.raw)};
|
|
5710
5861
|
#endif
|
|
@@ -5797,7 +5948,8 @@ HWY_API Vec128<uint8_t, N> Max(Vec128<uint8_t, N> a, Vec128<uint8_t, N> b) {
|
|
|
5797
5948
|
template <size_t N>
|
|
5798
5949
|
HWY_API Vec128<uint16_t, N> Max(Vec128<uint16_t, N> a, Vec128<uint16_t, N> b) {
|
|
5799
5950
|
#if HWY_TARGET >= HWY_SSSE3
|
|
5800
|
-
return
|
|
5951
|
+
return Vec128<uint16_t, N>{
|
|
5952
|
+
_mm_add_epi16(a.raw, _mm_subs_epu16(b.raw, a.raw))};
|
|
5801
5953
|
#else
|
|
5802
5954
|
return Vec128<uint16_t, N>{_mm_max_epu16(a.raw, b.raw)};
|
|
5803
5955
|
#endif
|
|
@@ -5866,6 +6018,110 @@ HWY_API Vec128<double, N> Max(Vec128<double, N> a, Vec128<double, N> b) {
|
|
|
5866
6018
|
return Vec128<double, N>{_mm_max_pd(a.raw, b.raw)};
|
|
5867
6019
|
}
|
|
5868
6020
|
|
|
6021
|
+
// ------------------------------ MinNumber and MaxNumber
|
|
6022
|
+
|
|
6023
|
+
#ifdef HWY_NATIVE_FLOAT_MIN_MAX_NUMBER
|
|
6024
|
+
#undef HWY_NATIVE_FLOAT_MIN_MAX_NUMBER
|
|
6025
|
+
#else
|
|
6026
|
+
#define HWY_NATIVE_FLOAT_MIN_MAX_NUMBER
|
|
6027
|
+
#endif
|
|
6028
|
+
|
|
6029
|
+
#if HWY_X86_HAVE_AVX10_2_OPS
|
|
6030
|
+
|
|
6031
|
+
#if HWY_HAVE_FLOAT16
|
|
6032
|
+
template <size_t N>
|
|
6033
|
+
HWY_API Vec128<float16_t, N> MinNumber(Vec128<float16_t, N> a,
|
|
6034
|
+
Vec128<float16_t, N> b) {
|
|
6035
|
+
return Vec128<float16_t, N>{_mm_minmax_ph(a.raw, b.raw, 0x14)};
|
|
6036
|
+
}
|
|
6037
|
+
#endif
|
|
6038
|
+
template <size_t N>
|
|
6039
|
+
HWY_API Vec128<float, N> MinNumber(Vec128<float, N> a, Vec128<float, N> b) {
|
|
6040
|
+
return Vec128<float, N>{_mm_minmax_ps(a.raw, b.raw, 0x14)};
|
|
6041
|
+
}
|
|
6042
|
+
template <size_t N>
|
|
6043
|
+
HWY_API Vec128<double, N> MinNumber(Vec128<double, N> a, Vec128<double, N> b) {
|
|
6044
|
+
return Vec128<double, N>{_mm_minmax_pd(a.raw, b.raw, 0x14)};
|
|
6045
|
+
}
|
|
6046
|
+
|
|
6047
|
+
#if HWY_HAVE_FLOAT16
|
|
6048
|
+
template <size_t N>
|
|
6049
|
+
HWY_API Vec128<float16_t, N> MaxNumber(Vec128<float16_t, N> a,
|
|
6050
|
+
Vec128<float16_t, N> b) {
|
|
6051
|
+
return Vec128<float16_t, N>{_mm_minmax_ph(a.raw, b.raw, 0x15)};
|
|
6052
|
+
}
|
|
6053
|
+
#endif
|
|
6054
|
+
template <size_t N>
|
|
6055
|
+
HWY_API Vec128<float, N> MaxNumber(Vec128<float, N> a, Vec128<float, N> b) {
|
|
6056
|
+
return Vec128<float, N>{_mm_minmax_ps(a.raw, b.raw, 0x15)};
|
|
6057
|
+
}
|
|
6058
|
+
template <size_t N>
|
|
6059
|
+
HWY_API Vec128<double, N> MaxNumber(Vec128<double, N> a, Vec128<double, N> b) {
|
|
6060
|
+
return Vec128<double, N>{_mm_minmax_pd(a.raw, b.raw, 0x15)};
|
|
6061
|
+
}
|
|
6062
|
+
|
|
6063
|
+
#else
|
|
6064
|
+
|
|
6065
|
+
// MinNumber/MaxNumber are generic for all vector lengths on targets other
|
|
6066
|
+
// than AVX10.2
|
|
6067
|
+
template <class V, HWY_IF_FLOAT_OR_SPECIAL_V(V)>
|
|
6068
|
+
HWY_API V MinNumber(V a, V b) {
|
|
6069
|
+
return Min(a, IfThenElse(IsNaN(b), a, b));
|
|
6070
|
+
}
|
|
6071
|
+
|
|
6072
|
+
template <class V, HWY_IF_FLOAT_OR_SPECIAL_V(V)>
|
|
6073
|
+
HWY_API V MaxNumber(V a, V b) {
|
|
6074
|
+
return Max(a, IfThenElse(IsNaN(b), a, b));
|
|
6075
|
+
}
|
|
6076
|
+
|
|
6077
|
+
#endif
|
|
6078
|
+
|
|
6079
|
+
// ------------------------------ MinMagnitude and MaxMagnitude
|
|
6080
|
+
|
|
6081
|
+
#if HWY_X86_HAVE_AVX10_2_OPS
|
|
6082
|
+
|
|
6083
|
+
#ifdef HWY_NATIVE_FLOAT_MIN_MAX_MAGNITUDE
|
|
6084
|
+
#undef HWY_NATIVE_FLOAT_MIN_MAX_MAGNITUDE
|
|
6085
|
+
#else
|
|
6086
|
+
#define HWY_NATIVE_FLOAT_MIN_MAX_MAGNITUDE
|
|
6087
|
+
#endif
|
|
6088
|
+
|
|
6089
|
+
#if HWY_HAVE_FLOAT16
|
|
6090
|
+
template <size_t N>
|
|
6091
|
+
HWY_API Vec128<float16_t, N> MinMagnitude(Vec128<float16_t, N> a,
|
|
6092
|
+
Vec128<float16_t, N> b) {
|
|
6093
|
+
return Vec128<float16_t, N>{_mm_minmax_ph(a.raw, b.raw, 0x16)};
|
|
6094
|
+
}
|
|
6095
|
+
#endif
|
|
6096
|
+
template <size_t N>
|
|
6097
|
+
HWY_API Vec128<float, N> MinMagnitude(Vec128<float, N> a, Vec128<float, N> b) {
|
|
6098
|
+
return Vec128<float, N>{_mm_minmax_ps(a.raw, b.raw, 0x16)};
|
|
6099
|
+
}
|
|
6100
|
+
template <size_t N>
|
|
6101
|
+
HWY_API Vec128<double, N> MinMagnitude(Vec128<double, N> a,
|
|
6102
|
+
Vec128<double, N> b) {
|
|
6103
|
+
return Vec128<double, N>{_mm_minmax_pd(a.raw, b.raw, 0x16)};
|
|
6104
|
+
}
|
|
6105
|
+
|
|
6106
|
+
#if HWY_HAVE_FLOAT16
|
|
6107
|
+
template <size_t N>
|
|
6108
|
+
HWY_API Vec128<float16_t, N> MaxMagnitude(Vec128<float16_t, N> a,
|
|
6109
|
+
Vec128<float16_t, N> b) {
|
|
6110
|
+
return Vec128<float16_t, N>{_mm_minmax_ph(a.raw, b.raw, 0x17)};
|
|
6111
|
+
}
|
|
6112
|
+
#endif
|
|
6113
|
+
template <size_t N>
|
|
6114
|
+
HWY_API Vec128<float, N> MaxMagnitude(Vec128<float, N> a, Vec128<float, N> b) {
|
|
6115
|
+
return Vec128<float, N>{_mm_minmax_ps(a.raw, b.raw, 0x17)};
|
|
6116
|
+
}
|
|
6117
|
+
template <size_t N>
|
|
6118
|
+
HWY_API Vec128<double, N> MaxMagnitude(Vec128<double, N> a,
|
|
6119
|
+
Vec128<double, N> b) {
|
|
6120
|
+
return Vec128<double, N>{_mm_minmax_pd(a.raw, b.raw, 0x17)};
|
|
6121
|
+
}
|
|
6122
|
+
|
|
6123
|
+
#endif
|
|
6124
|
+
|
|
5869
6125
|
// ================================================== MEMORY (3)
|
|
5870
6126
|
|
|
5871
6127
|
// ------------------------------ Non-temporal stores
|
|
@@ -6883,52 +7139,48 @@ HWY_API Vec128<float16_t, N> TableLookupLanes(Vec128<float16_t, N> v,
|
|
|
6883
7139
|
|
|
6884
7140
|
template <typename T, size_t N, HWY_IF_T_SIZE(T, 4)>
|
|
6885
7141
|
HWY_API Vec128<T, N> TableLookupLanes(Vec128<T, N> v, Indices128<T, N> idx) {
|
|
6886
|
-
#if HWY_TARGET <= HWY_AVX2
|
|
6887
7142
|
const DFromV<decltype(v)> d;
|
|
6888
|
-
const
|
|
6889
|
-
const Vec128<
|
|
6890
|
-
|
|
7143
|
+
const Full128<T> d_full;
|
|
7144
|
+
const Vec128<T> v_full = ZeroExtendResizeBitCast(d_full, d, v);
|
|
7145
|
+
|
|
7146
|
+
const RebindToSigned<decltype(d)> di;
|
|
7147
|
+
const Full128<MakeSigned<T>> di_full;
|
|
7148
|
+
const VFromD<decltype(di_full)> vidx =
|
|
7149
|
+
ZeroExtendResizeBitCast(di_full, di, VFromD<decltype(di)>{idx.raw});
|
|
7150
|
+
|
|
7151
|
+
#if HWY_TARGET <= HWY_AVX2
|
|
7152
|
+
// There is no permutevar for non-float; _mm256_permutevar8x32_epi32 is for
|
|
7153
|
+
// 256-bit vectors, hence cast to float.
|
|
7154
|
+
const Full128<float> df_full;
|
|
7155
|
+
// Workaround for MSAN false positive.
|
|
7156
|
+
HWY_IF_CONSTEXPR(HWY_IS_MSAN) PreventElision(GetLane(vidx));
|
|
7157
|
+
const Vec128<float> perm{
|
|
7158
|
+
_mm_permutevar_ps(BitCast(df_full, v_full).raw, vidx.raw)};
|
|
7159
|
+
return ResizeBitCast(d, perm);
|
|
6891
7160
|
#elif HWY_TARGET == HWY_SSE2
|
|
6892
7161
|
#if HWY_COMPILER_GCC_ACTUAL && HWY_HAS_BUILTIN(__builtin_shuffle)
|
|
6893
7162
|
typedef uint32_t GccU32RawVectType __attribute__((__vector_size__(16)));
|
|
6894
7163
|
return Vec128<T, N>{reinterpret_cast<typename detail::Raw128<T>::type>(
|
|
6895
|
-
__builtin_shuffle(reinterpret_cast<GccU32RawVectType>(
|
|
6896
|
-
reinterpret_cast<GccU32RawVectType>(
|
|
7164
|
+
__builtin_shuffle(reinterpret_cast<GccU32RawVectType>(v_full.raw),
|
|
7165
|
+
reinterpret_cast<GccU32RawVectType>(vidx.raw)))};
|
|
6897
7166
|
#else
|
|
6898
|
-
const Full128<T> d_full;
|
|
6899
7167
|
alignas(16) T src_lanes[4];
|
|
6900
|
-
alignas(16)
|
|
7168
|
+
alignas(16) int32_t indices[4];
|
|
6901
7169
|
alignas(16) T result_lanes[4];
|
|
6902
7170
|
|
|
6903
|
-
Store(
|
|
6904
|
-
|
|
7171
|
+
Store(v_full, d_full, src_lanes);
|
|
7172
|
+
Store(vidx, di_full, indices);
|
|
6905
7173
|
|
|
6906
|
-
for (
|
|
6907
|
-
result_lanes[i] = src_lanes[indices[i] &
|
|
7174
|
+
for (size_t i = 0; i < N; i++) {
|
|
7175
|
+
result_lanes[i] = src_lanes[static_cast<size_t>(indices[i] & 3)];
|
|
6908
7176
|
}
|
|
6909
|
-
|
|
6910
|
-
return Vec128<T, N>{Load(d_full, result_lanes).raw};
|
|
7177
|
+
return Load(d, result_lanes);
|
|
6911
7178
|
#endif // HWY_COMPILER_GCC_ACTUAL && HWY_HAS_BUILTIN(__builtin_shuffle)
|
|
6912
7179
|
#else // SSSE3 or SSE4
|
|
6913
|
-
return TableLookupBytes(
|
|
7180
|
+
return ResizeBitCast(d, TableLookupBytes(BitCast(di_full, v_full), vidx));
|
|
6914
7181
|
#endif
|
|
6915
7182
|
}
|
|
6916
7183
|
|
|
6917
|
-
#if HWY_TARGET <= HWY_SSSE3
|
|
6918
|
-
template <size_t N, HWY_IF_V_SIZE_GT(float, N, 4)>
|
|
6919
|
-
HWY_API Vec128<float, N> TableLookupLanes(Vec128<float, N> v,
|
|
6920
|
-
Indices128<float, N> idx) {
|
|
6921
|
-
#if HWY_TARGET <= HWY_AVX2
|
|
6922
|
-
return Vec128<float, N>{_mm_permutevar_ps(v.raw, idx.raw)};
|
|
6923
|
-
#else // SSSE3 or SSE4
|
|
6924
|
-
const DFromV<decltype(v)> df;
|
|
6925
|
-
const RebindToSigned<decltype(df)> di;
|
|
6926
|
-
return BitCast(df,
|
|
6927
|
-
TableLookupBytes(BitCast(di, v), Vec128<int32_t, N>{idx.raw}));
|
|
6928
|
-
#endif // HWY_TARGET <= HWY_AVX2
|
|
6929
|
-
}
|
|
6930
|
-
#endif // HWY_TARGET <= HWY_SSSE3
|
|
6931
|
-
|
|
6932
7184
|
// Single lane: no change
|
|
6933
7185
|
template <typename T>
|
|
6934
7186
|
HWY_API Vec128<T, 1> TableLookupLanes(Vec128<T, 1> v,
|
|
@@ -6936,11 +7188,15 @@ HWY_API Vec128<T, 1> TableLookupLanes(Vec128<T, 1> v,
|
|
|
6936
7188
|
return v;
|
|
6937
7189
|
}
|
|
6938
7190
|
|
|
6939
|
-
template <typename T,
|
|
7191
|
+
template <typename T, HWY_IF_T_SIZE(T, 8)>
|
|
6940
7192
|
HWY_API Vec128<T> TableLookupLanes(Vec128<T> v, Indices128<T> idx) {
|
|
6941
7193
|
const DFromV<decltype(v)> d;
|
|
7194
|
+
// No need for ZeroExtendResizeBitCast, we have full vectors.
|
|
6942
7195
|
Vec128<int64_t> vidx{idx.raw};
|
|
6943
|
-
|
|
7196
|
+
|
|
7197
|
+
// Disable in MSAN builds due to false positive. Note that this affects
|
|
7198
|
+
// CompressNot, which assumes upper index bits will be ignored.
|
|
7199
|
+
#if HWY_TARGET <= HWY_AVX2 && !HWY_IS_MSAN
|
|
6944
7200
|
// There is no _mm_permute[x]var_epi64.
|
|
6945
7201
|
vidx += vidx; // bit1 is the decider (unusual)
|
|
6946
7202
|
const RebindToFloat<decltype(d)> df;
|
|
@@ -6952,26 +7208,8 @@ HWY_API Vec128<T> TableLookupLanes(Vec128<T> v, Indices128<T> idx) {
|
|
|
6952
7208
|
// to obtain an all-zero or all-one mask.
|
|
6953
7209
|
const RebindToSigned<decltype(d)> di;
|
|
6954
7210
|
const Vec128<int64_t> same = (vidx ^ Iota(di, 0)) - Set(di, 1);
|
|
6955
|
-
|
|
6956
|
-
|
|
6957
|
-
#endif
|
|
6958
|
-
}
|
|
6959
|
-
|
|
6960
|
-
HWY_API Vec128<double> TableLookupLanes(Vec128<double> v,
|
|
6961
|
-
Indices128<double> idx) {
|
|
6962
|
-
Vec128<int64_t> vidx{idx.raw};
|
|
6963
|
-
#if HWY_TARGET <= HWY_AVX2
|
|
6964
|
-
vidx += vidx; // bit1 is the decider (unusual)
|
|
6965
|
-
return Vec128<double>{_mm_permutevar_pd(v.raw, vidx.raw)};
|
|
6966
|
-
#else
|
|
6967
|
-
// Only 2 lanes: can swap+blend. Choose v if vidx == iota. To avoid a 64-bit
|
|
6968
|
-
// comparison (expensive on SSSE3), just invert the upper lane and subtract 1
|
|
6969
|
-
// to obtain an all-zero or all-one mask.
|
|
6970
|
-
const DFromV<decltype(v)> d;
|
|
6971
|
-
const RebindToSigned<decltype(d)> di;
|
|
6972
|
-
const Vec128<int64_t> same = (vidx ^ Iota(di, 0)) - Set(di, 1);
|
|
6973
|
-
const Mask128<double> mask_same = RebindMask(d, MaskFromVec(same));
|
|
6974
|
-
return IfThenElse(mask_same, v, Shuffle01(v));
|
|
7211
|
+
return BitCast(
|
|
7212
|
+
d, IfVecThenElse(same, BitCast(di, v), Shuffle01(BitCast(di, v))));
|
|
6975
7213
|
#endif
|
|
6976
7214
|
}
|
|
6977
7215
|
|
|
@@ -8861,12 +9099,22 @@ HWY_API Vec128<T, N> OddEvenBlocks(Vec128<T, N> /* odd */, Vec128<T, N> even) {
|
|
|
8861
9099
|
}
|
|
8862
9100
|
|
|
8863
9101
|
// ------------------------------ SwapAdjacentBlocks
|
|
8864
|
-
|
|
8865
9102
|
template <typename T, size_t N>
|
|
8866
9103
|
HWY_API Vec128<T, N> SwapAdjacentBlocks(Vec128<T, N> v) {
|
|
8867
9104
|
return v;
|
|
8868
9105
|
}
|
|
8869
9106
|
|
|
9107
|
+
// ------------------------------ InterleaveEvenBlocks
|
|
9108
|
+
template <class D, class V = VFromD<D>, HWY_IF_V_SIZE_LE_D(D, 16)>
|
|
9109
|
+
HWY_API V InterleaveEvenBlocks(D, V a, V /*b*/) {
|
|
9110
|
+
return a;
|
|
9111
|
+
}
|
|
9112
|
+
// ------------------------------ InterleaveOddBlocks
|
|
9113
|
+
template <class D, class V = VFromD<D>, HWY_IF_V_SIZE_LE_D(D, 16)>
|
|
9114
|
+
HWY_API V InterleaveOddBlocks(D, V a, V /*b*/) {
|
|
9115
|
+
return a;
|
|
9116
|
+
}
|
|
9117
|
+
|
|
8870
9118
|
// ------------------------------ Shl (ZipLower, Mul)
|
|
8871
9119
|
|
|
8872
9120
|
// Use AVX2/3 variable shifts where available, otherwise multiply by powers of
|
|
@@ -9588,15 +9836,28 @@ HWY_INLINE VFromD<D> PromoteOddTo(hwy::SignedTag /*to_type_tag*/,
|
|
|
9588
9836
|
|
|
9589
9837
|
// ------------------------------ WidenMulPairwiseAdd (PromoteEvenTo)
|
|
9590
9838
|
|
|
9839
|
+
#if HWY_NATIVE_DOT_BF16
|
|
9840
|
+
|
|
9841
|
+
template <class DF, HWY_IF_F32_D(DF), HWY_IF_V_SIZE_LE_D(DF, 16),
|
|
9842
|
+
class VBF = VFromD<Repartition<bfloat16_t, DF>>>
|
|
9843
|
+
HWY_API VFromD<DF> WidenMulPairwiseAdd(DF df, VBF a, VBF b) {
|
|
9844
|
+
return VFromD<DF>{_mm_dpbf16_ps(Zero(df).raw,
|
|
9845
|
+
reinterpret_cast<__m128bh>(a.raw),
|
|
9846
|
+
reinterpret_cast<__m128bh>(b.raw))};
|
|
9847
|
+
}
|
|
9848
|
+
|
|
9849
|
+
#else
|
|
9850
|
+
|
|
9591
9851
|
// Generic for all vector lengths.
|
|
9592
9852
|
template <class DF, HWY_IF_F32_D(DF),
|
|
9593
9853
|
class VBF = VFromD<Repartition<bfloat16_t, DF>>>
|
|
9594
9854
|
HWY_API VFromD<DF> WidenMulPairwiseAdd(DF df, VBF a, VBF b) {
|
|
9595
|
-
// TODO(janwas): _mm_dpbf16_ps when available
|
|
9596
9855
|
return MulAdd(PromoteEvenTo(df, a), PromoteEvenTo(df, b),
|
|
9597
9856
|
Mul(PromoteOddTo(df, a), PromoteOddTo(df, b)));
|
|
9598
9857
|
}
|
|
9599
9858
|
|
|
9859
|
+
#endif // HWY_NATIVE_DOT_BF16
|
|
9860
|
+
|
|
9600
9861
|
// Even if N=1, the input is always at least 2 lanes, hence madd_epi16 is safe.
|
|
9601
9862
|
template <class D32, HWY_IF_I32_D(D32), HWY_IF_V_SIZE_LE_D(D32, 16),
|
|
9602
9863
|
class V16 = VFromD<RepartitionToNarrow<D32>>>
|
|
@@ -10276,6 +10537,7 @@ X86ConvertScalarFromFloat(TF from_val) {
|
|
|
10276
10537
|
return X86ConvertScalarFromFloat<TTo>(hwy::TypeTag<RemoveCvRef<TTo>>(),
|
|
10277
10538
|
from_val);
|
|
10278
10539
|
}
|
|
10540
|
+
|
|
10279
10541
|
#endif // HWY_COMPILER_GCC_ACTUAL >= 700 && !HWY_IS_DEBUG_BUILD
|
|
10280
10542
|
|
|
10281
10543
|
} // namespace detail
|
|
@@ -10288,7 +10550,9 @@ X86ConvertScalarFromFloat(TF from_val) {
|
|
|
10288
10550
|
|
|
10289
10551
|
template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_I32_D(D)>
|
|
10290
10552
|
HWY_API VFromD<D> DemoteInRangeTo(D /* tag */, VFromD<Rebind<double, D>> v) {
|
|
10291
|
-
#if
|
|
10553
|
+
#if HWY_X86_HAVE_AVX10_2_OPS
|
|
10554
|
+
return VFromD<D>{_mm_cvtts_pd_epi32(v.raw)};
|
|
10555
|
+
#elif HWY_COMPILER_GCC_ACTUAL
|
|
10292
10556
|
// Workaround for undefined behavior in _mm_cvttpd_epi32 with GCC if any
|
|
10293
10557
|
// values of v[i] are not within the range of an int32_t
|
|
10294
10558
|
|
|
@@ -10325,7 +10589,9 @@ HWY_API VFromD<D> DemoteTo(D di32, VFromD<Rebind<double, D>> v) {
|
|
|
10325
10589
|
#if HWY_TARGET <= HWY_AVX3
|
|
10326
10590
|
template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U32_D(D)>
|
|
10327
10591
|
HWY_API VFromD<D> DemoteInRangeTo(D /* tag */, VFromD<Rebind<double, D>> v) {
|
|
10328
|
-
#if
|
|
10592
|
+
#if HWY_X86_HAVE_AVX10_2_OPS
|
|
10593
|
+
return VFromD<D>{_mm_cvtts_pd_epu32(v.raw)};
|
|
10594
|
+
#elif HWY_COMPILER_GCC_ACTUAL
|
|
10329
10595
|
// Workaround for undefined behavior in _mm_cvttpd_epu32 with GCC if any
|
|
10330
10596
|
// values of v[i] are not within the range of an uint32_t
|
|
10331
10597
|
|
|
@@ -10353,8 +10619,12 @@ HWY_API VFromD<D> DemoteInRangeTo(D /* tag */, VFromD<Rebind<double, D>> v) {
|
|
|
10353
10619
|
|
|
10354
10620
|
// F64->U32 DemoteTo is generic for all vector lengths
|
|
10355
10621
|
template <class D, HWY_IF_U32_D(D)>
|
|
10356
|
-
HWY_API VFromD<D> DemoteTo(D
|
|
10357
|
-
|
|
10622
|
+
HWY_API VFromD<D> DemoteTo(D du32, VFromD<Rebind<double, D>> v) {
|
|
10623
|
+
#if HWY_X86_HAVE_AVX10_2_OPS
|
|
10624
|
+
return DemoteInRangeTo(du32, v);
|
|
10625
|
+
#else
|
|
10626
|
+
return DemoteInRangeTo(du32, ZeroIfNegative(v));
|
|
10627
|
+
#endif
|
|
10358
10628
|
}
|
|
10359
10629
|
#else // HWY_TARGET > HWY_AVX3
|
|
10360
10630
|
|
|
@@ -10482,7 +10752,9 @@ HWY_API Vec128<uint8_t, N> U8FromU32(const Vec128<uint32_t, N> v) {
|
|
|
10482
10752
|
#if HWY_TARGET <= HWY_AVX3
|
|
10483
10753
|
template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_I64_D(D)>
|
|
10484
10754
|
HWY_API VFromD<D> PromoteInRangeTo(D /*di64*/, VFromD<Rebind<float, D>> v) {
|
|
10485
|
-
#if
|
|
10755
|
+
#if HWY_X86_HAVE_AVX10_2_OPS
|
|
10756
|
+
return VFromD<D>{_mm_cvtts_ps_epi64(v.raw)};
|
|
10757
|
+
#elif HWY_COMPILER_GCC_ACTUAL
|
|
10486
10758
|
// Workaround for undefined behavior with GCC if any values of v[i] are not
|
|
10487
10759
|
// within the range of an int64_t
|
|
10488
10760
|
|
|
@@ -10510,6 +10782,9 @@ HWY_API VFromD<D> PromoteInRangeTo(D /*di64*/, VFromD<Rebind<float, D>> v) {
|
|
|
10510
10782
|
// Generic for all vector lengths.
|
|
10511
10783
|
template <class D, HWY_IF_I64_D(D)>
|
|
10512
10784
|
HWY_API VFromD<D> PromoteTo(D di64, VFromD<Rebind<float, D>> v) {
|
|
10785
|
+
#if HWY_X86_HAVE_AVX10_2_OPS
|
|
10786
|
+
return PromoteInRangeTo(di64, v);
|
|
10787
|
+
#else
|
|
10513
10788
|
const Rebind<float, decltype(di64)> df32;
|
|
10514
10789
|
const RebindToFloat<decltype(di64)> df64;
|
|
10515
10790
|
// We now avoid GCC UB in PromoteInRangeTo via assembly, see #2189 and
|
|
@@ -10522,14 +10797,21 @@ HWY_API VFromD<D> PromoteTo(D di64, VFromD<Rebind<float, D>> v) {
|
|
|
10522
10797
|
di64, PromoteMaskTo(df64, df32, Ge(v, Set(df32, 9.223372e18f))));
|
|
10523
10798
|
return IfThenElse(overflow, Set(di64, LimitsMax<int64_t>()),
|
|
10524
10799
|
PromoteInRangeTo(di64, v));
|
|
10800
|
+
#endif
|
|
10525
10801
|
}
|
|
10526
10802
|
template <class D, HWY_IF_U64_D(D)>
|
|
10527
|
-
HWY_API VFromD<D> PromoteTo(D
|
|
10528
|
-
|
|
10803
|
+
HWY_API VFromD<D> PromoteTo(D du64, VFromD<Rebind<float, D>> v) {
|
|
10804
|
+
#if HWY_X86_HAVE_AVX10_2_OPS
|
|
10805
|
+
return PromoteInRangeTo(du64, v);
|
|
10806
|
+
#else
|
|
10807
|
+
return PromoteInRangeTo(du64, ZeroIfNegative(v));
|
|
10808
|
+
#endif
|
|
10529
10809
|
}
|
|
10530
10810
|
template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_U64_D(D)>
|
|
10531
10811
|
HWY_API VFromD<D> PromoteInRangeTo(D /* tag */, VFromD<Rebind<float, D>> v) {
|
|
10532
|
-
#if
|
|
10812
|
+
#if HWY_X86_HAVE_AVX10_2_OPS
|
|
10813
|
+
return VFromD<D>{_mm_cvtts_ps_epu64(v.raw)};
|
|
10814
|
+
#elif HWY_COMPILER_GCC_ACTUAL
|
|
10533
10815
|
// Workaround for undefined behavior with GCC if any values of v[i] are not
|
|
10534
10816
|
// within the range of an uint64_t
|
|
10535
10817
|
|
|
@@ -11208,7 +11490,9 @@ HWY_API VFromD<D> ConvertTo(D /* tag */, VFromD<RebindToFloat<D>> v) {
|
|
|
11208
11490
|
|
|
11209
11491
|
template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_I32_D(D)>
|
|
11210
11492
|
HWY_API VFromD<D> ConvertInRangeTo(D /*di*/, VFromD<RebindToFloat<D>> v) {
|
|
11211
|
-
#if
|
|
11493
|
+
#if HWY_X86_HAVE_AVX10_2_OPS
|
|
11494
|
+
return VFromD<D>{_mm_cvtts_ps_epi32(v.raw)};
|
|
11495
|
+
#elif HWY_COMPILER_GCC_ACTUAL
|
|
11212
11496
|
// Workaround for undefined behavior in _mm_cvttps_epi32 with GCC if any
|
|
11213
11497
|
// values of v[i] are not within the range of an int32_t
|
|
11214
11498
|
|
|
@@ -11238,17 +11522,23 @@ HWY_API VFromD<D> ConvertInRangeTo(D /*di*/, VFromD<RebindToFloat<D>> v) {
|
|
|
11238
11522
|
// F32 to I32 ConvertTo is generic for all vector lengths
|
|
11239
11523
|
template <class D, HWY_IF_I32_D(D)>
|
|
11240
11524
|
HWY_API VFromD<D> ConvertTo(D di, VFromD<RebindToFloat<D>> v) {
|
|
11525
|
+
#if HWY_X86_HAVE_AVX10_2_OPS
|
|
11526
|
+
return ConvertInRangeTo(di, v);
|
|
11527
|
+
#else
|
|
11241
11528
|
const RebindToFloat<decltype(di)> df;
|
|
11242
11529
|
// See comment at the first occurrence of "IfThenElse(overflow,".
|
|
11243
11530
|
const MFromD<D> overflow = RebindMask(di, Ge(v, Set(df, 2147483648.0f)));
|
|
11244
11531
|
return IfThenElse(overflow, Set(di, LimitsMax<int32_t>()),
|
|
11245
11532
|
ConvertInRangeTo(di, v));
|
|
11533
|
+
#endif
|
|
11246
11534
|
}
|
|
11247
11535
|
|
|
11248
11536
|
#if HWY_TARGET <= HWY_AVX3
|
|
11249
11537
|
template <class DI, HWY_IF_V_SIZE_LE_D(DI, 16), HWY_IF_I64_D(DI)>
|
|
11250
11538
|
HWY_API VFromD<DI> ConvertInRangeTo(DI /*di*/, VFromD<RebindToFloat<DI>> v) {
|
|
11251
|
-
#if
|
|
11539
|
+
#if HWY_X86_HAVE_AVX10_2_OPS
|
|
11540
|
+
return VFromD<DI>{_mm_cvtts_pd_epi64(v.raw)};
|
|
11541
|
+
#elif HWY_COMPILER_GCC_ACTUAL
|
|
11252
11542
|
// Workaround for undefined behavior in _mm_cvttpd_epi64 with GCC if any
|
|
11253
11543
|
// values of v[i] are not within the range of an int64_t
|
|
11254
11544
|
|
|
@@ -11276,17 +11566,23 @@ HWY_API VFromD<DI> ConvertInRangeTo(DI /*di*/, VFromD<RebindToFloat<DI>> v) {
|
|
|
11276
11566
|
// F64 to I64 ConvertTo is generic for all vector lengths on AVX3
|
|
11277
11567
|
template <class DI, HWY_IF_I64_D(DI)>
|
|
11278
11568
|
HWY_API VFromD<DI> ConvertTo(DI di, VFromD<RebindToFloat<DI>> v) {
|
|
11569
|
+
#if HWY_X86_HAVE_AVX10_2_OPS
|
|
11570
|
+
return ConvertInRangeTo(di, v);
|
|
11571
|
+
#else
|
|
11279
11572
|
const RebindToFloat<decltype(di)> df;
|
|
11280
11573
|
// See comment at the first occurrence of "IfThenElse(overflow,".
|
|
11281
11574
|
const MFromD<DI> overflow =
|
|
11282
11575
|
RebindMask(di, Ge(v, Set(df, 9.223372036854776e18)));
|
|
11283
11576
|
return IfThenElse(overflow, Set(di, LimitsMax<int64_t>()),
|
|
11284
11577
|
ConvertInRangeTo(di, v));
|
|
11578
|
+
#endif
|
|
11285
11579
|
}
|
|
11286
11580
|
|
|
11287
11581
|
template <class DU, HWY_IF_V_SIZE_LE_D(DU, 16), HWY_IF_U32_D(DU)>
|
|
11288
11582
|
HWY_API VFromD<DU> ConvertInRangeTo(DU /*du*/, VFromD<RebindToFloat<DU>> v) {
|
|
11289
|
-
#if
|
|
11583
|
+
#if HWY_X86_HAVE_AVX10_2_OPS
|
|
11584
|
+
return VFromD<DU>{_mm_cvtts_ps_epu32(v.raw)};
|
|
11585
|
+
#elif HWY_COMPILER_GCC_ACTUAL
|
|
11290
11586
|
// Workaround for undefined behavior in _mm_cvttps_epu32 with GCC if any
|
|
11291
11587
|
// values of v[i] are not within the range of an uint32_t
|
|
11292
11588
|
|
|
@@ -11315,13 +11611,19 @@ HWY_API VFromD<DU> ConvertInRangeTo(DU /*du*/, VFromD<RebindToFloat<DU>> v) {
|
|
|
11315
11611
|
|
|
11316
11612
|
// F32->U32 ConvertTo is generic for all vector lengths
|
|
11317
11613
|
template <class DU, HWY_IF_U32_D(DU)>
|
|
11318
|
-
HWY_API VFromD<DU> ConvertTo(DU
|
|
11319
|
-
|
|
11614
|
+
HWY_API VFromD<DU> ConvertTo(DU du32, VFromD<RebindToFloat<DU>> v) {
|
|
11615
|
+
#if HWY_X86_HAVE_AVX10_2_OPS
|
|
11616
|
+
return ConvertInRangeTo(du32, v);
|
|
11617
|
+
#else
|
|
11618
|
+
return ConvertInRangeTo(du32, ZeroIfNegative(v));
|
|
11619
|
+
#endif
|
|
11320
11620
|
}
|
|
11321
11621
|
|
|
11322
11622
|
template <class DU, HWY_IF_V_SIZE_LE_D(DU, 16), HWY_IF_U64_D(DU)>
|
|
11323
11623
|
HWY_API VFromD<DU> ConvertInRangeTo(DU /*du*/, VFromD<RebindToFloat<DU>> v) {
|
|
11324
|
-
#if
|
|
11624
|
+
#if HWY_X86_HAVE_AVX10_2_OPS
|
|
11625
|
+
return VFromD<DU>{_mm_cvtts_pd_epu64(v.raw)};
|
|
11626
|
+
#elif HWY_COMPILER_GCC_ACTUAL
|
|
11325
11627
|
// Workaround for undefined behavior in _mm_cvttpd_epu64 with GCC if any
|
|
11326
11628
|
// values of v[i] are not within the range of an uint64_t
|
|
11327
11629
|
|
|
@@ -11348,8 +11650,12 @@ HWY_API VFromD<DU> ConvertInRangeTo(DU /*du*/, VFromD<RebindToFloat<DU>> v) {
|
|
|
11348
11650
|
|
|
11349
11651
|
// F64->U64 ConvertTo is generic for all vector lengths
|
|
11350
11652
|
template <class DU, HWY_IF_U64_D(DU)>
|
|
11351
|
-
HWY_API VFromD<DU> ConvertTo(DU
|
|
11352
|
-
|
|
11653
|
+
HWY_API VFromD<DU> ConvertTo(DU du64, VFromD<RebindToFloat<DU>> v) {
|
|
11654
|
+
#if HWY_X86_HAVE_AVX10_2_OPS
|
|
11655
|
+
return ConvertInRangeTo(du64, v);
|
|
11656
|
+
#else
|
|
11657
|
+
return ConvertInRangeTo(du64, ZeroIfNegative(v));
|
|
11658
|
+
#endif
|
|
11353
11659
|
}
|
|
11354
11660
|
|
|
11355
11661
|
#else // AVX2 or below
|
|
@@ -11620,7 +11926,8 @@ X86ScalarNearestInt(TF flt_val) {
|
|
|
11620
11926
|
|
|
11621
11927
|
// If these are in namespace detail, the x86_256/512 templates are not found.
|
|
11622
11928
|
template <class DI, HWY_IF_V_SIZE_LE_D(DI, 16), HWY_IF_I32_D(DI)>
|
|
11623
|
-
HWY_INLINE VFromD<DI> NearestIntInRange(DI,
|
|
11929
|
+
static HWY_INLINE VFromD<DI> NearestIntInRange(DI,
|
|
11930
|
+
VFromD<RebindToFloat<DI>> v) {
|
|
11624
11931
|
#if HWY_COMPILER_GCC_ACTUAL
|
|
11625
11932
|
// Workaround for undefined behavior in _mm_cvtps_epi32 with GCC if any values
|
|
11626
11933
|
// of v[i] are not within the range of an int32_t
|
|
@@ -11648,17 +11955,229 @@ HWY_INLINE VFromD<DI> NearestIntInRange(DI, VFromD<RebindToFloat<DI>> v) {
|
|
|
11648
11955
|
#endif
|
|
11649
11956
|
}
|
|
11650
11957
|
|
|
11651
|
-
|
|
11958
|
+
#if HWY_HAVE_FLOAT16
|
|
11959
|
+
template <class DI, HWY_IF_V_SIZE_LE_D(DI, 16), HWY_IF_I16_D(DI)>
|
|
11960
|
+
static HWY_INLINE VFromD<DI> NearestIntInRange(DI /*di*/,
|
|
11961
|
+
VFromD<RebindToFloat<DI>> v) {
|
|
11962
|
+
#if HWY_COMPILER_GCC_ACTUAL
|
|
11963
|
+
// Workaround for undefined behavior in _mm_cvtph_epi16 if any values of v[i]
|
|
11964
|
+
// are not within the range of an int16_t
|
|
11965
|
+
|
|
11966
|
+
#if HWY_COMPILER_GCC_ACTUAL >= 1200 && !HWY_IS_DEBUG_BUILD && \
|
|
11967
|
+
HWY_HAVE_SCALAR_F16_TYPE
|
|
11968
|
+
if (detail::IsConstantX86VecForF2IConv<int16_t>(v)) {
|
|
11969
|
+
typedef hwy::float16_t::Native GccF16RawVectType
|
|
11970
|
+
__attribute__((__vector_size__(16)));
|
|
11971
|
+
const auto raw_v = reinterpret_cast<GccF16RawVectType>(v.raw);
|
|
11972
|
+
return Dup128VecFromValues(DI(),
|
|
11973
|
+
detail::X86ScalarNearestInt<int16_t>(raw_v[0]),
|
|
11974
|
+
detail::X86ScalarNearestInt<int16_t>(raw_v[1]),
|
|
11975
|
+
detail::X86ScalarNearestInt<int16_t>(raw_v[2]),
|
|
11976
|
+
detail::X86ScalarNearestInt<int16_t>(raw_v[3]),
|
|
11977
|
+
detail::X86ScalarNearestInt<int16_t>(raw_v[4]),
|
|
11978
|
+
detail::X86ScalarNearestInt<int16_t>(raw_v[5]),
|
|
11979
|
+
detail::X86ScalarNearestInt<int16_t>(raw_v[6]),
|
|
11980
|
+
detail::X86ScalarNearestInt<int16_t>(raw_v[7]));
|
|
11981
|
+
}
|
|
11982
|
+
#endif
|
|
11983
|
+
|
|
11984
|
+
__m128i raw_result;
|
|
11985
|
+
__asm__("vcvtph2w {%1, %0|%0, %1}"
|
|
11986
|
+
: "=" HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(raw_result)
|
|
11987
|
+
: HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(v.raw)
|
|
11988
|
+
:);
|
|
11989
|
+
return VFromD<DI>{raw_result};
|
|
11990
|
+
#else // !HWY_COMPILER_GCC_ACTUAL
|
|
11991
|
+
return VFromD<DI>{_mm_cvtph_epi16(v.raw)};
|
|
11992
|
+
#endif
|
|
11993
|
+
}
|
|
11994
|
+
#endif // HWY_HAVE_FLOAT16
|
|
11995
|
+
|
|
11996
|
+
#if HWY_TARGET <= HWY_AVX3
|
|
11997
|
+
|
|
11998
|
+
template <class DI, HWY_IF_V_SIZE_LE_D(DI, 16), HWY_IF_I64_D(DI)>
|
|
11999
|
+
static HWY_INLINE VFromD<DI> NearestIntInRange(DI /*di*/,
|
|
12000
|
+
VFromD<RebindToFloat<DI>> v) {
|
|
12001
|
+
#if HWY_COMPILER_GCC_ACTUAL
|
|
12002
|
+
// Workaround for undefined behavior in _mm_cvtpd_epi64 with GCC if any
|
|
12003
|
+
// values of v[i] are not within the range of an int64_t
|
|
12004
|
+
|
|
12005
|
+
#if HWY_COMPILER_GCC_ACTUAL >= 700 && !HWY_IS_DEBUG_BUILD
|
|
12006
|
+
if (detail::IsConstantX86VecForF2IConv<int64_t>(v)) {
|
|
12007
|
+
typedef double GccF64RawVectType __attribute__((__vector_size__(16)));
|
|
12008
|
+
const auto raw_v = reinterpret_cast<GccF64RawVectType>(v.raw);
|
|
12009
|
+
return Dup128VecFromValues(DI(),
|
|
12010
|
+
detail::X86ScalarNearestInt<int64_t>(raw_v[0]),
|
|
12011
|
+
detail::X86ScalarNearestInt<int64_t>(raw_v[1]));
|
|
12012
|
+
}
|
|
12013
|
+
#endif
|
|
12014
|
+
|
|
12015
|
+
__m128i raw_result;
|
|
12016
|
+
__asm__("vcvtpd2qq {%1, %0|%0, %1}"
|
|
12017
|
+
: "=" HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(raw_result)
|
|
12018
|
+
: HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(v.raw)
|
|
12019
|
+
:);
|
|
12020
|
+
return VFromD<DI>{raw_result};
|
|
12021
|
+
#else // !HWY_COMPILER_GCC_ACTUAL
|
|
12022
|
+
return VFromD<DI>{_mm_cvtpd_epi64(v.raw)};
|
|
12023
|
+
#endif
|
|
12024
|
+
}
|
|
12025
|
+
|
|
12026
|
+
#else // HWY_TARGET > HWY_AVX3
|
|
12027
|
+
|
|
12028
|
+
namespace detail {
|
|
12029
|
+
|
|
12030
|
+
#if HWY_ARCH_X86_64
|
|
12031
|
+
template <size_t N>
|
|
12032
|
+
static HWY_INLINE int64_t
|
|
12033
|
+
SSE2ConvFirstF64LaneToNearestI64(Vec128<double, N> v) {
|
|
12034
|
+
#if HWY_COMPILER_GCC_ACTUAL
|
|
12035
|
+
// Workaround for undefined behavior in _mm_cvtsd_si64 with GCC if v[0] is
|
|
12036
|
+
// not within the range of an int64_t
|
|
12037
|
+
|
|
12038
|
+
#if HWY_COMPILER_GCC_ACTUAL >= 700 && !HWY_IS_DEBUG_BUILD
|
|
12039
|
+
if (IsConstantX86Vec(hwy::SizeTag<1>(), v)) {
|
|
12040
|
+
typedef double GccF64RawVectType __attribute__((__vector_size__(16)));
|
|
12041
|
+
const auto raw_v = reinterpret_cast<GccF64RawVectType>(v.raw);
|
|
12042
|
+
return X86ScalarNearestInt<int64_t>(raw_v[0]);
|
|
12043
|
+
}
|
|
12044
|
+
#endif
|
|
12045
|
+
|
|
12046
|
+
int64_t result;
|
|
12047
|
+
__asm__("%vcvtsd2si {%1, %0|%0, %1}"
|
|
12048
|
+
: "=r"(result)
|
|
12049
|
+
: HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(v.raw)
|
|
12050
|
+
:);
|
|
12051
|
+
return result;
|
|
12052
|
+
#else
|
|
12053
|
+
return _mm_cvtsd_si64(v.raw);
|
|
12054
|
+
#endif
|
|
12055
|
+
}
|
|
12056
|
+
#endif // HWY_ARCH_X86_64
|
|
12057
|
+
|
|
12058
|
+
#if !HWY_ARCH_X86_64 || HWY_TARGET <= HWY_AVX2
|
|
12059
|
+
template <class DI64, HWY_IF_I64_D(DI64)>
|
|
12060
|
+
static HWY_INLINE VFromD<DI64> SSE2NearestI64InRange(
|
|
12061
|
+
DI64 di64, VFromD<RebindToFloat<DI64>> v) {
|
|
12062
|
+
const RebindToFloat<DI64> df64;
|
|
12063
|
+
const RebindToUnsigned<DI64> du64;
|
|
12064
|
+
using VI64 = VFromD<decltype(di64)>;
|
|
12065
|
+
|
|
12066
|
+
const auto mant_end = Set(df64, MantissaEnd<double>());
|
|
12067
|
+
const auto is_small = Lt(Abs(v), mant_end);
|
|
12068
|
+
|
|
12069
|
+
const auto adj_v = Max(v, Set(df64, -9223372036854775808.0)) +
|
|
12070
|
+
IfThenElseZero(is_small, CopySignToAbs(mant_end, v));
|
|
12071
|
+
const auto adj_v_biased_exp =
|
|
12072
|
+
And(BitCast(di64, ShiftRight<52>(BitCast(du64, adj_v))),
|
|
12073
|
+
Set(di64, int64_t{0x7FF}));
|
|
12074
|
+
|
|
12075
|
+
// We can simply subtract 1075 from adj_v_biased_exp[i] to get shift_int since
|
|
12076
|
+
// adj_v_biased_exp[i] is at least 1075
|
|
12077
|
+
const VI64 shift_int = adj_v_biased_exp + Set(di64, int64_t{-1075});
|
|
12078
|
+
|
|
12079
|
+
const VI64 mantissa = BitCast(di64, adj_v) & Set(di64, (1LL << 52) - 1);
|
|
12080
|
+
// Include implicit 1-bit if is_small[i] is 0. NOTE: the shift count may
|
|
12081
|
+
// exceed 63; we rely on x86 returning zero in that case.
|
|
12082
|
+
const VI64 int53 = mantissa | IfThenZeroElse(RebindMask(di64, is_small),
|
|
12083
|
+
Set(di64, 1LL << 52));
|
|
12084
|
+
|
|
12085
|
+
const VI64 sign_mask = BroadcastSignBit(BitCast(di64, v));
|
|
12086
|
+
// If the input was negative, negate the integer (two's complement).
|
|
12087
|
+
return ((int53 << shift_int) ^ sign_mask) - sign_mask;
|
|
12088
|
+
}
|
|
12089
|
+
#endif // !HWY_ARCH_X86_64 || HWY_TARGET <= HWY_AVX2
|
|
12090
|
+
|
|
12091
|
+
} // namespace detail
|
|
12092
|
+
|
|
12093
|
+
#if HWY_ARCH_X86_64
|
|
12094
|
+
template <class DI, HWY_IF_V_SIZE_D(DI, 8), HWY_IF_I64_D(DI)>
|
|
12095
|
+
static HWY_INLINE VFromD<DI> NearestIntInRange(DI /*di*/, Vec64<double> v) {
|
|
12096
|
+
return VFromD<DI>{
|
|
12097
|
+
_mm_cvtsi64_si128(detail::SSE2ConvFirstF64LaneToNearestI64(v))};
|
|
12098
|
+
}
|
|
12099
|
+
template <class DI, HWY_IF_V_SIZE_D(DI, 16), HWY_IF_I64_D(DI)>
|
|
12100
|
+
static HWY_INLINE VFromD<DI> NearestIntInRange(DI /*di*/, Vec128<double> v) {
|
|
12101
|
+
const __m128i i0 =
|
|
12102
|
+
_mm_cvtsi64_si128(detail::SSE2ConvFirstF64LaneToNearestI64(v));
|
|
12103
|
+
const Full64<double> dd2;
|
|
12104
|
+
const __m128i i1 = _mm_cvtsi64_si128(
|
|
12105
|
+
detail::SSE2ConvFirstF64LaneToNearestI64(UpperHalf(dd2, v)));
|
|
12106
|
+
return VFromD<DI>{_mm_unpacklo_epi64(i0, i1)};
|
|
12107
|
+
}
|
|
12108
|
+
#endif // HWY_ARCH_X86_64
|
|
12109
|
+
|
|
12110
|
+
#if !HWY_ARCH_X86_64 || HWY_TARGET <= HWY_AVX2
|
|
12111
|
+
template <class DI, HWY_IF_V_SIZE_GT_D(DI, (HWY_ARCH_X86_64 ? 16 : 0)),
|
|
12112
|
+
HWY_IF_I64_D(DI)>
|
|
12113
|
+
static HWY_INLINE VFromD<DI> NearestIntInRange(DI di,
|
|
12114
|
+
VFromD<RebindToFloat<DI>> v) {
|
|
12115
|
+
return detail::SSE2NearestI64InRange(di, v);
|
|
12116
|
+
}
|
|
12117
|
+
#endif // !HWY_ARCH_X86_64 || HWY_TARGET <= HWY_AVX2
|
|
12118
|
+
|
|
12119
|
+
#endif // HWY_TARGET <= HWY_AVX3
|
|
12120
|
+
|
|
12121
|
+
template <class DI, HWY_IF_V_SIZE_LE_D(DI, 8), HWY_IF_I32_D(DI)>
|
|
12122
|
+
static HWY_INLINE VFromD<DI> DemoteToNearestIntInRange(
|
|
12123
|
+
DI, VFromD<Rebind<double, DI>> v) {
|
|
12124
|
+
#if HWY_COMPILER_GCC_ACTUAL
|
|
12125
|
+
// Workaround for undefined behavior in _mm_cvtpd_epi32 with GCC if any values
|
|
12126
|
+
// of v[i] are not within the range of an int32_t
|
|
12127
|
+
|
|
12128
|
+
#if HWY_COMPILER_GCC_ACTUAL >= 700 && !HWY_IS_DEBUG_BUILD
|
|
12129
|
+
if (detail::IsConstantX86VecForF2IConv<int32_t>(v)) {
|
|
12130
|
+
typedef double GccF32RawVectType __attribute__((__vector_size__(16)));
|
|
12131
|
+
const auto raw_v = reinterpret_cast<GccF32RawVectType>(v.raw);
|
|
12132
|
+
return Dup128VecFromValues(
|
|
12133
|
+
DI(), detail::X86ScalarNearestInt<int32_t>(raw_v[0]),
|
|
12134
|
+
detail::X86ScalarNearestInt<int32_t>(raw_v[1]), int32_t{0}, int32_t{0});
|
|
12135
|
+
}
|
|
12136
|
+
#endif
|
|
12137
|
+
|
|
12138
|
+
__m128i raw_result;
|
|
12139
|
+
__asm__("%vcvtpd2dq {%1, %0|%0, %1}"
|
|
12140
|
+
: "=" HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(raw_result)
|
|
12141
|
+
: HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(v.raw)
|
|
12142
|
+
:);
|
|
12143
|
+
return VFromD<DI>{raw_result};
|
|
12144
|
+
#else // !HWY_COMPILER_GCC_ACTUAL
|
|
12145
|
+
return VFromD<DI>{_mm_cvtpd_epi32(v.raw)};
|
|
12146
|
+
#endif
|
|
12147
|
+
}
|
|
12148
|
+
|
|
12149
|
+
// F16/F32/F64 NearestInt is generic for all vector lengths
|
|
11652
12150
|
template <class VF, class DF = DFromV<VF>, class DI = RebindToSigned<DF>,
|
|
11653
|
-
|
|
12151
|
+
HWY_IF_FLOAT_D(DF),
|
|
12152
|
+
HWY_IF_T_SIZE_ONE_OF_D(DF, (1 << 4) | (1 << 8) |
|
|
12153
|
+
(HWY_HAVE_FLOAT16 ? (1 << 2) : 0))>
|
|
11654
12154
|
HWY_API VFromD<DI> NearestInt(const VF v) {
|
|
11655
12155
|
const DI di;
|
|
12156
|
+
using TI = TFromD<DI>;
|
|
12157
|
+
using TF = TFromD<DF>;
|
|
12158
|
+
using TFArith = If<sizeof(TF) <= sizeof(float), float, RemoveCvRef<TF>>;
|
|
12159
|
+
|
|
12160
|
+
constexpr TFArith kMinOutOfRangePosVal =
|
|
12161
|
+
static_cast<TFArith>(-static_cast<TFArith>(LimitsMin<TI>()));
|
|
12162
|
+
static_assert(kMinOutOfRangePosVal > static_cast<TFArith>(0.0),
|
|
12163
|
+
"kMinOutOfRangePosVal > 0.0 must be true");
|
|
12164
|
+
|
|
11656
12165
|
// See comment at the first occurrence of "IfThenElse(overflow,".
|
|
11657
12166
|
// Here we are rounding, whereas previous occurrences truncate, but there is
|
|
11658
12167
|
// no difference because the previous float value is well below the max i32.
|
|
11659
|
-
const auto overflow = RebindMask(
|
|
11660
|
-
|
|
11661
|
-
|
|
12168
|
+
const auto overflow = RebindMask(
|
|
12169
|
+
di, Ge(v, Set(DF(), ConvertScalarTo<TF>(kMinOutOfRangePosVal))));
|
|
12170
|
+
auto result =
|
|
12171
|
+
IfThenElse(overflow, Set(di, LimitsMax<TI>()), NearestIntInRange(di, v));
|
|
12172
|
+
|
|
12173
|
+
return result;
|
|
12174
|
+
}
|
|
12175
|
+
|
|
12176
|
+
template <class DI, HWY_IF_I32_D(DI)>
|
|
12177
|
+
HWY_API VFromD<DI> DemoteToNearestInt(DI, VFromD<Rebind<double, DI>> v) {
|
|
12178
|
+
const DI di;
|
|
12179
|
+
const Rebind<double, DI> df64;
|
|
12180
|
+
return DemoteToNearestIntInRange(di, Min(v, Set(df64, 2147483647.0)));
|
|
11662
12181
|
}
|
|
11663
12182
|
|
|
11664
12183
|
// ------------------------------ Floating-point rounding (ConvertTo)
|
|
@@ -11724,6 +12243,25 @@ HWY_API Vec128<T, N> Ceil(const Vec128<T, N> v) {
|
|
|
11724
12243
|
return IfThenElse(detail::UseInt(v), int_f - neg1, v);
|
|
11725
12244
|
}
|
|
11726
12245
|
|
|
12246
|
+
#ifdef HWY_NATIVE_CEIL_FLOOR_INT
|
|
12247
|
+
#undef HWY_NATIVE_CEIL_FLOOR_INT
|
|
12248
|
+
#else
|
|
12249
|
+
#define HWY_NATIVE_CEIL_FLOOR_INT
|
|
12250
|
+
#endif
|
|
12251
|
+
|
|
12252
|
+
template <class V, HWY_IF_FLOAT_V(V)>
|
|
12253
|
+
HWY_API VFromD<RebindToSigned<DFromV<V>>> CeilInt(V v) {
|
|
12254
|
+
const DFromV<decltype(v)> df;
|
|
12255
|
+
const RebindToSigned<decltype(df)> di;
|
|
12256
|
+
|
|
12257
|
+
const auto integer = ConvertTo(di, v); // round toward 0
|
|
12258
|
+
const auto int_f = ConvertTo(df, integer);
|
|
12259
|
+
|
|
12260
|
+
// Truncating a positive non-integer ends up smaller; if so, add 1.
|
|
12261
|
+
return integer -
|
|
12262
|
+
VecFromMask(di, RebindMask(di, And(detail::UseInt(v), int_f < v)));
|
|
12263
|
+
}
|
|
12264
|
+
|
|
11727
12265
|
// Toward -infinity, aka floor
|
|
11728
12266
|
template <typename T, size_t N>
|
|
11729
12267
|
HWY_API Vec128<T, N> Floor(const Vec128<T, N> v) {
|
|
@@ -11740,6 +12278,19 @@ HWY_API Vec128<T, N> Floor(const Vec128<T, N> v) {
|
|
|
11740
12278
|
return IfThenElse(detail::UseInt(v), int_f + neg1, v);
|
|
11741
12279
|
}
|
|
11742
12280
|
|
|
12281
|
+
template <class V, HWY_IF_FLOAT_V(V)>
|
|
12282
|
+
HWY_API VFromD<RebindToSigned<DFromV<V>>> FloorInt(V v) {
|
|
12283
|
+
const DFromV<decltype(v)> df;
|
|
12284
|
+
const RebindToSigned<decltype(df)> di;
|
|
12285
|
+
|
|
12286
|
+
const auto integer = ConvertTo(di, v); // round toward 0
|
|
12287
|
+
const auto int_f = ConvertTo(df, integer);
|
|
12288
|
+
|
|
12289
|
+
// Truncating a negative non-integer ends up larger; if so, subtract 1.
|
|
12290
|
+
return integer +
|
|
12291
|
+
VecFromMask(di, RebindMask(di, And(detail::UseInt(v), int_f > v)));
|
|
12292
|
+
}
|
|
12293
|
+
|
|
11743
12294
|
#else
|
|
11744
12295
|
|
|
11745
12296
|
// Toward nearest integer, ties to even
|
|
@@ -12117,8 +12668,27 @@ struct CompressIsPartition {
|
|
|
12117
12668
|
#endif
|
|
12118
12669
|
};
|
|
12119
12670
|
|
|
12671
|
+
namespace detail {
|
|
12672
|
+
|
|
12673
|
+
// Returns `mask_bits` (from movemask) with the upper bits cleared, if there
|
|
12674
|
+
// are 8 or fewer valid bits.
|
|
12675
|
+
template <class D>
|
|
12676
|
+
constexpr uint64_t OnlyActive(D d, uint64_t mask_bits) {
|
|
12677
|
+
return (d.MaxBytes() >= 16) ? mask_bits
|
|
12678
|
+
: mask_bits & ((1ull << d.MaxLanes()) - 1);
|
|
12679
|
+
}
|
|
12680
|
+
|
|
12681
|
+
} // namespace detail
|
|
12682
|
+
|
|
12120
12683
|
#if HWY_TARGET <= HWY_AVX3
|
|
12121
12684
|
|
|
12685
|
+
// ------------------------------ BitsFromMask (MFromD, OnlyActive)
|
|
12686
|
+
// Generic for all vector lengths.
|
|
12687
|
+
template <class D>
|
|
12688
|
+
HWY_INLINE uint64_t BitsFromMask(D d, MFromD<D> mask) {
|
|
12689
|
+
return detail::OnlyActive(d, mask.raw);
|
|
12690
|
+
}
|
|
12691
|
+
|
|
12122
12692
|
// ------------------------------ StoreMaskBits
|
|
12123
12693
|
|
|
12124
12694
|
// `p` points to at least 8 writable bytes.
|
|
@@ -12238,14 +12808,16 @@ HWY_API Vec128<T> CompressNot(Vec128<T> v, Mask128<T> mask) {
|
|
|
12238
12808
|
alignas(16) static constexpr uint64_t packed_array[16] = {
|
|
12239
12809
|
0x00000010, 0x00000001, 0x00000010, 0x00000010};
|
|
12240
12810
|
|
|
12241
|
-
// For lane i, shift the i-th 4-bit index down to bits [0, 2)
|
|
12242
|
-
// _mm_permutexvar_epi64 will ignore the upper bits.
|
|
12811
|
+
// For lane i, shift the i-th 4-bit index down to bits [0, 2).
|
|
12243
12812
|
const DFromV<decltype(v)> d;
|
|
12244
12813
|
const RebindToUnsigned<decltype(d)> du64;
|
|
12245
12814
|
const auto packed = Set(du64, packed_array[mask.raw]);
|
|
12246
|
-
alignas(16) static constexpr uint64_t
|
|
12247
|
-
|
|
12248
|
-
|
|
12815
|
+
alignas(16) static constexpr uint64_t kShifts[2] = {0, 4};
|
|
12816
|
+
Vec128<uint64_t> indices = packed >> Load(du64, kShifts);
|
|
12817
|
+
// _mm_permutevar_pd will ignore the upper bits, but TableLookupLanes uses
|
|
12818
|
+
// a fallback in MSAN builds, so mask there.
|
|
12819
|
+
HWY_IF_CONSTEXPR(HWY_IS_MSAN) indices &= Set(du64, 1);
|
|
12820
|
+
return TableLookupLanes(v, Indices128<T>{indices.raw});
|
|
12249
12821
|
}
|
|
12250
12822
|
|
|
12251
12823
|
// ------------------------------ CompressBlocksNot
|
|
@@ -12256,42 +12828,13 @@ HWY_API Vec128<uint64_t> CompressBlocksNot(Vec128<uint64_t> v,
|
|
|
12256
12828
|
|
|
12257
12829
|
// ------------------------------ CompressStore (defined in x86_512)
|
|
12258
12830
|
|
|
12259
|
-
// ------------------------------ CompressBlendedStore (
|
|
12260
|
-
template <class D, HWY_IF_V_SIZE_LE_D(D, 8)>
|
|
12261
|
-
HWY_API size_t CompressBlendedStore(VFromD<D> v, MFromD<D> m, D d,
|
|
12262
|
-
TFromD<D>* HWY_RESTRICT unaligned) {
|
|
12263
|
-
// AVX-512 already does the blending at no extra cost (latency 11,
|
|
12264
|
-
// rthroughput 2 - same as compress plus store).
|
|
12265
|
-
if (HWY_TARGET == HWY_AVX3_DL ||
|
|
12266
|
-
(HWY_TARGET != HWY_AVX3_ZEN4 && sizeof(TFromD<D>) > 2)) {
|
|
12267
|
-
// We're relying on the mask to blend. Clear the undefined upper bits.
|
|
12268
|
-
constexpr size_t kN = MaxLanes(d);
|
|
12269
|
-
if (kN != 16 / sizeof(TFromD<D>)) {
|
|
12270
|
-
m = And(m, FirstN(d, kN));
|
|
12271
|
-
}
|
|
12272
|
-
return CompressStore(v, m, d, unaligned);
|
|
12273
|
-
} else {
|
|
12274
|
-
const size_t count = CountTrue(d, m);
|
|
12275
|
-
const VFromD<D> compressed = Compress(v, m);
|
|
12276
|
-
#if HWY_MEM_OPS_MIGHT_FAULT
|
|
12277
|
-
// BlendedStore tests mask for each lane, but we know that the mask is
|
|
12278
|
-
// FirstN, so we can just copy.
|
|
12279
|
-
alignas(16) TFromD<D> buf[MaxLanes(d)];
|
|
12280
|
-
Store(compressed, d, buf);
|
|
12281
|
-
CopyBytes(buf, unaligned, count * sizeof(TFromD<D>));
|
|
12282
|
-
#else
|
|
12283
|
-
BlendedStore(compressed, FirstN(d, count), d, unaligned);
|
|
12284
|
-
#endif
|
|
12285
|
-
detail::MaybeUnpoison(unaligned, count);
|
|
12286
|
-
return count;
|
|
12287
|
-
}
|
|
12288
|
-
}
|
|
12831
|
+
// ------------------------------ CompressBlendedStore (defined in x86_avx3)
|
|
12289
12832
|
|
|
12290
12833
|
// ------------------------------ CompressBitsStore (defined in x86_512)
|
|
12291
12834
|
|
|
12292
12835
|
#else // AVX2 or below
|
|
12293
12836
|
|
|
12294
|
-
// ------------------------------
|
|
12837
|
+
// ------------------------------ BitsFromMask
|
|
12295
12838
|
|
|
12296
12839
|
namespace detail {
|
|
12297
12840
|
|
|
@@ -12299,50 +12842,45 @@ constexpr HWY_INLINE uint64_t U64FromInt(int mask_bits) {
|
|
|
12299
12842
|
return static_cast<uint64_t>(static_cast<unsigned>(mask_bits));
|
|
12300
12843
|
}
|
|
12301
12844
|
|
|
12302
|
-
|
|
12303
|
-
|
|
12304
|
-
|
|
12305
|
-
|
|
12845
|
+
} // namespace detail
|
|
12846
|
+
|
|
12847
|
+
template <class D, HWY_IF_T_SIZE_D(D, 1), HWY_IF_V_SIZE_LE_D(D, 16)>
|
|
12848
|
+
HWY_API uint64_t BitsFromMask(D d, MFromD<D> mask) {
|
|
12306
12849
|
const auto sign_bits = BitCast(d, VecFromMask(d, mask)).raw;
|
|
12307
|
-
return
|
|
12850
|
+
return detail::OnlyActive(d,
|
|
12851
|
+
detail::U64FromInt(_mm_movemask_epi8(sign_bits)));
|
|
12308
12852
|
}
|
|
12309
12853
|
|
|
12310
|
-
template <
|
|
12311
|
-
|
|
12312
|
-
const Mask128<T, N> mask) {
|
|
12854
|
+
template <class D, HWY_IF_T_SIZE_D(D, 2), HWY_IF_V_SIZE_LE_D(D, 16)>
|
|
12855
|
+
HWY_API uint64_t BitsFromMask(D d, MFromD<D> mask) {
|
|
12313
12856
|
// Remove useless lower half of each u16 while preserving the sign bit.
|
|
12314
12857
|
const auto sign_bits = _mm_packs_epi16(mask.raw, _mm_setzero_si128());
|
|
12315
|
-
return
|
|
12858
|
+
return detail::OnlyActive(d,
|
|
12859
|
+
detail::U64FromInt(_mm_movemask_epi8(sign_bits)));
|
|
12316
12860
|
}
|
|
12317
12861
|
|
|
12318
|
-
template <
|
|
12319
|
-
|
|
12320
|
-
const
|
|
12321
|
-
const Simd<float, N, 0> df;
|
|
12862
|
+
template <class D, HWY_IF_T_SIZE_D(D, 4), HWY_IF_V_SIZE_LE_D(D, 16)>
|
|
12863
|
+
HWY_API uint64_t BitsFromMask(D d, MFromD<D> mask) {
|
|
12864
|
+
const RebindToFloat<decltype(d)> df;
|
|
12322
12865
|
const auto sign_bits = BitCast(df, VecFromMask(d, mask));
|
|
12323
|
-
return
|
|
12866
|
+
return detail::OnlyActive(d,
|
|
12867
|
+
detail::U64FromInt(_mm_movemask_ps(sign_bits.raw)));
|
|
12324
12868
|
}
|
|
12325
12869
|
|
|
12326
|
-
template <
|
|
12327
|
-
|
|
12328
|
-
const
|
|
12329
|
-
const Simd<double, N, 0> df;
|
|
12870
|
+
template <class D, HWY_IF_T_SIZE_D(D, 8), HWY_IF_V_SIZE_LE_D(D, 16)>
|
|
12871
|
+
HWY_API uint64_t BitsFromMask(D d, MFromD<D> mask) {
|
|
12872
|
+
const RebindToFloat<D> df;
|
|
12330
12873
|
const auto sign_bits = BitCast(df, VecFromMask(d, mask));
|
|
12331
|
-
return
|
|
12332
|
-
|
|
12333
|
-
|
|
12334
|
-
template <typename T, size_t N>
|
|
12335
|
-
HWY_INLINE uint64_t BitsFromMask(const Mask128<T, N> mask) {
|
|
12336
|
-
return OnlyActive<T, N>(BitsFromMask(hwy::SizeTag<sizeof(T)>(), mask));
|
|
12874
|
+
return detail::OnlyActive(d,
|
|
12875
|
+
detail::U64FromInt(_mm_movemask_pd(sign_bits.raw)));
|
|
12337
12876
|
}
|
|
12338
12877
|
|
|
12339
|
-
|
|
12340
|
-
|
|
12878
|
+
// ------------------------------ StoreMaskBits
|
|
12341
12879
|
// `p` points to at least 8 writable bytes.
|
|
12342
12880
|
template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
|
|
12343
12881
|
HWY_API size_t StoreMaskBits(D d, MFromD<D> mask, uint8_t* bits) {
|
|
12344
12882
|
constexpr size_t kNumBytes = (MaxLanes(d) + 7) / 8;
|
|
12345
|
-
const uint64_t mask_bits =
|
|
12883
|
+
const uint64_t mask_bits = BitsFromMask(d, mask);
|
|
12346
12884
|
CopyBytes<kNumBytes>(&mask_bits, bits);
|
|
12347
12885
|
return kNumBytes;
|
|
12348
12886
|
}
|
|
@@ -12350,43 +12888,43 @@ HWY_API size_t StoreMaskBits(D d, MFromD<D> mask, uint8_t* bits) {
|
|
|
12350
12888
|
// ------------------------------ Mask testing
|
|
12351
12889
|
|
|
12352
12890
|
template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
|
|
12353
|
-
HWY_API bool AllFalse(D
|
|
12891
|
+
HWY_API bool AllFalse(D d, MFromD<D> mask) {
|
|
12354
12892
|
// Cheaper than PTEST, which is 2 uop / 3L.
|
|
12355
|
-
return
|
|
12893
|
+
return BitsFromMask(d, mask) == 0;
|
|
12356
12894
|
}
|
|
12357
12895
|
|
|
12358
12896
|
template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
|
|
12359
12897
|
HWY_API bool AllTrue(D d, MFromD<D> mask) {
|
|
12360
12898
|
constexpr uint64_t kAllBits = (1ull << MaxLanes(d)) - 1;
|
|
12361
|
-
return
|
|
12899
|
+
return BitsFromMask(d, mask) == kAllBits;
|
|
12362
12900
|
}
|
|
12363
12901
|
|
|
12364
12902
|
template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
|
|
12365
|
-
HWY_API size_t CountTrue(D
|
|
12366
|
-
return PopCount(
|
|
12903
|
+
HWY_API size_t CountTrue(D d, MFromD<D> mask) {
|
|
12904
|
+
return PopCount(BitsFromMask(d, mask));
|
|
12367
12905
|
}
|
|
12368
12906
|
|
|
12369
12907
|
template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
|
|
12370
|
-
HWY_API size_t FindKnownFirstTrue(D
|
|
12908
|
+
HWY_API size_t FindKnownFirstTrue(D d, MFromD<D> mask) {
|
|
12371
12909
|
return Num0BitsBelowLS1Bit_Nonzero32(
|
|
12372
|
-
static_cast<uint32_t>(
|
|
12910
|
+
static_cast<uint32_t>(BitsFromMask(d, mask)));
|
|
12373
12911
|
}
|
|
12374
12912
|
|
|
12375
12913
|
template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
|
|
12376
|
-
HWY_API intptr_t FindFirstTrue(D
|
|
12377
|
-
const uint32_t mask_bits = static_cast<uint32_t>(
|
|
12914
|
+
HWY_API intptr_t FindFirstTrue(D d, MFromD<D> mask) {
|
|
12915
|
+
const uint32_t mask_bits = static_cast<uint32_t>(BitsFromMask(d, mask));
|
|
12378
12916
|
return mask_bits ? intptr_t(Num0BitsBelowLS1Bit_Nonzero32(mask_bits)) : -1;
|
|
12379
12917
|
}
|
|
12380
12918
|
|
|
12381
12919
|
template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
|
|
12382
|
-
HWY_API size_t FindKnownLastTrue(D
|
|
12920
|
+
HWY_API size_t FindKnownLastTrue(D d, MFromD<D> mask) {
|
|
12383
12921
|
return 31 - Num0BitsAboveMS1Bit_Nonzero32(
|
|
12384
|
-
static_cast<uint32_t>(
|
|
12922
|
+
static_cast<uint32_t>(BitsFromMask(d, mask)));
|
|
12385
12923
|
}
|
|
12386
12924
|
|
|
12387
12925
|
template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
|
|
12388
|
-
HWY_API intptr_t FindLastTrue(D
|
|
12389
|
-
const uint32_t mask_bits = static_cast<uint32_t>(
|
|
12926
|
+
HWY_API intptr_t FindLastTrue(D d, MFromD<D> mask) {
|
|
12927
|
+
const uint32_t mask_bits = static_cast<uint32_t>(BitsFromMask(d, mask));
|
|
12390
12928
|
return mask_bits ? intptr_t(31 - Num0BitsAboveMS1Bit_Nonzero32(mask_bits))
|
|
12391
12929
|
: -1;
|
|
12392
12930
|
}
|
|
@@ -12828,7 +13366,8 @@ HWY_API Vec128<T> Compress(Vec128<T> v, Mask128<T> mask) {
|
|
|
12828
13366
|
// General case, 2 or 4 bytes
|
|
12829
13367
|
template <typename T, size_t N, HWY_IF_T_SIZE_ONE_OF(T, (1 << 2) | (1 << 4))>
|
|
12830
13368
|
HWY_API Vec128<T, N> Compress(Vec128<T, N> v, Mask128<T, N> mask) {
|
|
12831
|
-
|
|
13369
|
+
const DFromV<decltype(v)> d;
|
|
13370
|
+
return detail::CompressBits(v, BitsFromMask(d, mask));
|
|
12832
13371
|
}
|
|
12833
13372
|
|
|
12834
13373
|
// ------------------------------ CompressNot
|
|
@@ -12853,12 +13392,13 @@ HWY_API Vec128<T> CompressNot(Vec128<T> v, Mask128<T> mask) {
|
|
|
12853
13392
|
|
|
12854
13393
|
template <typename T, size_t N, HWY_IF_T_SIZE_ONE_OF(T, (1 << 2) | (1 << 4))>
|
|
12855
13394
|
HWY_API Vec128<T, N> CompressNot(Vec128<T, N> v, Mask128<T, N> mask) {
|
|
13395
|
+
const DFromV<decltype(v)> d;
|
|
12856
13396
|
// For partial vectors, we cannot pull the Not() into the table because
|
|
12857
13397
|
// BitsFromMask clears the upper bits.
|
|
12858
13398
|
if (N < 16 / sizeof(T)) {
|
|
12859
|
-
return detail::CompressBits(v,
|
|
13399
|
+
return detail::CompressBits(v, BitsFromMask(d, Not(mask)));
|
|
12860
13400
|
}
|
|
12861
|
-
return detail::CompressNotBits(v,
|
|
13401
|
+
return detail::CompressNotBits(v, BitsFromMask(d, mask));
|
|
12862
13402
|
}
|
|
12863
13403
|
|
|
12864
13404
|
// ------------------------------ CompressBlocksNot
|
|
@@ -12887,7 +13427,7 @@ HWY_API size_t CompressStore(VFromD<D> v, MFromD<D> m, D d,
|
|
|
12887
13427
|
TFromD<D>* HWY_RESTRICT unaligned) {
|
|
12888
13428
|
const RebindToUnsigned<decltype(d)> du;
|
|
12889
13429
|
|
|
12890
|
-
const uint64_t mask_bits =
|
|
13430
|
+
const uint64_t mask_bits = BitsFromMask(d, m);
|
|
12891
13431
|
HWY_DASSERT(mask_bits < (1ull << MaxLanes(d)));
|
|
12892
13432
|
const size_t count = PopCount(mask_bits);
|
|
12893
13433
|
|
|
@@ -12904,7 +13444,7 @@ HWY_API size_t CompressBlendedStore(VFromD<D> v, MFromD<D> m, D d,
|
|
|
12904
13444
|
TFromD<D>* HWY_RESTRICT unaligned) {
|
|
12905
13445
|
const RebindToUnsigned<decltype(d)> du;
|
|
12906
13446
|
|
|
12907
|
-
const uint64_t mask_bits =
|
|
13447
|
+
const uint64_t mask_bits = BitsFromMask(d, m);
|
|
12908
13448
|
HWY_DASSERT(mask_bits < (1ull << MaxLanes(d)));
|
|
12909
13449
|
const size_t count = PopCount(mask_bits);
|
|
12910
13450
|
|
|
@@ -13331,6 +13871,25 @@ HWY_API V BitShuffle(V v, VI idx) {
|
|
|
13331
13871
|
}
|
|
13332
13872
|
#endif // HWY_TARGET <= HWY_AVX3_DL
|
|
13333
13873
|
|
|
13874
|
+
// ------------------------------ MultiRotateRight
|
|
13875
|
+
|
|
13876
|
+
#if HWY_TARGET <= HWY_AVX3_DL
|
|
13877
|
+
|
|
13878
|
+
#ifdef HWY_NATIVE_MULTIROTATERIGHT
|
|
13879
|
+
#undef HWY_NATIVE_MULTIROTATERIGHT
|
|
13880
|
+
#else
|
|
13881
|
+
#define HWY_NATIVE_MULTIROTATERIGHT
|
|
13882
|
+
#endif
|
|
13883
|
+
|
|
13884
|
+
template <class V, class VI, HWY_IF_UI64(TFromV<V>), HWY_IF_UI8(TFromV<VI>),
|
|
13885
|
+
HWY_IF_V_SIZE_LE_V(V, 16),
|
|
13886
|
+
HWY_IF_V_SIZE_V(VI, HWY_MAX_LANES_V(V) * 8)>
|
|
13887
|
+
HWY_API V MultiRotateRight(V v, VI idx) {
|
|
13888
|
+
return V{_mm_multishift_epi64_epi8(idx.raw, v.raw)};
|
|
13889
|
+
}
|
|
13890
|
+
|
|
13891
|
+
#endif
|
|
13892
|
+
|
|
13334
13893
|
// ------------------------------ Lt128
|
|
13335
13894
|
|
|
13336
13895
|
namespace detail {
|