@img/sharp-libvips-dev 1.2.0 → 1.2.2-rc.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/include/ffi.h +3 -3
- package/include/harfbuzz/hb-deprecated.h +4 -4
- package/include/harfbuzz/hb-font.h +120 -9
- package/include/harfbuzz/hb-version.h +3 -3
- package/include/hwy/abort.h +2 -19
- package/include/hwy/aligned_allocator.h +11 -7
- package/include/hwy/auto_tune.h +504 -0
- package/include/hwy/base.h +425 -104
- package/include/hwy/cache_control.h +16 -0
- package/include/hwy/detect_compiler_arch.h +32 -1
- package/include/hwy/detect_targets.h +251 -67
- package/include/hwy/foreach_target.h +35 -0
- package/include/hwy/highway.h +185 -76
- package/include/hwy/nanobenchmark.h +1 -19
- package/include/hwy/ops/arm_neon-inl.h +969 -458
- package/include/hwy/ops/arm_sve-inl.h +1137 -359
- package/include/hwy/ops/emu128-inl.h +97 -11
- package/include/hwy/ops/generic_ops-inl.h +1222 -34
- package/include/hwy/ops/loongarch_lasx-inl.h +4664 -0
- package/include/hwy/ops/loongarch_lsx-inl.h +5933 -0
- package/include/hwy/ops/ppc_vsx-inl.h +306 -126
- package/include/hwy/ops/rvv-inl.h +546 -51
- package/include/hwy/ops/scalar-inl.h +77 -22
- package/include/hwy/ops/set_macros-inl.h +138 -17
- package/include/hwy/ops/shared-inl.h +50 -10
- package/include/hwy/ops/wasm_128-inl.h +137 -92
- package/include/hwy/ops/x86_128-inl.h +773 -214
- package/include/hwy/ops/x86_256-inl.h +712 -255
- package/include/hwy/ops/x86_512-inl.h +429 -753
- package/include/hwy/ops/x86_avx3-inl.h +501 -0
- package/include/hwy/per_target.h +2 -1
- package/include/hwy/profiler.h +622 -486
- package/include/hwy/targets.h +62 -20
- package/include/hwy/timer-inl.h +8 -160
- package/include/hwy/timer.h +170 -3
- package/include/hwy/x86_cpuid.h +81 -0
- package/include/libheif/heif_cxx.h +25 -5
- package/include/libheif/heif_regions.h +5 -5
- package/include/libheif/heif_version.h +2 -2
- package/include/librsvg-2.0/librsvg/rsvg-version.h +2 -2
- package/include/libxml2/libxml/xmlversion.h +4 -4
- package/include/pango-1.0/pango/pango-enum-types.h +3 -0
- package/include/pango-1.0/pango/pango-features.h +3 -3
- package/include/pango-1.0/pango/pango-font.h +30 -0
- package/include/pango-1.0/pango/pango-version-macros.h +26 -0
- package/include/pixman-1/pixman-version.h +2 -2
- package/include/webp/decode.h +11 -2
- package/include/webp/demux.h +2 -0
- package/include/webp/encode.h +2 -0
- package/include/webp/mux_types.h +1 -0
- package/include/webp/sharpyuv/sharpyuv.h +1 -1
- package/include/webp/types.h +2 -2
- package/include/zlib.h +3 -3
- package/package.json +1 -1
- package/versions.json +11 -11
|
@@ -46,6 +46,36 @@ HWY_DIAGNOSTICS_OFF(disable : 4701 4703 6001 26494,
|
|
|
46
46
|
#include <f16cintrin.h>
|
|
47
47
|
#include <fmaintrin.h>
|
|
48
48
|
#include <smmintrin.h>
|
|
49
|
+
|
|
50
|
+
#if HWY_TARGET <= HWY_AVX10_2
|
|
51
|
+
#include <avx512bitalgintrin.h>
|
|
52
|
+
#include <avx512bwintrin.h>
|
|
53
|
+
#include <avx512cdintrin.h>
|
|
54
|
+
#include <avx512dqintrin.h>
|
|
55
|
+
#include <avx512fintrin.h>
|
|
56
|
+
#include <avx512vbmi2intrin.h>
|
|
57
|
+
#include <avx512vbmiintrin.h>
|
|
58
|
+
#include <avx512vbmivlintrin.h>
|
|
59
|
+
#include <avx512vlbitalgintrin.h>
|
|
60
|
+
#include <avx512vlbwintrin.h>
|
|
61
|
+
#include <avx512vlcdintrin.h>
|
|
62
|
+
#include <avx512vldqintrin.h>
|
|
63
|
+
#include <avx512vlintrin.h>
|
|
64
|
+
#include <avx512vlvbmi2intrin.h>
|
|
65
|
+
#include <avx512vlvnniintrin.h>
|
|
66
|
+
#include <avx512vnniintrin.h>
|
|
67
|
+
#include <avx512vpopcntdqintrin.h>
|
|
68
|
+
#include <avx512vpopcntdqvlintrin.h>
|
|
69
|
+
// Must come after avx512fintrin, else will not define 512-bit intrinsics.
|
|
70
|
+
#include <avx512fp16intrin.h>
|
|
71
|
+
#include <avx512vlfp16intrin.h>
|
|
72
|
+
#include <gfniintrin.h>
|
|
73
|
+
#include <vaesintrin.h>
|
|
74
|
+
#include <vpclmulqdqintrin.h>
|
|
75
|
+
|
|
76
|
+
#endif // HWY_TARGET <= HWY_AVX10_2
|
|
77
|
+
|
|
78
|
+
// clang-format on
|
|
49
79
|
#endif // HWY_COMPILER_CLANGCL
|
|
50
80
|
|
|
51
81
|
// For half-width vectors. Already includes base.h.
|
|
@@ -117,67 +147,90 @@ class Vec256 {
|
|
|
117
147
|
Raw raw;
|
|
118
148
|
};
|
|
119
149
|
|
|
120
|
-
#if HWY_TARGET <= HWY_AVX3
|
|
121
|
-
|
|
122
150
|
namespace detail {
|
|
123
151
|
|
|
152
|
+
#if HWY_TARGET <= HWY_AVX3
|
|
153
|
+
|
|
124
154
|
// Template arg: sizeof(lane type)
|
|
125
155
|
template <size_t size>
|
|
126
|
-
struct
|
|
156
|
+
struct RawMask256T {};
|
|
127
157
|
template <>
|
|
128
|
-
struct
|
|
158
|
+
struct RawMask256T<1> {
|
|
129
159
|
using type = __mmask32;
|
|
130
160
|
};
|
|
131
161
|
template <>
|
|
132
|
-
struct
|
|
162
|
+
struct RawMask256T<2> {
|
|
133
163
|
using type = __mmask16;
|
|
134
164
|
};
|
|
135
165
|
template <>
|
|
136
|
-
struct
|
|
166
|
+
struct RawMask256T<4> {
|
|
137
167
|
using type = __mmask8;
|
|
138
168
|
};
|
|
139
169
|
template <>
|
|
140
|
-
struct
|
|
170
|
+
struct RawMask256T<8> {
|
|
141
171
|
using type = __mmask8;
|
|
142
172
|
};
|
|
143
173
|
|
|
174
|
+
template <typename T>
|
|
175
|
+
using RawMask256 = typename RawMask256T<sizeof(T)>::type;
|
|
176
|
+
|
|
177
|
+
#else // AVX2 or earlier
|
|
178
|
+
|
|
179
|
+
template <typename T>
|
|
180
|
+
using RawMask256 = typename Raw256<T>::type;
|
|
181
|
+
|
|
182
|
+
#endif // HWY_TARGET <= HWY_AVX3
|
|
183
|
+
|
|
144
184
|
} // namespace detail
|
|
145
185
|
|
|
146
186
|
template <typename T>
|
|
147
187
|
struct Mask256 {
|
|
148
|
-
using Raw = typename detail::RawMask256<
|
|
188
|
+
using Raw = typename detail::RawMask256<T>;
|
|
149
189
|
|
|
190
|
+
using PrivateT = T; // only for DFromM
|
|
191
|
+
static constexpr size_t kPrivateN = 32 / sizeof(T); // only for DFromM
|
|
192
|
+
|
|
193
|
+
#if HWY_TARGET <= HWY_AVX3
|
|
150
194
|
static Mask256<T> FromBits(uint64_t mask_bits) {
|
|
151
195
|
return Mask256<T>{static_cast<Raw>(mask_bits)};
|
|
152
196
|
}
|
|
197
|
+
#else
|
|
198
|
+
// Lanes are either FF..FF or 0.
|
|
199
|
+
#endif // HWY_TARGET <= HWY_AVX3
|
|
153
200
|
|
|
154
201
|
Raw raw;
|
|
155
202
|
};
|
|
156
203
|
|
|
157
|
-
#else // AVX2
|
|
158
|
-
|
|
159
|
-
// FF..FF or 0.
|
|
160
204
|
template <typename T>
|
|
161
|
-
|
|
162
|
-
typename detail::Raw256<T>::type raw;
|
|
163
|
-
};
|
|
164
|
-
|
|
165
|
-
#endif // AVX2
|
|
205
|
+
using Full256 = Simd<T, 32 / sizeof(T), 0>;
|
|
166
206
|
|
|
167
|
-
|
|
168
|
-
namespace detail {
|
|
207
|
+
// ------------------------------ Zero
|
|
169
208
|
|
|
170
|
-
//
|
|
171
|
-
template <
|
|
172
|
-
|
|
173
|
-
return
|
|
209
|
+
// Cannot use VFromD here because it is defined in terms of Zero.
|
|
210
|
+
template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(D)>
|
|
211
|
+
HWY_API Vec256<TFromD<D>> Zero(D /* tag */) {
|
|
212
|
+
return Vec256<TFromD<D>>{_mm256_setzero_si256()};
|
|
213
|
+
}
|
|
214
|
+
template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_BF16_D(D)>
|
|
215
|
+
HWY_API Vec256<bfloat16_t> Zero(D /* tag */) {
|
|
216
|
+
return Vec256<bfloat16_t>{_mm256_setzero_si256()};
|
|
217
|
+
}
|
|
218
|
+
template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F16_D(D)>
|
|
219
|
+
HWY_API Vec256<float16_t> Zero(D /* tag */) {
|
|
220
|
+
#if HWY_HAVE_FLOAT16
|
|
221
|
+
return Vec256<float16_t>{_mm256_setzero_ph()};
|
|
222
|
+
#else
|
|
223
|
+
return Vec256<float16_t>{_mm256_setzero_si256()};
|
|
224
|
+
#endif // HWY_HAVE_FLOAT16
|
|
225
|
+
}
|
|
226
|
+
template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)>
|
|
227
|
+
HWY_API Vec256<float> Zero(D /* tag */) {
|
|
228
|
+
return Vec256<float>{_mm256_setzero_ps()};
|
|
229
|
+
}
|
|
230
|
+
template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F64_D(D)>
|
|
231
|
+
HWY_API Vec256<double> Zero(D /* tag */) {
|
|
232
|
+
return Vec256<double>{_mm256_setzero_pd()};
|
|
174
233
|
}
|
|
175
|
-
|
|
176
|
-
} // namespace detail
|
|
177
|
-
#endif // HWY_TARGET <= HWY_AVX3
|
|
178
|
-
|
|
179
|
-
template <typename T>
|
|
180
|
-
using Full256 = Simd<T, 32 / sizeof(T), 0>;
|
|
181
234
|
|
|
182
235
|
// ------------------------------ BitCast
|
|
183
236
|
|
|
@@ -250,34 +303,6 @@ HWY_API VFromD<D> BitCast(D d, Vec256<FromT> v) {
|
|
|
250
303
|
return detail::BitCastFromByte(d, detail::BitCastToByte(v));
|
|
251
304
|
}
|
|
252
305
|
|
|
253
|
-
// ------------------------------ Zero
|
|
254
|
-
|
|
255
|
-
// Cannot use VFromD here because it is defined in terms of Zero.
|
|
256
|
-
template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(D)>
|
|
257
|
-
HWY_API Vec256<TFromD<D>> Zero(D /* tag */) {
|
|
258
|
-
return Vec256<TFromD<D>>{_mm256_setzero_si256()};
|
|
259
|
-
}
|
|
260
|
-
template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_BF16_D(D)>
|
|
261
|
-
HWY_API Vec256<bfloat16_t> Zero(D /* tag */) {
|
|
262
|
-
return Vec256<bfloat16_t>{_mm256_setzero_si256()};
|
|
263
|
-
}
|
|
264
|
-
template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F16_D(D)>
|
|
265
|
-
HWY_API Vec256<float16_t> Zero(D /* tag */) {
|
|
266
|
-
#if HWY_HAVE_FLOAT16
|
|
267
|
-
return Vec256<float16_t>{_mm256_setzero_ph()};
|
|
268
|
-
#else
|
|
269
|
-
return Vec256<float16_t>{_mm256_setzero_si256()};
|
|
270
|
-
#endif // HWY_HAVE_FLOAT16
|
|
271
|
-
}
|
|
272
|
-
template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)>
|
|
273
|
-
HWY_API Vec256<float> Zero(D /* tag */) {
|
|
274
|
-
return Vec256<float>{_mm256_setzero_ps()};
|
|
275
|
-
}
|
|
276
|
-
template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F64_D(D)>
|
|
277
|
-
HWY_API Vec256<double> Zero(D /* tag */) {
|
|
278
|
-
return Vec256<double>{_mm256_setzero_pd()};
|
|
279
|
-
}
|
|
280
|
-
|
|
281
306
|
// ------------------------------ Set
|
|
282
307
|
|
|
283
308
|
template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 1)>
|
|
@@ -1761,6 +1786,68 @@ HWY_API Vec256<double> Max(const Vec256<double> a, const Vec256<double> b) {
|
|
|
1761
1786
|
return Vec256<double>{_mm256_max_pd(a.raw, b.raw)};
|
|
1762
1787
|
}
|
|
1763
1788
|
|
|
1789
|
+
// ------------------------------ MinNumber and MaxNumber
|
|
1790
|
+
|
|
1791
|
+
#if HWY_X86_HAVE_AVX10_2_OPS
|
|
1792
|
+
|
|
1793
|
+
#if HWY_HAVE_FLOAT16
|
|
1794
|
+
HWY_API Vec256<float16_t> MinNumber(Vec256<float16_t> a, Vec256<float16_t> b) {
|
|
1795
|
+
return Vec256<float16_t>{_mm256_minmax_ph(a.raw, b.raw, 0x14)};
|
|
1796
|
+
}
|
|
1797
|
+
#endif
|
|
1798
|
+
HWY_API Vec256<float> MinNumber(Vec256<float> a, Vec256<float> b) {
|
|
1799
|
+
return Vec256<float>{_mm256_minmax_ps(a.raw, b.raw, 0x14)};
|
|
1800
|
+
}
|
|
1801
|
+
HWY_API Vec256<double> MinNumber(Vec256<double> a, Vec256<double> b) {
|
|
1802
|
+
return Vec256<double>{_mm256_minmax_pd(a.raw, b.raw, 0x14)};
|
|
1803
|
+
}
|
|
1804
|
+
|
|
1805
|
+
#if HWY_HAVE_FLOAT16
|
|
1806
|
+
HWY_API Vec256<float16_t> MaxNumber(Vec256<float16_t> a, Vec256<float16_t> b) {
|
|
1807
|
+
return Vec256<float16_t>{_mm256_minmax_ph(a.raw, b.raw, 0x15)};
|
|
1808
|
+
}
|
|
1809
|
+
#endif
|
|
1810
|
+
HWY_API Vec256<float> MaxNumber(Vec256<float> a, Vec256<float> b) {
|
|
1811
|
+
return Vec256<float>{_mm256_minmax_ps(a.raw, b.raw, 0x15)};
|
|
1812
|
+
}
|
|
1813
|
+
HWY_API Vec256<double> MaxNumber(Vec256<double> a, Vec256<double> b) {
|
|
1814
|
+
return Vec256<double>{_mm256_minmax_pd(a.raw, b.raw, 0x15)};
|
|
1815
|
+
}
|
|
1816
|
+
|
|
1817
|
+
#endif
|
|
1818
|
+
|
|
1819
|
+
// ------------------------------ MinMagnitude and MaxMagnitude
|
|
1820
|
+
|
|
1821
|
+
#if HWY_X86_HAVE_AVX10_2_OPS
|
|
1822
|
+
|
|
1823
|
+
#if HWY_HAVE_FLOAT16
|
|
1824
|
+
HWY_API Vec256<float16_t> MinMagnitude(Vec256<float16_t> a,
|
|
1825
|
+
Vec256<float16_t> b) {
|
|
1826
|
+
return Vec256<float16_t>{_mm256_minmax_ph(a.raw, b.raw, 0x16)};
|
|
1827
|
+
}
|
|
1828
|
+
#endif
|
|
1829
|
+
HWY_API Vec256<float> MinMagnitude(Vec256<float> a, Vec256<float> b) {
|
|
1830
|
+
return Vec256<float>{_mm256_minmax_ps(a.raw, b.raw, 0x16)};
|
|
1831
|
+
}
|
|
1832
|
+
HWY_API Vec256<double> MinMagnitude(Vec256<double> a, Vec256<double> b) {
|
|
1833
|
+
return Vec256<double>{_mm256_minmax_pd(a.raw, b.raw, 0x16)};
|
|
1834
|
+
}
|
|
1835
|
+
|
|
1836
|
+
#if HWY_HAVE_FLOAT16
|
|
1837
|
+
HWY_API Vec256<float16_t> MaxMagnitude(Vec256<float16_t> a,
|
|
1838
|
+
Vec256<float16_t> b) {
|
|
1839
|
+
return Vec256<float16_t>{_mm256_minmax_ph(a.raw, b.raw, 0x17)};
|
|
1840
|
+
}
|
|
1841
|
+
#endif
|
|
1842
|
+
HWY_API Vec256<float> MaxMagnitude(Vec256<float> a, Vec256<float> b) {
|
|
1843
|
+
return Vec256<float>{_mm256_minmax_ps(a.raw, b.raw, 0x17)};
|
|
1844
|
+
}
|
|
1845
|
+
HWY_API Vec256<double> MaxMagnitude(Vec256<double> a, Vec256<double> b) {
|
|
1846
|
+
return Vec256<double>{_mm256_minmax_pd(a.raw, b.raw, 0x17)};
|
|
1847
|
+
}
|
|
1848
|
+
|
|
1849
|
+
#endif
|
|
1850
|
+
|
|
1764
1851
|
// ------------------------------ Iota
|
|
1765
1852
|
|
|
1766
1853
|
namespace detail {
|
|
@@ -1952,6 +2039,47 @@ HWY_API Vec256<double> AddSub(Vec256<double> a, Vec256<double> b) {
|
|
|
1952
2039
|
return Vec256<double>{_mm256_addsub_pd(a.raw, b.raw)};
|
|
1953
2040
|
}
|
|
1954
2041
|
|
|
2042
|
+
// ------------------------------ PairwiseAdd128/PairwiseSub128
|
|
2043
|
+
|
|
2044
|
+
template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_UI16_D(D)>
|
|
2045
|
+
HWY_API VFromD<D> PairwiseAdd128(D /*d*/, VFromD<D> a, VFromD<D> b) {
|
|
2046
|
+
return VFromD<D>{_mm256_hadd_epi16(a.raw, b.raw)};
|
|
2047
|
+
}
|
|
2048
|
+
template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_UI16_D(D)>
|
|
2049
|
+
HWY_API VFromD<D> PairwiseSub128(D /*d*/, VFromD<D> a, VFromD<D> b) {
|
|
2050
|
+
const DFromV<decltype(a)> d;
|
|
2051
|
+
const RebindToSigned<decltype(d)> di;
|
|
2052
|
+
return BitCast(d,
|
|
2053
|
+
Neg(BitCast(di, VFromD<D>{_mm256_hsub_epi16(a.raw, b.raw)})));
|
|
2054
|
+
}
|
|
2055
|
+
template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_UI32_D(D)>
|
|
2056
|
+
HWY_API VFromD<D> PairwiseAdd128(D /*d*/, VFromD<D> a, VFromD<D> b) {
|
|
2057
|
+
return VFromD<D>{_mm256_hadd_epi32(a.raw, b.raw)};
|
|
2058
|
+
}
|
|
2059
|
+
template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_UI32_D(D)>
|
|
2060
|
+
HWY_API VFromD<D> PairwiseSub128(D /*d*/, VFromD<D> a, VFromD<D> b) {
|
|
2061
|
+
const DFromV<decltype(a)> d;
|
|
2062
|
+
const RebindToSigned<decltype(d)> di;
|
|
2063
|
+
return BitCast(d,
|
|
2064
|
+
Neg(BitCast(di, VFromD<D>{_mm256_hsub_epi32(a.raw, b.raw)})));
|
|
2065
|
+
}
|
|
2066
|
+
template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)>
|
|
2067
|
+
HWY_API VFromD<D> PairwiseAdd128(D /*d*/, VFromD<D> a, VFromD<D> b) {
|
|
2068
|
+
return VFromD<D>{_mm256_hadd_ps(a.raw, b.raw)};
|
|
2069
|
+
}
|
|
2070
|
+
template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)>
|
|
2071
|
+
HWY_API VFromD<D> PairwiseSub128(D /*d*/, VFromD<D> a, VFromD<D> b) {
|
|
2072
|
+
return Neg(VFromD<D>{_mm256_hsub_ps(a.raw, b.raw)});
|
|
2073
|
+
}
|
|
2074
|
+
template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F64_D(D)>
|
|
2075
|
+
HWY_API VFromD<D> PairwiseAdd128(D /*d*/, VFromD<D> a, VFromD<D> b) {
|
|
2076
|
+
return VFromD<D>{_mm256_hadd_pd(a.raw, b.raw)};
|
|
2077
|
+
}
|
|
2078
|
+
template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F64_D(D)>
|
|
2079
|
+
HWY_API VFromD<D> PairwiseSub128(D /*d*/, VFromD<D> a, VFromD<D> b) {
|
|
2080
|
+
return Neg(VFromD<D>{_mm256_hsub_pd(a.raw, b.raw)});
|
|
2081
|
+
}
|
|
2082
|
+
|
|
1955
2083
|
// ------------------------------ SumsOf8
|
|
1956
2084
|
HWY_API Vec256<uint64_t> SumsOf8(Vec256<uint8_t> v) {
|
|
1957
2085
|
return Vec256<uint64_t>{_mm256_sad_epu8(v.raw, _mm256_setzero_si256())};
|
|
@@ -2146,6 +2274,11 @@ HWY_API Vec256<uint16_t> operator*(Vec256<uint16_t> a, Vec256<uint16_t> b) {
|
|
|
2146
2274
|
HWY_API Vec256<uint32_t> operator*(Vec256<uint32_t> a, Vec256<uint32_t> b) {
|
|
2147
2275
|
return Vec256<uint32_t>{_mm256_mullo_epi32(a.raw, b.raw)};
|
|
2148
2276
|
}
|
|
2277
|
+
#if HWY_TARGET <= HWY_AVX3
|
|
2278
|
+
HWY_API Vec256<uint64_t> operator*(Vec256<uint64_t> a, Vec256<uint64_t> b) {
|
|
2279
|
+
return Vec256<uint64_t>{_mm256_mullo_epi64(a.raw, b.raw)};
|
|
2280
|
+
}
|
|
2281
|
+
#endif
|
|
2149
2282
|
|
|
2150
2283
|
// Signed
|
|
2151
2284
|
HWY_API Vec256<int16_t> operator*(Vec256<int16_t> a, Vec256<int16_t> b) {
|
|
@@ -2154,6 +2287,11 @@ HWY_API Vec256<int16_t> operator*(Vec256<int16_t> a, Vec256<int16_t> b) {
|
|
|
2154
2287
|
HWY_API Vec256<int32_t> operator*(Vec256<int32_t> a, Vec256<int32_t> b) {
|
|
2155
2288
|
return Vec256<int32_t>{_mm256_mullo_epi32(a.raw, b.raw)};
|
|
2156
2289
|
}
|
|
2290
|
+
#if HWY_TARGET <= HWY_AVX3
|
|
2291
|
+
HWY_API Vec256<int64_t> operator*(Vec256<int64_t> a, Vec256<int64_t> b) {
|
|
2292
|
+
return Vec256<int64_t>{_mm256_mullo_epi64(a.raw, b.raw)};
|
|
2293
|
+
}
|
|
2294
|
+
#endif
|
|
2157
2295
|
|
|
2158
2296
|
// Returns the upper 16 bits of a * b in each lane.
|
|
2159
2297
|
HWY_API Vec256<uint16_t> MulHigh(Vec256<uint16_t> a, Vec256<uint16_t> b) {
|
|
@@ -2377,29 +2515,37 @@ HWY_API Vec256<int32_t> BroadcastSignBit(const Vec256<int32_t> v) {
|
|
|
2377
2515
|
return ShiftRight<31>(v);
|
|
2378
2516
|
}
|
|
2379
2517
|
|
|
2518
|
+
#if HWY_TARGET <= HWY_AVX3
|
|
2519
|
+
|
|
2520
|
+
template <int kBits>
|
|
2521
|
+
HWY_API Vec256<int64_t> ShiftRight(const Vec256<int64_t> v) {
|
|
2522
|
+
return Vec256<int64_t>{
|
|
2523
|
+
_mm256_srai_epi64(v.raw, static_cast<Shift64Count>(kBits))};
|
|
2524
|
+
}
|
|
2525
|
+
|
|
2526
|
+
HWY_API Vec256<int64_t> BroadcastSignBit(const Vec256<int64_t> v) {
|
|
2527
|
+
return ShiftRight<63>(v);
|
|
2528
|
+
}
|
|
2529
|
+
|
|
2530
|
+
#else // AVX2
|
|
2531
|
+
|
|
2532
|
+
// Unlike above, this will be used to implement int64_t ShiftRight.
|
|
2380
2533
|
HWY_API Vec256<int64_t> BroadcastSignBit(const Vec256<int64_t> v) {
|
|
2381
|
-
#if HWY_TARGET == HWY_AVX2
|
|
2382
2534
|
const DFromV<decltype(v)> d;
|
|
2383
2535
|
return VecFromMask(v < Zero(d));
|
|
2384
|
-
#else
|
|
2385
|
-
return Vec256<int64_t>{_mm256_srai_epi64(v.raw, 63)};
|
|
2386
|
-
#endif
|
|
2387
2536
|
}
|
|
2388
2537
|
|
|
2389
2538
|
template <int kBits>
|
|
2390
2539
|
HWY_API Vec256<int64_t> ShiftRight(const Vec256<int64_t> v) {
|
|
2391
|
-
#if HWY_TARGET <= HWY_AVX3
|
|
2392
|
-
return Vec256<int64_t>{
|
|
2393
|
-
_mm256_srai_epi64(v.raw, static_cast<Shift64Count>(kBits))};
|
|
2394
|
-
#else
|
|
2395
2540
|
const Full256<int64_t> di;
|
|
2396
2541
|
const Full256<uint64_t> du;
|
|
2397
2542
|
const auto right = BitCast(di, ShiftRight<kBits>(BitCast(du, v)));
|
|
2398
2543
|
const auto sign = ShiftLeft<64 - kBits>(BroadcastSignBit(v));
|
|
2399
2544
|
return right | sign;
|
|
2400
|
-
#endif
|
|
2401
2545
|
}
|
|
2402
2546
|
|
|
2547
|
+
#endif // #if HWY_TARGET <= HWY_AVX3
|
|
2548
|
+
|
|
2403
2549
|
// ------------------------------ IfNegativeThenElse (BroadcastSignBit)
|
|
2404
2550
|
HWY_API Vec256<int8_t> IfNegativeThenElse(Vec256<int8_t> v, Vec256<int8_t> yes,
|
|
2405
2551
|
Vec256<int8_t> no) {
|
|
@@ -2459,6 +2605,10 @@ HWY_API Vec256<int32_t> IfNegativeThenNegOrUndefIfZero(Vec256<int32_t> mask,
|
|
|
2459
2605
|
|
|
2460
2606
|
// ------------------------------ ShiftLeftSame
|
|
2461
2607
|
|
|
2608
|
+
// Disable sign conversion warnings for GCC debug intrinsics.
|
|
2609
|
+
HWY_DIAGNOSTICS(push)
|
|
2610
|
+
HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion")
|
|
2611
|
+
|
|
2462
2612
|
HWY_API Vec256<uint16_t> ShiftLeftSame(const Vec256<uint16_t> v,
|
|
2463
2613
|
const int bits) {
|
|
2464
2614
|
#if HWY_COMPILER_GCC
|
|
@@ -2606,6 +2756,8 @@ HWY_API Vec256<int8_t> ShiftRightSame(Vec256<int8_t> v, const int bits) {
|
|
|
2606
2756
|
return (shifted ^ shifted_sign) - shifted_sign;
|
|
2607
2757
|
}
|
|
2608
2758
|
|
|
2759
|
+
HWY_DIAGNOSTICS(pop)
|
|
2760
|
+
|
|
2609
2761
|
// ------------------------------ Neg (Xor, Sub)
|
|
2610
2762
|
|
|
2611
2763
|
// Tag dispatch instead of SFINAE for MSVC 2017 compatibility
|
|
@@ -2651,6 +2803,25 @@ HWY_API Vec256<double> operator*(Vec256<double> a, Vec256<double> b) {
|
|
|
2651
2803
|
return Vec256<double>{_mm256_mul_pd(a.raw, b.raw)};
|
|
2652
2804
|
}
|
|
2653
2805
|
|
|
2806
|
+
#if HWY_TARGET <= HWY_AVX3
|
|
2807
|
+
|
|
2808
|
+
#if HWY_HAVE_FLOAT16
|
|
2809
|
+
HWY_API Vec256<float16_t> MulByFloorPow2(Vec256<float16_t> a,
|
|
2810
|
+
Vec256<float16_t> b) {
|
|
2811
|
+
return Vec256<float16_t>{_mm256_scalef_ph(a.raw, b.raw)};
|
|
2812
|
+
}
|
|
2813
|
+
#endif
|
|
2814
|
+
|
|
2815
|
+
HWY_API Vec256<float> MulByFloorPow2(Vec256<float> a, Vec256<float> b) {
|
|
2816
|
+
return Vec256<float>{_mm256_scalef_ps(a.raw, b.raw)};
|
|
2817
|
+
}
|
|
2818
|
+
|
|
2819
|
+
HWY_API Vec256<double> MulByFloorPow2(Vec256<double> a, Vec256<double> b) {
|
|
2820
|
+
return Vec256<double>{_mm256_scalef_pd(a.raw, b.raw)};
|
|
2821
|
+
}
|
|
2822
|
+
|
|
2823
|
+
#endif // HWY_TARGET <= HWY_AVX3
|
|
2824
|
+
|
|
2654
2825
|
#if HWY_HAVE_FLOAT16
|
|
2655
2826
|
HWY_API Vec256<float16_t> operator/(Vec256<float16_t> a, Vec256<float16_t> b) {
|
|
2656
2827
|
return Vec256<float16_t>{_mm256_div_ph(a.raw, b.raw)};
|
|
@@ -2680,6 +2851,27 @@ HWY_API Vec256<double> ApproximateReciprocal(Vec256<double> v) {
|
|
|
2680
2851
|
}
|
|
2681
2852
|
#endif
|
|
2682
2853
|
|
|
2854
|
+
// ------------------------------ GetExponent
|
|
2855
|
+
|
|
2856
|
+
#if HWY_TARGET <= HWY_AVX3
|
|
2857
|
+
|
|
2858
|
+
#if HWY_HAVE_FLOAT16
|
|
2859
|
+
template <class V, HWY_IF_F16(TFromV<V>), HWY_IF_V_SIZE_V(V, 32)>
|
|
2860
|
+
HWY_API V GetExponent(V v) {
|
|
2861
|
+
return V{_mm256_getexp_ph(v.raw)};
|
|
2862
|
+
}
|
|
2863
|
+
#endif
|
|
2864
|
+
template <class V, HWY_IF_F32(TFromV<V>), HWY_IF_V_SIZE_V(V, 32)>
|
|
2865
|
+
HWY_API V GetExponent(V v) {
|
|
2866
|
+
return V{_mm256_getexp_ps(v.raw)};
|
|
2867
|
+
}
|
|
2868
|
+
template <class V, HWY_IF_F64(TFromV<V>), HWY_IF_V_SIZE_V(V, 32)>
|
|
2869
|
+
HWY_API V GetExponent(V v) {
|
|
2870
|
+
return V{_mm256_getexp_pd(v.raw)};
|
|
2871
|
+
}
|
|
2872
|
+
|
|
2873
|
+
#endif
|
|
2874
|
+
|
|
2683
2875
|
// ------------------------------ MaskedMinOr
|
|
2684
2876
|
|
|
2685
2877
|
#if HWY_TARGET <= HWY_AVX3
|
|
@@ -4170,48 +4362,130 @@ HWY_API Vec256<double> Broadcast(const Vec256<double> v) {
|
|
|
4170
4362
|
return Vec256<double>{_mm256_shuffle_pd(v.raw, v.raw, 15 * kLane)};
|
|
4171
4363
|
}
|
|
4172
4364
|
|
|
4173
|
-
// ------------------------------
|
|
4174
|
-
|
|
4175
|
-
template <int kBlockIdx, class T>
|
|
4176
|
-
HWY_API Vec256<T> BroadcastBlock(Vec256<T> v) {
|
|
4177
|
-
static_assert(kBlockIdx == 0 || kBlockIdx == 1, "Invalid block index");
|
|
4178
|
-
const DFromV<decltype(v)> d;
|
|
4179
|
-
return (kBlockIdx == 0) ? ConcatLowerLower(d, v, v)
|
|
4180
|
-
: ConcatUpperUpper(d, v, v);
|
|
4181
|
-
}
|
|
4182
|
-
|
|
4183
|
-
// ------------------------------ BroadcastLane
|
|
4184
|
-
|
|
4185
|
-
namespace detail {
|
|
4365
|
+
// ------------------------------ Concat blocks (LowerHalf, ZeroExtendVector)
|
|
4186
4366
|
|
|
4187
|
-
|
|
4188
|
-
|
|
4189
|
-
|
|
4190
|
-
const Half<DFromV<decltype(v)>> dh;
|
|
4191
|
-
return Vec256<T>{_mm256_broadcastb_epi8(LowerHalf(dh, v).raw)};
|
|
4192
|
-
}
|
|
4367
|
+
// _mm256_broadcastsi128_si256 has 7 cycle latency on ICL.
|
|
4368
|
+
// _mm256_permute2x128_si256 is slow on Zen1 (8 uops), so we avoid it (at no
|
|
4369
|
+
// extra cost) for LowerLower and UpperLower.
|
|
4193
4370
|
|
|
4194
|
-
|
|
4195
|
-
|
|
4196
|
-
|
|
4197
|
-
const DFromV<decltype(v)> d;
|
|
4371
|
+
// hiH,hiL loH,loL |-> hiL,loL (= lower halves)
|
|
4372
|
+
template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_NOT_FLOAT3264_D(D)>
|
|
4373
|
+
HWY_API VFromD<D> ConcatLowerLower(D d, VFromD<D> hi, VFromD<D> lo) {
|
|
4198
4374
|
const RebindToUnsigned<decltype(d)> du; // for float16_t
|
|
4199
|
-
const Half<decltype(d)>
|
|
4200
|
-
const RebindToUnsigned<decltype(
|
|
4201
|
-
return BitCast(
|
|
4202
|
-
|
|
4375
|
+
const Half<decltype(d)> d2;
|
|
4376
|
+
const RebindToUnsigned<decltype(d2)> du2; // for float16_t
|
|
4377
|
+
return BitCast(
|
|
4378
|
+
d, VFromD<decltype(du)>{_mm256_inserti128_si256(
|
|
4379
|
+
BitCast(du, lo).raw, BitCast(du2, LowerHalf(d2, hi)).raw, 1)});
|
|
4203
4380
|
}
|
|
4204
|
-
|
|
4205
|
-
|
|
4206
|
-
|
|
4207
|
-
|
|
4208
|
-
|
|
4209
|
-
|
|
4381
|
+
template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)>
|
|
4382
|
+
HWY_API Vec256<float> ConcatLowerLower(D d, Vec256<float> hi,
|
|
4383
|
+
Vec256<float> lo) {
|
|
4384
|
+
const Half<decltype(d)> d2;
|
|
4385
|
+
return Vec256<float>{_mm256_insertf128_ps(lo.raw, LowerHalf(d2, hi).raw, 1)};
|
|
4386
|
+
}
|
|
4387
|
+
template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F64_D(D)>
|
|
4388
|
+
HWY_API Vec256<double> ConcatLowerLower(D d, Vec256<double> hi,
|
|
4389
|
+
Vec256<double> lo) {
|
|
4390
|
+
const Half<decltype(d)> d2;
|
|
4391
|
+
return Vec256<double>{_mm256_insertf128_pd(lo.raw, LowerHalf(d2, hi).raw, 1)};
|
|
4210
4392
|
}
|
|
4211
4393
|
|
|
4212
|
-
|
|
4213
|
-
|
|
4214
|
-
|
|
4394
|
+
// hiH,hiL loH,loL |-> hiL,loH (= inner halves / swap blocks)
|
|
4395
|
+
template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_NOT_FLOAT3264_D(D)>
|
|
4396
|
+
HWY_API VFromD<D> ConcatLowerUpper(D d, VFromD<D> hi, VFromD<D> lo) {
|
|
4397
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
4398
|
+
return BitCast(d, VFromD<decltype(du)>{_mm256_permute2x128_si256(
|
|
4399
|
+
BitCast(du, lo).raw, BitCast(du, hi).raw, 0x21)});
|
|
4400
|
+
}
|
|
4401
|
+
template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)>
|
|
4402
|
+
HWY_API Vec256<float> ConcatLowerUpper(D /* tag */, Vec256<float> hi,
|
|
4403
|
+
Vec256<float> lo) {
|
|
4404
|
+
return Vec256<float>{_mm256_permute2f128_ps(lo.raw, hi.raw, 0x21)};
|
|
4405
|
+
}
|
|
4406
|
+
template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F64_D(D)>
|
|
4407
|
+
HWY_API Vec256<double> ConcatLowerUpper(D /* tag */, Vec256<double> hi,
|
|
4408
|
+
Vec256<double> lo) {
|
|
4409
|
+
return Vec256<double>{_mm256_permute2f128_pd(lo.raw, hi.raw, 0x21)};
|
|
4410
|
+
}
|
|
4411
|
+
|
|
4412
|
+
// hiH,hiL loH,loL |-> hiH,loL (= outer halves)
|
|
4413
|
+
template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_NOT_FLOAT3264_D(D)>
|
|
4414
|
+
HWY_API VFromD<D> ConcatUpperLower(D d, VFromD<D> hi, VFromD<D> lo) {
|
|
4415
|
+
const RebindToUnsigned<decltype(d)> du; // for float16_t
|
|
4416
|
+
return BitCast(d, VFromD<decltype(du)>{_mm256_blend_epi32(
|
|
4417
|
+
BitCast(du, hi).raw, BitCast(du, lo).raw, 0x0F)});
|
|
4418
|
+
}
|
|
4419
|
+
template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)>
|
|
4420
|
+
HWY_API Vec256<float> ConcatUpperLower(D /* tag */, Vec256<float> hi,
|
|
4421
|
+
Vec256<float> lo) {
|
|
4422
|
+
return Vec256<float>{_mm256_blend_ps(hi.raw, lo.raw, 0x0F)};
|
|
4423
|
+
}
|
|
4424
|
+
template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F64_D(D)>
|
|
4425
|
+
HWY_API Vec256<double> ConcatUpperLower(D /* tag */, Vec256<double> hi,
|
|
4426
|
+
Vec256<double> lo) {
|
|
4427
|
+
return Vec256<double>{_mm256_blend_pd(hi.raw, lo.raw, 3)};
|
|
4428
|
+
}
|
|
4429
|
+
|
|
4430
|
+
// hiH,hiL loH,loL |-> hiH,loH (= upper halves)
|
|
4431
|
+
template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_NOT_FLOAT3264_D(D)>
|
|
4432
|
+
HWY_API VFromD<D> ConcatUpperUpper(D d, VFromD<D> hi, VFromD<D> lo) {
|
|
4433
|
+
const RebindToUnsigned<decltype(d)> du; // for float16_t
|
|
4434
|
+
return BitCast(d, VFromD<decltype(du)>{_mm256_permute2x128_si256(
|
|
4435
|
+
BitCast(du, lo).raw, BitCast(du, hi).raw, 0x31)});
|
|
4436
|
+
}
|
|
4437
|
+
template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)>
|
|
4438
|
+
HWY_API Vec256<float> ConcatUpperUpper(D /* tag */, Vec256<float> hi,
|
|
4439
|
+
Vec256<float> lo) {
|
|
4440
|
+
return Vec256<float>{_mm256_permute2f128_ps(lo.raw, hi.raw, 0x31)};
|
|
4441
|
+
}
|
|
4442
|
+
template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F64_D(D)>
|
|
4443
|
+
HWY_API Vec256<double> ConcatUpperUpper(D /* tag */, Vec256<double> hi,
|
|
4444
|
+
Vec256<double> lo) {
|
|
4445
|
+
return Vec256<double>{_mm256_permute2f128_pd(lo.raw, hi.raw, 0x31)};
|
|
4446
|
+
}
|
|
4447
|
+
|
|
4448
|
+
// ------------------------------ BroadcastBlock
|
|
4449
|
+
template <int kBlockIdx, class T>
|
|
4450
|
+
HWY_API Vec256<T> BroadcastBlock(Vec256<T> v) {
|
|
4451
|
+
static_assert(kBlockIdx == 0 || kBlockIdx == 1, "Invalid block index");
|
|
4452
|
+
const DFromV<decltype(v)> d;
|
|
4453
|
+
return (kBlockIdx == 0) ? ConcatLowerLower(d, v, v)
|
|
4454
|
+
: ConcatUpperUpper(d, v, v);
|
|
4455
|
+
}
|
|
4456
|
+
|
|
4457
|
+
// ------------------------------ BroadcastLane
|
|
4458
|
+
|
|
4459
|
+
namespace detail {
|
|
4460
|
+
|
|
4461
|
+
template <class T, HWY_IF_T_SIZE(T, 1)>
|
|
4462
|
+
HWY_INLINE Vec256<T> BroadcastLane(hwy::SizeTag<0> /* lane_idx_tag */,
|
|
4463
|
+
Vec256<T> v) {
|
|
4464
|
+
const Half<DFromV<decltype(v)>> dh;
|
|
4465
|
+
return Vec256<T>{_mm256_broadcastb_epi8(LowerHalf(dh, v).raw)};
|
|
4466
|
+
}
|
|
4467
|
+
|
|
4468
|
+
template <class T, HWY_IF_T_SIZE(T, 2)>
|
|
4469
|
+
HWY_INLINE Vec256<T> BroadcastLane(hwy::SizeTag<0> /* lane_idx_tag */,
|
|
4470
|
+
Vec256<T> v) {
|
|
4471
|
+
const DFromV<decltype(v)> d;
|
|
4472
|
+
const RebindToUnsigned<decltype(d)> du; // for float16_t
|
|
4473
|
+
const Half<decltype(d)> dh;
|
|
4474
|
+
const RebindToUnsigned<decltype(dh)> dh_u;
|
|
4475
|
+
return BitCast(d, VFromD<decltype(du)>{_mm256_broadcastw_epi16(
|
|
4476
|
+
BitCast(dh_u, LowerHalf(dh, v)).raw)});
|
|
4477
|
+
}
|
|
4478
|
+
|
|
4479
|
+
template <class T, HWY_IF_UI32(T)>
|
|
4480
|
+
HWY_INLINE Vec256<T> BroadcastLane(hwy::SizeTag<0> /* lane_idx_tag */,
|
|
4481
|
+
Vec256<T> v) {
|
|
4482
|
+
const Half<DFromV<decltype(v)>> dh;
|
|
4483
|
+
return Vec256<T>{_mm256_broadcastd_epi32(LowerHalf(dh, v).raw)};
|
|
4484
|
+
}
|
|
4485
|
+
|
|
4486
|
+
template <class T, HWY_IF_UI64(T)>
|
|
4487
|
+
HWY_INLINE Vec256<T> BroadcastLane(hwy::SizeTag<0> /* lane_idx_tag */,
|
|
4488
|
+
Vec256<T> v) {
|
|
4215
4489
|
const Half<DFromV<decltype(v)>> dh;
|
|
4216
4490
|
return Vec256<T>{_mm256_broadcastq_epi64(LowerHalf(dh, v).raw)};
|
|
4217
4491
|
}
|
|
@@ -4651,6 +4925,18 @@ HWY_API Vec256<float> SwapAdjacentBlocks(Vec256<float> v) {
|
|
|
4651
4925
|
return BitCast(d, SwapAdjacentBlocks(BitCast(dw, v)));
|
|
4652
4926
|
}
|
|
4653
4927
|
|
|
4928
|
+
// ------------------------------ InterleaveEvenBlocks (ConcatLowerLower)
|
|
4929
|
+
template <class D, class V = VFromD<D>, HWY_IF_V_SIZE_D(D, 32)>
|
|
4930
|
+
HWY_API V InterleaveEvenBlocks(D d, V a, V b) {
|
|
4931
|
+
return ConcatLowerLower(d, b, a);
|
|
4932
|
+
}
|
|
4933
|
+
|
|
4934
|
+
// ------------------------------ InterleaveOddBlocks (ConcatUpperUpper)
|
|
4935
|
+
template <class D, class V = VFromD<D>, HWY_IF_V_SIZE_D(D, 32)>
|
|
4936
|
+
HWY_API V InterleaveOddBlocks(D d, V a, V b) {
|
|
4937
|
+
return ConcatUpperUpper(d, b, a);
|
|
4938
|
+
}
|
|
4939
|
+
|
|
4654
4940
|
// ------------------------------ Reverse (RotateRight)
|
|
4655
4941
|
|
|
4656
4942
|
template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 4)>
|
|
@@ -4807,89 +5093,6 @@ HWY_API VFromD<D> InterleaveUpper(D /* tag */, VFromD<D> a, VFromD<D> b) {
|
|
|
4807
5093
|
return VFromD<D>{_mm256_unpackhi_pd(a.raw, b.raw)};
|
|
4808
5094
|
}
|
|
4809
5095
|
|
|
4810
|
-
// ------------------------------ Blocks (LowerHalf, ZeroExtendVector)
|
|
4811
|
-
|
|
4812
|
-
// _mm256_broadcastsi128_si256 has 7 cycle latency on ICL.
|
|
4813
|
-
// _mm256_permute2x128_si256 is slow on Zen1 (8 uops), so we avoid it (at no
|
|
4814
|
-
// extra cost) for LowerLower and UpperLower.
|
|
4815
|
-
|
|
4816
|
-
// hiH,hiL loH,loL |-> hiL,loL (= lower halves)
|
|
4817
|
-
template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_NOT_FLOAT3264_D(D)>
|
|
4818
|
-
HWY_API VFromD<D> ConcatLowerLower(D d, VFromD<D> hi, VFromD<D> lo) {
|
|
4819
|
-
const RebindToUnsigned<decltype(d)> du; // for float16_t
|
|
4820
|
-
const Half<decltype(d)> d2;
|
|
4821
|
-
const RebindToUnsigned<decltype(d2)> du2; // for float16_t
|
|
4822
|
-
return BitCast(
|
|
4823
|
-
d, VFromD<decltype(du)>{_mm256_inserti128_si256(
|
|
4824
|
-
BitCast(du, lo).raw, BitCast(du2, LowerHalf(d2, hi)).raw, 1)});
|
|
4825
|
-
}
|
|
4826
|
-
template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)>
|
|
4827
|
-
HWY_API Vec256<float> ConcatLowerLower(D d, Vec256<float> hi,
|
|
4828
|
-
Vec256<float> lo) {
|
|
4829
|
-
const Half<decltype(d)> d2;
|
|
4830
|
-
return Vec256<float>{_mm256_insertf128_ps(lo.raw, LowerHalf(d2, hi).raw, 1)};
|
|
4831
|
-
}
|
|
4832
|
-
template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F64_D(D)>
|
|
4833
|
-
HWY_API Vec256<double> ConcatLowerLower(D d, Vec256<double> hi,
|
|
4834
|
-
Vec256<double> lo) {
|
|
4835
|
-
const Half<decltype(d)> d2;
|
|
4836
|
-
return Vec256<double>{_mm256_insertf128_pd(lo.raw, LowerHalf(d2, hi).raw, 1)};
|
|
4837
|
-
}
|
|
4838
|
-
|
|
4839
|
-
// hiH,hiL loH,loL |-> hiL,loH (= inner halves / swap blocks)
|
|
4840
|
-
template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_NOT_FLOAT3264_D(D)>
|
|
4841
|
-
HWY_API VFromD<D> ConcatLowerUpper(D d, VFromD<D> hi, VFromD<D> lo) {
|
|
4842
|
-
const RebindToUnsigned<decltype(d)> du;
|
|
4843
|
-
return BitCast(d, VFromD<decltype(du)>{_mm256_permute2x128_si256(
|
|
4844
|
-
BitCast(du, lo).raw, BitCast(du, hi).raw, 0x21)});
|
|
4845
|
-
}
|
|
4846
|
-
template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)>
|
|
4847
|
-
HWY_API Vec256<float> ConcatLowerUpper(D /* tag */, Vec256<float> hi,
|
|
4848
|
-
Vec256<float> lo) {
|
|
4849
|
-
return Vec256<float>{_mm256_permute2f128_ps(lo.raw, hi.raw, 0x21)};
|
|
4850
|
-
}
|
|
4851
|
-
template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F64_D(D)>
|
|
4852
|
-
HWY_API Vec256<double> ConcatLowerUpper(D /* tag */, Vec256<double> hi,
|
|
4853
|
-
Vec256<double> lo) {
|
|
4854
|
-
return Vec256<double>{_mm256_permute2f128_pd(lo.raw, hi.raw, 0x21)};
|
|
4855
|
-
}
|
|
4856
|
-
|
|
4857
|
-
// hiH,hiL loH,loL |-> hiH,loL (= outer halves)
|
|
4858
|
-
template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_NOT_FLOAT3264_D(D)>
|
|
4859
|
-
HWY_API VFromD<D> ConcatUpperLower(D d, VFromD<D> hi, VFromD<D> lo) {
|
|
4860
|
-
const RebindToUnsigned<decltype(d)> du; // for float16_t
|
|
4861
|
-
return BitCast(d, VFromD<decltype(du)>{_mm256_blend_epi32(
|
|
4862
|
-
BitCast(du, hi).raw, BitCast(du, lo).raw, 0x0F)});
|
|
4863
|
-
}
|
|
4864
|
-
template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)>
|
|
4865
|
-
HWY_API Vec256<float> ConcatUpperLower(D /* tag */, Vec256<float> hi,
|
|
4866
|
-
Vec256<float> lo) {
|
|
4867
|
-
return Vec256<float>{_mm256_blend_ps(hi.raw, lo.raw, 0x0F)};
|
|
4868
|
-
}
|
|
4869
|
-
template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F64_D(D)>
|
|
4870
|
-
HWY_API Vec256<double> ConcatUpperLower(D /* tag */, Vec256<double> hi,
|
|
4871
|
-
Vec256<double> lo) {
|
|
4872
|
-
return Vec256<double>{_mm256_blend_pd(hi.raw, lo.raw, 3)};
|
|
4873
|
-
}
|
|
4874
|
-
|
|
4875
|
-
// hiH,hiL loH,loL |-> hiH,loH (= upper halves)
|
|
4876
|
-
template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_NOT_FLOAT3264_D(D)>
|
|
4877
|
-
HWY_API VFromD<D> ConcatUpperUpper(D d, VFromD<D> hi, VFromD<D> lo) {
|
|
4878
|
-
const RebindToUnsigned<decltype(d)> du; // for float16_t
|
|
4879
|
-
return BitCast(d, VFromD<decltype(du)>{_mm256_permute2x128_si256(
|
|
4880
|
-
BitCast(du, lo).raw, BitCast(du, hi).raw, 0x31)});
|
|
4881
|
-
}
|
|
4882
|
-
template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)>
|
|
4883
|
-
HWY_API Vec256<float> ConcatUpperUpper(D /* tag */, Vec256<float> hi,
|
|
4884
|
-
Vec256<float> lo) {
|
|
4885
|
-
return Vec256<float>{_mm256_permute2f128_ps(lo.raw, hi.raw, 0x31)};
|
|
4886
|
-
}
|
|
4887
|
-
template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F64_D(D)>
|
|
4888
|
-
HWY_API Vec256<double> ConcatUpperUpper(D /* tag */, Vec256<double> hi,
|
|
4889
|
-
Vec256<double> lo) {
|
|
4890
|
-
return Vec256<double>{_mm256_permute2f128_pd(lo.raw, hi.raw, 0x31)};
|
|
4891
|
-
}
|
|
4892
|
-
|
|
4893
5096
|
// ---------------------------- InsertBlock (ConcatLowerLower, ConcatUpperLower)
|
|
4894
5097
|
template <int kBlockIdx, class T>
|
|
4895
5098
|
HWY_API Vec256<T> InsertBlock(Vec256<T> v, Vec128<T> blk_to_insert) {
|
|
@@ -6133,6 +6336,19 @@ HWY_API Vec256<int64_t> operator>>(Vec256<int64_t> v, Vec256<int64_t> bits) {
|
|
|
6133
6336
|
}
|
|
6134
6337
|
|
|
6135
6338
|
// ------------------------------ WidenMulPairwiseAdd
|
|
6339
|
+
|
|
6340
|
+
#if HWY_NATIVE_DOT_BF16
|
|
6341
|
+
|
|
6342
|
+
template <class DF, HWY_IF_F32_D(DF), HWY_IF_V_SIZE_D(DF, 32),
|
|
6343
|
+
class VBF = VFromD<Repartition<bfloat16_t, DF>>>
|
|
6344
|
+
HWY_API VFromD<DF> WidenMulPairwiseAdd(DF df, VBF a, VBF b) {
|
|
6345
|
+
return VFromD<DF>{_mm256_dpbf16_ps(Zero(df).raw,
|
|
6346
|
+
reinterpret_cast<__m256bh>(a.raw),
|
|
6347
|
+
reinterpret_cast<__m256bh>(b.raw))};
|
|
6348
|
+
}
|
|
6349
|
+
|
|
6350
|
+
#endif // HWY_NATIVE_DOT_BF16
|
|
6351
|
+
|
|
6136
6352
|
template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_I32_D(D)>
|
|
6137
6353
|
HWY_API VFromD<D> WidenMulPairwiseAdd(D /*d32*/, Vec256<int16_t> a,
|
|
6138
6354
|
Vec256<int16_t> b) {
|
|
@@ -6291,7 +6507,9 @@ HWY_API VFromD<D> PromoteTo(D /* tag */, Vec32<int8_t> v) {
|
|
|
6291
6507
|
#if HWY_TARGET <= HWY_AVX3
|
|
6292
6508
|
template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_I64_D(D)>
|
|
6293
6509
|
HWY_API VFromD<D> PromoteInRangeTo(D /*di64*/, VFromD<Rebind<float, D>> v) {
|
|
6294
|
-
#if
|
|
6510
|
+
#if HWY_X86_HAVE_AVX10_2_OPS
|
|
6511
|
+
return VFromD<D>{_mm256_cvtts_ps_epi64(v.raw)};
|
|
6512
|
+
#elif HWY_COMPILER_GCC_ACTUAL
|
|
6295
6513
|
// Workaround for undefined behavior with GCC if any values of v[i] are not
|
|
6296
6514
|
// within the range of an int64_t
|
|
6297
6515
|
|
|
@@ -6319,7 +6537,9 @@ HWY_API VFromD<D> PromoteInRangeTo(D /*di64*/, VFromD<Rebind<float, D>> v) {
|
|
|
6319
6537
|
}
|
|
6320
6538
|
template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_U64_D(D)>
|
|
6321
6539
|
HWY_API VFromD<D> PromoteInRangeTo(D /* tag */, VFromD<Rebind<float, D>> v) {
|
|
6322
|
-
#if
|
|
6540
|
+
#if HWY_X86_HAVE_AVX10_2_OPS
|
|
6541
|
+
return VFromD<D>{_mm256_cvtts_ps_epu64(v.raw)};
|
|
6542
|
+
#elif HWY_COMPILER_GCC_ACTUAL
|
|
6323
6543
|
// Workaround for undefined behavior with GCC if any values of v[i] are not
|
|
6324
6544
|
// within the range of an uint64_t
|
|
6325
6545
|
#if HWY_COMPILER_GCC_ACTUAL >= 700 && !HWY_IS_DEBUG_BUILD
|
|
@@ -6666,6 +6886,31 @@ HWY_API VFromD<D> OrderedDemote2To(D d, V a, V b) {
|
|
|
6666
6886
|
_MM_SHUFFLE(3, 1, 2, 0))};
|
|
6667
6887
|
}
|
|
6668
6888
|
|
|
6889
|
+
#if HWY_TARGET <= HWY_AVX3
|
|
6890
|
+
template <class D, HWY_IF_V_SIZE_D(D, HWY_MAX_BYTES), HWY_IF_UI32_D(D)>
|
|
6891
|
+
HWY_API VFromD<D> ReorderDemote2To(D dn, VFromD<Repartition<int64_t, D>> a,
|
|
6892
|
+
VFromD<Repartition<int64_t, D>> b) {
|
|
6893
|
+
const Half<decltype(dn)> dnh;
|
|
6894
|
+
return Combine(dn, DemoteTo(dnh, b), DemoteTo(dnh, a));
|
|
6895
|
+
}
|
|
6896
|
+
|
|
6897
|
+
template <class D, HWY_IF_V_SIZE_D(D, HWY_MAX_BYTES), HWY_IF_U32_D(D)>
|
|
6898
|
+
HWY_API VFromD<D> ReorderDemote2To(D dn, VFromD<Repartition<uint64_t, D>> a,
|
|
6899
|
+
VFromD<Repartition<uint64_t, D>> b) {
|
|
6900
|
+
const Half<decltype(dn)> dnh;
|
|
6901
|
+
return Combine(dn, DemoteTo(dnh, b), DemoteTo(dnh, a));
|
|
6902
|
+
}
|
|
6903
|
+
|
|
6904
|
+
template <class D, HWY_IF_NOT_FLOAT_NOR_SPECIAL(TFromD<D>),
|
|
6905
|
+
HWY_IF_V_SIZE_GT_D(D, 16), class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V),
|
|
6906
|
+
HWY_IF_T_SIZE_V(V, sizeof(TFromD<D>) * 2),
|
|
6907
|
+
HWY_IF_LANES_D(D, HWY_MAX_LANES_D(DFromV<V>) * 2),
|
|
6908
|
+
HWY_IF_T_SIZE_V(V, 8)>
|
|
6909
|
+
HWY_API VFromD<D> OrderedDemote2To(D d, V a, V b) {
|
|
6910
|
+
return ReorderDemote2To(d, a, b);
|
|
6911
|
+
}
|
|
6912
|
+
#endif
|
|
6913
|
+
|
|
6669
6914
|
template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F32_D(D)>
|
|
6670
6915
|
HWY_API VFromD<D> DemoteTo(D /* tag */, Vec256<double> v) {
|
|
6671
6916
|
return VFromD<D>{_mm256_cvtpd_ps(v.raw)};
|
|
@@ -6673,7 +6918,9 @@ HWY_API VFromD<D> DemoteTo(D /* tag */, Vec256<double> v) {
|
|
|
6673
6918
|
|
|
6674
6919
|
template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_I32_D(D)>
|
|
6675
6920
|
HWY_API VFromD<D> DemoteInRangeTo(D /* tag */, Vec256<double> v) {
|
|
6676
|
-
#if
|
|
6921
|
+
#if HWY_X86_HAVE_AVX10_2_OPS
|
|
6922
|
+
return VFromD<D>{_mm256_cvtts_pd_epi32(v.raw)};
|
|
6923
|
+
#elif HWY_COMPILER_GCC_ACTUAL
|
|
6677
6924
|
// Workaround for undefined behavior in _mm256_cvttpd_epi32 with GCC if any
|
|
6678
6925
|
// values of v[i] are not within the range of an int32_t
|
|
6679
6926
|
|
|
@@ -6703,7 +6950,9 @@ HWY_API VFromD<D> DemoteInRangeTo(D /* tag */, Vec256<double> v) {
|
|
|
6703
6950
|
#if HWY_TARGET <= HWY_AVX3
|
|
6704
6951
|
template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U32_D(D)>
|
|
6705
6952
|
HWY_API VFromD<D> DemoteInRangeTo(D /* tag */, Vec256<double> v) {
|
|
6706
|
-
#if
|
|
6953
|
+
#if HWY_X86_HAVE_AVX10_2_OPS
|
|
6954
|
+
return VFromD<D>{_mm256_cvtts_pd_epu32(v.raw)};
|
|
6955
|
+
#elif HWY_COMPILER_GCC_ACTUAL
|
|
6707
6956
|
// Workaround for undefined behavior in _mm256_cvttpd_epu32 with GCC if any
|
|
6708
6957
|
// values of v[i] are not within the range of an uint32_t
|
|
6709
6958
|
|
|
@@ -6998,7 +7247,9 @@ HWY_API VFromD<D> ConvertInRangeTo(D /* tag */, VFromD<RebindToFloat<D>> v) {
|
|
|
6998
7247
|
|
|
6999
7248
|
template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_I32_D(D)>
|
|
7000
7249
|
HWY_API VFromD<D> ConvertInRangeTo(D /*d*/, Vec256<float> v) {
|
|
7001
|
-
#if
|
|
7250
|
+
#if HWY_X86_HAVE_AVX10_2_OPS
|
|
7251
|
+
return VFromD<D>{_mm256_cvtts_ps_epi32(v.raw)};
|
|
7252
|
+
#elif HWY_COMPILER_GCC_ACTUAL
|
|
7002
7253
|
// Workaround for undefined behavior in _mm256_cvttps_epi32 with GCC if any
|
|
7003
7254
|
// values of v[i] are not within the range of an int32_t
|
|
7004
7255
|
|
|
@@ -7032,7 +7283,9 @@ HWY_API VFromD<D> ConvertInRangeTo(D /*d*/, Vec256<float> v) {
|
|
|
7032
7283
|
#if HWY_TARGET <= HWY_AVX3
|
|
7033
7284
|
template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_I64_D(D)>
|
|
7034
7285
|
HWY_API VFromD<D> ConvertInRangeTo(D /*di*/, Vec256<double> v) {
|
|
7035
|
-
#if
|
|
7286
|
+
#if HWY_X86_HAVE_AVX10_2_OPS
|
|
7287
|
+
return VFromD<D>{_mm256_cvtts_pd_epi64(v.raw)};
|
|
7288
|
+
#elif HWY_COMPILER_GCC_ACTUAL
|
|
7036
7289
|
// Workaround for undefined behavior in _mm256_cvttpd_epi64 with GCC if any
|
|
7037
7290
|
// values of v[i] are not within the range of an int64_t
|
|
7038
7291
|
|
|
@@ -7060,7 +7313,9 @@ HWY_API VFromD<D> ConvertInRangeTo(D /*di*/, Vec256<double> v) {
|
|
|
7060
7313
|
}
|
|
7061
7314
|
template <class DU, HWY_IF_V_SIZE_D(DU, 32), HWY_IF_U32_D(DU)>
|
|
7062
7315
|
HWY_API VFromD<DU> ConvertInRangeTo(DU /*du*/, VFromD<RebindToFloat<DU>> v) {
|
|
7063
|
-
#if
|
|
7316
|
+
#if HWY_X86_HAVE_AVX10_2_OPS
|
|
7317
|
+
return VFromD<DU>{_mm256_cvtts_ps_epu32(v.raw)};
|
|
7318
|
+
#elif HWY_COMPILER_GCC_ACTUAL
|
|
7064
7319
|
// Workaround for undefined behavior in _mm256_cvttps_epu32 with GCC if any
|
|
7065
7320
|
// values of v[i] are not within the range of an uint32_t
|
|
7066
7321
|
|
|
@@ -7100,7 +7355,9 @@ HWY_API VFromD<DU> ConvertInRangeTo(DU /*du*/, VFromD<RebindToFloat<DU>> v) {
|
|
|
7100
7355
|
}
|
|
7101
7356
|
template <class DU, HWY_IF_V_SIZE_D(DU, 32), HWY_IF_U64_D(DU)>
|
|
7102
7357
|
HWY_API VFromD<DU> ConvertInRangeTo(DU /*du*/, VFromD<RebindToFloat<DU>> v) {
|
|
7103
|
-
#if
|
|
7358
|
+
#if HWY_X86_HAVE_AVX10_2_OPS
|
|
7359
|
+
return VFromD<DU>{_mm256_cvtts_pd_epu64(v.raw)};
|
|
7360
|
+
#elif HWY_COMPILER_GCC_ACTUAL
|
|
7104
7361
|
// Workaround for undefined behavior in _mm256_cvttpd_epu64 with GCC if any
|
|
7105
7362
|
// values of v[i] are not within the range of an uint64_t
|
|
7106
7363
|
|
|
@@ -7133,7 +7390,8 @@ HWY_API VFromD<DU> ConvertInRangeTo(DU /*du*/, VFromD<RebindToFloat<DU>> v) {
|
|
|
7133
7390
|
#endif // HWY_TARGET <= HWY_AVX3
|
|
7134
7391
|
|
|
7135
7392
|
template <class DI, HWY_IF_V_SIZE_D(DI, 32), HWY_IF_I32_D(DI)>
|
|
7136
|
-
HWY_INLINE VFromD<DI> NearestIntInRange(DI,
|
|
7393
|
+
static HWY_INLINE VFromD<DI> NearestIntInRange(DI,
|
|
7394
|
+
VFromD<RebindToFloat<DI>> v) {
|
|
7137
7395
|
#if HWY_COMPILER_GCC_ACTUAL
|
|
7138
7396
|
// Workaround for undefined behavior in _mm256_cvtps_epi32 if any values of
|
|
7139
7397
|
// v[i] are not within the range of an int32_t
|
|
@@ -7165,6 +7423,113 @@ HWY_INLINE VFromD<DI> NearestIntInRange(DI, VFromD<RebindToFloat<DI>> v) {
|
|
|
7165
7423
|
#endif // HWY_COMPILER_GCC_ACTUAL
|
|
7166
7424
|
}
|
|
7167
7425
|
|
|
7426
|
+
#if HWY_HAVE_FLOAT16
|
|
7427
|
+
template <class DI, HWY_IF_V_SIZE_D(DI, 32), HWY_IF_I16_D(DI)>
|
|
7428
|
+
static HWY_INLINE VFromD<DI> NearestIntInRange(DI /*d*/, Vec256<float16_t> v) {
|
|
7429
|
+
#if HWY_COMPILER_GCC_ACTUAL
|
|
7430
|
+
// Workaround for undefined behavior in _mm256_cvtph_epi16 with GCC if any
|
|
7431
|
+
// values of v[i] are not within the range of an int16_t
|
|
7432
|
+
|
|
7433
|
+
#if HWY_COMPILER_GCC_ACTUAL >= 1200 && !HWY_IS_DEBUG_BUILD && \
|
|
7434
|
+
HWY_HAVE_SCALAR_F16_TYPE
|
|
7435
|
+
if (detail::IsConstantX86VecForF2IConv<int16_t>(v)) {
|
|
7436
|
+
typedef hwy::float16_t::Native GccF16RawVectType
|
|
7437
|
+
__attribute__((__vector_size__(32)));
|
|
7438
|
+
const auto raw_v = reinterpret_cast<GccF16RawVectType>(v.raw);
|
|
7439
|
+
return VFromD<DI>{
|
|
7440
|
+
_mm256_setr_epi16(detail::X86ScalarNearestInt<int16_t>(raw_v[0]),
|
|
7441
|
+
detail::X86ScalarNearestInt<int16_t>(raw_v[1]),
|
|
7442
|
+
detail::X86ScalarNearestInt<int16_t>(raw_v[2]),
|
|
7443
|
+
detail::X86ScalarNearestInt<int16_t>(raw_v[3]),
|
|
7444
|
+
detail::X86ScalarNearestInt<int16_t>(raw_v[4]),
|
|
7445
|
+
detail::X86ScalarNearestInt<int16_t>(raw_v[5]),
|
|
7446
|
+
detail::X86ScalarNearestInt<int16_t>(raw_v[6]),
|
|
7447
|
+
detail::X86ScalarNearestInt<int16_t>(raw_v[7]),
|
|
7448
|
+
detail::X86ScalarNearestInt<int16_t>(raw_v[8]),
|
|
7449
|
+
detail::X86ScalarNearestInt<int16_t>(raw_v[9]),
|
|
7450
|
+
detail::X86ScalarNearestInt<int16_t>(raw_v[10]),
|
|
7451
|
+
detail::X86ScalarNearestInt<int16_t>(raw_v[11]),
|
|
7452
|
+
detail::X86ScalarNearestInt<int16_t>(raw_v[12]),
|
|
7453
|
+
detail::X86ScalarNearestInt<int16_t>(raw_v[13]),
|
|
7454
|
+
detail::X86ScalarNearestInt<int16_t>(raw_v[14]),
|
|
7455
|
+
detail::X86ScalarNearestInt<int16_t>(raw_v[15]))};
|
|
7456
|
+
}
|
|
7457
|
+
#endif
|
|
7458
|
+
|
|
7459
|
+
__m256i raw_result;
|
|
7460
|
+
__asm__("vcvtph2w {%1, %0|%0, %1}"
|
|
7461
|
+
: "=" HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(raw_result)
|
|
7462
|
+
: HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(v.raw)
|
|
7463
|
+
:);
|
|
7464
|
+
return VFromD<DI>{raw_result};
|
|
7465
|
+
#else // HWY_COMPILER_GCC_ACTUAL
|
|
7466
|
+
return VFromD<DI>{_mm256_cvtph_epi16(v.raw)};
|
|
7467
|
+
#endif
|
|
7468
|
+
}
|
|
7469
|
+
#endif
|
|
7470
|
+
|
|
7471
|
+
#if HWY_TARGET <= HWY_AVX3
|
|
7472
|
+
template <class DI, HWY_IF_V_SIZE_D(DI, 32), HWY_IF_I64_D(DI)>
|
|
7473
|
+
static HWY_INLINE VFromD<DI> NearestIntInRange(DI,
|
|
7474
|
+
VFromD<RebindToFloat<DI>> v) {
|
|
7475
|
+
#if HWY_COMPILER_GCC_ACTUAL
|
|
7476
|
+
// Workaround for undefined behavior in _mm256_cvtpd_epi64 with GCC if any
|
|
7477
|
+
// values of v[i] are not within the range of an int64_t
|
|
7478
|
+
|
|
7479
|
+
#if HWY_COMPILER_GCC_ACTUAL >= 700 && !HWY_IS_DEBUG_BUILD
|
|
7480
|
+
if (detail::IsConstantX86VecForF2IConv<int64_t>(v)) {
|
|
7481
|
+
typedef double GccF64RawVectType __attribute__((__vector_size__(32)));
|
|
7482
|
+
const auto raw_v = reinterpret_cast<GccF64RawVectType>(v.raw);
|
|
7483
|
+
return VFromD<DI>{
|
|
7484
|
+
_mm256_setr_epi64x(detail::X86ScalarNearestInt<int64_t>(raw_v[0]),
|
|
7485
|
+
detail::X86ScalarNearestInt<int64_t>(raw_v[1]),
|
|
7486
|
+
detail::X86ScalarNearestInt<int64_t>(raw_v[2]),
|
|
7487
|
+
detail::X86ScalarNearestInt<int64_t>(raw_v[3]))};
|
|
7488
|
+
}
|
|
7489
|
+
#endif
|
|
7490
|
+
|
|
7491
|
+
__m256i raw_result;
|
|
7492
|
+
__asm__("vcvtpd2qq {%1, %0|%0, %1}"
|
|
7493
|
+
: "=" HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(raw_result)
|
|
7494
|
+
: HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(v.raw)
|
|
7495
|
+
:);
|
|
7496
|
+
return VFromD<DI>{raw_result};
|
|
7497
|
+
#else // !HWY_COMPILER_GCC_ACTUAL
|
|
7498
|
+
return VFromD<DI>{_mm256_cvtpd_epi64(v.raw)};
|
|
7499
|
+
#endif // HWY_COMPILER_GCC_ACTUAL
|
|
7500
|
+
}
|
|
7501
|
+
#endif // HWY_TARGET <= HWY_AVX3
|
|
7502
|
+
|
|
7503
|
+
template <class DI, HWY_IF_V_SIZE_D(DI, 16), HWY_IF_I32_D(DI)>
|
|
7504
|
+
static HWY_INLINE VFromD<DI> DemoteToNearestIntInRange(
|
|
7505
|
+
DI, VFromD<Rebind<double, DI>> v) {
|
|
7506
|
+
#if HWY_COMPILER_GCC_ACTUAL
|
|
7507
|
+
// Workaround for undefined behavior in _mm256_cvtpd_epi32 with GCC if any
|
|
7508
|
+
// values of v[i] are not within the range of an int32_t
|
|
7509
|
+
|
|
7510
|
+
#if HWY_COMPILER_GCC_ACTUAL >= 700 && !HWY_IS_DEBUG_BUILD
|
|
7511
|
+
if (detail::IsConstantX86VecForF2IConv<int32_t>(v)) {
|
|
7512
|
+
typedef double GccF32RawVectType __attribute__((__vector_size__(32)));
|
|
7513
|
+
const auto raw_v = reinterpret_cast<GccF32RawVectType>(v.raw);
|
|
7514
|
+
return Dup128VecFromValues(DI(),
|
|
7515
|
+
detail::X86ScalarNearestInt<int32_t>(raw_v[0]),
|
|
7516
|
+
detail::X86ScalarNearestInt<int32_t>(raw_v[1]),
|
|
7517
|
+
detail::X86ScalarNearestInt<int32_t>(raw_v[2]),
|
|
7518
|
+
detail::X86ScalarNearestInt<int32_t>(raw_v[3]));
|
|
7519
|
+
}
|
|
7520
|
+
#endif
|
|
7521
|
+
|
|
7522
|
+
__m128i raw_result;
|
|
7523
|
+
__asm__("vcvtpd2dq {%1, %0|%0, %1}"
|
|
7524
|
+
: "=" HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(raw_result)
|
|
7525
|
+
: HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(v.raw)
|
|
7526
|
+
:);
|
|
7527
|
+
return VFromD<DI>{raw_result};
|
|
7528
|
+
#else // !HWY_COMPILER_GCC_ACTUAL
|
|
7529
|
+
return VFromD<DI>{_mm256_cvtpd_epi32(v.raw)};
|
|
7530
|
+
#endif
|
|
7531
|
+
}
|
|
7532
|
+
|
|
7168
7533
|
#ifndef HWY_DISABLE_F16C
|
|
7169
7534
|
|
|
7170
7535
|
template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)>
|
|
@@ -7592,26 +7957,22 @@ HWY_API MFromD<D> LoadMaskBits(D d, const uint8_t* HWY_RESTRICT bits) {
|
|
|
7592
7957
|
return detail::LoadMaskBits256<TFromD<D>>(mask_bits);
|
|
7593
7958
|
}
|
|
7594
7959
|
|
|
7595
|
-
// ------------------------------
|
|
7596
|
-
|
|
7597
|
-
namespace detail {
|
|
7960
|
+
// ------------------------------ BitsFromMask
|
|
7598
7961
|
|
|
7599
|
-
template <
|
|
7600
|
-
|
|
7601
|
-
const
|
|
7602
|
-
const Full256<uint8_t> d8;
|
|
7962
|
+
template <class D, HWY_IF_T_SIZE_D(D, 1), HWY_IF_V_SIZE_D(D, 32)>
|
|
7963
|
+
HWY_API uint64_t BitsFromMask(D d, MFromD<D> mask) {
|
|
7964
|
+
const RebindToUnsigned<D> d8;
|
|
7603
7965
|
const auto sign_bits = BitCast(d8, VecFromMask(d, mask)).raw;
|
|
7604
7966
|
// Prevent sign-extension of 32-bit masks because the intrinsic returns int.
|
|
7605
7967
|
return static_cast<uint32_t>(_mm256_movemask_epi8(sign_bits));
|
|
7606
7968
|
}
|
|
7607
7969
|
|
|
7608
|
-
template <
|
|
7609
|
-
|
|
7970
|
+
template <class D, HWY_IF_T_SIZE_D(D, 2), HWY_IF_V_SIZE_D(D, 32)>
|
|
7971
|
+
HWY_API uint64_t BitsFromMask(D d, MFromD<D> mask) {
|
|
7610
7972
|
#if !defined(HWY_DISABLE_BMI2_FMA) && !defined(HWY_DISABLE_PEXT_ON_AVX2)
|
|
7611
|
-
const
|
|
7612
|
-
const Full256<uint8_t> d8;
|
|
7973
|
+
const Repartition<uint8_t, D> d8;
|
|
7613
7974
|
const Mask256<uint8_t> mask8 = MaskFromVec(BitCast(d8, VecFromMask(d, mask)));
|
|
7614
|
-
const uint64_t sign_bits8 = BitsFromMask(mask8);
|
|
7975
|
+
const uint64_t sign_bits8 = BitsFromMask(d8, mask8);
|
|
7615
7976
|
// Skip the bits from the lower byte of each u16 (better not to use the
|
|
7616
7977
|
// same packs_epi16 as SSE4, because that requires an extra swizzle here).
|
|
7617
7978
|
return _pext_u32(static_cast<uint32_t>(sign_bits8), 0xAAAAAAAAu);
|
|
@@ -7627,32 +7988,29 @@ HWY_INLINE uint64_t BitsFromMask(const Mask256<T> mask) {
|
|
|
7627
7988
|
#endif // HWY_ARCH_X86_64
|
|
7628
7989
|
}
|
|
7629
7990
|
|
|
7630
|
-
template <
|
|
7631
|
-
|
|
7632
|
-
const
|
|
7633
|
-
const Full256<float> df;
|
|
7991
|
+
template <class D, HWY_IF_T_SIZE_D(D, 4), HWY_IF_V_SIZE_D(D, 32)>
|
|
7992
|
+
HWY_API uint64_t BitsFromMask(D d, MFromD<D> mask) {
|
|
7993
|
+
const RebindToFloat<D> df;
|
|
7634
7994
|
const auto sign_bits = BitCast(df, VecFromMask(d, mask)).raw;
|
|
7635
7995
|
return static_cast<unsigned>(_mm256_movemask_ps(sign_bits));
|
|
7636
7996
|
}
|
|
7637
7997
|
|
|
7638
|
-
template <
|
|
7639
|
-
|
|
7640
|
-
const
|
|
7641
|
-
const Full256<double> df;
|
|
7998
|
+
template <class D, HWY_IF_T_SIZE_D(D, 8), HWY_IF_V_SIZE_D(D, 32)>
|
|
7999
|
+
HWY_API uint64_t BitsFromMask(D d, MFromD<D> mask) {
|
|
8000
|
+
const RebindToFloat<D> df;
|
|
7642
8001
|
const auto sign_bits = BitCast(df, VecFromMask(d, mask)).raw;
|
|
7643
8002
|
return static_cast<unsigned>(_mm256_movemask_pd(sign_bits));
|
|
7644
8003
|
}
|
|
7645
8004
|
|
|
7646
|
-
|
|
7647
|
-
|
|
8005
|
+
// ------------------------------ StoreMaskBits
|
|
7648
8006
|
// `p` points to at least 8 writable bytes.
|
|
7649
8007
|
template <class D, HWY_IF_V_SIZE_D(D, 32)>
|
|
7650
8008
|
HWY_API size_t StoreMaskBits(D d, MFromD<D> mask, uint8_t* bits) {
|
|
7651
|
-
|
|
7652
|
-
|
|
8009
|
+
HWY_LANES_CONSTEXPR size_t N = Lanes(d);
|
|
8010
|
+
HWY_LANES_CONSTEXPR size_t kNumBytes = (N + 7) / 8;
|
|
7653
8011
|
|
|
7654
|
-
const uint64_t mask_bits =
|
|
7655
|
-
CopyBytes
|
|
8012
|
+
const uint64_t mask_bits = BitsFromMask(d, mask);
|
|
8013
|
+
CopyBytes(&mask_bits, bits, kNumBytes);
|
|
7656
8014
|
return kNumBytes;
|
|
7657
8015
|
}
|
|
7658
8016
|
|
|
@@ -7664,59 +8022,59 @@ template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 2)>
|
|
|
7664
8022
|
HWY_API bool AllFalse(D d, MFromD<D> mask) {
|
|
7665
8023
|
const Repartition<uint8_t, decltype(d)> d8;
|
|
7666
8024
|
const Mask256<uint8_t> mask8 = MaskFromVec(BitCast(d8, VecFromMask(d, mask)));
|
|
7667
|
-
return
|
|
8025
|
+
return BitsFromMask(d8, mask8) == 0;
|
|
7668
8026
|
}
|
|
7669
8027
|
|
|
7670
8028
|
template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_NOT_T_SIZE_D(D, 2)>
|
|
7671
|
-
HWY_API bool AllFalse(D
|
|
8029
|
+
HWY_API bool AllFalse(D d, MFromD<D> mask) {
|
|
7672
8030
|
// Cheaper than PTEST, which is 2 uop / 3L.
|
|
7673
|
-
return
|
|
8031
|
+
return BitsFromMask(d, mask) == 0;
|
|
7674
8032
|
}
|
|
7675
8033
|
|
|
7676
8034
|
template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 2)>
|
|
7677
8035
|
HWY_API bool AllTrue(D d, MFromD<D> mask) {
|
|
7678
8036
|
const Repartition<uint8_t, decltype(d)> d8;
|
|
7679
8037
|
const Mask256<uint8_t> mask8 = MaskFromVec(BitCast(d8, VecFromMask(d, mask)));
|
|
7680
|
-
return
|
|
8038
|
+
return BitsFromMask(d8, mask8) == (1ull << 32) - 1;
|
|
7681
8039
|
}
|
|
7682
8040
|
template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_NOT_T_SIZE_D(D, 2)>
|
|
7683
8041
|
HWY_API bool AllTrue(D d, MFromD<D> mask) {
|
|
7684
|
-
constexpr uint64_t kAllBits = (1ull <<
|
|
7685
|
-
return
|
|
8042
|
+
constexpr uint64_t kAllBits = (1ull << MaxLanes(d)) - 1;
|
|
8043
|
+
return BitsFromMask(d, mask) == kAllBits;
|
|
7686
8044
|
}
|
|
7687
8045
|
|
|
7688
8046
|
template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 2)>
|
|
7689
8047
|
HWY_API size_t CountTrue(D d, MFromD<D> mask) {
|
|
7690
8048
|
const Repartition<uint8_t, decltype(d)> d8;
|
|
7691
8049
|
const Mask256<uint8_t> mask8 = MaskFromVec(BitCast(d8, VecFromMask(d, mask)));
|
|
7692
|
-
return PopCount(
|
|
8050
|
+
return PopCount(BitsFromMask(d8, mask8)) >> 1;
|
|
7693
8051
|
}
|
|
7694
8052
|
template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_NOT_T_SIZE_D(D, 2)>
|
|
7695
|
-
HWY_API size_t CountTrue(D
|
|
7696
|
-
return PopCount(
|
|
8053
|
+
HWY_API size_t CountTrue(D d, MFromD<D> mask) {
|
|
8054
|
+
return PopCount(BitsFromMask(d, mask));
|
|
7697
8055
|
}
|
|
7698
8056
|
|
|
7699
8057
|
template <class D, HWY_IF_V_SIZE_D(D, 32)>
|
|
7700
|
-
HWY_API size_t FindKnownFirstTrue(D
|
|
7701
|
-
const uint32_t mask_bits = static_cast<uint32_t>(
|
|
8058
|
+
HWY_API size_t FindKnownFirstTrue(D d, MFromD<D> mask) {
|
|
8059
|
+
const uint32_t mask_bits = static_cast<uint32_t>(BitsFromMask(d, mask));
|
|
7702
8060
|
return Num0BitsBelowLS1Bit_Nonzero32(mask_bits);
|
|
7703
8061
|
}
|
|
7704
8062
|
|
|
7705
8063
|
template <class D, HWY_IF_V_SIZE_D(D, 32)>
|
|
7706
|
-
HWY_API intptr_t FindFirstTrue(D
|
|
7707
|
-
const uint32_t mask_bits = static_cast<uint32_t>(
|
|
8064
|
+
HWY_API intptr_t FindFirstTrue(D d, MFromD<D> mask) {
|
|
8065
|
+
const uint32_t mask_bits = static_cast<uint32_t>(BitsFromMask(d, mask));
|
|
7708
8066
|
return mask_bits ? intptr_t(Num0BitsBelowLS1Bit_Nonzero32(mask_bits)) : -1;
|
|
7709
8067
|
}
|
|
7710
8068
|
|
|
7711
8069
|
template <class D, HWY_IF_V_SIZE_D(D, 32)>
|
|
7712
|
-
HWY_API size_t FindKnownLastTrue(D
|
|
7713
|
-
const uint32_t mask_bits = static_cast<uint32_t>(
|
|
8070
|
+
HWY_API size_t FindKnownLastTrue(D d, MFromD<D> mask) {
|
|
8071
|
+
const uint32_t mask_bits = static_cast<uint32_t>(BitsFromMask(d, mask));
|
|
7714
8072
|
return 31 - Num0BitsAboveMS1Bit_Nonzero32(mask_bits);
|
|
7715
8073
|
}
|
|
7716
8074
|
|
|
7717
8075
|
template <class D, HWY_IF_V_SIZE_D(D, 32)>
|
|
7718
|
-
HWY_API intptr_t FindLastTrue(D
|
|
7719
|
-
const uint32_t mask_bits = static_cast<uint32_t>(
|
|
8076
|
+
HWY_API intptr_t FindLastTrue(D d, MFromD<D> mask) {
|
|
8077
|
+
const uint32_t mask_bits = static_cast<uint32_t>(BitsFromMask(d, mask));
|
|
7720
8078
|
return mask_bits ? intptr_t(31 - Num0BitsAboveMS1Bit_Nonzero32(mask_bits))
|
|
7721
8079
|
: -1;
|
|
7722
8080
|
}
|
|
@@ -7969,12 +8327,14 @@ HWY_INLINE Vec256<T> CompressNot(Vec256<T> v, const uint64_t mask_bits) {
|
|
|
7969
8327
|
|
|
7970
8328
|
template <typename T, HWY_IF_NOT_T_SIZE(T, 1)>
|
|
7971
8329
|
HWY_API Vec256<T> Compress(Vec256<T> v, Mask256<T> m) {
|
|
7972
|
-
|
|
8330
|
+
const DFromV<decltype(v)> d;
|
|
8331
|
+
return detail::Compress(v, BitsFromMask(d, m));
|
|
7973
8332
|
}
|
|
7974
8333
|
|
|
7975
8334
|
template <typename T, HWY_IF_NOT_T_SIZE(T, 1)>
|
|
7976
8335
|
HWY_API Vec256<T> CompressNot(Vec256<T> v, Mask256<T> m) {
|
|
7977
|
-
|
|
8336
|
+
const DFromV<decltype(v)> d;
|
|
8337
|
+
return detail::CompressNot(v, BitsFromMask(d, m));
|
|
7978
8338
|
}
|
|
7979
8339
|
|
|
7980
8340
|
HWY_API Vec256<uint64_t> CompressBlocksNot(Vec256<uint64_t> v,
|
|
@@ -8002,7 +8362,7 @@ HWY_API Vec256<T> CompressBits(Vec256<T> v, const uint8_t* HWY_RESTRICT bits) {
|
|
|
8002
8362
|
template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_NOT_T_SIZE_D(D, 1)>
|
|
8003
8363
|
HWY_API size_t CompressStore(VFromD<D> v, MFromD<D> m, D d,
|
|
8004
8364
|
TFromD<D>* HWY_RESTRICT unaligned) {
|
|
8005
|
-
const uint64_t mask_bits =
|
|
8365
|
+
const uint64_t mask_bits = BitsFromMask(d, m);
|
|
8006
8366
|
const size_t count = PopCount(mask_bits);
|
|
8007
8367
|
StoreU(detail::Compress(v, mask_bits), d, unaligned);
|
|
8008
8368
|
detail::MaybeUnpoison(unaligned, count);
|
|
@@ -8013,7 +8373,7 @@ template <class D, HWY_IF_V_SIZE_D(D, 32),
|
|
|
8013
8373
|
HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 4) | (1 << 8))>
|
|
8014
8374
|
HWY_API size_t CompressBlendedStore(VFromD<D> v, MFromD<D> m, D d,
|
|
8015
8375
|
TFromD<D>* HWY_RESTRICT unaligned) {
|
|
8016
|
-
const uint64_t mask_bits =
|
|
8376
|
+
const uint64_t mask_bits = BitsFromMask(d, m);
|
|
8017
8377
|
const size_t count = PopCount(mask_bits);
|
|
8018
8378
|
|
|
8019
8379
|
const RebindToUnsigned<decltype(d)> du;
|
|
@@ -8040,7 +8400,7 @@ HWY_API size_t CompressBlendedStore(VFromD<D> v, MFromD<D> m, D d,
|
|
|
8040
8400
|
template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 2)>
|
|
8041
8401
|
HWY_API size_t CompressBlendedStore(VFromD<D> v, MFromD<D> m, D d,
|
|
8042
8402
|
TFromD<D>* HWY_RESTRICT unaligned) {
|
|
8043
|
-
const uint64_t mask_bits =
|
|
8403
|
+
const uint64_t mask_bits = BitsFromMask(d, m);
|
|
8044
8404
|
const size_t count = PopCount(mask_bits);
|
|
8045
8405
|
const VFromD<D> compressed = detail::Compress(v, mask_bits);
|
|
8046
8406
|
|
|
@@ -8059,11 +8419,11 @@ HWY_API size_t CompressBlendedStore(VFromD<D> v, MFromD<D> m, D d,
|
|
|
8059
8419
|
template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_NOT_T_SIZE_D(D, 1)>
|
|
8060
8420
|
HWY_API size_t CompressBitsStore(VFromD<D> v, const uint8_t* HWY_RESTRICT bits,
|
|
8061
8421
|
D d, TFromD<D>* HWY_RESTRICT unaligned) {
|
|
8062
|
-
|
|
8063
|
-
|
|
8422
|
+
HWY_LANES_CONSTEXPR size_t N = Lanes(d);
|
|
8423
|
+
HWY_LANES_CONSTEXPR size_t kNumBytes = (N + 7) / 8;
|
|
8064
8424
|
|
|
8065
8425
|
uint64_t mask_bits = 0;
|
|
8066
|
-
CopyBytes
|
|
8426
|
+
CopyBytes(bits, &mask_bits, kNumBytes);
|
|
8067
8427
|
|
|
8068
8428
|
if (N < 8) {
|
|
8069
8429
|
mask_bits &= (1ull << N) - 1;
|
|
@@ -8157,7 +8517,7 @@ HWY_API Vec256<T> Expand(Vec256<T> v, Mask256<T> mask) {
|
|
|
8157
8517
|
// LUTs are infeasible for so many mask combinations, so Combine two
|
|
8158
8518
|
// half-vector Expand.
|
|
8159
8519
|
const Half<decltype(d)> dh;
|
|
8160
|
-
const uint64_t mask_bits =
|
|
8520
|
+
const uint64_t mask_bits = BitsFromMask(d, mask);
|
|
8161
8521
|
constexpr size_t N = 32 / sizeof(T);
|
|
8162
8522
|
const size_t countL = PopCount(mask_bits & ((1 << (N / 2)) - 1));
|
|
8163
8523
|
const Mask128<T> maskL = MaskFromVec(LowerHalf(VecFromMask(d, mask)));
|
|
@@ -8211,7 +8571,7 @@ HWY_API Vec256<T> Expand(Vec256<T> v, Mask256<T> mask) {
|
|
|
8211
8571
|
return BitCast(d, detail::NativeExpand(BitCast(du, v), mu));
|
|
8212
8572
|
#else
|
|
8213
8573
|
const RebindToUnsigned<decltype(d)> du;
|
|
8214
|
-
const uint64_t mask_bits =
|
|
8574
|
+
const uint64_t mask_bits = BitsFromMask(d, mask);
|
|
8215
8575
|
|
|
8216
8576
|
alignas(16) constexpr uint32_t packed_array[256] = {
|
|
8217
8577
|
// PrintExpand32x8Nibble.
|
|
@@ -8280,7 +8640,7 @@ HWY_API Vec256<T> Expand(Vec256<T> v, Mask256<T> mask) {
|
|
|
8280
8640
|
return BitCast(d, detail::NativeExpand(BitCast(du, v), mu));
|
|
8281
8641
|
#else
|
|
8282
8642
|
const RebindToUnsigned<decltype(d)> du;
|
|
8283
|
-
const uint64_t mask_bits =
|
|
8643
|
+
const uint64_t mask_bits = BitsFromMask(d, mask);
|
|
8284
8644
|
|
|
8285
8645
|
alignas(16) constexpr uint64_t packed_array[16] = {
|
|
8286
8646
|
// PrintExpand64x4Nibble.
|
|
@@ -8354,7 +8714,7 @@ namespace detail {
|
|
|
8354
8714
|
template <class D, HWY_IF_V_SIZE_D(D, 32)>
|
|
8355
8715
|
HWY_API void LoadTransposedBlocks3(D d, const TFromD<D>* HWY_RESTRICT unaligned,
|
|
8356
8716
|
VFromD<D>& A, VFromD<D>& B, VFromD<D>& C) {
|
|
8357
|
-
|
|
8717
|
+
HWY_LANES_CONSTEXPR size_t N = Lanes(d);
|
|
8358
8718
|
const VFromD<D> v10 = LoadU(d, unaligned + 0 * N); // 1 0
|
|
8359
8719
|
const VFromD<D> v32 = LoadU(d, unaligned + 1 * N);
|
|
8360
8720
|
const VFromD<D> v54 = LoadU(d, unaligned + 2 * N);
|
|
@@ -8378,7 +8738,7 @@ template <class D, HWY_IF_V_SIZE_D(D, 32)>
|
|
|
8378
8738
|
HWY_API void LoadTransposedBlocks4(D d, const TFromD<D>* HWY_RESTRICT unaligned,
|
|
8379
8739
|
VFromD<D>& vA, VFromD<D>& vB, VFromD<D>& vC,
|
|
8380
8740
|
VFromD<D>& vD) {
|
|
8381
|
-
|
|
8741
|
+
HWY_LANES_CONSTEXPR size_t N = Lanes(d);
|
|
8382
8742
|
const VFromD<D> v10 = LoadU(d, unaligned + 0 * N);
|
|
8383
8743
|
const VFromD<D> v32 = LoadU(d, unaligned + 1 * N);
|
|
8384
8744
|
const VFromD<D> v54 = LoadU(d, unaligned + 2 * N);
|
|
@@ -8405,7 +8765,7 @@ namespace detail {
|
|
|
8405
8765
|
template <class D, HWY_IF_V_SIZE_D(D, 32)>
|
|
8406
8766
|
HWY_API void StoreTransposedBlocks2(VFromD<D> i, VFromD<D> j, D d,
|
|
8407
8767
|
TFromD<D>* HWY_RESTRICT unaligned) {
|
|
8408
|
-
|
|
8768
|
+
HWY_LANES_CONSTEXPR size_t N = Lanes(d);
|
|
8409
8769
|
const auto out0 = ConcatLowerLower(d, j, i);
|
|
8410
8770
|
const auto out1 = ConcatUpperUpper(d, j, i);
|
|
8411
8771
|
StoreU(out0, d, unaligned + 0 * N);
|
|
@@ -8423,7 +8783,7 @@ HWY_API void StoreTransposedBlocks2(VFromD<D> i, VFromD<D> j, D d,
|
|
|
8423
8783
|
template <class D, HWY_IF_V_SIZE_D(D, 32)>
|
|
8424
8784
|
HWY_API void StoreTransposedBlocks3(VFromD<D> i, VFromD<D> j, VFromD<D> k, D d,
|
|
8425
8785
|
TFromD<D>* HWY_RESTRICT unaligned) {
|
|
8426
|
-
|
|
8786
|
+
HWY_LANES_CONSTEXPR size_t N = Lanes(d);
|
|
8427
8787
|
const auto out0 = ConcatLowerLower(d, j, i);
|
|
8428
8788
|
const auto out1 = ConcatUpperLower(d, i, k);
|
|
8429
8789
|
const auto out2 = ConcatUpperUpper(d, k, j);
|
|
@@ -8446,7 +8806,7 @@ template <class D, HWY_IF_V_SIZE_D(D, 32)>
|
|
|
8446
8806
|
HWY_API void StoreTransposedBlocks4(VFromD<D> i, VFromD<D> j, VFromD<D> k,
|
|
8447
8807
|
VFromD<D> l, D d,
|
|
8448
8808
|
TFromD<D>* HWY_RESTRICT unaligned) {
|
|
8449
|
-
|
|
8809
|
+
HWY_LANES_CONSTEXPR size_t N = Lanes(d);
|
|
8450
8810
|
// Write lower halves, then upper.
|
|
8451
8811
|
const auto out0 = ConcatLowerLower(d, j, i);
|
|
8452
8812
|
const auto out1 = ConcatLowerLower(d, l, k);
|
|
@@ -8464,7 +8824,7 @@ HWY_API void StoreTransposedBlocks4(VFromD<D> i, VFromD<D> j, VFromD<D> k,
|
|
|
8464
8824
|
#if HWY_TARGET <= HWY_AVX3
|
|
8465
8825
|
template <class T>
|
|
8466
8826
|
HWY_API Mask256<T> SetAtOrAfterFirst(Mask256<T> mask) {
|
|
8467
|
-
constexpr size_t N =
|
|
8827
|
+
constexpr size_t N = MaxLanes(Full256<T>());
|
|
8468
8828
|
constexpr uint32_t kActiveElemMask =
|
|
8469
8829
|
static_cast<uint32_t>((uint64_t{1} << N) - 1);
|
|
8470
8830
|
return Mask256<T>{static_cast<typename Mask256<T>::Raw>(
|
|
@@ -8472,7 +8832,7 @@ HWY_API Mask256<T> SetAtOrAfterFirst(Mask256<T> mask) {
|
|
|
8472
8832
|
}
|
|
8473
8833
|
template <class T>
|
|
8474
8834
|
HWY_API Mask256<T> SetBeforeFirst(Mask256<T> mask) {
|
|
8475
|
-
constexpr size_t N =
|
|
8835
|
+
constexpr size_t N = MaxLanes(Full256<T>());
|
|
8476
8836
|
constexpr uint32_t kActiveElemMask =
|
|
8477
8837
|
static_cast<uint32_t>((uint64_t{1} << N) - 1);
|
|
8478
8838
|
return Mask256<T>{static_cast<typename Mask256<T>::Raw>(
|
|
@@ -8480,7 +8840,7 @@ HWY_API Mask256<T> SetBeforeFirst(Mask256<T> mask) {
|
|
|
8480
8840
|
}
|
|
8481
8841
|
template <class T>
|
|
8482
8842
|
HWY_API Mask256<T> SetAtOrBeforeFirst(Mask256<T> mask) {
|
|
8483
|
-
constexpr size_t N =
|
|
8843
|
+
constexpr size_t N = MaxLanes(Full256<T>());
|
|
8484
8844
|
constexpr uint32_t kActiveElemMask =
|
|
8485
8845
|
static_cast<uint32_t>((uint64_t{1} << N) - 1);
|
|
8486
8846
|
return Mask256<T>{static_cast<typename Mask256<T>::Raw>(
|
|
@@ -8579,6 +8939,24 @@ HWY_API V BitShuffle(V v, VI idx) {
|
|
|
8579
8939
|
}
|
|
8580
8940
|
#endif // HWY_TARGET <= HWY_AVX3_DL
|
|
8581
8941
|
|
|
8942
|
+
// ------------------------------ MultiRotateRight
|
|
8943
|
+
|
|
8944
|
+
#if HWY_TARGET <= HWY_AVX3_DL
|
|
8945
|
+
|
|
8946
|
+
#ifdef HWY_NATIVE_MULTIROTATERIGHT
|
|
8947
|
+
#undef HWY_NATIVE_MULTIROTATERIGHT
|
|
8948
|
+
#else
|
|
8949
|
+
#define HWY_NATIVE_MULTIROTATERIGHT
|
|
8950
|
+
#endif
|
|
8951
|
+
|
|
8952
|
+
template <class V, class VI, HWY_IF_UI64(TFromV<V>), HWY_IF_UI8(TFromV<VI>),
|
|
8953
|
+
HWY_IF_V_SIZE_V(V, 32), HWY_IF_V_SIZE_V(VI, HWY_MAX_LANES_V(V) * 8)>
|
|
8954
|
+
HWY_API V MultiRotateRight(V v, VI idx) {
|
|
8955
|
+
return V{_mm256_multishift_epi64_epi8(idx.raw, v.raw)};
|
|
8956
|
+
}
|
|
8957
|
+
|
|
8958
|
+
#endif
|
|
8959
|
+
|
|
8582
8960
|
// ------------------------------ LeadingZeroCount
|
|
8583
8961
|
|
|
8584
8962
|
#if HWY_TARGET <= HWY_AVX3
|
|
@@ -8591,6 +8969,85 @@ template <class V, HWY_IF_UI64(TFromV<V>), HWY_IF_V_SIZE_V(V, 32)>
|
|
|
8591
8969
|
HWY_API V LeadingZeroCount(V v) {
|
|
8592
8970
|
return V{_mm256_lzcnt_epi64(v.raw)};
|
|
8593
8971
|
}
|
|
8972
|
+
|
|
8973
|
+
namespace detail {
|
|
8974
|
+
|
|
8975
|
+
template <class V, HWY_IF_UNSIGNED_V(V),
|
|
8976
|
+
HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 2)),
|
|
8977
|
+
HWY_IF_LANES_LE_D(DFromV<V>, HWY_MAX_BYTES / 4)>
|
|
8978
|
+
static HWY_INLINE HWY_MAYBE_UNUSED V Lzcnt32ForU8OrU16OrU32(V v) {
|
|
8979
|
+
const DFromV<decltype(v)> d;
|
|
8980
|
+
const Rebind<int32_t, decltype(d)> di32;
|
|
8981
|
+
const Rebind<uint32_t, decltype(d)> du32;
|
|
8982
|
+
|
|
8983
|
+
const auto v_lz_count = LeadingZeroCount(PromoteTo(du32, v));
|
|
8984
|
+
return DemoteTo(d, BitCast(di32, v_lz_count));
|
|
8985
|
+
}
|
|
8986
|
+
|
|
8987
|
+
template <class V, HWY_IF_UNSIGNED_V(V), HWY_IF_T_SIZE_V(V, 4)>
|
|
8988
|
+
static HWY_INLINE HWY_MAYBE_UNUSED V Lzcnt32ForU8OrU16OrU32(V v) {
|
|
8989
|
+
return LeadingZeroCount(v);
|
|
8990
|
+
}
|
|
8991
|
+
|
|
8992
|
+
template <class V, HWY_IF_UNSIGNED_V(V),
|
|
8993
|
+
HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 2)),
|
|
8994
|
+
HWY_IF_LANES_GT_D(DFromV<V>, HWY_MAX_BYTES / 4)>
|
|
8995
|
+
static HWY_INLINE HWY_MAYBE_UNUSED V Lzcnt32ForU8OrU16OrU32(V v) {
|
|
8996
|
+
const DFromV<decltype(v)> d;
|
|
8997
|
+
const RepartitionToWide<decltype(d)> dw;
|
|
8998
|
+
const RebindToSigned<decltype(dw)> dw_i;
|
|
8999
|
+
|
|
9000
|
+
const auto lo_v_lz_count = Lzcnt32ForU8OrU16OrU32(PromoteLowerTo(dw, v));
|
|
9001
|
+
const auto hi_v_lz_count = Lzcnt32ForU8OrU16OrU32(PromoteUpperTo(dw, v));
|
|
9002
|
+
return OrderedDemote2To(d, BitCast(dw_i, lo_v_lz_count),
|
|
9003
|
+
BitCast(dw_i, hi_v_lz_count));
|
|
9004
|
+
}
|
|
9005
|
+
|
|
9006
|
+
} // namespace detail
|
|
9007
|
+
|
|
9008
|
+
template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V),
|
|
9009
|
+
HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 2))>
|
|
9010
|
+
HWY_API V LeadingZeroCount(V v) {
|
|
9011
|
+
const DFromV<decltype(v)> d;
|
|
9012
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
9013
|
+
using TU = TFromD<decltype(du)>;
|
|
9014
|
+
|
|
9015
|
+
constexpr TU kNumOfBitsInT{sizeof(TU) * 8};
|
|
9016
|
+
const auto v_lzcnt32 = detail::Lzcnt32ForU8OrU16OrU32(BitCast(du, v));
|
|
9017
|
+
return BitCast(d, Min(v_lzcnt32 - Set(du, TU{32 - kNumOfBitsInT}),
|
|
9018
|
+
Set(du, TU{kNumOfBitsInT})));
|
|
9019
|
+
}
|
|
9020
|
+
|
|
9021
|
+
template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V),
|
|
9022
|
+
HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 2))>
|
|
9023
|
+
HWY_API V HighestSetBitIndex(V v) {
|
|
9024
|
+
const DFromV<decltype(v)> d;
|
|
9025
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
9026
|
+
using TU = TFromD<decltype(du)>;
|
|
9027
|
+
return BitCast(
|
|
9028
|
+
d, Set(du, TU{31}) - detail::Lzcnt32ForU8OrU16OrU32(BitCast(du, v)));
|
|
9029
|
+
}
|
|
9030
|
+
|
|
9031
|
+
template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V),
|
|
9032
|
+
HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 4) | (1 << 8))>
|
|
9033
|
+
HWY_API V HighestSetBitIndex(V v) {
|
|
9034
|
+
const DFromV<decltype(v)> d;
|
|
9035
|
+
using T = TFromD<decltype(d)>;
|
|
9036
|
+
return BitCast(d, Set(d, T{sizeof(T) * 8 - 1}) - LeadingZeroCount(v));
|
|
9037
|
+
}
|
|
9038
|
+
|
|
9039
|
+
template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
|
|
9040
|
+
HWY_API V TrailingZeroCount(V v) {
|
|
9041
|
+
const DFromV<decltype(v)> d;
|
|
9042
|
+
const RebindToSigned<decltype(d)> di;
|
|
9043
|
+
using T = TFromD<decltype(d)>;
|
|
9044
|
+
|
|
9045
|
+
const auto vi = BitCast(di, v);
|
|
9046
|
+
const auto lowest_bit = BitCast(d, And(vi, Neg(vi)));
|
|
9047
|
+
constexpr T kNumOfBitsInT{sizeof(T) * 8};
|
|
9048
|
+
const auto bit_idx = HighestSetBitIndex(lowest_bit);
|
|
9049
|
+
return IfThenElse(MaskFromVec(bit_idx), Set(d, kNumOfBitsInT), bit_idx);
|
|
9050
|
+
}
|
|
8594
9051
|
#endif // HWY_TARGET <= HWY_AVX3
|
|
8595
9052
|
|
|
8596
9053
|
// NOLINTNEXTLINE(google-readability-namespace-comments)
|