@img/sharp-libvips-dev 1.2.0 → 1.2.2-rc.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. package/include/ffi.h +3 -3
  2. package/include/harfbuzz/hb-deprecated.h +4 -4
  3. package/include/harfbuzz/hb-font.h +120 -9
  4. package/include/harfbuzz/hb-version.h +3 -3
  5. package/include/hwy/abort.h +2 -19
  6. package/include/hwy/aligned_allocator.h +11 -7
  7. package/include/hwy/auto_tune.h +504 -0
  8. package/include/hwy/base.h +425 -104
  9. package/include/hwy/cache_control.h +16 -0
  10. package/include/hwy/detect_compiler_arch.h +32 -1
  11. package/include/hwy/detect_targets.h +251 -67
  12. package/include/hwy/foreach_target.h +35 -0
  13. package/include/hwy/highway.h +185 -76
  14. package/include/hwy/nanobenchmark.h +1 -19
  15. package/include/hwy/ops/arm_neon-inl.h +969 -458
  16. package/include/hwy/ops/arm_sve-inl.h +1137 -359
  17. package/include/hwy/ops/emu128-inl.h +97 -11
  18. package/include/hwy/ops/generic_ops-inl.h +1222 -34
  19. package/include/hwy/ops/loongarch_lasx-inl.h +4664 -0
  20. package/include/hwy/ops/loongarch_lsx-inl.h +5933 -0
  21. package/include/hwy/ops/ppc_vsx-inl.h +306 -126
  22. package/include/hwy/ops/rvv-inl.h +546 -51
  23. package/include/hwy/ops/scalar-inl.h +77 -22
  24. package/include/hwy/ops/set_macros-inl.h +138 -17
  25. package/include/hwy/ops/shared-inl.h +50 -10
  26. package/include/hwy/ops/wasm_128-inl.h +137 -92
  27. package/include/hwy/ops/x86_128-inl.h +773 -214
  28. package/include/hwy/ops/x86_256-inl.h +712 -255
  29. package/include/hwy/ops/x86_512-inl.h +429 -753
  30. package/include/hwy/ops/x86_avx3-inl.h +501 -0
  31. package/include/hwy/per_target.h +2 -1
  32. package/include/hwy/profiler.h +622 -486
  33. package/include/hwy/targets.h +62 -20
  34. package/include/hwy/timer-inl.h +8 -160
  35. package/include/hwy/timer.h +170 -3
  36. package/include/hwy/x86_cpuid.h +81 -0
  37. package/include/libheif/heif_cxx.h +25 -5
  38. package/include/libheif/heif_regions.h +5 -5
  39. package/include/libheif/heif_version.h +2 -2
  40. package/include/librsvg-2.0/librsvg/rsvg-version.h +2 -2
  41. package/include/libxml2/libxml/xmlversion.h +4 -4
  42. package/include/pango-1.0/pango/pango-enum-types.h +3 -0
  43. package/include/pango-1.0/pango/pango-features.h +3 -3
  44. package/include/pango-1.0/pango/pango-font.h +30 -0
  45. package/include/pango-1.0/pango/pango-version-macros.h +26 -0
  46. package/include/pixman-1/pixman-version.h +2 -2
  47. package/include/webp/decode.h +11 -2
  48. package/include/webp/demux.h +2 -0
  49. package/include/webp/encode.h +2 -0
  50. package/include/webp/mux_types.h +1 -0
  51. package/include/webp/sharpyuv/sharpyuv.h +1 -1
  52. package/include/webp/types.h +2 -2
  53. package/include/zlib.h +3 -3
  54. package/package.json +1 -1
  55. package/versions.json +11 -11
@@ -0,0 +1,4664 @@
1
+ // Copyright 2019 Google LLC
2
+ //
3
+ // Licensed under the Apache License, Version 2.0 (the "License");
4
+ // you may not use this file except in compliance with the License.
5
+ // You may obtain a copy of the License at
6
+ //
7
+ // http://www.apache.org/licenses/LICENSE-2.0
8
+ //
9
+ // Unless required by applicable law or agreed to in writing, software
10
+ // distributed under the License is distributed on an "AS IS" BASIS,
11
+ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ // See the License for the specific language governing permissions and
13
+ // limitations under the License.
14
+
15
+ // 256-bit LASX vectors and operations.
16
+ // External include guard in highway.h - see comment there.
17
+
18
+ #include <lasxintrin.h>
19
+
20
+ #include "hwy/ops/loongarch_lsx-inl.h"
21
+ #include "hwy/ops/shared-inl.h"
22
+
23
+ HWY_BEFORE_NAMESPACE();
24
+ namespace hwy {
25
+ namespace HWY_NAMESPACE {
26
+ namespace detail {
27
+
28
+ template <typename T>
29
+ struct Raw256 {
30
+ using type = __m256i;
31
+ };
32
+ template <>
33
+ struct Raw256<float> {
34
+ using type = __m256;
35
+ };
36
+ template <>
37
+ struct Raw256<double> {
38
+ using type = __m256d;
39
+ };
40
+
41
+ } // namespace detail
42
+
43
+ template <typename T>
44
+ class Vec256 {
45
+ using Raw = typename detail::Raw256<T>::type;
46
+
47
+ public:
48
+ using PrivateT = T; // only for DFromV
49
+ static constexpr size_t kPrivateN = 32 / sizeof(T); // only for DFromV
50
+
51
+ // Compound assignment. Only usable if there is a corresponding non-member
52
+ // binary operator overload. For example, only f32 and f64 support division.
53
+ HWY_INLINE Vec256& operator*=(const Vec256 other) {
54
+ return *this = (*this * other);
55
+ }
56
+ HWY_INLINE Vec256& operator/=(const Vec256 other) {
57
+ return *this = (*this / other);
58
+ }
59
+ HWY_INLINE Vec256& operator+=(const Vec256 other) {
60
+ return *this = (*this + other);
61
+ }
62
+ HWY_INLINE Vec256& operator-=(const Vec256 other) {
63
+ return *this = (*this - other);
64
+ }
65
+ HWY_INLINE Vec256& operator%=(const Vec256 other) {
66
+ return *this = (*this % other);
67
+ }
68
+ HWY_INLINE Vec256& operator&=(const Vec256 other) {
69
+ return *this = (*this & other);
70
+ }
71
+ HWY_INLINE Vec256& operator|=(const Vec256 other) {
72
+ return *this = (*this | other);
73
+ }
74
+ HWY_INLINE Vec256& operator^=(const Vec256 other) {
75
+ return *this = (*this ^ other);
76
+ }
77
+
78
+ Raw raw;
79
+ };
80
+
81
+ namespace detail {
82
+
83
+ template <typename T>
84
+ using RawMask256 = typename Raw256<T>::type;
85
+
86
+ } // namespace detail
87
+
88
+ template <typename T>
89
+ struct Mask256 {
90
+ using Raw = typename detail::RawMask256<T>;
91
+
92
+ using PrivateT = T; // only for DFromM
93
+ static constexpr size_t kPrivateN = 32 / sizeof(T); // only for DFromM
94
+
95
+ Raw raw;
96
+ };
97
+
98
+ template <typename T>
99
+ using Full256 = Simd<T, 32 / sizeof(T), 0>;
100
+
101
+ // ------------------------------ Zero
102
+
103
+ // Cannot use VFromD here because it is defined in terms of Zero.
104
+ template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(D)>
105
+ HWY_API Vec256<TFromD<D>> Zero(D /* tag */) {
106
+ return Vec256<TFromD<D>>{__lasx_xvreplgr2vr_d(0)};
107
+ }
108
+ template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_BF16_D(D)>
109
+ HWY_API Vec256<bfloat16_t> Zero(D /* tag */) {
110
+ return Vec256<bfloat16_t>{__lasx_xvreplgr2vr_d(0)};
111
+ }
112
+ template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F16_D(D)>
113
+ HWY_API Vec256<float16_t> Zero(D /* tag */) {
114
+ return Vec256<float16_t>{__lasx_xvreplgr2vr_d(0)};
115
+ }
116
+ template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)>
117
+ HWY_API Vec256<float> Zero(D /* tag */) {
118
+ return Vec256<float>{reinterpret_cast<__m256>(__lasx_xvreplgr2vr_d(0))};
119
+ }
120
+ template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F64_D(D)>
121
+ HWY_API Vec256<double> Zero(D /* tag */) {
122
+ return Vec256<double>{reinterpret_cast<__m256d>(__lasx_xvreplgr2vr_d(0))};
123
+ }
124
+
125
+ // ------------------------------ BitCast
126
+
127
+ namespace detail {
128
+
129
+ HWY_INLINE __m256i BitCastToInteger(__m256i v) { return v; }
130
+ HWY_INLINE __m256i BitCastToInteger(__m256 v) {
131
+ return reinterpret_cast<__m256i>(v);
132
+ }
133
+ HWY_INLINE __m256i BitCastToInteger(__m256d v) {
134
+ return reinterpret_cast<__m256i>(v);
135
+ }
136
+
137
+ template <typename T>
138
+ HWY_INLINE Vec256<uint8_t> BitCastToByte(Vec256<T> v) {
139
+ return Vec256<uint8_t>{BitCastToInteger(v.raw)};
140
+ }
141
+
142
+ // Cannot rely on function overloading because return types differ.
143
+ template <typename T>
144
+ struct BitCastFromInteger256 {
145
+ HWY_INLINE __m256i operator()(__m256i v) { return v; }
146
+ };
147
+ template <>
148
+ struct BitCastFromInteger256<float> {
149
+ HWY_INLINE __m256 operator()(__m256i v) {
150
+ return reinterpret_cast<__m256>(v);
151
+ }
152
+ };
153
+ template <>
154
+ struct BitCastFromInteger256<double> {
155
+ HWY_INLINE __m256d operator()(__m256i v) {
156
+ return reinterpret_cast<__m256d>(v);
157
+ }
158
+ };
159
+
160
+ template <class D, HWY_IF_V_SIZE_D(D, 32)>
161
+ HWY_INLINE VFromD<D> BitCastFromByte(D /* tag */, Vec256<uint8_t> v) {
162
+ return VFromD<D>{BitCastFromInteger256<TFromD<D>>()(v.raw)};
163
+ }
164
+
165
+ } // namespace detail
166
+
167
+ template <class D, HWY_IF_V_SIZE_D(D, 32), typename FromT>
168
+ HWY_API VFromD<D> BitCast(D d, Vec256<FromT> v) {
169
+ return detail::BitCastFromByte(d, detail::BitCastToByte(v));
170
+ }
171
+
172
+ // ------------------------------ Set
173
+
174
+ template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 1)>
175
+ HWY_API VFromD<D> Set(D /* tag */, TFromD<D> t) {
176
+ return VFromD<D>{__lasx_xvreplgr2vr_b(static_cast<char>(t))}; // NOLINT
177
+ }
178
+ template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_UI16_D(D)>
179
+ HWY_API VFromD<D> Set(D /* tag */, TFromD<D> t) {
180
+ return VFromD<D>{__lasx_xvreplgr2vr_h(static_cast<short>(t))}; // NOLINT
181
+ }
182
+ template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_UI32_D(D)>
183
+ HWY_API VFromD<D> Set(D /* tag */, TFromD<D> t) {
184
+ return VFromD<D>{__lasx_xvreplgr2vr_w(static_cast<int>(t))};
185
+ }
186
+ template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_UI64_D(D)>
187
+ HWY_API VFromD<D> Set(D /* tag */, TFromD<D> t) {
188
+ return VFromD<D>{__lasx_xvreplgr2vr_d(static_cast<long long>(t))}; // NOLINT
189
+ }
190
+ template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)>
191
+ HWY_API Vec256<float> Set(D /* tag */, float t) {
192
+ return BitCast(D(), Vec256<int32_t>{__lasx_xvldrepl_w(&t, 0)});
193
+ }
194
+ template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F64_D(D)>
195
+ HWY_API Vec256<double> Set(D /* tag */, double t) {
196
+ return BitCast(D(), Vec256<int64_t>{__lasx_xvldrepl_d(&t, 0)});
197
+ }
198
+
199
+ // ------------------------------ ResizeBitCast
200
+
201
+ // 32-byte vector to 32-byte vector
202
+ template <class D, class FromV, HWY_IF_V_SIZE_GT_V(FromV, 16),
203
+ HWY_IF_V_SIZE_D(D, HWY_MAX_LANES_V(FromV) * sizeof(TFromV<FromV>))>
204
+ HWY_API VFromD<D> ResizeBitCast(D d, FromV v) {
205
+ return BitCast(d, v);
206
+ }
207
+
208
+ // 32-byte vector to 16-byte vector
209
+ template <class D, class FromV, HWY_IF_V_SIZE_GT_V(FromV, 16),
210
+ HWY_IF_V_SIZE_D(D, 16)>
211
+ HWY_API VFromD<D> ResizeBitCast(D d, FromV v) {
212
+ const DFromV<decltype(v)> d_from;
213
+ const Half<decltype(d_from)> dh_from;
214
+ return BitCast(d, LowerHalf(dh_from, v));
215
+ }
216
+
217
+ // 32-byte vector to <= 8-byte vector
218
+ template <class D, class FromV, HWY_IF_V_SIZE_GT_V(FromV, 16),
219
+ HWY_IF_V_SIZE_LE_D(D, 8)>
220
+ HWY_API VFromD<D> ResizeBitCast(D /*d*/, FromV v) {
221
+ return VFromD<D>{ResizeBitCast(Full128<TFromD<D>>(), v).raw};
222
+ }
223
+
224
+ // <= 16-byte vector to 32-byte vector
225
+ template <class D, class FromV, HWY_IF_V_SIZE_LE_V(FromV, 16),
226
+ HWY_IF_V_SIZE_D(D, 32)>
227
+ HWY_API VFromD<D> ResizeBitCast(D d, FromV v) {
228
+ typedef uint64_t GccRawU64M128Vec __attribute__((__vector_size__(16)));
229
+
230
+ const GccRawU64M128Vec raw_v0 = reinterpret_cast<GccRawU64M128Vec>(v.raw);
231
+ #if HWY_COMPILER_CLANG && HWY_HAS_BUILTIN(__builtin_nondeterministic_value)
232
+ const GccRawU64M128Vec raw_v1 = __builtin_nondeterministic_value(raw_v0);
233
+ #else
234
+ const GccRawU64M128Vec raw_v1 = raw_v0;
235
+ #endif
236
+
237
+ const Repartition<uint64_t, decltype(d)> du64;
238
+ const Half<decltype(du64)> dh_u64;
239
+ return BitCast(
240
+ d,
241
+ Combine(du64, VFromD<decltype(dh_u64)>{reinterpret_cast<__m128i>(raw_v1)},
242
+ VFromD<decltype(dh_u64)>{reinterpret_cast<__m128i>(raw_v0)}));
243
+ }
244
+
245
+ // ------------------------------ Dup128VecFromValues
246
+
247
+ template <class D, HWY_IF_UI8_D(D), HWY_IF_V_SIZE_D(D, 32)>
248
+ HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1,
249
+ TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
250
+ TFromD<D> t5, TFromD<D> t6, TFromD<D> t7,
251
+ TFromD<D> t8, TFromD<D> t9, TFromD<D> t10,
252
+ TFromD<D> t11, TFromD<D> t12,
253
+ TFromD<D> t13, TFromD<D> t14,
254
+ TFromD<D> t15) {
255
+ typedef int8_t GccI8RawVectType __attribute__((__vector_size__(32)));
256
+ GccI8RawVectType raw_i8_vec = {
257
+ static_cast<char>(t0), static_cast<char>(t1), static_cast<char>(t2),
258
+ static_cast<char>(t3), static_cast<char>(t4), static_cast<char>(t5),
259
+ static_cast<char>(t6), static_cast<char>(t7), static_cast<char>(t8),
260
+ static_cast<char>(t9), static_cast<char>(t10), static_cast<char>(t11),
261
+ static_cast<char>(t12), static_cast<char>(t13), static_cast<char>(t14),
262
+ static_cast<char>(t15), static_cast<char>(t0), static_cast<char>(t1),
263
+ static_cast<char>(t2), static_cast<char>(t3), static_cast<char>(t4),
264
+ static_cast<char>(t5), static_cast<char>(t6), static_cast<char>(t7),
265
+ static_cast<char>(t8), static_cast<char>(t9), static_cast<char>(t10),
266
+ static_cast<char>(t11), static_cast<char>(t12), static_cast<char>(t13),
267
+ static_cast<char>(t14), static_cast<char>(t15)};
268
+ return VFromD<D>{reinterpret_cast<__m256i>(raw_i8_vec)};
269
+ }
270
+
271
+ template <class D, HWY_IF_UI16_D(D), HWY_IF_V_SIZE_D(D, 32)>
272
+ HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1,
273
+ TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
274
+ TFromD<D> t5, TFromD<D> t6,
275
+ TFromD<D> t7) {
276
+ typedef int16_t GccI16RawVectType __attribute__((__vector_size__(32)));
277
+ GccI16RawVectType raw_i16_vec = {
278
+ static_cast<int16_t>(t0), static_cast<int16_t>(t1),
279
+ static_cast<int16_t>(t2), static_cast<int16_t>(t3),
280
+ static_cast<int16_t>(t4), static_cast<int16_t>(t5),
281
+ static_cast<int16_t>(t6), static_cast<int16_t>(t7),
282
+ static_cast<int16_t>(t0), static_cast<int16_t>(t1),
283
+ static_cast<int16_t>(t2), static_cast<int16_t>(t3),
284
+ static_cast<int16_t>(t4), static_cast<int16_t>(t5),
285
+ static_cast<int16_t>(t6), static_cast<int16_t>(t7)};
286
+ return VFromD<D>{reinterpret_cast<__m256i>(raw_i16_vec)};
287
+ }
288
+
289
+ template <class D, HWY_IF_UI32_D(D), HWY_IF_V_SIZE_D(D, 32)>
290
+ HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1,
291
+ TFromD<D> t2, TFromD<D> t3) {
292
+ typedef int32_t GccI32RawVectType __attribute__((__vector_size__(32)));
293
+ GccI32RawVectType raw_i32_vec = {
294
+ static_cast<int32_t>(t0), static_cast<int32_t>(t1),
295
+ static_cast<int32_t>(t2), static_cast<int32_t>(t3),
296
+ static_cast<int32_t>(t0), static_cast<int32_t>(t1),
297
+ static_cast<int32_t>(t2), static_cast<int32_t>(t3)};
298
+ return VFromD<D>{reinterpret_cast<__m256i>(raw_i32_vec)};
299
+ }
300
+
301
+ template <class D, HWY_IF_F32_D(D), HWY_IF_V_SIZE_D(D, 32)>
302
+ HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1,
303
+ TFromD<D> t2, TFromD<D> t3) {
304
+ typedef float GccF32RawVectType __attribute__((__vector_size__(32)));
305
+ GccF32RawVectType raw_f32_vec = {t0, t1, t2, t3, t0, t1, t2, t3};
306
+ return Vec256<float>{reinterpret_cast<__m256>(raw_f32_vec)};
307
+ }
308
+
309
+ template <class D, HWY_IF_UI64_D(D), HWY_IF_V_SIZE_D(D, 32)>
310
+ HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1) {
311
+ typedef int64_t GccI64RawVectType __attribute__((__vector_size__(32)));
312
+ const GccI64RawVectType raw_i64_vec = {
313
+ static_cast<int64_t>(t0), static_cast<int64_t>(t1),
314
+ static_cast<int64_t>(t0), static_cast<int64_t>(t1)};
315
+ return VFromD<D>{reinterpret_cast<__m256i>(raw_i64_vec)};
316
+ }
317
+
318
+ template <class D, HWY_IF_F64_D(D), HWY_IF_V_SIZE_D(D, 32)>
319
+ HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1) {
320
+ typedef double GccF64RawVectType __attribute__((__vector_size__(32)));
321
+ const GccF64RawVectType raw_f64_vec = {t0, t1, t0, t1};
322
+ return VFromD<D>{reinterpret_cast<__m256d>(raw_f64_vec)};
323
+ }
324
+
325
+ // ------------------------------ And
326
+
327
+ template <typename T>
328
+ HWY_API Vec256<T> And(Vec256<T> a, Vec256<T> b) {
329
+ const DFromV<decltype(a)> d; // for float16_t
330
+ const RebindToUnsigned<decltype(d)> du;
331
+ return BitCast(d, VFromD<decltype(du)>{__lasx_xvand_v(BitCast(du, a).raw,
332
+ BitCast(du, b).raw)});
333
+ }
334
+
335
+ // ------------------------------ AndNot
336
+
337
+ // Returns ~not_mask & mask.
338
+ template <typename T>
339
+ HWY_API Vec256<T> AndNot(Vec256<T> not_mask, Vec256<T> mask) {
340
+ const DFromV<decltype(mask)> d; // for float16_t
341
+ const RebindToUnsigned<decltype(d)> du;
342
+ return BitCast(d, VFromD<decltype(du)>{__lasx_xvandn_v(
343
+ BitCast(du, not_mask).raw, BitCast(du, mask).raw)});
344
+ }
345
+
346
+ // ------------------------------ Or
347
+
348
+ template <typename T>
349
+ HWY_API Vec256<T> Or(Vec256<T> a, Vec256<T> b) {
350
+ const DFromV<decltype(a)> d; // for float16_t
351
+ const RebindToUnsigned<decltype(d)> du;
352
+ return BitCast(d, VFromD<decltype(du)>{
353
+ __lasx_xvor_v(BitCast(du, a).raw, BitCast(du, b).raw)});
354
+ }
355
+
356
+ // ------------------------------ Xor
357
+
358
+ template <typename T>
359
+ HWY_API Vec256<T> Xor(Vec256<T> a, Vec256<T> b) {
360
+ const DFromV<decltype(a)> d; // for float16_t
361
+ const RebindToUnsigned<decltype(d)> du;
362
+ return BitCast(d, VFromD<decltype(du)>{__lasx_xvxor_v(BitCast(du, a).raw,
363
+ BitCast(du, b).raw)});
364
+ }
365
+
366
+ // ------------------------------ Not
367
+ template <typename T>
368
+ HWY_API Vec256<T> Not(const Vec256<T> v) {
369
+ const DFromV<decltype(v)> d;
370
+ const RebindToUnsigned<decltype(d)> du;
371
+ return BitCast(d, VFromD<decltype(du)>{__lasx_xvnor_v(BitCast(du, v).raw,
372
+ BitCast(du, v).raw)});
373
+ }
374
+
375
+ // ------------------------------ Xor3
376
+ template <typename T>
377
+ HWY_API Vec256<T> Xor3(Vec256<T> x1, Vec256<T> x2, Vec256<T> x3) {
378
+ return Xor(x1, Xor(x2, x3));
379
+ }
380
+
381
+ // ------------------------------ Or3
382
+ template <typename T>
383
+ HWY_API Vec256<T> Or3(Vec256<T> o1, Vec256<T> o2, Vec256<T> o3) {
384
+ return Or(o1, Or(o2, o3));
385
+ }
386
+
387
+ // ------------------------------ OrAnd
388
+ template <typename T>
389
+ HWY_API Vec256<T> OrAnd(Vec256<T> o, Vec256<T> a1, Vec256<T> a2) {
390
+ return Or(o, And(a1, a2));
391
+ }
392
+
393
+ // ------------------------------ IfVecThenElse
394
+ template <typename T>
395
+ HWY_API Vec256<T> IfVecThenElse(Vec256<T> mask, Vec256<T> yes, Vec256<T> no) {
396
+ return IfThenElse(MaskFromVec(mask), yes, no);
397
+ }
398
+
399
+ // ------------------------------ Operator overloads (internal-only if float)
400
+
401
+ template <typename T>
402
+ HWY_API Vec256<T> operator&(const Vec256<T> a, const Vec256<T> b) {
403
+ return And(a, b);
404
+ }
405
+
406
+ template <typename T>
407
+ HWY_API Vec256<T> operator|(const Vec256<T> a, const Vec256<T> b) {
408
+ return Or(a, b);
409
+ }
410
+
411
+ template <typename T>
412
+ HWY_API Vec256<T> operator^(const Vec256<T> a, const Vec256<T> b) {
413
+ return Xor(a, b);
414
+ }
415
+
416
+ // ------------------------------ PopulationCount
417
+
418
+ namespace detail {
419
+
420
+ template <typename T>
421
+ HWY_INLINE Vec256<T> PopulationCount(hwy::SizeTag<1> /* tag */, Vec256<T> v) {
422
+ return Vec256<T>{__lasx_xvpcnt_b(v.raw)};
423
+ }
424
+ template <typename T>
425
+ HWY_INLINE Vec256<T> PopulationCount(hwy::SizeTag<2> /* tag */, Vec256<T> v) {
426
+ return Vec256<T>{__lasx_xvpcnt_h(v.raw)};
427
+ }
428
+ template <typename T>
429
+ HWY_INLINE Vec256<T> PopulationCount(hwy::SizeTag<4> /* tag */, Vec256<T> v) {
430
+ return Vec256<T>{__lasx_xvpcnt_w(v.raw)};
431
+ }
432
+ template <typename T>
433
+ HWY_INLINE Vec256<T> PopulationCount(hwy::SizeTag<8> /* tag */, Vec256<T> v) {
434
+ return Vec256<T>{__lasx_xvpcnt_d(v.raw)};
435
+ }
436
+
437
+ } // namespace detail
438
+
439
+ template <typename T>
440
+ HWY_API Vec256<T> PopulationCount(Vec256<T> v) {
441
+ return detail::PopulationCount(hwy::SizeTag<sizeof(T)>(), v);
442
+ }
443
+
444
+ // ------------------------------ Mask
445
+
446
+ // Mask and Vec are the same (true = FF..FF).
447
+ template <typename T>
448
+ HWY_API Mask256<T> MaskFromVec(const Vec256<T> v) {
449
+ return Mask256<T>{v.raw};
450
+ }
451
+
452
+ template <typename T>
453
+ HWY_API Vec256<T> VecFromMask(const Mask256<T> v) {
454
+ return Vec256<T>{v.raw};
455
+ }
456
+
457
+ // ------------------------------ IfThenElse
458
+
459
+ // mask ? yes : no
460
+ template <typename T>
461
+ HWY_API Vec256<T> IfThenElse(Mask256<T> mask, Vec256<T> yes, Vec256<T> no) {
462
+ const DFromV<decltype(yes)> d;
463
+ RebindToSigned<decltype(d)> di;
464
+ return BitCast(d, VFromD<decltype(di)>{__lasx_xvbitsel_v(
465
+ BitCast(di, no).raw, BitCast(di, yes).raw,
466
+ RebindMask(di, mask).raw)});
467
+ }
468
+
469
+ // mask ? yes : 0
470
+ template <typename T>
471
+ HWY_API Vec256<T> IfThenElseZero(Mask256<T> mask, Vec256<T> yes) {
472
+ return yes & VecFromMask(mask);
473
+ }
474
+
475
+ // mask ? 0 : no
476
+ template <typename T>
477
+ HWY_API Vec256<T> IfThenZeroElse(Mask256<T> mask, Vec256<T> no) {
478
+ return AndNot(VecFromMask(mask), no);
479
+ }
480
+
481
+ template <typename T>
482
+ HWY_API Vec256<T> ZeroIfNegative(Vec256<T> v) {
483
+ static_assert(IsSigned<T>(), "Only for float");
484
+ const DFromV<decltype(v)> d;
485
+ const auto zero = Zero(d);
486
+ return IfThenElse(v < zero, zero, v);
487
+ }
488
+
489
+ // ------------------------------ Mask logical
490
+
491
+ template <typename T>
492
+ HWY_API Mask256<T> Not(const Mask256<T> m) {
493
+ const Full256<T> d;
494
+ return MaskFromVec(Not(VecFromMask(d, m)));
495
+ }
496
+
497
+ template <typename T>
498
+ HWY_API Mask256<T> And(const Mask256<T> a, Mask256<T> b) {
499
+ const Full256<T> d;
500
+ return MaskFromVec(And(VecFromMask(d, a), VecFromMask(d, b)));
501
+ }
502
+
503
+ template <typename T>
504
+ HWY_API Mask256<T> AndNot(const Mask256<T> a, Mask256<T> b) {
505
+ const Full256<T> d;
506
+ return MaskFromVec(AndNot(VecFromMask(d, a), VecFromMask(d, b)));
507
+ }
508
+
509
+ template <typename T>
510
+ HWY_API Mask256<T> Or(const Mask256<T> a, Mask256<T> b) {
511
+ const Full256<T> d;
512
+ return MaskFromVec(Or(VecFromMask(d, a), VecFromMask(d, b)));
513
+ }
514
+
515
+ template <typename T>
516
+ HWY_API Mask256<T> Xor(const Mask256<T> a, Mask256<T> b) {
517
+ const Full256<T> d;
518
+ return MaskFromVec(Xor(VecFromMask(d, a), VecFromMask(d, b)));
519
+ }
520
+
521
+ template <typename T>
522
+ HWY_API Mask256<T> ExclusiveNeither(const Mask256<T> a, Mask256<T> b) {
523
+ const Full256<T> d;
524
+ return MaskFromVec(AndNot(VecFromMask(d, a), Not(VecFromMask(d, b))));
525
+ }
526
+
527
+ // ================================================== COMPARE
528
+
529
+ // Comparisons fill a lane with 1-bits if the condition is true, else 0.
530
+
531
+ template <class DTo, HWY_IF_V_SIZE_D(DTo, 32), typename TFrom>
532
+ HWY_API MFromD<DTo> RebindMask(DTo d_to, Mask256<TFrom> m) {
533
+ static_assert(sizeof(TFrom) == sizeof(TFromD<DTo>), "Must have same size");
534
+ const Full256<TFrom> dfrom;
535
+ return MaskFromVec(BitCast(d_to, VecFromMask(dfrom, m)));
536
+ }
537
+
538
+ template <typename T>
539
+ HWY_API Mask256<T> TestBit(const Vec256<T> v, const Vec256<T> bit) {
540
+ static_assert(!hwy::IsFloat<T>(), "Only integer vectors supported");
541
+ return (v & bit) == bit;
542
+ }
543
+
544
+ // ------------------------------ Equality
545
+
546
+ template <typename T, HWY_IF_T_SIZE(T, 1)>
547
+ HWY_API Mask256<T> operator==(Vec256<T> a, Vec256<T> b) {
548
+ return Mask256<T>{__lasx_xvseq_b(a.raw, b.raw)};
549
+ }
550
+
551
+ template <typename T, HWY_IF_UI16(T)>
552
+ HWY_API Mask256<T> operator==(Vec256<T> a, Vec256<T> b) {
553
+ return Mask256<T>{__lasx_xvseq_h(a.raw, b.raw)};
554
+ }
555
+
556
+ template <typename T, HWY_IF_UI32(T)>
557
+ HWY_API Mask256<T> operator==(Vec256<T> a, Vec256<T> b) {
558
+ return Mask256<T>{__lasx_xvseq_w(a.raw, b.raw)};
559
+ }
560
+
561
+ template <typename T, HWY_IF_UI64(T)>
562
+ HWY_API Mask256<T> operator==(Vec256<T> a, Vec256<T> b) {
563
+ return Mask256<T>{__lasx_xvseq_d(a.raw, b.raw)};
564
+ }
565
+
566
+ HWY_API Mask256<float> operator==(Vec256<float> a, Vec256<float> b) {
567
+ const DFromV<decltype(a)> d;
568
+ const RebindToSigned<decltype(d)> di;
569
+ return RebindMask(d, MFromD<decltype(di)>{__lasx_xvfcmp_ceq_s(a.raw, b.raw)});
570
+ }
571
+
572
+ HWY_API Mask256<double> operator==(Vec256<double> a, Vec256<double> b) {
573
+ const DFromV<decltype(a)> d;
574
+ const RebindToSigned<decltype(d)> di;
575
+ return RebindMask(d, MFromD<decltype(di)>{__lasx_xvfcmp_ceq_d(a.raw, b.raw)});
576
+ }
577
+
578
+ // ------------------------------ Inequality
579
+
580
+ template <typename T, HWY_IF_NOT_FLOAT3264(T)>
581
+ HWY_API Mask256<T> operator!=(Vec256<T> a, Vec256<T> b) {
582
+ return Not(a == b);
583
+ }
584
+ HWY_API Mask256<float> operator!=(Vec256<float> a, Vec256<float> b) {
585
+ const DFromV<decltype(a)> d;
586
+ const RebindToSigned<decltype(d)> di;
587
+ return RebindMask(d, MFromD<decltype(di)>{__lasx_xvfcmp_cne_s(a.raw, b.raw)});
588
+ }
589
+ HWY_API Mask256<double> operator!=(Vec256<double> a, Vec256<double> b) {
590
+ const DFromV<decltype(a)> d;
591
+ const RebindToSigned<decltype(d)> di;
592
+ return RebindMask(d, MFromD<decltype(di)>{__lasx_xvfcmp_cne_d(a.raw, b.raw)});
593
+ }
594
+
595
+ // ------------------------------ Strict inequality
596
+
597
+ namespace detail {
598
+
599
+ HWY_API Mask256<int8_t> Gt(hwy::SignedTag /*tag*/, Vec256<int8_t> a,
600
+ Vec256<int8_t> b) {
601
+ return Mask256<int8_t>{__lasx_xvslt_b(b.raw, a.raw)};
602
+ }
603
+ HWY_API Mask256<int16_t> Gt(hwy::SignedTag /*tag*/, Vec256<int16_t> a,
604
+ Vec256<int16_t> b) {
605
+ return Mask256<int16_t>{__lasx_xvslt_h(b.raw, a.raw)};
606
+ }
607
+ HWY_API Mask256<int32_t> Gt(hwy::SignedTag /*tag*/, Vec256<int32_t> a,
608
+ Vec256<int32_t> b) {
609
+ return Mask256<int32_t>{__lasx_xvslt_w(b.raw, a.raw)};
610
+ }
611
+ HWY_API Mask256<int64_t> Gt(hwy::SignedTag /*tag*/, Vec256<int64_t> a,
612
+ Vec256<int64_t> b) {
613
+ return Mask256<int64_t>{__lasx_xvslt_d(b.raw, a.raw)};
614
+ }
615
+
616
+ HWY_API Mask256<uint8_t> Gt(hwy::UnsignedTag /*tag*/, Vec256<uint8_t> a,
617
+ Vec256<uint8_t> b) {
618
+ return Mask256<uint8_t>{__lasx_xvslt_bu(b.raw, a.raw)};
619
+ }
620
+ HWY_API Mask256<uint16_t> Gt(hwy::UnsignedTag /*tag*/, Vec256<uint16_t> a,
621
+ Vec256<uint16_t> b) {
622
+ return Mask256<uint16_t>{__lasx_xvslt_hu(b.raw, a.raw)};
623
+ }
624
+ HWY_API Mask256<uint32_t> Gt(hwy::UnsignedTag /*tag*/, Vec256<uint32_t> a,
625
+ Vec256<uint32_t> b) {
626
+ return Mask256<uint32_t>{__lasx_xvslt_wu(b.raw, a.raw)};
627
+ }
628
+ HWY_API Mask256<uint64_t> Gt(hwy::UnsignedTag /*tag*/, Vec256<uint64_t> a,
629
+ Vec256<uint64_t> b) {
630
+ return Mask256<uint64_t>{__lasx_xvslt_du(b.raw, a.raw)};
631
+ }
632
+
633
+ HWY_API Mask256<float> Gt(hwy::FloatTag /*tag*/, Vec256<float> a,
634
+ Vec256<float> b) {
635
+ const DFromV<decltype(a)> d;
636
+ const RebindToSigned<decltype(d)> di;
637
+ return RebindMask(d, MFromD<decltype(di)>{__lasx_xvfcmp_clt_s(b.raw, a.raw)});
638
+ }
639
+ HWY_API Mask256<double> Gt(hwy::FloatTag /*tag*/, Vec256<double> a,
640
+ Vec256<double> b) {
641
+ const DFromV<decltype(a)> d;
642
+ const RebindToSigned<decltype(d)> di;
643
+ return RebindMask(d, MFromD<decltype(di)>{__lasx_xvfcmp_clt_d(b.raw, a.raw)});
644
+ }
645
+
646
+ } // namespace detail
647
+
648
+ template <typename T>
649
+ HWY_API Mask256<T> operator>(Vec256<T> a, Vec256<T> b) {
650
+ return detail::Gt(hwy::TypeTag<T>(), a, b);
651
+ }
652
+
653
+ // ------------------------------ Weak inequality
654
+
655
+ namespace detail {
656
+
657
+ template <typename T>
658
+ HWY_INLINE Mask256<T> Ge(hwy::SignedTag /*tag*/, Vec256<T> a, Vec256<T> b) {
659
+ return Not(b > a);
660
+ }
661
+
662
+ template <typename T>
663
+ HWY_INLINE Mask256<T> Ge(hwy::UnsignedTag /*tag*/, Vec256<T> a, Vec256<T> b) {
664
+ return Not(b > a);
665
+ }
666
+
667
+ HWY_INLINE Mask256<float> Ge(hwy::FloatTag /*tag*/, Vec256<float> a,
668
+ Vec256<float> b) {
669
+ const DFromV<decltype(a)> d;
670
+ const RebindToSigned<decltype(d)> di;
671
+ return RebindMask(d, MFromD<decltype(di)>{__lasx_xvfcmp_cle_s(b.raw, a.raw)});
672
+ }
673
+ HWY_INLINE Mask256<double> Ge(hwy::FloatTag /*tag*/, Vec256<double> a,
674
+ Vec256<double> b) {
675
+ const DFromV<decltype(a)> d;
676
+ const RebindToSigned<decltype(d)> di;
677
+ return RebindMask(d, MFromD<decltype(di)>{__lasx_xvfcmp_cle_d(b.raw, a.raw)});
678
+ }
679
+
680
+ } // namespace detail
681
+
682
+ template <typename T>
683
+ HWY_API Mask256<T> operator>=(Vec256<T> a, Vec256<T> b) {
684
+ return detail::Ge(hwy::TypeTag<T>(), a, b);
685
+ }
686
+
687
+ // ------------------------------ Reversed comparisons
688
+
689
+ template <typename T>
690
+ HWY_API Mask256<T> operator<(const Vec256<T> a, const Vec256<T> b) {
691
+ return b > a;
692
+ }
693
+
694
+ template <typename T>
695
+ HWY_API Mask256<T> operator<=(const Vec256<T> a, const Vec256<T> b) {
696
+ return b >= a;
697
+ }
698
+
699
+ // ------------------------------ Min (Gt, IfThenElse)
700
+
701
+ // Unsigned
702
+ HWY_API Vec256<uint8_t> Min(const Vec256<uint8_t> a, const Vec256<uint8_t> b) {
703
+ return Vec256<uint8_t>{__lasx_xvmin_bu(a.raw, b.raw)};
704
+ }
705
+ HWY_API Vec256<uint16_t> Min(const Vec256<uint16_t> a,
706
+ const Vec256<uint16_t> b) {
707
+ return Vec256<uint16_t>{__lasx_xvmin_hu(a.raw, b.raw)};
708
+ }
709
+ HWY_API Vec256<uint32_t> Min(const Vec256<uint32_t> a,
710
+ const Vec256<uint32_t> b) {
711
+ return Vec256<uint32_t>{__lasx_xvmin_wu(a.raw, b.raw)};
712
+ }
713
+ HWY_API Vec256<uint64_t> Min(const Vec256<uint64_t> a,
714
+ const Vec256<uint64_t> b) {
715
+ return Vec256<uint64_t>{__lasx_xvmin_du(a.raw, b.raw)};
716
+ }
717
+
718
+ // Signed
719
+ HWY_API Vec256<int8_t> Min(const Vec256<int8_t> a, const Vec256<int8_t> b) {
720
+ return Vec256<int8_t>{__lasx_xvmin_b(a.raw, b.raw)};
721
+ }
722
+ HWY_API Vec256<int16_t> Min(const Vec256<int16_t> a, const Vec256<int16_t> b) {
723
+ return Vec256<int16_t>{__lasx_xvmin_h(a.raw, b.raw)};
724
+ }
725
+ HWY_API Vec256<int32_t> Min(const Vec256<int32_t> a, const Vec256<int32_t> b) {
726
+ return Vec256<int32_t>{__lasx_xvmin_w(a.raw, b.raw)};
727
+ }
728
+ HWY_API Vec256<int64_t> Min(const Vec256<int64_t> a, const Vec256<int64_t> b) {
729
+ return Vec256<int64_t>{__lasx_xvmin_d(a.raw, b.raw)};
730
+ }
731
+
732
+ // Float
733
+ HWY_API Vec256<float> Min(const Vec256<float> a, const Vec256<float> b) {
734
+ return Vec256<float>{__lasx_xvfmin_s(a.raw, b.raw)};
735
+ }
736
+ HWY_API Vec256<double> Min(const Vec256<double> a, const Vec256<double> b) {
737
+ return Vec256<double>{__lasx_xvfmin_d(a.raw, b.raw)};
738
+ }
739
+
740
+ // ------------------------------ Max (Gt, IfThenElse)
741
+
742
+ // Unsigned
743
+ HWY_API Vec256<uint8_t> Max(const Vec256<uint8_t> a, const Vec256<uint8_t> b) {
744
+ return Vec256<uint8_t>{__lasx_xvmax_bu(a.raw, b.raw)};
745
+ }
746
+ HWY_API Vec256<uint16_t> Max(const Vec256<uint16_t> a,
747
+ const Vec256<uint16_t> b) {
748
+ return Vec256<uint16_t>{__lasx_xvmax_hu(a.raw, b.raw)};
749
+ }
750
+ HWY_API Vec256<uint32_t> Max(const Vec256<uint32_t> a,
751
+ const Vec256<uint32_t> b) {
752
+ return Vec256<uint32_t>{__lasx_xvmax_wu(a.raw, b.raw)};
753
+ }
754
+ HWY_API Vec256<uint64_t> Max(const Vec256<uint64_t> a,
755
+ const Vec256<uint64_t> b) {
756
+ return Vec256<uint64_t>{__lasx_xvmax_du(a.raw, b.raw)};
757
+ }
758
+
759
+ // Signed
760
+ HWY_API Vec256<int8_t> Max(const Vec256<int8_t> a, const Vec256<int8_t> b) {
761
+ return Vec256<int8_t>{__lasx_xvmax_b(a.raw, b.raw)};
762
+ }
763
+ HWY_API Vec256<int16_t> Max(const Vec256<int16_t> a, const Vec256<int16_t> b) {
764
+ return Vec256<int16_t>{__lasx_xvmax_h(a.raw, b.raw)};
765
+ }
766
+ HWY_API Vec256<int32_t> Max(const Vec256<int32_t> a, const Vec256<int32_t> b) {
767
+ return Vec256<int32_t>{__lasx_xvmax_w(a.raw, b.raw)};
768
+ }
769
+ HWY_API Vec256<int64_t> Max(const Vec256<int64_t> a, const Vec256<int64_t> b) {
770
+ return Vec256<int64_t>{__lasx_xvmax_d(a.raw, b.raw)};
771
+ }
772
+
773
+ // Float
774
+ HWY_API Vec256<float> Max(const Vec256<float> a, const Vec256<float> b) {
775
+ return Vec256<float>{__lasx_xvfmax_s(a.raw, b.raw)};
776
+ }
777
+ HWY_API Vec256<double> Max(const Vec256<double> a, const Vec256<double> b) {
778
+ return Vec256<double>{__lasx_xvfmax_d(a.raw, b.raw)};
779
+ }
780
+
781
+ // ------------------------------ MinMagnitude and MaxMagnitude
782
+
783
+ HWY_API Vec256<float> MinMagnitude(Vec256<float> a, Vec256<float> b) {
784
+ return Vec256<float>{__lasx_xvfmina_s(a.raw, b.raw)};
785
+ }
786
+ HWY_API Vec256<double> MinMagnitude(Vec256<double> a, Vec256<double> b) {
787
+ return Vec256<double>{__lasx_xvfmina_d(a.raw, b.raw)};
788
+ }
789
+
790
+ HWY_API Vec256<float> MaxMagnitude(Vec256<float> a, Vec256<float> b) {
791
+ return Vec256<float>{__lasx_xvfmaxa_s(a.raw, b.raw)};
792
+ }
793
+ HWY_API Vec256<double> MaxMagnitude(Vec256<double> a, Vec256<double> b) {
794
+ return Vec256<double>{__lasx_xvfmaxa_d(a.raw, b.raw)};
795
+ }
796
+
797
+ // ------------------------------ Iota
798
+
799
+ namespace detail {
800
+
801
+ template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 1)>
802
+ HWY_INLINE VFromD<D> Iota0(D /*d*/) {
803
+ typedef int8_t GccI8RawVectType __attribute__((__vector_size__(32)));
804
+ const GccI8RawVectType raw_i8_vec = {
805
+ static_cast<char>(0), static_cast<char>(1), static_cast<char>(2),
806
+ static_cast<char>(3), static_cast<char>(4), static_cast<char>(5),
807
+ static_cast<char>(6), static_cast<char>(7), static_cast<char>(8),
808
+ static_cast<char>(9), static_cast<char>(10), static_cast<char>(11),
809
+ static_cast<char>(12), static_cast<char>(13), static_cast<char>(14),
810
+ static_cast<char>(15), static_cast<char>(16), static_cast<char>(17),
811
+ static_cast<char>(18), static_cast<char>(19), static_cast<char>(20),
812
+ static_cast<char>(21), static_cast<char>(22), static_cast<char>(23),
813
+ static_cast<char>(24), static_cast<char>(25), static_cast<char>(26),
814
+ static_cast<char>(27), static_cast<char>(28), static_cast<char>(29),
815
+ static_cast<char>(30), static_cast<char>(31)};
816
+ return VFromD<D>{reinterpret_cast<__m256i>(raw_i8_vec)};
817
+ }
818
+
819
+ template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_UI16_D(D)>
820
+ HWY_INLINE VFromD<D> Iota0(D /*d*/) {
821
+ typedef int16_t GccI16RawVectType __attribute__((__vector_size__(32)));
822
+ const GccI16RawVectType raw_i16_vec = {
823
+ static_cast<int16_t>(0), static_cast<int16_t>(1),
824
+ static_cast<int16_t>(2), static_cast<int16_t>(3),
825
+ static_cast<int16_t>(4), static_cast<int16_t>(5),
826
+ static_cast<int16_t>(6), static_cast<int16_t>(7),
827
+ static_cast<int16_t>(8), static_cast<int16_t>(9),
828
+ static_cast<int16_t>(10), static_cast<int16_t>(11),
829
+ static_cast<int16_t>(12), static_cast<int16_t>(13),
830
+ static_cast<int16_t>(14), static_cast<int16_t>(15)};
831
+ return VFromD<D>{reinterpret_cast<__m256i>(raw_i16_vec)};
832
+ }
833
+
834
+ template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_UI32_D(D)>
835
+ HWY_INLINE VFromD<D> Iota0(D /*d*/) {
836
+ typedef int32_t GccI32RawVectType __attribute__((__vector_size__(32)));
837
+ const GccI32RawVectType raw_i32_vec = {
838
+ static_cast<int32_t>(0), static_cast<int32_t>(1), static_cast<int32_t>(2),
839
+ static_cast<int32_t>(3), static_cast<int32_t>(4), static_cast<int32_t>(5),
840
+ static_cast<int32_t>(6), static_cast<int32_t>(7)};
841
+ return VFromD<D>{reinterpret_cast<__m256i>(raw_i32_vec)};
842
+ }
843
+
844
+ template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_UI64_D(D)>
845
+ HWY_INLINE VFromD<D> Iota0(D /*d*/) {
846
+ typedef int64_t GccI64RawVectType __attribute__((__vector_size__(32)));
847
+ const GccI64RawVectType raw_i64_vec = {
848
+ static_cast<int64_t>(0), static_cast<int64_t>(1), static_cast<int64_t>(2),
849
+ static_cast<int64_t>(3)};
850
+ return VFromD<D>{reinterpret_cast<__m256i>(raw_i64_vec)};
851
+ }
852
+
853
+ template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)>
854
+ HWY_INLINE VFromD<D> Iota0(D /*d*/) {
855
+ typedef float GccF32RawVectType __attribute__((__vector_size__(32)));
856
+ const GccF32RawVectType raw_f32_vec = {0.0f, 1.0f, 2.0f, 3.0f,
857
+ 4.0f, 5.0f, 6.0f, 7.0f};
858
+ return VFromD<D>{reinterpret_cast<__m256>(raw_f32_vec)};
859
+ }
860
+
861
+ template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F64_D(D)>
862
+ HWY_INLINE VFromD<D> Iota0(D /*d*/) {
863
+ typedef double GccF64RawVectType __attribute__((__vector_size__(32)));
864
+ const GccF64RawVectType raw_f64_vec = {0.0, 1.0, 2.0, 3.0};
865
+ return VFromD<D>{reinterpret_cast<__m256d>(raw_f64_vec)};
866
+ }
867
+
868
+ } // namespace detail
869
+
870
+ template <class D, HWY_IF_V_SIZE_D(D, 32), typename T2>
871
+ HWY_API VFromD<D> Iota(D d, const T2 first) {
872
+ return detail::Iota0(d) + Set(d, ConvertScalarTo<TFromD<D>>(first));
873
+ }
874
+
875
+ // ------------------------------ FirstN (Iota, Lt)
876
+
877
+ template <class D, HWY_IF_V_SIZE_D(D, 32), class M = MFromD<D>>
878
+ HWY_API M FirstN(const D d, size_t n) {
879
+ constexpr size_t kN = MaxLanes(d);
880
+ n = HWY_MIN(n, kN);
881
+ const RebindToSigned<decltype(d)> di; // Signed comparisons are cheaper.
882
+ using TI = TFromD<decltype(di)>;
883
+ return RebindMask(d, detail::Iota0(di) < Set(di, static_cast<TI>(n)));
884
+ }
885
+
886
+ // ================================================== ARITHMETIC
887
+
888
+ // ------------------------------ Addition
889
+
890
+ // Unsigned
891
+ HWY_API Vec256<uint8_t> operator+(Vec256<uint8_t> a, Vec256<uint8_t> b) {
892
+ return Vec256<uint8_t>{__lasx_xvadd_b(a.raw, b.raw)};
893
+ }
894
+ HWY_API Vec256<uint16_t> operator+(Vec256<uint16_t> a, Vec256<uint16_t> b) {
895
+ return Vec256<uint16_t>{__lasx_xvadd_h(a.raw, b.raw)};
896
+ }
897
+ HWY_API Vec256<uint32_t> operator+(Vec256<uint32_t> a, Vec256<uint32_t> b) {
898
+ return Vec256<uint32_t>{__lasx_xvadd_w(a.raw, b.raw)};
899
+ }
900
+ HWY_API Vec256<uint64_t> operator+(Vec256<uint64_t> a, Vec256<uint64_t> b) {
901
+ return Vec256<uint64_t>{__lasx_xvadd_d(a.raw, b.raw)};
902
+ }
903
+
904
+ // Signed
905
+ HWY_API Vec256<int8_t> operator+(Vec256<int8_t> a, Vec256<int8_t> b) {
906
+ return Vec256<int8_t>{__lasx_xvadd_b(a.raw, b.raw)};
907
+ }
908
+ HWY_API Vec256<int16_t> operator+(Vec256<int16_t> a, Vec256<int16_t> b) {
909
+ return Vec256<int16_t>{__lasx_xvadd_h(a.raw, b.raw)};
910
+ }
911
+ HWY_API Vec256<int32_t> operator+(Vec256<int32_t> a, Vec256<int32_t> b) {
912
+ return Vec256<int32_t>{__lasx_xvadd_w(a.raw, b.raw)};
913
+ }
914
+ HWY_API Vec256<int64_t> operator+(Vec256<int64_t> a, Vec256<int64_t> b) {
915
+ return Vec256<int64_t>{__lasx_xvadd_d(a.raw, b.raw)};
916
+ }
917
+
918
+ HWY_API Vec256<float> operator+(Vec256<float> a, Vec256<float> b) {
919
+ return Vec256<float>{__lasx_xvfadd_s(a.raw, b.raw)};
920
+ }
921
+ HWY_API Vec256<double> operator+(Vec256<double> a, Vec256<double> b) {
922
+ return Vec256<double>{__lasx_xvfadd_d(a.raw, b.raw)};
923
+ }
924
+
925
+ template <typename T>
926
+ HWY_API Vec256<T> Add(Vec256<T> a, Vec256<T> b) {
927
+ return a + b;
928
+ }
929
+
930
+ // ------------------------------ Subtraction
931
+
932
+ // Unsigne
933
+ HWY_API Vec256<uint8_t> operator-(Vec256<uint8_t> a, Vec256<uint8_t> b) {
934
+ return Vec256<uint8_t>{__lasx_xvsub_b(a.raw, b.raw)};
935
+ }
936
+ HWY_API Vec256<uint16_t> operator-(Vec256<uint16_t> a, Vec256<uint16_t> b) {
937
+ return Vec256<uint16_t>{__lasx_xvsub_h(a.raw, b.raw)};
938
+ }
939
+ HWY_API Vec256<uint32_t> operator-(Vec256<uint32_t> a, Vec256<uint32_t> b) {
940
+ return Vec256<uint32_t>{__lasx_xvsub_w(a.raw, b.raw)};
941
+ }
942
+ HWY_API Vec256<uint64_t> operator-(Vec256<uint64_t> a, Vec256<uint64_t> b) {
943
+ return Vec256<uint64_t>{__lasx_xvsub_d(a.raw, b.raw)};
944
+ }
945
+
946
+ // Signed
947
+ HWY_API Vec256<int8_t> operator-(Vec256<int8_t> a, Vec256<int8_t> b) {
948
+ return Vec256<int8_t>{__lasx_xvsub_b(a.raw, b.raw)};
949
+ }
950
+ HWY_API Vec256<int16_t> operator-(Vec256<int16_t> a, Vec256<int16_t> b) {
951
+ return Vec256<int16_t>{__lasx_xvsub_h(a.raw, b.raw)};
952
+ }
953
+ HWY_API Vec256<int32_t> operator-(Vec256<int32_t> a, Vec256<int32_t> b) {
954
+ return Vec256<int32_t>{__lasx_xvsub_w(a.raw, b.raw)};
955
+ }
956
+ HWY_API Vec256<int64_t> operator-(Vec256<int64_t> a, Vec256<int64_t> b) {
957
+ return Vec256<int64_t>{__lasx_xvsub_d(a.raw, b.raw)};
958
+ }
959
+
960
+ HWY_API Vec256<float> operator-(Vec256<float> a, Vec256<float> b) {
961
+ return Vec256<float>{__lasx_xvfsub_s(a.raw, b.raw)};
962
+ }
963
+ HWY_API Vec256<double> operator-(Vec256<double> a, Vec256<double> b) {
964
+ return Vec256<double>{__lasx_xvfsub_d(a.raw, b.raw)};
965
+ }
966
+
967
+ // ------------------------------ SumsOf8
968
+ HWY_API Vec256<uint64_t> SumsOf8(Vec256<uint8_t> v) {
969
+ v.raw = __lasx_xvhaddw_hu_bu(v.raw, v.raw);
970
+ v.raw = __lasx_xvhaddw_wu_hu(v.raw, v.raw);
971
+ return Vec256<uint64_t>{__lasx_xvhaddw_du_wu(v.raw, v.raw)};
972
+ }
973
+ HWY_API Vec256<int64_t> SumsOf8(Vec256<int8_t> v) {
974
+ v.raw = __lasx_xvhaddw_h_b(v.raw, v.raw);
975
+ v.raw = __lasx_xvhaddw_w_h(v.raw, v.raw);
976
+ return Vec256<int64_t>{__lasx_xvhaddw_d_w(v.raw, v.raw)};
977
+ }
978
+
979
+ // ------------------------------ SaturatedAdd
980
+
981
+ // Returns a + b clamped to the destination range.
982
+
983
+ // Unsigned
984
+ HWY_API Vec256<uint8_t> SaturatedAdd(Vec256<uint8_t> a, Vec256<uint8_t> b) {
985
+ return Vec256<uint8_t>{__lasx_xvsadd_bu(a.raw, b.raw)};
986
+ }
987
+ HWY_API Vec256<uint16_t> SaturatedAdd(Vec256<uint16_t> a, Vec256<uint16_t> b) {
988
+ return Vec256<uint16_t>{__lasx_xvsadd_hu(a.raw, b.raw)};
989
+ }
990
+ HWY_API Vec256<uint32_t> SaturatedAdd(Vec256<uint32_t> a, Vec256<uint32_t> b) {
991
+ return Vec256<uint32_t>{__lasx_xvsadd_wu(a.raw, b.raw)};
992
+ }
993
+ HWY_API Vec256<uint64_t> SaturatedAdd(Vec256<uint64_t> a, Vec256<uint64_t> b) {
994
+ return Vec256<uint64_t>{__lasx_xvsadd_du(a.raw, b.raw)};
995
+ }
996
+
997
+ // Signed
998
+ HWY_API Vec256<int8_t> SaturatedAdd(Vec256<int8_t> a, Vec256<int8_t> b) {
999
+ return Vec256<int8_t>{__lasx_xvsadd_b(a.raw, b.raw)};
1000
+ }
1001
+ HWY_API Vec256<int16_t> SaturatedAdd(Vec256<int16_t> a, Vec256<int16_t> b) {
1002
+ return Vec256<int16_t>{__lasx_xvsadd_h(a.raw, b.raw)};
1003
+ }
1004
+ HWY_API Vec256<int32_t> SaturatedAdd(Vec256<int32_t> a, Vec256<int32_t> b) {
1005
+ return Vec256<int32_t>{__lasx_xvsadd_w(a.raw, b.raw)};
1006
+ }
1007
+ HWY_API Vec256<int64_t> SaturatedAdd(Vec256<int64_t> a, Vec256<int64_t> b) {
1008
+ return Vec256<int64_t>{__lasx_xvsadd_d(a.raw, b.raw)};
1009
+ }
1010
+
1011
+ // ------------------------------ SaturatedSub
1012
+
1013
+ // Returns a - b clamped to the destination range.
1014
+
1015
+ // Unsigned
1016
+ HWY_API Vec256<uint8_t> SaturatedSub(Vec256<uint8_t> a, Vec256<uint8_t> b) {
1017
+ return Vec256<uint8_t>{__lasx_xvssub_bu(a.raw, b.raw)};
1018
+ }
1019
+ HWY_API Vec256<uint16_t> SaturatedSub(Vec256<uint16_t> a, Vec256<uint16_t> b) {
1020
+ return Vec256<uint16_t>{__lasx_xvssub_hu(a.raw, b.raw)};
1021
+ }
1022
+ HWY_API Vec256<uint32_t> SaturatedSub(Vec256<uint32_t> a, Vec256<uint32_t> b) {
1023
+ return Vec256<uint32_t>{__lasx_xvssub_wu(a.raw, b.raw)};
1024
+ }
1025
+ HWY_API Vec256<uint64_t> SaturatedSub(Vec256<uint64_t> a, Vec256<uint64_t> b) {
1026
+ return Vec256<uint64_t>{__lasx_xvssub_du(a.raw, b.raw)};
1027
+ }
1028
+
1029
+ // Signed
1030
+ HWY_API Vec256<int8_t> SaturatedSub(Vec256<int8_t> a, Vec256<int8_t> b) {
1031
+ return Vec256<int8_t>{__lasx_xvssub_b(a.raw, b.raw)};
1032
+ }
1033
+ HWY_API Vec256<int16_t> SaturatedSub(Vec256<int16_t> a, Vec256<int16_t> b) {
1034
+ return Vec256<int16_t>{__lasx_xvssub_h(a.raw, b.raw)};
1035
+ }
1036
+ HWY_API Vec256<int32_t> SaturatedSub(Vec256<int32_t> a, Vec256<int32_t> b) {
1037
+ return Vec256<int32_t>{__lasx_xvssub_w(a.raw, b.raw)};
1038
+ }
1039
+ HWY_API Vec256<int64_t> SaturatedSub(Vec256<int64_t> a, Vec256<int64_t> b) {
1040
+ return Vec256<int64_t>{__lasx_xvssub_d(a.raw, b.raw)};
1041
+ }
1042
+
1043
+ // ------------------------------ Average
1044
+
1045
+ // Returns (a + b + 1) / 2
1046
+
1047
+ // Unsigned
1048
+ HWY_API Vec256<int8_t> AverageRound(Vec256<int8_t> a, Vec256<int8_t> b) {
1049
+ return Vec256<int8_t>{__lasx_xvavgr_b(a.raw, b.raw)};
1050
+ }
1051
+ HWY_API Vec256<uint8_t> AverageRound(Vec256<uint8_t> a, Vec256<uint8_t> b) {
1052
+ return Vec256<uint8_t>{__lasx_xvavgr_bu(a.raw, b.raw)};
1053
+ }
1054
+ HWY_API Vec256<int16_t> AverageRound(Vec256<int16_t> a, Vec256<int16_t> b) {
1055
+ return Vec256<int16_t>{__lasx_xvavgr_h(a.raw, b.raw)};
1056
+ }
1057
+ HWY_API Vec256<uint16_t> AverageRound(Vec256<uint16_t> a, Vec256<uint16_t> b) {
1058
+ return Vec256<uint16_t>{__lasx_xvavgr_hu(a.raw, b.raw)};
1059
+ }
1060
+ HWY_API Vec256<int32_t> AverageRound(Vec256<int32_t> a, Vec256<int32_t> b) {
1061
+ return Vec256<int32_t>{__lasx_xvavgr_w(a.raw, b.raw)};
1062
+ }
1063
+ HWY_API Vec256<uint32_t> AverageRound(Vec256<uint32_t> a, Vec256<uint32_t> b) {
1064
+ return Vec256<uint32_t>{__lasx_xvavgr_wu(a.raw, b.raw)};
1065
+ }
1066
+ HWY_API Vec256<int64_t> AverageRound(Vec256<int64_t> a, Vec256<int64_t> b) {
1067
+ return Vec256<int64_t>{__lasx_xvavgr_d(a.raw, b.raw)};
1068
+ }
1069
+ HWY_API Vec256<uint64_t> AverageRound(Vec256<uint64_t> a, Vec256<uint64_t> b) {
1070
+ return Vec256<uint64_t>{__lasx_xvavgr_du(a.raw, b.raw)};
1071
+ }
1072
+
1073
+ // ------------------------------ Abs (Sub)
1074
+
1075
+ // Returns absolute value, except that LimitsMin() maps to LimitsMax() + 1.
1076
+ HWY_API Vec256<int8_t> Abs(Vec256<int8_t> v) {
1077
+ return Vec256<int8_t>{__lasx_xvabsd_b(v.raw, __lasx_xvreplgr2vr_b(0))};
1078
+ }
1079
+ HWY_API Vec256<int16_t> Abs(const Vec256<int16_t> v) {
1080
+ return Vec256<int16_t>{__lasx_xvabsd_h(v.raw, __lasx_xvreplgr2vr_h(0))};
1081
+ }
1082
+ HWY_API Vec256<int32_t> Abs(const Vec256<int32_t> v) {
1083
+ return Vec256<int32_t>{__lasx_xvabsd_w(v.raw, __lasx_xvreplgr2vr_w(0))};
1084
+ }
1085
+ HWY_API Vec256<int64_t> Abs(const Vec256<int64_t> v) {
1086
+ return Vec256<int64_t>{__lasx_xvabsd_d(v.raw, __lasx_xvreplgr2vr_d(0))};
1087
+ }
1088
+
1089
+ // ------------------------------ Integer AbsDiff
1090
+ HWY_API Vec256<int8_t> AbsDiff(const Vec256<int8_t> a, Vec256<int8_t> b) {
1091
+ return Vec256<int8_t>{__lasx_xvabsd_b(a.raw, b.raw)};
1092
+ }
1093
+ HWY_API Vec256<int16_t> AbsDiff(const Vec256<int16_t> a, Vec256<int16_t> b) {
1094
+ return Vec256<int16_t>{__lasx_xvabsd_h(a.raw, b.raw)};
1095
+ }
1096
+ HWY_API Vec256<int32_t> AbsDiff(const Vec256<int32_t> a, Vec256<int32_t> b) {
1097
+ return Vec256<int32_t>{__lasx_xvabsd_w(a.raw, b.raw)};
1098
+ }
1099
+ HWY_API Vec256<int64_t> AbsDiff(const Vec256<int64_t> a, Vec256<int64_t> b) {
1100
+ return Vec256<int64_t>{__lasx_xvabsd_d(a.raw, b.raw)};
1101
+ }
1102
+
1103
+ HWY_API Vec256<uint8_t> AbsDiff(const Vec256<uint8_t> a, Vec256<uint8_t> b) {
1104
+ return Vec256<uint8_t>{__lasx_xvabsd_bu(a.raw, b.raw)};
1105
+ }
1106
+ HWY_API Vec256<uint16_t> AbsDiff(const Vec256<uint16_t> a, Vec256<uint16_t> b) {
1107
+ return Vec256<uint16_t>{__lasx_xvabsd_hu(a.raw, b.raw)};
1108
+ }
1109
+ HWY_API Vec256<uint32_t> AbsDiff(const Vec256<uint32_t> a, Vec256<uint32_t> b) {
1110
+ return Vec256<uint32_t>{__lasx_xvabsd_wu(a.raw, b.raw)};
1111
+ }
1112
+ HWY_API Vec256<uint64_t> AbsDiff(const Vec256<uint64_t> a, Vec256<uint64_t> b) {
1113
+ return Vec256<uint64_t>{__lasx_xvabsd_du(a.raw, b.raw)};
1114
+ }
1115
+
1116
+ // ------------------------------ Integer multiplication
1117
+
1118
+ // Unsigned
1119
+ HWY_API Vec256<uint8_t> operator*(Vec256<uint8_t> a, Vec256<uint8_t> b) {
1120
+ return Vec256<uint8_t>{__lasx_xvmul_b(a.raw, b.raw)};
1121
+ }
1122
+ HWY_API Vec256<uint16_t> operator*(Vec256<uint16_t> a, Vec256<uint16_t> b) {
1123
+ return Vec256<uint16_t>{__lasx_xvmul_h(a.raw, b.raw)};
1124
+ }
1125
+ HWY_API Vec256<uint32_t> operator*(Vec256<uint32_t> a, Vec256<uint32_t> b) {
1126
+ return Vec256<uint32_t>{__lasx_xvmul_w(a.raw, b.raw)};
1127
+ }
1128
+ HWY_API Vec256<uint64_t> operator*(Vec256<uint64_t> a, Vec256<uint64_t> b) {
1129
+ return Vec256<uint64_t>{__lasx_xvmul_d(a.raw, b.raw)};
1130
+ }
1131
+
1132
+ // Signed
1133
+ HWY_API Vec256<int8_t> operator*(Vec256<int8_t> a, Vec256<int8_t> b) {
1134
+ return Vec256<int8_t>{__lasx_xvmul_b(a.raw, b.raw)};
1135
+ }
1136
+ HWY_API Vec256<int16_t> operator*(Vec256<int16_t> a, Vec256<int16_t> b) {
1137
+ return Vec256<int16_t>{__lasx_xvmul_h(a.raw, b.raw)};
1138
+ }
1139
+ HWY_API Vec256<int32_t> operator*(Vec256<int32_t> a, Vec256<int32_t> b) {
1140
+ return Vec256<int32_t>{__lasx_xvmul_w(a.raw, b.raw)};
1141
+ }
1142
+ HWY_API Vec256<int64_t> operator*(Vec256<int64_t> a, Vec256<int64_t> b) {
1143
+ return Vec256<int64_t>{__lasx_xvmul_d(a.raw, b.raw)};
1144
+ }
1145
+
1146
+ HWY_API Vec256<uint8_t> MulHigh(Vec256<uint8_t> a, Vec256<uint8_t> b) {
1147
+ return Vec256<uint8_t>{__lasx_xvmuh_bu(a.raw, b.raw)};
1148
+ }
1149
+ HWY_API Vec256<int8_t> MulHigh(Vec256<int8_t> a, Vec256<int8_t> b) {
1150
+ return Vec256<int8_t>{__lasx_xvmuh_b(a.raw, b.raw)};
1151
+ }
1152
+ HWY_API Vec256<uint16_t> MulHigh(Vec256<uint16_t> a, Vec256<uint16_t> b) {
1153
+ return Vec256<uint16_t>{__lasx_xvmuh_hu(a.raw, b.raw)};
1154
+ }
1155
+ HWY_API Vec256<int16_t> MulHigh(Vec256<int16_t> a, Vec256<int16_t> b) {
1156
+ return Vec256<int16_t>{__lasx_xvmuh_h(a.raw, b.raw)};
1157
+ }
1158
+ HWY_API Vec256<uint32_t> MulHigh(Vec256<uint32_t> a, Vec256<uint32_t> b) {
1159
+ return Vec256<uint32_t>{__lasx_xvmuh_wu(a.raw, b.raw)};
1160
+ }
1161
+ HWY_API Vec256<int32_t> MulHigh(Vec256<int32_t> a, Vec256<int32_t> b) {
1162
+ return Vec256<int32_t>{__lasx_xvmuh_w(a.raw, b.raw)};
1163
+ }
1164
+ HWY_API Vec256<uint64_t> MulHigh(Vec256<uint64_t> a, Vec256<uint64_t> b) {
1165
+ return Vec256<uint64_t>{__lasx_xvmuh_du(a.raw, b.raw)};
1166
+ }
1167
+ HWY_API Vec256<int64_t> MulHigh(Vec256<int64_t> a, Vec256<int64_t> b) {
1168
+ return Vec256<int64_t>{__lasx_xvmuh_d(a.raw, b.raw)};
1169
+ }
1170
+
1171
+ // Multiplies even lanes (0, 2 ..) and places the double-wide result into
1172
+ // even and the upper half into its odd neighbor lane.
1173
+ HWY_API Vec256<int16_t> MulEven(Vec256<int8_t> a, Vec256<int8_t> b) {
1174
+ return Vec256<int16_t>{__lasx_xvmulwev_h_b(a.raw, b.raw)};
1175
+ }
1176
+ HWY_API Vec256<uint16_t> MulEven(Vec256<uint8_t> a, Vec256<uint8_t> b) {
1177
+ return Vec256<uint16_t>{__lasx_xvmulwev_h_bu(a.raw, b.raw)};
1178
+ }
1179
+ HWY_API Vec256<int32_t> MulEven(Vec256<int16_t> a, Vec256<int16_t> b) {
1180
+ return Vec256<int32_t>{__lasx_xvmulwev_w_h(a.raw, b.raw)};
1181
+ }
1182
+ HWY_API Vec256<uint32_t> MulEven(Vec256<uint16_t> a, Vec256<uint16_t> b) {
1183
+ return Vec256<uint32_t>{__lasx_xvmulwev_w_hu(a.raw, b.raw)};
1184
+ }
1185
+ HWY_API Vec256<int64_t> MulEven(Vec256<int32_t> a, Vec256<int32_t> b) {
1186
+ return Vec256<int64_t>{__lasx_xvmulwev_d_w(a.raw, b.raw)};
1187
+ }
1188
+ HWY_API Vec256<uint64_t> MulEven(Vec256<uint32_t> a, Vec256<uint32_t> b) {
1189
+ return Vec256<uint64_t>{__lasx_xvmulwev_d_wu(a.raw, b.raw)};
1190
+ }
1191
+ template <typename T, HWY_IF_I64(T)>
1192
+ HWY_API Vec256<T> MulEven(Vec256<T> a, Vec256<T> b) {
1193
+ return Vec256<T>{__lasx_xvmulwev_q_d(a.raw, b.raw)};
1194
+ }
1195
+ template <typename T, HWY_IF_U64(T)>
1196
+ HWY_API Vec256<T> MulEven(Vec256<T> a, Vec256<T> b) {
1197
+ return Vec256<T>{__lasx_xvmulwev_q_du(a.raw, b.raw)};
1198
+ }
1199
+
1200
+ HWY_API Vec256<int16_t> MulOdd(Vec256<int8_t> a, Vec256<int8_t> b) {
1201
+ return Vec256<int16_t>{__lasx_xvmulwod_h_b(a.raw, b.raw)};
1202
+ }
1203
+ HWY_API Vec256<uint16_t> MulOdd(Vec256<uint8_t> a, Vec256<uint8_t> b) {
1204
+ return Vec256<uint16_t>{__lasx_xvmulwod_h_bu(a.raw, b.raw)};
1205
+ }
1206
+ HWY_API Vec256<int32_t> MulOdd(Vec256<int16_t> a, Vec256<int16_t> b) {
1207
+ return Vec256<int32_t>{__lasx_xvmulwod_w_h(a.raw, b.raw)};
1208
+ }
1209
+ HWY_API Vec256<uint32_t> MulOdd(Vec256<uint16_t> a, Vec256<uint16_t> b) {
1210
+ return Vec256<uint32_t>{__lasx_xvmulwod_w_hu(a.raw, b.raw)};
1211
+ }
1212
+ HWY_API Vec256<int64_t> MulOdd(Vec256<int32_t> a, Vec256<int32_t> b) {
1213
+ return Vec256<int64_t>{__lasx_xvmulwod_d_w(a.raw, b.raw)};
1214
+ }
1215
+ HWY_API Vec256<uint64_t> MulOdd(Vec256<uint32_t> a, Vec256<uint32_t> b) {
1216
+ return Vec256<uint64_t>{__lasx_xvmulwod_d_wu(a.raw, b.raw)};
1217
+ }
1218
+ template <typename T, HWY_IF_I64(T)>
1219
+ HWY_API Vec256<T> MulOdd(Vec256<T> a, Vec256<T> b) {
1220
+ return Vec256<T>{__lasx_xvmulwod_q_d(a.raw, b.raw)};
1221
+ }
1222
+ template <typename T, HWY_IF_U64(T)>
1223
+ HWY_API Vec256<T> MulOdd(Vec256<T> a, Vec256<T> b) {
1224
+ return Vec256<T>{__lasx_xvmulwod_q_du(a.raw, b.raw)};
1225
+ }
1226
+
1227
+ template <typename T, HWY_IF_I16(T)>
1228
+ HWY_API Vec256<T> MulFixedPoint15(Vec256<T> a, Vec256<T> b) {
1229
+ const auto i32_ev = MulEven(a, b);
1230
+ const auto i32_od = MulOdd(a, b);
1231
+ const auto i64_lo = InterleaveLower(i32_ev, i32_od);
1232
+ const auto i64_hi = InterleaveUpper(Full256<int32_t>(), i32_ev, i32_od);
1233
+ return Vec256<T>{__lasx_xvssrarni_h_w(i64_hi.raw, i64_lo.raw, 15)};
1234
+ }
1235
+
1236
+ // ------------------------------ Integer division
1237
+
1238
+ HWY_API Vec256<int8_t> operator/(const Vec256<int8_t> a,
1239
+ const Vec256<int8_t> b) {
1240
+ // Use inline assembly to avoid undefined behavior if any lanes of b are zero
1241
+ // or a[i] == LimitsMin<int8_t>() && b[i] == -1
1242
+ __m256i raw_result;
1243
+ __asm__("xvdiv.b %u0,%u1,%u2" : "=f"(raw_result) : "f"(a.raw), "f"(b.raw) :);
1244
+ return Vec256<int8_t>{raw_result};
1245
+ }
1246
+
1247
+ HWY_API Vec256<uint8_t> operator/(const Vec256<uint8_t> a,
1248
+ const Vec256<uint8_t> b) {
1249
+ // Use inline assembly to avoid undefined behavior if any lanes of b are zero
1250
+ __m256i raw_result;
1251
+ __asm__("xvdiv.bu %u0,%u1,%u2" : "=f"(raw_result) : "f"(a.raw), "f"(b.raw) :);
1252
+ return Vec256<uint8_t>{raw_result};
1253
+ }
1254
+
1255
+ HWY_API Vec256<int16_t> operator/(const Vec256<int16_t> a,
1256
+ const Vec256<int16_t> b) {
1257
+ // Use inline assembly to avoid undefined behavior if any lanes of b are zero
1258
+ // or a[i] == LimitsMin<int16_t>() && b[i] == -1
1259
+ __m256i raw_result;
1260
+ __asm__("xvdiv.h %u0,%u1,%u2" : "=f"(raw_result) : "f"(a.raw), "f"(b.raw) :);
1261
+ return Vec256<int16_t>{raw_result};
1262
+ }
1263
+
1264
+ HWY_API Vec256<uint16_t> operator/(const Vec256<uint16_t> a,
1265
+ const Vec256<uint16_t> b) {
1266
+ // Use inline assembly to avoid undefined behavior if any lanes of b are zero
1267
+ __m256i raw_result;
1268
+ __asm__("xvdiv.hu %u0,%u1,%u2" : "=f"(raw_result) : "f"(a.raw), "f"(b.raw) :);
1269
+ return Vec256<uint16_t>{raw_result};
1270
+ }
1271
+
1272
+ HWY_API Vec256<int32_t> operator/(const Vec256<int32_t> a,
1273
+ const Vec256<int32_t> b) {
1274
+ // Use inline assembly to avoid undefined behavior if any lanes of b are zero
1275
+ // or a[i] == LimitsMin<int32_t>() && b[i] == -1
1276
+ __m256i raw_result;
1277
+ __asm__("xvdiv.w %u0,%u1,%u2" : "=f"(raw_result) : "f"(a.raw), "f"(b.raw) :);
1278
+ return Vec256<int32_t>{raw_result};
1279
+ }
1280
+
1281
+ HWY_API Vec256<uint32_t> operator/(const Vec256<uint32_t> a,
1282
+ const Vec256<uint32_t> b) {
1283
+ // Use inline assembly to avoid undefined behavior if any lanes of b are zero
1284
+ __m256i raw_result;
1285
+ __asm__("xvdiv.wu %u0,%u1,%u2" : "=f"(raw_result) : "f"(a.raw), "f"(b.raw) :);
1286
+ return Vec256<uint32_t>{raw_result};
1287
+ }
1288
+
1289
+ HWY_API Vec256<int64_t> operator/(const Vec256<int64_t> a,
1290
+ const Vec256<int64_t> b) {
1291
+ // Use inline assembly to avoid undefined behavior if any lanes of b are zero
1292
+ // or a[i] == LimitsMin<int64_t>() && b[i] == -1
1293
+ __m256i raw_result;
1294
+ __asm__("xvdiv.d %u0,%u1,%u2" : "=f"(raw_result) : "f"(a.raw), "f"(b.raw) :);
1295
+ return Vec256<int64_t>{raw_result};
1296
+ }
1297
+
1298
+ HWY_API Vec256<uint64_t> operator/(const Vec256<uint64_t> a,
1299
+ const Vec256<uint64_t> b) {
1300
+ // Use inline assembly to avoid undefined behavior if any lanes of b are zero
1301
+ __m256i raw_result;
1302
+ __asm__("xvdiv.du %u0,%u1,%u2" : "=f"(raw_result) : "f"(a.raw), "f"(b.raw) :);
1303
+ return Vec256<uint64_t>{raw_result};
1304
+ }
1305
+
1306
+ // ------------------------------ Integer modulo
1307
+
1308
+ HWY_API Vec256<int8_t> operator%(const Vec256<int8_t> a,
1309
+ const Vec256<int8_t> b) {
1310
+ // Use inline assembly to avoid undefined behavior if any lanes of b are zero
1311
+ // or a[i] == LimitsMin<int8_t>() && b[i] == -1
1312
+ __m256i raw_result;
1313
+ __asm__("xvmod.b %u0,%u1,%u2" : "=f"(raw_result) : "f"(a.raw), "f"(b.raw) :);
1314
+ return Vec256<int8_t>{raw_result};
1315
+ }
1316
+
1317
+ HWY_API Vec256<uint8_t> operator%(const Vec256<uint8_t> a,
1318
+ const Vec256<uint8_t> b) {
1319
+ // Use inline assembly to avoid undefined behavior if any lanes of b are zero
1320
+ __m256i raw_result;
1321
+ __asm__("xvmod.bu %u0,%u1,%u2" : "=f"(raw_result) : "f"(a.raw), "f"(b.raw) :);
1322
+ return Vec256<uint8_t>{raw_result};
1323
+ }
1324
+
1325
+ HWY_API Vec256<int16_t> operator%(const Vec256<int16_t> a,
1326
+ const Vec256<int16_t> b) {
1327
+ // Use inline assembly to avoid undefined behavior if any lanes of b are zero
1328
+ // or a[i] == LimitsMin<int16_t>() && b[i] == -1
1329
+ __m256i raw_result;
1330
+ __asm__("xvmod.h %u0,%u1,%u2" : "=f"(raw_result) : "f"(a.raw), "f"(b.raw) :);
1331
+ return Vec256<int16_t>{raw_result};
1332
+ }
1333
+
1334
+ HWY_API Vec256<uint16_t> operator%(const Vec256<uint16_t> a,
1335
+ const Vec256<uint16_t> b) {
1336
+ // Use inline assembly to avoid undefined behavior if any lanes of b are zero
1337
+ __m256i raw_result;
1338
+ __asm__("xvmod.hu %u0,%u1,%u2" : "=f"(raw_result) : "f"(a.raw), "f"(b.raw) :);
1339
+ return Vec256<uint16_t>{raw_result};
1340
+ }
1341
+
1342
+ HWY_API Vec256<int32_t> operator%(const Vec256<int32_t> a,
1343
+ const Vec256<int32_t> b) {
1344
+ // Use inline assembly to avoid undefined behavior if any lanes of b are zero
1345
+ // or a[i] == LimitsMin<int32_t>() && b[i] == -1
1346
+ __m256i raw_result;
1347
+ __asm__("xvmod.w %u0,%u1,%u2" : "=f"(raw_result) : "f"(a.raw), "f"(b.raw) :);
1348
+ return Vec256<int32_t>{raw_result};
1349
+ }
1350
+
1351
+ HWY_API Vec256<uint32_t> operator%(const Vec256<uint32_t> a,
1352
+ const Vec256<uint32_t> b) {
1353
+ // Use inline assembly to avoid undefined behavior if any lanes of b are zero
1354
+ __m256i raw_result;
1355
+ __asm__("xvmod.wu %u0,%u1,%u2" : "=f"(raw_result) : "f"(a.raw), "f"(b.raw) :);
1356
+ return Vec256<uint32_t>{raw_result};
1357
+ }
1358
+
1359
+ HWY_API Vec256<int64_t> operator%(const Vec256<int64_t> a,
1360
+ const Vec256<int64_t> b) {
1361
+ // Use inline assembly to avoid undefined behavior if any lanes of b are zero
1362
+ // or a[i] == LimitsMin<int64_t>() && b[i] == -1
1363
+ __m256i raw_result;
1364
+ __asm__("xvmod.d %u0,%u1,%u2" : "=f"(raw_result) : "f"(a.raw), "f"(b.raw) :);
1365
+ return Vec256<int64_t>{raw_result};
1366
+ }
1367
+
1368
+ HWY_API Vec256<uint64_t> operator%(const Vec256<uint64_t> a,
1369
+ const Vec256<uint64_t> b) {
1370
+ // Use inline assembly to avoid undefined behavior if any lanes of b are zero
1371
+ __m256i raw_result;
1372
+ __asm__("xvmod.du %u0,%u1,%u2" : "=f"(raw_result) : "f"(a.raw), "f"(b.raw) :);
1373
+ return Vec256<uint64_t>{raw_result};
1374
+ }
1375
+
1376
+ // ------------------------------ ShiftLeft (Compile-time constant shifts)
1377
+
1378
+ template <int kBits, typename T, HWY_IF_UI8(T)>
1379
+ HWY_API Vec256<T> ShiftLeft(Vec256<T> v) {
1380
+ return Vec256<T>{__lasx_xvslli_b(v.raw, kBits)};
1381
+ }
1382
+
1383
+ template <int kBits, typename T, HWY_IF_UI16(T)>
1384
+ HWY_API Vec256<T> ShiftLeft(Vec256<T> v) {
1385
+ return Vec256<T>{__lasx_xvslli_h(v.raw, kBits)};
1386
+ }
1387
+
1388
+ template <int kBits, typename T, HWY_IF_UI32(T)>
1389
+ HWY_API Vec256<T> ShiftLeft(Vec256<T> v) {
1390
+ return Vec256<T>{__lasx_xvslli_w(v.raw, kBits)};
1391
+ }
1392
+
1393
+ template <int kBits, typename T, HWY_IF_UI64(T)>
1394
+ HWY_API Vec256<T> ShiftLeft(Vec256<T> v) {
1395
+ return Vec256<T>{__lasx_xvslli_d(v.raw, kBits)};
1396
+ }
1397
+
1398
+ // ------------------------------ ShiftRight (Compile-time constant shifts)
1399
+
1400
+ template <int kBits>
1401
+ HWY_API Vec256<uint8_t> ShiftRight(Vec256<uint8_t> v) {
1402
+ return Vec256<uint8_t>{__lasx_xvsrli_b(v.raw, kBits)};
1403
+ }
1404
+
1405
+ template <int kBits>
1406
+ HWY_API Vec256<uint16_t> ShiftRight(Vec256<uint16_t> v) {
1407
+ return Vec256<uint16_t>{__lasx_xvsrli_h(v.raw, kBits)};
1408
+ }
1409
+
1410
+ template <int kBits>
1411
+ HWY_API Vec256<uint32_t> ShiftRight(Vec256<uint32_t> v) {
1412
+ return Vec256<uint32_t>{__lasx_xvsrli_w(v.raw, kBits)};
1413
+ }
1414
+
1415
+ template <int kBits>
1416
+ HWY_API Vec256<uint64_t> ShiftRight(Vec256<uint64_t> v) {
1417
+ return Vec256<uint64_t>{__lasx_xvsrli_d(v.raw, kBits)};
1418
+ }
1419
+
1420
+ template <int kBits>
1421
+ HWY_API Vec256<int8_t> ShiftRight(Vec256<int8_t> v) {
1422
+ return Vec256<int8_t>{__lasx_xvsrai_b(v.raw, kBits)};
1423
+ }
1424
+
1425
+ template <int kBits>
1426
+ HWY_API Vec256<int16_t> ShiftRight(Vec256<int16_t> v) {
1427
+ return Vec256<int16_t>{__lasx_xvsrai_h(v.raw, kBits)};
1428
+ }
1429
+
1430
+ template <int kBits>
1431
+ HWY_API Vec256<int32_t> ShiftRight(Vec256<int32_t> v) {
1432
+ return Vec256<int32_t>{__lasx_xvsrai_w(v.raw, kBits)};
1433
+ }
1434
+
1435
+ template <int kBits>
1436
+ HWY_API Vec256<int64_t> ShiftRight(Vec256<int64_t> v) {
1437
+ return Vec256<int64_t>{__lasx_xvsrai_d(v.raw, kBits)};
1438
+ }
1439
+
1440
+ // ------------------------------ RoundingShiftRight
1441
+
1442
+ template <int kBits>
1443
+ HWY_API Vec256<int8_t> RoundingShiftRight(Vec256<int8_t> v) {
1444
+ return Vec256<int8_t>{__lasx_xvsrari_b(v.raw, kBits)};
1445
+ }
1446
+ template <int kBits>
1447
+ HWY_API Vec256<int16_t> RoundingShiftRight(Vec256<int16_t> v) {
1448
+ return Vec256<int16_t>{__lasx_xvsrari_h(v.raw, kBits)};
1449
+ }
1450
+ template <int kBits>
1451
+ HWY_API Vec256<int32_t> RoundingShiftRight(Vec256<int32_t> v) {
1452
+ return Vec256<int32_t>{__lasx_xvsrari_w(v.raw, kBits)};
1453
+ }
1454
+ template <int kBits>
1455
+ HWY_API Vec256<int64_t> RoundingShiftRight(Vec256<int64_t> v) {
1456
+ return Vec256<int64_t>{__lasx_xvsrari_d(v.raw, kBits)};
1457
+ }
1458
+
1459
+ template <int kBits>
1460
+ HWY_API Vec256<uint8_t> RoundingShiftRight(Vec256<uint8_t> v) {
1461
+ return Vec256<uint8_t>{__lasx_xvsrlri_b(v.raw, kBits)};
1462
+ }
1463
+ template <int kBits>
1464
+ HWY_API Vec256<uint16_t> RoundingShiftRight(Vec256<uint16_t> v) {
1465
+ return Vec256<uint16_t>{__lasx_xvsrlri_h(v.raw, kBits)};
1466
+ }
1467
+ template <int kBits>
1468
+ HWY_API Vec256<uint32_t> RoundingShiftRight(Vec256<uint32_t> v) {
1469
+ return Vec256<uint32_t>{__lasx_xvsrlri_w(v.raw, kBits)};
1470
+ }
1471
+ template <int kBits>
1472
+ HWY_API Vec256<uint64_t> RoundingShiftRight(Vec256<uint64_t> v) {
1473
+ return Vec256<uint64_t>{__lasx_xvsrlri_d(v.raw, kBits)};
1474
+ }
1475
+ // ------------------------------ RoundingShr
1476
+
1477
+ HWY_API Vec256<uint8_t> RoundingShr(Vec256<uint8_t> v, Vec256<uint8_t> bits) {
1478
+ return Vec256<uint8_t>{__lasx_xvsrlr_b(v.raw, bits.raw)};
1479
+ }
1480
+ HWY_API Vec256<uint16_t> RoundingShr(Vec256<uint16_t> v,
1481
+ Vec256<uint16_t> bits) {
1482
+ return Vec256<uint16_t>{__lasx_xvsrlr_h(v.raw, bits.raw)};
1483
+ }
1484
+ HWY_API Vec256<uint32_t> RoundingShr(Vec256<uint32_t> v,
1485
+ Vec256<uint32_t> bits) {
1486
+ return Vec256<uint32_t>{__lasx_xvsrlr_w(v.raw, bits.raw)};
1487
+ }
1488
+ HWY_API Vec256<uint64_t> RoundingShr(Vec256<uint64_t> v,
1489
+ Vec256<uint64_t> bits) {
1490
+ return Vec256<uint64_t>{__lasx_xvsrlr_d(v.raw, bits.raw)};
1491
+ }
1492
+
1493
+ HWY_API Vec256<int8_t> RoundingShr(Vec256<int8_t> v, Vec256<int8_t> bits) {
1494
+ return Vec256<int8_t>{__lasx_xvsrar_b(v.raw, bits.raw)};
1495
+ }
1496
+ HWY_API Vec256<int16_t> RoundingShr(Vec256<int16_t> v, Vec256<int16_t> bits) {
1497
+ return Vec256<int16_t>{__lasx_xvsrar_h(v.raw, bits.raw)};
1498
+ }
1499
+ HWY_API Vec256<int32_t> RoundingShr(Vec256<int32_t> v, Vec256<int32_t> bits) {
1500
+ return Vec256<int32_t>{__lasx_xvsrar_w(v.raw, bits.raw)};
1501
+ }
1502
+ HWY_API Vec256<int64_t> RoundingShr(Vec256<int64_t> v, Vec256<int64_t> bits) {
1503
+ return Vec256<int64_t>{__lasx_xvsrar_d(v.raw, bits.raw)};
1504
+ }
1505
+
1506
+ // ------------------------------ RoundingShiftRightSame (RoundingShr)
1507
+
1508
+ template <typename T>
1509
+ HWY_API Vec256<T> RoundingShiftRightSame(const Vec256<T> v, int bits) {
1510
+ return RoundingShr(v, Set(DFromV<decltype(v)>(), static_cast<T>(bits)));
1511
+ }
1512
+
1513
+ // ------------------------------ RotateRight (Compile-time constant shifts)
1514
+
1515
+ template <int kBits, typename T, HWY_IF_UI8(T)>
1516
+ HWY_API Vec256<T> RotateRight(const Vec256<T> v) {
1517
+ static_assert(0 <= kBits && kBits < 8, "Invalid shift count");
1518
+ if (kBits == 0) return v;
1519
+ return Vec256<T>{__lasx_xvrotri_b(v.raw, kBits)};
1520
+ }
1521
+
1522
+ template <int kBits, typename T, HWY_IF_UI16(T)>
1523
+ HWY_API Vec256<T> RotateRight(const Vec256<T> v) {
1524
+ static_assert(0 <= kBits && kBits < 16, "Invalid shift count");
1525
+ if (kBits == 0) return v;
1526
+ return Vec256<T>{__lasx_xvrotri_h(v.raw, kBits)};
1527
+ }
1528
+
1529
+ template <int kBits, typename T, HWY_IF_UI32(T)>
1530
+ HWY_API Vec256<T> RotateRight(const Vec256<T> v) {
1531
+ static_assert(0 <= kBits && kBits < 32, "Invalid shift count");
1532
+ if (kBits == 0) return v;
1533
+ return Vec256<T>{__lasx_xvrotri_w(v.raw, kBits)};
1534
+ }
1535
+
1536
+ template <int kBits, typename T, HWY_IF_UI64(T)>
1537
+ HWY_API Vec256<T> RotateRight(const Vec256<T> v) {
1538
+ static_assert(0 <= kBits && kBits < 64, "Invalid shift count");
1539
+ if (kBits == 0) return v;
1540
+ return Vec256<T>{__lasx_xvrotri_d(v.raw, kBits)};
1541
+ }
1542
+
1543
+ // ------------------------------ Rol/Ror
1544
+ template <class T, HWY_IF_UI8(T)>
1545
+ HWY_API Vec256<T> Ror(Vec256<T> a, Vec256<T> b) {
1546
+ return Vec256<T>{__lasx_xvrotr_b(a.raw, b.raw)};
1547
+ }
1548
+
1549
+ template <class T, HWY_IF_UI16(T)>
1550
+ HWY_API Vec256<T> Ror(Vec256<T> a, Vec256<T> b) {
1551
+ return Vec256<T>{__lasx_xvrotr_h(a.raw, b.raw)};
1552
+ }
1553
+
1554
+ template <class T, HWY_IF_UI32(T)>
1555
+ HWY_API Vec256<T> Ror(Vec256<T> a, Vec256<T> b) {
1556
+ return Vec256<T>{__lasx_xvrotr_w(a.raw, b.raw)};
1557
+ }
1558
+
1559
+ template <class T, HWY_IF_UI64(T)>
1560
+ HWY_API Vec256<T> Ror(Vec256<T> a, Vec256<T> b) {
1561
+ return Vec256<T>{__lasx_xvrotr_d(a.raw, b.raw)};
1562
+ }
1563
+
1564
+ // ------------------------------ BroadcastSignBit (ShiftRight, compare, mask)
1565
+
1566
+ HWY_API Vec256<int8_t> BroadcastSignBit(const Vec256<int8_t> v) {
1567
+ return Vec256<int8_t>{__lasx_xvsrai_b(v.raw, 7)};
1568
+ }
1569
+
1570
+ HWY_API Vec256<int16_t> BroadcastSignBit(const Vec256<int16_t> v) {
1571
+ return Vec256<int16_t>{__lasx_xvsrai_h(v.raw, 15)};
1572
+ }
1573
+
1574
+ HWY_API Vec256<int32_t> BroadcastSignBit(const Vec256<int32_t> v) {
1575
+ return Vec256<int32_t>{__lasx_xvsrai_w(v.raw, 31)};
1576
+ }
1577
+
1578
+ HWY_API Vec256<int64_t> BroadcastSignBit(const Vec256<int64_t> v) {
1579
+ return Vec256<int64_t>{__lasx_xvsrai_d(v.raw, 63)};
1580
+ }
1581
+
1582
+ // ------------------------------ IfNegativeThenElse (BroadcastSignBit)
1583
+ template <typename T>
1584
+ HWY_API Vec256<T> IfNegativeThenElse(Vec256<T> v, Vec256<T> yes, Vec256<T> no) {
1585
+ static_assert(IsSigned<T>(), "Only works for signed/float");
1586
+ const DFromV<decltype(v)> d;
1587
+ const RebindToSigned<decltype(d)> di;
1588
+ const auto mask = MaskFromVec(BitCast(d, BroadcastSignBit(BitCast(di, v))));
1589
+ return IfThenElse(mask, yes, no);
1590
+ }
1591
+
1592
+ // ------------------------------ IfNegativeThenNegOrUndefIfZero
1593
+
1594
+ HWY_API Vec256<int8_t> IfNegativeThenNegOrUndefIfZero(Vec256<int8_t> mask,
1595
+ Vec256<int8_t> v) {
1596
+ return Vec256<int8_t>{__lasx_xvsigncov_b(mask.raw, v.raw)};
1597
+ }
1598
+
1599
+ HWY_API Vec256<int16_t> IfNegativeThenNegOrUndefIfZero(Vec256<int16_t> mask,
1600
+ Vec256<int16_t> v) {
1601
+ return Vec256<int16_t>{__lasx_xvsigncov_h(mask.raw, v.raw)};
1602
+ }
1603
+
1604
+ HWY_API Vec256<int32_t> IfNegativeThenNegOrUndefIfZero(Vec256<int32_t> mask,
1605
+ Vec256<int32_t> v) {
1606
+ return Vec256<int32_t>{__lasx_xvsigncov_w(mask.raw, v.raw)};
1607
+ }
1608
+
1609
+ HWY_API Vec256<int64_t> IfNegativeThenNegOrUndefIfZero(Vec256<int64_t> mask,
1610
+ Vec256<int64_t> v) {
1611
+ return Vec256<int64_t>{__lasx_xvsigncov_d(mask.raw, v.raw)};
1612
+ }
1613
+
1614
+ // ------------------------------ ShiftLeftSame
1615
+
1616
+ template <typename T>
1617
+ HWY_API Vec256<T> ShiftLeftSame(const Vec256<T> v, const int bits) {
1618
+ return Shl(v, Set(DFromV<decltype(v)>(), static_cast<T>(bits)));
1619
+ }
1620
+
1621
+ // ------------------------------ ShiftRightSame (BroadcastSignBit)
1622
+
1623
+ HWY_API Vec256<uint8_t> ShiftRightSame(const Vec256<uint8_t> v,
1624
+ const int bits) {
1625
+ return Vec256<uint8_t>{__lasx_xvsrl_b(v.raw, __lasx_xvreplgr2vr_b(bits))};
1626
+ }
1627
+
1628
+ HWY_API Vec256<uint16_t> ShiftRightSame(const Vec256<uint16_t> v,
1629
+ const int bits) {
1630
+ return Vec256<uint16_t>{__lasx_xvsrl_h(v.raw, __lasx_xvreplgr2vr_h(bits))};
1631
+ }
1632
+
1633
+ HWY_API Vec256<uint32_t> ShiftRightSame(const Vec256<uint32_t> v,
1634
+ const int bits) {
1635
+ return Vec256<uint32_t>{__lasx_xvsrl_w(v.raw, __lasx_xvreplgr2vr_w(bits))};
1636
+ }
1637
+
1638
+ HWY_API Vec256<uint64_t> ShiftRightSame(const Vec256<uint64_t> v,
1639
+ const int bits) {
1640
+ return Vec256<uint64_t>{__lasx_xvsrl_d(v.raw, __lasx_xvreplgr2vr_d(bits))};
1641
+ }
1642
+
1643
+ HWY_API Vec256<int8_t> ShiftRightSame(const Vec256<int8_t> v, const int bits) {
1644
+ return Vec256<int8_t>{__lasx_xvsra_b(v.raw, __lasx_xvreplgr2vr_b(bits))};
1645
+ }
1646
+
1647
+ HWY_API Vec256<int16_t> ShiftRightSame(const Vec256<int16_t> v,
1648
+ const int bits) {
1649
+ return Vec256<int16_t>{__lasx_xvsra_h(v.raw, __lasx_xvreplgr2vr_h(bits))};
1650
+ }
1651
+
1652
+ HWY_API Vec256<int32_t> ShiftRightSame(const Vec256<int32_t> v,
1653
+ const int bits) {
1654
+ return Vec256<int32_t>{__lasx_xvsra_w(v.raw, __lasx_xvreplgr2vr_w(bits))};
1655
+ }
1656
+
1657
+ HWY_API Vec256<int64_t> ShiftRightSame(const Vec256<int64_t> v,
1658
+ const int bits) {
1659
+ return Vec256<int64_t>{__lasx_xvsra_d(v.raw, __lasx_xvreplgr2vr_d(bits))};
1660
+ }
1661
+
1662
+ // ------------------------------ Neg (Xor, Sub)
1663
+
1664
+ namespace detail {
1665
+
1666
+ template <typename T>
1667
+ HWY_INLINE Vec256<T> Neg(hwy::FloatTag /*tag*/, const Vec256<T> v) {
1668
+ const DFromV<decltype(v)> d;
1669
+ return Xor(v, SignBit(d));
1670
+ }
1671
+
1672
+ template <typename T>
1673
+ HWY_INLINE Vec256<T> Neg(hwy::SpecialTag /*tag*/, const Vec256<T> v) {
1674
+ const DFromV<decltype(v)> d;
1675
+ return Xor(v, SignBit(d));
1676
+ }
1677
+
1678
+ // Not floating-point
1679
+ template <typename T, HWY_IF_UI8(T)>
1680
+ HWY_INLINE Vec256<T> Neg(hwy::SignedTag /*tag*/, const Vec256<T> v) {
1681
+ return Vec256<T>{__lasx_xvneg_b(v.raw)};
1682
+ }
1683
+
1684
+ template <typename T, HWY_IF_UI16(T)>
1685
+ HWY_INLINE Vec256<T> Neg(hwy::SignedTag /*tag*/, const Vec256<T> v) {
1686
+ return Vec256<T>{__lasx_xvneg_h(v.raw)};
1687
+ }
1688
+
1689
+ template <typename T, HWY_IF_UI32(T)>
1690
+ HWY_INLINE Vec256<T> Neg(hwy::SignedTag /*tag*/, const Vec256<T> v) {
1691
+ return Vec256<T>{__lasx_xvneg_w(v.raw)};
1692
+ }
1693
+
1694
+ template <typename T, HWY_IF_UI64(T)>
1695
+ HWY_INLINE Vec256<T> Neg(hwy::SignedTag /*tag*/, const Vec256<T> v) {
1696
+ return Vec256<T>{__lasx_xvneg_d(v.raw)};
1697
+ }
1698
+
1699
+ } // namespace detail
1700
+
1701
+ template <typename T>
1702
+ HWY_API Vec256<T> Neg(const Vec256<T> v) {
1703
+ return detail::Neg(hwy::TypeTag<T>(), v);
1704
+ }
1705
+
1706
+ // ------------------------------ Floating-point mul / div
1707
+
1708
+ HWY_API Vec256<float> operator*(Vec256<float> a, Vec256<float> b) {
1709
+ return Vec256<float>{__lasx_xvfmul_s(a.raw, b.raw)};
1710
+ }
1711
+ HWY_API Vec256<double> operator*(Vec256<double> a, Vec256<double> b) {
1712
+ return Vec256<double>{__lasx_xvfmul_d(a.raw, b.raw)};
1713
+ }
1714
+
1715
+ HWY_API Vec256<float> operator/(Vec256<float> a, Vec256<float> b) {
1716
+ return Vec256<float>{__lasx_xvfdiv_s(a.raw, b.raw)};
1717
+ }
1718
+ HWY_API Vec256<double> operator/(Vec256<double> a, Vec256<double> b) {
1719
+ return Vec256<double>{__lasx_xvfdiv_d(a.raw, b.raw)};
1720
+ }
1721
+
1722
+ // Approximate reciprocal
1723
+
1724
+ HWY_API Vec256<float> ApproximateReciprocal(Vec256<float> v) {
1725
+ return Vec256<float>{__lasx_xvfrecip_s(v.raw)};
1726
+ }
1727
+
1728
+ HWY_API Vec256<double> ApproximateReciprocal(Vec256<double> v) {
1729
+ return Vec256<double>{__lasx_xvfrecip_d(v.raw)};
1730
+ }
1731
+
1732
+ // Integer multiply-add variants
1733
+
1734
+ // signed
1735
+ HWY_API Vec256<int8_t> MulAdd(Vec256<int8_t> mul, Vec256<int8_t> x,
1736
+ Vec256<int8_t> add) {
1737
+ return Vec256<int8_t>{__lasx_xvmadd_b(add.raw, mul.raw, x.raw)};
1738
+ }
1739
+ HWY_API Vec256<int16_t> MulAdd(Vec256<int16_t> mul, Vec256<int16_t> x,
1740
+ Vec256<int16_t> add) {
1741
+ return Vec256<int16_t>{__lasx_xvmadd_h(add.raw, mul.raw, x.raw)};
1742
+ }
1743
+ HWY_API Vec256<int32_t> MulAdd(Vec256<int32_t> mul, Vec256<int32_t> x,
1744
+ Vec256<int32_t> add) {
1745
+ return Vec256<int32_t>{__lasx_xvmadd_w(add.raw, mul.raw, x.raw)};
1746
+ }
1747
+ HWY_API Vec256<int64_t> MulAdd(Vec256<int64_t> mul, Vec256<int64_t> x,
1748
+ Vec256<int64_t> add) {
1749
+ return Vec256<int64_t>{__lasx_xvmadd_d(add.raw, mul.raw, x.raw)};
1750
+ }
1751
+
1752
+ // unsigend
1753
+ HWY_API Vec256<uint8_t> MulAdd(Vec256<uint8_t> mul, Vec256<uint8_t> x,
1754
+ Vec256<uint8_t> add) {
1755
+ return Vec256<uint8_t>{__lasx_xvmadd_b(add.raw, mul.raw, x.raw)};
1756
+ }
1757
+ HWY_API Vec256<uint16_t> MulAdd(Vec256<uint16_t> mul, Vec256<uint16_t> x,
1758
+ Vec256<uint16_t> add) {
1759
+ return Vec256<uint16_t>{__lasx_xvmadd_h(add.raw, mul.raw, x.raw)};
1760
+ }
1761
+ HWY_API Vec256<uint32_t> MulAdd(Vec256<uint32_t> mul, Vec256<uint32_t> x,
1762
+ Vec256<uint32_t> add) {
1763
+ return Vec256<uint32_t>{__lasx_xvmadd_w(add.raw, mul.raw, x.raw)};
1764
+ }
1765
+ HWY_API Vec256<uint64_t> MulAdd(Vec256<uint64_t> mul, Vec256<uint64_t> x,
1766
+ Vec256<uint64_t> add) {
1767
+ return Vec256<uint64_t>{__lasx_xvmadd_d(add.raw, mul.raw, x.raw)};
1768
+ }
1769
+
1770
+ // signed
1771
+ HWY_API Vec256<int8_t> NegMulAdd(Vec256<int8_t> mul, Vec256<int8_t> x,
1772
+ Vec256<int8_t> add) {
1773
+ return Vec256<int8_t>{__lasx_xvmsub_b(add.raw, mul.raw, x.raw)};
1774
+ }
1775
+ HWY_API Vec256<int16_t> NegMulAdd(Vec256<int16_t> mul, Vec256<int16_t> x,
1776
+ Vec256<int16_t> add) {
1777
+ return Vec256<int16_t>{__lasx_xvmsub_h(add.raw, mul.raw, x.raw)};
1778
+ }
1779
+ HWY_API Vec256<int32_t> NegMulAdd(Vec256<int32_t> mul, Vec256<int32_t> x,
1780
+ Vec256<int32_t> add) {
1781
+ return Vec256<int32_t>{__lasx_xvmsub_w(add.raw, mul.raw, x.raw)};
1782
+ }
1783
+ HWY_API Vec256<int64_t> NegMulAdd(Vec256<int64_t> mul, Vec256<int64_t> x,
1784
+ Vec256<int64_t> add) {
1785
+ return Vec256<int64_t>{__lasx_xvmsub_d(add.raw, mul.raw, x.raw)};
1786
+ }
1787
+
1788
+ // unsigned
1789
+ HWY_API Vec256<uint8_t> NegMulAdd(Vec256<uint8_t> mul, Vec256<uint8_t> x,
1790
+ Vec256<uint8_t> add) {
1791
+ return Vec256<uint8_t>{__lasx_xvmsub_b(add.raw, mul.raw, x.raw)};
1792
+ }
1793
+ HWY_API Vec256<uint16_t> NegMulAdd(Vec256<uint16_t> mul, Vec256<uint16_t> x,
1794
+ Vec256<uint16_t> add) {
1795
+ return Vec256<uint16_t>{__lasx_xvmsub_h(add.raw, mul.raw, x.raw)};
1796
+ }
1797
+ HWY_API Vec256<uint32_t> NegMulAdd(Vec256<uint32_t> mul, Vec256<uint32_t> x,
1798
+ Vec256<uint32_t> add) {
1799
+ return Vec256<uint32_t>{__lasx_xvmsub_w(add.raw, mul.raw, x.raw)};
1800
+ }
1801
+ HWY_API Vec256<uint64_t> NegMulAdd(Vec256<uint64_t> mul, Vec256<uint64_t> x,
1802
+ Vec256<uint64_t> add) {
1803
+ return Vec256<uint64_t>{__lasx_xvmsub_d(add.raw, mul.raw, x.raw)};
1804
+ }
1805
+
1806
+ // ------------------------------ Floating-point multiply-add variants
1807
+
1808
+ HWY_API Vec256<float> MulAdd(Vec256<float> mul, Vec256<float> x,
1809
+ Vec256<float> add) {
1810
+ return Vec256<float>{__lasx_xvfmadd_s(mul.raw, x.raw, add.raw)};
1811
+ }
1812
+ HWY_API Vec256<double> MulAdd(Vec256<double> mul, Vec256<double> x,
1813
+ Vec256<double> add) {
1814
+ return Vec256<double>{__lasx_xvfmadd_d(mul.raw, x.raw, add.raw)};
1815
+ }
1816
+
1817
+ HWY_API Vec256<float> NegMulAdd(Vec256<float> mul, Vec256<float> x,
1818
+ Vec256<float> add) {
1819
+ return add - mul * x;
1820
+ }
1821
+ HWY_API Vec256<double> NegMulAdd(Vec256<double> mul, Vec256<double> x,
1822
+ Vec256<double> add) {
1823
+ return add - mul * x;
1824
+ }
1825
+
1826
+ HWY_API Vec256<float> MulSub(Vec256<float> mul, Vec256<float> x,
1827
+ Vec256<float> sub) {
1828
+ return Vec256<float>{__lasx_xvfmsub_s(mul.raw, x.raw, sub.raw)};
1829
+ }
1830
+ HWY_API Vec256<double> MulSub(Vec256<double> mul, Vec256<double> x,
1831
+ Vec256<double> sub) {
1832
+ return Vec256<double>{__lasx_xvfmsub_d(mul.raw, x.raw, sub.raw)};
1833
+ }
1834
+
1835
+ HWY_API Vec256<float> NegMulSub(Vec256<float> mul, Vec256<float> x,
1836
+ Vec256<float> sub) {
1837
+ return Vec256<float>{__lasx_xvfnmadd_s(mul.raw, x.raw, sub.raw)};
1838
+ }
1839
+ HWY_API Vec256<double> NegMulSub(Vec256<double> mul, Vec256<double> x,
1840
+ Vec256<double> sub) {
1841
+ return Vec256<double>{__lasx_xvfnmadd_d(mul.raw, x.raw, sub.raw)};
1842
+ }
1843
+
1844
+ // ------------------------------ MulAddSub(Float)
1845
+
1846
+ template <typename T, HWY_IF_FLOAT3264(T)>
1847
+ HWY_API Vec256<T> MulAddSub(Vec256<T> mul, Vec256<T> x, Vec256<T> sub_or_add) {
1848
+ return OddEven(MulAdd(mul, x, sub_or_add), MulSub(mul, x, sub_or_add));
1849
+ }
1850
+
1851
+ // ------------------------------ Floating-point square root
1852
+
1853
+ // Full precision square root
1854
+ HWY_API Vec256<float> Sqrt(Vec256<float> v) {
1855
+ return Vec256<float>{__lasx_xvfsqrt_s(v.raw)};
1856
+ }
1857
+
1858
+ HWY_API Vec256<double> Sqrt(Vec256<double> v) {
1859
+ return Vec256<double>{__lasx_xvfsqrt_d(v.raw)};
1860
+ }
1861
+
1862
+ // Approximate reciprocal square root
1863
+ HWY_API Vec256<float> ApproximateReciprocalSqrt(Vec256<float> v) {
1864
+ return Vec256<float>{__lasx_xvfrsqrt_s(v.raw)};
1865
+ }
1866
+
1867
+ HWY_API Vec256<double> ApproximateReciprocalSqrt(Vec256<double> v) {
1868
+ return Vec256<double>{__lasx_xvfrsqrt_d(v.raw)};
1869
+ }
1870
+
1871
+ // ------------------------------ Floating-point rounding
1872
+
1873
+ // Toward nearest integer, tie to even
1874
+ HWY_API Vec256<float> Round(Vec256<float> v) {
1875
+ return Vec256<float>{__lasx_xvfrintrne_s(v.raw)};
1876
+ }
1877
+
1878
+ HWY_API Vec256<double> Round(Vec256<double> v) {
1879
+ return Vec256<double>{__lasx_xvfrintrne_d(v.raw)};
1880
+ }
1881
+
1882
+ // Toward zero, aka truncate
1883
+ HWY_API Vec256<float> Trunc(Vec256<float> v) {
1884
+ return Vec256<float>{__lasx_xvfrintrz_s(v.raw)};
1885
+ }
1886
+
1887
+ HWY_API Vec256<double> Trunc(Vec256<double> v) {
1888
+ return Vec256<double>{__lasx_xvfrintrz_d(v.raw)};
1889
+ }
1890
+
1891
+ // Toward +infinity, aka ceiling
1892
+ HWY_API Vec256<float> Ceil(Vec256<float> v) {
1893
+ return Vec256<float>{__lasx_xvfrintrp_s(v.raw)};
1894
+ }
1895
+
1896
+ HWY_API Vec256<double> Ceil(Vec256<double> v) {
1897
+ return Vec256<double>{__lasx_xvfrintrp_d(v.raw)};
1898
+ }
1899
+
1900
+ // Toward -infinity, aka floor
1901
+ HWY_API Vec256<float> Floor(Vec256<float> v) {
1902
+ return Vec256<float>{__lasx_xvfrintrm_s(v.raw)};
1903
+ }
1904
+
1905
+ HWY_API Vec256<double> Floor(Vec256<double> v) {
1906
+ return Vec256<double>{__lasx_xvfrintrm_d(v.raw)};
1907
+ }
1908
+
1909
+ // ------------------------------ Floating-point classification
1910
+
1911
+ // FIXME: disable gcc-14 tree-based loop optimizations to prevent
1912
+ // 'HighwayTestGroup/HighwayTest.TestAllIsNaN/LASX' failures
1913
+ #if HWY_COMPILER_GCC && !HWY_COMPILER_CLANG
1914
+ #pragma GCC push_options
1915
+ #pragma GCC optimize("-fno-tree-loop-optimize")
1916
+ #endif
1917
+
1918
+ HWY_API Mask256<float> IsNaN(Vec256<float> v) {
1919
+ const DFromV<decltype(v)> d;
1920
+ const RebindToSigned<decltype(d)> di;
1921
+ return RebindMask(d,
1922
+ MFromD<decltype(di)>{__lasx_xvfcmp_cune_s(v.raw, v.raw)});
1923
+ }
1924
+
1925
+ HWY_API Mask256<double> IsNaN(Vec256<double> v) {
1926
+ const DFromV<decltype(v)> d;
1927
+ const RebindToSigned<decltype(d)> di;
1928
+ return RebindMask(d,
1929
+ MFromD<decltype(di)>{__lasx_xvfcmp_cune_d(v.raw, v.raw)});
1930
+ }
1931
+
1932
+ #if HWY_COMPILER_GCC && !HWY_COMPILER_CLANG
1933
+ #pragma GCC pop_options
1934
+ #endif
1935
+
1936
+ HWY_API Mask256<float> IsEitherNaN(Vec256<float> a, Vec256<float> b) {
1937
+ const DFromV<decltype(a)> d;
1938
+ const RebindToSigned<decltype(d)> di;
1939
+ return RebindMask(d, MFromD<decltype(di)>{__lasx_xvfcmp_cun_s(a.raw, b.raw)});
1940
+ }
1941
+
1942
+ HWY_API Mask256<double> IsEitherNaN(Vec256<double> a, Vec256<double> b) {
1943
+ const DFromV<decltype(a)> d;
1944
+ const RebindToSigned<decltype(d)> di;
1945
+ return RebindMask(d, MFromD<decltype(di)>{__lasx_xvfcmp_cun_d(a.raw, b.raw)});
1946
+ }
1947
+
1948
+ // ================================================== MEMORY
1949
+
1950
+ // ------------------------------ Load
1951
+
1952
+ template <class D, HWY_IF_V_SIZE_D(D, 32)>
1953
+ HWY_API VFromD<D> Load(D /* tag */, const TFromD<D>* HWY_RESTRICT aligned) {
1954
+ const RebindToSigned<D> di;
1955
+ return BitCast(D(), VFromD<decltype(di)>{__lasx_xvld(aligned, 0)});
1956
+ }
1957
+
1958
+ template <class D, HWY_IF_V_SIZE_D(D, 32)>
1959
+ HWY_API VFromD<D> LoadU(D /* tag */, const TFromD<D>* HWY_RESTRICT p) {
1960
+ const RebindToSigned<D> di;
1961
+ return BitCast(D(), VFromD<decltype(di)>{__lasx_xvld(p, 0)});
1962
+ }
1963
+
1964
+ // ------------------------------ MaskedLoad
1965
+
1966
+ template <class D, HWY_IF_V_SIZE_D(D, 32)>
1967
+ HWY_API VFromD<D> MaskedLoad(MFromD<D> m, D d,
1968
+ const TFromD<D>* HWY_RESTRICT p) {
1969
+ return IfThenElseZero(m, LoadU(d, p));
1970
+ }
1971
+
1972
+ // ------------------------------ LoadDup128
1973
+
1974
+ template <class D, HWY_IF_V_SIZE_D(D, 32)>
1975
+ HWY_API VFromD<D> LoadDup128(D d, const TFromD<D>* HWY_RESTRICT p) {
1976
+ VFromD<Half<D>> vec_tmp;
1977
+ vec_tmp = Load(Half<D>(), p);
1978
+ return Combine(d, vec_tmp, vec_tmp);
1979
+ }
1980
+
1981
+ // ------------------------------ Store
1982
+
1983
+ template <class D, HWY_IF_V_SIZE_D(D, 32)>
1984
+ HWY_API void Store(VFromD<D> v, D /* tag */, TFromD<D>* HWY_RESTRICT aligned) {
1985
+ __lasx_xvst(v.raw, aligned, 0);
1986
+ }
1987
+
1988
+ template <class D, HWY_IF_V_SIZE_D(D, 32)>
1989
+ HWY_API void StoreU(VFromD<D> v, D /* tag */, TFromD<D>* HWY_RESTRICT p) {
1990
+ __lasx_xvst(v.raw, p, 0);
1991
+ }
1992
+
1993
+ // ------------------------------ BlendedStore
1994
+ template <class D, HWY_IF_V_SIZE_D(D, 32)>
1995
+ HWY_API void BlendedStore(VFromD<D> v, MFromD<D> m, D d,
1996
+ TFromD<D>* HWY_RESTRICT p) {
1997
+ const RebindToUnsigned<decltype(d)> du;
1998
+ const auto blended =
1999
+ IfThenElse(RebindMask(du, m), BitCast(du, v), BitCast(du, LoadU(d, p)));
2000
+ StoreU(BitCast(d, blended), d, p);
2001
+ }
2002
+
2003
+ // ================================================== SWIZZLE
2004
+ // ------------------------------ LowerHalf
2005
+
2006
+ template <class D, HWY_IF_V_SIZE_D(D, 16)>
2007
+ HWY_API VFromD<D> LowerHalf(D /* tag */, VFromD<Twice<D>> v) {
2008
+ #if HWY_HAS_BUILTIN(__builtin_shufflevector)
2009
+ typedef uint32_t U32RawVectType __attribute__((__vector_size__(32)));
2010
+ return VFromD<D>{reinterpret_cast<typename detail::Raw128<TFromD<D>>::type>(
2011
+ __builtin_shufflevector(reinterpret_cast<U32RawVectType>(v.raw),
2012
+ reinterpret_cast<U32RawVectType>(v.raw), 0, 1, 2,
2013
+ 3))};
2014
+ #else
2015
+ const RebindToUnsigned<D> du;
2016
+ const Twice<decltype(du)> dut;
2017
+ alignas(32) __m128i vec_tmp[2];
2018
+ __m256i vec_result = BitCast(dut, v).raw;
2019
+ CopyBytes<32>(&vec_result, vec_tmp);
2020
+ return BitCast(D(), VFromD<decltype(du)>{vec_tmp[0]});
2021
+ #endif
2022
+ }
2023
+
2024
+ template <typename T>
2025
+ HWY_API Vec128<T> LowerHalf(Vec256<T> v) {
2026
+ const Full128<T> dh;
2027
+ return LowerHalf(dh, v);
2028
+ }
2029
+
2030
+ // ------------------------------ UpperHalf
2031
+
2032
+ template <class D, HWY_IF_V_SIZE_D(D, 16)>
2033
+ HWY_API VFromD<D> UpperHalf(D d, VFromD<Twice<D>> v) {
2034
+ #if HWY_HAS_BUILTIN(__builtin_shufflevector)
2035
+ (void)d;
2036
+ typedef uint32_t U32RawVectType __attribute__((__vector_size__(32)));
2037
+ return VFromD<D>{reinterpret_cast<typename detail::Raw128<TFromD<D>>::type>(
2038
+ __builtin_shufflevector(reinterpret_cast<U32RawVectType>(v.raw),
2039
+ reinterpret_cast<U32RawVectType>(v.raw), 4, 5, 6,
2040
+ 7))};
2041
+ #else
2042
+ const RebindToUnsigned<decltype(d)> du;
2043
+ const Twice<decltype(du)> dut;
2044
+ alignas(32) __m128i vec_tmp[2];
2045
+ __m256i vec_result = BitCast(dut, v).raw;
2046
+ CopyBytes<32>(&vec_result, vec_tmp);
2047
+ return BitCast(d, VFromD<decltype(du)>{vec_tmp[1]});
2048
+ #endif
2049
+ }
2050
+
2051
+ // ------------------------------ ExtractLane (Store)
2052
+ template <typename T>
2053
+ HWY_API T ExtractLane(const Vec256<T> v, size_t i) {
2054
+ const DFromV<decltype(v)> d;
2055
+ HWY_DASSERT(i < Lanes(d));
2056
+
2057
+ #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang
2058
+ constexpr size_t kLanesPerBlock = 16 / sizeof(T);
2059
+ if (__builtin_constant_p(i < kLanesPerBlock) && (i < kLanesPerBlock)) {
2060
+ return ExtractLane(LowerHalf(Half<decltype(d)>(), v), i);
2061
+ }
2062
+ #endif
2063
+
2064
+ alignas(32) T lanes[32 / sizeof(T)];
2065
+ Store(v, d, lanes);
2066
+ return lanes[i];
2067
+ }
2068
+
2069
+ // ------------------------------ InsertLane (Store)
2070
+ template <typename T>
2071
+ HWY_API Vec256<T> InsertLane(const Vec256<T> v, size_t i, T t) {
2072
+ return detail::InsertLaneUsingBroadcastAndBlend(v, i, t);
2073
+ }
2074
+
2075
+ // ------------------------------ GetLane (LowerHalf)
2076
+ template <typename T>
2077
+ HWY_API T GetLane(const Vec256<T> v) {
2078
+ return GetLane(LowerHalf(v));
2079
+ }
2080
+
2081
+ // ------------------------------ ExtractBlock (LowerHalf, UpperHalf)
2082
+
2083
+ template <int kBlockIdx, class T>
2084
+ HWY_API Vec128<T> ExtractBlock(Vec256<T> v) {
2085
+ static_assert(kBlockIdx == 0 || kBlockIdx == 1, "Invalid block index");
2086
+ const Half<DFromV<decltype(v)>> dh;
2087
+ return (kBlockIdx == 0) ? LowerHalf(dh, v) : UpperHalf(dh, v);
2088
+ }
2089
+
2090
+ // ------------------------------ ZeroExtendVector
2091
+
2092
+ template <class D, HWY_IF_V_SIZE_D(D, 32)>
2093
+ HWY_API VFromD<D> ZeroExtendVector(D /* tag */, VFromD<Half<D>> lo) {
2094
+ #if HWY_HAS_BUILTIN(__builtin_shufflevector)
2095
+ typedef uint32_t U32RawVectType __attribute__((__vector_size__(16)));
2096
+ U32RawVectType zero = {0, 0, 0, 0};
2097
+ return VFromD<D>{reinterpret_cast<typename detail::Raw256<TFromD<D>>::type>(
2098
+ __builtin_shufflevector(reinterpret_cast<U32RawVectType>(lo.raw), zero, 0,
2099
+ 1, 2, 3, 4, 5, 6, 7))};
2100
+ #else
2101
+ return Combine(D(), Zero(Half<D>()), lo);
2102
+ #endif
2103
+ }
2104
+
2105
+ // ------------------------------ ZeroExtendResizeBitCast
2106
+
2107
+ namespace detail {
2108
+
2109
+ template <class DTo, class DFrom>
2110
+ HWY_INLINE VFromD<DTo> ZeroExtendResizeBitCast(
2111
+ hwy::SizeTag<8> /* from_size_tag */, hwy::SizeTag<32> /* to_size_tag */,
2112
+ DTo d_to, DFrom d_from, VFromD<DFrom> v) {
2113
+ const Twice<decltype(d_from)> dt_from;
2114
+ const Twice<decltype(dt_from)> dq_from;
2115
+ return BitCast(d_to, ZeroExtendVector(dq_from, ZeroExtendVector(dt_from, v)));
2116
+ }
2117
+
2118
+ } // namespace detail
2119
+
2120
+ // ------------------------------ Combine
2121
+
2122
+ template <class D, HWY_IF_V_SIZE_D(D, 32)>
2123
+ HWY_API VFromD<D> Combine(D d, VFromD<Half<D>> hi, VFromD<Half<D>> lo) {
2124
+ #if HWY_HAS_BUILTIN(__builtin_shufflevector)
2125
+ (void)d;
2126
+ typedef uint32_t U32RawVectType __attribute__((__vector_size__(16)));
2127
+ return VFromD<D>{reinterpret_cast<typename detail::Raw256<TFromD<D>>::type>(
2128
+ __builtin_shufflevector(reinterpret_cast<U32RawVectType>(lo.raw),
2129
+ reinterpret_cast<U32RawVectType>(hi.raw), 0, 1, 2,
2130
+ 3, 4, 5, 6, 7))};
2131
+ #else
2132
+ const RebindToUnsigned<decltype(d)> du;
2133
+ const Half<decltype(du)> du128;
2134
+ alignas(32) __m128i vec_tmp[2];
2135
+ __m256i vec_result;
2136
+ vec_tmp[0] = BitCast(du128, lo).raw;
2137
+ vec_tmp[1] = BitCast(du128, hi).raw;
2138
+ CopyBytes<32>(vec_tmp, &vec_result);
2139
+ return BitCast(d, VFromD<decltype(du)>{vec_result});
2140
+ #endif
2141
+ }
2142
+
2143
+ // ------------------------------ ShiftLeftBytes
2144
+ template <int kBytes, class D, HWY_IF_V_SIZE_D(D, 32)>
2145
+ HWY_API VFromD<D> ShiftLeftBytes(D d, VFromD<D> v) {
2146
+ static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes");
2147
+ if (kBytes == 0) return v;
2148
+ const RebindToUnsigned<decltype(d)> du;
2149
+ return BitCast(
2150
+ d, VFromD<decltype(du)>{__lasx_xvbsll_v(BitCast(du, v).raw, kBytes)});
2151
+ }
2152
+
2153
+ // ------------------------------ ShiftRightBytes
2154
+ template <int kBytes, class D, HWY_IF_V_SIZE_D(D, 32)>
2155
+ HWY_API VFromD<D> ShiftRightBytes(D d, VFromD<D> v) {
2156
+ static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes");
2157
+ if (kBytes == 0) return v;
2158
+ const RebindToUnsigned<decltype(d)> du;
2159
+ return BitCast(
2160
+ d, VFromD<decltype(du)>{__lasx_xvbsrl_v(BitCast(du, v).raw, kBytes)});
2161
+ }
2162
+
2163
+ // ------------------------------ CombineShiftRightBytes
2164
+ template <int kBytes, class D, HWY_IF_V_SIZE_D(D, 32)>
2165
+ HWY_API VFromD<D> CombineShiftRightBytes(D d, VFromD<D> hi, VFromD<D> lo) {
2166
+ return Or(ShiftRightBytes<kBytes>(d, lo), ShiftLeftBytes<16 - kBytes>(d, hi));
2167
+ }
2168
+
2169
+ // ------------------------------ Broadcast
2170
+
2171
+ template <int kLane, class T, HWY_IF_T_SIZE(T, 1)>
2172
+ HWY_API Vec256<T> Broadcast(const Vec256<T> v) {
2173
+ static_assert(0 <= kLane && kLane < 16, "Invalid lane");
2174
+ return Vec256<T>{__lasx_xvreplve_b(v.raw, kLane)};
2175
+ }
2176
+
2177
+ template <int kLane, typename T, HWY_IF_T_SIZE(T, 2)>
2178
+ HWY_API Vec256<T> Broadcast(const Vec256<T> v) {
2179
+ static_assert(0 <= kLane && kLane < 8, "Invalid lane");
2180
+ const DFromV<decltype(v)> d;
2181
+ const RebindToUnsigned<decltype(d)> du;
2182
+ return BitCast(
2183
+ d, VFromD<decltype(du)>{__lasx_xvreplve_h(BitCast(du, v).raw, kLane)});
2184
+ }
2185
+
2186
+ template <int kLane, typename T, HWY_IF_UI32(T)>
2187
+ HWY_API Vec256<T> Broadcast(const Vec256<T> v) {
2188
+ static_assert(0 <= kLane && kLane < 4, "Invalid lane");
2189
+ return Vec256<T>{__lasx_xvreplve_w(v.raw, kLane)};
2190
+ }
2191
+
2192
+ template <int kLane, typename T, HWY_IF_UI64(T)>
2193
+ HWY_API Vec256<T> Broadcast(const Vec256<T> v) {
2194
+ static_assert(0 <= kLane && kLane < 2, "Invalid lane");
2195
+ return Vec256<T>{__lasx_xvreplve_d(v.raw, kLane)};
2196
+ }
2197
+
2198
+ template <int kLane>
2199
+ HWY_API Vec256<float> Broadcast(Vec256<float> v) {
2200
+ static_assert(0 <= kLane && kLane < 4, "Invalid lane");
2201
+ const DFromV<decltype(v)> d;
2202
+ const RebindToUnsigned<decltype(d)> du;
2203
+ return BitCast(
2204
+ d, VFromD<decltype(du)>{__lasx_xvreplve_w(BitCast(du, v).raw, kLane)});
2205
+ }
2206
+
2207
+ template <int kLane>
2208
+ HWY_API Vec256<double> Broadcast(const Vec256<double> v) {
2209
+ static_assert(0 <= kLane && kLane < 2, "Invalid lane");
2210
+ const DFromV<decltype(v)> d;
2211
+ const RebindToUnsigned<decltype(d)> du;
2212
+ return BitCast(
2213
+ d, VFromD<decltype(du)>{__lasx_xvreplve_d(BitCast(du, v).raw, kLane)});
2214
+ }
2215
+
2216
+ // ------------------------------ BroadcastBlock
2217
+
2218
+ template <int kBlockIdx, class T>
2219
+ HWY_API Vec256<T> BroadcastBlock(Vec256<T> v) {
2220
+ static_assert(kBlockIdx == 0 || kBlockIdx == 1, "Invalid block index");
2221
+ const DFromV<decltype(v)> d;
2222
+ return (kBlockIdx == 0) ? ConcatLowerLower(d, v, v)
2223
+ : ConcatUpperUpper(d, v, v);
2224
+ }
2225
+
2226
+ // ------------------------------ BroadcastLane
2227
+
2228
+ namespace detail {
2229
+
2230
+ template <class T, HWY_IF_T_SIZE(T, 1)>
2231
+ HWY_INLINE Vec256<T> BroadcastLane(hwy::SizeTag<0> /* lane_idx_tag */,
2232
+ Vec256<T> v) {
2233
+ return Vec256<T>{__lasx_xvreplve0_b(v.raw)};
2234
+ }
2235
+
2236
+ template <class T, HWY_IF_T_SIZE(T, 2)>
2237
+ HWY_INLINE Vec256<T> BroadcastLane(hwy::SizeTag<0> /* lane_idx_tag */,
2238
+ Vec256<T> v) {
2239
+ const DFromV<decltype(v)> d;
2240
+ const RebindToUnsigned<decltype(d)> du; // for float16_t
2241
+ return BitCast(d,
2242
+ VFromD<decltype(du)>{__lasx_xvreplve0_h(BitCast(du, v).raw)});
2243
+ }
2244
+
2245
+ template <class T, HWY_IF_UI32(T)>
2246
+ HWY_INLINE Vec256<T> BroadcastLane(hwy::SizeTag<0> /* lane_idx_tag */,
2247
+ Vec256<T> v) {
2248
+ return Vec256<T>{__lasx_xvreplve0_w(v.raw)};
2249
+ }
2250
+
2251
+ template <class T, HWY_IF_UI64(T)>
2252
+ HWY_INLINE Vec256<T> BroadcastLane(hwy::SizeTag<0> /* lane_idx_tag */,
2253
+ Vec256<T> v) {
2254
+ return Vec256<T>{__lasx_xvreplve0_d(v.raw)};
2255
+ }
2256
+
2257
+ HWY_INLINE Vec256<float> BroadcastLane(hwy::SizeTag<0> /* lane_idx_tag */,
2258
+ Vec256<float> v) {
2259
+ const DFromV<decltype(v)> d;
2260
+ const RebindToUnsigned<decltype(d)> du;
2261
+ return BitCast(d,
2262
+ VFromD<decltype(du)>{__lasx_xvreplve0_w(BitCast(du, v).raw)});
2263
+ }
2264
+
2265
+ HWY_INLINE Vec256<double> BroadcastLane(hwy::SizeTag<0> /* lane_idx_tag */,
2266
+ Vec256<double> v) {
2267
+ const DFromV<decltype(v)> d;
2268
+ const RebindToUnsigned<decltype(d)> du;
2269
+ return BitCast(d,
2270
+ VFromD<decltype(du)>{__lasx_xvreplve0_d(BitCast(du, v).raw)});
2271
+ }
2272
+
2273
+ template <size_t kLaneIdx, class T, hwy::EnableIf<kLaneIdx != 0>* = nullptr>
2274
+ HWY_INLINE Vec256<T> BroadcastLane(hwy::SizeTag<kLaneIdx> /* lane_idx_tag */,
2275
+ Vec256<T> v) {
2276
+ constexpr size_t kLanesPerBlock = 16 / sizeof(T);
2277
+ constexpr int kBlockIdx = static_cast<int>(kLaneIdx / kLanesPerBlock);
2278
+ constexpr int kLaneInBlkIdx =
2279
+ static_cast<int>(kLaneIdx) & (kLanesPerBlock - 1);
2280
+ return Broadcast<kLaneInBlkIdx>(BroadcastBlock<kBlockIdx>(v));
2281
+ }
2282
+ } // namespace detail
2283
+
2284
+ template <int kLaneIdx, class T>
2285
+ HWY_API Vec256<T> BroadcastLane(Vec256<T> v) {
2286
+ static_assert(kLaneIdx >= 0, "Invalid lane");
2287
+ return detail::BroadcastLane(hwy::SizeTag<static_cast<size_t>(kLaneIdx)>(),
2288
+ v);
2289
+ }
2290
+
2291
+ // ------------------------------ Hard-coded shuffles
2292
+
2293
+ // Notation: let Vec256<int32_t> have lanes 7,6,5,4,3,2,1,0 (0 is
2294
+ // least-significant). Shuffle0321 rotates four-lane blocks one lane to the
2295
+ // right (the previous least-significant lane is now most-significant =>
2296
+ // 47650321). These could also be implemented via CombineShiftRightBytes but
2297
+ // the shuffle_abcd notation is more convenient.
2298
+
2299
+ // Swap 32-bit halves in 64-bit halves.
2300
+ template <typename T, HWY_IF_UI32(T)>
2301
+ HWY_API Vec256<T> Shuffle2301(const Vec256<T> v) {
2302
+ return Vec256<T>{__lasx_xvshuf4i_w(v.raw, 0xb1)};
2303
+ }
2304
+ HWY_API Vec256<float> Shuffle2301(const Vec256<float> v) {
2305
+ const DFromV<decltype(v)> d;
2306
+ const RebindToUnsigned<decltype(d)> du;
2307
+ return BitCast(
2308
+ d, VFromD<decltype(du)>{__lasx_xvshuf4i_w(BitCast(du, v).raw, 0xb1)});
2309
+ }
2310
+
2311
+ // Used by generic_ops-inl.h
2312
+ namespace detail {
2313
+
2314
+ template <typename T, HWY_IF_T_SIZE(T, 4)>
2315
+ HWY_API Vec256<T> ShuffleTwo2301(const Vec256<T> a, const Vec256<T> b) {
2316
+ const DFromV<decltype(a)> d;
2317
+ const RebindToUnsigned<decltype(d)> du;
2318
+ return BitCast(d, VFromD<decltype(du)>{__lasx_xvpermi_w(
2319
+ BitCast(du, b).raw, BitCast(du, a).raw, 0xb1)});
2320
+ }
2321
+ template <typename T, HWY_IF_T_SIZE(T, 4)>
2322
+ HWY_API Vec256<T> ShuffleTwo1230(const Vec256<T> a, const Vec256<T> b) {
2323
+ const DFromV<decltype(a)> d;
2324
+ const RebindToUnsigned<decltype(d)> du;
2325
+ return BitCast(d, VFromD<decltype(du)>{__lasx_xvpermi_w(
2326
+ BitCast(du, b).raw, BitCast(du, a).raw, 0x6c)});
2327
+ }
2328
+ template <typename T, HWY_IF_T_SIZE(T, 4)>
2329
+ HWY_API Vec256<T> ShuffleTwo3012(const Vec256<T> a, const Vec256<T> b) {
2330
+ const DFromV<decltype(a)> d;
2331
+ const RebindToUnsigned<decltype(d)> du;
2332
+ return BitCast(d, VFromD<decltype(du)>{__lasx_xvpermi_w(
2333
+ BitCast(du, b).raw, BitCast(du, a).raw, 0xc6)});
2334
+ }
2335
+
2336
+ } // namespace detail
2337
+
2338
+ // Swap 64-bit halves
2339
+ HWY_API Vec256<uint32_t> Shuffle1032(const Vec256<uint32_t> v) {
2340
+ return Vec256<uint32_t>{__lasx_xvshuf4i_w(v.raw, 0x4e)};
2341
+ }
2342
+ HWY_API Vec256<int32_t> Shuffle1032(const Vec256<int32_t> v) {
2343
+ return Vec256<int32_t>{__lasx_xvshuf4i_w(v.raw, 0x4e)};
2344
+ }
2345
+ HWY_API Vec256<float> Shuffle1032(const Vec256<float> v) {
2346
+ const DFromV<decltype(v)> d;
2347
+ const RebindToUnsigned<decltype(d)> du;
2348
+ return BitCast(
2349
+ d, VFromD<decltype(du)>{__lasx_xvshuf4i_w(BitCast(du, v).raw, 0x4e)});
2350
+ }
2351
+ HWY_API Vec256<uint64_t> Shuffle01(const Vec256<uint64_t> v) {
2352
+ return Vec256<uint64_t>{__lasx_xvshuf4i_w(v.raw, 0x4e)};
2353
+ }
2354
+ HWY_API Vec256<int64_t> Shuffle01(const Vec256<int64_t> v) {
2355
+ return Vec256<int64_t>{__lasx_xvshuf4i_w(v.raw, 0x4e)};
2356
+ }
2357
+ HWY_API Vec256<double> Shuffle01(const Vec256<double> v) {
2358
+ const DFromV<decltype(v)> d;
2359
+ const RebindToUnsigned<decltype(d)> du;
2360
+ return BitCast(
2361
+ d, VFromD<decltype(du)>{__lasx_xvshuf4i_w(BitCast(du, v).raw, 0x4e)});
2362
+ }
2363
+
2364
+ // Rotate right 32 bits
2365
+ HWY_API Vec256<uint32_t> Shuffle0321(const Vec256<uint32_t> v) {
2366
+ return Vec256<uint32_t>{__lasx_xvshuf4i_w(v.raw, 0x39)};
2367
+ }
2368
+ HWY_API Vec256<int32_t> Shuffle0321(const Vec256<int32_t> v) {
2369
+ return Vec256<int32_t>{__lasx_xvshuf4i_w(v.raw, 0x39)};
2370
+ }
2371
+ HWY_API Vec256<float> Shuffle0321(const Vec256<float> v) {
2372
+ const DFromV<decltype(v)> d;
2373
+ const RebindToUnsigned<decltype(d)> du;
2374
+ return BitCast(
2375
+ d, VFromD<decltype(du)>{__lasx_xvshuf4i_w(BitCast(du, v).raw, 0x39)});
2376
+ }
2377
+ // Rotate left 32 bits
2378
+ HWY_API Vec256<uint32_t> Shuffle2103(const Vec256<uint32_t> v) {
2379
+ return Vec256<uint32_t>{__lasx_xvshuf4i_w(v.raw, 0x93)};
2380
+ }
2381
+ HWY_API Vec256<int32_t> Shuffle2103(const Vec256<int32_t> v) {
2382
+ return Vec256<int32_t>{__lasx_xvshuf4i_w(v.raw, 0x93)};
2383
+ }
2384
+ HWY_API Vec256<float> Shuffle2103(const Vec256<float> v) {
2385
+ const DFromV<decltype(v)> d;
2386
+ const RebindToUnsigned<decltype(d)> du;
2387
+ return BitCast(
2388
+ d, VFromD<decltype(du)>{__lasx_xvshuf4i_w(BitCast(du, v).raw, 0x93)});
2389
+ }
2390
+
2391
+ // Reverse
2392
+ HWY_API Vec256<uint32_t> Shuffle0123(const Vec256<uint32_t> v) {
2393
+ return Vec256<uint32_t>{__lasx_xvshuf4i_w(v.raw, 0x1B)};
2394
+ }
2395
+ HWY_API Vec256<int32_t> Shuffle0123(const Vec256<int32_t> v) {
2396
+ return Vec256<int32_t>{__lasx_xvshuf4i_w(v.raw, 0x1B)};
2397
+ }
2398
+ HWY_API Vec256<float> Shuffle0123(const Vec256<float> v) {
2399
+ const DFromV<decltype(v)> d;
2400
+ const RebindToUnsigned<decltype(d)> du;
2401
+ return BitCast(
2402
+ d, VFromD<decltype(du)>{__lasx_xvshuf4i_w(BitCast(du, v).raw, 0x1b)});
2403
+ }
2404
+
2405
+ // ------------------------------ TableLookupLanes
2406
+
2407
+ // Returned by SetTableIndices/IndicesFromVec for use by TableLookupLanes.
2408
+ template <typename T>
2409
+ struct Indices256 {
2410
+ __m256i raw;
2411
+ };
2412
+
2413
+ template <class D, HWY_IF_V_SIZE_D(D, 32), typename TI>
2414
+ HWY_API Indices256<TFromD<D>> IndicesFromVec(D /* tag */, Vec256<TI> vec) {
2415
+ static_assert(sizeof(TFromD<D>) == sizeof(TI), "Index size must match lane");
2416
+ #if HWY_IS_DEBUG_BUILD
2417
+ const Full256<TI> di;
2418
+ HWY_DASSERT(AllFalse(di, Lt(vec, Zero(di))) &&
2419
+ AllTrue(di, Lt(vec, Set(di, static_cast<TI>(2 * Lanes(di))))));
2420
+ #endif
2421
+ return Indices256<TFromD<D>>{vec.raw};
2422
+ }
2423
+
2424
+ template <class D, HWY_IF_V_SIZE_D(D, 32), typename TI>
2425
+ HWY_API Indices256<TFromD<D>> SetTableIndices(D d, const TI* idx) {
2426
+ const Rebind<TI, decltype(d)> di;
2427
+ return IndicesFromVec(d, LoadU(di, idx));
2428
+ }
2429
+
2430
+ template <typename T, HWY_IF_T_SIZE(T, 1)>
2431
+ HWY_API Vec256<T> TableLookupLanes(Vec256<T> v, Indices256<T> idx) {
2432
+ const DFromV<decltype(v)> d;
2433
+ const auto a = ConcatLowerLower(d, v, v);
2434
+ const auto b = ConcatUpperUpper(d, v, v);
2435
+ return Vec256<T>{__lasx_xvshuf_b(b.raw, a.raw, idx.raw)};
2436
+ }
2437
+
2438
+ template <typename T, HWY_IF_T_SIZE(T, 2)>
2439
+ HWY_API Vec256<T> TableLookupLanes(Vec256<T> v, Indices256<T> idx) {
2440
+ const DFromV<decltype(v)> d;
2441
+ const RebindToUnsigned<decltype(d)> du;
2442
+ const auto a = ConcatLowerLower(d, v, v);
2443
+ const auto b = ConcatUpperUpper(d, v, v);
2444
+ return BitCast(d, VFromD<decltype(du)>{__lasx_xvshuf_h(
2445
+ idx.raw, BitCast(du, b).raw, BitCast(du, a).raw)});
2446
+ }
2447
+
2448
+ template <typename T, HWY_IF_T_SIZE(T, 4)>
2449
+ HWY_API Vec256<T> TableLookupLanes(Vec256<T> v, Indices256<T> idx) {
2450
+ const DFromV<decltype(v)> d;
2451
+ const RebindToSigned<decltype(d)> di;
2452
+ return BitCast(d,
2453
+ Vec256<int32_t>{__lasx_xvperm_w(BitCast(di, v).raw, idx.raw)});
2454
+ }
2455
+
2456
+ template <typename T, HWY_IF_T_SIZE(T, 8)>
2457
+ HWY_API Vec256<T> TableLookupLanes(Vec256<T> v, Indices256<T> idx) {
2458
+ using TI = MakeSigned<T>;
2459
+ const DFromV<decltype(v)> d;
2460
+ const RebindToSigned<decltype(d)> di64;
2461
+ const Repartition<int32_t, decltype(d)> di32;
2462
+ // Replicate 64-bit index into upper 32 bits
2463
+ const Vec256<TI> dup{__lasx_xvpackev_w(idx.raw, idx.raw)};
2464
+ // For each idx64 i, idx32 are 2*i and 2*i+1.
2465
+ const Vec256<TI> idx32 = dup + dup + Set(di64, int64_t(1) << 32);
2466
+ return BitCast(
2467
+ d, TableLookupLanes(BitCast(di32, v), Indices256<int32_t>{idx32.raw}));
2468
+ }
2469
+
2470
+ template <typename T, HWY_IF_T_SIZE(T, 1)>
2471
+ HWY_API Vec256<T> TwoTablesLookupLanes(Vec256<T> a, Vec256<T> b,
2472
+ Indices256<T> idx) {
2473
+ const auto idx2 = Indices256<T>{__lasx_xvandi_b(idx.raw, 31)};
2474
+ const Vec256<T> idx_vec{idx.raw};
2475
+ const auto sel_hi_mask = ShiftLeft<2>(idx_vec);
2476
+ const auto mask0or1 = __lasx_xvslti_b(sel_hi_mask.raw, 0);
2477
+ const auto lo_lookup_result = TableLookupLanes(a, idx);
2478
+ const auto hi_lookup_result = TableLookupLanes(b, idx2);
2479
+ return IfThenElse(Mask256<T>{mask0or1}, hi_lookup_result, lo_lookup_result);
2480
+ }
2481
+
2482
+ template <typename T, HWY_IF_NOT_T_SIZE(T, 1)>
2483
+ HWY_API Vec256<T> TwoTablesLookupLanes(Vec256<T> a, Vec256<T> b,
2484
+ Indices256<T> idx) {
2485
+ const DFromV<decltype(a)> d;
2486
+ const RebindToSigned<decltype(d)> di;
2487
+ const Vec256<TFromD<decltype(di)>> idx_vec{idx.raw};
2488
+ constexpr int shift_count = 8 * sizeof(T) - 6 + CeilLog2(sizeof(T));
2489
+ const auto sel_hi_mask = BitCast(di, ShiftLeft<shift_count>(idx_vec));
2490
+ const auto lo_lookup_result = BitCast(di, TableLookupLanes(a, idx));
2491
+ const auto hi_lookup_result = BitCast(di, TableLookupLanes(b, idx));
2492
+ return BitCast(
2493
+ d, IfNegativeThenElse(sel_hi_mask, hi_lookup_result, lo_lookup_result));
2494
+ }
2495
+
2496
+ // ------------------------------ SwapAdjacentBlocks
2497
+
2498
+ template <typename T>
2499
+ HWY_API Vec256<T> SwapAdjacentBlocks(Vec256<T> v) {
2500
+ const DFromV<decltype(v)> d;
2501
+ const RebindToUnsigned<decltype(d)> du;
2502
+ return BitCast(d, Vec256<uint8_t>{__lasx_xvpermi_q(
2503
+ BitCast(du, v).raw, BitCast(du, v).raw, 0x01)});
2504
+ }
2505
+
2506
+ // ------------------------------ InterleaveEvenBlocks (ConcatLowerLower)
2507
+ template <class D, class V = VFromD<D>, HWY_IF_V_SIZE_D(D, 32)>
2508
+ HWY_API V InterleaveEvenBlocks(D d, V a, V b) {
2509
+ return ConcatLowerLower(d, b, a);
2510
+ }
2511
+
2512
+ // ------------------------------ InterleaveOddBlocks (ConcatUpperUpper)
2513
+ template <class D, class V = VFromD<D>, HWY_IF_V_SIZE_D(D, 32)>
2514
+ HWY_API V InterleaveOddBlocks(D d, V a, V b) {
2515
+ return ConcatUpperUpper(d, b, a);
2516
+ }
2517
+
2518
+ // ------------------------------ Reverse (RotateRight)
2519
+
2520
+ template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 4)>
2521
+ HWY_API VFromD<D> Reverse(D d, const VFromD<D> v) {
2522
+ alignas(32) static constexpr int32_t kReverse[8] = {7, 6, 5, 4, 3, 2, 1, 0};
2523
+ return TableLookupLanes(v, SetTableIndices(d, kReverse));
2524
+ }
2525
+
2526
+ template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 8)>
2527
+ HWY_API VFromD<D> Reverse(D d, const VFromD<D> v) {
2528
+ const RebindToUnsigned<decltype(d)> du;
2529
+ return BitCast(
2530
+ d, VFromD<decltype(du)>{__lasx_xvpermi_d(BitCast(du, v).raw, 0x1b)});
2531
+ }
2532
+
2533
+ template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 2)>
2534
+ HWY_API VFromD<D> Reverse(D d, const VFromD<D> v) {
2535
+ alignas(32) static constexpr int16_t kReverse[16] = {
2536
+ 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0};
2537
+ return TableLookupLanes(v, SetTableIndices(d, kReverse));
2538
+ }
2539
+
2540
+ template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 1)>
2541
+ HWY_API VFromD<D> Reverse(D d, const VFromD<D> v) {
2542
+ alignas(32) static constexpr TFromD<D> kReverse[32] = {
2543
+ 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16,
2544
+ 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0};
2545
+ return TableLookupLanes(v, SetTableIndices(d, kReverse));
2546
+ }
2547
+
2548
+ // ------------------------------ Reverse4 (SwapAdjacentBlocks)
2549
+
2550
+ template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 2)>
2551
+ HWY_API VFromD<D> Reverse4(D d, const VFromD<D> v) {
2552
+ const RebindToUnsigned<decltype(d)> du;
2553
+ return BitCast(
2554
+ d, VFromD<decltype(du)>{__lasx_xvshuf4i_h(BitCast(du, v).raw, 0x1b)});
2555
+ }
2556
+
2557
+ template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 8)>
2558
+ HWY_API VFromD<D> Reverse4(D /* tag */, const VFromD<D> v) {
2559
+ const RebindToUnsigned<D> du;
2560
+ return BitCast(
2561
+ D(), VFromD<decltype(du)>{__lasx_xvpermi_d(BitCast(du, v).raw, 0x1b)});
2562
+ }
2563
+
2564
+ // ------------------------------ Reverse8
2565
+
2566
+ template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 2)>
2567
+ HWY_API VFromD<D> Reverse8(D d, const VFromD<D> v) {
2568
+ const RebindToSigned<decltype(d)> di;
2569
+ const VFromD<decltype(di)> shuffle = Dup128VecFromValues(
2570
+ di, 0x0F0E, 0x0D0C, 0x0B0A, 0x0908, 0x0706, 0x0504, 0x0302, 0x0100);
2571
+ return BitCast(d, TableLookupBytes(v, shuffle));
2572
+ }
2573
+
2574
+ template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 4)>
2575
+ HWY_API VFromD<D> Reverse8(D d, const VFromD<D> v) {
2576
+ return Reverse(d, v);
2577
+ }
2578
+
2579
+ template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 8)>
2580
+ HWY_API VFromD<D> Reverse8(D /* tag */, const VFromD<D> /* v */) {
2581
+ HWY_ASSERT(0);
2582
+ }
2583
+
2584
+ // ------------------------------ InterleaveLower
2585
+
2586
+ // Interleaves lanes from halves of the 128-bit blocks of "a" (which provides
2587
+ // the least-significant lane) and "b". To concatenate two half-width integers
2588
+ // into one, use ZipLower/Upper instead (also works with scalar).
2589
+
2590
+ template <typename T, HWY_IF_T_SIZE(T, 1)>
2591
+ HWY_API Vec256<T> InterleaveLower(Vec256<T> a, Vec256<T> b) {
2592
+ return Vec256<T>{__lasx_xvilvl_b(b.raw, a.raw)};
2593
+ }
2594
+ template <typename T, HWY_IF_T_SIZE(T, 2)>
2595
+ HWY_API Vec256<T> InterleaveLower(Vec256<T> a, Vec256<T> b) {
2596
+ const DFromV<decltype(a)> d;
2597
+ const RebindToUnsigned<decltype(d)> du;
2598
+ using VU = VFromD<decltype(du)>; // for float16_t
2599
+ return BitCast(d,
2600
+ VU{__lasx_xvilvl_h(BitCast(du, b).raw, BitCast(du, a).raw)});
2601
+ }
2602
+ template <typename T, HWY_IF_UI32(T)>
2603
+ HWY_API Vec256<T> InterleaveLower(Vec256<T> a, Vec256<T> b) {
2604
+ return Vec256<T>{__lasx_xvilvl_w(b.raw, a.raw)};
2605
+ }
2606
+ template <typename T, HWY_IF_UI64(T)>
2607
+ HWY_API Vec256<T> InterleaveLower(Vec256<T> a, Vec256<T> b) {
2608
+ return Vec256<T>{__lasx_xvilvl_d(b.raw, a.raw)};
2609
+ }
2610
+
2611
+ HWY_API Vec256<float> InterleaveLower(Vec256<float> a, Vec256<float> b) {
2612
+ const Full256<uint32_t> du;
2613
+ const Full256<float> df;
2614
+ return BitCast(df, Vec256<uint32_t>{__lasx_xvilvl_w(BitCast(du, b).raw,
2615
+ BitCast(du, a).raw)});
2616
+ }
2617
+ HWY_API Vec256<double> InterleaveLower(Vec256<double> a, Vec256<double> b) {
2618
+ const Full256<uint64_t> du;
2619
+ const Full256<double> df;
2620
+ return BitCast(df, Vec256<uint64_t>{__lasx_xvilvl_d(BitCast(du, b).raw,
2621
+ BitCast(du, a).raw)});
2622
+ }
2623
+
2624
+ // ------------------------------ InterleaveUpper
2625
+
2626
+ template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 1)>
2627
+ HWY_API VFromD<D> InterleaveUpper(D /* tag */, VFromD<D> a, VFromD<D> b) {
2628
+ return VFromD<D>{__lasx_xvilvh_b(b.raw, a.raw)};
2629
+ }
2630
+
2631
+ template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 2)>
2632
+ HWY_API VFromD<D> InterleaveUpper(D d, VFromD<D> a, VFromD<D> b) {
2633
+ const RebindToUnsigned<decltype(d)> du;
2634
+ using VU = VFromD<decltype(du)>; // for float16_t
2635
+ return BitCast(d,
2636
+ VU{__lasx_xvilvh_h(BitCast(du, b).raw, BitCast(du, a).raw)});
2637
+ }
2638
+
2639
+ template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_UI32_D(D)>
2640
+ HWY_API VFromD<D> InterleaveUpper(D /* tag */, VFromD<D> a, VFromD<D> b) {
2641
+ return VFromD<D>{__lasx_xvilvh_w(b.raw, a.raw)};
2642
+ }
2643
+
2644
+ template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_UI64_D(D)>
2645
+ HWY_API VFromD<D> InterleaveUpper(D /* tag */, VFromD<D> a, VFromD<D> b) {
2646
+ return VFromD<D>{__lasx_xvilvh_d(b.raw, a.raw)};
2647
+ }
2648
+
2649
+ template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)>
2650
+ HWY_API VFromD<D> InterleaveUpper(D /* tag */, VFromD<D> a, VFromD<D> b) {
2651
+ const RebindToUnsigned<D> du;
2652
+ return BitCast(D(), VFromD<decltype(du)>{__lasx_xvilvh_w(
2653
+ BitCast(du, b).raw, BitCast(du, a).raw)});
2654
+ }
2655
+
2656
+ template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F64_D(D)>
2657
+ HWY_API VFromD<D> InterleaveUpper(D /* tag */, VFromD<D> a, VFromD<D> b) {
2658
+ const RebindToUnsigned<D> du;
2659
+ return BitCast(D(), VFromD<decltype(du)>{__lasx_xvilvh_d(
2660
+ BitCast(du, b).raw, BitCast(du, a).raw)});
2661
+ }
2662
+
2663
+ // ------------------------------ Blocks (LowerHalf, ZeroExtendVector)
2664
+
2665
+ // hiH,hiL loH,loL |-> hiL,loL (= lower halves)
2666
+ template <class D, HWY_IF_V_SIZE_D(D, 32)>
2667
+ HWY_API VFromD<D> ConcatLowerLower(D d, VFromD<D> hi, VFromD<D> lo) {
2668
+ const RebindToUnsigned<decltype(d)> du;
2669
+ return BitCast(d, VFromD<decltype(du)>{__lasx_xvpermi_q(
2670
+ BitCast(du, hi).raw, BitCast(du, lo).raw, 0x20)});
2671
+ }
2672
+
2673
+ // hiH,hiL loH,loL |-> hiL,loH (= inner halves / swap blocks)
2674
+ template <class D, HWY_IF_V_SIZE_D(D, 32)>
2675
+ HWY_API VFromD<D> ConcatLowerUpper(D d, VFromD<D> hi, VFromD<D> lo) {
2676
+ const RebindToUnsigned<decltype(d)> du;
2677
+ return BitCast(d, VFromD<decltype(du)>{__lasx_xvpermi_q(
2678
+ BitCast(du, hi).raw, BitCast(du, lo).raw, 0x21)});
2679
+ }
2680
+
2681
+ // hiH,hiL loH,loL |-> hiH,loL (= outer halves)
2682
+ template <class D, HWY_IF_V_SIZE_D(D, 32)>
2683
+ HWY_API VFromD<D> ConcatUpperLower(D d, VFromD<D> hi, VFromD<D> lo) {
2684
+ const RebindToUnsigned<decltype(d)> du; // for float16_t
2685
+ return BitCast(d, VFromD<decltype(du)>{__lasx_xvpermi_q(
2686
+ BitCast(du, hi).raw, BitCast(du, lo).raw, 0x30)});
2687
+ }
2688
+
2689
+ // hiH,hiL loH,loL |-> hiH,loH (= upper halves)
2690
+ template <class D, HWY_IF_V_SIZE_D(D, 32)>
2691
+ HWY_API VFromD<D> ConcatUpperUpper(D d, VFromD<D> hi, VFromD<D> lo) {
2692
+ const RebindToUnsigned<decltype(d)> du; // for float16_t
2693
+ return BitCast(d, VFromD<decltype(du)>{__lasx_xvpermi_q(
2694
+ BitCast(du, hi).raw, BitCast(du, lo).raw, 0x31)});
2695
+ }
2696
+
2697
+ // ---------------------------- InsertBlock (ConcatLowerLower, ConcatUpperLower)
2698
+ template <int kBlockIdx, class T>
2699
+ HWY_API Vec256<T> InsertBlock(Vec256<T> v, Vec128<T> blk_to_insert) {
2700
+ static_assert(kBlockIdx == 0 || kBlockIdx == 1, "Invalid block index");
2701
+
2702
+ const DFromV<decltype(v)> d;
2703
+ const auto vec_to_insert = ResizeBitCast(d, blk_to_insert);
2704
+ return (kBlockIdx == 0) ? ConcatUpperLower(d, v, vec_to_insert)
2705
+ : ConcatLowerLower(d, vec_to_insert, v);
2706
+ }
2707
+
2708
+ // ------------------------------ ConcatOdd
2709
+
2710
+ template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 1)>
2711
+ HWY_API VFromD<D> ConcatOdd(D /* tag */, VFromD<D> hi, VFromD<D> lo) {
2712
+ __m256i od = __lasx_xvpickod_b(hi.raw, lo.raw);
2713
+ return VFromD<D>{__lasx_xvpermi_d(od, 0xd8)};
2714
+ }
2715
+
2716
+ template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 2)>
2717
+ HWY_API VFromD<D> ConcatOdd(D d, VFromD<D> hi, VFromD<D> lo) {
2718
+ const RebindToUnsigned<decltype(d)> du;
2719
+ __m256i od = __lasx_xvpickod_h(BitCast(du, hi).raw, BitCast(du, lo).raw);
2720
+ return BitCast(d, VFromD<decltype(du)>{__lasx_xvpermi_d(od, 0xd8)});
2721
+ }
2722
+
2723
+ template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_UI32_D(D)>
2724
+ HWY_API VFromD<D> ConcatOdd(D /* tag */, VFromD<D> hi, VFromD<D> lo) {
2725
+ __m256i od = __lasx_xvpickod_w(hi.raw, lo.raw);
2726
+ return VFromD<D>{__lasx_xvpermi_d(od, 0xd8)};
2727
+ }
2728
+
2729
+ template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)>
2730
+ HWY_API VFromD<D> ConcatOdd(D d, VFromD<D> hi, VFromD<D> lo) {
2731
+ const RebindToUnsigned<decltype(d)> du;
2732
+ __m256i od = __lasx_xvpickod_w(BitCast(du, hi).raw, BitCast(du, lo).raw);
2733
+ return BitCast(d, VFromD<decltype(du)>{__lasx_xvpermi_d(od, 0xd8)});
2734
+ }
2735
+
2736
+ template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_UI64_D(D)>
2737
+ HWY_API VFromD<D> ConcatOdd(D /* tag */, VFromD<D> hi, VFromD<D> lo) {
2738
+ __m256i od = __lasx_xvpickod_d(hi.raw, lo.raw);
2739
+ return VFromD<D>{__lasx_xvpermi_d(od, 0xd8)};
2740
+ }
2741
+
2742
+ template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F64_D(D)>
2743
+ HWY_API Vec256<double> ConcatOdd(D d, Vec256<double> hi, Vec256<double> lo) {
2744
+ const RebindToUnsigned<decltype(d)> du;
2745
+ __m256i od = __lasx_xvpickod_d(BitCast(du, hi).raw, BitCast(du, lo).raw);
2746
+ return BitCast(d, VFromD<decltype(du)>{__lasx_xvpermi_d(od, 0xd8)});
2747
+ }
2748
+
2749
+ // ------------------------------ ConcatEven
2750
+
2751
+ template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 1)>
2752
+ HWY_API VFromD<D> ConcatEven(D /* tag */, VFromD<D> hi, VFromD<D> lo) {
2753
+ __m256i ev = __lasx_xvpickev_b(hi.raw, lo.raw);
2754
+ return VFromD<D>{__lasx_xvpermi_d(ev, 0xd8)};
2755
+ }
2756
+
2757
+ template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 2)>
2758
+ HWY_API VFromD<D> ConcatEven(D d, VFromD<D> hi, VFromD<D> lo) {
2759
+ const RebindToUnsigned<decltype(d)> du;
2760
+ __m256i ev = __lasx_xvpickev_h(BitCast(du, hi).raw, BitCast(du, lo).raw);
2761
+ return BitCast(d, VFromD<decltype(du)>{__lasx_xvpermi_d(ev, 0xd8)});
2762
+ }
2763
+
2764
+ template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_UI32_D(D)>
2765
+ HWY_API VFromD<D> ConcatEven(D /* tag */, VFromD<D> hi, VFromD<D> lo) {
2766
+ __m256i ev = __lasx_xvpickev_w(hi.raw, lo.raw);
2767
+ return VFromD<D>{__lasx_xvpermi_d(ev, 0xd8)};
2768
+ }
2769
+
2770
+ template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)>
2771
+ HWY_API VFromD<D> ConcatEven(D d, VFromD<D> hi, VFromD<D> lo) {
2772
+ const RebindToUnsigned<decltype(d)> du;
2773
+ __m256i ev = __lasx_xvpickev_w(BitCast(du, hi).raw, BitCast(du, lo).raw);
2774
+ return BitCast(d, VFromD<decltype(du)>{__lasx_xvpermi_d(ev, 0xd8)});
2775
+ }
2776
+
2777
+ template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_UI64_D(D)>
2778
+ HWY_API VFromD<D> ConcatEven(D /* tag */, VFromD<D> hi, VFromD<D> lo) {
2779
+ __m256i ev = __lasx_xvpickev_d(hi.raw, lo.raw);
2780
+ return VFromD<D>{__lasx_xvpermi_d(ev, 0xd8)};
2781
+ }
2782
+
2783
+ template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F64_D(D)>
2784
+ HWY_API Vec256<double> ConcatEven(D d, Vec256<double> hi, Vec256<double> lo) {
2785
+ const RebindToUnsigned<decltype(d)> du;
2786
+ __m256i ev = __lasx_xvpickev_d(BitCast(du, hi).raw, BitCast(du, lo).raw);
2787
+ return BitCast(d, VFromD<decltype(du)>{__lasx_xvpermi_d(ev, 0xd8)});
2788
+ }
2789
+
2790
+ // ------------------------------ InterleaveWholeLower
2791
+
2792
+ template <class D, HWY_IF_V_SIZE_D(D, 32)>
2793
+ HWY_API VFromD<D> InterleaveWholeLower(D d, VFromD<D> a, VFromD<D> b) {
2794
+ return ConcatLowerLower(d, InterleaveUpper(d, a, b), InterleaveLower(a, b));
2795
+ }
2796
+
2797
+ // ------------------------------ InterleaveWholeUpper
2798
+
2799
+ template <class D, HWY_IF_V_SIZE_D(D, 32)>
2800
+ HWY_API VFromD<D> InterleaveWholeUpper(D d, VFromD<D> a, VFromD<D> b) {
2801
+ return ConcatUpperUpper(d, InterleaveUpper(d, a, b), InterleaveLower(a, b));
2802
+ }
2803
+
2804
+ // ------------------------------ DupEven (InterleaveLower)
2805
+
2806
+ template <typename T, HWY_IF_UI8(T)>
2807
+ HWY_API Vec256<T> DupEven(Vec256<T> v) {
2808
+ return Vec256<T>{__lasx_xvpackev_b(v.raw, v.raw)};
2809
+ }
2810
+
2811
+ template <typename T, HWY_IF_UI16(T)>
2812
+ HWY_API Vec256<T> DupEven(Vec256<T> v) {
2813
+ return Vec256<T>{__lasx_xvpackev_h(v.raw, v.raw)};
2814
+ }
2815
+
2816
+ template <typename T, HWY_IF_UI32(T)>
2817
+ HWY_API Vec256<T> DupEven(Vec256<T> v) {
2818
+ return Vec256<T>{__lasx_xvpackev_w(v.raw, v.raw)};
2819
+ }
2820
+
2821
+ HWY_API Vec256<float> DupEven(Vec256<float> v) {
2822
+ const Full256<uint32_t> du;
2823
+ const DFromV<decltype(v)> d;
2824
+ return BitCast(d, Vec256<uint32_t>{__lasx_xvpackev_w(BitCast(du, v).raw,
2825
+ BitCast(du, v).raw)});
2826
+ }
2827
+
2828
+ template <typename T, HWY_IF_T_SIZE(T, 8)>
2829
+ HWY_API Vec256<T> DupEven(const Vec256<T> v) {
2830
+ const DFromV<decltype(v)> d;
2831
+ return InterleaveLower(d, v, v);
2832
+ }
2833
+
2834
+ // ------------------------------ DupOdd (InterleaveUpper)
2835
+
2836
+ template <typename T, HWY_IF_UI8(T)>
2837
+ HWY_API Vec256<T> DupOdd(Vec256<T> v) {
2838
+ return Vec256<T>{__lasx_xvpackod_b(v.raw, v.raw)};
2839
+ }
2840
+
2841
+ template <typename T, HWY_IF_UI16(T)>
2842
+ HWY_API Vec256<T> DupOdd(Vec256<T> v) {
2843
+ return Vec256<T>{__lasx_xvpackod_h(v.raw, v.raw)};
2844
+ }
2845
+
2846
+ template <typename T, HWY_IF_UI32(T)>
2847
+ HWY_API Vec256<T> DupOdd(Vec256<T> v) {
2848
+ return Vec256<T>{__lasx_xvpackod_w(v.raw, v.raw)};
2849
+ }
2850
+
2851
+ HWY_API Vec256<float> DupOdd(Vec256<float> v) {
2852
+ const Full256<uint32_t> du;
2853
+ const DFromV<decltype(v)> d;
2854
+ return BitCast(d, Vec256<uint32_t>{__lasx_xvpackod_w(BitCast(du, v).raw,
2855
+ BitCast(du, v).raw)});
2856
+ }
2857
+
2858
+ template <typename T, HWY_IF_T_SIZE(T, 8)>
2859
+ HWY_API Vec256<T> DupOdd(const Vec256<T> v) {
2860
+ const DFromV<decltype(v)> d;
2861
+ return InterleaveUpper(d, v, v);
2862
+ }
2863
+
2864
+ // ------------------------------ OddEven
2865
+
2866
+ template <typename T, HWY_IF_T_SIZE(T, 1)>
2867
+ HWY_INLINE Vec256<T> OddEven(Vec256<T> a, Vec256<T> b) {
2868
+ __m256i c = __lasx_xvpackod_b(a.raw, a.raw);
2869
+ return Vec256<T>{__lasx_xvpackev_b(c, b.raw)};
2870
+ }
2871
+
2872
+ template <typename T, HWY_IF_UI16(T)>
2873
+ HWY_INLINE Vec256<T> OddEven(Vec256<T> a, Vec256<T> b) {
2874
+ __m256i c = __lasx_xvpackod_h(a.raw, a.raw);
2875
+ return Vec256<T>{__lasx_xvpackev_h(c, b.raw)};
2876
+ }
2877
+
2878
+ template <typename T, HWY_IF_UI32(T)>
2879
+ HWY_INLINE Vec256<T> OddEven(Vec256<T> a, Vec256<T> b) {
2880
+ __m256i c = __lasx_xvpackod_w(a.raw, a.raw);
2881
+ return Vec256<T>{__lasx_xvpackev_w(c, b.raw)};
2882
+ }
2883
+
2884
+ template <typename T, HWY_IF_UI64(T)>
2885
+ HWY_INLINE Vec256<T> OddEven(Vec256<T> a, Vec256<T> b) {
2886
+ return Vec256<T>{__lasx_xvextrins_d(b.raw, a.raw, 0x11)};
2887
+ }
2888
+
2889
+ HWY_API Vec256<float> OddEven(Vec256<float> a, Vec256<float> b) {
2890
+ const DFromV<decltype(a)> d;
2891
+ const RebindToUnsigned<decltype(d)> du;
2892
+ __m256i c = __lasx_xvpackod_w(BitCast(du, a).raw, BitCast(du, a).raw);
2893
+ return BitCast(
2894
+ d, VFromD<decltype(du)>{__lasx_xvpackev_w(c, BitCast(du, b).raw)});
2895
+ }
2896
+
2897
+ HWY_API Vec256<double> OddEven(Vec256<double> a, Vec256<double> b) {
2898
+ const DFromV<decltype(a)> d;
2899
+ const RebindToUnsigned<decltype(d)> du;
2900
+ return BitCast(d, VFromD<decltype(du)>{__lasx_xvextrins_d(
2901
+ BitCast(du, b).raw, BitCast(du, a).raw, 0x11)});
2902
+ }
2903
+
2904
+ // -------------------------- InterleaveEven
2905
+ template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 1)>
2906
+ HWY_API VFromD<D> InterleaveEven(D /*d*/, VFromD<D> a, VFromD<D> b) {
2907
+ return VFromD<D>{__lasx_xvpackev_b(b.raw, a.raw)};
2908
+ }
2909
+ template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 2)>
2910
+ HWY_API VFromD<D> InterleaveEven(D /*d*/, VFromD<D> a, VFromD<D> b) {
2911
+ return VFromD<D>{__lasx_xvpackev_h(b.raw, a.raw)};
2912
+ }
2913
+ template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 4)>
2914
+ HWY_API VFromD<D> InterleaveEven(D d, VFromD<D> a, VFromD<D> b) {
2915
+ const RebindToUnsigned<decltype(d)> du;
2916
+ return BitCast(d, VFromD<decltype(du)>{__lasx_xvpackev_w(
2917
+ BitCast(du, b).raw, BitCast(du, a).raw)});
2918
+ }
2919
+ template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 8)>
2920
+ HWY_API VFromD<D> InterleaveEven(D /*d*/, VFromD<D> a, VFromD<D> b) {
2921
+ return InterleaveLower(a, b);
2922
+ }
2923
+
2924
+ // -------------------------- InterleaveOdd
2925
+ template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 1)>
2926
+ HWY_API VFromD<D> InterleaveOdd(D /*d*/, VFromD<D> a, VFromD<D> b) {
2927
+ return VFromD<D>{__lasx_xvpackod_b(b.raw, a.raw)};
2928
+ }
2929
+ template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 2)>
2930
+ HWY_API VFromD<D> InterleaveOdd(D /*d*/, VFromD<D> a, VFromD<D> b) {
2931
+ return VFromD<D>{__lasx_xvpackod_h(b.raw, a.raw)};
2932
+ }
2933
+ template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 4)>
2934
+ HWY_API VFromD<D> InterleaveOdd(D d, VFromD<D> a, VFromD<D> b) {
2935
+ const RebindToUnsigned<decltype(d)> du;
2936
+ return BitCast(d, VFromD<decltype(du)>{__lasx_xvpackod_w(
2937
+ BitCast(du, b).raw, BitCast(du, a).raw)});
2938
+ }
2939
+ template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 8)>
2940
+ HWY_API VFromD<D> InterleaveOdd(D d, VFromD<D> a, VFromD<D> b) {
2941
+ return InterleaveUpper(d, a, b);
2942
+ }
2943
+
2944
+ // ------------------------------ OddEvenBlocks
2945
+
2946
+ template <typename T>
2947
+ Vec256<T> OddEvenBlocks(Vec256<T> odd, Vec256<T> even) {
2948
+ const DFromV<decltype(odd)> d;
2949
+ const RebindToUnsigned<decltype(d)> du;
2950
+ return BitCast(d, VFromD<decltype(du)>{__lasx_xvpermi_q(
2951
+ BitCast(du, odd).raw, BitCast(du, even).raw, 0x30)});
2952
+ }
2953
+
2954
+ // ------------------------------ ReverseBlocks (SwapAdjacentBlocks)
2955
+
2956
+ template <class D, HWY_IF_V_SIZE_D(D, 32)>
2957
+ HWY_API VFromD<D> ReverseBlocks(D /*d*/, VFromD<D> v) {
2958
+ return SwapAdjacentBlocks(v);
2959
+ }
2960
+
2961
+ // ------------------------------ TableLookupBytes (ZeroExtendVector)
2962
+
2963
+ // Both full
2964
+ template <typename T, typename TI>
2965
+ HWY_API Vec256<TI> TableLookupBytes(Vec256<T> bytes, Vec256<TI> from) {
2966
+ const DFromV<decltype(from)> d;
2967
+ return BitCast(d, Vec256<uint8_t>{__lasx_xvshuf_b(
2968
+ BitCast(Full256<uint8_t>(), bytes).raw,
2969
+ BitCast(Full256<uint8_t>(), bytes).raw,
2970
+ BitCast(Full256<uint8_t>(), from).raw)});
2971
+ }
2972
+
2973
+ // Partial index vector
2974
+ template <typename T, typename TI, size_t NI>
2975
+ HWY_API Vec128<TI, NI> TableLookupBytes(Vec256<T> bytes, Vec128<TI, NI> from) {
2976
+ const Full256<TI> di;
2977
+ const Half<decltype(di)> dih;
2978
+ // First expand to full 128, then 256.
2979
+ const auto from_256 = ZeroExtendVector(di, Vec128<TI>{from.raw});
2980
+ const auto tbl_full = TableLookupBytes(bytes, from_256);
2981
+ // Shrink to 128, then partial.
2982
+ return Vec128<TI, NI>{LowerHalf(dih, tbl_full).raw};
2983
+ }
2984
+
2985
+ // Partial table vector
2986
+ template <typename T, size_t N, typename TI>
2987
+ HWY_API Vec256<TI> TableLookupBytes(Vec128<T, N> bytes, Vec256<TI> from) {
2988
+ const Full256<T> d;
2989
+ // First expand to full 128, then 256.
2990
+ const auto bytes_256 = ZeroExtendVector(d, Vec128<T>{bytes.raw});
2991
+ return TableLookupBytes(bytes_256, from);
2992
+ }
2993
+
2994
+ // ------------------------------ Per4LaneBlockShuffle
2995
+
2996
+ namespace detail {
2997
+
2998
+ template <class D, HWY_IF_V_SIZE_D(D, 32)>
2999
+ HWY_INLINE VFromD<D> Per4LaneBlkShufDupSet4xU32(D d, const uint32_t x3,
3000
+ const uint32_t x2,
3001
+ const uint32_t x1,
3002
+ const uint32_t x0) {
3003
+ alignas(32) uint32_t rawU32[8] = {x0, x1, x2, x3, x0, x1, x2, x3};
3004
+ return BitCast(d, Vec256<uint32_t>{__lasx_xvld(rawU32, 0)});
3005
+ }
3006
+
3007
+ template <size_t kIdx3210, class V, HWY_IF_NOT_FLOAT(TFromV<V>)>
3008
+ HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<kIdx3210> /*idx_3210_tag*/,
3009
+ hwy::SizeTag<4> /*lane_size_tag*/,
3010
+ hwy::SizeTag<32> /*vect_size_tag*/, V v) {
3011
+ const DFromV<decltype(v)> d;
3012
+ V idx =
3013
+ Per4LaneBlkShufDupSet4xU32(d, (kIdx3210 >> 6) & 3, (kIdx3210 >> 4) & 3,
3014
+ (kIdx3210 >> 2) & 3, kIdx3210 & 3);
3015
+ return V{__lasx_xvshuf_w(idx.raw, v.raw, v.raw)};
3016
+ }
3017
+
3018
+ template <size_t kIdx3210, class V, HWY_IF_FLOAT(TFromV<V>)>
3019
+ HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<kIdx3210> /*idx_3210_tag*/,
3020
+ hwy::SizeTag<4> /*lane_size_tag*/,
3021
+ hwy::SizeTag<32> /*vect_size_tag*/, V v) {
3022
+ const DFromV<decltype(v)> d;
3023
+ const RebindToUnsigned<decltype(d)> du;
3024
+ const auto idx =
3025
+ Per4LaneBlkShufDupSet4xU32(du, (kIdx3210 >> 6) & 3, (kIdx3210 >> 4) & 3,
3026
+ (kIdx3210 >> 2) & 3, kIdx3210 & 3);
3027
+ return BitCast(d, VFromD<decltype(du)>{__lasx_xvshuf_w(
3028
+ idx.raw, BitCast(du, v).raw, BitCast(du, v).raw)});
3029
+ }
3030
+
3031
+ template <class V>
3032
+ HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<0x44> /*idx_3210_tag*/,
3033
+ hwy::SizeTag<8> /*lane_size_tag*/,
3034
+ hwy::SizeTag<32> /*vect_size_tag*/, V v) {
3035
+ const DFromV<decltype(v)> d;
3036
+ return ConcatLowerLower(d, v, v);
3037
+ }
3038
+
3039
+ template <class V>
3040
+ HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<0xEE> /*idx_3210_tag*/,
3041
+ hwy::SizeTag<8> /*lane_size_tag*/,
3042
+ hwy::SizeTag<32> /*vect_size_tag*/, V v) {
3043
+ const DFromV<decltype(v)> d;
3044
+ return ConcatUpperUpper(d, v, v);
3045
+ }
3046
+
3047
+ template <size_t kIdx3210, class V, HWY_IF_NOT_FLOAT(TFromV<V>)>
3048
+ HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<kIdx3210> /*idx_3210_tag*/,
3049
+ hwy::SizeTag<8> /*lane_size_tag*/,
3050
+ hwy::SizeTag<32> /*vect_size_tag*/, V v) {
3051
+ const DFromV<decltype(v)> d;
3052
+ const RebindToUnsigned<decltype(d)> du;
3053
+ using VU = VFromD<decltype(du)>;
3054
+
3055
+ const VU vu = BitCast(du, v);
3056
+ return BitCast(
3057
+ d, VU{__lasx_xvpermi_d(vu.raw, static_cast<int>(kIdx3210 & 0xFF))});
3058
+ }
3059
+
3060
+ } // namespace detail
3061
+
3062
+ // ------------------------------ SlideUpLanes
3063
+
3064
+ namespace detail {
3065
+
3066
+ template <class D, HWY_IF_V_SIZE_D(D, 32)>
3067
+ HWY_INLINE VFromD<D> TableLookupSlideUpLanes(D d, VFromD<D> v, size_t amt) {
3068
+ const RebindToUnsigned<D> du;
3069
+ using TU = TFromD<decltype(du)>;
3070
+ const auto idx = Iota(du, static_cast<TU>(size_t{0} - amt));
3071
+ const auto masked_idx = And(idx, Set(du, static_cast<TU>(MaxLanes(d) - 1)));
3072
+ return BitCast(
3073
+ d, IfThenElseZero(
3074
+ idx == masked_idx,
3075
+ TableLookupLanes(BitCast(du, v), IndicesFromVec(du, masked_idx))));
3076
+ }
3077
+
3078
+ } // namespace detail
3079
+
3080
+ template <int kBlocks, class D, HWY_IF_V_SIZE_D(D, 32)>
3081
+ HWY_API VFromD<D> SlideUpBlocks(D d, VFromD<D> v) {
3082
+ static_assert(0 <= kBlocks && kBlocks <= 1,
3083
+ "kBlocks must be between 0 and 1");
3084
+ return (kBlocks == 1) ? ConcatLowerLower(d, v, Zero(d)) : v;
3085
+ }
3086
+
3087
+ template <class D, HWY_IF_V_SIZE_D(D, 32)>
3088
+ HWY_API VFromD<D> SlideUpLanes(D d, VFromD<D> v, size_t amt) {
3089
+ #if !HWY_IS_DEBUG_BUILD
3090
+ constexpr size_t kLanesPerBlock = 16 / sizeof(TFromD<D>);
3091
+ if (__builtin_constant_p(amt)) {
3092
+ const auto v_lo = ConcatLowerLower(d, v, Zero(d));
3093
+ switch (amt * sizeof(TFromD<D>)) {
3094
+ case 0:
3095
+ return v;
3096
+ case 1:
3097
+ return CombineShiftRightBytes<15>(d, v, v_lo);
3098
+ case 2:
3099
+ return CombineShiftRightBytes<14>(d, v, v_lo);
3100
+ case 3:
3101
+ return CombineShiftRightBytes<13>(d, v, v_lo);
3102
+ case 4:
3103
+ return CombineShiftRightBytes<12>(d, v, v_lo);
3104
+ case 5:
3105
+ return CombineShiftRightBytes<11>(d, v, v_lo);
3106
+ case 6:
3107
+ return CombineShiftRightBytes<10>(d, v, v_lo);
3108
+ case 7:
3109
+ return CombineShiftRightBytes<9>(d, v, v_lo);
3110
+ case 8:
3111
+ return CombineShiftRightBytes<8>(d, v, v_lo);
3112
+ case 9:
3113
+ return CombineShiftRightBytes<7>(d, v, v_lo);
3114
+ case 10:
3115
+ return CombineShiftRightBytes<6>(d, v, v_lo);
3116
+ case 11:
3117
+ return CombineShiftRightBytes<5>(d, v, v_lo);
3118
+ case 12:
3119
+ return CombineShiftRightBytes<4>(d, v, v_lo);
3120
+ case 13:
3121
+ return CombineShiftRightBytes<3>(d, v, v_lo);
3122
+ case 14:
3123
+ return CombineShiftRightBytes<2>(d, v, v_lo);
3124
+ case 15:
3125
+ return CombineShiftRightBytes<1>(d, v, v_lo);
3126
+ }
3127
+ }
3128
+
3129
+ if (__builtin_constant_p(amt >= kLanesPerBlock) && amt >= kLanesPerBlock) {
3130
+ const Half<decltype(d)> dh;
3131
+ return Combine(d, SlideUpLanes(dh, LowerHalf(dh, v), amt - kLanesPerBlock),
3132
+ Zero(dh));
3133
+ }
3134
+ #endif
3135
+
3136
+ return detail::TableLookupSlideUpLanes(d, v, amt);
3137
+ }
3138
+
3139
+ // ------------------------------ Slide1Up
3140
+
3141
+ template <typename D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 1)>
3142
+ HWY_API VFromD<D> Slide1Up(D d, VFromD<D> v) {
3143
+ const auto v_lo = ConcatLowerLower(d, v, Zero(d));
3144
+ return CombineShiftRightBytes<15>(d, v, v_lo);
3145
+ }
3146
+
3147
+ template <typename D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 2)>
3148
+ HWY_API VFromD<D> Slide1Up(D d, VFromD<D> v) {
3149
+ const auto v_lo = ConcatLowerLower(d, v, Zero(d));
3150
+ return CombineShiftRightBytes<14>(d, v, v_lo);
3151
+ }
3152
+
3153
+ template <typename D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 4)>
3154
+ HWY_API VFromD<D> Slide1Up(D d, VFromD<D> v) {
3155
+ const auto v_lo = ConcatLowerLower(d, v, Zero(d));
3156
+ return CombineShiftRightBytes<12>(d, v, v_lo);
3157
+ }
3158
+
3159
+ template <typename D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 8)>
3160
+ HWY_API VFromD<D> Slide1Up(D d, VFromD<D> v) {
3161
+ const auto v_lo = ConcatLowerLower(d, v, Zero(d));
3162
+ return CombineShiftRightBytes<8>(d, v, v_lo);
3163
+ }
3164
+
3165
+ // ------------------------------ SlideDownLanes
3166
+
3167
+ namespace detail {
3168
+
3169
+ template <class D, HWY_IF_V_SIZE_D(D, 32)>
3170
+ HWY_INLINE VFromD<D> TableLookupSlideDownLanes(D d, VFromD<D> v, size_t amt) {
3171
+ const RebindToUnsigned<decltype(d)> du;
3172
+ using TU = TFromD<decltype(du)>;
3173
+ const auto idx = Iota(du, static_cast<TU>(amt));
3174
+ const auto masked_idx = And(idx, Set(du, static_cast<TU>(MaxLanes(d) - 1)));
3175
+ return IfThenElseZero(RebindMask(d, idx == masked_idx),
3176
+ TableLookupLanes(v, IndicesFromVec(d, masked_idx)));
3177
+ }
3178
+
3179
+ } // namespace detail
3180
+
3181
+ template <int kBlocks, class D, HWY_IF_V_SIZE_D(D, 32)>
3182
+ HWY_API VFromD<D> SlideDownBlocks(D d, VFromD<D> v) {
3183
+ static_assert(0 <= kBlocks && kBlocks <= 1,
3184
+ "kBlocks must be between 0 and 1");
3185
+ const Half<decltype(d)> dh;
3186
+ return (kBlocks == 1) ? ZeroExtendVector(d, UpperHalf(dh, v)) : v;
3187
+ }
3188
+
3189
+ template <class D, HWY_IF_V_SIZE_D(D, 32)>
3190
+ HWY_API VFromD<D> SlideDownLanes(D d, VFromD<D> v, size_t amt) {
3191
+ #if !HWY_IS_DEBUG_BUILD
3192
+ constexpr size_t kLanesPerBlock = 16 / sizeof(TFromD<D>);
3193
+ const Half<decltype(d)> dh;
3194
+ if (__builtin_constant_p(amt)) {
3195
+ const auto v_hi = ZeroExtendVector(d, UpperHalf(dh, v));
3196
+ switch (amt * sizeof(TFromD<D>)) {
3197
+ case 0:
3198
+ return v;
3199
+ case 1:
3200
+ return CombineShiftRightBytes<1>(d, v_hi, v);
3201
+ case 2:
3202
+ return CombineShiftRightBytes<2>(d, v_hi, v);
3203
+ case 3:
3204
+ return CombineShiftRightBytes<3>(d, v_hi, v);
3205
+ case 4:
3206
+ return CombineShiftRightBytes<4>(d, v_hi, v);
3207
+ case 5:
3208
+ return CombineShiftRightBytes<5>(d, v_hi, v);
3209
+ case 6:
3210
+ return CombineShiftRightBytes<6>(d, v_hi, v);
3211
+ case 7:
3212
+ return CombineShiftRightBytes<7>(d, v_hi, v);
3213
+ case 8:
3214
+ return CombineShiftRightBytes<8>(d, v_hi, v);
3215
+ case 9:
3216
+ return CombineShiftRightBytes<9>(d, v_hi, v);
3217
+ case 10:
3218
+ return CombineShiftRightBytes<10>(d, v_hi, v);
3219
+ case 11:
3220
+ return CombineShiftRightBytes<11>(d, v_hi, v);
3221
+ case 12:
3222
+ return CombineShiftRightBytes<12>(d, v_hi, v);
3223
+ case 13:
3224
+ return CombineShiftRightBytes<13>(d, v_hi, v);
3225
+ case 14:
3226
+ return CombineShiftRightBytes<14>(d, v_hi, v);
3227
+ case 15:
3228
+ return CombineShiftRightBytes<15>(d, v_hi, v);
3229
+ }
3230
+ }
3231
+
3232
+ if (__builtin_constant_p(amt >= kLanesPerBlock) && amt >= kLanesPerBlock) {
3233
+ return ZeroExtendVector(
3234
+ d, SlideDownLanes(dh, UpperHalf(dh, v), amt - kLanesPerBlock));
3235
+ }
3236
+ #endif
3237
+
3238
+ return detail::TableLookupSlideDownLanes(d, v, amt);
3239
+ }
3240
+
3241
+ // ------------------------------ Slide1Down
3242
+
3243
+ template <typename D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 1)>
3244
+ HWY_API VFromD<D> Slide1Down(D d, VFromD<D> v) {
3245
+ const Half<decltype(d)> dh;
3246
+ const auto v_hi = ZeroExtendVector(d, UpperHalf(dh, v));
3247
+ return CombineShiftRightBytes<1>(d, v_hi, v);
3248
+ }
3249
+
3250
+ template <typename D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 2)>
3251
+ HWY_API VFromD<D> Slide1Down(D d, VFromD<D> v) {
3252
+ const Half<decltype(d)> dh;
3253
+ const auto v_hi = ZeroExtendVector(d, UpperHalf(dh, v));
3254
+ return CombineShiftRightBytes<2>(d, v_hi, v);
3255
+ }
3256
+
3257
+ template <typename D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 4)>
3258
+ HWY_API VFromD<D> Slide1Down(D d, VFromD<D> v) {
3259
+ const Half<decltype(d)> dh;
3260
+ const auto v_hi = ZeroExtendVector(d, UpperHalf(dh, v));
3261
+ return CombineShiftRightBytes<4>(d, v_hi, v);
3262
+ }
3263
+
3264
+ template <typename D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 8)>
3265
+ HWY_API VFromD<D> Slide1Down(D d, VFromD<D> v) {
3266
+ const Half<decltype(d)> dh;
3267
+ const auto v_hi = ZeroExtendVector(d, UpperHalf(dh, v));
3268
+ return CombineShiftRightBytes<8>(d, v_hi, v);
3269
+ }
3270
+
3271
+ // ------------------------------ Shl (Mul, ZipLower)
3272
+ namespace detail {
3273
+
3274
+ HWY_INLINE Vec256<uint8_t> Shl(hwy::UnsignedTag /*tag*/, Vec256<uint8_t> v,
3275
+ Vec256<uint8_t> bits) {
3276
+ return Vec256<uint8_t>{__lasx_xvsll_b(v.raw, bits.raw)};
3277
+ }
3278
+
3279
+ HWY_INLINE Vec256<uint16_t> Shl(hwy::UnsignedTag /*tag*/, Vec256<uint16_t> v,
3280
+ Vec256<uint16_t> bits) {
3281
+ return Vec256<uint16_t>{__lasx_xvsll_h(v.raw, bits.raw)};
3282
+ }
3283
+
3284
+ HWY_INLINE Vec256<uint32_t> Shl(hwy::UnsignedTag /*tag*/, Vec256<uint32_t> v,
3285
+ Vec256<uint32_t> bits) {
3286
+ return Vec256<uint32_t>{__lasx_xvsll_w(v.raw, bits.raw)};
3287
+ }
3288
+
3289
+ HWY_INLINE Vec256<uint64_t> Shl(hwy::UnsignedTag /*tag*/, Vec256<uint64_t> v,
3290
+ Vec256<uint64_t> bits) {
3291
+ return Vec256<uint64_t>{__lasx_xvsll_d(v.raw, bits.raw)};
3292
+ }
3293
+
3294
+ template <typename T>
3295
+ HWY_INLINE Vec256<T> Shl(hwy::SignedTag /*tag*/, Vec256<T> v, Vec256<T> bits) {
3296
+ // Signed left shifts are the same as unsigned.
3297
+ const Full256<T> di;
3298
+ const Full256<MakeUnsigned<T>> du;
3299
+ return BitCast(di,
3300
+ Shl(hwy::UnsignedTag(), BitCast(du, v), BitCast(du, bits)));
3301
+ }
3302
+
3303
+ } // namespace detail
3304
+
3305
+ template <typename T>
3306
+ HWY_API Vec256<T> operator<<(Vec256<T> v, Vec256<T> bits) {
3307
+ return detail::Shl(hwy::TypeTag<T>(), v, bits);
3308
+ }
3309
+
3310
+ // ------------------------------ Shr (MulHigh, IfThenElse, Not)
3311
+
3312
+ HWY_API Vec256<uint8_t> operator>>(Vec256<uint8_t> v, Vec256<uint8_t> bits) {
3313
+ return Vec256<uint8_t>{__lasx_xvsrl_b(v.raw, bits.raw)};
3314
+ }
3315
+
3316
+ HWY_API Vec256<uint16_t> operator>>(Vec256<uint16_t> v, Vec256<uint16_t> bits) {
3317
+ return Vec256<uint16_t>{__lasx_xvsrl_h(v.raw, bits.raw)};
3318
+ }
3319
+
3320
+ HWY_API Vec256<uint32_t> operator>>(Vec256<uint32_t> v, Vec256<uint32_t> bits) {
3321
+ return Vec256<uint32_t>{__lasx_xvsrl_w(v.raw, bits.raw)};
3322
+ }
3323
+
3324
+ HWY_API Vec256<uint64_t> operator>>(Vec256<uint64_t> v, Vec256<uint64_t> bits) {
3325
+ return Vec256<uint64_t>{__lasx_xvsrl_d(v.raw, bits.raw)};
3326
+ }
3327
+
3328
+ HWY_API Vec256<int8_t> operator>>(Vec256<int8_t> v, Vec256<int8_t> bits) {
3329
+ return Vec256<int8_t>{__lasx_xvsra_b(v.raw, bits.raw)};
3330
+ }
3331
+
3332
+ HWY_API Vec256<int16_t> operator>>(Vec256<int16_t> v, Vec256<int16_t> bits) {
3333
+ return Vec256<int16_t>{__lasx_xvsra_h(v.raw, bits.raw)};
3334
+ }
3335
+
3336
+ HWY_API Vec256<int32_t> operator>>(Vec256<int32_t> v, Vec256<int32_t> bits) {
3337
+ return Vec256<int32_t>{__lasx_xvsra_w(v.raw, bits.raw)};
3338
+ }
3339
+
3340
+ HWY_API Vec256<int64_t> operator>>(Vec256<int64_t> v, Vec256<int64_t> bits) {
3341
+ return Vec256<int64_t>{__lasx_xvsra_d(v.raw, bits.raw)};
3342
+ }
3343
+
3344
+ // ------------------------------ WidenMulPairwiseAdd
3345
+
3346
+ template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_I32_D(D)>
3347
+ HWY_API VFromD<D> WidenMulPairwiseAdd(D /*d32*/, Vec256<int16_t> a,
3348
+ Vec256<int16_t> b) {
3349
+ __m256i ev = __lasx_xvmulwev_w_h(b.raw, a.raw);
3350
+ return VFromD<D>{__lasx_xvmaddwod_w_h(ev, b.raw, a.raw)};
3351
+ }
3352
+
3353
+ template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_U32_D(D)>
3354
+ HWY_API VFromD<D> WidenMulPairwiseAdd(D /*d32*/, Vec256<uint16_t> a,
3355
+ Vec256<uint16_t> b) {
3356
+ __m256i ev = __lasx_xvmulwev_w_hu(b.raw, a.raw);
3357
+ return VFromD<D>{__lasx_xvmaddwod_w_hu(ev, b.raw, a.raw)};
3358
+ }
3359
+
3360
+ // ------------------------------ ReorderWidenMulAccumulate
3361
+
3362
+ template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_I32_D(D)>
3363
+ HWY_API VFromD<D> ReorderWidenMulAccumulate(D /*tag*/, Vec256<int16_t> a,
3364
+ Vec256<int16_t> b,
3365
+ const VFromD<D> sum0,
3366
+ VFromD<D>& /*sum1*/) {
3367
+ return VFromD<D>{__lasx_xvmaddwev_w_h(
3368
+ __lasx_xvmaddwod_w_h(sum0.raw, a.raw, b.raw), a.raw, b.raw)};
3369
+ }
3370
+
3371
+ template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_U32_D(D)>
3372
+ HWY_API VFromD<D> ReorderWidenMulAccumulate(D /*tag*/, Vec256<uint16_t> a,
3373
+ Vec256<uint16_t> b,
3374
+ const VFromD<D> sum0,
3375
+ VFromD<D>& /*sum1*/) {
3376
+ return VFromD<D>{__lasx_xvmaddwev_w_hu(
3377
+ __lasx_xvmaddwod_w_hu(sum0.raw, a.raw, b.raw), a.raw, b.raw)};
3378
+ }
3379
+
3380
+ // ------------------------------ RearrangeToOddPlusEven
3381
+ HWY_API Vec256<int32_t> RearrangeToOddPlusEven(const Vec256<int32_t> sum0,
3382
+ Vec256<int32_t> /*sum1*/) {
3383
+ return sum0; // invariant already holds
3384
+ }
3385
+
3386
+ HWY_API Vec256<uint32_t> RearrangeToOddPlusEven(const Vec256<uint32_t> sum0,
3387
+ Vec256<uint32_t> /*sum1*/) {
3388
+ return sum0; // invariant already holds
3389
+ }
3390
+
3391
+ // ================================================== CONVERT
3392
+
3393
+ // ------------------------------ Promotions (part w/ narrow lanes -> full)
3394
+
3395
+ template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)>
3396
+ HWY_API VFromD<D> PromoteTo(D /* tag */, Vec128<hwy::float16_t> v) {
3397
+ const Repartition<hwy::float16_t, D> df16;
3398
+ const auto from_128 = ZeroExtendVector(df16, v);
3399
+ const VFromD<decltype(df16)> f16_concat{__lasx_xvpermi_d(from_128.raw, 0xd8)};
3400
+ return VFromD<D>{__lasx_xvfcvtl_s_h(f16_concat.raw)};
3401
+ }
3402
+
3403
+ template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F64_D(D)>
3404
+ HWY_API VFromD<D> PromoteTo(D /* tag */, Vec128<float> v) {
3405
+ const Repartition<float, D> df;
3406
+ const RebindToSigned<decltype(df)> di;
3407
+ const auto from_128 = ZeroExtendVector(df, v);
3408
+ const auto f32_concat = BitCast(
3409
+ df, Vec256<uint32_t>{__lasx_xvpermi_d(BitCast(di, from_128).raw, 0xd8)});
3410
+ return VFromD<D>{__lasx_xvfcvtl_d_s(f32_concat.raw)};
3411
+ }
3412
+
3413
+ template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_I64_D(D)>
3414
+ HWY_API VFromD<D> PromoteTo(D /*di64*/, Vec128<float> v) {
3415
+ const Repartition<float, D> df;
3416
+ const RebindToSigned<decltype(df)> di;
3417
+ const auto from_128 = ZeroExtendVector(df, v);
3418
+ const auto f32_concat = BitCast(
3419
+ df, Vec256<uint32_t>{__lasx_xvpermi_d(BitCast(di, from_128).raw, 0xd8)});
3420
+ return VFromD<D>{__lasx_xvftintrzl_l_s(f32_concat.raw)};
3421
+ }
3422
+
3423
+ template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F64_D(D)>
3424
+ HWY_API VFromD<D> PromoteTo(D /* tag */, Vec128<int32_t> v) {
3425
+ alignas(32) __m128i vec_tmp[2];
3426
+ __m256i vec_temp;
3427
+ vec_tmp[0] = v.raw;
3428
+ CopyBytes<32>(vec_tmp, &vec_temp);
3429
+ vec_temp = __lasx_xvpermi_d(vec_temp, 0xd8);
3430
+ vec_temp = __lasx_xvsllwil_d_w(vec_temp, 0);
3431
+ return VFromD<D>{__lasx_xvffint_d_l(vec_temp)};
3432
+ }
3433
+
3434
+ template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F64_D(D)>
3435
+ HWY_API Vec256<double> PromoteTo(D /* tag */, Vec128<uint32_t> v) {
3436
+ alignas(32) __m128i vec_tmp[2];
3437
+ __m256i vec_temp;
3438
+ vec_tmp[0] = v.raw;
3439
+ CopyBytes<32>(vec_tmp, &vec_temp);
3440
+ vec_temp = __lasx_xvpermi_d(vec_temp, 0xd8);
3441
+ vec_temp = __lasx_xvsllwil_du_wu(vec_temp, 0);
3442
+ return VFromD<D>{__lasx_xvffint_d_lu(vec_temp)};
3443
+ }
3444
+
3445
+ // Unsigned: zero-extend.
3446
+ template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_U16_D(D)>
3447
+ HWY_API VFromD<D> PromoteTo(D /* tag */, Vec128<uint8_t> v) {
3448
+ alignas(32) __m128i vec_tmp[2];
3449
+ __m256i vec_temp;
3450
+ vec_tmp[0] = v.raw;
3451
+ CopyBytes<32>(vec_tmp, &vec_temp);
3452
+ vec_temp = __lasx_xvpermi_d(vec_temp, 0xd8);
3453
+ return VFromD<D>{__lasx_xvsllwil_hu_bu(vec_temp, 0)};
3454
+ }
3455
+ template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_U32_D(D)>
3456
+ HWY_API VFromD<D> PromoteTo(D /* tag */, Vec128<uint8_t, 8> v) {
3457
+ alignas(32) __m128i vec_tmp[2];
3458
+ __m256i vec_temp;
3459
+ vec_tmp[0] = v.raw;
3460
+ CopyBytes<32>(vec_tmp, &vec_temp);
3461
+ vec_temp = __lasx_xvsllwil_hu_bu(vec_temp, 0);
3462
+ vec_temp = __lasx_xvpermi_d(vec_temp, 0xd8);
3463
+ return VFromD<D>{__lasx_xvsllwil_wu_hu(vec_temp, 0)};
3464
+ }
3465
+ template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_U32_D(D)>
3466
+ HWY_API VFromD<D> PromoteTo(D /* tag */, Vec128<uint16_t> v) {
3467
+ alignas(32) __m128i vec_tmp[2];
3468
+ __m256i vec_temp;
3469
+ vec_tmp[0] = v.raw;
3470
+ CopyBytes<32>(vec_tmp, &vec_temp);
3471
+ vec_temp = __lasx_xvpermi_d(vec_temp, 0xd8);
3472
+ return VFromD<D>{__lasx_xvsllwil_wu_hu(vec_temp, 0)};
3473
+ }
3474
+ template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_U64_D(D)>
3475
+ HWY_API VFromD<D> PromoteTo(D /* tag */, Vec128<uint32_t> v) {
3476
+ alignas(32) __m128i vec_tmp[2];
3477
+ __m256i vec_temp;
3478
+ vec_tmp[0] = v.raw;
3479
+ CopyBytes<32>(vec_tmp, &vec_temp);
3480
+ vec_temp = __lasx_xvpermi_d(vec_temp, 0xd8);
3481
+ return VFromD<D>{__lasx_xvsllwil_du_wu(vec_temp, 0)};
3482
+ }
3483
+ template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_U64_D(D)>
3484
+ HWY_API VFromD<D> PromoteTo(D /* tag */, Vec64<uint16_t> v) {
3485
+ alignas(32) __m128i vec_tmp[2];
3486
+ __m256i vec_temp;
3487
+ vec_tmp[0] = v.raw;
3488
+ CopyBytes<32>(vec_tmp, &vec_temp);
3489
+ vec_temp = __lasx_xvsllwil_wu_hu(vec_temp, 0);
3490
+ vec_temp = __lasx_xvpermi_d(vec_temp, 0xd8);
3491
+ return VFromD<D>{__lasx_xvsllwil_du_wu(vec_temp, 0)};
3492
+ }
3493
+ template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_U64_D(D)>
3494
+ HWY_API VFromD<D> PromoteTo(D /* tag */, Vec32<uint8_t> v) {
3495
+ alignas(32) __m128i vec_tmp[2];
3496
+ __m256i vec_temp;
3497
+ vec_tmp[0] = v.raw;
3498
+ CopyBytes<32>(vec_tmp, &vec_temp);
3499
+ vec_temp = __lasx_xvsllwil_hu_bu(vec_temp, 0);
3500
+ vec_temp = __lasx_xvsllwil_wu_hu(vec_temp, 0);
3501
+ vec_temp = __lasx_xvpermi_d(vec_temp, 0xd8);
3502
+ return VFromD<D>{__lasx_xvsllwil_du_wu(vec_temp, 0)};
3503
+ }
3504
+
3505
+ // Signed: replicate sign bit.
3506
+ template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_I16_D(D)>
3507
+ HWY_API VFromD<D> PromoteTo(D /* tag */, Vec128<int8_t> v) {
3508
+ alignas(32) __m128i vec_tmp[2];
3509
+ __m256i vec_temp;
3510
+ vec_tmp[0] = v.raw;
3511
+ CopyBytes<32>(vec_tmp, &vec_temp);
3512
+ vec_temp = __lasx_xvpermi_d(vec_temp, 0xd8);
3513
+ return VFromD<D>{__lasx_xvsllwil_h_b(vec_temp, 0)};
3514
+ }
3515
+ template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_I32_D(D)>
3516
+ HWY_API VFromD<D> PromoteTo(D /* tag */, Vec128<int8_t, 8> v) {
3517
+ alignas(32) __m128i vec_tmp[2];
3518
+ __m256i vec_temp;
3519
+ vec_tmp[0] = v.raw;
3520
+ CopyBytes<32>(vec_tmp, &vec_temp);
3521
+ vec_temp = __lasx_xvsllwil_h_b(vec_temp, 0);
3522
+ vec_temp = __lasx_xvpermi_d(vec_temp, 0xd8);
3523
+ return VFromD<D>{__lasx_xvsllwil_w_h(vec_temp, 0)};
3524
+ }
3525
+ template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_I32_D(D)>
3526
+ HWY_API VFromD<D> PromoteTo(D /* tag */, Vec128<int16_t> v) {
3527
+ alignas(32) __m128i vec_tmp[2];
3528
+ __m256i vec_temp;
3529
+ vec_tmp[0] = v.raw;
3530
+ CopyBytes<32>(vec_tmp, &vec_temp);
3531
+ vec_temp = __lasx_xvpermi_d(vec_temp, 0xd8);
3532
+ return VFromD<D>{__lasx_xvsllwil_w_h(vec_temp, 0)};
3533
+ }
3534
+ template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_I64_D(D)>
3535
+ HWY_API VFromD<D> PromoteTo(D /* tag */, Vec128<int32_t> v) {
3536
+ alignas(32) __m128i vec_tmp[2];
3537
+ __m256i vec_temp;
3538
+ vec_tmp[0] = v.raw;
3539
+ CopyBytes<32>(vec_tmp, &vec_temp);
3540
+ vec_temp = __lasx_xvpermi_d(vec_temp, 0xd8);
3541
+ return VFromD<D>{__lasx_xvsllwil_d_w(vec_temp, 0)};
3542
+ }
3543
+ template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_I64_D(D)>
3544
+ HWY_API VFromD<D> PromoteTo(D /* tag */, Vec64<int16_t> v) {
3545
+ alignas(32) __m128i vec_tmp[2];
3546
+ __m256i vec_temp;
3547
+ vec_tmp[0] = v.raw;
3548
+ CopyBytes<32>(vec_tmp, &vec_temp);
3549
+ vec_temp = __lasx_xvsllwil_w_h(vec_temp, 0);
3550
+ vec_temp = __lasx_xvpermi_d(vec_temp, 0xd8);
3551
+ return VFromD<D>{__lasx_xvsllwil_d_w(vec_temp, 0)};
3552
+ }
3553
+ template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_I64_D(D)>
3554
+ HWY_API VFromD<D> PromoteTo(D /* tag */, Vec32<int8_t> v) {
3555
+ alignas(32) __m128i vec_tmp[2];
3556
+ __m256i vec_temp;
3557
+ vec_tmp[0] = v.raw;
3558
+ CopyBytes<32>(vec_tmp, &vec_temp);
3559
+ vec_temp = __lasx_xvsllwil_h_b(vec_temp, 0);
3560
+ vec_temp = __lasx_xvsllwil_w_h(vec_temp, 0);
3561
+ vec_temp = __lasx_xvpermi_d(vec_temp, 0xd8);
3562
+ return VFromD<D>{__lasx_xvsllwil_d_w(vec_temp, 0)};
3563
+ }
3564
+
3565
+ // ------------------------------ PromoteEvenTo/PromoteOddTo
3566
+ namespace detail {
3567
+
3568
+ // I32->I64 PromoteEvenTo/PromoteOddTo
3569
+
3570
+ template <class D, HWY_IF_LANES_D(D, 4)>
3571
+ HWY_INLINE VFromD<D> PromoteEvenTo(hwy::SignedTag /*to_type_tag*/,
3572
+ hwy::SizeTag<8> /*to_lane_size_tag*/,
3573
+ hwy::SignedTag /*from_type_tag*/, D d_to,
3574
+ Vec256<int32_t> v) {
3575
+ return BitCast(d_to, OddEven(DupEven(BroadcastSignBit(v)), v));
3576
+ }
3577
+
3578
+ template <class D, HWY_IF_LANES_D(D, 4)>
3579
+ HWY_INLINE VFromD<D> PromoteOddTo(hwy::SignedTag /*to_type_tag*/,
3580
+ hwy::SizeTag<8> /*to_lane_size_tag*/,
3581
+ hwy::SignedTag /*from_type_tag*/, D d_to,
3582
+ Vec256<int32_t> v) {
3583
+ return BitCast(d_to, OddEven(BroadcastSignBit(v), DupOdd(v)));
3584
+ }
3585
+
3586
+ } // namespace detail
3587
+
3588
+ // ------------------------------ Demotions (full -> part w/ narrow lanes)
3589
+
3590
+ template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_I8_D(D)>
3591
+ HWY_API VFromD<D> ReorderDemote2To(D /* tag */, Vec256<int16_t> a,
3592
+ Vec256<int16_t> b) {
3593
+ return VFromD<D>{__lasx_xvssrani_b_h(b.raw, a.raw, 0)};
3594
+ }
3595
+
3596
+ template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_U8_D(D)>
3597
+ HWY_API VFromD<D> ReorderDemote2To(D /* tag */, Vec256<int16_t> a,
3598
+ Vec256<int16_t> b) {
3599
+ return VFromD<D>{__lasx_xvssrani_bu_h(b.raw, a.raw, 0)};
3600
+ }
3601
+
3602
+ template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_I8_D(D)>
3603
+ HWY_API VFromD<D> ReorderDemote2To(D /* tag */, Vec256<uint16_t> a,
3604
+ Vec256<uint16_t> b) {
3605
+ return VFromD<D>{__lasx_xvssrlni_b_h(b.raw, a.raw, 0)};
3606
+ }
3607
+
3608
+ template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_U8_D(D)>
3609
+ HWY_API VFromD<D> ReorderDemote2To(D /* tag */, Vec256<uint16_t> a,
3610
+ Vec256<uint16_t> b) {
3611
+ return VFromD<D>{__lasx_xvssrlni_bu_h(b.raw, a.raw, 0)};
3612
+ }
3613
+
3614
+ template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_I16_D(D)>
3615
+ HWY_API VFromD<D> ReorderDemote2To(D /* tag */, Vec256<int32_t> a,
3616
+ Vec256<int32_t> b) {
3617
+ return VFromD<D>{__lasx_xvssrani_h_w(b.raw, a.raw, 0)};
3618
+ }
3619
+
3620
+ template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_U16_D(D)>
3621
+ HWY_API VFromD<D> ReorderDemote2To(D /* tag */, Vec256<int32_t> a,
3622
+ Vec256<int32_t> b) {
3623
+ return VFromD<D>{__lasx_xvssrani_hu_w(b.raw, a.raw, 0)};
3624
+ }
3625
+
3626
+ template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_I16_D(D)>
3627
+ HWY_API VFromD<D> ReorderDemote2To(D /* tag */, Vec256<uint32_t> a,
3628
+ Vec256<uint32_t> b) {
3629
+ return VFromD<D>{__lasx_xvssrlni_h_w(b.raw, a.raw, 0)};
3630
+ }
3631
+
3632
+ template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_U16_D(D)>
3633
+ HWY_API VFromD<D> ReorderDemote2To(D /* tag */, Vec256<uint32_t> a,
3634
+ Vec256<uint32_t> b) {
3635
+ return VFromD<D>{__lasx_xvssrlni_hu_w(b.raw, a.raw, 0)};
3636
+ }
3637
+
3638
+ template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_I32_D(D)>
3639
+ HWY_API VFromD<D> ReorderDemote2To(D /* tag */, Vec256<int64_t> a,
3640
+ Vec256<int64_t> b) {
3641
+ return VFromD<D>{__lasx_xvssrani_w_d(b.raw, a.raw, 0)};
3642
+ }
3643
+
3644
+ template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_U32_D(D)>
3645
+ HWY_API VFromD<D> ReorderDemote2To(D /* tag */, Vec256<int64_t> a,
3646
+ Vec256<int64_t> b) {
3647
+ return VFromD<D>{__lasx_xvssrani_wu_d(b.raw, a.raw, 0)};
3648
+ }
3649
+
3650
+ template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_I32_D(D)>
3651
+ HWY_API VFromD<D> ReorderDemote2To(D /* tag */, Vec256<uint64_t> a,
3652
+ Vec256<uint64_t> b) {
3653
+ return VFromD<D>{__lasx_xvssrlni_w_d(b.raw, a.raw, 0)};
3654
+ }
3655
+
3656
+ template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_U32_D(D)>
3657
+ HWY_API VFromD<D> ReorderDemote2To(D /* tag */, Vec256<uint64_t> a,
3658
+ Vec256<uint64_t> b) {
3659
+ return VFromD<D>{__lasx_xvssrlni_wu_d(b.raw, a.raw, 0)};
3660
+ }
3661
+
3662
+ template <class D, class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL(TFromD<D>),
3663
+ HWY_IF_V_SIZE_D(D, 32), HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V),
3664
+ HWY_IF_T_SIZE_V(V, sizeof(TFromD<D>) * 2),
3665
+ HWY_IF_LANES_D(D, HWY_MAX_LANES_D(DFromV<V>) * 2)>
3666
+ HWY_API VFromD<D> OrderedDemote2To(D d, V a, V b) {
3667
+ return VFromD<D>{__lasx_xvpermi_d(ReorderDemote2To(d, a, b).raw, 0xd8)};
3668
+ }
3669
+
3670
+ template <class D, class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(D),
3671
+ HWY_IF_V_SIZE_D(D, 16), HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V),
3672
+ HWY_IF_T_SIZE_V(V, sizeof(TFromD<D>) * 2),
3673
+ HWY_IF_LANES_D(D, HWY_MAX_LANES_D(DFromV<V>))>
3674
+ HWY_API VFromD<D> DemoteTo(D d, V v) {
3675
+ return LowerHalf(OrderedDemote2To(Twice<decltype(d)>(), v, v));
3676
+ }
3677
+
3678
+ template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F16_D(D)>
3679
+ HWY_API VFromD<D> DemoteTo(D /* tag */, Vec256<float> v) {
3680
+ const Full256<int16_t> di;
3681
+ const Vec256<hwy::float16_t> f16_blocks{__lasx_xvfcvt_h_s(v.raw, v.raw)};
3682
+ const auto f16_concat =
3683
+ BitCast(Twice<D>(), VFromD<decltype(di)>{__lasx_xvpermi_d(
3684
+ BitCast(di, f16_blocks).raw, 0xd8)});
3685
+ return LowerHalf(f16_concat);
3686
+ }
3687
+
3688
+ template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F32_D(D)>
3689
+ HWY_API VFromD<D> DemoteTo(D /* tag */, Vec256<double> v) {
3690
+ const Full256<int32_t> di;
3691
+ const Vec256<float> f32_blocks{__lasx_xvfcvt_s_d(v.raw, v.raw)};
3692
+ const auto f32_concat =
3693
+ BitCast(Twice<D>(), VFromD<decltype(di)>{__lasx_xvpermi_d(
3694
+ BitCast(di, f32_blocks).raw, 0xd8)});
3695
+ return LowerHalf(f32_concat);
3696
+ }
3697
+
3698
+ template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_I32_D(D)>
3699
+ HWY_API VFromD<D> DemoteTo(D dn, Vec256<double> v) {
3700
+ const __m256i i32_blocks = __lasx_xvftintrz_w_d(v.raw, v.raw);
3701
+ return LowerHalf(dn, VFromD<Twice<D>>{__lasx_xvpermi_d(i32_blocks, 0xd8)});
3702
+ }
3703
+
3704
+ // For already range-limited input [0, 255].
3705
+ HWY_API Vec128<uint8_t, 8> U8FromU32(const Vec256<uint32_t> v) {
3706
+ const Full256<uint32_t> d32;
3707
+ const Full64<uint8_t> d8;
3708
+ alignas(32) static constexpr uint32_t k8From32[8] = {
3709
+ 0x0C080400u, 0x13121110u, 0, 0, 0x13121110u, 0x0C080400u, 0, 0};
3710
+ // Place first four bytes in lo[0], remaining 4 in hi[1].
3711
+ const auto quad = VFromD<decltype(d32)>{
3712
+ __lasx_xvshuf_b(Zero(d32).raw, v.raw, Load(d32, k8From32).raw)};
3713
+ // Interleave both quadruplets - OR instead of unpack reduces port5 pressure.
3714
+ const auto lo = LowerHalf(quad);
3715
+ const auto hi = UpperHalf(Half<decltype(d32)>(), quad);
3716
+ return BitCast(d8, LowerHalf(lo | hi));
3717
+ }
3718
+
3719
+ // ------------------------------ Truncations
3720
+
3721
+ template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_U8_D(D)>
3722
+ HWY_API VFromD<D> TruncateTo(D /* tag */, Vec256<uint64_t> v) {
3723
+ const Full256<uint8_t> d8;
3724
+ alignas(32) static constexpr uint8_t kMap[32] = {0, 8, 16, 24};
3725
+ const auto i8 = TableLookupLanes(BitCast(d8, v), SetTableIndices(d8, kMap));
3726
+ return LowerHalf(LowerHalf(LowerHalf(Vec256<uint8_t>{i8.raw})));
3727
+ }
3728
+
3729
+ template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_U16_D(D)>
3730
+ HWY_API VFromD<D> TruncateTo(D /* tag */, Vec256<uint64_t> v) {
3731
+ const __m256i i32_blocks = __lasx_xvpickev_w(v.raw, v.raw);
3732
+ const __m256i i32_concat = __lasx_xvpermi_d(i32_blocks, 0xd8);
3733
+ const __m256i i16 = __lasx_xvpickev_h(i32_concat, i32_concat);
3734
+ return LowerHalf(LowerHalf(Vec256<uint16_t>{i16}));
3735
+ }
3736
+
3737
+ template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U32_D(D)>
3738
+ HWY_API VFromD<D> TruncateTo(D /* tag */, Vec256<uint64_t> v) {
3739
+ const Full256<uint32_t> d32;
3740
+ alignas(32) static constexpr uint32_t kEven[8] = {0, 2, 4, 6, 0, 2, 4, 6};
3741
+ const auto v32 =
3742
+ TableLookupLanes(BitCast(d32, v), SetTableIndices(d32, kEven));
3743
+ return LowerHalf(Vec256<uint32_t>{v32.raw});
3744
+ }
3745
+
3746
+ template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_U8_D(D)>
3747
+ HWY_API VFromD<D> TruncateTo(D /* tag */, Vec256<uint32_t> v) {
3748
+ const Full256<uint8_t> d8;
3749
+ alignas(32) static constexpr uint8_t kEven[32] = {0, 4, 8, 12,
3750
+ 16, 20, 24, 28};
3751
+ const auto i8 = TableLookupLanes(BitCast(d8, v), SetTableIndices(d8, kEven));
3752
+ return LowerHalf(LowerHalf(Vec256<uint8_t>{i8.raw}));
3753
+ }
3754
+
3755
+ template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U16_D(D)>
3756
+ HWY_API VFromD<D> TruncateTo(D /* tag */, Vec256<uint32_t> v) {
3757
+ const __m256i i16_blocks = __lasx_xvpickev_h(v.raw, v.raw);
3758
+ const __m256i i16_concat = __lasx_xvpermi_d(i16_blocks, 0xd8);
3759
+ return LowerHalf(Vec256<uint16_t>{i16_concat});
3760
+ }
3761
+
3762
+ template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U8_D(D)>
3763
+ HWY_API VFromD<D> TruncateTo(D /* tag */, Vec256<uint16_t> v) {
3764
+ const __m256i i8_blocks = __lasx_xvpickev_b(v.raw, v.raw);
3765
+ const __m256i i8_concat = __lasx_xvpermi_d(i8_blocks, 0xd8);
3766
+ return LowerHalf(Vec256<uint8_t>{i8_concat});
3767
+ }
3768
+
3769
+ // ------------------------------ Integer <=> fp (ShiftRight, OddEven)
3770
+
3771
+ template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)>
3772
+ HWY_API VFromD<D> ConvertTo(D /* tag */, Vec256<int32_t> v) {
3773
+ return VFromD<D>{__lasx_xvffint_s_w(v.raw)};
3774
+ }
3775
+
3776
+ template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)>
3777
+ HWY_API VFromD<D> ConvertTo(D /*df*/, Vec256<uint32_t> v) {
3778
+ return VFromD<D>{__lasx_xvffint_s_wu(v.raw)};
3779
+ }
3780
+
3781
+ template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F64_D(D)>
3782
+ HWY_API VFromD<D> ConvertTo(D /*dd*/, Vec256<int64_t> v) {
3783
+ return VFromD<D>{__lasx_xvffint_d_l(v.raw)};
3784
+ }
3785
+
3786
+ template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F64_D(D)>
3787
+ HWY_API VFromD<D> ConvertTo(D /*dd*/, Vec256<uint64_t> v) {
3788
+ return VFromD<D>{__lasx_xvffint_d_lu(v.raw)};
3789
+ }
3790
+
3791
+ template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_I32_D(D)>
3792
+ HWY_API VFromD<D> ConvertTo(D /*d*/, Vec256<float> v) {
3793
+ return VFromD<D>{__lasx_xvftintrz_w_s(v.raw)};
3794
+ }
3795
+
3796
+ template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_I64_D(D)>
3797
+ HWY_API VFromD<D> ConvertTo(D /*di*/, Vec256<double> v) {
3798
+ return VFromD<D>{__lasx_xvftintrz_l_d(v.raw)};
3799
+ }
3800
+
3801
+ template <class DU, HWY_IF_V_SIZE_D(DU, 32), HWY_IF_U32_D(DU)>
3802
+ HWY_API VFromD<DU> ConvertTo(DU /*du*/, VFromD<RebindToFloat<DU>> v) {
3803
+ return VFromD<DU>{__lasx_xvftintrz_wu_s(v.raw)};
3804
+ }
3805
+
3806
+ template <class DU, HWY_IF_V_SIZE_D(DU, 32), HWY_IF_U64_D(DU)>
3807
+ HWY_API VFromD<DU> ConvertTo(DU /*du*/, VFromD<RebindToFloat<DU>> v) {
3808
+ return VFromD<DU>{__lasx_xvftintrz_lu_d(v.raw)};
3809
+ }
3810
+
3811
+ template <typename T, HWY_IF_FLOAT3264(T)>
3812
+ HWY_API Vec256<MakeSigned<T>> NearestInt(const Vec256<T> v) {
3813
+ return ConvertTo(Full256<MakeSigned<T>>(), Round(v));
3814
+ }
3815
+
3816
+ // ------------------------------ LoadMaskBits (TestBit)
3817
+
3818
+ namespace detail {
3819
+
3820
+ template <typename T, HWY_IF_T_SIZE(T, 1)>
3821
+ HWY_INLINE Mask256<T> LoadMaskBits256(uint64_t mask_bits) {
3822
+ const Full256<T> d;
3823
+ const RebindToUnsigned<decltype(d)> du;
3824
+ const Repartition<uint32_t, decltype(d)> du32;
3825
+ const auto vbits = BitCast(du, Set(du32, static_cast<uint32_t>(mask_bits)));
3826
+
3827
+ // Replicate bytes 8x such that each byte contains the bit that governs it.
3828
+ const Repartition<uint64_t, decltype(d)> du64;
3829
+ alignas(32) static constexpr uint64_t kRep8[4] = {
3830
+ 0x0000000000000000ull, 0x0101010101010101ull, 0x0202020202020202ull,
3831
+ 0x0303030303030303ull};
3832
+ const auto rep8 = TableLookupBytes(vbits, BitCast(du, Load(du64, kRep8)));
3833
+
3834
+ const VFromD<decltype(du)> bit = Dup128VecFromValues(
3835
+ du, 1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128);
3836
+ return RebindMask(d, TestBit(rep8, bit));
3837
+ }
3838
+
3839
+ template <typename T, HWY_IF_T_SIZE(T, 2)>
3840
+ HWY_INLINE Mask256<T> LoadMaskBits256(uint64_t mask_bits) {
3841
+ const Full256<T> d;
3842
+ const RebindToUnsigned<decltype(d)> du;
3843
+ alignas(32) static constexpr uint16_t kBit[16] = {
3844
+ 1, 2, 4, 8, 16, 32, 64, 128,
3845
+ 0x100, 0x200, 0x400, 0x800, 0x1000, 0x2000, 0x4000, 0x8000};
3846
+ const auto vmask_bits = Set(du, static_cast<uint16_t>(mask_bits));
3847
+ return RebindMask(d, TestBit(vmask_bits, Load(du, kBit)));
3848
+ }
3849
+
3850
+ template <typename T, HWY_IF_T_SIZE(T, 4)>
3851
+ HWY_INLINE Mask256<T> LoadMaskBits256(uint64_t mask_bits) {
3852
+ const Full256<T> d;
3853
+ const RebindToUnsigned<decltype(d)> du;
3854
+ alignas(32) static constexpr uint32_t kBit[8] = {1, 2, 4, 8, 16, 32, 64, 128};
3855
+ const auto vmask_bits = Set(du, static_cast<uint32_t>(mask_bits));
3856
+ return RebindMask(d, TestBit(vmask_bits, Load(du, kBit)));
3857
+ }
3858
+
3859
+ template <typename T, HWY_IF_T_SIZE(T, 8)>
3860
+ HWY_INLINE Mask256<T> LoadMaskBits256(uint64_t mask_bits) {
3861
+ const Full256<T> d;
3862
+ const RebindToUnsigned<decltype(d)> du;
3863
+ alignas(32) static constexpr uint64_t kBit[8] = {1, 2, 4, 8};
3864
+ return RebindMask(d, TestBit(Set(du, mask_bits), Load(du, kBit)));
3865
+ }
3866
+
3867
+ } // namespace detail
3868
+
3869
+ // `p` points to at least 8 readable bytes, not all of which need be valid.
3870
+ template <class D, HWY_IF_V_SIZE_D(D, 32)>
3871
+ HWY_API MFromD<D> LoadMaskBits(D d, const uint8_t* HWY_RESTRICT bits) {
3872
+ constexpr size_t kN = MaxLanes(d);
3873
+ constexpr size_t kNumBytes = (kN + 7) / 8;
3874
+
3875
+ uint64_t mask_bits = 0;
3876
+ CopyBytes<kNumBytes>(bits, &mask_bits);
3877
+
3878
+ if (kN < 8) {
3879
+ mask_bits &= (1ull << kN) - 1;
3880
+ }
3881
+
3882
+ return detail::LoadMaskBits256<TFromD<D>>(mask_bits);
3883
+ }
3884
+
3885
+ // ------------------------------ BitsFromMask
3886
+
3887
+ template <class D, HWY_IF_T_SIZE_D(D, 1), HWY_IF_V_SIZE_D(D, 32)>
3888
+ HWY_API uint64_t BitsFromMask(D /*tag*/, MFromD<D> mask) {
3889
+ const auto sign_bits = __lasx_xvmskltz_b(mask.raw);
3890
+ return static_cast<uint32_t>(__lasx_xvpickve2gr_w(sign_bits, 0) |
3891
+ (__lasx_xvpickve2gr_w(sign_bits, 4) << 16));
3892
+ }
3893
+
3894
+ template <class D, HWY_IF_T_SIZE_D(D, 2), HWY_IF_V_SIZE_D(D, 32)>
3895
+ HWY_API uint64_t BitsFromMask(D d, MFromD<D> mask) {
3896
+ const RebindToSigned<decltype(d)> di;
3897
+ const auto vec_mask = VecFromMask(mask);
3898
+ const auto sign_bits =
3899
+ __lasx_xvpickod_b(BitCast(di, vec_mask).raw, BitCast(di, vec_mask).raw);
3900
+ const auto sign_shuf = __lasx_xvpermi_d(sign_bits, 0xd8);
3901
+ const auto sign_last = __lasx_xvmskltz_b(sign_shuf);
3902
+ return static_cast<unsigned>(__lasx_xvpickve2gr_w(sign_last, 0));
3903
+ }
3904
+
3905
+ template <class D, HWY_IF_T_SIZE_D(D, 4), HWY_IF_V_SIZE_D(D, 32)>
3906
+ HWY_API uint64_t BitsFromMask(D d, MFromD<D> mask) {
3907
+ const RebindToSigned<decltype(d)> di;
3908
+ const auto vec_mask = VecFromMask(mask);
3909
+ const auto sign_bits =
3910
+ __lasx_xvpickod_h(BitCast(di, vec_mask).raw, BitCast(di, vec_mask).raw);
3911
+ const auto sign_shuf = __lasx_xvpermi_d(sign_bits, 0xd8);
3912
+ const auto sign_last = __lasx_xvmskltz_h(sign_shuf);
3913
+ return static_cast<unsigned>(__lasx_xvpickve2gr_w(sign_last, 0));
3914
+ }
3915
+
3916
+ template <class D, HWY_IF_T_SIZE_D(D, 8), HWY_IF_V_SIZE_D(D, 32)>
3917
+ HWY_API uint64_t BitsFromMask(D d, MFromD<D> mask) {
3918
+ const RebindToSigned<decltype(d)> di;
3919
+ const auto vec_mask = VecFromMask(mask);
3920
+ const auto sign_bits =
3921
+ __lasx_xvpickod_w(BitCast(di, vec_mask).raw, BitCast(di, vec_mask).raw);
3922
+ const auto sign_shuf = __lasx_xvpermi_d(sign_bits, 0xd8);
3923
+ const auto sign_last = __lasx_xvmskltz_w(sign_shuf);
3924
+ return static_cast<unsigned>(__lasx_xvpickve2gr_w(sign_last, 0));
3925
+ }
3926
+
3927
+ // ------------------------------ StoreMaskBits
3928
+ // `p` points to at least 8 writable bytes.
3929
+ template <class D, HWY_IF_V_SIZE_D(D, 32)>
3930
+ HWY_API size_t StoreMaskBits(D d, MFromD<D> mask, uint8_t* bits) {
3931
+ constexpr size_t N = MaxLanes(d);
3932
+ constexpr size_t kNumBytes = (N + 7) / 8;
3933
+
3934
+ const uint64_t mask_bits = BitsFromMask(d, mask);
3935
+ CopyBytes<kNumBytes>(&mask_bits, bits);
3936
+ return kNumBytes;
3937
+ }
3938
+
3939
+ // ------------------------------ Mask testing
3940
+
3941
+ template <class D, HWY_IF_V_SIZE_D(D, 32)>
3942
+ HWY_API bool AllFalse(D d, MFromD<D> mask) {
3943
+ return BitsFromMask(d, mask) == 0;
3944
+ }
3945
+
3946
+ template <class D, HWY_IF_V_SIZE_D(D, 32)>
3947
+ HWY_API bool AllTrue(D d, MFromD<D> mask) {
3948
+ constexpr size_t kN = MaxLanes(d);
3949
+ constexpr uint64_t kAllBits = (1ull << kN) - 1;
3950
+ return BitsFromMask(d, mask) == kAllBits;
3951
+ }
3952
+
3953
+ template <class D, HWY_IF_V_SIZE_D(D, 32)>
3954
+ HWY_API size_t CountTrue(D d, MFromD<D> mask) {
3955
+ return PopCount(BitsFromMask(d, mask));
3956
+ }
3957
+
3958
+ template <class D, HWY_IF_V_SIZE_D(D, 32)>
3959
+ HWY_API size_t FindKnownFirstTrue(D d, MFromD<D> mask) {
3960
+ const uint32_t mask_bits = static_cast<uint32_t>(BitsFromMask(d, mask));
3961
+ return Num0BitsBelowLS1Bit_Nonzero32(mask_bits);
3962
+ }
3963
+
3964
+ template <class D, HWY_IF_V_SIZE_D(D, 32)>
3965
+ HWY_API intptr_t FindFirstTrue(D d, MFromD<D> mask) {
3966
+ const uint32_t mask_bits = static_cast<uint32_t>(BitsFromMask(d, mask));
3967
+ return mask_bits ? intptr_t(Num0BitsBelowLS1Bit_Nonzero32(mask_bits)) : -1;
3968
+ }
3969
+
3970
+ template <class D, HWY_IF_V_SIZE_D(D, 32)>
3971
+ HWY_API size_t FindKnownLastTrue(D d, MFromD<D> mask) {
3972
+ const uint32_t mask_bits = static_cast<uint32_t>(BitsFromMask(d, mask));
3973
+ return 31 - Num0BitsAboveMS1Bit_Nonzero32(mask_bits);
3974
+ }
3975
+
3976
+ template <class D, HWY_IF_V_SIZE_D(D, 32)>
3977
+ HWY_API intptr_t FindLastTrue(D d, MFromD<D> mask) {
3978
+ const uint32_t mask_bits = static_cast<uint32_t>(BitsFromMask(d, mask));
3979
+ return mask_bits ? intptr_t(31 - Num0BitsAboveMS1Bit_Nonzero32(mask_bits))
3980
+ : -1;
3981
+ }
3982
+
3983
+ // ------------------------------ Compress, CompressBits
3984
+
3985
+ namespace detail {
3986
+
3987
+ template <typename T, HWY_IF_T_SIZE(T, 4)>
3988
+ HWY_INLINE Vec256<uint32_t> IndicesFromBits256(uint64_t mask_bits) {
3989
+ const Full256<uint32_t> d32;
3990
+ // We need a masked Iota(). With 8 lanes, there are 256 combinations and a LUT
3991
+ // of SetTableIndices would require 8 KiB, a large part of L1D. We instead
3992
+ // compress each index into 4 bits, for a total of 1 KiB.
3993
+ alignas(16) static constexpr uint32_t packed_array[256] = {
3994
+ // PrintCompress32x8Tables
3995
+ 0x76543210, 0x76543218, 0x76543209, 0x76543298, 0x7654310a, 0x765431a8,
3996
+ 0x765430a9, 0x76543a98, 0x7654210b, 0x765421b8, 0x765420b9, 0x76542b98,
3997
+ 0x765410ba, 0x76541ba8, 0x76540ba9, 0x7654ba98, 0x7653210c, 0x765321c8,
3998
+ 0x765320c9, 0x76532c98, 0x765310ca, 0x76531ca8, 0x76530ca9, 0x7653ca98,
3999
+ 0x765210cb, 0x76521cb8, 0x76520cb9, 0x7652cb98, 0x76510cba, 0x7651cba8,
4000
+ 0x7650cba9, 0x765cba98, 0x7643210d, 0x764321d8, 0x764320d9, 0x76432d98,
4001
+ 0x764310da, 0x76431da8, 0x76430da9, 0x7643da98, 0x764210db, 0x76421db8,
4002
+ 0x76420db9, 0x7642db98, 0x76410dba, 0x7641dba8, 0x7640dba9, 0x764dba98,
4003
+ 0x763210dc, 0x76321dc8, 0x76320dc9, 0x7632dc98, 0x76310dca, 0x7631dca8,
4004
+ 0x7630dca9, 0x763dca98, 0x76210dcb, 0x7621dcb8, 0x7620dcb9, 0x762dcb98,
4005
+ 0x7610dcba, 0x761dcba8, 0x760dcba9, 0x76dcba98, 0x7543210e, 0x754321e8,
4006
+ 0x754320e9, 0x75432e98, 0x754310ea, 0x75431ea8, 0x75430ea9, 0x7543ea98,
4007
+ 0x754210eb, 0x75421eb8, 0x75420eb9, 0x7542eb98, 0x75410eba, 0x7541eba8,
4008
+ 0x7540eba9, 0x754eba98, 0x753210ec, 0x75321ec8, 0x75320ec9, 0x7532ec98,
4009
+ 0x75310eca, 0x7531eca8, 0x7530eca9, 0x753eca98, 0x75210ecb, 0x7521ecb8,
4010
+ 0x7520ecb9, 0x752ecb98, 0x7510ecba, 0x751ecba8, 0x750ecba9, 0x75ecba98,
4011
+ 0x743210ed, 0x74321ed8, 0x74320ed9, 0x7432ed98, 0x74310eda, 0x7431eda8,
4012
+ 0x7430eda9, 0x743eda98, 0x74210edb, 0x7421edb8, 0x7420edb9, 0x742edb98,
4013
+ 0x7410edba, 0x741edba8, 0x740edba9, 0x74edba98, 0x73210edc, 0x7321edc8,
4014
+ 0x7320edc9, 0x732edc98, 0x7310edca, 0x731edca8, 0x730edca9, 0x73edca98,
4015
+ 0x7210edcb, 0x721edcb8, 0x720edcb9, 0x72edcb98, 0x710edcba, 0x71edcba8,
4016
+ 0x70edcba9, 0x7edcba98, 0x6543210f, 0x654321f8, 0x654320f9, 0x65432f98,
4017
+ 0x654310fa, 0x65431fa8, 0x65430fa9, 0x6543fa98, 0x654210fb, 0x65421fb8,
4018
+ 0x65420fb9, 0x6542fb98, 0x65410fba, 0x6541fba8, 0x6540fba9, 0x654fba98,
4019
+ 0x653210fc, 0x65321fc8, 0x65320fc9, 0x6532fc98, 0x65310fca, 0x6531fca8,
4020
+ 0x6530fca9, 0x653fca98, 0x65210fcb, 0x6521fcb8, 0x6520fcb9, 0x652fcb98,
4021
+ 0x6510fcba, 0x651fcba8, 0x650fcba9, 0x65fcba98, 0x643210fd, 0x64321fd8,
4022
+ 0x64320fd9, 0x6432fd98, 0x64310fda, 0x6431fda8, 0x6430fda9, 0x643fda98,
4023
+ 0x64210fdb, 0x6421fdb8, 0x6420fdb9, 0x642fdb98, 0x6410fdba, 0x641fdba8,
4024
+ 0x640fdba9, 0x64fdba98, 0x63210fdc, 0x6321fdc8, 0x6320fdc9, 0x632fdc98,
4025
+ 0x6310fdca, 0x631fdca8, 0x630fdca9, 0x63fdca98, 0x6210fdcb, 0x621fdcb8,
4026
+ 0x620fdcb9, 0x62fdcb98, 0x610fdcba, 0x61fdcba8, 0x60fdcba9, 0x6fdcba98,
4027
+ 0x543210fe, 0x54321fe8, 0x54320fe9, 0x5432fe98, 0x54310fea, 0x5431fea8,
4028
+ 0x5430fea9, 0x543fea98, 0x54210feb, 0x5421feb8, 0x5420feb9, 0x542feb98,
4029
+ 0x5410feba, 0x541feba8, 0x540feba9, 0x54feba98, 0x53210fec, 0x5321fec8,
4030
+ 0x5320fec9, 0x532fec98, 0x5310feca, 0x531feca8, 0x530feca9, 0x53feca98,
4031
+ 0x5210fecb, 0x521fecb8, 0x520fecb9, 0x52fecb98, 0x510fecba, 0x51fecba8,
4032
+ 0x50fecba9, 0x5fecba98, 0x43210fed, 0x4321fed8, 0x4320fed9, 0x432fed98,
4033
+ 0x4310feda, 0x431feda8, 0x430feda9, 0x43feda98, 0x4210fedb, 0x421fedb8,
4034
+ 0x420fedb9, 0x42fedb98, 0x410fedba, 0x41fedba8, 0x40fedba9, 0x4fedba98,
4035
+ 0x3210fedc, 0x321fedc8, 0x320fedc9, 0x32fedc98, 0x310fedca, 0x31fedca8,
4036
+ 0x30fedca9, 0x3fedca98, 0x210fedcb, 0x21fedcb8, 0x20fedcb9, 0x2fedcb98,
4037
+ 0x10fedcba, 0x1fedcba8, 0x0fedcba9, 0xfedcba98};
4038
+
4039
+ // No need to mask because __lasx_xvperm_w ignores bits 3..31.
4040
+ // Just shift each copy of the 32 bit LUT to extract its 4-bit fields.
4041
+ const auto packed = Set(d32, packed_array[mask_bits]);
4042
+ alignas(32) static constexpr uint32_t shifts[8] = {0, 4, 8, 12,
4043
+ 16, 20, 24, 28};
4044
+ return packed >> Load(d32, shifts);
4045
+ }
4046
+
4047
+ template <typename T, HWY_IF_T_SIZE(T, 8)>
4048
+ HWY_INLINE Vec256<uint64_t> IndicesFromBits256(uint64_t mask_bits) {
4049
+ const Full256<uint64_t> d64;
4050
+
4051
+ // For 64-bit, there are only 4 lanes, so we can afford to load the
4052
+ // entire index vector directly.
4053
+ alignas(32) static constexpr uint64_t u64_indices[64] = {
4054
+ // PrintCompress64x4PairTables
4055
+ 0, 1, 2, 3, 8, 1, 2, 3, 9, 0, 2, 3, 8, 9, 2, 3,
4056
+ 10, 0, 1, 3, 8, 10, 1, 3, 9, 10, 0, 3, 8, 9, 10, 3,
4057
+ 11, 0, 1, 2, 8, 11, 1, 2, 9, 11, 0, 2, 8, 9, 11, 2,
4058
+ 10, 11, 0, 1, 8, 10, 11, 1, 9, 10, 11, 0, 8, 9, 10, 11};
4059
+ return Load(d64, u64_indices + 4 * mask_bits);
4060
+ }
4061
+
4062
+ template <typename T, HWY_IF_T_SIZE(T, 4)>
4063
+ HWY_INLINE Vec256<uint32_t> IndicesFromNotBits256(uint64_t mask_bits) {
4064
+ const Full256<uint32_t> d32;
4065
+ // We need a masked Iota(). With 8 lanes, there are 256 combinations and a LUT
4066
+ // of SetTableIndices would require 8 KiB, a large part of L1D. We instead
4067
+ // compress each index into 4 bits, for a total of 1 KiB.
4068
+ alignas(16) static constexpr uint32_t packed_array[256] = {
4069
+ // PrintCompressNot32x8Tables
4070
+ 0xfedcba98, 0x8fedcba9, 0x9fedcba8, 0x98fedcba, 0xafedcb98, 0xa8fedcb9,
4071
+ 0xa9fedcb8, 0xa98fedcb, 0xbfedca98, 0xb8fedca9, 0xb9fedca8, 0xb98fedca,
4072
+ 0xbafedc98, 0xba8fedc9, 0xba9fedc8, 0xba98fedc, 0xcfedba98, 0xc8fedba9,
4073
+ 0xc9fedba8, 0xc98fedba, 0xcafedb98, 0xca8fedb9, 0xca9fedb8, 0xca98fedb,
4074
+ 0xcbfeda98, 0xcb8feda9, 0xcb9feda8, 0xcb98feda, 0xcbafed98, 0xcba8fed9,
4075
+ 0xcba9fed8, 0xcba98fed, 0xdfecba98, 0xd8fecba9, 0xd9fecba8, 0xd98fecba,
4076
+ 0xdafecb98, 0xda8fecb9, 0xda9fecb8, 0xda98fecb, 0xdbfeca98, 0xdb8feca9,
4077
+ 0xdb9feca8, 0xdb98feca, 0xdbafec98, 0xdba8fec9, 0xdba9fec8, 0xdba98fec,
4078
+ 0xdcfeba98, 0xdc8feba9, 0xdc9feba8, 0xdc98feba, 0xdcafeb98, 0xdca8feb9,
4079
+ 0xdca9feb8, 0xdca98feb, 0xdcbfea98, 0xdcb8fea9, 0xdcb9fea8, 0xdcb98fea,
4080
+ 0xdcbafe98, 0xdcba8fe9, 0xdcba9fe8, 0xdcba98fe, 0xefdcba98, 0xe8fdcba9,
4081
+ 0xe9fdcba8, 0xe98fdcba, 0xeafdcb98, 0xea8fdcb9, 0xea9fdcb8, 0xea98fdcb,
4082
+ 0xebfdca98, 0xeb8fdca9, 0xeb9fdca8, 0xeb98fdca, 0xebafdc98, 0xeba8fdc9,
4083
+ 0xeba9fdc8, 0xeba98fdc, 0xecfdba98, 0xec8fdba9, 0xec9fdba8, 0xec98fdba,
4084
+ 0xecafdb98, 0xeca8fdb9, 0xeca9fdb8, 0xeca98fdb, 0xecbfda98, 0xecb8fda9,
4085
+ 0xecb9fda8, 0xecb98fda, 0xecbafd98, 0xecba8fd9, 0xecba9fd8, 0xecba98fd,
4086
+ 0xedfcba98, 0xed8fcba9, 0xed9fcba8, 0xed98fcba, 0xedafcb98, 0xeda8fcb9,
4087
+ 0xeda9fcb8, 0xeda98fcb, 0xedbfca98, 0xedb8fca9, 0xedb9fca8, 0xedb98fca,
4088
+ 0xedbafc98, 0xedba8fc9, 0xedba9fc8, 0xedba98fc, 0xedcfba98, 0xedc8fba9,
4089
+ 0xedc9fba8, 0xedc98fba, 0xedcafb98, 0xedca8fb9, 0xedca9fb8, 0xedca98fb,
4090
+ 0xedcbfa98, 0xedcb8fa9, 0xedcb9fa8, 0xedcb98fa, 0xedcbaf98, 0xedcba8f9,
4091
+ 0xedcba9f8, 0xedcba98f, 0xfedcba98, 0xf8edcba9, 0xf9edcba8, 0xf98edcba,
4092
+ 0xfaedcb98, 0xfa8edcb9, 0xfa9edcb8, 0xfa98edcb, 0xfbedca98, 0xfb8edca9,
4093
+ 0xfb9edca8, 0xfb98edca, 0xfbaedc98, 0xfba8edc9, 0xfba9edc8, 0xfba98edc,
4094
+ 0xfcedba98, 0xfc8edba9, 0xfc9edba8, 0xfc98edba, 0xfcaedb98, 0xfca8edb9,
4095
+ 0xfca9edb8, 0xfca98edb, 0xfcbeda98, 0xfcb8eda9, 0xfcb9eda8, 0xfcb98eda,
4096
+ 0xfcbaed98, 0xfcba8ed9, 0xfcba9ed8, 0xfcba98ed, 0xfdecba98, 0xfd8ecba9,
4097
+ 0xfd9ecba8, 0xfd98ecba, 0xfdaecb98, 0xfda8ecb9, 0xfda9ecb8, 0xfda98ecb,
4098
+ 0xfdbeca98, 0xfdb8eca9, 0xfdb9eca8, 0xfdb98eca, 0xfdbaec98, 0xfdba8ec9,
4099
+ 0xfdba9ec8, 0xfdba98ec, 0xfdceba98, 0xfdc8eba9, 0xfdc9eba8, 0xfdc98eba,
4100
+ 0xfdcaeb98, 0xfdca8eb9, 0xfdca9eb8, 0xfdca98eb, 0xfdcbea98, 0xfdcb8ea9,
4101
+ 0xfdcb9ea8, 0xfdcb98ea, 0xfdcbae98, 0xfdcba8e9, 0xfdcba9e8, 0xfdcba98e,
4102
+ 0xfedcba98, 0xfe8dcba9, 0xfe9dcba8, 0xfe98dcba, 0xfeadcb98, 0xfea8dcb9,
4103
+ 0xfea9dcb8, 0xfea98dcb, 0xfebdca98, 0xfeb8dca9, 0xfeb9dca8, 0xfeb98dca,
4104
+ 0xfebadc98, 0xfeba8dc9, 0xfeba9dc8, 0xfeba98dc, 0xfecdba98, 0xfec8dba9,
4105
+ 0xfec9dba8, 0xfec98dba, 0xfecadb98, 0xfeca8db9, 0xfeca9db8, 0xfeca98db,
4106
+ 0xfecbda98, 0xfecb8da9, 0xfecb9da8, 0xfecb98da, 0xfecbad98, 0xfecba8d9,
4107
+ 0xfecba9d8, 0xfecba98d, 0xfedcba98, 0xfed8cba9, 0xfed9cba8, 0xfed98cba,
4108
+ 0xfedacb98, 0xfeda8cb9, 0xfeda9cb8, 0xfeda98cb, 0xfedbca98, 0xfedb8ca9,
4109
+ 0xfedb9ca8, 0xfedb98ca, 0xfedbac98, 0xfedba8c9, 0xfedba9c8, 0xfedba98c,
4110
+ 0xfedcba98, 0xfedc8ba9, 0xfedc9ba8, 0xfedc98ba, 0xfedcab98, 0xfedca8b9,
4111
+ 0xfedca9b8, 0xfedca98b, 0xfedcba98, 0xfedcb8a9, 0xfedcb9a8, 0xfedcb98a,
4112
+ 0xfedcba98, 0xfedcba89, 0xfedcba98, 0xfedcba98};
4113
+
4114
+ // No need to mask because <__lasx_xvperm_w> ignores bits 3..31.
4115
+ // Just shift each copy of the 32 bit LUT to extract its 4-bit fields.
4116
+ const Vec256<uint32_t> packed = Set(d32, packed_array[mask_bits]);
4117
+ alignas(32) static constexpr uint32_t shifts[8] = {0, 4, 8, 12,
4118
+ 16, 20, 24, 28};
4119
+ return packed >> Load(d32, shifts);
4120
+ }
4121
+
4122
+ template <typename T, HWY_IF_T_SIZE(T, 8)>
4123
+ HWY_INLINE Vec256<uint64_t> IndicesFromNotBits256(uint64_t mask_bits) {
4124
+ const Full256<uint64_t> d64;
4125
+
4126
+ // For 64-bit, there are only 4 lanes, so we can afford to load
4127
+ // the entire index vector directly.
4128
+ alignas(32) static constexpr uint64_t u64_indices[64] = {
4129
+ // PrintCompressNot64x4PairTables
4130
+ 8, 9, 10, 11, 9, 10, 11, 0, 8, 10, 11, 1, 10, 11, 0, 1,
4131
+ 8, 9, 11, 2, 9, 11, 0, 2, 8, 11, 1, 2, 11, 0, 1, 2,
4132
+ 8, 9, 10, 3, 9, 10, 0, 3, 8, 10, 1, 3, 10, 0, 1, 3,
4133
+ 8, 9, 2, 3, 9, 0, 2, 3, 8, 1, 2, 3, 0, 1, 2, 3};
4134
+ return Load(d64, u64_indices + 4 * mask_bits);
4135
+ }
4136
+
4137
+ template <typename T, HWY_IF_NOT_T_SIZE(T, 2)>
4138
+ HWY_INLINE Vec256<T> Compress(Vec256<T> v, const uint64_t mask_bits) {
4139
+ const DFromV<decltype(v)> d;
4140
+ const RebindToSigned<decltype(d)> di;
4141
+
4142
+ HWY_DASSERT(mask_bits < (1ull << Lanes(d)));
4143
+ const Indices256<TFromD<decltype(di)>> indices{
4144
+ IndicesFromBits256<T>(mask_bits).raw};
4145
+ return BitCast(d, TableLookupLanes(BitCast(di, v), indices));
4146
+ }
4147
+
4148
+ // LUTs are infeasible for 2^16 possible masks, so splice together two
4149
+ // half-vector Compress.
4150
+ template <typename T, HWY_IF_T_SIZE(T, 2)>
4151
+ HWY_INLINE Vec256<T> Compress(Vec256<T> v, const uint64_t mask_bits) {
4152
+ const DFromV<decltype(v)> d;
4153
+ const RebindToUnsigned<decltype(d)> du;
4154
+ const auto vu16 = BitCast(du, v); // (required for float16_t inputs)
4155
+ const Half<decltype(du)> duh;
4156
+ const auto half0 = LowerHalf(duh, vu16);
4157
+ const auto half1 = UpperHalf(duh, vu16);
4158
+
4159
+ const uint64_t mask_bits0 = mask_bits & 0xFF;
4160
+ const uint64_t mask_bits1 = mask_bits >> 8;
4161
+ const auto compressed0 = detail::CompressBits(half0, mask_bits0);
4162
+ const auto compressed1 = detail::CompressBits(half1, mask_bits1);
4163
+
4164
+ alignas(32) uint16_t all_true[16] = {};
4165
+ // Store mask=true lanes, left to right.
4166
+ const size_t num_true0 = PopCount(mask_bits0);
4167
+ Store(compressed0, duh, all_true);
4168
+ StoreU(compressed1, duh, all_true + num_true0);
4169
+
4170
+ if (hwy::HWY_NAMESPACE::CompressIsPartition<T>::value) {
4171
+ // Store mask=false lanes, right to left. The second vector fills the upper
4172
+ // half with right-aligned false lanes. The first vector is shifted
4173
+ // rightwards to overwrite the true lanes of the second.
4174
+ alignas(32) uint16_t all_false[16] = {};
4175
+ const size_t num_true1 = PopCount(mask_bits1);
4176
+ Store(compressed1, duh, all_false + 8);
4177
+ StoreU(compressed0, duh, all_false + num_true1);
4178
+
4179
+ const auto mask = FirstN(du, num_true0 + num_true1);
4180
+ return BitCast(d,
4181
+ IfThenElse(mask, Load(du, all_true), Load(du, all_false)));
4182
+ } else {
4183
+ // Only care about the mask=true lanes.
4184
+ return BitCast(d, Load(du, all_true));
4185
+ }
4186
+ }
4187
+
4188
+ template <typename T, HWY_IF_T_SIZE_ONE_OF(T, (1 << 4) | (1 << 8))>
4189
+ HWY_INLINE Vec256<T> CompressNot(Vec256<T> v, const uint64_t mask_bits) {
4190
+ const DFromV<decltype(v)> d;
4191
+ const RebindToSigned<decltype(d)> di;
4192
+
4193
+ HWY_DASSERT(mask_bits < (1ull << Lanes(d)));
4194
+ const Indices256<TFromD<decltype(di)>> indices{
4195
+ IndicesFromNotBits256<T>(mask_bits).raw};
4196
+ return BitCast(d, TableLookupLanes(BitCast(di, v), indices));
4197
+ }
4198
+
4199
+ // LUTs are infeasible for 2^16 possible masks, so splice together two
4200
+ // half-vector Compress.
4201
+ template <typename T, HWY_IF_T_SIZE(T, 2)>
4202
+ HWY_INLINE Vec256<T> CompressNot(Vec256<T> v, const uint64_t mask_bits) {
4203
+ // Compress ensures only the lower 16 bits are set, so flip those.
4204
+ return Compress(v, mask_bits ^ 0xFFFF);
4205
+ }
4206
+
4207
+ } // namespace detail
4208
+
4209
+ template <typename T, HWY_IF_NOT_T_SIZE(T, 1)>
4210
+ HWY_API Vec256<T> Compress(Vec256<T> v, Mask256<T> m) {
4211
+ const DFromV<decltype(v)> d;
4212
+ return detail::Compress(v, BitsFromMask(d, m));
4213
+ }
4214
+
4215
+ template <typename T, HWY_IF_NOT_T_SIZE(T, 1)>
4216
+ HWY_API Vec256<T> CompressNot(Vec256<T> v, Mask256<T> m) {
4217
+ const DFromV<decltype(v)> d;
4218
+ return detail::CompressNot(v, BitsFromMask(d, m));
4219
+ }
4220
+
4221
+ HWY_API Vec256<uint64_t> CompressBlocksNot(Vec256<uint64_t> v,
4222
+ Mask256<uint64_t> mask) {
4223
+ return CompressNot(v, mask);
4224
+ }
4225
+
4226
+ template <typename T, HWY_IF_NOT_T_SIZE(T, 1)>
4227
+ HWY_API Vec256<T> CompressBits(Vec256<T> v, const uint8_t* HWY_RESTRICT bits) {
4228
+ constexpr size_t N = 32 / sizeof(T);
4229
+ constexpr size_t kNumBytes = (N + 7) / 8;
4230
+
4231
+ uint64_t mask_bits = 0;
4232
+ CopyBytes<kNumBytes>(bits, &mask_bits);
4233
+
4234
+ if (N < 8) {
4235
+ mask_bits &= (1ull << N) - 1;
4236
+ }
4237
+
4238
+ return detail::Compress(v, mask_bits);
4239
+ }
4240
+
4241
+ // ------------------------------ CompressStore, CompressBitsStore
4242
+
4243
+ template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_NOT_T_SIZE_D(D, 1)>
4244
+ HWY_API size_t CompressStore(VFromD<D> v, MFromD<D> m, D d,
4245
+ TFromD<D>* HWY_RESTRICT unaligned) {
4246
+ const uint64_t mask_bits = BitsFromMask(d, m);
4247
+ const size_t count = PopCount(mask_bits);
4248
+ StoreU(detail::Compress(v, mask_bits), d, unaligned);
4249
+ detail::MaybeUnpoison(unaligned, count);
4250
+ return count;
4251
+ }
4252
+
4253
+ template <class D, HWY_IF_V_SIZE_D(D, 32),
4254
+ HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 4) | (1 << 8))>
4255
+ HWY_API size_t CompressBlendedStore(VFromD<D> v, MFromD<D> m, D d,
4256
+ TFromD<D>* HWY_RESTRICT unaligned) {
4257
+ const uint64_t mask_bits = BitsFromMask(d, m);
4258
+ const size_t count = PopCount(mask_bits);
4259
+ using TU = MakeUnsigned<TFromD<D>>;
4260
+
4261
+ const RebindToUnsigned<decltype(d)> du;
4262
+ HWY_DASSERT(mask_bits < (1ull << Lanes(d)));
4263
+ const Vec256<TU> idx_mask = detail::IndicesFromBits256<TFromD<D>>(mask_bits);
4264
+ // Shift nibble MSB into MSB
4265
+ const auto shiftVal = sizeof(TU) == 4 ? 28 : 60;
4266
+ const Mask256<TU> mask32or64 = MaskFromVec(ShiftLeft<shiftVal>(idx_mask));
4267
+ const Mask256<TU> masku{sizeof(TU) == 4 ? __lasx_xvslti_w(mask32or64.raw, 0)
4268
+ : __lasx_xvslti_d(mask32or64.raw, 0)};
4269
+ const MFromD<D> mask = RebindMask(d, masku);
4270
+ const VFromD<D> compressed = BitCast(
4271
+ d, TableLookupLanes(BitCast(du, v), Indices256<TU>{idx_mask.raw}));
4272
+
4273
+ BlendedStore(compressed, mask, d, unaligned);
4274
+ detail::MaybeUnpoison(unaligned, count);
4275
+ return count;
4276
+ }
4277
+
4278
+ template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 2)>
4279
+ HWY_API size_t CompressBlendedStore(VFromD<D> v, MFromD<D> m, D d,
4280
+ TFromD<D>* HWY_RESTRICT unaligned) {
4281
+ const uint64_t mask_bits = BitsFromMask(d, m);
4282
+ const size_t count = PopCount(mask_bits);
4283
+ const VFromD<D> compressed = detail::Compress(v, mask_bits);
4284
+ BlendedStore(compressed, FirstN(d, count), d, unaligned);
4285
+ return count;
4286
+ }
4287
+
4288
+ template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_NOT_T_SIZE_D(D, 1)>
4289
+ HWY_API size_t CompressBitsStore(VFromD<D> v, const uint8_t* HWY_RESTRICT bits,
4290
+ D d, TFromD<D>* HWY_RESTRICT unaligned) {
4291
+ constexpr size_t N = MaxLanes(d);
4292
+ constexpr size_t kNumBytes = (N + 7) / 8;
4293
+
4294
+ uint64_t mask_bits = 0;
4295
+ CopyBytes<kNumBytes>(bits, &mask_bits);
4296
+
4297
+ if (N < 8) {
4298
+ mask_bits &= (1ull << N) - 1;
4299
+ }
4300
+ const size_t count = PopCount(mask_bits);
4301
+
4302
+ StoreU(detail::Compress(v, mask_bits), d, unaligned);
4303
+ detail::MaybeUnpoison(unaligned, count);
4304
+ return count;
4305
+ }
4306
+
4307
+ // ------------------------------ Dup128MaskFromMaskBits
4308
+
4309
+ // Generic for all vector lengths >= 32 bytes
4310
+ template <class D, HWY_IF_V_SIZE_GT_D(D, 16)>
4311
+ HWY_API MFromD<D> Dup128MaskFromMaskBits(D d, unsigned mask_bits) {
4312
+ const Half<decltype(d)> dh;
4313
+ const auto mh = Dup128MaskFromMaskBits(dh, mask_bits);
4314
+ return CombineMasks(d, mh, mh);
4315
+ }
4316
+
4317
+ // ------------------------------ Expand
4318
+
4319
+ template <typename T, HWY_IF_T_SIZE(T, 1)>
4320
+ HWY_API Vec256<T> Expand(Vec256<T> v, Mask256<T> mask) {
4321
+ const DFromV<decltype(v)> d;
4322
+ // LUTs are infeasible for so many mask combinations, so Combine two
4323
+ // half-vector Expand.
4324
+ const Half<decltype(d)> dh;
4325
+ const uint64_t mask_bits = BitsFromMask(d, mask);
4326
+ constexpr size_t N = 32 / sizeof(T);
4327
+ const size_t countL = PopCount(mask_bits & ((1 << (N / 2)) - 1));
4328
+ const Mask128<T> maskL = MaskFromVec(LowerHalf(VecFromMask(d, mask)));
4329
+ const Vec128<T> expandL = Expand(LowerHalf(v), maskL);
4330
+
4331
+ alignas(32) T lanes[N];
4332
+ Store(v, d, lanes);
4333
+ const Mask128<T> maskH = MaskFromVec(UpperHalf(dh, VecFromMask(d, mask)));
4334
+ const Vec128<T> expandH = Expand(LoadU(dh, lanes + countL), maskH);
4335
+ return Combine(d, expandH, expandL);
4336
+ }
4337
+
4338
+ template <typename T, HWY_IF_T_SIZE(T, 2)>
4339
+ HWY_API Vec256<T> Expand(Vec256<T> v, Mask256<T> mask) {
4340
+ const Full256<T> d;
4341
+ // LUTs are infeasible for 2^16 possible masks, so splice together two
4342
+ // half-vector Expand.
4343
+ const Half<decltype(d)> dh;
4344
+ const Mask128<T> maskL = MaskFromVec(LowerHalf(VecFromMask(d, mask)));
4345
+ const Vec128<T> expandL = Expand(LowerHalf(v), maskL);
4346
+
4347
+ alignas(32) T lanes[32 / sizeof(T)];
4348
+ Store(v, d, lanes);
4349
+ const Vec128<T> vH = LoadU(dh, lanes + CountTrue(dh, maskL));
4350
+ const Mask128<T> maskH = MaskFromVec(UpperHalf(dh, VecFromMask(d, mask)));
4351
+ const Vec128<T> expandH = Expand(vH, maskH);
4352
+ return Combine(d, expandH, expandL);
4353
+ }
4354
+
4355
+ template <typename T, HWY_IF_T_SIZE(T, 4)>
4356
+ HWY_API Vec256<T> Expand(Vec256<T> v, Mask256<T> mask) {
4357
+ const Full256<T> d;
4358
+ const RebindToUnsigned<decltype(d)> du;
4359
+ const uint64_t mask_bits = BitsFromMask(d, mask);
4360
+ alignas(16) constexpr uint32_t packed_array[256] = {
4361
+ // PrintExpand32x8Nibble.
4362
+ 0xffffffff, 0xfffffff0, 0xffffff0f, 0xffffff10, 0xfffff0ff, 0xfffff1f0,
4363
+ 0xfffff10f, 0xfffff210, 0xffff0fff, 0xffff1ff0, 0xffff1f0f, 0xffff2f10,
4364
+ 0xffff10ff, 0xffff21f0, 0xffff210f, 0xffff3210, 0xfff0ffff, 0xfff1fff0,
4365
+ 0xfff1ff0f, 0xfff2ff10, 0xfff1f0ff, 0xfff2f1f0, 0xfff2f10f, 0xfff3f210,
4366
+ 0xfff10fff, 0xfff21ff0, 0xfff21f0f, 0xfff32f10, 0xfff210ff, 0xfff321f0,
4367
+ 0xfff3210f, 0xfff43210, 0xff0fffff, 0xff1ffff0, 0xff1fff0f, 0xff2fff10,
4368
+ 0xff1ff0ff, 0xff2ff1f0, 0xff2ff10f, 0xff3ff210, 0xff1f0fff, 0xff2f1ff0,
4369
+ 0xff2f1f0f, 0xff3f2f10, 0xff2f10ff, 0xff3f21f0, 0xff3f210f, 0xff4f3210,
4370
+ 0xff10ffff, 0xff21fff0, 0xff21ff0f, 0xff32ff10, 0xff21f0ff, 0xff32f1f0,
4371
+ 0xff32f10f, 0xff43f210, 0xff210fff, 0xff321ff0, 0xff321f0f, 0xff432f10,
4372
+ 0xff3210ff, 0xff4321f0, 0xff43210f, 0xff543210, 0xf0ffffff, 0xf1fffff0,
4373
+ 0xf1ffff0f, 0xf2ffff10, 0xf1fff0ff, 0xf2fff1f0, 0xf2fff10f, 0xf3fff210,
4374
+ 0xf1ff0fff, 0xf2ff1ff0, 0xf2ff1f0f, 0xf3ff2f10, 0xf2ff10ff, 0xf3ff21f0,
4375
+ 0xf3ff210f, 0xf4ff3210, 0xf1f0ffff, 0xf2f1fff0, 0xf2f1ff0f, 0xf3f2ff10,
4376
+ 0xf2f1f0ff, 0xf3f2f1f0, 0xf3f2f10f, 0xf4f3f210, 0xf2f10fff, 0xf3f21ff0,
4377
+ 0xf3f21f0f, 0xf4f32f10, 0xf3f210ff, 0xf4f321f0, 0xf4f3210f, 0xf5f43210,
4378
+ 0xf10fffff, 0xf21ffff0, 0xf21fff0f, 0xf32fff10, 0xf21ff0ff, 0xf32ff1f0,
4379
+ 0xf32ff10f, 0xf43ff210, 0xf21f0fff, 0xf32f1ff0, 0xf32f1f0f, 0xf43f2f10,
4380
+ 0xf32f10ff, 0xf43f21f0, 0xf43f210f, 0xf54f3210, 0xf210ffff, 0xf321fff0,
4381
+ 0xf321ff0f, 0xf432ff10, 0xf321f0ff, 0xf432f1f0, 0xf432f10f, 0xf543f210,
4382
+ 0xf3210fff, 0xf4321ff0, 0xf4321f0f, 0xf5432f10, 0xf43210ff, 0xf54321f0,
4383
+ 0xf543210f, 0xf6543210, 0x0fffffff, 0x1ffffff0, 0x1fffff0f, 0x2fffff10,
4384
+ 0x1ffff0ff, 0x2ffff1f0, 0x2ffff10f, 0x3ffff210, 0x1fff0fff, 0x2fff1ff0,
4385
+ 0x2fff1f0f, 0x3fff2f10, 0x2fff10ff, 0x3fff21f0, 0x3fff210f, 0x4fff3210,
4386
+ 0x1ff0ffff, 0x2ff1fff0, 0x2ff1ff0f, 0x3ff2ff10, 0x2ff1f0ff, 0x3ff2f1f0,
4387
+ 0x3ff2f10f, 0x4ff3f210, 0x2ff10fff, 0x3ff21ff0, 0x3ff21f0f, 0x4ff32f10,
4388
+ 0x3ff210ff, 0x4ff321f0, 0x4ff3210f, 0x5ff43210, 0x1f0fffff, 0x2f1ffff0,
4389
+ 0x2f1fff0f, 0x3f2fff10, 0x2f1ff0ff, 0x3f2ff1f0, 0x3f2ff10f, 0x4f3ff210,
4390
+ 0x2f1f0fff, 0x3f2f1ff0, 0x3f2f1f0f, 0x4f3f2f10, 0x3f2f10ff, 0x4f3f21f0,
4391
+ 0x4f3f210f, 0x5f4f3210, 0x2f10ffff, 0x3f21fff0, 0x3f21ff0f, 0x4f32ff10,
4392
+ 0x3f21f0ff, 0x4f32f1f0, 0x4f32f10f, 0x5f43f210, 0x3f210fff, 0x4f321ff0,
4393
+ 0x4f321f0f, 0x5f432f10, 0x4f3210ff, 0x5f4321f0, 0x5f43210f, 0x6f543210,
4394
+ 0x10ffffff, 0x21fffff0, 0x21ffff0f, 0x32ffff10, 0x21fff0ff, 0x32fff1f0,
4395
+ 0x32fff10f, 0x43fff210, 0x21ff0fff, 0x32ff1ff0, 0x32ff1f0f, 0x43ff2f10,
4396
+ 0x32ff10ff, 0x43ff21f0, 0x43ff210f, 0x54ff3210, 0x21f0ffff, 0x32f1fff0,
4397
+ 0x32f1ff0f, 0x43f2ff10, 0x32f1f0ff, 0x43f2f1f0, 0x43f2f10f, 0x54f3f210,
4398
+ 0x32f10fff, 0x43f21ff0, 0x43f21f0f, 0x54f32f10, 0x43f210ff, 0x54f321f0,
4399
+ 0x54f3210f, 0x65f43210, 0x210fffff, 0x321ffff0, 0x321fff0f, 0x432fff10,
4400
+ 0x321ff0ff, 0x432ff1f0, 0x432ff10f, 0x543ff210, 0x321f0fff, 0x432f1ff0,
4401
+ 0x432f1f0f, 0x543f2f10, 0x432f10ff, 0x543f21f0, 0x543f210f, 0x654f3210,
4402
+ 0x3210ffff, 0x4321fff0, 0x4321ff0f, 0x5432ff10, 0x4321f0ff, 0x5432f1f0,
4403
+ 0x5432f10f, 0x6543f210, 0x43210fff, 0x54321ff0, 0x54321f0f, 0x65432f10,
4404
+ 0x543210ff, 0x654321f0, 0x6543210f, 0x76543210,
4405
+ };
4406
+
4407
+ // For lane i, shift the i-th 4-bit index down to bits [0, 3).
4408
+ const Vec256<uint32_t> packed = Set(du, packed_array[mask_bits]);
4409
+ alignas(32) constexpr uint32_t shifts[8] = {0, 4, 8, 12, 16, 20, 24, 28};
4410
+ // TableLookupLanes ignores upper bits; avoid bounds-check in IndicesFromVec.
4411
+ const Indices256<uint32_t> indices{(packed >> Load(du, shifts)).raw};
4412
+ const Vec256<uint32_t> expand = TableLookupLanes(BitCast(du, v), indices);
4413
+ // TableLookupLanes cannot also zero masked-off lanes, so do that now.
4414
+ return IfThenElseZero(mask, BitCast(d, expand));
4415
+ }
4416
+
4417
+ template <typename T, HWY_IF_T_SIZE(T, 8)>
4418
+ HWY_API Vec256<T> Expand(Vec256<T> v, Mask256<T> mask) {
4419
+ const Full256<T> d;
4420
+ const RebindToUnsigned<decltype(d)> du;
4421
+ const uint64_t mask_bits = BitsFromMask(d, mask);
4422
+
4423
+ alignas(16) constexpr uint64_t packed_array[16] = {
4424
+ // PrintExpand64x4Nibble.
4425
+ 0x0000ffff, 0x0000fff0, 0x0000ff0f, 0x0000ff10, 0x0000f0ff, 0x0000f1f0,
4426
+ 0x0000f10f, 0x0000f210, 0x00000fff, 0x00001ff0, 0x00001f0f, 0x00002f10,
4427
+ 0x000010ff, 0x000021f0, 0x0000210f, 0x00003210};
4428
+
4429
+ // For lane i, shift the i-th 4-bit index down to bits [0, 2).
4430
+ const Vec256<uint64_t> packed = Set(du, packed_array[mask_bits]);
4431
+ alignas(32) constexpr uint64_t shifts[8] = {0, 4, 8, 12, 16, 20, 24, 28};
4432
+ // 64-bit TableLookupLanes on LASX requires IndicesFromVec, which checks
4433
+ // bounds, so clear the upper bits.
4434
+ const Vec256<uint64_t> masked = And(packed >> Load(du, shifts), Set(du, 3));
4435
+ const Indices256<uint64_t> indices = IndicesFromVec(du, masked);
4436
+ const Vec256<uint64_t> expand = TableLookupLanes(BitCast(du, v), indices);
4437
+ // TableLookupLanes cannot also zero masked-off lanes, so do that now.
4438
+ return IfThenElseZero(mask, BitCast(d, expand));
4439
+ }
4440
+
4441
+ // ------------------------------ LoadExpand
4442
+
4443
+ template <class D, HWY_IF_V_SIZE_D(D, 32),
4444
+ HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2))>
4445
+ HWY_API VFromD<D> LoadExpand(MFromD<D> mask, D d,
4446
+ const TFromD<D>* HWY_RESTRICT unaligned) {
4447
+ return Expand(LoadU(d, unaligned), mask);
4448
+ }
4449
+
4450
+ template <class D, HWY_IF_V_SIZE_D(D, 32),
4451
+ HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 4) | (1 << 8))>
4452
+ HWY_API VFromD<D> LoadExpand(MFromD<D> mask, D d,
4453
+ const TFromD<D>* HWY_RESTRICT unaligned) {
4454
+ return Expand(LoadU(d, unaligned), mask);
4455
+ }
4456
+
4457
+ // ------------------------------ LoadInterleaved3/4
4458
+
4459
+ // Implemented in generic_ops, we just overload LoadTransposedBlocks3/4.
4460
+
4461
+ namespace detail {
4462
+ // Input:
4463
+ // 1 0 (<- first block of unaligned)
4464
+ // 3 2
4465
+ // 5 4
4466
+ // Output:
4467
+ // 3 0
4468
+ // 4 1
4469
+ // 5 2
4470
+ template <class D, HWY_IF_V_SIZE_D(D, 32)>
4471
+ HWY_API void LoadTransposedBlocks3(D d, const TFromD<D>* HWY_RESTRICT unaligned,
4472
+ VFromD<D>& A, VFromD<D>& B, VFromD<D>& C) {
4473
+ constexpr size_t N = MaxLanes(d);
4474
+ const VFromD<D> v10 = LoadU(d, unaligned + 0 * N); // 1 0
4475
+ const VFromD<D> v32 = LoadU(d, unaligned + 1 * N);
4476
+ const VFromD<D> v54 = LoadU(d, unaligned + 2 * N);
4477
+
4478
+ A = ConcatUpperLower(d, v32, v10);
4479
+ B = ConcatLowerUpper(d, v54, v10);
4480
+ C = ConcatUpperLower(d, v54, v32);
4481
+ }
4482
+
4483
+ // Input (128-bit blocks):
4484
+ // 1 0 (first block of unaligned)
4485
+ // 3 2
4486
+ // 5 4
4487
+ // 7 6
4488
+ // Output:
4489
+ // 4 0 (LSB of vA)
4490
+ // 5 1
4491
+ // 6 2
4492
+ // 7 3
4493
+ template <class D, HWY_IF_V_SIZE_D(D, 32)>
4494
+ HWY_API void LoadTransposedBlocks4(D d, const TFromD<D>* HWY_RESTRICT unaligned,
4495
+ VFromD<D>& vA, VFromD<D>& vB, VFromD<D>& vC,
4496
+ VFromD<D>& vD) {
4497
+ constexpr size_t N = MaxLanes(d);
4498
+ const VFromD<D> v10 = LoadU(d, unaligned + 0 * N);
4499
+ const VFromD<D> v32 = LoadU(d, unaligned + 1 * N);
4500
+ const VFromD<D> v54 = LoadU(d, unaligned + 2 * N);
4501
+ const VFromD<D> v76 = LoadU(d, unaligned + 3 * N);
4502
+
4503
+ vA = ConcatLowerLower(d, v54, v10);
4504
+ vB = ConcatUpperUpper(d, v54, v10);
4505
+ vC = ConcatLowerLower(d, v76, v32);
4506
+ vD = ConcatUpperUpper(d, v76, v32);
4507
+ }
4508
+ } // namespace detail
4509
+
4510
+ // ------------------------------ StoreInterleaved2/3/4 (ConcatUpperLower)
4511
+
4512
+ // Implemented in generic_ops, we just overload StoreTransposedBlocks2/3/4.
4513
+
4514
+ namespace detail {
4515
+ // Input (128-bit blocks):
4516
+ // 2 0 (LSB of i)
4517
+ // 3 1
4518
+ // Output:
4519
+ // 1 0
4520
+ // 3 2
4521
+ template <class D, HWY_IF_V_SIZE_D(D, 32)>
4522
+ HWY_API void StoreTransposedBlocks2(VFromD<D> i, VFromD<D> j, D d,
4523
+ TFromD<D>* HWY_RESTRICT unaligned) {
4524
+ constexpr size_t N = MaxLanes(d);
4525
+ const auto out0 = ConcatLowerLower(d, j, i);
4526
+ const auto out1 = ConcatUpperUpper(d, j, i);
4527
+ StoreU(out0, d, unaligned + 0 * N);
4528
+ StoreU(out1, d, unaligned + 1 * N);
4529
+ }
4530
+
4531
+ // Input (128-bit blocks):
4532
+ // 3 0 (LSB of i)
4533
+ // 4 1
4534
+ // 5 2
4535
+ // Output:
4536
+ // 1 0
4537
+ // 3 2
4538
+ // 5 4
4539
+ template <class D, HWY_IF_V_SIZE_D(D, 32)>
4540
+ HWY_API void StoreTransposedBlocks3(VFromD<D> i, VFromD<D> j, VFromD<D> k, D d,
4541
+ TFromD<D>* HWY_RESTRICT unaligned) {
4542
+ constexpr size_t N = MaxLanes(d);
4543
+ const auto out0 = ConcatLowerLower(d, j, i);
4544
+ const auto out1 = ConcatUpperLower(d, i, k);
4545
+ const auto out2 = ConcatUpperUpper(d, k, j);
4546
+ StoreU(out0, d, unaligned + 0 * N);
4547
+ StoreU(out1, d, unaligned + 1 * N);
4548
+ StoreU(out2, d, unaligned + 2 * N);
4549
+ }
4550
+
4551
+ // Input (128-bit blocks):
4552
+ // 4 0 (LSB of i)
4553
+ // 5 1
4554
+ // 6 2
4555
+ // 7 3
4556
+ // Output:
4557
+ // 1 0
4558
+ // 3 2
4559
+ // 5 4
4560
+ // 7 6
4561
+ template <class D, HWY_IF_V_SIZE_D(D, 32)>
4562
+ HWY_API void StoreTransposedBlocks4(VFromD<D> i, VFromD<D> j, VFromD<D> k,
4563
+ VFromD<D> l, D d,
4564
+ TFromD<D>* HWY_RESTRICT unaligned) {
4565
+ constexpr size_t N = MaxLanes(d);
4566
+ // Write lower halves, then upper.
4567
+ const auto out0 = ConcatLowerLower(d, j, i);
4568
+ const auto out1 = ConcatLowerLower(d, l, k);
4569
+ StoreU(out0, d, unaligned + 0 * N);
4570
+ StoreU(out1, d, unaligned + 1 * N);
4571
+ const auto out2 = ConcatUpperUpper(d, j, i);
4572
+ const auto out3 = ConcatUpperUpper(d, l, k);
4573
+ StoreU(out2, d, unaligned + 2 * N);
4574
+ StoreU(out3, d, unaligned + 3 * N);
4575
+ }
4576
+ } // namespace detail
4577
+
4578
+ // ------------------------------ Additional mask logical operations
4579
+
4580
+ namespace detail {
4581
+
4582
+ template <class T>
4583
+ static HWY_INLINE HWY_MAYBE_UNUSED Vec256<T> LasxI256Neg(Vec256<T> v) {
4584
+ const Full256<T> d;
4585
+ const Repartition<uint64_t, decltype(d)> du64;
4586
+
4587
+ const auto vu64 = BitCast(du64, v);
4588
+ const auto vu64_zero = Zero(du64);
4589
+ const auto i128_ne_zero = VecFromMask(du64, Ne128(du64, vu64, vu64_zero));
4590
+ const VFromD<decltype(du64)> i128_neg_result{
4591
+ __lasx_xvsub_q(vu64_zero.raw, vu64.raw)};
4592
+ const VFromD<decltype(du64)> i256_neg_result_as_u64{
4593
+ __lasx_xvadd_q(i128_neg_result.raw,
4594
+ ConcatLowerLower(du64, i128_ne_zero, vu64_zero).raw)};
4595
+
4596
+ return BitCast(d, i256_neg_result_as_u64);
4597
+ }
4598
+
4599
+ } // namespace detail
4600
+
4601
+ template <class T>
4602
+ HWY_API Mask256<T> SetAtOrAfterFirst(Mask256<T> mask) {
4603
+ const Full256<T> d;
4604
+ return Or(mask, MaskFromVec(detail::LasxI256Neg(VecFromMask(d, mask))));
4605
+ }
4606
+
4607
+ template <class T>
4608
+ HWY_API Mask256<T> SetBeforeFirst(Mask256<T> mask) {
4609
+ return Not(SetAtOrAfterFirst(mask));
4610
+ }
4611
+
4612
+ template <class T>
4613
+ HWY_API Mask256<T> SetOnlyFirst(Mask256<T> mask) {
4614
+ const Full256<T> d;
4615
+ const RebindToSigned<decltype(d)> di;
4616
+
4617
+ const auto vmask = BitCast(di, VecFromMask(d, mask));
4618
+ const auto neg_vmask = detail::LasxI256Neg(vmask);
4619
+
4620
+ return MaskFromVec(BitCast(d, Neg(And(vmask, neg_vmask))));
4621
+ }
4622
+
4623
+ template <class T>
4624
+ HWY_API Mask256<T> SetAtOrBeforeFirst(Mask256<T> mask) {
4625
+ const Full256<T> d;
4626
+ constexpr size_t kLanesPerBlock = MaxLanes(d) / 2;
4627
+
4628
+ const auto vmask = VecFromMask(d, mask);
4629
+ const auto vmask_lo = ConcatLowerLower(d, vmask, Zero(d));
4630
+ return SetBeforeFirst(
4631
+ MaskFromVec(CombineShiftRightBytes<(kLanesPerBlock - 1) * sizeof(T)>(
4632
+ d, vmask, vmask_lo)));
4633
+ }
4634
+
4635
+ // ------------------------------ LeadingZeroCount
4636
+
4637
+ template <class V, HWY_IF_UI8(TFromV<V>), HWY_IF_V_SIZE_V(V, 32)>
4638
+ HWY_API V LeadingZeroCount(V v) {
4639
+ return V{__lasx_xvclz_b(v.raw)};
4640
+ }
4641
+ template <class V, HWY_IF_UI16(TFromV<V>), HWY_IF_V_SIZE_V(V, 32)>
4642
+ HWY_API V LeadingZeroCount(V v) {
4643
+ return V{__lasx_xvclz_h(v.raw)};
4644
+ }
4645
+ template <class V, HWY_IF_UI32(TFromV<V>), HWY_IF_V_SIZE_V(V, 32)>
4646
+ HWY_API V LeadingZeroCount(V v) {
4647
+ return V{__lasx_xvclz_w(v.raw)};
4648
+ }
4649
+ template <class V, HWY_IF_UI64(TFromV<V>), HWY_IF_V_SIZE_V(V, 32)>
4650
+ HWY_API V LeadingZeroCount(V v) {
4651
+ return V{__lasx_xvclz_d(v.raw)};
4652
+ }
4653
+
4654
+ template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V), HWY_IF_V_SIZE_V(V, 32)>
4655
+ HWY_API V HighestSetBitIndex(V v) {
4656
+ const DFromV<decltype(v)> d;
4657
+ using T = TFromD<decltype(d)>;
4658
+ return BitCast(d, Set(d, T{sizeof(T) * 8 - 1}) - LeadingZeroCount(v));
4659
+ }
4660
+
4661
+ // NOLINTNEXTLINE(google-readability-namespace-comments)
4662
+ } // namespace HWY_NAMESPACE
4663
+ } // namespace hwy
4664
+ HWY_AFTER_NAMESPACE();