@img/sharp-libvips-dev 0.0.1 → 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (103) hide show
  1. package/README.md +2 -2
  2. package/cplusplus/VConnection.cpp +54 -54
  3. package/cplusplus/VError.cpp +20 -18
  4. package/cplusplus/VImage.cpp +636 -589
  5. package/cplusplus/VInterpolate.cpp +22 -22
  6. package/cplusplus/VRegion.cpp +4 -4
  7. package/cplusplus/vips-operators.cpp +2326 -2301
  8. package/include/aom/aom_codec.h +10 -6
  9. package/include/aom/aom_decoder.h +1 -1
  10. package/include/aom/aom_encoder.h +9 -2
  11. package/include/aom/aomcx.h +72 -3
  12. package/include/cairo/cairo-ft.h +1 -1
  13. package/include/cairo/cairo-gobject.h +8 -0
  14. package/include/cairo/cairo-svg.h +3 -3
  15. package/include/cairo/cairo-version.h +2 -2
  16. package/include/cairo/cairo.h +91 -24
  17. package/include/harfbuzz/hb-version.h +2 -2
  18. package/include/hwy/aligned_allocator.h +211 -0
  19. package/include/hwy/base.h +1517 -0
  20. package/include/hwy/cache_control.h +108 -0
  21. package/include/hwy/detect_compiler_arch.h +281 -0
  22. package/include/hwy/detect_targets.h +644 -0
  23. package/include/hwy/foreach_target.h +340 -0
  24. package/include/hwy/highway.h +435 -0
  25. package/include/hwy/highway_export.h +74 -0
  26. package/include/hwy/nanobenchmark.h +171 -0
  27. package/include/hwy/ops/arm_neon-inl.h +8913 -0
  28. package/include/hwy/ops/arm_sve-inl.h +5105 -0
  29. package/include/hwy/ops/emu128-inl.h +2811 -0
  30. package/include/hwy/ops/generic_ops-inl.h +4745 -0
  31. package/include/hwy/ops/ppc_vsx-inl.h +5716 -0
  32. package/include/hwy/ops/rvv-inl.h +5070 -0
  33. package/include/hwy/ops/scalar-inl.h +1995 -0
  34. package/include/hwy/ops/set_macros-inl.h +578 -0
  35. package/include/hwy/ops/shared-inl.h +539 -0
  36. package/include/hwy/ops/tuple-inl.h +125 -0
  37. package/include/hwy/ops/wasm_128-inl.h +5917 -0
  38. package/include/hwy/ops/x86_128-inl.h +11173 -0
  39. package/include/hwy/ops/x86_256-inl.h +7529 -0
  40. package/include/hwy/ops/x86_512-inl.h +6849 -0
  41. package/include/hwy/per_target.h +44 -0
  42. package/include/hwy/print-inl.h +62 -0
  43. package/include/hwy/print.h +75 -0
  44. package/include/hwy/robust_statistics.h +148 -0
  45. package/include/hwy/targets.h +338 -0
  46. package/include/hwy/timer-inl.h +200 -0
  47. package/include/hwy/timer.h +55 -0
  48. package/include/jconfig.h +2 -2
  49. package/include/jpeglib.h +3 -2
  50. package/include/libheif/heif.h +443 -377
  51. package/include/libheif/heif_cxx.h +4 -1
  52. package/include/libheif/heif_plugin.h +1 -1
  53. package/include/libheif/heif_properties.h +138 -0
  54. package/include/libheif/heif_regions.h +866 -0
  55. package/include/libheif/heif_version.h +3 -3
  56. package/include/vips/VConnection8.h +43 -49
  57. package/include/vips/VError8.h +27 -24
  58. package/include/vips/VImage8.h +4861 -4597
  59. package/include/vips/VInterpolate8.h +24 -27
  60. package/include/vips/VRegion8.h +32 -33
  61. package/include/vips/arithmetic.h +169 -169
  62. package/include/vips/basic.h +33 -33
  63. package/include/vips/buf.h +56 -54
  64. package/include/vips/colour.h +95 -95
  65. package/include/vips/connection.h +190 -193
  66. package/include/vips/conversion.h +91 -91
  67. package/include/vips/convolution.h +36 -30
  68. package/include/vips/create.h +63 -63
  69. package/include/vips/dbuf.h +35 -37
  70. package/include/vips/debug.h +65 -33
  71. package/include/vips/draw.h +41 -41
  72. package/include/vips/enumtypes.h +54 -51
  73. package/include/vips/error.h +63 -63
  74. package/include/vips/foreign.h +263 -223
  75. package/include/vips/format.h +48 -48
  76. package/include/vips/freqfilt.h +22 -22
  77. package/include/vips/gate.h +55 -47
  78. package/include/vips/generate.h +34 -34
  79. package/include/vips/header.h +111 -101
  80. package/include/vips/histogram.h +28 -28
  81. package/include/vips/image.h +213 -213
  82. package/include/vips/interpolate.h +40 -41
  83. package/include/vips/memory.h +61 -52
  84. package/include/vips/morphology.h +24 -24
  85. package/include/vips/mosaicing.h +32 -33
  86. package/include/vips/object.h +371 -357
  87. package/include/vips/operation.h +68 -67
  88. package/include/vips/private.h +76 -76
  89. package/include/vips/rect.h +26 -26
  90. package/include/vips/region.h +92 -92
  91. package/include/vips/resample.h +38 -38
  92. package/include/vips/sbuf.h +53 -54
  93. package/include/vips/semaphore.h +24 -24
  94. package/include/vips/thread.h +30 -27
  95. package/include/vips/threadpool.h +48 -49
  96. package/include/vips/transform.h +39 -39
  97. package/include/vips/type.h +90 -85
  98. package/include/vips/util.h +274 -229
  99. package/include/vips/vector.h +24 -144
  100. package/include/vips/version.h +9 -9
  101. package/include/vips/vips.h +41 -40
  102. package/package.json +1 -1
  103. package/versions.json +7 -7
@@ -0,0 +1,4745 @@
1
+ // Copyright 2021 Google LLC
2
+ // Copyright 2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
3
+ // SPDX-License-Identifier: Apache-2.0
4
+ // SPDX-License-Identifier: BSD-3-Clause
5
+ //
6
+ // Licensed under the Apache License, Version 2.0 (the "License");
7
+ // you may not use this file except in compliance with the License.
8
+ // You may obtain a copy of the License at
9
+ //
10
+ // http://www.apache.org/licenses/LICENSE-2.0
11
+ //
12
+ // Unless required by applicable law or agreed to in writing, software
13
+ // distributed under the License is distributed on an "AS IS" BASIS,
14
+ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15
+ // See the License for the specific language governing permissions and
16
+ // limitations under the License.
17
+
18
+ // Target-independent types/functions defined after target-specific ops.
19
+
20
+ #include "hwy/base.h"
21
+
22
+ // Define detail::Shuffle1230 etc, but only when viewing the current header;
23
+ // normally this is included via highway.h, which includes ops/*.h.
24
+ #if HWY_IDE && !defined(HWY_HIGHWAY_INCLUDED)
25
+ #include "hwy/detect_targets.h"
26
+ #include "hwy/ops/emu128-inl.h"
27
+ #endif // HWY_IDE
28
+
29
+ // Relies on the external include guard in highway.h.
30
+ HWY_BEFORE_NAMESPACE();
31
+ namespace hwy {
32
+ namespace HWY_NAMESPACE {
33
+
34
+ // The lane type of a vector type, e.g. float for Vec<ScalableTag<float>>.
35
+ template <class V>
36
+ using LaneType = decltype(GetLane(V()));
37
+
38
+ // Vector type, e.g. Vec128<float> for CappedTag<float, 4>. Useful as the return
39
+ // type of functions that do not take a vector argument, or as an argument type
40
+ // if the function only has a template argument for D, or for explicit type
41
+ // names instead of auto. This may be a built-in type.
42
+ template <class D>
43
+ using Vec = decltype(Zero(D()));
44
+
45
+ // Mask type. Useful as the return type of functions that do not take a mask
46
+ // argument, or as an argument type if the function only has a template argument
47
+ // for D, or for explicit type names instead of auto.
48
+ template <class D>
49
+ using Mask = decltype(MaskFromVec(Zero(D())));
50
+
51
+ // Returns the closest value to v within [lo, hi].
52
+ template <class V>
53
+ HWY_API V Clamp(const V v, const V lo, const V hi) {
54
+ return Min(Max(lo, v), hi);
55
+ }
56
+
57
+ // CombineShiftRightBytes (and -Lanes) are not available for the scalar target,
58
+ // and RVV has its own implementation of -Lanes.
59
+ #if HWY_TARGET != HWY_SCALAR && HWY_TARGET != HWY_RVV
60
+
61
+ template <size_t kLanes, class D>
62
+ HWY_API VFromD<D> CombineShiftRightLanes(D d, VFromD<D> hi, VFromD<D> lo) {
63
+ constexpr size_t kBytes = kLanes * sizeof(TFromD<D>);
64
+ static_assert(kBytes < 16, "Shift count is per-block");
65
+ return CombineShiftRightBytes<kBytes>(d, hi, lo);
66
+ }
67
+
68
+ #endif
69
+
70
+ // Returns lanes with the most significant bit set and all other bits zero.
71
+ template <class D>
72
+ HWY_API Vec<D> SignBit(D d) {
73
+ const RebindToUnsigned<decltype(d)> du;
74
+ return BitCast(d, Set(du, SignMask<TFromD<D>>()));
75
+ }
76
+
77
+ // Returns quiet NaN.
78
+ template <class D>
79
+ HWY_API Vec<D> NaN(D d) {
80
+ const RebindToSigned<D> di;
81
+ // LimitsMax sets all exponent and mantissa bits to 1. The exponent plus
82
+ // mantissa MSB (to indicate quiet) would be sufficient.
83
+ return BitCast(d, Set(di, LimitsMax<TFromD<decltype(di)>>()));
84
+ }
85
+
86
+ // Returns positive infinity.
87
+ template <class D>
88
+ HWY_API Vec<D> Inf(D d) {
89
+ const RebindToUnsigned<D> du;
90
+ using T = TFromD<D>;
91
+ using TU = TFromD<decltype(du)>;
92
+ const TU max_x2 = static_cast<TU>(MaxExponentTimes2<T>());
93
+ return BitCast(d, Set(du, max_x2 >> 1));
94
+ }
95
+
96
+ // ------------------------------ ZeroExtendResizeBitCast
97
+
98
+ // The implementation of detail::ZeroExtendResizeBitCast for the HWY_EMU128
99
+ // target is in emu128-inl.h, and the implementation of
100
+ // detail::ZeroExtendResizeBitCast for the HWY_SCALAR target is in scalar-inl.h
101
+ #if HWY_TARGET != HWY_EMU128 && HWY_TARGET != HWY_SCALAR
102
+ namespace detail {
103
+
104
+ #if HWY_HAVE_SCALABLE
105
+ template <size_t kFromVectSize, size_t kToVectSize, class DTo, class DFrom>
106
+ HWY_INLINE VFromD<DTo> ZeroExtendResizeBitCast(
107
+ hwy::SizeTag<kFromVectSize> /* from_size_tag */,
108
+ hwy::SizeTag<kToVectSize> /* to_size_tag */, DTo d_to, DFrom d_from,
109
+ VFromD<DFrom> v) {
110
+ const Repartition<uint8_t, DTo> d_to_u8;
111
+ const auto resized = ResizeBitCast(d_to_u8, v);
112
+ // Zero the upper bytes which were not present/valid in d_from.
113
+ const size_t num_bytes = Lanes(Repartition<uint8_t, decltype(d_from)>());
114
+ return BitCast(d_to, IfThenElseZero(FirstN(d_to_u8, num_bytes), resized));
115
+ }
116
+ #else // target that uses fixed-size vectors
117
+ // Truncating or same-size resizing cast: same as ResizeBitCast
118
+ template <size_t kFromVectSize, size_t kToVectSize, class DTo, class DFrom,
119
+ HWY_IF_LANES_LE(kToVectSize, kFromVectSize)>
120
+ HWY_INLINE VFromD<DTo> ZeroExtendResizeBitCast(
121
+ hwy::SizeTag<kFromVectSize> /* from_size_tag */,
122
+ hwy::SizeTag<kToVectSize> /* to_size_tag */, DTo d_to, DFrom /*d_from*/,
123
+ VFromD<DFrom> v) {
124
+ return ResizeBitCast(d_to, v);
125
+ }
126
+
127
+ // Resizing cast to vector that has twice the number of lanes of the source
128
+ // vector
129
+ template <size_t kFromVectSize, size_t kToVectSize, class DTo, class DFrom,
130
+ HWY_IF_LANES(kToVectSize, kFromVectSize * 2)>
131
+ HWY_INLINE VFromD<DTo> ZeroExtendResizeBitCast(
132
+ hwy::SizeTag<kFromVectSize> /* from_size_tag */,
133
+ hwy::SizeTag<kToVectSize> /* to_size_tag */, DTo d_to, DFrom d_from,
134
+ VFromD<DFrom> v) {
135
+ const Twice<decltype(d_from)> dt_from;
136
+ return BitCast(d_to, ZeroExtendVector(dt_from, v));
137
+ }
138
+
139
+ // Resizing cast to vector that has more than twice the number of lanes of the
140
+ // source vector
141
+ template <size_t kFromVectSize, size_t kToVectSize, class DTo, class DFrom,
142
+ HWY_IF_LANES_GT(kToVectSize, kFromVectSize * 2)>
143
+ HWY_INLINE VFromD<DTo> ZeroExtendResizeBitCast(
144
+ hwy::SizeTag<kFromVectSize> /* from_size_tag */,
145
+ hwy::SizeTag<kToVectSize> /* to_size_tag */, DTo d_to, DFrom /*d_from*/,
146
+ VFromD<DFrom> v) {
147
+ using TFrom = TFromD<DFrom>;
148
+ constexpr size_t kNumOfFromLanes = kFromVectSize / sizeof(TFrom);
149
+ const Repartition<TFrom, decltype(d_to)> d_resize_to;
150
+ return BitCast(d_to, IfThenElseZero(FirstN(d_resize_to, kNumOfFromLanes),
151
+ ResizeBitCast(d_resize_to, v)));
152
+ }
153
+ #endif // HWY_HAVE_SCALABLE
154
+
155
+ } // namespace detail
156
+ #endif // HWY_TARGET != HWY_EMU128 && HWY_TARGET != HWY_SCALAR
157
+
158
+ template <class DTo, class DFrom>
159
+ HWY_API VFromD<DTo> ZeroExtendResizeBitCast(DTo d_to, DFrom d_from,
160
+ VFromD<DFrom> v) {
161
+ return detail::ZeroExtendResizeBitCast(hwy::SizeTag<d_from.MaxBytes()>(),
162
+ hwy::SizeTag<d_to.MaxBytes()>(), d_to,
163
+ d_from, v);
164
+ }
165
+
166
+ // ------------------------------ SafeFillN
167
+
168
+ template <class D, typename T = TFromD<D>>
169
+ HWY_API void SafeFillN(const size_t num, const T value, D d,
170
+ T* HWY_RESTRICT to) {
171
+ #if HWY_MEM_OPS_MIGHT_FAULT
172
+ (void)d;
173
+ for (size_t i = 0; i < num; ++i) {
174
+ to[i] = value;
175
+ }
176
+ #else
177
+ BlendedStore(Set(d, value), FirstN(d, num), d, to);
178
+ #endif
179
+ }
180
+
181
+ // ------------------------------ SafeCopyN
182
+
183
+ template <class D, typename T = TFromD<D>>
184
+ HWY_API void SafeCopyN(const size_t num, D d, const T* HWY_RESTRICT from,
185
+ T* HWY_RESTRICT to) {
186
+ #if HWY_MEM_OPS_MIGHT_FAULT
187
+ (void)d;
188
+ for (size_t i = 0; i < num; ++i) {
189
+ to[i] = from[i];
190
+ }
191
+ #else
192
+ const Mask<D> mask = FirstN(d, num);
193
+ BlendedStore(MaskedLoad(mask, d, from), mask, d, to);
194
+ #endif
195
+ }
196
+
197
+ // ------------------------------ BitwiseIfThenElse
198
+ #if (defined(HWY_NATIVE_BITWISE_IF_THEN_ELSE) == defined(HWY_TARGET_TOGGLE))
199
+ #ifdef HWY_NATIVE_BITWISE_IF_THEN_ELSE
200
+ #undef HWY_NATIVE_BITWISE_IF_THEN_ELSE
201
+ #else
202
+ #define HWY_NATIVE_BITWISE_IF_THEN_ELSE
203
+ #endif
204
+
205
+ template <class V>
206
+ HWY_API V BitwiseIfThenElse(V mask, V yes, V no) {
207
+ return Or(And(mask, yes), AndNot(mask, no));
208
+ }
209
+
210
+ #endif // HWY_NATIVE_BITWISE_IF_THEN_ELSE
211
+
212
+ // "Include guard": skip if native instructions are available. The generic
213
+ // implementation is currently shared between x86_* and wasm_*, and is too large
214
+ // to duplicate.
215
+
216
+ #if HWY_IDE || \
217
+ (defined(HWY_NATIVE_LOAD_STORE_INTERLEAVED) == defined(HWY_TARGET_TOGGLE))
218
+ #ifdef HWY_NATIVE_LOAD_STORE_INTERLEAVED
219
+ #undef HWY_NATIVE_LOAD_STORE_INTERLEAVED
220
+ #else
221
+ #define HWY_NATIVE_LOAD_STORE_INTERLEAVED
222
+ #endif
223
+
224
+ // ------------------------------ LoadInterleaved2
225
+
226
+ template <class D, HWY_IF_LANES_GT_D(D, 1)>
227
+ HWY_API void LoadInterleaved2(D d, const TFromD<D>* HWY_RESTRICT unaligned,
228
+ VFromD<D>& v0, VFromD<D>& v1) {
229
+ const VFromD<D> A = LoadU(d, unaligned); // v1[1] v0[1] v1[0] v0[0]
230
+ const VFromD<D> B = LoadU(d, unaligned + Lanes(d));
231
+ v0 = ConcatEven(d, B, A);
232
+ v1 = ConcatOdd(d, B, A);
233
+ }
234
+
235
+ template <class D, HWY_IF_LANES_D(D, 1)>
236
+ HWY_API void LoadInterleaved2(D d, const TFromD<D>* HWY_RESTRICT unaligned,
237
+ VFromD<D>& v0, VFromD<D>& v1) {
238
+ v0 = LoadU(d, unaligned + 0);
239
+ v1 = LoadU(d, unaligned + 1);
240
+ }
241
+
242
+ // ------------------------------ LoadInterleaved3 (CombineShiftRightBytes)
243
+
244
+ namespace detail {
245
+
246
+ #if HWY_IDE
247
+ template <class V>
248
+ HWY_INLINE V ShuffleTwo1230(V a, V /* b */) {
249
+ return a;
250
+ }
251
+ template <class V>
252
+ HWY_INLINE V ShuffleTwo2301(V a, V /* b */) {
253
+ return a;
254
+ }
255
+ template <class V>
256
+ HWY_INLINE V ShuffleTwo3012(V a, V /* b */) {
257
+ return a;
258
+ }
259
+ #endif // HWY_IDE
260
+
261
+ // Default for <= 128-bit vectors; x86_256 and x86_512 have their own overload.
262
+ template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
263
+ HWY_INLINE void LoadTransposedBlocks3(D d,
264
+ const TFromD<D>* HWY_RESTRICT unaligned,
265
+ VFromD<D>& A, VFromD<D>& B,
266
+ VFromD<D>& C) {
267
+ constexpr size_t kN = MaxLanes(d);
268
+ A = LoadU(d, unaligned + 0 * kN);
269
+ B = LoadU(d, unaligned + 1 * kN);
270
+ C = LoadU(d, unaligned + 2 * kN);
271
+ }
272
+
273
+ } // namespace detail
274
+
275
+ template <class D, HWY_IF_LANES_PER_BLOCK_D(D, 16)>
276
+ HWY_API void LoadInterleaved3(D d, const TFromD<D>* HWY_RESTRICT unaligned,
277
+ VFromD<D>& v0, VFromD<D>& v1, VFromD<D>& v2) {
278
+ const RebindToUnsigned<decltype(d)> du;
279
+ using V = VFromD<D>;
280
+ // Compact notation so these fit on one line: 12 := v1[2].
281
+ V A; // 05 24 14 04 23 13 03 22 12 02 21 11 01 20 10 00
282
+ V B; // 1a 0a 29 19 09 28 18 08 27 17 07 26 16 06 25 15
283
+ V C; // 2f 1f 0f 2e 1e 0e 2d 1d 0d 2c 1c 0c 2b 1b 0b 2a
284
+ detail::LoadTransposedBlocks3(d, unaligned, A, B, C);
285
+ // Compress all lanes belonging to v0 into consecutive lanes.
286
+ constexpr uint8_t Z = 0x80;
287
+ alignas(16) static constexpr uint8_t kIdx_v0A[16] = {
288
+ 0, 3, 6, 9, 12, 15, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z};
289
+ alignas(16) static constexpr uint8_t kIdx_v0B[16] = {
290
+ Z, Z, Z, Z, Z, Z, 2, 5, 8, 11, 14, Z, Z, Z, Z, Z};
291
+ alignas(16) static constexpr uint8_t kIdx_v0C[16] = {
292
+ Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, 1, 4, 7, 10, 13};
293
+ alignas(16) static constexpr uint8_t kIdx_v1A[16] = {
294
+ 1, 4, 7, 10, 13, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z};
295
+ alignas(16) static constexpr uint8_t kIdx_v1B[16] = {
296
+ Z, Z, Z, Z, Z, 0, 3, 6, 9, 12, 15, Z, Z, Z, Z, Z};
297
+ alignas(16) static constexpr uint8_t kIdx_v1C[16] = {
298
+ Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, 2, 5, 8, 11, 14};
299
+ alignas(16) static constexpr uint8_t kIdx_v2A[16] = {
300
+ 2, 5, 8, 11, 14, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z};
301
+ alignas(16) static constexpr uint8_t kIdx_v2B[16] = {
302
+ Z, Z, Z, Z, Z, 1, 4, 7, 10, 13, Z, Z, Z, Z, Z, Z};
303
+ alignas(16) static constexpr uint8_t kIdx_v2C[16] = {
304
+ Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, 0, 3, 6, 9, 12, 15};
305
+ const V v0L = BitCast(d, TableLookupBytesOr0(A, LoadDup128(du, kIdx_v0A)));
306
+ const V v0M = BitCast(d, TableLookupBytesOr0(B, LoadDup128(du, kIdx_v0B)));
307
+ const V v0U = BitCast(d, TableLookupBytesOr0(C, LoadDup128(du, kIdx_v0C)));
308
+ const V v1L = BitCast(d, TableLookupBytesOr0(A, LoadDup128(du, kIdx_v1A)));
309
+ const V v1M = BitCast(d, TableLookupBytesOr0(B, LoadDup128(du, kIdx_v1B)));
310
+ const V v1U = BitCast(d, TableLookupBytesOr0(C, LoadDup128(du, kIdx_v1C)));
311
+ const V v2L = BitCast(d, TableLookupBytesOr0(A, LoadDup128(du, kIdx_v2A)));
312
+ const V v2M = BitCast(d, TableLookupBytesOr0(B, LoadDup128(du, kIdx_v2B)));
313
+ const V v2U = BitCast(d, TableLookupBytesOr0(C, LoadDup128(du, kIdx_v2C)));
314
+ v0 = Xor3(v0L, v0M, v0U);
315
+ v1 = Xor3(v1L, v1M, v1U);
316
+ v2 = Xor3(v2L, v2M, v2U);
317
+ }
318
+
319
+ // 8-bit lanes x8
320
+ template <class D, HWY_IF_LANES_PER_BLOCK_D(D, 8), HWY_IF_T_SIZE_D(D, 1)>
321
+ HWY_API void LoadInterleaved3(D d, const TFromD<D>* HWY_RESTRICT unaligned,
322
+ VFromD<D>& v0, VFromD<D>& v1, VFromD<D>& v2) {
323
+ const RebindToUnsigned<decltype(d)> du;
324
+ using V = VFromD<D>;
325
+ V A; // v1[2] v0[2] v2[1] v1[1] v0[1] v2[0] v1[0] v0[0]
326
+ V B; // v0[5] v2[4] v1[4] v0[4] v2[3] v1[3] v0[3] v2[2]
327
+ V C; // v2[7] v1[7] v0[7] v2[6] v1[6] v0[6] v2[5] v1[5]
328
+ detail::LoadTransposedBlocks3(d, unaligned, A, B, C);
329
+ // Compress all lanes belonging to v0 into consecutive lanes.
330
+ constexpr uint8_t Z = 0x80;
331
+ alignas(16) static constexpr uint8_t kIdx_v0A[16] = {0, 3, 6, Z, Z, Z, Z, Z};
332
+ alignas(16) static constexpr uint8_t kIdx_v0B[16] = {Z, Z, Z, 1, 4, 7, Z, Z};
333
+ alignas(16) static constexpr uint8_t kIdx_v0C[16] = {Z, Z, Z, Z, Z, Z, 2, 5};
334
+ alignas(16) static constexpr uint8_t kIdx_v1A[16] = {1, 4, 7, Z, Z, Z, Z, Z};
335
+ alignas(16) static constexpr uint8_t kIdx_v1B[16] = {Z, Z, Z, 2, 5, Z, Z, Z};
336
+ alignas(16) static constexpr uint8_t kIdx_v1C[16] = {Z, Z, Z, Z, Z, 0, 3, 6};
337
+ alignas(16) static constexpr uint8_t kIdx_v2A[16] = {2, 5, Z, Z, Z, Z, Z, Z};
338
+ alignas(16) static constexpr uint8_t kIdx_v2B[16] = {Z, Z, 0, 3, 6, Z, Z, Z};
339
+ alignas(16) static constexpr uint8_t kIdx_v2C[16] = {Z, Z, Z, Z, Z, 1, 4, 7};
340
+ const V v0L = BitCast(d, TableLookupBytesOr0(A, LoadDup128(du, kIdx_v0A)));
341
+ const V v0M = BitCast(d, TableLookupBytesOr0(B, LoadDup128(du, kIdx_v0B)));
342
+ const V v0U = BitCast(d, TableLookupBytesOr0(C, LoadDup128(du, kIdx_v0C)));
343
+ const V v1L = BitCast(d, TableLookupBytesOr0(A, LoadDup128(du, kIdx_v1A)));
344
+ const V v1M = BitCast(d, TableLookupBytesOr0(B, LoadDup128(du, kIdx_v1B)));
345
+ const V v1U = BitCast(d, TableLookupBytesOr0(C, LoadDup128(du, kIdx_v1C)));
346
+ const V v2L = BitCast(d, TableLookupBytesOr0(A, LoadDup128(du, kIdx_v2A)));
347
+ const V v2M = BitCast(d, TableLookupBytesOr0(B, LoadDup128(du, kIdx_v2B)));
348
+ const V v2U = BitCast(d, TableLookupBytesOr0(C, LoadDup128(du, kIdx_v2C)));
349
+ v0 = Xor3(v0L, v0M, v0U);
350
+ v1 = Xor3(v1L, v1M, v1U);
351
+ v2 = Xor3(v2L, v2M, v2U);
352
+ }
353
+
354
+ // 16-bit lanes x8
355
+ template <class D, HWY_IF_LANES_PER_BLOCK_D(D, 8), HWY_IF_T_SIZE_D(D, 2)>
356
+ HWY_API void LoadInterleaved3(D d, const TFromD<D>* HWY_RESTRICT unaligned,
357
+ VFromD<D>& v0, VFromD<D>& v1, VFromD<D>& v2) {
358
+ const RebindToUnsigned<decltype(d)> du;
359
+ const Repartition<uint8_t, decltype(du)> du8;
360
+ using V = VFromD<D>;
361
+ V A; // v1[2] v0[2] v2[1] v1[1] v0[1] v2[0] v1[0] v0[0]
362
+ V B; // v0[5] v2[4] v1[4] v0[4] v2[3] v1[3] v0[3] v2[2]
363
+ V C; // v2[7] v1[7] v0[7] v2[6] v1[6] v0[6] v2[5] v1[5]
364
+ detail::LoadTransposedBlocks3(d, unaligned, A, B, C);
365
+ // Compress all lanes belonging to v0 into consecutive lanes. Same as above,
366
+ // but each element of the array contains a byte index for a byte of a lane.
367
+ constexpr uint8_t Z = 0x80;
368
+ alignas(16) static constexpr uint8_t kIdx_v0A[16] = {
369
+ 0x00, 0x01, 0x06, 0x07, 0x0C, 0x0D, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z};
370
+ alignas(16) static constexpr uint8_t kIdx_v0B[16] = {
371
+ Z, Z, Z, Z, Z, Z, 0x02, 0x03, 0x08, 0x09, 0x0E, 0x0F, Z, Z, Z, Z};
372
+ alignas(16) static constexpr uint8_t kIdx_v0C[16] = {
373
+ Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, 0x04, 0x05, 0x0A, 0x0B};
374
+ alignas(16) static constexpr uint8_t kIdx_v1A[16] = {
375
+ 0x02, 0x03, 0x08, 0x09, 0x0E, 0x0F, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z};
376
+ alignas(16) static constexpr uint8_t kIdx_v1B[16] = {
377
+ Z, Z, Z, Z, Z, Z, 0x04, 0x05, 0x0A, 0x0B, Z, Z, Z, Z, Z, Z};
378
+ alignas(16) static constexpr uint8_t kIdx_v1C[16] = {
379
+ Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, 0x00, 0x01, 0x06, 0x07, 0x0C, 0x0D};
380
+ alignas(16) static constexpr uint8_t kIdx_v2A[16] = {
381
+ 0x04, 0x05, 0x0A, 0x0B, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z};
382
+ alignas(16) static constexpr uint8_t kIdx_v2B[16] = {
383
+ Z, Z, Z, Z, 0x00, 0x01, 0x06, 0x07, 0x0C, 0x0D, Z, Z, Z, Z, Z, Z};
384
+ alignas(16) static constexpr uint8_t kIdx_v2C[16] = {
385
+ Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, 0x02, 0x03, 0x08, 0x09, 0x0E, 0x0F};
386
+ const V v0L = TableLookupBytesOr0(A, BitCast(d, LoadDup128(du8, kIdx_v0A)));
387
+ const V v0M = TableLookupBytesOr0(B, BitCast(d, LoadDup128(du8, kIdx_v0B)));
388
+ const V v0U = TableLookupBytesOr0(C, BitCast(d, LoadDup128(du8, kIdx_v0C)));
389
+ const V v1L = TableLookupBytesOr0(A, BitCast(d, LoadDup128(du8, kIdx_v1A)));
390
+ const V v1M = TableLookupBytesOr0(B, BitCast(d, LoadDup128(du8, kIdx_v1B)));
391
+ const V v1U = TableLookupBytesOr0(C, BitCast(d, LoadDup128(du8, kIdx_v1C)));
392
+ const V v2L = TableLookupBytesOr0(A, BitCast(d, LoadDup128(du8, kIdx_v2A)));
393
+ const V v2M = TableLookupBytesOr0(B, BitCast(d, LoadDup128(du8, kIdx_v2B)));
394
+ const V v2U = TableLookupBytesOr0(C, BitCast(d, LoadDup128(du8, kIdx_v2C)));
395
+ v0 = Xor3(v0L, v0M, v0U);
396
+ v1 = Xor3(v1L, v1M, v1U);
397
+ v2 = Xor3(v2L, v2M, v2U);
398
+ }
399
+
400
+ template <class D, HWY_IF_LANES_PER_BLOCK_D(D, 4)>
401
+ HWY_API void LoadInterleaved3(D d, const TFromD<D>* HWY_RESTRICT unaligned,
402
+ VFromD<D>& v0, VFromD<D>& v1, VFromD<D>& v2) {
403
+ using V = VFromD<D>;
404
+ V A; // v0[1] v2[0] v1[0] v0[0]
405
+ V B; // v1[2] v0[2] v2[1] v1[1]
406
+ V C; // v2[3] v1[3] v0[3] v2[2]
407
+ detail::LoadTransposedBlocks3(d, unaligned, A, B, C);
408
+
409
+ const V vxx_02_03_xx = OddEven(C, B);
410
+ v0 = detail::ShuffleTwo1230(A, vxx_02_03_xx);
411
+
412
+ // Shuffle2301 takes the upper/lower halves of the output from one input, so
413
+ // we cannot just combine 13 and 10 with 12 and 11 (similar to v0/v2). Use
414
+ // OddEven because it may have higher throughput than Shuffle.
415
+ const V vxx_xx_10_11 = OddEven(A, B);
416
+ const V v12_13_xx_xx = OddEven(B, C);
417
+ v1 = detail::ShuffleTwo2301(vxx_xx_10_11, v12_13_xx_xx);
418
+
419
+ const V vxx_20_21_xx = OddEven(B, A);
420
+ v2 = detail::ShuffleTwo3012(vxx_20_21_xx, C);
421
+ }
422
+
423
+ template <class D, HWY_IF_LANES_PER_BLOCK_D(D, 2)>
424
+ HWY_API void LoadInterleaved3(D d, const TFromD<D>* HWY_RESTRICT unaligned,
425
+ VFromD<D>& v0, VFromD<D>& v1, VFromD<D>& v2) {
426
+ VFromD<D> A; // v1[0] v0[0]
427
+ VFromD<D> B; // v0[1] v2[0]
428
+ VFromD<D> C; // v2[1] v1[1]
429
+ detail::LoadTransposedBlocks3(d, unaligned, A, B, C);
430
+ v0 = OddEven(B, A);
431
+ v1 = CombineShiftRightBytes<sizeof(TFromD<D>)>(d, C, A);
432
+ v2 = OddEven(C, B);
433
+ }
434
+
435
+ template <class D, typename T = TFromD<D>, HWY_IF_LANES_D(D, 1)>
436
+ HWY_API void LoadInterleaved3(D d, const T* HWY_RESTRICT unaligned,
437
+ VFromD<D>& v0, VFromD<D>& v1, VFromD<D>& v2) {
438
+ v0 = LoadU(d, unaligned + 0);
439
+ v1 = LoadU(d, unaligned + 1);
440
+ v2 = LoadU(d, unaligned + 2);
441
+ }
442
+
443
+ // ------------------------------ LoadInterleaved4
444
+
445
+ namespace detail {
446
+
447
+ // Default for <= 128-bit vectors; x86_256 and x86_512 have their own overload.
448
+ template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
449
+ HWY_INLINE void LoadTransposedBlocks4(D d,
450
+ const TFromD<D>* HWY_RESTRICT unaligned,
451
+ VFromD<D>& vA, VFromD<D>& vB,
452
+ VFromD<D>& vC, VFromD<D>& vD) {
453
+ constexpr size_t kN = MaxLanes(d);
454
+ vA = LoadU(d, unaligned + 0 * kN);
455
+ vB = LoadU(d, unaligned + 1 * kN);
456
+ vC = LoadU(d, unaligned + 2 * kN);
457
+ vD = LoadU(d, unaligned + 3 * kN);
458
+ }
459
+
460
+ } // namespace detail
461
+
462
+ template <class D, HWY_IF_LANES_PER_BLOCK_D(D, 16)>
463
+ HWY_API void LoadInterleaved4(D d, const TFromD<D>* HWY_RESTRICT unaligned,
464
+ VFromD<D>& v0, VFromD<D>& v1, VFromD<D>& v2,
465
+ VFromD<D>& v3) {
466
+ const Repartition<uint64_t, decltype(d)> d64;
467
+ using V64 = VFromD<decltype(d64)>;
468
+ using V = VFromD<D>;
469
+ // 16 lanes per block; the lowest four blocks are at the bottom of vA..vD.
470
+ // Here int[i] means the four interleaved values of the i-th 4-tuple and
471
+ // int[3..0] indicates four consecutive 4-tuples (0 = least-significant).
472
+ V vA; // int[13..10] int[3..0]
473
+ V vB; // int[17..14] int[7..4]
474
+ V vC; // int[1b..18] int[b..8]
475
+ V vD; // int[1f..1c] int[f..c]
476
+ detail::LoadTransposedBlocks4(d, unaligned, vA, vB, vC, vD);
477
+
478
+ // For brevity, the comments only list the lower block (upper = lower + 0x10)
479
+ const V v5140 = InterleaveLower(d, vA, vB); // int[5,1,4,0]
480
+ const V vd9c8 = InterleaveLower(d, vC, vD); // int[d,9,c,8]
481
+ const V v7362 = InterleaveUpper(d, vA, vB); // int[7,3,6,2]
482
+ const V vfbea = InterleaveUpper(d, vC, vD); // int[f,b,e,a]
483
+
484
+ const V v6420 = InterleaveLower(d, v5140, v7362); // int[6,4,2,0]
485
+ const V veca8 = InterleaveLower(d, vd9c8, vfbea); // int[e,c,a,8]
486
+ const V v7531 = InterleaveUpper(d, v5140, v7362); // int[7,5,3,1]
487
+ const V vfdb9 = InterleaveUpper(d, vd9c8, vfbea); // int[f,d,b,9]
488
+
489
+ const V64 v10L = BitCast(d64, InterleaveLower(d, v6420, v7531)); // v10[7..0]
490
+ const V64 v10U = BitCast(d64, InterleaveLower(d, veca8, vfdb9)); // v10[f..8]
491
+ const V64 v32L = BitCast(d64, InterleaveUpper(d, v6420, v7531)); // v32[7..0]
492
+ const V64 v32U = BitCast(d64, InterleaveUpper(d, veca8, vfdb9)); // v32[f..8]
493
+
494
+ v0 = BitCast(d, InterleaveLower(d64, v10L, v10U));
495
+ v1 = BitCast(d, InterleaveUpper(d64, v10L, v10U));
496
+ v2 = BitCast(d, InterleaveLower(d64, v32L, v32U));
497
+ v3 = BitCast(d, InterleaveUpper(d64, v32L, v32U));
498
+ }
499
+
500
+ template <class D, HWY_IF_LANES_PER_BLOCK_D(D, 8)>
501
+ HWY_API void LoadInterleaved4(D d, const TFromD<D>* HWY_RESTRICT unaligned,
502
+ VFromD<D>& v0, VFromD<D>& v1, VFromD<D>& v2,
503
+ VFromD<D>& v3) {
504
+ // In the last step, we interleave by half of the block size, which is usually
505
+ // 8 bytes but half that for 8-bit x8 vectors.
506
+ using TW = hwy::UnsignedFromSize<d.MaxBytes() == 8 ? 4 : 8>;
507
+ const Repartition<TW, decltype(d)> dw;
508
+ using VW = VFromD<decltype(dw)>;
509
+
510
+ // (Comments are for 256-bit vectors.)
511
+ // 8 lanes per block; the lowest four blocks are at the bottom of vA..vD.
512
+ VFromD<D> vA; // v3210[9]v3210[8] v3210[1]v3210[0]
513
+ VFromD<D> vB; // v3210[b]v3210[a] v3210[3]v3210[2]
514
+ VFromD<D> vC; // v3210[d]v3210[c] v3210[5]v3210[4]
515
+ VFromD<D> vD; // v3210[f]v3210[e] v3210[7]v3210[6]
516
+ detail::LoadTransposedBlocks4(d, unaligned, vA, vB, vC, vD);
517
+
518
+ const VFromD<D> va820 = InterleaveLower(d, vA, vB); // v3210[a,8] v3210[2,0]
519
+ const VFromD<D> vec64 = InterleaveLower(d, vC, vD); // v3210[e,c] v3210[6,4]
520
+ const VFromD<D> vb931 = InterleaveUpper(d, vA, vB); // v3210[b,9] v3210[3,1]
521
+ const VFromD<D> vfd75 = InterleaveUpper(d, vC, vD); // v3210[f,d] v3210[7,5]
522
+
523
+ const VW v10_b830 = // v10[b..8] v10[3..0]
524
+ BitCast(dw, InterleaveLower(d, va820, vb931));
525
+ const VW v10_fc74 = // v10[f..c] v10[7..4]
526
+ BitCast(dw, InterleaveLower(d, vec64, vfd75));
527
+ const VW v32_b830 = // v32[b..8] v32[3..0]
528
+ BitCast(dw, InterleaveUpper(d, va820, vb931));
529
+ const VW v32_fc74 = // v32[f..c] v32[7..4]
530
+ BitCast(dw, InterleaveUpper(d, vec64, vfd75));
531
+
532
+ v0 = BitCast(d, InterleaveLower(dw, v10_b830, v10_fc74));
533
+ v1 = BitCast(d, InterleaveUpper(dw, v10_b830, v10_fc74));
534
+ v2 = BitCast(d, InterleaveLower(dw, v32_b830, v32_fc74));
535
+ v3 = BitCast(d, InterleaveUpper(dw, v32_b830, v32_fc74));
536
+ }
537
+
538
+ template <class D, HWY_IF_LANES_PER_BLOCK_D(D, 4)>
539
+ HWY_API void LoadInterleaved4(D d, const TFromD<D>* HWY_RESTRICT unaligned,
540
+ VFromD<D>& v0, VFromD<D>& v1, VFromD<D>& v2,
541
+ VFromD<D>& v3) {
542
+ using V = VFromD<D>;
543
+ V vA; // v3210[4] v3210[0]
544
+ V vB; // v3210[5] v3210[1]
545
+ V vC; // v3210[6] v3210[2]
546
+ V vD; // v3210[7] v3210[3]
547
+ detail::LoadTransposedBlocks4(d, unaligned, vA, vB, vC, vD);
548
+ const V v10e = InterleaveLower(d, vA, vC); // v1[6,4] v0[6,4] v1[2,0] v0[2,0]
549
+ const V v10o = InterleaveLower(d, vB, vD); // v1[7,5] v0[7,5] v1[3,1] v0[3,1]
550
+ const V v32e = InterleaveUpper(d, vA, vC); // v3[6,4] v2[6,4] v3[2,0] v2[2,0]
551
+ const V v32o = InterleaveUpper(d, vB, vD); // v3[7,5] v2[7,5] v3[3,1] v2[3,1]
552
+
553
+ v0 = InterleaveLower(d, v10e, v10o);
554
+ v1 = InterleaveUpper(d, v10e, v10o);
555
+ v2 = InterleaveLower(d, v32e, v32o);
556
+ v3 = InterleaveUpper(d, v32e, v32o);
557
+ }
558
+
559
+ template <class D, HWY_IF_LANES_PER_BLOCK_D(D, 2)>
560
+ HWY_API void LoadInterleaved4(D d, const TFromD<D>* HWY_RESTRICT unaligned,
561
+ VFromD<D>& v0, VFromD<D>& v1, VFromD<D>& v2,
562
+ VFromD<D>& v3) {
563
+ VFromD<D> vA, vB, vC, vD;
564
+ detail::LoadTransposedBlocks4(d, unaligned, vA, vB, vC, vD);
565
+ v0 = InterleaveLower(d, vA, vC);
566
+ v1 = InterleaveUpper(d, vA, vC);
567
+ v2 = InterleaveLower(d, vB, vD);
568
+ v3 = InterleaveUpper(d, vB, vD);
569
+ }
570
+
571
+ // Any T x1
572
+ template <class D, typename T = TFromD<D>, HWY_IF_LANES_D(D, 1)>
573
+ HWY_API void LoadInterleaved4(D d, const T* HWY_RESTRICT unaligned,
574
+ VFromD<D>& v0, VFromD<D>& v1, VFromD<D>& v2,
575
+ VFromD<D>& v3) {
576
+ v0 = LoadU(d, unaligned + 0);
577
+ v1 = LoadU(d, unaligned + 1);
578
+ v2 = LoadU(d, unaligned + 2);
579
+ v3 = LoadU(d, unaligned + 3);
580
+ }
581
+
582
+ // ------------------------------ StoreInterleaved2
583
+
584
+ namespace detail {
585
+
586
+ // Default for <= 128-bit vectors; x86_256 and x86_512 have their own overload.
587
+ template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
588
+ HWY_INLINE void StoreTransposedBlocks2(VFromD<D> A, VFromD<D> B, D d,
589
+ TFromD<D>* HWY_RESTRICT unaligned) {
590
+ constexpr size_t kN = MaxLanes(d);
591
+ StoreU(A, d, unaligned + 0 * kN);
592
+ StoreU(B, d, unaligned + 1 * kN);
593
+ }
594
+
595
+ } // namespace detail
596
+
597
+ // >= 128 bit vector
598
+ template <class D, HWY_IF_V_SIZE_GT_D(D, 8)>
599
+ HWY_API void StoreInterleaved2(VFromD<D> v0, VFromD<D> v1, D d,
600
+ TFromD<D>* HWY_RESTRICT unaligned) {
601
+ const auto v10L = InterleaveLower(d, v0, v1); // .. v1[0] v0[0]
602
+ const auto v10U = InterleaveUpper(d, v0, v1); // .. v1[kN/2] v0[kN/2]
603
+ detail::StoreTransposedBlocks2(v10L, v10U, d, unaligned);
604
+ }
605
+
606
+ // <= 64 bits
607
+ template <class V, class D, HWY_IF_V_SIZE_LE_D(D, 8)>
608
+ HWY_API void StoreInterleaved2(V part0, V part1, D d,
609
+ TFromD<D>* HWY_RESTRICT unaligned) {
610
+ const Twice<decltype(d)> d2;
611
+ const auto v0 = ZeroExtendVector(d2, part0);
612
+ const auto v1 = ZeroExtendVector(d2, part1);
613
+ const auto v10 = InterleaveLower(d2, v0, v1);
614
+ StoreU(v10, d2, unaligned);
615
+ }
616
+
617
+ // ------------------------------ StoreInterleaved3 (CombineShiftRightBytes,
618
+ // TableLookupBytes)
619
+
620
+ namespace detail {
621
+
622
+ // Default for <= 128-bit vectors; x86_256 and x86_512 have their own overload.
623
+ template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
624
+ HWY_INLINE void StoreTransposedBlocks3(VFromD<D> A, VFromD<D> B, VFromD<D> C,
625
+ D d, TFromD<D>* HWY_RESTRICT unaligned) {
626
+ constexpr size_t kN = MaxLanes(d);
627
+ StoreU(A, d, unaligned + 0 * kN);
628
+ StoreU(B, d, unaligned + 1 * kN);
629
+ StoreU(C, d, unaligned + 2 * kN);
630
+ }
631
+
632
+ } // namespace detail
633
+
634
+ // >= 128-bit vector, 8-bit lanes
635
+ template <class D, HWY_IF_T_SIZE_D(D, 1), HWY_IF_V_SIZE_GT_D(D, 8)>
636
+ HWY_API void StoreInterleaved3(VFromD<D> v0, VFromD<D> v1, VFromD<D> v2, D d,
637
+ TFromD<D>* HWY_RESTRICT unaligned) {
638
+ const RebindToUnsigned<decltype(d)> du;
639
+ using TU = TFromD<decltype(du)>;
640
+ const auto k5 = Set(du, TU{5});
641
+ const auto k6 = Set(du, TU{6});
642
+
643
+ // Interleave (v0,v1,v2) to (MSB on left, lane 0 on right):
644
+ // v0[5], v2[4],v1[4],v0[4] .. v2[0],v1[0],v0[0]. We're expanding v0 lanes
645
+ // to their place, with 0x80 so lanes to be filled from other vectors are 0
646
+ // to enable blending by ORing together.
647
+ alignas(16) static constexpr uint8_t tbl_v0[16] = {
648
+ 0, 0x80, 0x80, 1, 0x80, 0x80, 2, 0x80, 0x80, //
649
+ 3, 0x80, 0x80, 4, 0x80, 0x80, 5};
650
+ alignas(16) static constexpr uint8_t tbl_v1[16] = {
651
+ 0x80, 0, 0x80, 0x80, 1, 0x80, //
652
+ 0x80, 2, 0x80, 0x80, 3, 0x80, 0x80, 4, 0x80, 0x80};
653
+ // The interleaved vectors will be named A, B, C; temporaries with suffix
654
+ // 0..2 indicate which input vector's lanes they hold.
655
+ const auto shuf_A0 = LoadDup128(du, tbl_v0);
656
+ const auto shuf_A1 = LoadDup128(du, tbl_v1); // cannot reuse shuf_A0 (has 5)
657
+ const auto shuf_A2 = CombineShiftRightBytes<15>(du, shuf_A1, shuf_A1);
658
+ const auto A0 = TableLookupBytesOr0(v0, shuf_A0); // 5..4..3..2..1..0
659
+ const auto A1 = TableLookupBytesOr0(v1, shuf_A1); // ..4..3..2..1..0.
660
+ const auto A2 = TableLookupBytesOr0(v2, shuf_A2); // .4..3..2..1..0..
661
+ const VFromD<D> A = BitCast(d, A0 | A1 | A2);
662
+
663
+ // B: v1[10],v0[10], v2[9],v1[9],v0[9] .. , v2[6],v1[6],v0[6], v2[5],v1[5]
664
+ const auto shuf_B0 = shuf_A2 + k6; // .A..9..8..7..6..
665
+ const auto shuf_B1 = shuf_A0 + k5; // A..9..8..7..6..5
666
+ const auto shuf_B2 = shuf_A1 + k5; // ..9..8..7..6..5.
667
+ const auto B0 = TableLookupBytesOr0(v0, shuf_B0);
668
+ const auto B1 = TableLookupBytesOr0(v1, shuf_B1);
669
+ const auto B2 = TableLookupBytesOr0(v2, shuf_B2);
670
+ const VFromD<D> B = BitCast(d, B0 | B1 | B2);
671
+
672
+ // C: v2[15],v1[15],v0[15], v2[11],v1[11],v0[11], v2[10]
673
+ const auto shuf_C0 = shuf_B2 + k6; // ..F..E..D..C..B.
674
+ const auto shuf_C1 = shuf_B0 + k5; // .F..E..D..C..B..
675
+ const auto shuf_C2 = shuf_B1 + k5; // F..E..D..C..B..A
676
+ const auto C0 = TableLookupBytesOr0(v0, shuf_C0);
677
+ const auto C1 = TableLookupBytesOr0(v1, shuf_C1);
678
+ const auto C2 = TableLookupBytesOr0(v2, shuf_C2);
679
+ const VFromD<D> C = BitCast(d, C0 | C1 | C2);
680
+
681
+ detail::StoreTransposedBlocks3(A, B, C, d, unaligned);
682
+ }
683
+
684
+ // >= 128-bit vector, 16-bit lanes
685
+ template <class D, HWY_IF_T_SIZE_D(D, 2), HWY_IF_V_SIZE_GT_D(D, 8)>
686
+ HWY_API void StoreInterleaved3(VFromD<D> v0, VFromD<D> v1, VFromD<D> v2, D d,
687
+ TFromD<D>* HWY_RESTRICT unaligned) {
688
+ const Repartition<uint8_t, decltype(d)> du8;
689
+ const auto k2 = Set(du8, uint8_t{2 * sizeof(TFromD<D>)});
690
+ const auto k3 = Set(du8, uint8_t{3 * sizeof(TFromD<D>)});
691
+
692
+ // Interleave (v0,v1,v2) to (MSB on left, lane 0 on right):
693
+ // v1[2],v0[2], v2[1],v1[1],v0[1], v2[0],v1[0],v0[0]. 0x80 so lanes to be
694
+ // filled from other vectors are 0 for blending. Note that these are byte
695
+ // indices for 16-bit lanes.
696
+ alignas(16) static constexpr uint8_t tbl_v1[16] = {
697
+ 0x80, 0x80, 0, 1, 0x80, 0x80, 0x80, 0x80,
698
+ 2, 3, 0x80, 0x80, 0x80, 0x80, 4, 5};
699
+ alignas(16) static constexpr uint8_t tbl_v2[16] = {
700
+ 0x80, 0x80, 0x80, 0x80, 0, 1, 0x80, 0x80,
701
+ 0x80, 0x80, 2, 3, 0x80, 0x80, 0x80, 0x80};
702
+
703
+ // The interleaved vectors will be named A, B, C; temporaries with suffix
704
+ // 0..2 indicate which input vector's lanes they hold.
705
+ const auto shuf_A1 = LoadDup128(du8, tbl_v1); // 2..1..0.
706
+ // .2..1..0
707
+ const auto shuf_A0 = CombineShiftRightBytes<2>(du8, shuf_A1, shuf_A1);
708
+ const auto shuf_A2 = LoadDup128(du8, tbl_v2); // ..1..0..
709
+
710
+ const auto A0 = TableLookupBytesOr0(v0, shuf_A0);
711
+ const auto A1 = TableLookupBytesOr0(v1, shuf_A1);
712
+ const auto A2 = TableLookupBytesOr0(v2, shuf_A2);
713
+ const VFromD<D> A = BitCast(d, A0 | A1 | A2);
714
+
715
+ // B: v0[5] v2[4],v1[4],v0[4], v2[3],v1[3],v0[3], v2[2]
716
+ const auto shuf_B0 = shuf_A1 + k3; // 5..4..3.
717
+ const auto shuf_B1 = shuf_A2 + k3; // ..4..3..
718
+ const auto shuf_B2 = shuf_A0 + k2; // .4..3..2
719
+ const auto B0 = TableLookupBytesOr0(v0, shuf_B0);
720
+ const auto B1 = TableLookupBytesOr0(v1, shuf_B1);
721
+ const auto B2 = TableLookupBytesOr0(v2, shuf_B2);
722
+ const VFromD<D> B = BitCast(d, B0 | B1 | B2);
723
+
724
+ // C: v2[7],v1[7],v0[7], v2[6],v1[6],v0[6], v2[5],v1[5]
725
+ const auto shuf_C0 = shuf_B1 + k3; // ..7..6..
726
+ const auto shuf_C1 = shuf_B2 + k3; // .7..6..5
727
+ const auto shuf_C2 = shuf_B0 + k2; // 7..6..5.
728
+ const auto C0 = TableLookupBytesOr0(v0, shuf_C0);
729
+ const auto C1 = TableLookupBytesOr0(v1, shuf_C1);
730
+ const auto C2 = TableLookupBytesOr0(v2, shuf_C2);
731
+ const VFromD<D> C = BitCast(d, C0 | C1 | C2);
732
+
733
+ detail::StoreTransposedBlocks3(A, B, C, d, unaligned);
734
+ }
735
+
736
+ // >= 128-bit vector, 32-bit lanes
737
+ template <class D, HWY_IF_T_SIZE_D(D, 4), HWY_IF_V_SIZE_GT_D(D, 8)>
738
+ HWY_API void StoreInterleaved3(VFromD<D> v0, VFromD<D> v1, VFromD<D> v2, D d,
739
+ TFromD<D>* HWY_RESTRICT unaligned) {
740
+ const RepartitionToWide<decltype(d)> dw;
741
+
742
+ const VFromD<D> v10_v00 = InterleaveLower(d, v0, v1);
743
+ const VFromD<D> v01_v20 = OddEven(v0, v2);
744
+ // A: v0[1], v2[0],v1[0],v0[0] (<- lane 0)
745
+ const VFromD<D> A = BitCast(
746
+ d, InterleaveLower(dw, BitCast(dw, v10_v00), BitCast(dw, v01_v20)));
747
+
748
+ const VFromD<D> v1_321 = ShiftRightLanes<1>(d, v1);
749
+ const VFromD<D> v0_32 = ShiftRightLanes<2>(d, v0);
750
+ const VFromD<D> v21_v11 = OddEven(v2, v1_321);
751
+ const VFromD<D> v12_v02 = OddEven(v1_321, v0_32);
752
+ // B: v1[2],v0[2], v2[1],v1[1]
753
+ const VFromD<D> B = BitCast(
754
+ d, InterleaveLower(dw, BitCast(dw, v21_v11), BitCast(dw, v12_v02)));
755
+
756
+ // Notation refers to the upper 2 lanes of the vector for InterleaveUpper.
757
+ const VFromD<D> v23_v13 = OddEven(v2, v1_321);
758
+ const VFromD<D> v03_v22 = OddEven(v0, v2);
759
+ // C: v2[3],v1[3],v0[3], v2[2]
760
+ const VFromD<D> C = BitCast(
761
+ d, InterleaveUpper(dw, BitCast(dw, v03_v22), BitCast(dw, v23_v13)));
762
+
763
+ detail::StoreTransposedBlocks3(A, B, C, d, unaligned);
764
+ }
765
+
766
+ // >= 128-bit vector, 64-bit lanes
767
+ template <class D, HWY_IF_T_SIZE_D(D, 8), HWY_IF_V_SIZE_GT_D(D, 8)>
768
+ HWY_API void StoreInterleaved3(VFromD<D> v0, VFromD<D> v1, VFromD<D> v2, D d,
769
+ TFromD<D>* HWY_RESTRICT unaligned) {
770
+ const VFromD<D> A = InterleaveLower(d, v0, v1);
771
+ const VFromD<D> B = OddEven(v0, v2);
772
+ const VFromD<D> C = InterleaveUpper(d, v1, v2);
773
+ detail::StoreTransposedBlocks3(A, B, C, d, unaligned);
774
+ }
775
+
776
+ // 64-bit vector, 8-bit lanes
777
+ template <class D, HWY_IF_T_SIZE_D(D, 1), HWY_IF_V_SIZE_D(D, 8)>
778
+ HWY_API void StoreInterleaved3(VFromD<D> part0, VFromD<D> part1,
779
+ VFromD<D> part2, D d,
780
+ TFromD<D>* HWY_RESTRICT unaligned) {
781
+ // Use full vectors for the shuffles and first result.
782
+ constexpr size_t kFullN = 16 / sizeof(TFromD<D>);
783
+ const Full128<uint8_t> du;
784
+ const Full128<TFromD<D>> d_full;
785
+ const auto k5 = Set(du, uint8_t{5});
786
+ const auto k6 = Set(du, uint8_t{6});
787
+
788
+ const VFromD<decltype(d_full)> v0{part0.raw};
789
+ const VFromD<decltype(d_full)> v1{part1.raw};
790
+ const VFromD<decltype(d_full)> v2{part2.raw};
791
+
792
+ // Interleave (v0,v1,v2) to (MSB on left, lane 0 on right):
793
+ // v1[2],v0[2], v2[1],v1[1],v0[1], v2[0],v1[0],v0[0]. 0x80 so lanes to be
794
+ // filled from other vectors are 0 for blending.
795
+ alignas(16) static constexpr uint8_t tbl_v0[16] = {
796
+ 0, 0x80, 0x80, 1, 0x80, 0x80, 2, 0x80, 0x80, //
797
+ 3, 0x80, 0x80, 4, 0x80, 0x80, 5};
798
+ alignas(16) static constexpr uint8_t tbl_v1[16] = {
799
+ 0x80, 0, 0x80, 0x80, 1, 0x80, //
800
+ 0x80, 2, 0x80, 0x80, 3, 0x80, 0x80, 4, 0x80, 0x80};
801
+ // The interleaved vectors will be named A, B, C; temporaries with suffix
802
+ // 0..2 indicate which input vector's lanes they hold.
803
+ const auto shuf_A0 = Load(du, tbl_v0);
804
+ const auto shuf_A1 = Load(du, tbl_v1); // cannot reuse shuf_A0 (5 in MSB)
805
+ const auto shuf_A2 = CombineShiftRightBytes<15>(du, shuf_A1, shuf_A1);
806
+ const auto A0 = TableLookupBytesOr0(v0, shuf_A0); // 5..4..3..2..1..0
807
+ const auto A1 = TableLookupBytesOr0(v1, shuf_A1); // ..4..3..2..1..0.
808
+ const auto A2 = TableLookupBytesOr0(v2, shuf_A2); // .4..3..2..1..0..
809
+ const auto A = BitCast(d_full, A0 | A1 | A2);
810
+ StoreU(A, d_full, unaligned + 0 * kFullN);
811
+
812
+ // Second (HALF) vector: v2[7],v1[7],v0[7], v2[6],v1[6],v0[6], v2[5],v1[5]
813
+ const auto shuf_B0 = shuf_A2 + k6; // ..7..6..
814
+ const auto shuf_B1 = shuf_A0 + k5; // .7..6..5
815
+ const auto shuf_B2 = shuf_A1 + k5; // 7..6..5.
816
+ const auto B0 = TableLookupBytesOr0(v0, shuf_B0);
817
+ const auto B1 = TableLookupBytesOr0(v1, shuf_B1);
818
+ const auto B2 = TableLookupBytesOr0(v2, shuf_B2);
819
+ const VFromD<D> B{BitCast(d_full, B0 | B1 | B2).raw};
820
+ StoreU(B, d, unaligned + 1 * kFullN);
821
+ }
822
+
823
+ // 64-bit vector, 16-bit lanes
824
+ template <class D, HWY_IF_T_SIZE_D(D, 2), HWY_IF_LANES_D(D, 4)>
825
+ HWY_API void StoreInterleaved3(VFromD<D> part0, VFromD<D> part1,
826
+ VFromD<D> part2, D dh,
827
+ TFromD<D>* HWY_RESTRICT unaligned) {
828
+ const Twice<D> d_full;
829
+ const Full128<uint8_t> du8;
830
+ const auto k2 = Set(du8, uint8_t{2 * sizeof(TFromD<D>)});
831
+ const auto k3 = Set(du8, uint8_t{3 * sizeof(TFromD<D>)});
832
+
833
+ const VFromD<decltype(d_full)> v0{part0.raw};
834
+ const VFromD<decltype(d_full)> v1{part1.raw};
835
+ const VFromD<decltype(d_full)> v2{part2.raw};
836
+
837
+ // Interleave part (v0,v1,v2) to full (MSB on left, lane 0 on right):
838
+ // v1[2],v0[2], v2[1],v1[1],v0[1], v2[0],v1[0],v0[0]. We're expanding v0 lanes
839
+ // to their place, with 0x80 so lanes to be filled from other vectors are 0
840
+ // to enable blending by ORing together.
841
+ alignas(16) static constexpr uint8_t tbl_v1[16] = {
842
+ 0x80, 0x80, 0, 1, 0x80, 0x80, 0x80, 0x80,
843
+ 2, 3, 0x80, 0x80, 0x80, 0x80, 4, 5};
844
+ alignas(16) static constexpr uint8_t tbl_v2[16] = {
845
+ 0x80, 0x80, 0x80, 0x80, 0, 1, 0x80, 0x80,
846
+ 0x80, 0x80, 2, 3, 0x80, 0x80, 0x80, 0x80};
847
+
848
+ // The interleaved vectors will be named A, B; temporaries with suffix
849
+ // 0..2 indicate which input vector's lanes they hold.
850
+ const auto shuf_A1 = Load(du8, tbl_v1); // 2..1..0.
851
+ // .2..1..0
852
+ const auto shuf_A0 = CombineShiftRightBytes<2>(du8, shuf_A1, shuf_A1);
853
+ const auto shuf_A2 = Load(du8, tbl_v2); // ..1..0..
854
+
855
+ const auto A0 = TableLookupBytesOr0(v0, shuf_A0);
856
+ const auto A1 = TableLookupBytesOr0(v1, shuf_A1);
857
+ const auto A2 = TableLookupBytesOr0(v2, shuf_A2);
858
+ const VFromD<decltype(d_full)> A = BitCast(d_full, A0 | A1 | A2);
859
+ StoreU(A, d_full, unaligned);
860
+
861
+ // Second (HALF) vector: v2[3],v1[3],v0[3], v2[2]
862
+ const auto shuf_B0 = shuf_A1 + k3; // ..3.
863
+ const auto shuf_B1 = shuf_A2 + k3; // .3..
864
+ const auto shuf_B2 = shuf_A0 + k2; // 3..2
865
+ const auto B0 = TableLookupBytesOr0(v0, shuf_B0);
866
+ const auto B1 = TableLookupBytesOr0(v1, shuf_B1);
867
+ const auto B2 = TableLookupBytesOr0(v2, shuf_B2);
868
+ const VFromD<decltype(d_full)> B = BitCast(d_full, B0 | B1 | B2);
869
+ StoreU(VFromD<D>{B.raw}, dh, unaligned + MaxLanes(d_full));
870
+ }
871
+
872
+ // 64-bit vector, 32-bit lanes
873
+ template <class D, HWY_IF_T_SIZE_D(D, 4), HWY_IF_LANES_D(D, 2)>
874
+ HWY_API void StoreInterleaved3(VFromD<D> v0, VFromD<D> v1, VFromD<D> v2, D d,
875
+ TFromD<D>* HWY_RESTRICT unaligned) {
876
+ // (same code as 128-bit vector, 64-bit lanes)
877
+ const VFromD<D> v10_v00 = InterleaveLower(d, v0, v1);
878
+ const VFromD<D> v01_v20 = OddEven(v0, v2);
879
+ const VFromD<D> v21_v11 = InterleaveUpper(d, v1, v2);
880
+ constexpr size_t kN = MaxLanes(d);
881
+ StoreU(v10_v00, d, unaligned + 0 * kN);
882
+ StoreU(v01_v20, d, unaligned + 1 * kN);
883
+ StoreU(v21_v11, d, unaligned + 2 * kN);
884
+ }
885
+
886
+ // 64-bit lanes are handled by the N=1 case below.
887
+
888
+ // <= 32-bit vector, 8-bit lanes
889
+ template <class D, HWY_IF_T_SIZE_D(D, 1), HWY_IF_V_SIZE_LE_D(D, 4),
890
+ HWY_IF_LANES_GT_D(D, 1)>
891
+ HWY_API void StoreInterleaved3(VFromD<D> part0, VFromD<D> part1,
892
+ VFromD<D> part2, D d,
893
+ TFromD<D>* HWY_RESTRICT unaligned) {
894
+ // Use full vectors for the shuffles and result.
895
+ const Full128<uint8_t> du;
896
+ const Full128<TFromD<D>> d_full;
897
+
898
+ const VFromD<decltype(d_full)> v0{part0.raw};
899
+ const VFromD<decltype(d_full)> v1{part1.raw};
900
+ const VFromD<decltype(d_full)> v2{part2.raw};
901
+
902
+ // Interleave (v0,v1,v2). We're expanding v0 lanes to their place, with 0x80
903
+ // so lanes to be filled from other vectors are 0 to enable blending by ORing
904
+ // together.
905
+ alignas(16) static constexpr uint8_t tbl_v0[16] = {
906
+ 0, 0x80, 0x80, 1, 0x80, 0x80, 2, 0x80,
907
+ 0x80, 3, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80};
908
+ // The interleaved vector will be named A; temporaries with suffix
909
+ // 0..2 indicate which input vector's lanes they hold.
910
+ const auto shuf_A0 = Load(du, tbl_v0);
911
+ const auto shuf_A1 = CombineShiftRightBytes<15>(du, shuf_A0, shuf_A0);
912
+ const auto shuf_A2 = CombineShiftRightBytes<14>(du, shuf_A0, shuf_A0);
913
+ const auto A0 = TableLookupBytesOr0(v0, shuf_A0); // ......3..2..1..0
914
+ const auto A1 = TableLookupBytesOr0(v1, shuf_A1); // .....3..2..1..0.
915
+ const auto A2 = TableLookupBytesOr0(v2, shuf_A2); // ....3..2..1..0..
916
+ const VFromD<decltype(d_full)> A = BitCast(d_full, A0 | A1 | A2);
917
+ alignas(16) TFromD<D> buf[MaxLanes(d_full)];
918
+ StoreU(A, d_full, buf);
919
+ CopyBytes<d.MaxBytes() * 3>(buf, unaligned);
920
+ }
921
+
922
+ // 32-bit vector, 16-bit lanes
923
+ template <class D, HWY_IF_T_SIZE_D(D, 2), HWY_IF_LANES_D(D, 2)>
924
+ HWY_API void StoreInterleaved3(VFromD<D> part0, VFromD<D> part1,
925
+ VFromD<D> part2, D d,
926
+ TFromD<D>* HWY_RESTRICT unaligned) {
927
+ // Use full vectors for the shuffles and result.
928
+ const Full128<uint8_t> du8;
929
+ const Full128<TFromD<D>> d_full;
930
+
931
+ const VFromD<decltype(d_full)> v0{part0.raw};
932
+ const VFromD<decltype(d_full)> v1{part1.raw};
933
+ const VFromD<decltype(d_full)> v2{part2.raw};
934
+
935
+ // Interleave (v0,v1,v2). We're expanding v0 lanes to their place, with 0x80
936
+ // so lanes to be filled from other vectors are 0 to enable blending by ORing
937
+ // together.
938
+ alignas(16) static constexpr uint8_t tbl_v2[16] = {
939
+ 0x80, 0x80, 0x80, 0x80, 0, 1, 0x80, 0x80,
940
+ 0x80, 0x80, 2, 3, 0x80, 0x80, 0x80, 0x80};
941
+ // The interleaved vector will be named A; temporaries with suffix
942
+ // 0..2 indicate which input vector's lanes they hold.
943
+ const auto shuf_A2 = // ..1..0..
944
+ Load(du8, tbl_v2);
945
+ const auto shuf_A1 = // ...1..0.
946
+ CombineShiftRightBytes<2>(du8, shuf_A2, shuf_A2);
947
+ const auto shuf_A0 = // ....1..0
948
+ CombineShiftRightBytes<4>(du8, shuf_A2, shuf_A2);
949
+ const auto A0 = TableLookupBytesOr0(v0, shuf_A0); // ..1..0
950
+ const auto A1 = TableLookupBytesOr0(v1, shuf_A1); // .1..0.
951
+ const auto A2 = TableLookupBytesOr0(v2, shuf_A2); // 1..0..
952
+ const auto A = BitCast(d_full, A0 | A1 | A2);
953
+ alignas(16) TFromD<D> buf[MaxLanes(d_full)];
954
+ StoreU(A, d_full, buf);
955
+ CopyBytes<d.MaxBytes() * 3>(buf, unaligned);
956
+ }
957
+
958
+ // Single-element vector, any lane size: just store directly
959
+ template <class D, HWY_IF_LANES_D(D, 1)>
960
+ HWY_API void StoreInterleaved3(VFromD<D> v0, VFromD<D> v1, VFromD<D> v2, D d,
961
+ TFromD<D>* HWY_RESTRICT unaligned) {
962
+ StoreU(v0, d, unaligned + 0);
963
+ StoreU(v1, d, unaligned + 1);
964
+ StoreU(v2, d, unaligned + 2);
965
+ }
966
+
967
+ // ------------------------------ StoreInterleaved4
968
+
969
+ namespace detail {
970
+
971
+ // Default for <= 128-bit vectors; x86_256 and x86_512 have their own overload.
972
+ template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
973
+ HWY_INLINE void StoreTransposedBlocks4(VFromD<D> vA, VFromD<D> vB, VFromD<D> vC,
974
+ VFromD<D> vD, D d,
975
+ TFromD<D>* HWY_RESTRICT unaligned) {
976
+ constexpr size_t kN = MaxLanes(d);
977
+ StoreU(vA, d, unaligned + 0 * kN);
978
+ StoreU(vB, d, unaligned + 1 * kN);
979
+ StoreU(vC, d, unaligned + 2 * kN);
980
+ StoreU(vD, d, unaligned + 3 * kN);
981
+ }
982
+
983
+ } // namespace detail
984
+
985
+ // >= 128-bit vector, 8..32-bit lanes
986
+ template <class D, HWY_IF_NOT_T_SIZE_D(D, 8), HWY_IF_V_SIZE_GT_D(D, 8)>
987
+ HWY_API void StoreInterleaved4(VFromD<D> v0, VFromD<D> v1, VFromD<D> v2,
988
+ VFromD<D> v3, D d,
989
+ TFromD<D>* HWY_RESTRICT unaligned) {
990
+ const RepartitionToWide<decltype(d)> dw;
991
+ const auto v10L = ZipLower(dw, v0, v1); // .. v1[0] v0[0]
992
+ const auto v32L = ZipLower(dw, v2, v3);
993
+ const auto v10U = ZipUpper(dw, v0, v1);
994
+ const auto v32U = ZipUpper(dw, v2, v3);
995
+ // The interleaved vectors are vA, vB, vC, vD.
996
+ const VFromD<D> vA = BitCast(d, InterleaveLower(dw, v10L, v32L)); // 3210
997
+ const VFromD<D> vB = BitCast(d, InterleaveUpper(dw, v10L, v32L));
998
+ const VFromD<D> vC = BitCast(d, InterleaveLower(dw, v10U, v32U));
999
+ const VFromD<D> vD = BitCast(d, InterleaveUpper(dw, v10U, v32U));
1000
+ detail::StoreTransposedBlocks4(vA, vB, vC, vD, d, unaligned);
1001
+ }
1002
+
1003
+ // >= 128-bit vector, 64-bit lanes
1004
+ template <class D, HWY_IF_T_SIZE_D(D, 8), HWY_IF_V_SIZE_GT_D(D, 8)>
1005
+ HWY_API void StoreInterleaved4(VFromD<D> v0, VFromD<D> v1, VFromD<D> v2,
1006
+ VFromD<D> v3, D d,
1007
+ TFromD<D>* HWY_RESTRICT unaligned) {
1008
+ // The interleaved vectors are vA, vB, vC, vD.
1009
+ const VFromD<D> vA = InterleaveLower(d, v0, v1); // v1[0] v0[0]
1010
+ const VFromD<D> vB = InterleaveLower(d, v2, v3);
1011
+ const VFromD<D> vC = InterleaveUpper(d, v0, v1);
1012
+ const VFromD<D> vD = InterleaveUpper(d, v2, v3);
1013
+ detail::StoreTransposedBlocks4(vA, vB, vC, vD, d, unaligned);
1014
+ }
1015
+
1016
+ // 64-bit vector, 8..32-bit lanes
1017
+ template <class D, HWY_IF_NOT_T_SIZE_D(D, 8), HWY_IF_V_SIZE_D(D, 8)>
1018
+ HWY_API void StoreInterleaved4(VFromD<D> part0, VFromD<D> part1,
1019
+ VFromD<D> part2, VFromD<D> part3, D /* tag */,
1020
+ TFromD<D>* HWY_RESTRICT unaligned) {
1021
+ // Use full vectors to reduce the number of stores.
1022
+ const Full128<TFromD<D>> d_full;
1023
+ const RepartitionToWide<decltype(d_full)> dw;
1024
+ const VFromD<decltype(d_full)> v0{part0.raw};
1025
+ const VFromD<decltype(d_full)> v1{part1.raw};
1026
+ const VFromD<decltype(d_full)> v2{part2.raw};
1027
+ const VFromD<decltype(d_full)> v3{part3.raw};
1028
+ const auto v10 = ZipLower(dw, v0, v1); // v1[0] v0[0]
1029
+ const auto v32 = ZipLower(dw, v2, v3);
1030
+ const auto A = BitCast(d_full, InterleaveLower(dw, v10, v32));
1031
+ const auto B = BitCast(d_full, InterleaveUpper(dw, v10, v32));
1032
+ StoreU(A, d_full, unaligned);
1033
+ StoreU(B, d_full, unaligned + MaxLanes(d_full));
1034
+ }
1035
+
1036
+ // 64-bit vector, 64-bit lane
1037
+ template <class D, HWY_IF_T_SIZE_D(D, 8), HWY_IF_LANES_D(D, 1)>
1038
+ HWY_API void StoreInterleaved4(VFromD<D> part0, VFromD<D> part1,
1039
+ VFromD<D> part2, VFromD<D> part3, D /* tag */,
1040
+ TFromD<D>* HWY_RESTRICT unaligned) {
1041
+ // Use full vectors to reduce the number of stores.
1042
+ const Full128<TFromD<D>> d_full;
1043
+ const VFromD<decltype(d_full)> v0{part0.raw};
1044
+ const VFromD<decltype(d_full)> v1{part1.raw};
1045
+ const VFromD<decltype(d_full)> v2{part2.raw};
1046
+ const VFromD<decltype(d_full)> v3{part3.raw};
1047
+ const auto A = InterleaveLower(d_full, v0, v1); // v1[0] v0[0]
1048
+ const auto B = InterleaveLower(d_full, v2, v3);
1049
+ StoreU(A, d_full, unaligned);
1050
+ StoreU(B, d_full, unaligned + MaxLanes(d_full));
1051
+ }
1052
+
1053
+ // <= 32-bit vectors
1054
+ template <class D, HWY_IF_V_SIZE_LE_D(D, 4)>
1055
+ HWY_API void StoreInterleaved4(VFromD<D> part0, VFromD<D> part1,
1056
+ VFromD<D> part2, VFromD<D> part3, D d,
1057
+ TFromD<D>* HWY_RESTRICT unaligned) {
1058
+ // Use full vectors to reduce the number of stores.
1059
+ const Full128<TFromD<D>> d_full;
1060
+ const RepartitionToWide<decltype(d_full)> dw;
1061
+ const VFromD<decltype(d_full)> v0{part0.raw};
1062
+ const VFromD<decltype(d_full)> v1{part1.raw};
1063
+ const VFromD<decltype(d_full)> v2{part2.raw};
1064
+ const VFromD<decltype(d_full)> v3{part3.raw};
1065
+ const auto v10 = ZipLower(dw, v0, v1); // .. v1[0] v0[0]
1066
+ const auto v32 = ZipLower(dw, v2, v3);
1067
+ const auto v3210 = BitCast(d_full, InterleaveLower(dw, v10, v32));
1068
+ alignas(16) TFromD<D> buf[MaxLanes(d_full)];
1069
+ StoreU(v3210, d_full, buf);
1070
+ CopyBytes<d.MaxBytes() * 4>(buf, unaligned);
1071
+ }
1072
+
1073
+ #endif // HWY_NATIVE_LOAD_STORE_INTERLEAVED
1074
+
1075
+ // ------------------------------ LoadN
1076
+
1077
+ #if (defined(HWY_NATIVE_LOAD_N) == defined(HWY_TARGET_TOGGLE))
1078
+
1079
+ #ifdef HWY_NATIVE_LOAD_N
1080
+ #undef HWY_NATIVE_LOAD_N
1081
+ #else
1082
+ #define HWY_NATIVE_LOAD_N
1083
+ #endif
1084
+
1085
+ #if HWY_MEM_OPS_MIGHT_FAULT && !HWY_HAVE_SCALABLE
1086
+ namespace detail {
1087
+
1088
+ template <class DTo, class DFrom>
1089
+ HWY_INLINE VFromD<DTo> LoadNResizeBitCast(DTo d_to, DFrom d_from,
1090
+ VFromD<DFrom> v) {
1091
+ #if HWY_TARGET <= HWY_SSE2
1092
+ // On SSE2/SSSE3/SSE4, the LoadU operation will zero out any lanes of v.raw
1093
+ // past the first (lowest-index) Lanes(d_from) lanes of v.raw if
1094
+ // sizeof(decltype(v.raw)) > d_from.MaxBytes() is true
1095
+ (void)d_from;
1096
+ return ResizeBitCast(d_to, v);
1097
+ #else
1098
+ // On other targets such as PPC/NEON, the contents of any lanes past the first
1099
+ // (lowest-index) Lanes(d_from) lanes of v.raw might be non-zero if
1100
+ // sizeof(decltype(v.raw)) > d_from.MaxBytes() is true.
1101
+ return ZeroExtendResizeBitCast(d_to, d_from, v);
1102
+ #endif
1103
+ }
1104
+
1105
+ } // namespace detail
1106
+
1107
+ template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 1)>
1108
+ HWY_API VFromD<D> LoadN(D d, const TFromD<D>* HWY_RESTRICT p,
1109
+ size_t num_lanes) {
1110
+ return (num_lanes > 0) ? LoadU(d, p) : Zero(d);
1111
+ }
1112
+
1113
+ template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 1)>
1114
+ HWY_API VFromD<D> LoadNOr(VFromD<D> no, D d, const TFromD<D>* HWY_RESTRICT p,
1115
+ size_t num_lanes) {
1116
+ return (num_lanes > 0) ? LoadU(d, p) : no;
1117
+ }
1118
+
1119
+ template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 2)>
1120
+ HWY_API VFromD<D> LoadN(D d, const TFromD<D>* HWY_RESTRICT p,
1121
+ size_t num_lanes) {
1122
+ const FixedTag<TFromD<D>, 1> d1;
1123
+
1124
+ if (num_lanes >= 2) return LoadU(d, p);
1125
+ if (num_lanes == 0) return Zero(d);
1126
+ return detail::LoadNResizeBitCast(d, d1, LoadU(d1, p));
1127
+ }
1128
+
1129
+ template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 2)>
1130
+ HWY_API VFromD<D> LoadNOr(VFromD<D> no, D d, const TFromD<D>* HWY_RESTRICT p,
1131
+ size_t num_lanes) {
1132
+ const FixedTag<TFromD<D>, 1> d1;
1133
+
1134
+ if (num_lanes >= 2) return LoadU(d, p);
1135
+ if (num_lanes == 0) return no;
1136
+ return InterleaveLower(ResizeBitCast(d, LoadU(d1, p)), no);
1137
+ }
1138
+
1139
+ template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 4)>
1140
+ HWY_API VFromD<D> LoadN(D d, const TFromD<D>* HWY_RESTRICT p,
1141
+ size_t num_lanes) {
1142
+ const FixedTag<TFromD<D>, 2> d2;
1143
+ const Half<decltype(d2)> d1;
1144
+
1145
+ if (num_lanes >= 4) return LoadU(d, p);
1146
+ if (num_lanes == 0) return Zero(d);
1147
+ if (num_lanes == 1) return detail::LoadNResizeBitCast(d, d1, LoadU(d1, p));
1148
+
1149
+ // Two or three lanes.
1150
+ const VFromD<D> v_lo = detail::LoadNResizeBitCast(d, d2, LoadU(d2, p));
1151
+ return (num_lanes == 2) ? v_lo : InsertLane(v_lo, 2, p[2]);
1152
+ }
1153
+
1154
+ template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 4)>
1155
+ HWY_API VFromD<D> LoadNOr(VFromD<D> no, D d, const TFromD<D>* HWY_RESTRICT p,
1156
+ size_t num_lanes) {
1157
+ const FixedTag<TFromD<D>, 2> d2;
1158
+
1159
+ if (num_lanes >= 4) return LoadU(d, p);
1160
+ if (num_lanes == 0) return no;
1161
+ if (num_lanes == 1) return InsertLane(no, 0, p[0]);
1162
+
1163
+ // Two or three lanes.
1164
+ const VFromD<D> v_lo =
1165
+ ConcatUpperLower(d, no, ResizeBitCast(d, LoadU(d2, p)));
1166
+ return (num_lanes == 2) ? v_lo : InsertLane(v_lo, 2, p[2]);
1167
+ }
1168
+
1169
+ template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 8)>
1170
+ HWY_API VFromD<D> LoadN(D d, const TFromD<D>* HWY_RESTRICT p,
1171
+ size_t num_lanes) {
1172
+ const FixedTag<TFromD<D>, 4> d4;
1173
+ const Half<decltype(d4)> d2;
1174
+ const Half<decltype(d2)> d1;
1175
+
1176
+ if (num_lanes >= 8) return LoadU(d, p);
1177
+ if (num_lanes == 0) return Zero(d);
1178
+ if (num_lanes == 1) return detail::LoadNResizeBitCast(d, d1, LoadU(d1, p));
1179
+
1180
+ const size_t leading_len = num_lanes & 4;
1181
+ VFromD<decltype(d4)> v_trailing = Zero(d4);
1182
+
1183
+ if ((num_lanes & 2) != 0) {
1184
+ const VFromD<decltype(d2)> v_trailing_lo2 = LoadU(d2, p + leading_len);
1185
+ if ((num_lanes & 1) != 0) {
1186
+ v_trailing = Combine(
1187
+ d4,
1188
+ detail::LoadNResizeBitCast(d2, d1, LoadU(d1, p + leading_len + 2)),
1189
+ v_trailing_lo2);
1190
+ } else {
1191
+ v_trailing = detail::LoadNResizeBitCast(d4, d2, v_trailing_lo2);
1192
+ }
1193
+ } else if ((num_lanes & 1) != 0) {
1194
+ v_trailing = detail::LoadNResizeBitCast(d4, d1, LoadU(d1, p + leading_len));
1195
+ }
1196
+
1197
+ if (leading_len != 0) {
1198
+ return Combine(d, v_trailing, LoadU(d4, p));
1199
+ } else {
1200
+ return detail::LoadNResizeBitCast(d, d4, v_trailing);
1201
+ }
1202
+ }
1203
+
1204
+ template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 8)>
1205
+ HWY_API VFromD<D> LoadNOr(VFromD<D> no, D d, const TFromD<D>* HWY_RESTRICT p,
1206
+ size_t num_lanes) {
1207
+ const FixedTag<TFromD<D>, 4> d4;
1208
+ const Half<decltype(d4)> d2;
1209
+ const Half<decltype(d2)> d1;
1210
+
1211
+ if (num_lanes >= 8) return LoadU(d, p);
1212
+ if (num_lanes == 0) return no;
1213
+ if (num_lanes == 1) return InsertLane(no, 0, p[0]);
1214
+
1215
+ const size_t leading_len = num_lanes & 4;
1216
+ VFromD<decltype(d4)> v_trailing = ResizeBitCast(d4, no);
1217
+
1218
+ if ((num_lanes & 2) != 0) {
1219
+ const VFromD<decltype(d2)> v_trailing_lo2 = LoadU(d2, p + leading_len);
1220
+ if ((num_lanes & 1) != 0) {
1221
+ v_trailing = Combine(
1222
+ d4,
1223
+ InterleaveLower(ResizeBitCast(d2, LoadU(d1, p + leading_len + 2)),
1224
+ ResizeBitCast(d2, no)),
1225
+ v_trailing_lo2);
1226
+ } else {
1227
+ v_trailing = ConcatUpperLower(d4, ResizeBitCast(d4, no),
1228
+ ResizeBitCast(d4, v_trailing_lo2));
1229
+ }
1230
+ } else if ((num_lanes & 1) != 0) {
1231
+ v_trailing = InsertLane(ResizeBitCast(d4, no), 0, p[leading_len]);
1232
+ }
1233
+
1234
+ if (leading_len != 0) {
1235
+ return Combine(d, v_trailing, LoadU(d4, p));
1236
+ } else {
1237
+ return ConcatUpperLower(d, no, ResizeBitCast(d, v_trailing));
1238
+ }
1239
+ }
1240
+
1241
+ template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 16)>
1242
+ HWY_API VFromD<D> LoadN(D d, const TFromD<D>* HWY_RESTRICT p,
1243
+ size_t num_lanes) {
1244
+ const FixedTag<TFromD<D>, 8> d8;
1245
+ const Half<decltype(d8)> d4;
1246
+ const Half<decltype(d4)> d2;
1247
+ const Half<decltype(d2)> d1;
1248
+
1249
+ if (num_lanes >= 16) return LoadU(d, p);
1250
+ if (num_lanes == 0) return Zero(d);
1251
+ if (num_lanes == 1) return detail::LoadNResizeBitCast(d, d1, LoadU(d1, p));
1252
+
1253
+ const size_t leading_len = num_lanes & 12;
1254
+ VFromD<decltype(d4)> v_trailing = Zero(d4);
1255
+
1256
+ if ((num_lanes & 2) != 0) {
1257
+ const VFromD<decltype(d2)> v_trailing_lo2 = LoadU(d2, p + leading_len);
1258
+ if ((num_lanes & 1) != 0) {
1259
+ v_trailing = Combine(
1260
+ d4,
1261
+ detail::LoadNResizeBitCast(d2, d1, LoadU(d1, p + leading_len + 2)),
1262
+ v_trailing_lo2);
1263
+ } else {
1264
+ v_trailing = detail::LoadNResizeBitCast(d4, d2, v_trailing_lo2);
1265
+ }
1266
+ } else if ((num_lanes & 1) != 0) {
1267
+ v_trailing = detail::LoadNResizeBitCast(d4, d1, LoadU(d1, p + leading_len));
1268
+ }
1269
+
1270
+ if (leading_len != 0) {
1271
+ if (leading_len >= 8) {
1272
+ const VFromD<decltype(d8)> v_hi7 =
1273
+ ((leading_len & 4) != 0)
1274
+ ? Combine(d8, v_trailing, LoadU(d4, p + 8))
1275
+ : detail::LoadNResizeBitCast(d8, d4, v_trailing);
1276
+ return Combine(d, v_hi7, LoadU(d8, p));
1277
+ } else {
1278
+ return detail::LoadNResizeBitCast(d, d8,
1279
+ Combine(d8, v_trailing, LoadU(d4, p)));
1280
+ }
1281
+ } else {
1282
+ return detail::LoadNResizeBitCast(d, d4, v_trailing);
1283
+ }
1284
+ }
1285
+
1286
+ template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 16)>
1287
+ HWY_API VFromD<D> LoadNOr(VFromD<D> no, D d, const TFromD<D>* HWY_RESTRICT p,
1288
+ size_t num_lanes) {
1289
+ const FixedTag<TFromD<D>, 8> d8;
1290
+ const Half<decltype(d8)> d4;
1291
+ const Half<decltype(d4)> d2;
1292
+ const Half<decltype(d2)> d1;
1293
+
1294
+ if (num_lanes >= 16) return LoadU(d, p);
1295
+ if (num_lanes == 0) return no;
1296
+ if (num_lanes == 1) return InsertLane(no, 0, p[0]);
1297
+
1298
+ const size_t leading_len = num_lanes & 12;
1299
+ VFromD<decltype(d4)> v_trailing = ResizeBitCast(d4, no);
1300
+
1301
+ if ((num_lanes & 2) != 0) {
1302
+ const VFromD<decltype(d2)> v_trailing_lo2 = LoadU(d2, p + leading_len);
1303
+ if ((num_lanes & 1) != 0) {
1304
+ v_trailing = Combine(
1305
+ d4,
1306
+ InterleaveLower(ResizeBitCast(d2, LoadU(d1, p + leading_len + 2)),
1307
+ ResizeBitCast(d2, no)),
1308
+ v_trailing_lo2);
1309
+ } else {
1310
+ v_trailing = ConcatUpperLower(d4, ResizeBitCast(d4, no),
1311
+ ResizeBitCast(d4, v_trailing_lo2));
1312
+ }
1313
+ } else if ((num_lanes & 1) != 0) {
1314
+ v_trailing = InsertLane(ResizeBitCast(d4, no), 0, p[leading_len]);
1315
+ }
1316
+
1317
+ if (leading_len != 0) {
1318
+ if (leading_len >= 8) {
1319
+ const VFromD<decltype(d8)> v_hi7 =
1320
+ ((leading_len & 4) != 0)
1321
+ ? Combine(d8, v_trailing, LoadU(d4, p + 8))
1322
+ : ConcatUpperLower(d8, ResizeBitCast(d8, no),
1323
+ ResizeBitCast(d8, v_trailing));
1324
+ return Combine(d, v_hi7, LoadU(d8, p));
1325
+ } else {
1326
+ return ConcatUpperLower(
1327
+ d, ResizeBitCast(d, no),
1328
+ ResizeBitCast(d, Combine(d8, v_trailing, LoadU(d4, p))));
1329
+ }
1330
+ } else {
1331
+ const Repartition<uint32_t, D> du32;
1332
+ // lowest 4 bytes from v_trailing, next 4 from no.
1333
+ const VFromD<decltype(du32)> lo8 =
1334
+ InterleaveLower(ResizeBitCast(du32, v_trailing), BitCast(du32, no));
1335
+ return ConcatUpperLower(d, ResizeBitCast(d, no), ResizeBitCast(d, lo8));
1336
+ }
1337
+ }
1338
+
1339
+ #if HWY_MAX_BYTES >= 32
1340
+
1341
+ template <class D, HWY_IF_V_SIZE_GT_D(D, 16)>
1342
+ HWY_API VFromD<D> LoadN(D d, const TFromD<D>* HWY_RESTRICT p,
1343
+ size_t num_lanes) {
1344
+ if (num_lanes >= Lanes(d)) return LoadU(d, p);
1345
+
1346
+ const Half<decltype(d)> dh;
1347
+ const size_t half_N = Lanes(dh);
1348
+ if (num_lanes <= half_N) {
1349
+ return ZeroExtendVector(d, LoadN(dh, p, num_lanes));
1350
+ } else {
1351
+ const VFromD<decltype(dh)> v_lo = LoadU(dh, p);
1352
+ const VFromD<decltype(dh)> v_hi = LoadN(dh, p + half_N, num_lanes - half_N);
1353
+ return Combine(d, v_hi, v_lo);
1354
+ }
1355
+ }
1356
+
1357
+ template <class D, HWY_IF_V_SIZE_GT_D(D, 16)>
1358
+ HWY_API VFromD<D> LoadNOr(VFromD<D> no, D d, const TFromD<D>* HWY_RESTRICT p,
1359
+ size_t num_lanes) {
1360
+ if (num_lanes >= Lanes(d)) return LoadU(d, p);
1361
+
1362
+ const Half<decltype(d)> dh;
1363
+ const size_t half_N = Lanes(dh);
1364
+ const VFromD<decltype(dh)> no_h = LowerHalf(no);
1365
+ if (num_lanes <= half_N) {
1366
+ return ConcatUpperLower(d, no,
1367
+ ResizeBitCast(d, LoadNOr(no_h, dh, p, num_lanes)));
1368
+ } else {
1369
+ const VFromD<decltype(dh)> v_lo = LoadU(dh, p);
1370
+ const VFromD<decltype(dh)> v_hi =
1371
+ LoadNOr(no_h, dh, p + half_N, num_lanes - half_N);
1372
+ return Combine(d, v_hi, v_lo);
1373
+ }
1374
+ }
1375
+
1376
+ #endif // HWY_MAX_BYTES >= 32
1377
+ #else // !HWY_MEM_OPS_MIGHT_FAULT || HWY_HAVE_SCALABLE
1378
+
1379
+ // For SVE and non-sanitizer AVX-512; RVV has its own specialization.
1380
+ template <class D>
1381
+ HWY_API VFromD<D> LoadN(D d, const TFromD<D>* HWY_RESTRICT p,
1382
+ size_t num_lanes) {
1383
+ #if HWY_MEM_OPS_MIGHT_FAULT
1384
+ if (num_lanes <= 0) return Zero(d);
1385
+ #endif
1386
+
1387
+ return MaskedLoad(FirstN(d, num_lanes), d, p);
1388
+ }
1389
+
1390
+ template <class D>
1391
+ HWY_API VFromD<D> LoadNOr(VFromD<D> no, D d, const TFromD<D>* HWY_RESTRICT p,
1392
+ size_t num_lanes) {
1393
+ #if HWY_MEM_OPS_MIGHT_FAULT
1394
+ if (num_lanes <= 0) return no;
1395
+ #endif
1396
+
1397
+ return MaskedLoadOr(no, FirstN(d, num_lanes), d, p);
1398
+ }
1399
+
1400
+ #endif // HWY_MEM_OPS_MIGHT_FAULT && !HWY_HAVE_SCALABLE
1401
+ #endif // HWY_NATIVE_LOAD_N
1402
+
1403
+ // ------------------------------ StoreN
1404
+ #if (defined(HWY_NATIVE_STORE_N) == defined(HWY_TARGET_TOGGLE))
1405
+ #ifdef HWY_NATIVE_STORE_N
1406
+ #undef HWY_NATIVE_STORE_N
1407
+ #else
1408
+ #define HWY_NATIVE_STORE_N
1409
+ #endif
1410
+
1411
+ #if HWY_MEM_OPS_MIGHT_FAULT && !HWY_HAVE_SCALABLE
1412
+ namespace detail {
1413
+
1414
+ template <class DH, HWY_IF_V_SIZE_LE_D(DH, 4)>
1415
+ HWY_INLINE VFromD<DH> StoreNGetUpperHalf(DH dh, VFromD<Twice<DH>> v) {
1416
+ constexpr size_t kMinShrVectBytes =
1417
+ (HWY_TARGET == HWY_NEON || HWY_TARGET == HWY_NEON_WITHOUT_AES) ? 8 : 16;
1418
+ const FixedTag<uint8_t, kMinShrVectBytes> d_shift;
1419
+ return ResizeBitCast(
1420
+ dh, ShiftRightBytes<dh.MaxBytes()>(d_shift, ResizeBitCast(d_shift, v)));
1421
+ }
1422
+
1423
+ template <class DH, HWY_IF_V_SIZE_GT_D(DH, 4)>
1424
+ HWY_INLINE VFromD<DH> StoreNGetUpperHalf(DH dh, VFromD<Twice<DH>> v) {
1425
+ return UpperHalf(dh, v);
1426
+ }
1427
+
1428
+ } // namespace detail
1429
+
1430
+ template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 1),
1431
+ typename T = TFromD<D>>
1432
+ HWY_API void StoreN(VFromD<D> v, D d, T* HWY_RESTRICT p,
1433
+ size_t max_lanes_to_store) {
1434
+ if (max_lanes_to_store > 0) {
1435
+ StoreU(v, d, p);
1436
+ }
1437
+ }
1438
+
1439
+ template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 2),
1440
+ typename T = TFromD<D>>
1441
+ HWY_API void StoreN(VFromD<D> v, D d, T* HWY_RESTRICT p,
1442
+ size_t max_lanes_to_store) {
1443
+ if (max_lanes_to_store > 1) {
1444
+ StoreU(v, d, p);
1445
+ } else if (max_lanes_to_store == 1) {
1446
+ const FixedTag<TFromD<D>, 1> d1;
1447
+ StoreU(LowerHalf(d1, v), d1, p);
1448
+ }
1449
+ }
1450
+
1451
+ template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 4),
1452
+ typename T = TFromD<D>>
1453
+ HWY_API void StoreN(VFromD<D> v, D d, T* HWY_RESTRICT p,
1454
+ size_t max_lanes_to_store) {
1455
+ const FixedTag<TFromD<D>, 2> d2;
1456
+ const Half<decltype(d2)> d1;
1457
+
1458
+ if (max_lanes_to_store > 1) {
1459
+ if (max_lanes_to_store >= 4) {
1460
+ StoreU(v, d, p);
1461
+ } else {
1462
+ StoreU(ResizeBitCast(d2, v), d2, p);
1463
+ if (max_lanes_to_store == 3) {
1464
+ StoreU(ResizeBitCast(d1, detail::StoreNGetUpperHalf(d2, v)), d1, p + 2);
1465
+ }
1466
+ }
1467
+ } else if (max_lanes_to_store == 1) {
1468
+ StoreU(ResizeBitCast(d1, v), d1, p);
1469
+ }
1470
+ }
1471
+
1472
+ template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 8),
1473
+ typename T = TFromD<D>>
1474
+ HWY_API void StoreN(VFromD<D> v, D d, T* HWY_RESTRICT p,
1475
+ size_t max_lanes_to_store) {
1476
+ const FixedTag<TFromD<D>, 4> d4;
1477
+ const Half<decltype(d4)> d2;
1478
+ const Half<decltype(d2)> d1;
1479
+
1480
+ if (max_lanes_to_store <= 1) {
1481
+ if (max_lanes_to_store == 1) {
1482
+ StoreU(ResizeBitCast(d1, v), d1, p);
1483
+ }
1484
+ } else if (max_lanes_to_store >= 8) {
1485
+ StoreU(v, d, p);
1486
+ } else if (max_lanes_to_store >= 4) {
1487
+ StoreU(LowerHalf(d4, v), d4, p);
1488
+ StoreN(detail::StoreNGetUpperHalf(d4, v), d4, p + 4,
1489
+ max_lanes_to_store - 4);
1490
+ } else {
1491
+ StoreN(LowerHalf(d4, v), d4, p, max_lanes_to_store);
1492
+ }
1493
+ }
1494
+
1495
+ template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 16),
1496
+ typename T = TFromD<D>>
1497
+ HWY_API void StoreN(VFromD<D> v, D d, T* HWY_RESTRICT p,
1498
+ size_t max_lanes_to_store) {
1499
+ const FixedTag<TFromD<D>, 8> d8;
1500
+ const Half<decltype(d8)> d4;
1501
+ const Half<decltype(d4)> d2;
1502
+ const Half<decltype(d2)> d1;
1503
+
1504
+ if (max_lanes_to_store <= 1) {
1505
+ if (max_lanes_to_store == 1) {
1506
+ StoreU(ResizeBitCast(d1, v), d1, p);
1507
+ }
1508
+ } else if (max_lanes_to_store >= 16) {
1509
+ StoreU(v, d, p);
1510
+ } else if (max_lanes_to_store >= 8) {
1511
+ StoreU(LowerHalf(d8, v), d8, p);
1512
+ StoreN(detail::StoreNGetUpperHalf(d8, v), d8, p + 8,
1513
+ max_lanes_to_store - 8);
1514
+ } else {
1515
+ StoreN(LowerHalf(d8, v), d8, p, max_lanes_to_store);
1516
+ }
1517
+ }
1518
+
1519
+ #if HWY_MAX_BYTES >= 32
1520
+ template <class D, HWY_IF_V_SIZE_GT_D(D, 16), typename T = TFromD<D>>
1521
+ HWY_API void StoreN(VFromD<D> v, D d, T* HWY_RESTRICT p,
1522
+ size_t max_lanes_to_store) {
1523
+ const size_t N = Lanes(d);
1524
+ if (max_lanes_to_store >= N) {
1525
+ StoreU(v, d, p);
1526
+ return;
1527
+ }
1528
+
1529
+ const Half<decltype(d)> dh;
1530
+ const size_t half_N = Lanes(dh);
1531
+ if (max_lanes_to_store <= half_N) {
1532
+ StoreN(LowerHalf(dh, v), dh, p, max_lanes_to_store);
1533
+ } else {
1534
+ StoreU(LowerHalf(dh, v), dh, p);
1535
+ StoreN(UpperHalf(dh, v), dh, p + half_N, max_lanes_to_store - half_N);
1536
+ }
1537
+ }
1538
+ #endif // HWY_MAX_BYTES >= 32
1539
+
1540
+ #else // !HWY_MEM_OPS_MIGHT_FAULT || HWY_HAVE_SCALABLE
1541
+ template <class D, typename T = TFromD<D>>
1542
+ HWY_API void StoreN(VFromD<D> v, D d, T* HWY_RESTRICT p,
1543
+ size_t max_lanes_to_store) {
1544
+ const size_t N = Lanes(d);
1545
+ const size_t clamped_max_lanes_to_store = HWY_MIN(max_lanes_to_store, N);
1546
+ #if HWY_MEM_OPS_MIGHT_FAULT
1547
+ if (clamped_max_lanes_to_store == 0) return;
1548
+ #endif
1549
+
1550
+ BlendedStore(v, FirstN(d, clamped_max_lanes_to_store), d, p);
1551
+
1552
+ #if HWY_MEM_OPS_MIGHT_FAULT
1553
+ detail::MaybeUnpoison(p, clamped_max_lanes_to_store);
1554
+ #endif
1555
+ }
1556
+ #endif // HWY_MEM_OPS_MIGHT_FAULT && !HWY_HAVE_SCALABLE
1557
+
1558
+ #endif // (defined(HWY_NATIVE_STORE_N) == defined(HWY_TARGET_TOGGLE))
1559
+
1560
+ // ------------------------------ Scatter
1561
+
1562
+ #if (defined(HWY_NATIVE_SCATTER) == defined(HWY_TARGET_TOGGLE))
1563
+ #ifdef HWY_NATIVE_SCATTER
1564
+ #undef HWY_NATIVE_SCATTER
1565
+ #else
1566
+ #define HWY_NATIVE_SCATTER
1567
+ #endif
1568
+
1569
+ template <class D, typename T = TFromD<D>>
1570
+ HWY_API void ScatterOffset(VFromD<D> v, D d, T* HWY_RESTRICT base,
1571
+ VFromD<RebindToSigned<D>> offset) {
1572
+ const RebindToSigned<decltype(d)> di;
1573
+ using TI = TFromD<decltype(di)>;
1574
+ static_assert(sizeof(T) == sizeof(TI), "Index/lane size must match");
1575
+
1576
+ HWY_ALIGN T lanes[MaxLanes(d)];
1577
+ Store(v, d, lanes);
1578
+
1579
+ HWY_ALIGN TI offset_lanes[MaxLanes(d)];
1580
+ Store(offset, di, offset_lanes);
1581
+
1582
+ uint8_t* base_bytes = reinterpret_cast<uint8_t*>(base);
1583
+ for (size_t i = 0; i < MaxLanes(d); ++i) {
1584
+ CopyBytes<sizeof(T)>(&lanes[i], base_bytes + offset_lanes[i]);
1585
+ }
1586
+ }
1587
+
1588
+ template <class D, typename T = TFromD<D>>
1589
+ HWY_API void ScatterIndex(VFromD<D> v, D d, T* HWY_RESTRICT base,
1590
+ VFromD<RebindToSigned<D>> index) {
1591
+ const RebindToSigned<decltype(d)> di;
1592
+ using TI = TFromD<decltype(di)>;
1593
+ static_assert(sizeof(T) == sizeof(TI), "Index/lane size must match");
1594
+
1595
+ HWY_ALIGN T lanes[MaxLanes(d)];
1596
+ Store(v, d, lanes);
1597
+
1598
+ HWY_ALIGN TI index_lanes[MaxLanes(d)];
1599
+ Store(index, di, index_lanes);
1600
+
1601
+ for (size_t i = 0; i < MaxLanes(d); ++i) {
1602
+ base[index_lanes[i]] = lanes[i];
1603
+ }
1604
+ }
1605
+
1606
+ template <class D, typename T = TFromD<D>>
1607
+ HWY_API void MaskedScatterIndex(VFromD<D> v, MFromD<D> m, D d,
1608
+ T* HWY_RESTRICT base,
1609
+ VFromD<RebindToSigned<D>> index) {
1610
+ const RebindToSigned<decltype(d)> di;
1611
+ using TI = TFromD<decltype(di)>;
1612
+ static_assert(sizeof(T) == sizeof(TI), "Index/lane size must match");
1613
+
1614
+ HWY_ALIGN T lanes[MaxLanes(d)];
1615
+ Store(v, d, lanes);
1616
+
1617
+ HWY_ALIGN TI index_lanes[MaxLanes(d)];
1618
+ Store(index, di, index_lanes);
1619
+
1620
+ HWY_ALIGN TI mask_lanes[MaxLanes(di)];
1621
+ Store(BitCast(di, VecFromMask(d, m)), di, mask_lanes);
1622
+
1623
+ for (size_t i = 0; i < MaxLanes(d); ++i) {
1624
+ if (mask_lanes[i]) base[index_lanes[i]] = lanes[i];
1625
+ }
1626
+ }
1627
+
1628
+ #endif // (defined(HWY_NATIVE_SCATTER) == defined(HWY_TARGET_TOGGLE))
1629
+
1630
+ // ------------------------------ Gather
1631
+
1632
+ #if (defined(HWY_NATIVE_GATHER) == defined(HWY_TARGET_TOGGLE))
1633
+ #ifdef HWY_NATIVE_GATHER
1634
+ #undef HWY_NATIVE_GATHER
1635
+ #else
1636
+ #define HWY_NATIVE_GATHER
1637
+ #endif
1638
+
1639
+ template <class D, typename T = TFromD<D>>
1640
+ HWY_API VFromD<D> GatherOffset(D d, const T* HWY_RESTRICT base,
1641
+ VFromD<RebindToSigned<D>> offset) {
1642
+ const RebindToSigned<D> di;
1643
+ using TI = TFromD<decltype(di)>;
1644
+ static_assert(sizeof(T) == sizeof(TI), "Index/lane size must match");
1645
+
1646
+ HWY_ALIGN TI offset_lanes[MaxLanes(d)];
1647
+ Store(offset, di, offset_lanes);
1648
+
1649
+ HWY_ALIGN T lanes[MaxLanes(d)];
1650
+ const uint8_t* base_bytes = reinterpret_cast<const uint8_t*>(base);
1651
+ for (size_t i = 0; i < MaxLanes(d); ++i) {
1652
+ CopyBytes<sizeof(T)>(base_bytes + offset_lanes[i], &lanes[i]);
1653
+ }
1654
+ return Load(d, lanes);
1655
+ }
1656
+
1657
+ template <class D, typename T = TFromD<D>>
1658
+ HWY_API VFromD<D> GatherIndex(D d, const T* HWY_RESTRICT base,
1659
+ VFromD<RebindToSigned<D>> index) {
1660
+ const RebindToSigned<D> di;
1661
+ using TI = TFromD<decltype(di)>;
1662
+ static_assert(sizeof(T) == sizeof(TI), "Index/lane size must match");
1663
+
1664
+ HWY_ALIGN TI index_lanes[MaxLanes(d)];
1665
+ Store(index, di, index_lanes);
1666
+
1667
+ HWY_ALIGN T lanes[MaxLanes(d)];
1668
+ for (size_t i = 0; i < MaxLanes(d); ++i) {
1669
+ lanes[i] = base[index_lanes[i]];
1670
+ }
1671
+ return Load(d, lanes);
1672
+ }
1673
+
1674
+ template <class D, typename T = TFromD<D>>
1675
+ HWY_API VFromD<D> MaskedGatherIndex(MFromD<D> m, D d,
1676
+ const T* HWY_RESTRICT base,
1677
+ VFromD<RebindToSigned<D>> index) {
1678
+ const RebindToSigned<D> di;
1679
+ using TI = TFromD<decltype(di)>;
1680
+ static_assert(sizeof(T) == sizeof(TI), "Index/lane size must match");
1681
+
1682
+ HWY_ALIGN TI index_lanes[MaxLanes(di)];
1683
+ Store(index, di, index_lanes);
1684
+
1685
+ HWY_ALIGN TI mask_lanes[MaxLanes(di)];
1686
+ Store(BitCast(di, VecFromMask(d, m)), di, mask_lanes);
1687
+
1688
+ HWY_ALIGN T lanes[MaxLanes(d)];
1689
+ for (size_t i = 0; i < MaxLanes(d); ++i) {
1690
+ lanes[i] = mask_lanes[i] ? base[index_lanes[i]] : T{0};
1691
+ }
1692
+ return Load(d, lanes);
1693
+ }
1694
+
1695
+ #endif // (defined(HWY_NATIVE_GATHER) == defined(HWY_TARGET_TOGGLE))
1696
+
1697
+ // ------------------------------ ScatterN/GatherN
1698
+
1699
+ template <class D, typename T = TFromD<D>>
1700
+ HWY_API void ScatterIndexN(VFromD<D> v, D d, T* HWY_RESTRICT base,
1701
+ VFromD<RebindToSigned<D>> index,
1702
+ const size_t max_lanes_to_store) {
1703
+ MaskedScatterIndex(v, FirstN(d, max_lanes_to_store), d, base, index);
1704
+ }
1705
+
1706
+ template <class D, typename T = TFromD<D>>
1707
+ HWY_API VFromD<D> GatherIndexN(D d, const T* HWY_RESTRICT base,
1708
+ VFromD<RebindToSigned<D>> index,
1709
+ const size_t max_lanes_to_load) {
1710
+ return MaskedGatherIndex(FirstN(d, max_lanes_to_load), d, base, index);
1711
+ }
1712
+
1713
+ // ------------------------------ Integer AbsDiff and SumsOf8AbsDiff
1714
+
1715
+ #if (defined(HWY_NATIVE_INTEGER_ABS_DIFF) == defined(HWY_TARGET_TOGGLE))
1716
+ #ifdef HWY_NATIVE_INTEGER_ABS_DIFF
1717
+ #undef HWY_NATIVE_INTEGER_ABS_DIFF
1718
+ #else
1719
+ #define HWY_NATIVE_INTEGER_ABS_DIFF
1720
+ #endif
1721
+
1722
+ template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
1723
+ HWY_API V AbsDiff(V a, V b) {
1724
+ return Sub(Max(a, b), Min(a, b));
1725
+ }
1726
+
1727
+ #endif // HWY_NATIVE_INTEGER_ABS_DIFF
1728
+
1729
+ #if (defined(HWY_NATIVE_SUMS_OF_8_ABS_DIFF) == defined(HWY_TARGET_TOGGLE))
1730
+ #ifdef HWY_NATIVE_SUMS_OF_8_ABS_DIFF
1731
+ #undef HWY_NATIVE_SUMS_OF_8_ABS_DIFF
1732
+ #else
1733
+ #define HWY_NATIVE_SUMS_OF_8_ABS_DIFF
1734
+ #endif
1735
+
1736
+ template <class V, HWY_IF_U8_D(DFromV<V>),
1737
+ HWY_IF_V_SIZE_GT_D(DFromV<V>, (HWY_TARGET == HWY_SCALAR ? 0 : 4))>
1738
+ HWY_API Vec<Repartition<uint64_t, DFromV<V>>> SumsOf8AbsDiff(V a, V b) {
1739
+ return SumsOf8(AbsDiff(a, b));
1740
+ }
1741
+
1742
+ #endif // HWY_NATIVE_SUMS_OF_8_ABS_DIFF
1743
+
1744
+ // ------------------------------ SaturatedAdd/SaturatedSub for UI32/UI64
1745
+
1746
+ #if (defined(HWY_NATIVE_I32_SATURATED_ADDSUB) == defined(HWY_TARGET_TOGGLE))
1747
+ #ifdef HWY_NATIVE_I32_SATURATED_ADDSUB
1748
+ #undef HWY_NATIVE_I32_SATURATED_ADDSUB
1749
+ #else
1750
+ #define HWY_NATIVE_I32_SATURATED_ADDSUB
1751
+ #endif
1752
+
1753
+ template <class V, HWY_IF_I32_D(DFromV<V>)>
1754
+ HWY_API V SaturatedAdd(V a, V b) {
1755
+ const DFromV<decltype(a)> d;
1756
+ const auto sum = Add(a, b);
1757
+ const auto overflow_mask = AndNot(Xor(a, b), Xor(a, sum));
1758
+ const auto overflow_result =
1759
+ Xor(BroadcastSignBit(a), Set(d, LimitsMax<int32_t>()));
1760
+ return IfNegativeThenElse(overflow_mask, overflow_result, sum);
1761
+ }
1762
+
1763
+ template <class V, HWY_IF_I32_D(DFromV<V>)>
1764
+ HWY_API V SaturatedSub(V a, V b) {
1765
+ const DFromV<decltype(a)> d;
1766
+ const auto diff = Sub(a, b);
1767
+ const auto overflow_mask = And(Xor(a, b), Xor(a, diff));
1768
+ const auto overflow_result =
1769
+ Xor(BroadcastSignBit(a), Set(d, LimitsMax<int32_t>()));
1770
+ return IfNegativeThenElse(overflow_mask, overflow_result, diff);
1771
+ }
1772
+
1773
+ #endif // HWY_NATIVE_I32_SATURATED_ADDSUB
1774
+
1775
+ #if (defined(HWY_NATIVE_I64_SATURATED_ADDSUB) == defined(HWY_TARGET_TOGGLE))
1776
+ #ifdef HWY_NATIVE_I64_SATURATED_ADDSUB
1777
+ #undef HWY_NATIVE_I64_SATURATED_ADDSUB
1778
+ #else
1779
+ #define HWY_NATIVE_I64_SATURATED_ADDSUB
1780
+ #endif
1781
+
1782
+ template <class V, HWY_IF_I64_D(DFromV<V>)>
1783
+ HWY_API V SaturatedAdd(V a, V b) {
1784
+ const DFromV<decltype(a)> d;
1785
+ const auto sum = Add(a, b);
1786
+ const auto overflow_mask = AndNot(Xor(a, b), Xor(a, sum));
1787
+ const auto overflow_result =
1788
+ Xor(BroadcastSignBit(a), Set(d, LimitsMax<int64_t>()));
1789
+ return IfNegativeThenElse(overflow_mask, overflow_result, sum);
1790
+ }
1791
+
1792
+ template <class V, HWY_IF_I64_D(DFromV<V>)>
1793
+ HWY_API V SaturatedSub(V a, V b) {
1794
+ const DFromV<decltype(a)> d;
1795
+ const auto diff = Sub(a, b);
1796
+ const auto overflow_mask = And(Xor(a, b), Xor(a, diff));
1797
+ const auto overflow_result =
1798
+ Xor(BroadcastSignBit(a), Set(d, LimitsMax<int64_t>()));
1799
+ return IfNegativeThenElse(overflow_mask, overflow_result, diff);
1800
+ }
1801
+
1802
+ #endif // HWY_NATIVE_I64_SATURATED_ADDSUB
1803
+
1804
+ #if (defined(HWY_NATIVE_U32_SATURATED_ADDSUB) == defined(HWY_TARGET_TOGGLE))
1805
+ #ifdef HWY_NATIVE_U32_SATURATED_ADDSUB
1806
+ #undef HWY_NATIVE_U32_SATURATED_ADDSUB
1807
+ #else
1808
+ #define HWY_NATIVE_U32_SATURATED_ADDSUB
1809
+ #endif
1810
+
1811
+ template <class V, HWY_IF_U32_D(DFromV<V>)>
1812
+ HWY_API V SaturatedAdd(V a, V b) {
1813
+ return Add(a, Min(b, Not(a)));
1814
+ }
1815
+
1816
+ template <class V, HWY_IF_U32_D(DFromV<V>)>
1817
+ HWY_API V SaturatedSub(V a, V b) {
1818
+ return Sub(a, Min(a, b));
1819
+ }
1820
+
1821
+ #endif // HWY_NATIVE_U32_SATURATED_ADDSUB
1822
+
1823
+ #if (defined(HWY_NATIVE_U64_SATURATED_ADDSUB) == defined(HWY_TARGET_TOGGLE))
1824
+ #ifdef HWY_NATIVE_U64_SATURATED_ADDSUB
1825
+ #undef HWY_NATIVE_U64_SATURATED_ADDSUB
1826
+ #else
1827
+ #define HWY_NATIVE_U64_SATURATED_ADDSUB
1828
+ #endif
1829
+
1830
+ template <class V, HWY_IF_U64_D(DFromV<V>)>
1831
+ HWY_API V SaturatedAdd(V a, V b) {
1832
+ return Add(a, Min(b, Not(a)));
1833
+ }
1834
+
1835
+ template <class V, HWY_IF_U64_D(DFromV<V>)>
1836
+ HWY_API V SaturatedSub(V a, V b) {
1837
+ return Sub(a, Min(a, b));
1838
+ }
1839
+
1840
+ #endif // HWY_NATIVE_U64_SATURATED_ADDSUB
1841
+
1842
+ // ------------------------------ Unsigned to signed demotions
1843
+
1844
+ template <class DN, HWY_IF_SIGNED_D(DN), class V, HWY_IF_UNSIGNED_V(V),
1845
+ class V2 = VFromD<Rebind<TFromV<V>, DN>>,
1846
+ hwy::EnableIf<(sizeof(TFromD<DN>) < sizeof(TFromV<V>))>* = nullptr,
1847
+ HWY_IF_LANES_D(DFromV<V>, HWY_MAX_LANES_D(DFromV<V2>))>
1848
+ HWY_API VFromD<DN> DemoteTo(DN dn, V v) {
1849
+ const DFromV<decltype(v)> d;
1850
+ const RebindToSigned<decltype(d)> di;
1851
+ const RebindToUnsigned<decltype(dn)> dn_u;
1852
+
1853
+ // First, do a signed to signed demotion. This will convert any values
1854
+ // that are greater than hwy::HighestValue<MakeSigned<TFromV<V>>>() to a
1855
+ // negative value.
1856
+ const auto i2i_demote_result = DemoteTo(dn, BitCast(di, v));
1857
+
1858
+ // Second, convert any negative values to hwy::HighestValue<TFromD<DN>>()
1859
+ // using an unsigned Min operation.
1860
+ const auto max_signed_val = Set(dn, hwy::HighestValue<TFromD<DN>>());
1861
+
1862
+ return BitCast(
1863
+ dn, Min(BitCast(dn_u, i2i_demote_result), BitCast(dn_u, max_signed_val)));
1864
+ }
1865
+
1866
+ #if HWY_TARGET != HWY_SCALAR || HWY_IDE
1867
+ template <class DN, HWY_IF_SIGNED_D(DN), class V, HWY_IF_UNSIGNED_V(V),
1868
+ class V2 = VFromD<Repartition<TFromV<V>, DN>>,
1869
+ HWY_IF_T_SIZE_V(V, sizeof(TFromD<DN>) * 2),
1870
+ HWY_IF_LANES_D(DFromV<V>, HWY_MAX_LANES_D(DFromV<V2>))>
1871
+ HWY_API VFromD<DN> ReorderDemote2To(DN dn, V a, V b) {
1872
+ const DFromV<decltype(a)> d;
1873
+ const RebindToSigned<decltype(d)> di;
1874
+ const RebindToUnsigned<decltype(dn)> dn_u;
1875
+
1876
+ // First, do a signed to signed demotion. This will convert any values
1877
+ // that are greater than hwy::HighestValue<MakeSigned<TFromV<V>>>() to a
1878
+ // negative value.
1879
+ const auto i2i_demote_result =
1880
+ ReorderDemote2To(dn, BitCast(di, a), BitCast(di, b));
1881
+
1882
+ // Second, convert any negative values to hwy::HighestValue<TFromD<DN>>()
1883
+ // using an unsigned Min operation.
1884
+ const auto max_signed_val = Set(dn, hwy::HighestValue<TFromD<DN>>());
1885
+
1886
+ return BitCast(
1887
+ dn, Min(BitCast(dn_u, i2i_demote_result), BitCast(dn_u, max_signed_val)));
1888
+ }
1889
+ #endif
1890
+
1891
+ // ------------------------------ PromoteLowerTo
1892
+
1893
+ // There is no codegen advantage for a native version of this. It is provided
1894
+ // only for convenience.
1895
+ template <class D, class V>
1896
+ HWY_API VFromD<D> PromoteLowerTo(D d, V v) {
1897
+ // Lanes(d) may differ from Lanes(DFromV<V>()). Use the lane type from V
1898
+ // because it cannot be deduced from D (could be either bf16 or f16).
1899
+ const Rebind<TFromV<V>, decltype(d)> dh;
1900
+ return PromoteTo(d, LowerHalf(dh, v));
1901
+ }
1902
+
1903
+ // ------------------------------ PromoteUpperTo
1904
+
1905
+ #if (defined(HWY_NATIVE_PROMOTE_UPPER_TO) == defined(HWY_TARGET_TOGGLE))
1906
+ #ifdef HWY_NATIVE_PROMOTE_UPPER_TO
1907
+ #undef HWY_NATIVE_PROMOTE_UPPER_TO
1908
+ #else
1909
+ #define HWY_NATIVE_PROMOTE_UPPER_TO
1910
+ #endif
1911
+
1912
+ // This requires UpperHalf.
1913
+ #if HWY_TARGET != HWY_SCALAR || HWY_IDE
1914
+
1915
+ template <class D, class V>
1916
+ HWY_API VFromD<D> PromoteUpperTo(D d, V v) {
1917
+ // Lanes(d) may differ from Lanes(DFromV<V>()). Use the lane type from V
1918
+ // because it cannot be deduced from D (could be either bf16 or f16).
1919
+ const Rebind<TFromV<V>, decltype(d)> dh;
1920
+ return PromoteTo(d, UpperHalf(dh, v));
1921
+ }
1922
+
1923
+ #endif // HWY_TARGET != HWY_SCALAR
1924
+ #endif // HWY_NATIVE_PROMOTE_UPPER_TO
1925
+
1926
+ // ------------------------------ float16_t <-> float
1927
+
1928
+ #if (defined(HWY_NATIVE_F16C) == defined(HWY_TARGET_TOGGLE))
1929
+ #ifdef HWY_NATIVE_F16C
1930
+ #undef HWY_NATIVE_F16C
1931
+ #else
1932
+ #define HWY_NATIVE_F16C
1933
+ #endif
1934
+
1935
+ template <class D, HWY_IF_F32_D(D)>
1936
+ HWY_API VFromD<D> PromoteTo(D df32, VFromD<Rebind<float16_t, D>> v) {
1937
+ const RebindToSigned<decltype(df32)> di32;
1938
+ const RebindToUnsigned<decltype(df32)> du32;
1939
+ const Rebind<uint16_t, decltype(df32)> du16;
1940
+ using VU32 = VFromD<decltype(du32)>;
1941
+
1942
+ const VU32 bits16 = PromoteTo(du32, BitCast(du16, v));
1943
+ const VU32 sign = ShiftRight<15>(bits16);
1944
+ const VU32 biased_exp = And(ShiftRight<10>(bits16), Set(du32, 0x1F));
1945
+ const VU32 mantissa = And(bits16, Set(du32, 0x3FF));
1946
+ const VU32 subnormal =
1947
+ BitCast(du32, Mul(ConvertTo(df32, BitCast(di32, mantissa)),
1948
+ Set(df32, 1.0f / 16384 / 1024)));
1949
+
1950
+ const VU32 biased_exp32 = Add(biased_exp, Set(du32, 127 - 15));
1951
+ const VU32 mantissa32 = ShiftLeft<23 - 10>(mantissa);
1952
+ const VU32 normal = Or(ShiftLeft<23>(biased_exp32), mantissa32);
1953
+ const VU32 bits32 = IfThenElse(Eq(biased_exp, Zero(du32)), subnormal, normal);
1954
+ return BitCast(df32, Or(ShiftLeft<31>(sign), bits32));
1955
+ }
1956
+
1957
+ template <class D, HWY_IF_F16_D(D)>
1958
+ HWY_API VFromD<D> DemoteTo(D df16, VFromD<Rebind<float, D>> v) {
1959
+ const RebindToUnsigned<decltype(df16)> du16;
1960
+ const Rebind<uint32_t, decltype(df16)> du32;
1961
+ const RebindToSigned<decltype(du32)> di32;
1962
+ using VU32 = VFromD<decltype(du32)>;
1963
+ using VI32 = VFromD<decltype(di32)>;
1964
+
1965
+ const VU32 bits32 = BitCast(du32, v);
1966
+ const VU32 sign = ShiftRight<31>(bits32);
1967
+ const VU32 biased_exp32 = And(ShiftRight<23>(bits32), Set(du32, 0xFF));
1968
+ const VU32 mantissa32 = And(bits32, Set(du32, 0x7FFFFF));
1969
+
1970
+ const VI32 k15 = Set(di32, 15);
1971
+ const VI32 exp = Min(Sub(BitCast(di32, biased_exp32), Set(di32, 127)), k15);
1972
+ const MFromD<decltype(di32)> is_tiny = Lt(exp, Set(di32, -24));
1973
+
1974
+ const MFromD<decltype(di32)> is_subnormal = Lt(exp, Set(di32, -14));
1975
+ const VU32 biased_exp16 =
1976
+ BitCast(du32, IfThenZeroElse(is_subnormal, Add(exp, k15)));
1977
+ const VU32 sub_exp = BitCast(du32, Sub(Set(di32, -14), exp)); // [1, 11)
1978
+ // Clamp shift counts to prevent warnings in emu_128 Shr.
1979
+ const VU32 k31 = Set(du32, 31);
1980
+ const VU32 shift_m = Min(Add(Set(du32, 13), sub_exp), k31);
1981
+ const VU32 shift_1 = Min(Sub(Set(du32, 10), sub_exp), k31);
1982
+ const VU32 sub_m = Add(Shl(Set(du32, 1), shift_1), Shr(mantissa32, shift_m));
1983
+ const VU32 mantissa16 = IfThenElse(RebindMask(du32, is_subnormal), sub_m,
1984
+ ShiftRight<13>(mantissa32)); // <1024
1985
+
1986
+ const VU32 sign16 = ShiftLeft<15>(sign);
1987
+ const VU32 normal16 = Or3(sign16, ShiftLeft<10>(biased_exp16), mantissa16);
1988
+ const VI32 bits16 = IfThenZeroElse(is_tiny, BitCast(di32, normal16));
1989
+ return BitCast(df16, DemoteTo(du16, bits16));
1990
+ }
1991
+
1992
+ #endif // HWY_NATIVE_F16C
1993
+
1994
+ // ------------------------------ OrderedTruncate2To
1995
+
1996
+ #if HWY_IDE || \
1997
+ (defined(HWY_NATIVE_ORDERED_TRUNCATE_2_TO) == defined(HWY_TARGET_TOGGLE))
1998
+
1999
+ #ifdef HWY_NATIVE_ORDERED_TRUNCATE_2_TO
2000
+ #undef HWY_NATIVE_ORDERED_TRUNCATE_2_TO
2001
+ #else
2002
+ #define HWY_NATIVE_ORDERED_TRUNCATE_2_TO
2003
+ #endif
2004
+
2005
+ // (Must come after HWY_TARGET_TOGGLE, else we don't reset it for scalar)
2006
+ #if HWY_TARGET != HWY_SCALAR || HWY_IDE
2007
+ template <class DN, HWY_IF_UNSIGNED_D(DN), class V, HWY_IF_UNSIGNED_V(V),
2008
+ HWY_IF_T_SIZE_V(V, sizeof(TFromD<DN>) * 2),
2009
+ HWY_IF_LANES_D(DFromV<VFromD<DN>>, HWY_MAX_LANES_D(DFromV<V>) * 2)>
2010
+ HWY_API VFromD<DN> OrderedTruncate2To(DN dn, V a, V b) {
2011
+ return ConcatEven(dn, BitCast(dn, b), BitCast(dn, a));
2012
+ }
2013
+ #endif // HWY_TARGET != HWY_SCALAR
2014
+ #endif // HWY_NATIVE_ORDERED_TRUNCATE_2_TO
2015
+
2016
+ // -------------------- LeadingZeroCount, TrailingZeroCount, HighestSetBitIndex
2017
+
2018
+ #if (defined(HWY_NATIVE_LEADING_ZERO_COUNT) == defined(HWY_TARGET_TOGGLE))
2019
+ #ifdef HWY_NATIVE_LEADING_ZERO_COUNT
2020
+ #undef HWY_NATIVE_LEADING_ZERO_COUNT
2021
+ #else
2022
+ #define HWY_NATIVE_LEADING_ZERO_COUNT
2023
+ #endif
2024
+
2025
+ namespace detail {
2026
+
2027
+ template <class D, HWY_IF_U32_D(D)>
2028
+ HWY_INLINE VFromD<D> UIntToF32BiasedExp(D d, VFromD<D> v) {
2029
+ const RebindToFloat<decltype(d)> df;
2030
+ #if HWY_TARGET > HWY_AVX3 && HWY_TARGET <= HWY_SSE2
2031
+ const RebindToSigned<decltype(d)> di;
2032
+ const Repartition<int16_t, decltype(d)> di16;
2033
+
2034
+ // On SSE2/SSSE3/SSE4/AVX2, do an int32_t to float conversion, followed
2035
+ // by a unsigned right shift of the uint32_t bit representation of the
2036
+ // floating point values by 23, followed by an int16_t Min
2037
+ // operation as we are only interested in the biased exponent that would
2038
+ // result from a uint32_t to float conversion.
2039
+
2040
+ // An int32_t to float vector conversion is also much more efficient on
2041
+ // SSE2/SSSE3/SSE4/AVX2 than an uint32_t vector to float vector conversion
2042
+ // as an uint32_t vector to float vector conversion on SSE2/SSSE3/SSE4/AVX2
2043
+ // requires multiple instructions whereas an int32_t to float vector
2044
+ // conversion can be carried out using a single instruction on
2045
+ // SSE2/SSSE3/SSE4/AVX2.
2046
+
2047
+ const auto f32_bits = BitCast(d, ConvertTo(df, BitCast(di, v)));
2048
+ return BitCast(d, Min(BitCast(di16, ShiftRight<23>(f32_bits)),
2049
+ BitCast(di16, Set(d, 158))));
2050
+ #else
2051
+ const auto f32_bits = BitCast(d, ConvertTo(df, v));
2052
+ return BitCast(d, ShiftRight<23>(f32_bits));
2053
+ #endif
2054
+ }
2055
+
2056
+ template <class V, HWY_IF_U32_D(DFromV<V>)>
2057
+ HWY_INLINE V I32RangeU32ToF32BiasedExp(V v) {
2058
+ // I32RangeU32ToF32BiasedExp is similar to UIntToF32BiasedExp, but
2059
+ // I32RangeU32ToF32BiasedExp assumes that v[i] is between 0 and 2147483647.
2060
+ const DFromV<decltype(v)> d;
2061
+ const RebindToFloat<decltype(d)> df;
2062
+ #if HWY_TARGET > HWY_AVX3 && HWY_TARGET <= HWY_SSE2
2063
+ const RebindToSigned<decltype(d)> d_src;
2064
+ #else
2065
+ const RebindToUnsigned<decltype(d)> d_src;
2066
+ #endif
2067
+ const auto f32_bits = BitCast(d, ConvertTo(df, BitCast(d_src, v)));
2068
+ return ShiftRight<23>(f32_bits);
2069
+ }
2070
+
2071
+ template <class D, HWY_IF_U16_D(D), HWY_IF_LANES_LE_D(D, HWY_MAX_BYTES / 4)>
2072
+ HWY_INLINE VFromD<D> UIntToF32BiasedExp(D d, VFromD<D> v) {
2073
+ const Rebind<uint32_t, decltype(d)> du32;
2074
+ const auto f32_biased_exp_as_u32 =
2075
+ I32RangeU32ToF32BiasedExp(PromoteTo(du32, v));
2076
+ return TruncateTo(d, f32_biased_exp_as_u32);
2077
+ }
2078
+
2079
+ #if HWY_TARGET != HWY_SCALAR
2080
+ template <class D, HWY_IF_U16_D(D), HWY_IF_LANES_GT_D(D, HWY_MAX_BYTES / 4)>
2081
+ HWY_INLINE VFromD<D> UIntToF32BiasedExp(D d, VFromD<D> v) {
2082
+ const Half<decltype(d)> dh;
2083
+ const Rebind<uint32_t, decltype(dh)> du32;
2084
+
2085
+ const auto lo_u32 = PromoteTo(du32, LowerHalf(dh, v));
2086
+ const auto hi_u32 = PromoteTo(du32, UpperHalf(dh, v));
2087
+
2088
+ const auto lo_f32_biased_exp_as_u32 = I32RangeU32ToF32BiasedExp(lo_u32);
2089
+ const auto hi_f32_biased_exp_as_u32 = I32RangeU32ToF32BiasedExp(hi_u32);
2090
+ #if HWY_TARGET <= HWY_SSE2
2091
+ const RebindToSigned<decltype(du32)> di32;
2092
+ const RebindToSigned<decltype(d)> di;
2093
+ return BitCast(d,
2094
+ OrderedDemote2To(di, BitCast(di32, lo_f32_biased_exp_as_u32),
2095
+ BitCast(di32, hi_f32_biased_exp_as_u32)));
2096
+ #else
2097
+ return OrderedTruncate2To(d, lo_f32_biased_exp_as_u32,
2098
+ hi_f32_biased_exp_as_u32);
2099
+ #endif
2100
+ }
2101
+ #endif // HWY_TARGET != HWY_SCALAR
2102
+
2103
+ template <class D, HWY_IF_U8_D(D), HWY_IF_LANES_LE_D(D, HWY_MAX_BYTES / 4)>
2104
+ HWY_INLINE VFromD<D> UIntToF32BiasedExp(D d, VFromD<D> v) {
2105
+ const Rebind<uint32_t, decltype(d)> du32;
2106
+ const auto f32_biased_exp_as_u32 =
2107
+ I32RangeU32ToF32BiasedExp(PromoteTo(du32, v));
2108
+ return U8FromU32(f32_biased_exp_as_u32);
2109
+ }
2110
+
2111
+ #if HWY_TARGET != HWY_SCALAR
2112
+ template <class D, HWY_IF_U8_D(D), HWY_IF_LANES_GT_D(D, HWY_MAX_BYTES / 4),
2113
+ HWY_IF_LANES_LE_D(D, HWY_MAX_BYTES / 2)>
2114
+ HWY_INLINE VFromD<D> UIntToF32BiasedExp(D d, VFromD<D> v) {
2115
+ const Half<decltype(d)> dh;
2116
+ const Rebind<uint32_t, decltype(dh)> du32;
2117
+ const Repartition<uint16_t, decltype(du32)> du16;
2118
+
2119
+ const auto lo_u32 = PromoteTo(du32, LowerHalf(dh, v));
2120
+ const auto hi_u32 = PromoteTo(du32, UpperHalf(dh, v));
2121
+
2122
+ const auto lo_f32_biased_exp_as_u32 = I32RangeU32ToF32BiasedExp(lo_u32);
2123
+ const auto hi_f32_biased_exp_as_u32 = I32RangeU32ToF32BiasedExp(hi_u32);
2124
+
2125
+ #if HWY_TARGET <= HWY_SSE2
2126
+ const RebindToSigned<decltype(du32)> di32;
2127
+ const RebindToSigned<decltype(du16)> di16;
2128
+ const auto f32_biased_exp_as_i16 =
2129
+ OrderedDemote2To(di16, BitCast(di32, lo_f32_biased_exp_as_u32),
2130
+ BitCast(di32, hi_f32_biased_exp_as_u32));
2131
+ return DemoteTo(d, f32_biased_exp_as_i16);
2132
+ #else
2133
+ const auto f32_biased_exp_as_u16 = OrderedTruncate2To(
2134
+ du16, lo_f32_biased_exp_as_u32, hi_f32_biased_exp_as_u32);
2135
+ return TruncateTo(d, f32_biased_exp_as_u16);
2136
+ #endif
2137
+ }
2138
+
2139
+ template <class D, HWY_IF_U8_D(D), HWY_IF_LANES_GT_D(D, HWY_MAX_BYTES / 2)>
2140
+ HWY_INLINE VFromD<D> UIntToF32BiasedExp(D d, VFromD<D> v) {
2141
+ const Half<decltype(d)> dh;
2142
+ const Half<decltype(dh)> dq;
2143
+ const Rebind<uint32_t, decltype(dq)> du32;
2144
+ const Repartition<uint16_t, decltype(du32)> du16;
2145
+
2146
+ const auto lo_half = LowerHalf(dh, v);
2147
+ const auto hi_half = UpperHalf(dh, v);
2148
+
2149
+ const auto u32_q0 = PromoteTo(du32, LowerHalf(dq, lo_half));
2150
+ const auto u32_q1 = PromoteTo(du32, UpperHalf(dq, lo_half));
2151
+ const auto u32_q2 = PromoteTo(du32, LowerHalf(dq, hi_half));
2152
+ const auto u32_q3 = PromoteTo(du32, UpperHalf(dq, hi_half));
2153
+
2154
+ const auto f32_biased_exp_as_u32_q0 = I32RangeU32ToF32BiasedExp(u32_q0);
2155
+ const auto f32_biased_exp_as_u32_q1 = I32RangeU32ToF32BiasedExp(u32_q1);
2156
+ const auto f32_biased_exp_as_u32_q2 = I32RangeU32ToF32BiasedExp(u32_q2);
2157
+ const auto f32_biased_exp_as_u32_q3 = I32RangeU32ToF32BiasedExp(u32_q3);
2158
+
2159
+ #if HWY_TARGET <= HWY_SSE2
2160
+ const RebindToSigned<decltype(du32)> di32;
2161
+ const RebindToSigned<decltype(du16)> di16;
2162
+
2163
+ const auto lo_f32_biased_exp_as_i16 =
2164
+ OrderedDemote2To(di16, BitCast(di32, f32_biased_exp_as_u32_q0),
2165
+ BitCast(di32, f32_biased_exp_as_u32_q1));
2166
+ const auto hi_f32_biased_exp_as_i16 =
2167
+ OrderedDemote2To(di16, BitCast(di32, f32_biased_exp_as_u32_q2),
2168
+ BitCast(di32, f32_biased_exp_as_u32_q3));
2169
+ return OrderedDemote2To(d, lo_f32_biased_exp_as_i16,
2170
+ hi_f32_biased_exp_as_i16);
2171
+ #else
2172
+ const auto lo_f32_biased_exp_as_u16 = OrderedTruncate2To(
2173
+ du16, f32_biased_exp_as_u32_q0, f32_biased_exp_as_u32_q1);
2174
+ const auto hi_f32_biased_exp_as_u16 = OrderedTruncate2To(
2175
+ du16, f32_biased_exp_as_u32_q2, f32_biased_exp_as_u32_q3);
2176
+ return OrderedTruncate2To(d, lo_f32_biased_exp_as_u16,
2177
+ hi_f32_biased_exp_as_u16);
2178
+ #endif
2179
+ }
2180
+ #endif // HWY_TARGET != HWY_SCALAR
2181
+
2182
+ #if HWY_TARGET == HWY_SCALAR
2183
+ template <class D>
2184
+ using F32ExpLzcntMinMaxRepartition = RebindToUnsigned<D>;
2185
+ #elif HWY_TARGET >= HWY_SSSE3 && HWY_TARGET <= HWY_SSE2
2186
+ template <class D>
2187
+ using F32ExpLzcntMinMaxRepartition = Repartition<uint8_t, D>;
2188
+ #else
2189
+ template <class D>
2190
+ using F32ExpLzcntMinMaxRepartition =
2191
+ Repartition<UnsignedFromSize<HWY_MIN(sizeof(TFromD<D>), 4)>, D>;
2192
+ #endif
2193
+
2194
+ template <class V>
2195
+ using F32ExpLzcntMinMaxCmpV = VFromD<F32ExpLzcntMinMaxRepartition<DFromV<V>>>;
2196
+
2197
+ template <class V>
2198
+ HWY_INLINE F32ExpLzcntMinMaxCmpV<V> F32ExpLzcntMinMaxBitCast(V v) {
2199
+ const DFromV<decltype(v)> d;
2200
+ const F32ExpLzcntMinMaxRepartition<decltype(d)> d2;
2201
+ return BitCast(d2, v);
2202
+ }
2203
+
2204
+ template <class D, HWY_IF_U64_D(D)>
2205
+ HWY_INLINE VFromD<D> UIntToF32BiasedExp(D d, VFromD<D> v) {
2206
+ #if HWY_TARGET == HWY_SCALAR
2207
+ const uint64_t u64_val = GetLane(v);
2208
+ const float f32_val = static_cast<float>(u64_val);
2209
+ uint32_t f32_bits;
2210
+ CopySameSize(&f32_val, &f32_bits);
2211
+ return Set(d, static_cast<uint64_t>(f32_bits >> 23));
2212
+ #else
2213
+ const Repartition<uint32_t, decltype(d)> du32;
2214
+ const auto f32_biased_exp = UIntToF32BiasedExp(du32, BitCast(du32, v));
2215
+ const auto f32_biased_exp_adj =
2216
+ IfThenZeroElse(Eq(f32_biased_exp, Zero(du32)),
2217
+ BitCast(du32, Set(d, 0x0000002000000000u)));
2218
+ const auto adj_f32_biased_exp = Add(f32_biased_exp, f32_biased_exp_adj);
2219
+
2220
+ return ShiftRight<32>(BitCast(
2221
+ d, Max(F32ExpLzcntMinMaxBitCast(adj_f32_biased_exp),
2222
+ F32ExpLzcntMinMaxBitCast(Reverse2(du32, adj_f32_biased_exp)))));
2223
+ #endif
2224
+ }
2225
+
2226
+ template <class V, HWY_IF_UNSIGNED_V(V)>
2227
+ HWY_INLINE V UIntToF32BiasedExp(V v) {
2228
+ const DFromV<decltype(v)> d;
2229
+ return UIntToF32BiasedExp(d, v);
2230
+ }
2231
+
2232
+ template <class V, HWY_IF_UNSIGNED_V(V),
2233
+ HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 2))>
2234
+ HWY_INLINE V NormalizeForUIntTruncConvToF32(V v) {
2235
+ return v;
2236
+ }
2237
+
2238
+ template <class V, HWY_IF_UNSIGNED_V(V),
2239
+ HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 4) | (1 << 8))>
2240
+ HWY_INLINE V NormalizeForUIntTruncConvToF32(V v) {
2241
+ // If v[i] >= 16777216 is true, make sure that the bit at
2242
+ // HighestSetBitIndex(v[i]) - 24 is zeroed out to ensure that any inexact
2243
+ // conversion to single-precision floating point is rounded down.
2244
+
2245
+ // This zeroing-out can be accomplished through the AndNot operation below.
2246
+ return AndNot(ShiftRight<24>(v), v);
2247
+ }
2248
+
2249
+ } // namespace detail
2250
+
2251
+ template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
2252
+ HWY_API V HighestSetBitIndex(V v) {
2253
+ const DFromV<decltype(v)> d;
2254
+ const RebindToUnsigned<decltype(d)> du;
2255
+ using TU = TFromD<decltype(du)>;
2256
+
2257
+ const auto f32_biased_exp = detail::UIntToF32BiasedExp(
2258
+ detail::NormalizeForUIntTruncConvToF32(BitCast(du, v)));
2259
+ return BitCast(d, Sub(f32_biased_exp, Set(du, TU{127})));
2260
+ }
2261
+
2262
+ template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
2263
+ HWY_API V LeadingZeroCount(V v) {
2264
+ const DFromV<decltype(v)> d;
2265
+ const RebindToUnsigned<decltype(d)> du;
2266
+ using TU = TFromD<decltype(du)>;
2267
+
2268
+ constexpr TU kNumOfBitsInT{sizeof(TU) * 8};
2269
+ const auto f32_biased_exp = detail::UIntToF32BiasedExp(
2270
+ detail::NormalizeForUIntTruncConvToF32(BitCast(du, v)));
2271
+ const auto lz_count = Sub(Set(du, TU{kNumOfBitsInT + 126}), f32_biased_exp);
2272
+
2273
+ return BitCast(d,
2274
+ Min(detail::F32ExpLzcntMinMaxBitCast(lz_count),
2275
+ detail::F32ExpLzcntMinMaxBitCast(Set(du, kNumOfBitsInT))));
2276
+ }
2277
+
2278
+ template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
2279
+ HWY_API V TrailingZeroCount(V v) {
2280
+ const DFromV<decltype(v)> d;
2281
+ const RebindToUnsigned<decltype(d)> du;
2282
+ const RebindToSigned<decltype(d)> di;
2283
+ using TU = TFromD<decltype(du)>;
2284
+
2285
+ const auto vi = BitCast(di, v);
2286
+ const auto lowest_bit = BitCast(du, And(vi, Neg(vi)));
2287
+
2288
+ constexpr TU kNumOfBitsInT{sizeof(TU) * 8};
2289
+ const auto f32_biased_exp = detail::UIntToF32BiasedExp(lowest_bit);
2290
+ const auto tz_count = Sub(f32_biased_exp, Set(du, TU{127}));
2291
+
2292
+ return BitCast(d,
2293
+ Min(detail::F32ExpLzcntMinMaxBitCast(tz_count),
2294
+ detail::F32ExpLzcntMinMaxBitCast(Set(du, kNumOfBitsInT))));
2295
+ }
2296
+ #endif // HWY_NATIVE_LEADING_ZERO_COUNT
2297
+
2298
+ // ------------------------------ AESRound
2299
+
2300
+ // Cannot implement on scalar: need at least 16 bytes for TableLookupBytes.
2301
+ #if HWY_TARGET != HWY_SCALAR || HWY_IDE
2302
+
2303
+ // Define for white-box testing, even if native instructions are available.
2304
+ namespace detail {
2305
+
2306
+ // Constant-time: computes inverse in GF(2^4) based on "Accelerating AES with
2307
+ // Vector Permute Instructions" and the accompanying assembly language
2308
+ // implementation: https://crypto.stanford.edu/vpaes/vpaes.tgz. See also Botan:
2309
+ // https://botan.randombit.net/doxygen/aes__vperm_8cpp_source.html .
2310
+ //
2311
+ // A brute-force 256 byte table lookup can also be made constant-time, and
2312
+ // possibly competitive on NEON, but this is more performance-portable
2313
+ // especially for x86 and large vectors.
2314
+
2315
+ template <class V> // u8
2316
+ HWY_INLINE V SubBytesMulInverseAndAffineLookup(V state, V affine_tblL,
2317
+ V affine_tblU) {
2318
+ const DFromV<V> du;
2319
+ const auto mask = Set(du, uint8_t{0xF});
2320
+
2321
+ // Change polynomial basis to GF(2^4)
2322
+ {
2323
+ alignas(16) static constexpr uint8_t basisL[16] = {
2324
+ 0x00, 0x70, 0x2A, 0x5A, 0x98, 0xE8, 0xB2, 0xC2,
2325
+ 0x08, 0x78, 0x22, 0x52, 0x90, 0xE0, 0xBA, 0xCA};
2326
+ alignas(16) static constexpr uint8_t basisU[16] = {
2327
+ 0x00, 0x4D, 0x7C, 0x31, 0x7D, 0x30, 0x01, 0x4C,
2328
+ 0x81, 0xCC, 0xFD, 0xB0, 0xFC, 0xB1, 0x80, 0xCD};
2329
+ const auto sL = And(state, mask);
2330
+ const auto sU = ShiftRight<4>(state); // byte shift => upper bits are zero
2331
+ const auto gf4L = TableLookupBytes(LoadDup128(du, basisL), sL);
2332
+ const auto gf4U = TableLookupBytes(LoadDup128(du, basisU), sU);
2333
+ state = Xor(gf4L, gf4U);
2334
+ }
2335
+
2336
+ // Inversion in GF(2^4). Elements 0 represent "infinity" (division by 0) and
2337
+ // cause TableLookupBytesOr0 to return 0.
2338
+ alignas(16) static constexpr uint8_t kZetaInv[16] = {
2339
+ 0x80, 7, 11, 15, 6, 10, 4, 1, 9, 8, 5, 2, 12, 14, 13, 3};
2340
+ alignas(16) static constexpr uint8_t kInv[16] = {
2341
+ 0x80, 1, 8, 13, 15, 6, 5, 14, 2, 12, 11, 10, 9, 3, 7, 4};
2342
+ const auto tbl = LoadDup128(du, kInv);
2343
+ const auto sL = And(state, mask); // L=low nibble, U=upper
2344
+ const auto sU = ShiftRight<4>(state); // byte shift => upper bits are zero
2345
+ const auto sX = Xor(sU, sL);
2346
+ const auto invL = TableLookupBytes(LoadDup128(du, kZetaInv), sL);
2347
+ const auto invU = TableLookupBytes(tbl, sU);
2348
+ const auto invX = TableLookupBytes(tbl, sX);
2349
+ const auto outL = Xor(sX, TableLookupBytesOr0(tbl, Xor(invL, invU)));
2350
+ const auto outU = Xor(sU, TableLookupBytesOr0(tbl, Xor(invL, invX)));
2351
+
2352
+ const auto affL = TableLookupBytesOr0(affine_tblL, outL);
2353
+ const auto affU = TableLookupBytesOr0(affine_tblU, outU);
2354
+ return Xor(affL, affU);
2355
+ }
2356
+
2357
+ template <class V> // u8
2358
+ HWY_INLINE V SubBytes(V state) {
2359
+ const DFromV<V> du;
2360
+ // Linear skew (cannot bake 0x63 bias into the table because out* indices
2361
+ // may have the infinity flag set).
2362
+ alignas(16) static constexpr uint8_t kAffineL[16] = {
2363
+ 0x00, 0xC7, 0xBD, 0x6F, 0x17, 0x6D, 0xD2, 0xD0,
2364
+ 0x78, 0xA8, 0x02, 0xC5, 0x7A, 0xBF, 0xAA, 0x15};
2365
+ alignas(16) static constexpr uint8_t kAffineU[16] = {
2366
+ 0x00, 0x6A, 0xBB, 0x5F, 0xA5, 0x74, 0xE4, 0xCF,
2367
+ 0xFA, 0x35, 0x2B, 0x41, 0xD1, 0x90, 0x1E, 0x8E};
2368
+ return Xor(SubBytesMulInverseAndAffineLookup(state, LoadDup128(du, kAffineL),
2369
+ LoadDup128(du, kAffineU)),
2370
+ Set(du, uint8_t{0x63}));
2371
+ }
2372
+
2373
+ template <class V> // u8
2374
+ HWY_INLINE V InvSubBytes(V state) {
2375
+ const DFromV<V> du;
2376
+ alignas(16) static constexpr uint8_t kGF2P4InvToGF2P8InvL[16]{
2377
+ 0x00, 0x40, 0xF9, 0x7E, 0x53, 0xEA, 0x87, 0x13,
2378
+ 0x2D, 0x3E, 0x94, 0xD4, 0xB9, 0x6D, 0xAA, 0xC7};
2379
+ alignas(16) static constexpr uint8_t kGF2P4InvToGF2P8InvU[16]{
2380
+ 0x00, 0x1D, 0x44, 0x93, 0x0F, 0x56, 0xD7, 0x12,
2381
+ 0x9C, 0x8E, 0xC5, 0xD8, 0x59, 0x81, 0x4B, 0xCA};
2382
+
2383
+ // Apply the inverse affine transformation
2384
+ const auto b = Xor(Xor3(Or(ShiftLeft<1>(state), ShiftRight<7>(state)),
2385
+ Or(ShiftLeft<3>(state), ShiftRight<5>(state)),
2386
+ Or(ShiftLeft<6>(state), ShiftRight<2>(state))),
2387
+ Set(du, uint8_t{0x05}));
2388
+
2389
+ // The GF(2^8) multiplicative inverse is computed as follows:
2390
+ // - Changing the polynomial basis to GF(2^4)
2391
+ // - Computing the GF(2^4) multiplicative inverse
2392
+ // - Converting the GF(2^4) multiplicative inverse to the GF(2^8)
2393
+ // multiplicative inverse through table lookups using the
2394
+ // kGF2P4InvToGF2P8InvL and kGF2P4InvToGF2P8InvU tables
2395
+ return SubBytesMulInverseAndAffineLookup(
2396
+ b, LoadDup128(du, kGF2P4InvToGF2P8InvL),
2397
+ LoadDup128(du, kGF2P4InvToGF2P8InvU));
2398
+ }
2399
+
2400
+ } // namespace detail
2401
+
2402
+ #endif // HWY_TARGET != HWY_SCALAR
2403
+
2404
+ // "Include guard": skip if native AES instructions are available.
2405
+ #if (defined(HWY_NATIVE_AES) == defined(HWY_TARGET_TOGGLE))
2406
+ #ifdef HWY_NATIVE_AES
2407
+ #undef HWY_NATIVE_AES
2408
+ #else
2409
+ #define HWY_NATIVE_AES
2410
+ #endif
2411
+
2412
+ // (Must come after HWY_TARGET_TOGGLE, else we don't reset it for scalar)
2413
+ #if HWY_TARGET != HWY_SCALAR
2414
+
2415
+ namespace detail {
2416
+
2417
+ template <class V> // u8
2418
+ HWY_INLINE V ShiftRows(const V state) {
2419
+ const DFromV<V> du;
2420
+ alignas(16) static constexpr uint8_t kShiftRow[16] = {
2421
+ 0, 5, 10, 15, // transposed: state is column major
2422
+ 4, 9, 14, 3, //
2423
+ 8, 13, 2, 7, //
2424
+ 12, 1, 6, 11};
2425
+ const auto shift_row = LoadDup128(du, kShiftRow);
2426
+ return TableLookupBytes(state, shift_row);
2427
+ }
2428
+
2429
+ template <class V> // u8
2430
+ HWY_INLINE V InvShiftRows(const V state) {
2431
+ const DFromV<V> du;
2432
+ alignas(16) static constexpr uint8_t kShiftRow[16] = {
2433
+ 0, 13, 10, 7, // transposed: state is column major
2434
+ 4, 1, 14, 11, //
2435
+ 8, 5, 2, 15, //
2436
+ 12, 9, 6, 3};
2437
+ const auto shift_row = LoadDup128(du, kShiftRow);
2438
+ return TableLookupBytes(state, shift_row);
2439
+ }
2440
+
2441
+ template <class V> // u8
2442
+ HWY_INLINE V GF2P8Mod11BMulBy2(V v) {
2443
+ const DFromV<V> du;
2444
+ const RebindToSigned<decltype(du)> di; // can only do signed comparisons
2445
+ const auto msb = Lt(BitCast(di, v), Zero(di));
2446
+ const auto overflow = BitCast(du, IfThenElseZero(msb, Set(di, int8_t{0x1B})));
2447
+ return Xor(Add(v, v), overflow); // = v*2 in GF(2^8).
2448
+ }
2449
+
2450
+ template <class V> // u8
2451
+ HWY_INLINE V MixColumns(const V state) {
2452
+ const DFromV<V> du;
2453
+ // For each column, the rows are the sum of GF(2^8) matrix multiplication by:
2454
+ // 2 3 1 1 // Let s := state*1, d := state*2, t := state*3.
2455
+ // 1 2 3 1 // d are on diagonal, no permutation needed.
2456
+ // 1 1 2 3 // t1230 indicates column indices of threes for the 4 rows.
2457
+ // 3 1 1 2 // We also need to compute s2301 and s3012 (=1230 o 2301).
2458
+ alignas(16) static constexpr uint8_t k2301[16] = {
2459
+ 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13};
2460
+ alignas(16) static constexpr uint8_t k1230[16] = {
2461
+ 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12};
2462
+ const auto d = GF2P8Mod11BMulBy2(state); // = state*2 in GF(2^8).
2463
+ const auto s2301 = TableLookupBytes(state, LoadDup128(du, k2301));
2464
+ const auto d_s2301 = Xor(d, s2301);
2465
+ const auto t_s2301 = Xor(state, d_s2301); // t(s*3) = XOR-sum {s, d(s*2)}
2466
+ const auto t1230_s3012 = TableLookupBytes(t_s2301, LoadDup128(du, k1230));
2467
+ return Xor(d_s2301, t1230_s3012); // XOR-sum of 4 terms
2468
+ }
2469
+
2470
+ template <class V> // u8
2471
+ HWY_INLINE V InvMixColumns(const V state) {
2472
+ const DFromV<V> du;
2473
+ // For each column, the rows are the sum of GF(2^8) matrix multiplication by:
2474
+ // 14 11 13 9
2475
+ // 9 14 11 13
2476
+ // 13 9 14 11
2477
+ // 11 13 9 14
2478
+ alignas(16) static constexpr uint8_t k2301[16] = {
2479
+ 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13};
2480
+ alignas(16) static constexpr uint8_t k1230[16] = {
2481
+ 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12};
2482
+ const auto v1230 = LoadDup128(du, k1230);
2483
+
2484
+ const auto sx2 = GF2P8Mod11BMulBy2(state); /* = state*2 in GF(2^8) */
2485
+ const auto sx4 = GF2P8Mod11BMulBy2(sx2); /* = state*4 in GF(2^8) */
2486
+ const auto sx8 = GF2P8Mod11BMulBy2(sx4); /* = state*8 in GF(2^8) */
2487
+ const auto sx9 = Xor(sx8, state); /* = state*9 in GF(2^8) */
2488
+ const auto sx11 = Xor(sx9, sx2); /* = state*11 in GF(2^8) */
2489
+ const auto sx13 = Xor(sx9, sx4); /* = state*13 in GF(2^8) */
2490
+ const auto sx14 = Xor3(sx8, sx4, sx2); /* = state*14 in GF(2^8) */
2491
+
2492
+ const auto sx13_0123_sx9_1230 = Xor(sx13, TableLookupBytes(sx9, v1230));
2493
+ const auto sx14_0123_sx11_1230 = Xor(sx14, TableLookupBytes(sx11, v1230));
2494
+ const auto sx13_2301_sx9_3012 =
2495
+ TableLookupBytes(sx13_0123_sx9_1230, LoadDup128(du, k2301));
2496
+ return Xor(sx14_0123_sx11_1230, sx13_2301_sx9_3012);
2497
+ }
2498
+
2499
+ } // namespace detail
2500
+
2501
+ template <class V> // u8
2502
+ HWY_API V AESRound(V state, const V round_key) {
2503
+ // Intel docs swap the first two steps, but it does not matter because
2504
+ // ShiftRows is a permutation and SubBytes is independent of lane index.
2505
+ state = detail::SubBytes(state);
2506
+ state = detail::ShiftRows(state);
2507
+ state = detail::MixColumns(state);
2508
+ state = Xor(state, round_key); // AddRoundKey
2509
+ return state;
2510
+ }
2511
+
2512
+ template <class V> // u8
2513
+ HWY_API V AESLastRound(V state, const V round_key) {
2514
+ // LIke AESRound, but without MixColumns.
2515
+ state = detail::SubBytes(state);
2516
+ state = detail::ShiftRows(state);
2517
+ state = Xor(state, round_key); // AddRoundKey
2518
+ return state;
2519
+ }
2520
+
2521
+ template <class V>
2522
+ HWY_API V AESInvMixColumns(V state) {
2523
+ return detail::InvMixColumns(state);
2524
+ }
2525
+
2526
+ template <class V> // u8
2527
+ HWY_API V AESRoundInv(V state, const V round_key) {
2528
+ state = detail::InvSubBytes(state);
2529
+ state = detail::InvShiftRows(state);
2530
+ state = detail::InvMixColumns(state);
2531
+ state = Xor(state, round_key); // AddRoundKey
2532
+ return state;
2533
+ }
2534
+
2535
+ template <class V> // u8
2536
+ HWY_API V AESLastRoundInv(V state, const V round_key) {
2537
+ // Like AESRoundInv, but without InvMixColumns.
2538
+ state = detail::InvSubBytes(state);
2539
+ state = detail::InvShiftRows(state);
2540
+ state = Xor(state, round_key); // AddRoundKey
2541
+ return state;
2542
+ }
2543
+
2544
+ template <uint8_t kRcon, class V, HWY_IF_U8_D(DFromV<V>)>
2545
+ HWY_API V AESKeyGenAssist(V v) {
2546
+ alignas(16) static constexpr uint8_t kRconXorMask[16] = {
2547
+ 0, 0, 0, 0, kRcon, 0, 0, 0, 0, 0, 0, 0, kRcon, 0, 0, 0};
2548
+ alignas(16) static constexpr uint8_t kRotWordShuffle[16] = {
2549
+ 4, 5, 6, 7, 5, 6, 7, 4, 12, 13, 14, 15, 13, 14, 15, 12};
2550
+ const DFromV<decltype(v)> d;
2551
+ const auto sub_word_result = detail::SubBytes(v);
2552
+ const auto rot_word_result =
2553
+ TableLookupBytes(sub_word_result, LoadDup128(d, kRotWordShuffle));
2554
+ return Xor(rot_word_result, LoadDup128(d, kRconXorMask));
2555
+ }
2556
+
2557
+ // Constant-time implementation inspired by
2558
+ // https://www.bearssl.org/constanttime.html, but about half the cost because we
2559
+ // use 64x64 multiplies and 128-bit XORs.
2560
+ template <class V>
2561
+ HWY_API V CLMulLower(V a, V b) {
2562
+ const DFromV<V> d;
2563
+ static_assert(IsSame<TFromD<decltype(d)>, uint64_t>(), "V must be u64");
2564
+ const auto k1 = Set(d, 0x1111111111111111ULL);
2565
+ const auto k2 = Set(d, 0x2222222222222222ULL);
2566
+ const auto k4 = Set(d, 0x4444444444444444ULL);
2567
+ const auto k8 = Set(d, 0x8888888888888888ULL);
2568
+ const auto a0 = And(a, k1);
2569
+ const auto a1 = And(a, k2);
2570
+ const auto a2 = And(a, k4);
2571
+ const auto a3 = And(a, k8);
2572
+ const auto b0 = And(b, k1);
2573
+ const auto b1 = And(b, k2);
2574
+ const auto b2 = And(b, k4);
2575
+ const auto b3 = And(b, k8);
2576
+
2577
+ auto m0 = Xor(MulEven(a0, b0), MulEven(a1, b3));
2578
+ auto m1 = Xor(MulEven(a0, b1), MulEven(a1, b0));
2579
+ auto m2 = Xor(MulEven(a0, b2), MulEven(a1, b1));
2580
+ auto m3 = Xor(MulEven(a0, b3), MulEven(a1, b2));
2581
+ m0 = Xor(m0, Xor(MulEven(a2, b2), MulEven(a3, b1)));
2582
+ m1 = Xor(m1, Xor(MulEven(a2, b3), MulEven(a3, b2)));
2583
+ m2 = Xor(m2, Xor(MulEven(a2, b0), MulEven(a3, b3)));
2584
+ m3 = Xor(m3, Xor(MulEven(a2, b1), MulEven(a3, b0)));
2585
+ return Or(Or(And(m0, k1), And(m1, k2)), Or(And(m2, k4), And(m3, k8)));
2586
+ }
2587
+
2588
+ template <class V>
2589
+ HWY_API V CLMulUpper(V a, V b) {
2590
+ const DFromV<V> d;
2591
+ static_assert(IsSame<TFromD<decltype(d)>, uint64_t>(), "V must be u64");
2592
+ const auto k1 = Set(d, 0x1111111111111111ULL);
2593
+ const auto k2 = Set(d, 0x2222222222222222ULL);
2594
+ const auto k4 = Set(d, 0x4444444444444444ULL);
2595
+ const auto k8 = Set(d, 0x8888888888888888ULL);
2596
+ const auto a0 = And(a, k1);
2597
+ const auto a1 = And(a, k2);
2598
+ const auto a2 = And(a, k4);
2599
+ const auto a3 = And(a, k8);
2600
+ const auto b0 = And(b, k1);
2601
+ const auto b1 = And(b, k2);
2602
+ const auto b2 = And(b, k4);
2603
+ const auto b3 = And(b, k8);
2604
+
2605
+ auto m0 = Xor(MulOdd(a0, b0), MulOdd(a1, b3));
2606
+ auto m1 = Xor(MulOdd(a0, b1), MulOdd(a1, b0));
2607
+ auto m2 = Xor(MulOdd(a0, b2), MulOdd(a1, b1));
2608
+ auto m3 = Xor(MulOdd(a0, b3), MulOdd(a1, b2));
2609
+ m0 = Xor(m0, Xor(MulOdd(a2, b2), MulOdd(a3, b1)));
2610
+ m1 = Xor(m1, Xor(MulOdd(a2, b3), MulOdd(a3, b2)));
2611
+ m2 = Xor(m2, Xor(MulOdd(a2, b0), MulOdd(a3, b3)));
2612
+ m3 = Xor(m3, Xor(MulOdd(a2, b1), MulOdd(a3, b0)));
2613
+ return Or(Or(And(m0, k1), And(m1, k2)), Or(And(m2, k4), And(m3, k8)));
2614
+ }
2615
+
2616
+ #endif // HWY_NATIVE_AES
2617
+ #endif // HWY_TARGET != HWY_SCALAR
2618
+
2619
+ // ------------------------------ PopulationCount
2620
+
2621
+ // "Include guard": skip if native POPCNT-related instructions are available.
2622
+ #if (defined(HWY_NATIVE_POPCNT) == defined(HWY_TARGET_TOGGLE))
2623
+ #ifdef HWY_NATIVE_POPCNT
2624
+ #undef HWY_NATIVE_POPCNT
2625
+ #else
2626
+ #define HWY_NATIVE_POPCNT
2627
+ #endif
2628
+
2629
+ // This overload requires vectors to be at least 16 bytes, which is the case
2630
+ // for LMUL >= 2.
2631
+ #undef HWY_IF_POPCNT
2632
+ #if HWY_TARGET == HWY_RVV
2633
+ #define HWY_IF_POPCNT(D) \
2634
+ hwy::EnableIf<D().Pow2() >= 1 && D().MaxLanes() >= 16>* = nullptr
2635
+ #else
2636
+ // Other targets only have these two overloads which are mutually exclusive, so
2637
+ // no further conditions are required.
2638
+ #define HWY_IF_POPCNT(D) void* = nullptr
2639
+ #endif // HWY_TARGET == HWY_RVV
2640
+
2641
+ template <class V, class D = DFromV<V>, HWY_IF_U8_D(D),
2642
+ HWY_IF_V_SIZE_GT_D(D, 8), HWY_IF_POPCNT(D)>
2643
+ HWY_API V PopulationCount(V v) {
2644
+ const D d;
2645
+ HWY_ALIGN constexpr uint8_t kLookup[16] = {
2646
+ 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
2647
+ };
2648
+ const auto lo = And(v, Set(d, uint8_t{0xF}));
2649
+ const auto hi = ShiftRight<4>(v);
2650
+ const auto lookup = LoadDup128(d, kLookup);
2651
+ return Add(TableLookupBytes(lookup, hi), TableLookupBytes(lookup, lo));
2652
+ }
2653
+
2654
+ // RVV has a specialization that avoids the Set().
2655
+ #if HWY_TARGET != HWY_RVV
2656
+ // Slower fallback for capped vectors.
2657
+ template <class V, class D = DFromV<V>, HWY_IF_U8_D(D),
2658
+ HWY_IF_V_SIZE_LE_D(D, 8)>
2659
+ HWY_API V PopulationCount(V v) {
2660
+ const D d;
2661
+ // See https://arxiv.org/pdf/1611.07612.pdf, Figure 3
2662
+ const V k33 = Set(d, uint8_t{0x33});
2663
+ v = Sub(v, And(ShiftRight<1>(v), Set(d, uint8_t{0x55})));
2664
+ v = Add(And(ShiftRight<2>(v), k33), And(v, k33));
2665
+ return And(Add(v, ShiftRight<4>(v)), Set(d, uint8_t{0x0F}));
2666
+ }
2667
+ #endif // HWY_TARGET != HWY_RVV
2668
+
2669
+ template <class V, class D = DFromV<V>, HWY_IF_U16_D(D)>
2670
+ HWY_API V PopulationCount(V v) {
2671
+ const D d;
2672
+ const Repartition<uint8_t, decltype(d)> d8;
2673
+ const auto vals = BitCast(d, PopulationCount(BitCast(d8, v)));
2674
+ return Add(ShiftRight<8>(vals), And(vals, Set(d, uint16_t{0xFF})));
2675
+ }
2676
+
2677
+ template <class V, class D = DFromV<V>, HWY_IF_U32_D(D)>
2678
+ HWY_API V PopulationCount(V v) {
2679
+ const D d;
2680
+ Repartition<uint16_t, decltype(d)> d16;
2681
+ auto vals = BitCast(d, PopulationCount(BitCast(d16, v)));
2682
+ return Add(ShiftRight<16>(vals), And(vals, Set(d, uint32_t{0xFF})));
2683
+ }
2684
+
2685
+ #if HWY_HAVE_INTEGER64
2686
+ template <class V, class D = DFromV<V>, HWY_IF_U64_D(D)>
2687
+ HWY_API V PopulationCount(V v) {
2688
+ const D d;
2689
+ Repartition<uint32_t, decltype(d)> d32;
2690
+ auto vals = BitCast(d, PopulationCount(BitCast(d32, v)));
2691
+ return Add(ShiftRight<32>(vals), And(vals, Set(d, 0xFFULL)));
2692
+ }
2693
+ #endif
2694
+
2695
+ #endif // HWY_NATIVE_POPCNT
2696
+
2697
+ // ------------------------------ 8-bit multiplication
2698
+
2699
+ // "Include guard": skip if native 8-bit mul instructions are available.
2700
+ #if (defined(HWY_NATIVE_MUL_8) == defined(HWY_TARGET_TOGGLE)) || HWY_IDE
2701
+ #ifdef HWY_NATIVE_MUL_8
2702
+ #undef HWY_NATIVE_MUL_8
2703
+ #else
2704
+ #define HWY_NATIVE_MUL_8
2705
+ #endif
2706
+
2707
+ // 8 bit and fits in wider reg: promote
2708
+ template <class V, HWY_IF_T_SIZE_V(V, 1),
2709
+ HWY_IF_V_SIZE_LE_V(V, HWY_MAX_BYTES / 2)>
2710
+ HWY_API V operator*(const V a, const V b) {
2711
+ const DFromV<decltype(a)> d;
2712
+ const Rebind<MakeWide<TFromV<V>>, decltype(d)> dw;
2713
+ const RebindToUnsigned<decltype(d)> du; // TruncateTo result
2714
+ const RebindToUnsigned<decltype(dw)> dwu; // TruncateTo input
2715
+ const VFromD<decltype(dw)> mul = PromoteTo(dw, a) * PromoteTo(dw, b);
2716
+ // TruncateTo is cheaper than ConcatEven.
2717
+ return BitCast(d, TruncateTo(du, BitCast(dwu, mul)));
2718
+ }
2719
+
2720
+ // 8 bit full reg: promote halves
2721
+ template <class V, HWY_IF_T_SIZE_V(V, 1),
2722
+ HWY_IF_V_SIZE_GT_V(V, HWY_MAX_BYTES / 2)>
2723
+ HWY_API V operator*(const V a, const V b) {
2724
+ const DFromV<decltype(a)> d;
2725
+ const Half<decltype(d)> dh;
2726
+ const Twice<RepartitionToWide<decltype(dh)>> dw;
2727
+ const VFromD<decltype(dw)> a0 = PromoteTo(dw, LowerHalf(dh, a));
2728
+ const VFromD<decltype(dw)> a1 = PromoteTo(dw, UpperHalf(dh, a));
2729
+ const VFromD<decltype(dw)> b0 = PromoteTo(dw, LowerHalf(dh, b));
2730
+ const VFromD<decltype(dw)> b1 = PromoteTo(dw, UpperHalf(dh, b));
2731
+ const VFromD<decltype(dw)> m0 = a0 * b0;
2732
+ const VFromD<decltype(dw)> m1 = a1 * b1;
2733
+ return ConcatEven(d, BitCast(d, m1), BitCast(d, m0));
2734
+ }
2735
+
2736
+ #endif // HWY_NATIVE_MUL_8
2737
+
2738
+ // ------------------------------ 64-bit multiplication
2739
+
2740
+ // "Include guard": skip if native 64-bit mul instructions are available.
2741
+ #if (defined(HWY_NATIVE_MUL_64) == defined(HWY_TARGET_TOGGLE)) || HWY_IDE
2742
+ #ifdef HWY_NATIVE_MUL_64
2743
+ #undef HWY_NATIVE_MUL_64
2744
+ #else
2745
+ #define HWY_NATIVE_MUL_64
2746
+ #endif
2747
+
2748
+ // Single-lane i64 or u64
2749
+ template <class V, HWY_IF_T_SIZE_V(V, 8), HWY_IF_V_SIZE_V(V, 8),
2750
+ HWY_IF_NOT_FLOAT_V(V)>
2751
+ HWY_API V operator*(V x, V y) {
2752
+ const DFromV<V> d;
2753
+ using T = TFromD<decltype(d)>;
2754
+ using TU = MakeUnsigned<T>;
2755
+ const TU xu = static_cast<TU>(GetLane(x));
2756
+ const TU yu = static_cast<TU>(GetLane(y));
2757
+ return Set(d, static_cast<T>(xu * yu));
2758
+ }
2759
+
2760
+ template <class V, class D64 = DFromV<V>, HWY_IF_U64_D(D64),
2761
+ HWY_IF_V_SIZE_GT_D(D64, 8)>
2762
+ HWY_API V operator*(V x, V y) {
2763
+ RepartitionToNarrow<D64> d32;
2764
+ auto x32 = BitCast(d32, x);
2765
+ auto y32 = BitCast(d32, y);
2766
+ auto lolo = BitCast(d32, MulEven(x32, y32));
2767
+ auto lohi = BitCast(d32, MulEven(x32, BitCast(d32, ShiftRight<32>(y))));
2768
+ auto hilo = BitCast(d32, MulEven(BitCast(d32, ShiftRight<32>(x)), y32));
2769
+ auto hi = BitCast(d32, ShiftLeft<32>(BitCast(D64{}, lohi + hilo)));
2770
+ return BitCast(D64{}, lolo + hi);
2771
+ }
2772
+ template <class V, class DI64 = DFromV<V>, HWY_IF_I64_D(DI64),
2773
+ HWY_IF_V_SIZE_GT_D(DI64, 8)>
2774
+ HWY_API V operator*(V x, V y) {
2775
+ RebindToUnsigned<DI64> du64;
2776
+ return BitCast(DI64{}, BitCast(du64, x) * BitCast(du64, y));
2777
+ }
2778
+
2779
+ #endif // HWY_NATIVE_MUL_64
2780
+
2781
+ // ------------------------------ MulAdd / NegMulAdd
2782
+
2783
+ // "Include guard": skip if native int MulAdd instructions are available.
2784
+ #if (defined(HWY_NATIVE_INT_FMA) == defined(HWY_TARGET_TOGGLE))
2785
+ #ifdef HWY_NATIVE_INT_FMA
2786
+ #undef HWY_NATIVE_INT_FMA
2787
+ #else
2788
+ #define HWY_NATIVE_INT_FMA
2789
+ #endif
2790
+
2791
+ template <class V, HWY_IF_NOT_FLOAT_V(V)>
2792
+ HWY_API V MulAdd(V mul, V x, V add) {
2793
+ return Add(Mul(mul, x), add);
2794
+ }
2795
+
2796
+ template <class V, HWY_IF_NOT_FLOAT_V(V)>
2797
+ HWY_API V NegMulAdd(V mul, V x, V add) {
2798
+ return Sub(add, Mul(mul, x));
2799
+ }
2800
+
2801
+ #endif // HWY_NATIVE_INT_FMA
2802
+
2803
+ // ------------------------------ SatWidenMulPairwiseAdd
2804
+
2805
+ #if (defined(HWY_NATIVE_U8_I8_SATWIDENMULPAIRWISEADD) == \
2806
+ defined(HWY_TARGET_TOGGLE))
2807
+
2808
+ #ifdef HWY_NATIVE_U8_I8_SATWIDENMULPAIRWISEADD
2809
+ #undef HWY_NATIVE_U8_I8_SATWIDENMULPAIRWISEADD
2810
+ #else
2811
+ #define HWY_NATIVE_U8_I8_SATWIDENMULPAIRWISEADD
2812
+ #endif
2813
+
2814
+ template <class DI16, class VU8, class VI8,
2815
+ class VU8_2 = Vec<Repartition<uint8_t, DI16>>, HWY_IF_I16_D(DI16),
2816
+ HWY_IF_U8_D(DFromV<VU8>), HWY_IF_I8_D(DFromV<VI8>),
2817
+ HWY_IF_LANES_D(DFromV<VU8>, HWY_MAX_LANES_V(VI8)),
2818
+ HWY_IF_LANES_D(DFromV<VU8>, HWY_MAX_LANES_V(VU8_2))>
2819
+ HWY_API Vec<DI16> SatWidenMulPairwiseAdd(DI16 di16, VU8 a, VI8 b) {
2820
+ const RebindToUnsigned<decltype(di16)> du16;
2821
+
2822
+ const auto a0 = And(BitCast(di16, a), Set(di16, int16_t{0x00FF}));
2823
+ const auto b0 = ShiftRight<8>(ShiftLeft<8>(BitCast(di16, b)));
2824
+
2825
+ const auto a1 = BitCast(di16, ShiftRight<8>(BitCast(du16, a)));
2826
+ const auto b1 = ShiftRight<8>(BitCast(di16, b));
2827
+
2828
+ return SaturatedAdd(Mul(a0, b0), Mul(a1, b1));
2829
+ }
2830
+
2831
+ #endif
2832
+
2833
+ // ------------------------------ SumOfMulQuadAccumulate
2834
+
2835
+ #if (defined(HWY_NATIVE_I8_I8_SUMOFMULQUADACCUMULATE) == \
2836
+ defined(HWY_TARGET_TOGGLE))
2837
+
2838
+ #ifdef HWY_NATIVE_I8_I8_SUMOFMULQUADACCUMULATE
2839
+ #undef HWY_NATIVE_I8_I8_SUMOFMULQUADACCUMULATE
2840
+ #else
2841
+ #define HWY_NATIVE_I8_I8_SUMOFMULQUADACCUMULATE
2842
+ #endif
2843
+
2844
+ template <class DI32, HWY_IF_I32_D(DI32)>
2845
+ HWY_API VFromD<DI32> SumOfMulQuadAccumulate(DI32 di32,
2846
+ VFromD<Repartition<int8_t, DI32>> a,
2847
+ VFromD<Repartition<int8_t, DI32>> b,
2848
+ VFromD<DI32> sum) {
2849
+ const Repartition<int16_t, decltype(di32)> di16;
2850
+
2851
+ const auto a0 = ShiftRight<8>(ShiftLeft<8>(BitCast(di16, a)));
2852
+ const auto b0 = ShiftRight<8>(ShiftLeft<8>(BitCast(di16, b)));
2853
+
2854
+ const auto a1 = ShiftRight<8>(BitCast(di16, a));
2855
+ const auto b1 = ShiftRight<8>(BitCast(di16, b));
2856
+
2857
+ return Add(sum, Add(WidenMulPairwiseAdd(di32, a0, b0),
2858
+ WidenMulPairwiseAdd(di32, a1, b1)));
2859
+ }
2860
+
2861
+ #endif
2862
+
2863
+ #if (defined(HWY_NATIVE_U8_U8_SUMOFMULQUADACCUMULATE) == \
2864
+ defined(HWY_TARGET_TOGGLE))
2865
+
2866
+ #ifdef HWY_NATIVE_U8_U8_SUMOFMULQUADACCUMULATE
2867
+ #undef HWY_NATIVE_U8_U8_SUMOFMULQUADACCUMULATE
2868
+ #else
2869
+ #define HWY_NATIVE_U8_U8_SUMOFMULQUADACCUMULATE
2870
+ #endif
2871
+
2872
+ template <class DU32, HWY_IF_U32_D(DU32)>
2873
+ HWY_API VFromD<DU32> SumOfMulQuadAccumulate(
2874
+ DU32 du32, VFromD<Repartition<uint8_t, DU32>> a,
2875
+ VFromD<Repartition<uint8_t, DU32>> b, VFromD<DU32> sum) {
2876
+ const Repartition<uint16_t, decltype(du32)> du16;
2877
+ const RebindToSigned<decltype(du16)> di16;
2878
+ const RebindToSigned<decltype(du32)> di32;
2879
+
2880
+ const auto lo8_mask = Set(di16, int16_t{0x00FF});
2881
+ const auto a0 = And(BitCast(di16, a), lo8_mask);
2882
+ const auto b0 = And(BitCast(di16, b), lo8_mask);
2883
+
2884
+ const auto a1 = BitCast(di16, ShiftRight<8>(BitCast(du16, a)));
2885
+ const auto b1 = BitCast(di16, ShiftRight<8>(BitCast(du16, b)));
2886
+
2887
+ return Add(sum, Add(BitCast(du32, WidenMulPairwiseAdd(di32, a0, b0)),
2888
+ BitCast(du32, WidenMulPairwiseAdd(di32, a1, b1))));
2889
+ }
2890
+
2891
+ #endif
2892
+
2893
+ #if (defined(HWY_NATIVE_U8_I8_SUMOFMULQUADACCUMULATE) == \
2894
+ defined(HWY_TARGET_TOGGLE))
2895
+
2896
+ #ifdef HWY_NATIVE_U8_I8_SUMOFMULQUADACCUMULATE
2897
+ #undef HWY_NATIVE_U8_I8_SUMOFMULQUADACCUMULATE
2898
+ #else
2899
+ #define HWY_NATIVE_U8_I8_SUMOFMULQUADACCUMULATE
2900
+ #endif
2901
+
2902
+ template <class DI32, HWY_IF_I32_D(DI32)>
2903
+ HWY_API VFromD<DI32> SumOfMulQuadAccumulate(
2904
+ DI32 di32, VFromD<Repartition<uint8_t, DI32>> a_u,
2905
+ VFromD<Repartition<int8_t, DI32>> b_i, VFromD<DI32> sum) {
2906
+ const Repartition<int16_t, decltype(di32)> di16;
2907
+ const RebindToUnsigned<decltype(di16)> du16;
2908
+
2909
+ const auto a0 = And(BitCast(di16, a_u), Set(di16, int16_t{0x00FF}));
2910
+ const auto b0 = ShiftRight<8>(ShiftLeft<8>(BitCast(di16, b_i)));
2911
+
2912
+ const auto a1 = BitCast(di16, ShiftRight<8>(BitCast(du16, a_u)));
2913
+ const auto b1 = ShiftRight<8>(BitCast(di16, b_i));
2914
+
2915
+ // NOTE: SatWidenMulPairwiseAdd(di16, a_u, b_i) cannot be used in
2916
+ // SumOfMulQuadAccumulate as it is possible for
2917
+ // a_u[0]*b_i[0]+a_u[1]*b_i[1] to overflow an int16_t if a_u[0], b_i[0],
2918
+ // a_u[1], and b_i[1] are all non-zero and b_i[0] and b_i[1] have the same
2919
+ // sign.
2920
+
2921
+ return Add(sum, Add(WidenMulPairwiseAdd(di32, a0, b0),
2922
+ WidenMulPairwiseAdd(di32, a1, b1)));
2923
+ }
2924
+
2925
+ #endif
2926
+
2927
+ #if (defined(HWY_NATIVE_I16_I16_SUMOFMULQUADACCUMULATE) == \
2928
+ defined(HWY_TARGET_TOGGLE))
2929
+
2930
+ #ifdef HWY_NATIVE_I16_I16_SUMOFMULQUADACCUMULATE
2931
+ #undef HWY_NATIVE_I16_I16_SUMOFMULQUADACCUMULATE
2932
+ #else
2933
+ #define HWY_NATIVE_I16_I16_SUMOFMULQUADACCUMULATE
2934
+ #endif
2935
+
2936
+ #if HWY_HAVE_INTEGER64
2937
+ template <class DI64, HWY_IF_I64_D(DI64)>
2938
+ HWY_API VFromD<DI64> SumOfMulQuadAccumulate(
2939
+ DI64 di64, VFromD<Repartition<int16_t, DI64>> a,
2940
+ VFromD<Repartition<int16_t, DI64>> b, VFromD<DI64> sum) {
2941
+ const Repartition<int32_t, decltype(di64)> di32;
2942
+
2943
+ // WidenMulPairwiseAdd(di32, a, b) is okay here as
2944
+ // a[0]*b[0]+a[1]*b[1] is between -2147418112 and 2147483648 and as
2945
+ // a[0]*b[0]+a[1]*b[1] can only overflow an int32_t if
2946
+ // a[0], b[0], a[1], and b[1] are all equal to -32768.
2947
+
2948
+ const auto i32_pairwise_sum = WidenMulPairwiseAdd(di32, a, b);
2949
+ const auto i32_pairwise_sum_overflow =
2950
+ VecFromMask(di32, Eq(i32_pairwise_sum, Set(di32, LimitsMin<int32_t>())));
2951
+
2952
+ // The upper 32 bits of sum0 and sum1 need to be zeroed out in the case of
2953
+ // overflow.
2954
+ const auto hi32_mask = Set(di64, static_cast<int64_t>(~int64_t{0xFFFFFFFF}));
2955
+ const auto p0_zero_out_mask =
2956
+ ShiftLeft<32>(BitCast(di64, i32_pairwise_sum_overflow));
2957
+ const auto p1_zero_out_mask =
2958
+ And(BitCast(di64, i32_pairwise_sum_overflow), hi32_mask);
2959
+
2960
+ const auto p0 =
2961
+ AndNot(p0_zero_out_mask,
2962
+ ShiftRight<32>(ShiftLeft<32>(BitCast(di64, i32_pairwise_sum))));
2963
+ const auto p1 =
2964
+ AndNot(p1_zero_out_mask, ShiftRight<32>(BitCast(di64, i32_pairwise_sum)));
2965
+
2966
+ return Add(sum, Add(p0, p1));
2967
+ }
2968
+ #endif // HWY_HAVE_INTEGER64
2969
+ #endif // HWY_NATIVE_I16_I16_SUMOFMULQUADACCUMULATE
2970
+
2971
+ #if (defined(HWY_NATIVE_U16_U16_SUMOFMULQUADACCUMULATE) == \
2972
+ defined(HWY_TARGET_TOGGLE))
2973
+
2974
+ #ifdef HWY_NATIVE_U16_U16_SUMOFMULQUADACCUMULATE
2975
+ #undef HWY_NATIVE_U16_U16_SUMOFMULQUADACCUMULATE
2976
+ #else
2977
+ #define HWY_NATIVE_U16_U16_SUMOFMULQUADACCUMULATE
2978
+ #endif
2979
+
2980
+ #if HWY_HAVE_INTEGER64
2981
+ template <class DU64, HWY_IF_U64_D(DU64)>
2982
+ HWY_API VFromD<DU64> SumOfMulQuadAccumulate(
2983
+ DU64 du64, VFromD<Repartition<uint16_t, DU64>> a,
2984
+ VFromD<Repartition<uint16_t, DU64>> b, VFromD<DU64> sum) {
2985
+ const auto u32_even_prod = MulEven(a, b);
2986
+ const auto u32_odd_prod = MulOdd(a, b);
2987
+
2988
+ const auto lo32_mask = Set(du64, uint64_t{0xFFFFFFFFu});
2989
+
2990
+ const auto p0 = Add(And(BitCast(du64, u32_even_prod), lo32_mask),
2991
+ And(BitCast(du64, u32_odd_prod), lo32_mask));
2992
+ const auto p1 = Add(ShiftRight<32>(BitCast(du64, u32_even_prod)),
2993
+ ShiftRight<32>(BitCast(du64, u32_odd_prod)));
2994
+
2995
+ return Add(sum, Add(p0, p1));
2996
+ }
2997
+ #endif // HWY_HAVE_INTEGER64
2998
+ #endif // HWY_NATIVE_U16_U16_SUMOFMULQUADACCUMULATE
2999
+
3000
+ // ------------------------------ F64 ApproximateReciprocal
3001
+
3002
+ #if (defined(HWY_NATIVE_F64_APPROX_RECIP) == defined(HWY_TARGET_TOGGLE))
3003
+ #ifdef HWY_NATIVE_F64_APPROX_RECIP
3004
+ #undef HWY_NATIVE_F64_APPROX_RECIP
3005
+ #else
3006
+ #define HWY_NATIVE_F64_APPROX_RECIP
3007
+ #endif
3008
+
3009
+ #if HWY_HAVE_FLOAT64
3010
+ template <class V, HWY_IF_F64_D(DFromV<V>)>
3011
+ HWY_API V ApproximateReciprocal(V v) {
3012
+ const DFromV<decltype(v)> d;
3013
+ return Div(Set(d, 1.0), v);
3014
+ }
3015
+ #endif // HWY_HAVE_FLOAT64
3016
+
3017
+ #endif // HWY_NATIVE_F64_APPROX_RECIP
3018
+
3019
+ // ------------------------------ F64 ApproximateReciprocalSqrt
3020
+
3021
+ #if (defined(HWY_NATIVE_F64_APPROX_RSQRT) == defined(HWY_TARGET_TOGGLE))
3022
+ #ifdef HWY_NATIVE_F64_APPROX_RSQRT
3023
+ #undef HWY_NATIVE_F64_APPROX_RSQRT
3024
+ #else
3025
+ #define HWY_NATIVE_F64_APPROX_RSQRT
3026
+ #endif
3027
+
3028
+ #if HWY_HAVE_FLOAT64
3029
+ template <class V, HWY_IF_F64_D(DFromV<V>)>
3030
+ HWY_API V ApproximateReciprocalSqrt(V v) {
3031
+ const DFromV<decltype(v)> d;
3032
+ const RebindToUnsigned<decltype(d)> du;
3033
+ const auto half = Mul(v, Set(d, 0.5));
3034
+ // Initial guess based on log2(f)
3035
+ const auto guess = BitCast(d, Sub(Set(du, uint64_t{0x5FE6EB50C7B537A9u}),
3036
+ ShiftRight<1>(BitCast(du, v))));
3037
+ // One Newton-Raphson iteration
3038
+ return Mul(guess, NegMulAdd(Mul(half, guess), guess, Set(d, 1.5)));
3039
+ }
3040
+ #endif // HWY_HAVE_FLOAT64
3041
+
3042
+ #endif // HWY_NATIVE_F64_APPROX_RSQRT
3043
+
3044
+ // ------------------------------ Compress*
3045
+
3046
+ // "Include guard": skip if native 8-bit compress instructions are available.
3047
+ #if (defined(HWY_NATIVE_COMPRESS8) == defined(HWY_TARGET_TOGGLE))
3048
+ #ifdef HWY_NATIVE_COMPRESS8
3049
+ #undef HWY_NATIVE_COMPRESS8
3050
+ #else
3051
+ #define HWY_NATIVE_COMPRESS8
3052
+ #endif
3053
+
3054
+ template <class V, class D, typename T, HWY_IF_T_SIZE(T, 1)>
3055
+ HWY_API size_t CompressBitsStore(V v, const uint8_t* HWY_RESTRICT bits, D d,
3056
+ T* unaligned) {
3057
+ HWY_ALIGN T lanes[MaxLanes(d)];
3058
+ Store(v, d, lanes);
3059
+
3060
+ const Simd<T, HWY_MIN(MaxLanes(d), 8), 0> d8;
3061
+ T* HWY_RESTRICT pos = unaligned;
3062
+
3063
+ HWY_ALIGN constexpr T table[2048] = {
3064
+ 0, 1, 2, 3, 4, 5, 6, 7, /**/ 0, 1, 2, 3, 4, 5, 6, 7, //
3065
+ 1, 0, 2, 3, 4, 5, 6, 7, /**/ 0, 1, 2, 3, 4, 5, 6, 7, //
3066
+ 2, 0, 1, 3, 4, 5, 6, 7, /**/ 0, 2, 1, 3, 4, 5, 6, 7, //
3067
+ 1, 2, 0, 3, 4, 5, 6, 7, /**/ 0, 1, 2, 3, 4, 5, 6, 7, //
3068
+ 3, 0, 1, 2, 4, 5, 6, 7, /**/ 0, 3, 1, 2, 4, 5, 6, 7, //
3069
+ 1, 3, 0, 2, 4, 5, 6, 7, /**/ 0, 1, 3, 2, 4, 5, 6, 7, //
3070
+ 2, 3, 0, 1, 4, 5, 6, 7, /**/ 0, 2, 3, 1, 4, 5, 6, 7, //
3071
+ 1, 2, 3, 0, 4, 5, 6, 7, /**/ 0, 1, 2, 3, 4, 5, 6, 7, //
3072
+ 4, 0, 1, 2, 3, 5, 6, 7, /**/ 0, 4, 1, 2, 3, 5, 6, 7, //
3073
+ 1, 4, 0, 2, 3, 5, 6, 7, /**/ 0, 1, 4, 2, 3, 5, 6, 7, //
3074
+ 2, 4, 0, 1, 3, 5, 6, 7, /**/ 0, 2, 4, 1, 3, 5, 6, 7, //
3075
+ 1, 2, 4, 0, 3, 5, 6, 7, /**/ 0, 1, 2, 4, 3, 5, 6, 7, //
3076
+ 3, 4, 0, 1, 2, 5, 6, 7, /**/ 0, 3, 4, 1, 2, 5, 6, 7, //
3077
+ 1, 3, 4, 0, 2, 5, 6, 7, /**/ 0, 1, 3, 4, 2, 5, 6, 7, //
3078
+ 2, 3, 4, 0, 1, 5, 6, 7, /**/ 0, 2, 3, 4, 1, 5, 6, 7, //
3079
+ 1, 2, 3, 4, 0, 5, 6, 7, /**/ 0, 1, 2, 3, 4, 5, 6, 7, //
3080
+ 5, 0, 1, 2, 3, 4, 6, 7, /**/ 0, 5, 1, 2, 3, 4, 6, 7, //
3081
+ 1, 5, 0, 2, 3, 4, 6, 7, /**/ 0, 1, 5, 2, 3, 4, 6, 7, //
3082
+ 2, 5, 0, 1, 3, 4, 6, 7, /**/ 0, 2, 5, 1, 3, 4, 6, 7, //
3083
+ 1, 2, 5, 0, 3, 4, 6, 7, /**/ 0, 1, 2, 5, 3, 4, 6, 7, //
3084
+ 3, 5, 0, 1, 2, 4, 6, 7, /**/ 0, 3, 5, 1, 2, 4, 6, 7, //
3085
+ 1, 3, 5, 0, 2, 4, 6, 7, /**/ 0, 1, 3, 5, 2, 4, 6, 7, //
3086
+ 2, 3, 5, 0, 1, 4, 6, 7, /**/ 0, 2, 3, 5, 1, 4, 6, 7, //
3087
+ 1, 2, 3, 5, 0, 4, 6, 7, /**/ 0, 1, 2, 3, 5, 4, 6, 7, //
3088
+ 4, 5, 0, 1, 2, 3, 6, 7, /**/ 0, 4, 5, 1, 2, 3, 6, 7, //
3089
+ 1, 4, 5, 0, 2, 3, 6, 7, /**/ 0, 1, 4, 5, 2, 3, 6, 7, //
3090
+ 2, 4, 5, 0, 1, 3, 6, 7, /**/ 0, 2, 4, 5, 1, 3, 6, 7, //
3091
+ 1, 2, 4, 5, 0, 3, 6, 7, /**/ 0, 1, 2, 4, 5, 3, 6, 7, //
3092
+ 3, 4, 5, 0, 1, 2, 6, 7, /**/ 0, 3, 4, 5, 1, 2, 6, 7, //
3093
+ 1, 3, 4, 5, 0, 2, 6, 7, /**/ 0, 1, 3, 4, 5, 2, 6, 7, //
3094
+ 2, 3, 4, 5, 0, 1, 6, 7, /**/ 0, 2, 3, 4, 5, 1, 6, 7, //
3095
+ 1, 2, 3, 4, 5, 0, 6, 7, /**/ 0, 1, 2, 3, 4, 5, 6, 7, //
3096
+ 6, 0, 1, 2, 3, 4, 5, 7, /**/ 0, 6, 1, 2, 3, 4, 5, 7, //
3097
+ 1, 6, 0, 2, 3, 4, 5, 7, /**/ 0, 1, 6, 2, 3, 4, 5, 7, //
3098
+ 2, 6, 0, 1, 3, 4, 5, 7, /**/ 0, 2, 6, 1, 3, 4, 5, 7, //
3099
+ 1, 2, 6, 0, 3, 4, 5, 7, /**/ 0, 1, 2, 6, 3, 4, 5, 7, //
3100
+ 3, 6, 0, 1, 2, 4, 5, 7, /**/ 0, 3, 6, 1, 2, 4, 5, 7, //
3101
+ 1, 3, 6, 0, 2, 4, 5, 7, /**/ 0, 1, 3, 6, 2, 4, 5, 7, //
3102
+ 2, 3, 6, 0, 1, 4, 5, 7, /**/ 0, 2, 3, 6, 1, 4, 5, 7, //
3103
+ 1, 2, 3, 6, 0, 4, 5, 7, /**/ 0, 1, 2, 3, 6, 4, 5, 7, //
3104
+ 4, 6, 0, 1, 2, 3, 5, 7, /**/ 0, 4, 6, 1, 2, 3, 5, 7, //
3105
+ 1, 4, 6, 0, 2, 3, 5, 7, /**/ 0, 1, 4, 6, 2, 3, 5, 7, //
3106
+ 2, 4, 6, 0, 1, 3, 5, 7, /**/ 0, 2, 4, 6, 1, 3, 5, 7, //
3107
+ 1, 2, 4, 6, 0, 3, 5, 7, /**/ 0, 1, 2, 4, 6, 3, 5, 7, //
3108
+ 3, 4, 6, 0, 1, 2, 5, 7, /**/ 0, 3, 4, 6, 1, 2, 5, 7, //
3109
+ 1, 3, 4, 6, 0, 2, 5, 7, /**/ 0, 1, 3, 4, 6, 2, 5, 7, //
3110
+ 2, 3, 4, 6, 0, 1, 5, 7, /**/ 0, 2, 3, 4, 6, 1, 5, 7, //
3111
+ 1, 2, 3, 4, 6, 0, 5, 7, /**/ 0, 1, 2, 3, 4, 6, 5, 7, //
3112
+ 5, 6, 0, 1, 2, 3, 4, 7, /**/ 0, 5, 6, 1, 2, 3, 4, 7, //
3113
+ 1, 5, 6, 0, 2, 3, 4, 7, /**/ 0, 1, 5, 6, 2, 3, 4, 7, //
3114
+ 2, 5, 6, 0, 1, 3, 4, 7, /**/ 0, 2, 5, 6, 1, 3, 4, 7, //
3115
+ 1, 2, 5, 6, 0, 3, 4, 7, /**/ 0, 1, 2, 5, 6, 3, 4, 7, //
3116
+ 3, 5, 6, 0, 1, 2, 4, 7, /**/ 0, 3, 5, 6, 1, 2, 4, 7, //
3117
+ 1, 3, 5, 6, 0, 2, 4, 7, /**/ 0, 1, 3, 5, 6, 2, 4, 7, //
3118
+ 2, 3, 5, 6, 0, 1, 4, 7, /**/ 0, 2, 3, 5, 6, 1, 4, 7, //
3119
+ 1, 2, 3, 5, 6, 0, 4, 7, /**/ 0, 1, 2, 3, 5, 6, 4, 7, //
3120
+ 4, 5, 6, 0, 1, 2, 3, 7, /**/ 0, 4, 5, 6, 1, 2, 3, 7, //
3121
+ 1, 4, 5, 6, 0, 2, 3, 7, /**/ 0, 1, 4, 5, 6, 2, 3, 7, //
3122
+ 2, 4, 5, 6, 0, 1, 3, 7, /**/ 0, 2, 4, 5, 6, 1, 3, 7, //
3123
+ 1, 2, 4, 5, 6, 0, 3, 7, /**/ 0, 1, 2, 4, 5, 6, 3, 7, //
3124
+ 3, 4, 5, 6, 0, 1, 2, 7, /**/ 0, 3, 4, 5, 6, 1, 2, 7, //
3125
+ 1, 3, 4, 5, 6, 0, 2, 7, /**/ 0, 1, 3, 4, 5, 6, 2, 7, //
3126
+ 2, 3, 4, 5, 6, 0, 1, 7, /**/ 0, 2, 3, 4, 5, 6, 1, 7, //
3127
+ 1, 2, 3, 4, 5, 6, 0, 7, /**/ 0, 1, 2, 3, 4, 5, 6, 7, //
3128
+ 7, 0, 1, 2, 3, 4, 5, 6, /**/ 0, 7, 1, 2, 3, 4, 5, 6, //
3129
+ 1, 7, 0, 2, 3, 4, 5, 6, /**/ 0, 1, 7, 2, 3, 4, 5, 6, //
3130
+ 2, 7, 0, 1, 3, 4, 5, 6, /**/ 0, 2, 7, 1, 3, 4, 5, 6, //
3131
+ 1, 2, 7, 0, 3, 4, 5, 6, /**/ 0, 1, 2, 7, 3, 4, 5, 6, //
3132
+ 3, 7, 0, 1, 2, 4, 5, 6, /**/ 0, 3, 7, 1, 2, 4, 5, 6, //
3133
+ 1, 3, 7, 0, 2, 4, 5, 6, /**/ 0, 1, 3, 7, 2, 4, 5, 6, //
3134
+ 2, 3, 7, 0, 1, 4, 5, 6, /**/ 0, 2, 3, 7, 1, 4, 5, 6, //
3135
+ 1, 2, 3, 7, 0, 4, 5, 6, /**/ 0, 1, 2, 3, 7, 4, 5, 6, //
3136
+ 4, 7, 0, 1, 2, 3, 5, 6, /**/ 0, 4, 7, 1, 2, 3, 5, 6, //
3137
+ 1, 4, 7, 0, 2, 3, 5, 6, /**/ 0, 1, 4, 7, 2, 3, 5, 6, //
3138
+ 2, 4, 7, 0, 1, 3, 5, 6, /**/ 0, 2, 4, 7, 1, 3, 5, 6, //
3139
+ 1, 2, 4, 7, 0, 3, 5, 6, /**/ 0, 1, 2, 4, 7, 3, 5, 6, //
3140
+ 3, 4, 7, 0, 1, 2, 5, 6, /**/ 0, 3, 4, 7, 1, 2, 5, 6, //
3141
+ 1, 3, 4, 7, 0, 2, 5, 6, /**/ 0, 1, 3, 4, 7, 2, 5, 6, //
3142
+ 2, 3, 4, 7, 0, 1, 5, 6, /**/ 0, 2, 3, 4, 7, 1, 5, 6, //
3143
+ 1, 2, 3, 4, 7, 0, 5, 6, /**/ 0, 1, 2, 3, 4, 7, 5, 6, //
3144
+ 5, 7, 0, 1, 2, 3, 4, 6, /**/ 0, 5, 7, 1, 2, 3, 4, 6, //
3145
+ 1, 5, 7, 0, 2, 3, 4, 6, /**/ 0, 1, 5, 7, 2, 3, 4, 6, //
3146
+ 2, 5, 7, 0, 1, 3, 4, 6, /**/ 0, 2, 5, 7, 1, 3, 4, 6, //
3147
+ 1, 2, 5, 7, 0, 3, 4, 6, /**/ 0, 1, 2, 5, 7, 3, 4, 6, //
3148
+ 3, 5, 7, 0, 1, 2, 4, 6, /**/ 0, 3, 5, 7, 1, 2, 4, 6, //
3149
+ 1, 3, 5, 7, 0, 2, 4, 6, /**/ 0, 1, 3, 5, 7, 2, 4, 6, //
3150
+ 2, 3, 5, 7, 0, 1, 4, 6, /**/ 0, 2, 3, 5, 7, 1, 4, 6, //
3151
+ 1, 2, 3, 5, 7, 0, 4, 6, /**/ 0, 1, 2, 3, 5, 7, 4, 6, //
3152
+ 4, 5, 7, 0, 1, 2, 3, 6, /**/ 0, 4, 5, 7, 1, 2, 3, 6, //
3153
+ 1, 4, 5, 7, 0, 2, 3, 6, /**/ 0, 1, 4, 5, 7, 2, 3, 6, //
3154
+ 2, 4, 5, 7, 0, 1, 3, 6, /**/ 0, 2, 4, 5, 7, 1, 3, 6, //
3155
+ 1, 2, 4, 5, 7, 0, 3, 6, /**/ 0, 1, 2, 4, 5, 7, 3, 6, //
3156
+ 3, 4, 5, 7, 0, 1, 2, 6, /**/ 0, 3, 4, 5, 7, 1, 2, 6, //
3157
+ 1, 3, 4, 5, 7, 0, 2, 6, /**/ 0, 1, 3, 4, 5, 7, 2, 6, //
3158
+ 2, 3, 4, 5, 7, 0, 1, 6, /**/ 0, 2, 3, 4, 5, 7, 1, 6, //
3159
+ 1, 2, 3, 4, 5, 7, 0, 6, /**/ 0, 1, 2, 3, 4, 5, 7, 6, //
3160
+ 6, 7, 0, 1, 2, 3, 4, 5, /**/ 0, 6, 7, 1, 2, 3, 4, 5, //
3161
+ 1, 6, 7, 0, 2, 3, 4, 5, /**/ 0, 1, 6, 7, 2, 3, 4, 5, //
3162
+ 2, 6, 7, 0, 1, 3, 4, 5, /**/ 0, 2, 6, 7, 1, 3, 4, 5, //
3163
+ 1, 2, 6, 7, 0, 3, 4, 5, /**/ 0, 1, 2, 6, 7, 3, 4, 5, //
3164
+ 3, 6, 7, 0, 1, 2, 4, 5, /**/ 0, 3, 6, 7, 1, 2, 4, 5, //
3165
+ 1, 3, 6, 7, 0, 2, 4, 5, /**/ 0, 1, 3, 6, 7, 2, 4, 5, //
3166
+ 2, 3, 6, 7, 0, 1, 4, 5, /**/ 0, 2, 3, 6, 7, 1, 4, 5, //
3167
+ 1, 2, 3, 6, 7, 0, 4, 5, /**/ 0, 1, 2, 3, 6, 7, 4, 5, //
3168
+ 4, 6, 7, 0, 1, 2, 3, 5, /**/ 0, 4, 6, 7, 1, 2, 3, 5, //
3169
+ 1, 4, 6, 7, 0, 2, 3, 5, /**/ 0, 1, 4, 6, 7, 2, 3, 5, //
3170
+ 2, 4, 6, 7, 0, 1, 3, 5, /**/ 0, 2, 4, 6, 7, 1, 3, 5, //
3171
+ 1, 2, 4, 6, 7, 0, 3, 5, /**/ 0, 1, 2, 4, 6, 7, 3, 5, //
3172
+ 3, 4, 6, 7, 0, 1, 2, 5, /**/ 0, 3, 4, 6, 7, 1, 2, 5, //
3173
+ 1, 3, 4, 6, 7, 0, 2, 5, /**/ 0, 1, 3, 4, 6, 7, 2, 5, //
3174
+ 2, 3, 4, 6, 7, 0, 1, 5, /**/ 0, 2, 3, 4, 6, 7, 1, 5, //
3175
+ 1, 2, 3, 4, 6, 7, 0, 5, /**/ 0, 1, 2, 3, 4, 6, 7, 5, //
3176
+ 5, 6, 7, 0, 1, 2, 3, 4, /**/ 0, 5, 6, 7, 1, 2, 3, 4, //
3177
+ 1, 5, 6, 7, 0, 2, 3, 4, /**/ 0, 1, 5, 6, 7, 2, 3, 4, //
3178
+ 2, 5, 6, 7, 0, 1, 3, 4, /**/ 0, 2, 5, 6, 7, 1, 3, 4, //
3179
+ 1, 2, 5, 6, 7, 0, 3, 4, /**/ 0, 1, 2, 5, 6, 7, 3, 4, //
3180
+ 3, 5, 6, 7, 0, 1, 2, 4, /**/ 0, 3, 5, 6, 7, 1, 2, 4, //
3181
+ 1, 3, 5, 6, 7, 0, 2, 4, /**/ 0, 1, 3, 5, 6, 7, 2, 4, //
3182
+ 2, 3, 5, 6, 7, 0, 1, 4, /**/ 0, 2, 3, 5, 6, 7, 1, 4, //
3183
+ 1, 2, 3, 5, 6, 7, 0, 4, /**/ 0, 1, 2, 3, 5, 6, 7, 4, //
3184
+ 4, 5, 6, 7, 0, 1, 2, 3, /**/ 0, 4, 5, 6, 7, 1, 2, 3, //
3185
+ 1, 4, 5, 6, 7, 0, 2, 3, /**/ 0, 1, 4, 5, 6, 7, 2, 3, //
3186
+ 2, 4, 5, 6, 7, 0, 1, 3, /**/ 0, 2, 4, 5, 6, 7, 1, 3, //
3187
+ 1, 2, 4, 5, 6, 7, 0, 3, /**/ 0, 1, 2, 4, 5, 6, 7, 3, //
3188
+ 3, 4, 5, 6, 7, 0, 1, 2, /**/ 0, 3, 4, 5, 6, 7, 1, 2, //
3189
+ 1, 3, 4, 5, 6, 7, 0, 2, /**/ 0, 1, 3, 4, 5, 6, 7, 2, //
3190
+ 2, 3, 4, 5, 6, 7, 0, 1, /**/ 0, 2, 3, 4, 5, 6, 7, 1, //
3191
+ 1, 2, 3, 4, 5, 6, 7, 0, /**/ 0, 1, 2, 3, 4, 5, 6, 7};
3192
+
3193
+ for (size_t i = 0; i < Lanes(d); i += 8) {
3194
+ // Each byte worth of bits is the index of one of 256 8-byte ranges, and its
3195
+ // population count determines how far to advance the write position.
3196
+ const size_t bits8 = bits[i / 8];
3197
+ const auto indices = Load(d8, table + bits8 * 8);
3198
+ const auto compressed = TableLookupBytes(LoadU(d8, lanes + i), indices);
3199
+ StoreU(compressed, d8, pos);
3200
+ pos += PopCount(bits8);
3201
+ }
3202
+ return static_cast<size_t>(pos - unaligned);
3203
+ }
3204
+
3205
+ template <class V, class M, class D, typename T, HWY_IF_T_SIZE(T, 1)>
3206
+ HWY_API size_t CompressStore(V v, M mask, D d, T* HWY_RESTRICT unaligned) {
3207
+ uint8_t bits[HWY_MAX(size_t{8}, MaxLanes(d) / 8)];
3208
+ (void)StoreMaskBits(d, mask, bits);
3209
+ return CompressBitsStore(v, bits, d, unaligned);
3210
+ }
3211
+
3212
+ template <class V, class M, class D, typename T, HWY_IF_T_SIZE(T, 1)>
3213
+ HWY_API size_t CompressBlendedStore(V v, M mask, D d,
3214
+ T* HWY_RESTRICT unaligned) {
3215
+ HWY_ALIGN T buf[MaxLanes(d)];
3216
+ const size_t bytes = CompressStore(v, mask, d, buf);
3217
+ BlendedStore(Load(d, buf), FirstN(d, bytes), d, unaligned);
3218
+ return bytes;
3219
+ }
3220
+
3221
+ // For reasons unknown, HWY_IF_T_SIZE_V is a compile error in SVE.
3222
+ template <class V, class M, typename T = TFromV<V>, HWY_IF_T_SIZE(T, 1)>
3223
+ HWY_API V Compress(V v, const M mask) {
3224
+ const DFromV<V> d;
3225
+ HWY_ALIGN T lanes[MaxLanes(d)];
3226
+ (void)CompressStore(v, mask, d, lanes);
3227
+ return Load(d, lanes);
3228
+ }
3229
+
3230
+ template <class V, typename T = TFromV<V>, HWY_IF_T_SIZE(T, 1)>
3231
+ HWY_API V CompressBits(V v, const uint8_t* HWY_RESTRICT bits) {
3232
+ const DFromV<V> d;
3233
+ HWY_ALIGN T lanes[MaxLanes(d)];
3234
+ (void)CompressBitsStore(v, bits, d, lanes);
3235
+ return Load(d, lanes);
3236
+ }
3237
+
3238
+ template <class V, class M, typename T = TFromV<V>, HWY_IF_T_SIZE(T, 1)>
3239
+ HWY_API V CompressNot(V v, M mask) {
3240
+ return Compress(v, Not(mask));
3241
+ }
3242
+
3243
+ #endif // HWY_NATIVE_COMPRESS8
3244
+
3245
+ // ------------------------------ Expand
3246
+
3247
+ // "Include guard": skip if native 8/16-bit Expand/LoadExpand are available.
3248
+ // Note that this generic implementation assumes <= 128 bit fixed vectors;
3249
+ // the SVE and RVV targets provide their own native implementations.
3250
+ #if (defined(HWY_NATIVE_EXPAND) == defined(HWY_TARGET_TOGGLE)) || HWY_IDE
3251
+ #ifdef HWY_NATIVE_EXPAND
3252
+ #undef HWY_NATIVE_EXPAND
3253
+ #else
3254
+ #define HWY_NATIVE_EXPAND
3255
+ #endif
3256
+
3257
+ namespace detail {
3258
+
3259
+ #if HWY_IDE
3260
+ template <class M>
3261
+ HWY_INLINE uint64_t BitsFromMask(M /* mask */) {
3262
+ return 0;
3263
+ }
3264
+ #endif // HWY_IDE
3265
+
3266
+ template <size_t N>
3267
+ HWY_INLINE Vec128<uint8_t, N> IndicesForExpandFromBits(uint64_t mask_bits) {
3268
+ static_assert(N <= 8, "Should only be called for half-vectors");
3269
+ const Simd<uint8_t, N, 0> du8;
3270
+ HWY_DASSERT(mask_bits < 0x100);
3271
+ alignas(16) static constexpr uint8_t table[2048] = {
3272
+ // PrintExpand8x8Tables
3273
+ 128, 128, 128, 128, 128, 128, 128, 128, //
3274
+ 0, 128, 128, 128, 128, 128, 128, 128, //
3275
+ 128, 0, 128, 128, 128, 128, 128, 128, //
3276
+ 0, 1, 128, 128, 128, 128, 128, 128, //
3277
+ 128, 128, 0, 128, 128, 128, 128, 128, //
3278
+ 0, 128, 1, 128, 128, 128, 128, 128, //
3279
+ 128, 0, 1, 128, 128, 128, 128, 128, //
3280
+ 0, 1, 2, 128, 128, 128, 128, 128, //
3281
+ 128, 128, 128, 0, 128, 128, 128, 128, //
3282
+ 0, 128, 128, 1, 128, 128, 128, 128, //
3283
+ 128, 0, 128, 1, 128, 128, 128, 128, //
3284
+ 0, 1, 128, 2, 128, 128, 128, 128, //
3285
+ 128, 128, 0, 1, 128, 128, 128, 128, //
3286
+ 0, 128, 1, 2, 128, 128, 128, 128, //
3287
+ 128, 0, 1, 2, 128, 128, 128, 128, //
3288
+ 0, 1, 2, 3, 128, 128, 128, 128, //
3289
+ 128, 128, 128, 128, 0, 128, 128, 128, //
3290
+ 0, 128, 128, 128, 1, 128, 128, 128, //
3291
+ 128, 0, 128, 128, 1, 128, 128, 128, //
3292
+ 0, 1, 128, 128, 2, 128, 128, 128, //
3293
+ 128, 128, 0, 128, 1, 128, 128, 128, //
3294
+ 0, 128, 1, 128, 2, 128, 128, 128, //
3295
+ 128, 0, 1, 128, 2, 128, 128, 128, //
3296
+ 0, 1, 2, 128, 3, 128, 128, 128, //
3297
+ 128, 128, 128, 0, 1, 128, 128, 128, //
3298
+ 0, 128, 128, 1, 2, 128, 128, 128, //
3299
+ 128, 0, 128, 1, 2, 128, 128, 128, //
3300
+ 0, 1, 128, 2, 3, 128, 128, 128, //
3301
+ 128, 128, 0, 1, 2, 128, 128, 128, //
3302
+ 0, 128, 1, 2, 3, 128, 128, 128, //
3303
+ 128, 0, 1, 2, 3, 128, 128, 128, //
3304
+ 0, 1, 2, 3, 4, 128, 128, 128, //
3305
+ 128, 128, 128, 128, 128, 0, 128, 128, //
3306
+ 0, 128, 128, 128, 128, 1, 128, 128, //
3307
+ 128, 0, 128, 128, 128, 1, 128, 128, //
3308
+ 0, 1, 128, 128, 128, 2, 128, 128, //
3309
+ 128, 128, 0, 128, 128, 1, 128, 128, //
3310
+ 0, 128, 1, 128, 128, 2, 128, 128, //
3311
+ 128, 0, 1, 128, 128, 2, 128, 128, //
3312
+ 0, 1, 2, 128, 128, 3, 128, 128, //
3313
+ 128, 128, 128, 0, 128, 1, 128, 128, //
3314
+ 0, 128, 128, 1, 128, 2, 128, 128, //
3315
+ 128, 0, 128, 1, 128, 2, 128, 128, //
3316
+ 0, 1, 128, 2, 128, 3, 128, 128, //
3317
+ 128, 128, 0, 1, 128, 2, 128, 128, //
3318
+ 0, 128, 1, 2, 128, 3, 128, 128, //
3319
+ 128, 0, 1, 2, 128, 3, 128, 128, //
3320
+ 0, 1, 2, 3, 128, 4, 128, 128, //
3321
+ 128, 128, 128, 128, 0, 1, 128, 128, //
3322
+ 0, 128, 128, 128, 1, 2, 128, 128, //
3323
+ 128, 0, 128, 128, 1, 2, 128, 128, //
3324
+ 0, 1, 128, 128, 2, 3, 128, 128, //
3325
+ 128, 128, 0, 128, 1, 2, 128, 128, //
3326
+ 0, 128, 1, 128, 2, 3, 128, 128, //
3327
+ 128, 0, 1, 128, 2, 3, 128, 128, //
3328
+ 0, 1, 2, 128, 3, 4, 128, 128, //
3329
+ 128, 128, 128, 0, 1, 2, 128, 128, //
3330
+ 0, 128, 128, 1, 2, 3, 128, 128, //
3331
+ 128, 0, 128, 1, 2, 3, 128, 128, //
3332
+ 0, 1, 128, 2, 3, 4, 128, 128, //
3333
+ 128, 128, 0, 1, 2, 3, 128, 128, //
3334
+ 0, 128, 1, 2, 3, 4, 128, 128, //
3335
+ 128, 0, 1, 2, 3, 4, 128, 128, //
3336
+ 0, 1, 2, 3, 4, 5, 128, 128, //
3337
+ 128, 128, 128, 128, 128, 128, 0, 128, //
3338
+ 0, 128, 128, 128, 128, 128, 1, 128, //
3339
+ 128, 0, 128, 128, 128, 128, 1, 128, //
3340
+ 0, 1, 128, 128, 128, 128, 2, 128, //
3341
+ 128, 128, 0, 128, 128, 128, 1, 128, //
3342
+ 0, 128, 1, 128, 128, 128, 2, 128, //
3343
+ 128, 0, 1, 128, 128, 128, 2, 128, //
3344
+ 0, 1, 2, 128, 128, 128, 3, 128, //
3345
+ 128, 128, 128, 0, 128, 128, 1, 128, //
3346
+ 0, 128, 128, 1, 128, 128, 2, 128, //
3347
+ 128, 0, 128, 1, 128, 128, 2, 128, //
3348
+ 0, 1, 128, 2, 128, 128, 3, 128, //
3349
+ 128, 128, 0, 1, 128, 128, 2, 128, //
3350
+ 0, 128, 1, 2, 128, 128, 3, 128, //
3351
+ 128, 0, 1, 2, 128, 128, 3, 128, //
3352
+ 0, 1, 2, 3, 128, 128, 4, 128, //
3353
+ 128, 128, 128, 128, 0, 128, 1, 128, //
3354
+ 0, 128, 128, 128, 1, 128, 2, 128, //
3355
+ 128, 0, 128, 128, 1, 128, 2, 128, //
3356
+ 0, 1, 128, 128, 2, 128, 3, 128, //
3357
+ 128, 128, 0, 128, 1, 128, 2, 128, //
3358
+ 0, 128, 1, 128, 2, 128, 3, 128, //
3359
+ 128, 0, 1, 128, 2, 128, 3, 128, //
3360
+ 0, 1, 2, 128, 3, 128, 4, 128, //
3361
+ 128, 128, 128, 0, 1, 128, 2, 128, //
3362
+ 0, 128, 128, 1, 2, 128, 3, 128, //
3363
+ 128, 0, 128, 1, 2, 128, 3, 128, //
3364
+ 0, 1, 128, 2, 3, 128, 4, 128, //
3365
+ 128, 128, 0, 1, 2, 128, 3, 128, //
3366
+ 0, 128, 1, 2, 3, 128, 4, 128, //
3367
+ 128, 0, 1, 2, 3, 128, 4, 128, //
3368
+ 0, 1, 2, 3, 4, 128, 5, 128, //
3369
+ 128, 128, 128, 128, 128, 0, 1, 128, //
3370
+ 0, 128, 128, 128, 128, 1, 2, 128, //
3371
+ 128, 0, 128, 128, 128, 1, 2, 128, //
3372
+ 0, 1, 128, 128, 128, 2, 3, 128, //
3373
+ 128, 128, 0, 128, 128, 1, 2, 128, //
3374
+ 0, 128, 1, 128, 128, 2, 3, 128, //
3375
+ 128, 0, 1, 128, 128, 2, 3, 128, //
3376
+ 0, 1, 2, 128, 128, 3, 4, 128, //
3377
+ 128, 128, 128, 0, 128, 1, 2, 128, //
3378
+ 0, 128, 128, 1, 128, 2, 3, 128, //
3379
+ 128, 0, 128, 1, 128, 2, 3, 128, //
3380
+ 0, 1, 128, 2, 128, 3, 4, 128, //
3381
+ 128, 128, 0, 1, 128, 2, 3, 128, //
3382
+ 0, 128, 1, 2, 128, 3, 4, 128, //
3383
+ 128, 0, 1, 2, 128, 3, 4, 128, //
3384
+ 0, 1, 2, 3, 128, 4, 5, 128, //
3385
+ 128, 128, 128, 128, 0, 1, 2, 128, //
3386
+ 0, 128, 128, 128, 1, 2, 3, 128, //
3387
+ 128, 0, 128, 128, 1, 2, 3, 128, //
3388
+ 0, 1, 128, 128, 2, 3, 4, 128, //
3389
+ 128, 128, 0, 128, 1, 2, 3, 128, //
3390
+ 0, 128, 1, 128, 2, 3, 4, 128, //
3391
+ 128, 0, 1, 128, 2, 3, 4, 128, //
3392
+ 0, 1, 2, 128, 3, 4, 5, 128, //
3393
+ 128, 128, 128, 0, 1, 2, 3, 128, //
3394
+ 0, 128, 128, 1, 2, 3, 4, 128, //
3395
+ 128, 0, 128, 1, 2, 3, 4, 128, //
3396
+ 0, 1, 128, 2, 3, 4, 5, 128, //
3397
+ 128, 128, 0, 1, 2, 3, 4, 128, //
3398
+ 0, 128, 1, 2, 3, 4, 5, 128, //
3399
+ 128, 0, 1, 2, 3, 4, 5, 128, //
3400
+ 0, 1, 2, 3, 4, 5, 6, 128, //
3401
+ 128, 128, 128, 128, 128, 128, 128, 0, //
3402
+ 0, 128, 128, 128, 128, 128, 128, 1, //
3403
+ 128, 0, 128, 128, 128, 128, 128, 1, //
3404
+ 0, 1, 128, 128, 128, 128, 128, 2, //
3405
+ 128, 128, 0, 128, 128, 128, 128, 1, //
3406
+ 0, 128, 1, 128, 128, 128, 128, 2, //
3407
+ 128, 0, 1, 128, 128, 128, 128, 2, //
3408
+ 0, 1, 2, 128, 128, 128, 128, 3, //
3409
+ 128, 128, 128, 0, 128, 128, 128, 1, //
3410
+ 0, 128, 128, 1, 128, 128, 128, 2, //
3411
+ 128, 0, 128, 1, 128, 128, 128, 2, //
3412
+ 0, 1, 128, 2, 128, 128, 128, 3, //
3413
+ 128, 128, 0, 1, 128, 128, 128, 2, //
3414
+ 0, 128, 1, 2, 128, 128, 128, 3, //
3415
+ 128, 0, 1, 2, 128, 128, 128, 3, //
3416
+ 0, 1, 2, 3, 128, 128, 128, 4, //
3417
+ 128, 128, 128, 128, 0, 128, 128, 1, //
3418
+ 0, 128, 128, 128, 1, 128, 128, 2, //
3419
+ 128, 0, 128, 128, 1, 128, 128, 2, //
3420
+ 0, 1, 128, 128, 2, 128, 128, 3, //
3421
+ 128, 128, 0, 128, 1, 128, 128, 2, //
3422
+ 0, 128, 1, 128, 2, 128, 128, 3, //
3423
+ 128, 0, 1, 128, 2, 128, 128, 3, //
3424
+ 0, 1, 2, 128, 3, 128, 128, 4, //
3425
+ 128, 128, 128, 0, 1, 128, 128, 2, //
3426
+ 0, 128, 128, 1, 2, 128, 128, 3, //
3427
+ 128, 0, 128, 1, 2, 128, 128, 3, //
3428
+ 0, 1, 128, 2, 3, 128, 128, 4, //
3429
+ 128, 128, 0, 1, 2, 128, 128, 3, //
3430
+ 0, 128, 1, 2, 3, 128, 128, 4, //
3431
+ 128, 0, 1, 2, 3, 128, 128, 4, //
3432
+ 0, 1, 2, 3, 4, 128, 128, 5, //
3433
+ 128, 128, 128, 128, 128, 0, 128, 1, //
3434
+ 0, 128, 128, 128, 128, 1, 128, 2, //
3435
+ 128, 0, 128, 128, 128, 1, 128, 2, //
3436
+ 0, 1, 128, 128, 128, 2, 128, 3, //
3437
+ 128, 128, 0, 128, 128, 1, 128, 2, //
3438
+ 0, 128, 1, 128, 128, 2, 128, 3, //
3439
+ 128, 0, 1, 128, 128, 2, 128, 3, //
3440
+ 0, 1, 2, 128, 128, 3, 128, 4, //
3441
+ 128, 128, 128, 0, 128, 1, 128, 2, //
3442
+ 0, 128, 128, 1, 128, 2, 128, 3, //
3443
+ 128, 0, 128, 1, 128, 2, 128, 3, //
3444
+ 0, 1, 128, 2, 128, 3, 128, 4, //
3445
+ 128, 128, 0, 1, 128, 2, 128, 3, //
3446
+ 0, 128, 1, 2, 128, 3, 128, 4, //
3447
+ 128, 0, 1, 2, 128, 3, 128, 4, //
3448
+ 0, 1, 2, 3, 128, 4, 128, 5, //
3449
+ 128, 128, 128, 128, 0, 1, 128, 2, //
3450
+ 0, 128, 128, 128, 1, 2, 128, 3, //
3451
+ 128, 0, 128, 128, 1, 2, 128, 3, //
3452
+ 0, 1, 128, 128, 2, 3, 128, 4, //
3453
+ 128, 128, 0, 128, 1, 2, 128, 3, //
3454
+ 0, 128, 1, 128, 2, 3, 128, 4, //
3455
+ 128, 0, 1, 128, 2, 3, 128, 4, //
3456
+ 0, 1, 2, 128, 3, 4, 128, 5, //
3457
+ 128, 128, 128, 0, 1, 2, 128, 3, //
3458
+ 0, 128, 128, 1, 2, 3, 128, 4, //
3459
+ 128, 0, 128, 1, 2, 3, 128, 4, //
3460
+ 0, 1, 128, 2, 3, 4, 128, 5, //
3461
+ 128, 128, 0, 1, 2, 3, 128, 4, //
3462
+ 0, 128, 1, 2, 3, 4, 128, 5, //
3463
+ 128, 0, 1, 2, 3, 4, 128, 5, //
3464
+ 0, 1, 2, 3, 4, 5, 128, 6, //
3465
+ 128, 128, 128, 128, 128, 128, 0, 1, //
3466
+ 0, 128, 128, 128, 128, 128, 1, 2, //
3467
+ 128, 0, 128, 128, 128, 128, 1, 2, //
3468
+ 0, 1, 128, 128, 128, 128, 2, 3, //
3469
+ 128, 128, 0, 128, 128, 128, 1, 2, //
3470
+ 0, 128, 1, 128, 128, 128, 2, 3, //
3471
+ 128, 0, 1, 128, 128, 128, 2, 3, //
3472
+ 0, 1, 2, 128, 128, 128, 3, 4, //
3473
+ 128, 128, 128, 0, 128, 128, 1, 2, //
3474
+ 0, 128, 128, 1, 128, 128, 2, 3, //
3475
+ 128, 0, 128, 1, 128, 128, 2, 3, //
3476
+ 0, 1, 128, 2, 128, 128, 3, 4, //
3477
+ 128, 128, 0, 1, 128, 128, 2, 3, //
3478
+ 0, 128, 1, 2, 128, 128, 3, 4, //
3479
+ 128, 0, 1, 2, 128, 128, 3, 4, //
3480
+ 0, 1, 2, 3, 128, 128, 4, 5, //
3481
+ 128, 128, 128, 128, 0, 128, 1, 2, //
3482
+ 0, 128, 128, 128, 1, 128, 2, 3, //
3483
+ 128, 0, 128, 128, 1, 128, 2, 3, //
3484
+ 0, 1, 128, 128, 2, 128, 3, 4, //
3485
+ 128, 128, 0, 128, 1, 128, 2, 3, //
3486
+ 0, 128, 1, 128, 2, 128, 3, 4, //
3487
+ 128, 0, 1, 128, 2, 128, 3, 4, //
3488
+ 0, 1, 2, 128, 3, 128, 4, 5, //
3489
+ 128, 128, 128, 0, 1, 128, 2, 3, //
3490
+ 0, 128, 128, 1, 2, 128, 3, 4, //
3491
+ 128, 0, 128, 1, 2, 128, 3, 4, //
3492
+ 0, 1, 128, 2, 3, 128, 4, 5, //
3493
+ 128, 128, 0, 1, 2, 128, 3, 4, //
3494
+ 0, 128, 1, 2, 3, 128, 4, 5, //
3495
+ 128, 0, 1, 2, 3, 128, 4, 5, //
3496
+ 0, 1, 2, 3, 4, 128, 5, 6, //
3497
+ 128, 128, 128, 128, 128, 0, 1, 2, //
3498
+ 0, 128, 128, 128, 128, 1, 2, 3, //
3499
+ 128, 0, 128, 128, 128, 1, 2, 3, //
3500
+ 0, 1, 128, 128, 128, 2, 3, 4, //
3501
+ 128, 128, 0, 128, 128, 1, 2, 3, //
3502
+ 0, 128, 1, 128, 128, 2, 3, 4, //
3503
+ 128, 0, 1, 128, 128, 2, 3, 4, //
3504
+ 0, 1, 2, 128, 128, 3, 4, 5, //
3505
+ 128, 128, 128, 0, 128, 1, 2, 3, //
3506
+ 0, 128, 128, 1, 128, 2, 3, 4, //
3507
+ 128, 0, 128, 1, 128, 2, 3, 4, //
3508
+ 0, 1, 128, 2, 128, 3, 4, 5, //
3509
+ 128, 128, 0, 1, 128, 2, 3, 4, //
3510
+ 0, 128, 1, 2, 128, 3, 4, 5, //
3511
+ 128, 0, 1, 2, 128, 3, 4, 5, //
3512
+ 0, 1, 2, 3, 128, 4, 5, 6, //
3513
+ 128, 128, 128, 128, 0, 1, 2, 3, //
3514
+ 0, 128, 128, 128, 1, 2, 3, 4, //
3515
+ 128, 0, 128, 128, 1, 2, 3, 4, //
3516
+ 0, 1, 128, 128, 2, 3, 4, 5, //
3517
+ 128, 128, 0, 128, 1, 2, 3, 4, //
3518
+ 0, 128, 1, 128, 2, 3, 4, 5, //
3519
+ 128, 0, 1, 128, 2, 3, 4, 5, //
3520
+ 0, 1, 2, 128, 3, 4, 5, 6, //
3521
+ 128, 128, 128, 0, 1, 2, 3, 4, //
3522
+ 0, 128, 128, 1, 2, 3, 4, 5, //
3523
+ 128, 0, 128, 1, 2, 3, 4, 5, //
3524
+ 0, 1, 128, 2, 3, 4, 5, 6, //
3525
+ 128, 128, 0, 1, 2, 3, 4, 5, //
3526
+ 0, 128, 1, 2, 3, 4, 5, 6, //
3527
+ 128, 0, 1, 2, 3, 4, 5, 6, //
3528
+ 0, 1, 2, 3, 4, 5, 6, 7};
3529
+ return LoadU(du8, table + mask_bits * 8);
3530
+ }
3531
+
3532
+ } // namespace detail
3533
+
3534
+ // Half vector of bytes: one table lookup
3535
+ template <typename T, size_t N, HWY_IF_T_SIZE(T, 1), HWY_IF_V_SIZE_LE(T, N, 8)>
3536
+ HWY_API Vec128<T, N> Expand(Vec128<T, N> v, Mask128<T, N> mask) {
3537
+ const DFromV<decltype(v)> d;
3538
+
3539
+ const uint64_t mask_bits = detail::BitsFromMask(mask);
3540
+ const Vec128<uint8_t, N> indices =
3541
+ detail::IndicesForExpandFromBits<N>(mask_bits);
3542
+ return BitCast(d, TableLookupBytesOr0(v, indices));
3543
+ }
3544
+
3545
+ // Full vector of bytes: two table lookups
3546
+ template <typename T, HWY_IF_T_SIZE(T, 1)>
3547
+ HWY_API Vec128<T> Expand(Vec128<T> v, Mask128<T> mask) {
3548
+ const Full128<T> d;
3549
+ const RebindToUnsigned<decltype(d)> du;
3550
+ const Half<decltype(du)> duh;
3551
+ const Vec128<uint8_t> vu = BitCast(du, v);
3552
+
3553
+ const uint64_t mask_bits = detail::BitsFromMask(mask);
3554
+ const uint64_t maskL = mask_bits & 0xFF;
3555
+ const uint64_t maskH = mask_bits >> 8;
3556
+
3557
+ // We want to skip past the v bytes already consumed by idxL. There is no
3558
+ // instruction for shift-reg by variable bytes. Storing v itself would work
3559
+ // but would involve a store-load forwarding stall. We instead shuffle using
3560
+ // loaded indices. multishift_epi64_epi8 would also help, but if we have that,
3561
+ // we probably also have native 8-bit Expand.
3562
+ alignas(16) static constexpr uint8_t iota[32] = {
3563
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
3564
+ 11, 12, 13, 14, 15, 128, 128, 128, 128, 128, 128,
3565
+ 128, 128, 128, 128, 128, 128, 128, 128, 128, 128};
3566
+ const VFromD<decltype(du)> shift = LoadU(du, iota + PopCount(maskL));
3567
+ const VFromD<decltype(duh)> vL = LowerHalf(duh, vu);
3568
+ const VFromD<decltype(duh)> vH =
3569
+ LowerHalf(duh, TableLookupBytesOr0(vu, shift));
3570
+
3571
+ const VFromD<decltype(duh)> idxL = detail::IndicesForExpandFromBits<8>(maskL);
3572
+ const VFromD<decltype(duh)> idxH = detail::IndicesForExpandFromBits<8>(maskH);
3573
+
3574
+ const VFromD<decltype(duh)> expandL = TableLookupBytesOr0(vL, idxL);
3575
+ const VFromD<decltype(duh)> expandH = TableLookupBytesOr0(vH, idxH);
3576
+ return BitCast(d, Combine(du, expandH, expandL));
3577
+ }
3578
+
3579
+ template <typename T, size_t N, HWY_IF_T_SIZE(T, 2)>
3580
+ HWY_API Vec128<T, N> Expand(Vec128<T, N> v, Mask128<T, N> mask) {
3581
+ const DFromV<decltype(v)> d;
3582
+ const RebindToUnsigned<decltype(d)> du;
3583
+
3584
+ const Rebind<uint8_t, decltype(d)> du8;
3585
+ const uint64_t mask_bits = detail::BitsFromMask(mask);
3586
+
3587
+ // Storing as 8-bit reduces table size from 4 KiB to 2 KiB. We cannot apply
3588
+ // the nibble trick used below because not all indices fit within one lane.
3589
+ alignas(16) static constexpr uint8_t table[2048] = {
3590
+ // PrintExpand16x8ByteTables
3591
+ 128, 128, 128, 128, 128, 128, 128, 128, //
3592
+ 0, 128, 128, 128, 128, 128, 128, 128, //
3593
+ 128, 0, 128, 128, 128, 128, 128, 128, //
3594
+ 0, 2, 128, 128, 128, 128, 128, 128, //
3595
+ 128, 128, 0, 128, 128, 128, 128, 128, //
3596
+ 0, 128, 2, 128, 128, 128, 128, 128, //
3597
+ 128, 0, 2, 128, 128, 128, 128, 128, //
3598
+ 0, 2, 4, 128, 128, 128, 128, 128, //
3599
+ 128, 128, 128, 0, 128, 128, 128, 128, //
3600
+ 0, 128, 128, 2, 128, 128, 128, 128, //
3601
+ 128, 0, 128, 2, 128, 128, 128, 128, //
3602
+ 0, 2, 128, 4, 128, 128, 128, 128, //
3603
+ 128, 128, 0, 2, 128, 128, 128, 128, //
3604
+ 0, 128, 2, 4, 128, 128, 128, 128, //
3605
+ 128, 0, 2, 4, 128, 128, 128, 128, //
3606
+ 0, 2, 4, 6, 128, 128, 128, 128, //
3607
+ 128, 128, 128, 128, 0, 128, 128, 128, //
3608
+ 0, 128, 128, 128, 2, 128, 128, 128, //
3609
+ 128, 0, 128, 128, 2, 128, 128, 128, //
3610
+ 0, 2, 128, 128, 4, 128, 128, 128, //
3611
+ 128, 128, 0, 128, 2, 128, 128, 128, //
3612
+ 0, 128, 2, 128, 4, 128, 128, 128, //
3613
+ 128, 0, 2, 128, 4, 128, 128, 128, //
3614
+ 0, 2, 4, 128, 6, 128, 128, 128, //
3615
+ 128, 128, 128, 0, 2, 128, 128, 128, //
3616
+ 0, 128, 128, 2, 4, 128, 128, 128, //
3617
+ 128, 0, 128, 2, 4, 128, 128, 128, //
3618
+ 0, 2, 128, 4, 6, 128, 128, 128, //
3619
+ 128, 128, 0, 2, 4, 128, 128, 128, //
3620
+ 0, 128, 2, 4, 6, 128, 128, 128, //
3621
+ 128, 0, 2, 4, 6, 128, 128, 128, //
3622
+ 0, 2, 4, 6, 8, 128, 128, 128, //
3623
+ 128, 128, 128, 128, 128, 0, 128, 128, //
3624
+ 0, 128, 128, 128, 128, 2, 128, 128, //
3625
+ 128, 0, 128, 128, 128, 2, 128, 128, //
3626
+ 0, 2, 128, 128, 128, 4, 128, 128, //
3627
+ 128, 128, 0, 128, 128, 2, 128, 128, //
3628
+ 0, 128, 2, 128, 128, 4, 128, 128, //
3629
+ 128, 0, 2, 128, 128, 4, 128, 128, //
3630
+ 0, 2, 4, 128, 128, 6, 128, 128, //
3631
+ 128, 128, 128, 0, 128, 2, 128, 128, //
3632
+ 0, 128, 128, 2, 128, 4, 128, 128, //
3633
+ 128, 0, 128, 2, 128, 4, 128, 128, //
3634
+ 0, 2, 128, 4, 128, 6, 128, 128, //
3635
+ 128, 128, 0, 2, 128, 4, 128, 128, //
3636
+ 0, 128, 2, 4, 128, 6, 128, 128, //
3637
+ 128, 0, 2, 4, 128, 6, 128, 128, //
3638
+ 0, 2, 4, 6, 128, 8, 128, 128, //
3639
+ 128, 128, 128, 128, 0, 2, 128, 128, //
3640
+ 0, 128, 128, 128, 2, 4, 128, 128, //
3641
+ 128, 0, 128, 128, 2, 4, 128, 128, //
3642
+ 0, 2, 128, 128, 4, 6, 128, 128, //
3643
+ 128, 128, 0, 128, 2, 4, 128, 128, //
3644
+ 0, 128, 2, 128, 4, 6, 128, 128, //
3645
+ 128, 0, 2, 128, 4, 6, 128, 128, //
3646
+ 0, 2, 4, 128, 6, 8, 128, 128, //
3647
+ 128, 128, 128, 0, 2, 4, 128, 128, //
3648
+ 0, 128, 128, 2, 4, 6, 128, 128, //
3649
+ 128, 0, 128, 2, 4, 6, 128, 128, //
3650
+ 0, 2, 128, 4, 6, 8, 128, 128, //
3651
+ 128, 128, 0, 2, 4, 6, 128, 128, //
3652
+ 0, 128, 2, 4, 6, 8, 128, 128, //
3653
+ 128, 0, 2, 4, 6, 8, 128, 128, //
3654
+ 0, 2, 4, 6, 8, 10, 128, 128, //
3655
+ 128, 128, 128, 128, 128, 128, 0, 128, //
3656
+ 0, 128, 128, 128, 128, 128, 2, 128, //
3657
+ 128, 0, 128, 128, 128, 128, 2, 128, //
3658
+ 0, 2, 128, 128, 128, 128, 4, 128, //
3659
+ 128, 128, 0, 128, 128, 128, 2, 128, //
3660
+ 0, 128, 2, 128, 128, 128, 4, 128, //
3661
+ 128, 0, 2, 128, 128, 128, 4, 128, //
3662
+ 0, 2, 4, 128, 128, 128, 6, 128, //
3663
+ 128, 128, 128, 0, 128, 128, 2, 128, //
3664
+ 0, 128, 128, 2, 128, 128, 4, 128, //
3665
+ 128, 0, 128, 2, 128, 128, 4, 128, //
3666
+ 0, 2, 128, 4, 128, 128, 6, 128, //
3667
+ 128, 128, 0, 2, 128, 128, 4, 128, //
3668
+ 0, 128, 2, 4, 128, 128, 6, 128, //
3669
+ 128, 0, 2, 4, 128, 128, 6, 128, //
3670
+ 0, 2, 4, 6, 128, 128, 8, 128, //
3671
+ 128, 128, 128, 128, 0, 128, 2, 128, //
3672
+ 0, 128, 128, 128, 2, 128, 4, 128, //
3673
+ 128, 0, 128, 128, 2, 128, 4, 128, //
3674
+ 0, 2, 128, 128, 4, 128, 6, 128, //
3675
+ 128, 128, 0, 128, 2, 128, 4, 128, //
3676
+ 0, 128, 2, 128, 4, 128, 6, 128, //
3677
+ 128, 0, 2, 128, 4, 128, 6, 128, //
3678
+ 0, 2, 4, 128, 6, 128, 8, 128, //
3679
+ 128, 128, 128, 0, 2, 128, 4, 128, //
3680
+ 0, 128, 128, 2, 4, 128, 6, 128, //
3681
+ 128, 0, 128, 2, 4, 128, 6, 128, //
3682
+ 0, 2, 128, 4, 6, 128, 8, 128, //
3683
+ 128, 128, 0, 2, 4, 128, 6, 128, //
3684
+ 0, 128, 2, 4, 6, 128, 8, 128, //
3685
+ 128, 0, 2, 4, 6, 128, 8, 128, //
3686
+ 0, 2, 4, 6, 8, 128, 10, 128, //
3687
+ 128, 128, 128, 128, 128, 0, 2, 128, //
3688
+ 0, 128, 128, 128, 128, 2, 4, 128, //
3689
+ 128, 0, 128, 128, 128, 2, 4, 128, //
3690
+ 0, 2, 128, 128, 128, 4, 6, 128, //
3691
+ 128, 128, 0, 128, 128, 2, 4, 128, //
3692
+ 0, 128, 2, 128, 128, 4, 6, 128, //
3693
+ 128, 0, 2, 128, 128, 4, 6, 128, //
3694
+ 0, 2, 4, 128, 128, 6, 8, 128, //
3695
+ 128, 128, 128, 0, 128, 2, 4, 128, //
3696
+ 0, 128, 128, 2, 128, 4, 6, 128, //
3697
+ 128, 0, 128, 2, 128, 4, 6, 128, //
3698
+ 0, 2, 128, 4, 128, 6, 8, 128, //
3699
+ 128, 128, 0, 2, 128, 4, 6, 128, //
3700
+ 0, 128, 2, 4, 128, 6, 8, 128, //
3701
+ 128, 0, 2, 4, 128, 6, 8, 128, //
3702
+ 0, 2, 4, 6, 128, 8, 10, 128, //
3703
+ 128, 128, 128, 128, 0, 2, 4, 128, //
3704
+ 0, 128, 128, 128, 2, 4, 6, 128, //
3705
+ 128, 0, 128, 128, 2, 4, 6, 128, //
3706
+ 0, 2, 128, 128, 4, 6, 8, 128, //
3707
+ 128, 128, 0, 128, 2, 4, 6, 128, //
3708
+ 0, 128, 2, 128, 4, 6, 8, 128, //
3709
+ 128, 0, 2, 128, 4, 6, 8, 128, //
3710
+ 0, 2, 4, 128, 6, 8, 10, 128, //
3711
+ 128, 128, 128, 0, 2, 4, 6, 128, //
3712
+ 0, 128, 128, 2, 4, 6, 8, 128, //
3713
+ 128, 0, 128, 2, 4, 6, 8, 128, //
3714
+ 0, 2, 128, 4, 6, 8, 10, 128, //
3715
+ 128, 128, 0, 2, 4, 6, 8, 128, //
3716
+ 0, 128, 2, 4, 6, 8, 10, 128, //
3717
+ 128, 0, 2, 4, 6, 8, 10, 128, //
3718
+ 0, 2, 4, 6, 8, 10, 12, 128, //
3719
+ 128, 128, 128, 128, 128, 128, 128, 0, //
3720
+ 0, 128, 128, 128, 128, 128, 128, 2, //
3721
+ 128, 0, 128, 128, 128, 128, 128, 2, //
3722
+ 0, 2, 128, 128, 128, 128, 128, 4, //
3723
+ 128, 128, 0, 128, 128, 128, 128, 2, //
3724
+ 0, 128, 2, 128, 128, 128, 128, 4, //
3725
+ 128, 0, 2, 128, 128, 128, 128, 4, //
3726
+ 0, 2, 4, 128, 128, 128, 128, 6, //
3727
+ 128, 128, 128, 0, 128, 128, 128, 2, //
3728
+ 0, 128, 128, 2, 128, 128, 128, 4, //
3729
+ 128, 0, 128, 2, 128, 128, 128, 4, //
3730
+ 0, 2, 128, 4, 128, 128, 128, 6, //
3731
+ 128, 128, 0, 2, 128, 128, 128, 4, //
3732
+ 0, 128, 2, 4, 128, 128, 128, 6, //
3733
+ 128, 0, 2, 4, 128, 128, 128, 6, //
3734
+ 0, 2, 4, 6, 128, 128, 128, 8, //
3735
+ 128, 128, 128, 128, 0, 128, 128, 2, //
3736
+ 0, 128, 128, 128, 2, 128, 128, 4, //
3737
+ 128, 0, 128, 128, 2, 128, 128, 4, //
3738
+ 0, 2, 128, 128, 4, 128, 128, 6, //
3739
+ 128, 128, 0, 128, 2, 128, 128, 4, //
3740
+ 0, 128, 2, 128, 4, 128, 128, 6, //
3741
+ 128, 0, 2, 128, 4, 128, 128, 6, //
3742
+ 0, 2, 4, 128, 6, 128, 128, 8, //
3743
+ 128, 128, 128, 0, 2, 128, 128, 4, //
3744
+ 0, 128, 128, 2, 4, 128, 128, 6, //
3745
+ 128, 0, 128, 2, 4, 128, 128, 6, //
3746
+ 0, 2, 128, 4, 6, 128, 128, 8, //
3747
+ 128, 128, 0, 2, 4, 128, 128, 6, //
3748
+ 0, 128, 2, 4, 6, 128, 128, 8, //
3749
+ 128, 0, 2, 4, 6, 128, 128, 8, //
3750
+ 0, 2, 4, 6, 8, 128, 128, 10, //
3751
+ 128, 128, 128, 128, 128, 0, 128, 2, //
3752
+ 0, 128, 128, 128, 128, 2, 128, 4, //
3753
+ 128, 0, 128, 128, 128, 2, 128, 4, //
3754
+ 0, 2, 128, 128, 128, 4, 128, 6, //
3755
+ 128, 128, 0, 128, 128, 2, 128, 4, //
3756
+ 0, 128, 2, 128, 128, 4, 128, 6, //
3757
+ 128, 0, 2, 128, 128, 4, 128, 6, //
3758
+ 0, 2, 4, 128, 128, 6, 128, 8, //
3759
+ 128, 128, 128, 0, 128, 2, 128, 4, //
3760
+ 0, 128, 128, 2, 128, 4, 128, 6, //
3761
+ 128, 0, 128, 2, 128, 4, 128, 6, //
3762
+ 0, 2, 128, 4, 128, 6, 128, 8, //
3763
+ 128, 128, 0, 2, 128, 4, 128, 6, //
3764
+ 0, 128, 2, 4, 128, 6, 128, 8, //
3765
+ 128, 0, 2, 4, 128, 6, 128, 8, //
3766
+ 0, 2, 4, 6, 128, 8, 128, 10, //
3767
+ 128, 128, 128, 128, 0, 2, 128, 4, //
3768
+ 0, 128, 128, 128, 2, 4, 128, 6, //
3769
+ 128, 0, 128, 128, 2, 4, 128, 6, //
3770
+ 0, 2, 128, 128, 4, 6, 128, 8, //
3771
+ 128, 128, 0, 128, 2, 4, 128, 6, //
3772
+ 0, 128, 2, 128, 4, 6, 128, 8, //
3773
+ 128, 0, 2, 128, 4, 6, 128, 8, //
3774
+ 0, 2, 4, 128, 6, 8, 128, 10, //
3775
+ 128, 128, 128, 0, 2, 4, 128, 6, //
3776
+ 0, 128, 128, 2, 4, 6, 128, 8, //
3777
+ 128, 0, 128, 2, 4, 6, 128, 8, //
3778
+ 0, 2, 128, 4, 6, 8, 128, 10, //
3779
+ 128, 128, 0, 2, 4, 6, 128, 8, //
3780
+ 0, 128, 2, 4, 6, 8, 128, 10, //
3781
+ 128, 0, 2, 4, 6, 8, 128, 10, //
3782
+ 0, 2, 4, 6, 8, 10, 128, 12, //
3783
+ 128, 128, 128, 128, 128, 128, 0, 2, //
3784
+ 0, 128, 128, 128, 128, 128, 2, 4, //
3785
+ 128, 0, 128, 128, 128, 128, 2, 4, //
3786
+ 0, 2, 128, 128, 128, 128, 4, 6, //
3787
+ 128, 128, 0, 128, 128, 128, 2, 4, //
3788
+ 0, 128, 2, 128, 128, 128, 4, 6, //
3789
+ 128, 0, 2, 128, 128, 128, 4, 6, //
3790
+ 0, 2, 4, 128, 128, 128, 6, 8, //
3791
+ 128, 128, 128, 0, 128, 128, 2, 4, //
3792
+ 0, 128, 128, 2, 128, 128, 4, 6, //
3793
+ 128, 0, 128, 2, 128, 128, 4, 6, //
3794
+ 0, 2, 128, 4, 128, 128, 6, 8, //
3795
+ 128, 128, 0, 2, 128, 128, 4, 6, //
3796
+ 0, 128, 2, 4, 128, 128, 6, 8, //
3797
+ 128, 0, 2, 4, 128, 128, 6, 8, //
3798
+ 0, 2, 4, 6, 128, 128, 8, 10, //
3799
+ 128, 128, 128, 128, 0, 128, 2, 4, //
3800
+ 0, 128, 128, 128, 2, 128, 4, 6, //
3801
+ 128, 0, 128, 128, 2, 128, 4, 6, //
3802
+ 0, 2, 128, 128, 4, 128, 6, 8, //
3803
+ 128, 128, 0, 128, 2, 128, 4, 6, //
3804
+ 0, 128, 2, 128, 4, 128, 6, 8, //
3805
+ 128, 0, 2, 128, 4, 128, 6, 8, //
3806
+ 0, 2, 4, 128, 6, 128, 8, 10, //
3807
+ 128, 128, 128, 0, 2, 128, 4, 6, //
3808
+ 0, 128, 128, 2, 4, 128, 6, 8, //
3809
+ 128, 0, 128, 2, 4, 128, 6, 8, //
3810
+ 0, 2, 128, 4, 6, 128, 8, 10, //
3811
+ 128, 128, 0, 2, 4, 128, 6, 8, //
3812
+ 0, 128, 2, 4, 6, 128, 8, 10, //
3813
+ 128, 0, 2, 4, 6, 128, 8, 10, //
3814
+ 0, 2, 4, 6, 8, 128, 10, 12, //
3815
+ 128, 128, 128, 128, 128, 0, 2, 4, //
3816
+ 0, 128, 128, 128, 128, 2, 4, 6, //
3817
+ 128, 0, 128, 128, 128, 2, 4, 6, //
3818
+ 0, 2, 128, 128, 128, 4, 6, 8, //
3819
+ 128, 128, 0, 128, 128, 2, 4, 6, //
3820
+ 0, 128, 2, 128, 128, 4, 6, 8, //
3821
+ 128, 0, 2, 128, 128, 4, 6, 8, //
3822
+ 0, 2, 4, 128, 128, 6, 8, 10, //
3823
+ 128, 128, 128, 0, 128, 2, 4, 6, //
3824
+ 0, 128, 128, 2, 128, 4, 6, 8, //
3825
+ 128, 0, 128, 2, 128, 4, 6, 8, //
3826
+ 0, 2, 128, 4, 128, 6, 8, 10, //
3827
+ 128, 128, 0, 2, 128, 4, 6, 8, //
3828
+ 0, 128, 2, 4, 128, 6, 8, 10, //
3829
+ 128, 0, 2, 4, 128, 6, 8, 10, //
3830
+ 0, 2, 4, 6, 128, 8, 10, 12, //
3831
+ 128, 128, 128, 128, 0, 2, 4, 6, //
3832
+ 0, 128, 128, 128, 2, 4, 6, 8, //
3833
+ 128, 0, 128, 128, 2, 4, 6, 8, //
3834
+ 0, 2, 128, 128, 4, 6, 8, 10, //
3835
+ 128, 128, 0, 128, 2, 4, 6, 8, //
3836
+ 0, 128, 2, 128, 4, 6, 8, 10, //
3837
+ 128, 0, 2, 128, 4, 6, 8, 10, //
3838
+ 0, 2, 4, 128, 6, 8, 10, 12, //
3839
+ 128, 128, 128, 0, 2, 4, 6, 8, //
3840
+ 0, 128, 128, 2, 4, 6, 8, 10, //
3841
+ 128, 0, 128, 2, 4, 6, 8, 10, //
3842
+ 0, 2, 128, 4, 6, 8, 10, 12, //
3843
+ 128, 128, 0, 2, 4, 6, 8, 10, //
3844
+ 0, 128, 2, 4, 6, 8, 10, 12, //
3845
+ 128, 0, 2, 4, 6, 8, 10, 12, //
3846
+ 0, 2, 4, 6, 8, 10, 12, 14};
3847
+ // Extend to double length because InterleaveLower will only use the (valid)
3848
+ // lower half, and we want N u16.
3849
+ const Twice<decltype(du8)> du8x2;
3850
+ const Vec128<uint8_t, 2 * N> indices8 =
3851
+ ZeroExtendVector(du8x2, Load(du8, table + mask_bits * 8));
3852
+ const Vec128<uint16_t, N> indices16 =
3853
+ BitCast(du, InterleaveLower(du8x2, indices8, indices8));
3854
+ // TableLookupBytesOr0 operates on bytes. To convert u16 lane indices to byte
3855
+ // indices, add 0 to even and 1 to odd byte lanes.
3856
+ const Vec128<uint16_t, N> byte_indices = Add(indices16, Set(du, 0x0100));
3857
+ return BitCast(d, TableLookupBytesOr0(v, byte_indices));
3858
+ }
3859
+
3860
+ template <typename T, size_t N, HWY_IF_T_SIZE(T, 4)>
3861
+ HWY_API Vec128<T, N> Expand(Vec128<T, N> v, Mask128<T, N> mask) {
3862
+ const DFromV<decltype(v)> d;
3863
+ const RebindToUnsigned<decltype(d)> du;
3864
+
3865
+ const uint64_t mask_bits = detail::BitsFromMask(mask);
3866
+
3867
+ alignas(16) static constexpr uint32_t packed_array[16] = {
3868
+ // PrintExpand64x4Nibble - same for 32x4.
3869
+ 0x0000ffff, 0x0000fff0, 0x0000ff0f, 0x0000ff10, 0x0000f0ff, 0x0000f1f0,
3870
+ 0x0000f10f, 0x0000f210, 0x00000fff, 0x00001ff0, 0x00001f0f, 0x00002f10,
3871
+ 0x000010ff, 0x000021f0, 0x0000210f, 0x00003210};
3872
+
3873
+ // For lane i, shift the i-th 4-bit index down to bits [0, 2).
3874
+ const Vec128<uint32_t, N> packed = Set(du, packed_array[mask_bits]);
3875
+ alignas(16) static constexpr uint32_t shifts[4] = {0, 4, 8, 12};
3876
+ Vec128<uint32_t, N> indices = packed >> Load(du, shifts);
3877
+ // AVX2 _mm256_permutexvar_epi32 will ignore upper bits, but IndicesFromVec
3878
+ // checks bounds, so clear the upper bits.
3879
+ indices = And(indices, Set(du, N - 1));
3880
+ const Vec128<uint32_t, N> expand =
3881
+ TableLookupLanes(BitCast(du, v), IndicesFromVec(du, indices));
3882
+ // TableLookupLanes cannot also zero masked-off lanes, so do that now.
3883
+ return IfThenElseZero(mask, BitCast(d, expand));
3884
+ }
3885
+
3886
+ template <typename T, HWY_IF_T_SIZE(T, 8)>
3887
+ HWY_API Vec128<T> Expand(Vec128<T> v, Mask128<T> mask) {
3888
+ // Same as Compress, just zero out the mask=false lanes.
3889
+ return IfThenElseZero(mask, Compress(v, mask));
3890
+ }
3891
+
3892
+ // For single-element vectors, this is at least as fast as native.
3893
+ template <typename T>
3894
+ HWY_API Vec128<T, 1> Expand(Vec128<T, 1> v, Mask128<T, 1> mask) {
3895
+ return IfThenElseZero(mask, v);
3896
+ }
3897
+
3898
+ // ------------------------------ LoadExpand
3899
+ template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
3900
+ HWY_API VFromD<D> LoadExpand(MFromD<D> mask, D d,
3901
+ const TFromD<D>* HWY_RESTRICT unaligned) {
3902
+ return Expand(LoadU(d, unaligned), mask);
3903
+ }
3904
+
3905
+ #endif // HWY_NATIVE_EXPAND
3906
+
3907
+ // ------------------------------ TwoTablesLookupLanes
3908
+
3909
+ template <class D>
3910
+ using IndicesFromD = decltype(IndicesFromVec(D(), Zero(RebindToUnsigned<D>())));
3911
+
3912
+ // RVV/SVE have their own implementations of
3913
+ // TwoTablesLookupLanes(D d, VFromD<D> a, VFromD<D> b, IndicesFromD<D> idx)
3914
+ #if HWY_TARGET != HWY_RVV && HWY_TARGET != HWY_SVE && \
3915
+ HWY_TARGET != HWY_SVE2 && HWY_TARGET != HWY_SVE_256 && \
3916
+ HWY_TARGET != HWY_SVE2_128
3917
+ template <class D>
3918
+ HWY_API VFromD<D> TwoTablesLookupLanes(D /*d*/, VFromD<D> a, VFromD<D> b,
3919
+ IndicesFromD<D> idx) {
3920
+ return TwoTablesLookupLanes(a, b, idx);
3921
+ }
3922
+ #endif
3923
+
3924
+ // ------------------------------ Reverse2, Reverse4, Reverse8 (8-bit)
3925
+
3926
+ #if (defined(HWY_NATIVE_REVERSE2_8) == defined(HWY_TARGET_TOGGLE)) || HWY_IDE
3927
+ #ifdef HWY_NATIVE_REVERSE2_8
3928
+ #undef HWY_NATIVE_REVERSE2_8
3929
+ #else
3930
+ #define HWY_NATIVE_REVERSE2_8
3931
+ #endif
3932
+
3933
+ #undef HWY_PREFER_ROTATE
3934
+ // Platforms on which RotateRight is likely faster than TableLookupBytes.
3935
+ // RVV and SVE anyway have their own implementation of this.
3936
+ #if HWY_TARGET == HWY_SSE2 || HWY_TARGET <= HWY_AVX3 || \
3937
+ HWY_TARGET == HWY_WASM || HWY_TARGET == HWY_PPC8
3938
+ #define HWY_PREFER_ROTATE 1
3939
+ #else
3940
+ #define HWY_PREFER_ROTATE 0
3941
+ #endif
3942
+
3943
+ template <class D, HWY_IF_T_SIZE_D(D, 1)>
3944
+ HWY_API VFromD<D> Reverse2(D d, VFromD<D> v) {
3945
+ // Exclude AVX3 because its 16-bit RotateRight is actually 3 instructions.
3946
+ #if HWY_PREFER_ROTATE && HWY_TARGET > HWY_AVX3
3947
+ const Repartition<uint16_t, decltype(d)> du16;
3948
+ return BitCast(d, RotateRight<8>(BitCast(du16, v)));
3949
+ #else
3950
+ alignas(16) static constexpr TFromD<D> kShuffle[16] = {
3951
+ 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14};
3952
+ return TableLookupBytes(v, LoadDup128(d, kShuffle));
3953
+ #endif
3954
+ }
3955
+
3956
+ template <class D, HWY_IF_T_SIZE_D(D, 1)>
3957
+ HWY_API VFromD<D> Reverse4(D d, VFromD<D> v) {
3958
+ #if HWY_PREFER_ROTATE
3959
+ const Repartition<uint16_t, decltype(d)> du16;
3960
+ return BitCast(d, Reverse2(du16, BitCast(du16, Reverse2(d, v))));
3961
+ #else
3962
+ alignas(16) static constexpr uint8_t kShuffle[16] = {
3963
+ 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12};
3964
+ const Repartition<uint8_t, decltype(d)> du8;
3965
+ return TableLookupBytes(v, BitCast(d, LoadDup128(du8, kShuffle)));
3966
+ #endif
3967
+ }
3968
+
3969
+ template <class D, HWY_IF_T_SIZE_D(D, 1)>
3970
+ HWY_API VFromD<D> Reverse8(D d, VFromD<D> v) {
3971
+ #if HWY_PREFER_ROTATE
3972
+ const Repartition<uint32_t, D> du32;
3973
+ return BitCast(d, Reverse2(du32, BitCast(du32, Reverse4(d, v))));
3974
+ #else
3975
+ alignas(16) static constexpr uint8_t kShuffle[16] = {
3976
+ 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8};
3977
+ const Repartition<uint8_t, decltype(d)> du8;
3978
+ return TableLookupBytes(v, BitCast(d, LoadDup128(du8, kShuffle)));
3979
+ #endif
3980
+ }
3981
+
3982
+ #endif // HWY_NATIVE_REVERSE2_8
3983
+
3984
+ // ------------------------------ ReverseLaneBytes
3985
+
3986
+ #if (defined(HWY_NATIVE_REVERSE_LANE_BYTES) == defined(HWY_TARGET_TOGGLE))
3987
+ #ifdef HWY_NATIVE_REVERSE_LANE_BYTES
3988
+ #undef HWY_NATIVE_REVERSE_LANE_BYTES
3989
+ #else
3990
+ #define HWY_NATIVE_REVERSE_LANE_BYTES
3991
+ #endif
3992
+
3993
+ template <class V, HWY_IF_T_SIZE_V(V, 2)>
3994
+ HWY_API V ReverseLaneBytes(V v) {
3995
+ const DFromV<V> d;
3996
+ const Repartition<uint8_t, decltype(d)> du8;
3997
+ return BitCast(d, Reverse2(du8, BitCast(du8, v)));
3998
+ }
3999
+
4000
+ template <class V, HWY_IF_T_SIZE_V(V, 4)>
4001
+ HWY_API V ReverseLaneBytes(V v) {
4002
+ const DFromV<V> d;
4003
+ const Repartition<uint8_t, decltype(d)> du8;
4004
+ return BitCast(d, Reverse4(du8, BitCast(du8, v)));
4005
+ }
4006
+
4007
+ template <class V, HWY_IF_T_SIZE_V(V, 8)>
4008
+ HWY_API V ReverseLaneBytes(V v) {
4009
+ const DFromV<V> d;
4010
+ const Repartition<uint8_t, decltype(d)> du8;
4011
+ return BitCast(d, Reverse8(du8, BitCast(du8, v)));
4012
+ }
4013
+
4014
+ #endif // HWY_NATIVE_REVERSE_LANE_BYTES
4015
+
4016
+ // ------------------------------ ReverseBits
4017
+
4018
+ // On these targets, we emulate 8-bit shifts using 16-bit shifts and therefore
4019
+ // require at least two lanes to BitCast to 16-bit. We avoid Highway's 8-bit
4020
+ // shifts because those would add extra masking already taken care of by
4021
+ // UI8ReverseBitsStep. Note that AVX3_DL/AVX3_ZEN4 support GFNI and use it to
4022
+ // implement ReverseBits, so this code is not used there.
4023
+ #undef HWY_REVERSE_BITS_MIN_BYTES
4024
+ #if ((HWY_TARGET >= HWY_AVX3 && HWY_TARGET <= HWY_SSE2) || \
4025
+ HWY_TARGET == HWY_WASM || HWY_TARGET == HWY_WASM_EMU256)
4026
+ #define HWY_REVERSE_BITS_MIN_BYTES 2
4027
+ #else
4028
+ #define HWY_REVERSE_BITS_MIN_BYTES 1
4029
+ #endif
4030
+
4031
+ #if (defined(HWY_NATIVE_REVERSE_BITS_UI8) == defined(HWY_TARGET_TOGGLE))
4032
+ #ifdef HWY_NATIVE_REVERSE_BITS_UI8
4033
+ #undef HWY_NATIVE_REVERSE_BITS_UI8
4034
+ #else
4035
+ #define HWY_NATIVE_REVERSE_BITS_UI8
4036
+ #endif
4037
+
4038
+ namespace detail {
4039
+
4040
+ template <int kShiftAmt, int kShrResultMask, class V,
4041
+ HWY_IF_V_SIZE_GT_D(DFromV<V>, HWY_REVERSE_BITS_MIN_BYTES - 1)>
4042
+ HWY_INLINE V UI8ReverseBitsStep(V v) {
4043
+ const DFromV<decltype(v)> d;
4044
+ const RebindToUnsigned<decltype(d)> du;
4045
+ #if HWY_REVERSE_BITS_MIN_BYTES == 2
4046
+ const Repartition<uint16_t, decltype(d)> d_shift;
4047
+ #else
4048
+ const RebindToUnsigned<decltype(d)> d_shift;
4049
+ #endif
4050
+
4051
+ const auto v_to_shift = BitCast(d_shift, v);
4052
+ const auto shl_result = BitCast(d, ShiftLeft<kShiftAmt>(v_to_shift));
4053
+ const auto shr_result = BitCast(d, ShiftRight<kShiftAmt>(v_to_shift));
4054
+ const auto shr_result_mask =
4055
+ BitCast(d, Set(du, static_cast<uint8_t>(kShrResultMask)));
4056
+ return Or(And(shr_result, shr_result_mask),
4057
+ AndNot(shr_result_mask, shl_result));
4058
+ }
4059
+
4060
+ #if HWY_REVERSE_BITS_MIN_BYTES == 2
4061
+ template <int kShiftAmt, int kShrResultMask, class V,
4062
+ HWY_IF_V_SIZE_D(DFromV<V>, 1)>
4063
+ HWY_INLINE V UI8ReverseBitsStep(V v) {
4064
+ return V{UI8ReverseBitsStep<kShiftAmt, kShrResultMask>(Vec128<uint8_t>{v.raw})
4065
+ .raw};
4066
+ }
4067
+ #endif
4068
+
4069
+ } // namespace detail
4070
+
4071
+ template <class V, HWY_IF_T_SIZE_V(V, 1)>
4072
+ HWY_API V ReverseBits(V v) {
4073
+ auto result = detail::UI8ReverseBitsStep<1, 0x55>(v);
4074
+ result = detail::UI8ReverseBitsStep<2, 0x33>(result);
4075
+ result = detail::UI8ReverseBitsStep<4, 0x0F>(result);
4076
+ return result;
4077
+ }
4078
+
4079
+ #endif // HWY_NATIVE_REVERSE_BITS_UI8
4080
+
4081
+ #if (defined(HWY_NATIVE_REVERSE_BITS_UI16_32_64) == defined(HWY_TARGET_TOGGLE))
4082
+ #ifdef HWY_NATIVE_REVERSE_BITS_UI16_32_64
4083
+ #undef HWY_NATIVE_REVERSE_BITS_UI16_32_64
4084
+ #else
4085
+ #define HWY_NATIVE_REVERSE_BITS_UI16_32_64
4086
+ #endif
4087
+
4088
+ template <class V, HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 2) | (1 << 4) | (1 << 8)),
4089
+ HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
4090
+ HWY_API V ReverseBits(V v) {
4091
+ const DFromV<decltype(v)> d;
4092
+ const Repartition<uint8_t, decltype(d)> du8;
4093
+ return ReverseLaneBytes(BitCast(d, ReverseBits(BitCast(du8, v))));
4094
+ }
4095
+ #endif // HWY_NATIVE_REVERSE_BITS_UI16_32_64
4096
+
4097
+ // ------------------------------ Per4LaneBlockShuffle
4098
+
4099
+ #if (defined(HWY_NATIVE_PER4LANEBLKSHUF_DUP32) == defined(HWY_TARGET_TOGGLE))
4100
+ #ifdef HWY_NATIVE_PER4LANEBLKSHUF_DUP32
4101
+ #undef HWY_NATIVE_PER4LANEBLKSHUF_DUP32
4102
+ #else
4103
+ #define HWY_NATIVE_PER4LANEBLKSHUF_DUP32
4104
+ #endif
4105
+
4106
+ #if HWY_TARGET != HWY_SCALAR
4107
+ namespace detail {
4108
+
4109
+ template <class D>
4110
+ HWY_INLINE Vec<D> Per4LaneBlkShufDupSet4xU32(D d, const uint32_t x3,
4111
+ const uint32_t x2,
4112
+ const uint32_t x1,
4113
+ const uint32_t x0) {
4114
+ alignas(16) const uint32_t lanes[4] = {x0, x1, x2, x3};
4115
+
4116
+ #if HWY_TARGET == HWY_RVV
4117
+ constexpr int kPow2 = d.Pow2();
4118
+ constexpr int kLoadPow2 = HWY_MAX(kPow2, -1);
4119
+ const ScalableTag<uint32_t, kLoadPow2> d_load;
4120
+ #else
4121
+ constexpr size_t kMaxBytes = d.MaxBytes();
4122
+ #if HWY_TARGET == HWY_NEON || HWY_TARGET == HWY_NEON_WITHOUT_AES
4123
+ constexpr size_t kMinLanesToLoad = 2;
4124
+ #else
4125
+ constexpr size_t kMinLanesToLoad = 4;
4126
+ #endif
4127
+ constexpr size_t kNumToLoad =
4128
+ HWY_MAX(kMaxBytes / sizeof(uint32_t), kMinLanesToLoad);
4129
+ const CappedTag<uint32_t, kNumToLoad> d_load;
4130
+ #endif
4131
+
4132
+ return ResizeBitCast(d, LoadDup128(d_load, lanes));
4133
+ }
4134
+
4135
+ } // namespace detail
4136
+ #endif
4137
+
4138
+ #endif // HWY_NATIVE_PER4LANEBLKSHUF_DUP32
4139
+
4140
+ #if HWY_TARGET != HWY_SCALAR
4141
+ namespace detail {
4142
+
4143
+ template <class V>
4144
+ HWY_INLINE V Per2LaneBlockShuffle(hwy::SizeTag<0> /*idx_10_tag*/, V v) {
4145
+ return DupEven(v);
4146
+ }
4147
+
4148
+ template <class V>
4149
+ HWY_INLINE V Per2LaneBlockShuffle(hwy::SizeTag<1> /*idx_10_tag*/, V v) {
4150
+ const DFromV<decltype(v)> d;
4151
+ return Reverse2(d, v);
4152
+ }
4153
+
4154
+ template <class V>
4155
+ HWY_INLINE V Per2LaneBlockShuffle(hwy::SizeTag<2> /*idx_10_tag*/, V v) {
4156
+ return v;
4157
+ }
4158
+
4159
+ template <class V>
4160
+ HWY_INLINE V Per2LaneBlockShuffle(hwy::SizeTag<3> /*idx_10_tag*/, V v) {
4161
+ return DupOdd(v);
4162
+ }
4163
+
4164
+ HWY_INLINE uint32_t U8x4Per4LaneBlkIndices(const uint32_t idx3,
4165
+ const uint32_t idx2,
4166
+ const uint32_t idx1,
4167
+ const uint32_t idx0) {
4168
+ #if HWY_IS_LITTLE_ENDIAN
4169
+ return static_cast<uint32_t>((idx3 << 24) | (idx2 << 16) | (idx1 << 8) |
4170
+ idx0);
4171
+ #else
4172
+ return static_cast<uint32_t>(idx3 | (idx2 << 8) | (idx1 << 16) |
4173
+ (idx0 << 24));
4174
+ #endif
4175
+ }
4176
+
4177
+ template <class D>
4178
+ HWY_INLINE Vec<D> TblLookupPer4LaneBlkU8IdxInBlk(D d, const uint32_t idx3,
4179
+ const uint32_t idx2,
4180
+ const uint32_t idx1,
4181
+ const uint32_t idx0) {
4182
+ #if HWY_TARGET == HWY_RVV
4183
+ const AdjustSimdTagToMinVecPow2<Repartition<uint32_t, D>> du32;
4184
+ #else
4185
+ const Repartition<uint32_t, D> du32;
4186
+ #endif
4187
+
4188
+ return ResizeBitCast(
4189
+ d, Set(du32, U8x4Per4LaneBlkIndices(idx3, idx2, idx1, idx0)));
4190
+ }
4191
+
4192
+ #if HWY_HAVE_SCALABLE || HWY_TARGET == HWY_SVE_256 || \
4193
+ HWY_TARGET == HWY_SVE2_128 || HWY_TARGET == HWY_EMU128
4194
+ #define HWY_PER_4_BLK_TBL_LOOKUP_LANES_ENABLE(D) void* = nullptr
4195
+ #else
4196
+ #define HWY_PER_4_BLK_TBL_LOOKUP_LANES_ENABLE(D) HWY_IF_T_SIZE_D(D, 8)
4197
+
4198
+ template <class V, HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 2) | (1 << 4))>
4199
+ HWY_INLINE V Per4LaneBlkShufDoTblLookup(V v, V idx) {
4200
+ const DFromV<decltype(v)> d;
4201
+ const Repartition<uint8_t, decltype(d)> du8;
4202
+ return BitCast(d, TableLookupBytes(BitCast(du8, v), BitCast(du8, idx)));
4203
+ }
4204
+
4205
+ template <class D, HWY_IF_T_SIZE_D(D, 1)>
4206
+ HWY_INLINE Vec<D> TblLookupPer4LaneBlkShufIdx(D d, const uint32_t idx3,
4207
+ const uint32_t idx2,
4208
+ const uint32_t idx1,
4209
+ const uint32_t idx0) {
4210
+ const Repartition<uint32_t, decltype(d)> du32;
4211
+ const uint32_t idx3210 = U8x4Per4LaneBlkIndices(idx3, idx2, idx1, idx0);
4212
+ const auto v_byte_idx = Per4LaneBlkShufDupSet4xU32(
4213
+ du32, static_cast<uint32_t>(idx3210 + 0x0C0C0C0C),
4214
+ static_cast<uint32_t>(idx3210 + 0x08080808),
4215
+ static_cast<uint32_t>(idx3210 + 0x04040404),
4216
+ static_cast<uint32_t>(idx3210));
4217
+ return ResizeBitCast(d, v_byte_idx);
4218
+ }
4219
+
4220
+ template <class D, HWY_IF_T_SIZE_D(D, 2)>
4221
+ HWY_INLINE Vec<D> TblLookupPer4LaneBlkShufIdx(D d, const uint32_t idx3,
4222
+ const uint32_t idx2,
4223
+ const uint32_t idx1,
4224
+ const uint32_t idx0) {
4225
+ const Repartition<uint32_t, decltype(d)> du32;
4226
+ #if HWY_IS_LITTLE_ENDIAN
4227
+ const uint32_t idx10 = static_cast<uint32_t>((idx1 << 16) | idx0);
4228
+ const uint32_t idx32 = static_cast<uint32_t>((idx3 << 16) | idx2);
4229
+ constexpr uint32_t kLaneByteOffsets{0x01000100};
4230
+ #else
4231
+ const uint32_t idx10 = static_cast<uint32_t>(idx1 | (idx0 << 16));
4232
+ const uint32_t idx32 = static_cast<uint32_t>(idx3 | (idx2 << 16));
4233
+ constexpr uint32_t kLaneByteOffsets{0x00010001};
4234
+ #endif
4235
+ constexpr uint32_t kHiLaneByteOffsets{kLaneByteOffsets + 0x08080808u};
4236
+
4237
+ const auto v_byte_idx = Per4LaneBlkShufDupSet4xU32(
4238
+ du32, static_cast<uint32_t>(idx32 * 0x0202u + kHiLaneByteOffsets),
4239
+ static_cast<uint32_t>(idx10 * 0x0202u + kHiLaneByteOffsets),
4240
+ static_cast<uint32_t>(idx32 * 0x0202u + kLaneByteOffsets),
4241
+ static_cast<uint32_t>(idx10 * 0x0202u + kLaneByteOffsets));
4242
+ return ResizeBitCast(d, v_byte_idx);
4243
+ }
4244
+
4245
+ template <class D, HWY_IF_T_SIZE_D(D, 4)>
4246
+ HWY_INLINE Vec<D> TblLookupPer4LaneBlkShufIdx(D d, const uint32_t idx3,
4247
+ const uint32_t idx2,
4248
+ const uint32_t idx1,
4249
+ const uint32_t idx0) {
4250
+ const Repartition<uint32_t, decltype(d)> du32;
4251
+ #if HWY_IS_LITTLE_ENDIAN
4252
+ constexpr uint32_t kLaneByteOffsets{0x03020100};
4253
+ #else
4254
+ constexpr uint32_t kLaneByteOffsets{0x00010203};
4255
+ #endif
4256
+
4257
+ const auto v_byte_idx = Per4LaneBlkShufDupSet4xU32(
4258
+ du32, static_cast<uint32_t>(idx3 * 0x04040404u + kLaneByteOffsets),
4259
+ static_cast<uint32_t>(idx2 * 0x04040404u + kLaneByteOffsets),
4260
+ static_cast<uint32_t>(idx1 * 0x04040404u + kLaneByteOffsets),
4261
+ static_cast<uint32_t>(idx0 * 0x04040404u + kLaneByteOffsets));
4262
+ return ResizeBitCast(d, v_byte_idx);
4263
+ }
4264
+ #endif
4265
+
4266
+ template <class D, HWY_IF_T_SIZE_D(D, 1)>
4267
+ HWY_INLINE VFromD<D> TblLookupPer4LaneBlkIdxInBlk(D d, const uint32_t idx3,
4268
+ const uint32_t idx2,
4269
+ const uint32_t idx1,
4270
+ const uint32_t idx0) {
4271
+ return TblLookupPer4LaneBlkU8IdxInBlk(d, idx3, idx2, idx1, idx0);
4272
+ }
4273
+
4274
+ #if HWY_TARGET == HWY_RVV
4275
+ template <class D, HWY_IF_NOT_T_SIZE_D(D, 1)>
4276
+ HWY_INLINE VFromD<D> TblLookupPer4LaneBlkIdxInBlk(D d, const uint32_t idx3,
4277
+ const uint32_t idx2,
4278
+ const uint32_t idx1,
4279
+ const uint32_t idx0) {
4280
+ const Rebind<uint8_t, decltype(d)> du8;
4281
+ return PromoteTo(d,
4282
+ TblLookupPer4LaneBlkU8IdxInBlk(du8, idx3, idx2, idx1, idx0));
4283
+ }
4284
+ #else
4285
+ template <class D, HWY_IF_T_SIZE_D(D, 2)>
4286
+ HWY_INLINE VFromD<D> TblLookupPer4LaneBlkIdxInBlk(D d, const uint32_t idx3,
4287
+ const uint32_t idx2,
4288
+ const uint32_t idx1,
4289
+ const uint32_t idx0) {
4290
+ const uint16_t u16_idx0 = static_cast<uint16_t>(idx0);
4291
+ const uint16_t u16_idx1 = static_cast<uint16_t>(idx1);
4292
+ const uint16_t u16_idx2 = static_cast<uint16_t>(idx2);
4293
+ const uint16_t u16_idx3 = static_cast<uint16_t>(idx3);
4294
+ alignas(16)
4295
+ const uint16_t indices[8] = {u16_idx0, u16_idx1, u16_idx2, u16_idx3,
4296
+ u16_idx0, u16_idx1, u16_idx2, u16_idx3};
4297
+
4298
+ #if HWY_TARGET == HWY_NEON || HWY_TARGET == HWY_NEON_WITHOUT_AES
4299
+ constexpr size_t kMinLanesToLoad = 4;
4300
+ #else
4301
+ constexpr size_t kMinLanesToLoad = 8;
4302
+ #endif
4303
+ constexpr size_t kNumToLoad = HWY_MAX(HWY_MAX_LANES_D(D), kMinLanesToLoad);
4304
+ const CappedTag<uint16_t, kNumToLoad> d_load;
4305
+
4306
+ return ResizeBitCast(d, LoadDup128(d_load, indices));
4307
+ }
4308
+
4309
+ template <class D, HWY_IF_T_SIZE_D(D, 4)>
4310
+ HWY_INLINE VFromD<D> TblLookupPer4LaneBlkIdxInBlk(D d, const uint32_t idx3,
4311
+ const uint32_t idx2,
4312
+ const uint32_t idx1,
4313
+ const uint32_t idx0) {
4314
+ return Per4LaneBlkShufDupSet4xU32(d, idx3, idx2, idx1, idx0);
4315
+ }
4316
+
4317
+ template <class D, HWY_IF_T_SIZE_D(D, 8)>
4318
+ HWY_INLINE VFromD<D> TblLookupPer4LaneBlkIdxInBlk(D d, const uint32_t idx3,
4319
+ const uint32_t idx2,
4320
+ const uint32_t idx1,
4321
+ const uint32_t idx0) {
4322
+ const RebindToUnsigned<decltype(d)> du;
4323
+ const Rebind<uint32_t, decltype(d)> du32;
4324
+ return BitCast(d, PromoteTo(du, Per4LaneBlkShufDupSet4xU32(du32, idx3, idx2,
4325
+ idx1, idx0)));
4326
+ }
4327
+ #endif
4328
+
4329
+ template <class D, HWY_PER_4_BLK_TBL_LOOKUP_LANES_ENABLE(D)>
4330
+ HWY_INLINE IndicesFromD<D> TblLookupPer4LaneBlkShufIdx(D d, const uint32_t idx3,
4331
+ const uint32_t idx2,
4332
+ const uint32_t idx1,
4333
+ const uint32_t idx0) {
4334
+ const RebindToUnsigned<decltype(d)> du;
4335
+ using TU = TFromD<decltype(du)>;
4336
+ auto idx_in_blk = TblLookupPer4LaneBlkIdxInBlk(du, idx3, idx2, idx1, idx0);
4337
+
4338
+ constexpr size_t kN = HWY_MAX_LANES_D(D);
4339
+ if (kN < 4) {
4340
+ idx_in_blk = And(idx_in_blk, Set(du, static_cast<TU>(kN - 1)));
4341
+ }
4342
+
4343
+ #if HWY_TARGET == HWY_RVV
4344
+ const auto blk_offsets = AndS(Iota0(du), static_cast<TU>(~TU{3}));
4345
+ #else
4346
+ const auto blk_offsets =
4347
+ And(Iota(du, TU{0}), Set(du, static_cast<TU>(~TU{3})));
4348
+ #endif
4349
+ return IndicesFromVec(d, Add(idx_in_blk, blk_offsets));
4350
+ }
4351
+
4352
+ template <class V, HWY_PER_4_BLK_TBL_LOOKUP_LANES_ENABLE(DFromV<V>)>
4353
+ HWY_INLINE V Per4LaneBlkShufDoTblLookup(V v, IndicesFromD<DFromV<V>> idx) {
4354
+ return TableLookupLanes(v, idx);
4355
+ }
4356
+
4357
+ #undef HWY_PER_4_BLK_TBL_LOOKUP_LANES_ENABLE
4358
+
4359
+ template <class V>
4360
+ HWY_INLINE V TblLookupPer4LaneBlkShuf(V v, size_t idx3210) {
4361
+ const DFromV<decltype(v)> d;
4362
+ const uint32_t idx3 = static_cast<uint32_t>((idx3210 >> 6) & 3);
4363
+ const uint32_t idx2 = static_cast<uint32_t>((idx3210 >> 4) & 3);
4364
+ const uint32_t idx1 = static_cast<uint32_t>((idx3210 >> 2) & 3);
4365
+ const uint32_t idx0 = static_cast<uint32_t>(idx3210 & 3);
4366
+ const auto idx = TblLookupPer4LaneBlkShufIdx(d, idx3, idx2, idx1, idx0);
4367
+ return Per4LaneBlkShufDoTblLookup(v, idx);
4368
+ }
4369
+
4370
+ // The detail::Per4LaneBlockShuffle overloads that have the extra lane_size_tag
4371
+ // and vect_size_tag parameters are only called for vectors that have at
4372
+ // least 4 lanes (or scalable vectors that might possibly have 4 or more lanes)
4373
+ template <size_t kIdx3210, size_t kLaneSize, size_t kVectSize, class V>
4374
+ HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<kIdx3210> /*idx_3210_tag*/,
4375
+ hwy::SizeTag<kLaneSize> /*lane_size_tag*/,
4376
+ hwy::SizeTag<kVectSize> /*vect_size_tag*/,
4377
+ V v) {
4378
+ return TblLookupPer4LaneBlkShuf(v, kIdx3210);
4379
+ }
4380
+
4381
+ #if HWY_HAVE_FLOAT64
4382
+ template <class V>
4383
+ HWY_INLINE VFromD<RepartitionToWide<DFromV<V>>> Per4LaneBlockShufCastToWide(
4384
+ hwy::FloatTag /* type_tag */, hwy::SizeTag<4> /* lane_size_tag */, V v) {
4385
+ const DFromV<decltype(v)> d;
4386
+ const RepartitionToWide<decltype(d)> dw;
4387
+ return BitCast(dw, v);
4388
+ }
4389
+ #endif
4390
+
4391
+ template <size_t kLaneSize, class V>
4392
+ HWY_INLINE VFromD<RepartitionToWide<RebindToUnsigned<DFromV<V>>>>
4393
+ Per4LaneBlockShufCastToWide(hwy::FloatTag /* type_tag */,
4394
+ hwy::SizeTag<kLaneSize> /* lane_size_tag */, V v) {
4395
+ const DFromV<decltype(v)> d;
4396
+ const RebindToUnsigned<decltype(d)> du;
4397
+ const RepartitionToWide<decltype(du)> dw;
4398
+ return BitCast(dw, v);
4399
+ }
4400
+
4401
+ template <size_t kLaneSize, class V>
4402
+ HWY_INLINE VFromD<RepartitionToWide<DFromV<V>>> Per4LaneBlockShufCastToWide(
4403
+ hwy::NonFloatTag /* type_tag */,
4404
+ hwy::SizeTag<kLaneSize> /* lane_size_tag */, V v) {
4405
+ const DFromV<decltype(v)> d;
4406
+ const RepartitionToWide<decltype(d)> dw;
4407
+ return BitCast(dw, v);
4408
+ }
4409
+
4410
+ template <class V>
4411
+ HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<0x1B> /*idx_3210_tag*/, V v) {
4412
+ const DFromV<decltype(v)> d;
4413
+ return Reverse4(d, v);
4414
+ }
4415
+
4416
+ template <class V,
4417
+ HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 2) |
4418
+ (HWY_HAVE_INTEGER64 ? (1 << 4) : 0))>
4419
+ HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<0x44> /*idx_3210_tag*/, V v) {
4420
+ const DFromV<decltype(v)> d;
4421
+ const auto vw = Per4LaneBlockShufCastToWide(
4422
+ hwy::IsFloatTag<TFromV<V>>(), hwy::SizeTag<sizeof(TFromV<V>)>(), v);
4423
+ return BitCast(d, DupEven(vw));
4424
+ }
4425
+
4426
+ template <class V,
4427
+ HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 2) |
4428
+ (HWY_HAVE_INTEGER64 ? (1 << 4) : 0))>
4429
+ HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<0x4E> /*idx_3210_tag*/, V v) {
4430
+ const DFromV<decltype(v)> d;
4431
+ const auto vw = Per4LaneBlockShufCastToWide(
4432
+ hwy::IsFloatTag<TFromV<V>>(), hwy::SizeTag<sizeof(TFromV<V>)>(), v);
4433
+ const DFromV<decltype(vw)> dw;
4434
+ return BitCast(d, Reverse2(dw, vw));
4435
+ }
4436
+
4437
+ #if HWY_MAX_BYTES >= 32
4438
+ template <class V, HWY_IF_T_SIZE_V(V, 8)>
4439
+ HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<0x4E> /*idx_3210_tag*/, V v) {
4440
+ return SwapAdjacentBlocks(v);
4441
+ }
4442
+ #endif
4443
+
4444
+ template <class V, HWY_IF_LANES_D(DFromV<V>, 4),
4445
+ HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 2))>
4446
+ HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<0x50> /*idx_3210_tag*/, V v) {
4447
+ const DFromV<decltype(v)> d;
4448
+ return InterleaveLower(d, v, v);
4449
+ }
4450
+
4451
+ template <class V, HWY_IF_T_SIZE_V(V, 4)>
4452
+ HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<0x50> /*idx_3210_tag*/, V v) {
4453
+ const DFromV<decltype(v)> d;
4454
+ return InterleaveLower(d, v, v);
4455
+ }
4456
+
4457
+ template <class V, HWY_IF_LANES_D(DFromV<V>, 4)>
4458
+ HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<0x88> /*idx_3210_tag*/, V v) {
4459
+ const DFromV<decltype(v)> d;
4460
+ return ConcatEven(d, v, v);
4461
+ }
4462
+
4463
+ template <class V>
4464
+ HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<0xA0> /*idx_3210_tag*/, V v) {
4465
+ return DupEven(v);
4466
+ }
4467
+
4468
+ template <class V>
4469
+ HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<0xB1> /*idx_3210_tag*/, V v) {
4470
+ const DFromV<decltype(v)> d;
4471
+ return Reverse2(d, v);
4472
+ }
4473
+
4474
+ template <class V, HWY_IF_LANES_D(DFromV<V>, 4)>
4475
+ HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<0xDD> /*idx_3210_tag*/, V v) {
4476
+ const DFromV<decltype(v)> d;
4477
+ return ConcatOdd(d, v, v);
4478
+ }
4479
+
4480
+ template <class V>
4481
+ HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<0xE4> /*idx_3210_tag*/, V v) {
4482
+ return v;
4483
+ }
4484
+
4485
+ template <class V,
4486
+ HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 2) |
4487
+ (HWY_HAVE_INTEGER64 ? (1 << 4) : 0))>
4488
+ HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<0xEE> /*idx_3210_tag*/, V v) {
4489
+ const DFromV<decltype(v)> d;
4490
+ const auto vw = Per4LaneBlockShufCastToWide(
4491
+ hwy::IsFloatTag<TFromV<V>>(), hwy::SizeTag<sizeof(TFromV<V>)>(), v);
4492
+ return BitCast(d, DupOdd(vw));
4493
+ }
4494
+
4495
+ template <class V>
4496
+ HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<0xF5> /*idx_3210_tag*/, V v) {
4497
+ return DupOdd(v);
4498
+ }
4499
+
4500
+ template <class V, HWY_IF_T_SIZE_V(V, 4)>
4501
+ HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<0xFA> /*idx_3210_tag*/, V v) {
4502
+ const DFromV<decltype(v)> d;
4503
+ return InterleaveUpper(d, v, v);
4504
+ }
4505
+
4506
+ template <size_t kIdx3210, class V>
4507
+ HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<kIdx3210> idx_3210_tag, V v) {
4508
+ const DFromV<decltype(v)> d;
4509
+ return Per4LaneBlockShuffle(idx_3210_tag, hwy::SizeTag<sizeof(TFromV<V>)>(),
4510
+ hwy::SizeTag<d.MaxBytes()>(), v);
4511
+ }
4512
+
4513
+ } // namespace detail
4514
+ #endif // HWY_TARGET != HWY_SCALAR
4515
+
4516
+ template <size_t kIdx3, size_t kIdx2, size_t kIdx1, size_t kIdx0, class V,
4517
+ HWY_IF_LANES_D(DFromV<V>, 1)>
4518
+ HWY_API V Per4LaneBlockShuffle(V v) {
4519
+ static_assert(kIdx0 <= 3, "kIdx0 <= 3 must be true");
4520
+ static_assert(kIdx1 <= 3, "kIdx1 <= 3 must be true");
4521
+ static_assert(kIdx2 <= 3, "kIdx2 <= 3 must be true");
4522
+ static_assert(kIdx3 <= 3, "kIdx3 <= 3 must be true");
4523
+
4524
+ return v;
4525
+ }
4526
+
4527
+ #if HWY_TARGET != HWY_SCALAR
4528
+ template <size_t kIdx3, size_t kIdx2, size_t kIdx1, size_t kIdx0, class V,
4529
+ HWY_IF_LANES_D(DFromV<V>, 2)>
4530
+ HWY_API V Per4LaneBlockShuffle(V v) {
4531
+ static_assert(kIdx0 <= 3, "kIdx0 <= 3 must be true");
4532
+ static_assert(kIdx1 <= 3, "kIdx1 <= 3 must be true");
4533
+ static_assert(kIdx2 <= 3, "kIdx2 <= 3 must be true");
4534
+ static_assert(kIdx3 <= 3, "kIdx3 <= 3 must be true");
4535
+
4536
+ constexpr bool isReverse2 = (kIdx0 == 1 || kIdx1 == 0) && (kIdx0 != kIdx1);
4537
+ constexpr size_t kPer2BlkIdx0 = (kIdx0 <= 1) ? kIdx0 : (isReverse2 ? 1 : 0);
4538
+ constexpr size_t kPer2BlkIdx1 = (kIdx1 <= 1) ? kIdx1 : (isReverse2 ? 0 : 1);
4539
+
4540
+ constexpr size_t kIdx10 = (kPer2BlkIdx1 << 1) | kPer2BlkIdx0;
4541
+ static_assert(kIdx10 <= 3, "kIdx10 <= 3 must be true");
4542
+ return detail::Per2LaneBlockShuffle(hwy::SizeTag<kIdx10>(), v);
4543
+ }
4544
+
4545
+ template <size_t kIdx3, size_t kIdx2, size_t kIdx1, size_t kIdx0, class V,
4546
+ HWY_IF_LANES_GT_D(DFromV<V>, 2)>
4547
+ HWY_API V Per4LaneBlockShuffle(V v) {
4548
+ static_assert(kIdx0 <= 3, "kIdx0 <= 3 must be true");
4549
+ static_assert(kIdx1 <= 3, "kIdx1 <= 3 must be true");
4550
+ static_assert(kIdx2 <= 3, "kIdx2 <= 3 must be true");
4551
+ static_assert(kIdx3 <= 3, "kIdx3 <= 3 must be true");
4552
+
4553
+ constexpr size_t kIdx3210 =
4554
+ (kIdx3 << 6) | (kIdx2 << 4) | (kIdx1 << 2) | kIdx0;
4555
+ return detail::Per4LaneBlockShuffle(hwy::SizeTag<kIdx3210>(), v);
4556
+ }
4557
+ #endif
4558
+
4559
+ // ------------------------------ Blocks
4560
+
4561
+ template <class D>
4562
+ HWY_API size_t Blocks(D d) {
4563
+ return (d.MaxBytes() <= 16) ? 1 : ((Lanes(d) * sizeof(TFromD<D>) + 15) / 16);
4564
+ }
4565
+
4566
+ // ------------------------------ Block insert/extract/broadcast ops
4567
+ #if (defined(HWY_NATIVE_BLK_INSERT_EXTRACT) == defined(HWY_TARGET_TOGGLE))
4568
+ #ifdef HWY_NATIVE_BLK_INSERT_EXTRACT
4569
+ #undef HWY_NATIVE_BLK_INSERT_EXTRACT
4570
+ #else
4571
+ #define HWY_NATIVE_BLK_INSERT_EXTRACT
4572
+ #endif
4573
+
4574
+ template <int kBlockIdx, class V, HWY_IF_V_SIZE_LE_V(V, 16)>
4575
+ HWY_API V InsertBlock(V /*v*/, V blk_to_insert) {
4576
+ static_assert(kBlockIdx == 0, "Invalid block index");
4577
+ return blk_to_insert;
4578
+ }
4579
+
4580
+ template <int kBlockIdx, class V, HWY_IF_V_SIZE_LE_V(V, 16)>
4581
+ HWY_API V ExtractBlock(V v) {
4582
+ static_assert(kBlockIdx == 0, "Invalid block index");
4583
+ return v;
4584
+ }
4585
+
4586
+ template <int kBlockIdx, class V, HWY_IF_V_SIZE_LE_V(V, 16)>
4587
+ HWY_API V BroadcastBlock(V v) {
4588
+ static_assert(kBlockIdx == 0, "Invalid block index");
4589
+ return v;
4590
+ }
4591
+
4592
+ #endif // HWY_NATIVE_BLK_INSERT_EXTRACT
4593
+
4594
+ // ------------------------------ BroadcastLane
4595
+ #if (defined(HWY_NATIVE_BROADCASTLANE) == defined(HWY_TARGET_TOGGLE))
4596
+ #ifdef HWY_NATIVE_BROADCASTLANE
4597
+ #undef HWY_NATIVE_BROADCASTLANE
4598
+ #else
4599
+ #define HWY_NATIVE_BROADCASTLANE
4600
+ #endif
4601
+
4602
+ template <int kLane, class V, HWY_IF_V_SIZE_LE_V(V, 16)>
4603
+ HWY_API V BroadcastLane(V v) {
4604
+ return Broadcast<kLane>(v);
4605
+ }
4606
+
4607
+ #endif // HWY_NATIVE_BROADCASTLANE
4608
+
4609
+ // ------------------------------ Slide1Up and Slide1Down
4610
+ #if (defined(HWY_NATIVE_SLIDE1_UP_DOWN) == defined(HWY_TARGET_TOGGLE))
4611
+ #ifdef HWY_NATIVE_SLIDE1_UP_DOWN
4612
+ #undef HWY_NATIVE_SLIDE1_UP_DOWN
4613
+ #else
4614
+ #define HWY_NATIVE_SLIDE1_UP_DOWN
4615
+ #endif
4616
+
4617
+ template <class D, HWY_IF_LANES_D(D, 1)>
4618
+ HWY_API VFromD<D> Slide1Up(D d, VFromD<D> /*v*/) {
4619
+ return Zero(d);
4620
+ }
4621
+ template <class D, HWY_IF_LANES_D(D, 1)>
4622
+ HWY_API VFromD<D> Slide1Down(D d, VFromD<D> /*v*/) {
4623
+ return Zero(d);
4624
+ }
4625
+
4626
+ #if HWY_TARGET != HWY_SCALAR
4627
+ template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_GT_D(D, 1)>
4628
+ HWY_API VFromD<D> Slide1Up(D d, VFromD<D> v) {
4629
+ return ShiftLeftLanes<1>(d, v);
4630
+ }
4631
+ template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_GT_D(D, 1)>
4632
+ HWY_API VFromD<D> Slide1Down(D d, VFromD<D> v) {
4633
+ return ShiftRightLanes<1>(d, v);
4634
+ }
4635
+ #endif // HWY_TARGET != HWY_SCALAR
4636
+
4637
+ #endif // HWY_NATIVE_SLIDE1_UP_DOWN
4638
+
4639
+ // ------------------------------ SlideUpBlocks
4640
+
4641
+ template <int kBlocks, class D, HWY_IF_V_SIZE_LE_D(D, 16)>
4642
+ HWY_API VFromD<D> SlideUpBlocks(D /*d*/, VFromD<D> v) {
4643
+ static_assert(kBlocks == 0, "kBlocks == 0 must be true");
4644
+ return v;
4645
+ }
4646
+
4647
+ #if HWY_HAVE_SCALABLE || HWY_TARGET == HWY_SVE_256
4648
+ template <int kBlocks, class D, HWY_IF_V_SIZE_GT_D(D, 16)>
4649
+ HWY_API VFromD<D> SlideUpBlocks(D d, VFromD<D> v) {
4650
+ static_assert(0 <= kBlocks && static_cast<size_t>(kBlocks) < d.MaxBlocks(),
4651
+ "kBlocks must be between 0 and d.MaxBlocks() - 1");
4652
+ constexpr size_t kLanesPerBlock = 16 / sizeof(TFromD<D>);
4653
+ return SlideUpLanes(d, v, static_cast<size_t>(kBlocks) * kLanesPerBlock);
4654
+ }
4655
+ #endif
4656
+
4657
+ // ------------------------------ SlideDownBlocks
4658
+
4659
+ template <int kBlocks, class D, HWY_IF_V_SIZE_LE_D(D, 16)>
4660
+ HWY_API VFromD<D> SlideDownBlocks(D /*d*/, VFromD<D> v) {
4661
+ static_assert(kBlocks == 0, "kBlocks == 0 must be true");
4662
+ return v;
4663
+ }
4664
+
4665
+ #if HWY_HAVE_SCALABLE || HWY_TARGET == HWY_SVE_256
4666
+ template <int kBlocks, class D, HWY_IF_V_SIZE_GT_D(D, 16)>
4667
+ HWY_API VFromD<D> SlideDownBlocks(D d, VFromD<D> v) {
4668
+ static_assert(0 <= kBlocks && static_cast<size_t>(kBlocks) < d.MaxBlocks(),
4669
+ "kBlocks must be between 0 and d.MaxBlocks() - 1");
4670
+ constexpr size_t kLanesPerBlock = 16 / sizeof(TFromD<D>);
4671
+ return SlideDownLanes(d, v, static_cast<size_t>(kBlocks) * kLanesPerBlock);
4672
+ }
4673
+ #endif
4674
+
4675
+ // ================================================== Operator wrapper
4676
+
4677
+ // SVE* and RVV currently cannot define operators and have already defined
4678
+ // (only) the corresponding functions such as Add.
4679
+ #if (defined(HWY_NATIVE_OPERATOR_REPLACEMENTS) == defined(HWY_TARGET_TOGGLE))
4680
+ #ifdef HWY_NATIVE_OPERATOR_REPLACEMENTS
4681
+ #undef HWY_NATIVE_OPERATOR_REPLACEMENTS
4682
+ #else
4683
+ #define HWY_NATIVE_OPERATOR_REPLACEMENTS
4684
+ #endif
4685
+
4686
+ template <class V>
4687
+ HWY_API V Add(V a, V b) {
4688
+ return a + b;
4689
+ }
4690
+ template <class V>
4691
+ HWY_API V Sub(V a, V b) {
4692
+ return a - b;
4693
+ }
4694
+
4695
+ template <class V>
4696
+ HWY_API V Mul(V a, V b) {
4697
+ return a * b;
4698
+ }
4699
+ template <class V>
4700
+ HWY_API V Div(V a, V b) {
4701
+ return a / b;
4702
+ }
4703
+
4704
+ template <class V>
4705
+ V Shl(V a, V b) {
4706
+ return a << b;
4707
+ }
4708
+ template <class V>
4709
+ V Shr(V a, V b) {
4710
+ return a >> b;
4711
+ }
4712
+
4713
+ template <class V>
4714
+ HWY_API auto Eq(V a, V b) -> decltype(a == b) {
4715
+ return a == b;
4716
+ }
4717
+ template <class V>
4718
+ HWY_API auto Ne(V a, V b) -> decltype(a == b) {
4719
+ return a != b;
4720
+ }
4721
+ template <class V>
4722
+ HWY_API auto Lt(V a, V b) -> decltype(a == b) {
4723
+ return a < b;
4724
+ }
4725
+
4726
+ template <class V>
4727
+ HWY_API auto Gt(V a, V b) -> decltype(a == b) {
4728
+ return a > b;
4729
+ }
4730
+ template <class V>
4731
+ HWY_API auto Ge(V a, V b) -> decltype(a == b) {
4732
+ return a >= b;
4733
+ }
4734
+
4735
+ template <class V>
4736
+ HWY_API auto Le(V a, V b) -> decltype(a == b) {
4737
+ return a <= b;
4738
+ }
4739
+
4740
+ #endif // HWY_NATIVE_OPERATOR_REPLACEMENTS
4741
+
4742
+ // NOLINTNEXTLINE(google-readability-namespace-comments)
4743
+ } // namespace HWY_NAMESPACE
4744
+ } // namespace hwy
4745
+ HWY_AFTER_NAMESPACE();