@img/sharp-libvips-dev 0.0.1 → 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (103) hide show
  1. package/README.md +2 -2
  2. package/cplusplus/VConnection.cpp +54 -54
  3. package/cplusplus/VError.cpp +20 -18
  4. package/cplusplus/VImage.cpp +636 -589
  5. package/cplusplus/VInterpolate.cpp +22 -22
  6. package/cplusplus/VRegion.cpp +4 -4
  7. package/cplusplus/vips-operators.cpp +2326 -2301
  8. package/include/aom/aom_codec.h +10 -6
  9. package/include/aom/aom_decoder.h +1 -1
  10. package/include/aom/aom_encoder.h +9 -2
  11. package/include/aom/aomcx.h +72 -3
  12. package/include/cairo/cairo-ft.h +1 -1
  13. package/include/cairo/cairo-gobject.h +8 -0
  14. package/include/cairo/cairo-svg.h +3 -3
  15. package/include/cairo/cairo-version.h +2 -2
  16. package/include/cairo/cairo.h +91 -24
  17. package/include/harfbuzz/hb-version.h +2 -2
  18. package/include/hwy/aligned_allocator.h +211 -0
  19. package/include/hwy/base.h +1517 -0
  20. package/include/hwy/cache_control.h +108 -0
  21. package/include/hwy/detect_compiler_arch.h +281 -0
  22. package/include/hwy/detect_targets.h +644 -0
  23. package/include/hwy/foreach_target.h +340 -0
  24. package/include/hwy/highway.h +435 -0
  25. package/include/hwy/highway_export.h +74 -0
  26. package/include/hwy/nanobenchmark.h +171 -0
  27. package/include/hwy/ops/arm_neon-inl.h +8913 -0
  28. package/include/hwy/ops/arm_sve-inl.h +5105 -0
  29. package/include/hwy/ops/emu128-inl.h +2811 -0
  30. package/include/hwy/ops/generic_ops-inl.h +4745 -0
  31. package/include/hwy/ops/ppc_vsx-inl.h +5716 -0
  32. package/include/hwy/ops/rvv-inl.h +5070 -0
  33. package/include/hwy/ops/scalar-inl.h +1995 -0
  34. package/include/hwy/ops/set_macros-inl.h +578 -0
  35. package/include/hwy/ops/shared-inl.h +539 -0
  36. package/include/hwy/ops/tuple-inl.h +125 -0
  37. package/include/hwy/ops/wasm_128-inl.h +5917 -0
  38. package/include/hwy/ops/x86_128-inl.h +11173 -0
  39. package/include/hwy/ops/x86_256-inl.h +7529 -0
  40. package/include/hwy/ops/x86_512-inl.h +6849 -0
  41. package/include/hwy/per_target.h +44 -0
  42. package/include/hwy/print-inl.h +62 -0
  43. package/include/hwy/print.h +75 -0
  44. package/include/hwy/robust_statistics.h +148 -0
  45. package/include/hwy/targets.h +338 -0
  46. package/include/hwy/timer-inl.h +200 -0
  47. package/include/hwy/timer.h +55 -0
  48. package/include/jconfig.h +2 -2
  49. package/include/jpeglib.h +3 -2
  50. package/include/libheif/heif.h +443 -377
  51. package/include/libheif/heif_cxx.h +4 -1
  52. package/include/libheif/heif_plugin.h +1 -1
  53. package/include/libheif/heif_properties.h +138 -0
  54. package/include/libheif/heif_regions.h +866 -0
  55. package/include/libheif/heif_version.h +3 -3
  56. package/include/vips/VConnection8.h +43 -49
  57. package/include/vips/VError8.h +27 -24
  58. package/include/vips/VImage8.h +4861 -4597
  59. package/include/vips/VInterpolate8.h +24 -27
  60. package/include/vips/VRegion8.h +32 -33
  61. package/include/vips/arithmetic.h +169 -169
  62. package/include/vips/basic.h +33 -33
  63. package/include/vips/buf.h +56 -54
  64. package/include/vips/colour.h +95 -95
  65. package/include/vips/connection.h +190 -193
  66. package/include/vips/conversion.h +91 -91
  67. package/include/vips/convolution.h +36 -30
  68. package/include/vips/create.h +63 -63
  69. package/include/vips/dbuf.h +35 -37
  70. package/include/vips/debug.h +65 -33
  71. package/include/vips/draw.h +41 -41
  72. package/include/vips/enumtypes.h +54 -51
  73. package/include/vips/error.h +63 -63
  74. package/include/vips/foreign.h +263 -223
  75. package/include/vips/format.h +48 -48
  76. package/include/vips/freqfilt.h +22 -22
  77. package/include/vips/gate.h +55 -47
  78. package/include/vips/generate.h +34 -34
  79. package/include/vips/header.h +111 -101
  80. package/include/vips/histogram.h +28 -28
  81. package/include/vips/image.h +213 -213
  82. package/include/vips/interpolate.h +40 -41
  83. package/include/vips/memory.h +61 -52
  84. package/include/vips/morphology.h +24 -24
  85. package/include/vips/mosaicing.h +32 -33
  86. package/include/vips/object.h +371 -357
  87. package/include/vips/operation.h +68 -67
  88. package/include/vips/private.h +76 -76
  89. package/include/vips/rect.h +26 -26
  90. package/include/vips/region.h +92 -92
  91. package/include/vips/resample.h +38 -38
  92. package/include/vips/sbuf.h +53 -54
  93. package/include/vips/semaphore.h +24 -24
  94. package/include/vips/thread.h +30 -27
  95. package/include/vips/threadpool.h +48 -49
  96. package/include/vips/transform.h +39 -39
  97. package/include/vips/type.h +90 -85
  98. package/include/vips/util.h +274 -229
  99. package/include/vips/vector.h +24 -144
  100. package/include/vips/version.h +9 -9
  101. package/include/vips/vips.h +41 -40
  102. package/package.json +1 -1
  103. package/versions.json +7 -7
@@ -0,0 +1,2811 @@
1
+ // Copyright 2022 Google LLC
2
+ // SPDX-License-Identifier: Apache-2.0
3
+ //
4
+ // Licensed under the Apache License, Version 2.0 (the "License");
5
+ // you may not use this file except in compliance with the License.
6
+ // You may obtain a copy of the License at
7
+ //
8
+ // http://www.apache.org/licenses/LICENSE-2.0
9
+ //
10
+ // Unless required by applicable law or agreed to in writing, software
11
+ // distributed under the License is distributed on an "AS IS" BASIS,
12
+ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ // See the License for the specific language governing permissions and
14
+ // limitations under the License.
15
+
16
+ // Single-element vectors and operations.
17
+ // External include guard in highway.h - see comment there.
18
+
19
+ #include <cmath> // std::abs, std::isnan
20
+
21
+ #include "hwy/ops/shared-inl.h"
22
+
23
+ HWY_BEFORE_NAMESPACE();
24
+ namespace hwy {
25
+ namespace HWY_NAMESPACE {
26
+
27
+ template <typename T>
28
+ using Full128 = Simd<T, 16 / sizeof(T), 0>;
29
+
30
+ // (Wrapper class required for overloading comparison operators.)
31
+ template <typename T, size_t N = 16 / sizeof(T)>
32
+ struct Vec128 {
33
+ using PrivateT = T; // only for DFromV
34
+ static constexpr size_t kPrivateN = N; // only for DFromV
35
+
36
+ HWY_INLINE Vec128() = default;
37
+ Vec128(const Vec128&) = default;
38
+ Vec128& operator=(const Vec128&) = default;
39
+
40
+ HWY_INLINE Vec128& operator*=(const Vec128 other) {
41
+ return *this = (*this * other);
42
+ }
43
+ HWY_INLINE Vec128& operator/=(const Vec128 other) {
44
+ return *this = (*this / other);
45
+ }
46
+ HWY_INLINE Vec128& operator+=(const Vec128 other) {
47
+ return *this = (*this + other);
48
+ }
49
+ HWY_INLINE Vec128& operator-=(const Vec128 other) {
50
+ return *this = (*this - other);
51
+ }
52
+ HWY_INLINE Vec128& operator&=(const Vec128 other) {
53
+ return *this = (*this & other);
54
+ }
55
+ HWY_INLINE Vec128& operator|=(const Vec128 other) {
56
+ return *this = (*this | other);
57
+ }
58
+ HWY_INLINE Vec128& operator^=(const Vec128 other) {
59
+ return *this = (*this ^ other);
60
+ }
61
+
62
+ // Behave like wasm128 (vectors can always hold 128 bits). generic_ops-inl.h
63
+ // relies on this for LoadInterleaved*. CAVEAT: this method of padding
64
+ // prevents using range for, especially in SumOfLanes, where it would be
65
+ // incorrect. Moving padding to another field would require handling the case
66
+ // where N = 16 / sizeof(T) (i.e. there is no padding), which is also awkward.
67
+ T raw[16 / sizeof(T)] = {};
68
+ };
69
+
70
+ // 0 or FF..FF, same size as Vec128.
71
+ template <typename T, size_t N = 16 / sizeof(T)>
72
+ struct Mask128 {
73
+ using Raw = hwy::MakeUnsigned<T>;
74
+ static HWY_INLINE Raw FromBool(bool b) {
75
+ return b ? static_cast<Raw>(~Raw{0}) : 0;
76
+ }
77
+
78
+ // Must match the size of Vec128.
79
+ Raw bits[16 / sizeof(T)] = {};
80
+ };
81
+
82
+ template <class V>
83
+ using DFromV = Simd<typename V::PrivateT, V::kPrivateN, 0>;
84
+
85
+ template <class V>
86
+ using TFromV = typename V::PrivateT;
87
+
88
+ // ------------------------------ Zero
89
+
90
+ // Use HWY_MAX_LANES_D here because VFromD is defined in terms of Zero.
91
+ template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
92
+ HWY_API Vec128<TFromD<D>, HWY_MAX_LANES_D(D)> Zero(D /* tag */) {
93
+ Vec128<TFromD<D>, HWY_MAX_LANES_D(D)> v; // zero-initialized
94
+ return v;
95
+ }
96
+
97
+ template <class D>
98
+ using VFromD = decltype(Zero(D()));
99
+
100
+ // ------------------------------ Tuple (VFromD)
101
+ #include "hwy/ops/tuple-inl.h"
102
+
103
+ // ------------------------------ BitCast
104
+
105
+ template <class D, class VFrom>
106
+ HWY_API VFromD<D> BitCast(D /* tag */, VFrom v) {
107
+ VFromD<D> to;
108
+ CopySameSize(&v, &to);
109
+ return to;
110
+ }
111
+
112
+ // ------------------------------ ResizeBitCast
113
+
114
+ template <class D, class VFrom>
115
+ HWY_API VFromD<D> ResizeBitCast(D d, VFrom v) {
116
+ using DFrom = DFromV<VFrom>;
117
+ using TFrom = TFromD<DFrom>;
118
+ using TTo = TFromD<D>;
119
+
120
+ constexpr size_t kFromByteLen = sizeof(TFrom) * HWY_MAX_LANES_D(DFrom);
121
+ constexpr size_t kToByteLen = sizeof(TTo) * HWY_MAX_LANES_D(D);
122
+ constexpr size_t kCopyByteLen = HWY_MIN(kFromByteLen, kToByteLen);
123
+
124
+ VFromD<D> to = Zero(d);
125
+ CopyBytes<kCopyByteLen>(&v, &to);
126
+ return to;
127
+ }
128
+
129
+ namespace detail {
130
+
131
+ // ResizeBitCast on the HWY_EMU128 target has zero-extending semantics if
132
+ // VFromD<DTo> is a larger vector than FromV
133
+ template <class FromSizeTag, class ToSizeTag, class DTo, class DFrom>
134
+ HWY_INLINE VFromD<DTo> ZeroExtendResizeBitCast(FromSizeTag /* from_size_tag */,
135
+ ToSizeTag /* to_size_tag */,
136
+ DTo d_to, DFrom /* d_from */,
137
+ VFromD<DFrom> v) {
138
+ return ResizeBitCast(d_to, v);
139
+ }
140
+
141
+ } // namespace detail
142
+
143
+ // ------------------------------ Set
144
+ template <class D, typename T2>
145
+ HWY_API VFromD<D> Set(D d, const T2 t) {
146
+ VFromD<D> v;
147
+ for (size_t i = 0; i < MaxLanes(d); ++i) {
148
+ v.raw[i] = static_cast<TFromD<D>>(t);
149
+ }
150
+ return v;
151
+ }
152
+
153
+ // ------------------------------ Undefined
154
+ template <class D>
155
+ HWY_API VFromD<D> Undefined(D d) {
156
+ return Zero(d);
157
+ }
158
+
159
+ // ------------------------------ Iota
160
+
161
+ template <class D, typename T = TFromD<D>, typename T2>
162
+ HWY_API VFromD<D> Iota(D d, T2 first) {
163
+ VFromD<D> v;
164
+ for (size_t i = 0; i < MaxLanes(d); ++i) {
165
+ v.raw[i] =
166
+ AddWithWraparound(hwy::IsFloatTag<T>(), static_cast<T>(first), i);
167
+ }
168
+ return v;
169
+ }
170
+
171
+ // ================================================== LOGICAL
172
+
173
+ // ------------------------------ Not
174
+ template <typename T, size_t N>
175
+ HWY_API Vec128<T, N> Not(Vec128<T, N> v) {
176
+ const DFromV<decltype(v)> d;
177
+ const RebindToUnsigned<decltype(d)> du;
178
+ using TU = TFromD<decltype(du)>;
179
+ VFromD<decltype(du)> vu = BitCast(du, v);
180
+ for (size_t i = 0; i < N; ++i) {
181
+ vu.raw[i] = static_cast<TU>(~vu.raw[i]);
182
+ }
183
+ return BitCast(d, vu);
184
+ }
185
+
186
+ // ------------------------------ And
187
+ template <typename T, size_t N>
188
+ HWY_API Vec128<T, N> And(Vec128<T, N> a, Vec128<T, N> b) {
189
+ const DFromV<decltype(a)> d;
190
+ const RebindToUnsigned<decltype(d)> du;
191
+ auto au = BitCast(du, a);
192
+ auto bu = BitCast(du, b);
193
+ for (size_t i = 0; i < N; ++i) {
194
+ au.raw[i] &= bu.raw[i];
195
+ }
196
+ return BitCast(d, au);
197
+ }
198
+ template <typename T, size_t N>
199
+ HWY_API Vec128<T, N> operator&(Vec128<T, N> a, Vec128<T, N> b) {
200
+ return And(a, b);
201
+ }
202
+
203
+ // ------------------------------ AndNot
204
+ template <typename T, size_t N>
205
+ HWY_API Vec128<T, N> AndNot(Vec128<T, N> a, Vec128<T, N> b) {
206
+ return And(Not(a), b);
207
+ }
208
+
209
+ // ------------------------------ Or
210
+ template <typename T, size_t N>
211
+ HWY_API Vec128<T, N> Or(Vec128<T, N> a, Vec128<T, N> b) {
212
+ const DFromV<decltype(a)> d;
213
+ const RebindToUnsigned<decltype(d)> du;
214
+ auto au = BitCast(du, a);
215
+ auto bu = BitCast(du, b);
216
+ for (size_t i = 0; i < N; ++i) {
217
+ au.raw[i] |= bu.raw[i];
218
+ }
219
+ return BitCast(d, au);
220
+ }
221
+ template <typename T, size_t N>
222
+ HWY_API Vec128<T, N> operator|(Vec128<T, N> a, Vec128<T, N> b) {
223
+ return Or(a, b);
224
+ }
225
+
226
+ // ------------------------------ Xor
227
+ template <typename T, size_t N>
228
+ HWY_API Vec128<T, N> Xor(Vec128<T, N> a, Vec128<T, N> b) {
229
+ const DFromV<decltype(a)> d;
230
+ const RebindToUnsigned<decltype(d)> du;
231
+ auto au = BitCast(du, a);
232
+ auto bu = BitCast(du, b);
233
+ for (size_t i = 0; i < N; ++i) {
234
+ au.raw[i] ^= bu.raw[i];
235
+ }
236
+ return BitCast(d, au);
237
+ }
238
+ template <typename T, size_t N>
239
+ HWY_API Vec128<T, N> operator^(Vec128<T, N> a, Vec128<T, N> b) {
240
+ return Xor(a, b);
241
+ }
242
+
243
+ // ------------------------------ Xor3
244
+ template <typename T, size_t N>
245
+ HWY_API Vec128<T, N> Xor3(Vec128<T, N> x1, Vec128<T, N> x2, Vec128<T, N> x3) {
246
+ return Xor(x1, Xor(x2, x3));
247
+ }
248
+
249
+ // ------------------------------ Or3
250
+ template <typename T, size_t N>
251
+ HWY_API Vec128<T, N> Or3(Vec128<T, N> o1, Vec128<T, N> o2, Vec128<T, N> o3) {
252
+ return Or(o1, Or(o2, o3));
253
+ }
254
+
255
+ // ------------------------------ OrAnd
256
+ template <typename T, size_t N>
257
+ HWY_API Vec128<T, N> OrAnd(Vec128<T, N> o, Vec128<T, N> a1, Vec128<T, N> a2) {
258
+ return Or(o, And(a1, a2));
259
+ }
260
+
261
+ // ------------------------------ IfVecThenElse
262
+ template <typename T, size_t N>
263
+ HWY_API Vec128<T, N> IfVecThenElse(Vec128<T, N> mask, Vec128<T, N> yes,
264
+ Vec128<T, N> no) {
265
+ return Or(And(mask, yes), AndNot(mask, no));
266
+ }
267
+
268
+ // ------------------------------ CopySign
269
+ template <typename T, size_t N>
270
+ HWY_API Vec128<T, N> CopySign(Vec128<T, N> magn, Vec128<T, N> sign) {
271
+ static_assert(IsFloat<T>(), "Only makes sense for floating-point");
272
+ const DFromV<decltype(magn)> d;
273
+ return BitwiseIfThenElse(SignBit(d), sign, magn);
274
+ }
275
+
276
+ // ------------------------------ CopySignToAbs
277
+ template <typename T, size_t N>
278
+ HWY_API Vec128<T, N> CopySignToAbs(Vec128<T, N> abs, Vec128<T, N> sign) {
279
+ static_assert(IsFloat<T>(), "Only makes sense for floating-point");
280
+ const DFromV<decltype(abs)> d;
281
+ return OrAnd(abs, SignBit(d), sign);
282
+ }
283
+
284
+ // ------------------------------ BroadcastSignBit
285
+ template <typename T, size_t N>
286
+ HWY_API Vec128<T, N> BroadcastSignBit(Vec128<T, N> v) {
287
+ // This is used inside ShiftRight, so we cannot implement in terms of it.
288
+ for (size_t i = 0; i < N; ++i) {
289
+ v.raw[i] = v.raw[i] < 0 ? T(-1) : T(0);
290
+ }
291
+ return v;
292
+ }
293
+
294
+ // ------------------------------ Mask
295
+
296
+ // v must be 0 or FF..FF.
297
+ template <typename T, size_t N>
298
+ HWY_API Mask128<T, N> MaskFromVec(Vec128<T, N> v) {
299
+ Mask128<T, N> mask;
300
+ CopySameSize(&v, &mask);
301
+ return mask;
302
+ }
303
+
304
+ template <class D>
305
+ using MFromD = decltype(MaskFromVec(VFromD<D>()));
306
+
307
+ template <class DTo, class MFrom>
308
+ HWY_API MFromD<DTo> RebindMask(DTo /* tag */, MFrom mask) {
309
+ MFromD<DTo> to;
310
+ CopySameSize(&mask, &to);
311
+ return to;
312
+ }
313
+
314
+ template <typename T, size_t N>
315
+ Vec128<T, N> VecFromMask(Mask128<T, N> mask) {
316
+ Vec128<T, N> v;
317
+ CopySameSize(&mask, &v);
318
+ return v;
319
+ }
320
+
321
+ template <class D>
322
+ VFromD<D> VecFromMask(D /* tag */, MFromD<D> mask) {
323
+ return VecFromMask(mask);
324
+ }
325
+
326
+ template <class D>
327
+ HWY_API MFromD<D> FirstN(D d, size_t n) {
328
+ MFromD<D> m;
329
+ for (size_t i = 0; i < MaxLanes(d); ++i) {
330
+ m.bits[i] = MFromD<D>::FromBool(i < n);
331
+ }
332
+ return m;
333
+ }
334
+
335
+ // Returns mask ? yes : no.
336
+ template <typename T, size_t N>
337
+ HWY_API Vec128<T, N> IfThenElse(Mask128<T, N> mask, Vec128<T, N> yes,
338
+ Vec128<T, N> no) {
339
+ return IfVecThenElse(VecFromMask(mask), yes, no);
340
+ }
341
+
342
+ template <typename T, size_t N>
343
+ HWY_API Vec128<T, N> IfThenElseZero(Mask128<T, N> mask, Vec128<T, N> yes) {
344
+ const DFromV<decltype(yes)> d;
345
+ return IfVecThenElse(VecFromMask(mask), yes, Zero(d));
346
+ }
347
+
348
+ template <typename T, size_t N>
349
+ HWY_API Vec128<T, N> IfThenZeroElse(Mask128<T, N> mask, Vec128<T, N> no) {
350
+ const DFromV<decltype(no)> d;
351
+ return IfVecThenElse(VecFromMask(mask), Zero(d), no);
352
+ }
353
+
354
+ template <typename T, size_t N>
355
+ HWY_API Vec128<T, N> IfNegativeThenElse(Vec128<T, N> v, Vec128<T, N> yes,
356
+ Vec128<T, N> no) {
357
+ const DFromV<decltype(v)> d;
358
+ const RebindToSigned<decltype(d)> di;
359
+ const auto vi = BitCast(di, v);
360
+
361
+ for (size_t i = 0; i < N; ++i) {
362
+ v.raw[i] = vi.raw[i] < 0 ? yes.raw[i] : no.raw[i];
363
+ }
364
+ return v;
365
+ }
366
+
367
+ template <typename T, size_t N>
368
+ HWY_API Vec128<T, N> ZeroIfNegative(Vec128<T, N> v) {
369
+ const DFromV<decltype(v)> d;
370
+ return IfNegativeThenElse(v, Zero(d), v);
371
+ }
372
+
373
+ // ------------------------------ Mask logical
374
+
375
+ template <typename T, size_t N>
376
+ HWY_API Mask128<T, N> Not(Mask128<T, N> m) {
377
+ return MaskFromVec(Not(VecFromMask(Simd<T, N, 0>(), m)));
378
+ }
379
+
380
+ template <typename T, size_t N>
381
+ HWY_API Mask128<T, N> And(Mask128<T, N> a, Mask128<T, N> b) {
382
+ const Simd<T, N, 0> d;
383
+ return MaskFromVec(And(VecFromMask(d, a), VecFromMask(d, b)));
384
+ }
385
+
386
+ template <typename T, size_t N>
387
+ HWY_API Mask128<T, N> AndNot(Mask128<T, N> a, Mask128<T, N> b) {
388
+ const Simd<T, N, 0> d;
389
+ return MaskFromVec(AndNot(VecFromMask(d, a), VecFromMask(d, b)));
390
+ }
391
+
392
+ template <typename T, size_t N>
393
+ HWY_API Mask128<T, N> Or(Mask128<T, N> a, Mask128<T, N> b) {
394
+ const Simd<T, N, 0> d;
395
+ return MaskFromVec(Or(VecFromMask(d, a), VecFromMask(d, b)));
396
+ }
397
+
398
+ template <typename T, size_t N>
399
+ HWY_API Mask128<T, N> Xor(Mask128<T, N> a, Mask128<T, N> b) {
400
+ const Simd<T, N, 0> d;
401
+ return MaskFromVec(Xor(VecFromMask(d, a), VecFromMask(d, b)));
402
+ }
403
+
404
+ template <typename T, size_t N>
405
+ HWY_API Mask128<T, N> ExclusiveNeither(Mask128<T, N> a, Mask128<T, N> b) {
406
+ const Simd<T, N, 0> d;
407
+ return MaskFromVec(AndNot(VecFromMask(d, a), Not(VecFromMask(d, b))));
408
+ }
409
+
410
+ // ================================================== SHIFTS
411
+
412
+ // ------------------------------ ShiftLeft/ShiftRight (BroadcastSignBit)
413
+
414
+ template <int kBits, typename T, size_t N>
415
+ HWY_API Vec128<T, N> ShiftLeft(Vec128<T, N> v) {
416
+ static_assert(0 <= kBits && kBits < sizeof(T) * 8, "Invalid shift");
417
+ using TU = hwy::MakeUnsigned<T>;
418
+ for (size_t i = 0; i < N; ++i) {
419
+ const TU raw_u = static_cast<TU>(v.raw[i]);
420
+ const auto shifted = raw_u << kBits; // separate line to avoid MSVC warning
421
+ v.raw[i] = static_cast<T>(shifted);
422
+ }
423
+ return v;
424
+ }
425
+
426
+ template <int kBits, typename T, size_t N>
427
+ HWY_API Vec128<T, N> ShiftRight(Vec128<T, N> v) {
428
+ static_assert(0 <= kBits && kBits < sizeof(T) * 8, "Invalid shift");
429
+ #if __cplusplus >= 202002L
430
+ // Signed right shift is now guaranteed to be arithmetic (rounding toward
431
+ // negative infinity, i.e. shifting in the sign bit).
432
+ for (size_t i = 0; i < N; ++i) {
433
+ v.raw[i] = static_cast<T>(v.raw[i] >> kBits);
434
+ }
435
+ #else
436
+ if (IsSigned<T>()) {
437
+ // Emulate arithmetic shift using only logical (unsigned) shifts, because
438
+ // signed shifts are still implementation-defined.
439
+ using TU = hwy::MakeUnsigned<T>;
440
+ for (size_t i = 0; i < N; ++i) {
441
+ const TU shifted = static_cast<TU>(static_cast<TU>(v.raw[i]) >> kBits);
442
+ const TU sign = v.raw[i] < 0 ? static_cast<TU>(~TU{0}) : 0;
443
+ const size_t sign_shift =
444
+ static_cast<size_t>(static_cast<int>(sizeof(TU)) * 8 - 1 - kBits);
445
+ const TU upper = static_cast<TU>(sign << sign_shift);
446
+ v.raw[i] = static_cast<T>(shifted | upper);
447
+ }
448
+ } else { // T is unsigned
449
+ for (size_t i = 0; i < N; ++i) {
450
+ v.raw[i] = static_cast<T>(v.raw[i] >> kBits);
451
+ }
452
+ }
453
+ #endif
454
+ return v;
455
+ }
456
+
457
+ // ------------------------------ RotateRight (ShiftRight)
458
+ template <int kBits, typename T, size_t N>
459
+ HWY_API Vec128<T, N> RotateRight(const Vec128<T, N> v) {
460
+ constexpr size_t kSizeInBits = sizeof(T) * 8;
461
+ static_assert(0 <= kBits && kBits < kSizeInBits, "Invalid shift count");
462
+ if (kBits == 0) return v;
463
+ return Or(ShiftRight<kBits>(v),
464
+ ShiftLeft<HWY_MIN(kSizeInBits - 1, kSizeInBits - kBits)>(v));
465
+ }
466
+
467
+ // ------------------------------ ShiftLeftSame
468
+
469
+ template <typename T, size_t N>
470
+ HWY_API Vec128<T, N> ShiftLeftSame(Vec128<T, N> v, int bits) {
471
+ for (size_t i = 0; i < N; ++i) {
472
+ const auto shifted = static_cast<hwy::MakeUnsigned<T>>(v.raw[i]) << bits;
473
+ v.raw[i] = static_cast<T>(shifted);
474
+ }
475
+ return v;
476
+ }
477
+
478
+ template <typename T, size_t N>
479
+ HWY_API Vec128<T, N> ShiftRightSame(Vec128<T, N> v, int bits) {
480
+ #if __cplusplus >= 202002L
481
+ // Signed right shift is now guaranteed to be arithmetic (rounding toward
482
+ // negative infinity, i.e. shifting in the sign bit).
483
+ for (size_t i = 0; i < N; ++i) {
484
+ v.raw[i] = static_cast<T>(v.raw[i] >> bits);
485
+ }
486
+ #else
487
+ if (IsSigned<T>()) {
488
+ // Emulate arithmetic shift using only logical (unsigned) shifts, because
489
+ // signed shifts are still implementation-defined.
490
+ using TU = hwy::MakeUnsigned<T>;
491
+ for (size_t i = 0; i < N; ++i) {
492
+ const TU shifted = static_cast<TU>(static_cast<TU>(v.raw[i]) >> bits);
493
+ const TU sign = v.raw[i] < 0 ? static_cast<TU>(~TU{0}) : 0;
494
+ const size_t sign_shift =
495
+ static_cast<size_t>(static_cast<int>(sizeof(TU)) * 8 - 1 - bits);
496
+ const TU upper = static_cast<TU>(sign << sign_shift);
497
+ v.raw[i] = static_cast<T>(shifted | upper);
498
+ }
499
+ } else {
500
+ for (size_t i = 0; i < N; ++i) {
501
+ v.raw[i] = static_cast<T>(v.raw[i] >> bits); // unsigned, logical shift
502
+ }
503
+ }
504
+ #endif
505
+ return v;
506
+ }
507
+
508
+ // ------------------------------ Shl
509
+
510
+ template <typename T, size_t N>
511
+ HWY_API Vec128<T, N> operator<<(Vec128<T, N> v, Vec128<T, N> bits) {
512
+ for (size_t i = 0; i < N; ++i) {
513
+ const auto shifted = static_cast<hwy::MakeUnsigned<T>>(v.raw[i])
514
+ << bits.raw[i];
515
+ v.raw[i] = static_cast<T>(shifted);
516
+ }
517
+ return v;
518
+ }
519
+
520
+ template <typename T, size_t N>
521
+ HWY_API Vec128<T, N> operator>>(Vec128<T, N> v, Vec128<T, N> bits) {
522
+ #if __cplusplus >= 202002L
523
+ // Signed right shift is now guaranteed to be arithmetic (rounding toward
524
+ // negative infinity, i.e. shifting in the sign bit).
525
+ for (size_t i = 0; i < N; ++i) {
526
+ v.raw[i] = static_cast<T>(v.raw[i] >> bits.raw[i]);
527
+ }
528
+ #else
529
+ if (IsSigned<T>()) {
530
+ // Emulate arithmetic shift using only logical (unsigned) shifts, because
531
+ // signed shifts are still implementation-defined.
532
+ using TU = hwy::MakeUnsigned<T>;
533
+ for (size_t i = 0; i < N; ++i) {
534
+ const TU shifted =
535
+ static_cast<TU>(static_cast<TU>(v.raw[i]) >> bits.raw[i]);
536
+ const TU sign = v.raw[i] < 0 ? static_cast<TU>(~TU{0}) : 0;
537
+ const size_t sign_shift = static_cast<size_t>(
538
+ static_cast<int>(sizeof(TU)) * 8 - 1 - bits.raw[i]);
539
+ const TU upper = static_cast<TU>(sign << sign_shift);
540
+ v.raw[i] = static_cast<T>(shifted | upper);
541
+ }
542
+ } else { // T is unsigned
543
+ for (size_t i = 0; i < N; ++i) {
544
+ v.raw[i] = static_cast<T>(v.raw[i] >> bits.raw[i]);
545
+ }
546
+ }
547
+ #endif
548
+ return v;
549
+ }
550
+
551
+ // ================================================== ARITHMETIC
552
+
553
+ // Tag dispatch instead of SFINAE for MSVC 2017 compatibility
554
+ namespace detail {
555
+
556
+ template <typename T, size_t N>
557
+ HWY_INLINE Vec128<T, N> Add(hwy::NonFloatTag /*tag*/, Vec128<T, N> a,
558
+ Vec128<T, N> b) {
559
+ for (size_t i = 0; i < N; ++i) {
560
+ const uint64_t a64 = static_cast<uint64_t>(a.raw[i]);
561
+ const uint64_t b64 = static_cast<uint64_t>(b.raw[i]);
562
+ a.raw[i] = static_cast<T>((a64 + b64) & static_cast<uint64_t>(~T(0)));
563
+ }
564
+ return a;
565
+ }
566
+ template <typename T, size_t N>
567
+ HWY_INLINE Vec128<T, N> Sub(hwy::NonFloatTag /*tag*/, Vec128<T, N> a,
568
+ Vec128<T, N> b) {
569
+ for (size_t i = 0; i < N; ++i) {
570
+ const uint64_t a64 = static_cast<uint64_t>(a.raw[i]);
571
+ const uint64_t b64 = static_cast<uint64_t>(b.raw[i]);
572
+ a.raw[i] = static_cast<T>((a64 - b64) & static_cast<uint64_t>(~T(0)));
573
+ }
574
+ return a;
575
+ }
576
+
577
+ template <typename T, size_t N>
578
+ HWY_INLINE Vec128<T, N> Add(hwy::FloatTag /*tag*/, Vec128<T, N> a,
579
+ Vec128<T, N> b) {
580
+ for (size_t i = 0; i < N; ++i) {
581
+ a.raw[i] += b.raw[i];
582
+ }
583
+ return a;
584
+ }
585
+
586
+ template <typename T, size_t N>
587
+ HWY_INLINE Vec128<T, N> Sub(hwy::FloatTag /*tag*/, Vec128<T, N> a,
588
+ Vec128<T, N> b) {
589
+ for (size_t i = 0; i < N; ++i) {
590
+ a.raw[i] -= b.raw[i];
591
+ }
592
+ return a;
593
+ }
594
+
595
+ } // namespace detail
596
+
597
+ template <typename T, size_t N>
598
+ HWY_API Vec128<T, N> operator-(Vec128<T, N> a, Vec128<T, N> b) {
599
+ return detail::Sub(hwy::IsFloatTag<T>(), a, b);
600
+ }
601
+ template <typename T, size_t N>
602
+ HWY_API Vec128<T, N> operator+(Vec128<T, N> a, Vec128<T, N> b) {
603
+ return detail::Add(hwy::IsFloatTag<T>(), a, b);
604
+ }
605
+
606
+ // ------------------------------ SumsOf8
607
+
608
+ template <size_t N>
609
+ HWY_API Vec128<uint64_t, (N + 7) / 8> SumsOf8(Vec128<uint8_t, N> v) {
610
+ Vec128<uint64_t, (N + 7) / 8> sums;
611
+ for (size_t i = 0; i < N; ++i) {
612
+ sums.raw[i / 8] += v.raw[i];
613
+ }
614
+ return sums;
615
+ }
616
+
617
+ // ------------------------------ SaturatedAdd
618
+ template <typename T, size_t N, HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2)),
619
+ HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
620
+ HWY_API Vec128<T, N> SaturatedAdd(Vec128<T, N> a, Vec128<T, N> b) {
621
+ using TW = MakeSigned<MakeWide<T>>;
622
+ for (size_t i = 0; i < N; ++i) {
623
+ a.raw[i] = static_cast<T>(HWY_MIN(
624
+ HWY_MAX(hwy::LowestValue<T>(), static_cast<TW>(a.raw[i]) + b.raw[i]),
625
+ hwy::HighestValue<T>()));
626
+ }
627
+ return a;
628
+ }
629
+
630
+ // ------------------------------ SaturatedSub
631
+ template <typename T, size_t N, HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2)),
632
+ HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
633
+ HWY_API Vec128<T, N> SaturatedSub(Vec128<T, N> a, Vec128<T, N> b) {
634
+ using TW = MakeSigned<MakeWide<T>>;
635
+ for (size_t i = 0; i < N; ++i) {
636
+ a.raw[i] = static_cast<T>(HWY_MIN(
637
+ HWY_MAX(hwy::LowestValue<T>(), static_cast<TW>(a.raw[i]) - b.raw[i]),
638
+ hwy::HighestValue<T>()));
639
+ }
640
+ return a;
641
+ }
642
+
643
+ // ------------------------------ AverageRound
644
+ template <typename T, size_t N>
645
+ HWY_API Vec128<T, N> AverageRound(Vec128<T, N> a, Vec128<T, N> b) {
646
+ static_assert(!IsSigned<T>(), "Only for unsigned");
647
+ for (size_t i = 0; i < N; ++i) {
648
+ a.raw[i] = static_cast<T>((a.raw[i] + b.raw[i] + 1) / 2);
649
+ }
650
+ return a;
651
+ }
652
+
653
+ // ------------------------------ Abs
654
+
655
+ // Tag dispatch instead of SFINAE for MSVC 2017 compatibility
656
+ namespace detail {
657
+
658
+ template <typename T, size_t N>
659
+ HWY_INLINE Vec128<T, N> Abs(SignedTag /*tag*/, Vec128<T, N> a) {
660
+ for (size_t i = 0; i < N; ++i) {
661
+ const T s = a.raw[i];
662
+ const T min = hwy::LimitsMin<T>();
663
+ a.raw[i] = static_cast<T>((s >= 0 || s == min) ? a.raw[i] : -s);
664
+ }
665
+ return a;
666
+ }
667
+
668
+ template <typename T, size_t N>
669
+ HWY_INLINE Vec128<T, N> Abs(hwy::FloatTag /*tag*/, Vec128<T, N> v) {
670
+ for (size_t i = 0; i < N; ++i) {
671
+ v.raw[i] = std::abs(v.raw[i]);
672
+ }
673
+ return v;
674
+ }
675
+
676
+ } // namespace detail
677
+
678
+ template <typename T, size_t N>
679
+ HWY_API Vec128<T, N> Abs(Vec128<T, N> a) {
680
+ return detail::Abs(hwy::TypeTag<T>(), a);
681
+ }
682
+
683
+ // ------------------------------ Min/Max
684
+
685
+ // Tag dispatch instead of SFINAE for MSVC 2017 compatibility
686
+ namespace detail {
687
+
688
+ template <typename T, size_t N>
689
+ HWY_INLINE Vec128<T, N> Min(hwy::NonFloatTag /*tag*/, Vec128<T, N> a,
690
+ Vec128<T, N> b) {
691
+ for (size_t i = 0; i < N; ++i) {
692
+ a.raw[i] = HWY_MIN(a.raw[i], b.raw[i]);
693
+ }
694
+ return a;
695
+ }
696
+ template <typename T, size_t N>
697
+ HWY_INLINE Vec128<T, N> Max(hwy::NonFloatTag /*tag*/, Vec128<T, N> a,
698
+ Vec128<T, N> b) {
699
+ for (size_t i = 0; i < N; ++i) {
700
+ a.raw[i] = HWY_MAX(a.raw[i], b.raw[i]);
701
+ }
702
+ return a;
703
+ }
704
+
705
+ template <typename T, size_t N>
706
+ HWY_INLINE Vec128<T, N> Min(hwy::FloatTag /*tag*/, Vec128<T, N> a,
707
+ Vec128<T, N> b) {
708
+ for (size_t i = 0; i < N; ++i) {
709
+ if (std::isnan(a.raw[i])) {
710
+ a.raw[i] = b.raw[i];
711
+ } else if (std::isnan(b.raw[i])) {
712
+ // no change
713
+ } else {
714
+ a.raw[i] = HWY_MIN(a.raw[i], b.raw[i]);
715
+ }
716
+ }
717
+ return a;
718
+ }
719
+ template <typename T, size_t N>
720
+ HWY_INLINE Vec128<T, N> Max(hwy::FloatTag /*tag*/, Vec128<T, N> a,
721
+ Vec128<T, N> b) {
722
+ for (size_t i = 0; i < N; ++i) {
723
+ if (std::isnan(a.raw[i])) {
724
+ a.raw[i] = b.raw[i];
725
+ } else if (std::isnan(b.raw[i])) {
726
+ // no change
727
+ } else {
728
+ a.raw[i] = HWY_MAX(a.raw[i], b.raw[i]);
729
+ }
730
+ }
731
+ return a;
732
+ }
733
+
734
+ } // namespace detail
735
+
736
+ template <typename T, size_t N>
737
+ HWY_API Vec128<T, N> Min(Vec128<T, N> a, Vec128<T, N> b) {
738
+ return detail::Min(hwy::IsFloatTag<T>(), a, b);
739
+ }
740
+
741
+ template <typename T, size_t N>
742
+ HWY_API Vec128<T, N> Max(Vec128<T, N> a, Vec128<T, N> b) {
743
+ return detail::Max(hwy::IsFloatTag<T>(), a, b);
744
+ }
745
+
746
+ // ------------------------------ Neg
747
+
748
+ // Tag dispatch instead of SFINAE for MSVC 2017 compatibility
749
+ namespace detail {
750
+
751
+ template <typename T, size_t N>
752
+ HWY_API Vec128<T, N> Neg(hwy::NonFloatTag /*tag*/, Vec128<T, N> v) {
753
+ const DFromV<decltype(v)> d;
754
+ return Zero(d) - v;
755
+ }
756
+
757
+ template <typename T, size_t N>
758
+ HWY_API Vec128<T, N> Neg(hwy::FloatTag /*tag*/, Vec128<T, N> v) {
759
+ const DFromV<decltype(v)> d;
760
+ return Xor(v, SignBit(d));
761
+ }
762
+
763
+ template <typename T, size_t N>
764
+ HWY_API Vec128<T, N> Neg(hwy::SpecialTag /*tag*/, Vec128<T, N> v) {
765
+ const DFromV<decltype(v)> d;
766
+ return Xor(v, SignBit(d));
767
+ }
768
+
769
+ } // namespace detail
770
+
771
+ template <typename T, size_t N>
772
+ HWY_API Vec128<T, N> Neg(Vec128<T, N> v) {
773
+ return detail::Neg(hwy::IsFloatTag<T>(), v);
774
+ }
775
+
776
+ // ------------------------------ Mul/Div
777
+
778
+ // Tag dispatch instead of SFINAE for MSVC 2017 compatibility
779
+ namespace detail {
780
+
781
+ template <typename T, size_t N>
782
+ HWY_INLINE Vec128<T, N> Mul(hwy::FloatTag /*tag*/, Vec128<T, N> a,
783
+ Vec128<T, N> b) {
784
+ for (size_t i = 0; i < N; ++i) {
785
+ a.raw[i] *= b.raw[i];
786
+ }
787
+ return a;
788
+ }
789
+
790
+ template <typename T, size_t N>
791
+ HWY_INLINE Vec128<T, N> Mul(SignedTag /*tag*/, Vec128<T, N> a, Vec128<T, N> b) {
792
+ for (size_t i = 0; i < N; ++i) {
793
+ a.raw[i] = static_cast<T>(static_cast<uint64_t>(a.raw[i]) *
794
+ static_cast<uint64_t>(b.raw[i]));
795
+ }
796
+ return a;
797
+ }
798
+
799
+ template <typename T, size_t N>
800
+ HWY_INLINE Vec128<T, N> Mul(UnsignedTag /*tag*/, Vec128<T, N> a,
801
+ Vec128<T, N> b) {
802
+ for (size_t i = 0; i < N; ++i) {
803
+ a.raw[i] = static_cast<T>(static_cast<uint64_t>(a.raw[i]) *
804
+ static_cast<uint64_t>(b.raw[i]));
805
+ }
806
+ return a;
807
+ }
808
+
809
+ } // namespace detail
810
+
811
+ // Per-target flags to prevent generic_ops-inl.h defining 8/64-bit operator*.
812
+ #ifdef HWY_NATIVE_MUL_8
813
+ #undef HWY_NATIVE_MUL_8
814
+ #else
815
+ #define HWY_NATIVE_MUL_8
816
+ #endif
817
+ #ifdef HWY_NATIVE_MUL_64
818
+ #undef HWY_NATIVE_MUL_64
819
+ #else
820
+ #define HWY_NATIVE_MUL_64
821
+ #endif
822
+
823
+ template <typename T, size_t N>
824
+ HWY_API Vec128<T, N> operator*(Vec128<T, N> a, Vec128<T, N> b) {
825
+ return detail::Mul(hwy::TypeTag<T>(), a, b);
826
+ }
827
+
828
+ template <typename T, size_t N>
829
+ HWY_API Vec128<T, N> operator/(Vec128<T, N> a, Vec128<T, N> b) {
830
+ for (size_t i = 0; i < N; ++i) {
831
+ a.raw[i] = (b.raw[i] == T{0}) ? 0 : a.raw[i] / b.raw[i];
832
+ }
833
+ return a;
834
+ }
835
+
836
+ // Returns the upper 16 bits of a * b in each lane.
837
+ template <size_t N>
838
+ HWY_API Vec128<int16_t, N> MulHigh(Vec128<int16_t, N> a, Vec128<int16_t, N> b) {
839
+ for (size_t i = 0; i < N; ++i) {
840
+ a.raw[i] = static_cast<int16_t>((int32_t{a.raw[i]} * b.raw[i]) >> 16);
841
+ }
842
+ return a;
843
+ }
844
+ template <size_t N>
845
+ HWY_API Vec128<uint16_t, N> MulHigh(Vec128<uint16_t, N> a,
846
+ Vec128<uint16_t, N> b) {
847
+ for (size_t i = 0; i < N; ++i) {
848
+ // Cast to uint32_t first to prevent overflow. Otherwise the result of
849
+ // uint16_t * uint16_t is in "int" which may overflow. In practice the
850
+ // result is the same but this way it is also defined.
851
+ a.raw[i] = static_cast<uint16_t>(
852
+ (static_cast<uint32_t>(a.raw[i]) * static_cast<uint32_t>(b.raw[i])) >>
853
+ 16);
854
+ }
855
+ return a;
856
+ }
857
+
858
+ template <size_t N>
859
+ HWY_API Vec128<int16_t, N> MulFixedPoint15(Vec128<int16_t, N> a,
860
+ Vec128<int16_t, N> b) {
861
+ for (size_t i = 0; i < N; ++i) {
862
+ a.raw[i] = static_cast<int16_t>((a.raw[i] * b.raw[i] + 16384) >> 15);
863
+ }
864
+ return a;
865
+ }
866
+
867
+ // Multiplies even lanes (0, 2, ..) and returns the double-wide result.
868
+ template <class T, size_t N,
869
+ HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2) | (1 << 4)),
870
+ HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
871
+ HWY_API Vec128<MakeWide<T>, (N + 1) / 2> MulEven(Vec128<T, N> a,
872
+ Vec128<T, N> b) {
873
+ using TW = MakeWide<T>;
874
+ Vec128<TW, (N + 1) / 2> mul;
875
+ for (size_t i = 0; i < N; i += 2) {
876
+ const TW a_wide = a.raw[i];
877
+ mul.raw[i / 2] = static_cast<TW>(a_wide * b.raw[i]);
878
+ }
879
+ return mul;
880
+ }
881
+
882
+ // Multiplies odd lanes (1, 3, ..) and returns the double-wide result.
883
+ template <class T, size_t N,
884
+ HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2) | (1 << 4)),
885
+ HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
886
+ HWY_API Vec128<MakeWide<T>, (N + 1) / 2> MulOdd(Vec128<T, N> a,
887
+ Vec128<T, N> b) {
888
+ using TW = MakeWide<T>;
889
+ Vec128<TW, (N + 1) / 2> mul;
890
+ for (size_t i = 0; i < N; i += 2) {
891
+ const TW a_wide = a.raw[i + 1];
892
+ mul.raw[i / 2] = static_cast<TW>(a_wide * b.raw[i + 1]);
893
+ }
894
+ return mul;
895
+ }
896
+
897
+ template <size_t N>
898
+ HWY_API Vec128<float, N> ApproximateReciprocal(Vec128<float, N> v) {
899
+ for (size_t i = 0; i < N; ++i) {
900
+ // Zero inputs are allowed, but callers are responsible for replacing the
901
+ // return value with something else (typically using IfThenElse). This check
902
+ // avoids a ubsan error. The result is arbitrary.
903
+ v.raw[i] = (std::abs(v.raw[i]) == 0.0f) ? 0.0f : 1.0f / v.raw[i];
904
+ }
905
+ return v;
906
+ }
907
+
908
+ // generic_ops takes care of integer T.
909
+ template <typename T, size_t N, HWY_IF_FLOAT(T)>
910
+ HWY_API Vec128<T, N> AbsDiff(Vec128<T, N> a, Vec128<T, N> b) {
911
+ return Abs(a - b);
912
+ }
913
+
914
+ // ------------------------------ Floating-point multiply-add variants
915
+
916
+ template <typename T, size_t N>
917
+ HWY_API Vec128<T, N> MulAdd(Vec128<T, N> mul, Vec128<T, N> x,
918
+ Vec128<T, N> add) {
919
+ return mul * x + add;
920
+ }
921
+
922
+ template <typename T, size_t N>
923
+ HWY_API Vec128<T, N> NegMulAdd(Vec128<T, N> mul, Vec128<T, N> x,
924
+ Vec128<T, N> add) {
925
+ return add - mul * x;
926
+ }
927
+
928
+ template <typename T, size_t N>
929
+ HWY_API Vec128<T, N> MulSub(Vec128<T, N> mul, Vec128<T, N> x,
930
+ Vec128<T, N> sub) {
931
+ return mul * x - sub;
932
+ }
933
+
934
+ template <typename T, size_t N>
935
+ HWY_API Vec128<T, N> NegMulSub(Vec128<T, N> mul, Vec128<T, N> x,
936
+ Vec128<T, N> sub) {
937
+ return Neg(mul) * x - sub;
938
+ }
939
+
940
+ // ------------------------------ Floating-point square root
941
+
942
+ template <size_t N>
943
+ HWY_API Vec128<float, N> ApproximateReciprocalSqrt(Vec128<float, N> v) {
944
+ for (size_t i = 0; i < N; ++i) {
945
+ const float half = v.raw[i] * 0.5f;
946
+ uint32_t bits;
947
+ CopySameSize(&v.raw[i], &bits);
948
+ // Initial guess based on log2(f)
949
+ bits = 0x5F3759DF - (bits >> 1);
950
+ CopySameSize(&bits, &v.raw[i]);
951
+ // One Newton-Raphson iteration
952
+ v.raw[i] = v.raw[i] * (1.5f - (half * v.raw[i] * v.raw[i]));
953
+ }
954
+ return v;
955
+ }
956
+
957
+ template <typename T, size_t N>
958
+ HWY_API Vec128<T, N> Sqrt(Vec128<T, N> v) {
959
+ for (size_t i = 0; i < N; ++i) {
960
+ v.raw[i] = std::sqrt(v.raw[i]);
961
+ }
962
+ return v;
963
+ }
964
+
965
+ // ------------------------------ Floating-point rounding
966
+
967
+ template <typename T, size_t N>
968
+ HWY_API Vec128<T, N> Round(Vec128<T, N> v) {
969
+ using TI = MakeSigned<T>;
970
+ const Vec128<T, N> a = Abs(v);
971
+ for (size_t i = 0; i < N; ++i) {
972
+ if (!(a.raw[i] < MantissaEnd<T>())) { // Huge or NaN
973
+ continue;
974
+ }
975
+ const T bias = v.raw[i] < T(0.0) ? T(-0.5) : T(0.5);
976
+ const TI rounded = static_cast<TI>(v.raw[i] + bias);
977
+ if (rounded == 0) {
978
+ v.raw[i] = v.raw[i] < 0 ? T{-0} : T{0};
979
+ continue;
980
+ }
981
+ const T rounded_f = static_cast<T>(rounded);
982
+ // Round to even
983
+ if ((rounded & 1) && std::abs(rounded_f - v.raw[i]) == T(0.5)) {
984
+ v.raw[i] = static_cast<T>(rounded - (v.raw[i] < T(0) ? -1 : 1));
985
+ continue;
986
+ }
987
+ v.raw[i] = rounded_f;
988
+ }
989
+ return v;
990
+ }
991
+
992
+ // Round-to-nearest even.
993
+ template <size_t N>
994
+ HWY_API Vec128<int32_t, N> NearestInt(Vec128<float, N> v) {
995
+ using T = float;
996
+ using TI = int32_t;
997
+
998
+ const Vec128<float, N> abs = Abs(v);
999
+ Vec128<int32_t, N> ret;
1000
+ for (size_t i = 0; i < N; ++i) {
1001
+ const bool signbit = std::signbit(v.raw[i]);
1002
+
1003
+ if (!(abs.raw[i] < MantissaEnd<T>())) { // Huge or NaN
1004
+ // Check if too large to cast or NaN
1005
+ if (!(abs.raw[i] <= static_cast<T>(LimitsMax<TI>()))) {
1006
+ ret.raw[i] = signbit ? LimitsMin<TI>() : LimitsMax<TI>();
1007
+ continue;
1008
+ }
1009
+ ret.raw[i] = static_cast<TI>(v.raw[i]);
1010
+ continue;
1011
+ }
1012
+ const T bias = v.raw[i] < T(0.0) ? T(-0.5) : T(0.5);
1013
+ const TI rounded = static_cast<TI>(v.raw[i] + bias);
1014
+ if (rounded == 0) {
1015
+ ret.raw[i] = 0;
1016
+ continue;
1017
+ }
1018
+ const T rounded_f = static_cast<T>(rounded);
1019
+ // Round to even
1020
+ if ((rounded & 1) && std::abs(rounded_f - v.raw[i]) == T(0.5)) {
1021
+ ret.raw[i] = rounded - (signbit ? -1 : 1);
1022
+ continue;
1023
+ }
1024
+ ret.raw[i] = rounded;
1025
+ }
1026
+ return ret;
1027
+ }
1028
+
1029
+ template <typename T, size_t N>
1030
+ HWY_API Vec128<T, N> Trunc(Vec128<T, N> v) {
1031
+ using TI = MakeSigned<T>;
1032
+ const Vec128<T, N> abs = Abs(v);
1033
+ for (size_t i = 0; i < N; ++i) {
1034
+ if (!(abs.raw[i] <= MantissaEnd<T>())) { // Huge or NaN
1035
+ continue;
1036
+ }
1037
+ const TI truncated = static_cast<TI>(v.raw[i]);
1038
+ if (truncated == 0) {
1039
+ v.raw[i] = v.raw[i] < 0 ? -T{0} : T{0};
1040
+ continue;
1041
+ }
1042
+ v.raw[i] = static_cast<T>(truncated);
1043
+ }
1044
+ return v;
1045
+ }
1046
+
1047
+ // Toward +infinity, aka ceiling
1048
+ template <typename Float, size_t N>
1049
+ Vec128<Float, N> Ceil(Vec128<Float, N> v) {
1050
+ constexpr int kMantissaBits = MantissaBits<Float>();
1051
+ using Bits = MakeUnsigned<Float>;
1052
+ const Bits kExponentMask = MaxExponentField<Float>();
1053
+ const Bits kMantissaMask = MantissaMask<Float>();
1054
+ const Bits kBias = kExponentMask / 2;
1055
+
1056
+ for (size_t i = 0; i < N; ++i) {
1057
+ const bool positive = v.raw[i] > Float(0.0);
1058
+
1059
+ Bits bits;
1060
+ CopySameSize(&v.raw[i], &bits);
1061
+
1062
+ const int exponent =
1063
+ static_cast<int>(((bits >> kMantissaBits) & kExponentMask) - kBias);
1064
+ // Already an integer.
1065
+ if (exponent >= kMantissaBits) continue;
1066
+ // |v| <= 1 => 0 or 1.
1067
+ if (exponent < 0) {
1068
+ v.raw[i] = positive ? Float{1} : Float{-0.0};
1069
+ continue;
1070
+ }
1071
+
1072
+ const Bits mantissa_mask = kMantissaMask >> exponent;
1073
+ // Already an integer
1074
+ if ((bits & mantissa_mask) == 0) continue;
1075
+
1076
+ // Clear fractional bits and round up
1077
+ if (positive) bits += (kMantissaMask + 1) >> exponent;
1078
+ bits &= ~mantissa_mask;
1079
+
1080
+ CopySameSize(&bits, &v.raw[i]);
1081
+ }
1082
+ return v;
1083
+ }
1084
+
1085
+ // Toward -infinity, aka floor
1086
+ template <typename Float, size_t N>
1087
+ Vec128<Float, N> Floor(Vec128<Float, N> v) {
1088
+ constexpr int kMantissaBits = MantissaBits<Float>();
1089
+ using Bits = MakeUnsigned<Float>;
1090
+ const Bits kExponentMask = MaxExponentField<Float>();
1091
+ const Bits kMantissaMask = MantissaMask<Float>();
1092
+ const Bits kBias = kExponentMask / 2;
1093
+
1094
+ for (size_t i = 0; i < N; ++i) {
1095
+ const bool negative = v.raw[i] < Float(0.0);
1096
+
1097
+ Bits bits;
1098
+ CopySameSize(&v.raw[i], &bits);
1099
+
1100
+ const int exponent =
1101
+ static_cast<int>(((bits >> kMantissaBits) & kExponentMask) - kBias);
1102
+ // Already an integer.
1103
+ if (exponent >= kMantissaBits) continue;
1104
+ // |v| <= 1 => -1 or 0.
1105
+ if (exponent < 0) {
1106
+ v.raw[i] = negative ? Float(-1.0) : Float(0.0);
1107
+ continue;
1108
+ }
1109
+
1110
+ const Bits mantissa_mask = kMantissaMask >> exponent;
1111
+ // Already an integer
1112
+ if ((bits & mantissa_mask) == 0) continue;
1113
+
1114
+ // Clear fractional bits and round down
1115
+ if (negative) bits += (kMantissaMask + 1) >> exponent;
1116
+ bits &= ~mantissa_mask;
1117
+
1118
+ CopySameSize(&bits, &v.raw[i]);
1119
+ }
1120
+ return v;
1121
+ }
1122
+
1123
+ // ------------------------------ Floating-point classification
1124
+
1125
+ template <typename T, size_t N>
1126
+ HWY_API Mask128<T, N> IsNaN(Vec128<T, N> v) {
1127
+ Mask128<T, N> ret;
1128
+ for (size_t i = 0; i < N; ++i) {
1129
+ // std::isnan returns false for 0x7F..FF in clang AVX3 builds, so DIY.
1130
+ MakeUnsigned<T> bits;
1131
+ CopySameSize(&v.raw[i], &bits);
1132
+ bits += bits;
1133
+ bits >>= 1; // clear sign bit
1134
+ // NaN if all exponent bits are set and the mantissa is not zero.
1135
+ ret.bits[i] = Mask128<T, N>::FromBool(bits > ExponentMask<T>());
1136
+ }
1137
+ return ret;
1138
+ }
1139
+
1140
+ template <typename T, size_t N>
1141
+ HWY_API Mask128<T, N> IsInf(Vec128<T, N> v) {
1142
+ static_assert(IsFloat<T>(), "Only for float");
1143
+ const DFromV<decltype(v)> d;
1144
+ const RebindToSigned<decltype(d)> di;
1145
+ const VFromD<decltype(di)> vi = BitCast(di, v);
1146
+ // 'Shift left' to clear the sign bit, check for exponent=max and mantissa=0.
1147
+ return RebindMask(d, Eq(Add(vi, vi), Set(di, hwy::MaxExponentTimes2<T>())));
1148
+ }
1149
+
1150
+ // Returns whether normal/subnormal/zero.
1151
+ template <typename T, size_t N>
1152
+ HWY_API Mask128<T, N> IsFinite(Vec128<T, N> v) {
1153
+ static_assert(IsFloat<T>(), "Only for float");
1154
+ const DFromV<decltype(v)> d;
1155
+ const RebindToUnsigned<decltype(d)> du;
1156
+ const RebindToSigned<decltype(d)> di; // cheaper than unsigned comparison
1157
+ using VI = VFromD<decltype(di)>;
1158
+ using VU = VFromD<decltype(du)>;
1159
+ const VU vu = BitCast(du, v);
1160
+ // 'Shift left' to clear the sign bit, then right so we can compare with the
1161
+ // max exponent (cannot compare with MaxExponentTimes2 directly because it is
1162
+ // negative and non-negative floats would be greater).
1163
+ const VI exp =
1164
+ BitCast(di, ShiftRight<hwy::MantissaBits<T>() + 1>(Add(vu, vu)));
1165
+ return RebindMask(d, Lt(exp, Set(di, hwy::MaxExponentField<T>())));
1166
+ }
1167
+
1168
+ // ================================================== COMPARE
1169
+
1170
+ template <typename T, size_t N>
1171
+ HWY_API Mask128<T, N> operator==(Vec128<T, N> a, Vec128<T, N> b) {
1172
+ Mask128<T, N> m;
1173
+ for (size_t i = 0; i < N; ++i) {
1174
+ m.bits[i] = Mask128<T, N>::FromBool(a.raw[i] == b.raw[i]);
1175
+ }
1176
+ return m;
1177
+ }
1178
+
1179
+ template <typename T, size_t N>
1180
+ HWY_API Mask128<T, N> operator!=(Vec128<T, N> a, Vec128<T, N> b) {
1181
+ Mask128<T, N> m;
1182
+ for (size_t i = 0; i < N; ++i) {
1183
+ m.bits[i] = Mask128<T, N>::FromBool(a.raw[i] != b.raw[i]);
1184
+ }
1185
+ return m;
1186
+ }
1187
+
1188
+ template <typename T, size_t N>
1189
+ HWY_API Mask128<T, N> TestBit(Vec128<T, N> v, Vec128<T, N> bit) {
1190
+ static_assert(!hwy::IsFloat<T>(), "Only integer vectors supported");
1191
+ return (v & bit) == bit;
1192
+ }
1193
+
1194
+ template <typename T, size_t N>
1195
+ HWY_API Mask128<T, N> operator<(Vec128<T, N> a, Vec128<T, N> b) {
1196
+ Mask128<T, N> m;
1197
+ for (size_t i = 0; i < N; ++i) {
1198
+ m.bits[i] = Mask128<T, N>::FromBool(a.raw[i] < b.raw[i]);
1199
+ }
1200
+ return m;
1201
+ }
1202
+ template <typename T, size_t N>
1203
+ HWY_API Mask128<T, N> operator>(Vec128<T, N> a, Vec128<T, N> b) {
1204
+ Mask128<T, N> m;
1205
+ for (size_t i = 0; i < N; ++i) {
1206
+ m.bits[i] = Mask128<T, N>::FromBool(a.raw[i] > b.raw[i]);
1207
+ }
1208
+ return m;
1209
+ }
1210
+
1211
+ template <typename T, size_t N>
1212
+ HWY_API Mask128<T, N> operator<=(Vec128<T, N> a, Vec128<T, N> b) {
1213
+ Mask128<T, N> m;
1214
+ for (size_t i = 0; i < N; ++i) {
1215
+ m.bits[i] = Mask128<T, N>::FromBool(a.raw[i] <= b.raw[i]);
1216
+ }
1217
+ return m;
1218
+ }
1219
+ template <typename T, size_t N>
1220
+ HWY_API Mask128<T, N> operator>=(Vec128<T, N> a, Vec128<T, N> b) {
1221
+ Mask128<T, N> m;
1222
+ for (size_t i = 0; i < N; ++i) {
1223
+ m.bits[i] = Mask128<T, N>::FromBool(a.raw[i] >= b.raw[i]);
1224
+ }
1225
+ return m;
1226
+ }
1227
+
1228
+ // ------------------------------ Lt128
1229
+
1230
+ // Only makes sense for full vectors of u64.
1231
+ template <class D>
1232
+ HWY_API MFromD<D> Lt128(D /* tag */, Vec128<uint64_t> a, Vec128<uint64_t> b) {
1233
+ const bool lt =
1234
+ (a.raw[1] < b.raw[1]) || (a.raw[1] == b.raw[1] && a.raw[0] < b.raw[0]);
1235
+ Mask128<uint64_t> ret;
1236
+ ret.bits[0] = ret.bits[1] = Mask128<uint64_t>::FromBool(lt);
1237
+ return ret;
1238
+ }
1239
+
1240
+ template <class D>
1241
+ HWY_API MFromD<D> Lt128Upper(D /* tag */, Vec128<uint64_t> a,
1242
+ Vec128<uint64_t> b) {
1243
+ const bool lt = a.raw[1] < b.raw[1];
1244
+ Mask128<uint64_t> ret;
1245
+ ret.bits[0] = ret.bits[1] = Mask128<uint64_t>::FromBool(lt);
1246
+ return ret;
1247
+ }
1248
+
1249
+ // ------------------------------ Eq128
1250
+
1251
+ // Only makes sense for full vectors of u64.
1252
+ template <class D>
1253
+ HWY_API MFromD<D> Eq128(D /* tag */, Vec128<uint64_t> a, Vec128<uint64_t> b) {
1254
+ const bool eq = a.raw[1] == b.raw[1] && a.raw[0] == b.raw[0];
1255
+ Mask128<uint64_t> ret;
1256
+ ret.bits[0] = ret.bits[1] = Mask128<uint64_t>::FromBool(eq);
1257
+ return ret;
1258
+ }
1259
+
1260
+ template <class D>
1261
+ HWY_API Mask128<uint64_t> Ne128(D /* tag */, Vec128<uint64_t> a,
1262
+ Vec128<uint64_t> b) {
1263
+ const bool ne = a.raw[1] != b.raw[1] || a.raw[0] != b.raw[0];
1264
+ Mask128<uint64_t> ret;
1265
+ ret.bits[0] = ret.bits[1] = Mask128<uint64_t>::FromBool(ne);
1266
+ return ret;
1267
+ }
1268
+
1269
+ template <class D>
1270
+ HWY_API MFromD<D> Eq128Upper(D /* tag */, Vec128<uint64_t> a,
1271
+ Vec128<uint64_t> b) {
1272
+ const bool eq = a.raw[1] == b.raw[1];
1273
+ Mask128<uint64_t> ret;
1274
+ ret.bits[0] = ret.bits[1] = Mask128<uint64_t>::FromBool(eq);
1275
+ return ret;
1276
+ }
1277
+
1278
+ template <class D>
1279
+ HWY_API MFromD<D> Ne128Upper(D /* tag */, Vec128<uint64_t> a,
1280
+ Vec128<uint64_t> b) {
1281
+ const bool ne = a.raw[1] != b.raw[1];
1282
+ Mask128<uint64_t> ret;
1283
+ ret.bits[0] = ret.bits[1] = Mask128<uint64_t>::FromBool(ne);
1284
+ return ret;
1285
+ }
1286
+
1287
+ // ------------------------------ Min128, Max128 (Lt128)
1288
+
1289
+ template <class D>
1290
+ HWY_API VFromD<D> Min128(D d, VFromD<D> a, VFromD<D> b) {
1291
+ return IfThenElse(Lt128(d, a, b), a, b);
1292
+ }
1293
+
1294
+ template <class D>
1295
+ HWY_API VFromD<D> Max128(D d, VFromD<D> a, VFromD<D> b) {
1296
+ return IfThenElse(Lt128(d, b, a), a, b);
1297
+ }
1298
+
1299
+ template <class D>
1300
+ HWY_API VFromD<D> Min128Upper(D d, VFromD<D> a, VFromD<D> b) {
1301
+ return IfThenElse(Lt128Upper(d, a, b), a, b);
1302
+ }
1303
+
1304
+ template <class D>
1305
+ HWY_API VFromD<D> Max128Upper(D d, VFromD<D> a, VFromD<D> b) {
1306
+ return IfThenElse(Lt128Upper(d, b, a), a, b);
1307
+ }
1308
+
1309
+ // ================================================== MEMORY
1310
+
1311
+ // ------------------------------ Load
1312
+
1313
+ template <class D>
1314
+ HWY_API VFromD<D> Load(D d, const TFromD<D>* HWY_RESTRICT aligned) {
1315
+ VFromD<D> v;
1316
+ CopyBytes<d.MaxBytes()>(aligned, v.raw); // copy from array
1317
+ return v;
1318
+ }
1319
+
1320
+ template <class D>
1321
+ HWY_API VFromD<D> MaskedLoad(MFromD<D> m, D d,
1322
+ const TFromD<D>* HWY_RESTRICT p) {
1323
+ return IfThenElseZero(m, LoadU(d, p));
1324
+ }
1325
+
1326
+ template <class D>
1327
+ HWY_API VFromD<D> MaskedLoadOr(VFromD<D> v, MFromD<D> m, D d,
1328
+ const TFromD<D>* HWY_RESTRICT p) {
1329
+ return IfThenElse(m, LoadU(d, p), v);
1330
+ }
1331
+
1332
+ template <class D>
1333
+ HWY_API VFromD<D> LoadU(D d, const TFromD<D>* HWY_RESTRICT p) {
1334
+ return Load(d, p);
1335
+ }
1336
+
1337
+ // In some use cases, "load single lane" is sufficient; otherwise avoid this.
1338
+ template <class D>
1339
+ HWY_API VFromD<D> LoadDup128(D d, const TFromD<D>* HWY_RESTRICT aligned) {
1340
+ return Load(d, aligned);
1341
+ }
1342
+
1343
+ #ifdef HWY_NATIVE_LOAD_N
1344
+ #undef HWY_NATIVE_LOAD_N
1345
+ #else
1346
+ #define HWY_NATIVE_LOAD_N
1347
+ #endif
1348
+
1349
+ template <class D>
1350
+ HWY_API VFromD<D> LoadN(D d, const TFromD<D>* HWY_RESTRICT p,
1351
+ size_t max_lanes_to_load) {
1352
+ VFromD<D> v = Zero(d);
1353
+ const size_t N = Lanes(d);
1354
+ const size_t num_of_lanes_to_load = HWY_MIN(max_lanes_to_load, N);
1355
+ CopyBytes(p, v.raw, num_of_lanes_to_load * sizeof(TFromD<D>));
1356
+ return v;
1357
+ }
1358
+
1359
+ template <class D>
1360
+ HWY_API VFromD<D> LoadNOr(VFromD<D> no, D d, const TFromD<D>* HWY_RESTRICT p,
1361
+ size_t max_lanes_to_load) {
1362
+ VFromD<D> v = no;
1363
+ const size_t N = Lanes(d);
1364
+ const size_t num_of_lanes_to_load = HWY_MIN(max_lanes_to_load, N);
1365
+ CopyBytes(p, v.raw, num_of_lanes_to_load * sizeof(TFromD<D>));
1366
+ return v;
1367
+ }
1368
+
1369
+ // ------------------------------ Store
1370
+
1371
+ template <class D>
1372
+ HWY_API void Store(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT aligned) {
1373
+ CopyBytes<d.MaxBytes()>(v.raw, aligned); // copy to array
1374
+ }
1375
+
1376
+ template <class D>
1377
+ HWY_API void StoreU(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT p) {
1378
+ Store(v, d, p);
1379
+ }
1380
+
1381
+ template <class D>
1382
+ HWY_API void BlendedStore(VFromD<D> v, MFromD<D> m, D d,
1383
+ TFromD<D>* HWY_RESTRICT p) {
1384
+ for (size_t i = 0; i < MaxLanes(d); ++i) {
1385
+ if (m.bits[i]) p[i] = v.raw[i];
1386
+ }
1387
+ }
1388
+
1389
+ #ifdef HWY_NATIVE_STORE_N
1390
+ #undef HWY_NATIVE_STORE_N
1391
+ #else
1392
+ #define HWY_NATIVE_STORE_N
1393
+ #endif
1394
+
1395
+ template <class D>
1396
+ HWY_API void StoreN(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT p,
1397
+ size_t max_lanes_to_store) {
1398
+ const size_t N = Lanes(d);
1399
+ const size_t num_of_lanes_to_store = HWY_MIN(max_lanes_to_store, N);
1400
+ CopyBytes(v.raw, p, num_of_lanes_to_store * sizeof(TFromD<D>));
1401
+ }
1402
+
1403
+ // ------------------------------ LoadInterleaved2/3/4
1404
+
1405
+ // Per-target flag to prevent generic_ops-inl.h from defining LoadInterleaved2.
1406
+ // We implement those here because scalar code is likely faster than emulation
1407
+ // via shuffles.
1408
+ #ifdef HWY_NATIVE_LOAD_STORE_INTERLEAVED
1409
+ #undef HWY_NATIVE_LOAD_STORE_INTERLEAVED
1410
+ #else
1411
+ #define HWY_NATIVE_LOAD_STORE_INTERLEAVED
1412
+ #endif
1413
+
1414
+ template <class D, typename T = TFromD<D>>
1415
+ HWY_API void LoadInterleaved2(D d, const T* HWY_RESTRICT unaligned,
1416
+ VFromD<D>& v0, VFromD<D>& v1) {
1417
+ alignas(16) T buf0[MaxLanes(d)];
1418
+ alignas(16) T buf1[MaxLanes(d)];
1419
+ for (size_t i = 0; i < MaxLanes(d); ++i) {
1420
+ buf0[i] = *unaligned++;
1421
+ buf1[i] = *unaligned++;
1422
+ }
1423
+ v0 = Load(d, buf0);
1424
+ v1 = Load(d, buf1);
1425
+ }
1426
+
1427
+ template <class D, typename T = TFromD<D>>
1428
+ HWY_API void LoadInterleaved3(D d, const T* HWY_RESTRICT unaligned,
1429
+ VFromD<D>& v0, VFromD<D>& v1, VFromD<D>& v2) {
1430
+ alignas(16) T buf0[MaxLanes(d)];
1431
+ alignas(16) T buf1[MaxLanes(d)];
1432
+ alignas(16) T buf2[MaxLanes(d)];
1433
+ for (size_t i = 0; i < MaxLanes(d); ++i) {
1434
+ buf0[i] = *unaligned++;
1435
+ buf1[i] = *unaligned++;
1436
+ buf2[i] = *unaligned++;
1437
+ }
1438
+ v0 = Load(d, buf0);
1439
+ v1 = Load(d, buf1);
1440
+ v2 = Load(d, buf2);
1441
+ }
1442
+
1443
+ template <class D, typename T = TFromD<D>>
1444
+ HWY_API void LoadInterleaved4(D d, const T* HWY_RESTRICT unaligned,
1445
+ VFromD<D>& v0, VFromD<D>& v1, VFromD<D>& v2,
1446
+ VFromD<D>& v3) {
1447
+ alignas(16) T buf0[MaxLanes(d)];
1448
+ alignas(16) T buf1[MaxLanes(d)];
1449
+ alignas(16) T buf2[MaxLanes(d)];
1450
+ alignas(16) T buf3[MaxLanes(d)];
1451
+ for (size_t i = 0; i < MaxLanes(d); ++i) {
1452
+ buf0[i] = *unaligned++;
1453
+ buf1[i] = *unaligned++;
1454
+ buf2[i] = *unaligned++;
1455
+ buf3[i] = *unaligned++;
1456
+ }
1457
+ v0 = Load(d, buf0);
1458
+ v1 = Load(d, buf1);
1459
+ v2 = Load(d, buf2);
1460
+ v3 = Load(d, buf3);
1461
+ }
1462
+
1463
+ // ------------------------------ StoreInterleaved2/3/4
1464
+
1465
+ template <class D>
1466
+ HWY_API void StoreInterleaved2(VFromD<D> v0, VFromD<D> v1, D d,
1467
+ TFromD<D>* HWY_RESTRICT unaligned) {
1468
+ for (size_t i = 0; i < MaxLanes(d); ++i) {
1469
+ *unaligned++ = v0.raw[i];
1470
+ *unaligned++ = v1.raw[i];
1471
+ }
1472
+ }
1473
+
1474
+ template <class D>
1475
+ HWY_API void StoreInterleaved3(VFromD<D> v0, VFromD<D> v1, VFromD<D> v2, D d,
1476
+ TFromD<D>* HWY_RESTRICT unaligned) {
1477
+ for (size_t i = 0; i < MaxLanes(d); ++i) {
1478
+ *unaligned++ = v0.raw[i];
1479
+ *unaligned++ = v1.raw[i];
1480
+ *unaligned++ = v2.raw[i];
1481
+ }
1482
+ }
1483
+
1484
+ template <class D>
1485
+ HWY_API void StoreInterleaved4(VFromD<D> v0, VFromD<D> v1, VFromD<D> v2,
1486
+ VFromD<D> v3, D d,
1487
+ TFromD<D>* HWY_RESTRICT unaligned) {
1488
+ for (size_t i = 0; i < MaxLanes(d); ++i) {
1489
+ *unaligned++ = v0.raw[i];
1490
+ *unaligned++ = v1.raw[i];
1491
+ *unaligned++ = v2.raw[i];
1492
+ *unaligned++ = v3.raw[i];
1493
+ }
1494
+ }
1495
+
1496
+ // ------------------------------ Stream
1497
+ template <class D>
1498
+ HWY_API void Stream(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT aligned) {
1499
+ Store(v, d, aligned);
1500
+ }
1501
+
1502
+ // ------------------------------ Scatter in generic_ops-inl.h
1503
+ // ------------------------------ Gather in generic_ops-inl.h
1504
+
1505
+ // ================================================== CONVERT
1506
+
1507
+ // ConvertTo and DemoteTo with floating-point input and integer output truncate
1508
+ // (rounding toward zero).
1509
+
1510
+ namespace detail {
1511
+
1512
+ template <class ToT, class FromT>
1513
+ HWY_INLINE ToT CastValueForF2IConv(hwy::UnsignedTag /* to_type_tag */,
1514
+ FromT val) {
1515
+ // Prevent ubsan errors when converting float to narrower integer
1516
+
1517
+ // If LimitsMax<ToT>() can be exactly represented in FromT,
1518
+ // kSmallestOutOfToTRangePosVal is equal to LimitsMax<ToT>().
1519
+
1520
+ // Otherwise, if LimitsMax<ToT>() cannot be exactly represented in FromT,
1521
+ // kSmallestOutOfToTRangePosVal is equal to LimitsMax<ToT>() + 1, which can
1522
+ // be exactly represented in FromT.
1523
+ constexpr FromT kSmallestOutOfToTRangePosVal =
1524
+ (sizeof(ToT) * 8 <= static_cast<size_t>(MantissaBits<FromT>()) + 1)
1525
+ ? static_cast<FromT>(LimitsMax<ToT>())
1526
+ : static_cast<FromT>(
1527
+ static_cast<FromT>(ToT{1} << (sizeof(ToT) * 8 - 1)) * FromT(2));
1528
+
1529
+ if (std::signbit(val)) {
1530
+ return ToT{0};
1531
+ } else if (std::isinf(val) || val >= kSmallestOutOfToTRangePosVal) {
1532
+ return LimitsMax<ToT>();
1533
+ } else {
1534
+ return static_cast<ToT>(val);
1535
+ }
1536
+ }
1537
+
1538
+ template <class ToT, class FromT>
1539
+ HWY_INLINE ToT CastValueForF2IConv(hwy::SignedTag /* to_type_tag */,
1540
+ FromT val) {
1541
+ // Prevent ubsan errors when converting float to narrower integer
1542
+
1543
+ // If LimitsMax<ToT>() can be exactly represented in FromT,
1544
+ // kSmallestOutOfToTRangePosVal is equal to LimitsMax<ToT>().
1545
+
1546
+ // Otherwise, if LimitsMax<ToT>() cannot be exactly represented in FromT,
1547
+ // kSmallestOutOfToTRangePosVal is equal to -LimitsMin<ToT>(), which can
1548
+ // be exactly represented in FromT.
1549
+ constexpr FromT kSmallestOutOfToTRangePosVal =
1550
+ (sizeof(ToT) * 8 <= static_cast<size_t>(MantissaBits<FromT>()) + 2)
1551
+ ? static_cast<FromT>(LimitsMax<ToT>())
1552
+ : static_cast<FromT>(-static_cast<FromT>(LimitsMin<ToT>()));
1553
+
1554
+ if (std::isinf(val) || std::fabs(val) >= kSmallestOutOfToTRangePosVal) {
1555
+ return std::signbit(val) ? LimitsMin<ToT>() : LimitsMax<ToT>();
1556
+ } else {
1557
+ return static_cast<ToT>(val);
1558
+ }
1559
+ }
1560
+
1561
+ template <class ToT, class ToTypeTag, class FromT>
1562
+ HWY_INLINE ToT CastValueForPromoteTo(ToTypeTag /* to_type_tag */, FromT val) {
1563
+ return static_cast<ToT>(val);
1564
+ }
1565
+
1566
+ template <class ToT>
1567
+ HWY_INLINE ToT CastValueForPromoteTo(hwy::SignedTag to_type_tag, float val) {
1568
+ return CastValueForF2IConv<ToT>(to_type_tag, val);
1569
+ }
1570
+
1571
+ template <class ToT>
1572
+ HWY_INLINE ToT CastValueForPromoteTo(hwy::UnsignedTag to_type_tag, float val) {
1573
+ return CastValueForF2IConv<ToT>(to_type_tag, val);
1574
+ }
1575
+
1576
+ } // namespace detail
1577
+
1578
+ template <class DTo, typename TFrom, HWY_IF_NOT_SPECIAL_FLOAT(TFrom)>
1579
+ HWY_API VFromD<DTo> PromoteTo(DTo d, Vec128<TFrom, HWY_MAX_LANES_D(DTo)> from) {
1580
+ static_assert(sizeof(TFromD<DTo>) > sizeof(TFrom), "Not promoting");
1581
+ VFromD<DTo> ret;
1582
+ for (size_t i = 0; i < MaxLanes(d); ++i) {
1583
+ // For bits Y > X, floatX->floatY and intX->intY are always representable.
1584
+ ret.raw[i] = detail::CastValueForPromoteTo<TFromD<DTo>>(
1585
+ hwy::TypeTag<TFromD<DTo>>(), from.raw[i]);
1586
+ }
1587
+ return ret;
1588
+ }
1589
+
1590
+ // MSVC 19.10 cannot deduce the argument type if HWY_IF_FLOAT(TFrom) is here,
1591
+ // so we overload for TFrom=double and ToT={float,int32_t}.
1592
+ template <class D, HWY_IF_F32_D(D)>
1593
+ HWY_API VFromD<D> DemoteTo(D d, VFromD<Rebind<double, D>> from) {
1594
+ VFromD<D> ret;
1595
+ for (size_t i = 0; i < MaxLanes(d); ++i) {
1596
+ // Prevent ubsan errors when converting float to narrower integer/float
1597
+ if (std::isinf(from.raw[i]) ||
1598
+ std::fabs(from.raw[i]) > static_cast<double>(HighestValue<float>())) {
1599
+ ret.raw[i] = std::signbit(from.raw[i]) ? LowestValue<float>()
1600
+ : HighestValue<float>();
1601
+ continue;
1602
+ }
1603
+ ret.raw[i] = static_cast<float>(from.raw[i]);
1604
+ }
1605
+ return ret;
1606
+ }
1607
+ template <class D, HWY_IF_UI32_D(D)>
1608
+ HWY_API VFromD<D> DemoteTo(D d, VFromD<Rebind<double, D>> from) {
1609
+ VFromD<D> ret;
1610
+ for (size_t i = 0; i < MaxLanes(d); ++i) {
1611
+ // Prevent ubsan errors when converting double to narrower integer/int32_t
1612
+ ret.raw[i] = detail::CastValueForF2IConv<TFromD<D>>(
1613
+ hwy::TypeTag<TFromD<D>>(), from.raw[i]);
1614
+ }
1615
+ return ret;
1616
+ }
1617
+
1618
+ template <class DTo, typename TFrom, size_t N, HWY_IF_SIGNED(TFrom),
1619
+ HWY_IF_NOT_FLOAT_NOR_SPECIAL(TFromD<DTo>)>
1620
+ HWY_API VFromD<DTo> DemoteTo(DTo /* tag */, Vec128<TFrom, N> from) {
1621
+ using TTo = TFromD<DTo>;
1622
+ static_assert(sizeof(TTo) < sizeof(TFrom), "Not demoting");
1623
+
1624
+ VFromD<DTo> ret;
1625
+ for (size_t i = 0; i < N; ++i) {
1626
+ // Int to int: choose closest value in ToT to `from` (avoids UB)
1627
+ from.raw[i] =
1628
+ HWY_MIN(HWY_MAX(LimitsMin<TTo>(), from.raw[i]), LimitsMax<TTo>());
1629
+ ret.raw[i] = static_cast<TTo>(from.raw[i]);
1630
+ }
1631
+ return ret;
1632
+ }
1633
+
1634
+ template <class DTo, typename TFrom, size_t N, HWY_IF_UNSIGNED(TFrom),
1635
+ HWY_IF_UNSIGNED_D(DTo)>
1636
+ HWY_API VFromD<DTo> DemoteTo(DTo /* tag */, Vec128<TFrom, N> from) {
1637
+ using TTo = TFromD<DTo>;
1638
+ static_assert(sizeof(TTo) < sizeof(TFrom), "Not demoting");
1639
+
1640
+ VFromD<DTo> ret;
1641
+ for (size_t i = 0; i < N; ++i) {
1642
+ // Int to int: choose closest value in ToT to `from` (avoids UB)
1643
+ from.raw[i] = HWY_MIN(from.raw[i], LimitsMax<TTo>());
1644
+ ret.raw[i] = static_cast<TTo>(from.raw[i]);
1645
+ }
1646
+ return ret;
1647
+ }
1648
+
1649
+ template <class DTo, typename TFrom, size_t N, HWY_IF_UI64(TFrom),
1650
+ HWY_IF_F32_D(DTo)>
1651
+ HWY_API VFromD<DTo> DemoteTo(DTo /* tag */, Vec128<TFrom, N> from) {
1652
+ using TTo = TFromD<DTo>;
1653
+ static_assert(sizeof(TTo) < sizeof(TFrom), "Not demoting");
1654
+
1655
+ VFromD<DTo> ret;
1656
+ for (size_t i = 0; i < N; ++i) {
1657
+ // int64_t/uint64_t to float: okay to cast to float as an int64_t/uint64_t
1658
+ // value is always within the range of a float
1659
+ ret.raw[i] = static_cast<TTo>(from.raw[i]);
1660
+ }
1661
+ return ret;
1662
+ }
1663
+
1664
+ template <class DBF16, HWY_IF_BF16_D(DBF16), class VF32>
1665
+ HWY_API VFromD<DBF16> ReorderDemote2To(DBF16 dbf16, VF32 a, VF32 b) {
1666
+ const Repartition<uint32_t, decltype(dbf16)> du32;
1667
+ const VFromD<decltype(du32)> b_in_lower = ShiftRight<16>(BitCast(du32, b));
1668
+ // Avoid OddEven - we want the upper half of `a` even on big-endian systems.
1669
+ const VFromD<decltype(du32)> a_mask = Set(du32, 0xFFFF0000);
1670
+ return BitCast(dbf16, IfVecThenElse(a_mask, BitCast(du32, a), b_in_lower));
1671
+ }
1672
+
1673
+ template <class DN, HWY_IF_NOT_FLOAT_NOR_SPECIAL(TFromD<DN>), class V,
1674
+ HWY_IF_SIGNED_V(V), HWY_IF_T_SIZE_V(V, sizeof(TFromD<DN>) * 2),
1675
+ HWY_IF_LANES_D(DN, HWY_MAX_LANES_D(DFromV<V>) * 2)>
1676
+ HWY_API VFromD<DN> ReorderDemote2To(DN dn, V a, V b) {
1677
+ const RepartitionToWide<decltype(dn)> dw;
1678
+ const size_t NW = Lanes(dw);
1679
+ using TN = TFromD<DN>;
1680
+ const TN min = LimitsMin<TN>();
1681
+ const TN max = LimitsMax<TN>();
1682
+ VFromD<DN> ret;
1683
+ for (size_t i = 0; i < NW; ++i) {
1684
+ ret.raw[i] = static_cast<TN>(HWY_MIN(HWY_MAX(min, a.raw[i]), max));
1685
+ }
1686
+ for (size_t i = 0; i < NW; ++i) {
1687
+ ret.raw[NW + i] = static_cast<TN>(HWY_MIN(HWY_MAX(min, b.raw[i]), max));
1688
+ }
1689
+ return ret;
1690
+ }
1691
+
1692
+ template <class DN, HWY_IF_UNSIGNED_D(DN), class V, HWY_IF_UNSIGNED_V(V),
1693
+ HWY_IF_T_SIZE_V(V, sizeof(TFromD<DN>) * 2),
1694
+ HWY_IF_LANES_D(DN, HWY_MAX_LANES_D(DFromV<V>) * 2)>
1695
+ HWY_API VFromD<DN> ReorderDemote2To(DN dn, V a, V b) {
1696
+ const RepartitionToWide<decltype(dn)> dw;
1697
+ const size_t NW = Lanes(dw);
1698
+ using TN = TFromD<DN>;
1699
+ const TN max = LimitsMax<TN>();
1700
+ VFromD<DN> ret;
1701
+ for (size_t i = 0; i < NW; ++i) {
1702
+ ret.raw[i] = static_cast<TN>(HWY_MIN(a.raw[i], max));
1703
+ }
1704
+ for (size_t i = 0; i < NW; ++i) {
1705
+ ret.raw[NW + i] = static_cast<TN>(HWY_MIN(b.raw[i], max));
1706
+ }
1707
+ return ret;
1708
+ }
1709
+
1710
+ template <class DN, HWY_IF_NOT_FLOAT_NOR_SPECIAL(TFromD<DN>), class V,
1711
+ HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V),
1712
+ HWY_IF_T_SIZE_V(V, sizeof(TFromD<DN>) * 2),
1713
+ HWY_IF_LANES_D(DN, HWY_MAX_LANES_D(DFromV<V>) * 2)>
1714
+ HWY_API VFromD<DN> OrderedDemote2To(DN dn, V a, V b) {
1715
+ return ReorderDemote2To(dn, a, b);
1716
+ }
1717
+
1718
+ template <class DN, HWY_IF_BF16_D(DN), class V, HWY_IF_F32_D(DFromV<V>),
1719
+ HWY_IF_LANES_D(DN, HWY_MAX_LANES_D(DFromV<V>) * 2)>
1720
+ HWY_API VFromD<DN> OrderedDemote2To(DN dn, V a, V b) {
1721
+ const RebindToUnsigned<DFromV<decltype(a)>> du32;
1722
+ const size_t NW = Lanes(du32);
1723
+ VFromD<Repartition<uint16_t, DN>> ret;
1724
+
1725
+ const auto a_bits = BitCast(du32, a);
1726
+ const auto b_bits = BitCast(du32, b);
1727
+
1728
+ for (size_t i = 0; i < NW; ++i) {
1729
+ ret.raw[i] = static_cast<uint16_t>(a_bits.raw[i] >> 16);
1730
+ }
1731
+ for (size_t i = 0; i < NW; ++i) {
1732
+ ret.raw[NW + i] = static_cast<uint16_t>(b_bits.raw[i] >> 16);
1733
+ }
1734
+ return BitCast(dn, ret);
1735
+ }
1736
+
1737
+ namespace detail {
1738
+
1739
+ HWY_INLINE void StoreU16ToF16(const uint16_t val,
1740
+ hwy::float16_t* HWY_RESTRICT to) {
1741
+ CopySameSize(&val, to);
1742
+ }
1743
+
1744
+ HWY_INLINE uint16_t U16FromF16(const hwy::float16_t* HWY_RESTRICT from) {
1745
+ uint16_t bits16;
1746
+ CopySameSize(from, &bits16);
1747
+ return bits16;
1748
+ }
1749
+
1750
+ } // namespace detail
1751
+
1752
+ template <class D, HWY_IF_F32_D(D), size_t N>
1753
+ HWY_API VFromD<D> PromoteTo(D /* tag */, Vec128<bfloat16_t, N> v) {
1754
+ VFromD<D> ret;
1755
+ for (size_t i = 0; i < N; ++i) {
1756
+ ret.raw[i] = F32FromBF16(v.raw[i]);
1757
+ }
1758
+ return ret;
1759
+ }
1760
+
1761
+ template <class D, HWY_IF_BF16_D(D), size_t N>
1762
+ HWY_API VFromD<D> DemoteTo(D /* tag */, Vec128<float, N> v) {
1763
+ VFromD<D> ret;
1764
+ for (size_t i = 0; i < N; ++i) {
1765
+ ret.raw[i] = BF16FromF32(v.raw[i]);
1766
+ }
1767
+ return ret;
1768
+ }
1769
+
1770
+ // Tag dispatch instead of SFINAE for MSVC 2017 compatibility
1771
+ namespace detail {
1772
+
1773
+ template <typename TFrom, typename DTo>
1774
+ HWY_API VFromD<DTo> ConvertTo(hwy::FloatTag /*tag*/, DTo /*tag*/,
1775
+ Vec128<TFrom, HWY_MAX_LANES_D(DTo)> from) {
1776
+ using ToT = TFromD<DTo>;
1777
+ static_assert(sizeof(ToT) == sizeof(TFrom), "Should have same size");
1778
+ VFromD<DTo> ret;
1779
+ constexpr size_t N = HWY_MAX_LANES_D(DTo);
1780
+
1781
+ for (size_t i = 0; i < N; ++i) {
1782
+ // float## -> int##: return closest representable value
1783
+ ret.raw[i] = CastValueForF2IConv<ToT>(hwy::TypeTag<ToT>(), from.raw[i]);
1784
+ }
1785
+ return ret;
1786
+ }
1787
+
1788
+ template <typename TFrom, typename DTo>
1789
+ HWY_API VFromD<DTo> ConvertTo(hwy::NonFloatTag /*tag*/, DTo /* tag */,
1790
+ Vec128<TFrom, HWY_MAX_LANES_D(DTo)> from) {
1791
+ using ToT = TFromD<DTo>;
1792
+ static_assert(sizeof(ToT) == sizeof(TFrom), "Should have same size");
1793
+ VFromD<DTo> ret;
1794
+ constexpr size_t N = HWY_MAX_LANES_D(DTo);
1795
+ for (size_t i = 0; i < N; ++i) {
1796
+ // int## -> float##: no check needed
1797
+ ret.raw[i] = static_cast<ToT>(from.raw[i]);
1798
+ }
1799
+ return ret;
1800
+ }
1801
+
1802
+ } // namespace detail
1803
+
1804
+ template <class DTo, typename TFrom>
1805
+ HWY_API VFromD<DTo> ConvertTo(DTo d, Vec128<TFrom, HWY_MAX_LANES_D(DTo)> from) {
1806
+ return detail::ConvertTo(hwy::IsFloatTag<TFrom>(), d, from);
1807
+ }
1808
+
1809
+ template <size_t N>
1810
+ HWY_API Vec128<uint8_t, N> U8FromU32(Vec128<uint32_t, N> v) {
1811
+ return DemoteTo(Simd<uint8_t, N, 0>(), v);
1812
+ }
1813
+
1814
+ // ------------------------------ Truncations
1815
+
1816
+ template <class D, HWY_IF_U8_D(D), size_t N>
1817
+ HWY_API VFromD<D> TruncateTo(D /* tag */, Vec128<uint64_t, N> v) {
1818
+ VFromD<D> ret;
1819
+ for (size_t i = 0; i < N; ++i) {
1820
+ ret.raw[i] = static_cast<uint8_t>(v.raw[i] & 0xFF);
1821
+ }
1822
+ return ret;
1823
+ }
1824
+
1825
+ template <class D, HWY_IF_U16_D(D), size_t N>
1826
+ HWY_API VFromD<D> TruncateTo(D /* tag */, Vec128<uint64_t, N> v) {
1827
+ VFromD<D> ret;
1828
+ for (size_t i = 0; i < N; ++i) {
1829
+ ret.raw[i] = static_cast<uint16_t>(v.raw[i] & 0xFFFF);
1830
+ }
1831
+ return ret;
1832
+ }
1833
+
1834
+ template <class D, HWY_IF_U32_D(D), size_t N>
1835
+ HWY_API VFromD<D> TruncateTo(D /* tag */, Vec128<uint64_t, N> v) {
1836
+ VFromD<D> ret;
1837
+ for (size_t i = 0; i < N; ++i) {
1838
+ ret.raw[i] = static_cast<uint32_t>(v.raw[i] & 0xFFFFFFFFu);
1839
+ }
1840
+ return ret;
1841
+ }
1842
+
1843
+ template <class D, HWY_IF_U8_D(D), size_t N>
1844
+ HWY_API VFromD<D> TruncateTo(D /* tag */, Vec128<uint32_t, N> v) {
1845
+ VFromD<D> ret;
1846
+ for (size_t i = 0; i < N; ++i) {
1847
+ ret.raw[i] = static_cast<uint8_t>(v.raw[i] & 0xFF);
1848
+ }
1849
+ return ret;
1850
+ }
1851
+
1852
+ template <class D, HWY_IF_U16_D(D), size_t N>
1853
+ HWY_API VFromD<D> TruncateTo(D /* tag */, Vec128<uint32_t, N> v) {
1854
+ VFromD<D> ret;
1855
+ for (size_t i = 0; i < N; ++i) {
1856
+ ret.raw[i] = static_cast<uint16_t>(v.raw[i] & 0xFFFF);
1857
+ }
1858
+ return ret;
1859
+ }
1860
+
1861
+ template <class D, HWY_IF_U8_D(D), size_t N>
1862
+ HWY_API VFromD<D> TruncateTo(D /* tag */, Vec128<uint16_t, N> v) {
1863
+ VFromD<D> ret;
1864
+ for (size_t i = 0; i < N; ++i) {
1865
+ ret.raw[i] = static_cast<uint8_t>(v.raw[i] & 0xFF);
1866
+ }
1867
+ return ret;
1868
+ }
1869
+
1870
+ #ifdef HWY_NATIVE_ORDERED_TRUNCATE_2_TO
1871
+ #undef HWY_NATIVE_ORDERED_TRUNCATE_2_TO
1872
+ #else
1873
+ #define HWY_NATIVE_ORDERED_TRUNCATE_2_TO
1874
+ #endif
1875
+
1876
+ template <class DN, HWY_IF_UNSIGNED_D(DN), class V, HWY_IF_UNSIGNED_V(V),
1877
+ HWY_IF_T_SIZE_V(V, sizeof(TFromD<DN>) * 2),
1878
+ HWY_IF_LANES_D(DN, HWY_MAX_LANES_D(DFromV<V>) * 2)>
1879
+ HWY_API VFromD<DN> OrderedTruncate2To(DN dn, V a, V b) {
1880
+ const RepartitionToWide<decltype(dn)> dw;
1881
+ const size_t NW = Lanes(dw);
1882
+ using TW = TFromD<decltype(dw)>;
1883
+ using TN = TFromD<decltype(dn)>;
1884
+ VFromD<DN> ret;
1885
+ constexpr TW max_val{LimitsMax<TN>()};
1886
+
1887
+ for (size_t i = 0; i < NW; ++i) {
1888
+ ret.raw[i] = static_cast<TN>(a.raw[i] & max_val);
1889
+ }
1890
+ for (size_t i = 0; i < NW; ++i) {
1891
+ ret.raw[NW + i] = static_cast<TN>(b.raw[i] & max_val);
1892
+ }
1893
+ return ret;
1894
+ }
1895
+
1896
+ // ================================================== COMBINE
1897
+
1898
+ template <typename T, size_t N>
1899
+ HWY_API Vec128<T, N / 2> LowerHalf(Vec128<T, N> v) {
1900
+ Vec128<T, N / 2> ret;
1901
+ CopyBytes<N / 2 * sizeof(T)>(v.raw, ret.raw);
1902
+ return ret;
1903
+ }
1904
+
1905
+ template <class D>
1906
+ HWY_API VFromD<D> LowerHalf(D /* tag */, VFromD<Twice<D>> v) {
1907
+ return LowerHalf(v);
1908
+ }
1909
+
1910
+ template <class D>
1911
+ HWY_API VFromD<D> UpperHalf(D d, VFromD<Twice<D>> v) {
1912
+ VFromD<D> ret;
1913
+ CopyBytes<d.MaxBytes()>(&v.raw[MaxLanes(d)], ret.raw);
1914
+ return ret;
1915
+ }
1916
+
1917
+ template <class D>
1918
+ HWY_API VFromD<D> ZeroExtendVector(D d, VFromD<Half<D>> v) {
1919
+ const Half<decltype(d)> dh;
1920
+ VFromD<D> ret; // zero-initialized
1921
+ CopyBytes<dh.MaxBytes()>(v.raw, ret.raw);
1922
+ return ret;
1923
+ }
1924
+
1925
+ template <class D, class VH = VFromD<Half<D>>>
1926
+ HWY_API VFromD<D> Combine(D d, VH hi_half, VH lo_half) {
1927
+ const Half<decltype(d)> dh;
1928
+ VFromD<D> ret;
1929
+ CopyBytes<dh.MaxBytes()>(lo_half.raw, &ret.raw[0]);
1930
+ CopyBytes<dh.MaxBytes()>(hi_half.raw, &ret.raw[MaxLanes(dh)]);
1931
+ return ret;
1932
+ }
1933
+
1934
+ template <class D>
1935
+ HWY_API VFromD<D> ConcatLowerLower(D d, VFromD<D> hi, VFromD<D> lo) {
1936
+ const Half<decltype(d)> dh;
1937
+ VFromD<D> ret;
1938
+ CopyBytes<dh.MaxBytes()>(lo.raw, &ret.raw[0]);
1939
+ CopyBytes<dh.MaxBytes()>(hi.raw, &ret.raw[MaxLanes(dh)]);
1940
+ return ret;
1941
+ }
1942
+
1943
+ template <class D>
1944
+ HWY_API VFromD<D> ConcatUpperUpper(D d, VFromD<D> hi, VFromD<D> lo) {
1945
+ const Half<decltype(d)> dh;
1946
+ VFromD<D> ret;
1947
+ CopyBytes<dh.MaxBytes()>(&lo.raw[MaxLanes(dh)], &ret.raw[0]);
1948
+ CopyBytes<dh.MaxBytes()>(&hi.raw[MaxLanes(dh)], &ret.raw[MaxLanes(dh)]);
1949
+ return ret;
1950
+ }
1951
+
1952
+ template <class D>
1953
+ HWY_API VFromD<D> ConcatLowerUpper(D d, VFromD<D> hi, VFromD<D> lo) {
1954
+ const Half<decltype(d)> dh;
1955
+ VFromD<D> ret;
1956
+ CopyBytes<dh.MaxBytes()>(&lo.raw[MaxLanes(dh)], &ret.raw[0]);
1957
+ CopyBytes<dh.MaxBytes()>(hi.raw, &ret.raw[MaxLanes(dh)]);
1958
+ return ret;
1959
+ }
1960
+
1961
+ template <class D>
1962
+ HWY_API VFromD<D> ConcatUpperLower(D d, VFromD<D> hi, VFromD<D> lo) {
1963
+ const Half<decltype(d)> dh;
1964
+ VFromD<D> ret;
1965
+ CopyBytes<dh.MaxBytes()>(lo.raw, &ret.raw[0]);
1966
+ CopyBytes<dh.MaxBytes()>(&hi.raw[MaxLanes(dh)], &ret.raw[MaxLanes(dh)]);
1967
+ return ret;
1968
+ }
1969
+
1970
+ template <class D>
1971
+ HWY_API VFromD<D> ConcatEven(D d, VFromD<D> hi, VFromD<D> lo) {
1972
+ const Half<decltype(d)> dh;
1973
+ VFromD<D> ret;
1974
+ for (size_t i = 0; i < MaxLanes(dh); ++i) {
1975
+ ret.raw[i] = lo.raw[2 * i];
1976
+ }
1977
+ for (size_t i = 0; i < MaxLanes(dh); ++i) {
1978
+ ret.raw[MaxLanes(dh) + i] = hi.raw[2 * i];
1979
+ }
1980
+ return ret;
1981
+ }
1982
+
1983
+ template <class D>
1984
+ HWY_API VFromD<D> ConcatOdd(D d, VFromD<D> hi, VFromD<D> lo) {
1985
+ const Half<decltype(d)> dh;
1986
+ VFromD<D> ret;
1987
+ for (size_t i = 0; i < MaxLanes(dh); ++i) {
1988
+ ret.raw[i] = lo.raw[2 * i + 1];
1989
+ }
1990
+ for (size_t i = 0; i < MaxLanes(dh); ++i) {
1991
+ ret.raw[MaxLanes(dh) + i] = hi.raw[2 * i + 1];
1992
+ }
1993
+ return ret;
1994
+ }
1995
+
1996
+ // ------------------------------ CombineShiftRightBytes
1997
+ template <int kBytes, class D>
1998
+ HWY_API VFromD<D> CombineShiftRightBytes(D d, VFromD<D> hi, VFromD<D> lo) {
1999
+ VFromD<D> ret;
2000
+ const uint8_t* HWY_RESTRICT lo8 =
2001
+ reinterpret_cast<const uint8_t * HWY_RESTRICT>(lo.raw);
2002
+ uint8_t* HWY_RESTRICT ret8 =
2003
+ reinterpret_cast<uint8_t * HWY_RESTRICT>(ret.raw);
2004
+ CopyBytes<d.MaxBytes() - kBytes>(lo8 + kBytes, ret8);
2005
+ CopyBytes<kBytes>(hi.raw, ret8 + d.MaxBytes() - kBytes);
2006
+ return ret;
2007
+ }
2008
+
2009
+ // ------------------------------ ShiftLeftBytes
2010
+
2011
+ template <int kBytes, class D>
2012
+ HWY_API VFromD<D> ShiftLeftBytes(D d, VFromD<D> v) {
2013
+ static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes");
2014
+ VFromD<D> ret;
2015
+ uint8_t* HWY_RESTRICT ret8 =
2016
+ reinterpret_cast<uint8_t * HWY_RESTRICT>(ret.raw);
2017
+ ZeroBytes<kBytes>(ret8);
2018
+ CopyBytes<d.MaxBytes() - kBytes>(v.raw, ret8 + kBytes);
2019
+ return ret;
2020
+ }
2021
+
2022
+ template <int kBytes, typename T, size_t N>
2023
+ HWY_API Vec128<T, N> ShiftLeftBytes(Vec128<T, N> v) {
2024
+ return ShiftLeftBytes<kBytes>(DFromV<decltype(v)>(), v);
2025
+ }
2026
+
2027
+ // ------------------------------ ShiftLeftLanes
2028
+
2029
+ template <int kLanes, class D, typename T = TFromD<D>>
2030
+ HWY_API VFromD<D> ShiftLeftLanes(D d, VFromD<D> v) {
2031
+ const Repartition<uint8_t, decltype(d)> d8;
2032
+ return BitCast(d, ShiftLeftBytes<kLanes * sizeof(T)>(BitCast(d8, v)));
2033
+ }
2034
+
2035
+ template <int kLanes, typename T, size_t N>
2036
+ HWY_API Vec128<T, N> ShiftLeftLanes(Vec128<T, N> v) {
2037
+ return ShiftLeftLanes<kLanes>(DFromV<decltype(v)>(), v);
2038
+ }
2039
+
2040
+ // ------------------------------ ShiftRightBytes
2041
+ template <int kBytes, class D>
2042
+ HWY_API VFromD<D> ShiftRightBytes(D d, VFromD<D> v) {
2043
+ static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes");
2044
+ VFromD<D> ret;
2045
+ const uint8_t* HWY_RESTRICT v8 =
2046
+ reinterpret_cast<const uint8_t * HWY_RESTRICT>(v.raw);
2047
+ uint8_t* HWY_RESTRICT ret8 =
2048
+ reinterpret_cast<uint8_t * HWY_RESTRICT>(ret.raw);
2049
+ CopyBytes<d.MaxBytes() - kBytes>(v8 + kBytes, ret8);
2050
+ ZeroBytes<kBytes>(ret8 + d.MaxBytes() - kBytes);
2051
+ return ret;
2052
+ }
2053
+
2054
+ // ------------------------------ ShiftRightLanes
2055
+ template <int kLanes, class D>
2056
+ HWY_API VFromD<D> ShiftRightLanes(D d, VFromD<D> v) {
2057
+ const Repartition<uint8_t, decltype(d)> d8;
2058
+ constexpr size_t kBytes = kLanes * sizeof(TFromD<D>);
2059
+ return BitCast(d, ShiftRightBytes<kBytes>(d8, BitCast(d8, v)));
2060
+ }
2061
+
2062
+ // ================================================== SWIZZLE
2063
+
2064
+ template <typename T, size_t N>
2065
+ HWY_API T GetLane(Vec128<T, N> v) {
2066
+ return v.raw[0];
2067
+ }
2068
+
2069
+ template <typename T, size_t N>
2070
+ HWY_API Vec128<T, N> InsertLane(Vec128<T, N> v, size_t i, T t) {
2071
+ v.raw[i] = t;
2072
+ return v;
2073
+ }
2074
+
2075
+ template <typename T, size_t N>
2076
+ HWY_API T ExtractLane(Vec128<T, N> v, size_t i) {
2077
+ return v.raw[i];
2078
+ }
2079
+
2080
+ template <typename T, size_t N>
2081
+ HWY_API Vec128<T, N> DupEven(Vec128<T, N> v) {
2082
+ for (size_t i = 0; i < N; i += 2) {
2083
+ v.raw[i + 1] = v.raw[i];
2084
+ }
2085
+ return v;
2086
+ }
2087
+
2088
+ template <typename T, size_t N>
2089
+ HWY_API Vec128<T, N> DupOdd(Vec128<T, N> v) {
2090
+ for (size_t i = 0; i < N; i += 2) {
2091
+ v.raw[i] = v.raw[i + 1];
2092
+ }
2093
+ return v;
2094
+ }
2095
+
2096
+ template <typename T, size_t N>
2097
+ HWY_API Vec128<T, N> OddEven(Vec128<T, N> odd, Vec128<T, N> even) {
2098
+ for (size_t i = 0; i < N; i += 2) {
2099
+ odd.raw[i] = even.raw[i];
2100
+ }
2101
+ return odd;
2102
+ }
2103
+
2104
+ template <typename T, size_t N>
2105
+ HWY_API Vec128<T, N> OddEvenBlocks(Vec128<T, N> /* odd */, Vec128<T, N> even) {
2106
+ return even;
2107
+ }
2108
+
2109
+ // ------------------------------ SwapAdjacentBlocks
2110
+ template <typename T, size_t N>
2111
+ HWY_API Vec128<T, N> SwapAdjacentBlocks(Vec128<T, N> v) {
2112
+ return v;
2113
+ }
2114
+
2115
+ // ------------------------------ TableLookupLanes
2116
+
2117
+ // Returned by SetTableIndices for use by TableLookupLanes.
2118
+ template <typename T, size_t N>
2119
+ struct Indices128 {
2120
+ MakeSigned<T> raw[N];
2121
+ };
2122
+
2123
+ template <class D, typename TI, size_t N>
2124
+ HWY_API Indices128<TFromD<D>, N> IndicesFromVec(D d, Vec128<TI, N> vec) {
2125
+ static_assert(sizeof(TFromD<D>) == sizeof(TI), "Index/lane size must match");
2126
+ Indices128<TFromD<D>, N> ret;
2127
+ CopyBytes<d.MaxBytes()>(vec.raw, ret.raw);
2128
+ return ret;
2129
+ }
2130
+
2131
+ template <class D, typename TI>
2132
+ HWY_API Indices128<TFromD<D>, HWY_MAX_LANES_D(D)> SetTableIndices(
2133
+ D d, const TI* idx) {
2134
+ return IndicesFromVec(d, LoadU(Rebind<TI, D>(), idx));
2135
+ }
2136
+
2137
+ template <typename T, size_t N>
2138
+ HWY_API Vec128<T, N> TableLookupLanes(Vec128<T, N> v, Indices128<T, N> idx) {
2139
+ Vec128<T, N> ret;
2140
+ for (size_t i = 0; i < N; ++i) {
2141
+ ret.raw[i] = v.raw[idx.raw[i]];
2142
+ }
2143
+ return ret;
2144
+ }
2145
+
2146
+ template <typename T, size_t N>
2147
+ HWY_API Vec128<T, N> TwoTablesLookupLanes(Vec128<T, N> a, Vec128<T, N> b,
2148
+ Indices128<T, N> idx) {
2149
+ using TI = MakeSigned<T>;
2150
+ Vec128<T, N> ret;
2151
+ constexpr TI kVecLaneIdxMask = static_cast<TI>(N - 1);
2152
+ for (size_t i = 0; i < N; ++i) {
2153
+ const auto src_idx = idx.raw[i];
2154
+ const auto masked_src_lane_idx = src_idx & kVecLaneIdxMask;
2155
+ ret.raw[i] = (src_idx < static_cast<TI>(N)) ? a.raw[masked_src_lane_idx]
2156
+ : b.raw[masked_src_lane_idx];
2157
+ }
2158
+ return ret;
2159
+ }
2160
+
2161
+ // ------------------------------ ReverseBlocks
2162
+ template <class D>
2163
+ HWY_API VFromD<D> ReverseBlocks(D /* tag */, VFromD<D> v) {
2164
+ return v; // Single block: no change
2165
+ }
2166
+
2167
+ // ------------------------------ Reverse
2168
+
2169
+ template <class D>
2170
+ HWY_API VFromD<D> Reverse(D d, VFromD<D> v) {
2171
+ VFromD<D> ret;
2172
+ for (size_t i = 0; i < MaxLanes(d); ++i) {
2173
+ ret.raw[i] = v.raw[MaxLanes(d) - 1 - i];
2174
+ }
2175
+ return ret;
2176
+ }
2177
+
2178
+ // Per-target flag to prevent generic_ops-inl.h defining 8-bit Reverse2/4/8.
2179
+ #ifdef HWY_NATIVE_REVERSE2_8
2180
+ #undef HWY_NATIVE_REVERSE2_8
2181
+ #else
2182
+ #define HWY_NATIVE_REVERSE2_8
2183
+ #endif
2184
+
2185
+ template <class D>
2186
+ HWY_API VFromD<D> Reverse2(D d, VFromD<D> v) {
2187
+ VFromD<D> ret;
2188
+ for (size_t i = 0; i < MaxLanes(d); i += 2) {
2189
+ ret.raw[i + 0] = v.raw[i + 1];
2190
+ ret.raw[i + 1] = v.raw[i + 0];
2191
+ }
2192
+ return ret;
2193
+ }
2194
+
2195
+ template <class D>
2196
+ HWY_API VFromD<D> Reverse4(D d, VFromD<D> v) {
2197
+ VFromD<D> ret;
2198
+ for (size_t i = 0; i < MaxLanes(d); i += 4) {
2199
+ ret.raw[i + 0] = v.raw[i + 3];
2200
+ ret.raw[i + 1] = v.raw[i + 2];
2201
+ ret.raw[i + 2] = v.raw[i + 1];
2202
+ ret.raw[i + 3] = v.raw[i + 0];
2203
+ }
2204
+ return ret;
2205
+ }
2206
+
2207
+ template <class D>
2208
+ HWY_API VFromD<D> Reverse8(D d, VFromD<D> v) {
2209
+ VFromD<D> ret;
2210
+ for (size_t i = 0; i < MaxLanes(d); i += 8) {
2211
+ ret.raw[i + 0] = v.raw[i + 7];
2212
+ ret.raw[i + 1] = v.raw[i + 6];
2213
+ ret.raw[i + 2] = v.raw[i + 5];
2214
+ ret.raw[i + 3] = v.raw[i + 4];
2215
+ ret.raw[i + 4] = v.raw[i + 3];
2216
+ ret.raw[i + 5] = v.raw[i + 2];
2217
+ ret.raw[i + 6] = v.raw[i + 1];
2218
+ ret.raw[i + 7] = v.raw[i + 0];
2219
+ }
2220
+ return ret;
2221
+ }
2222
+
2223
+ // ------------------------------ SlideUpLanes
2224
+
2225
+ template <class D>
2226
+ HWY_API VFromD<D> SlideUpLanes(D d, VFromD<D> v, size_t amt) {
2227
+ VFromD<D> ret = Zero(d);
2228
+ constexpr size_t N = HWY_MAX_LANES_D(D);
2229
+ const size_t clamped_amt = HWY_MIN(amt, N);
2230
+ CopyBytes(v.raw, ret.raw + clamped_amt,
2231
+ (N - clamped_amt) * sizeof(TFromD<D>));
2232
+ return ret;
2233
+ }
2234
+
2235
+ // ------------------------------ SlideDownLanes
2236
+
2237
+ template <class D>
2238
+ HWY_API VFromD<D> SlideDownLanes(D d, VFromD<D> v, size_t amt) {
2239
+ VFromD<D> ret = Zero(d);
2240
+ constexpr size_t N = HWY_MAX_LANES_D(D);
2241
+ const size_t clamped_amt = HWY_MIN(amt, N);
2242
+ CopyBytes(v.raw + clamped_amt, ret.raw,
2243
+ (N - clamped_amt) * sizeof(TFromD<D>));
2244
+ return ret;
2245
+ }
2246
+
2247
+ // ================================================== BLOCKWISE
2248
+
2249
+ // ------------------------------ Shuffle*
2250
+
2251
+ // Swap 32-bit halves in 64-bit halves.
2252
+ template <typename T, size_t N>
2253
+ HWY_API Vec128<T, N> Shuffle2301(Vec128<T, N> v) {
2254
+ static_assert(sizeof(T) == 4, "Only for 32-bit");
2255
+ static_assert(N == 2 || N == 4, "Does not make sense for N=1");
2256
+ return Reverse2(DFromV<decltype(v)>(), v);
2257
+ }
2258
+
2259
+ // Swap 64-bit halves
2260
+ template <typename T>
2261
+ HWY_API Vec128<T> Shuffle1032(Vec128<T> v) {
2262
+ static_assert(sizeof(T) == 4, "Only for 32-bit");
2263
+ Vec128<T> ret;
2264
+ ret.raw[3] = v.raw[1];
2265
+ ret.raw[2] = v.raw[0];
2266
+ ret.raw[1] = v.raw[3];
2267
+ ret.raw[0] = v.raw[2];
2268
+ return ret;
2269
+ }
2270
+ template <typename T>
2271
+ HWY_API Vec128<T> Shuffle01(Vec128<T> v) {
2272
+ static_assert(sizeof(T) == 8, "Only for 64-bit");
2273
+ return Reverse2(DFromV<decltype(v)>(), v);
2274
+ }
2275
+
2276
+ // Rotate right 32 bits
2277
+ template <typename T>
2278
+ HWY_API Vec128<T> Shuffle0321(Vec128<T> v) {
2279
+ Vec128<T> ret;
2280
+ ret.raw[3] = v.raw[0];
2281
+ ret.raw[2] = v.raw[3];
2282
+ ret.raw[1] = v.raw[2];
2283
+ ret.raw[0] = v.raw[1];
2284
+ return ret;
2285
+ }
2286
+
2287
+ // Rotate left 32 bits
2288
+ template <typename T>
2289
+ HWY_API Vec128<T> Shuffle2103(Vec128<T> v) {
2290
+ Vec128<T> ret;
2291
+ ret.raw[3] = v.raw[2];
2292
+ ret.raw[2] = v.raw[1];
2293
+ ret.raw[1] = v.raw[0];
2294
+ ret.raw[0] = v.raw[3];
2295
+ return ret;
2296
+ }
2297
+
2298
+ template <typename T>
2299
+ HWY_API Vec128<T> Shuffle0123(Vec128<T> v) {
2300
+ return Reverse4(DFromV<decltype(v)>(), v);
2301
+ }
2302
+
2303
+ // ------------------------------ Broadcast
2304
+ template <int kLane, typename T, size_t N>
2305
+ HWY_API Vec128<T, N> Broadcast(Vec128<T, N> v) {
2306
+ for (size_t i = 0; i < N; ++i) {
2307
+ v.raw[i] = v.raw[kLane];
2308
+ }
2309
+ return v;
2310
+ }
2311
+
2312
+ // ------------------------------ TableLookupBytes, TableLookupBytesOr0
2313
+
2314
+ template <typename T, size_t N, typename TI, size_t NI>
2315
+ HWY_API Vec128<TI, NI> TableLookupBytes(Vec128<T, N> v,
2316
+ Vec128<TI, NI> indices) {
2317
+ const uint8_t* HWY_RESTRICT v_bytes =
2318
+ reinterpret_cast<const uint8_t * HWY_RESTRICT>(v.raw);
2319
+ const uint8_t* HWY_RESTRICT idx_bytes =
2320
+ reinterpret_cast<const uint8_t*>(indices.raw);
2321
+ Vec128<TI, NI> ret;
2322
+ uint8_t* HWY_RESTRICT ret_bytes =
2323
+ reinterpret_cast<uint8_t * HWY_RESTRICT>(ret.raw);
2324
+ for (size_t i = 0; i < NI * sizeof(TI); ++i) {
2325
+ const size_t idx = idx_bytes[i];
2326
+ // Avoid out of bounds reads.
2327
+ ret_bytes[i] = idx < sizeof(T) * N ? v_bytes[idx] : 0;
2328
+ }
2329
+ return ret;
2330
+ }
2331
+
2332
+ template <typename T, size_t N, typename TI, size_t NI>
2333
+ HWY_API Vec128<TI, NI> TableLookupBytesOr0(Vec128<T, N> v,
2334
+ Vec128<TI, NI> indices) {
2335
+ // Same as TableLookupBytes, which already returns 0 if out of bounds.
2336
+ return TableLookupBytes(v, indices);
2337
+ }
2338
+
2339
+ // ------------------------------ InterleaveLower/InterleaveUpper
2340
+
2341
+ template <typename T, size_t N>
2342
+ HWY_API Vec128<T, N> InterleaveLower(Vec128<T, N> a, Vec128<T, N> b) {
2343
+ Vec128<T, N> ret;
2344
+ for (size_t i = 0; i < N / 2; ++i) {
2345
+ ret.raw[2 * i + 0] = a.raw[i];
2346
+ ret.raw[2 * i + 1] = b.raw[i];
2347
+ }
2348
+ return ret;
2349
+ }
2350
+
2351
+ // Additional overload for the optional tag.
2352
+ template <class V>
2353
+ HWY_API V InterleaveLower(DFromV<V> /* tag */, V a, V b) {
2354
+ return InterleaveLower(a, b);
2355
+ }
2356
+
2357
+ template <class D>
2358
+ HWY_API VFromD<D> InterleaveUpper(D d, VFromD<D> a, VFromD<D> b) {
2359
+ const Half<decltype(d)> dh;
2360
+ VFromD<D> ret;
2361
+ for (size_t i = 0; i < MaxLanes(dh); ++i) {
2362
+ ret.raw[2 * i + 0] = a.raw[MaxLanes(dh) + i];
2363
+ ret.raw[2 * i + 1] = b.raw[MaxLanes(dh) + i];
2364
+ }
2365
+ return ret;
2366
+ }
2367
+
2368
+ // ------------------------------ ZipLower/ZipUpper (InterleaveLower)
2369
+
2370
+ // Same as Interleave*, except that the return lanes are double-width integers;
2371
+ // this is necessary because the single-lane scalar cannot return two values.
2372
+ template <class V, class DW = RepartitionToWide<DFromV<V>>>
2373
+ HWY_API VFromD<DW> ZipLower(V a, V b) {
2374
+ return BitCast(DW(), InterleaveLower(a, b));
2375
+ }
2376
+ template <class V, class D = DFromV<V>, class DW = RepartitionToWide<D>>
2377
+ HWY_API VFromD<DW> ZipLower(DW dw, V a, V b) {
2378
+ return BitCast(dw, InterleaveLower(D(), a, b));
2379
+ }
2380
+
2381
+ template <class V, class D = DFromV<V>, class DW = RepartitionToWide<D>>
2382
+ HWY_API VFromD<DW> ZipUpper(DW dw, V a, V b) {
2383
+ return BitCast(dw, InterleaveUpper(D(), a, b));
2384
+ }
2385
+
2386
+ // ================================================== MASK
2387
+
2388
+ template <class D>
2389
+ HWY_API bool AllFalse(D d, MFromD<D> mask) {
2390
+ typename MFromD<D>::Raw or_sum = 0;
2391
+ for (size_t i = 0; i < MaxLanes(d); ++i) {
2392
+ or_sum |= mask.bits[i];
2393
+ }
2394
+ return or_sum == 0;
2395
+ }
2396
+
2397
+ template <class D>
2398
+ HWY_API bool AllTrue(D d, MFromD<D> mask) {
2399
+ constexpr uint64_t kAll = LimitsMax<typename MFromD<D>::Raw>();
2400
+ uint64_t and_sum = kAll;
2401
+ for (size_t i = 0; i < MaxLanes(d); ++i) {
2402
+ and_sum &= mask.bits[i];
2403
+ }
2404
+ return and_sum == kAll;
2405
+ }
2406
+
2407
+ // `p` points to at least 8 readable bytes, not all of which need be valid.
2408
+ template <class D>
2409
+ HWY_API MFromD<D> LoadMaskBits(D d, const uint8_t* HWY_RESTRICT bits) {
2410
+ MFromD<D> m;
2411
+ for (size_t i = 0; i < MaxLanes(d); ++i) {
2412
+ const size_t bit = size_t{1} << (i & 7);
2413
+ const size_t idx_byte = i >> 3;
2414
+ m.bits[i] = MFromD<D>::FromBool((bits[idx_byte] & bit) != 0);
2415
+ }
2416
+ return m;
2417
+ }
2418
+
2419
+ // `p` points to at least 8 writable bytes.
2420
+ template <class D>
2421
+ HWY_API size_t StoreMaskBits(D d, MFromD<D> mask, uint8_t* bits) {
2422
+ bits[0] = 0;
2423
+ if (MaxLanes(d) > 8) bits[1] = 0; // MaxLanes(d) <= 16, so max two bytes
2424
+ for (size_t i = 0; i < MaxLanes(d); ++i) {
2425
+ const size_t bit = size_t{1} << (i & 7);
2426
+ const size_t idx_byte = i >> 3;
2427
+ if (mask.bits[i]) {
2428
+ bits[idx_byte] = static_cast<uint8_t>(bits[idx_byte] | bit);
2429
+ }
2430
+ }
2431
+ return MaxLanes(d) > 8 ? 2 : 1;
2432
+ }
2433
+
2434
+ template <class D>
2435
+ HWY_API size_t CountTrue(D d, MFromD<D> mask) {
2436
+ size_t count = 0;
2437
+ for (size_t i = 0; i < MaxLanes(d); ++i) {
2438
+ count += mask.bits[i] != 0;
2439
+ }
2440
+ return count;
2441
+ }
2442
+
2443
+ template <class D>
2444
+ HWY_API size_t FindKnownFirstTrue(D d, MFromD<D> mask) {
2445
+ for (size_t i = 0; i < MaxLanes(d); ++i) {
2446
+ if (mask.bits[i] != 0) return i;
2447
+ }
2448
+ HWY_DASSERT(false);
2449
+ return 0;
2450
+ }
2451
+
2452
+ template <class D>
2453
+ HWY_API intptr_t FindFirstTrue(D d, MFromD<D> mask) {
2454
+ for (size_t i = 0; i < MaxLanes(d); ++i) {
2455
+ if (mask.bits[i] != 0) return static_cast<intptr_t>(i);
2456
+ }
2457
+ return intptr_t{-1};
2458
+ }
2459
+
2460
+ template <class D>
2461
+ HWY_API size_t FindKnownLastTrue(D d, MFromD<D> mask) {
2462
+ for (intptr_t i = static_cast<intptr_t>(MaxLanes(d) - 1); i >= 0; i--) {
2463
+ if (mask.bits[i] != 0) return static_cast<size_t>(i);
2464
+ }
2465
+ HWY_DASSERT(false);
2466
+ return 0;
2467
+ }
2468
+
2469
+ template <class D>
2470
+ HWY_API intptr_t FindLastTrue(D d, MFromD<D> mask) {
2471
+ for (intptr_t i = static_cast<intptr_t>(MaxLanes(d) - 1); i >= 0; i--) {
2472
+ if (mask.bits[i] != 0) return i;
2473
+ }
2474
+ return intptr_t{-1};
2475
+ }
2476
+
2477
+ // ------------------------------ Compress
2478
+
2479
+ template <typename T>
2480
+ struct CompressIsPartition {
2481
+ enum { value = (sizeof(T) != 1) };
2482
+ };
2483
+
2484
+ template <typename T, size_t N>
2485
+ HWY_API Vec128<T, N> Compress(Vec128<T, N> v, Mask128<T, N> mask) {
2486
+ size_t count = 0;
2487
+ Vec128<T, N> ret;
2488
+ for (size_t i = 0; i < N; ++i) {
2489
+ if (mask.bits[i]) {
2490
+ ret.raw[count++] = v.raw[i];
2491
+ }
2492
+ }
2493
+ for (size_t i = 0; i < N; ++i) {
2494
+ if (!mask.bits[i]) {
2495
+ ret.raw[count++] = v.raw[i];
2496
+ }
2497
+ }
2498
+ HWY_DASSERT(count == N);
2499
+ return ret;
2500
+ }
2501
+
2502
+ // ------------------------------ Expand
2503
+
2504
+ // Could also just allow generic_ops-inl.h to implement these, but use our
2505
+ // simple implementation below to ensure the test is correct.
2506
+ #ifdef HWY_NATIVE_EXPAND
2507
+ #undef HWY_NATIVE_EXPAND
2508
+ #else
2509
+ #define HWY_NATIVE_EXPAND
2510
+ #endif
2511
+
2512
+ template <typename T, size_t N>
2513
+ HWY_API Vec128<T, N> Expand(Vec128<T, N> v, const Mask128<T, N> mask) {
2514
+ size_t in_pos = 0;
2515
+ Vec128<T, N> ret;
2516
+ for (size_t i = 0; i < N; ++i) {
2517
+ if (mask.bits[i]) {
2518
+ ret.raw[i] = v.raw[in_pos++];
2519
+ } else {
2520
+ ret.raw[i] = T(); // zero, also works for float16_t
2521
+ }
2522
+ }
2523
+ return ret;
2524
+ }
2525
+
2526
+ // ------------------------------ LoadExpand
2527
+
2528
+ template <class D>
2529
+ HWY_API VFromD<D> LoadExpand(MFromD<D> mask, D d,
2530
+ const TFromD<D>* HWY_RESTRICT unaligned) {
2531
+ size_t in_pos = 0;
2532
+ VFromD<D> ret;
2533
+ for (size_t i = 0; i < Lanes(d); ++i) {
2534
+ if (mask.bits[i]) {
2535
+ ret.raw[i] = unaligned[in_pos++];
2536
+ } else {
2537
+ ret.raw[i] = TFromD<D>(); // zero, also works for float16_t
2538
+ }
2539
+ }
2540
+ return ret;
2541
+ }
2542
+
2543
+ // ------------------------------ CompressNot
2544
+ template <typename T, size_t N>
2545
+ HWY_API Vec128<T, N> CompressNot(Vec128<T, N> v, Mask128<T, N> mask) {
2546
+ size_t count = 0;
2547
+ Vec128<T, N> ret;
2548
+ for (size_t i = 0; i < N; ++i) {
2549
+ if (!mask.bits[i]) {
2550
+ ret.raw[count++] = v.raw[i];
2551
+ }
2552
+ }
2553
+ for (size_t i = 0; i < N; ++i) {
2554
+ if (mask.bits[i]) {
2555
+ ret.raw[count++] = v.raw[i];
2556
+ }
2557
+ }
2558
+ HWY_DASSERT(count == N);
2559
+ return ret;
2560
+ }
2561
+
2562
+ // ------------------------------ CompressBlocksNot
2563
+ HWY_API Vec128<uint64_t> CompressBlocksNot(Vec128<uint64_t> v,
2564
+ Mask128<uint64_t> /* m */) {
2565
+ return v;
2566
+ }
2567
+
2568
+ // ------------------------------ CompressBits
2569
+ template <typename T, size_t N>
2570
+ HWY_API Vec128<T, N> CompressBits(Vec128<T, N> v,
2571
+ const uint8_t* HWY_RESTRICT bits) {
2572
+ return Compress(v, LoadMaskBits(Simd<T, N, 0>(), bits));
2573
+ }
2574
+
2575
+ // ------------------------------ CompressStore
2576
+
2577
+ // generic_ops-inl defines the 8-bit versions.
2578
+ template <class D, HWY_IF_NOT_T_SIZE_D(D, 1)>
2579
+ HWY_API size_t CompressStore(VFromD<D> v, MFromD<D> mask, D d,
2580
+ TFromD<D>* HWY_RESTRICT unaligned) {
2581
+ size_t count = 0;
2582
+ for (size_t i = 0; i < MaxLanes(d); ++i) {
2583
+ if (mask.bits[i]) {
2584
+ unaligned[count++] = v.raw[i];
2585
+ }
2586
+ }
2587
+ return count;
2588
+ }
2589
+
2590
+ // ------------------------------ CompressBlendedStore
2591
+ template <class D, HWY_IF_NOT_T_SIZE_D(D, 1)>
2592
+ HWY_API size_t CompressBlendedStore(VFromD<D> v, MFromD<D> mask, D d,
2593
+ TFromD<D>* HWY_RESTRICT unaligned) {
2594
+ return CompressStore(v, mask, d, unaligned);
2595
+ }
2596
+
2597
+ // ------------------------------ CompressBitsStore
2598
+ template <class D, HWY_IF_NOT_T_SIZE_D(D, 1)>
2599
+ HWY_API size_t CompressBitsStore(VFromD<D> v, const uint8_t* HWY_RESTRICT bits,
2600
+ D d, TFromD<D>* HWY_RESTRICT unaligned) {
2601
+ const MFromD<D> mask = LoadMaskBits(d, bits);
2602
+ StoreU(Compress(v, mask), d, unaligned);
2603
+ return CountTrue(d, mask);
2604
+ }
2605
+
2606
+ // ------------------------------ Additional mask logical operations
2607
+ template <class T>
2608
+ HWY_API Mask128<T, 1> SetAtOrAfterFirst(Mask128<T, 1> mask) {
2609
+ return mask;
2610
+ }
2611
+
2612
+ template <class T, size_t N, HWY_IF_LANES_GT(N, 1)>
2613
+ HWY_API Mask128<T, N> SetAtOrAfterFirst(Mask128<T, N> mask) {
2614
+ using TU = hwy::MakeUnsigned<T>;
2615
+
2616
+ Mask128<T, N> result;
2617
+ TU result_lane_mask{0};
2618
+ for (size_t i = 0; i < N; i++) {
2619
+ result_lane_mask = static_cast<TU>(result_lane_mask | mask.bits[i]);
2620
+ result.bits[i] = result_lane_mask;
2621
+ }
2622
+ return result;
2623
+ }
2624
+
2625
+ template <class T, size_t N>
2626
+ HWY_API Mask128<T, N> SetBeforeFirst(Mask128<T, N> mask) {
2627
+ return Not(SetAtOrAfterFirst(mask));
2628
+ }
2629
+
2630
+ template <class T, size_t N>
2631
+ HWY_API Mask128<T, N> SetOnlyFirst(Mask128<T, N> mask) {
2632
+ using TU = hwy::MakeUnsigned<T>;
2633
+ using TI = hwy::MakeSigned<T>;
2634
+
2635
+ Mask128<T, N> result;
2636
+ TU result_lane_mask = static_cast<TU>(~TU{0});
2637
+ for (size_t i = 0; i < N; i++) {
2638
+ const auto curr_lane_mask_bits = mask.bits[i];
2639
+ result.bits[i] = static_cast<TU>(curr_lane_mask_bits & result_lane_mask);
2640
+ result_lane_mask =
2641
+ static_cast<TU>(result_lane_mask &
2642
+ static_cast<TU>(-static_cast<TI>(mask.bits[i] == 0)));
2643
+ }
2644
+ return result;
2645
+ }
2646
+
2647
+ template <class T, size_t N>
2648
+ HWY_API Mask128<T, N> SetAtOrBeforeFirst(Mask128<T, N> mask) {
2649
+ using TU = hwy::MakeUnsigned<T>;
2650
+ using TI = hwy::MakeSigned<T>;
2651
+
2652
+ Mask128<T, N> result;
2653
+ TU result_lane_mask = static_cast<TU>(~TU{0});
2654
+ for (size_t i = 0; i < N; i++) {
2655
+ result.bits[i] = result_lane_mask;
2656
+ result_lane_mask =
2657
+ static_cast<TU>(result_lane_mask &
2658
+ static_cast<TU>(-static_cast<TI>(mask.bits[i] == 0)));
2659
+ }
2660
+ return result;
2661
+ }
2662
+
2663
+ // ------------------------------ WidenMulPairwiseAdd
2664
+
2665
+ template <class D, HWY_IF_F32_D(D), class VBF16>
2666
+ HWY_API VFromD<D> WidenMulPairwiseAdd(D df32, VBF16 a, VBF16 b) {
2667
+ const Rebind<uint32_t, decltype(df32)> du32;
2668
+ using VU32 = VFromD<decltype(du32)>;
2669
+ const VU32 odd = Set(du32, 0xFFFF0000u); // bfloat16 is the upper half of f32
2670
+ // Avoid ZipLower/Upper so this also works on big-endian systems.
2671
+ const VU32 ae = ShiftLeft<16>(BitCast(du32, a));
2672
+ const VU32 ao = And(BitCast(du32, a), odd);
2673
+ const VU32 be = ShiftLeft<16>(BitCast(du32, b));
2674
+ const VU32 bo = And(BitCast(du32, b), odd);
2675
+ return Mul(BitCast(df32, ae), BitCast(df32, be)) +
2676
+ Mul(BitCast(df32, ao), BitCast(df32, bo));
2677
+ }
2678
+
2679
+ template <class D, HWY_IF_I32_D(D), class VI16>
2680
+ HWY_API VFromD<D> WidenMulPairwiseAdd(D d32, VI16 a, VI16 b) {
2681
+ using VI32 = VFromD<decltype(d32)>;
2682
+ // Manual sign extension requires two shifts for even lanes.
2683
+ const VI32 ae = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, a)));
2684
+ const VI32 be = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, b)));
2685
+ const VI32 ao = ShiftRight<16>(BitCast(d32, a));
2686
+ const VI32 bo = ShiftRight<16>(BitCast(d32, b));
2687
+ return Add(Mul(ae, be), Mul(ao, bo));
2688
+ }
2689
+
2690
+ template <class D, HWY_IF_U32_D(D), class VU16>
2691
+ HWY_API VFromD<D> WidenMulPairwiseAdd(D du32, VU16 a, VU16 b) {
2692
+ const auto lo16_mask = Set(du32, 0x0000FFFFu);
2693
+
2694
+ const auto a0 = And(BitCast(du32, a), lo16_mask);
2695
+ const auto b0 = And(BitCast(du32, b), lo16_mask);
2696
+
2697
+ const auto a1 = ShiftRight<16>(BitCast(du32, a));
2698
+ const auto b1 = ShiftRight<16>(BitCast(du32, b));
2699
+
2700
+ return Add(Mul(a0, b0), Mul(a1, b1));
2701
+ }
2702
+
2703
+ // ------------------------------ ReorderWidenMulAccumulate (MulAdd, ZipLower)
2704
+
2705
+ template <class D, HWY_IF_F32_D(D), size_t N, class VBF16>
2706
+ HWY_API VFromD<D> ReorderWidenMulAccumulate(D df32, VBF16 a, VBF16 b,
2707
+ const Vec128<float, N> sum0,
2708
+ Vec128<float, N>& sum1) {
2709
+ const Rebind<uint32_t, decltype(df32)> du32;
2710
+ using VU32 = VFromD<decltype(du32)>;
2711
+ const VU32 odd = Set(du32, 0xFFFF0000u); // bfloat16 is the upper half of f32
2712
+ // Avoid ZipLower/Upper so this also works on big-endian systems.
2713
+ const VU32 ae = ShiftLeft<16>(BitCast(du32, a));
2714
+ const VU32 ao = And(BitCast(du32, a), odd);
2715
+ const VU32 be = ShiftLeft<16>(BitCast(du32, b));
2716
+ const VU32 bo = And(BitCast(du32, b), odd);
2717
+ sum1 = MulAdd(BitCast(df32, ao), BitCast(df32, bo), sum1);
2718
+ return MulAdd(BitCast(df32, ae), BitCast(df32, be), sum0);
2719
+ }
2720
+
2721
+ template <class D, HWY_IF_I32_D(D), size_t N, class VI16>
2722
+ HWY_API VFromD<D> ReorderWidenMulAccumulate(D d32, VI16 a, VI16 b,
2723
+ const Vec128<int32_t, N> sum0,
2724
+ Vec128<int32_t, N>& sum1) {
2725
+ using VI32 = VFromD<decltype(d32)>;
2726
+ // Manual sign extension requires two shifts for even lanes.
2727
+ const VI32 ae = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, a)));
2728
+ const VI32 be = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, b)));
2729
+ const VI32 ao = ShiftRight<16>(BitCast(d32, a));
2730
+ const VI32 bo = ShiftRight<16>(BitCast(d32, b));
2731
+ sum1 = Add(Mul(ao, bo), sum1);
2732
+ return Add(Mul(ae, be), sum0);
2733
+ }
2734
+
2735
+ template <class D, HWY_IF_U32_D(D), size_t N, class VU16>
2736
+ HWY_API VFromD<D> ReorderWidenMulAccumulate(D du32, VU16 a, VU16 b,
2737
+ const Vec128<uint32_t, N> sum0,
2738
+ Vec128<uint32_t, N>& sum1) {
2739
+ using VU32 = VFromD<decltype(du32)>;
2740
+ const VU32 lo16_mask = Set(du32, uint32_t{0x0000FFFFu});
2741
+ const VU32 ae = And(BitCast(du32, a), lo16_mask);
2742
+ const VU32 be = And(BitCast(du32, b), lo16_mask);
2743
+ const VU32 ao = ShiftRight<16>(BitCast(du32, a));
2744
+ const VU32 bo = ShiftRight<16>(BitCast(du32, b));
2745
+ sum1 = Add(Mul(ao, bo), sum1);
2746
+ return Add(Mul(ae, be), sum0);
2747
+ }
2748
+
2749
+ // ------------------------------ RearrangeToOddPlusEven
2750
+ template <class VW>
2751
+ HWY_API VW RearrangeToOddPlusEven(VW sum0, VW sum1) {
2752
+ return Add(sum0, sum1);
2753
+ }
2754
+
2755
+ // ================================================== REDUCTIONS
2756
+
2757
+ template <class D, typename T = TFromD<D>>
2758
+ HWY_API VFromD<D> SumOfLanes(D d, VFromD<D> v) {
2759
+ T sum = T{0};
2760
+ for (size_t i = 0; i < MaxLanes(d); ++i) {
2761
+ sum += v.raw[i];
2762
+ }
2763
+ return Set(d, sum);
2764
+ }
2765
+ template <class D, typename T = TFromD<D>>
2766
+ HWY_API T ReduceSum(D d, VFromD<D> v) {
2767
+ T sum = T{0};
2768
+ for (size_t i = 0; i < MaxLanes(d); ++i) {
2769
+ sum += v.raw[i];
2770
+ }
2771
+ return sum;
2772
+ }
2773
+ template <class D, typename T = TFromD<D>>
2774
+ HWY_API VFromD<D> MinOfLanes(D d, VFromD<D> v) {
2775
+ T min = HighestValue<T>();
2776
+ for (size_t i = 0; i < MaxLanes(d); ++i) {
2777
+ min = HWY_MIN(min, v.raw[i]);
2778
+ }
2779
+ return Set(d, min);
2780
+ }
2781
+ template <class D, typename T = TFromD<D>>
2782
+ HWY_API VFromD<D> MaxOfLanes(D d, VFromD<D> v) {
2783
+ T max = LowestValue<T>();
2784
+ for (size_t i = 0; i < MaxLanes(d); ++i) {
2785
+ max = HWY_MAX(max, v.raw[i]);
2786
+ }
2787
+ return Set(d, max);
2788
+ }
2789
+
2790
+ // ================================================== OPS WITH DEPENDENCIES
2791
+
2792
+ // ------------------------------ MulEven/Odd 64x64 (UpperHalf)
2793
+
2794
+ HWY_INLINE Vec128<uint64_t> MulEven(Vec128<uint64_t> a, Vec128<uint64_t> b) {
2795
+ alignas(16) uint64_t mul[2];
2796
+ mul[0] = Mul128(GetLane(a), GetLane(b), &mul[1]);
2797
+ return Load(Full128<uint64_t>(), mul);
2798
+ }
2799
+
2800
+ HWY_INLINE Vec128<uint64_t> MulOdd(Vec128<uint64_t> a, Vec128<uint64_t> b) {
2801
+ alignas(16) uint64_t mul[2];
2802
+ const Half<Full128<uint64_t>> d2;
2803
+ mul[0] =
2804
+ Mul128(GetLane(UpperHalf(d2, a)), GetLane(UpperHalf(d2, b)), &mul[1]);
2805
+ return Load(Full128<uint64_t>(), mul);
2806
+ }
2807
+
2808
+ // NOLINTNEXTLINE(google-readability-namespace-comments)
2809
+ } // namespace HWY_NAMESPACE
2810
+ } // namespace hwy
2811
+ HWY_AFTER_NAMESPACE();