@img/sharp-libvips-dev 0.0.1 → 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (103) hide show
  1. package/README.md +2 -2
  2. package/cplusplus/VConnection.cpp +54 -54
  3. package/cplusplus/VError.cpp +20 -18
  4. package/cplusplus/VImage.cpp +636 -589
  5. package/cplusplus/VInterpolate.cpp +22 -22
  6. package/cplusplus/VRegion.cpp +4 -4
  7. package/cplusplus/vips-operators.cpp +2326 -2301
  8. package/include/aom/aom_codec.h +10 -6
  9. package/include/aom/aom_decoder.h +1 -1
  10. package/include/aom/aom_encoder.h +9 -2
  11. package/include/aom/aomcx.h +72 -3
  12. package/include/cairo/cairo-ft.h +1 -1
  13. package/include/cairo/cairo-gobject.h +8 -0
  14. package/include/cairo/cairo-svg.h +3 -3
  15. package/include/cairo/cairo-version.h +2 -2
  16. package/include/cairo/cairo.h +91 -24
  17. package/include/harfbuzz/hb-version.h +2 -2
  18. package/include/hwy/aligned_allocator.h +211 -0
  19. package/include/hwy/base.h +1517 -0
  20. package/include/hwy/cache_control.h +108 -0
  21. package/include/hwy/detect_compiler_arch.h +281 -0
  22. package/include/hwy/detect_targets.h +644 -0
  23. package/include/hwy/foreach_target.h +340 -0
  24. package/include/hwy/highway.h +435 -0
  25. package/include/hwy/highway_export.h +74 -0
  26. package/include/hwy/nanobenchmark.h +171 -0
  27. package/include/hwy/ops/arm_neon-inl.h +8913 -0
  28. package/include/hwy/ops/arm_sve-inl.h +5105 -0
  29. package/include/hwy/ops/emu128-inl.h +2811 -0
  30. package/include/hwy/ops/generic_ops-inl.h +4745 -0
  31. package/include/hwy/ops/ppc_vsx-inl.h +5716 -0
  32. package/include/hwy/ops/rvv-inl.h +5070 -0
  33. package/include/hwy/ops/scalar-inl.h +1995 -0
  34. package/include/hwy/ops/set_macros-inl.h +578 -0
  35. package/include/hwy/ops/shared-inl.h +539 -0
  36. package/include/hwy/ops/tuple-inl.h +125 -0
  37. package/include/hwy/ops/wasm_128-inl.h +5917 -0
  38. package/include/hwy/ops/x86_128-inl.h +11173 -0
  39. package/include/hwy/ops/x86_256-inl.h +7529 -0
  40. package/include/hwy/ops/x86_512-inl.h +6849 -0
  41. package/include/hwy/per_target.h +44 -0
  42. package/include/hwy/print-inl.h +62 -0
  43. package/include/hwy/print.h +75 -0
  44. package/include/hwy/robust_statistics.h +148 -0
  45. package/include/hwy/targets.h +338 -0
  46. package/include/hwy/timer-inl.h +200 -0
  47. package/include/hwy/timer.h +55 -0
  48. package/include/jconfig.h +2 -2
  49. package/include/jpeglib.h +3 -2
  50. package/include/libheif/heif.h +443 -377
  51. package/include/libheif/heif_cxx.h +4 -1
  52. package/include/libheif/heif_plugin.h +1 -1
  53. package/include/libheif/heif_properties.h +138 -0
  54. package/include/libheif/heif_regions.h +866 -0
  55. package/include/libheif/heif_version.h +3 -3
  56. package/include/vips/VConnection8.h +43 -49
  57. package/include/vips/VError8.h +27 -24
  58. package/include/vips/VImage8.h +4861 -4597
  59. package/include/vips/VInterpolate8.h +24 -27
  60. package/include/vips/VRegion8.h +32 -33
  61. package/include/vips/arithmetic.h +169 -169
  62. package/include/vips/basic.h +33 -33
  63. package/include/vips/buf.h +56 -54
  64. package/include/vips/colour.h +95 -95
  65. package/include/vips/connection.h +190 -193
  66. package/include/vips/conversion.h +91 -91
  67. package/include/vips/convolution.h +36 -30
  68. package/include/vips/create.h +63 -63
  69. package/include/vips/dbuf.h +35 -37
  70. package/include/vips/debug.h +65 -33
  71. package/include/vips/draw.h +41 -41
  72. package/include/vips/enumtypes.h +54 -51
  73. package/include/vips/error.h +63 -63
  74. package/include/vips/foreign.h +263 -223
  75. package/include/vips/format.h +48 -48
  76. package/include/vips/freqfilt.h +22 -22
  77. package/include/vips/gate.h +55 -47
  78. package/include/vips/generate.h +34 -34
  79. package/include/vips/header.h +111 -101
  80. package/include/vips/histogram.h +28 -28
  81. package/include/vips/image.h +213 -213
  82. package/include/vips/interpolate.h +40 -41
  83. package/include/vips/memory.h +61 -52
  84. package/include/vips/morphology.h +24 -24
  85. package/include/vips/mosaicing.h +32 -33
  86. package/include/vips/object.h +371 -357
  87. package/include/vips/operation.h +68 -67
  88. package/include/vips/private.h +76 -76
  89. package/include/vips/rect.h +26 -26
  90. package/include/vips/region.h +92 -92
  91. package/include/vips/resample.h +38 -38
  92. package/include/vips/sbuf.h +53 -54
  93. package/include/vips/semaphore.h +24 -24
  94. package/include/vips/thread.h +30 -27
  95. package/include/vips/threadpool.h +48 -49
  96. package/include/vips/transform.h +39 -39
  97. package/include/vips/type.h +90 -85
  98. package/include/vips/util.h +274 -229
  99. package/include/vips/vector.h +24 -144
  100. package/include/vips/version.h +9 -9
  101. package/include/vips/vips.h +41 -40
  102. package/package.json +1 -1
  103. package/versions.json +7 -7
@@ -0,0 +1,1995 @@
1
+ // Copyright 2019 Google LLC
2
+ // SPDX-License-Identifier: Apache-2.0
3
+ //
4
+ // Licensed under the Apache License, Version 2.0 (the "License");
5
+ // you may not use this file except in compliance with the License.
6
+ // You may obtain a copy of the License at
7
+ //
8
+ // http://www.apache.org/licenses/LICENSE-2.0
9
+ //
10
+ // Unless required by applicable law or agreed to in writing, software
11
+ // distributed under the License is distributed on an "AS IS" BASIS,
12
+ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ // See the License for the specific language governing permissions and
14
+ // limitations under the License.
15
+
16
+ // Single-element vectors and operations.
17
+ // External include guard in highway.h - see comment there.
18
+
19
+ #ifndef HWY_NO_LIBCXX
20
+ #include <math.h> // sqrtf
21
+ #endif
22
+
23
+ #include "hwy/ops/shared-inl.h"
24
+
25
+ HWY_BEFORE_NAMESPACE();
26
+ namespace hwy {
27
+ namespace HWY_NAMESPACE {
28
+
29
+ // Single instruction, single data.
30
+ template <typename T>
31
+ using Sisd = Simd<T, 1, 0>;
32
+
33
+ // (Wrapper class required for overloading comparison operators.)
34
+ template <typename T>
35
+ struct Vec1 {
36
+ using PrivateT = T; // only for DFromV
37
+ static constexpr size_t kPrivateN = 1; // only for DFromV
38
+
39
+ HWY_INLINE Vec1() = default;
40
+ Vec1(const Vec1&) = default;
41
+ Vec1& operator=(const Vec1&) = default;
42
+ HWY_INLINE explicit Vec1(const T t) : raw(t) {}
43
+
44
+ HWY_INLINE Vec1& operator*=(const Vec1 other) {
45
+ return *this = (*this * other);
46
+ }
47
+ HWY_INLINE Vec1& operator/=(const Vec1 other) {
48
+ return *this = (*this / other);
49
+ }
50
+ HWY_INLINE Vec1& operator+=(const Vec1 other) {
51
+ return *this = (*this + other);
52
+ }
53
+ HWY_INLINE Vec1& operator-=(const Vec1 other) {
54
+ return *this = (*this - other);
55
+ }
56
+ HWY_INLINE Vec1& operator&=(const Vec1 other) {
57
+ return *this = (*this & other);
58
+ }
59
+ HWY_INLINE Vec1& operator|=(const Vec1 other) {
60
+ return *this = (*this | other);
61
+ }
62
+ HWY_INLINE Vec1& operator^=(const Vec1 other) {
63
+ return *this = (*this ^ other);
64
+ }
65
+
66
+ T raw;
67
+ };
68
+
69
+ // 0 or FF..FF, same size as Vec1.
70
+ template <typename T>
71
+ class Mask1 {
72
+ using Raw = hwy::MakeUnsigned<T>;
73
+
74
+ public:
75
+ static HWY_INLINE Mask1<T> FromBool(bool b) {
76
+ Mask1<T> mask;
77
+ mask.bits = b ? static_cast<Raw>(~Raw{0}) : 0;
78
+ return mask;
79
+ }
80
+
81
+ Raw bits;
82
+ };
83
+
84
+ template <class V>
85
+ using DFromV = Simd<typename V::PrivateT, V::kPrivateN, 0>;
86
+
87
+ template <class V>
88
+ using TFromV = typename V::PrivateT;
89
+
90
+ // ------------------------------ BitCast
91
+
92
+ template <class DTo, typename TTo = TFromD<DTo>, typename TFrom>
93
+ HWY_API Vec1<TTo> BitCast(DTo /* tag */, Vec1<TFrom> v) {
94
+ static_assert(sizeof(TTo) <= sizeof(TFrom), "Promoting is undefined");
95
+ TTo to;
96
+ CopyBytes<sizeof(TTo)>(&v.raw, &to); // not same size - ok to shrink
97
+ return Vec1<TTo>(to);
98
+ }
99
+
100
+ // ------------------------------ Zero
101
+
102
+ template <class D, HWY_IF_LANES_D(D, 1), typename T = TFromD<D>>
103
+ HWY_API Vec1<T> Zero(D /* tag */) {
104
+ Vec1<T> v;
105
+ ZeroBytes<sizeof(v.raw)>(&v.raw);
106
+ return v;
107
+ }
108
+
109
+ template <class D>
110
+ using VFromD = decltype(Zero(D()));
111
+
112
+ // ------------------------------ Tuple (VFromD)
113
+ #include "hwy/ops/tuple-inl.h"
114
+
115
+ // ------------------------------ Set
116
+ template <class D, HWY_IF_LANES_D(D, 1), typename T = TFromD<D>, typename T2>
117
+ HWY_API Vec1<T> Set(D /* tag */, const T2 t) {
118
+ return Vec1<T>(static_cast<T>(t));
119
+ }
120
+
121
+ // ------------------------------ Undefined
122
+ template <class D, HWY_IF_LANES_D(D, 1), typename T = TFromD<D>>
123
+ HWY_API Vec1<T> Undefined(D d) {
124
+ return Zero(d);
125
+ }
126
+
127
+ // ------------------------------ Iota
128
+ template <class D, HWY_IF_LANES_D(D, 1), typename T = TFromD<D>, typename T2>
129
+ HWY_API Vec1<T> Iota(const D /* tag */, const T2 first) {
130
+ return Vec1<T>(static_cast<T>(first));
131
+ }
132
+
133
+ // ------------------------------ ResizeBitCast
134
+
135
+ template <class D, typename FromV>
136
+ HWY_API VFromD<D> ResizeBitCast(D /* tag */, FromV v) {
137
+ using TFrom = TFromV<FromV>;
138
+ using TTo = TFromD<D>;
139
+ constexpr size_t kCopyLen = HWY_MIN(sizeof(TFrom), sizeof(TTo));
140
+ TTo to = TTo{0};
141
+ CopyBytes<kCopyLen>(&v.raw, &to);
142
+ return VFromD<D>(to);
143
+ }
144
+
145
+ namespace detail {
146
+
147
+ // ResizeBitCast on the HWY_SCALAR target has zero-extending semantics if
148
+ // sizeof(TFromD<DTo>) is greater than sizeof(TFromV<FromV>)
149
+ template <class FromSizeTag, class ToSizeTag, class DTo, class DFrom>
150
+ HWY_INLINE VFromD<DTo> ZeroExtendResizeBitCast(FromSizeTag /* from_size_tag */,
151
+ ToSizeTag /* to_size_tag */,
152
+ DTo d_to, DFrom /*d_from*/,
153
+ VFromD<DFrom> v) {
154
+ return ResizeBitCast(d_to, v);
155
+ }
156
+
157
+ } // namespace detail
158
+
159
+ // ================================================== LOGICAL
160
+
161
+ // ------------------------------ Not
162
+
163
+ template <typename T>
164
+ HWY_API Vec1<T> Not(const Vec1<T> v) {
165
+ using TU = MakeUnsigned<T>;
166
+ const Sisd<TU> du;
167
+ return BitCast(Sisd<T>(), Vec1<TU>(static_cast<TU>(~BitCast(du, v).raw)));
168
+ }
169
+
170
+ // ------------------------------ And
171
+
172
+ template <typename T>
173
+ HWY_API Vec1<T> And(const Vec1<T> a, const Vec1<T> b) {
174
+ using TU = MakeUnsigned<T>;
175
+ const Sisd<TU> du;
176
+ return BitCast(Sisd<T>(), Vec1<TU>(BitCast(du, a).raw & BitCast(du, b).raw));
177
+ }
178
+ template <typename T>
179
+ HWY_API Vec1<T> operator&(const Vec1<T> a, const Vec1<T> b) {
180
+ return And(a, b);
181
+ }
182
+
183
+ // ------------------------------ AndNot
184
+
185
+ template <typename T>
186
+ HWY_API Vec1<T> AndNot(const Vec1<T> a, const Vec1<T> b) {
187
+ using TU = MakeUnsigned<T>;
188
+ const Sisd<TU> du;
189
+ return BitCast(Sisd<T>(), Vec1<TU>(static_cast<TU>(~BitCast(du, a).raw &
190
+ BitCast(du, b).raw)));
191
+ }
192
+
193
+ // ------------------------------ Or
194
+
195
+ template <typename T>
196
+ HWY_API Vec1<T> Or(const Vec1<T> a, const Vec1<T> b) {
197
+ using TU = MakeUnsigned<T>;
198
+ const Sisd<TU> du;
199
+ return BitCast(Sisd<T>(), Vec1<TU>(BitCast(du, a).raw | BitCast(du, b).raw));
200
+ }
201
+ template <typename T>
202
+ HWY_API Vec1<T> operator|(const Vec1<T> a, const Vec1<T> b) {
203
+ return Or(a, b);
204
+ }
205
+
206
+ // ------------------------------ Xor
207
+
208
+ template <typename T>
209
+ HWY_API Vec1<T> Xor(const Vec1<T> a, const Vec1<T> b) {
210
+ using TU = MakeUnsigned<T>;
211
+ const Sisd<TU> du;
212
+ return BitCast(Sisd<T>(), Vec1<TU>(BitCast(du, a).raw ^ BitCast(du, b).raw));
213
+ }
214
+ template <typename T>
215
+ HWY_API Vec1<T> operator^(const Vec1<T> a, const Vec1<T> b) {
216
+ return Xor(a, b);
217
+ }
218
+
219
+ // ------------------------------ Xor3
220
+
221
+ template <typename T>
222
+ HWY_API Vec1<T> Xor3(Vec1<T> x1, Vec1<T> x2, Vec1<T> x3) {
223
+ return Xor(x1, Xor(x2, x3));
224
+ }
225
+
226
+ // ------------------------------ Or3
227
+
228
+ template <typename T>
229
+ HWY_API Vec1<T> Or3(Vec1<T> o1, Vec1<T> o2, Vec1<T> o3) {
230
+ return Or(o1, Or(o2, o3));
231
+ }
232
+
233
+ // ------------------------------ OrAnd
234
+
235
+ template <typename T>
236
+ HWY_API Vec1<T> OrAnd(const Vec1<T> o, const Vec1<T> a1, const Vec1<T> a2) {
237
+ return Or(o, And(a1, a2));
238
+ }
239
+
240
+ // ------------------------------ Mask
241
+
242
+ template <class DTo, typename TTo = TFromD<DTo>, typename TFrom>
243
+ HWY_API Mask1<TTo> RebindMask(DTo /*tag*/, Mask1<TFrom> m) {
244
+ static_assert(sizeof(TFrom) == sizeof(TTo), "Must have same size");
245
+ return Mask1<TTo>{m.bits};
246
+ }
247
+
248
+ // v must be 0 or FF..FF.
249
+ template <typename T>
250
+ HWY_API Mask1<T> MaskFromVec(const Vec1<T> v) {
251
+ Mask1<T> mask;
252
+ CopySameSize(&v, &mask);
253
+ return mask;
254
+ }
255
+
256
+ template <class D>
257
+ using MFromD = decltype(MaskFromVec(VFromD<D>()));
258
+
259
+ template <typename T>
260
+ Vec1<T> VecFromMask(const Mask1<T> mask) {
261
+ Vec1<T> v;
262
+ CopySameSize(&mask, &v);
263
+ return v;
264
+ }
265
+
266
+ template <class D, typename T = TFromD<D>>
267
+ Vec1<T> VecFromMask(D /* tag */, const Mask1<T> mask) {
268
+ Vec1<T> v;
269
+ CopySameSize(&mask, &v);
270
+ return v;
271
+ }
272
+
273
+ template <class D, HWY_IF_LANES_D(D, 1), typename T = TFromD<D>>
274
+ HWY_API Mask1<T> FirstN(D /*tag*/, size_t n) {
275
+ return Mask1<T>::FromBool(n != 0);
276
+ }
277
+
278
+ // ------------------------------ IfVecThenElse
279
+ template <typename T>
280
+ HWY_API Vec1<T> IfVecThenElse(Vec1<T> mask, Vec1<T> yes, Vec1<T> no) {
281
+ return IfThenElse(MaskFromVec(mask), yes, no);
282
+ }
283
+
284
+ // ------------------------------ CopySign
285
+ template <typename T>
286
+ HWY_API Vec1<T> CopySign(const Vec1<T> magn, const Vec1<T> sign) {
287
+ static_assert(IsFloat<T>(), "Only makes sense for floating-point");
288
+ const DFromV<decltype(magn)> d;
289
+ return BitwiseIfThenElse(SignBit(d), sign, magn);
290
+ }
291
+
292
+ // ------------------------------ CopySignToAbs
293
+ template <typename T>
294
+ HWY_API Vec1<T> CopySignToAbs(const Vec1<T> abs, const Vec1<T> sign) {
295
+ static_assert(IsFloat<T>(), "Only makes sense for floating-point");
296
+ const Sisd<T> d;
297
+ return OrAnd(abs, SignBit(d), sign);
298
+ }
299
+
300
+ // ------------------------------ BroadcastSignBit
301
+ template <typename T>
302
+ HWY_API Vec1<T> BroadcastSignBit(const Vec1<T> v) {
303
+ // This is used inside ShiftRight, so we cannot implement in terms of it.
304
+ return v.raw < 0 ? Vec1<T>(T(-1)) : Vec1<T>(0);
305
+ }
306
+
307
+ // ------------------------------ PopulationCount
308
+
309
+ #ifdef HWY_NATIVE_POPCNT
310
+ #undef HWY_NATIVE_POPCNT
311
+ #else
312
+ #define HWY_NATIVE_POPCNT
313
+ #endif
314
+
315
+ template <typename T>
316
+ HWY_API Vec1<T> PopulationCount(Vec1<T> v) {
317
+ return Vec1<T>(static_cast<T>(PopCount(v.raw)));
318
+ }
319
+
320
+ // ------------------------------ IfThenElse
321
+
322
+ // Returns mask ? yes : no.
323
+ template <typename T>
324
+ HWY_API Vec1<T> IfThenElse(const Mask1<T> mask, const Vec1<T> yes,
325
+ const Vec1<T> no) {
326
+ return mask.bits ? yes : no;
327
+ }
328
+
329
+ template <typename T>
330
+ HWY_API Vec1<T> IfThenElseZero(const Mask1<T> mask, const Vec1<T> yes) {
331
+ return mask.bits ? yes : Vec1<T>(0);
332
+ }
333
+
334
+ template <typename T>
335
+ HWY_API Vec1<T> IfThenZeroElse(const Mask1<T> mask, const Vec1<T> no) {
336
+ return mask.bits ? Vec1<T>(0) : no;
337
+ }
338
+
339
+ template <typename T>
340
+ HWY_API Vec1<T> IfNegativeThenElse(Vec1<T> v, Vec1<T> yes, Vec1<T> no) {
341
+ const DFromV<decltype(v)> d;
342
+ const RebindToSigned<decltype(d)> di;
343
+ const auto vi = BitCast(di, v);
344
+
345
+ return vi.raw < 0 ? yes : no;
346
+ }
347
+
348
+ template <typename T>
349
+ HWY_API Vec1<T> ZeroIfNegative(const Vec1<T> v) {
350
+ return v.raw < 0 ? Vec1<T>(0) : v;
351
+ }
352
+
353
+ // ------------------------------ Mask logical
354
+
355
+ template <typename T>
356
+ HWY_API Mask1<T> Not(const Mask1<T> m) {
357
+ return MaskFromVec(Not(VecFromMask(Sisd<T>(), m)));
358
+ }
359
+
360
+ template <typename T>
361
+ HWY_API Mask1<T> And(const Mask1<T> a, Mask1<T> b) {
362
+ const Sisd<T> d;
363
+ return MaskFromVec(And(VecFromMask(d, a), VecFromMask(d, b)));
364
+ }
365
+
366
+ template <typename T>
367
+ HWY_API Mask1<T> AndNot(const Mask1<T> a, Mask1<T> b) {
368
+ const Sisd<T> d;
369
+ return MaskFromVec(AndNot(VecFromMask(d, a), VecFromMask(d, b)));
370
+ }
371
+
372
+ template <typename T>
373
+ HWY_API Mask1<T> Or(const Mask1<T> a, Mask1<T> b) {
374
+ const Sisd<T> d;
375
+ return MaskFromVec(Or(VecFromMask(d, a), VecFromMask(d, b)));
376
+ }
377
+
378
+ template <typename T>
379
+ HWY_API Mask1<T> Xor(const Mask1<T> a, Mask1<T> b) {
380
+ const Sisd<T> d;
381
+ return MaskFromVec(Xor(VecFromMask(d, a), VecFromMask(d, b)));
382
+ }
383
+
384
+ template <typename T>
385
+ HWY_API Mask1<T> ExclusiveNeither(const Mask1<T> a, Mask1<T> b) {
386
+ const Sisd<T> d;
387
+ return MaskFromVec(AndNot(VecFromMask(d, a), Not(VecFromMask(d, b))));
388
+ }
389
+
390
+ template <class T>
391
+ HWY_API Mask1<T> SetAtOrAfterFirst(Mask1<T> mask) {
392
+ return mask;
393
+ }
394
+
395
+ template <class T>
396
+ HWY_API Mask1<T> SetBeforeFirst(Mask1<T> mask) {
397
+ return Not(mask);
398
+ }
399
+
400
+ template <class T>
401
+ HWY_API Mask1<T> SetOnlyFirst(Mask1<T> mask) {
402
+ return mask;
403
+ }
404
+
405
+ template <class T>
406
+ HWY_API Mask1<T> SetAtOrBeforeFirst(Mask1<T> /*mask*/) {
407
+ return Mask1<T>::FromBool(true);
408
+ }
409
+
410
+ // ================================================== SHIFTS
411
+
412
+ // ------------------------------ ShiftLeft/ShiftRight (BroadcastSignBit)
413
+
414
+ template <int kBits, typename T>
415
+ HWY_API Vec1<T> ShiftLeft(const Vec1<T> v) {
416
+ static_assert(0 <= kBits && kBits < sizeof(T) * 8, "Invalid shift");
417
+ return Vec1<T>(
418
+ static_cast<T>(static_cast<hwy::MakeUnsigned<T>>(v.raw) << kBits));
419
+ }
420
+
421
+ template <int kBits, typename T>
422
+ HWY_API Vec1<T> ShiftRight(const Vec1<T> v) {
423
+ static_assert(0 <= kBits && kBits < sizeof(T) * 8, "Invalid shift");
424
+ #if __cplusplus >= 202002L
425
+ // Signed right shift is now guaranteed to be arithmetic (rounding toward
426
+ // negative infinity, i.e. shifting in the sign bit).
427
+ return Vec1<T>(static_cast<T>(v.raw >> kBits));
428
+ #else
429
+ if (IsSigned<T>()) {
430
+ // Emulate arithmetic shift using only logical (unsigned) shifts, because
431
+ // signed shifts are still implementation-defined.
432
+ using TU = hwy::MakeUnsigned<T>;
433
+ const Sisd<TU> du;
434
+ const TU shifted = static_cast<TU>(BitCast(du, v).raw >> kBits);
435
+ const TU sign = BitCast(du, BroadcastSignBit(v)).raw;
436
+ const size_t sign_shift =
437
+ static_cast<size_t>(static_cast<int>(sizeof(TU)) * 8 - 1 - kBits);
438
+ const TU upper = static_cast<TU>(sign << sign_shift);
439
+ return BitCast(Sisd<T>(), Vec1<TU>(shifted | upper));
440
+ } else { // T is unsigned
441
+ return Vec1<T>(static_cast<T>(v.raw >> kBits));
442
+ }
443
+ #endif
444
+ }
445
+
446
+ // ------------------------------ RotateRight (ShiftRight)
447
+ template <int kBits, typename T>
448
+ HWY_API Vec1<T> RotateRight(const Vec1<T> v) {
449
+ constexpr size_t kSizeInBits = sizeof(T) * 8;
450
+ static_assert(0 <= kBits && kBits < kSizeInBits, "Invalid shift");
451
+ if (kBits == 0) return v;
452
+ return Or(ShiftRight<kBits>(v),
453
+ ShiftLeft<HWY_MIN(kSizeInBits - 1, kSizeInBits - kBits)>(v));
454
+ }
455
+
456
+ // ------------------------------ ShiftLeftSame (BroadcastSignBit)
457
+
458
+ template <typename T>
459
+ HWY_API Vec1<T> ShiftLeftSame(const Vec1<T> v, int bits) {
460
+ return Vec1<T>(
461
+ static_cast<T>(static_cast<hwy::MakeUnsigned<T>>(v.raw) << bits));
462
+ }
463
+
464
+ template <typename T>
465
+ HWY_API Vec1<T> ShiftRightSame(const Vec1<T> v, int bits) {
466
+ #if __cplusplus >= 202002L
467
+ // Signed right shift is now guaranteed to be arithmetic (rounding toward
468
+ // negative infinity, i.e. shifting in the sign bit).
469
+ return Vec1<T>(static_cast<T>(v.raw >> bits));
470
+ #else
471
+ if (IsSigned<T>()) {
472
+ // Emulate arithmetic shift using only logical (unsigned) shifts, because
473
+ // signed shifts are still implementation-defined.
474
+ using TU = hwy::MakeUnsigned<T>;
475
+ const Sisd<TU> du;
476
+ const TU shifted = static_cast<TU>(BitCast(du, v).raw >> bits);
477
+ const TU sign = BitCast(du, BroadcastSignBit(v)).raw;
478
+ const size_t sign_shift =
479
+ static_cast<size_t>(static_cast<int>(sizeof(TU)) * 8 - 1 - bits);
480
+ const TU upper = static_cast<TU>(sign << sign_shift);
481
+ return BitCast(Sisd<T>(), Vec1<TU>(shifted | upper));
482
+ } else { // T is unsigned
483
+ return Vec1<T>(static_cast<T>(v.raw >> bits));
484
+ }
485
+ #endif
486
+ }
487
+
488
+ // ------------------------------ Shl
489
+
490
+ // Single-lane => same as ShiftLeftSame except for the argument type.
491
+ template <typename T>
492
+ HWY_API Vec1<T> operator<<(const Vec1<T> v, const Vec1<T> bits) {
493
+ return ShiftLeftSame(v, static_cast<int>(bits.raw));
494
+ }
495
+
496
+ template <typename T>
497
+ HWY_API Vec1<T> operator>>(const Vec1<T> v, const Vec1<T> bits) {
498
+ return ShiftRightSame(v, static_cast<int>(bits.raw));
499
+ }
500
+
501
+ // ================================================== ARITHMETIC
502
+
503
+ template <typename T>
504
+ HWY_API Vec1<T> operator+(Vec1<T> a, Vec1<T> b) {
505
+ const uint64_t a64 = static_cast<uint64_t>(a.raw);
506
+ const uint64_t b64 = static_cast<uint64_t>(b.raw);
507
+ return Vec1<T>(static_cast<T>((a64 + b64) & static_cast<uint64_t>(~T(0))));
508
+ }
509
+ HWY_API Vec1<float> operator+(const Vec1<float> a, const Vec1<float> b) {
510
+ return Vec1<float>(a.raw + b.raw);
511
+ }
512
+ HWY_API Vec1<double> operator+(const Vec1<double> a, const Vec1<double> b) {
513
+ return Vec1<double>(a.raw + b.raw);
514
+ }
515
+
516
+ template <typename T>
517
+ HWY_API Vec1<T> operator-(Vec1<T> a, Vec1<T> b) {
518
+ const uint64_t a64 = static_cast<uint64_t>(a.raw);
519
+ const uint64_t b64 = static_cast<uint64_t>(b.raw);
520
+ return Vec1<T>(static_cast<T>((a64 - b64) & static_cast<uint64_t>(~T(0))));
521
+ }
522
+ HWY_API Vec1<float> operator-(const Vec1<float> a, const Vec1<float> b) {
523
+ return Vec1<float>(a.raw - b.raw);
524
+ }
525
+ HWY_API Vec1<double> operator-(const Vec1<double> a, const Vec1<double> b) {
526
+ return Vec1<double>(a.raw - b.raw);
527
+ }
528
+
529
+ // ------------------------------ SumsOf8
530
+
531
+ HWY_API Vec1<uint64_t> SumsOf8(const Vec1<uint8_t> v) {
532
+ return Vec1<uint64_t>(v.raw);
533
+ }
534
+
535
+ // ------------------------------ SaturatedAdd
536
+
537
+ // Returns a + b clamped to the destination range.
538
+
539
+ // Unsigned
540
+ HWY_API Vec1<uint8_t> SaturatedAdd(const Vec1<uint8_t> a,
541
+ const Vec1<uint8_t> b) {
542
+ return Vec1<uint8_t>(
543
+ static_cast<uint8_t>(HWY_MIN(HWY_MAX(0, a.raw + b.raw), 255)));
544
+ }
545
+ HWY_API Vec1<uint16_t> SaturatedAdd(const Vec1<uint16_t> a,
546
+ const Vec1<uint16_t> b) {
547
+ return Vec1<uint16_t>(static_cast<uint16_t>(
548
+ HWY_MIN(HWY_MAX(0, static_cast<int32_t>(a.raw) + b.raw), 65535)));
549
+ }
550
+
551
+ // Signed
552
+ HWY_API Vec1<int8_t> SaturatedAdd(const Vec1<int8_t> a, const Vec1<int8_t> b) {
553
+ return Vec1<int8_t>(
554
+ static_cast<int8_t>(HWY_MIN(HWY_MAX(-128, a.raw + b.raw), 127)));
555
+ }
556
+ HWY_API Vec1<int16_t> SaturatedAdd(const Vec1<int16_t> a,
557
+ const Vec1<int16_t> b) {
558
+ return Vec1<int16_t>(static_cast<int16_t>(
559
+ HWY_MIN(HWY_MAX(-32768, static_cast<int32_t>(a.raw) + b.raw), 32767)));
560
+ }
561
+
562
+ // ------------------------------ Saturating subtraction
563
+
564
+ // Returns a - b clamped to the destination range.
565
+
566
+ // Unsigned
567
+ HWY_API Vec1<uint8_t> SaturatedSub(const Vec1<uint8_t> a,
568
+ const Vec1<uint8_t> b) {
569
+ return Vec1<uint8_t>(
570
+ static_cast<uint8_t>(HWY_MIN(HWY_MAX(0, a.raw - b.raw), 255)));
571
+ }
572
+ HWY_API Vec1<uint16_t> SaturatedSub(const Vec1<uint16_t> a,
573
+ const Vec1<uint16_t> b) {
574
+ return Vec1<uint16_t>(static_cast<uint16_t>(
575
+ HWY_MIN(HWY_MAX(0, static_cast<int32_t>(a.raw) - b.raw), 65535)));
576
+ }
577
+
578
+ // Signed
579
+ HWY_API Vec1<int8_t> SaturatedSub(const Vec1<int8_t> a, const Vec1<int8_t> b) {
580
+ return Vec1<int8_t>(
581
+ static_cast<int8_t>(HWY_MIN(HWY_MAX(-128, a.raw - b.raw), 127)));
582
+ }
583
+ HWY_API Vec1<int16_t> SaturatedSub(const Vec1<int16_t> a,
584
+ const Vec1<int16_t> b) {
585
+ return Vec1<int16_t>(static_cast<int16_t>(
586
+ HWY_MIN(HWY_MAX(-32768, static_cast<int32_t>(a.raw) - b.raw), 32767)));
587
+ }
588
+
589
+ // ------------------------------ Average
590
+
591
+ // Returns (a + b + 1) / 2
592
+
593
+ HWY_API Vec1<uint8_t> AverageRound(const Vec1<uint8_t> a,
594
+ const Vec1<uint8_t> b) {
595
+ return Vec1<uint8_t>(static_cast<uint8_t>((a.raw + b.raw + 1) / 2));
596
+ }
597
+ HWY_API Vec1<uint16_t> AverageRound(const Vec1<uint16_t> a,
598
+ const Vec1<uint16_t> b) {
599
+ return Vec1<uint16_t>(static_cast<uint16_t>((a.raw + b.raw + 1) / 2));
600
+ }
601
+
602
+ // ------------------------------ Absolute value
603
+
604
+ template <typename T>
605
+ HWY_API Vec1<T> Abs(const Vec1<T> a) {
606
+ const T i = a.raw;
607
+ if (i >= 0 || i == hwy::LimitsMin<T>()) return a;
608
+ return Vec1<T>(static_cast<T>(-i & T{-1}));
609
+ }
610
+ HWY_API Vec1<float> Abs(Vec1<float> a) {
611
+ int32_t i;
612
+ CopyBytes<sizeof(i)>(&a.raw, &i);
613
+ i &= 0x7FFFFFFF;
614
+ CopyBytes<sizeof(i)>(&i, &a.raw);
615
+ return a;
616
+ }
617
+ HWY_API Vec1<double> Abs(Vec1<double> a) {
618
+ int64_t i;
619
+ CopyBytes<sizeof(i)>(&a.raw, &i);
620
+ i &= 0x7FFFFFFFFFFFFFFFL;
621
+ CopyBytes<sizeof(i)>(&i, &a.raw);
622
+ return a;
623
+ }
624
+
625
+ // ------------------------------ Min/Max
626
+
627
+ // <cmath> may be unavailable, so implement our own.
628
+ namespace detail {
629
+
630
+ static inline float Abs(float f) {
631
+ uint32_t i;
632
+ CopyBytes<4>(&f, &i);
633
+ i &= 0x7FFFFFFFu;
634
+ CopyBytes<4>(&i, &f);
635
+ return f;
636
+ }
637
+ static inline double Abs(double f) {
638
+ uint64_t i;
639
+ CopyBytes<8>(&f, &i);
640
+ i &= 0x7FFFFFFFFFFFFFFFull;
641
+ CopyBytes<8>(&i, &f);
642
+ return f;
643
+ }
644
+
645
+ static inline bool SignBit(float f) {
646
+ uint32_t i;
647
+ CopyBytes<4>(&f, &i);
648
+ return (i >> 31) != 0;
649
+ }
650
+ static inline bool SignBit(double f) {
651
+ uint64_t i;
652
+ CopyBytes<8>(&f, &i);
653
+ return (i >> 63) != 0;
654
+ }
655
+
656
+ } // namespace detail
657
+
658
+ template <typename T, HWY_IF_NOT_FLOAT(T)>
659
+ HWY_API Vec1<T> Min(const Vec1<T> a, const Vec1<T> b) {
660
+ return Vec1<T>(HWY_MIN(a.raw, b.raw));
661
+ }
662
+
663
+ template <typename T, HWY_IF_FLOAT(T)>
664
+ HWY_API Vec1<T> Min(const Vec1<T> a, const Vec1<T> b) {
665
+ if (isnan(a.raw)) return b;
666
+ if (isnan(b.raw)) return a;
667
+ return Vec1<T>(HWY_MIN(a.raw, b.raw));
668
+ }
669
+
670
+ template <typename T, HWY_IF_NOT_FLOAT(T)>
671
+ HWY_API Vec1<T> Max(const Vec1<T> a, const Vec1<T> b) {
672
+ return Vec1<T>(HWY_MAX(a.raw, b.raw));
673
+ }
674
+
675
+ template <typename T, HWY_IF_FLOAT(T)>
676
+ HWY_API Vec1<T> Max(const Vec1<T> a, const Vec1<T> b) {
677
+ if (isnan(a.raw)) return b;
678
+ if (isnan(b.raw)) return a;
679
+ return Vec1<T>(HWY_MAX(a.raw, b.raw));
680
+ }
681
+
682
+ // ------------------------------ Floating-point negate
683
+
684
+ template <typename T, HWY_IF_FLOAT_OR_SPECIAL(T)>
685
+ HWY_API Vec1<T> Neg(const Vec1<T> v) {
686
+ return Xor(v, SignBit(Sisd<T>()));
687
+ }
688
+
689
+ template <typename T, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
690
+ HWY_API Vec1<T> Neg(const Vec1<T> v) {
691
+ return Zero(Sisd<T>()) - v;
692
+ }
693
+
694
+ // ------------------------------ mul/div
695
+
696
+ // Per-target flags to prevent generic_ops-inl.h defining 8/64-bit operator*.
697
+ #ifdef HWY_NATIVE_MUL_8
698
+ #undef HWY_NATIVE_MUL_8
699
+ #else
700
+ #define HWY_NATIVE_MUL_8
701
+ #endif
702
+ #ifdef HWY_NATIVE_MUL_64
703
+ #undef HWY_NATIVE_MUL_64
704
+ #else
705
+ #define HWY_NATIVE_MUL_64
706
+ #endif
707
+
708
+ template <typename T, HWY_IF_FLOAT(T)>
709
+ HWY_API Vec1<T> operator*(const Vec1<T> a, const Vec1<T> b) {
710
+ return Vec1<T>(static_cast<T>(double{a.raw} * b.raw));
711
+ }
712
+
713
+ template <typename T, HWY_IF_NOT_FLOAT(T)>
714
+ HWY_API Vec1<T> operator*(const Vec1<T> a, const Vec1<T> b) {
715
+ return Vec1<T>(static_cast<T>(static_cast<uint64_t>(a.raw) *
716
+ static_cast<uint64_t>(b.raw)));
717
+ }
718
+
719
+ template <typename T>
720
+ HWY_API Vec1<T> operator/(const Vec1<T> a, const Vec1<T> b) {
721
+ return Vec1<T>(a.raw / b.raw);
722
+ }
723
+
724
+ // Returns the upper 16 bits of a * b in each lane.
725
+ HWY_API Vec1<int16_t> MulHigh(const Vec1<int16_t> a, const Vec1<int16_t> b) {
726
+ return Vec1<int16_t>(static_cast<int16_t>((a.raw * b.raw) >> 16));
727
+ }
728
+ HWY_API Vec1<uint16_t> MulHigh(const Vec1<uint16_t> a, const Vec1<uint16_t> b) {
729
+ // Cast to uint32_t first to prevent overflow. Otherwise the result of
730
+ // uint16_t * uint16_t is in "int" which may overflow. In practice the result
731
+ // is the same but this way it is also defined.
732
+ return Vec1<uint16_t>(static_cast<uint16_t>(
733
+ (static_cast<uint32_t>(a.raw) * static_cast<uint32_t>(b.raw)) >> 16));
734
+ }
735
+
736
+ HWY_API Vec1<int16_t> MulFixedPoint15(Vec1<int16_t> a, Vec1<int16_t> b) {
737
+ return Vec1<int16_t>(static_cast<int16_t>((a.raw * b.raw + 16384) >> 15));
738
+ }
739
+
740
+ // Multiplies even lanes (0, 2 ..) and returns the double-wide result.
741
+ template <class T, HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2) | (1 << 4)),
742
+ HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
743
+ HWY_API Vec1<MakeWide<T>> MulEven(const Vec1<T> a, const Vec1<T> b) {
744
+ using TW = MakeWide<T>;
745
+ const TW a_wide = a.raw;
746
+ return Vec1<TW>(static_cast<TW>(a_wide * b.raw));
747
+ }
748
+
749
+ // Approximate reciprocal
750
+ HWY_API Vec1<float> ApproximateReciprocal(const Vec1<float> v) {
751
+ // Zero inputs are allowed, but callers are responsible for replacing the
752
+ // return value with something else (typically using IfThenElse). This check
753
+ // avoids a ubsan error. The return value is arbitrary.
754
+ if (v.raw == 0.0f) return Vec1<float>(0.0f);
755
+ return Vec1<float>(1.0f / v.raw);
756
+ }
757
+
758
+ // generic_ops takes care of integer T.
759
+ template <typename T, HWY_IF_FLOAT(T)>
760
+ HWY_API Vec1<T> AbsDiff(const Vec1<T> a, const Vec1<T> b) {
761
+ return Abs(a - b);
762
+ }
763
+
764
+ // ------------------------------ Floating-point multiply-add variants
765
+
766
+ template <typename T>
767
+ HWY_API Vec1<T> MulAdd(const Vec1<T> mul, const Vec1<T> x, const Vec1<T> add) {
768
+ return mul * x + add;
769
+ }
770
+
771
+ template <typename T>
772
+ HWY_API Vec1<T> NegMulAdd(const Vec1<T> mul, const Vec1<T> x,
773
+ const Vec1<T> add) {
774
+ return add - mul * x;
775
+ }
776
+
777
+ template <typename T>
778
+ HWY_API Vec1<T> MulSub(const Vec1<T> mul, const Vec1<T> x, const Vec1<T> sub) {
779
+ return mul * x - sub;
780
+ }
781
+
782
+ template <typename T>
783
+ HWY_API Vec1<T> NegMulSub(const Vec1<T> mul, const Vec1<T> x,
784
+ const Vec1<T> sub) {
785
+ return Neg(mul) * x - sub;
786
+ }
787
+
788
+ // ------------------------------ Floating-point square root
789
+
790
+ // Approximate reciprocal square root
791
+ HWY_API Vec1<float> ApproximateReciprocalSqrt(const Vec1<float> v) {
792
+ float f = v.raw;
793
+ const float half = f * 0.5f;
794
+ uint32_t bits;
795
+ CopySameSize(&f, &bits);
796
+ // Initial guess based on log2(f)
797
+ bits = 0x5F3759DF - (bits >> 1);
798
+ CopySameSize(&bits, &f);
799
+ // One Newton-Raphson iteration
800
+ return Vec1<float>(f * (1.5f - (half * f * f)));
801
+ }
802
+
803
+ // Square root
804
+ HWY_API Vec1<float> Sqrt(Vec1<float> v) {
805
+ #if defined(HWY_NO_LIBCXX)
806
+ #if HWY_COMPILER_GCC_ACTUAL
807
+ return Vec1<float>(__builtin_sqrt(v.raw));
808
+ #else
809
+ uint32_t bits;
810
+ CopyBytes<sizeof(bits)>(&v, &bits);
811
+ // Coarse approximation, letting the exponent LSB leak into the mantissa
812
+ bits = (1 << 29) + (bits >> 1) - (1 << 22);
813
+ CopyBytes<sizeof(bits)>(&bits, &v);
814
+ return v;
815
+ #endif // !HWY_COMPILER_GCC_ACTUAL
816
+ #else
817
+ return Vec1<float>(sqrtf(v.raw));
818
+ #endif // !HWY_NO_LIBCXX
819
+ }
820
+ HWY_API Vec1<double> Sqrt(Vec1<double> v) {
821
+ #if defined(HWY_NO_LIBCXX)
822
+ #if HWY_COMPILER_GCC_ACTUAL
823
+ return Vec1<double>(__builtin_sqrt(v.raw));
824
+ #else
825
+ uint64_t bits;
826
+ CopyBytes<sizeof(bits)>(&v, &bits);
827
+ // Coarse approximation, letting the exponent LSB leak into the mantissa
828
+ bits = (1ULL << 61) + (bits >> 1) - (1ULL << 51);
829
+ CopyBytes<sizeof(bits)>(&bits, &v);
830
+ return v;
831
+ #endif // !HWY_COMPILER_GCC_ACTUAL
832
+ #else
833
+ return Vec1<double>(sqrt(v.raw));
834
+ #endif // HWY_NO_LIBCXX
835
+ }
836
+
837
+ // ------------------------------ Floating-point rounding
838
+
839
+ template <typename T>
840
+ HWY_API Vec1<T> Round(const Vec1<T> v) {
841
+ using TI = MakeSigned<T>;
842
+ if (!(Abs(v).raw < MantissaEnd<T>())) { // Huge or NaN
843
+ return v;
844
+ }
845
+ const T bias = v.raw < T(0.0) ? T(-0.5) : T(0.5);
846
+ const TI rounded = static_cast<TI>(v.raw + bias);
847
+ if (rounded == 0) return CopySignToAbs(Vec1<T>(0), v);
848
+ // Round to even
849
+ if ((rounded & 1) && detail::Abs(static_cast<T>(rounded) - v.raw) == T(0.5)) {
850
+ return Vec1<T>(static_cast<T>(rounded - (v.raw < T(0) ? -1 : 1)));
851
+ }
852
+ return Vec1<T>(static_cast<T>(rounded));
853
+ }
854
+
855
+ // Round-to-nearest even.
856
+ HWY_API Vec1<int32_t> NearestInt(const Vec1<float> v) {
857
+ using T = float;
858
+ using TI = int32_t;
859
+
860
+ const T abs = Abs(v).raw;
861
+ const bool is_sign = detail::SignBit(v.raw);
862
+
863
+ if (!(abs < MantissaEnd<T>())) { // Huge or NaN
864
+ // Check if too large to cast or NaN
865
+ if (!(abs <= static_cast<T>(LimitsMax<TI>()))) {
866
+ return Vec1<TI>(is_sign ? LimitsMin<TI>() : LimitsMax<TI>());
867
+ }
868
+ return Vec1<int32_t>(static_cast<TI>(v.raw));
869
+ }
870
+ const T bias = v.raw < T(0.0) ? T(-0.5) : T(0.5);
871
+ const TI rounded = static_cast<TI>(v.raw + bias);
872
+ if (rounded == 0) return Vec1<int32_t>(0);
873
+ // Round to even
874
+ if ((rounded & 1) && detail::Abs(static_cast<T>(rounded) - v.raw) == T(0.5)) {
875
+ return Vec1<TI>(rounded - (is_sign ? -1 : 1));
876
+ }
877
+ return Vec1<TI>(rounded);
878
+ }
879
+
880
+ template <typename T>
881
+ HWY_API Vec1<T> Trunc(const Vec1<T> v) {
882
+ using TI = MakeSigned<T>;
883
+ if (!(Abs(v).raw <= MantissaEnd<T>())) { // Huge or NaN
884
+ return v;
885
+ }
886
+ const TI truncated = static_cast<TI>(v.raw);
887
+ if (truncated == 0) return CopySignToAbs(Vec1<T>(0), v);
888
+ return Vec1<T>(static_cast<T>(truncated));
889
+ }
890
+
891
+ template <typename Float, typename Bits, int kMantissaBits, int kExponentBits,
892
+ class V>
893
+ V Ceiling(const V v) {
894
+ const Bits kExponentMask = (1ull << kExponentBits) - 1;
895
+ const Bits kMantissaMask = (1ull << kMantissaBits) - 1;
896
+ const Bits kBias = kExponentMask / 2;
897
+
898
+ Float f = v.raw;
899
+ const bool positive = f > Float(0.0);
900
+
901
+ Bits bits;
902
+ CopySameSize(&v, &bits);
903
+
904
+ const int exponent =
905
+ static_cast<int>(((bits >> kMantissaBits) & kExponentMask) - kBias);
906
+ // Already an integer.
907
+ if (exponent >= kMantissaBits) return v;
908
+ // |v| <= 1 => 0 or 1.
909
+ if (exponent < 0) return positive ? V(1) : V(-0.0);
910
+
911
+ const Bits mantissa_mask = kMantissaMask >> exponent;
912
+ // Already an integer
913
+ if ((bits & mantissa_mask) == 0) return v;
914
+
915
+ // Clear fractional bits and round up
916
+ if (positive) bits += (kMantissaMask + 1) >> exponent;
917
+ bits &= ~mantissa_mask;
918
+
919
+ CopySameSize(&bits, &f);
920
+ return V(f);
921
+ }
922
+
923
+ template <typename Float, typename Bits, int kMantissaBits, int kExponentBits,
924
+ class V>
925
+ V Floor(const V v) {
926
+ const Bits kExponentMask = (1ull << kExponentBits) - 1;
927
+ const Bits kMantissaMask = (1ull << kMantissaBits) - 1;
928
+ const Bits kBias = kExponentMask / 2;
929
+
930
+ Float f = v.raw;
931
+ const bool negative = f < Float(0.0);
932
+
933
+ Bits bits;
934
+ CopySameSize(&v, &bits);
935
+
936
+ const int exponent =
937
+ static_cast<int>(((bits >> kMantissaBits) & kExponentMask) - kBias);
938
+ // Already an integer.
939
+ if (exponent >= kMantissaBits) return v;
940
+ // |v| <= 1 => -1 or 0.
941
+ if (exponent < 0) return V(negative ? Float(-1.0) : Float(0.0));
942
+
943
+ const Bits mantissa_mask = kMantissaMask >> exponent;
944
+ // Already an integer
945
+ if ((bits & mantissa_mask) == 0) return v;
946
+
947
+ // Clear fractional bits and round down
948
+ if (negative) bits += (kMantissaMask + 1) >> exponent;
949
+ bits &= ~mantissa_mask;
950
+
951
+ CopySameSize(&bits, &f);
952
+ return V(f);
953
+ }
954
+
955
+ // Toward +infinity, aka ceiling
956
+ HWY_API Vec1<float> Ceil(const Vec1<float> v) {
957
+ return Ceiling<float, uint32_t, 23, 8>(v);
958
+ }
959
+ HWY_API Vec1<double> Ceil(const Vec1<double> v) {
960
+ return Ceiling<double, uint64_t, 52, 11>(v);
961
+ }
962
+
963
+ // Toward -infinity, aka floor
964
+ HWY_API Vec1<float> Floor(const Vec1<float> v) {
965
+ return Floor<float, uint32_t, 23, 8>(v);
966
+ }
967
+ HWY_API Vec1<double> Floor(const Vec1<double> v) {
968
+ return Floor<double, uint64_t, 52, 11>(v);
969
+ }
970
+
971
+ // ================================================== COMPARE
972
+
973
+ template <typename T>
974
+ HWY_API Mask1<T> operator==(const Vec1<T> a, const Vec1<T> b) {
975
+ return Mask1<T>::FromBool(a.raw == b.raw);
976
+ }
977
+
978
+ template <typename T>
979
+ HWY_API Mask1<T> operator!=(const Vec1<T> a, const Vec1<T> b) {
980
+ return Mask1<T>::FromBool(a.raw != b.raw);
981
+ }
982
+
983
+ template <typename T>
984
+ HWY_API Mask1<T> TestBit(const Vec1<T> v, const Vec1<T> bit) {
985
+ static_assert(!hwy::IsFloat<T>(), "Only integer vectors supported");
986
+ return (v & bit) == bit;
987
+ }
988
+
989
+ template <typename T>
990
+ HWY_API Mask1<T> operator<(const Vec1<T> a, const Vec1<T> b) {
991
+ return Mask1<T>::FromBool(a.raw < b.raw);
992
+ }
993
+ template <typename T>
994
+ HWY_API Mask1<T> operator>(const Vec1<T> a, const Vec1<T> b) {
995
+ return Mask1<T>::FromBool(a.raw > b.raw);
996
+ }
997
+
998
+ template <typename T>
999
+ HWY_API Mask1<T> operator<=(const Vec1<T> a, const Vec1<T> b) {
1000
+ return Mask1<T>::FromBool(a.raw <= b.raw);
1001
+ }
1002
+ template <typename T>
1003
+ HWY_API Mask1<T> operator>=(const Vec1<T> a, const Vec1<T> b) {
1004
+ return Mask1<T>::FromBool(a.raw >= b.raw);
1005
+ }
1006
+
1007
+ // ------------------------------ Floating-point classification (==)
1008
+
1009
+ template <typename T>
1010
+ HWY_API Mask1<T> IsNaN(const Vec1<T> v) {
1011
+ // std::isnan returns false for 0x7F..FF in clang AVX3 builds, so DIY.
1012
+ MakeUnsigned<T> bits;
1013
+ CopySameSize(&v, &bits);
1014
+ bits += bits;
1015
+ bits >>= 1; // clear sign bit
1016
+ // NaN if all exponent bits are set and the mantissa is not zero.
1017
+ return Mask1<T>::FromBool(bits > ExponentMask<T>());
1018
+ }
1019
+
1020
+ HWY_API Mask1<float> IsInf(const Vec1<float> v) {
1021
+ const Sisd<float> d;
1022
+ const RebindToUnsigned<decltype(d)> du;
1023
+ const Vec1<uint32_t> vu = BitCast(du, v);
1024
+ // 'Shift left' to clear the sign bit, check for exponent=max and mantissa=0.
1025
+ return RebindMask(d, (vu + vu) == Set(du, 0xFF000000u));
1026
+ }
1027
+ HWY_API Mask1<double> IsInf(const Vec1<double> v) {
1028
+ const Sisd<double> d;
1029
+ const RebindToUnsigned<decltype(d)> du;
1030
+ const Vec1<uint64_t> vu = BitCast(du, v);
1031
+ // 'Shift left' to clear the sign bit, check for exponent=max and mantissa=0.
1032
+ return RebindMask(d, (vu + vu) == Set(du, 0xFFE0000000000000ull));
1033
+ }
1034
+
1035
+ HWY_API Mask1<float> IsFinite(const Vec1<float> v) {
1036
+ const Vec1<uint32_t> vu = BitCast(Sisd<uint32_t>(), v);
1037
+ // Shift left to clear the sign bit, check whether exponent != max value.
1038
+ return Mask1<float>::FromBool((vu.raw << 1) < 0xFF000000u);
1039
+ }
1040
+ HWY_API Mask1<double> IsFinite(const Vec1<double> v) {
1041
+ const Vec1<uint64_t> vu = BitCast(Sisd<uint64_t>(), v);
1042
+ // Shift left to clear the sign bit, check whether exponent != max value.
1043
+ return Mask1<double>::FromBool((vu.raw << 1) < 0xFFE0000000000000ull);
1044
+ }
1045
+
1046
+ // ================================================== MEMORY
1047
+
1048
+ // ------------------------------ Load
1049
+
1050
+ template <class D, HWY_IF_LANES_D(D, 1), typename T = TFromD<D>>
1051
+ HWY_API Vec1<T> Load(D /* tag */, const T* HWY_RESTRICT aligned) {
1052
+ T t;
1053
+ CopySameSize(aligned, &t);
1054
+ return Vec1<T>(t);
1055
+ }
1056
+
1057
+ template <class D, typename T = TFromD<D>>
1058
+ HWY_API Vec1<T> MaskedLoad(Mask1<T> m, D d, const T* HWY_RESTRICT aligned) {
1059
+ return IfThenElseZero(m, Load(d, aligned));
1060
+ }
1061
+
1062
+ template <class D, typename T = TFromD<D>>
1063
+ HWY_API Vec1<T> MaskedLoadOr(Vec1<T> v, Mask1<T> m, D d,
1064
+ const T* HWY_RESTRICT aligned) {
1065
+ return IfThenElse(m, Load(d, aligned), v);
1066
+ }
1067
+
1068
+ template <class D, HWY_IF_LANES_D(D, 1), typename T = TFromD<D>>
1069
+ HWY_API Vec1<T> LoadU(D d, const T* HWY_RESTRICT p) {
1070
+ return Load(d, p);
1071
+ }
1072
+
1073
+ // In some use cases, "load single lane" is sufficient; otherwise avoid this.
1074
+ template <class D, HWY_IF_LANES_D(D, 1), typename T = TFromD<D>>
1075
+ HWY_API Vec1<T> LoadDup128(D d, const T* HWY_RESTRICT aligned) {
1076
+ return Load(d, aligned);
1077
+ }
1078
+
1079
+ #ifdef HWY_NATIVE_LOAD_N
1080
+ #undef HWY_NATIVE_LOAD_N
1081
+ #else
1082
+ #define HWY_NATIVE_LOAD_N
1083
+ #endif
1084
+
1085
+ template <class D, typename T = TFromD<D>>
1086
+ HWY_API VFromD<D> LoadN(D d, const T* HWY_RESTRICT p,
1087
+ size_t max_lanes_to_load) {
1088
+ return (max_lanes_to_load > 0) ? Load(d, p) : Zero(d);
1089
+ }
1090
+
1091
+ template <class D, typename T = TFromD<D>>
1092
+ HWY_API VFromD<D> LoadNOr(VFromD<D> no, D d, const T* HWY_RESTRICT p,
1093
+ size_t max_lanes_to_load) {
1094
+ return (max_lanes_to_load > 0) ? Load(d, p) : no;
1095
+ }
1096
+
1097
+ // ------------------------------ Store
1098
+
1099
+ template <class D, typename T = TFromD<D>>
1100
+ HWY_API void Store(const Vec1<T> v, D /* tag */, T* HWY_RESTRICT aligned) {
1101
+ CopySameSize(&v.raw, aligned);
1102
+ }
1103
+
1104
+ template <class D, typename T = TFromD<D>>
1105
+ HWY_API void StoreU(const Vec1<T> v, D d, T* HWY_RESTRICT p) {
1106
+ return Store(v, d, p);
1107
+ }
1108
+
1109
+ template <class D, typename T = TFromD<D>>
1110
+ HWY_API void BlendedStore(const Vec1<T> v, Mask1<T> m, D d, T* HWY_RESTRICT p) {
1111
+ if (!m.bits) return;
1112
+ StoreU(v, d, p);
1113
+ }
1114
+
1115
+ #ifdef HWY_NATIVE_STORE_N
1116
+ #undef HWY_NATIVE_STORE_N
1117
+ #else
1118
+ #define HWY_NATIVE_STORE_N
1119
+ #endif
1120
+
1121
+ template <class D, typename T = TFromD<D>>
1122
+ HWY_API void StoreN(VFromD<D> v, D d, T* HWY_RESTRICT p,
1123
+ size_t max_lanes_to_store) {
1124
+ if (max_lanes_to_store > 0) {
1125
+ Store(v, d, p);
1126
+ }
1127
+ }
1128
+
1129
+ // ------------------------------ LoadInterleaved2/3/4
1130
+
1131
+ // Per-target flag to prevent generic_ops-inl.h from defining StoreInterleaved2.
1132
+ #ifdef HWY_NATIVE_LOAD_STORE_INTERLEAVED
1133
+ #undef HWY_NATIVE_LOAD_STORE_INTERLEAVED
1134
+ #else
1135
+ #define HWY_NATIVE_LOAD_STORE_INTERLEAVED
1136
+ #endif
1137
+
1138
+ template <class D, typename T = TFromD<D>>
1139
+ HWY_API void LoadInterleaved2(D d, const T* HWY_RESTRICT unaligned, Vec1<T>& v0,
1140
+ Vec1<T>& v1) {
1141
+ v0 = LoadU(d, unaligned + 0);
1142
+ v1 = LoadU(d, unaligned + 1);
1143
+ }
1144
+
1145
+ template <class D, typename T = TFromD<D>>
1146
+ HWY_API void LoadInterleaved3(D d, const T* HWY_RESTRICT unaligned, Vec1<T>& v0,
1147
+ Vec1<T>& v1, Vec1<T>& v2) {
1148
+ v0 = LoadU(d, unaligned + 0);
1149
+ v1 = LoadU(d, unaligned + 1);
1150
+ v2 = LoadU(d, unaligned + 2);
1151
+ }
1152
+
1153
+ template <class D, typename T = TFromD<D>>
1154
+ HWY_API void LoadInterleaved4(D d, const T* HWY_RESTRICT unaligned, Vec1<T>& v0,
1155
+ Vec1<T>& v1, Vec1<T>& v2, Vec1<T>& v3) {
1156
+ v0 = LoadU(d, unaligned + 0);
1157
+ v1 = LoadU(d, unaligned + 1);
1158
+ v2 = LoadU(d, unaligned + 2);
1159
+ v3 = LoadU(d, unaligned + 3);
1160
+ }
1161
+
1162
+ // ------------------------------ StoreInterleaved2/3/4
1163
+
1164
+ template <class D, typename T = TFromD<D>>
1165
+ HWY_API void StoreInterleaved2(const Vec1<T> v0, const Vec1<T> v1, D d,
1166
+ T* HWY_RESTRICT unaligned) {
1167
+ StoreU(v0, d, unaligned + 0);
1168
+ StoreU(v1, d, unaligned + 1);
1169
+ }
1170
+
1171
+ template <class D, typename T = TFromD<D>>
1172
+ HWY_API void StoreInterleaved3(const Vec1<T> v0, const Vec1<T> v1,
1173
+ const Vec1<T> v2, D d,
1174
+ T* HWY_RESTRICT unaligned) {
1175
+ StoreU(v0, d, unaligned + 0);
1176
+ StoreU(v1, d, unaligned + 1);
1177
+ StoreU(v2, d, unaligned + 2);
1178
+ }
1179
+
1180
+ template <class D, typename T = TFromD<D>>
1181
+ HWY_API void StoreInterleaved4(const Vec1<T> v0, const Vec1<T> v1,
1182
+ const Vec1<T> v2, const Vec1<T> v3, D d,
1183
+ T* HWY_RESTRICT unaligned) {
1184
+ StoreU(v0, d, unaligned + 0);
1185
+ StoreU(v1, d, unaligned + 1);
1186
+ StoreU(v2, d, unaligned + 2);
1187
+ StoreU(v3, d, unaligned + 3);
1188
+ }
1189
+
1190
+ // ------------------------------ Stream
1191
+
1192
+ template <class D, typename T = TFromD<D>>
1193
+ HWY_API void Stream(const Vec1<T> v, D d, T* HWY_RESTRICT aligned) {
1194
+ return Store(v, d, aligned);
1195
+ }
1196
+
1197
+ // ------------------------------ Scatter
1198
+
1199
+ #ifdef HWY_NATIVE_SCATTER
1200
+ #undef HWY_NATIVE_SCATTER
1201
+ #else
1202
+ #define HWY_NATIVE_SCATTER
1203
+ #endif
1204
+
1205
+ template <class D, typename T = TFromD<D>, typename TI>
1206
+ HWY_API void ScatterOffset(Vec1<T> v, D d, T* base, Vec1<TI> offset) {
1207
+ static_assert(sizeof(T) == sizeof(TI), "Index/lane size must match");
1208
+ uint8_t* const base8 = reinterpret_cast<uint8_t*>(base) + offset.raw;
1209
+ Store(v, d, reinterpret_cast<T*>(base8));
1210
+ }
1211
+
1212
+ template <class D, typename T = TFromD<D>, typename TI>
1213
+ HWY_API void ScatterIndex(Vec1<T> v, D d, T* HWY_RESTRICT base,
1214
+ Vec1<TI> index) {
1215
+ static_assert(sizeof(T) == sizeof(TI), "Index/lane size must match");
1216
+ Store(v, d, base + index.raw);
1217
+ }
1218
+
1219
+ template <class D, typename T = TFromD<D>, typename TI>
1220
+ HWY_API void MaskedScatterIndex(Vec1<T> v, Mask1<T> m, D d,
1221
+ T* HWY_RESTRICT base, Vec1<TI> index) {
1222
+ static_assert(sizeof(T) == sizeof(TI), "Index/lane size must match");
1223
+ if (m.bits) Store(v, d, base + index.raw);
1224
+ }
1225
+
1226
+ // ------------------------------ Gather
1227
+
1228
+ #ifdef HWY_NATIVE_GATHER
1229
+ #undef HWY_NATIVE_GATHER
1230
+ #else
1231
+ #define HWY_NATIVE_GATHER
1232
+ #endif
1233
+
1234
+ template <class D, typename T = TFromD<D>, typename TI>
1235
+ HWY_API Vec1<T> GatherOffset(D d, const T* base, Vec1<TI> offset) {
1236
+ static_assert(sizeof(T) == sizeof(TI), "Index/lane size must match");
1237
+ const intptr_t addr =
1238
+ reinterpret_cast<intptr_t>(base) + static_cast<intptr_t>(offset.raw);
1239
+ return Load(d, reinterpret_cast<const T*>(addr));
1240
+ }
1241
+
1242
+ template <class D, typename T = TFromD<D>, typename TI>
1243
+ HWY_API Vec1<T> GatherIndex(D d, const T* HWY_RESTRICT base, Vec1<TI> index) {
1244
+ static_assert(sizeof(T) == sizeof(TI), "Index/lane size must match");
1245
+ return Load(d, base + index.raw);
1246
+ }
1247
+
1248
+ template <class D, typename T = TFromD<D>, typename TI>
1249
+ HWY_API Vec1<T> MaskedGatherIndex(Mask1<T> m, D d, const T* HWY_RESTRICT base,
1250
+ Vec1<TI> index) {
1251
+ static_assert(sizeof(T) == sizeof(TI), "Index/lane size must match");
1252
+ return MaskedLoad(m, d, base + index.raw);
1253
+ }
1254
+
1255
+ // ================================================== CONVERT
1256
+
1257
+ // ConvertTo and DemoteTo with floating-point input and integer output truncate
1258
+ // (rounding toward zero).
1259
+
1260
+ namespace detail {
1261
+
1262
+ template <class ToT, class FromT>
1263
+ HWY_INLINE ToT CastValueForF2IConv(hwy::UnsignedTag /* to_type_tag */,
1264
+ FromT val) {
1265
+ // Prevent ubsan errors when converting float to narrower integer
1266
+
1267
+ // If LimitsMax<ToT>() can be exactly represented in FromT,
1268
+ // kSmallestOutOfToTRangePosVal is equal to LimitsMax<ToT>().
1269
+
1270
+ // Otherwise, if LimitsMax<ToT>() cannot be exactly represented in FromT,
1271
+ // kSmallestOutOfToTRangePosVal is equal to LimitsMax<ToT>() + 1, which can
1272
+ // be exactly represented in FromT.
1273
+ constexpr FromT kSmallestOutOfToTRangePosVal =
1274
+ (sizeof(ToT) * 8 <= static_cast<size_t>(MantissaBits<FromT>()) + 1)
1275
+ ? static_cast<FromT>(LimitsMax<ToT>())
1276
+ : static_cast<FromT>(
1277
+ static_cast<FromT>(ToT{1} << (sizeof(ToT) * 8 - 1)) * FromT(2));
1278
+
1279
+ if (detail::SignBit(val)) {
1280
+ return ToT{0};
1281
+ } else if (IsInf(Vec1<FromT>(val)).bits ||
1282
+ val >= kSmallestOutOfToTRangePosVal) {
1283
+ return LimitsMax<ToT>();
1284
+ } else {
1285
+ return static_cast<ToT>(val);
1286
+ }
1287
+ }
1288
+
1289
+ template <class ToT, class FromT>
1290
+ HWY_INLINE ToT CastValueForF2IConv(hwy::SignedTag /* to_type_tag */,
1291
+ FromT val) {
1292
+ // Prevent ubsan errors when converting float to narrower integer
1293
+
1294
+ // If LimitsMax<ToT>() can be exactly represented in FromT,
1295
+ // kSmallestOutOfToTRangePosVal is equal to LimitsMax<ToT>().
1296
+
1297
+ // Otherwise, if LimitsMax<ToT>() cannot be exactly represented in FromT,
1298
+ // kSmallestOutOfToTRangePosVal is equal to -LimitsMin<ToT>(), which can
1299
+ // be exactly represented in FromT.
1300
+ constexpr FromT kSmallestOutOfToTRangePosVal =
1301
+ (sizeof(ToT) * 8 <= static_cast<size_t>(MantissaBits<FromT>()) + 2)
1302
+ ? static_cast<FromT>(LimitsMax<ToT>())
1303
+ : static_cast<FromT>(-static_cast<FromT>(LimitsMin<ToT>()));
1304
+
1305
+ if (IsInf(Vec1<FromT>(val)).bits ||
1306
+ detail::Abs(val) >= kSmallestOutOfToTRangePosVal) {
1307
+ return detail::SignBit(val) ? LimitsMin<ToT>() : LimitsMax<ToT>();
1308
+ } else {
1309
+ return static_cast<ToT>(val);
1310
+ }
1311
+ }
1312
+
1313
+ template <class ToT, class ToTypeTag, class FromT>
1314
+ HWY_INLINE ToT CastValueForPromoteTo(ToTypeTag /* to_type_tag */, FromT val) {
1315
+ return static_cast<ToT>(val);
1316
+ }
1317
+
1318
+ template <class ToT>
1319
+ HWY_INLINE ToT CastValueForPromoteTo(hwy::SignedTag to_type_tag, float val) {
1320
+ return CastValueForF2IConv<ToT>(to_type_tag, val);
1321
+ }
1322
+
1323
+ template <class ToT>
1324
+ HWY_INLINE ToT CastValueForPromoteTo(hwy::UnsignedTag to_type_tag, float val) {
1325
+ return CastValueForF2IConv<ToT>(to_type_tag, val);
1326
+ }
1327
+
1328
+ } // namespace detail
1329
+
1330
+ template <class DTo, typename TTo = TFromD<DTo>, typename TFrom>
1331
+ HWY_API Vec1<TTo> PromoteTo(DTo /* tag */, Vec1<TFrom> from) {
1332
+ static_assert(sizeof(TTo) > sizeof(TFrom), "Not promoting");
1333
+ // For bits Y > X, floatX->floatY and intX->intY are always representable.
1334
+ return Vec1<TTo>(
1335
+ detail::CastValueForPromoteTo<TTo>(hwy::TypeTag<TTo>(), from.raw));
1336
+ }
1337
+
1338
+ // MSVC 19.10 cannot deduce the argument type if HWY_IF_FLOAT(TFrom) is here,
1339
+ // so we overload for TFrom=double and TTo={float,int32_t}.
1340
+ template <class D, HWY_IF_F32_D(D)>
1341
+ HWY_API Vec1<float> DemoteTo(D /* tag */, Vec1<double> from) {
1342
+ // Prevent ubsan errors when converting float to narrower integer/float
1343
+ if (IsInf(from).bits ||
1344
+ Abs(from).raw > static_cast<double>(HighestValue<float>())) {
1345
+ return Vec1<float>(detail::SignBit(from.raw) ? LowestValue<float>()
1346
+ : HighestValue<float>());
1347
+ }
1348
+ return Vec1<float>(static_cast<float>(from.raw));
1349
+ }
1350
+ template <class D, HWY_IF_UI32_D(D)>
1351
+ HWY_API VFromD<D> DemoteTo(D /* tag */, Vec1<double> from) {
1352
+ // Prevent ubsan errors when converting int32_t to narrower integer/int32_t
1353
+ return Vec1<TFromD<D>>(detail::CastValueForF2IConv<TFromD<D>>(
1354
+ hwy::TypeTag<TFromD<D>>(), from.raw));
1355
+ }
1356
+
1357
+ template <class DTo, typename TTo = TFromD<DTo>, typename TFrom,
1358
+ HWY_IF_SIGNED(TFrom), HWY_IF_NOT_FLOAT_NOR_SPECIAL(TFromD<DTo>)>
1359
+ HWY_API Vec1<TTo> DemoteTo(DTo /* tag */, Vec1<TFrom> from) {
1360
+ static_assert(!IsFloat<TFrom>(), "TFrom=double are handled above");
1361
+ static_assert(sizeof(TTo) < sizeof(TFrom), "Not demoting");
1362
+
1363
+ // Int to int: choose closest value in TTo to `from` (avoids UB)
1364
+ from.raw = HWY_MIN(HWY_MAX(LimitsMin<TTo>(), from.raw), LimitsMax<TTo>());
1365
+ return Vec1<TTo>(static_cast<TTo>(from.raw));
1366
+ }
1367
+
1368
+ template <class DTo, typename TTo = TFromD<DTo>, typename TFrom,
1369
+ HWY_IF_UNSIGNED(TFrom), HWY_IF_UNSIGNED_D(DTo)>
1370
+ HWY_API Vec1<TTo> DemoteTo(DTo /* tag */, Vec1<TFrom> from) {
1371
+ static_assert(!IsFloat<TFrom>(), "TFrom=double are handled above");
1372
+ static_assert(sizeof(TTo) < sizeof(TFrom), "Not demoting");
1373
+
1374
+ // Int to int: choose closest value in TTo to `from` (avoids UB)
1375
+ from.raw = HWY_MIN(from.raw, LimitsMax<TTo>());
1376
+ return Vec1<TTo>(static_cast<TTo>(from.raw));
1377
+ }
1378
+
1379
+ template <class DTo, typename TTo = TFromD<DTo>, typename TFrom,
1380
+ HWY_IF_UI64(TFrom), HWY_IF_F32_D(DTo)>
1381
+ HWY_API Vec1<TTo> DemoteTo(DTo /* tag */, Vec1<TFrom> from) {
1382
+ // int64_t/uint64_t to float: simply cast to TTo
1383
+ return Vec1<TTo>(static_cast<TTo>(from.raw));
1384
+ }
1385
+
1386
+ // Per-target flag to prevent generic_ops-inl.h from defining f16 conversions;
1387
+ // use this scalar version to verify the vector implementation.
1388
+ #ifdef HWY_NATIVE_F16C
1389
+ #undef HWY_NATIVE_F16C
1390
+ #else
1391
+ #define HWY_NATIVE_F16C
1392
+ #endif
1393
+
1394
+ template <class D, HWY_IF_F32_D(D)>
1395
+ HWY_API Vec1<float> PromoteTo(D /* tag */, const Vec1<float16_t> v) {
1396
+ return Vec1<float>(F32FromF16(v.raw));
1397
+ }
1398
+
1399
+ template <class D, HWY_IF_F32_D(D)>
1400
+ HWY_API Vec1<float> PromoteTo(D d, const Vec1<bfloat16_t> v) {
1401
+ return Set(d, F32FromBF16(v.raw));
1402
+ }
1403
+
1404
+ template <class D, HWY_IF_F16_D(D)>
1405
+ HWY_API Vec1<float16_t> DemoteTo(D /* tag */, const Vec1<float> v) {
1406
+ return Vec1<float16_t>(F16FromF32(v.raw));
1407
+ }
1408
+
1409
+ template <class D, HWY_IF_BF16_D(D)>
1410
+ HWY_API Vec1<bfloat16_t> DemoteTo(D d, const Vec1<float> v) {
1411
+ return Set(d, BF16FromF32(v.raw));
1412
+ }
1413
+
1414
+ template <class DTo, typename TTo = TFromD<DTo>, typename TFrom,
1415
+ HWY_IF_FLOAT(TFrom)>
1416
+ HWY_API Vec1<TTo> ConvertTo(DTo /* tag */, Vec1<TFrom> from) {
1417
+ static_assert(sizeof(TTo) == sizeof(TFrom), "Should have same size");
1418
+ // float## -> int##: return closest representable value.
1419
+ return Vec1<TTo>(
1420
+ detail::CastValueForF2IConv<TTo>(hwy::TypeTag<TTo>(), from.raw));
1421
+ }
1422
+
1423
+ template <class DTo, typename TTo = TFromD<DTo>, typename TFrom,
1424
+ HWY_IF_NOT_FLOAT(TFrom)>
1425
+ HWY_API Vec1<TTo> ConvertTo(DTo /* tag */, Vec1<TFrom> from) {
1426
+ static_assert(sizeof(TTo) == sizeof(TFrom), "Should have same size");
1427
+ // int## -> float##: no check needed
1428
+ return Vec1<TTo>(static_cast<TTo>(from.raw));
1429
+ }
1430
+
1431
+ HWY_API Vec1<uint8_t> U8FromU32(const Vec1<uint32_t> v) {
1432
+ return DemoteTo(Sisd<uint8_t>(), v);
1433
+ }
1434
+
1435
+ // ------------------------------ TruncateTo
1436
+
1437
+ template <class D, HWY_IF_U8_D(D)>
1438
+ HWY_API Vec1<uint8_t> TruncateTo(D /* tag */, Vec1<uint64_t> v) {
1439
+ return Vec1<uint8_t>{static_cast<uint8_t>(v.raw & 0xFF)};
1440
+ }
1441
+
1442
+ template <class D, HWY_IF_U16_D(D)>
1443
+ HWY_API Vec1<uint16_t> TruncateTo(D /* tag */, Vec1<uint64_t> v) {
1444
+ return Vec1<uint16_t>{static_cast<uint16_t>(v.raw & 0xFFFF)};
1445
+ }
1446
+
1447
+ template <class D, HWY_IF_U32_D(D)>
1448
+ HWY_API Vec1<uint32_t> TruncateTo(D /* tag */, Vec1<uint64_t> v) {
1449
+ return Vec1<uint32_t>{static_cast<uint32_t>(v.raw & 0xFFFFFFFFu)};
1450
+ }
1451
+
1452
+ template <class D, HWY_IF_U8_D(D)>
1453
+ HWY_API Vec1<uint8_t> TruncateTo(D /* tag */, Vec1<uint32_t> v) {
1454
+ return Vec1<uint8_t>{static_cast<uint8_t>(v.raw & 0xFF)};
1455
+ }
1456
+
1457
+ template <class D, HWY_IF_U16_D(D)>
1458
+ HWY_API Vec1<uint16_t> TruncateTo(D /* tag */, Vec1<uint32_t> v) {
1459
+ return Vec1<uint16_t>{static_cast<uint16_t>(v.raw & 0xFFFF)};
1460
+ }
1461
+
1462
+ template <class D, HWY_IF_U8_D(D)>
1463
+ HWY_API Vec1<uint8_t> TruncateTo(D /* tag */, Vec1<uint16_t> v) {
1464
+ return Vec1<uint8_t>{static_cast<uint8_t>(v.raw & 0xFF)};
1465
+ }
1466
+
1467
+ // ================================================== COMBINE
1468
+ // UpperHalf, ZeroExtendVector, Combine, Concat* are unsupported.
1469
+
1470
+ template <typename T>
1471
+ HWY_API Vec1<T> LowerHalf(Vec1<T> v) {
1472
+ return v;
1473
+ }
1474
+
1475
+ template <class D, typename T = TFromD<D>>
1476
+ HWY_API Vec1<T> LowerHalf(D /* tag */, Vec1<T> v) {
1477
+ return v;
1478
+ }
1479
+
1480
+ // ================================================== SWIZZLE
1481
+
1482
+ template <typename T>
1483
+ HWY_API T GetLane(const Vec1<T> v) {
1484
+ return v.raw;
1485
+ }
1486
+
1487
+ template <typename T>
1488
+ HWY_API T ExtractLane(const Vec1<T> v, size_t i) {
1489
+ HWY_DASSERT(i == 0);
1490
+ (void)i;
1491
+ return v.raw;
1492
+ }
1493
+
1494
+ template <typename T>
1495
+ HWY_API Vec1<T> InsertLane(Vec1<T> v, size_t i, T t) {
1496
+ HWY_DASSERT(i == 0);
1497
+ (void)i;
1498
+ v.raw = t;
1499
+ return v;
1500
+ }
1501
+
1502
+ template <typename T>
1503
+ HWY_API Vec1<T> DupEven(Vec1<T> v) {
1504
+ return v;
1505
+ }
1506
+ // DupOdd is unsupported.
1507
+
1508
+ template <typename T>
1509
+ HWY_API Vec1<T> OddEven(Vec1<T> /* odd */, Vec1<T> even) {
1510
+ return even;
1511
+ }
1512
+
1513
+ template <typename T>
1514
+ HWY_API Vec1<T> OddEvenBlocks(Vec1<T> /* odd */, Vec1<T> even) {
1515
+ return even;
1516
+ }
1517
+
1518
+ // ------------------------------ SwapAdjacentBlocks
1519
+
1520
+ template <typename T>
1521
+ HWY_API Vec1<T> SwapAdjacentBlocks(Vec1<T> v) {
1522
+ return v;
1523
+ }
1524
+
1525
+ // ------------------------------ TableLookupLanes
1526
+
1527
+ // Returned by SetTableIndices for use by TableLookupLanes.
1528
+ template <typename T>
1529
+ struct Indices1 {
1530
+ MakeSigned<T> raw;
1531
+ };
1532
+
1533
+ template <class D, typename T = TFromD<D>, typename TI>
1534
+ HWY_API Indices1<T> IndicesFromVec(D, Vec1<TI> vec) {
1535
+ static_assert(sizeof(T) == sizeof(TI), "Index size must match lane size");
1536
+ HWY_DASSERT(vec.raw <= 1);
1537
+ return Indices1<T>{static_cast<MakeSigned<T>>(vec.raw)};
1538
+ }
1539
+
1540
+ template <class D, HWY_IF_LANES_D(D, 1), typename T = TFromD<D>, typename TI>
1541
+ HWY_API Indices1<T> SetTableIndices(D d, const TI* idx) {
1542
+ return IndicesFromVec(d, LoadU(Sisd<TI>(), idx));
1543
+ }
1544
+
1545
+ template <typename T>
1546
+ HWY_API Vec1<T> TableLookupLanes(const Vec1<T> v, const Indices1<T> /* idx */) {
1547
+ return v;
1548
+ }
1549
+
1550
+ template <typename T>
1551
+ HWY_API Vec1<T> TwoTablesLookupLanes(const Vec1<T> a, const Vec1<T> b,
1552
+ const Indices1<T> idx) {
1553
+ return (idx.raw == 0) ? a : b;
1554
+ }
1555
+
1556
+ // ------------------------------ ReverseBlocks
1557
+
1558
+ // Single block: no change
1559
+ template <class D, typename T = TFromD<D>>
1560
+ HWY_API Vec1<T> ReverseBlocks(D /* tag */, const Vec1<T> v) {
1561
+ return v;
1562
+ }
1563
+
1564
+ // ------------------------------ Reverse
1565
+
1566
+ template <class D, typename T = TFromD<D>>
1567
+ HWY_API Vec1<T> Reverse(D /* tag */, const Vec1<T> v) {
1568
+ return v;
1569
+ }
1570
+
1571
+ // Per-target flag to prevent generic_ops-inl.h defining 8-bit Reverse2/4/8.
1572
+ #ifdef HWY_NATIVE_REVERSE2_8
1573
+ #undef HWY_NATIVE_REVERSE2_8
1574
+ #else
1575
+ #define HWY_NATIVE_REVERSE2_8
1576
+ #endif
1577
+
1578
+ // Must not be called:
1579
+ template <class D, typename T = TFromD<D>>
1580
+ HWY_API Vec1<T> Reverse2(D /* tag */, const Vec1<T> v) {
1581
+ return v;
1582
+ }
1583
+
1584
+ template <class D, typename T = TFromD<D>>
1585
+ HWY_API Vec1<T> Reverse4(D /* tag */, const Vec1<T> v) {
1586
+ return v;
1587
+ }
1588
+
1589
+ template <class D, typename T = TFromD<D>>
1590
+ HWY_API Vec1<T> Reverse8(D /* tag */, const Vec1<T> v) {
1591
+ return v;
1592
+ }
1593
+
1594
+ // ------------------------------ ReverseLaneBytes
1595
+
1596
+ #ifdef HWY_NATIVE_REVERSE_LANE_BYTES
1597
+ #undef HWY_NATIVE_REVERSE_LANE_BYTES
1598
+ #else
1599
+ #define HWY_NATIVE_REVERSE_LANE_BYTES
1600
+ #endif
1601
+
1602
+ HWY_API Vec1<uint16_t> ReverseLaneBytes(Vec1<uint16_t> v) {
1603
+ const uint32_t val{v.raw};
1604
+ return Vec1<uint16_t>(
1605
+ static_cast<uint16_t>(((val << 8) & 0xFF00u) | ((val >> 8) & 0x00FFu)));
1606
+ }
1607
+
1608
+ HWY_API Vec1<uint32_t> ReverseLaneBytes(Vec1<uint32_t> v) {
1609
+ const uint32_t val = v.raw;
1610
+ return Vec1<uint32_t>(static_cast<uint32_t>(
1611
+ ((val << 24) & 0xFF000000u) | ((val << 8) & 0x00FF0000u) |
1612
+ ((val >> 8) & 0x0000FF00u) | ((val >> 24) & 0x000000FFu)));
1613
+ }
1614
+
1615
+ HWY_API Vec1<uint64_t> ReverseLaneBytes(Vec1<uint64_t> v) {
1616
+ const uint64_t val = v.raw;
1617
+ return Vec1<uint64_t>(static_cast<uint64_t>(
1618
+ ((val << 56) & 0xFF00000000000000u) |
1619
+ ((val << 40) & 0x00FF000000000000u) |
1620
+ ((val << 24) & 0x0000FF0000000000u) | ((val << 8) & 0x000000FF00000000u) |
1621
+ ((val >> 8) & 0x00000000FF000000u) | ((val >> 24) & 0x0000000000FF0000u) |
1622
+ ((val >> 40) & 0x000000000000FF00u) |
1623
+ ((val >> 56) & 0x00000000000000FFu)));
1624
+ }
1625
+
1626
+ template <class V, HWY_IF_SIGNED_V(V),
1627
+ HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 2) | (1 << 4) | (1 << 8))>
1628
+ HWY_API V ReverseLaneBytes(V v) {
1629
+ const DFromV<decltype(v)> d;
1630
+ const RebindToUnsigned<decltype(d)> du;
1631
+ return BitCast(d, ReverseLaneBytes(BitCast(du, v)));
1632
+ }
1633
+
1634
+ // ------------------------------ ReverseBits
1635
+ #ifdef HWY_NATIVE_REVERSE_BITS_UI8
1636
+ #undef HWY_NATIVE_REVERSE_BITS_UI8
1637
+ #else
1638
+ #define HWY_NATIVE_REVERSE_BITS_UI8
1639
+ #endif
1640
+
1641
+ #ifdef HWY_NATIVE_REVERSE_BITS_UI16_32_64
1642
+ #undef HWY_NATIVE_REVERSE_BITS_UI16_32_64
1643
+ #else
1644
+ #define HWY_NATIVE_REVERSE_BITS_UI16_32_64
1645
+ #endif
1646
+
1647
+ namespace detail {
1648
+
1649
+ template <class T>
1650
+ HWY_INLINE T ReverseBitsOfEachByte(T val) {
1651
+ using TU = MakeUnsigned<T>;
1652
+ constexpr TU kMaxUnsignedVal{LimitsMax<TU>()};
1653
+ constexpr TU kShrMask1 =
1654
+ static_cast<TU>(0x5555555555555555u & kMaxUnsignedVal);
1655
+ constexpr TU kShrMask2 =
1656
+ static_cast<TU>(0x3333333333333333u & kMaxUnsignedVal);
1657
+ constexpr TU kShrMask3 =
1658
+ static_cast<TU>(0x0F0F0F0F0F0F0F0Fu & kMaxUnsignedVal);
1659
+
1660
+ constexpr TU kShlMask1 = static_cast<TU>(~kShrMask1);
1661
+ constexpr TU kShlMask2 = static_cast<TU>(~kShrMask2);
1662
+ constexpr TU kShlMask3 = static_cast<TU>(~kShrMask3);
1663
+
1664
+ TU result = static_cast<TU>(val);
1665
+ result = static_cast<TU>(((result << 1) & kShlMask1) |
1666
+ ((result >> 1) & kShrMask1));
1667
+ result = static_cast<TU>(((result << 2) & kShlMask2) |
1668
+ ((result >> 2) & kShrMask2));
1669
+ result = static_cast<TU>(((result << 4) & kShlMask3) |
1670
+ ((result >> 4) & kShrMask3));
1671
+ return static_cast<T>(result);
1672
+ }
1673
+
1674
+ } // namespace detail
1675
+
1676
+ template <class V, HWY_IF_UNSIGNED_V(V), HWY_IF_T_SIZE_V(V, 1)>
1677
+ HWY_API V ReverseBits(V v) {
1678
+ return V(detail::ReverseBitsOfEachByte(v.raw));
1679
+ }
1680
+
1681
+ template <class V, HWY_IF_UNSIGNED_V(V),
1682
+ HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 2) | (1 << 4) | (1 << 8))>
1683
+ HWY_API V ReverseBits(V v) {
1684
+ return ReverseLaneBytes(V(detail::ReverseBitsOfEachByte(v.raw)));
1685
+ }
1686
+
1687
+ template <class V, HWY_IF_SIGNED_V(V)>
1688
+ HWY_API V ReverseBits(V v) {
1689
+ const DFromV<decltype(v)> d;
1690
+ const RebindToUnsigned<decltype(d)> du;
1691
+ return BitCast(d, ReverseBits(BitCast(du, v)));
1692
+ }
1693
+
1694
+ // ------------------------------ SlideUpLanes
1695
+
1696
+ template <typename D>
1697
+ HWY_API VFromD<D> SlideUpLanes(D /*d*/, VFromD<D> v, size_t /*amt*/) {
1698
+ return v;
1699
+ }
1700
+
1701
+ // ------------------------------ SlideDownLanes
1702
+
1703
+ template <typename D>
1704
+ HWY_API VFromD<D> SlideDownLanes(D /*d*/, VFromD<D> v, size_t /*amt*/) {
1705
+ return v;
1706
+ }
1707
+
1708
+ // ================================================== BLOCKWISE
1709
+ // Shift*Bytes, CombineShiftRightBytes, Interleave*, Shuffle* are unsupported.
1710
+
1711
+ // ------------------------------ Broadcast/splat any lane
1712
+
1713
+ template <int kLane, typename T>
1714
+ HWY_API Vec1<T> Broadcast(const Vec1<T> v) {
1715
+ static_assert(kLane == 0, "Scalar only has one lane");
1716
+ return v;
1717
+ }
1718
+
1719
+ // ------------------------------ TableLookupBytes, TableLookupBytesOr0
1720
+
1721
+ template <typename T, typename TI>
1722
+ HWY_API Vec1<TI> TableLookupBytes(const Vec1<T> in, const Vec1<TI> indices) {
1723
+ uint8_t in_bytes[sizeof(T)];
1724
+ uint8_t idx_bytes[sizeof(T)];
1725
+ uint8_t out_bytes[sizeof(T)];
1726
+ CopyBytes<sizeof(T)>(&in, &in_bytes); // copy to bytes
1727
+ CopyBytes<sizeof(T)>(&indices, &idx_bytes);
1728
+ for (size_t i = 0; i < sizeof(T); ++i) {
1729
+ out_bytes[i] = in_bytes[idx_bytes[i]];
1730
+ }
1731
+ TI out;
1732
+ CopyBytes<sizeof(TI)>(&out_bytes, &out);
1733
+ return Vec1<TI>{out};
1734
+ }
1735
+
1736
+ template <typename T, typename TI>
1737
+ HWY_API Vec1<TI> TableLookupBytesOr0(const Vec1<T> in, const Vec1<TI> indices) {
1738
+ uint8_t in_bytes[sizeof(T)];
1739
+ uint8_t idx_bytes[sizeof(T)];
1740
+ uint8_t out_bytes[sizeof(T)];
1741
+ CopyBytes<sizeof(T)>(&in, &in_bytes); // copy to bytes
1742
+ CopyBytes<sizeof(T)>(&indices, &idx_bytes);
1743
+ for (size_t i = 0; i < sizeof(T); ++i) {
1744
+ out_bytes[i] = idx_bytes[i] & 0x80 ? 0 : in_bytes[idx_bytes[i]];
1745
+ }
1746
+ TI out;
1747
+ CopyBytes<sizeof(TI)>(&out_bytes, &out);
1748
+ return Vec1<TI>{out};
1749
+ }
1750
+
1751
+ // ------------------------------ ZipLower
1752
+
1753
+ HWY_API Vec1<uint16_t> ZipLower(Vec1<uint8_t> a, Vec1<uint8_t> b) {
1754
+ return Vec1<uint16_t>(static_cast<uint16_t>((uint32_t{b.raw} << 8) + a.raw));
1755
+ }
1756
+ HWY_API Vec1<uint32_t> ZipLower(Vec1<uint16_t> a, Vec1<uint16_t> b) {
1757
+ return Vec1<uint32_t>((uint32_t{b.raw} << 16) + a.raw);
1758
+ }
1759
+ HWY_API Vec1<uint64_t> ZipLower(Vec1<uint32_t> a, Vec1<uint32_t> b) {
1760
+ return Vec1<uint64_t>((uint64_t{b.raw} << 32) + a.raw);
1761
+ }
1762
+ HWY_API Vec1<int16_t> ZipLower(Vec1<int8_t> a, Vec1<int8_t> b) {
1763
+ return Vec1<int16_t>(static_cast<int16_t>((int32_t{b.raw} << 8) + a.raw));
1764
+ }
1765
+ HWY_API Vec1<int32_t> ZipLower(Vec1<int16_t> a, Vec1<int16_t> b) {
1766
+ return Vec1<int32_t>((int32_t{b.raw} << 16) + a.raw);
1767
+ }
1768
+ HWY_API Vec1<int64_t> ZipLower(Vec1<int32_t> a, Vec1<int32_t> b) {
1769
+ return Vec1<int64_t>((int64_t{b.raw} << 32) + a.raw);
1770
+ }
1771
+
1772
+ template <class DW, typename TW = TFromD<DW>, typename TN = MakeNarrow<TW>>
1773
+ HWY_API Vec1<TW> ZipLower(DW /* tag */, Vec1<TN> a, Vec1<TN> b) {
1774
+ return Vec1<TW>(static_cast<TW>((TW{b.raw} << (sizeof(TN) * 8)) + a.raw));
1775
+ }
1776
+
1777
+ // ================================================== MASK
1778
+
1779
+ template <class D, typename T = TFromD<D>>
1780
+ HWY_API bool AllFalse(D /* tag */, const Mask1<T> mask) {
1781
+ return mask.bits == 0;
1782
+ }
1783
+
1784
+ template <class D, typename T = TFromD<D>>
1785
+ HWY_API bool AllTrue(D /* tag */, const Mask1<T> mask) {
1786
+ return mask.bits != 0;
1787
+ }
1788
+
1789
+ // `p` points to at least 8 readable bytes, not all of which need be valid.
1790
+ template <class D, HWY_IF_LANES_D(D, 1), typename T = TFromD<D>>
1791
+ HWY_API Mask1<T> LoadMaskBits(D /* tag */, const uint8_t* HWY_RESTRICT bits) {
1792
+ return Mask1<T>::FromBool((bits[0] & 1) != 0);
1793
+ }
1794
+
1795
+ // `p` points to at least 8 writable bytes.
1796
+ template <class D, typename T = TFromD<D>>
1797
+ HWY_API size_t StoreMaskBits(D d, const Mask1<T> mask, uint8_t* bits) {
1798
+ *bits = AllTrue(d, mask);
1799
+ return 1;
1800
+ }
1801
+
1802
+ template <class D, typename T = TFromD<D>>
1803
+ HWY_API size_t CountTrue(D /* tag */, const Mask1<T> mask) {
1804
+ return mask.bits == 0 ? 0 : 1;
1805
+ }
1806
+
1807
+ template <class D, typename T = TFromD<D>>
1808
+ HWY_API intptr_t FindFirstTrue(D /* tag */, const Mask1<T> mask) {
1809
+ return mask.bits == 0 ? -1 : 0;
1810
+ }
1811
+
1812
+ template <class D, typename T = TFromD<D>>
1813
+ HWY_API size_t FindKnownFirstTrue(D /* tag */, const Mask1<T> /* m */) {
1814
+ return 0; // There is only one lane and we know it is true.
1815
+ }
1816
+
1817
+ template <class D, typename T = TFromD<D>>
1818
+ HWY_API intptr_t FindLastTrue(D /* tag */, const Mask1<T> mask) {
1819
+ return mask.bits == 0 ? -1 : 0;
1820
+ }
1821
+
1822
+ template <class D, typename T = TFromD<D>>
1823
+ HWY_API size_t FindKnownLastTrue(D /* tag */, const Mask1<T> /* m */) {
1824
+ return 0; // There is only one lane and we know it is true.
1825
+ }
1826
+
1827
+ // ------------------------------ Compress, CompressBits
1828
+
1829
+ template <typename T>
1830
+ struct CompressIsPartition {
1831
+ enum { value = 1 };
1832
+ };
1833
+
1834
+ template <typename T>
1835
+ HWY_API Vec1<T> Compress(Vec1<T> v, const Mask1<T> /* mask */) {
1836
+ // A single lane is already partitioned by definition.
1837
+ return v;
1838
+ }
1839
+
1840
+ template <typename T>
1841
+ HWY_API Vec1<T> CompressNot(Vec1<T> v, const Mask1<T> /* mask */) {
1842
+ // A single lane is already partitioned by definition.
1843
+ return v;
1844
+ }
1845
+
1846
+ // ------------------------------ CompressStore
1847
+ template <class D, typename T = TFromD<D>>
1848
+ HWY_API size_t CompressStore(Vec1<T> v, const Mask1<T> mask, D d,
1849
+ T* HWY_RESTRICT unaligned) {
1850
+ StoreU(Compress(v, mask), d, unaligned);
1851
+ return CountTrue(d, mask);
1852
+ }
1853
+
1854
+ // ------------------------------ CompressBlendedStore
1855
+ template <class D, typename T = TFromD<D>>
1856
+ HWY_API size_t CompressBlendedStore(Vec1<T> v, const Mask1<T> mask, D d,
1857
+ T* HWY_RESTRICT unaligned) {
1858
+ if (!mask.bits) return 0;
1859
+ StoreU(v, d, unaligned);
1860
+ return 1;
1861
+ }
1862
+
1863
+ // ------------------------------ CompressBits
1864
+ template <typename T>
1865
+ HWY_API Vec1<T> CompressBits(Vec1<T> v, const uint8_t* HWY_RESTRICT /*bits*/) {
1866
+ return v;
1867
+ }
1868
+
1869
+ // ------------------------------ CompressBitsStore
1870
+ template <class D, typename T = TFromD<D>>
1871
+ HWY_API size_t CompressBitsStore(Vec1<T> v, const uint8_t* HWY_RESTRICT bits,
1872
+ D d, T* HWY_RESTRICT unaligned) {
1873
+ const Mask1<T> mask = LoadMaskBits(d, bits);
1874
+ StoreU(Compress(v, mask), d, unaligned);
1875
+ return CountTrue(d, mask);
1876
+ }
1877
+
1878
+ // ------------------------------ Expand
1879
+
1880
+ // generic_ops-inl.h requires Vec64/128, so implement [Load]Expand here.
1881
+ #ifdef HWY_NATIVE_EXPAND
1882
+ #undef HWY_NATIVE_EXPAND
1883
+ #else
1884
+ #define HWY_NATIVE_EXPAND
1885
+ #endif
1886
+
1887
+ template <typename T>
1888
+ HWY_API Vec1<T> Expand(Vec1<T> v, const Mask1<T> mask) {
1889
+ return IfThenElseZero(mask, v);
1890
+ }
1891
+
1892
+ // ------------------------------ LoadExpand
1893
+ template <class D>
1894
+ HWY_API VFromD<D> LoadExpand(MFromD<D> mask, D d,
1895
+ const TFromD<D>* HWY_RESTRICT unaligned) {
1896
+ return MaskedLoad(mask, d, unaligned);
1897
+ }
1898
+
1899
+ // ------------------------------ WidenMulPairwiseAdd
1900
+
1901
+ template <class D32, HWY_IF_F32_D(D32)>
1902
+ HWY_API Vec1<float> WidenMulPairwiseAdd(D32 /* tag */, Vec1<bfloat16_t> a,
1903
+ Vec1<bfloat16_t> b) {
1904
+ return Vec1<float>(F32FromBF16(a.raw)) * Vec1<float>(F32FromBF16(b.raw));
1905
+ }
1906
+
1907
+ template <class D32, HWY_IF_I32_D(D32)>
1908
+ HWY_API Vec1<int32_t> WidenMulPairwiseAdd(D32 /* tag */, Vec1<int16_t> a,
1909
+ Vec1<int16_t> b) {
1910
+ return Vec1<int32_t>(a.raw * b.raw);
1911
+ }
1912
+
1913
+ // ------------------------------ SatWidenMulPairwiseAdd
1914
+
1915
+ #ifdef HWY_NATIVE_U8_I8_SATWIDENMULPAIRWISEADD
1916
+ #undef HWY_NATIVE_U8_I8_SATWIDENMULPAIRWISEADD
1917
+ #else
1918
+ #define HWY_NATIVE_U8_I8_SATWIDENMULPAIRWISEADD
1919
+ #endif
1920
+
1921
+ template <class DI16, HWY_IF_I16_D(DI16)>
1922
+ HWY_API Vec1<int16_t> SatWidenMulPairwiseAdd(DI16 /* tag */, Vec1<uint8_t> a,
1923
+ Vec1<int8_t> b) {
1924
+ // Saturation of a.raw * b.raw is not needed on the HWY_SCALAR target as the
1925
+ // input vectors only have 1 lane on the HWY_SCALAR target and as
1926
+ // a.raw * b.raw is between -32640 and 32385, which is already within the
1927
+ // range of an int16_t.
1928
+
1929
+ // On other targets, a saturated addition of a[0]*b[0] + a[1]*b[1] is needed
1930
+ // as it is possible for the addition of a[0]*b[0] + a[1]*b[1] to overflow if
1931
+ // a[0], a[1], b[0], and b[1] are all non-zero and b[0] and b[1] both have the
1932
+ // same sign.
1933
+
1934
+ return Vec1<int16_t>(static_cast<int16_t>(a.raw) *
1935
+ static_cast<int16_t>(b.raw));
1936
+ }
1937
+
1938
+ // ------------------------------ ReorderWidenMulAccumulate (MulAdd, ZipLower)
1939
+
1940
+ template <class D32, HWY_IF_F32_D(D32)>
1941
+ HWY_API Vec1<float> ReorderWidenMulAccumulate(D32 /* tag */, Vec1<bfloat16_t> a,
1942
+ Vec1<bfloat16_t> b,
1943
+ const Vec1<float> sum0,
1944
+ Vec1<float>& /* sum1 */) {
1945
+ return MulAdd(Vec1<float>(F32FromBF16(a.raw)),
1946
+ Vec1<float>(F32FromBF16(b.raw)), sum0);
1947
+ }
1948
+
1949
+ template <class D32, HWY_IF_I32_D(D32)>
1950
+ HWY_API Vec1<int32_t> ReorderWidenMulAccumulate(D32 /* tag */, Vec1<int16_t> a,
1951
+ Vec1<int16_t> b,
1952
+ const Vec1<int32_t> sum0,
1953
+ Vec1<int32_t>& /* sum1 */) {
1954
+ return Vec1<int32_t>(a.raw * b.raw + sum0.raw);
1955
+ }
1956
+
1957
+ template <class DU32, HWY_IF_U32_D(DU32)>
1958
+ HWY_API Vec1<uint32_t> ReorderWidenMulAccumulate(DU32 /* tag */,
1959
+ Vec1<uint16_t> a,
1960
+ Vec1<uint16_t> b,
1961
+ const Vec1<uint32_t> sum0,
1962
+ Vec1<uint32_t>& /* sum1 */) {
1963
+ return Vec1<uint32_t>(static_cast<uint32_t>(a.raw) * b.raw + sum0.raw);
1964
+ }
1965
+
1966
+ // ------------------------------ RearrangeToOddPlusEven
1967
+ template <typename TW>
1968
+ HWY_API Vec1<TW> RearrangeToOddPlusEven(Vec1<TW> sum0, Vec1<TW> /* sum1 */) {
1969
+ return sum0; // invariant already holds
1970
+ }
1971
+
1972
+ // ================================================== REDUCTIONS
1973
+
1974
+ // Sum of all lanes, i.e. the only one.
1975
+ template <class D, typename T = TFromD<D>>
1976
+ HWY_API Vec1<T> SumOfLanes(D /* tag */, const Vec1<T> v) {
1977
+ return v;
1978
+ }
1979
+ template <class D, typename T = TFromD<D>>
1980
+ HWY_API T ReduceSum(D /* tag */, const Vec1<T> v) {
1981
+ return GetLane(v);
1982
+ }
1983
+ template <class D, typename T = TFromD<D>>
1984
+ HWY_API Vec1<T> MinOfLanes(D /* tag */, const Vec1<T> v) {
1985
+ return v;
1986
+ }
1987
+ template <class D, typename T = TFromD<D>>
1988
+ HWY_API Vec1<T> MaxOfLanes(D /* tag */, const Vec1<T> v) {
1989
+ return v;
1990
+ }
1991
+
1992
+ // NOLINTNEXTLINE(google-readability-namespace-comments)
1993
+ } // namespace HWY_NAMESPACE
1994
+ } // namespace hwy
1995
+ HWY_AFTER_NAMESPACE();