@img/sharp-libvips-dev 1.0.1 → 1.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/include/expat.h +21 -10
- package/include/expat_config.h +11 -5
- package/include/ffi.h +12 -25
- package/include/freetype2/freetype/config/ftoption.h +1 -1
- package/include/gio-unix-2.0/gio/gfiledescriptorbased.h +3 -2
- package/include/glib-2.0/gio/gapplication.h +6 -0
- package/include/glib-2.0/gio/giotypes.h +0 -1
- package/include/glib-2.0/girepository/giarginfo.h +23 -6
- package/include/glib-2.0/girepository/gibaseinfo.h +44 -18
- package/include/glib-2.0/girepository/gicallableinfo.h +26 -16
- package/include/glib-2.0/girepository/gicallbackinfo.h +17 -2
- package/include/glib-2.0/girepository/giconstantinfo.h +19 -4
- package/include/glib-2.0/girepository/gienuminfo.h +20 -21
- package/include/glib-2.0/girepository/gifieldinfo.h +22 -7
- package/include/glib-2.0/girepository/giflagsinfo.h +60 -0
- package/include/glib-2.0/girepository/gifunctioninfo.h +22 -7
- package/include/glib-2.0/girepository/giinterfaceinfo.h +33 -18
- package/include/glib-2.0/girepository/giobjectinfo.h +41 -26
- package/include/glib-2.0/girepository/gipropertyinfo.h +18 -3
- package/include/glib-2.0/girepository/giregisteredtypeinfo.h +22 -11
- package/include/glib-2.0/girepository/girepository-autocleanups.h +56 -0
- package/include/glib-2.0/girepository/girepository.h +53 -62
- package/include/glib-2.0/girepository/girffi.h +8 -7
- package/include/glib-2.0/girepository/gisignalinfo.h +18 -3
- package/include/glib-2.0/girepository/gistructinfo.h +26 -11
- package/include/glib-2.0/girepository/gitypeinfo.h +29 -16
- package/include/glib-2.0/girepository/gitypelib.h +9 -13
- package/include/glib-2.0/girepository/gitypes.h +52 -104
- package/include/glib-2.0/girepository/giunioninfo.h +28 -12
- package/include/glib-2.0/girepository/giunresolvedinfo.h +17 -2
- package/include/glib-2.0/girepository/givalueinfo.h +65 -0
- package/include/glib-2.0/girepository/givfuncinfo.h +23 -8
- package/include/glib-2.0/glib/deprecated/gthread.h +9 -5
- package/include/glib-2.0/glib/gbitlock.h +31 -0
- package/include/glib-2.0/glib/gmessages.h +8 -0
- package/include/glib-2.0/glib/gslice.h +2 -0
- package/include/glib-2.0/glib/gstrfuncs.h +24 -18
- package/include/glib-2.0/glib/gthread.h +191 -3
- package/include/glib-2.0/glib-unix.h +7 -1
- package/include/glib-2.0/gobject/genums.h +6 -6
- package/include/glib-2.0/gobject/glib-types.h +11 -0
- package/include/glib-2.0/gobject/gsignal.h +16 -6
- package/include/hwy/aligned_allocator.h +171 -6
- package/include/hwy/base.h +1765 -543
- package/include/hwy/cache_control.h +24 -6
- package/include/hwy/detect_compiler_arch.h +23 -2
- package/include/hwy/detect_targets.h +56 -13
- package/include/hwy/foreach_target.h +24 -0
- package/include/hwy/highway.h +20 -3
- package/include/hwy/ops/arm_neon-inl.h +1086 -667
- package/include/hwy/ops/arm_sve-inl.h +1091 -235
- package/include/hwy/ops/emu128-inl.h +271 -196
- package/include/hwy/ops/generic_ops-inl.h +2270 -399
- package/include/hwy/ops/ppc_vsx-inl.h +1786 -563
- package/include/hwy/ops/rvv-inl.h +1043 -311
- package/include/hwy/ops/scalar-inl.h +189 -159
- package/include/hwy/ops/set_macros-inl.h +66 -6
- package/include/hwy/ops/shared-inl.h +175 -56
- package/include/hwy/ops/wasm_128-inl.h +153 -136
- package/include/hwy/ops/x86_128-inl.h +1647 -646
- package/include/hwy/ops/x86_256-inl.h +1003 -370
- package/include/hwy/ops/x86_512-inl.h +948 -353
- package/include/hwy/per_target.h +4 -0
- package/include/hwy/profiler.h +648 -0
- package/include/hwy/robust_statistics.h +2 -2
- package/include/hwy/targets.h +18 -11
- package/include/hwy/timer.h +11 -0
- package/include/libpng16/png.h +32 -29
- package/include/libpng16/pngconf.h +2 -2
- package/include/libpng16/pnglibconf.h +7 -2
- package/include/librsvg-2.0/librsvg/rsvg-version.h +2 -2
- package/include/libxml2/libxml/parser.h +16 -7
- package/include/libxml2/libxml/xmlIO.h +0 -1
- package/include/libxml2/libxml/xmlversion.h +4 -4
- package/include/pango-1.0/pango/pango-features.h +3 -3
- package/include/pango-1.0/pango/pango-fontmap.h +7 -0
- package/include/pixman-1/pixman-version.h +2 -2
- package/include/png.h +32 -29
- package/include/pngconf.h +2 -2
- package/include/pnglibconf.h +7 -2
- package/include/vips/connection.h +9 -3
- package/include/vips/util.h +0 -9
- package/include/vips/version.h +4 -4
- package/package.json +1 -1
- package/versions.json +11 -11
|
@@ -16,7 +16,10 @@
|
|
|
16
16
|
// Single-element vectors and operations.
|
|
17
17
|
// External include guard in highway.h - see comment there.
|
|
18
18
|
|
|
19
|
-
#include
|
|
19
|
+
#include "hwy/base.h"
|
|
20
|
+
#ifndef HWY_NO_LIBCXX
|
|
21
|
+
#include <math.h> // sqrtf
|
|
22
|
+
#endif
|
|
20
23
|
|
|
21
24
|
#include "hwy/ops/shared-inl.h"
|
|
22
25
|
|
|
@@ -49,6 +52,9 @@ struct Vec128 {
|
|
|
49
52
|
HWY_INLINE Vec128& operator-=(const Vec128 other) {
|
|
50
53
|
return *this = (*this - other);
|
|
51
54
|
}
|
|
55
|
+
HWY_INLINE Vec128& operator%=(const Vec128 other) {
|
|
56
|
+
return *this = (*this % other);
|
|
57
|
+
}
|
|
52
58
|
HWY_INLINE Vec128& operator&=(const Vec128 other) {
|
|
53
59
|
return *this = (*this & other);
|
|
54
60
|
}
|
|
@@ -105,7 +111,7 @@ using VFromD = decltype(Zero(D()));
|
|
|
105
111
|
template <class D, class VFrom>
|
|
106
112
|
HWY_API VFromD<D> BitCast(D /* tag */, VFrom v) {
|
|
107
113
|
VFromD<D> to;
|
|
108
|
-
CopySameSize(&v, &to);
|
|
114
|
+
CopySameSize(&v.raw, &to.raw);
|
|
109
115
|
return to;
|
|
110
116
|
}
|
|
111
117
|
|
|
@@ -122,7 +128,7 @@ HWY_API VFromD<D> ResizeBitCast(D d, VFrom v) {
|
|
|
122
128
|
constexpr size_t kCopyByteLen = HWY_MIN(kFromByteLen, kToByteLen);
|
|
123
129
|
|
|
124
130
|
VFromD<D> to = Zero(d);
|
|
125
|
-
CopyBytes<kCopyByteLen>(&v, &to);
|
|
131
|
+
CopyBytes<kCopyByteLen>(&v.raw, &to.raw);
|
|
126
132
|
return to;
|
|
127
133
|
}
|
|
128
134
|
|
|
@@ -145,7 +151,7 @@ template <class D, typename T2>
|
|
|
145
151
|
HWY_API VFromD<D> Set(D d, const T2 t) {
|
|
146
152
|
VFromD<D> v;
|
|
147
153
|
for (size_t i = 0; i < MaxLanes(d); ++i) {
|
|
148
|
-
v.raw[i] =
|
|
154
|
+
v.raw[i] = ConvertScalarTo<TFromD<D>>(t);
|
|
149
155
|
}
|
|
150
156
|
return v;
|
|
151
157
|
}
|
|
@@ -156,14 +162,79 @@ HWY_API VFromD<D> Undefined(D d) {
|
|
|
156
162
|
return Zero(d);
|
|
157
163
|
}
|
|
158
164
|
|
|
165
|
+
// ------------------------------ Dup128VecFromValues
|
|
166
|
+
|
|
167
|
+
template <class D, HWY_IF_T_SIZE_D(D, 1)>
|
|
168
|
+
HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1,
|
|
169
|
+
TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
|
|
170
|
+
TFromD<D> t5, TFromD<D> t6, TFromD<D> t7,
|
|
171
|
+
TFromD<D> t8, TFromD<D> t9, TFromD<D> t10,
|
|
172
|
+
TFromD<D> t11, TFromD<D> t12,
|
|
173
|
+
TFromD<D> t13, TFromD<D> t14,
|
|
174
|
+
TFromD<D> t15) {
|
|
175
|
+
VFromD<D> result;
|
|
176
|
+
result.raw[0] = t0;
|
|
177
|
+
result.raw[1] = t1;
|
|
178
|
+
result.raw[2] = t2;
|
|
179
|
+
result.raw[3] = t3;
|
|
180
|
+
result.raw[4] = t4;
|
|
181
|
+
result.raw[5] = t5;
|
|
182
|
+
result.raw[6] = t6;
|
|
183
|
+
result.raw[7] = t7;
|
|
184
|
+
result.raw[8] = t8;
|
|
185
|
+
result.raw[9] = t9;
|
|
186
|
+
result.raw[10] = t10;
|
|
187
|
+
result.raw[11] = t11;
|
|
188
|
+
result.raw[12] = t12;
|
|
189
|
+
result.raw[13] = t13;
|
|
190
|
+
result.raw[14] = t14;
|
|
191
|
+
result.raw[15] = t15;
|
|
192
|
+
return result;
|
|
193
|
+
}
|
|
194
|
+
|
|
195
|
+
template <class D, HWY_IF_T_SIZE_D(D, 2)>
|
|
196
|
+
HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1,
|
|
197
|
+
TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
|
|
198
|
+
TFromD<D> t5, TFromD<D> t6,
|
|
199
|
+
TFromD<D> t7) {
|
|
200
|
+
VFromD<D> result;
|
|
201
|
+
result.raw[0] = t0;
|
|
202
|
+
result.raw[1] = t1;
|
|
203
|
+
result.raw[2] = t2;
|
|
204
|
+
result.raw[3] = t3;
|
|
205
|
+
result.raw[4] = t4;
|
|
206
|
+
result.raw[5] = t5;
|
|
207
|
+
result.raw[6] = t6;
|
|
208
|
+
result.raw[7] = t7;
|
|
209
|
+
return result;
|
|
210
|
+
}
|
|
211
|
+
|
|
212
|
+
template <class D, HWY_IF_T_SIZE_D(D, 4)>
|
|
213
|
+
HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1,
|
|
214
|
+
TFromD<D> t2, TFromD<D> t3) {
|
|
215
|
+
VFromD<D> result;
|
|
216
|
+
result.raw[0] = t0;
|
|
217
|
+
result.raw[1] = t1;
|
|
218
|
+
result.raw[2] = t2;
|
|
219
|
+
result.raw[3] = t3;
|
|
220
|
+
return result;
|
|
221
|
+
}
|
|
222
|
+
|
|
223
|
+
template <class D, HWY_IF_T_SIZE_D(D, 8)>
|
|
224
|
+
HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1) {
|
|
225
|
+
VFromD<D> result;
|
|
226
|
+
result.raw[0] = t0;
|
|
227
|
+
result.raw[1] = t1;
|
|
228
|
+
return result;
|
|
229
|
+
}
|
|
230
|
+
|
|
159
231
|
// ------------------------------ Iota
|
|
160
232
|
|
|
161
233
|
template <class D, typename T = TFromD<D>, typename T2>
|
|
162
234
|
HWY_API VFromD<D> Iota(D d, T2 first) {
|
|
163
235
|
VFromD<D> v;
|
|
164
236
|
for (size_t i = 0; i < MaxLanes(d); ++i) {
|
|
165
|
-
v.raw[i] =
|
|
166
|
-
AddWithWraparound(hwy::IsFloatTag<T>(), static_cast<T>(first), i);
|
|
237
|
+
v.raw[i] = AddWithWraparound(static_cast<T>(first), i);
|
|
167
238
|
}
|
|
168
239
|
return v;
|
|
169
240
|
}
|
|
@@ -286,7 +357,7 @@ template <typename T, size_t N>
|
|
|
286
357
|
HWY_API Vec128<T, N> BroadcastSignBit(Vec128<T, N> v) {
|
|
287
358
|
// This is used inside ShiftRight, so we cannot implement in terms of it.
|
|
288
359
|
for (size_t i = 0; i < N; ++i) {
|
|
289
|
-
v.raw[i] = v.raw[i] < 0 ?
|
|
360
|
+
v.raw[i] = static_cast<T>(v.raw[i] < 0 ? -1 : 0);
|
|
290
361
|
}
|
|
291
362
|
return v;
|
|
292
363
|
}
|
|
@@ -297,7 +368,7 @@ HWY_API Vec128<T, N> BroadcastSignBit(Vec128<T, N> v) {
|
|
|
297
368
|
template <typename T, size_t N>
|
|
298
369
|
HWY_API Mask128<T, N> MaskFromVec(Vec128<T, N> v) {
|
|
299
370
|
Mask128<T, N> mask;
|
|
300
|
-
CopySameSize(&v, &mask);
|
|
371
|
+
CopySameSize(&v.raw, &mask.bits);
|
|
301
372
|
return mask;
|
|
302
373
|
}
|
|
303
374
|
|
|
@@ -307,20 +378,15 @@ using MFromD = decltype(MaskFromVec(VFromD<D>()));
|
|
|
307
378
|
template <class DTo, class MFrom>
|
|
308
379
|
HWY_API MFromD<DTo> RebindMask(DTo /* tag */, MFrom mask) {
|
|
309
380
|
MFromD<DTo> to;
|
|
310
|
-
CopySameSize(&mask, &to);
|
|
381
|
+
CopySameSize(&mask.bits, &to.bits);
|
|
311
382
|
return to;
|
|
312
383
|
}
|
|
313
384
|
|
|
314
|
-
template <typename T, size_t N>
|
|
315
|
-
Vec128<T, N> VecFromMask(Mask128<T, N> mask) {
|
|
316
|
-
Vec128<T, N> v;
|
|
317
|
-
CopySameSize(&mask, &v);
|
|
318
|
-
return v;
|
|
319
|
-
}
|
|
320
|
-
|
|
321
385
|
template <class D>
|
|
322
386
|
VFromD<D> VecFromMask(D /* tag */, MFromD<D> mask) {
|
|
323
|
-
|
|
387
|
+
VFromD<D> v;
|
|
388
|
+
CopySameSize(&mask.bits, &v.raw);
|
|
389
|
+
return v;
|
|
324
390
|
}
|
|
325
391
|
|
|
326
392
|
template <class D>
|
|
@@ -336,19 +402,20 @@ HWY_API MFromD<D> FirstN(D d, size_t n) {
|
|
|
336
402
|
template <typename T, size_t N>
|
|
337
403
|
HWY_API Vec128<T, N> IfThenElse(Mask128<T, N> mask, Vec128<T, N> yes,
|
|
338
404
|
Vec128<T, N> no) {
|
|
339
|
-
|
|
405
|
+
const DFromV<decltype(yes)> d;
|
|
406
|
+
return IfVecThenElse(VecFromMask(d, mask), yes, no);
|
|
340
407
|
}
|
|
341
408
|
|
|
342
409
|
template <typename T, size_t N>
|
|
343
410
|
HWY_API Vec128<T, N> IfThenElseZero(Mask128<T, N> mask, Vec128<T, N> yes) {
|
|
344
411
|
const DFromV<decltype(yes)> d;
|
|
345
|
-
return IfVecThenElse(VecFromMask(mask), yes, Zero(d));
|
|
412
|
+
return IfVecThenElse(VecFromMask(d, mask), yes, Zero(d));
|
|
346
413
|
}
|
|
347
414
|
|
|
348
415
|
template <typename T, size_t N>
|
|
349
416
|
HWY_API Vec128<T, N> IfThenZeroElse(Mask128<T, N> mask, Vec128<T, N> no) {
|
|
350
417
|
const DFromV<decltype(no)> d;
|
|
351
|
-
return IfVecThenElse(VecFromMask(mask), Zero(d), no);
|
|
418
|
+
return IfVecThenElse(VecFromMask(d, mask), Zero(d), no);
|
|
352
419
|
}
|
|
353
420
|
|
|
354
421
|
template <typename T, size_t N>
|
|
@@ -374,7 +441,8 @@ HWY_API Vec128<T, N> ZeroIfNegative(Vec128<T, N> v) {
|
|
|
374
441
|
|
|
375
442
|
template <typename T, size_t N>
|
|
376
443
|
HWY_API Mask128<T, N> Not(Mask128<T, N> m) {
|
|
377
|
-
|
|
444
|
+
const Simd<T, N, 0> d;
|
|
445
|
+
return MaskFromVec(Not(VecFromMask(d, m)));
|
|
378
446
|
}
|
|
379
447
|
|
|
380
448
|
template <typename T, size_t N>
|
|
@@ -614,6 +682,15 @@ HWY_API Vec128<uint64_t, (N + 7) / 8> SumsOf8(Vec128<uint8_t, N> v) {
|
|
|
614
682
|
return sums;
|
|
615
683
|
}
|
|
616
684
|
|
|
685
|
+
template <size_t N>
|
|
686
|
+
HWY_API Vec128<int64_t, (N + 7) / 8> SumsOf8(Vec128<int8_t, N> v) {
|
|
687
|
+
Vec128<int64_t, (N + 7) / 8> sums;
|
|
688
|
+
for (size_t i = 0; i < N; ++i) {
|
|
689
|
+
sums.raw[i / 8] += v.raw[i];
|
|
690
|
+
}
|
|
691
|
+
return sums;
|
|
692
|
+
}
|
|
693
|
+
|
|
617
694
|
// ------------------------------ SaturatedAdd
|
|
618
695
|
template <typename T, size_t N, HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2)),
|
|
619
696
|
HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
|
|
@@ -652,34 +729,14 @@ HWY_API Vec128<T, N> AverageRound(Vec128<T, N> a, Vec128<T, N> b) {
|
|
|
652
729
|
|
|
653
730
|
// ------------------------------ Abs
|
|
654
731
|
|
|
655
|
-
// Tag dispatch instead of SFINAE for MSVC 2017 compatibility
|
|
656
|
-
namespace detail {
|
|
657
|
-
|
|
658
732
|
template <typename T, size_t N>
|
|
659
|
-
|
|
733
|
+
HWY_API Vec128<T, N> Abs(Vec128<T, N> a) {
|
|
660
734
|
for (size_t i = 0; i < N; ++i) {
|
|
661
|
-
|
|
662
|
-
const T min = hwy::LimitsMin<T>();
|
|
663
|
-
a.raw[i] = static_cast<T>((s >= 0 || s == min) ? a.raw[i] : -s);
|
|
735
|
+
a.raw[i] = ScalarAbs(a.raw[i]);
|
|
664
736
|
}
|
|
665
737
|
return a;
|
|
666
738
|
}
|
|
667
739
|
|
|
668
|
-
template <typename T, size_t N>
|
|
669
|
-
HWY_INLINE Vec128<T, N> Abs(hwy::FloatTag /*tag*/, Vec128<T, N> v) {
|
|
670
|
-
for (size_t i = 0; i < N; ++i) {
|
|
671
|
-
v.raw[i] = std::abs(v.raw[i]);
|
|
672
|
-
}
|
|
673
|
-
return v;
|
|
674
|
-
}
|
|
675
|
-
|
|
676
|
-
} // namespace detail
|
|
677
|
-
|
|
678
|
-
template <typename T, size_t N>
|
|
679
|
-
HWY_API Vec128<T, N> Abs(Vec128<T, N> a) {
|
|
680
|
-
return detail::Abs(hwy::TypeTag<T>(), a);
|
|
681
|
-
}
|
|
682
|
-
|
|
683
740
|
// ------------------------------ Min/Max
|
|
684
741
|
|
|
685
742
|
// Tag dispatch instead of SFINAE for MSVC 2017 compatibility
|
|
@@ -706,9 +763,9 @@ template <typename T, size_t N>
|
|
|
706
763
|
HWY_INLINE Vec128<T, N> Min(hwy::FloatTag /*tag*/, Vec128<T, N> a,
|
|
707
764
|
Vec128<T, N> b) {
|
|
708
765
|
for (size_t i = 0; i < N; ++i) {
|
|
709
|
-
if (
|
|
766
|
+
if (ScalarIsNaN(a.raw[i])) {
|
|
710
767
|
a.raw[i] = b.raw[i];
|
|
711
|
-
} else if (
|
|
768
|
+
} else if (ScalarIsNaN(b.raw[i])) {
|
|
712
769
|
// no change
|
|
713
770
|
} else {
|
|
714
771
|
a.raw[i] = HWY_MIN(a.raw[i], b.raw[i]);
|
|
@@ -720,9 +777,9 @@ template <typename T, size_t N>
|
|
|
720
777
|
HWY_INLINE Vec128<T, N> Max(hwy::FloatTag /*tag*/, Vec128<T, N> a,
|
|
721
778
|
Vec128<T, N> b) {
|
|
722
779
|
for (size_t i = 0; i < N; ++i) {
|
|
723
|
-
if (
|
|
780
|
+
if (ScalarIsNaN(a.raw[i])) {
|
|
724
781
|
a.raw[i] = b.raw[i];
|
|
725
|
-
} else if (
|
|
782
|
+
} else if (ScalarIsNaN(b.raw[i])) {
|
|
726
783
|
// no change
|
|
727
784
|
} else {
|
|
728
785
|
a.raw[i] = HWY_MAX(a.raw[i], b.raw[i]);
|
|
@@ -825,7 +882,7 @@ HWY_API Vec128<T, N> operator*(Vec128<T, N> a, Vec128<T, N> b) {
|
|
|
825
882
|
return detail::Mul(hwy::TypeTag<T>(), a, b);
|
|
826
883
|
}
|
|
827
884
|
|
|
828
|
-
template <typename T, size_t N>
|
|
885
|
+
template <typename T, size_t N, HWY_IF_FLOAT(T)>
|
|
829
886
|
HWY_API Vec128<T, N> operator/(Vec128<T, N> a, Vec128<T, N> b) {
|
|
830
887
|
for (size_t i = 0; i < N; ++i) {
|
|
831
888
|
a.raw[i] = (b.raw[i] == T{0}) ? 0 : a.raw[i] / b.raw[i];
|
|
@@ -900,7 +957,7 @@ HWY_API Vec128<float, N> ApproximateReciprocal(Vec128<float, N> v) {
|
|
|
900
957
|
// Zero inputs are allowed, but callers are responsible for replacing the
|
|
901
958
|
// return value with something else (typically using IfThenElse). This check
|
|
902
959
|
// avoids a ubsan error. The result is arbitrary.
|
|
903
|
-
v.raw[i] = (
|
|
960
|
+
v.raw[i] = (ScalarAbs(v.raw[i]) == 0.0f) ? 0.0f : 1.0f / v.raw[i];
|
|
904
961
|
}
|
|
905
962
|
return v;
|
|
906
963
|
}
|
|
@@ -913,25 +970,25 @@ HWY_API Vec128<T, N> AbsDiff(Vec128<T, N> a, Vec128<T, N> b) {
|
|
|
913
970
|
|
|
914
971
|
// ------------------------------ Floating-point multiply-add variants
|
|
915
972
|
|
|
916
|
-
template <typename T, size_t N>
|
|
973
|
+
template <typename T, size_t N, HWY_IF_FLOAT(T)>
|
|
917
974
|
HWY_API Vec128<T, N> MulAdd(Vec128<T, N> mul, Vec128<T, N> x,
|
|
918
975
|
Vec128<T, N> add) {
|
|
919
976
|
return mul * x + add;
|
|
920
977
|
}
|
|
921
978
|
|
|
922
|
-
template <typename T, size_t N>
|
|
979
|
+
template <typename T, size_t N, HWY_IF_FLOAT(T)>
|
|
923
980
|
HWY_API Vec128<T, N> NegMulAdd(Vec128<T, N> mul, Vec128<T, N> x,
|
|
924
981
|
Vec128<T, N> add) {
|
|
925
982
|
return add - mul * x;
|
|
926
983
|
}
|
|
927
984
|
|
|
928
|
-
template <typename T, size_t N>
|
|
985
|
+
template <typename T, size_t N, HWY_IF_FLOAT(T)>
|
|
929
986
|
HWY_API Vec128<T, N> MulSub(Vec128<T, N> mul, Vec128<T, N> x,
|
|
930
987
|
Vec128<T, N> sub) {
|
|
931
988
|
return mul * x - sub;
|
|
932
989
|
}
|
|
933
990
|
|
|
934
|
-
template <typename T, size_t N>
|
|
991
|
+
template <typename T, size_t N, HWY_IF_FLOAT(T)>
|
|
935
992
|
HWY_API Vec128<T, N> NegMulSub(Vec128<T, N> mul, Vec128<T, N> x,
|
|
936
993
|
Vec128<T, N> sub) {
|
|
937
994
|
return Neg(mul) * x - sub;
|
|
@@ -943,21 +1000,52 @@ template <size_t N>
|
|
|
943
1000
|
HWY_API Vec128<float, N> ApproximateReciprocalSqrt(Vec128<float, N> v) {
|
|
944
1001
|
for (size_t i = 0; i < N; ++i) {
|
|
945
1002
|
const float half = v.raw[i] * 0.5f;
|
|
946
|
-
uint32_t bits;
|
|
947
|
-
CopySameSize(&v.raw[i], &bits);
|
|
948
1003
|
// Initial guess based on log2(f)
|
|
949
|
-
|
|
950
|
-
|
|
1004
|
+
v.raw[i] = BitCastScalar<float>(static_cast<uint32_t>(
|
|
1005
|
+
0x5F3759DF - (BitCastScalar<uint32_t>(v.raw[i]) >> 1)));
|
|
951
1006
|
// One Newton-Raphson iteration
|
|
952
1007
|
v.raw[i] = v.raw[i] * (1.5f - (half * v.raw[i] * v.raw[i]));
|
|
953
1008
|
}
|
|
954
1009
|
return v;
|
|
955
1010
|
}
|
|
956
1011
|
|
|
1012
|
+
namespace detail {
|
|
1013
|
+
|
|
1014
|
+
static HWY_INLINE float ScalarSqrt(float v) {
|
|
1015
|
+
#if defined(HWY_NO_LIBCXX)
|
|
1016
|
+
#if HWY_COMPILER_GCC_ACTUAL
|
|
1017
|
+
return __builtin_sqrt(v);
|
|
1018
|
+
#else
|
|
1019
|
+
uint32_t bits = BitCastScalar<uint32_t>(v);
|
|
1020
|
+
// Coarse approximation, letting the exponent LSB leak into the mantissa
|
|
1021
|
+
bits = (1 << 29) + (bits >> 1) - (1 << 22);
|
|
1022
|
+
return BitCastScalar<float>(bits);
|
|
1023
|
+
#endif // !HWY_COMPILER_GCC_ACTUAL
|
|
1024
|
+
#else
|
|
1025
|
+
return sqrtf(v);
|
|
1026
|
+
#endif // !HWY_NO_LIBCXX
|
|
1027
|
+
}
|
|
1028
|
+
static HWY_INLINE double ScalarSqrt(double v) {
|
|
1029
|
+
#if defined(HWY_NO_LIBCXX)
|
|
1030
|
+
#if HWY_COMPILER_GCC_ACTUAL
|
|
1031
|
+
return __builtin_sqrt(v);
|
|
1032
|
+
#else
|
|
1033
|
+
uint64_t bits = BitCastScalar<uint64_t>(v);
|
|
1034
|
+
// Coarse approximation, letting the exponent LSB leak into the mantissa
|
|
1035
|
+
bits = (1ULL << 61) + (bits >> 1) - (1ULL << 51);
|
|
1036
|
+
return BitCastScalar<double>(bits);
|
|
1037
|
+
#endif // !HWY_COMPILER_GCC_ACTUAL
|
|
1038
|
+
#else
|
|
1039
|
+
return sqrt(v);
|
|
1040
|
+
#endif // HWY_NO_LIBCXX
|
|
1041
|
+
}
|
|
1042
|
+
|
|
1043
|
+
} // namespace detail
|
|
1044
|
+
|
|
957
1045
|
template <typename T, size_t N>
|
|
958
1046
|
HWY_API Vec128<T, N> Sqrt(Vec128<T, N> v) {
|
|
959
1047
|
for (size_t i = 0; i < N; ++i) {
|
|
960
|
-
v.raw[i] =
|
|
1048
|
+
v.raw[i] = detail::ScalarSqrt(v.raw[i]);
|
|
961
1049
|
}
|
|
962
1050
|
return v;
|
|
963
1051
|
}
|
|
@@ -967,21 +1055,23 @@ HWY_API Vec128<T, N> Sqrt(Vec128<T, N> v) {
|
|
|
967
1055
|
template <typename T, size_t N>
|
|
968
1056
|
HWY_API Vec128<T, N> Round(Vec128<T, N> v) {
|
|
969
1057
|
using TI = MakeSigned<T>;
|
|
1058
|
+
const T k0 = ConvertScalarTo<T>(0);
|
|
970
1059
|
const Vec128<T, N> a = Abs(v);
|
|
971
1060
|
for (size_t i = 0; i < N; ++i) {
|
|
972
1061
|
if (!(a.raw[i] < MantissaEnd<T>())) { // Huge or NaN
|
|
973
1062
|
continue;
|
|
974
1063
|
}
|
|
975
|
-
const T bias = v.raw[i] <
|
|
976
|
-
const TI rounded =
|
|
1064
|
+
const T bias = ConvertScalarTo<T>(v.raw[i] < k0 ? -0.5 : 0.5);
|
|
1065
|
+
const TI rounded = ConvertScalarTo<TI>(v.raw[i] + bias);
|
|
977
1066
|
if (rounded == 0) {
|
|
978
|
-
v.raw[i] = v.raw[i] < 0 ? T
|
|
1067
|
+
v.raw[i] = v.raw[i] < 0 ? ConvertScalarTo<T>(-0) : k0;
|
|
979
1068
|
continue;
|
|
980
1069
|
}
|
|
981
|
-
const T rounded_f =
|
|
1070
|
+
const T rounded_f = ConvertScalarTo<T>(rounded);
|
|
982
1071
|
// Round to even
|
|
983
|
-
if ((rounded & 1) &&
|
|
984
|
-
|
|
1072
|
+
if ((rounded & 1) &&
|
|
1073
|
+
ScalarAbs(rounded_f - v.raw[i]) == ConvertScalarTo<T>(0.5)) {
|
|
1074
|
+
v.raw[i] = ConvertScalarTo<T>(rounded - (v.raw[i] < k0 ? -1 : 1));
|
|
985
1075
|
continue;
|
|
986
1076
|
}
|
|
987
1077
|
v.raw[i] = rounded_f;
|
|
@@ -994,30 +1084,32 @@ template <size_t N>
|
|
|
994
1084
|
HWY_API Vec128<int32_t, N> NearestInt(Vec128<float, N> v) {
|
|
995
1085
|
using T = float;
|
|
996
1086
|
using TI = int32_t;
|
|
1087
|
+
const T k0 = ConvertScalarTo<T>(0);
|
|
997
1088
|
|
|
998
1089
|
const Vec128<float, N> abs = Abs(v);
|
|
999
1090
|
Vec128<int32_t, N> ret;
|
|
1000
1091
|
for (size_t i = 0; i < N; ++i) {
|
|
1001
|
-
const bool signbit =
|
|
1092
|
+
const bool signbit = ScalarSignBit(v.raw[i]);
|
|
1002
1093
|
|
|
1003
1094
|
if (!(abs.raw[i] < MantissaEnd<T>())) { // Huge or NaN
|
|
1004
1095
|
// Check if too large to cast or NaN
|
|
1005
|
-
if (!(abs.raw[i] <=
|
|
1096
|
+
if (!(abs.raw[i] <= ConvertScalarTo<T>(LimitsMax<TI>()))) {
|
|
1006
1097
|
ret.raw[i] = signbit ? LimitsMin<TI>() : LimitsMax<TI>();
|
|
1007
1098
|
continue;
|
|
1008
1099
|
}
|
|
1009
1100
|
ret.raw[i] = static_cast<TI>(v.raw[i]);
|
|
1010
1101
|
continue;
|
|
1011
1102
|
}
|
|
1012
|
-
const T bias = v.raw[i] <
|
|
1013
|
-
const TI rounded =
|
|
1103
|
+
const T bias = ConvertScalarTo<T>(v.raw[i] < k0 ? -0.5 : 0.5);
|
|
1104
|
+
const TI rounded = ConvertScalarTo<TI>(v.raw[i] + bias);
|
|
1014
1105
|
if (rounded == 0) {
|
|
1015
1106
|
ret.raw[i] = 0;
|
|
1016
1107
|
continue;
|
|
1017
1108
|
}
|
|
1018
|
-
const T rounded_f =
|
|
1109
|
+
const T rounded_f = ConvertScalarTo<T>(rounded);
|
|
1019
1110
|
// Round to even
|
|
1020
|
-
if ((rounded & 1) &&
|
|
1111
|
+
if ((rounded & 1) &&
|
|
1112
|
+
ScalarAbs(rounded_f - v.raw[i]) == ConvertScalarTo<T>(0.5)) {
|
|
1021
1113
|
ret.raw[i] = rounded - (signbit ? -1 : 1);
|
|
1022
1114
|
continue;
|
|
1023
1115
|
}
|
|
@@ -1056,8 +1148,7 @@ Vec128<Float, N> Ceil(Vec128<Float, N> v) {
|
|
|
1056
1148
|
for (size_t i = 0; i < N; ++i) {
|
|
1057
1149
|
const bool positive = v.raw[i] > Float(0.0);
|
|
1058
1150
|
|
|
1059
|
-
Bits bits;
|
|
1060
|
-
CopySameSize(&v.raw[i], &bits);
|
|
1151
|
+
Bits bits = BitCastScalar<Bits>(v.raw[i]);
|
|
1061
1152
|
|
|
1062
1153
|
const int exponent =
|
|
1063
1154
|
static_cast<int>(((bits >> kMantissaBits) & kExponentMask) - kBias);
|
|
@@ -1077,7 +1168,7 @@ Vec128<Float, N> Ceil(Vec128<Float, N> v) {
|
|
|
1077
1168
|
if (positive) bits += (kMantissaMask + 1) >> exponent;
|
|
1078
1169
|
bits &= ~mantissa_mask;
|
|
1079
1170
|
|
|
1080
|
-
|
|
1171
|
+
v.raw[i] = BitCastScalar<Float>(bits);
|
|
1081
1172
|
}
|
|
1082
1173
|
return v;
|
|
1083
1174
|
}
|
|
@@ -1094,8 +1185,7 @@ Vec128<Float, N> Floor(Vec128<Float, N> v) {
|
|
|
1094
1185
|
for (size_t i = 0; i < N; ++i) {
|
|
1095
1186
|
const bool negative = v.raw[i] < Float(0.0);
|
|
1096
1187
|
|
|
1097
|
-
Bits bits;
|
|
1098
|
-
CopySameSize(&v.raw[i], &bits);
|
|
1188
|
+
Bits bits = BitCastScalar<Bits>(v.raw[i]);
|
|
1099
1189
|
|
|
1100
1190
|
const int exponent =
|
|
1101
1191
|
static_cast<int>(((bits >> kMantissaBits) & kExponentMask) - kBias);
|
|
@@ -1115,7 +1205,7 @@ Vec128<Float, N> Floor(Vec128<Float, N> v) {
|
|
|
1115
1205
|
if (negative) bits += (kMantissaMask + 1) >> exponent;
|
|
1116
1206
|
bits &= ~mantissa_mask;
|
|
1117
1207
|
|
|
1118
|
-
|
|
1208
|
+
v.raw[i] = BitCastScalar<Float>(bits);
|
|
1119
1209
|
}
|
|
1120
1210
|
return v;
|
|
1121
1211
|
}
|
|
@@ -1127,44 +1217,11 @@ HWY_API Mask128<T, N> IsNaN(Vec128<T, N> v) {
|
|
|
1127
1217
|
Mask128<T, N> ret;
|
|
1128
1218
|
for (size_t i = 0; i < N; ++i) {
|
|
1129
1219
|
// std::isnan returns false for 0x7F..FF in clang AVX3 builds, so DIY.
|
|
1130
|
-
|
|
1131
|
-
CopySameSize(&v.raw[i], &bits);
|
|
1132
|
-
bits += bits;
|
|
1133
|
-
bits >>= 1; // clear sign bit
|
|
1134
|
-
// NaN if all exponent bits are set and the mantissa is not zero.
|
|
1135
|
-
ret.bits[i] = Mask128<T, N>::FromBool(bits > ExponentMask<T>());
|
|
1220
|
+
ret.bits[i] = Mask128<T, N>::FromBool(ScalarIsNaN(v.raw[i]));
|
|
1136
1221
|
}
|
|
1137
1222
|
return ret;
|
|
1138
1223
|
}
|
|
1139
1224
|
|
|
1140
|
-
template <typename T, size_t N>
|
|
1141
|
-
HWY_API Mask128<T, N> IsInf(Vec128<T, N> v) {
|
|
1142
|
-
static_assert(IsFloat<T>(), "Only for float");
|
|
1143
|
-
const DFromV<decltype(v)> d;
|
|
1144
|
-
const RebindToSigned<decltype(d)> di;
|
|
1145
|
-
const VFromD<decltype(di)> vi = BitCast(di, v);
|
|
1146
|
-
// 'Shift left' to clear the sign bit, check for exponent=max and mantissa=0.
|
|
1147
|
-
return RebindMask(d, Eq(Add(vi, vi), Set(di, hwy::MaxExponentTimes2<T>())));
|
|
1148
|
-
}
|
|
1149
|
-
|
|
1150
|
-
// Returns whether normal/subnormal/zero.
|
|
1151
|
-
template <typename T, size_t N>
|
|
1152
|
-
HWY_API Mask128<T, N> IsFinite(Vec128<T, N> v) {
|
|
1153
|
-
static_assert(IsFloat<T>(), "Only for float");
|
|
1154
|
-
const DFromV<decltype(v)> d;
|
|
1155
|
-
const RebindToUnsigned<decltype(d)> du;
|
|
1156
|
-
const RebindToSigned<decltype(d)> di; // cheaper than unsigned comparison
|
|
1157
|
-
using VI = VFromD<decltype(di)>;
|
|
1158
|
-
using VU = VFromD<decltype(du)>;
|
|
1159
|
-
const VU vu = BitCast(du, v);
|
|
1160
|
-
// 'Shift left' to clear the sign bit, then right so we can compare with the
|
|
1161
|
-
// max exponent (cannot compare with MaxExponentTimes2 directly because it is
|
|
1162
|
-
// negative and non-negative floats would be greater).
|
|
1163
|
-
const VI exp =
|
|
1164
|
-
BitCast(di, ShiftRight<hwy::MantissaBits<T>() + 1>(Add(vu, vu)));
|
|
1165
|
-
return RebindMask(d, Lt(exp, Set(di, hwy::MaxExponentField<T>())));
|
|
1166
|
-
}
|
|
1167
|
-
|
|
1168
1225
|
// ================================================== COMPARE
|
|
1169
1226
|
|
|
1170
1227
|
template <typename T, size_t N>
|
|
@@ -1510,67 +1567,59 @@ HWY_API void Stream(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT aligned) {
|
|
|
1510
1567
|
namespace detail {
|
|
1511
1568
|
|
|
1512
1569
|
template <class ToT, class FromT>
|
|
1513
|
-
HWY_INLINE ToT CastValueForF2IConv(
|
|
1514
|
-
FromT val) {
|
|
1515
|
-
// Prevent ubsan errors when converting float to narrower integer
|
|
1516
|
-
|
|
1517
|
-
// If LimitsMax<ToT>() can be exactly represented in FromT,
|
|
1518
|
-
// kSmallestOutOfToTRangePosVal is equal to LimitsMax<ToT>().
|
|
1519
|
-
|
|
1520
|
-
// Otherwise, if LimitsMax<ToT>() cannot be exactly represented in FromT,
|
|
1521
|
-
// kSmallestOutOfToTRangePosVal is equal to LimitsMax<ToT>() + 1, which can
|
|
1522
|
-
// be exactly represented in FromT.
|
|
1523
|
-
constexpr FromT kSmallestOutOfToTRangePosVal =
|
|
1524
|
-
(sizeof(ToT) * 8 <= static_cast<size_t>(MantissaBits<FromT>()) + 1)
|
|
1525
|
-
? static_cast<FromT>(LimitsMax<ToT>())
|
|
1526
|
-
: static_cast<FromT>(
|
|
1527
|
-
static_cast<FromT>(ToT{1} << (sizeof(ToT) * 8 - 1)) * FromT(2));
|
|
1528
|
-
|
|
1529
|
-
if (std::signbit(val)) {
|
|
1530
|
-
return ToT{0};
|
|
1531
|
-
} else if (std::isinf(val) || val >= kSmallestOutOfToTRangePosVal) {
|
|
1532
|
-
return LimitsMax<ToT>();
|
|
1533
|
-
} else {
|
|
1534
|
-
return static_cast<ToT>(val);
|
|
1535
|
-
}
|
|
1536
|
-
}
|
|
1537
|
-
|
|
1538
|
-
template <class ToT, class FromT>
|
|
1539
|
-
HWY_INLINE ToT CastValueForF2IConv(hwy::SignedTag /* to_type_tag */,
|
|
1540
|
-
FromT val) {
|
|
1570
|
+
HWY_INLINE ToT CastValueForF2IConv(FromT val) {
|
|
1541
1571
|
// Prevent ubsan errors when converting float to narrower integer
|
|
1542
1572
|
|
|
1543
|
-
|
|
1544
|
-
|
|
1545
|
-
|
|
1546
|
-
|
|
1547
|
-
|
|
1548
|
-
|
|
1549
|
-
constexpr
|
|
1550
|
-
|
|
1551
|
-
|
|
1552
|
-
|
|
1553
|
-
|
|
1554
|
-
|
|
1555
|
-
|
|
1556
|
-
|
|
1557
|
-
|
|
1558
|
-
|
|
1573
|
+
using FromTU = MakeUnsigned<FromT>;
|
|
1574
|
+
using ToTU = MakeUnsigned<ToT>;
|
|
1575
|
+
|
|
1576
|
+
constexpr unsigned kMaxExpField =
|
|
1577
|
+
static_cast<unsigned>(MaxExponentField<FromT>());
|
|
1578
|
+
constexpr unsigned kExpBias = kMaxExpField >> 1;
|
|
1579
|
+
constexpr unsigned kMinOutOfRangeExpField = static_cast<unsigned>(HWY_MIN(
|
|
1580
|
+
kExpBias + sizeof(ToT) * 8 - static_cast<unsigned>(IsSigned<ToT>()),
|
|
1581
|
+
kMaxExpField));
|
|
1582
|
+
|
|
1583
|
+
// If ToT is signed, compare only the exponent bits of val against
|
|
1584
|
+
// kMinOutOfRangeExpField.
|
|
1585
|
+
//
|
|
1586
|
+
// Otherwise, if ToT is unsigned, compare the sign bit plus exponent bits of
|
|
1587
|
+
// val against kMinOutOfRangeExpField as a negative value is outside of the
|
|
1588
|
+
// range of an unsigned integer type.
|
|
1589
|
+
const FromT val_to_compare =
|
|
1590
|
+
static_cast<FromT>(IsSigned<ToT>() ? ScalarAbs(val) : val);
|
|
1591
|
+
|
|
1592
|
+
// val is within the range of ToT if
|
|
1593
|
+
// (BitCastScalar<FromTU>(val_to_compare) >> MantissaBits<FromT>()) is less
|
|
1594
|
+
// than kMinOutOfRangeExpField
|
|
1595
|
+
//
|
|
1596
|
+
// Otherwise, val is either outside of the range of ToT or equal to
|
|
1597
|
+
// LimitsMin<ToT>() if
|
|
1598
|
+
// (BitCastScalar<FromTU>(val_to_compare) >> MantissaBits<FromT>()) is greater
|
|
1599
|
+
// than or equal to kMinOutOfRangeExpField.
|
|
1600
|
+
|
|
1601
|
+
return (static_cast<unsigned>(BitCastScalar<FromTU>(val_to_compare) >>
|
|
1602
|
+
MantissaBits<FromT>()) < kMinOutOfRangeExpField)
|
|
1603
|
+
? static_cast<ToT>(val)
|
|
1604
|
+
: static_cast<ToT>(static_cast<ToTU>(LimitsMax<ToT>()) +
|
|
1605
|
+
static_cast<ToTU>(ScalarSignBit(val)));
|
|
1559
1606
|
}
|
|
1560
1607
|
|
|
1561
1608
|
template <class ToT, class ToTypeTag, class FromT>
|
|
1562
1609
|
HWY_INLINE ToT CastValueForPromoteTo(ToTypeTag /* to_type_tag */, FromT val) {
|
|
1563
|
-
return
|
|
1610
|
+
return ConvertScalarTo<ToT>(val);
|
|
1564
1611
|
}
|
|
1565
1612
|
|
|
1566
1613
|
template <class ToT>
|
|
1567
|
-
HWY_INLINE ToT CastValueForPromoteTo(hwy::SignedTag to_type_tag
|
|
1568
|
-
|
|
1614
|
+
HWY_INLINE ToT CastValueForPromoteTo(hwy::SignedTag /*to_type_tag*/,
|
|
1615
|
+
float val) {
|
|
1616
|
+
return CastValueForF2IConv<ToT>(val);
|
|
1569
1617
|
}
|
|
1570
1618
|
|
|
1571
1619
|
template <class ToT>
|
|
1572
|
-
HWY_INLINE ToT CastValueForPromoteTo(hwy::UnsignedTag to_type_tag
|
|
1573
|
-
|
|
1620
|
+
HWY_INLINE ToT CastValueForPromoteTo(hwy::UnsignedTag /*to_type_tag*/,
|
|
1621
|
+
float val) {
|
|
1622
|
+
return CastValueForF2IConv<ToT>(val);
|
|
1574
1623
|
}
|
|
1575
1624
|
|
|
1576
1625
|
} // namespace detail
|
|
@@ -1594,10 +1643,10 @@ HWY_API VFromD<D> DemoteTo(D d, VFromD<Rebind<double, D>> from) {
|
|
|
1594
1643
|
VFromD<D> ret;
|
|
1595
1644
|
for (size_t i = 0; i < MaxLanes(d); ++i) {
|
|
1596
1645
|
// Prevent ubsan errors when converting float to narrower integer/float
|
|
1597
|
-
if (
|
|
1598
|
-
|
|
1599
|
-
ret.raw[i] =
|
|
1600
|
-
|
|
1646
|
+
if (ScalarIsInf(from.raw[i]) ||
|
|
1647
|
+
ScalarAbs(from.raw[i]) > static_cast<double>(HighestValue<float>())) {
|
|
1648
|
+
ret.raw[i] = ScalarSignBit(from.raw[i]) ? LowestValue<float>()
|
|
1649
|
+
: HighestValue<float>();
|
|
1601
1650
|
continue;
|
|
1602
1651
|
}
|
|
1603
1652
|
ret.raw[i] = static_cast<float>(from.raw[i]);
|
|
@@ -1609,8 +1658,7 @@ HWY_API VFromD<D> DemoteTo(D d, VFromD<Rebind<double, D>> from) {
|
|
|
1609
1658
|
VFromD<D> ret;
|
|
1610
1659
|
for (size_t i = 0; i < MaxLanes(d); ++i) {
|
|
1611
1660
|
// Prevent ubsan errors when converting double to narrower integer/int32_t
|
|
1612
|
-
ret.raw[i] = detail::CastValueForF2IConv<TFromD<D>>(
|
|
1613
|
-
hwy::TypeTag<TFromD<D>>(), from.raw[i]);
|
|
1661
|
+
ret.raw[i] = detail::CastValueForF2IConv<TFromD<D>>(from.raw[i]);
|
|
1614
1662
|
}
|
|
1615
1663
|
return ret;
|
|
1616
1664
|
}
|
|
@@ -1715,23 +1763,20 @@ HWY_API VFromD<DN> OrderedDemote2To(DN dn, V a, V b) {
|
|
|
1715
1763
|
return ReorderDemote2To(dn, a, b);
|
|
1716
1764
|
}
|
|
1717
1765
|
|
|
1718
|
-
template <class DN,
|
|
1766
|
+
template <class DN, HWY_IF_SPECIAL_FLOAT_D(DN), class V,
|
|
1767
|
+
HWY_IF_F32_D(DFromV<V>),
|
|
1719
1768
|
HWY_IF_LANES_D(DN, HWY_MAX_LANES_D(DFromV<V>) * 2)>
|
|
1720
1769
|
HWY_API VFromD<DN> OrderedDemote2To(DN dn, V a, V b) {
|
|
1721
|
-
const
|
|
1722
|
-
|
|
1723
|
-
VFromD<
|
|
1724
|
-
|
|
1725
|
-
const auto a_bits = BitCast(du32, a);
|
|
1726
|
-
const auto b_bits = BitCast(du32, b);
|
|
1727
|
-
|
|
1770
|
+
const size_t NW = Lanes(dn) / 2;
|
|
1771
|
+
using TN = TFromD<DN>;
|
|
1772
|
+
VFromD<DN> ret;
|
|
1728
1773
|
for (size_t i = 0; i < NW; ++i) {
|
|
1729
|
-
ret.raw[i] =
|
|
1774
|
+
ret.raw[i] = ConvertScalarTo<TN>(a.raw[i]);
|
|
1730
1775
|
}
|
|
1731
1776
|
for (size_t i = 0; i < NW; ++i) {
|
|
1732
|
-
ret.raw[NW + i] =
|
|
1777
|
+
ret.raw[NW + i] = ConvertScalarTo<TN>(b.raw[i]);
|
|
1733
1778
|
}
|
|
1734
|
-
return
|
|
1779
|
+
return ret;
|
|
1735
1780
|
}
|
|
1736
1781
|
|
|
1737
1782
|
namespace detail {
|
|
@@ -1780,7 +1825,7 @@ HWY_API VFromD<DTo> ConvertTo(hwy::FloatTag /*tag*/, DTo /*tag*/,
|
|
|
1780
1825
|
|
|
1781
1826
|
for (size_t i = 0; i < N; ++i) {
|
|
1782
1827
|
// float## -> int##: return closest representable value
|
|
1783
|
-
ret.raw[i] = CastValueForF2IConv<ToT>(
|
|
1828
|
+
ret.raw[i] = CastValueForF2IConv<ToT>(from.raw[i]);
|
|
1784
1829
|
}
|
|
1785
1830
|
return ret;
|
|
1786
1831
|
}
|
|
@@ -1980,8 +2025,16 @@ HWY_API VFromD<D> ConcatEven(D d, VFromD<D> hi, VFromD<D> lo) {
|
|
|
1980
2025
|
return ret;
|
|
1981
2026
|
}
|
|
1982
2027
|
|
|
2028
|
+
// 2023-11-23: workaround for incorrect codegen (reduction_test fails for
|
|
2029
|
+
// SumsOf2 because PromoteOddTo, which uses ConcatOdd, returns zero).
|
|
2030
|
+
#if HWY_ARCH_RVV && HWY_TARGET == HWY_EMU128 && HWY_COMPILER_CLANG
|
|
2031
|
+
#define HWY_EMU128_CONCAT_INLINE HWY_NOINLINE
|
|
2032
|
+
#else
|
|
2033
|
+
#define HWY_EMU128_CONCAT_INLINE HWY_API
|
|
2034
|
+
#endif
|
|
2035
|
+
|
|
1983
2036
|
template <class D>
|
|
1984
|
-
|
|
2037
|
+
HWY_EMU128_CONCAT_INLINE VFromD<D> ConcatOdd(D d, VFromD<D> hi, VFromD<D> lo) {
|
|
1985
2038
|
const Half<decltype(d)> dh;
|
|
1986
2039
|
VFromD<D> ret;
|
|
1987
2040
|
for (size_t i = 0; i < MaxLanes(dh); ++i) {
|
|
@@ -2349,8 +2402,8 @@ HWY_API Vec128<T, N> InterleaveLower(Vec128<T, N> a, Vec128<T, N> b) {
|
|
|
2349
2402
|
}
|
|
2350
2403
|
|
|
2351
2404
|
// Additional overload for the optional tag.
|
|
2352
|
-
template <class
|
|
2353
|
-
HWY_API
|
|
2405
|
+
template <class D>
|
|
2406
|
+
HWY_API VFromD<D> InterleaveLower(D /* tag */, VFromD<D> a, VFromD<D> b) {
|
|
2354
2407
|
return InterleaveLower(a, b);
|
|
2355
2408
|
}
|
|
2356
2409
|
|
|
@@ -2416,6 +2469,15 @@ HWY_API MFromD<D> LoadMaskBits(D d, const uint8_t* HWY_RESTRICT bits) {
|
|
|
2416
2469
|
return m;
|
|
2417
2470
|
}
|
|
2418
2471
|
|
|
2472
|
+
template <class D>
|
|
2473
|
+
HWY_API MFromD<D> Dup128MaskFromMaskBits(D d, unsigned mask_bits) {
|
|
2474
|
+
MFromD<D> m;
|
|
2475
|
+
for (size_t i = 0; i < MaxLanes(d); ++i) {
|
|
2476
|
+
m.bits[i] = MFromD<D>::FromBool(((mask_bits >> i) & 1u) != 0);
|
|
2477
|
+
}
|
|
2478
|
+
return m;
|
|
2479
|
+
}
|
|
2480
|
+
|
|
2419
2481
|
// `p` points to at least 8 writable bytes.
|
|
2420
2482
|
template <class D>
|
|
2421
2483
|
HWY_API size_t StoreMaskBits(D d, MFromD<D> mask, uint8_t* bits) {
|
|
@@ -2517,7 +2579,7 @@ HWY_API Vec128<T, N> Expand(Vec128<T, N> v, const Mask128<T, N> mask) {
|
|
|
2517
2579
|
if (mask.bits[i]) {
|
|
2518
2580
|
ret.raw[i] = v.raw[in_pos++];
|
|
2519
2581
|
} else {
|
|
2520
|
-
ret.raw[i] = T();
|
|
2582
|
+
ret.raw[i] = ConvertScalarTo<T>(0);
|
|
2521
2583
|
}
|
|
2522
2584
|
}
|
|
2523
2585
|
return ret;
|
|
@@ -2754,15 +2816,13 @@ HWY_API VW RearrangeToOddPlusEven(VW sum0, VW sum1) {
|
|
|
2754
2816
|
|
|
2755
2817
|
// ================================================== REDUCTIONS
|
|
2756
2818
|
|
|
2757
|
-
|
|
2758
|
-
|
|
2759
|
-
|
|
2760
|
-
|
|
2761
|
-
|
|
2762
|
-
|
|
2763
|
-
|
|
2764
|
-
}
|
|
2765
|
-
template <class D, typename T = TFromD<D>>
|
|
2819
|
+
#ifdef HWY_NATIVE_REDUCE_SCALAR
|
|
2820
|
+
#undef HWY_NATIVE_REDUCE_SCALAR
|
|
2821
|
+
#else
|
|
2822
|
+
#define HWY_NATIVE_REDUCE_SCALAR
|
|
2823
|
+
#endif
|
|
2824
|
+
|
|
2825
|
+
template <class D, typename T = TFromD<D>, HWY_IF_REDUCE_D(D)>
|
|
2766
2826
|
HWY_API T ReduceSum(D d, VFromD<D> v) {
|
|
2767
2827
|
T sum = T{0};
|
|
2768
2828
|
for (size_t i = 0; i < MaxLanes(d); ++i) {
|
|
@@ -2770,21 +2830,36 @@ HWY_API T ReduceSum(D d, VFromD<D> v) {
|
|
|
2770
2830
|
}
|
|
2771
2831
|
return sum;
|
|
2772
2832
|
}
|
|
2773
|
-
template <class D, typename T = TFromD<D
|
|
2774
|
-
HWY_API
|
|
2833
|
+
template <class D, typename T = TFromD<D>, HWY_IF_REDUCE_D(D)>
|
|
2834
|
+
HWY_API T ReduceMin(D d, VFromD<D> v) {
|
|
2775
2835
|
T min = HighestValue<T>();
|
|
2776
2836
|
for (size_t i = 0; i < MaxLanes(d); ++i) {
|
|
2777
2837
|
min = HWY_MIN(min, v.raw[i]);
|
|
2778
2838
|
}
|
|
2779
|
-
return
|
|
2839
|
+
return min;
|
|
2780
2840
|
}
|
|
2781
|
-
template <class D, typename T = TFromD<D
|
|
2782
|
-
HWY_API
|
|
2841
|
+
template <class D, typename T = TFromD<D>, HWY_IF_REDUCE_D(D)>
|
|
2842
|
+
HWY_API T ReduceMax(D d, VFromD<D> v) {
|
|
2783
2843
|
T max = LowestValue<T>();
|
|
2784
2844
|
for (size_t i = 0; i < MaxLanes(d); ++i) {
|
|
2785
2845
|
max = HWY_MAX(max, v.raw[i]);
|
|
2786
2846
|
}
|
|
2787
|
-
return
|
|
2847
|
+
return max;
|
|
2848
|
+
}
|
|
2849
|
+
|
|
2850
|
+
// ------------------------------ SumOfLanes
|
|
2851
|
+
|
|
2852
|
+
template <class D, HWY_IF_LANES_GT_D(D, 1)>
|
|
2853
|
+
HWY_API VFromD<D> SumOfLanes(D d, VFromD<D> v) {
|
|
2854
|
+
return Set(d, ReduceSum(d, v));
|
|
2855
|
+
}
|
|
2856
|
+
template <class D, HWY_IF_LANES_GT_D(D, 1)>
|
|
2857
|
+
HWY_API VFromD<D> MinOfLanes(D d, VFromD<D> v) {
|
|
2858
|
+
return Set(d, ReduceMin(d, v));
|
|
2859
|
+
}
|
|
2860
|
+
template <class D, HWY_IF_LANES_GT_D(D, 1)>
|
|
2861
|
+
HWY_API VFromD<D> MaxOfLanes(D d, VFromD<D> v) {
|
|
2862
|
+
return Set(d, ReduceMax(d, v));
|
|
2788
2863
|
}
|
|
2789
2864
|
|
|
2790
2865
|
// ================================================== OPS WITH DEPENDENCIES
|