@img/sharp-libvips-dev 1.0.1 → 1.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -2
- package/include/aom/aom_decoder.h +1 -1
- package/include/aom/aom_encoder.h +7 -1
- package/include/aom/aom_image.h +24 -12
- package/include/aom/aom_integer.h +3 -3
- package/include/aom/aomcx.h +15 -0
- package/include/aom/aomdx.h +5 -2
- package/include/archive.h +7 -5
- package/include/archive_entry.h +5 -3
- package/include/cgif.h +3 -0
- package/include/expat.h +21 -10
- package/include/expat_config.h +11 -5
- package/include/ffi.h +12 -25
- package/include/freetype2/freetype/config/ftoption.h +2 -2
- package/include/fribidi/fribidi-config.h +2 -2
- package/include/fribidi/fribidi-unicode-version.h +3 -3
- package/include/gio-unix-2.0/gio/gfiledescriptorbased.h +3 -2
- package/include/glib-2.0/gio/gappinfo.h +40 -25
- package/include/glib-2.0/gio/gapplication.h +6 -0
- package/include/glib-2.0/gio/gasyncresult.h +1 -1
- package/include/glib-2.0/gio/gconverter.h +5 -0
- package/include/glib-2.0/gio/gdbusintrospection.h +1 -1
- package/include/glib-2.0/gio/gfile.h +16 -0
- package/include/glib-2.0/gio/gio-visibility.h +34 -0
- package/include/glib-2.0/gio/giotypes.h +0 -1
- package/include/glib-2.0/gio/gsettings.h +8 -0
- package/include/glib-2.0/gio/gvfs.h +2 -2
- package/include/glib-2.0/girepository/gi-visibility.h +34 -0
- package/include/glib-2.0/girepository/giarginfo.h +23 -6
- package/include/glib-2.0/girepository/gibaseinfo.h +44 -18
- package/include/glib-2.0/girepository/gicallableinfo.h +26 -16
- package/include/glib-2.0/girepository/gicallbackinfo.h +17 -2
- package/include/glib-2.0/girepository/giconstantinfo.h +19 -4
- package/include/glib-2.0/girepository/gienuminfo.h +20 -21
- package/include/glib-2.0/girepository/gifieldinfo.h +22 -7
- package/include/glib-2.0/girepository/giflagsinfo.h +60 -0
- package/include/glib-2.0/girepository/gifunctioninfo.h +22 -7
- package/include/glib-2.0/girepository/giinterfaceinfo.h +33 -18
- package/include/glib-2.0/girepository/giobjectinfo.h +41 -26
- package/include/glib-2.0/girepository/gipropertyinfo.h +18 -3
- package/include/glib-2.0/girepository/giregisteredtypeinfo.h +22 -11
- package/include/glib-2.0/girepository/girepository-autocleanups.h +56 -0
- package/include/glib-2.0/girepository/girepository.h +53 -62
- package/include/glib-2.0/girepository/girffi.h +8 -7
- package/include/glib-2.0/girepository/gisignalinfo.h +18 -3
- package/include/glib-2.0/girepository/gistructinfo.h +26 -11
- package/include/glib-2.0/girepository/gitypeinfo.h +29 -16
- package/include/glib-2.0/girepository/gitypelib.h +9 -13
- package/include/glib-2.0/girepository/gitypes.h +52 -104
- package/include/glib-2.0/girepository/giunioninfo.h +28 -12
- package/include/glib-2.0/girepository/giunresolvedinfo.h +17 -2
- package/include/glib-2.0/girepository/givalueinfo.h +65 -0
- package/include/glib-2.0/girepository/givfuncinfo.h +23 -8
- package/include/glib-2.0/glib/deprecated/gthread.h +9 -5
- package/include/glib-2.0/glib/gbitlock.h +31 -0
- package/include/glib-2.0/glib/gbookmarkfile.h +1 -1
- package/include/glib-2.0/glib/giochannel.h +2 -2
- package/include/glib-2.0/glib/glib-visibility.h +34 -0
- package/include/glib-2.0/glib/gmacros.h +12 -5
- package/include/glib-2.0/glib/gmain.h +93 -7
- package/include/glib-2.0/glib/gmessages.h +8 -0
- package/include/glib-2.0/glib/gqsort.h +8 -1
- package/include/glib-2.0/glib/gslice.h +2 -0
- package/include/glib-2.0/glib/gstrfuncs.h +24 -30
- package/include/glib-2.0/glib/gstrvbuilder.h +3 -0
- package/include/glib-2.0/glib/gthread.h +191 -3
- package/include/glib-2.0/glib/gunicode.h +1 -1
- package/include/glib-2.0/glib/gversionmacros.h +9 -0
- package/include/glib-2.0/glib-unix.h +7 -1
- package/include/glib-2.0/gmodule/gmodule-visibility.h +34 -0
- package/include/glib-2.0/gobject/genums.h +6 -6
- package/include/glib-2.0/gobject/glib-types.h +11 -0
- package/include/glib-2.0/gobject/gobject-visibility.h +34 -0
- package/include/glib-2.0/gobject/gsignal.h +16 -6
- package/include/glib-2.0/gobject/gtype.h +6 -6
- package/include/harfbuzz/hb-buffer.h +6 -0
- package/include/harfbuzz/hb-common.h +6 -9
- package/include/harfbuzz/hb-cplusplus.hh +8 -11
- package/include/harfbuzz/hb-subset.h +17 -4
- package/include/harfbuzz/hb-version.h +3 -3
- package/include/hwy/abort.h +28 -0
- package/include/hwy/aligned_allocator.h +218 -6
- package/include/hwy/base.h +1935 -512
- package/include/hwy/cache_control.h +24 -6
- package/include/hwy/detect_compiler_arch.h +105 -10
- package/include/hwy/detect_targets.h +146 -37
- package/include/hwy/foreach_target.h +36 -1
- package/include/hwy/highway.h +222 -50
- package/include/hwy/ops/arm_neon-inl.h +2055 -894
- package/include/hwy/ops/arm_sve-inl.h +1476 -348
- package/include/hwy/ops/emu128-inl.h +711 -623
- package/include/hwy/ops/generic_ops-inl.h +4431 -2157
- package/include/hwy/ops/inside-inl.h +691 -0
- package/include/hwy/ops/ppc_vsx-inl.h +2186 -673
- package/include/hwy/ops/rvv-inl.h +1556 -536
- package/include/hwy/ops/scalar-inl.h +353 -233
- package/include/hwy/ops/set_macros-inl.h +171 -23
- package/include/hwy/ops/shared-inl.h +198 -56
- package/include/hwy/ops/wasm_128-inl.h +283 -244
- package/include/hwy/ops/x86_128-inl.h +3673 -1357
- package/include/hwy/ops/x86_256-inl.h +1737 -663
- package/include/hwy/ops/x86_512-inl.h +1697 -500
- package/include/hwy/per_target.h +4 -0
- package/include/hwy/profiler.h +648 -0
- package/include/hwy/robust_statistics.h +2 -2
- package/include/hwy/targets.h +40 -32
- package/include/hwy/timer-inl.h +3 -3
- package/include/hwy/timer.h +16 -1
- package/include/libheif/heif.h +170 -15
- package/include/libheif/heif_items.h +237 -0
- package/include/libheif/heif_properties.h +38 -2
- package/include/libheif/heif_regions.h +1 -1
- package/include/libheif/heif_version.h +2 -2
- package/include/libpng16/png.h +32 -29
- package/include/libpng16/pngconf.h +2 -2
- package/include/libpng16/pnglibconf.h +8 -3
- package/include/librsvg-2.0/librsvg/rsvg-cairo.h +1 -1
- package/include/librsvg-2.0/librsvg/rsvg-features.h +3 -4
- package/include/librsvg-2.0/librsvg/rsvg-pixbuf.h +235 -0
- package/include/librsvg-2.0/librsvg/rsvg-version.h +3 -3
- package/include/librsvg-2.0/librsvg/rsvg.h +55 -176
- package/include/libxml2/libxml/HTMLparser.h +12 -19
- package/include/libxml2/libxml/c14n.h +1 -12
- package/include/libxml2/libxml/debugXML.h +1 -1
- package/include/libxml2/libxml/encoding.h +9 -0
- package/include/libxml2/libxml/entities.h +12 -1
- package/include/libxml2/libxml/hash.h +19 -0
- package/include/libxml2/libxml/list.h +2 -2
- package/include/libxml2/libxml/nanohttp.h +17 -0
- package/include/libxml2/libxml/parser.h +73 -58
- package/include/libxml2/libxml/parserInternals.h +9 -1
- package/include/libxml2/libxml/pattern.h +6 -0
- package/include/libxml2/libxml/tree.h +32 -12
- package/include/libxml2/libxml/uri.h +11 -0
- package/include/libxml2/libxml/valid.h +29 -2
- package/include/libxml2/libxml/xinclude.h +7 -0
- package/include/libxml2/libxml/xmlIO.h +21 -5
- package/include/libxml2/libxml/xmlerror.h +14 -0
- package/include/libxml2/libxml/xmlexports.h +111 -15
- package/include/libxml2/libxml/xmlmemory.h +8 -45
- package/include/libxml2/libxml/xmlreader.h +2 -0
- package/include/libxml2/libxml/xmlsave.h +5 -0
- package/include/libxml2/libxml/xmlunicode.h +165 -1
- package/include/libxml2/libxml/xmlversion.h +15 -179
- package/include/libxml2/libxml/xmlwriter.h +1 -0
- package/include/libxml2/libxml/xpath.h +4 -0
- package/include/pango-1.0/pango/pango-features.h +2 -2
- package/include/pango-1.0/pango/pango-fontmap.h +7 -0
- package/include/pango-1.0/pango/pango-item.h +4 -2
- package/include/pango-1.0/pango/pango-version-macros.h +25 -0
- package/include/pango-1.0/pango/pangofc-font.h +2 -1
- package/include/pixman-1/pixman-version.h +2 -2
- package/include/png.h +32 -29
- package/include/pngconf.h +2 -2
- package/include/pnglibconf.h +8 -3
- package/include/vips/connection.h +9 -3
- package/include/vips/util.h +1 -11
- package/include/vips/version.h +4 -4
- package/include/webp/decode.h +58 -56
- package/include/webp/demux.h +25 -21
- package/include/webp/encode.h +44 -39
- package/include/webp/mux.h +76 -15
- package/include/webp/mux_types.h +2 -1
- package/include/webp/sharpyuv/sharpyuv.h +77 -8
- package/include/webp/types.h +29 -8
- package/include/zconf.h +1 -1
- package/include/zlib.h +12 -12
- package/package.json +1 -1
- package/versions.json +18 -19
|
@@ -16,7 +16,11 @@
|
|
|
16
16
|
// Single-element vectors and operations.
|
|
17
17
|
// External include guard in highway.h - see comment there.
|
|
18
18
|
|
|
19
|
-
#include
|
|
19
|
+
#include "hwy/base.h"
|
|
20
|
+
|
|
21
|
+
#ifndef HWY_NO_LIBCXX
|
|
22
|
+
#include <math.h> // sqrtf
|
|
23
|
+
#endif
|
|
20
24
|
|
|
21
25
|
#include "hwy/ops/shared-inl.h"
|
|
22
26
|
|
|
@@ -49,6 +53,9 @@ struct Vec128 {
|
|
|
49
53
|
HWY_INLINE Vec128& operator-=(const Vec128 other) {
|
|
50
54
|
return *this = (*this - other);
|
|
51
55
|
}
|
|
56
|
+
HWY_INLINE Vec128& operator%=(const Vec128 other) {
|
|
57
|
+
return *this = (*this % other);
|
|
58
|
+
}
|
|
52
59
|
HWY_INLINE Vec128& operator&=(const Vec128 other) {
|
|
53
60
|
return *this = (*this & other);
|
|
54
61
|
}
|
|
@@ -97,15 +104,12 @@ HWY_API Vec128<TFromD<D>, HWY_MAX_LANES_D(D)> Zero(D /* tag */) {
|
|
|
97
104
|
template <class D>
|
|
98
105
|
using VFromD = decltype(Zero(D()));
|
|
99
106
|
|
|
100
|
-
// ------------------------------ Tuple (VFromD)
|
|
101
|
-
#include "hwy/ops/tuple-inl.h"
|
|
102
|
-
|
|
103
107
|
// ------------------------------ BitCast
|
|
104
108
|
|
|
105
109
|
template <class D, class VFrom>
|
|
106
110
|
HWY_API VFromD<D> BitCast(D /* tag */, VFrom v) {
|
|
107
111
|
VFromD<D> to;
|
|
108
|
-
CopySameSize(&v, &to);
|
|
112
|
+
CopySameSize(&v.raw, &to.raw);
|
|
109
113
|
return to;
|
|
110
114
|
}
|
|
111
115
|
|
|
@@ -122,7 +126,7 @@ HWY_API VFromD<D> ResizeBitCast(D d, VFrom v) {
|
|
|
122
126
|
constexpr size_t kCopyByteLen = HWY_MIN(kFromByteLen, kToByteLen);
|
|
123
127
|
|
|
124
128
|
VFromD<D> to = Zero(d);
|
|
125
|
-
CopyBytes<kCopyByteLen>(&v, &to);
|
|
129
|
+
CopyBytes<kCopyByteLen>(&v.raw, &to.raw);
|
|
126
130
|
return to;
|
|
127
131
|
}
|
|
128
132
|
|
|
@@ -145,7 +149,7 @@ template <class D, typename T2>
|
|
|
145
149
|
HWY_API VFromD<D> Set(D d, const T2 t) {
|
|
146
150
|
VFromD<D> v;
|
|
147
151
|
for (size_t i = 0; i < MaxLanes(d); ++i) {
|
|
148
|
-
v.raw[i] =
|
|
152
|
+
v.raw[i] = ConvertScalarTo<TFromD<D>>(t);
|
|
149
153
|
}
|
|
150
154
|
return v;
|
|
151
155
|
}
|
|
@@ -156,14 +160,79 @@ HWY_API VFromD<D> Undefined(D d) {
|
|
|
156
160
|
return Zero(d);
|
|
157
161
|
}
|
|
158
162
|
|
|
163
|
+
// ------------------------------ Dup128VecFromValues
|
|
164
|
+
|
|
165
|
+
template <class D, HWY_IF_T_SIZE_D(D, 1)>
|
|
166
|
+
HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1,
|
|
167
|
+
TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
|
|
168
|
+
TFromD<D> t5, TFromD<D> t6, TFromD<D> t7,
|
|
169
|
+
TFromD<D> t8, TFromD<D> t9, TFromD<D> t10,
|
|
170
|
+
TFromD<D> t11, TFromD<D> t12,
|
|
171
|
+
TFromD<D> t13, TFromD<D> t14,
|
|
172
|
+
TFromD<D> t15) {
|
|
173
|
+
VFromD<D> result;
|
|
174
|
+
result.raw[0] = t0;
|
|
175
|
+
result.raw[1] = t1;
|
|
176
|
+
result.raw[2] = t2;
|
|
177
|
+
result.raw[3] = t3;
|
|
178
|
+
result.raw[4] = t4;
|
|
179
|
+
result.raw[5] = t5;
|
|
180
|
+
result.raw[6] = t6;
|
|
181
|
+
result.raw[7] = t7;
|
|
182
|
+
result.raw[8] = t8;
|
|
183
|
+
result.raw[9] = t9;
|
|
184
|
+
result.raw[10] = t10;
|
|
185
|
+
result.raw[11] = t11;
|
|
186
|
+
result.raw[12] = t12;
|
|
187
|
+
result.raw[13] = t13;
|
|
188
|
+
result.raw[14] = t14;
|
|
189
|
+
result.raw[15] = t15;
|
|
190
|
+
return result;
|
|
191
|
+
}
|
|
192
|
+
|
|
193
|
+
template <class D, HWY_IF_T_SIZE_D(D, 2)>
|
|
194
|
+
HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1,
|
|
195
|
+
TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
|
|
196
|
+
TFromD<D> t5, TFromD<D> t6,
|
|
197
|
+
TFromD<D> t7) {
|
|
198
|
+
VFromD<D> result;
|
|
199
|
+
result.raw[0] = t0;
|
|
200
|
+
result.raw[1] = t1;
|
|
201
|
+
result.raw[2] = t2;
|
|
202
|
+
result.raw[3] = t3;
|
|
203
|
+
result.raw[4] = t4;
|
|
204
|
+
result.raw[5] = t5;
|
|
205
|
+
result.raw[6] = t6;
|
|
206
|
+
result.raw[7] = t7;
|
|
207
|
+
return result;
|
|
208
|
+
}
|
|
209
|
+
|
|
210
|
+
template <class D, HWY_IF_T_SIZE_D(D, 4)>
|
|
211
|
+
HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1,
|
|
212
|
+
TFromD<D> t2, TFromD<D> t3) {
|
|
213
|
+
VFromD<D> result;
|
|
214
|
+
result.raw[0] = t0;
|
|
215
|
+
result.raw[1] = t1;
|
|
216
|
+
result.raw[2] = t2;
|
|
217
|
+
result.raw[3] = t3;
|
|
218
|
+
return result;
|
|
219
|
+
}
|
|
220
|
+
|
|
221
|
+
template <class D, HWY_IF_T_SIZE_D(D, 8)>
|
|
222
|
+
HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1) {
|
|
223
|
+
VFromD<D> result;
|
|
224
|
+
result.raw[0] = t0;
|
|
225
|
+
result.raw[1] = t1;
|
|
226
|
+
return result;
|
|
227
|
+
}
|
|
228
|
+
|
|
159
229
|
// ------------------------------ Iota
|
|
160
230
|
|
|
161
231
|
template <class D, typename T = TFromD<D>, typename T2>
|
|
162
232
|
HWY_API VFromD<D> Iota(D d, T2 first) {
|
|
163
233
|
VFromD<D> v;
|
|
164
234
|
for (size_t i = 0; i < MaxLanes(d); ++i) {
|
|
165
|
-
v.raw[i] =
|
|
166
|
-
AddWithWraparound(hwy::IsFloatTag<T>(), static_cast<T>(first), i);
|
|
235
|
+
v.raw[i] = AddWithWraparound(static_cast<T>(first), i);
|
|
167
236
|
}
|
|
168
237
|
return v;
|
|
169
238
|
}
|
|
@@ -284,9 +353,8 @@ HWY_API Vec128<T, N> CopySignToAbs(Vec128<T, N> abs, Vec128<T, N> sign) {
|
|
|
284
353
|
// ------------------------------ BroadcastSignBit
|
|
285
354
|
template <typename T, size_t N>
|
|
286
355
|
HWY_API Vec128<T, N> BroadcastSignBit(Vec128<T, N> v) {
|
|
287
|
-
// This is used inside ShiftRight, so we cannot implement in terms of it.
|
|
288
356
|
for (size_t i = 0; i < N; ++i) {
|
|
289
|
-
v.raw[i] = v.raw[i]
|
|
357
|
+
v.raw[i] = ScalarShr(v.raw[i], sizeof(T) * 8 - 1);
|
|
290
358
|
}
|
|
291
359
|
return v;
|
|
292
360
|
}
|
|
@@ -297,7 +365,7 @@ HWY_API Vec128<T, N> BroadcastSignBit(Vec128<T, N> v) {
|
|
|
297
365
|
template <typename T, size_t N>
|
|
298
366
|
HWY_API Mask128<T, N> MaskFromVec(Vec128<T, N> v) {
|
|
299
367
|
Mask128<T, N> mask;
|
|
300
|
-
CopySameSize(&v, &mask);
|
|
368
|
+
CopySameSize(&v.raw, &mask.bits);
|
|
301
369
|
return mask;
|
|
302
370
|
}
|
|
303
371
|
|
|
@@ -307,20 +375,15 @@ using MFromD = decltype(MaskFromVec(VFromD<D>()));
|
|
|
307
375
|
template <class DTo, class MFrom>
|
|
308
376
|
HWY_API MFromD<DTo> RebindMask(DTo /* tag */, MFrom mask) {
|
|
309
377
|
MFromD<DTo> to;
|
|
310
|
-
CopySameSize(&mask, &to);
|
|
378
|
+
CopySameSize(&mask.bits, &to.bits);
|
|
311
379
|
return to;
|
|
312
380
|
}
|
|
313
381
|
|
|
314
|
-
template <typename T, size_t N>
|
|
315
|
-
Vec128<T, N> VecFromMask(Mask128<T, N> mask) {
|
|
316
|
-
Vec128<T, N> v;
|
|
317
|
-
CopySameSize(&mask, &v);
|
|
318
|
-
return v;
|
|
319
|
-
}
|
|
320
|
-
|
|
321
382
|
template <class D>
|
|
322
383
|
VFromD<D> VecFromMask(D /* tag */, MFromD<D> mask) {
|
|
323
|
-
|
|
384
|
+
VFromD<D> v;
|
|
385
|
+
CopySameSize(&mask.bits, &v.raw);
|
|
386
|
+
return v;
|
|
324
387
|
}
|
|
325
388
|
|
|
326
389
|
template <class D>
|
|
@@ -336,19 +399,20 @@ HWY_API MFromD<D> FirstN(D d, size_t n) {
|
|
|
336
399
|
template <typename T, size_t N>
|
|
337
400
|
HWY_API Vec128<T, N> IfThenElse(Mask128<T, N> mask, Vec128<T, N> yes,
|
|
338
401
|
Vec128<T, N> no) {
|
|
339
|
-
|
|
402
|
+
const DFromV<decltype(yes)> d;
|
|
403
|
+
return IfVecThenElse(VecFromMask(d, mask), yes, no);
|
|
340
404
|
}
|
|
341
405
|
|
|
342
406
|
template <typename T, size_t N>
|
|
343
407
|
HWY_API Vec128<T, N> IfThenElseZero(Mask128<T, N> mask, Vec128<T, N> yes) {
|
|
344
408
|
const DFromV<decltype(yes)> d;
|
|
345
|
-
return IfVecThenElse(VecFromMask(mask), yes, Zero(d));
|
|
409
|
+
return IfVecThenElse(VecFromMask(d, mask), yes, Zero(d));
|
|
346
410
|
}
|
|
347
411
|
|
|
348
412
|
template <typename T, size_t N>
|
|
349
413
|
HWY_API Vec128<T, N> IfThenZeroElse(Mask128<T, N> mask, Vec128<T, N> no) {
|
|
350
414
|
const DFromV<decltype(no)> d;
|
|
351
|
-
return IfVecThenElse(VecFromMask(mask), Zero(d), no);
|
|
415
|
+
return IfVecThenElse(VecFromMask(d, mask), Zero(d), no);
|
|
352
416
|
}
|
|
353
417
|
|
|
354
418
|
template <typename T, size_t N>
|
|
@@ -364,17 +428,12 @@ HWY_API Vec128<T, N> IfNegativeThenElse(Vec128<T, N> v, Vec128<T, N> yes,
|
|
|
364
428
|
return v;
|
|
365
429
|
}
|
|
366
430
|
|
|
367
|
-
template <typename T, size_t N>
|
|
368
|
-
HWY_API Vec128<T, N> ZeroIfNegative(Vec128<T, N> v) {
|
|
369
|
-
const DFromV<decltype(v)> d;
|
|
370
|
-
return IfNegativeThenElse(v, Zero(d), v);
|
|
371
|
-
}
|
|
372
|
-
|
|
373
431
|
// ------------------------------ Mask logical
|
|
374
432
|
|
|
375
433
|
template <typename T, size_t N>
|
|
376
434
|
HWY_API Mask128<T, N> Not(Mask128<T, N> m) {
|
|
377
|
-
|
|
435
|
+
const Simd<T, N, 0> d;
|
|
436
|
+
return MaskFromVec(Not(VecFromMask(d, m)));
|
|
378
437
|
}
|
|
379
438
|
|
|
380
439
|
template <typename T, size_t N>
|
|
@@ -426,41 +485,26 @@ HWY_API Vec128<T, N> ShiftLeft(Vec128<T, N> v) {
|
|
|
426
485
|
template <int kBits, typename T, size_t N>
|
|
427
486
|
HWY_API Vec128<T, N> ShiftRight(Vec128<T, N> v) {
|
|
428
487
|
static_assert(0 <= kBits && kBits < sizeof(T) * 8, "Invalid shift");
|
|
429
|
-
#if __cplusplus >= 202002L
|
|
430
488
|
// Signed right shift is now guaranteed to be arithmetic (rounding toward
|
|
431
489
|
// negative infinity, i.e. shifting in the sign bit).
|
|
432
490
|
for (size_t i = 0; i < N; ++i) {
|
|
433
|
-
v.raw[i] =
|
|
434
|
-
}
|
|
435
|
-
#else
|
|
436
|
-
if (IsSigned<T>()) {
|
|
437
|
-
// Emulate arithmetic shift using only logical (unsigned) shifts, because
|
|
438
|
-
// signed shifts are still implementation-defined.
|
|
439
|
-
using TU = hwy::MakeUnsigned<T>;
|
|
440
|
-
for (size_t i = 0; i < N; ++i) {
|
|
441
|
-
const TU shifted = static_cast<TU>(static_cast<TU>(v.raw[i]) >> kBits);
|
|
442
|
-
const TU sign = v.raw[i] < 0 ? static_cast<TU>(~TU{0}) : 0;
|
|
443
|
-
const size_t sign_shift =
|
|
444
|
-
static_cast<size_t>(static_cast<int>(sizeof(TU)) * 8 - 1 - kBits);
|
|
445
|
-
const TU upper = static_cast<TU>(sign << sign_shift);
|
|
446
|
-
v.raw[i] = static_cast<T>(shifted | upper);
|
|
447
|
-
}
|
|
448
|
-
} else { // T is unsigned
|
|
449
|
-
for (size_t i = 0; i < N; ++i) {
|
|
450
|
-
v.raw[i] = static_cast<T>(v.raw[i] >> kBits);
|
|
451
|
-
}
|
|
491
|
+
v.raw[i] = ScalarShr(v.raw[i], kBits);
|
|
452
492
|
}
|
|
453
|
-
|
|
493
|
+
|
|
454
494
|
return v;
|
|
455
495
|
}
|
|
456
496
|
|
|
457
497
|
// ------------------------------ RotateRight (ShiftRight)
|
|
458
|
-
template <int kBits, typename T, size_t N>
|
|
498
|
+
template <int kBits, typename T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
|
|
459
499
|
HWY_API Vec128<T, N> RotateRight(const Vec128<T, N> v) {
|
|
500
|
+
const DFromV<decltype(v)> d;
|
|
501
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
502
|
+
|
|
460
503
|
constexpr size_t kSizeInBits = sizeof(T) * 8;
|
|
461
504
|
static_assert(0 <= kBits && kBits < kSizeInBits, "Invalid shift count");
|
|
462
505
|
if (kBits == 0) return v;
|
|
463
|
-
|
|
506
|
+
|
|
507
|
+
return Or(BitCast(d, ShiftRight<kBits>(BitCast(du, v))),
|
|
464
508
|
ShiftLeft<HWY_MIN(kSizeInBits - 1, kSizeInBits - kBits)>(v));
|
|
465
509
|
}
|
|
466
510
|
|
|
@@ -477,31 +521,10 @@ HWY_API Vec128<T, N> ShiftLeftSame(Vec128<T, N> v, int bits) {
|
|
|
477
521
|
|
|
478
522
|
template <typename T, size_t N>
|
|
479
523
|
HWY_API Vec128<T, N> ShiftRightSame(Vec128<T, N> v, int bits) {
|
|
480
|
-
#if __cplusplus >= 202002L
|
|
481
|
-
// Signed right shift is now guaranteed to be arithmetic (rounding toward
|
|
482
|
-
// negative infinity, i.e. shifting in the sign bit).
|
|
483
524
|
for (size_t i = 0; i < N; ++i) {
|
|
484
|
-
v.raw[i] =
|
|
485
|
-
}
|
|
486
|
-
#else
|
|
487
|
-
if (IsSigned<T>()) {
|
|
488
|
-
// Emulate arithmetic shift using only logical (unsigned) shifts, because
|
|
489
|
-
// signed shifts are still implementation-defined.
|
|
490
|
-
using TU = hwy::MakeUnsigned<T>;
|
|
491
|
-
for (size_t i = 0; i < N; ++i) {
|
|
492
|
-
const TU shifted = static_cast<TU>(static_cast<TU>(v.raw[i]) >> bits);
|
|
493
|
-
const TU sign = v.raw[i] < 0 ? static_cast<TU>(~TU{0}) : 0;
|
|
494
|
-
const size_t sign_shift =
|
|
495
|
-
static_cast<size_t>(static_cast<int>(sizeof(TU)) * 8 - 1 - bits);
|
|
496
|
-
const TU upper = static_cast<TU>(sign << sign_shift);
|
|
497
|
-
v.raw[i] = static_cast<T>(shifted | upper);
|
|
498
|
-
}
|
|
499
|
-
} else {
|
|
500
|
-
for (size_t i = 0; i < N; ++i) {
|
|
501
|
-
v.raw[i] = static_cast<T>(v.raw[i] >> bits); // unsigned, logical shift
|
|
502
|
-
}
|
|
525
|
+
v.raw[i] = ScalarShr(v.raw[i], bits);
|
|
503
526
|
}
|
|
504
|
-
|
|
527
|
+
|
|
505
528
|
return v;
|
|
506
529
|
}
|
|
507
530
|
|
|
@@ -519,32 +542,10 @@ HWY_API Vec128<T, N> operator<<(Vec128<T, N> v, Vec128<T, N> bits) {
|
|
|
519
542
|
|
|
520
543
|
template <typename T, size_t N>
|
|
521
544
|
HWY_API Vec128<T, N> operator>>(Vec128<T, N> v, Vec128<T, N> bits) {
|
|
522
|
-
#if __cplusplus >= 202002L
|
|
523
|
-
// Signed right shift is now guaranteed to be arithmetic (rounding toward
|
|
524
|
-
// negative infinity, i.e. shifting in the sign bit).
|
|
525
545
|
for (size_t i = 0; i < N; ++i) {
|
|
526
|
-
v.raw[i] =
|
|
527
|
-
}
|
|
528
|
-
#else
|
|
529
|
-
if (IsSigned<T>()) {
|
|
530
|
-
// Emulate arithmetic shift using only logical (unsigned) shifts, because
|
|
531
|
-
// signed shifts are still implementation-defined.
|
|
532
|
-
using TU = hwy::MakeUnsigned<T>;
|
|
533
|
-
for (size_t i = 0; i < N; ++i) {
|
|
534
|
-
const TU shifted =
|
|
535
|
-
static_cast<TU>(static_cast<TU>(v.raw[i]) >> bits.raw[i]);
|
|
536
|
-
const TU sign = v.raw[i] < 0 ? static_cast<TU>(~TU{0}) : 0;
|
|
537
|
-
const size_t sign_shift = static_cast<size_t>(
|
|
538
|
-
static_cast<int>(sizeof(TU)) * 8 - 1 - bits.raw[i]);
|
|
539
|
-
const TU upper = static_cast<TU>(sign << sign_shift);
|
|
540
|
-
v.raw[i] = static_cast<T>(shifted | upper);
|
|
541
|
-
}
|
|
542
|
-
} else { // T is unsigned
|
|
543
|
-
for (size_t i = 0; i < N; ++i) {
|
|
544
|
-
v.raw[i] = static_cast<T>(v.raw[i] >> bits.raw[i]);
|
|
545
|
-
}
|
|
546
|
+
v.raw[i] = ScalarShr(v.raw[i], static_cast<int>(bits.raw[i]));
|
|
546
547
|
}
|
|
547
|
-
|
|
548
|
+
|
|
548
549
|
return v;
|
|
549
550
|
}
|
|
550
551
|
|
|
@@ -614,6 +615,15 @@ HWY_API Vec128<uint64_t, (N + 7) / 8> SumsOf8(Vec128<uint8_t, N> v) {
|
|
|
614
615
|
return sums;
|
|
615
616
|
}
|
|
616
617
|
|
|
618
|
+
template <size_t N>
|
|
619
|
+
HWY_API Vec128<int64_t, (N + 7) / 8> SumsOf8(Vec128<int8_t, N> v) {
|
|
620
|
+
Vec128<int64_t, (N + 7) / 8> sums;
|
|
621
|
+
for (size_t i = 0; i < N; ++i) {
|
|
622
|
+
sums.raw[i / 8] += v.raw[i];
|
|
623
|
+
}
|
|
624
|
+
return sums;
|
|
625
|
+
}
|
|
626
|
+
|
|
617
627
|
// ------------------------------ SaturatedAdd
|
|
618
628
|
template <typename T, size_t N, HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2)),
|
|
619
629
|
HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
|
|
@@ -652,34 +662,14 @@ HWY_API Vec128<T, N> AverageRound(Vec128<T, N> a, Vec128<T, N> b) {
|
|
|
652
662
|
|
|
653
663
|
// ------------------------------ Abs
|
|
654
664
|
|
|
655
|
-
// Tag dispatch instead of SFINAE for MSVC 2017 compatibility
|
|
656
|
-
namespace detail {
|
|
657
|
-
|
|
658
665
|
template <typename T, size_t N>
|
|
659
|
-
|
|
666
|
+
HWY_API Vec128<T, N> Abs(Vec128<T, N> a) {
|
|
660
667
|
for (size_t i = 0; i < N; ++i) {
|
|
661
|
-
|
|
662
|
-
const T min = hwy::LimitsMin<T>();
|
|
663
|
-
a.raw[i] = static_cast<T>((s >= 0 || s == min) ? a.raw[i] : -s);
|
|
668
|
+
a.raw[i] = ScalarAbs(a.raw[i]);
|
|
664
669
|
}
|
|
665
670
|
return a;
|
|
666
671
|
}
|
|
667
672
|
|
|
668
|
-
template <typename T, size_t N>
|
|
669
|
-
HWY_INLINE Vec128<T, N> Abs(hwy::FloatTag /*tag*/, Vec128<T, N> v) {
|
|
670
|
-
for (size_t i = 0; i < N; ++i) {
|
|
671
|
-
v.raw[i] = std::abs(v.raw[i]);
|
|
672
|
-
}
|
|
673
|
-
return v;
|
|
674
|
-
}
|
|
675
|
-
|
|
676
|
-
} // namespace detail
|
|
677
|
-
|
|
678
|
-
template <typename T, size_t N>
|
|
679
|
-
HWY_API Vec128<T, N> Abs(Vec128<T, N> a) {
|
|
680
|
-
return detail::Abs(hwy::TypeTag<T>(), a);
|
|
681
|
-
}
|
|
682
|
-
|
|
683
673
|
// ------------------------------ Min/Max
|
|
684
674
|
|
|
685
675
|
// Tag dispatch instead of SFINAE for MSVC 2017 compatibility
|
|
@@ -706,9 +696,9 @@ template <typename T, size_t N>
|
|
|
706
696
|
HWY_INLINE Vec128<T, N> Min(hwy::FloatTag /*tag*/, Vec128<T, N> a,
|
|
707
697
|
Vec128<T, N> b) {
|
|
708
698
|
for (size_t i = 0; i < N; ++i) {
|
|
709
|
-
if (
|
|
699
|
+
if (ScalarIsNaN(a.raw[i])) {
|
|
710
700
|
a.raw[i] = b.raw[i];
|
|
711
|
-
} else if (
|
|
701
|
+
} else if (ScalarIsNaN(b.raw[i])) {
|
|
712
702
|
// no change
|
|
713
703
|
} else {
|
|
714
704
|
a.raw[i] = HWY_MIN(a.raw[i], b.raw[i]);
|
|
@@ -720,9 +710,9 @@ template <typename T, size_t N>
|
|
|
720
710
|
HWY_INLINE Vec128<T, N> Max(hwy::FloatTag /*tag*/, Vec128<T, N> a,
|
|
721
711
|
Vec128<T, N> b) {
|
|
722
712
|
for (size_t i = 0; i < N; ++i) {
|
|
723
|
-
if (
|
|
713
|
+
if (ScalarIsNaN(a.raw[i])) {
|
|
724
714
|
a.raw[i] = b.raw[i];
|
|
725
|
-
} else if (
|
|
715
|
+
} else if (ScalarIsNaN(b.raw[i])) {
|
|
726
716
|
// no change
|
|
727
717
|
} else {
|
|
728
718
|
a.raw[i] = HWY_MAX(a.raw[i], b.raw[i]);
|
|
@@ -825,7 +815,7 @@ HWY_API Vec128<T, N> operator*(Vec128<T, N> a, Vec128<T, N> b) {
|
|
|
825
815
|
return detail::Mul(hwy::TypeTag<T>(), a, b);
|
|
826
816
|
}
|
|
827
817
|
|
|
828
|
-
template <typename T, size_t N>
|
|
818
|
+
template <typename T, size_t N, HWY_IF_FLOAT(T)>
|
|
829
819
|
HWY_API Vec128<T, N> operator/(Vec128<T, N> a, Vec128<T, N> b) {
|
|
830
820
|
for (size_t i = 0; i < N; ++i) {
|
|
831
821
|
a.raw[i] = (b.raw[i] == T{0}) ? 0 : a.raw[i] / b.raw[i];
|
|
@@ -833,26 +823,36 @@ HWY_API Vec128<T, N> operator/(Vec128<T, N> a, Vec128<T, N> b) {
|
|
|
833
823
|
return a;
|
|
834
824
|
}
|
|
835
825
|
|
|
836
|
-
// Returns the upper
|
|
837
|
-
template <size_t N
|
|
838
|
-
|
|
826
|
+
// Returns the upper sizeof(T)*8 bits of a * b in each lane.
|
|
827
|
+
template <class T, size_t N,
|
|
828
|
+
HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2) | (1 << 4)),
|
|
829
|
+
HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
|
|
830
|
+
HWY_API Vec128<T, N> MulHigh(Vec128<T, N> a, Vec128<T, N> b) {
|
|
831
|
+
using TW = MakeWide<T>;
|
|
839
832
|
for (size_t i = 0; i < N; ++i) {
|
|
840
|
-
a.raw[i] = static_cast<
|
|
833
|
+
a.raw[i] = static_cast<T>(
|
|
834
|
+
(static_cast<TW>(a.raw[i]) * static_cast<TW>(b.raw[i])) >>
|
|
835
|
+
(sizeof(T) * 8));
|
|
841
836
|
}
|
|
842
837
|
return a;
|
|
843
838
|
}
|
|
844
|
-
|
|
845
|
-
|
|
846
|
-
|
|
847
|
-
|
|
848
|
-
|
|
849
|
-
|
|
850
|
-
|
|
851
|
-
|
|
852
|
-
|
|
853
|
-
|
|
854
|
-
|
|
855
|
-
|
|
839
|
+
|
|
840
|
+
template <class T, HWY_IF_UI64(T)>
|
|
841
|
+
HWY_API Vec128<T, 1> MulHigh(Vec128<T, 1> a, Vec128<T, 1> b) {
|
|
842
|
+
T hi;
|
|
843
|
+
Mul128(GetLane(a), GetLane(b), &hi);
|
|
844
|
+
return Set(Full64<T>(), hi);
|
|
845
|
+
}
|
|
846
|
+
|
|
847
|
+
template <class T, HWY_IF_UI64(T)>
|
|
848
|
+
HWY_API Vec128<T> MulHigh(Vec128<T> a, Vec128<T> b) {
|
|
849
|
+
T hi_0;
|
|
850
|
+
T hi_1;
|
|
851
|
+
|
|
852
|
+
Mul128(GetLane(a), GetLane(b), &hi_0);
|
|
853
|
+
Mul128(ExtractLane(a, 1), ExtractLane(b, 1), &hi_1);
|
|
854
|
+
|
|
855
|
+
return Dup128VecFromValues(Full128<T>(), hi_0, hi_1);
|
|
856
856
|
}
|
|
857
857
|
|
|
858
858
|
template <size_t N>
|
|
@@ -900,7 +900,7 @@ HWY_API Vec128<float, N> ApproximateReciprocal(Vec128<float, N> v) {
|
|
|
900
900
|
// Zero inputs are allowed, but callers are responsible for replacing the
|
|
901
901
|
// return value with something else (typically using IfThenElse). This check
|
|
902
902
|
// avoids a ubsan error. The result is arbitrary.
|
|
903
|
-
v.raw[i] = (
|
|
903
|
+
v.raw[i] = (ScalarAbs(v.raw[i]) == 0.0f) ? 0.0f : 1.0f / v.raw[i];
|
|
904
904
|
}
|
|
905
905
|
return v;
|
|
906
906
|
}
|
|
@@ -913,25 +913,25 @@ HWY_API Vec128<T, N> AbsDiff(Vec128<T, N> a, Vec128<T, N> b) {
|
|
|
913
913
|
|
|
914
914
|
// ------------------------------ Floating-point multiply-add variants
|
|
915
915
|
|
|
916
|
-
template <typename T, size_t N>
|
|
916
|
+
template <typename T, size_t N, HWY_IF_FLOAT(T)>
|
|
917
917
|
HWY_API Vec128<T, N> MulAdd(Vec128<T, N> mul, Vec128<T, N> x,
|
|
918
918
|
Vec128<T, N> add) {
|
|
919
919
|
return mul * x + add;
|
|
920
920
|
}
|
|
921
921
|
|
|
922
|
-
template <typename T, size_t N>
|
|
922
|
+
template <typename T, size_t N, HWY_IF_FLOAT(T)>
|
|
923
923
|
HWY_API Vec128<T, N> NegMulAdd(Vec128<T, N> mul, Vec128<T, N> x,
|
|
924
924
|
Vec128<T, N> add) {
|
|
925
925
|
return add - mul * x;
|
|
926
926
|
}
|
|
927
927
|
|
|
928
|
-
template <typename T, size_t N>
|
|
928
|
+
template <typename T, size_t N, HWY_IF_FLOAT(T)>
|
|
929
929
|
HWY_API Vec128<T, N> MulSub(Vec128<T, N> mul, Vec128<T, N> x,
|
|
930
930
|
Vec128<T, N> sub) {
|
|
931
931
|
return mul * x - sub;
|
|
932
932
|
}
|
|
933
933
|
|
|
934
|
-
template <typename T, size_t N>
|
|
934
|
+
template <typename T, size_t N, HWY_IF_FLOAT(T)>
|
|
935
935
|
HWY_API Vec128<T, N> NegMulSub(Vec128<T, N> mul, Vec128<T, N> x,
|
|
936
936
|
Vec128<T, N> sub) {
|
|
937
937
|
return Neg(mul) * x - sub;
|
|
@@ -943,21 +943,52 @@ template <size_t N>
|
|
|
943
943
|
HWY_API Vec128<float, N> ApproximateReciprocalSqrt(Vec128<float, N> v) {
|
|
944
944
|
for (size_t i = 0; i < N; ++i) {
|
|
945
945
|
const float half = v.raw[i] * 0.5f;
|
|
946
|
-
uint32_t bits;
|
|
947
|
-
CopySameSize(&v.raw[i], &bits);
|
|
948
946
|
// Initial guess based on log2(f)
|
|
949
|
-
|
|
950
|
-
|
|
947
|
+
v.raw[i] = BitCastScalar<float>(static_cast<uint32_t>(
|
|
948
|
+
0x5F3759DF - (BitCastScalar<uint32_t>(v.raw[i]) >> 1)));
|
|
951
949
|
// One Newton-Raphson iteration
|
|
952
950
|
v.raw[i] = v.raw[i] * (1.5f - (half * v.raw[i] * v.raw[i]));
|
|
953
951
|
}
|
|
954
952
|
return v;
|
|
955
953
|
}
|
|
956
954
|
|
|
955
|
+
namespace detail {
|
|
956
|
+
|
|
957
|
+
static HWY_INLINE float ScalarSqrt(float v) {
|
|
958
|
+
#if defined(HWY_NO_LIBCXX)
|
|
959
|
+
#if HWY_COMPILER_GCC_ACTUAL
|
|
960
|
+
return __builtin_sqrt(v);
|
|
961
|
+
#else
|
|
962
|
+
uint32_t bits = BitCastScalar<uint32_t>(v);
|
|
963
|
+
// Coarse approximation, letting the exponent LSB leak into the mantissa
|
|
964
|
+
bits = (1 << 29) + (bits >> 1) - (1 << 22);
|
|
965
|
+
return BitCastScalar<float>(bits);
|
|
966
|
+
#endif // !HWY_COMPILER_GCC_ACTUAL
|
|
967
|
+
#else
|
|
968
|
+
return sqrtf(v);
|
|
969
|
+
#endif // !HWY_NO_LIBCXX
|
|
970
|
+
}
|
|
971
|
+
static HWY_INLINE double ScalarSqrt(double v) {
|
|
972
|
+
#if defined(HWY_NO_LIBCXX)
|
|
973
|
+
#if HWY_COMPILER_GCC_ACTUAL
|
|
974
|
+
return __builtin_sqrt(v);
|
|
975
|
+
#else
|
|
976
|
+
uint64_t bits = BitCastScalar<uint64_t>(v);
|
|
977
|
+
// Coarse approximation, letting the exponent LSB leak into the mantissa
|
|
978
|
+
bits = (1ULL << 61) + (bits >> 1) - (1ULL << 51);
|
|
979
|
+
return BitCastScalar<double>(bits);
|
|
980
|
+
#endif // !HWY_COMPILER_GCC_ACTUAL
|
|
981
|
+
#else
|
|
982
|
+
return sqrt(v);
|
|
983
|
+
#endif // HWY_NO_LIBCXX
|
|
984
|
+
}
|
|
985
|
+
|
|
986
|
+
} // namespace detail
|
|
987
|
+
|
|
957
988
|
template <typename T, size_t N>
|
|
958
989
|
HWY_API Vec128<T, N> Sqrt(Vec128<T, N> v) {
|
|
959
990
|
for (size_t i = 0; i < N; ++i) {
|
|
960
|
-
v.raw[i] =
|
|
991
|
+
v.raw[i] = detail::ScalarSqrt(v.raw[i]);
|
|
961
992
|
}
|
|
962
993
|
return v;
|
|
963
994
|
}
|
|
@@ -967,21 +998,23 @@ HWY_API Vec128<T, N> Sqrt(Vec128<T, N> v) {
|
|
|
967
998
|
template <typename T, size_t N>
|
|
968
999
|
HWY_API Vec128<T, N> Round(Vec128<T, N> v) {
|
|
969
1000
|
using TI = MakeSigned<T>;
|
|
1001
|
+
const T k0 = ConvertScalarTo<T>(0);
|
|
970
1002
|
const Vec128<T, N> a = Abs(v);
|
|
971
1003
|
for (size_t i = 0; i < N; ++i) {
|
|
972
1004
|
if (!(a.raw[i] < MantissaEnd<T>())) { // Huge or NaN
|
|
973
1005
|
continue;
|
|
974
1006
|
}
|
|
975
|
-
const T bias = v.raw[i] <
|
|
976
|
-
const TI rounded =
|
|
1007
|
+
const T bias = ConvertScalarTo<T>(v.raw[i] < k0 ? -0.5 : 0.5);
|
|
1008
|
+
const TI rounded = ConvertScalarTo<TI>(v.raw[i] + bias);
|
|
977
1009
|
if (rounded == 0) {
|
|
978
|
-
v.raw[i] = v.raw[i] < 0 ? T
|
|
1010
|
+
v.raw[i] = v.raw[i] < 0 ? ConvertScalarTo<T>(-0) : k0;
|
|
979
1011
|
continue;
|
|
980
1012
|
}
|
|
981
|
-
const T rounded_f =
|
|
1013
|
+
const T rounded_f = ConvertScalarTo<T>(rounded);
|
|
982
1014
|
// Round to even
|
|
983
|
-
if ((rounded & 1) &&
|
|
984
|
-
|
|
1015
|
+
if ((rounded & 1) &&
|
|
1016
|
+
ScalarAbs(rounded_f - v.raw[i]) == ConvertScalarTo<T>(0.5)) {
|
|
1017
|
+
v.raw[i] = ConvertScalarTo<T>(rounded - (v.raw[i] < k0 ? -1 : 1));
|
|
985
1018
|
continue;
|
|
986
1019
|
}
|
|
987
1020
|
v.raw[i] = rounded_f;
|
|
@@ -994,30 +1027,32 @@ template <size_t N>
|
|
|
994
1027
|
HWY_API Vec128<int32_t, N> NearestInt(Vec128<float, N> v) {
|
|
995
1028
|
using T = float;
|
|
996
1029
|
using TI = int32_t;
|
|
1030
|
+
const T k0 = ConvertScalarTo<T>(0);
|
|
997
1031
|
|
|
998
1032
|
const Vec128<float, N> abs = Abs(v);
|
|
999
1033
|
Vec128<int32_t, N> ret;
|
|
1000
1034
|
for (size_t i = 0; i < N; ++i) {
|
|
1001
|
-
const bool signbit =
|
|
1035
|
+
const bool signbit = ScalarSignBit(v.raw[i]);
|
|
1002
1036
|
|
|
1003
1037
|
if (!(abs.raw[i] < MantissaEnd<T>())) { // Huge or NaN
|
|
1004
1038
|
// Check if too large to cast or NaN
|
|
1005
|
-
if (!(abs.raw[i] <=
|
|
1039
|
+
if (!(abs.raw[i] <= ConvertScalarTo<T>(LimitsMax<TI>()))) {
|
|
1006
1040
|
ret.raw[i] = signbit ? LimitsMin<TI>() : LimitsMax<TI>();
|
|
1007
1041
|
continue;
|
|
1008
1042
|
}
|
|
1009
1043
|
ret.raw[i] = static_cast<TI>(v.raw[i]);
|
|
1010
1044
|
continue;
|
|
1011
1045
|
}
|
|
1012
|
-
const T bias = v.raw[i] <
|
|
1013
|
-
const TI rounded =
|
|
1046
|
+
const T bias = ConvertScalarTo<T>(v.raw[i] < k0 ? -0.5 : 0.5);
|
|
1047
|
+
const TI rounded = ConvertScalarTo<TI>(v.raw[i] + bias);
|
|
1014
1048
|
if (rounded == 0) {
|
|
1015
1049
|
ret.raw[i] = 0;
|
|
1016
1050
|
continue;
|
|
1017
1051
|
}
|
|
1018
|
-
const T rounded_f =
|
|
1052
|
+
const T rounded_f = ConvertScalarTo<T>(rounded);
|
|
1019
1053
|
// Round to even
|
|
1020
|
-
if ((rounded & 1) &&
|
|
1054
|
+
if ((rounded & 1) &&
|
|
1055
|
+
ScalarAbs(rounded_f - v.raw[i]) == ConvertScalarTo<T>(0.5)) {
|
|
1021
1056
|
ret.raw[i] = rounded - (signbit ? -1 : 1);
|
|
1022
1057
|
continue;
|
|
1023
1058
|
}
|
|
@@ -1056,8 +1091,7 @@ Vec128<Float, N> Ceil(Vec128<Float, N> v) {
|
|
|
1056
1091
|
for (size_t i = 0; i < N; ++i) {
|
|
1057
1092
|
const bool positive = v.raw[i] > Float(0.0);
|
|
1058
1093
|
|
|
1059
|
-
Bits bits;
|
|
1060
|
-
CopySameSize(&v.raw[i], &bits);
|
|
1094
|
+
Bits bits = BitCastScalar<Bits>(v.raw[i]);
|
|
1061
1095
|
|
|
1062
1096
|
const int exponent =
|
|
1063
1097
|
static_cast<int>(((bits >> kMantissaBits) & kExponentMask) - kBias);
|
|
@@ -1077,7 +1111,7 @@ Vec128<Float, N> Ceil(Vec128<Float, N> v) {
|
|
|
1077
1111
|
if (positive) bits += (kMantissaMask + 1) >> exponent;
|
|
1078
1112
|
bits &= ~mantissa_mask;
|
|
1079
1113
|
|
|
1080
|
-
|
|
1114
|
+
v.raw[i] = BitCastScalar<Float>(bits);
|
|
1081
1115
|
}
|
|
1082
1116
|
return v;
|
|
1083
1117
|
}
|
|
@@ -1094,8 +1128,7 @@ Vec128<Float, N> Floor(Vec128<Float, N> v) {
|
|
|
1094
1128
|
for (size_t i = 0; i < N; ++i) {
|
|
1095
1129
|
const bool negative = v.raw[i] < Float(0.0);
|
|
1096
1130
|
|
|
1097
|
-
Bits bits;
|
|
1098
|
-
CopySameSize(&v.raw[i], &bits);
|
|
1131
|
+
Bits bits = BitCastScalar<Bits>(v.raw[i]);
|
|
1099
1132
|
|
|
1100
1133
|
const int exponent =
|
|
1101
1134
|
static_cast<int>(((bits >> kMantissaBits) & kExponentMask) - kBias);
|
|
@@ -1115,7 +1148,7 @@ Vec128<Float, N> Floor(Vec128<Float, N> v) {
|
|
|
1115
1148
|
if (negative) bits += (kMantissaMask + 1) >> exponent;
|
|
1116
1149
|
bits &= ~mantissa_mask;
|
|
1117
1150
|
|
|
1118
|
-
|
|
1151
|
+
v.raw[i] = BitCastScalar<Float>(bits);
|
|
1119
1152
|
}
|
|
1120
1153
|
return v;
|
|
1121
1154
|
}
|
|
@@ -1127,44 +1160,11 @@ HWY_API Mask128<T, N> IsNaN(Vec128<T, N> v) {
|
|
|
1127
1160
|
Mask128<T, N> ret;
|
|
1128
1161
|
for (size_t i = 0; i < N; ++i) {
|
|
1129
1162
|
// std::isnan returns false for 0x7F..FF in clang AVX3 builds, so DIY.
|
|
1130
|
-
|
|
1131
|
-
CopySameSize(&v.raw[i], &bits);
|
|
1132
|
-
bits += bits;
|
|
1133
|
-
bits >>= 1; // clear sign bit
|
|
1134
|
-
// NaN if all exponent bits are set and the mantissa is not zero.
|
|
1135
|
-
ret.bits[i] = Mask128<T, N>::FromBool(bits > ExponentMask<T>());
|
|
1163
|
+
ret.bits[i] = Mask128<T, N>::FromBool(ScalarIsNaN(v.raw[i]));
|
|
1136
1164
|
}
|
|
1137
1165
|
return ret;
|
|
1138
1166
|
}
|
|
1139
1167
|
|
|
1140
|
-
template <typename T, size_t N>
|
|
1141
|
-
HWY_API Mask128<T, N> IsInf(Vec128<T, N> v) {
|
|
1142
|
-
static_assert(IsFloat<T>(), "Only for float");
|
|
1143
|
-
const DFromV<decltype(v)> d;
|
|
1144
|
-
const RebindToSigned<decltype(d)> di;
|
|
1145
|
-
const VFromD<decltype(di)> vi = BitCast(di, v);
|
|
1146
|
-
// 'Shift left' to clear the sign bit, check for exponent=max and mantissa=0.
|
|
1147
|
-
return RebindMask(d, Eq(Add(vi, vi), Set(di, hwy::MaxExponentTimes2<T>())));
|
|
1148
|
-
}
|
|
1149
|
-
|
|
1150
|
-
// Returns whether normal/subnormal/zero.
|
|
1151
|
-
template <typename T, size_t N>
|
|
1152
|
-
HWY_API Mask128<T, N> IsFinite(Vec128<T, N> v) {
|
|
1153
|
-
static_assert(IsFloat<T>(), "Only for float");
|
|
1154
|
-
const DFromV<decltype(v)> d;
|
|
1155
|
-
const RebindToUnsigned<decltype(d)> du;
|
|
1156
|
-
const RebindToSigned<decltype(d)> di; // cheaper than unsigned comparison
|
|
1157
|
-
using VI = VFromD<decltype(di)>;
|
|
1158
|
-
using VU = VFromD<decltype(du)>;
|
|
1159
|
-
const VU vu = BitCast(du, v);
|
|
1160
|
-
// 'Shift left' to clear the sign bit, then right so we can compare with the
|
|
1161
|
-
// max exponent (cannot compare with MaxExponentTimes2 directly because it is
|
|
1162
|
-
// negative and non-negative floats would be greater).
|
|
1163
|
-
const VI exp =
|
|
1164
|
-
BitCast(di, ShiftRight<hwy::MantissaBits<T>() + 1>(Add(vu, vu)));
|
|
1165
|
-
return RebindMask(d, Lt(exp, Set(di, hwy::MaxExponentField<T>())));
|
|
1166
|
-
}
|
|
1167
|
-
|
|
1168
1168
|
// ================================================== COMPARE
|
|
1169
1169
|
|
|
1170
1170
|
template <typename T, size_t N>
|
|
@@ -1400,177 +1400,387 @@ HWY_API void StoreN(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT p,
|
|
|
1400
1400
|
CopyBytes(v.raw, p, num_of_lanes_to_store * sizeof(TFromD<D>));
|
|
1401
1401
|
}
|
|
1402
1402
|
|
|
1403
|
-
//
|
|
1403
|
+
// ================================================== COMBINE
|
|
1404
1404
|
|
|
1405
|
-
|
|
1406
|
-
|
|
1407
|
-
|
|
1408
|
-
|
|
1409
|
-
|
|
1410
|
-
|
|
1411
|
-
#define HWY_NATIVE_LOAD_STORE_INTERLEAVED
|
|
1412
|
-
#endif
|
|
1405
|
+
template <typename T, size_t N>
|
|
1406
|
+
HWY_API Vec128<T, N / 2> LowerHalf(Vec128<T, N> v) {
|
|
1407
|
+
Vec128<T, N / 2> ret;
|
|
1408
|
+
CopyBytes<N / 2 * sizeof(T)>(v.raw, ret.raw);
|
|
1409
|
+
return ret;
|
|
1410
|
+
}
|
|
1413
1411
|
|
|
1414
|
-
template <class D
|
|
1415
|
-
HWY_API
|
|
1416
|
-
|
|
1417
|
-
alignas(16) T buf0[MaxLanes(d)];
|
|
1418
|
-
alignas(16) T buf1[MaxLanes(d)];
|
|
1419
|
-
for (size_t i = 0; i < MaxLanes(d); ++i) {
|
|
1420
|
-
buf0[i] = *unaligned++;
|
|
1421
|
-
buf1[i] = *unaligned++;
|
|
1422
|
-
}
|
|
1423
|
-
v0 = Load(d, buf0);
|
|
1424
|
-
v1 = Load(d, buf1);
|
|
1412
|
+
template <class D>
|
|
1413
|
+
HWY_API VFromD<D> LowerHalf(D /* tag */, VFromD<Twice<D>> v) {
|
|
1414
|
+
return LowerHalf(v);
|
|
1425
1415
|
}
|
|
1426
1416
|
|
|
1427
|
-
template <class D
|
|
1428
|
-
HWY_API
|
|
1429
|
-
|
|
1430
|
-
|
|
1431
|
-
|
|
1432
|
-
alignas(16) T buf2[MaxLanes(d)];
|
|
1433
|
-
for (size_t i = 0; i < MaxLanes(d); ++i) {
|
|
1434
|
-
buf0[i] = *unaligned++;
|
|
1435
|
-
buf1[i] = *unaligned++;
|
|
1436
|
-
buf2[i] = *unaligned++;
|
|
1437
|
-
}
|
|
1438
|
-
v0 = Load(d, buf0);
|
|
1439
|
-
v1 = Load(d, buf1);
|
|
1440
|
-
v2 = Load(d, buf2);
|
|
1417
|
+
template <class D>
|
|
1418
|
+
HWY_API VFromD<D> UpperHalf(D d, VFromD<Twice<D>> v) {
|
|
1419
|
+
VFromD<D> ret;
|
|
1420
|
+
CopyBytes<d.MaxBytes()>(&v.raw[MaxLanes(d)], ret.raw);
|
|
1421
|
+
return ret;
|
|
1441
1422
|
}
|
|
1442
1423
|
|
|
1443
|
-
template <class D
|
|
1444
|
-
HWY_API
|
|
1445
|
-
|
|
1446
|
-
|
|
1447
|
-
|
|
1448
|
-
|
|
1449
|
-
alignas(16) T buf2[MaxLanes(d)];
|
|
1450
|
-
alignas(16) T buf3[MaxLanes(d)];
|
|
1451
|
-
for (size_t i = 0; i < MaxLanes(d); ++i) {
|
|
1452
|
-
buf0[i] = *unaligned++;
|
|
1453
|
-
buf1[i] = *unaligned++;
|
|
1454
|
-
buf2[i] = *unaligned++;
|
|
1455
|
-
buf3[i] = *unaligned++;
|
|
1456
|
-
}
|
|
1457
|
-
v0 = Load(d, buf0);
|
|
1458
|
-
v1 = Load(d, buf1);
|
|
1459
|
-
v2 = Load(d, buf2);
|
|
1460
|
-
v3 = Load(d, buf3);
|
|
1424
|
+
template <class D>
|
|
1425
|
+
HWY_API VFromD<D> ZeroExtendVector(D d, VFromD<Half<D>> v) {
|
|
1426
|
+
const Half<decltype(d)> dh;
|
|
1427
|
+
VFromD<D> ret; // zero-initialized
|
|
1428
|
+
CopyBytes<dh.MaxBytes()>(v.raw, ret.raw);
|
|
1429
|
+
return ret;
|
|
1461
1430
|
}
|
|
1462
1431
|
|
|
1463
|
-
|
|
1432
|
+
template <class D, class VH = VFromD<Half<D>>>
|
|
1433
|
+
HWY_API VFromD<D> Combine(D d, VH hi_half, VH lo_half) {
|
|
1434
|
+
const Half<decltype(d)> dh;
|
|
1435
|
+
VFromD<D> ret;
|
|
1436
|
+
CopyBytes<dh.MaxBytes()>(lo_half.raw, &ret.raw[0]);
|
|
1437
|
+
CopyBytes<dh.MaxBytes()>(hi_half.raw, &ret.raw[MaxLanes(dh)]);
|
|
1438
|
+
return ret;
|
|
1439
|
+
}
|
|
1464
1440
|
|
|
1465
1441
|
template <class D>
|
|
1466
|
-
HWY_API
|
|
1467
|
-
|
|
1468
|
-
|
|
1469
|
-
|
|
1470
|
-
|
|
1471
|
-
|
|
1442
|
+
HWY_API VFromD<D> ConcatLowerLower(D d, VFromD<D> hi, VFromD<D> lo) {
|
|
1443
|
+
const Half<decltype(d)> dh;
|
|
1444
|
+
VFromD<D> ret;
|
|
1445
|
+
CopyBytes<dh.MaxBytes()>(lo.raw, &ret.raw[0]);
|
|
1446
|
+
CopyBytes<dh.MaxBytes()>(hi.raw, &ret.raw[MaxLanes(dh)]);
|
|
1447
|
+
return ret;
|
|
1472
1448
|
}
|
|
1473
1449
|
|
|
1474
1450
|
template <class D>
|
|
1475
|
-
HWY_API
|
|
1476
|
-
|
|
1477
|
-
|
|
1478
|
-
|
|
1479
|
-
|
|
1480
|
-
|
|
1481
|
-
}
|
|
1451
|
+
HWY_API VFromD<D> ConcatUpperUpper(D d, VFromD<D> hi, VFromD<D> lo) {
|
|
1452
|
+
const Half<decltype(d)> dh;
|
|
1453
|
+
VFromD<D> ret;
|
|
1454
|
+
CopyBytes<dh.MaxBytes()>(&lo.raw[MaxLanes(dh)], &ret.raw[0]);
|
|
1455
|
+
CopyBytes<dh.MaxBytes()>(&hi.raw[MaxLanes(dh)], &ret.raw[MaxLanes(dh)]);
|
|
1456
|
+
return ret;
|
|
1482
1457
|
}
|
|
1483
1458
|
|
|
1484
1459
|
template <class D>
|
|
1485
|
-
HWY_API
|
|
1486
|
-
|
|
1487
|
-
|
|
1488
|
-
|
|
1489
|
-
|
|
1490
|
-
|
|
1491
|
-
*unaligned++ = v2.raw[i];
|
|
1492
|
-
*unaligned++ = v3.raw[i];
|
|
1493
|
-
}
|
|
1460
|
+
HWY_API VFromD<D> ConcatLowerUpper(D d, VFromD<D> hi, VFromD<D> lo) {
|
|
1461
|
+
const Half<decltype(d)> dh;
|
|
1462
|
+
VFromD<D> ret;
|
|
1463
|
+
CopyBytes<dh.MaxBytes()>(&lo.raw[MaxLanes(dh)], &ret.raw[0]);
|
|
1464
|
+
CopyBytes<dh.MaxBytes()>(hi.raw, &ret.raw[MaxLanes(dh)]);
|
|
1465
|
+
return ret;
|
|
1494
1466
|
}
|
|
1495
1467
|
|
|
1496
|
-
// ------------------------------ Stream
|
|
1497
1468
|
template <class D>
|
|
1498
|
-
HWY_API
|
|
1499
|
-
|
|
1469
|
+
HWY_API VFromD<D> ConcatUpperLower(D d, VFromD<D> hi, VFromD<D> lo) {
|
|
1470
|
+
const Half<decltype(d)> dh;
|
|
1471
|
+
VFromD<D> ret;
|
|
1472
|
+
CopyBytes<dh.MaxBytes()>(lo.raw, &ret.raw[0]);
|
|
1473
|
+
CopyBytes<dh.MaxBytes()>(&hi.raw[MaxLanes(dh)], &ret.raw[MaxLanes(dh)]);
|
|
1474
|
+
return ret;
|
|
1500
1475
|
}
|
|
1501
1476
|
|
|
1502
|
-
|
|
1503
|
-
|
|
1477
|
+
template <class D>
|
|
1478
|
+
HWY_API VFromD<D> ConcatEven(D d, VFromD<D> hi, VFromD<D> lo) {
|
|
1479
|
+
const Half<decltype(d)> dh;
|
|
1480
|
+
VFromD<D> ret;
|
|
1481
|
+
for (size_t i = 0; i < MaxLanes(dh); ++i) {
|
|
1482
|
+
ret.raw[i] = lo.raw[2 * i];
|
|
1483
|
+
}
|
|
1484
|
+
for (size_t i = 0; i < MaxLanes(dh); ++i) {
|
|
1485
|
+
ret.raw[MaxLanes(dh) + i] = hi.raw[2 * i];
|
|
1486
|
+
}
|
|
1487
|
+
return ret;
|
|
1488
|
+
}
|
|
1504
1489
|
|
|
1505
|
-
//
|
|
1490
|
+
// 2023-11-23: workaround for incorrect codegen (reduction_test fails for
|
|
1491
|
+
// SumsOf2 because PromoteOddTo, which uses ConcatOdd, returns zero).
|
|
1492
|
+
#if HWY_ARCH_RISCV && HWY_TARGET == HWY_EMU128 && HWY_COMPILER_CLANG
|
|
1493
|
+
#define HWY_EMU128_CONCAT_INLINE HWY_NOINLINE
|
|
1494
|
+
#else
|
|
1495
|
+
#define HWY_EMU128_CONCAT_INLINE HWY_API
|
|
1496
|
+
#endif
|
|
1506
1497
|
|
|
1507
|
-
|
|
1508
|
-
|
|
1498
|
+
template <class D>
|
|
1499
|
+
HWY_EMU128_CONCAT_INLINE VFromD<D> ConcatOdd(D d, VFromD<D> hi, VFromD<D> lo) {
|
|
1500
|
+
const Half<decltype(d)> dh;
|
|
1501
|
+
VFromD<D> ret;
|
|
1502
|
+
for (size_t i = 0; i < MaxLanes(dh); ++i) {
|
|
1503
|
+
ret.raw[i] = lo.raw[2 * i + 1];
|
|
1504
|
+
}
|
|
1505
|
+
for (size_t i = 0; i < MaxLanes(dh); ++i) {
|
|
1506
|
+
ret.raw[MaxLanes(dh) + i] = hi.raw[2 * i + 1];
|
|
1507
|
+
}
|
|
1508
|
+
return ret;
|
|
1509
|
+
}
|
|
1509
1510
|
|
|
1510
|
-
|
|
1511
|
+
// ------------------------------ CombineShiftRightBytes
|
|
1512
|
+
template <int kBytes, class D>
|
|
1513
|
+
HWY_API VFromD<D> CombineShiftRightBytes(D d, VFromD<D> hi, VFromD<D> lo) {
|
|
1514
|
+
VFromD<D> ret;
|
|
1515
|
+
const uint8_t* HWY_RESTRICT lo8 =
|
|
1516
|
+
reinterpret_cast<const uint8_t * HWY_RESTRICT>(lo.raw);
|
|
1517
|
+
uint8_t* HWY_RESTRICT ret8 =
|
|
1518
|
+
reinterpret_cast<uint8_t * HWY_RESTRICT>(ret.raw);
|
|
1519
|
+
CopyBytes<d.MaxBytes() - kBytes>(lo8 + kBytes, ret8);
|
|
1520
|
+
CopyBytes<kBytes>(hi.raw, ret8 + d.MaxBytes() - kBytes);
|
|
1521
|
+
return ret;
|
|
1522
|
+
}
|
|
1511
1523
|
|
|
1512
|
-
|
|
1513
|
-
|
|
1514
|
-
|
|
1515
|
-
|
|
1524
|
+
// ------------------------------ ShiftLeftBytes
|
|
1525
|
+
|
|
1526
|
+
template <int kBytes, class D>
|
|
1527
|
+
HWY_API VFromD<D> ShiftLeftBytes(D d, VFromD<D> v) {
|
|
1528
|
+
static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes");
|
|
1529
|
+
VFromD<D> ret;
|
|
1530
|
+
uint8_t* HWY_RESTRICT ret8 =
|
|
1531
|
+
reinterpret_cast<uint8_t * HWY_RESTRICT>(ret.raw);
|
|
1532
|
+
ZeroBytes<kBytes>(ret8);
|
|
1533
|
+
CopyBytes<d.MaxBytes() - kBytes>(v.raw, ret8 + kBytes);
|
|
1534
|
+
return ret;
|
|
1535
|
+
}
|
|
1536
|
+
|
|
1537
|
+
template <int kBytes, typename T, size_t N>
|
|
1538
|
+
HWY_API Vec128<T, N> ShiftLeftBytes(Vec128<T, N> v) {
|
|
1539
|
+
return ShiftLeftBytes<kBytes>(DFromV<decltype(v)>(), v);
|
|
1540
|
+
}
|
|
1541
|
+
|
|
1542
|
+
// ------------------------------ ShiftLeftLanes
|
|
1516
1543
|
|
|
1517
|
-
|
|
1518
|
-
|
|
1544
|
+
template <int kLanes, class D, typename T = TFromD<D>>
|
|
1545
|
+
HWY_API VFromD<D> ShiftLeftLanes(D d, VFromD<D> v) {
|
|
1546
|
+
const Repartition<uint8_t, decltype(d)> d8;
|
|
1547
|
+
return BitCast(d, ShiftLeftBytes<kLanes * sizeof(T)>(BitCast(d8, v)));
|
|
1548
|
+
}
|
|
1549
|
+
|
|
1550
|
+
template <int kLanes, typename T, size_t N>
|
|
1551
|
+
HWY_API Vec128<T, N> ShiftLeftLanes(Vec128<T, N> v) {
|
|
1552
|
+
return ShiftLeftLanes<kLanes>(DFromV<decltype(v)>(), v);
|
|
1553
|
+
}
|
|
1554
|
+
|
|
1555
|
+
// ------------------------------ ShiftRightBytes
|
|
1556
|
+
template <int kBytes, class D>
|
|
1557
|
+
HWY_API VFromD<D> ShiftRightBytes(D d, VFromD<D> v) {
|
|
1558
|
+
static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes");
|
|
1559
|
+
VFromD<D> ret;
|
|
1560
|
+
const uint8_t* HWY_RESTRICT v8 =
|
|
1561
|
+
reinterpret_cast<const uint8_t * HWY_RESTRICT>(v.raw);
|
|
1562
|
+
uint8_t* HWY_RESTRICT ret8 =
|
|
1563
|
+
reinterpret_cast<uint8_t * HWY_RESTRICT>(ret.raw);
|
|
1564
|
+
CopyBytes<d.MaxBytes() - kBytes>(v8 + kBytes, ret8);
|
|
1565
|
+
ZeroBytes<kBytes>(ret8 + d.MaxBytes() - kBytes);
|
|
1566
|
+
return ret;
|
|
1567
|
+
}
|
|
1568
|
+
|
|
1569
|
+
// ------------------------------ ShiftRightLanes
|
|
1570
|
+
template <int kLanes, class D>
|
|
1571
|
+
HWY_API VFromD<D> ShiftRightLanes(D d, VFromD<D> v) {
|
|
1572
|
+
const Repartition<uint8_t, decltype(d)> d8;
|
|
1573
|
+
constexpr size_t kBytes = kLanes * sizeof(TFromD<D>);
|
|
1574
|
+
return BitCast(d, ShiftRightBytes<kBytes>(d8, BitCast(d8, v)));
|
|
1575
|
+
}
|
|
1576
|
+
|
|
1577
|
+
// ------------------------------ Tuples, PromoteEvenTo/PromoteOddTo
|
|
1578
|
+
#include "hwy/ops/inside-inl.h"
|
|
1579
|
+
|
|
1580
|
+
// ------------------------------ LoadInterleaved2/3/4
|
|
1581
|
+
|
|
1582
|
+
// Per-target flag to prevent generic_ops-inl.h from defining LoadInterleaved2.
|
|
1583
|
+
// We implement those here because scalar code is likely faster than emulation
|
|
1584
|
+
// via shuffles.
|
|
1585
|
+
#ifdef HWY_NATIVE_LOAD_STORE_INTERLEAVED
|
|
1586
|
+
#undef HWY_NATIVE_LOAD_STORE_INTERLEAVED
|
|
1587
|
+
#else
|
|
1588
|
+
#define HWY_NATIVE_LOAD_STORE_INTERLEAVED
|
|
1589
|
+
#endif
|
|
1590
|
+
|
|
1591
|
+
template <class D, typename T = TFromD<D>>
|
|
1592
|
+
HWY_API void LoadInterleaved2(D d, const T* HWY_RESTRICT unaligned,
|
|
1593
|
+
VFromD<D>& v0, VFromD<D>& v1) {
|
|
1594
|
+
alignas(16) T buf0[MaxLanes(d)];
|
|
1595
|
+
alignas(16) T buf1[MaxLanes(d)];
|
|
1596
|
+
for (size_t i = 0; i < MaxLanes(d); ++i) {
|
|
1597
|
+
buf0[i] = *unaligned++;
|
|
1598
|
+
buf1[i] = *unaligned++;
|
|
1599
|
+
}
|
|
1600
|
+
v0 = Load(d, buf0);
|
|
1601
|
+
v1 = Load(d, buf1);
|
|
1602
|
+
}
|
|
1519
1603
|
|
|
1520
|
-
|
|
1521
|
-
|
|
1522
|
-
|
|
1523
|
-
|
|
1524
|
-
|
|
1525
|
-
|
|
1526
|
-
|
|
1527
|
-
|
|
1604
|
+
template <class D, typename T = TFromD<D>>
|
|
1605
|
+
HWY_API void LoadInterleaved3(D d, const T* HWY_RESTRICT unaligned,
|
|
1606
|
+
VFromD<D>& v0, VFromD<D>& v1, VFromD<D>& v2) {
|
|
1607
|
+
alignas(16) T buf0[MaxLanes(d)];
|
|
1608
|
+
alignas(16) T buf1[MaxLanes(d)];
|
|
1609
|
+
alignas(16) T buf2[MaxLanes(d)];
|
|
1610
|
+
for (size_t i = 0; i < MaxLanes(d); ++i) {
|
|
1611
|
+
buf0[i] = *unaligned++;
|
|
1612
|
+
buf1[i] = *unaligned++;
|
|
1613
|
+
buf2[i] = *unaligned++;
|
|
1614
|
+
}
|
|
1615
|
+
v0 = Load(d, buf0);
|
|
1616
|
+
v1 = Load(d, buf1);
|
|
1617
|
+
v2 = Load(d, buf2);
|
|
1618
|
+
}
|
|
1528
1619
|
|
|
1529
|
-
|
|
1530
|
-
|
|
1531
|
-
|
|
1532
|
-
|
|
1533
|
-
|
|
1534
|
-
|
|
1620
|
+
template <class D, typename T = TFromD<D>>
|
|
1621
|
+
HWY_API void LoadInterleaved4(D d, const T* HWY_RESTRICT unaligned,
|
|
1622
|
+
VFromD<D>& v0, VFromD<D>& v1, VFromD<D>& v2,
|
|
1623
|
+
VFromD<D>& v3) {
|
|
1624
|
+
alignas(16) T buf0[MaxLanes(d)];
|
|
1625
|
+
alignas(16) T buf1[MaxLanes(d)];
|
|
1626
|
+
alignas(16) T buf2[MaxLanes(d)];
|
|
1627
|
+
alignas(16) T buf3[MaxLanes(d)];
|
|
1628
|
+
for (size_t i = 0; i < MaxLanes(d); ++i) {
|
|
1629
|
+
buf0[i] = *unaligned++;
|
|
1630
|
+
buf1[i] = *unaligned++;
|
|
1631
|
+
buf2[i] = *unaligned++;
|
|
1632
|
+
buf3[i] = *unaligned++;
|
|
1535
1633
|
}
|
|
1634
|
+
v0 = Load(d, buf0);
|
|
1635
|
+
v1 = Load(d, buf1);
|
|
1636
|
+
v2 = Load(d, buf2);
|
|
1637
|
+
v3 = Load(d, buf3);
|
|
1536
1638
|
}
|
|
1537
1639
|
|
|
1538
|
-
|
|
1539
|
-
HWY_INLINE ToT CastValueForF2IConv(hwy::SignedTag /* to_type_tag */,
|
|
1540
|
-
FromT val) {
|
|
1541
|
-
// Prevent ubsan errors when converting float to narrower integer
|
|
1640
|
+
// ------------------------------ StoreInterleaved2/3/4
|
|
1542
1641
|
|
|
1543
|
-
|
|
1544
|
-
|
|
1642
|
+
template <class D>
|
|
1643
|
+
HWY_API void StoreInterleaved2(VFromD<D> v0, VFromD<D> v1, D d,
|
|
1644
|
+
TFromD<D>* HWY_RESTRICT unaligned) {
|
|
1645
|
+
for (size_t i = 0; i < MaxLanes(d); ++i) {
|
|
1646
|
+
*unaligned++ = v0.raw[i];
|
|
1647
|
+
*unaligned++ = v1.raw[i];
|
|
1648
|
+
}
|
|
1649
|
+
}
|
|
1545
1650
|
|
|
1546
|
-
|
|
1547
|
-
|
|
1548
|
-
|
|
1549
|
-
|
|
1550
|
-
|
|
1551
|
-
|
|
1552
|
-
|
|
1651
|
+
template <class D>
|
|
1652
|
+
HWY_API void StoreInterleaved3(VFromD<D> v0, VFromD<D> v1, VFromD<D> v2, D d,
|
|
1653
|
+
TFromD<D>* HWY_RESTRICT unaligned) {
|
|
1654
|
+
for (size_t i = 0; i < MaxLanes(d); ++i) {
|
|
1655
|
+
*unaligned++ = v0.raw[i];
|
|
1656
|
+
*unaligned++ = v1.raw[i];
|
|
1657
|
+
*unaligned++ = v2.raw[i];
|
|
1658
|
+
}
|
|
1659
|
+
}
|
|
1553
1660
|
|
|
1554
|
-
|
|
1555
|
-
|
|
1556
|
-
|
|
1557
|
-
|
|
1661
|
+
template <class D>
|
|
1662
|
+
HWY_API void StoreInterleaved4(VFromD<D> v0, VFromD<D> v1, VFromD<D> v2,
|
|
1663
|
+
VFromD<D> v3, D d,
|
|
1664
|
+
TFromD<D>* HWY_RESTRICT unaligned) {
|
|
1665
|
+
for (size_t i = 0; i < MaxLanes(d); ++i) {
|
|
1666
|
+
*unaligned++ = v0.raw[i];
|
|
1667
|
+
*unaligned++ = v1.raw[i];
|
|
1668
|
+
*unaligned++ = v2.raw[i];
|
|
1669
|
+
*unaligned++ = v3.raw[i];
|
|
1558
1670
|
}
|
|
1559
1671
|
}
|
|
1560
1672
|
|
|
1673
|
+
// ------------------------------ Stream
|
|
1674
|
+
template <class D>
|
|
1675
|
+
HWY_API void Stream(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT aligned) {
|
|
1676
|
+
Store(v, d, aligned);
|
|
1677
|
+
}
|
|
1678
|
+
|
|
1679
|
+
// ------------------------------ Scatter in generic_ops-inl.h
|
|
1680
|
+
// ------------------------------ Gather in generic_ops-inl.h
|
|
1681
|
+
|
|
1682
|
+
// ================================================== CONVERT
|
|
1683
|
+
|
|
1684
|
+
// ConvertTo and DemoteTo with floating-point input and integer output truncate
|
|
1685
|
+
// (rounding toward zero).
|
|
1686
|
+
|
|
1687
|
+
namespace detail {
|
|
1688
|
+
|
|
1689
|
+
template <class ToT, class FromT>
|
|
1690
|
+
HWY_INLINE ToT CastValueForF2IConv(FromT val) {
|
|
1691
|
+
// Prevent ubsan errors when converting float to narrower integer
|
|
1692
|
+
|
|
1693
|
+
using FromTU = MakeUnsigned<FromT>;
|
|
1694
|
+
using ToTU = MakeUnsigned<ToT>;
|
|
1695
|
+
|
|
1696
|
+
constexpr unsigned kMaxExpField =
|
|
1697
|
+
static_cast<unsigned>(MaxExponentField<FromT>());
|
|
1698
|
+
constexpr unsigned kExpBias = kMaxExpField >> 1;
|
|
1699
|
+
constexpr unsigned kMinOutOfRangeExpField = static_cast<unsigned>(HWY_MIN(
|
|
1700
|
+
kExpBias + sizeof(ToT) * 8 - static_cast<unsigned>(IsSigned<ToT>()),
|
|
1701
|
+
kMaxExpField));
|
|
1702
|
+
|
|
1703
|
+
// If ToT is signed, compare only the exponent bits of val against
|
|
1704
|
+
// kMinOutOfRangeExpField.
|
|
1705
|
+
//
|
|
1706
|
+
// Otherwise, if ToT is unsigned, compare the sign bit plus exponent bits of
|
|
1707
|
+
// val against kMinOutOfRangeExpField as a negative value is outside of the
|
|
1708
|
+
// range of an unsigned integer type.
|
|
1709
|
+
const FromT val_to_compare =
|
|
1710
|
+
static_cast<FromT>(IsSigned<ToT>() ? ScalarAbs(val) : val);
|
|
1711
|
+
|
|
1712
|
+
// val is within the range of ToT if
|
|
1713
|
+
// (BitCastScalar<FromTU>(val_to_compare) >> MantissaBits<FromT>()) is less
|
|
1714
|
+
// than kMinOutOfRangeExpField
|
|
1715
|
+
//
|
|
1716
|
+
// Otherwise, val is either outside of the range of ToT or equal to
|
|
1717
|
+
// LimitsMin<ToT>() if
|
|
1718
|
+
// (BitCastScalar<FromTU>(val_to_compare) >> MantissaBits<FromT>()) is greater
|
|
1719
|
+
// than or equal to kMinOutOfRangeExpField.
|
|
1720
|
+
|
|
1721
|
+
return (static_cast<unsigned>(BitCastScalar<FromTU>(val_to_compare) >>
|
|
1722
|
+
MantissaBits<FromT>()) < kMinOutOfRangeExpField)
|
|
1723
|
+
? static_cast<ToT>(val)
|
|
1724
|
+
: static_cast<ToT>(static_cast<ToTU>(LimitsMax<ToT>()) +
|
|
1725
|
+
static_cast<ToTU>(ScalarSignBit(val)));
|
|
1726
|
+
}
|
|
1727
|
+
|
|
1561
1728
|
template <class ToT, class ToTypeTag, class FromT>
|
|
1562
1729
|
HWY_INLINE ToT CastValueForPromoteTo(ToTypeTag /* to_type_tag */, FromT val) {
|
|
1563
|
-
return
|
|
1730
|
+
return ConvertScalarTo<ToT>(val);
|
|
1564
1731
|
}
|
|
1565
1732
|
|
|
1566
1733
|
template <class ToT>
|
|
1567
|
-
HWY_INLINE ToT CastValueForPromoteTo(hwy::SignedTag to_type_tag
|
|
1568
|
-
|
|
1734
|
+
HWY_INLINE ToT CastValueForPromoteTo(hwy::SignedTag /*to_type_tag*/,
|
|
1735
|
+
float val) {
|
|
1736
|
+
return CastValueForF2IConv<ToT>(val);
|
|
1569
1737
|
}
|
|
1570
1738
|
|
|
1571
1739
|
template <class ToT>
|
|
1572
|
-
HWY_INLINE ToT CastValueForPromoteTo(hwy::UnsignedTag to_type_tag
|
|
1573
|
-
|
|
1740
|
+
HWY_INLINE ToT CastValueForPromoteTo(hwy::UnsignedTag /*to_type_tag*/,
|
|
1741
|
+
float val) {
|
|
1742
|
+
return CastValueForF2IConv<ToT>(val);
|
|
1743
|
+
}
|
|
1744
|
+
// If val is within the range of ToT, CastValueForInRangeF2IConv<ToT>(val)
|
|
1745
|
+
// returns static_cast<ToT>(val)
|
|
1746
|
+
//
|
|
1747
|
+
// Otherwise, CastValueForInRangeF2IConv<ToT>(val) returns an
|
|
1748
|
+
// implementation-defined result if val is not within the range of ToT.
|
|
1749
|
+
template <class ToT, class FromT>
|
|
1750
|
+
HWY_INLINE ToT CastValueForInRangeF2IConv(FromT val) {
|
|
1751
|
+
// Prevent ubsan errors when converting float to narrower integer
|
|
1752
|
+
|
|
1753
|
+
using FromTU = MakeUnsigned<FromT>;
|
|
1754
|
+
|
|
1755
|
+
constexpr unsigned kMaxExpField =
|
|
1756
|
+
static_cast<unsigned>(MaxExponentField<FromT>());
|
|
1757
|
+
constexpr unsigned kExpBias = kMaxExpField >> 1;
|
|
1758
|
+
constexpr unsigned kMinOutOfRangeExpField = static_cast<unsigned>(HWY_MIN(
|
|
1759
|
+
kExpBias + sizeof(ToT) * 8 - static_cast<unsigned>(IsSigned<ToT>()),
|
|
1760
|
+
kMaxExpField));
|
|
1761
|
+
|
|
1762
|
+
// If ToT is signed, compare only the exponent bits of val against
|
|
1763
|
+
// kMinOutOfRangeExpField.
|
|
1764
|
+
//
|
|
1765
|
+
// Otherwise, if ToT is unsigned, compare the sign bit plus exponent bits of
|
|
1766
|
+
// val against kMinOutOfRangeExpField as a negative value is outside of the
|
|
1767
|
+
// range of an unsigned integer type.
|
|
1768
|
+
const FromT val_to_compare =
|
|
1769
|
+
static_cast<FromT>(IsSigned<ToT>() ? ScalarAbs(val) : val);
|
|
1770
|
+
|
|
1771
|
+
// val is within the range of ToT if
|
|
1772
|
+
// (BitCastScalar<FromTU>(val_to_compare) >> MantissaBits<FromT>()) is less
|
|
1773
|
+
// than kMinOutOfRangeExpField
|
|
1774
|
+
//
|
|
1775
|
+
// Otherwise, val is either outside of the range of ToT or equal to
|
|
1776
|
+
// LimitsMin<ToT>() if
|
|
1777
|
+
// (BitCastScalar<FromTU>(val_to_compare) >> MantissaBits<FromT>()) is greater
|
|
1778
|
+
// than or equal to kMinOutOfRangeExpField.
|
|
1779
|
+
|
|
1780
|
+
return (static_cast<unsigned>(BitCastScalar<FromTU>(val_to_compare) >>
|
|
1781
|
+
MantissaBits<FromT>()) < kMinOutOfRangeExpField)
|
|
1782
|
+
? static_cast<ToT>(val)
|
|
1783
|
+
: static_cast<ToT>(LimitsMin<ToT>());
|
|
1574
1784
|
}
|
|
1575
1785
|
|
|
1576
1786
|
} // namespace detail
|
|
@@ -1587,6 +1797,21 @@ HWY_API VFromD<DTo> PromoteTo(DTo d, Vec128<TFrom, HWY_MAX_LANES_D(DTo)> from) {
|
|
|
1587
1797
|
return ret;
|
|
1588
1798
|
}
|
|
1589
1799
|
|
|
1800
|
+
#ifdef HWY_NATIVE_F32_TO_UI64_PROMOTE_IN_RANGE_TO
|
|
1801
|
+
#undef HWY_NATIVE_F32_TO_UI64_PROMOTE_IN_RANGE_TO
|
|
1802
|
+
#else
|
|
1803
|
+
#define HWY_NATIVE_F32_TO_UI64_PROMOTE_IN_RANGE_TO
|
|
1804
|
+
#endif
|
|
1805
|
+
|
|
1806
|
+
template <class D64, HWY_IF_UI64_D(D64)>
|
|
1807
|
+
HWY_API VFromD<D64> PromoteInRangeTo(D64 d64, VFromD<Rebind<float, D64>> v) {
|
|
1808
|
+
VFromD<D64> ret;
|
|
1809
|
+
for (size_t i = 0; i < MaxLanes(d64); ++i) {
|
|
1810
|
+
ret.raw[i] = detail::CastValueForInRangeF2IConv<TFromD<D64>>(v.raw[i]);
|
|
1811
|
+
}
|
|
1812
|
+
return ret;
|
|
1813
|
+
}
|
|
1814
|
+
|
|
1590
1815
|
// MSVC 19.10 cannot deduce the argument type if HWY_IF_FLOAT(TFrom) is here,
|
|
1591
1816
|
// so we overload for TFrom=double and ToT={float,int32_t}.
|
|
1592
1817
|
template <class D, HWY_IF_F32_D(D)>
|
|
@@ -1594,10 +1819,10 @@ HWY_API VFromD<D> DemoteTo(D d, VFromD<Rebind<double, D>> from) {
|
|
|
1594
1819
|
VFromD<D> ret;
|
|
1595
1820
|
for (size_t i = 0; i < MaxLanes(d); ++i) {
|
|
1596
1821
|
// Prevent ubsan errors when converting float to narrower integer/float
|
|
1597
|
-
if (
|
|
1598
|
-
|
|
1599
|
-
ret.raw[i] =
|
|
1600
|
-
|
|
1822
|
+
if (ScalarIsInf(from.raw[i]) ||
|
|
1823
|
+
ScalarAbs(from.raw[i]) > static_cast<double>(HighestValue<float>())) {
|
|
1824
|
+
ret.raw[i] = ScalarSignBit(from.raw[i]) ? LowestValue<float>()
|
|
1825
|
+
: HighestValue<float>();
|
|
1601
1826
|
continue;
|
|
1602
1827
|
}
|
|
1603
1828
|
ret.raw[i] = static_cast<float>(from.raw[i]);
|
|
@@ -1609,8 +1834,7 @@ HWY_API VFromD<D> DemoteTo(D d, VFromD<Rebind<double, D>> from) {
|
|
|
1609
1834
|
VFromD<D> ret;
|
|
1610
1835
|
for (size_t i = 0; i < MaxLanes(d); ++i) {
|
|
1611
1836
|
// Prevent ubsan errors when converting double to narrower integer/int32_t
|
|
1612
|
-
ret.raw[i] = detail::CastValueForF2IConv<TFromD<D>>(
|
|
1613
|
-
hwy::TypeTag<TFromD<D>>(), from.raw[i]);
|
|
1837
|
+
ret.raw[i] = detail::CastValueForF2IConv<TFromD<D>>(from.raw[i]);
|
|
1614
1838
|
}
|
|
1615
1839
|
return ret;
|
|
1616
1840
|
}
|
|
@@ -1631,17 +1855,32 @@ HWY_API VFromD<DTo> DemoteTo(DTo /* tag */, Vec128<TFrom, N> from) {
|
|
|
1631
1855
|
return ret;
|
|
1632
1856
|
}
|
|
1633
1857
|
|
|
1858
|
+
// Disable the default unsigned to signed DemoteTo/ReorderDemote2To
|
|
1859
|
+
// implementations in generic_ops-inl.h on EMU128 as the EMU128 target has
|
|
1860
|
+
// target-specific implementations of the unsigned to signed DemoteTo and
|
|
1861
|
+
// ReorderDemote2To ops
|
|
1862
|
+
|
|
1863
|
+
// NOTE: hwy::EnableIf<!hwy::IsSame<V, V>()>* = nullptr is used instead of
|
|
1864
|
+
// hwy::EnableIf<false>* = nullptr to avoid compiler errors since
|
|
1865
|
+
// !hwy::IsSame<V, V>() is always false and as !hwy::IsSame<V, V>() will cause
|
|
1866
|
+
// SFINAE to occur instead of a hard error due to a dependency on the V template
|
|
1867
|
+
// argument
|
|
1868
|
+
#undef HWY_IF_U2I_DEMOTE_FROM_LANE_SIZE_V
|
|
1869
|
+
#define HWY_IF_U2I_DEMOTE_FROM_LANE_SIZE_V(V) \
|
|
1870
|
+
hwy::EnableIf<!hwy::IsSame<V, V>()>* = nullptr
|
|
1871
|
+
|
|
1634
1872
|
template <class DTo, typename TFrom, size_t N, HWY_IF_UNSIGNED(TFrom),
|
|
1635
|
-
|
|
1873
|
+
HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(DTo)>
|
|
1636
1874
|
HWY_API VFromD<DTo> DemoteTo(DTo /* tag */, Vec128<TFrom, N> from) {
|
|
1637
1875
|
using TTo = TFromD<DTo>;
|
|
1638
1876
|
static_assert(sizeof(TTo) < sizeof(TFrom), "Not demoting");
|
|
1639
1877
|
|
|
1878
|
+
const auto max = static_cast<MakeUnsigned<TTo>>(LimitsMax<TTo>());
|
|
1879
|
+
|
|
1640
1880
|
VFromD<DTo> ret;
|
|
1641
1881
|
for (size_t i = 0; i < N; ++i) {
|
|
1642
1882
|
// Int to int: choose closest value in ToT to `from` (avoids UB)
|
|
1643
|
-
|
|
1644
|
-
ret.raw[i] = static_cast<TTo>(from.raw[i]);
|
|
1883
|
+
ret.raw[i] = static_cast<TTo>(HWY_MIN(from.raw[i], max));
|
|
1645
1884
|
}
|
|
1646
1885
|
return ret;
|
|
1647
1886
|
}
|
|
@@ -1689,14 +1928,15 @@ HWY_API VFromD<DN> ReorderDemote2To(DN dn, V a, V b) {
|
|
|
1689
1928
|
return ret;
|
|
1690
1929
|
}
|
|
1691
1930
|
|
|
1692
|
-
template <class DN,
|
|
1693
|
-
HWY_IF_T_SIZE_V(V, sizeof(TFromD<DN>) * 2),
|
|
1931
|
+
template <class DN, HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(DN), class V,
|
|
1932
|
+
HWY_IF_UNSIGNED_V(V), HWY_IF_T_SIZE_V(V, sizeof(TFromD<DN>) * 2),
|
|
1694
1933
|
HWY_IF_LANES_D(DN, HWY_MAX_LANES_D(DFromV<V>) * 2)>
|
|
1695
1934
|
HWY_API VFromD<DN> ReorderDemote2To(DN dn, V a, V b) {
|
|
1696
1935
|
const RepartitionToWide<decltype(dn)> dw;
|
|
1697
1936
|
const size_t NW = Lanes(dw);
|
|
1698
1937
|
using TN = TFromD<DN>;
|
|
1699
|
-
|
|
1938
|
+
using TN_U = MakeUnsigned<TN>;
|
|
1939
|
+
const TN_U max = static_cast<TN_U>(LimitsMax<TN>());
|
|
1700
1940
|
VFromD<DN> ret;
|
|
1701
1941
|
for (size_t i = 0; i < NW; ++i) {
|
|
1702
1942
|
ret.raw[i] = static_cast<TN>(HWY_MIN(a.raw[i], max));
|
|
@@ -1715,23 +1955,20 @@ HWY_API VFromD<DN> OrderedDemote2To(DN dn, V a, V b) {
|
|
|
1715
1955
|
return ReorderDemote2To(dn, a, b);
|
|
1716
1956
|
}
|
|
1717
1957
|
|
|
1718
|
-
template <class DN,
|
|
1958
|
+
template <class DN, HWY_IF_SPECIAL_FLOAT_D(DN), class V,
|
|
1959
|
+
HWY_IF_F32_D(DFromV<V>),
|
|
1719
1960
|
HWY_IF_LANES_D(DN, HWY_MAX_LANES_D(DFromV<V>) * 2)>
|
|
1720
1961
|
HWY_API VFromD<DN> OrderedDemote2To(DN dn, V a, V b) {
|
|
1721
|
-
const
|
|
1722
|
-
|
|
1723
|
-
VFromD<
|
|
1724
|
-
|
|
1725
|
-
const auto a_bits = BitCast(du32, a);
|
|
1726
|
-
const auto b_bits = BitCast(du32, b);
|
|
1727
|
-
|
|
1962
|
+
const size_t NW = Lanes(dn) / 2;
|
|
1963
|
+
using TN = TFromD<DN>;
|
|
1964
|
+
VFromD<DN> ret;
|
|
1728
1965
|
for (size_t i = 0; i < NW; ++i) {
|
|
1729
|
-
ret.raw[i] =
|
|
1966
|
+
ret.raw[i] = ConvertScalarTo<TN>(a.raw[i]);
|
|
1730
1967
|
}
|
|
1731
1968
|
for (size_t i = 0; i < NW; ++i) {
|
|
1732
|
-
ret.raw[NW + i] =
|
|
1969
|
+
ret.raw[NW + i] = ConvertScalarTo<TN>(b.raw[i]);
|
|
1733
1970
|
}
|
|
1734
|
-
return
|
|
1971
|
+
return ret;
|
|
1735
1972
|
}
|
|
1736
1973
|
|
|
1737
1974
|
namespace detail {
|
|
@@ -1758,6 +1995,12 @@ HWY_API VFromD<D> PromoteTo(D /* tag */, Vec128<bfloat16_t, N> v) {
|
|
|
1758
1995
|
return ret;
|
|
1759
1996
|
}
|
|
1760
1997
|
|
|
1998
|
+
#ifdef HWY_NATIVE_DEMOTE_F32_TO_BF16
|
|
1999
|
+
#undef HWY_NATIVE_DEMOTE_F32_TO_BF16
|
|
2000
|
+
#else
|
|
2001
|
+
#define HWY_NATIVE_DEMOTE_F32_TO_BF16
|
|
2002
|
+
#endif
|
|
2003
|
+
|
|
1761
2004
|
template <class D, HWY_IF_BF16_D(D), size_t N>
|
|
1762
2005
|
HWY_API VFromD<D> DemoteTo(D /* tag */, Vec128<float, N> v) {
|
|
1763
2006
|
VFromD<D> ret;
|
|
@@ -1767,6 +2010,21 @@ HWY_API VFromD<D> DemoteTo(D /* tag */, Vec128<float, N> v) {
|
|
|
1767
2010
|
return ret;
|
|
1768
2011
|
}
|
|
1769
2012
|
|
|
2013
|
+
#ifdef HWY_NATIVE_F64_TO_UI32_DEMOTE_IN_RANGE_TO
|
|
2014
|
+
#undef HWY_NATIVE_F64_TO_UI32_DEMOTE_IN_RANGE_TO
|
|
2015
|
+
#else
|
|
2016
|
+
#define HWY_NATIVE_F64_TO_UI32_DEMOTE_IN_RANGE_TO
|
|
2017
|
+
#endif
|
|
2018
|
+
|
|
2019
|
+
template <class D32, HWY_IF_UI32_D(D32)>
|
|
2020
|
+
HWY_API VFromD<D32> DemoteInRangeTo(D32 d32, VFromD<Rebind<double, D32>> v) {
|
|
2021
|
+
VFromD<D32> ret;
|
|
2022
|
+
for (size_t i = 0; i < MaxLanes(d32); ++i) {
|
|
2023
|
+
ret.raw[i] = detail::CastValueForInRangeF2IConv<TFromD<D32>>(v.raw[i]);
|
|
2024
|
+
}
|
|
2025
|
+
return ret;
|
|
2026
|
+
}
|
|
2027
|
+
|
|
1770
2028
|
// Tag dispatch instead of SFINAE for MSVC 2017 compatibility
|
|
1771
2029
|
namespace detail {
|
|
1772
2030
|
|
|
@@ -1780,7 +2038,7 @@ HWY_API VFromD<DTo> ConvertTo(hwy::FloatTag /*tag*/, DTo /*tag*/,
|
|
|
1780
2038
|
|
|
1781
2039
|
for (size_t i = 0; i < N; ++i) {
|
|
1782
2040
|
// float## -> int##: return closest representable value
|
|
1783
|
-
ret.raw[i] = CastValueForF2IConv<ToT>(
|
|
2041
|
+
ret.raw[i] = CastValueForF2IConv<ToT>(from.raw[i]);
|
|
1784
2042
|
}
|
|
1785
2043
|
return ret;
|
|
1786
2044
|
}
|
|
@@ -1806,6 +2064,22 @@ HWY_API VFromD<DTo> ConvertTo(DTo d, Vec128<TFrom, HWY_MAX_LANES_D(DTo)> from) {
|
|
|
1806
2064
|
return detail::ConvertTo(hwy::IsFloatTag<TFrom>(), d, from);
|
|
1807
2065
|
}
|
|
1808
2066
|
|
|
2067
|
+
#ifdef HWY_NATIVE_F2I_CONVERT_IN_RANGE_TO
|
|
2068
|
+
#undef HWY_NATIVE_F2I_CONVERT_IN_RANGE_TO
|
|
2069
|
+
#else
|
|
2070
|
+
#define HWY_NATIVE_F2I_CONVERT_IN_RANGE_TO
|
|
2071
|
+
#endif
|
|
2072
|
+
|
|
2073
|
+
template <class DI, HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(DI),
|
|
2074
|
+
HWY_IF_T_SIZE_ONE_OF_D(DI, (1 << 4) | (1 << 8))>
|
|
2075
|
+
HWY_API VFromD<DI> ConvertInRangeTo(DI di, VFromD<RebindToFloat<DI>> v) {
|
|
2076
|
+
VFromD<DI> ret;
|
|
2077
|
+
for (size_t i = 0; i < MaxLanes(di); i++) {
|
|
2078
|
+
ret.raw[i] = detail::CastValueForInRangeF2IConv<TFromD<DI>>(v.raw[i]);
|
|
2079
|
+
}
|
|
2080
|
+
return ret;
|
|
2081
|
+
}
|
|
2082
|
+
|
|
1809
2083
|
template <size_t N>
|
|
1810
2084
|
HWY_API Vec128<uint8_t, N> U8FromU32(Vec128<uint32_t, N> v) {
|
|
1811
2085
|
return DemoteTo(Simd<uint8_t, N, 0>(), v);
|
|
@@ -1893,172 +2167,6 @@ HWY_API VFromD<DN> OrderedTruncate2To(DN dn, V a, V b) {
|
|
|
1893
2167
|
return ret;
|
|
1894
2168
|
}
|
|
1895
2169
|
|
|
1896
|
-
// ================================================== COMBINE
|
|
1897
|
-
|
|
1898
|
-
template <typename T, size_t N>
|
|
1899
|
-
HWY_API Vec128<T, N / 2> LowerHalf(Vec128<T, N> v) {
|
|
1900
|
-
Vec128<T, N / 2> ret;
|
|
1901
|
-
CopyBytes<N / 2 * sizeof(T)>(v.raw, ret.raw);
|
|
1902
|
-
return ret;
|
|
1903
|
-
}
|
|
1904
|
-
|
|
1905
|
-
template <class D>
|
|
1906
|
-
HWY_API VFromD<D> LowerHalf(D /* tag */, VFromD<Twice<D>> v) {
|
|
1907
|
-
return LowerHalf(v);
|
|
1908
|
-
}
|
|
1909
|
-
|
|
1910
|
-
template <class D>
|
|
1911
|
-
HWY_API VFromD<D> UpperHalf(D d, VFromD<Twice<D>> v) {
|
|
1912
|
-
VFromD<D> ret;
|
|
1913
|
-
CopyBytes<d.MaxBytes()>(&v.raw[MaxLanes(d)], ret.raw);
|
|
1914
|
-
return ret;
|
|
1915
|
-
}
|
|
1916
|
-
|
|
1917
|
-
template <class D>
|
|
1918
|
-
HWY_API VFromD<D> ZeroExtendVector(D d, VFromD<Half<D>> v) {
|
|
1919
|
-
const Half<decltype(d)> dh;
|
|
1920
|
-
VFromD<D> ret; // zero-initialized
|
|
1921
|
-
CopyBytes<dh.MaxBytes()>(v.raw, ret.raw);
|
|
1922
|
-
return ret;
|
|
1923
|
-
}
|
|
1924
|
-
|
|
1925
|
-
template <class D, class VH = VFromD<Half<D>>>
|
|
1926
|
-
HWY_API VFromD<D> Combine(D d, VH hi_half, VH lo_half) {
|
|
1927
|
-
const Half<decltype(d)> dh;
|
|
1928
|
-
VFromD<D> ret;
|
|
1929
|
-
CopyBytes<dh.MaxBytes()>(lo_half.raw, &ret.raw[0]);
|
|
1930
|
-
CopyBytes<dh.MaxBytes()>(hi_half.raw, &ret.raw[MaxLanes(dh)]);
|
|
1931
|
-
return ret;
|
|
1932
|
-
}
|
|
1933
|
-
|
|
1934
|
-
template <class D>
|
|
1935
|
-
HWY_API VFromD<D> ConcatLowerLower(D d, VFromD<D> hi, VFromD<D> lo) {
|
|
1936
|
-
const Half<decltype(d)> dh;
|
|
1937
|
-
VFromD<D> ret;
|
|
1938
|
-
CopyBytes<dh.MaxBytes()>(lo.raw, &ret.raw[0]);
|
|
1939
|
-
CopyBytes<dh.MaxBytes()>(hi.raw, &ret.raw[MaxLanes(dh)]);
|
|
1940
|
-
return ret;
|
|
1941
|
-
}
|
|
1942
|
-
|
|
1943
|
-
template <class D>
|
|
1944
|
-
HWY_API VFromD<D> ConcatUpperUpper(D d, VFromD<D> hi, VFromD<D> lo) {
|
|
1945
|
-
const Half<decltype(d)> dh;
|
|
1946
|
-
VFromD<D> ret;
|
|
1947
|
-
CopyBytes<dh.MaxBytes()>(&lo.raw[MaxLanes(dh)], &ret.raw[0]);
|
|
1948
|
-
CopyBytes<dh.MaxBytes()>(&hi.raw[MaxLanes(dh)], &ret.raw[MaxLanes(dh)]);
|
|
1949
|
-
return ret;
|
|
1950
|
-
}
|
|
1951
|
-
|
|
1952
|
-
template <class D>
|
|
1953
|
-
HWY_API VFromD<D> ConcatLowerUpper(D d, VFromD<D> hi, VFromD<D> lo) {
|
|
1954
|
-
const Half<decltype(d)> dh;
|
|
1955
|
-
VFromD<D> ret;
|
|
1956
|
-
CopyBytes<dh.MaxBytes()>(&lo.raw[MaxLanes(dh)], &ret.raw[0]);
|
|
1957
|
-
CopyBytes<dh.MaxBytes()>(hi.raw, &ret.raw[MaxLanes(dh)]);
|
|
1958
|
-
return ret;
|
|
1959
|
-
}
|
|
1960
|
-
|
|
1961
|
-
template <class D>
|
|
1962
|
-
HWY_API VFromD<D> ConcatUpperLower(D d, VFromD<D> hi, VFromD<D> lo) {
|
|
1963
|
-
const Half<decltype(d)> dh;
|
|
1964
|
-
VFromD<D> ret;
|
|
1965
|
-
CopyBytes<dh.MaxBytes()>(lo.raw, &ret.raw[0]);
|
|
1966
|
-
CopyBytes<dh.MaxBytes()>(&hi.raw[MaxLanes(dh)], &ret.raw[MaxLanes(dh)]);
|
|
1967
|
-
return ret;
|
|
1968
|
-
}
|
|
1969
|
-
|
|
1970
|
-
template <class D>
|
|
1971
|
-
HWY_API VFromD<D> ConcatEven(D d, VFromD<D> hi, VFromD<D> lo) {
|
|
1972
|
-
const Half<decltype(d)> dh;
|
|
1973
|
-
VFromD<D> ret;
|
|
1974
|
-
for (size_t i = 0; i < MaxLanes(dh); ++i) {
|
|
1975
|
-
ret.raw[i] = lo.raw[2 * i];
|
|
1976
|
-
}
|
|
1977
|
-
for (size_t i = 0; i < MaxLanes(dh); ++i) {
|
|
1978
|
-
ret.raw[MaxLanes(dh) + i] = hi.raw[2 * i];
|
|
1979
|
-
}
|
|
1980
|
-
return ret;
|
|
1981
|
-
}
|
|
1982
|
-
|
|
1983
|
-
template <class D>
|
|
1984
|
-
HWY_API VFromD<D> ConcatOdd(D d, VFromD<D> hi, VFromD<D> lo) {
|
|
1985
|
-
const Half<decltype(d)> dh;
|
|
1986
|
-
VFromD<D> ret;
|
|
1987
|
-
for (size_t i = 0; i < MaxLanes(dh); ++i) {
|
|
1988
|
-
ret.raw[i] = lo.raw[2 * i + 1];
|
|
1989
|
-
}
|
|
1990
|
-
for (size_t i = 0; i < MaxLanes(dh); ++i) {
|
|
1991
|
-
ret.raw[MaxLanes(dh) + i] = hi.raw[2 * i + 1];
|
|
1992
|
-
}
|
|
1993
|
-
return ret;
|
|
1994
|
-
}
|
|
1995
|
-
|
|
1996
|
-
// ------------------------------ CombineShiftRightBytes
|
|
1997
|
-
template <int kBytes, class D>
|
|
1998
|
-
HWY_API VFromD<D> CombineShiftRightBytes(D d, VFromD<D> hi, VFromD<D> lo) {
|
|
1999
|
-
VFromD<D> ret;
|
|
2000
|
-
const uint8_t* HWY_RESTRICT lo8 =
|
|
2001
|
-
reinterpret_cast<const uint8_t * HWY_RESTRICT>(lo.raw);
|
|
2002
|
-
uint8_t* HWY_RESTRICT ret8 =
|
|
2003
|
-
reinterpret_cast<uint8_t * HWY_RESTRICT>(ret.raw);
|
|
2004
|
-
CopyBytes<d.MaxBytes() - kBytes>(lo8 + kBytes, ret8);
|
|
2005
|
-
CopyBytes<kBytes>(hi.raw, ret8 + d.MaxBytes() - kBytes);
|
|
2006
|
-
return ret;
|
|
2007
|
-
}
|
|
2008
|
-
|
|
2009
|
-
// ------------------------------ ShiftLeftBytes
|
|
2010
|
-
|
|
2011
|
-
template <int kBytes, class D>
|
|
2012
|
-
HWY_API VFromD<D> ShiftLeftBytes(D d, VFromD<D> v) {
|
|
2013
|
-
static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes");
|
|
2014
|
-
VFromD<D> ret;
|
|
2015
|
-
uint8_t* HWY_RESTRICT ret8 =
|
|
2016
|
-
reinterpret_cast<uint8_t * HWY_RESTRICT>(ret.raw);
|
|
2017
|
-
ZeroBytes<kBytes>(ret8);
|
|
2018
|
-
CopyBytes<d.MaxBytes() - kBytes>(v.raw, ret8 + kBytes);
|
|
2019
|
-
return ret;
|
|
2020
|
-
}
|
|
2021
|
-
|
|
2022
|
-
template <int kBytes, typename T, size_t N>
|
|
2023
|
-
HWY_API Vec128<T, N> ShiftLeftBytes(Vec128<T, N> v) {
|
|
2024
|
-
return ShiftLeftBytes<kBytes>(DFromV<decltype(v)>(), v);
|
|
2025
|
-
}
|
|
2026
|
-
|
|
2027
|
-
// ------------------------------ ShiftLeftLanes
|
|
2028
|
-
|
|
2029
|
-
template <int kLanes, class D, typename T = TFromD<D>>
|
|
2030
|
-
HWY_API VFromD<D> ShiftLeftLanes(D d, VFromD<D> v) {
|
|
2031
|
-
const Repartition<uint8_t, decltype(d)> d8;
|
|
2032
|
-
return BitCast(d, ShiftLeftBytes<kLanes * sizeof(T)>(BitCast(d8, v)));
|
|
2033
|
-
}
|
|
2034
|
-
|
|
2035
|
-
template <int kLanes, typename T, size_t N>
|
|
2036
|
-
HWY_API Vec128<T, N> ShiftLeftLanes(Vec128<T, N> v) {
|
|
2037
|
-
return ShiftLeftLanes<kLanes>(DFromV<decltype(v)>(), v);
|
|
2038
|
-
}
|
|
2039
|
-
|
|
2040
|
-
// ------------------------------ ShiftRightBytes
|
|
2041
|
-
template <int kBytes, class D>
|
|
2042
|
-
HWY_API VFromD<D> ShiftRightBytes(D d, VFromD<D> v) {
|
|
2043
|
-
static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes");
|
|
2044
|
-
VFromD<D> ret;
|
|
2045
|
-
const uint8_t* HWY_RESTRICT v8 =
|
|
2046
|
-
reinterpret_cast<const uint8_t * HWY_RESTRICT>(v.raw);
|
|
2047
|
-
uint8_t* HWY_RESTRICT ret8 =
|
|
2048
|
-
reinterpret_cast<uint8_t * HWY_RESTRICT>(ret.raw);
|
|
2049
|
-
CopyBytes<d.MaxBytes() - kBytes>(v8 + kBytes, ret8);
|
|
2050
|
-
ZeroBytes<kBytes>(ret8 + d.MaxBytes() - kBytes);
|
|
2051
|
-
return ret;
|
|
2052
|
-
}
|
|
2053
|
-
|
|
2054
|
-
// ------------------------------ ShiftRightLanes
|
|
2055
|
-
template <int kLanes, class D>
|
|
2056
|
-
HWY_API VFromD<D> ShiftRightLanes(D d, VFromD<D> v) {
|
|
2057
|
-
const Repartition<uint8_t, decltype(d)> d8;
|
|
2058
|
-
constexpr size_t kBytes = kLanes * sizeof(TFromD<D>);
|
|
2059
|
-
return BitCast(d, ShiftRightBytes<kBytes>(d8, BitCast(d8, v)));
|
|
2060
|
-
}
|
|
2061
|
-
|
|
2062
2170
|
// ================================================== SWIZZLE
|
|
2063
2171
|
|
|
2064
2172
|
template <typename T, size_t N>
|
|
@@ -2101,6 +2209,24 @@ HWY_API Vec128<T, N> OddEven(Vec128<T, N> odd, Vec128<T, N> even) {
|
|
|
2101
2209
|
return odd;
|
|
2102
2210
|
}
|
|
2103
2211
|
|
|
2212
|
+
template <class D>
|
|
2213
|
+
HWY_API VFromD<D> InterleaveEven(D /*d*/, VFromD<D> a, VFromD<D> b) {
|
|
2214
|
+
constexpr size_t N = HWY_MAX_LANES_D(D);
|
|
2215
|
+
for (size_t i = 1; i < N; i += 2) {
|
|
2216
|
+
a.raw[i] = b.raw[i - 1];
|
|
2217
|
+
}
|
|
2218
|
+
return a;
|
|
2219
|
+
}
|
|
2220
|
+
|
|
2221
|
+
template <class D>
|
|
2222
|
+
HWY_API VFromD<D> InterleaveOdd(D /*d*/, VFromD<D> a, VFromD<D> b) {
|
|
2223
|
+
constexpr size_t N = HWY_MAX_LANES_D(D);
|
|
2224
|
+
for (size_t i = 1; i < N; i += 2) {
|
|
2225
|
+
b.raw[i - 1] = a.raw[i];
|
|
2226
|
+
}
|
|
2227
|
+
return b;
|
|
2228
|
+
}
|
|
2229
|
+
|
|
2104
2230
|
template <typename T, size_t N>
|
|
2105
2231
|
HWY_API Vec128<T, N> OddEvenBlocks(Vec128<T, N> /* odd */, Vec128<T, N> even) {
|
|
2106
2232
|
return even;
|
|
@@ -2349,8 +2475,8 @@ HWY_API Vec128<T, N> InterleaveLower(Vec128<T, N> a, Vec128<T, N> b) {
|
|
|
2349
2475
|
}
|
|
2350
2476
|
|
|
2351
2477
|
// Additional overload for the optional tag.
|
|
2352
|
-
template <class
|
|
2353
|
-
HWY_API
|
|
2478
|
+
template <class D>
|
|
2479
|
+
HWY_API VFromD<D> InterleaveLower(D /* tag */, VFromD<D> a, VFromD<D> b) {
|
|
2354
2480
|
return InterleaveLower(a, b);
|
|
2355
2481
|
}
|
|
2356
2482
|
|
|
@@ -2416,6 +2542,15 @@ HWY_API MFromD<D> LoadMaskBits(D d, const uint8_t* HWY_RESTRICT bits) {
|
|
|
2416
2542
|
return m;
|
|
2417
2543
|
}
|
|
2418
2544
|
|
|
2545
|
+
template <class D>
|
|
2546
|
+
HWY_API MFromD<D> Dup128MaskFromMaskBits(D d, unsigned mask_bits) {
|
|
2547
|
+
MFromD<D> m;
|
|
2548
|
+
for (size_t i = 0; i < MaxLanes(d); ++i) {
|
|
2549
|
+
m.bits[i] = MFromD<D>::FromBool(((mask_bits >> i) & 1u) != 0);
|
|
2550
|
+
}
|
|
2551
|
+
return m;
|
|
2552
|
+
}
|
|
2553
|
+
|
|
2419
2554
|
// `p` points to at least 8 writable bytes.
|
|
2420
2555
|
template <class D>
|
|
2421
2556
|
HWY_API size_t StoreMaskBits(D d, MFromD<D> mask, uint8_t* bits) {
|
|
@@ -2517,7 +2652,7 @@ HWY_API Vec128<T, N> Expand(Vec128<T, N> v, const Mask128<T, N> mask) {
|
|
|
2517
2652
|
if (mask.bits[i]) {
|
|
2518
2653
|
ret.raw[i] = v.raw[in_pos++];
|
|
2519
2654
|
} else {
|
|
2520
|
-
ret.raw[i] = T();
|
|
2655
|
+
ret.raw[i] = ConvertScalarTo<T>(0);
|
|
2521
2656
|
}
|
|
2522
2657
|
}
|
|
2523
2658
|
return ret;
|
|
@@ -2662,88 +2797,26 @@ HWY_API Mask128<T, N> SetAtOrBeforeFirst(Mask128<T, N> mask) {
|
|
|
2662
2797
|
|
|
2663
2798
|
// ------------------------------ WidenMulPairwiseAdd
|
|
2664
2799
|
|
|
2665
|
-
template <class
|
|
2666
|
-
HWY_API VFromD<
|
|
2667
|
-
|
|
2668
|
-
|
|
2669
|
-
const VU32 odd = Set(du32, 0xFFFF0000u); // bfloat16 is the upper half of f32
|
|
2670
|
-
// Avoid ZipLower/Upper so this also works on big-endian systems.
|
|
2671
|
-
const VU32 ae = ShiftLeft<16>(BitCast(du32, a));
|
|
2672
|
-
const VU32 ao = And(BitCast(du32, a), odd);
|
|
2673
|
-
const VU32 be = ShiftLeft<16>(BitCast(du32, b));
|
|
2674
|
-
const VU32 bo = And(BitCast(du32, b), odd);
|
|
2675
|
-
return Mul(BitCast(df32, ae), BitCast(df32, be)) +
|
|
2676
|
-
Mul(BitCast(df32, ao), BitCast(df32, bo));
|
|
2800
|
+
template <class DF, HWY_IF_F32_D(DF), class VBF>
|
|
2801
|
+
HWY_API VFromD<DF> WidenMulPairwiseAdd(DF df, VBF a, VBF b) {
|
|
2802
|
+
return MulAdd(PromoteEvenTo(df, a), PromoteEvenTo(df, b),
|
|
2803
|
+
Mul(PromoteOddTo(df, a), PromoteOddTo(df, b)));
|
|
2677
2804
|
}
|
|
2678
2805
|
|
|
2679
|
-
template <class D,
|
|
2680
|
-
HWY_API VFromD<D> WidenMulPairwiseAdd(D d32,
|
|
2681
|
-
|
|
2682
|
-
|
|
2683
|
-
const VI32 ae = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, a)));
|
|
2684
|
-
const VI32 be = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, b)));
|
|
2685
|
-
const VI32 ao = ShiftRight<16>(BitCast(d32, a));
|
|
2686
|
-
const VI32 bo = ShiftRight<16>(BitCast(d32, b));
|
|
2687
|
-
return Add(Mul(ae, be), Mul(ao, bo));
|
|
2688
|
-
}
|
|
2689
|
-
|
|
2690
|
-
template <class D, HWY_IF_U32_D(D), class VU16>
|
|
2691
|
-
HWY_API VFromD<D> WidenMulPairwiseAdd(D du32, VU16 a, VU16 b) {
|
|
2692
|
-
const auto lo16_mask = Set(du32, 0x0000FFFFu);
|
|
2693
|
-
|
|
2694
|
-
const auto a0 = And(BitCast(du32, a), lo16_mask);
|
|
2695
|
-
const auto b0 = And(BitCast(du32, b), lo16_mask);
|
|
2696
|
-
|
|
2697
|
-
const auto a1 = ShiftRight<16>(BitCast(du32, a));
|
|
2698
|
-
const auto b1 = ShiftRight<16>(BitCast(du32, b));
|
|
2699
|
-
|
|
2700
|
-
return Add(Mul(a0, b0), Mul(a1, b1));
|
|
2806
|
+
template <class D, HWY_IF_UI32_D(D), class V16>
|
|
2807
|
+
HWY_API VFromD<D> WidenMulPairwiseAdd(D d32, V16 a, V16 b) {
|
|
2808
|
+
return MulAdd(PromoteEvenTo(d32, a), PromoteEvenTo(d32, b),
|
|
2809
|
+
Mul(PromoteOddTo(d32, a), PromoteOddTo(d32, b)));
|
|
2701
2810
|
}
|
|
2702
2811
|
|
|
2703
2812
|
// ------------------------------ ReorderWidenMulAccumulate (MulAdd, ZipLower)
|
|
2704
2813
|
|
|
2705
|
-
template <class D,
|
|
2706
|
-
HWY_API VFromD<D> ReorderWidenMulAccumulate(D
|
|
2707
|
-
const
|
|
2708
|
-
|
|
2709
|
-
|
|
2710
|
-
|
|
2711
|
-
const VU32 odd = Set(du32, 0xFFFF0000u); // bfloat16 is the upper half of f32
|
|
2712
|
-
// Avoid ZipLower/Upper so this also works on big-endian systems.
|
|
2713
|
-
const VU32 ae = ShiftLeft<16>(BitCast(du32, a));
|
|
2714
|
-
const VU32 ao = And(BitCast(du32, a), odd);
|
|
2715
|
-
const VU32 be = ShiftLeft<16>(BitCast(du32, b));
|
|
2716
|
-
const VU32 bo = And(BitCast(du32, b), odd);
|
|
2717
|
-
sum1 = MulAdd(BitCast(df32, ao), BitCast(df32, bo), sum1);
|
|
2718
|
-
return MulAdd(BitCast(df32, ae), BitCast(df32, be), sum0);
|
|
2719
|
-
}
|
|
2720
|
-
|
|
2721
|
-
template <class D, HWY_IF_I32_D(D), size_t N, class VI16>
|
|
2722
|
-
HWY_API VFromD<D> ReorderWidenMulAccumulate(D d32, VI16 a, VI16 b,
|
|
2723
|
-
const Vec128<int32_t, N> sum0,
|
|
2724
|
-
Vec128<int32_t, N>& sum1) {
|
|
2725
|
-
using VI32 = VFromD<decltype(d32)>;
|
|
2726
|
-
// Manual sign extension requires two shifts for even lanes.
|
|
2727
|
-
const VI32 ae = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, a)));
|
|
2728
|
-
const VI32 be = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, b)));
|
|
2729
|
-
const VI32 ao = ShiftRight<16>(BitCast(d32, a));
|
|
2730
|
-
const VI32 bo = ShiftRight<16>(BitCast(d32, b));
|
|
2731
|
-
sum1 = Add(Mul(ao, bo), sum1);
|
|
2732
|
-
return Add(Mul(ae, be), sum0);
|
|
2733
|
-
}
|
|
2734
|
-
|
|
2735
|
-
template <class D, HWY_IF_U32_D(D), size_t N, class VU16>
|
|
2736
|
-
HWY_API VFromD<D> ReorderWidenMulAccumulate(D du32, VU16 a, VU16 b,
|
|
2737
|
-
const Vec128<uint32_t, N> sum0,
|
|
2738
|
-
Vec128<uint32_t, N>& sum1) {
|
|
2739
|
-
using VU32 = VFromD<decltype(du32)>;
|
|
2740
|
-
const VU32 lo16_mask = Set(du32, uint32_t{0x0000FFFFu});
|
|
2741
|
-
const VU32 ae = And(BitCast(du32, a), lo16_mask);
|
|
2742
|
-
const VU32 be = And(BitCast(du32, b), lo16_mask);
|
|
2743
|
-
const VU32 ao = ShiftRight<16>(BitCast(du32, a));
|
|
2744
|
-
const VU32 bo = ShiftRight<16>(BitCast(du32, b));
|
|
2745
|
-
sum1 = Add(Mul(ao, bo), sum1);
|
|
2746
|
-
return Add(Mul(ae, be), sum0);
|
|
2814
|
+
template <class D, HWY_IF_UI32_D(D), class V16>
|
|
2815
|
+
HWY_API VFromD<D> ReorderWidenMulAccumulate(D d32, V16 a, V16 b,
|
|
2816
|
+
const VFromD<D> sum0,
|
|
2817
|
+
VFromD<D>& sum1) {
|
|
2818
|
+
sum1 = MulAdd(PromoteOddTo(d32, a), PromoteOddTo(d32, b), sum1);
|
|
2819
|
+
return MulAdd(PromoteEvenTo(d32, a), PromoteEvenTo(d32, b), sum0);
|
|
2747
2820
|
}
|
|
2748
2821
|
|
|
2749
2822
|
// ------------------------------ RearrangeToOddPlusEven
|
|
@@ -2754,15 +2827,13 @@ HWY_API VW RearrangeToOddPlusEven(VW sum0, VW sum1) {
|
|
|
2754
2827
|
|
|
2755
2828
|
// ================================================== REDUCTIONS
|
|
2756
2829
|
|
|
2757
|
-
|
|
2758
|
-
|
|
2759
|
-
|
|
2760
|
-
|
|
2761
|
-
|
|
2762
|
-
|
|
2763
|
-
|
|
2764
|
-
}
|
|
2765
|
-
template <class D, typename T = TFromD<D>>
|
|
2830
|
+
#ifdef HWY_NATIVE_REDUCE_SCALAR
|
|
2831
|
+
#undef HWY_NATIVE_REDUCE_SCALAR
|
|
2832
|
+
#else
|
|
2833
|
+
#define HWY_NATIVE_REDUCE_SCALAR
|
|
2834
|
+
#endif
|
|
2835
|
+
|
|
2836
|
+
template <class D, typename T = TFromD<D>, HWY_IF_REDUCE_D(D)>
|
|
2766
2837
|
HWY_API T ReduceSum(D d, VFromD<D> v) {
|
|
2767
2838
|
T sum = T{0};
|
|
2768
2839
|
for (size_t i = 0; i < MaxLanes(d); ++i) {
|
|
@@ -2770,39 +2841,56 @@ HWY_API T ReduceSum(D d, VFromD<D> v) {
|
|
|
2770
2841
|
}
|
|
2771
2842
|
return sum;
|
|
2772
2843
|
}
|
|
2773
|
-
template <class D, typename T = TFromD<D
|
|
2774
|
-
HWY_API
|
|
2844
|
+
template <class D, typename T = TFromD<D>, HWY_IF_REDUCE_D(D)>
|
|
2845
|
+
HWY_API T ReduceMin(D d, VFromD<D> v) {
|
|
2775
2846
|
T min = HighestValue<T>();
|
|
2776
2847
|
for (size_t i = 0; i < MaxLanes(d); ++i) {
|
|
2777
2848
|
min = HWY_MIN(min, v.raw[i]);
|
|
2778
2849
|
}
|
|
2779
|
-
return
|
|
2850
|
+
return min;
|
|
2780
2851
|
}
|
|
2781
|
-
template <class D, typename T = TFromD<D
|
|
2782
|
-
HWY_API
|
|
2852
|
+
template <class D, typename T = TFromD<D>, HWY_IF_REDUCE_D(D)>
|
|
2853
|
+
HWY_API T ReduceMax(D d, VFromD<D> v) {
|
|
2783
2854
|
T max = LowestValue<T>();
|
|
2784
2855
|
for (size_t i = 0; i < MaxLanes(d); ++i) {
|
|
2785
2856
|
max = HWY_MAX(max, v.raw[i]);
|
|
2786
2857
|
}
|
|
2787
|
-
return
|
|
2858
|
+
return max;
|
|
2859
|
+
}
|
|
2860
|
+
|
|
2861
|
+
// ------------------------------ SumOfLanes
|
|
2862
|
+
|
|
2863
|
+
template <class D, HWY_IF_LANES_GT_D(D, 1)>
|
|
2864
|
+
HWY_API VFromD<D> SumOfLanes(D d, VFromD<D> v) {
|
|
2865
|
+
return Set(d, ReduceSum(d, v));
|
|
2866
|
+
}
|
|
2867
|
+
template <class D, HWY_IF_LANES_GT_D(D, 1)>
|
|
2868
|
+
HWY_API VFromD<D> MinOfLanes(D d, VFromD<D> v) {
|
|
2869
|
+
return Set(d, ReduceMin(d, v));
|
|
2870
|
+
}
|
|
2871
|
+
template <class D, HWY_IF_LANES_GT_D(D, 1)>
|
|
2872
|
+
HWY_API VFromD<D> MaxOfLanes(D d, VFromD<D> v) {
|
|
2873
|
+
return Set(d, ReduceMax(d, v));
|
|
2788
2874
|
}
|
|
2789
2875
|
|
|
2790
2876
|
// ================================================== OPS WITH DEPENDENCIES
|
|
2791
2877
|
|
|
2792
2878
|
// ------------------------------ MulEven/Odd 64x64 (UpperHalf)
|
|
2793
2879
|
|
|
2794
|
-
|
|
2795
|
-
|
|
2880
|
+
template <class T, HWY_IF_UI64(T)>
|
|
2881
|
+
HWY_API Vec128<T> MulEven(Vec128<T> a, Vec128<T> b) {
|
|
2882
|
+
alignas(16) T mul[2];
|
|
2796
2883
|
mul[0] = Mul128(GetLane(a), GetLane(b), &mul[1]);
|
|
2797
|
-
return Load(Full128<
|
|
2884
|
+
return Load(Full128<T>(), mul);
|
|
2798
2885
|
}
|
|
2799
2886
|
|
|
2800
|
-
|
|
2801
|
-
|
|
2802
|
-
|
|
2887
|
+
template <class T, HWY_IF_UI64(T)>
|
|
2888
|
+
HWY_API Vec128<T> MulOdd(Vec128<T> a, Vec128<T> b) {
|
|
2889
|
+
alignas(16) T mul[2];
|
|
2890
|
+
const Half<Full128<T>> d2;
|
|
2803
2891
|
mul[0] =
|
|
2804
2892
|
Mul128(GetLane(UpperHalf(d2, a)), GetLane(UpperHalf(d2, b)), &mul[1]);
|
|
2805
|
-
return Load(Full128<
|
|
2893
|
+
return Load(Full128<T>(), mul);
|
|
2806
2894
|
}
|
|
2807
2895
|
|
|
2808
2896
|
// NOLINTNEXTLINE(google-readability-namespace-comments)
|