@img/sharp-libvips-dev 1.0.1 → 1.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -2
- package/include/aom/aom_decoder.h +1 -1
- package/include/aom/aom_encoder.h +7 -1
- package/include/aom/aom_image.h +24 -12
- package/include/aom/aom_integer.h +3 -3
- package/include/aom/aomcx.h +15 -0
- package/include/aom/aomdx.h +5 -2
- package/include/archive.h +7 -5
- package/include/archive_entry.h +5 -3
- package/include/cgif.h +3 -0
- package/include/expat.h +21 -10
- package/include/expat_config.h +11 -5
- package/include/ffi.h +12 -25
- package/include/freetype2/freetype/config/ftoption.h +2 -2
- package/include/fribidi/fribidi-config.h +2 -2
- package/include/fribidi/fribidi-unicode-version.h +3 -3
- package/include/gio-unix-2.0/gio/gfiledescriptorbased.h +3 -2
- package/include/glib-2.0/gio/gappinfo.h +40 -25
- package/include/glib-2.0/gio/gapplication.h +6 -0
- package/include/glib-2.0/gio/gasyncresult.h +1 -1
- package/include/glib-2.0/gio/gconverter.h +5 -0
- package/include/glib-2.0/gio/gdbusintrospection.h +1 -1
- package/include/glib-2.0/gio/gfile.h +16 -0
- package/include/glib-2.0/gio/gio-visibility.h +34 -0
- package/include/glib-2.0/gio/giotypes.h +0 -1
- package/include/glib-2.0/gio/gsettings.h +8 -0
- package/include/glib-2.0/gio/gvfs.h +2 -2
- package/include/glib-2.0/girepository/gi-visibility.h +34 -0
- package/include/glib-2.0/girepository/giarginfo.h +23 -6
- package/include/glib-2.0/girepository/gibaseinfo.h +44 -18
- package/include/glib-2.0/girepository/gicallableinfo.h +26 -16
- package/include/glib-2.0/girepository/gicallbackinfo.h +17 -2
- package/include/glib-2.0/girepository/giconstantinfo.h +19 -4
- package/include/glib-2.0/girepository/gienuminfo.h +20 -21
- package/include/glib-2.0/girepository/gifieldinfo.h +22 -7
- package/include/glib-2.0/girepository/giflagsinfo.h +60 -0
- package/include/glib-2.0/girepository/gifunctioninfo.h +22 -7
- package/include/glib-2.0/girepository/giinterfaceinfo.h +33 -18
- package/include/glib-2.0/girepository/giobjectinfo.h +41 -26
- package/include/glib-2.0/girepository/gipropertyinfo.h +18 -3
- package/include/glib-2.0/girepository/giregisteredtypeinfo.h +22 -11
- package/include/glib-2.0/girepository/girepository-autocleanups.h +56 -0
- package/include/glib-2.0/girepository/girepository.h +53 -62
- package/include/glib-2.0/girepository/girffi.h +8 -7
- package/include/glib-2.0/girepository/gisignalinfo.h +18 -3
- package/include/glib-2.0/girepository/gistructinfo.h +26 -11
- package/include/glib-2.0/girepository/gitypeinfo.h +29 -16
- package/include/glib-2.0/girepository/gitypelib.h +9 -13
- package/include/glib-2.0/girepository/gitypes.h +52 -104
- package/include/glib-2.0/girepository/giunioninfo.h +28 -12
- package/include/glib-2.0/girepository/giunresolvedinfo.h +17 -2
- package/include/glib-2.0/girepository/givalueinfo.h +65 -0
- package/include/glib-2.0/girepository/givfuncinfo.h +23 -8
- package/include/glib-2.0/glib/deprecated/gthread.h +9 -5
- package/include/glib-2.0/glib/gbitlock.h +31 -0
- package/include/glib-2.0/glib/gbookmarkfile.h +1 -1
- package/include/glib-2.0/glib/giochannel.h +2 -2
- package/include/glib-2.0/glib/glib-visibility.h +34 -0
- package/include/glib-2.0/glib/gmacros.h +12 -5
- package/include/glib-2.0/glib/gmain.h +93 -7
- package/include/glib-2.0/glib/gmessages.h +8 -0
- package/include/glib-2.0/glib/gqsort.h +8 -1
- package/include/glib-2.0/glib/gslice.h +2 -0
- package/include/glib-2.0/glib/gstrfuncs.h +24 -30
- package/include/glib-2.0/glib/gstrvbuilder.h +3 -0
- package/include/glib-2.0/glib/gthread.h +191 -3
- package/include/glib-2.0/glib/gunicode.h +1 -1
- package/include/glib-2.0/glib/gversionmacros.h +9 -0
- package/include/glib-2.0/glib-unix.h +7 -1
- package/include/glib-2.0/gmodule/gmodule-visibility.h +34 -0
- package/include/glib-2.0/gobject/genums.h +6 -6
- package/include/glib-2.0/gobject/glib-types.h +11 -0
- package/include/glib-2.0/gobject/gobject-visibility.h +34 -0
- package/include/glib-2.0/gobject/gsignal.h +16 -6
- package/include/glib-2.0/gobject/gtype.h +6 -6
- package/include/harfbuzz/hb-buffer.h +6 -0
- package/include/harfbuzz/hb-common.h +6 -9
- package/include/harfbuzz/hb-cplusplus.hh +8 -11
- package/include/harfbuzz/hb-subset.h +17 -4
- package/include/harfbuzz/hb-version.h +3 -3
- package/include/hwy/abort.h +28 -0
- package/include/hwy/aligned_allocator.h +218 -6
- package/include/hwy/base.h +1935 -512
- package/include/hwy/cache_control.h +24 -6
- package/include/hwy/detect_compiler_arch.h +105 -10
- package/include/hwy/detect_targets.h +146 -37
- package/include/hwy/foreach_target.h +36 -1
- package/include/hwy/highway.h +222 -50
- package/include/hwy/ops/arm_neon-inl.h +2055 -894
- package/include/hwy/ops/arm_sve-inl.h +1476 -348
- package/include/hwy/ops/emu128-inl.h +711 -623
- package/include/hwy/ops/generic_ops-inl.h +4431 -2157
- package/include/hwy/ops/inside-inl.h +691 -0
- package/include/hwy/ops/ppc_vsx-inl.h +2186 -673
- package/include/hwy/ops/rvv-inl.h +1556 -536
- package/include/hwy/ops/scalar-inl.h +353 -233
- package/include/hwy/ops/set_macros-inl.h +171 -23
- package/include/hwy/ops/shared-inl.h +198 -56
- package/include/hwy/ops/wasm_128-inl.h +283 -244
- package/include/hwy/ops/x86_128-inl.h +3673 -1357
- package/include/hwy/ops/x86_256-inl.h +1737 -663
- package/include/hwy/ops/x86_512-inl.h +1697 -500
- package/include/hwy/per_target.h +4 -0
- package/include/hwy/profiler.h +648 -0
- package/include/hwy/robust_statistics.h +2 -2
- package/include/hwy/targets.h +40 -32
- package/include/hwy/timer-inl.h +3 -3
- package/include/hwy/timer.h +16 -1
- package/include/libheif/heif.h +170 -15
- package/include/libheif/heif_items.h +237 -0
- package/include/libheif/heif_properties.h +38 -2
- package/include/libheif/heif_regions.h +1 -1
- package/include/libheif/heif_version.h +2 -2
- package/include/libpng16/png.h +32 -29
- package/include/libpng16/pngconf.h +2 -2
- package/include/libpng16/pnglibconf.h +8 -3
- package/include/librsvg-2.0/librsvg/rsvg-cairo.h +1 -1
- package/include/librsvg-2.0/librsvg/rsvg-features.h +3 -4
- package/include/librsvg-2.0/librsvg/rsvg-pixbuf.h +235 -0
- package/include/librsvg-2.0/librsvg/rsvg-version.h +3 -3
- package/include/librsvg-2.0/librsvg/rsvg.h +55 -176
- package/include/libxml2/libxml/HTMLparser.h +12 -19
- package/include/libxml2/libxml/c14n.h +1 -12
- package/include/libxml2/libxml/debugXML.h +1 -1
- package/include/libxml2/libxml/encoding.h +9 -0
- package/include/libxml2/libxml/entities.h +12 -1
- package/include/libxml2/libxml/hash.h +19 -0
- package/include/libxml2/libxml/list.h +2 -2
- package/include/libxml2/libxml/nanohttp.h +17 -0
- package/include/libxml2/libxml/parser.h +73 -58
- package/include/libxml2/libxml/parserInternals.h +9 -1
- package/include/libxml2/libxml/pattern.h +6 -0
- package/include/libxml2/libxml/tree.h +32 -12
- package/include/libxml2/libxml/uri.h +11 -0
- package/include/libxml2/libxml/valid.h +29 -2
- package/include/libxml2/libxml/xinclude.h +7 -0
- package/include/libxml2/libxml/xmlIO.h +21 -5
- package/include/libxml2/libxml/xmlerror.h +14 -0
- package/include/libxml2/libxml/xmlexports.h +111 -15
- package/include/libxml2/libxml/xmlmemory.h +8 -45
- package/include/libxml2/libxml/xmlreader.h +2 -0
- package/include/libxml2/libxml/xmlsave.h +5 -0
- package/include/libxml2/libxml/xmlunicode.h +165 -1
- package/include/libxml2/libxml/xmlversion.h +15 -179
- package/include/libxml2/libxml/xmlwriter.h +1 -0
- package/include/libxml2/libxml/xpath.h +4 -0
- package/include/pango-1.0/pango/pango-features.h +2 -2
- package/include/pango-1.0/pango/pango-fontmap.h +7 -0
- package/include/pango-1.0/pango/pango-item.h +4 -2
- package/include/pango-1.0/pango/pango-version-macros.h +25 -0
- package/include/pango-1.0/pango/pangofc-font.h +2 -1
- package/include/pixman-1/pixman-version.h +2 -2
- package/include/png.h +32 -29
- package/include/pngconf.h +2 -2
- package/include/pnglibconf.h +8 -3
- package/include/vips/connection.h +9 -3
- package/include/vips/util.h +1 -11
- package/include/vips/version.h +4 -4
- package/include/webp/decode.h +58 -56
- package/include/webp/demux.h +25 -21
- package/include/webp/encode.h +44 -39
- package/include/webp/mux.h +76 -15
- package/include/webp/mux_types.h +2 -1
- package/include/webp/sharpyuv/sharpyuv.h +77 -8
- package/include/webp/types.h +29 -8
- package/include/zconf.h +1 -1
- package/include/zlib.h +12 -12
- package/package.json +1 -1
- package/versions.json +18 -19
|
@@ -13,9 +13,15 @@
|
|
|
13
13
|
// See the License for the specific language governing permissions and
|
|
14
14
|
// limitations under the License.
|
|
15
15
|
|
|
16
|
-
// 128-bit vectors for VSX
|
|
16
|
+
// 128-bit vectors for VSX/Z14
|
|
17
17
|
// External include guard in highway.h - see comment there.
|
|
18
18
|
|
|
19
|
+
#if HWY_TARGET == HWY_Z14 || HWY_TARGET == HWY_Z15
|
|
20
|
+
#define HWY_S390X_HAVE_Z14 1
|
|
21
|
+
#else
|
|
22
|
+
#define HWY_S390X_HAVE_Z14 0
|
|
23
|
+
#endif
|
|
24
|
+
|
|
19
25
|
#pragma push_macro("vector")
|
|
20
26
|
#pragma push_macro("pixel")
|
|
21
27
|
#pragma push_macro("bool")
|
|
@@ -24,7 +30,11 @@
|
|
|
24
30
|
#undef pixel
|
|
25
31
|
#undef bool
|
|
26
32
|
|
|
33
|
+
#if HWY_S390X_HAVE_Z14
|
|
34
|
+
#include <vecintrin.h>
|
|
35
|
+
#else
|
|
27
36
|
#include <altivec.h>
|
|
37
|
+
#endif
|
|
28
38
|
|
|
29
39
|
#pragma pop_macro("vector")
|
|
30
40
|
#pragma pop_macro("pixel")
|
|
@@ -37,20 +47,26 @@
|
|
|
37
47
|
// This means we can only use POWER10-specific intrinsics in static dispatch
|
|
38
48
|
// mode (where the -mpower10-vector compiler flag is passed). Same for PPC9.
|
|
39
49
|
// On other compilers, the usual target check is sufficient.
|
|
40
|
-
#if HWY_TARGET <= HWY_PPC9 && \
|
|
50
|
+
#if !HWY_S390X_HAVE_Z14 && HWY_TARGET <= HWY_PPC9 && \
|
|
41
51
|
(defined(_ARCH_PWR9) || defined(__POWER9_VECTOR__))
|
|
42
52
|
#define HWY_PPC_HAVE_9 1
|
|
43
53
|
#else
|
|
44
54
|
#define HWY_PPC_HAVE_9 0
|
|
45
55
|
#endif
|
|
46
56
|
|
|
47
|
-
#if HWY_TARGET <= HWY_PPC10 && \
|
|
57
|
+
#if !HWY_S390X_HAVE_Z14 && HWY_TARGET <= HWY_PPC10 && \
|
|
48
58
|
(defined(_ARCH_PWR10) || defined(__POWER10_VECTOR__))
|
|
49
59
|
#define HWY_PPC_HAVE_10 1
|
|
50
60
|
#else
|
|
51
61
|
#define HWY_PPC_HAVE_10 0
|
|
52
62
|
#endif
|
|
53
63
|
|
|
64
|
+
#if HWY_S390X_HAVE_Z14 && HWY_TARGET <= HWY_Z15 && __ARCH__ >= 13
|
|
65
|
+
#define HWY_S390X_HAVE_Z15 1
|
|
66
|
+
#else
|
|
67
|
+
#define HWY_S390X_HAVE_Z15 0
|
|
68
|
+
#endif
|
|
69
|
+
|
|
54
70
|
HWY_BEFORE_NAMESPACE();
|
|
55
71
|
namespace hwy {
|
|
56
72
|
namespace HWY_NAMESPACE {
|
|
@@ -125,6 +141,9 @@ class Vec128 {
|
|
|
125
141
|
HWY_INLINE Vec128& operator-=(const Vec128 other) {
|
|
126
142
|
return *this = (*this - other);
|
|
127
143
|
}
|
|
144
|
+
HWY_INLINE Vec128& operator%=(const Vec128 other) {
|
|
145
|
+
return *this = (*this % other);
|
|
146
|
+
}
|
|
128
147
|
HWY_INLINE Vec128& operator&=(const Vec128 other) {
|
|
129
148
|
return *this = (*this & other);
|
|
130
149
|
}
|
|
@@ -180,9 +199,6 @@ HWY_API Vec128<T, HWY_MAX_LANES_D(D)> Zero(D /* tag */) {
|
|
|
180
199
|
template <class D>
|
|
181
200
|
using VFromD = decltype(Zero(D()));
|
|
182
201
|
|
|
183
|
-
// ------------------------------ Tuple (VFromD)
|
|
184
|
-
#include "hwy/ops/tuple-inl.h"
|
|
185
|
-
|
|
186
202
|
// ------------------------------ BitCast
|
|
187
203
|
|
|
188
204
|
template <class D, typename FromT>
|
|
@@ -215,6 +231,12 @@ HWY_API VFromD<D> Set(D /* tag */, TFromD<D> t) {
|
|
|
215
231
|
return VFromD<D>{vec_splats(static_cast<RawLane>(t))};
|
|
216
232
|
}
|
|
217
233
|
|
|
234
|
+
template <class D, HWY_IF_SPECIAL_FLOAT(TFromD<D>)>
|
|
235
|
+
HWY_API VFromD<D> Set(D d, TFromD<D> t) {
|
|
236
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
237
|
+
return BitCast(d, Set(du, BitCastScalar<TFromD<decltype(du)>>(t)));
|
|
238
|
+
}
|
|
239
|
+
|
|
218
240
|
// Returns a vector with uninitialized elements.
|
|
219
241
|
template <class D>
|
|
220
242
|
HWY_API VFromD<D> Undefined(D d) {
|
|
@@ -222,6 +244,8 @@ HWY_API VFromD<D> Undefined(D d) {
|
|
|
222
244
|
// Suppressing maybe-uninitialized both here and at the caller does not work,
|
|
223
245
|
// so initialize.
|
|
224
246
|
return Zero(d);
|
|
247
|
+
#elif HWY_HAS_BUILTIN(__builtin_nondeterministic_value)
|
|
248
|
+
return VFromD<D>{__builtin_nondeterministic_value(Zero(d).raw)};
|
|
225
249
|
#else
|
|
226
250
|
HWY_DIAGNOSTICS(push)
|
|
227
251
|
HWY_DIAGNOSTICS_OFF(disable : 4700, ignored "-Wuninitialized")
|
|
@@ -240,6 +264,58 @@ HWY_API T GetLane(Vec128<T, N> v) {
|
|
|
240
264
|
return static_cast<T>(v.raw[0]);
|
|
241
265
|
}
|
|
242
266
|
|
|
267
|
+
// ------------------------------ Dup128VecFromValues
|
|
268
|
+
|
|
269
|
+
template <class D, HWY_IF_T_SIZE_D(D, 1)>
|
|
270
|
+
HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1,
|
|
271
|
+
TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
|
|
272
|
+
TFromD<D> t5, TFromD<D> t6, TFromD<D> t7,
|
|
273
|
+
TFromD<D> t8, TFromD<D> t9, TFromD<D> t10,
|
|
274
|
+
TFromD<D> t11, TFromD<D> t12,
|
|
275
|
+
TFromD<D> t13, TFromD<D> t14,
|
|
276
|
+
TFromD<D> t15) {
|
|
277
|
+
const typename detail::Raw128<TFromD<D>>::type raw = {
|
|
278
|
+
t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15};
|
|
279
|
+
return VFromD<D>{raw};
|
|
280
|
+
}
|
|
281
|
+
|
|
282
|
+
template <class D, HWY_IF_UI16_D(D)>
|
|
283
|
+
HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1,
|
|
284
|
+
TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
|
|
285
|
+
TFromD<D> t5, TFromD<D> t6,
|
|
286
|
+
TFromD<D> t7) {
|
|
287
|
+
const typename detail::Raw128<TFromD<D>>::type raw = {t0, t1, t2, t3,
|
|
288
|
+
t4, t5, t6, t7};
|
|
289
|
+
return VFromD<D>{raw};
|
|
290
|
+
}
|
|
291
|
+
|
|
292
|
+
template <class D, HWY_IF_SPECIAL_FLOAT_D(D)>
|
|
293
|
+
HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1,
|
|
294
|
+
TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
|
|
295
|
+
TFromD<D> t5, TFromD<D> t6,
|
|
296
|
+
TFromD<D> t7) {
|
|
297
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
298
|
+
return BitCast(
|
|
299
|
+
d, Dup128VecFromValues(
|
|
300
|
+
du, BitCastScalar<uint16_t>(t0), BitCastScalar<uint16_t>(t1),
|
|
301
|
+
BitCastScalar<uint16_t>(t2), BitCastScalar<uint16_t>(t3),
|
|
302
|
+
BitCastScalar<uint16_t>(t4), BitCastScalar<uint16_t>(t5),
|
|
303
|
+
BitCastScalar<uint16_t>(t6), BitCastScalar<uint16_t>(t7)));
|
|
304
|
+
}
|
|
305
|
+
|
|
306
|
+
template <class D, HWY_IF_T_SIZE_D(D, 4)>
|
|
307
|
+
HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1,
|
|
308
|
+
TFromD<D> t2, TFromD<D> t3) {
|
|
309
|
+
const typename detail::Raw128<TFromD<D>>::type raw = {t0, t1, t2, t3};
|
|
310
|
+
return VFromD<D>{raw};
|
|
311
|
+
}
|
|
312
|
+
|
|
313
|
+
template <class D, HWY_IF_T_SIZE_D(D, 8)>
|
|
314
|
+
HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1) {
|
|
315
|
+
const typename detail::Raw128<TFromD<D>>::type raw = {t0, t1};
|
|
316
|
+
return VFromD<D>{raw};
|
|
317
|
+
}
|
|
318
|
+
|
|
243
319
|
// ================================================== LOGICAL
|
|
244
320
|
|
|
245
321
|
// ------------------------------ And
|
|
@@ -249,7 +325,11 @@ HWY_API Vec128<T, N> And(Vec128<T, N> a, Vec128<T, N> b) {
|
|
|
249
325
|
const DFromV<decltype(a)> d;
|
|
250
326
|
const RebindToUnsigned<decltype(d)> du;
|
|
251
327
|
using VU = VFromD<decltype(du)>;
|
|
328
|
+
#if HWY_S390X_HAVE_Z14
|
|
329
|
+
return BitCast(d, VU{BitCast(du, a).raw & BitCast(du, b).raw});
|
|
330
|
+
#else
|
|
252
331
|
return BitCast(d, VU{vec_and(BitCast(du, a).raw, BitCast(du, b).raw)});
|
|
332
|
+
#endif
|
|
253
333
|
}
|
|
254
334
|
|
|
255
335
|
// ------------------------------ AndNot
|
|
@@ -271,7 +351,11 @@ HWY_API Vec128<T, N> Or(Vec128<T, N> a, Vec128<T, N> b) {
|
|
|
271
351
|
const DFromV<decltype(a)> d;
|
|
272
352
|
const RebindToUnsigned<decltype(d)> du;
|
|
273
353
|
using VU = VFromD<decltype(du)>;
|
|
354
|
+
#if HWY_S390X_HAVE_Z14
|
|
355
|
+
return BitCast(d, VU{BitCast(du, a).raw | BitCast(du, b).raw});
|
|
356
|
+
#else
|
|
274
357
|
return BitCast(d, VU{vec_or(BitCast(du, a).raw, BitCast(du, b).raw)});
|
|
358
|
+
#endif
|
|
275
359
|
}
|
|
276
360
|
|
|
277
361
|
// ------------------------------ Xor
|
|
@@ -281,7 +365,11 @@ HWY_API Vec128<T, N> Xor(Vec128<T, N> a, Vec128<T, N> b) {
|
|
|
281
365
|
const DFromV<decltype(a)> d;
|
|
282
366
|
const RebindToUnsigned<decltype(d)> du;
|
|
283
367
|
using VU = VFromD<decltype(du)>;
|
|
368
|
+
#if HWY_S390X_HAVE_Z14
|
|
369
|
+
return BitCast(d, VU{BitCast(du, a).raw ^ BitCast(du, b).raw});
|
|
370
|
+
#else
|
|
284
371
|
return BitCast(d, VU{vec_xor(BitCast(du, a).raw, BitCast(du, b).raw)});
|
|
372
|
+
#endif
|
|
285
373
|
}
|
|
286
374
|
|
|
287
375
|
// ------------------------------ Not
|
|
@@ -476,9 +564,21 @@ HWY_API Vec128<T, N> operator^(Vec128<T, N> a, Vec128<T, N> b) {
|
|
|
476
564
|
|
|
477
565
|
// ------------------------------ Neg
|
|
478
566
|
|
|
479
|
-
template <typename T, size_t N,
|
|
480
|
-
|
|
567
|
+
template <typename T, size_t N, HWY_IF_SIGNED(T)>
|
|
568
|
+
HWY_API Vec128<T, N> Neg(Vec128<T, N> v) {
|
|
569
|
+
// If T is an signed integer type, use Zero(d) - v instead of vec_neg to
|
|
570
|
+
// avoid undefined behavior in the case where v[i] == LimitsMin<T>()
|
|
571
|
+
const DFromV<decltype(v)> d;
|
|
572
|
+
return Zero(d) - v;
|
|
573
|
+
}
|
|
574
|
+
|
|
575
|
+
template <typename T, size_t N, HWY_IF_FLOAT3264(T)>
|
|
576
|
+
HWY_API Vec128<T, N> Neg(Vec128<T, N> v) {
|
|
577
|
+
#if HWY_S390X_HAVE_Z14
|
|
578
|
+
return Xor(v, SignBit(DFromV<decltype(v)>()));
|
|
579
|
+
#else
|
|
481
580
|
return Vec128<T, N>{vec_neg(v.raw)};
|
|
581
|
+
#endif
|
|
482
582
|
}
|
|
483
583
|
|
|
484
584
|
template <typename T, size_t N, HWY_IF_SPECIAL_FLOAT(T)>
|
|
@@ -489,13 +589,40 @@ HWY_API Vec128<T, N> Neg(const Vec128<T, N> v) {
|
|
|
489
589
|
// ------------------------------ Abs
|
|
490
590
|
|
|
491
591
|
// Returns absolute value, except that LimitsMin() maps to LimitsMax() + 1.
|
|
492
|
-
template <class T, size_t N,
|
|
592
|
+
template <class T, size_t N, HWY_IF_SIGNED(T)>
|
|
593
|
+
HWY_API Vec128<T, N> Abs(Vec128<T, N> v) {
|
|
594
|
+
// If T is a signed integer type, use Max(v, Neg(v)) instead of vec_abs to
|
|
595
|
+
// avoid undefined behavior in the case where v[i] == LimitsMin<T>().
|
|
596
|
+
return Max(v, Neg(v));
|
|
597
|
+
}
|
|
598
|
+
|
|
599
|
+
template <class T, size_t N, HWY_IF_FLOAT3264(T)>
|
|
493
600
|
HWY_API Vec128<T, N> Abs(Vec128<T, N> v) {
|
|
494
601
|
return Vec128<T, N>{vec_abs(v.raw)};
|
|
495
602
|
}
|
|
496
603
|
|
|
497
604
|
// ------------------------------ CopySign
|
|
498
605
|
|
|
606
|
+
#if HWY_S390X_HAVE_Z14
|
|
607
|
+
template <class V>
|
|
608
|
+
HWY_API V CopySign(const V magn, const V sign) {
|
|
609
|
+
static_assert(IsFloat<TFromV<V>>(), "Only makes sense for floating-point");
|
|
610
|
+
|
|
611
|
+
const DFromV<decltype(magn)> d;
|
|
612
|
+
const auto msb = SignBit(d);
|
|
613
|
+
|
|
614
|
+
// Truth table for msb, magn, sign | bitwise msb ? sign : mag
|
|
615
|
+
// 0 0 0 | 0
|
|
616
|
+
// 0 0 1 | 0
|
|
617
|
+
// 0 1 0 | 1
|
|
618
|
+
// 0 1 1 | 1
|
|
619
|
+
// 1 0 0 | 0
|
|
620
|
+
// 1 0 1 | 1
|
|
621
|
+
// 1 1 0 | 0
|
|
622
|
+
// 1 1 1 | 1
|
|
623
|
+
return BitwiseIfThenElse(msb, sign, magn);
|
|
624
|
+
}
|
|
625
|
+
#else // VSX
|
|
499
626
|
template <size_t N>
|
|
500
627
|
HWY_API Vec128<float, N> CopySign(Vec128<float, N> magn,
|
|
501
628
|
Vec128<float, N> sign) {
|
|
@@ -525,6 +652,7 @@ HWY_API Vec128<double, N> CopySign(Vec128<double, N> magn,
|
|
|
525
652
|
return Vec128<double, N>{vec_cpsgn(sign.raw, magn.raw)};
|
|
526
653
|
#endif
|
|
527
654
|
}
|
|
655
|
+
#endif // HWY_S390X_HAVE_Z14
|
|
528
656
|
|
|
529
657
|
template <typename T, size_t N>
|
|
530
658
|
HWY_API Vec128<T, N> CopySignToAbs(Vec128<T, N> abs, Vec128<T, N> sign) {
|
|
@@ -542,10 +670,21 @@ HWY_API Vec128<T, N> CopySignToAbs(Vec128<T, N> abs, Vec128<T, N> sign) {
|
|
|
542
670
|
|
|
543
671
|
template <class D, HWY_IF_V_SIZE_D(D, 16), typename T = TFromD<D>>
|
|
544
672
|
HWY_API Vec128<T> Load(D /* tag */, const T* HWY_RESTRICT aligned) {
|
|
673
|
+
// Suppress the ignoring attributes warning that is generated by
|
|
674
|
+
// HWY_RCAST_ALIGNED(const LoadRaw*, aligned) with GCC
|
|
675
|
+
#if HWY_COMPILER_GCC
|
|
676
|
+
HWY_DIAGNOSTICS(push)
|
|
677
|
+
HWY_DIAGNOSTICS_OFF(disable : 4649, ignored "-Wignored-attributes")
|
|
678
|
+
#endif
|
|
679
|
+
|
|
545
680
|
using LoadRaw = typename detail::Raw128<T>::AlignedRawVec;
|
|
546
|
-
const LoadRaw* HWY_RESTRICT p =
|
|
681
|
+
const LoadRaw* HWY_RESTRICT p = HWY_RCAST_ALIGNED(const LoadRaw*, aligned);
|
|
547
682
|
using ResultRaw = typename detail::Raw128<T>::type;
|
|
548
683
|
return Vec128<T>{reinterpret_cast<ResultRaw>(*p)};
|
|
684
|
+
|
|
685
|
+
#if HWY_COMPILER_GCC
|
|
686
|
+
HWY_DIAGNOSTICS(pop)
|
|
687
|
+
#endif
|
|
549
688
|
}
|
|
550
689
|
|
|
551
690
|
// Any <= 64 bit
|
|
@@ -598,19 +737,13 @@ HWY_API Vec128<T, N> IfThenElse(Mask128<T, N> mask, Vec128<T, N> yes,
|
|
|
598
737
|
// mask ? yes : 0
|
|
599
738
|
template <typename T, size_t N>
|
|
600
739
|
HWY_API Vec128<T, N> IfThenElseZero(Mask128<T, N> mask, Vec128<T, N> yes) {
|
|
601
|
-
|
|
602
|
-
const RebindToUnsigned<decltype(d)> du;
|
|
603
|
-
return BitCast(d,
|
|
604
|
-
VFromD<decltype(du)>{vec_and(BitCast(du, yes).raw, mask.raw)});
|
|
740
|
+
return yes & VecFromMask(DFromV<decltype(yes)>(), mask);
|
|
605
741
|
}
|
|
606
742
|
|
|
607
743
|
// mask ? 0 : no
|
|
608
744
|
template <typename T, size_t N>
|
|
609
745
|
HWY_API Vec128<T, N> IfThenZeroElse(Mask128<T, N> mask, Vec128<T, N> no) {
|
|
610
|
-
|
|
611
|
-
const RebindToUnsigned<decltype(d)> du;
|
|
612
|
-
return BitCast(d,
|
|
613
|
-
VFromD<decltype(du)>{vec_andc(BitCast(du, no).raw, mask.raw)});
|
|
746
|
+
return AndNot(VecFromMask(DFromV<decltype(no)>(), mask), no);
|
|
614
747
|
}
|
|
615
748
|
|
|
616
749
|
// ------------------------------ Mask logical
|
|
@@ -622,7 +755,11 @@ HWY_API Mask128<T, N> Not(Mask128<T, N> m) {
|
|
|
622
755
|
|
|
623
756
|
template <typename T, size_t N>
|
|
624
757
|
HWY_API Mask128<T, N> And(Mask128<T, N> a, Mask128<T, N> b) {
|
|
758
|
+
#if HWY_S390X_HAVE_Z14
|
|
759
|
+
return Mask128<T, N>{a.raw & b.raw};
|
|
760
|
+
#else
|
|
625
761
|
return Mask128<T, N>{vec_and(a.raw, b.raw)};
|
|
762
|
+
#endif
|
|
626
763
|
}
|
|
627
764
|
|
|
628
765
|
template <typename T, size_t N>
|
|
@@ -632,12 +769,20 @@ HWY_API Mask128<T, N> AndNot(Mask128<T, N> a, Mask128<T, N> b) {
|
|
|
632
769
|
|
|
633
770
|
template <typename T, size_t N>
|
|
634
771
|
HWY_API Mask128<T, N> Or(Mask128<T, N> a, Mask128<T, N> b) {
|
|
772
|
+
#if HWY_S390X_HAVE_Z14
|
|
773
|
+
return Mask128<T, N>{a.raw | b.raw};
|
|
774
|
+
#else
|
|
635
775
|
return Mask128<T, N>{vec_or(a.raw, b.raw)};
|
|
776
|
+
#endif
|
|
636
777
|
}
|
|
637
778
|
|
|
638
779
|
template <typename T, size_t N>
|
|
639
780
|
HWY_API Mask128<T, N> Xor(Mask128<T, N> a, Mask128<T, N> b) {
|
|
781
|
+
#if HWY_S390X_HAVE_Z14
|
|
782
|
+
return Mask128<T, N>{a.raw ^ b.raw};
|
|
783
|
+
#else
|
|
640
784
|
return Mask128<T, N>{vec_xor(a.raw, b.raw)};
|
|
785
|
+
#endif
|
|
641
786
|
}
|
|
642
787
|
|
|
643
788
|
template <typename T, size_t N>
|
|
@@ -645,36 +790,24 @@ HWY_API Mask128<T, N> ExclusiveNeither(Mask128<T, N> a, Mask128<T, N> b) {
|
|
|
645
790
|
return Mask128<T, N>{vec_nor(a.raw, b.raw)};
|
|
646
791
|
}
|
|
647
792
|
|
|
648
|
-
// ------------------------------ BroadcastSignBit
|
|
649
|
-
|
|
650
|
-
template <size_t N>
|
|
651
|
-
HWY_API Vec128<int8_t, N> BroadcastSignBit(Vec128<int8_t, N> v) {
|
|
652
|
-
return Vec128<int8_t, N>{
|
|
653
|
-
vec_sra(v.raw, vec_splats(static_cast<unsigned char>(7)))};
|
|
654
|
-
}
|
|
655
|
-
|
|
656
|
-
template <size_t N>
|
|
657
|
-
HWY_API Vec128<int16_t, N> BroadcastSignBit(Vec128<int16_t, N> v) {
|
|
658
|
-
return Vec128<int16_t, N>{
|
|
659
|
-
vec_sra(v.raw, vec_splats(static_cast<unsigned short>(15)))};
|
|
660
|
-
}
|
|
661
|
-
|
|
662
|
-
template <size_t N>
|
|
663
|
-
HWY_API Vec128<int32_t, N> BroadcastSignBit(Vec128<int32_t, N> v) {
|
|
664
|
-
return Vec128<int32_t, N>{vec_sra(v.raw, vec_splats(31u))};
|
|
665
|
-
}
|
|
666
|
-
|
|
667
|
-
template <size_t N>
|
|
668
|
-
HWY_API Vec128<int64_t, N> BroadcastSignBit(Vec128<int64_t, N> v) {
|
|
669
|
-
return Vec128<int64_t, N>{vec_sra(v.raw, vec_splats(63ULL))};
|
|
670
|
-
}
|
|
671
|
-
|
|
672
793
|
// ------------------------------ ShiftLeftSame
|
|
673
794
|
|
|
674
795
|
template <typename T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
|
|
675
796
|
HWY_API Vec128<T, N> ShiftLeftSame(Vec128<T, N> v, const int bits) {
|
|
676
|
-
|
|
677
|
-
|
|
797
|
+
const DFromV<decltype(v)> d;
|
|
798
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
799
|
+
using TU = TFromD<decltype(du)>;
|
|
800
|
+
|
|
801
|
+
#if HWY_S390X_HAVE_Z14
|
|
802
|
+
return BitCast(d,
|
|
803
|
+
VFromD<decltype(du)>{BitCast(du, v).raw
|
|
804
|
+
<< Set(du, static_cast<TU>(bits)).raw});
|
|
805
|
+
#else
|
|
806
|
+
// Do an unsigned vec_sl operation to avoid undefined behavior
|
|
807
|
+
return BitCast(
|
|
808
|
+
d, VFromD<decltype(du)>{
|
|
809
|
+
vec_sl(BitCast(du, v).raw, Set(du, static_cast<TU>(bits)).raw)});
|
|
810
|
+
#endif
|
|
678
811
|
}
|
|
679
812
|
|
|
680
813
|
// ------------------------------ ShiftRightSame
|
|
@@ -682,13 +815,22 @@ HWY_API Vec128<T, N> ShiftLeftSame(Vec128<T, N> v, const int bits) {
|
|
|
682
815
|
template <typename T, size_t N, HWY_IF_UNSIGNED(T)>
|
|
683
816
|
HWY_API Vec128<T, N> ShiftRightSame(Vec128<T, N> v, const int bits) {
|
|
684
817
|
using TU = typename detail::Raw128<MakeUnsigned<T>>::RawT;
|
|
818
|
+
#if HWY_S390X_HAVE_Z14
|
|
819
|
+
return Vec128<T, N>{v.raw >> vec_splats(static_cast<TU>(bits))};
|
|
820
|
+
#else
|
|
685
821
|
return Vec128<T, N>{vec_sr(v.raw, vec_splats(static_cast<TU>(bits)))};
|
|
822
|
+
#endif
|
|
686
823
|
}
|
|
687
824
|
|
|
688
825
|
template <typename T, size_t N, HWY_IF_SIGNED(T)>
|
|
689
826
|
HWY_API Vec128<T, N> ShiftRightSame(Vec128<T, N> v, const int bits) {
|
|
827
|
+
#if HWY_S390X_HAVE_Z14
|
|
828
|
+
using TI = typename detail::Raw128<T>::RawT;
|
|
829
|
+
return Vec128<T, N>{v.raw >> vec_splats(static_cast<TI>(bits))};
|
|
830
|
+
#else
|
|
690
831
|
using TU = typename detail::Raw128<MakeUnsigned<T>>::RawT;
|
|
691
832
|
return Vec128<T, N>{vec_sra(v.raw, vec_splats(static_cast<TU>(bits)))};
|
|
833
|
+
#endif
|
|
692
834
|
}
|
|
693
835
|
|
|
694
836
|
// ------------------------------ ShiftLeft
|
|
@@ -707,6 +849,13 @@ HWY_API Vec128<T, N> ShiftRight(Vec128<T, N> v) {
|
|
|
707
849
|
return ShiftRightSame(v, kBits);
|
|
708
850
|
}
|
|
709
851
|
|
|
852
|
+
// ------------------------------ BroadcastSignBit
|
|
853
|
+
|
|
854
|
+
template <typename T, size_t N, HWY_IF_SIGNED(T)>
|
|
855
|
+
HWY_API Vec128<T, N> BroadcastSignBit(Vec128<T, N> v) {
|
|
856
|
+
return ShiftRightSame(v, static_cast<int>(sizeof(T) * 8 - 1));
|
|
857
|
+
}
|
|
858
|
+
|
|
710
859
|
// ================================================== SWIZZLE (1)
|
|
711
860
|
|
|
712
861
|
// ------------------------------ TableLookupBytes
|
|
@@ -1003,7 +1152,7 @@ HWY_API VFromD<D> LoadDup128(D d, const T* HWY_RESTRICT p) {
|
|
|
1003
1152
|
return LoadU(d, p);
|
|
1004
1153
|
}
|
|
1005
1154
|
|
|
1006
|
-
#if HWY_PPC_HAVE_9
|
|
1155
|
+
#if (HWY_PPC_HAVE_9 && HWY_ARCH_PPC_64) || HWY_S390X_HAVE_Z14
|
|
1007
1156
|
#ifdef HWY_NATIVE_LOAD_N
|
|
1008
1157
|
#undef HWY_NATIVE_LOAD_N
|
|
1009
1158
|
#else
|
|
@@ -1027,11 +1176,20 @@ HWY_API VFromD<D> LoadN(D d, const T* HWY_RESTRICT p,
|
|
|
1027
1176
|
const size_t num_of_bytes_to_load =
|
|
1028
1177
|
HWY_MIN(max_lanes_to_load, HWY_MAX_LANES_D(D)) * sizeof(TFromD<D>);
|
|
1029
1178
|
const Repartition<uint8_t, decltype(d)> du8;
|
|
1179
|
+
#if HWY_S390X_HAVE_Z14
|
|
1180
|
+
return (num_of_bytes_to_load > 0)
|
|
1181
|
+
? BitCast(d, VFromD<decltype(du8)>{vec_load_len(
|
|
1182
|
+
const_cast<unsigned char*>(
|
|
1183
|
+
reinterpret_cast<const unsigned char*>(p)),
|
|
1184
|
+
static_cast<unsigned>(num_of_bytes_to_load - 1))})
|
|
1185
|
+
: Zero(d);
|
|
1186
|
+
#else
|
|
1030
1187
|
return BitCast(
|
|
1031
1188
|
d,
|
|
1032
1189
|
VFromD<decltype(du8)>{vec_xl_len(
|
|
1033
1190
|
const_cast<unsigned char*>(reinterpret_cast<const unsigned char*>(p)),
|
|
1034
1191
|
num_of_bytes_to_load)});
|
|
1192
|
+
#endif
|
|
1035
1193
|
}
|
|
1036
1194
|
|
|
1037
1195
|
template <class D, typename T = TFromD<D>>
|
|
@@ -1048,18 +1206,11 @@ HWY_API VFromD<D> LoadNOr(VFromD<D> no, D d, const T* HWY_RESTRICT p,
|
|
|
1048
1206
|
}
|
|
1049
1207
|
#endif
|
|
1050
1208
|
|
|
1051
|
-
|
|
1052
|
-
|
|
1053
|
-
const Repartition<uint8_t, decltype(d)> du8;
|
|
1054
|
-
const VFromD<D> v = BitCast(
|
|
1055
|
-
d,
|
|
1056
|
-
VFromD<decltype(du8)>{vec_xl_len(
|
|
1057
|
-
const_cast<unsigned char*>(reinterpret_cast<const unsigned char*>(p)),
|
|
1058
|
-
num_of_bytes_to_load)});
|
|
1059
|
-
return IfThenElse(FirstN(d, max_lanes_to_load), v, no);
|
|
1209
|
+
return IfThenElse(FirstN(d, max_lanes_to_load),
|
|
1210
|
+
LoadN(d, p, max_lanes_to_load), no);
|
|
1060
1211
|
}
|
|
1061
1212
|
|
|
1062
|
-
#endif // HWY_PPC_HAVE_9
|
|
1213
|
+
#endif // HWY_PPC_HAVE_9 || HWY_S390X_HAVE_Z14
|
|
1063
1214
|
|
|
1064
1215
|
// Returns a vector with lane i=[0, N) set to "first" + i.
|
|
1065
1216
|
namespace detail {
|
|
@@ -1134,8 +1285,19 @@ HWY_API VFromD<D> MaskedLoadOr(VFromD<D> v, MFromD<D> m, D d,
|
|
|
1134
1285
|
|
|
1135
1286
|
template <class D, HWY_IF_V_SIZE_D(D, 16), typename T = TFromD<D>>
|
|
1136
1287
|
HWY_API void Store(Vec128<T> v, D /* tag */, T* HWY_RESTRICT aligned) {
|
|
1288
|
+
// Suppress the ignoring attributes warning that is generated by
|
|
1289
|
+
// HWY_RCAST_ALIGNED(StoreRaw*, aligned) with GCC
|
|
1290
|
+
#if HWY_COMPILER_GCC
|
|
1291
|
+
HWY_DIAGNOSTICS(push)
|
|
1292
|
+
HWY_DIAGNOSTICS_OFF(disable : 4649, ignored "-Wignored-attributes")
|
|
1293
|
+
#endif
|
|
1294
|
+
|
|
1137
1295
|
using StoreRaw = typename detail::Raw128<T>::AlignedRawVec;
|
|
1138
|
-
*
|
|
1296
|
+
*HWY_RCAST_ALIGNED(StoreRaw*, aligned) = reinterpret_cast<StoreRaw>(v.raw);
|
|
1297
|
+
|
|
1298
|
+
#if HWY_COMPILER_GCC
|
|
1299
|
+
HWY_DIAGNOSTICS(pop)
|
|
1300
|
+
#endif
|
|
1139
1301
|
}
|
|
1140
1302
|
|
|
1141
1303
|
template <class D, HWY_IF_V_SIZE_D(D, 16), typename T = TFromD<D>>
|
|
@@ -1159,7 +1321,7 @@ HWY_API void StoreU(VFromD<D> v, D d, T* HWY_RESTRICT p) {
|
|
|
1159
1321
|
Store(v, d, p);
|
|
1160
1322
|
}
|
|
1161
1323
|
|
|
1162
|
-
#if HWY_PPC_HAVE_9
|
|
1324
|
+
#if (HWY_PPC_HAVE_9 && HWY_ARCH_PPC_64) || HWY_S390X_HAVE_Z14
|
|
1163
1325
|
|
|
1164
1326
|
#ifdef HWY_NATIVE_STORE_N
|
|
1165
1327
|
#undef HWY_NATIVE_STORE_N
|
|
@@ -1185,8 +1347,15 @@ HWY_API void StoreN(VFromD<D> v, D d, T* HWY_RESTRICT p,
|
|
|
1185
1347
|
const size_t num_of_bytes_to_store =
|
|
1186
1348
|
HWY_MIN(max_lanes_to_store, HWY_MAX_LANES_D(D)) * sizeof(TFromD<D>);
|
|
1187
1349
|
const Repartition<uint8_t, decltype(d)> du8;
|
|
1350
|
+
#if HWY_S390X_HAVE_Z14
|
|
1351
|
+
if (num_of_bytes_to_store > 0) {
|
|
1352
|
+
vec_store_len(BitCast(du8, v).raw, reinterpret_cast<unsigned char*>(p),
|
|
1353
|
+
static_cast<unsigned>(num_of_bytes_to_store - 1));
|
|
1354
|
+
}
|
|
1355
|
+
#else
|
|
1188
1356
|
vec_xst_len(BitCast(du8, v).raw, reinterpret_cast<unsigned char*>(p),
|
|
1189
1357
|
num_of_bytes_to_store);
|
|
1358
|
+
#endif
|
|
1190
1359
|
}
|
|
1191
1360
|
#endif
|
|
1192
1361
|
|
|
@@ -1195,180 +1364,104 @@ HWY_API void StoreN(VFromD<D> v, D d, T* HWY_RESTRICT p,
|
|
|
1195
1364
|
template <class D>
|
|
1196
1365
|
HWY_API void BlendedStore(VFromD<D> v, MFromD<D> m, D d,
|
|
1197
1366
|
TFromD<D>* HWY_RESTRICT p) {
|
|
1198
|
-
const
|
|
1199
|
-
|
|
1200
|
-
alignas(16) TI buf[MaxLanes(d)];
|
|
1201
|
-
alignas(16) TI mask[MaxLanes(d)];
|
|
1202
|
-
Store(BitCast(di, v), di, buf);
|
|
1203
|
-
Store(BitCast(di, VecFromMask(d, m)), di, mask);
|
|
1204
|
-
for (size_t i = 0; i < MaxLanes(d); ++i) {
|
|
1205
|
-
if (mask[i]) {
|
|
1206
|
-
CopySameSize(buf + i, p + i);
|
|
1207
|
-
}
|
|
1208
|
-
}
|
|
1367
|
+
const VFromD<D> old = LoadU(d, p);
|
|
1368
|
+
StoreU(IfThenElse(RebindMask(d, m), v, old), d, p);
|
|
1209
1369
|
}
|
|
1210
1370
|
|
|
1211
1371
|
// ================================================== ARITHMETIC
|
|
1212
1372
|
|
|
1373
|
+
namespace detail {
|
|
1374
|
+
// If TFromD<D> is an integer type, detail::RebindToUnsignedIfNotFloat<D>
|
|
1375
|
+
// rebinds D to MakeUnsigned<TFromD<D>>.
|
|
1376
|
+
|
|
1377
|
+
// Otherwise, if TFromD<D> is a floating-point type (including F16 and BF16),
|
|
1378
|
+
// detail::RebindToUnsignedIfNotFloat<D> is the same as D.
|
|
1379
|
+
template <class D>
|
|
1380
|
+
using RebindToUnsignedIfNotFloat =
|
|
1381
|
+
hwy::If<(!hwy::IsFloat<TFromD<D>>() && !hwy::IsSpecialFloat<TFromD<D>>()),
|
|
1382
|
+
RebindToUnsigned<D>, D>;
|
|
1383
|
+
} // namespace detail
|
|
1384
|
+
|
|
1213
1385
|
// ------------------------------ Addition
|
|
1214
1386
|
|
|
1215
1387
|
template <typename T, size_t N, HWY_IF_NOT_SPECIAL_FLOAT(T)>
|
|
1216
1388
|
HWY_API Vec128<T, N> operator+(Vec128<T, N> a, Vec128<T, N> b) {
|
|
1217
|
-
|
|
1389
|
+
const DFromV<decltype(a)> d;
|
|
1390
|
+
const detail::RebindToUnsignedIfNotFloat<decltype(d)> d_arith;
|
|
1391
|
+
|
|
1392
|
+
// If T is an integer type, do an unsigned vec_add to avoid undefined behavior
|
|
1393
|
+
#if HWY_S390X_HAVE_Z14
|
|
1394
|
+
return BitCast(d, VFromD<decltype(d_arith)>{BitCast(d_arith, a).raw +
|
|
1395
|
+
BitCast(d_arith, b).raw});
|
|
1396
|
+
#else
|
|
1397
|
+
return BitCast(d, VFromD<decltype(d_arith)>{vec_add(
|
|
1398
|
+
BitCast(d_arith, a).raw, BitCast(d_arith, b).raw)});
|
|
1399
|
+
#endif
|
|
1218
1400
|
}
|
|
1219
1401
|
|
|
1220
1402
|
// ------------------------------ Subtraction
|
|
1221
1403
|
|
|
1222
1404
|
template <typename T, size_t N, HWY_IF_NOT_SPECIAL_FLOAT(T)>
|
|
1223
1405
|
HWY_API Vec128<T, N> operator-(Vec128<T, N> a, Vec128<T, N> b) {
|
|
1224
|
-
|
|
1225
|
-
|
|
1226
|
-
|
|
1227
|
-
// ------------------------------ SumsOf8
|
|
1228
|
-
namespace detail {
|
|
1229
|
-
|
|
1230
|
-
// Casts nominally int32_t result to D.
|
|
1231
|
-
template <class D>
|
|
1232
|
-
HWY_INLINE VFromD<D> AltivecVsum4sbs(D d, __vector signed char a,
|
|
1233
|
-
__vector signed int b) {
|
|
1234
|
-
const Repartition<int32_t, D> di32;
|
|
1235
|
-
#ifdef __OPTIMIZE__
|
|
1236
|
-
if (IsConstantRawAltivecVect(a) && IsConstantRawAltivecVect(b)) {
|
|
1237
|
-
const int64_t sum0 =
|
|
1238
|
-
static_cast<int64_t>(a[0]) + static_cast<int64_t>(a[1]) +
|
|
1239
|
-
static_cast<int64_t>(a[2]) + static_cast<int64_t>(a[3]) +
|
|
1240
|
-
static_cast<int64_t>(b[0]);
|
|
1241
|
-
const int64_t sum1 =
|
|
1242
|
-
static_cast<int64_t>(a[4]) + static_cast<int64_t>(a[5]) +
|
|
1243
|
-
static_cast<int64_t>(a[6]) + static_cast<int64_t>(a[7]) +
|
|
1244
|
-
static_cast<int64_t>(b[1]);
|
|
1245
|
-
const int64_t sum2 =
|
|
1246
|
-
static_cast<int64_t>(a[8]) + static_cast<int64_t>(a[9]) +
|
|
1247
|
-
static_cast<int64_t>(a[10]) + static_cast<int64_t>(a[11]) +
|
|
1248
|
-
static_cast<int64_t>(b[2]);
|
|
1249
|
-
const int64_t sum3 =
|
|
1250
|
-
static_cast<int64_t>(a[12]) + static_cast<int64_t>(a[13]) +
|
|
1251
|
-
static_cast<int64_t>(a[14]) + static_cast<int64_t>(a[15]) +
|
|
1252
|
-
static_cast<int64_t>(b[3]);
|
|
1253
|
-
const int32_t sign0 = static_cast<int32_t>(sum0 >> 63);
|
|
1254
|
-
const int32_t sign1 = static_cast<int32_t>(sum1 >> 63);
|
|
1255
|
-
const int32_t sign2 = static_cast<int32_t>(sum2 >> 63);
|
|
1256
|
-
const int32_t sign3 = static_cast<int32_t>(sum3 >> 63);
|
|
1257
|
-
using Raw = typename detail::Raw128<int32_t>::type;
|
|
1258
|
-
return BitCast(
|
|
1259
|
-
d,
|
|
1260
|
-
VFromD<decltype(di32)>{Raw{
|
|
1261
|
-
(sign0 == (sum0 >> 31)) ? static_cast<int32_t>(sum0)
|
|
1262
|
-
: static_cast<int32_t>(sign0 ^ 0x7FFFFFFF),
|
|
1263
|
-
(sign1 == (sum1 >> 31)) ? static_cast<int32_t>(sum1)
|
|
1264
|
-
: static_cast<int32_t>(sign1 ^ 0x7FFFFFFF),
|
|
1265
|
-
(sign2 == (sum2 >> 31)) ? static_cast<int32_t>(sum2)
|
|
1266
|
-
: static_cast<int32_t>(sign2 ^ 0x7FFFFFFF),
|
|
1267
|
-
(sign3 == (sum3 >> 31))
|
|
1268
|
-
? static_cast<int32_t>(sum3)
|
|
1269
|
-
: static_cast<int32_t>(sign3 ^ 0x7FFFFFFF)}});
|
|
1270
|
-
} else // NOLINT
|
|
1271
|
-
#endif
|
|
1272
|
-
{
|
|
1273
|
-
return BitCast(d, VFromD<decltype(di32)>{vec_vsum4sbs(a, b)});
|
|
1274
|
-
}
|
|
1275
|
-
}
|
|
1406
|
+
const DFromV<decltype(a)> d;
|
|
1407
|
+
const detail::RebindToUnsignedIfNotFloat<decltype(d)> d_arith;
|
|
1276
1408
|
|
|
1277
|
-
//
|
|
1278
|
-
|
|
1279
|
-
|
|
1280
|
-
|
|
1281
|
-
|
|
1282
|
-
|
|
1283
|
-
|
|
1284
|
-
const uint64_t sum0 =
|
|
1285
|
-
static_cast<uint64_t>(a[0]) + static_cast<uint64_t>(a[1]) +
|
|
1286
|
-
static_cast<uint64_t>(a[2]) + static_cast<uint64_t>(a[3]) +
|
|
1287
|
-
static_cast<uint64_t>(b[0]);
|
|
1288
|
-
const uint64_t sum1 =
|
|
1289
|
-
static_cast<uint64_t>(a[4]) + static_cast<uint64_t>(a[5]) +
|
|
1290
|
-
static_cast<uint64_t>(a[6]) + static_cast<uint64_t>(a[7]) +
|
|
1291
|
-
static_cast<uint64_t>(b[1]);
|
|
1292
|
-
const uint64_t sum2 =
|
|
1293
|
-
static_cast<uint64_t>(a[8]) + static_cast<uint64_t>(a[9]) +
|
|
1294
|
-
static_cast<uint64_t>(a[10]) + static_cast<uint64_t>(a[11]) +
|
|
1295
|
-
static_cast<uint64_t>(b[2]);
|
|
1296
|
-
const uint64_t sum3 =
|
|
1297
|
-
static_cast<uint64_t>(a[12]) + static_cast<uint64_t>(a[13]) +
|
|
1298
|
-
static_cast<uint64_t>(a[14]) + static_cast<uint64_t>(a[15]) +
|
|
1299
|
-
static_cast<uint64_t>(b[3]);
|
|
1300
|
-
return BitCast(
|
|
1301
|
-
d,
|
|
1302
|
-
VFromD<decltype(du32)>{(__vector unsigned int){
|
|
1303
|
-
static_cast<unsigned int>(sum0 <= 0xFFFFFFFFu ? sum0 : 0xFFFFFFFFu),
|
|
1304
|
-
static_cast<unsigned int>(sum1 <= 0xFFFFFFFFu ? sum1 : 0xFFFFFFFFu),
|
|
1305
|
-
static_cast<unsigned int>(sum2 <= 0xFFFFFFFFu ? sum2 : 0xFFFFFFFFu),
|
|
1306
|
-
static_cast<unsigned int>(sum3 <= 0xFFFFFFFFu ? sum3
|
|
1307
|
-
: 0xFFFFFFFFu)}});
|
|
1308
|
-
} else // NOLINT
|
|
1409
|
+
// If T is an integer type, do an unsigned vec_sub to avoid undefined behavior
|
|
1410
|
+
#if HWY_S390X_HAVE_Z14
|
|
1411
|
+
return BitCast(d, VFromD<decltype(d_arith)>{BitCast(d_arith, a).raw -
|
|
1412
|
+
BitCast(d_arith, b).raw});
|
|
1413
|
+
#else
|
|
1414
|
+
return BitCast(d, VFromD<decltype(d_arith)>{vec_sub(
|
|
1415
|
+
BitCast(d_arith, a).raw, BitCast(d_arith, b).raw)});
|
|
1309
1416
|
#endif
|
|
1310
|
-
{
|
|
1311
|
-
return BitCast(d, VFromD<decltype(du32)>{vec_vsum4ubs(a, b)});
|
|
1312
|
-
}
|
|
1313
1417
|
}
|
|
1314
1418
|
|
|
1315
|
-
//
|
|
1316
|
-
template <class
|
|
1317
|
-
|
|
1318
|
-
|
|
1319
|
-
const Repartition<int32_t, D> di32;
|
|
1320
|
-
#ifdef __OPTIMIZE__
|
|
1321
|
-
const Repartition<uint64_t, D> du64;
|
|
1322
|
-
constexpr int kDestLaneOffset = HWY_IS_BIG_ENDIAN;
|
|
1323
|
-
if (IsConstantRawAltivecVect(a) && __builtin_constant_p(b[kDestLaneOffset]) &&
|
|
1324
|
-
__builtin_constant_p(b[kDestLaneOffset + 2])) {
|
|
1325
|
-
const int64_t sum0 = static_cast<int64_t>(a[0]) +
|
|
1326
|
-
static_cast<int64_t>(a[1]) +
|
|
1327
|
-
static_cast<int64_t>(b[kDestLaneOffset]);
|
|
1328
|
-
const int64_t sum1 = static_cast<int64_t>(a[2]) +
|
|
1329
|
-
static_cast<int64_t>(a[3]) +
|
|
1330
|
-
static_cast<int64_t>(b[kDestLaneOffset + 2]);
|
|
1331
|
-
const int32_t sign0 = static_cast<int32_t>(sum0 >> 63);
|
|
1332
|
-
const int32_t sign1 = static_cast<int32_t>(sum1 >> 63);
|
|
1333
|
-
return BitCast(d, VFromD<decltype(du64)>{(__vector unsigned long long){
|
|
1334
|
-
(sign0 == (sum0 >> 31))
|
|
1335
|
-
? static_cast<uint32_t>(sum0)
|
|
1336
|
-
: static_cast<uint32_t>(sign0 ^ 0x7FFFFFFF),
|
|
1337
|
-
(sign1 == (sum1 >> 31))
|
|
1338
|
-
? static_cast<uint32_t>(sum1)
|
|
1339
|
-
: static_cast<uint32_t>(sign1 ^ 0x7FFFFFFF)}});
|
|
1340
|
-
} else // NOLINT
|
|
1341
|
-
#endif
|
|
1342
|
-
{
|
|
1343
|
-
__vector signed int sum;
|
|
1344
|
-
|
|
1345
|
-
// Inline assembly is used for vsum2sws to avoid unnecessary shuffling
|
|
1346
|
-
// on little-endian PowerPC targets as the result of the vsum2sws
|
|
1347
|
-
// instruction will already be in the correct lanes on little-endian
|
|
1348
|
-
// PowerPC targets.
|
|
1349
|
-
__asm__("vsum2sws %0,%1,%2" : "=v"(sum) : "v"(a), "v"(b));
|
|
1350
|
-
|
|
1351
|
-
return BitCast(d, VFromD<decltype(di32)>{sum});
|
|
1352
|
-
}
|
|
1419
|
+
// ------------------------------ SumsOf8
|
|
1420
|
+
template <class V, HWY_IF_U8(TFromV<V>)>
|
|
1421
|
+
HWY_API VFromD<RepartitionToWideX3<DFromV<V>>> SumsOf8(V v) {
|
|
1422
|
+
return SumsOf2(SumsOf4(v));
|
|
1353
1423
|
}
|
|
1354
1424
|
|
|
1355
|
-
|
|
1356
|
-
|
|
1357
|
-
|
|
1358
|
-
|
|
1359
|
-
const
|
|
1360
|
-
const
|
|
1361
|
-
const RebindToUnsigned<decltype(di32)> du32;
|
|
1425
|
+
template <class V, HWY_IF_I8(TFromV<V>)>
|
|
1426
|
+
HWY_API VFromD<RepartitionToWideX3<DFromV<V>>> SumsOf8(V v) {
|
|
1427
|
+
#if HWY_S390X_HAVE_Z14
|
|
1428
|
+
const DFromV<decltype(v)> di8;
|
|
1429
|
+
const RebindToUnsigned<decltype(di8)> du8;
|
|
1430
|
+
const RepartitionToWideX3<decltype(di8)> di64;
|
|
1362
1431
|
|
|
1363
|
-
return
|
|
1364
|
-
|
|
1365
|
-
|
|
1432
|
+
return BitCast(di64, SumsOf8(BitCast(du8, Xor(v, SignBit(di8))))) +
|
|
1433
|
+
Set(di64, int64_t{-1024});
|
|
1434
|
+
#else
|
|
1435
|
+
return SumsOf2(SumsOf4(v));
|
|
1436
|
+
#endif
|
|
1366
1437
|
}
|
|
1367
1438
|
|
|
1368
1439
|
// ------------------------------ SaturatedAdd
|
|
1369
1440
|
|
|
1370
1441
|
// Returns a + b clamped to the destination range.
|
|
1371
1442
|
|
|
1443
|
+
#if HWY_S390X_HAVE_Z14
|
|
1444
|
+
// Z14/Z15/Z16 does not have I8/U8/I16/U16 SaturatedAdd instructions unlike most
|
|
1445
|
+
// other integer SIMD instruction sets
|
|
1446
|
+
|
|
1447
|
+
template <typename T, size_t N, HWY_IF_UNSIGNED(T),
|
|
1448
|
+
HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2))>
|
|
1449
|
+
HWY_API Vec128<T, N> SaturatedAdd(Vec128<T, N> a, Vec128<T, N> b) {
|
|
1450
|
+
return Add(a, Min(b, Not(a)));
|
|
1451
|
+
}
|
|
1452
|
+
|
|
1453
|
+
template <typename T, size_t N, HWY_IF_SIGNED(T),
|
|
1454
|
+
HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2))>
|
|
1455
|
+
HWY_API Vec128<T, N> SaturatedAdd(Vec128<T, N> a, Vec128<T, N> b) {
|
|
1456
|
+
const DFromV<decltype(a)> d;
|
|
1457
|
+
const auto sum = Add(a, b);
|
|
1458
|
+
const auto overflow_mask = AndNot(Xor(a, b), Xor(a, sum));
|
|
1459
|
+
const auto overflow_result = Xor(BroadcastSignBit(a), Set(d, LimitsMax<T>()));
|
|
1460
|
+
return IfNegativeThenElse(overflow_mask, overflow_result, sum);
|
|
1461
|
+
}
|
|
1462
|
+
|
|
1463
|
+
#else // VSX
|
|
1464
|
+
|
|
1372
1465
|
#ifdef HWY_NATIVE_I32_SATURATED_ADDSUB
|
|
1373
1466
|
#undef HWY_NATIVE_I32_SATURATED_ADDSUB
|
|
1374
1467
|
#else
|
|
@@ -1386,6 +1479,7 @@ template <typename T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T),
|
|
|
1386
1479
|
HWY_API Vec128<T, N> SaturatedAdd(Vec128<T, N> a, Vec128<T, N> b) {
|
|
1387
1480
|
return Vec128<T, N>{vec_adds(a.raw, b.raw)};
|
|
1388
1481
|
}
|
|
1482
|
+
#endif // HWY_S390X_HAVE_Z14
|
|
1389
1483
|
|
|
1390
1484
|
#if HWY_PPC_HAVE_10
|
|
1391
1485
|
|
|
@@ -1412,11 +1506,34 @@ HWY_API V SaturatedAdd(V a, V b) {
|
|
|
1412
1506
|
|
|
1413
1507
|
// Returns a - b clamped to the destination range.
|
|
1414
1508
|
|
|
1509
|
+
#if HWY_S390X_HAVE_Z14
|
|
1510
|
+
// Z14/Z15/Z16 does not have I8/U8/I16/U16 SaturatedSub instructions unlike most
|
|
1511
|
+
// other integer SIMD instruction sets
|
|
1512
|
+
|
|
1513
|
+
template <typename T, size_t N, HWY_IF_UNSIGNED(T),
|
|
1514
|
+
HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2))>
|
|
1515
|
+
HWY_API Vec128<T, N> SaturatedSub(Vec128<T, N> a, Vec128<T, N> b) {
|
|
1516
|
+
return Sub(a, Min(a, b));
|
|
1517
|
+
}
|
|
1518
|
+
|
|
1519
|
+
template <typename T, size_t N, HWY_IF_SIGNED(T),
|
|
1520
|
+
HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2))>
|
|
1521
|
+
HWY_API Vec128<T, N> SaturatedSub(Vec128<T, N> a, Vec128<T, N> b) {
|
|
1522
|
+
const DFromV<decltype(a)> d;
|
|
1523
|
+
const auto diff = Sub(a, b);
|
|
1524
|
+
const auto overflow_mask = And(Xor(a, b), Xor(a, diff));
|
|
1525
|
+
const auto overflow_result = Xor(BroadcastSignBit(a), Set(d, LimitsMax<T>()));
|
|
1526
|
+
return IfNegativeThenElse(overflow_mask, overflow_result, diff);
|
|
1527
|
+
}
|
|
1528
|
+
|
|
1529
|
+
#else // VSX
|
|
1530
|
+
|
|
1415
1531
|
template <typename T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T),
|
|
1416
1532
|
HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2) | (1 << 4))>
|
|
1417
1533
|
HWY_API Vec128<T, N> SaturatedSub(Vec128<T, N> a, Vec128<T, N> b) {
|
|
1418
1534
|
return Vec128<T, N>{vec_subs(a.raw, b.raw)};
|
|
1419
1535
|
}
|
|
1536
|
+
#endif // HWY_S390X_HAVE_Z14
|
|
1420
1537
|
|
|
1421
1538
|
#if HWY_PPC_HAVE_10
|
|
1422
1539
|
|
|
@@ -1459,32 +1576,96 @@ HWY_API Vec128<T, N> AverageRound(Vec128<T, N> a, Vec128<T, N> b) {
|
|
|
1459
1576
|
|
|
1460
1577
|
template <typename T, size_t N, HWY_IF_NOT_SPECIAL_FLOAT(T)>
|
|
1461
1578
|
HWY_API Vec128<T, N> operator*(Vec128<T, N> a, Vec128<T, N> b) {
|
|
1462
|
-
|
|
1579
|
+
const DFromV<decltype(a)> d;
|
|
1580
|
+
const detail::RebindToUnsignedIfNotFloat<decltype(d)> d_arith;
|
|
1581
|
+
|
|
1582
|
+
// If T is an integer type, do an unsigned vec_mul to avoid undefined behavior
|
|
1583
|
+
#if HWY_S390X_HAVE_Z14
|
|
1584
|
+
return BitCast(d, VFromD<decltype(d_arith)>{BitCast(d_arith, a).raw *
|
|
1585
|
+
BitCast(d_arith, b).raw});
|
|
1586
|
+
#else
|
|
1587
|
+
return BitCast(d, VFromD<decltype(d_arith)>{vec_mul(
|
|
1588
|
+
BitCast(d_arith, a).raw, BitCast(d_arith, b).raw)});
|
|
1589
|
+
#endif
|
|
1590
|
+
}
|
|
1591
|
+
|
|
1592
|
+
// Returns the upper sizeof(T)*8 bits of a * b in each lane.
|
|
1593
|
+
|
|
1594
|
+
#if HWY_S390X_HAVE_Z14
|
|
1595
|
+
#define HWY_PPC_IF_MULHIGH_USING_VEC_MULH(T) \
|
|
1596
|
+
HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2) | (1 << 4))
|
|
1597
|
+
#define HWY_PPC_IF_MULHIGH_8_16_32_NOT_USING_VEC_MULH(T) \
|
|
1598
|
+
hwy::EnableIf<!hwy::IsSame<T, T>()>* = nullptr
|
|
1599
|
+
#elif HWY_PPC_HAVE_10
|
|
1600
|
+
#define HWY_PPC_IF_MULHIGH_USING_VEC_MULH(T) \
|
|
1601
|
+
HWY_IF_T_SIZE_ONE_OF(T, (1 << 4) | (1 << 8))
|
|
1602
|
+
#define HWY_PPC_IF_MULHIGH_8_16_32_NOT_USING_VEC_MULH(T) \
|
|
1603
|
+
HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2))
|
|
1604
|
+
#else
|
|
1605
|
+
#define HWY_PPC_IF_MULHIGH_USING_VEC_MULH(T) \
|
|
1606
|
+
hwy::EnableIf<!hwy::IsSame<T, T>()>* = nullptr
|
|
1607
|
+
#define HWY_PPC_IF_MULHIGH_8_16_32_NOT_USING_VEC_MULH(T) \
|
|
1608
|
+
HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2) | (1 << 4))
|
|
1609
|
+
#endif
|
|
1610
|
+
|
|
1611
|
+
#if HWY_S390X_HAVE_Z14 || HWY_PPC_HAVE_10
|
|
1612
|
+
template <typename T, size_t N, HWY_PPC_IF_MULHIGH_USING_VEC_MULH(T),
|
|
1613
|
+
HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
|
|
1614
|
+
HWY_API Vec128<T, N> MulHigh(Vec128<T, N> a, Vec128<T, N> b) {
|
|
1615
|
+
return Vec128<T, N>{vec_mulh(a.raw, b.raw)};
|
|
1616
|
+
}
|
|
1617
|
+
#endif
|
|
1618
|
+
|
|
1619
|
+
template <typename T, HWY_PPC_IF_MULHIGH_8_16_32_NOT_USING_VEC_MULH(T),
|
|
1620
|
+
HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
|
|
1621
|
+
HWY_API Vec128<T, 1> MulHigh(Vec128<T, 1> a, Vec128<T, 1> b) {
|
|
1622
|
+
const auto p_even = MulEven(a, b);
|
|
1623
|
+
|
|
1624
|
+
#if HWY_IS_LITTLE_ENDIAN
|
|
1625
|
+
const auto p_even_full = ResizeBitCast(Full128<T>(), p_even);
|
|
1626
|
+
return Vec128<T, 1>{
|
|
1627
|
+
vec_sld(p_even_full.raw, p_even_full.raw, 16 - sizeof(T))};
|
|
1628
|
+
#else
|
|
1629
|
+
const DFromV<decltype(a)> d;
|
|
1630
|
+
return ResizeBitCast(d, p_even);
|
|
1631
|
+
#endif
|
|
1463
1632
|
}
|
|
1464
1633
|
|
|
1465
|
-
|
|
1466
|
-
|
|
1634
|
+
template <typename T, size_t N,
|
|
1635
|
+
HWY_PPC_IF_MULHIGH_8_16_32_NOT_USING_VEC_MULH(T),
|
|
1636
|
+
HWY_IF_NOT_FLOAT_NOR_SPECIAL(T), HWY_IF_LANES_GT(N, 1)>
|
|
1467
1637
|
HWY_API Vec128<T, N> MulHigh(Vec128<T, N> a, Vec128<T, N> b) {
|
|
1468
1638
|
const DFromV<decltype(a)> d;
|
|
1469
|
-
|
|
1470
|
-
const
|
|
1471
|
-
const
|
|
1639
|
+
|
|
1640
|
+
const auto p_even = BitCast(d, MulEven(a, b));
|
|
1641
|
+
const auto p_odd = BitCast(d, MulOdd(a, b));
|
|
1642
|
+
|
|
1472
1643
|
#if HWY_IS_LITTLE_ENDIAN
|
|
1473
|
-
|
|
1474
|
-
10, 11, 26, 27, 14, 15, 30, 31};
|
|
1644
|
+
return InterleaveOdd(d, p_even, p_odd);
|
|
1475
1645
|
#else
|
|
1476
|
-
|
|
1477
|
-
8, 9, 24, 25, 12, 13, 28, 29};
|
|
1646
|
+
return InterleaveEven(d, p_even, p_odd);
|
|
1478
1647
|
#endif
|
|
1479
|
-
return BitCast(d, VFromD<decltype(dw)>{vec_perm(p1.raw, p2.raw, kShuffle)});
|
|
1480
1648
|
}
|
|
1481
1649
|
|
|
1482
|
-
|
|
1483
|
-
|
|
1484
|
-
|
|
1485
|
-
|
|
1486
|
-
|
|
1650
|
+
#if !HWY_PPC_HAVE_10
|
|
1651
|
+
template <class T, HWY_IF_UI64(T)>
|
|
1652
|
+
HWY_API Vec64<T> MulHigh(Vec64<T> a, Vec64<T> b) {
|
|
1653
|
+
T p_hi;
|
|
1654
|
+
Mul128(GetLane(a), GetLane(b), &p_hi);
|
|
1655
|
+
return Set(Full64<T>(), p_hi);
|
|
1656
|
+
}
|
|
1657
|
+
|
|
1658
|
+
template <class T, HWY_IF_UI64(T)>
|
|
1659
|
+
HWY_API Vec128<T> MulHigh(Vec128<T> a, Vec128<T> b) {
|
|
1660
|
+
const DFromV<decltype(a)> d;
|
|
1661
|
+
const Half<decltype(d)> dh;
|
|
1662
|
+
return Combine(d, MulHigh(UpperHalf(dh, a), UpperHalf(dh, b)),
|
|
1663
|
+
MulHigh(LowerHalf(dh, a), LowerHalf(dh, b)));
|
|
1487
1664
|
}
|
|
1665
|
+
#endif // !HWY_PPC_HAVE_10
|
|
1666
|
+
|
|
1667
|
+
#undef HWY_PPC_IF_MULHIGH_USING_VEC_MULH
|
|
1668
|
+
#undef HWY_PPC_IF_MULHIGH_8_16_32_NOT_USING_VEC_MULH
|
|
1488
1669
|
|
|
1489
1670
|
// Multiplies even lanes (0, 2, ..) and places the double-wide result into
|
|
1490
1671
|
// even and the upper half into its odd neighbor lane.
|
|
@@ -1506,24 +1687,83 @@ HWY_API Vec128<MakeWide<T>, (N + 1) / 2> MulOdd(Vec128<T, N> a,
|
|
|
1506
1687
|
return Vec128<MakeWide<T>, (N + 1) / 2>{vec_mulo(a.raw, b.raw)};
|
|
1507
1688
|
}
|
|
1508
1689
|
|
|
1690
|
+
// ------------------------------ Rol/Ror
|
|
1691
|
+
|
|
1692
|
+
#ifdef HWY_NATIVE_ROL_ROR_8
|
|
1693
|
+
#undef HWY_NATIVE_ROL_ROR_8
|
|
1694
|
+
#else
|
|
1695
|
+
#define HWY_NATIVE_ROL_ROR_8
|
|
1696
|
+
#endif
|
|
1697
|
+
|
|
1698
|
+
#ifdef HWY_NATIVE_ROL_ROR_16
|
|
1699
|
+
#undef HWY_NATIVE_ROL_ROR_16
|
|
1700
|
+
#else
|
|
1701
|
+
#define HWY_NATIVE_ROL_ROR_16
|
|
1702
|
+
#endif
|
|
1703
|
+
|
|
1704
|
+
#ifdef HWY_NATIVE_ROL_ROR_32_64
|
|
1705
|
+
#undef HWY_NATIVE_ROL_ROR_32_64
|
|
1706
|
+
#else
|
|
1707
|
+
#define HWY_NATIVE_ROL_ROR_32_64
|
|
1708
|
+
#endif
|
|
1709
|
+
|
|
1710
|
+
template <typename T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
|
|
1711
|
+
HWY_API Vec128<T, N> Rol(Vec128<T, N> a, Vec128<T, N> b) {
|
|
1712
|
+
const DFromV<decltype(a)> d;
|
|
1713
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
1714
|
+
return BitCast(
|
|
1715
|
+
d, VFromD<decltype(du)>{vec_rl(BitCast(du, a).raw, BitCast(du, b).raw)});
|
|
1716
|
+
}
|
|
1717
|
+
|
|
1718
|
+
template <typename T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
|
|
1719
|
+
HWY_API Vec128<T, N> Ror(Vec128<T, N> a, Vec128<T, N> b) {
|
|
1720
|
+
const DFromV<decltype(a)> d;
|
|
1721
|
+
const RebindToSigned<decltype(d)> di;
|
|
1722
|
+
return Rol(a, BitCast(d, Neg(BitCast(di, b))));
|
|
1723
|
+
}
|
|
1724
|
+
|
|
1509
1725
|
// ------------------------------ RotateRight
|
|
1510
|
-
template <int kBits, typename T, size_t N>
|
|
1726
|
+
template <int kBits, typename T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
|
|
1511
1727
|
HWY_API Vec128<T, N> RotateRight(const Vec128<T, N> v) {
|
|
1512
1728
|
const DFromV<decltype(v)> d;
|
|
1513
1729
|
constexpr size_t kSizeInBits = sizeof(T) * 8;
|
|
1514
1730
|
static_assert(0 <= kBits && kBits < kSizeInBits, "Invalid shift count");
|
|
1515
|
-
|
|
1516
|
-
return
|
|
1731
|
+
|
|
1732
|
+
return (kBits == 0)
|
|
1733
|
+
? v
|
|
1734
|
+
: Rol(v, Set(d, static_cast<T>(static_cast<int>(kSizeInBits) -
|
|
1735
|
+
kBits)));
|
|
1517
1736
|
}
|
|
1518
1737
|
|
|
1519
|
-
// ------------------------------
|
|
1520
|
-
|
|
1521
|
-
|
|
1522
|
-
|
|
1738
|
+
// ------------------------------ RotateLeftSame/RotateRightSame
|
|
1739
|
+
#ifdef HWY_NATIVE_ROL_ROR_SAME_8
|
|
1740
|
+
#undef HWY_NATIVE_ROL_ROR_SAME_8
|
|
1741
|
+
#else
|
|
1742
|
+
#define HWY_NATIVE_ROL_ROR_SAME_8
|
|
1743
|
+
#endif
|
|
1744
|
+
|
|
1745
|
+
#ifdef HWY_NATIVE_ROL_ROR_SAME_16
|
|
1746
|
+
#undef HWY_NATIVE_ROL_ROR_SAME_16
|
|
1747
|
+
#else
|
|
1748
|
+
#define HWY_NATIVE_ROL_ROR_SAME_16
|
|
1749
|
+
#endif
|
|
1750
|
+
|
|
1751
|
+
#ifdef HWY_NATIVE_ROL_ROR_SAME_32_64
|
|
1752
|
+
#undef HWY_NATIVE_ROL_ROR_SAME_32_64
|
|
1753
|
+
#else
|
|
1754
|
+
#define HWY_NATIVE_ROL_ROR_SAME_32_64
|
|
1755
|
+
#endif
|
|
1756
|
+
|
|
1757
|
+
template <typename T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
|
|
1758
|
+
HWY_API Vec128<T, N> RotateLeftSame(Vec128<T, N> v, int bits) {
|
|
1523
1759
|
const DFromV<decltype(v)> d;
|
|
1524
|
-
|
|
1525
|
-
|
|
1526
|
-
|
|
1760
|
+
return Rol(v, Set(d, static_cast<T>(static_cast<unsigned>(bits))));
|
|
1761
|
+
}
|
|
1762
|
+
|
|
1763
|
+
template <typename T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
|
|
1764
|
+
HWY_API Vec128<T, N> RotateRightSame(Vec128<T, N> v, int bits) {
|
|
1765
|
+
const DFromV<decltype(v)> d;
|
|
1766
|
+
return Rol(v, Set(d, static_cast<T>(0u - static_cast<unsigned>(bits))));
|
|
1527
1767
|
}
|
|
1528
1768
|
|
|
1529
1769
|
// ------------------------------ IfNegativeThenElse
|
|
@@ -1541,10 +1781,35 @@ HWY_API Vec128<T, N> IfNegativeThenElse(Vec128<T, N> v, Vec128<T, N> yes,
|
|
|
1541
1781
|
BitCast(du, no).raw, BitCast(du, yes).raw, BitCast(du, v).raw)});
|
|
1542
1782
|
#else
|
|
1543
1783
|
const RebindToSigned<decltype(d)> di;
|
|
1544
|
-
return
|
|
1545
|
-
|
|
1784
|
+
return IfVecThenElse(BitCast(d, BroadcastSignBit(BitCast(di, v))), yes, no);
|
|
1785
|
+
#endif
|
|
1786
|
+
}
|
|
1787
|
+
|
|
1788
|
+
#if HWY_PPC_HAVE_10
|
|
1789
|
+
#ifdef HWY_NATIVE_IF_NEG_THEN_ELSE_ZERO
|
|
1790
|
+
#undef HWY_NATIVE_IF_NEG_THEN_ELSE_ZERO
|
|
1791
|
+
#else
|
|
1792
|
+
#define HWY_NATIVE_IF_NEG_THEN_ELSE_ZERO
|
|
1546
1793
|
#endif
|
|
1794
|
+
|
|
1795
|
+
#ifdef HWY_NATIVE_IF_NEG_THEN_ZERO_ELSE
|
|
1796
|
+
#undef HWY_NATIVE_IF_NEG_THEN_ZERO_ELSE
|
|
1797
|
+
#else
|
|
1798
|
+
#define HWY_NATIVE_IF_NEG_THEN_ZERO_ELSE
|
|
1799
|
+
#endif
|
|
1800
|
+
|
|
1801
|
+
template <class V, HWY_IF_NOT_UNSIGNED_V(V)>
|
|
1802
|
+
HWY_API V IfNegativeThenElseZero(V v, V yes) {
|
|
1803
|
+
const DFromV<decltype(v)> d;
|
|
1804
|
+
return IfNegativeThenElse(v, yes, Zero(d));
|
|
1805
|
+
}
|
|
1806
|
+
|
|
1807
|
+
template <class V, HWY_IF_NOT_UNSIGNED_V(V)>
|
|
1808
|
+
HWY_API V IfNegativeThenZeroElse(V v, V no) {
|
|
1809
|
+
const DFromV<decltype(v)> d;
|
|
1810
|
+
return IfNegativeThenElse(v, Zero(d), no);
|
|
1547
1811
|
}
|
|
1812
|
+
#endif
|
|
1548
1813
|
|
|
1549
1814
|
// generic_ops takes care of integer T.
|
|
1550
1815
|
template <typename T, size_t N, HWY_IF_FLOAT(T)>
|
|
@@ -1598,17 +1863,42 @@ HWY_API Vec128<T, N> NegMulSub(Vec128<T, N> mul, Vec128<T, N> x,
|
|
|
1598
1863
|
#endif
|
|
1599
1864
|
|
|
1600
1865
|
template <typename T, size_t N, HWY_IF_FLOAT(T)>
|
|
1601
|
-
HWY_API Vec128<T, N>
|
|
1602
|
-
|
|
1866
|
+
HWY_API Vec128<T, N> operator/(Vec128<T, N> a, Vec128<T, N> b) {
|
|
1867
|
+
#if HWY_S390X_HAVE_Z14
|
|
1868
|
+
return Vec128<T, N>{a.raw / b.raw};
|
|
1869
|
+
#else
|
|
1870
|
+
return Vec128<T, N>{vec_div(a.raw, b.raw)};
|
|
1871
|
+
#endif
|
|
1603
1872
|
}
|
|
1604
1873
|
|
|
1605
1874
|
template <typename T, size_t N, HWY_IF_FLOAT(T)>
|
|
1606
|
-
HWY_API Vec128<T, N>
|
|
1607
|
-
|
|
1875
|
+
HWY_API Vec128<T, N> ApproximateReciprocal(Vec128<T, N> v) {
|
|
1876
|
+
#if HWY_S390X_HAVE_Z14
|
|
1877
|
+
const DFromV<decltype(v)> d;
|
|
1878
|
+
return Set(d, T(1.0)) / v;
|
|
1879
|
+
#else
|
|
1880
|
+
return Vec128<T, N>{vec_re(v.raw)};
|
|
1881
|
+
#endif
|
|
1608
1882
|
}
|
|
1609
1883
|
|
|
1610
1884
|
// ------------------------------ Floating-point square root
|
|
1611
1885
|
|
|
1886
|
+
#if HWY_S390X_HAVE_Z14
|
|
1887
|
+
// Approximate reciprocal square root
|
|
1888
|
+
template <size_t N>
|
|
1889
|
+
HWY_API Vec128<float, N> ApproximateReciprocalSqrt(Vec128<float, N> v) {
|
|
1890
|
+
const DFromV<decltype(v)> d;
|
|
1891
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
1892
|
+
|
|
1893
|
+
const auto half = v * Set(d, 0.5f);
|
|
1894
|
+
// Initial guess based on log2(f)
|
|
1895
|
+
const auto guess = BitCast(
|
|
1896
|
+
d, Set(du, uint32_t{0x5F3759DFu}) - ShiftRight<1>(BitCast(du, v)));
|
|
1897
|
+
// One Newton-Raphson iteration
|
|
1898
|
+
return guess * NegMulAdd(half * guess, guess, Set(d, 1.5f));
|
|
1899
|
+
}
|
|
1900
|
+
#else // VSX
|
|
1901
|
+
|
|
1612
1902
|
#ifdef HWY_NATIVE_F64_APPROX_RSQRT
|
|
1613
1903
|
#undef HWY_NATIVE_F64_APPROX_RSQRT
|
|
1614
1904
|
#else
|
|
@@ -1620,6 +1910,7 @@ template <class T, size_t N, HWY_IF_FLOAT(T)>
|
|
|
1620
1910
|
HWY_API Vec128<T, N> ApproximateReciprocalSqrt(Vec128<T, N> v) {
|
|
1621
1911
|
return Vec128<T, N>{vec_rsqrte(v.raw)};
|
|
1622
1912
|
}
|
|
1913
|
+
#endif // HWY_S390X_HAVE_Z14
|
|
1623
1914
|
|
|
1624
1915
|
// Full precision square root
|
|
1625
1916
|
template <class T, size_t N, HWY_IF_FLOAT(T)>
|
|
@@ -1668,6 +1959,167 @@ HWY_API V AbsDiff(const V a, const V b) {
|
|
|
1668
1959
|
|
|
1669
1960
|
#endif // HWY_PPC_HAVE_9
|
|
1670
1961
|
|
|
1962
|
+
// ------------------------------ Integer Div for PPC10
|
|
1963
|
+
#if HWY_PPC_HAVE_10
|
|
1964
|
+
#ifdef HWY_NATIVE_INT_DIV
|
|
1965
|
+
#undef HWY_NATIVE_INT_DIV
|
|
1966
|
+
#else
|
|
1967
|
+
#define HWY_NATIVE_INT_DIV
|
|
1968
|
+
#endif
|
|
1969
|
+
|
|
1970
|
+
template <size_t N>
|
|
1971
|
+
HWY_API Vec128<int32_t, N> operator/(Vec128<int32_t, N> a,
|
|
1972
|
+
Vec128<int32_t, N> b) {
|
|
1973
|
+
// Inline assembly is used instead of vec_div for I32 Div on PPC10 to avoid
|
|
1974
|
+
// undefined behavior if b[i] == 0 or
|
|
1975
|
+
// (a[i] == LimitsMin<int32_t>() && b[i] == -1)
|
|
1976
|
+
|
|
1977
|
+
// Clang will also optimize out I32 vec_div on PPC10 if optimizations are
|
|
1978
|
+
// enabled and any of the lanes of b are known to be zero (even in the unused
|
|
1979
|
+
// lanes of a partial vector)
|
|
1980
|
+
__vector signed int raw_result;
|
|
1981
|
+
__asm__("vdivsw %0,%1,%2" : "=v"(raw_result) : "v"(a.raw), "v"(b.raw));
|
|
1982
|
+
return Vec128<int32_t, N>{raw_result};
|
|
1983
|
+
}
|
|
1984
|
+
|
|
1985
|
+
template <size_t N>
|
|
1986
|
+
HWY_API Vec128<uint32_t, N> operator/(Vec128<uint32_t, N> a,
|
|
1987
|
+
Vec128<uint32_t, N> b) {
|
|
1988
|
+
// Inline assembly is used instead of vec_div for U32 Div on PPC10 to avoid
|
|
1989
|
+
// undefined behavior if b[i] == 0
|
|
1990
|
+
|
|
1991
|
+
// Clang will also optimize out U32 vec_div on PPC10 if optimizations are
|
|
1992
|
+
// enabled and any of the lanes of b are known to be zero (even in the unused
|
|
1993
|
+
// lanes of a partial vector)
|
|
1994
|
+
__vector unsigned int raw_result;
|
|
1995
|
+
__asm__("vdivuw %0,%1,%2" : "=v"(raw_result) : "v"(a.raw), "v"(b.raw));
|
|
1996
|
+
return Vec128<uint32_t, N>{raw_result};
|
|
1997
|
+
}
|
|
1998
|
+
|
|
1999
|
+
template <size_t N>
|
|
2000
|
+
HWY_API Vec128<int64_t, N> operator/(Vec128<int64_t, N> a,
|
|
2001
|
+
Vec128<int64_t, N> b) {
|
|
2002
|
+
// Inline assembly is used instead of vec_div for I64 Div on PPC10 to avoid
|
|
2003
|
+
// undefined behavior if b[i] == 0 or
|
|
2004
|
+
// (a[i] == LimitsMin<int64_t>() && b[i] == -1)
|
|
2005
|
+
|
|
2006
|
+
// Clang will also optimize out I64 vec_div on PPC10 if optimizations are
|
|
2007
|
+
// enabled and any of the lanes of b are known to be zero (even in the unused
|
|
2008
|
+
// lanes of a partial vector)
|
|
2009
|
+
__vector signed long long raw_result;
|
|
2010
|
+
__asm__("vdivsd %0,%1,%2" : "=v"(raw_result) : "v"(a.raw), "v"(b.raw));
|
|
2011
|
+
return Vec128<int64_t, N>{raw_result};
|
|
2012
|
+
}
|
|
2013
|
+
|
|
2014
|
+
template <size_t N>
|
|
2015
|
+
HWY_API Vec128<uint64_t, N> operator/(Vec128<uint64_t, N> a,
|
|
2016
|
+
Vec128<uint64_t, N> b) {
|
|
2017
|
+
// Inline assembly is used instead of vec_div for U64 Div on PPC10 to avoid
|
|
2018
|
+
// undefined behavior if b[i] == 0
|
|
2019
|
+
|
|
2020
|
+
// Clang will also optimize out U64 vec_div on PPC10 if optimizations are
|
|
2021
|
+
// enabled and any of the lanes of b are known to be zero (even in the unused
|
|
2022
|
+
// lanes of a partial vector)
|
|
2023
|
+
__vector unsigned long long raw_result;
|
|
2024
|
+
__asm__("vdivud %0,%1,%2" : "=v"(raw_result) : "v"(a.raw), "v"(b.raw));
|
|
2025
|
+
return Vec128<uint64_t, N>{raw_result};
|
|
2026
|
+
}
|
|
2027
|
+
|
|
2028
|
+
template <class T, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T),
|
|
2029
|
+
HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2))>
|
|
2030
|
+
HWY_API Vec128<T> operator/(Vec128<T> a, Vec128<T> b) {
|
|
2031
|
+
const DFromV<decltype(a)> d;
|
|
2032
|
+
const RepartitionToWide<decltype(d)> dw;
|
|
2033
|
+
return OrderedDemote2To(d, PromoteLowerTo(dw, a) / PromoteLowerTo(dw, b),
|
|
2034
|
+
PromoteUpperTo(dw, a) / PromoteUpperTo(dw, b));
|
|
2035
|
+
}
|
|
2036
|
+
|
|
2037
|
+
template <class T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T),
|
|
2038
|
+
HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2)),
|
|
2039
|
+
HWY_IF_V_SIZE_LE(T, N, 8)>
|
|
2040
|
+
HWY_API Vec128<T, N> operator/(Vec128<T, N> a, Vec128<T, N> b) {
|
|
2041
|
+
const DFromV<decltype(a)> d;
|
|
2042
|
+
const Rebind<MakeWide<T>, decltype(d)> dw;
|
|
2043
|
+
return DemoteTo(d, PromoteTo(dw, a) / PromoteTo(dw, b));
|
|
2044
|
+
}
|
|
2045
|
+
|
|
2046
|
+
template <size_t N>
|
|
2047
|
+
HWY_API Vec128<int32_t, N> operator%(Vec128<int32_t, N> a,
|
|
2048
|
+
Vec128<int32_t, N> b) {
|
|
2049
|
+
// Inline assembly is used instead of vec_mod for I32 Mod on PPC10 to avoid
|
|
2050
|
+
// undefined behavior if b[i] == 0 or
|
|
2051
|
+
// (a[i] == LimitsMin<int32_t>() && b[i] == -1)
|
|
2052
|
+
|
|
2053
|
+
// Clang will also optimize out I32 vec_mod on PPC10 if optimizations are
|
|
2054
|
+
// enabled and any of the lanes of b are known to be zero (even in the unused
|
|
2055
|
+
// lanes of a partial vector)
|
|
2056
|
+
__vector signed int raw_result;
|
|
2057
|
+
__asm__("vmodsw %0,%1,%2" : "=v"(raw_result) : "v"(a.raw), "v"(b.raw));
|
|
2058
|
+
return Vec128<int32_t, N>{raw_result};
|
|
2059
|
+
}
|
|
2060
|
+
|
|
2061
|
+
template <size_t N>
|
|
2062
|
+
HWY_API Vec128<uint32_t, N> operator%(Vec128<uint32_t, N> a,
|
|
2063
|
+
Vec128<uint32_t, N> b) {
|
|
2064
|
+
// Inline assembly is used instead of vec_mod for U32 Mod on PPC10 to avoid
|
|
2065
|
+
// undefined behavior if b[i] == 0
|
|
2066
|
+
|
|
2067
|
+
// Clang will also optimize out U32 vec_mod on PPC10 if optimizations are
|
|
2068
|
+
// enabled and any of the lanes of b are known to be zero (even in the unused
|
|
2069
|
+
// lanes of a partial vector)
|
|
2070
|
+
__vector unsigned int raw_result;
|
|
2071
|
+
__asm__("vmoduw %0,%1,%2" : "=v"(raw_result) : "v"(a.raw), "v"(b.raw));
|
|
2072
|
+
return Vec128<uint32_t, N>{raw_result};
|
|
2073
|
+
}
|
|
2074
|
+
|
|
2075
|
+
template <size_t N>
|
|
2076
|
+
HWY_API Vec128<int64_t, N> operator%(Vec128<int64_t, N> a,
|
|
2077
|
+
Vec128<int64_t, N> b) {
|
|
2078
|
+
// Inline assembly is used instead of vec_mod for I64 Mod on PPC10 to avoid
|
|
2079
|
+
// undefined behavior if b[i] == 0 or
|
|
2080
|
+
// (a[i] == LimitsMin<int64_t>() && b[i] == -1)
|
|
2081
|
+
|
|
2082
|
+
// Clang will also optimize out I64 vec_mod on PPC10 if optimizations are
|
|
2083
|
+
// enabled and any of the lanes of b are known to be zero (even in the unused
|
|
2084
|
+
// lanes of a partial vector)
|
|
2085
|
+
__vector signed long long raw_result;
|
|
2086
|
+
__asm__("vmodsd %0,%1,%2" : "=v"(raw_result) : "v"(a.raw), "v"(b.raw));
|
|
2087
|
+
return Vec128<int64_t, N>{raw_result};
|
|
2088
|
+
}
|
|
2089
|
+
|
|
2090
|
+
template <size_t N>
|
|
2091
|
+
HWY_API Vec128<uint64_t, N> operator%(Vec128<uint64_t, N> a,
|
|
2092
|
+
Vec128<uint64_t, N> b) {
|
|
2093
|
+
// Inline assembly is used instead of vec_mod for U64 Mod on PPC10 to avoid
|
|
2094
|
+
// undefined behavior if b[i] == 0
|
|
2095
|
+
|
|
2096
|
+
// Clang will also optimize out U64 vec_mod on PPC10 if optimizations are
|
|
2097
|
+
// enabled and any of the lanes of b are known to be zero (even in the unused
|
|
2098
|
+
// lanes of a partial vector)
|
|
2099
|
+
__vector unsigned long long raw_result;
|
|
2100
|
+
__asm__("vmodud %0,%1,%2" : "=v"(raw_result) : "v"(a.raw), "v"(b.raw));
|
|
2101
|
+
return Vec128<uint64_t, N>{raw_result};
|
|
2102
|
+
}
|
|
2103
|
+
|
|
2104
|
+
template <class T, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T),
|
|
2105
|
+
HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2))>
|
|
2106
|
+
HWY_API Vec128<T> operator%(Vec128<T> a, Vec128<T> b) {
|
|
2107
|
+
const DFromV<decltype(a)> d;
|
|
2108
|
+
const RepartitionToWide<decltype(d)> dw;
|
|
2109
|
+
return OrderedDemote2To(d, PromoteLowerTo(dw, a) % PromoteLowerTo(dw, b),
|
|
2110
|
+
PromoteUpperTo(dw, a) % PromoteUpperTo(dw, b));
|
|
2111
|
+
}
|
|
2112
|
+
|
|
2113
|
+
template <class T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T),
|
|
2114
|
+
HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2)),
|
|
2115
|
+
HWY_IF_V_SIZE_LE(T, N, 8)>
|
|
2116
|
+
HWY_API Vec128<T, N> operator%(Vec128<T, N> a, Vec128<T, N> b) {
|
|
2117
|
+
const DFromV<decltype(a)> d;
|
|
2118
|
+
const Rebind<MakeWide<T>, decltype(d)> dw;
|
|
2119
|
+
return DemoteTo(d, PromoteTo(dw, a) % PromoteTo(dw, b));
|
|
2120
|
+
}
|
|
2121
|
+
#endif
|
|
2122
|
+
|
|
1671
2123
|
// ================================================== MEMORY (3)
|
|
1672
2124
|
|
|
1673
2125
|
// ------------------------------ Non-temporal stores
|
|
@@ -1800,7 +2252,7 @@ template <typename T, size_t N>
|
|
|
1800
2252
|
HWY_API Vec128<T, N> InsertLane(Vec128<T, N> v, size_t i, T t) {
|
|
1801
2253
|
#if HWY_IS_LITTLE_ENDIAN
|
|
1802
2254
|
typename detail::Raw128<T>::type raw_result = v.raw;
|
|
1803
|
-
raw_result[i] = t;
|
|
2255
|
+
raw_result[i] = BitCastScalar<typename detail::Raw128<T>::RawT>(t);
|
|
1804
2256
|
return Vec128<T, N>{raw_result};
|
|
1805
2257
|
#else
|
|
1806
2258
|
// On ppc64be without this, mul_test fails, but swizzle_test passes.
|
|
@@ -2070,7 +2522,7 @@ HWY_API Vec32<T> Reverse(D d, Vec32<T> v) {
|
|
|
2070
2522
|
|
|
2071
2523
|
// ------------------------------- ReverseLaneBytes
|
|
2072
2524
|
|
|
2073
|
-
#if HWY_PPC_HAVE_9 && \
|
|
2525
|
+
#if (HWY_PPC_HAVE_9 || HWY_S390X_HAVE_Z14) && \
|
|
2074
2526
|
(HWY_COMPILER_GCC_ACTUAL >= 710 || HWY_COMPILER_CLANG >= 400)
|
|
2075
2527
|
|
|
2076
2528
|
// Per-target flag to prevent generic_ops-inl.h defining 8-bit ReverseLaneBytes.
|
|
@@ -2111,7 +2563,7 @@ HWY_API VFromD<D> Reverse8(D d, VFromD<D> v) {
|
|
|
2111
2563
|
return BitCast(d, ReverseLaneBytes(BitCast(du64, v)));
|
|
2112
2564
|
}
|
|
2113
2565
|
|
|
2114
|
-
#endif // HWY_PPC_HAVE_9
|
|
2566
|
+
#endif // HWY_PPC_HAVE_9 || HWY_S390X_HAVE_Z14
|
|
2115
2567
|
|
|
2116
2568
|
template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 1)>
|
|
2117
2569
|
HWY_API Vec16<T> Reverse(D d, Vec16<T> v) {
|
|
@@ -2268,11 +2720,15 @@ HWY_API VFromD<D> SlideUpLanes(D d, VFromD<D> v, size_t amt) {
|
|
|
2268
2720
|
Set(Full128<uint32_t>(),
|
|
2269
2721
|
static_cast<uint32_t>(amt * sizeof(TFromD<D>) * 8)));
|
|
2270
2722
|
|
|
2723
|
+
#if HWY_S390X_HAVE_Z14
|
|
2724
|
+
return BitCast(d, VU8{vec_srb(BitCast(du8, v).raw, v_shift_amt.raw)});
|
|
2725
|
+
#else // VSX
|
|
2271
2726
|
#if HWY_IS_LITTLE_ENDIAN
|
|
2272
2727
|
return BitCast(d, VU8{vec_slo(BitCast(du8, v).raw, v_shift_amt.raw)});
|
|
2273
2728
|
#else
|
|
2274
2729
|
return BitCast(d, VU8{vec_sro(BitCast(du8, v).raw, v_shift_amt.raw)});
|
|
2275
|
-
#endif
|
|
2730
|
+
#endif // HWY_IS_LITTLE_ENDIAN
|
|
2731
|
+
#endif // HWY_S390X_HAVE_Z14
|
|
2276
2732
|
}
|
|
2277
2733
|
|
|
2278
2734
|
// ------------------------------ SlideDownLanes
|
|
@@ -2300,11 +2756,15 @@ HWY_API VFromD<D> SlideDownLanes(D d, VFromD<D> v, size_t amt) {
|
|
|
2300
2756
|
Set(Full128<uint32_t>(),
|
|
2301
2757
|
static_cast<uint32_t>(amt * sizeof(TFromD<D>) * 8)));
|
|
2302
2758
|
|
|
2759
|
+
#if HWY_S390X_HAVE_Z14
|
|
2760
|
+
return BitCast(d, VU8{vec_slb(BitCast(du8, v).raw, v_shift_amt.raw)});
|
|
2761
|
+
#else // VSX
|
|
2303
2762
|
#if HWY_IS_LITTLE_ENDIAN
|
|
2304
2763
|
return BitCast(d, VU8{vec_sro(BitCast(du8, v).raw, v_shift_amt.raw)});
|
|
2305
2764
|
#else
|
|
2306
2765
|
return BitCast(d, VU8{vec_slo(BitCast(du8, v).raw, v_shift_amt.raw)});
|
|
2307
|
-
#endif
|
|
2766
|
+
#endif // HWY_IS_LITTLE_ENDIAN
|
|
2767
|
+
#endif // HWY_S390X_HAVE_Z14
|
|
2308
2768
|
}
|
|
2309
2769
|
|
|
2310
2770
|
// ================================================== COMBINE
|
|
@@ -2637,7 +3097,15 @@ HWY_API Vec128<T, N> DupEven(Vec128<T, N> v) {
|
|
|
2637
3097
|
|
|
2638
3098
|
template <typename T, HWY_IF_T_SIZE(T, 4)>
|
|
2639
3099
|
HWY_API Vec128<T> DupEven(Vec128<T> v) {
|
|
3100
|
+
#if HWY_S390X_HAVE_Z14
|
|
3101
|
+
const DFromV<decltype(v)> d;
|
|
3102
|
+
const Repartition<uint8_t, decltype(d)> du8;
|
|
3103
|
+
return TableLookupBytes(
|
|
3104
|
+
v, BitCast(d, Dup128VecFromValues(du8, 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10,
|
|
3105
|
+
11, 8, 9, 10, 11)));
|
|
3106
|
+
#else
|
|
2640
3107
|
return Vec128<T>{vec_mergee(v.raw, v.raw)};
|
|
3108
|
+
#endif
|
|
2641
3109
|
}
|
|
2642
3110
|
|
|
2643
3111
|
// ------------------------------ DupOdd (InterleaveUpper)
|
|
@@ -2662,7 +3130,15 @@ HWY_API Vec128<T, N> DupOdd(Vec128<T, N> v) {
|
|
|
2662
3130
|
|
|
2663
3131
|
template <typename T, size_t N, HWY_IF_T_SIZE(T, 4)>
|
|
2664
3132
|
HWY_API Vec128<T, N> DupOdd(Vec128<T, N> v) {
|
|
3133
|
+
#if HWY_S390X_HAVE_Z14
|
|
3134
|
+
const DFromV<decltype(v)> d;
|
|
3135
|
+
const Repartition<uint8_t, decltype(d)> du8;
|
|
3136
|
+
return TableLookupBytes(
|
|
3137
|
+
v, BitCast(d, Dup128VecFromValues(du8, 4, 5, 6, 7, 4, 5, 6, 7, 12, 13, 14,
|
|
3138
|
+
15, 12, 13, 14, 15)));
|
|
3139
|
+
#else
|
|
2665
3140
|
return Vec128<T, N>{vec_mergeo(v.raw, v.raw)};
|
|
3141
|
+
#endif
|
|
2666
3142
|
}
|
|
2667
3143
|
|
|
2668
3144
|
template <typename T, size_t N, HWY_IF_T_SIZE(T, 8)>
|
|
@@ -2706,6 +3182,96 @@ HWY_INLINE Vec128<T, N> OddEven(Vec128<T, N> a, Vec128<T, N> b) {
|
|
|
2706
3182
|
return IfVecThenElse(BitCast(d, Vec128<uint8_t, N * 8>{mask}), b, a);
|
|
2707
3183
|
}
|
|
2708
3184
|
|
|
3185
|
+
// ------------------------------ InterleaveEven
|
|
3186
|
+
|
|
3187
|
+
template <class D, HWY_IF_T_SIZE_D(D, 1)>
|
|
3188
|
+
HWY_API VFromD<D> InterleaveEven(D d, VFromD<D> a, VFromD<D> b) {
|
|
3189
|
+
const Full128<TFromD<D>> d_full;
|
|
3190
|
+
const Indices128<TFromD<D>> idx{
|
|
3191
|
+
Dup128VecFromValues(Full128<uint8_t>(), 0, 16, 2, 18, 4, 20, 6, 22, 8, 24,
|
|
3192
|
+
10, 26, 12, 28, 14, 30)
|
|
3193
|
+
.raw};
|
|
3194
|
+
return ResizeBitCast(d, TwoTablesLookupLanes(ResizeBitCast(d_full, a),
|
|
3195
|
+
ResizeBitCast(d_full, b), idx));
|
|
3196
|
+
}
|
|
3197
|
+
|
|
3198
|
+
template <class D, HWY_IF_T_SIZE_D(D, 2)>
|
|
3199
|
+
HWY_API VFromD<D> InterleaveEven(D d, VFromD<D> a, VFromD<D> b) {
|
|
3200
|
+
const Full128<TFromD<D>> d_full;
|
|
3201
|
+
const Indices128<TFromD<D>> idx{Dup128VecFromValues(Full128<uint8_t>(), 0, 1,
|
|
3202
|
+
16, 17, 4, 5, 20, 21, 8,
|
|
3203
|
+
9, 24, 25, 12, 13, 28, 29)
|
|
3204
|
+
.raw};
|
|
3205
|
+
return ResizeBitCast(d, TwoTablesLookupLanes(ResizeBitCast(d_full, a),
|
|
3206
|
+
ResizeBitCast(d_full, b), idx));
|
|
3207
|
+
}
|
|
3208
|
+
|
|
3209
|
+
template <class D, HWY_IF_T_SIZE_D(D, 4)>
|
|
3210
|
+
HWY_API VFromD<D> InterleaveEven(D d, VFromD<D> a, VFromD<D> b) {
|
|
3211
|
+
#if HWY_S390X_HAVE_Z14
|
|
3212
|
+
const Full128<TFromD<D>> d_full;
|
|
3213
|
+
const Indices128<TFromD<D>> idx{Dup128VecFromValues(Full128<uint8_t>(), 0, 1,
|
|
3214
|
+
2, 3, 16, 17, 18, 19, 8,
|
|
3215
|
+
9, 10, 11, 24, 25, 26, 27)
|
|
3216
|
+
.raw};
|
|
3217
|
+
return ResizeBitCast(d, TwoTablesLookupLanes(ResizeBitCast(d_full, a),
|
|
3218
|
+
ResizeBitCast(d_full, b), idx));
|
|
3219
|
+
#else
|
|
3220
|
+
(void)d;
|
|
3221
|
+
return VFromD<D>{vec_mergee(a.raw, b.raw)};
|
|
3222
|
+
#endif
|
|
3223
|
+
}
|
|
3224
|
+
|
|
3225
|
+
template <class D, HWY_IF_T_SIZE_D(D, 8)>
|
|
3226
|
+
HWY_API VFromD<D> InterleaveEven(D /*d*/, VFromD<D> a, VFromD<D> b) {
|
|
3227
|
+
return InterleaveLower(a, b);
|
|
3228
|
+
}
|
|
3229
|
+
|
|
3230
|
+
// ------------------------------ InterleaveOdd
|
|
3231
|
+
|
|
3232
|
+
template <class D, HWY_IF_T_SIZE_D(D, 1)>
|
|
3233
|
+
HWY_API VFromD<D> InterleaveOdd(D d, VFromD<D> a, VFromD<D> b) {
|
|
3234
|
+
const Full128<TFromD<D>> d_full;
|
|
3235
|
+
const Indices128<TFromD<D>> idx{
|
|
3236
|
+
Dup128VecFromValues(Full128<uint8_t>(), 1, 17, 3, 19, 5, 21, 7, 23, 9, 25,
|
|
3237
|
+
11, 27, 13, 29, 15, 31)
|
|
3238
|
+
.raw};
|
|
3239
|
+
return ResizeBitCast(d, TwoTablesLookupLanes(ResizeBitCast(d_full, a),
|
|
3240
|
+
ResizeBitCast(d_full, b), idx));
|
|
3241
|
+
}
|
|
3242
|
+
|
|
3243
|
+
template <class D, HWY_IF_T_SIZE_D(D, 2)>
|
|
3244
|
+
HWY_API VFromD<D> InterleaveOdd(D d, VFromD<D> a, VFromD<D> b) {
|
|
3245
|
+
const Full128<TFromD<D>> d_full;
|
|
3246
|
+
const Indices128<TFromD<D>> idx{
|
|
3247
|
+
Dup128VecFromValues(Full128<uint8_t>(), 2, 3, 18, 19, 6, 7, 22, 23, 10,
|
|
3248
|
+
11, 26, 27, 14, 15, 30, 31)
|
|
3249
|
+
.raw};
|
|
3250
|
+
return ResizeBitCast(d, TwoTablesLookupLanes(ResizeBitCast(d_full, a),
|
|
3251
|
+
ResizeBitCast(d_full, b), idx));
|
|
3252
|
+
}
|
|
3253
|
+
|
|
3254
|
+
template <class D, HWY_IF_T_SIZE_D(D, 4)>
|
|
3255
|
+
HWY_API VFromD<D> InterleaveOdd(D d, VFromD<D> a, VFromD<D> b) {
|
|
3256
|
+
#if HWY_S390X_HAVE_Z14
|
|
3257
|
+
const Full128<TFromD<D>> d_full;
|
|
3258
|
+
const Indices128<TFromD<D>> idx{
|
|
3259
|
+
Dup128VecFromValues(Full128<uint8_t>(), 4, 5, 6, 7, 20, 21, 22, 23, 12,
|
|
3260
|
+
13, 14, 15, 28, 29, 30, 31)
|
|
3261
|
+
.raw};
|
|
3262
|
+
return ResizeBitCast(d, TwoTablesLookupLanes(ResizeBitCast(d_full, a),
|
|
3263
|
+
ResizeBitCast(d_full, b), idx));
|
|
3264
|
+
#else
|
|
3265
|
+
(void)d;
|
|
3266
|
+
return VFromD<D>{vec_mergeo(a.raw, b.raw)};
|
|
3267
|
+
#endif
|
|
3268
|
+
}
|
|
3269
|
+
|
|
3270
|
+
template <class D, HWY_IF_T_SIZE_D(D, 8)>
|
|
3271
|
+
HWY_API VFromD<D> InterleaveOdd(D d, VFromD<D> a, VFromD<D> b) {
|
|
3272
|
+
return InterleaveUpper(d, a, b);
|
|
3273
|
+
}
|
|
3274
|
+
|
|
2709
3275
|
// ------------------------------ OddEvenBlocks
|
|
2710
3276
|
template <typename T, size_t N>
|
|
2711
3277
|
HWY_API Vec128<T, N> OddEvenBlocks(Vec128<T, N> /* odd */, Vec128<T, N> even) {
|
|
@@ -2719,26 +3285,64 @@ HWY_API Vec128<T, N> SwapAdjacentBlocks(Vec128<T, N> v) {
|
|
|
2719
3285
|
return v;
|
|
2720
3286
|
}
|
|
2721
3287
|
|
|
2722
|
-
// ------------------------------
|
|
3288
|
+
// ------------------------------ MulFixedPoint15 (OddEven)
|
|
2723
3289
|
|
|
2724
|
-
|
|
2725
|
-
|
|
2726
|
-
|
|
2727
|
-
|
|
2728
|
-
return Vec128<T, N>{vec_sl(v.raw, bits.raw)};
|
|
2729
|
-
}
|
|
3290
|
+
#if HWY_S390X_HAVE_Z14
|
|
3291
|
+
HWY_API Vec16<int16_t> MulFixedPoint15(Vec16<int16_t> a, Vec16<int16_t> b) {
|
|
3292
|
+
const DFromV<decltype(a)> di16;
|
|
3293
|
+
const RepartitionToWide<decltype(di16)> di32;
|
|
2730
3294
|
|
|
2731
|
-
|
|
2732
|
-
|
|
2733
|
-
HWY_API Vec128<T, N> Shl(hwy::SignedTag /*tag*/, Vec128<T, N> v,
|
|
2734
|
-
Vec128<T, N> bits) {
|
|
2735
|
-
const DFromV<decltype(v)> di;
|
|
2736
|
-
const RebindToUnsigned<decltype(di)> du;
|
|
2737
|
-
return BitCast(di,
|
|
2738
|
-
Shl(hwy::UnsignedTag(), BitCast(du, v), BitCast(du, bits)));
|
|
2739
|
-
}
|
|
3295
|
+
const auto round_up_incr = Set(di32, 0x4000);
|
|
3296
|
+
const auto i32_product = MulEven(a, b) + round_up_incr;
|
|
2740
3297
|
|
|
2741
|
-
|
|
3298
|
+
return ResizeBitCast(di16, ShiftLeft<1>(i32_product));
|
|
3299
|
+
}
|
|
3300
|
+
template <size_t N, HWY_IF_LANES_GT(N, 1)>
|
|
3301
|
+
HWY_API Vec128<int16_t, N> MulFixedPoint15(Vec128<int16_t, N> a,
|
|
3302
|
+
Vec128<int16_t, N> b) {
|
|
3303
|
+
const DFromV<decltype(a)> di16;
|
|
3304
|
+
const RepartitionToWide<decltype(di16)> di32;
|
|
3305
|
+
|
|
3306
|
+
const auto round_up_incr = Set(di32, 0x4000);
|
|
3307
|
+
const auto even_product = MulEven(a, b) + round_up_incr;
|
|
3308
|
+
const auto odd_product = MulOdd(a, b) + round_up_incr;
|
|
3309
|
+
|
|
3310
|
+
return OddEven(BitCast(di16, ShiftRight<15>(odd_product)),
|
|
3311
|
+
BitCast(di16, ShiftLeft<1>(even_product)));
|
|
3312
|
+
}
|
|
3313
|
+
#else
|
|
3314
|
+
template <size_t N>
|
|
3315
|
+
HWY_API Vec128<int16_t, N> MulFixedPoint15(Vec128<int16_t, N> a,
|
|
3316
|
+
Vec128<int16_t, N> b) {
|
|
3317
|
+
const Vec128<int16_t> zero = Zero(Full128<int16_t>());
|
|
3318
|
+
return Vec128<int16_t, N>{vec_mradds(a.raw, b.raw, zero.raw)};
|
|
3319
|
+
}
|
|
3320
|
+
#endif
|
|
3321
|
+
|
|
3322
|
+
// ------------------------------ Shl
|
|
3323
|
+
|
|
3324
|
+
namespace detail {
|
|
3325
|
+
template <typename T, size_t N>
|
|
3326
|
+
HWY_API Vec128<T, N> Shl(hwy::UnsignedTag /*tag*/, Vec128<T, N> v,
|
|
3327
|
+
Vec128<T, N> bits) {
|
|
3328
|
+
#if HWY_S390X_HAVE_Z14
|
|
3329
|
+
return Vec128<T, N>{v.raw << bits.raw};
|
|
3330
|
+
#else
|
|
3331
|
+
return Vec128<T, N>{vec_sl(v.raw, bits.raw)};
|
|
3332
|
+
#endif
|
|
3333
|
+
}
|
|
3334
|
+
|
|
3335
|
+
// Signed left shift is the same as unsigned.
|
|
3336
|
+
template <typename T, size_t N>
|
|
3337
|
+
HWY_API Vec128<T, N> Shl(hwy::SignedTag /*tag*/, Vec128<T, N> v,
|
|
3338
|
+
Vec128<T, N> bits) {
|
|
3339
|
+
const DFromV<decltype(v)> di;
|
|
3340
|
+
const RebindToUnsigned<decltype(di)> du;
|
|
3341
|
+
return BitCast(di,
|
|
3342
|
+
Shl(hwy::UnsignedTag(), BitCast(du, v), BitCast(du, bits)));
|
|
3343
|
+
}
|
|
3344
|
+
|
|
3345
|
+
} // namespace detail
|
|
2742
3346
|
|
|
2743
3347
|
template <typename T, size_t N, HWY_IF_NOT_FLOAT(T)>
|
|
2744
3348
|
HWY_API Vec128<T, N> operator<<(Vec128<T, N> v, Vec128<T, N> bits) {
|
|
@@ -2751,15 +3355,23 @@ namespace detail {
|
|
|
2751
3355
|
template <typename T, size_t N>
|
|
2752
3356
|
HWY_API Vec128<T, N> Shr(hwy::UnsignedTag /*tag*/, Vec128<T, N> v,
|
|
2753
3357
|
Vec128<T, N> bits) {
|
|
3358
|
+
#if HWY_S390X_HAVE_Z14
|
|
3359
|
+
return Vec128<T, N>{v.raw >> bits.raw};
|
|
3360
|
+
#else
|
|
2754
3361
|
return Vec128<T, N>{vec_sr(v.raw, bits.raw)};
|
|
3362
|
+
#endif
|
|
2755
3363
|
}
|
|
2756
3364
|
|
|
2757
3365
|
template <typename T, size_t N>
|
|
2758
3366
|
HWY_API Vec128<T, N> Shr(hwy::SignedTag /*tag*/, Vec128<T, N> v,
|
|
2759
3367
|
Vec128<T, N> bits) {
|
|
3368
|
+
#if HWY_S390X_HAVE_Z14
|
|
3369
|
+
return Vec128<T, N>{v.raw >> bits.raw};
|
|
3370
|
+
#else
|
|
2760
3371
|
const DFromV<decltype(v)> di;
|
|
2761
3372
|
const RebindToUnsigned<decltype(di)> du;
|
|
2762
3373
|
return Vec128<T, N>{vec_sra(v.raw, BitCast(du, bits).raw)};
|
|
3374
|
+
#endif
|
|
2763
3375
|
}
|
|
2764
3376
|
|
|
2765
3377
|
} // namespace detail
|
|
@@ -2771,100 +3383,85 @@ HWY_API Vec128<T, N> operator>>(Vec128<T, N> v, Vec128<T, N> bits) {
|
|
|
2771
3383
|
|
|
2772
3384
|
// ------------------------------ MulEven/Odd 64x64 (UpperHalf)
|
|
2773
3385
|
|
|
2774
|
-
|
|
3386
|
+
template <class T, HWY_IF_UI64(T)>
|
|
3387
|
+
HWY_INLINE Vec128<T> MulEven(Vec128<T> a, Vec128<T> b) {
|
|
2775
3388
|
#if HWY_PPC_HAVE_10 && defined(__SIZEOF_INT128__)
|
|
2776
|
-
using
|
|
2777
|
-
const
|
|
3389
|
+
using V64 = typename detail::Raw128<T>::type;
|
|
3390
|
+
const V64 mul128_result = reinterpret_cast<V64>(vec_mule(a.raw, b.raw));
|
|
2778
3391
|
#if HWY_IS_LITTLE_ENDIAN
|
|
2779
|
-
return Vec128<
|
|
3392
|
+
return Vec128<T>{mul128_result};
|
|
2780
3393
|
#else
|
|
2781
3394
|
// Need to swap the two halves of mul128_result on big-endian targets as
|
|
2782
3395
|
// the upper 64 bits of the product are in lane 0 of mul128_result and
|
|
2783
3396
|
// the lower 64 bits of the product are in lane 1 of mul128_result
|
|
2784
|
-
return Vec128<
|
|
3397
|
+
return Vec128<T>{vec_sld(mul128_result, mul128_result, 8)};
|
|
2785
3398
|
#endif
|
|
2786
3399
|
#else
|
|
2787
|
-
alignas(16)
|
|
3400
|
+
alignas(16) T mul[2];
|
|
2788
3401
|
mul[0] = Mul128(GetLane(a), GetLane(b), &mul[1]);
|
|
2789
|
-
return Load(Full128<
|
|
3402
|
+
return Load(Full128<T>(), mul);
|
|
2790
3403
|
#endif
|
|
2791
3404
|
}
|
|
2792
3405
|
|
|
2793
|
-
|
|
3406
|
+
template <class T, HWY_IF_UI64(T)>
|
|
3407
|
+
HWY_INLINE Vec128<T> MulOdd(Vec128<T> a, Vec128<T> b) {
|
|
2794
3408
|
#if HWY_PPC_HAVE_10 && defined(__SIZEOF_INT128__)
|
|
2795
|
-
using
|
|
2796
|
-
const
|
|
3409
|
+
using V64 = typename detail::Raw128<T>::type;
|
|
3410
|
+
const V64 mul128_result = reinterpret_cast<V64>(vec_mulo(a.raw, b.raw));
|
|
2797
3411
|
#if HWY_IS_LITTLE_ENDIAN
|
|
2798
|
-
return Vec128<
|
|
3412
|
+
return Vec128<T>{mul128_result};
|
|
2799
3413
|
#else
|
|
2800
3414
|
// Need to swap the two halves of mul128_result on big-endian targets as
|
|
2801
3415
|
// the upper 64 bits of the product are in lane 0 of mul128_result and
|
|
2802
3416
|
// the lower 64 bits of the product are in lane 1 of mul128_result
|
|
2803
|
-
return Vec128<
|
|
3417
|
+
return Vec128<T>{vec_sld(mul128_result, mul128_result, 8)};
|
|
2804
3418
|
#endif
|
|
2805
3419
|
#else
|
|
2806
|
-
alignas(16)
|
|
2807
|
-
const Full64<
|
|
3420
|
+
alignas(16) T mul[2];
|
|
3421
|
+
const Full64<T> d2;
|
|
2808
3422
|
mul[0] =
|
|
2809
3423
|
Mul128(GetLane(UpperHalf(d2, a)), GetLane(UpperHalf(d2, b)), &mul[1]);
|
|
2810
|
-
return Load(Full128<
|
|
3424
|
+
return Load(Full128<T>(), mul);
|
|
2811
3425
|
#endif
|
|
2812
3426
|
}
|
|
2813
3427
|
|
|
3428
|
+
// ------------------------------ PromoteEvenTo/PromoteOddTo
|
|
3429
|
+
#include "hwy/ops/inside-inl.h"
|
|
3430
|
+
|
|
2814
3431
|
// ------------------------------ WidenMulPairwiseAdd
|
|
2815
3432
|
|
|
2816
|
-
template <class
|
|
2817
|
-
class
|
|
2818
|
-
HWY_API VFromD<
|
|
2819
|
-
|
|
2820
|
-
|
|
2821
|
-
// longer-latency lane-crossing PromoteTo. Using shift/and instead of Zip
|
|
2822
|
-
// leads to the odd/even order that RearrangeToOddPlusEven prefers.
|
|
2823
|
-
using VU32 = VFromD<decltype(du32)>;
|
|
2824
|
-
const VU32 odd = Set(du32, 0xFFFF0000u);
|
|
2825
|
-
const VU32 ae = ShiftLeft<16>(BitCast(du32, a));
|
|
2826
|
-
const VU32 ao = And(BitCast(du32, a), odd);
|
|
2827
|
-
const VU32 be = ShiftLeft<16>(BitCast(du32, b));
|
|
2828
|
-
const VU32 bo = And(BitCast(du32, b), odd);
|
|
2829
|
-
return MulAdd(BitCast(df32, ae), BitCast(df32, be),
|
|
2830
|
-
Mul(BitCast(df32, ao), BitCast(df32, bo)));
|
|
3433
|
+
template <class DF, HWY_IF_F32_D(DF),
|
|
3434
|
+
class VBF = VFromD<Repartition<bfloat16_t, DF>>>
|
|
3435
|
+
HWY_API VFromD<DF> WidenMulPairwiseAdd(DF df, VBF a, VBF b) {
|
|
3436
|
+
return MulAdd(PromoteEvenTo(df, a), PromoteEvenTo(df, b),
|
|
3437
|
+
Mul(PromoteOddTo(df, a), PromoteOddTo(df, b)));
|
|
2831
3438
|
}
|
|
2832
3439
|
|
|
2833
3440
|
// Even if N=1, the input is always at least 2 lanes, hence vec_msum is safe.
|
|
2834
3441
|
template <class D32, HWY_IF_UI32_D(D32),
|
|
2835
3442
|
class V16 = VFromD<RepartitionToNarrow<D32>>>
|
|
2836
3443
|
HWY_API VFromD<D32> WidenMulPairwiseAdd(D32 d32, V16 a, V16 b) {
|
|
3444
|
+
#if HWY_S390X_HAVE_Z14
|
|
3445
|
+
(void)d32;
|
|
3446
|
+
return MulEven(a, b) + MulOdd(a, b);
|
|
3447
|
+
#else
|
|
2837
3448
|
return VFromD<D32>{vec_msum(a.raw, b.raw, Zero(d32).raw)};
|
|
3449
|
+
#endif
|
|
2838
3450
|
}
|
|
2839
3451
|
|
|
2840
3452
|
// ------------------------------ ReorderWidenMulAccumulate (MulAdd, ZipLower)
|
|
2841
3453
|
|
|
2842
|
-
template <class D32, HWY_IF_F32_D(D32),
|
|
2843
|
-
class V16 = VFromD<Repartition<bfloat16_t, D32>>>
|
|
2844
|
-
HWY_API VFromD<D32> ReorderWidenMulAccumulate(D32 df32, V16 a, V16 b,
|
|
2845
|
-
VFromD<D32> sum0,
|
|
2846
|
-
VFromD<D32>& sum1) {
|
|
2847
|
-
const RebindToUnsigned<decltype(df32)> du32;
|
|
2848
|
-
// Lane order within sum0/1 is undefined, hence we can avoid the
|
|
2849
|
-
// longer-latency lane-crossing PromoteTo. Using shift/and instead of Zip
|
|
2850
|
-
// leads to the odd/even order that RearrangeToOddPlusEven prefers.
|
|
2851
|
-
using VU32 = VFromD<decltype(du32)>;
|
|
2852
|
-
const VU32 odd = Set(du32, 0xFFFF0000u);
|
|
2853
|
-
const VU32 ae = ShiftLeft<16>(BitCast(du32, a));
|
|
2854
|
-
const VU32 ao = And(BitCast(du32, a), odd);
|
|
2855
|
-
const VU32 be = ShiftLeft<16>(BitCast(du32, b));
|
|
2856
|
-
const VU32 bo = And(BitCast(du32, b), odd);
|
|
2857
|
-
sum1 = MulAdd(BitCast(df32, ao), BitCast(df32, bo), sum1);
|
|
2858
|
-
return MulAdd(BitCast(df32, ae), BitCast(df32, be), sum0);
|
|
2859
|
-
}
|
|
2860
|
-
|
|
2861
3454
|
// Even if N=1, the input is always at least 2 lanes, hence vec_msum is safe.
|
|
2862
3455
|
template <class D32, HWY_IF_UI32_D(D32),
|
|
2863
3456
|
class V16 = VFromD<RepartitionToNarrow<D32>>>
|
|
2864
|
-
HWY_API VFromD<D32> ReorderWidenMulAccumulate(D32 /*
|
|
3457
|
+
HWY_API VFromD<D32> ReorderWidenMulAccumulate(D32 /*d32*/, V16 a, V16 b,
|
|
2865
3458
|
VFromD<D32> sum0,
|
|
2866
3459
|
VFromD<D32>& /*sum1*/) {
|
|
3460
|
+
#if HWY_S390X_HAVE_Z14
|
|
3461
|
+
return MulEven(a, b) + MulOdd(a, b) + sum0;
|
|
3462
|
+
#else
|
|
2867
3463
|
return VFromD<D32>{vec_msum(a.raw, b.raw, sum0.raw)};
|
|
3464
|
+
#endif
|
|
2868
3465
|
}
|
|
2869
3466
|
|
|
2870
3467
|
// ------------------------------ RearrangeToOddPlusEven
|
|
@@ -2885,7 +3482,27 @@ HWY_API VW RearrangeToOddPlusEven(const VW sum0, const VW sum1) {
|
|
|
2885
3482
|
return Add(sum0, sum1);
|
|
2886
3483
|
}
|
|
2887
3484
|
|
|
3485
|
+
// ------------------------------ SatWidenMulPairwiseAccumulate
|
|
3486
|
+
#if !HWY_S390X_HAVE_Z14
|
|
3487
|
+
|
|
3488
|
+
#ifdef HWY_NATIVE_I16_I16_SATWIDENMULPAIRWISEACCUM
|
|
3489
|
+
#undef HWY_NATIVE_I16_I16_SATWIDENMULPAIRWISEACCUM
|
|
3490
|
+
#else
|
|
3491
|
+
#define HWY_NATIVE_I16_I16_SATWIDENMULPAIRWISEACCUM
|
|
3492
|
+
#endif
|
|
3493
|
+
|
|
3494
|
+
template <class DI32, HWY_IF_I32_D(DI32), HWY_IF_V_SIZE_LE_D(DI32, 16)>
|
|
3495
|
+
HWY_API VFromD<DI32> SatWidenMulPairwiseAccumulate(
|
|
3496
|
+
DI32 /* tag */, VFromD<Repartition<int16_t, DI32>> a,
|
|
3497
|
+
VFromD<Repartition<int16_t, DI32>> b, VFromD<DI32> sum) {
|
|
3498
|
+
return VFromD<DI32>{vec_msums(a.raw, b.raw, sum.raw)};
|
|
3499
|
+
}
|
|
3500
|
+
|
|
3501
|
+
#endif // !HWY_S390X_HAVE_Z14
|
|
3502
|
+
|
|
2888
3503
|
// ------------------------------ SumOfMulQuadAccumulate
|
|
3504
|
+
#if !HWY_S390X_HAVE_Z14
|
|
3505
|
+
|
|
2889
3506
|
#ifdef HWY_NATIVE_U8_U8_SUMOFMULQUADACCUMULATE
|
|
2890
3507
|
#undef HWY_NATIVE_U8_U8_SUMOFMULQUADACCUMULATE
|
|
2891
3508
|
#else
|
|
@@ -2925,11 +3542,12 @@ HWY_API VFromD<DI32> SumOfMulQuadAccumulate(DI32 di32,
|
|
|
2925
3542
|
|
|
2926
3543
|
const auto result_sum_0 =
|
|
2927
3544
|
SumOfMulQuadAccumulate(di32, BitCast(du8, a), b, sum);
|
|
2928
|
-
const auto result_sum_1 = ShiftLeft<8>(
|
|
2929
|
-
di32, And(b, BroadcastSignBit(a)).raw, Zero(di32).raw));
|
|
3545
|
+
const auto result_sum_1 = ShiftLeft<8>(SumsOf4(And(b, BroadcastSignBit(a))));
|
|
2930
3546
|
return result_sum_0 - result_sum_1;
|
|
2931
3547
|
}
|
|
2932
3548
|
|
|
3549
|
+
#endif // !HWY_S390X_HAVE_Z14
|
|
3550
|
+
|
|
2933
3551
|
// ================================================== CONVERT
|
|
2934
3552
|
|
|
2935
3553
|
// ------------------------------ Promotions (part w/ narrow lanes -> full)
|
|
@@ -3018,29 +3636,59 @@ HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<float, D>> v) {
|
|
|
3018
3636
|
}
|
|
3019
3637
|
|
|
3020
3638
|
template <class D, HWY_IF_F64_D(D)>
|
|
3021
|
-
HWY_API VFromD<D> PromoteTo(D
|
|
3639
|
+
HWY_API VFromD<D> PromoteTo(D df64, VFromD<Rebind<int32_t, D>> v) {
|
|
3640
|
+
#if HWY_S390X_HAVE_Z14
|
|
3641
|
+
const RebindToSigned<decltype(df64)> di64;
|
|
3642
|
+
return ConvertTo(df64, PromoteTo(di64, v));
|
|
3643
|
+
#else // VSX
|
|
3644
|
+
(void)df64;
|
|
3022
3645
|
const __vector signed int raw_v = InterleaveLower(v, v).raw;
|
|
3023
3646
|
#if HWY_IS_LITTLE_ENDIAN
|
|
3024
3647
|
return VFromD<D>{vec_doubleo(raw_v)};
|
|
3025
3648
|
#else
|
|
3026
3649
|
return VFromD<D>{vec_doublee(raw_v)};
|
|
3027
3650
|
#endif
|
|
3651
|
+
#endif // HWY_S390X_HAVE_Z14
|
|
3028
3652
|
}
|
|
3029
3653
|
|
|
3030
3654
|
template <class D, HWY_IF_F64_D(D)>
|
|
3031
|
-
HWY_API VFromD<D> PromoteTo(D
|
|
3655
|
+
HWY_API VFromD<D> PromoteTo(D df64, VFromD<Rebind<uint32_t, D>> v) {
|
|
3656
|
+
#if HWY_S390X_HAVE_Z14
|
|
3657
|
+
const RebindToUnsigned<decltype(df64)> du64;
|
|
3658
|
+
return ConvertTo(df64, PromoteTo(du64, v));
|
|
3659
|
+
#else // VSX
|
|
3660
|
+
(void)df64;
|
|
3032
3661
|
const __vector unsigned int raw_v = InterleaveLower(v, v).raw;
|
|
3033
3662
|
#if HWY_IS_LITTLE_ENDIAN
|
|
3034
3663
|
return VFromD<D>{vec_doubleo(raw_v)};
|
|
3035
3664
|
#else
|
|
3036
3665
|
return VFromD<D>{vec_doublee(raw_v)};
|
|
3037
3666
|
#endif
|
|
3667
|
+
#endif // HWY_S390X_HAVE_Z14
|
|
3038
3668
|
}
|
|
3039
3669
|
|
|
3670
|
+
#if !HWY_S390X_HAVE_Z14
|
|
3671
|
+
namespace detail {
|
|
3672
|
+
|
|
3673
|
+
template <class V>
|
|
3674
|
+
static HWY_INLINE V VsxF2INormalizeSrcVals(V v) {
|
|
3675
|
+
#if !defined(HWY_DISABLE_PPC_VSX_QEMU_F2I_WORKAROUND)
|
|
3676
|
+
// Workaround for QEMU 7/8 VSX float to int conversion bug
|
|
3677
|
+
return IfThenElseZero(v == v, v);
|
|
3678
|
+
#else
|
|
3679
|
+
return v;
|
|
3680
|
+
#endif
|
|
3681
|
+
}
|
|
3682
|
+
|
|
3683
|
+
} // namespace detail
|
|
3684
|
+
#endif // !HWY_S390X_HAVE_Z14
|
|
3685
|
+
|
|
3040
3686
|
template <class D, HWY_IF_I64_D(D)>
|
|
3041
3687
|
HWY_API VFromD<D> PromoteTo(D di64, VFromD<Rebind<float, D>> v) {
|
|
3042
|
-
#if
|
|
3043
|
-
|
|
3688
|
+
#if !HWY_S390X_HAVE_Z14 && \
|
|
3689
|
+
(HWY_COMPILER_GCC_ACTUAL || HWY_HAS_BUILTIN(__builtin_vsx_xvcvspsxds))
|
|
3690
|
+
const __vector float raw_v =
|
|
3691
|
+
detail::VsxF2INormalizeSrcVals(InterleaveLower(v, v)).raw;
|
|
3044
3692
|
return VFromD<decltype(di64)>{__builtin_vsx_xvcvspsxds(raw_v)};
|
|
3045
3693
|
#else
|
|
3046
3694
|
const RebindToFloat<decltype(di64)> df64;
|
|
@@ -3050,8 +3698,10 @@ HWY_API VFromD<D> PromoteTo(D di64, VFromD<Rebind<float, D>> v) {
|
|
|
3050
3698
|
|
|
3051
3699
|
template <class D, HWY_IF_U64_D(D)>
|
|
3052
3700
|
HWY_API VFromD<D> PromoteTo(D du64, VFromD<Rebind<float, D>> v) {
|
|
3053
|
-
#if
|
|
3054
|
-
|
|
3701
|
+
#if !HWY_S390X_HAVE_Z14 && \
|
|
3702
|
+
(HWY_COMPILER_GCC_ACTUAL || HWY_HAS_BUILTIN(__builtin_vsx_xvcvspuxds))
|
|
3703
|
+
const __vector float raw_v =
|
|
3704
|
+
detail::VsxF2INormalizeSrcVals(InterleaveLower(v, v)).raw;
|
|
3055
3705
|
return VFromD<decltype(du64)>{reinterpret_cast<__vector unsigned long long>(
|
|
3056
3706
|
__builtin_vsx_xvcvspuxds(raw_v))};
|
|
3057
3707
|
#else
|
|
@@ -3123,7 +3773,12 @@ HWY_API VFromD<D> PromoteUpperTo(D /*tag*/, Vec128<float> v) {
|
|
|
3123
3773
|
}
|
|
3124
3774
|
|
|
3125
3775
|
template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F64_D(D)>
|
|
3126
|
-
HWY_API VFromD<D> PromoteUpperTo(D
|
|
3776
|
+
HWY_API VFromD<D> PromoteUpperTo(D df64, Vec128<int32_t> v) {
|
|
3777
|
+
#if HWY_S390X_HAVE_Z14
|
|
3778
|
+
const RebindToSigned<decltype(df64)> di64;
|
|
3779
|
+
return ConvertTo(df64, PromoteUpperTo(di64, v));
|
|
3780
|
+
#else // VSX
|
|
3781
|
+
(void)df64;
|
|
3127
3782
|
const __vector signed int raw_v =
|
|
3128
3783
|
InterleaveUpper(Full128<int32_t>(), v, v).raw;
|
|
3129
3784
|
#if HWY_IS_LITTLE_ENDIAN
|
|
@@ -3131,10 +3786,16 @@ HWY_API VFromD<D> PromoteUpperTo(D /*tag*/, Vec128<int32_t> v) {
|
|
|
3131
3786
|
#else
|
|
3132
3787
|
return VFromD<D>{vec_doublee(raw_v)};
|
|
3133
3788
|
#endif
|
|
3789
|
+
#endif // HWY_S390X_HAVE_Z14
|
|
3134
3790
|
}
|
|
3135
3791
|
|
|
3136
3792
|
template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F64_D(D)>
|
|
3137
|
-
HWY_API VFromD<D> PromoteUpperTo(D
|
|
3793
|
+
HWY_API VFromD<D> PromoteUpperTo(D df64, Vec128<uint32_t> v) {
|
|
3794
|
+
#if HWY_S390X_HAVE_Z14
|
|
3795
|
+
const RebindToUnsigned<decltype(df64)> du64;
|
|
3796
|
+
return ConvertTo(df64, PromoteUpperTo(du64, v));
|
|
3797
|
+
#else // VSX
|
|
3798
|
+
(void)df64;
|
|
3138
3799
|
const __vector unsigned int raw_v =
|
|
3139
3800
|
InterleaveUpper(Full128<uint32_t>(), v, v).raw;
|
|
3140
3801
|
#if HWY_IS_LITTLE_ENDIAN
|
|
@@ -3142,12 +3803,16 @@ HWY_API VFromD<D> PromoteUpperTo(D /*tag*/, Vec128<uint32_t> v) {
|
|
|
3142
3803
|
#else
|
|
3143
3804
|
return VFromD<D>{vec_doublee(raw_v)};
|
|
3144
3805
|
#endif
|
|
3806
|
+
#endif // HWY_S390X_HAVE_Z14
|
|
3145
3807
|
}
|
|
3146
3808
|
|
|
3147
3809
|
template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_I64_D(D)>
|
|
3148
3810
|
HWY_API VFromD<D> PromoteUpperTo(D di64, Vec128<float> v) {
|
|
3149
|
-
#if
|
|
3150
|
-
|
|
3811
|
+
#if !HWY_S390X_HAVE_Z14 && \
|
|
3812
|
+
(HWY_COMPILER_GCC_ACTUAL || HWY_HAS_BUILTIN(__builtin_vsx_xvcvspsxds))
|
|
3813
|
+
const __vector float raw_v =
|
|
3814
|
+
detail::VsxF2INormalizeSrcVals(InterleaveUpper(Full128<float>(), v, v))
|
|
3815
|
+
.raw;
|
|
3151
3816
|
return VFromD<decltype(di64)>{__builtin_vsx_xvcvspsxds(raw_v)};
|
|
3152
3817
|
#else
|
|
3153
3818
|
const RebindToFloat<decltype(di64)> df64;
|
|
@@ -3157,8 +3822,11 @@ HWY_API VFromD<D> PromoteUpperTo(D di64, Vec128<float> v) {
|
|
|
3157
3822
|
|
|
3158
3823
|
template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U64_D(D)>
|
|
3159
3824
|
HWY_API VFromD<D> PromoteUpperTo(D du64, Vec128<float> v) {
|
|
3160
|
-
#if
|
|
3161
|
-
|
|
3825
|
+
#if !HWY_S390X_HAVE_Z14 && \
|
|
3826
|
+
(HWY_COMPILER_GCC_ACTUAL || HWY_HAS_BUILTIN(__builtin_vsx_xvcvspuxds))
|
|
3827
|
+
const __vector float raw_v =
|
|
3828
|
+
detail::VsxF2INormalizeSrcVals(InterleaveUpper(Full128<float>(), v, v))
|
|
3829
|
+
.raw;
|
|
3162
3830
|
return VFromD<decltype(du64)>{reinterpret_cast<__vector unsigned long long>(
|
|
3163
3831
|
__builtin_vsx_xvcvspuxds(raw_v))};
|
|
3164
3832
|
#else
|
|
@@ -3174,6 +3842,219 @@ HWY_API VFromD<D> PromoteUpperTo(D d, V v) {
|
|
|
3174
3842
|
return PromoteTo(d, UpperHalf(dh, v));
|
|
3175
3843
|
}
|
|
3176
3844
|
|
|
3845
|
+
// ------------------------------ PromoteEvenTo/PromoteOddTo
|
|
3846
|
+
|
|
3847
|
+
namespace detail {
|
|
3848
|
+
|
|
3849
|
+
// Signed to Signed PromoteEvenTo/PromoteOddTo for PPC9/PPC10
|
|
3850
|
+
#if HWY_PPC_HAVE_9 && \
|
|
3851
|
+
(HWY_COMPILER_GCC_ACTUAL >= 1200 || HWY_COMPILER_CLANG >= 1200)
|
|
3852
|
+
|
|
3853
|
+
#if HWY_IS_LITTLE_ENDIAN
|
|
3854
|
+
template <class D, class V>
|
|
3855
|
+
HWY_INLINE VFromD<D> PromoteEvenTo(hwy::SignedTag /*to_type_tag*/,
|
|
3856
|
+
hwy::SizeTag<4> /*to_lane_size_tag*/,
|
|
3857
|
+
hwy::SignedTag /*from_type_tag*/, D /*d_to*/,
|
|
3858
|
+
V v) {
|
|
3859
|
+
return VFromD<D>{vec_signexti(v.raw)};
|
|
3860
|
+
}
|
|
3861
|
+
template <class D, class V>
|
|
3862
|
+
HWY_INLINE VFromD<D> PromoteEvenTo(hwy::SignedTag /*to_type_tag*/,
|
|
3863
|
+
hwy::SizeTag<8> /*to_lane_size_tag*/,
|
|
3864
|
+
hwy::SignedTag /*from_type_tag*/, D /*d_to*/,
|
|
3865
|
+
V v) {
|
|
3866
|
+
return VFromD<D>{vec_signextll(v.raw)};
|
|
3867
|
+
}
|
|
3868
|
+
#else
|
|
3869
|
+
template <class D, class V>
|
|
3870
|
+
HWY_INLINE VFromD<D> PromoteOddTo(hwy::SignedTag /*to_type_tag*/,
|
|
3871
|
+
hwy::SizeTag<4> /*to_lane_size_tag*/,
|
|
3872
|
+
hwy::SignedTag /*from_type_tag*/, D /*d_to*/,
|
|
3873
|
+
V v) {
|
|
3874
|
+
return VFromD<D>{vec_signexti(v.raw)};
|
|
3875
|
+
}
|
|
3876
|
+
template <class D, class V>
|
|
3877
|
+
HWY_INLINE VFromD<D> PromoteOddTo(hwy::SignedTag /*to_type_tag*/,
|
|
3878
|
+
hwy::SizeTag<8> /*to_lane_size_tag*/,
|
|
3879
|
+
hwy::SignedTag /*from_type_tag*/, D /*d_to*/,
|
|
3880
|
+
V v) {
|
|
3881
|
+
return VFromD<D>{vec_signextll(v.raw)};
|
|
3882
|
+
}
|
|
3883
|
+
#endif // HWY_IS_LITTLE_ENDIAN
|
|
3884
|
+
|
|
3885
|
+
#endif // HWY_PPC_HAVE_9
|
|
3886
|
+
|
|
3887
|
+
// I32/U32/F32->F64 PromoteEvenTo
|
|
3888
|
+
#if HWY_S390X_HAVE_Z14
|
|
3889
|
+
template <class D, class V>
|
|
3890
|
+
HWY_INLINE VFromD<D> PromoteEvenTo(hwy::FloatTag /*to_type_tag*/,
|
|
3891
|
+
hwy::SizeTag<8> /*to_lane_size_tag*/,
|
|
3892
|
+
hwy::FloatTag /*from_type_tag*/, D /*d_to*/,
|
|
3893
|
+
V v) {
|
|
3894
|
+
return VFromD<D>{vec_doublee(v.raw)};
|
|
3895
|
+
}
|
|
3896
|
+
template <class D, class V, class FromTypeTag, HWY_IF_UI32(TFromV<V>)>
|
|
3897
|
+
HWY_INLINE VFromD<D> PromoteEvenTo(hwy::FloatTag /*to_type_tag*/,
|
|
3898
|
+
hwy::SizeTag<8> /*to_lane_size_tag*/,
|
|
3899
|
+
FromTypeTag /*from_type_tag*/, D d_to, V v) {
|
|
3900
|
+
const Rebind<MakeWide<TFromV<V>>, decltype(d_to)> dw;
|
|
3901
|
+
return ConvertTo(d_to, PromoteEvenTo(dw, v));
|
|
3902
|
+
}
|
|
3903
|
+
#else // VSX
|
|
3904
|
+
template <class D, class V, class FromTypeTag>
|
|
3905
|
+
HWY_INLINE VFromD<D> PromoteEvenTo(hwy::FloatTag /*to_type_tag*/,
|
|
3906
|
+
hwy::SizeTag<8> /*to_lane_size_tag*/,
|
|
3907
|
+
FromTypeTag /*from_type_tag*/, D /*d_to*/,
|
|
3908
|
+
V v) {
|
|
3909
|
+
return VFromD<D>{vec_doublee(v.raw)};
|
|
3910
|
+
}
|
|
3911
|
+
#endif // HWY_S390X_HAVE_Z14
|
|
3912
|
+
|
|
3913
|
+
// F32->I64 PromoteEvenTo
|
|
3914
|
+
template <class D, class V>
|
|
3915
|
+
HWY_INLINE VFromD<D> PromoteEvenTo(hwy::SignedTag /*to_type_tag*/,
|
|
3916
|
+
hwy::SizeTag<8> /*to_lane_size_tag*/,
|
|
3917
|
+
hwy::FloatTag /*from_type_tag*/, D d_to,
|
|
3918
|
+
V v) {
|
|
3919
|
+
#if !HWY_S390X_HAVE_Z14 && \
|
|
3920
|
+
(HWY_COMPILER_GCC_ACTUAL || HWY_HAS_BUILTIN(__builtin_vsx_xvcvspsxds))
|
|
3921
|
+
(void)d_to;
|
|
3922
|
+
const auto normalized_v = detail::VsxF2INormalizeSrcVals(v);
|
|
3923
|
+
#if HWY_IS_LITTLE_ENDIAN
|
|
3924
|
+
// __builtin_vsx_xvcvspsxds expects the source values to be in the odd lanes
|
|
3925
|
+
// on little-endian PPC, and the vec_sld operation below will shift the even
|
|
3926
|
+
// lanes of normalized_v into the odd lanes.
|
|
3927
|
+
return VFromD<D>{
|
|
3928
|
+
__builtin_vsx_xvcvspsxds(vec_sld(normalized_v.raw, normalized_v.raw, 4))};
|
|
3929
|
+
#else
|
|
3930
|
+
// __builtin_vsx_xvcvspsxds expects the source values to be in the even lanes
|
|
3931
|
+
// on big-endian PPC.
|
|
3932
|
+
return VFromD<D>{__builtin_vsx_xvcvspsxds(normalized_v.raw)};
|
|
3933
|
+
#endif
|
|
3934
|
+
#else
|
|
3935
|
+
const RebindToFloat<decltype(d_to)> df64;
|
|
3936
|
+
return ConvertTo(d_to, PromoteEvenTo(hwy::FloatTag(), hwy::SizeTag<8>(),
|
|
3937
|
+
hwy::FloatTag(), df64, v));
|
|
3938
|
+
#endif
|
|
3939
|
+
}
|
|
3940
|
+
|
|
3941
|
+
// F32->U64 PromoteEvenTo
|
|
3942
|
+
template <class D, class V>
|
|
3943
|
+
HWY_INLINE VFromD<D> PromoteEvenTo(hwy::UnsignedTag /*to_type_tag*/,
|
|
3944
|
+
hwy::SizeTag<8> /*to_lane_size_tag*/,
|
|
3945
|
+
hwy::FloatTag /*from_type_tag*/, D d_to,
|
|
3946
|
+
V v) {
|
|
3947
|
+
#if !HWY_S390X_HAVE_Z14 && \
|
|
3948
|
+
(HWY_COMPILER_GCC_ACTUAL || HWY_HAS_BUILTIN(__builtin_vsx_xvcvspuxds))
|
|
3949
|
+
(void)d_to;
|
|
3950
|
+
const auto normalized_v = detail::VsxF2INormalizeSrcVals(v);
|
|
3951
|
+
#if HWY_IS_LITTLE_ENDIAN
|
|
3952
|
+
// __builtin_vsx_xvcvspuxds expects the source values to be in the odd lanes
|
|
3953
|
+
// on little-endian PPC, and the vec_sld operation below will shift the even
|
|
3954
|
+
// lanes of normalized_v into the odd lanes.
|
|
3955
|
+
return VFromD<D>{
|
|
3956
|
+
reinterpret_cast<__vector unsigned long long>(__builtin_vsx_xvcvspuxds(
|
|
3957
|
+
vec_sld(normalized_v.raw, normalized_v.raw, 4)))};
|
|
3958
|
+
#else
|
|
3959
|
+
// __builtin_vsx_xvcvspuxds expects the source values to be in the even lanes
|
|
3960
|
+
// on big-endian PPC.
|
|
3961
|
+
return VFromD<D>{reinterpret_cast<__vector unsigned long long>(
|
|
3962
|
+
__builtin_vsx_xvcvspuxds(normalized_v.raw))};
|
|
3963
|
+
#endif
|
|
3964
|
+
#else
|
|
3965
|
+
const RebindToFloat<decltype(d_to)> df64;
|
|
3966
|
+
return ConvertTo(d_to, PromoteEvenTo(hwy::FloatTag(), hwy::SizeTag<8>(),
|
|
3967
|
+
hwy::FloatTag(), df64, v));
|
|
3968
|
+
#endif
|
|
3969
|
+
}
|
|
3970
|
+
|
|
3971
|
+
// I32/U32/F32->F64 PromoteOddTo
|
|
3972
|
+
#if HWY_S390X_HAVE_Z14
|
|
3973
|
+
template <class D, class V>
|
|
3974
|
+
HWY_INLINE VFromD<D> PromoteOddTo(hwy::FloatTag /*to_type_tag*/,
|
|
3975
|
+
hwy::SizeTag<8> /*to_lane_size_tag*/,
|
|
3976
|
+
hwy::FloatTag /*from_type_tag*/, D d_to,
|
|
3977
|
+
V v) {
|
|
3978
|
+
return PromoteEvenTo(hwy::FloatTag(), hwy::SizeTag<8>(), hwy::FloatTag(),
|
|
3979
|
+
d_to, V{vec_sld(v.raw, v.raw, 4)});
|
|
3980
|
+
}
|
|
3981
|
+
template <class D, class V, class FromTypeTag, HWY_IF_UI32(TFromV<V>)>
|
|
3982
|
+
HWY_INLINE VFromD<D> PromoteOddTo(hwy::FloatTag /*to_type_tag*/,
|
|
3983
|
+
hwy::SizeTag<8> /*to_lane_size_tag*/,
|
|
3984
|
+
FromTypeTag /*from_type_tag*/, D d_to, V v) {
|
|
3985
|
+
const Rebind<MakeWide<TFromV<V>>, decltype(d_to)> dw;
|
|
3986
|
+
return ConvertTo(d_to, PromoteOddTo(dw, v));
|
|
3987
|
+
}
|
|
3988
|
+
#else
|
|
3989
|
+
template <class D, class V, class FromTypeTag>
|
|
3990
|
+
HWY_INLINE VFromD<D> PromoteOddTo(hwy::FloatTag /*to_type_tag*/,
|
|
3991
|
+
hwy::SizeTag<8> /*to_lane_size_tag*/,
|
|
3992
|
+
FromTypeTag /*from_type_tag*/, D /*d_to*/,
|
|
3993
|
+
V v) {
|
|
3994
|
+
return VFromD<D>{vec_doubleo(v.raw)};
|
|
3995
|
+
}
|
|
3996
|
+
#endif
|
|
3997
|
+
|
|
3998
|
+
// F32->I64 PromoteOddTo
|
|
3999
|
+
template <class D, class V>
|
|
4000
|
+
HWY_INLINE VFromD<D> PromoteOddTo(hwy::SignedTag /*to_type_tag*/,
|
|
4001
|
+
hwy::SizeTag<8> /*to_lane_size_tag*/,
|
|
4002
|
+
hwy::FloatTag /*from_type_tag*/, D d_to,
|
|
4003
|
+
V v) {
|
|
4004
|
+
#if !HWY_S390X_HAVE_Z14 && \
|
|
4005
|
+
(HWY_COMPILER_GCC_ACTUAL || HWY_HAS_BUILTIN(__builtin_vsx_xvcvspsxds))
|
|
4006
|
+
(void)d_to;
|
|
4007
|
+
const auto normalized_v = detail::VsxF2INormalizeSrcVals(v);
|
|
4008
|
+
#if HWY_IS_LITTLE_ENDIAN
|
|
4009
|
+
// __builtin_vsx_xvcvspsxds expects the source values to be in the odd lanes
|
|
4010
|
+
// on little-endian PPC
|
|
4011
|
+
return VFromD<D>{__builtin_vsx_xvcvspsxds(normalized_v.raw)};
|
|
4012
|
+
#else
|
|
4013
|
+
// __builtin_vsx_xvcvspsxds expects the source values to be in the even lanes
|
|
4014
|
+
// on big-endian PPC, and the vec_sld operation below will shift the odd lanes
|
|
4015
|
+
// of normalized_v into the even lanes.
|
|
4016
|
+
return VFromD<D>{
|
|
4017
|
+
__builtin_vsx_xvcvspsxds(vec_sld(normalized_v.raw, normalized_v.raw, 4))};
|
|
4018
|
+
#endif
|
|
4019
|
+
#else
|
|
4020
|
+
const RebindToFloat<decltype(d_to)> df64;
|
|
4021
|
+
return ConvertTo(d_to, PromoteOddTo(hwy::FloatTag(), hwy::SizeTag<8>(),
|
|
4022
|
+
hwy::FloatTag(), df64, v));
|
|
4023
|
+
#endif
|
|
4024
|
+
}
|
|
4025
|
+
|
|
4026
|
+
// F32->U64 PromoteOddTo
|
|
4027
|
+
template <class D, class V>
|
|
4028
|
+
HWY_INLINE VFromD<D> PromoteOddTo(hwy::UnsignedTag /*to_type_tag*/,
|
|
4029
|
+
hwy::SizeTag<8> /*to_lane_size_tag*/,
|
|
4030
|
+
hwy::FloatTag /*from_type_tag*/, D d_to,
|
|
4031
|
+
V v) {
|
|
4032
|
+
#if !HWY_S390X_HAVE_Z14 && \
|
|
4033
|
+
(HWY_COMPILER_GCC_ACTUAL || HWY_HAS_BUILTIN(__builtin_vsx_xvcvspuxds))
|
|
4034
|
+
(void)d_to;
|
|
4035
|
+
const auto normalized_v = detail::VsxF2INormalizeSrcVals(v);
|
|
4036
|
+
#if HWY_IS_LITTLE_ENDIAN
|
|
4037
|
+
// __builtin_vsx_xvcvspuxds expects the source values to be in the odd lanes
|
|
4038
|
+
// on little-endian PPC
|
|
4039
|
+
return VFromD<D>{reinterpret_cast<__vector unsigned long long>(
|
|
4040
|
+
__builtin_vsx_xvcvspuxds(normalized_v.raw))};
|
|
4041
|
+
#else
|
|
4042
|
+
// __builtin_vsx_xvcvspuxds expects the source values to be in the even lanes
|
|
4043
|
+
// on big-endian PPC, and the vec_sld operation below will shift the odd lanes
|
|
4044
|
+
// of normalized_v into the even lanes.
|
|
4045
|
+
return VFromD<D>{
|
|
4046
|
+
reinterpret_cast<__vector unsigned long long>(__builtin_vsx_xvcvspuxds(
|
|
4047
|
+
vec_sld(normalized_v.raw, normalized_v.raw, 4)))};
|
|
4048
|
+
#endif
|
|
4049
|
+
#else
|
|
4050
|
+
const RebindToFloat<decltype(d_to)> df64;
|
|
4051
|
+
return ConvertTo(d_to, PromoteOddTo(hwy::FloatTag(), hwy::SizeTag<8>(),
|
|
4052
|
+
hwy::FloatTag(), df64, v));
|
|
4053
|
+
#endif
|
|
4054
|
+
}
|
|
4055
|
+
|
|
4056
|
+
} // namespace detail
|
|
4057
|
+
|
|
3177
4058
|
// ------------------------------ Demotions (full -> part w/ narrow lanes)
|
|
3178
4059
|
|
|
3179
4060
|
template <class D, typename FromT, HWY_IF_UNSIGNED_D(D),
|
|
@@ -3254,29 +4135,138 @@ HWY_API VFromD<D> DemoteTo(D df16, VFromD<Rebind<float, D>> v) {
|
|
|
3254
4135
|
|
|
3255
4136
|
#endif // HWY_PPC_HAVE_9
|
|
3256
4137
|
|
|
3257
|
-
|
|
3258
|
-
|
|
3259
|
-
|
|
3260
|
-
|
|
3261
|
-
|
|
3262
|
-
|
|
4138
|
+
#if HWY_PPC_HAVE_9
|
|
4139
|
+
|
|
4140
|
+
#ifdef HWY_NATIVE_DEMOTE_F64_TO_F16
|
|
4141
|
+
#undef HWY_NATIVE_DEMOTE_F64_TO_F16
|
|
4142
|
+
#else
|
|
4143
|
+
#define HWY_NATIVE_DEMOTE_F64_TO_F16
|
|
4144
|
+
#endif
|
|
4145
|
+
|
|
4146
|
+
namespace detail {
|
|
4147
|
+
|
|
4148
|
+
// On big-endian PPC9, VsxXscvdphp converts vf64[0] to a F16, returned as an U64
|
|
4149
|
+
// vector with the resulting F16 bits in the lower 16 bits of U64 lane 0
|
|
4150
|
+
|
|
4151
|
+
// On little-endian PPC9, VsxXscvdphp converts vf64[1] to a F16, returned as
|
|
4152
|
+
// an U64 vector with the resulting F16 bits in the lower 16 bits of U64 lane 1
|
|
4153
|
+
static HWY_INLINE Vec128<uint64_t> VsxXscvdphp(Vec128<double> vf64) {
|
|
4154
|
+
// Inline assembly is needed for the PPC9 xscvdphp instruction as there is
|
|
4155
|
+
// currently no intrinsic available for the PPC9 xscvdphp instruction
|
|
4156
|
+
__vector unsigned long long raw_result;
|
|
4157
|
+
__asm__("xscvdphp %x0, %x1" : "=wa"(raw_result) : "wa"(vf64.raw));
|
|
4158
|
+
return Vec128<uint64_t>{raw_result};
|
|
3263
4159
|
}
|
|
3264
4160
|
|
|
3265
|
-
|
|
3266
|
-
|
|
3267
|
-
|
|
3268
|
-
|
|
4161
|
+
} // namespace detail
|
|
4162
|
+
|
|
4163
|
+
template <class D, HWY_IF_F16_D(D), HWY_IF_LANES_D(D, 1)>
|
|
4164
|
+
HWY_API VFromD<D> DemoteTo(D df16, VFromD<Rebind<double, D>> v) {
|
|
4165
|
+
const RebindToUnsigned<decltype(df16)> du16;
|
|
4166
|
+
const Rebind<uint64_t, decltype(df16)> du64;
|
|
4167
|
+
|
|
4168
|
+
const Full128<double> df64_full;
|
|
3269
4169
|
#if HWY_IS_LITTLE_ENDIAN
|
|
3270
|
-
const auto
|
|
3271
|
-
|
|
4170
|
+
const auto bits16_as_u64 =
|
|
4171
|
+
UpperHalf(du64, detail::VsxXscvdphp(Combine(df64_full, v, v)));
|
|
3272
4172
|
#else
|
|
3273
|
-
const auto
|
|
3274
|
-
|
|
4173
|
+
const auto bits16_as_u64 =
|
|
4174
|
+
LowerHalf(du64, detail::VsxXscvdphp(ResizeBitCast(df64_full, v)));
|
|
3275
4175
|
#endif
|
|
3276
|
-
|
|
3277
|
-
|
|
4176
|
+
|
|
4177
|
+
return BitCast(df16, TruncateTo(du16, bits16_as_u64));
|
|
4178
|
+
}
|
|
4179
|
+
|
|
4180
|
+
template <class D, HWY_IF_F16_D(D), HWY_IF_LANES_D(D, 2)>
|
|
4181
|
+
HWY_API VFromD<D> DemoteTo(D df16, VFromD<Rebind<double, D>> v) {
|
|
4182
|
+
const RebindToUnsigned<decltype(df16)> du16;
|
|
4183
|
+
const Rebind<uint64_t, decltype(df16)> du64;
|
|
4184
|
+
const Rebind<double, decltype(df16)> df64;
|
|
4185
|
+
|
|
4186
|
+
#if HWY_IS_LITTLE_ENDIAN
|
|
4187
|
+
const auto bits64_as_u64_0 = detail::VsxXscvdphp(InterleaveLower(df64, v, v));
|
|
4188
|
+
const auto bits64_as_u64_1 = detail::VsxXscvdphp(v);
|
|
4189
|
+
const auto bits64_as_u64 =
|
|
4190
|
+
InterleaveUpper(du64, bits64_as_u64_0, bits64_as_u64_1);
|
|
4191
|
+
#else
|
|
4192
|
+
const auto bits64_as_u64_0 = detail::VsxXscvdphp(v);
|
|
4193
|
+
const auto bits64_as_u64_1 = detail::VsxXscvdphp(InterleaveUpper(df64, v, v));
|
|
4194
|
+
const auto bits64_as_u64 =
|
|
4195
|
+
InterleaveLower(du64, bits64_as_u64_0, bits64_as_u64_1);
|
|
4196
|
+
#endif
|
|
4197
|
+
|
|
4198
|
+
return BitCast(df16, TruncateTo(du16, bits64_as_u64));
|
|
4199
|
+
}
|
|
4200
|
+
|
|
4201
|
+
#elif HWY_S390X_HAVE_Z14
|
|
4202
|
+
|
|
4203
|
+
#ifdef HWY_NATIVE_DEMOTE_F64_TO_F16
|
|
4204
|
+
#undef HWY_NATIVE_DEMOTE_F64_TO_F16
|
|
4205
|
+
#else
|
|
4206
|
+
#define HWY_NATIVE_DEMOTE_F64_TO_F16
|
|
4207
|
+
#endif
|
|
4208
|
+
|
|
4209
|
+
namespace detail {
|
|
4210
|
+
|
|
4211
|
+
template <class DF32, HWY_IF_F32_D(DF32)>
|
|
4212
|
+
static HWY_INLINE VFromD<DF32> DemoteToF32WithRoundToOdd(
|
|
4213
|
+
DF32 df32, VFromD<Rebind<double, DF32>> v) {
|
|
4214
|
+
const Twice<DF32> dt_f32;
|
|
4215
|
+
|
|
4216
|
+
__vector float raw_f32_in_even;
|
|
4217
|
+
__asm__("vledb %0,%1,0,3" : "=v"(raw_f32_in_even) : "v"(v.raw));
|
|
4218
|
+
|
|
4219
|
+
const VFromD<decltype(dt_f32)> f32_in_even{raw_f32_in_even};
|
|
4220
|
+
return LowerHalf(df32, ConcatEven(dt_f32, f32_in_even, f32_in_even));
|
|
4221
|
+
}
|
|
4222
|
+
|
|
4223
|
+
} // namespace detail
|
|
4224
|
+
|
|
4225
|
+
template <class D, HWY_IF_V_SIZE_LE_D(D, 4), HWY_IF_F16_D(D)>
|
|
4226
|
+
HWY_API VFromD<D> DemoteTo(D df16, VFromD<Rebind<double, D>> v) {
|
|
4227
|
+
const Rebind<float, decltype(df16)> df32;
|
|
4228
|
+
return DemoteTo(df16, detail::DemoteToF32WithRoundToOdd(df32, v));
|
|
3278
4229
|
}
|
|
3279
4230
|
|
|
4231
|
+
#endif // HWY_PPC_HAVE_9
|
|
4232
|
+
|
|
4233
|
+
#if HWY_PPC_HAVE_10 && HWY_HAS_BUILTIN(__builtin_vsx_xvcvspbf16)
|
|
4234
|
+
|
|
4235
|
+
#ifdef HWY_NATIVE_DEMOTE_F32_TO_BF16
|
|
4236
|
+
#undef HWY_NATIVE_DEMOTE_F32_TO_BF16
|
|
4237
|
+
#else
|
|
4238
|
+
#define HWY_NATIVE_DEMOTE_F32_TO_BF16
|
|
4239
|
+
#endif
|
|
4240
|
+
|
|
4241
|
+
namespace detail {
|
|
4242
|
+
|
|
4243
|
+
// VsxXvcvspbf16 converts a F32 vector to a BF16 vector, bitcasted to an U32
|
|
4244
|
+
// vector with the resulting BF16 bits in the lower 16 bits of each U32 lane
|
|
4245
|
+
template <class D, HWY_IF_BF16_D(D)>
|
|
4246
|
+
static HWY_INLINE VFromD<Rebind<uint32_t, D>> VsxXvcvspbf16(
|
|
4247
|
+
D dbf16, VFromD<Rebind<float, D>> v) {
|
|
4248
|
+
const Rebind<uint32_t, decltype(dbf16)> du32;
|
|
4249
|
+
const Repartition<uint8_t, decltype(du32)> du32_as_du8;
|
|
4250
|
+
|
|
4251
|
+
using VU32 = __vector unsigned int;
|
|
4252
|
+
|
|
4253
|
+
// Even though the __builtin_vsx_xvcvspbf16 builtin performs a F32 to BF16
|
|
4254
|
+
// conversion, the __builtin_vsx_xvcvspbf16 intrinsic expects a
|
|
4255
|
+
// __vector unsigned char argument (at least as of GCC 13 and Clang 17)
|
|
4256
|
+
return VFromD<Rebind<uint32_t, D>>{reinterpret_cast<VU32>(
|
|
4257
|
+
__builtin_vsx_xvcvspbf16(BitCast(du32_as_du8, v).raw))};
|
|
4258
|
+
}
|
|
4259
|
+
|
|
4260
|
+
} // namespace detail
|
|
4261
|
+
|
|
4262
|
+
template <class D, HWY_IF_BF16_D(D)>
|
|
4263
|
+
HWY_API VFromD<D> DemoteTo(D dbf16, VFromD<Rebind<float, D>> v) {
|
|
4264
|
+
const RebindToUnsigned<decltype(dbf16)> du16;
|
|
4265
|
+
return BitCast(dbf16, TruncateTo(du16, detail::VsxXvcvspbf16(dbf16, v)));
|
|
4266
|
+
}
|
|
4267
|
+
|
|
4268
|
+
#endif // HWY_PPC_HAVE_10 && HWY_HAS_BUILTIN(__builtin_vsx_xvcvspbf16)
|
|
4269
|
+
|
|
3280
4270
|
// Specializations for partial vectors because vec_packs sets lanes above 2*N.
|
|
3281
4271
|
template <class DN, typename V, HWY_IF_V_SIZE_LE_D(DN, 4), HWY_IF_SIGNED_D(DN),
|
|
3282
4272
|
HWY_IF_SIGNED_V(V),
|
|
@@ -3368,6 +4358,18 @@ HWY_API VFromD<DN> ReorderDemote2To(DN /*dn*/, V a, V b) {
|
|
|
3368
4358
|
return VFromD<DN>{vec_packs(a.raw, b.raw)};
|
|
3369
4359
|
}
|
|
3370
4360
|
|
|
4361
|
+
#if HWY_PPC_HAVE_10 && HWY_HAS_BUILTIN(__builtin_vsx_xvcvspbf16)
|
|
4362
|
+
template <class D, class V, HWY_IF_BF16_D(D), HWY_IF_F32(TFromV<V>),
|
|
4363
|
+
HWY_IF_LANES_D(D, HWY_MAX_LANES_V(V) * 2)>
|
|
4364
|
+
HWY_API VFromD<D> ReorderDemote2To(D dbf16, V a, V b) {
|
|
4365
|
+
const RebindToUnsigned<decltype(dbf16)> du16;
|
|
4366
|
+
const Half<decltype(dbf16)> dh_bf16;
|
|
4367
|
+
return BitCast(dbf16,
|
|
4368
|
+
OrderedTruncate2To(du16, detail::VsxXvcvspbf16(dh_bf16, a),
|
|
4369
|
+
detail::VsxXvcvspbf16(dh_bf16, b)));
|
|
4370
|
+
}
|
|
4371
|
+
#endif
|
|
4372
|
+
|
|
3371
4373
|
template <class D, HWY_IF_NOT_FLOAT_NOR_SPECIAL(TFromD<D>), class V,
|
|
3372
4374
|
HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V),
|
|
3373
4375
|
HWY_IF_T_SIZE_V(V, sizeof(TFromD<D>) * 2),
|
|
@@ -3376,15 +4378,13 @@ HWY_API VFromD<D> OrderedDemote2To(D d, V a, V b) {
|
|
|
3376
4378
|
return ReorderDemote2To(d, a, b);
|
|
3377
4379
|
}
|
|
3378
4380
|
|
|
3379
|
-
|
|
3380
|
-
|
|
3381
|
-
|
|
3382
|
-
|
|
3383
|
-
return
|
|
3384
|
-
#else
|
|
3385
|
-
return BitCast(dbf16, ConcatEven(du16, BitCast(du16, b), BitCast(du16, a)));
|
|
3386
|
-
#endif
|
|
4381
|
+
#if HWY_PPC_HAVE_10 && HWY_HAS_BUILTIN(__builtin_vsx_xvcvspbf16)
|
|
4382
|
+
template <class D, HWY_IF_BF16_D(D), class V, HWY_IF_F32(TFromV<V>),
|
|
4383
|
+
HWY_IF_LANES_D(D, HWY_MAX_LANES_D(DFromV<V>) * 2)>
|
|
4384
|
+
HWY_API VFromD<D> OrderedDemote2To(D d, V a, V b) {
|
|
4385
|
+
return ReorderDemote2To(d, a, b);
|
|
3387
4386
|
}
|
|
4387
|
+
#endif
|
|
3388
4388
|
|
|
3389
4389
|
template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_F32_D(D)>
|
|
3390
4390
|
HWY_API Vec32<float> DemoteTo(D /* tag */, Vec64<double> v) {
|
|
@@ -3393,90 +4393,164 @@ HWY_API Vec32<float> DemoteTo(D /* tag */, Vec64<double> v) {
|
|
|
3393
4393
|
|
|
3394
4394
|
template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_F32_D(D)>
|
|
3395
4395
|
HWY_API Vec64<float> DemoteTo(D d, Vec128<double> v) {
|
|
3396
|
-
#if HWY_IS_LITTLE_ENDIAN
|
|
4396
|
+
#if HWY_S390X_HAVE_Z14 || HWY_IS_LITTLE_ENDIAN
|
|
3397
4397
|
const Vec128<float> f64_to_f32{vec_floate(v.raw)};
|
|
3398
4398
|
#else
|
|
3399
4399
|
const Vec128<float> f64_to_f32{vec_floato(v.raw)};
|
|
3400
4400
|
#endif
|
|
3401
4401
|
|
|
4402
|
+
#if HWY_S390X_HAVE_Z14
|
|
4403
|
+
const Twice<decltype(d)> dt;
|
|
4404
|
+
return LowerHalf(d, ConcatEven(dt, f64_to_f32, f64_to_f32));
|
|
4405
|
+
#else
|
|
3402
4406
|
const RebindToUnsigned<D> du;
|
|
3403
4407
|
const Rebind<uint64_t, D> du64;
|
|
3404
4408
|
return Vec64<float>{
|
|
3405
4409
|
BitCast(d, TruncateTo(du, BitCast(du64, f64_to_f32))).raw};
|
|
4410
|
+
#endif
|
|
3406
4411
|
}
|
|
3407
4412
|
|
|
3408
4413
|
template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_I32_D(D)>
|
|
3409
|
-
HWY_API Vec32<int32_t> DemoteTo(D
|
|
3410
|
-
|
|
4414
|
+
HWY_API Vec32<int32_t> DemoteTo(D di32, Vec64<double> v) {
|
|
4415
|
+
#if HWY_S390X_HAVE_Z14
|
|
4416
|
+
const Rebind<int64_t, decltype(di32)> di64;
|
|
4417
|
+
return DemoteTo(di32, ConvertTo(di64, v));
|
|
4418
|
+
#else
|
|
4419
|
+
(void)di32;
|
|
4420
|
+
return Vec32<int32_t>{vec_signede(detail::VsxF2INormalizeSrcVals(v).raw)};
|
|
4421
|
+
#endif
|
|
3411
4422
|
}
|
|
3412
4423
|
|
|
3413
4424
|
template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_I32_D(D)>
|
|
3414
|
-
HWY_API Vec64<int32_t> DemoteTo(D
|
|
4425
|
+
HWY_API Vec64<int32_t> DemoteTo(D di32, Vec128<double> v) {
|
|
4426
|
+
#if HWY_S390X_HAVE_Z14
|
|
4427
|
+
const Rebind<int64_t, decltype(di32)> di64;
|
|
4428
|
+
return DemoteTo(di32, ConvertTo(di64, v));
|
|
4429
|
+
#else
|
|
4430
|
+
(void)di32;
|
|
4431
|
+
|
|
3415
4432
|
#if HWY_IS_LITTLE_ENDIAN
|
|
3416
|
-
const Vec128<int32_t> f64_to_i32{
|
|
4433
|
+
const Vec128<int32_t> f64_to_i32{
|
|
4434
|
+
vec_signede(detail::VsxF2INormalizeSrcVals(v).raw)};
|
|
3417
4435
|
#else
|
|
3418
|
-
const Vec128<int32_t> f64_to_i32{
|
|
4436
|
+
const Vec128<int32_t> f64_to_i32{
|
|
4437
|
+
vec_signedo(detail::VsxF2INormalizeSrcVals(v).raw)};
|
|
3419
4438
|
#endif
|
|
3420
4439
|
|
|
3421
4440
|
const Rebind<int64_t, D> di64;
|
|
3422
4441
|
const Vec128<int64_t> vi64 = BitCast(di64, f64_to_i32);
|
|
3423
4442
|
return Vec64<int32_t>{vec_pack(vi64.raw, vi64.raw)};
|
|
4443
|
+
#endif
|
|
3424
4444
|
}
|
|
3425
4445
|
|
|
3426
4446
|
template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_U32_D(D)>
|
|
3427
|
-
HWY_API Vec32<uint32_t> DemoteTo(D
|
|
3428
|
-
|
|
4447
|
+
HWY_API Vec32<uint32_t> DemoteTo(D du32, Vec64<double> v) {
|
|
4448
|
+
#if HWY_S390X_HAVE_Z14
|
|
4449
|
+
const Rebind<uint64_t, decltype(du32)> du64;
|
|
4450
|
+
return DemoteTo(du32, ConvertTo(du64, v));
|
|
4451
|
+
#else
|
|
4452
|
+
(void)du32;
|
|
4453
|
+
return Vec32<uint32_t>{vec_unsignede(detail::VsxF2INormalizeSrcVals(v).raw)};
|
|
4454
|
+
#endif
|
|
3429
4455
|
}
|
|
3430
4456
|
|
|
3431
4457
|
template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_U32_D(D)>
|
|
3432
|
-
HWY_API Vec64<uint32_t> DemoteTo(D
|
|
4458
|
+
HWY_API Vec64<uint32_t> DemoteTo(D du32, Vec128<double> v) {
|
|
4459
|
+
#if HWY_S390X_HAVE_Z14
|
|
4460
|
+
const Rebind<uint64_t, decltype(du32)> du64;
|
|
4461
|
+
return DemoteTo(du32, ConvertTo(du64, v));
|
|
4462
|
+
#else
|
|
4463
|
+
(void)du32;
|
|
3433
4464
|
#if HWY_IS_LITTLE_ENDIAN
|
|
3434
|
-
const Vec128<uint32_t> f64_to_u32{
|
|
4465
|
+
const Vec128<uint32_t> f64_to_u32{
|
|
4466
|
+
vec_unsignede(detail::VsxF2INormalizeSrcVals(v).raw)};
|
|
3435
4467
|
#else
|
|
3436
|
-
const Vec128<uint32_t> f64_to_u32{
|
|
4468
|
+
const Vec128<uint32_t> f64_to_u32{
|
|
4469
|
+
vec_unsignedo(detail::VsxF2INormalizeSrcVals(v).raw)};
|
|
3437
4470
|
#endif
|
|
3438
4471
|
|
|
3439
4472
|
const Rebind<uint64_t, D> du64;
|
|
3440
4473
|
const Vec128<uint64_t> vu64 = BitCast(du64, f64_to_u32);
|
|
3441
4474
|
return Vec64<uint32_t>{vec_pack(vu64.raw, vu64.raw)};
|
|
4475
|
+
#endif
|
|
4476
|
+
}
|
|
4477
|
+
|
|
4478
|
+
#if HWY_S390X_HAVE_Z14
|
|
4479
|
+
namespace detail {
|
|
4480
|
+
|
|
4481
|
+
template <class V, HWY_IF_I64(TFromV<V>)>
|
|
4482
|
+
HWY_INLINE VFromD<RebindToFloat<DFromV<V>>> ConvToF64WithRoundToOdd(V v) {
|
|
4483
|
+
__vector double raw_result;
|
|
4484
|
+
// Use inline assembly to do a round-to-odd I64->F64 conversion on Z14
|
|
4485
|
+
__asm__("vcdgb %0,%1,0,3" : "=v"(raw_result) : "v"(v.raw));
|
|
4486
|
+
return VFromD<RebindToFloat<DFromV<V>>>{raw_result};
|
|
4487
|
+
}
|
|
4488
|
+
|
|
4489
|
+
template <class V, HWY_IF_U64(TFromV<V>)>
|
|
4490
|
+
HWY_INLINE VFromD<RebindToFloat<DFromV<V>>> ConvToF64WithRoundToOdd(V v) {
|
|
4491
|
+
__vector double raw_result;
|
|
4492
|
+
// Use inline assembly to do a round-to-odd U64->F64 conversion on Z14
|
|
4493
|
+
__asm__("vcdlgb %0,%1,0,3" : "=v"(raw_result) : "v"(v.raw));
|
|
4494
|
+
return VFromD<RebindToFloat<DFromV<V>>>{raw_result};
|
|
3442
4495
|
}
|
|
3443
4496
|
|
|
4497
|
+
} // namespace detail
|
|
4498
|
+
#endif // HWY_S390X_HAVE_Z14
|
|
4499
|
+
|
|
3444
4500
|
template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_F32_D(D)>
|
|
3445
|
-
HWY_API Vec32<float> DemoteTo(D
|
|
4501
|
+
HWY_API Vec32<float> DemoteTo(D df32, Vec64<int64_t> v) {
|
|
4502
|
+
#if HWY_S390X_HAVE_Z14
|
|
4503
|
+
return DemoteTo(df32, detail::ConvToF64WithRoundToOdd(v));
|
|
4504
|
+
#else // VSX
|
|
4505
|
+
(void)df32;
|
|
3446
4506
|
return Vec32<float>{vec_floate(v.raw)};
|
|
4507
|
+
#endif
|
|
3447
4508
|
}
|
|
3448
4509
|
|
|
3449
4510
|
template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_F32_D(D)>
|
|
3450
|
-
HWY_API Vec64<float> DemoteTo(D
|
|
4511
|
+
HWY_API Vec64<float> DemoteTo(D df32, Vec128<int64_t> v) {
|
|
4512
|
+
#if HWY_S390X_HAVE_Z14
|
|
4513
|
+
return DemoteTo(df32, detail::ConvToF64WithRoundToOdd(v));
|
|
4514
|
+
#else // VSX
|
|
3451
4515
|
#if HWY_IS_LITTLE_ENDIAN
|
|
3452
4516
|
const Vec128<float> i64_to_f32{vec_floate(v.raw)};
|
|
3453
4517
|
#else
|
|
3454
4518
|
const Vec128<float> i64_to_f32{vec_floato(v.raw)};
|
|
3455
4519
|
#endif
|
|
3456
4520
|
|
|
3457
|
-
const RebindToUnsigned<
|
|
3458
|
-
const Rebind<uint64_t,
|
|
4521
|
+
const RebindToUnsigned<decltype(df32)> du32;
|
|
4522
|
+
const Rebind<uint64_t, decltype(df32)> du64;
|
|
3459
4523
|
return Vec64<float>{
|
|
3460
|
-
BitCast(
|
|
4524
|
+
BitCast(df32, TruncateTo(du32, BitCast(du64, i64_to_f32))).raw};
|
|
4525
|
+
#endif
|
|
3461
4526
|
}
|
|
3462
4527
|
|
|
3463
4528
|
template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_F32_D(D)>
|
|
3464
|
-
HWY_API Vec32<float> DemoteTo(D
|
|
4529
|
+
HWY_API Vec32<float> DemoteTo(D df32, Vec64<uint64_t> v) {
|
|
4530
|
+
#if HWY_S390X_HAVE_Z14
|
|
4531
|
+
return DemoteTo(df32, detail::ConvToF64WithRoundToOdd(v));
|
|
4532
|
+
#else // VSX
|
|
4533
|
+
(void)df32;
|
|
3465
4534
|
return Vec32<float>{vec_floate(v.raw)};
|
|
4535
|
+
#endif
|
|
3466
4536
|
}
|
|
3467
4537
|
|
|
3468
4538
|
template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_F32_D(D)>
|
|
3469
|
-
HWY_API Vec64<float> DemoteTo(D
|
|
4539
|
+
HWY_API Vec64<float> DemoteTo(D df32, Vec128<uint64_t> v) {
|
|
4540
|
+
#if HWY_S390X_HAVE_Z14
|
|
4541
|
+
return DemoteTo(df32, detail::ConvToF64WithRoundToOdd(v));
|
|
4542
|
+
#else // VSX
|
|
3470
4543
|
#if HWY_IS_LITTLE_ENDIAN
|
|
3471
4544
|
const Vec128<float> u64_to_f32{vec_floate(v.raw)};
|
|
3472
4545
|
#else
|
|
3473
4546
|
const Vec128<float> u64_to_f32{vec_floato(v.raw)};
|
|
3474
4547
|
#endif
|
|
3475
4548
|
|
|
3476
|
-
const RebindToUnsigned<
|
|
3477
|
-
const Rebind<uint64_t,
|
|
4549
|
+
const RebindToUnsigned<decltype(df32)> du;
|
|
4550
|
+
const Rebind<uint64_t, decltype(df32)> du64;
|
|
3478
4551
|
return Vec64<float>{
|
|
3479
|
-
BitCast(
|
|
4552
|
+
BitCast(df32, TruncateTo(du, BitCast(du64, u64_to_f32))).raw};
|
|
4553
|
+
#endif
|
|
3480
4554
|
}
|
|
3481
4555
|
|
|
3482
4556
|
// For already range-limited input [0, 255].
|
|
@@ -3491,17 +4565,39 @@ HWY_API Vec128<uint8_t, N> U8FromU32(Vec128<uint32_t, N> v) {
|
|
|
3491
4565
|
// Note: altivec.h vec_ct* currently contain C casts which triggers
|
|
3492
4566
|
// -Wdeprecate-lax-vec-conv-all warnings, so disable them.
|
|
3493
4567
|
|
|
3494
|
-
|
|
3495
|
-
|
|
4568
|
+
#if HWY_S390X_HAVE_Z14 && !HWY_S390X_HAVE_Z15
|
|
4569
|
+
template <class D, typename FromT, HWY_IF_F32_D(D), HWY_IF_UI32(FromT),
|
|
4570
|
+
HWY_IF_V_SIZE_LE_D(D, 8)>
|
|
4571
|
+
HWY_API VFromD<D> ConvertTo(D df32,
|
|
4572
|
+
Vec128<FromT, Rebind<FromT, D>().MaxLanes()> v) {
|
|
4573
|
+
const Rebind<double, decltype(df32)> df64;
|
|
4574
|
+
return DemoteTo(df32, PromoteTo(df64, v));
|
|
4575
|
+
}
|
|
4576
|
+
template <class D, typename FromT, HWY_IF_F32_D(D), HWY_IF_UI32(FromT),
|
|
4577
|
+
HWY_IF_V_SIZE_D(D, 16)>
|
|
4578
|
+
HWY_API VFromD<D> ConvertTo(D df32, Vec128<FromT> v) {
|
|
4579
|
+
const RepartitionToWide<decltype(df32)> df64;
|
|
4580
|
+
|
|
4581
|
+
const VFromD<D> vf32_lo{vec_floate(PromoteLowerTo(df64, v).raw)};
|
|
4582
|
+
const VFromD<D> vf32_hi{vec_floate(PromoteUpperTo(df64, v).raw)};
|
|
4583
|
+
return ConcatEven(df32, vf32_hi, vf32_lo);
|
|
4584
|
+
}
|
|
4585
|
+
#else // Z15 or PPC
|
|
4586
|
+
template <class D, typename FromT, HWY_IF_F32_D(D), HWY_IF_UI32(FromT)>
|
|
3496
4587
|
HWY_API VFromD<D> ConvertTo(D /* tag */,
|
|
3497
4588
|
Vec128<FromT, Rebind<FromT, D>().MaxLanes()> v) {
|
|
3498
4589
|
HWY_DIAGNOSTICS(push)
|
|
3499
4590
|
#if HWY_COMPILER_CLANG
|
|
3500
4591
|
HWY_DIAGNOSTICS_OFF(disable : 5219, ignored "-Wdeprecate-lax-vec-conv-all")
|
|
3501
4592
|
#endif
|
|
4593
|
+
#if HWY_S390X_HAVE_Z15
|
|
4594
|
+
return VFromD<D>{vec_float(v.raw)};
|
|
4595
|
+
#else
|
|
3502
4596
|
return VFromD<D>{vec_ctf(v.raw, 0)};
|
|
4597
|
+
#endif
|
|
3503
4598
|
HWY_DIAGNOSTICS(pop)
|
|
3504
4599
|
}
|
|
4600
|
+
#endif // HWY_TARGET == HWY_Z14
|
|
3505
4601
|
|
|
3506
4602
|
template <class D, typename FromT, HWY_IF_F64_D(D), HWY_IF_NOT_FLOAT(FromT),
|
|
3507
4603
|
HWY_IF_T_SIZE_D(D, sizeof(FromT))>
|
|
@@ -3511,38 +4607,195 @@ HWY_API VFromD<D> ConvertTo(D /* tag */,
|
|
|
3511
4607
|
}
|
|
3512
4608
|
|
|
3513
4609
|
// Truncates (rounds toward zero).
|
|
3514
|
-
|
|
3515
|
-
|
|
4610
|
+
#if HWY_S390X_HAVE_Z14 && !HWY_S390X_HAVE_Z15
|
|
4611
|
+
template <class D, HWY_IF_I32_D(D), HWY_IF_V_SIZE_LE_D(D, 8)>
|
|
4612
|
+
HWY_API VFromD<D> ConvertTo(D di32,
|
|
4613
|
+
Vec128<float, Rebind<float, D>().MaxLanes()> v) {
|
|
4614
|
+
const Rebind<int64_t, decltype(di32)> di64;
|
|
4615
|
+
return DemoteTo(di32, PromoteTo(di64, v));
|
|
4616
|
+
}
|
|
4617
|
+
template <class D, HWY_IF_I32_D(D), HWY_IF_V_SIZE_D(D, 16)>
|
|
4618
|
+
HWY_API VFromD<D> ConvertTo(D di32,
|
|
4619
|
+
Vec128<float, Rebind<float, D>().MaxLanes()> v) {
|
|
4620
|
+
const RepartitionToWide<decltype(di32)> di64;
|
|
4621
|
+
return OrderedDemote2To(di32, PromoteLowerTo(di64, v),
|
|
4622
|
+
PromoteUpperTo(di64, v));
|
|
4623
|
+
}
|
|
4624
|
+
#else // Z15 or PPC
|
|
4625
|
+
template <class D, HWY_IF_I32_D(D)>
|
|
3516
4626
|
HWY_API VFromD<D> ConvertTo(D /* tag */,
|
|
3517
|
-
Vec128<
|
|
4627
|
+
Vec128<float, Rebind<float, D>().MaxLanes()> v) {
|
|
4628
|
+
#if defined(__OPTIMIZE__)
|
|
4629
|
+
if (detail::IsConstantRawAltivecVect(v.raw)) {
|
|
4630
|
+
constexpr int32_t kMinI32 = LimitsMin<int32_t>();
|
|
4631
|
+
constexpr int32_t kMaxI32 = LimitsMax<int32_t>();
|
|
4632
|
+
return Dup128VecFromValues(
|
|
4633
|
+
D(),
|
|
4634
|
+
(v.raw[0] >= -2147483648.0f)
|
|
4635
|
+
? ((v.raw[0] < 2147483648.0f) ? static_cast<int32_t>(v.raw[0])
|
|
4636
|
+
: kMaxI32)
|
|
4637
|
+
: ((v.raw[0] < 0) ? kMinI32 : 0),
|
|
4638
|
+
(v.raw[1] >= -2147483648.0f)
|
|
4639
|
+
? ((v.raw[1] < 2147483648.0f) ? static_cast<int32_t>(v.raw[1])
|
|
4640
|
+
: kMaxI32)
|
|
4641
|
+
: ((v.raw[1] < 0) ? kMinI32 : 0),
|
|
4642
|
+
(v.raw[2] >= -2147483648.0f)
|
|
4643
|
+
? ((v.raw[2] < 2147483648.0f) ? static_cast<int32_t>(v.raw[2])
|
|
4644
|
+
: kMaxI32)
|
|
4645
|
+
: ((v.raw[2] < 0) ? kMinI32 : 0),
|
|
4646
|
+
(v.raw[3] >= -2147483648.0f)
|
|
4647
|
+
? ((v.raw[3] < 2147483648.0f) ? static_cast<int32_t>(v.raw[3])
|
|
4648
|
+
: kMaxI32)
|
|
4649
|
+
: ((v.raw[3] < 0) ? kMinI32 : 0));
|
|
4650
|
+
}
|
|
4651
|
+
#endif
|
|
4652
|
+
|
|
4653
|
+
#if HWY_S390X_HAVE_Z15
|
|
4654
|
+
// Use inline assembly on Z15 to avoid undefined behavior if v[i] is not in
|
|
4655
|
+
// the range of an int32_t
|
|
4656
|
+
__vector signed int raw_result;
|
|
4657
|
+
__asm__("vcfeb %0,%1,0,5" : "=v"(raw_result) : "v"(v.raw));
|
|
4658
|
+
return VFromD<D>{raw_result};
|
|
4659
|
+
#else
|
|
3518
4660
|
HWY_DIAGNOSTICS(push)
|
|
3519
4661
|
#if HWY_COMPILER_CLANG
|
|
3520
4662
|
HWY_DIAGNOSTICS_OFF(disable : 5219, ignored "-Wdeprecate-lax-vec-conv-all")
|
|
3521
4663
|
#endif
|
|
3522
4664
|
return VFromD<D>{vec_cts(v.raw, 0)};
|
|
3523
4665
|
HWY_DIAGNOSTICS(pop)
|
|
4666
|
+
#endif // HWY_S390X_HAVE_Z15
|
|
3524
4667
|
}
|
|
4668
|
+
#endif // HWY_S390X_HAVE_Z14 && !HWY_S390X_HAVE_Z15
|
|
3525
4669
|
|
|
3526
|
-
template <class D,
|
|
3527
|
-
HWY_IF_T_SIZE_D(D, sizeof(FromT))>
|
|
4670
|
+
template <class D, HWY_IF_I64_D(D)>
|
|
3528
4671
|
HWY_API VFromD<D> ConvertTo(D /* tag */,
|
|
3529
|
-
Vec128<
|
|
4672
|
+
Vec128<double, Rebind<double, D>().MaxLanes()> v) {
|
|
4673
|
+
#if defined(__OPTIMIZE__)
|
|
4674
|
+
if (detail::IsConstantRawAltivecVect(v.raw)) {
|
|
4675
|
+
constexpr int64_t kMinI64 = LimitsMin<int64_t>();
|
|
4676
|
+
constexpr int64_t kMaxI64 = LimitsMax<int64_t>();
|
|
4677
|
+
return Dup128VecFromValues(D(),
|
|
4678
|
+
(v.raw[0] >= -9223372036854775808.0)
|
|
4679
|
+
? ((v.raw[0] < 9223372036854775808.0)
|
|
4680
|
+
? static_cast<int64_t>(v.raw[0])
|
|
4681
|
+
: kMaxI64)
|
|
4682
|
+
: ((v.raw[0] < 0) ? kMinI64 : 0LL),
|
|
4683
|
+
(v.raw[1] >= -9223372036854775808.0)
|
|
4684
|
+
? ((v.raw[1] < 9223372036854775808.0)
|
|
4685
|
+
? static_cast<int64_t>(v.raw[1])
|
|
4686
|
+
: kMaxI64)
|
|
4687
|
+
: ((v.raw[1] < 0) ? kMinI64 : 0LL));
|
|
4688
|
+
}
|
|
4689
|
+
#endif
|
|
4690
|
+
|
|
4691
|
+
// Use inline assembly to avoid undefined behavior if v[i] is not within the
|
|
4692
|
+
// range of an int64_t
|
|
4693
|
+
__vector signed long long raw_result;
|
|
4694
|
+
#if HWY_S390X_HAVE_Z14
|
|
4695
|
+
__asm__("vcgdb %0,%1,0,5" : "=v"(raw_result) : "v"(v.raw));
|
|
4696
|
+
#else
|
|
4697
|
+
__asm__("xvcvdpsxds %x0,%x1"
|
|
4698
|
+
: "=wa"(raw_result)
|
|
4699
|
+
: "wa"(detail::VsxF2INormalizeSrcVals(v).raw));
|
|
4700
|
+
#endif
|
|
4701
|
+
return VFromD<D>{raw_result};
|
|
4702
|
+
}
|
|
4703
|
+
|
|
4704
|
+
#if HWY_S390X_HAVE_Z14 && !HWY_S390X_HAVE_Z15
|
|
4705
|
+
template <class D, HWY_IF_U32_D(D), HWY_IF_V_SIZE_LE_D(D, 8)>
|
|
4706
|
+
HWY_API VFromD<D> ConvertTo(D du32,
|
|
4707
|
+
Vec128<float, Rebind<float, D>().MaxLanes()> v) {
|
|
4708
|
+
const Rebind<uint64_t, decltype(du32)> du64;
|
|
4709
|
+
return DemoteTo(du32, PromoteTo(du64, v));
|
|
4710
|
+
}
|
|
4711
|
+
template <class D, HWY_IF_U32_D(D), HWY_IF_V_SIZE_D(D, 16)>
|
|
4712
|
+
HWY_API VFromD<D> ConvertTo(D du32,
|
|
4713
|
+
Vec128<float, Rebind<float, D>().MaxLanes()> v) {
|
|
4714
|
+
const RepartitionToWide<decltype(du32)> du64;
|
|
4715
|
+
return OrderedDemote2To(du32, PromoteLowerTo(du64, v),
|
|
4716
|
+
PromoteUpperTo(du64, v));
|
|
4717
|
+
}
|
|
4718
|
+
#else // Z15 or VSX
|
|
4719
|
+
template <class D, HWY_IF_U32_D(D)>
|
|
4720
|
+
HWY_API VFromD<D> ConvertTo(D /* tag */,
|
|
4721
|
+
Vec128<float, Rebind<float, D>().MaxLanes()> v) {
|
|
4722
|
+
#if defined(__OPTIMIZE__)
|
|
4723
|
+
if (detail::IsConstantRawAltivecVect(v.raw)) {
|
|
4724
|
+
constexpr uint32_t kMaxU32 = LimitsMax<uint32_t>();
|
|
4725
|
+
return Dup128VecFromValues(
|
|
4726
|
+
D(),
|
|
4727
|
+
(v.raw[0] >= 0.0f)
|
|
4728
|
+
? ((v.raw[0] < 4294967296.0f) ? static_cast<uint32_t>(v.raw[0])
|
|
4729
|
+
: kMaxU32)
|
|
4730
|
+
: 0,
|
|
4731
|
+
(v.raw[1] >= 0.0f)
|
|
4732
|
+
? ((v.raw[1] < 4294967296.0f) ? static_cast<uint32_t>(v.raw[1])
|
|
4733
|
+
: kMaxU32)
|
|
4734
|
+
: 0,
|
|
4735
|
+
(v.raw[2] >= 0.0f)
|
|
4736
|
+
? ((v.raw[2] < 4294967296.0f) ? static_cast<uint32_t>(v.raw[2])
|
|
4737
|
+
: kMaxU32)
|
|
4738
|
+
: 0,
|
|
4739
|
+
(v.raw[3] >= 0.0f)
|
|
4740
|
+
? ((v.raw[3] < 4294967296.0f) ? static_cast<uint32_t>(v.raw[3])
|
|
4741
|
+
: kMaxU32)
|
|
4742
|
+
: 0);
|
|
4743
|
+
}
|
|
4744
|
+
#endif
|
|
4745
|
+
|
|
4746
|
+
#if HWY_S390X_HAVE_Z15
|
|
4747
|
+
// Use inline assembly on Z15 to avoid undefined behavior if v[i] is not in
|
|
4748
|
+
// the range of an uint32_t
|
|
4749
|
+
__vector unsigned int raw_result;
|
|
4750
|
+
__asm__("vclfeb %0,%1,0,5" : "=v"(raw_result) : "v"(v.raw));
|
|
4751
|
+
return VFromD<D>{raw_result};
|
|
4752
|
+
#else // VSX
|
|
3530
4753
|
HWY_DIAGNOSTICS(push)
|
|
3531
4754
|
#if HWY_COMPILER_CLANG
|
|
3532
4755
|
HWY_DIAGNOSTICS_OFF(disable : 5219, ignored "-Wdeprecate-lax-vec-conv-all")
|
|
3533
4756
|
#endif
|
|
3534
|
-
|
|
4757
|
+
VFromD<D> result{vec_ctu(v.raw, 0)};
|
|
3535
4758
|
HWY_DIAGNOSTICS(pop)
|
|
4759
|
+
return result;
|
|
4760
|
+
#endif // HWY_S390X_HAVE_Z15
|
|
3536
4761
|
}
|
|
4762
|
+
#endif // HWY_S390X_HAVE_Z14 && !HWY_S390X_HAVE_Z15
|
|
3537
4763
|
|
|
3538
|
-
template <
|
|
3539
|
-
HWY_API
|
|
4764
|
+
template <class D, HWY_IF_U64_D(D)>
|
|
4765
|
+
HWY_API VFromD<D> ConvertTo(D /* tag */,
|
|
4766
|
+
Vec128<double, Rebind<double, D>().MaxLanes()> v) {
|
|
3540
4767
|
HWY_DIAGNOSTICS(push)
|
|
3541
4768
|
#if HWY_COMPILER_CLANG
|
|
3542
4769
|
HWY_DIAGNOSTICS_OFF(disable : 5219, ignored "-Wdeprecate-lax-vec-conv-all")
|
|
3543
4770
|
#endif
|
|
3544
|
-
|
|
3545
|
-
|
|
4771
|
+
|
|
4772
|
+
#if defined(__OPTIMIZE__)
|
|
4773
|
+
if (detail::IsConstantRawAltivecVect(v.raw)) {
|
|
4774
|
+
constexpr uint64_t kMaxU64 = LimitsMax<uint64_t>();
|
|
4775
|
+
return Dup128VecFromValues(
|
|
4776
|
+
D(),
|
|
4777
|
+
(v.raw[0] >= 0.0) ? ((v.raw[0] < 18446744073709551616.0)
|
|
4778
|
+
? static_cast<uint64_t>(v.raw[0])
|
|
4779
|
+
: kMaxU64)
|
|
4780
|
+
: 0,
|
|
4781
|
+
(v.raw[1] >= 0.0) ? ((v.raw[1] < 18446744073709551616.0)
|
|
4782
|
+
? static_cast<uint64_t>(v.raw[1])
|
|
4783
|
+
: kMaxU64)
|
|
4784
|
+
: 0);
|
|
4785
|
+
}
|
|
4786
|
+
#endif
|
|
4787
|
+
|
|
4788
|
+
// Use inline assembly to avoid undefined behavior if v[i] is not within the
|
|
4789
|
+
// range of an uint64_t
|
|
4790
|
+
__vector unsigned long long raw_result;
|
|
4791
|
+
#if HWY_S390X_HAVE_Z14
|
|
4792
|
+
__asm__("vclgdb %0,%1,0,5" : "=v"(raw_result) : "v"(v.raw));
|
|
4793
|
+
#else // VSX
|
|
4794
|
+
__asm__("xvcvdpuxds %x0,%x1"
|
|
4795
|
+
: "=wa"(raw_result)
|
|
4796
|
+
: "wa"(detail::VsxF2INormalizeSrcVals(v).raw));
|
|
4797
|
+
#endif
|
|
4798
|
+
return VFromD<D>{raw_result};
|
|
3546
4799
|
}
|
|
3547
4800
|
|
|
3548
4801
|
// ------------------------------ Floating-point rounding (ConvertTo)
|
|
@@ -3555,7 +4808,18 @@ HWY_API Vec128<float, N> Round(Vec128<float, N> v) {
|
|
|
3555
4808
|
|
|
3556
4809
|
template <size_t N>
|
|
3557
4810
|
HWY_API Vec128<double, N> Round(Vec128<double, N> v) {
|
|
4811
|
+
#if HWY_S390X_HAVE_Z14
|
|
4812
|
+
return Vec128<double, N>{vec_round(v.raw)};
|
|
4813
|
+
#else
|
|
3558
4814
|
return Vec128<double, N>{vec_rint(v.raw)};
|
|
4815
|
+
#endif
|
|
4816
|
+
}
|
|
4817
|
+
|
|
4818
|
+
template <size_t N>
|
|
4819
|
+
HWY_API Vec128<int32_t, N> NearestInt(Vec128<float, N> v) {
|
|
4820
|
+
const DFromV<decltype(v)> d;
|
|
4821
|
+
const RebindToSigned<decltype(d)> di;
|
|
4822
|
+
return ConvertTo(di, Round(v));
|
|
3559
4823
|
}
|
|
3560
4824
|
|
|
3561
4825
|
// Toward zero, aka truncate
|
|
@@ -3613,7 +4877,7 @@ HWY_API Mask128<T, N> IsFinite(Vec128<T, N> v) {
|
|
|
3613
4877
|
|
|
3614
4878
|
// ================================================== CRYPTO
|
|
3615
4879
|
|
|
3616
|
-
#if !defined(HWY_DISABLE_PPC8_CRYPTO)
|
|
4880
|
+
#if !HWY_S390X_HAVE_Z14 && !defined(HWY_DISABLE_PPC8_CRYPTO)
|
|
3617
4881
|
|
|
3618
4882
|
// Per-target flag to prevent generic_ops-inl.h from defining AESRound.
|
|
3619
4883
|
#ifdef HWY_NATIVE_AES
|
|
@@ -3918,6 +5182,15 @@ struct CompressIsPartition {
|
|
|
3918
5182
|
enum { value = (sizeof(T) != 1) };
|
|
3919
5183
|
};
|
|
3920
5184
|
|
|
5185
|
+
// ------------------------------ Dup128MaskFromMaskBits
|
|
5186
|
+
|
|
5187
|
+
template <class D>
|
|
5188
|
+
HWY_API MFromD<D> Dup128MaskFromMaskBits(D d, unsigned mask_bits) {
|
|
5189
|
+
constexpr size_t kN = MaxLanes(d);
|
|
5190
|
+
if (kN < 8) mask_bits &= (1u << kN) - 1;
|
|
5191
|
+
return detail::LoadMaskBits128(d, mask_bits);
|
|
5192
|
+
}
|
|
5193
|
+
|
|
3921
5194
|
// ------------------------------ StoreMaskBits
|
|
3922
5195
|
|
|
3923
5196
|
namespace detail {
|
|
@@ -3930,37 +5203,45 @@ HWY_INLINE uint64_t ExtractSignBits(Vec128<uint8_t, N> sign_bits,
|
|
|
3930
5203
|
// clang POWER8 and 9 targets appear to differ in their return type of
|
|
3931
5204
|
// vec_vbpermq: unsigned or signed, so cast to avoid a warning.
|
|
3932
5205
|
using VU64 = detail::Raw128<uint64_t>::type;
|
|
5206
|
+
#if HWY_S390X_HAVE_Z14
|
|
5207
|
+
const Vec128<uint64_t> extracted{
|
|
5208
|
+
reinterpret_cast<VU64>(vec_bperm_u128(sign_bits.raw, bit_shuffle))};
|
|
5209
|
+
#else
|
|
3933
5210
|
const Vec128<uint64_t> extracted{
|
|
3934
5211
|
reinterpret_cast<VU64>(vec_vbpermq(sign_bits.raw, bit_shuffle))};
|
|
5212
|
+
#endif
|
|
3935
5213
|
return extracted.raw[HWY_IS_LITTLE_ENDIAN];
|
|
3936
5214
|
}
|
|
3937
5215
|
|
|
3938
|
-
#endif // !HWY_PPC_HAVE_10
|
|
5216
|
+
#endif // !HWY_PPC_HAVE_10 || HWY_IS_BIG_ENDIAN
|
|
3939
5217
|
|
|
3940
5218
|
template <typename T, size_t N>
|
|
3941
5219
|
HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<1> /*tag*/, Mask128<T, N> mask) {
|
|
3942
5220
|
const DFromM<decltype(mask)> d;
|
|
3943
5221
|
const Repartition<uint8_t, decltype(d)> du8;
|
|
3944
5222
|
const VFromD<decltype(du8)> sign_bits = BitCast(du8, VecFromMask(d, mask));
|
|
5223
|
+
|
|
3945
5224
|
#if HWY_PPC_HAVE_10 && HWY_IS_LITTLE_ENDIAN
|
|
3946
5225
|
return static_cast<uint64_t>(vec_extractm(sign_bits.raw));
|
|
3947
|
-
#else
|
|
5226
|
+
#else // Z14, Z15, PPC8, PPC9, or big-endian PPC10
|
|
3948
5227
|
const __vector unsigned char kBitShuffle = {120, 112, 104, 96, 88, 80, 72, 64,
|
|
3949
5228
|
56, 48, 40, 32, 24, 16, 8, 0};
|
|
3950
5229
|
return ExtractSignBits(sign_bits, kBitShuffle);
|
|
3951
|
-
#endif // HWY_PPC_HAVE_10
|
|
5230
|
+
#endif // HWY_PPC_HAVE_10 && HWY_IS_LITTLE_ENDIAN
|
|
3952
5231
|
}
|
|
3953
5232
|
|
|
3954
5233
|
template <typename T, size_t N>
|
|
3955
5234
|
HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<2> /*tag*/, Mask128<T, N> mask) {
|
|
3956
5235
|
const DFromM<decltype(mask)> d;
|
|
5236
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
5237
|
+
|
|
3957
5238
|
const Repartition<uint8_t, decltype(d)> du8;
|
|
3958
5239
|
const VFromD<decltype(du8)> sign_bits = BitCast(du8, VecFromMask(d, mask));
|
|
3959
5240
|
|
|
3960
5241
|
#if HWY_PPC_HAVE_10 && HWY_IS_LITTLE_ENDIAN
|
|
3961
|
-
const RebindToUnsigned<decltype(d)> du;
|
|
3962
5242
|
return static_cast<uint64_t>(vec_extractm(BitCast(du, sign_bits).raw));
|
|
3963
|
-
#else
|
|
5243
|
+
#else // Z14, Z15, PPC8, PPC9, or big-endian PPC10
|
|
5244
|
+
(void)du;
|
|
3964
5245
|
#if HWY_IS_LITTLE_ENDIAN
|
|
3965
5246
|
const __vector unsigned char kBitShuffle = {
|
|
3966
5247
|
112, 96, 80, 64, 48, 32, 16, 0, 128, 128, 128, 128, 128, 128, 128, 128};
|
|
@@ -3975,12 +5256,15 @@ HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<2> /*tag*/, Mask128<T, N> mask) {
|
|
|
3975
5256
|
template <typename T, size_t N>
|
|
3976
5257
|
HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<4> /*tag*/, Mask128<T, N> mask) {
|
|
3977
5258
|
const DFromM<decltype(mask)> d;
|
|
5259
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
5260
|
+
|
|
3978
5261
|
const Repartition<uint8_t, decltype(d)> du8;
|
|
3979
5262
|
const VFromD<decltype(du8)> sign_bits = BitCast(du8, VecFromMask(d, mask));
|
|
5263
|
+
|
|
3980
5264
|
#if HWY_PPC_HAVE_10 && HWY_IS_LITTLE_ENDIAN
|
|
3981
|
-
const RebindToUnsigned<decltype(d)> du;
|
|
3982
5265
|
return static_cast<uint64_t>(vec_extractm(BitCast(du, sign_bits).raw));
|
|
3983
|
-
#else
|
|
5266
|
+
#else // Z14, Z15, PPC8, PPC9, or big-endian PPC10
|
|
5267
|
+
(void)du;
|
|
3984
5268
|
#if HWY_IS_LITTLE_ENDIAN
|
|
3985
5269
|
const __vector unsigned char kBitShuffle = {96, 64, 32, 0, 128, 128,
|
|
3986
5270
|
128, 128, 128, 128, 128, 128,
|
|
@@ -3997,12 +5281,15 @@ HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<4> /*tag*/, Mask128<T, N> mask) {
|
|
|
3997
5281
|
template <typename T, size_t N>
|
|
3998
5282
|
HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<8> /*tag*/, Mask128<T, N> mask) {
|
|
3999
5283
|
const DFromM<decltype(mask)> d;
|
|
5284
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
5285
|
+
|
|
4000
5286
|
const Repartition<uint8_t, decltype(d)> du8;
|
|
4001
5287
|
const VFromD<decltype(du8)> sign_bits = BitCast(du8, VecFromMask(d, mask));
|
|
5288
|
+
|
|
4002
5289
|
#if HWY_PPC_HAVE_10 && HWY_IS_LITTLE_ENDIAN
|
|
4003
|
-
const RebindToUnsigned<decltype(d)> du;
|
|
4004
5290
|
return static_cast<uint64_t>(vec_extractm(BitCast(du, sign_bits).raw));
|
|
4005
|
-
#else
|
|
5291
|
+
#else // Z14, Z15, PPC8, PPC9, or big-endian PPC10
|
|
5292
|
+
(void)du;
|
|
4006
5293
|
#if HWY_IS_LITTLE_ENDIAN
|
|
4007
5294
|
const __vector unsigned char kBitShuffle = {64, 0, 128, 128, 128, 128,
|
|
4008
5295
|
128, 128, 128, 128, 128, 128,
|
|
@@ -4076,31 +5363,32 @@ HWY_API size_t StoreMaskBits(D /*d*/, MFromD<D> mask, uint8_t* bits) {
|
|
|
4076
5363
|
template <class D, HWY_IF_V_SIZE_D(D, 16)>
|
|
4077
5364
|
HWY_API bool AllFalse(D d, MFromD<D> mask) {
|
|
4078
5365
|
const RebindToUnsigned<decltype(d)> du;
|
|
4079
|
-
return static_cast<bool>(
|
|
5366
|
+
return static_cast<bool>(
|
|
5367
|
+
vec_all_eq(VecFromMask(du, RebindMask(du, mask)).raw, Zero(du).raw));
|
|
4080
5368
|
}
|
|
4081
5369
|
|
|
4082
5370
|
template <class D, HWY_IF_V_SIZE_D(D, 16)>
|
|
4083
5371
|
HWY_API bool AllTrue(D d, MFromD<D> mask) {
|
|
4084
5372
|
const RebindToUnsigned<decltype(d)> du;
|
|
4085
5373
|
using TU = TFromD<decltype(du)>;
|
|
4086
|
-
return static_cast<bool>(
|
|
4087
|
-
|
|
5374
|
+
return static_cast<bool>(vec_all_eq(VecFromMask(du, RebindMask(du, mask)).raw,
|
|
5375
|
+
Set(du, hwy::LimitsMax<TU>()).raw));
|
|
4088
5376
|
}
|
|
4089
5377
|
|
|
4090
5378
|
template <class D, HWY_IF_V_SIZE_LE_D(D, 8)>
|
|
4091
5379
|
HWY_API bool AllFalse(D d, MFromD<D> mask) {
|
|
4092
5380
|
const Full128<TFromD<D>> d_full;
|
|
4093
5381
|
constexpr size_t kN = MaxLanes(d);
|
|
4094
|
-
return AllFalse(d_full,
|
|
4095
|
-
|
|
5382
|
+
return AllFalse(d_full,
|
|
5383
|
+
And(MFromD<decltype(d_full)>{mask.raw}, FirstN(d_full, kN)));
|
|
4096
5384
|
}
|
|
4097
5385
|
|
|
4098
5386
|
template <class D, HWY_IF_V_SIZE_LE_D(D, 8)>
|
|
4099
5387
|
HWY_API bool AllTrue(D d, MFromD<D> mask) {
|
|
4100
5388
|
const Full128<TFromD<D>> d_full;
|
|
4101
5389
|
constexpr size_t kN = MaxLanes(d);
|
|
4102
|
-
return AllTrue(
|
|
4103
|
-
|
|
5390
|
+
return AllTrue(
|
|
5391
|
+
d_full, Or(MFromD<decltype(d_full)>{mask.raw}, Not(FirstN(d_full, kN))));
|
|
4104
5392
|
}
|
|
4105
5393
|
|
|
4106
5394
|
template <class D>
|
|
@@ -4222,7 +5510,7 @@ HWY_INLINE VFromD<D> CompressOrExpandIndicesFromMask(D d, MFromD<D> mask) {
|
|
|
4222
5510
|
__asm__("xxgenpcvbm %x0, %1, %2"
|
|
4223
5511
|
: "=wa"(idx)
|
|
4224
5512
|
: "v"(mask.raw), "i"(kGenPcvmMode));
|
|
4225
|
-
return VFromD<
|
|
5513
|
+
return VFromD<decltype(d)>{idx};
|
|
4226
5514
|
}
|
|
4227
5515
|
template <bool kIsCompress, class D, HWY_IF_T_SIZE_D(D, 2)>
|
|
4228
5516
|
HWY_INLINE VFromD<D> CompressOrExpandIndicesFromMask(D d, MFromD<D> mask) {
|
|
@@ -4235,7 +5523,7 @@ HWY_INLINE VFromD<D> CompressOrExpandIndicesFromMask(D d, MFromD<D> mask) {
|
|
|
4235
5523
|
__asm__("xxgenpcvhm %x0, %1, %2"
|
|
4236
5524
|
: "=wa"(idx)
|
|
4237
5525
|
: "v"(mask.raw), "i"(kGenPcvmMode));
|
|
4238
|
-
return VFromD<
|
|
5526
|
+
return VFromD<decltype(d)>{idx};
|
|
4239
5527
|
}
|
|
4240
5528
|
template <bool kIsCompress, class D, HWY_IF_T_SIZE_D(D, 4)>
|
|
4241
5529
|
HWY_INLINE VFromD<D> CompressOrExpandIndicesFromMask(D d, MFromD<D> mask) {
|
|
@@ -4248,7 +5536,7 @@ HWY_INLINE VFromD<D> CompressOrExpandIndicesFromMask(D d, MFromD<D> mask) {
|
|
|
4248
5536
|
__asm__("xxgenpcvwm %x0, %1, %2"
|
|
4249
5537
|
: "=wa"(idx)
|
|
4250
5538
|
: "v"(mask.raw), "i"(kGenPcvmMode));
|
|
4251
|
-
return VFromD<
|
|
5539
|
+
return VFromD<decltype(d)>{idx};
|
|
4252
5540
|
}
|
|
4253
5541
|
#endif
|
|
4254
5542
|
|
|
@@ -4821,7 +6109,7 @@ HWY_API size_t CompressBlendedStore(VFromD<D> v, MFromD<D> m, D d,
|
|
|
4821
6109
|
|
|
4822
6110
|
const auto indices = BitCast(du, detail::IndicesFromBits128(d, mask_bits));
|
|
4823
6111
|
const auto compressed = BitCast(d, TableLookupBytes(BitCast(du, v), indices));
|
|
4824
|
-
#if HWY_PPC_HAVE_9
|
|
6112
|
+
#if (HWY_PPC_HAVE_9 && HWY_ARCH_PPC_64) || HWY_S390X_HAVE_Z14
|
|
4825
6113
|
StoreN(compressed, d, unaligned, count);
|
|
4826
6114
|
#else
|
|
4827
6115
|
BlendedStore(compressed, FirstN(d, count), d, unaligned);
|
|
@@ -4939,7 +6227,11 @@ HWY_INLINE V Per128BitBlkRevLanesOnBe(V v) {
|
|
|
4939
6227
|
|
|
4940
6228
|
template <class V>
|
|
4941
6229
|
HWY_INLINE V I128Subtract(V a, V b) {
|
|
4942
|
-
#if
|
|
6230
|
+
#if HWY_S390X_HAVE_Z14
|
|
6231
|
+
const V diff_i128{reinterpret_cast<typename detail::Raw128<TFromV<V>>::type>(
|
|
6232
|
+
vec_sub_u128(reinterpret_cast<__vector unsigned char>(a.raw),
|
|
6233
|
+
reinterpret_cast<__vector unsigned char>(b.raw)))};
|
|
6234
|
+
#elif defined(__SIZEOF_INT128__)
|
|
4943
6235
|
using VU128 = __vector unsigned __int128;
|
|
4944
6236
|
const V diff_i128{reinterpret_cast<typename detail::Raw128<TFromV<V>>::type>(
|
|
4945
6237
|
vec_sub(reinterpret_cast<VU128>(a.raw), reinterpret_cast<VU128>(b.raw)))};
|
|
@@ -5067,84 +6359,133 @@ HWY_API Mask128<T, N> SetAtOrBeforeFirst(Mask128<T, N> mask) {
|
|
|
5067
6359
|
return SetBeforeFirst(MaskFromVec(ShiftLeftLanes<1>(VecFromMask(d, mask))));
|
|
5068
6360
|
}
|
|
5069
6361
|
|
|
5070
|
-
// ------------------------------
|
|
5071
|
-
|
|
6362
|
+
// ------------------------------ SumsOf2 and SumsOf4
|
|
5072
6363
|
namespace detail {
|
|
5073
6364
|
|
|
5074
|
-
|
|
5075
|
-
|
|
5076
|
-
|
|
5077
|
-
|
|
5078
|
-
|
|
5079
|
-
|
|
5080
|
-
|
|
5081
|
-
|
|
5082
|
-
|
|
5083
|
-
|
|
5084
|
-
|
|
5085
|
-
|
|
6365
|
+
#if !HWY_S390X_HAVE_Z14
|
|
6366
|
+
// Casts nominally int32_t result to D.
|
|
6367
|
+
template <class D>
|
|
6368
|
+
HWY_INLINE VFromD<D> AltivecVsum4sbs(D d, __vector signed char a,
|
|
6369
|
+
__vector signed int b) {
|
|
6370
|
+
const Repartition<int32_t, D> di32;
|
|
6371
|
+
#ifdef __OPTIMIZE__
|
|
6372
|
+
if (IsConstantRawAltivecVect(a) && IsConstantRawAltivecVect(b)) {
|
|
6373
|
+
const int64_t sum0 =
|
|
6374
|
+
static_cast<int64_t>(a[0]) + static_cast<int64_t>(a[1]) +
|
|
6375
|
+
static_cast<int64_t>(a[2]) + static_cast<int64_t>(a[3]) +
|
|
6376
|
+
static_cast<int64_t>(b[0]);
|
|
6377
|
+
const int64_t sum1 =
|
|
6378
|
+
static_cast<int64_t>(a[4]) + static_cast<int64_t>(a[5]) +
|
|
6379
|
+
static_cast<int64_t>(a[6]) + static_cast<int64_t>(a[7]) +
|
|
6380
|
+
static_cast<int64_t>(b[1]);
|
|
6381
|
+
const int64_t sum2 =
|
|
6382
|
+
static_cast<int64_t>(a[8]) + static_cast<int64_t>(a[9]) +
|
|
6383
|
+
static_cast<int64_t>(a[10]) + static_cast<int64_t>(a[11]) +
|
|
6384
|
+
static_cast<int64_t>(b[2]);
|
|
6385
|
+
const int64_t sum3 =
|
|
6386
|
+
static_cast<int64_t>(a[12]) + static_cast<int64_t>(a[13]) +
|
|
6387
|
+
static_cast<int64_t>(a[14]) + static_cast<int64_t>(a[15]) +
|
|
6388
|
+
static_cast<int64_t>(b[3]);
|
|
6389
|
+
const int32_t sign0 = static_cast<int32_t>(sum0 >> 63);
|
|
6390
|
+
const int32_t sign1 = static_cast<int32_t>(sum1 >> 63);
|
|
6391
|
+
const int32_t sign2 = static_cast<int32_t>(sum2 >> 63);
|
|
6392
|
+
const int32_t sign3 = static_cast<int32_t>(sum3 >> 63);
|
|
6393
|
+
using Raw = typename detail::Raw128<int32_t>::type;
|
|
6394
|
+
return BitCast(
|
|
6395
|
+
d,
|
|
6396
|
+
VFromD<decltype(di32)>{Raw{
|
|
6397
|
+
(sign0 == (sum0 >> 31)) ? static_cast<int32_t>(sum0)
|
|
6398
|
+
: static_cast<int32_t>(sign0 ^ 0x7FFFFFFF),
|
|
6399
|
+
(sign1 == (sum1 >> 31)) ? static_cast<int32_t>(sum1)
|
|
6400
|
+
: static_cast<int32_t>(sign1 ^ 0x7FFFFFFF),
|
|
6401
|
+
(sign2 == (sum2 >> 31)) ? static_cast<int32_t>(sum2)
|
|
6402
|
+
: static_cast<int32_t>(sign2 ^ 0x7FFFFFFF),
|
|
6403
|
+
(sign3 == (sum3 >> 31))
|
|
6404
|
+
? static_cast<int32_t>(sum3)
|
|
6405
|
+
: static_cast<int32_t>(sign3 ^ 0x7FFFFFFF)}});
|
|
6406
|
+
} else // NOLINT
|
|
6407
|
+
#endif
|
|
6408
|
+
{
|
|
6409
|
+
return BitCast(d, VFromD<decltype(di32)>{vec_vsum4sbs(a, b)});
|
|
6410
|
+
}
|
|
5086
6411
|
}
|
|
5087
6412
|
|
|
5088
|
-
//
|
|
5089
|
-
|
|
5090
|
-
|
|
5091
|
-
|
|
5092
|
-
|
|
5093
|
-
|
|
5094
|
-
|
|
5095
|
-
|
|
5096
|
-
|
|
5097
|
-
|
|
5098
|
-
|
|
5099
|
-
|
|
5100
|
-
|
|
5101
|
-
|
|
5102
|
-
|
|
5103
|
-
|
|
6413
|
+
// Casts nominally uint32_t result to D.
|
|
6414
|
+
template <class D>
|
|
6415
|
+
HWY_INLINE VFromD<D> AltivecVsum4ubs(D d, __vector unsigned char a,
|
|
6416
|
+
__vector unsigned int b) {
|
|
6417
|
+
const Repartition<uint32_t, D> du32;
|
|
6418
|
+
#ifdef __OPTIMIZE__
|
|
6419
|
+
if (IsConstantRawAltivecVect(a) && IsConstantRawAltivecVect(b)) {
|
|
6420
|
+
const uint64_t sum0 =
|
|
6421
|
+
static_cast<uint64_t>(a[0]) + static_cast<uint64_t>(a[1]) +
|
|
6422
|
+
static_cast<uint64_t>(a[2]) + static_cast<uint64_t>(a[3]) +
|
|
6423
|
+
static_cast<uint64_t>(b[0]);
|
|
6424
|
+
const uint64_t sum1 =
|
|
6425
|
+
static_cast<uint64_t>(a[4]) + static_cast<uint64_t>(a[5]) +
|
|
6426
|
+
static_cast<uint64_t>(a[6]) + static_cast<uint64_t>(a[7]) +
|
|
6427
|
+
static_cast<uint64_t>(b[1]);
|
|
6428
|
+
const uint64_t sum2 =
|
|
6429
|
+
static_cast<uint64_t>(a[8]) + static_cast<uint64_t>(a[9]) +
|
|
6430
|
+
static_cast<uint64_t>(a[10]) + static_cast<uint64_t>(a[11]) +
|
|
6431
|
+
static_cast<uint64_t>(b[2]);
|
|
6432
|
+
const uint64_t sum3 =
|
|
6433
|
+
static_cast<uint64_t>(a[12]) + static_cast<uint64_t>(a[13]) +
|
|
6434
|
+
static_cast<uint64_t>(a[14]) + static_cast<uint64_t>(a[15]) +
|
|
6435
|
+
static_cast<uint64_t>(b[3]);
|
|
6436
|
+
return BitCast(
|
|
6437
|
+
d,
|
|
6438
|
+
VFromD<decltype(du32)>{(__vector unsigned int){
|
|
6439
|
+
static_cast<unsigned int>(sum0 <= 0xFFFFFFFFu ? sum0 : 0xFFFFFFFFu),
|
|
6440
|
+
static_cast<unsigned int>(sum1 <= 0xFFFFFFFFu ? sum1 : 0xFFFFFFFFu),
|
|
6441
|
+
static_cast<unsigned int>(sum2 <= 0xFFFFFFFFu ? sum2 : 0xFFFFFFFFu),
|
|
6442
|
+
static_cast<unsigned int>(sum3 <= 0xFFFFFFFFu ? sum3
|
|
6443
|
+
: 0xFFFFFFFFu)}});
|
|
6444
|
+
} else // NOLINT
|
|
6445
|
+
#endif
|
|
6446
|
+
{
|
|
6447
|
+
return BitCast(d, VFromD<decltype(du32)>{vec_vsum4ubs(a, b)});
|
|
6448
|
+
}
|
|
5104
6449
|
}
|
|
5105
6450
|
|
|
5106
|
-
//
|
|
5107
|
-
template <
|
|
5108
|
-
HWY_INLINE
|
|
5109
|
-
|
|
5110
|
-
|
|
5111
|
-
|
|
5112
|
-
const
|
|
5113
|
-
|
|
5114
|
-
|
|
5115
|
-
|
|
5116
|
-
|
|
5117
|
-
|
|
5118
|
-
|
|
5119
|
-
|
|
5120
|
-
|
|
5121
|
-
|
|
5122
|
-
|
|
5123
|
-
|
|
5124
|
-
|
|
5125
|
-
|
|
5126
|
-
|
|
5127
|
-
|
|
5128
|
-
|
|
5129
|
-
|
|
6451
|
+
// Casts nominally int32_t result to D.
|
|
6452
|
+
template <class D>
|
|
6453
|
+
HWY_INLINE VFromD<D> AltivecVsum2sws(D d, __vector signed int a,
|
|
6454
|
+
__vector signed int b) {
|
|
6455
|
+
const Repartition<int32_t, D> di32;
|
|
6456
|
+
#ifdef __OPTIMIZE__
|
|
6457
|
+
const Repartition<uint64_t, D> du64;
|
|
6458
|
+
constexpr int kDestLaneOffset = HWY_IS_BIG_ENDIAN;
|
|
6459
|
+
if (IsConstantRawAltivecVect(a) && __builtin_constant_p(b[kDestLaneOffset]) &&
|
|
6460
|
+
__builtin_constant_p(b[kDestLaneOffset + 2])) {
|
|
6461
|
+
const int64_t sum0 = static_cast<int64_t>(a[0]) +
|
|
6462
|
+
static_cast<int64_t>(a[1]) +
|
|
6463
|
+
static_cast<int64_t>(b[kDestLaneOffset]);
|
|
6464
|
+
const int64_t sum1 = static_cast<int64_t>(a[2]) +
|
|
6465
|
+
static_cast<int64_t>(a[3]) +
|
|
6466
|
+
static_cast<int64_t>(b[kDestLaneOffset + 2]);
|
|
6467
|
+
const int32_t sign0 = static_cast<int32_t>(sum0 >> 63);
|
|
6468
|
+
const int32_t sign1 = static_cast<int32_t>(sum1 >> 63);
|
|
6469
|
+
return BitCast(d, VFromD<decltype(du64)>{(__vector unsigned long long){
|
|
6470
|
+
(sign0 == (sum0 >> 31))
|
|
6471
|
+
? static_cast<uint32_t>(sum0)
|
|
6472
|
+
: static_cast<uint32_t>(sign0 ^ 0x7FFFFFFF),
|
|
6473
|
+
(sign1 == (sum1 >> 31))
|
|
6474
|
+
? static_cast<uint32_t>(sum1)
|
|
6475
|
+
: static_cast<uint32_t>(sign1 ^ 0x7FFFFFFF)}});
|
|
6476
|
+
} else // NOLINT
|
|
6477
|
+
#endif
|
|
6478
|
+
{
|
|
6479
|
+
__vector signed int sum;
|
|
5130
6480
|
|
|
5131
|
-
//
|
|
6481
|
+
// Inline assembly is used for vsum2sws to avoid unnecessary shuffling
|
|
6482
|
+
// on little-endian PowerPC targets as the result of the vsum2sws
|
|
6483
|
+
// instruction will already be in the correct lanes on little-endian
|
|
6484
|
+
// PowerPC targets.
|
|
6485
|
+
__asm__("vsum2sws %0,%1,%2" : "=v"(sum) : "v"(a), "v"(b));
|
|
5132
6486
|
|
|
5133
|
-
|
|
5134
|
-
|
|
5135
|
-
HWY_INLINE Vec128<T> SumOfLanes(Vec128<T> v10) {
|
|
5136
|
-
const Vec128<T> v01 = Shuffle01(v10);
|
|
5137
|
-
return v10 + v01;
|
|
5138
|
-
}
|
|
5139
|
-
template <typename T, HWY_IF_T_SIZE(T, 8)>
|
|
5140
|
-
HWY_INLINE Vec128<T> MinOfLanes(Vec128<T> v10) {
|
|
5141
|
-
const Vec128<T> v01 = Shuffle01(v10);
|
|
5142
|
-
return Min(v10, v01);
|
|
5143
|
-
}
|
|
5144
|
-
template <typename T, HWY_IF_T_SIZE(T, 8)>
|
|
5145
|
-
HWY_INLINE Vec128<T> MaxOfLanes(Vec128<T> v10) {
|
|
5146
|
-
const Vec128<T> v01 = Shuffle01(v10);
|
|
5147
|
-
return Max(v10, v01);
|
|
6487
|
+
return BitCast(d, VFromD<decltype(di32)>{sum});
|
|
6488
|
+
}
|
|
5148
6489
|
}
|
|
5149
6490
|
|
|
5150
6491
|
// Casts nominally int32_t result to D.
|
|
@@ -5238,275 +6579,419 @@ HWY_INLINE Vec128<int32_t, N / 2> AltivecU16SumsOf2(Vec128<uint16_t, N> v) {
|
|
|
5238
6579
|
return AltivecVsum4shs(di32, Xor(BitCast(di16, v), Set(di16, -32768)).raw,
|
|
5239
6580
|
Set(di32, 65536).raw);
|
|
5240
6581
|
}
|
|
6582
|
+
#endif // !HWY_S390X_HAVE_Z14
|
|
6583
|
+
|
|
6584
|
+
// U16->U32 SumsOf2
|
|
6585
|
+
template <class V>
|
|
6586
|
+
HWY_INLINE VFromD<RepartitionToWide<DFromV<V>>> SumsOf2(
|
|
6587
|
+
hwy::UnsignedTag /*type_tag*/, hwy::SizeTag<2> /*lane_size_tag*/, V v) {
|
|
6588
|
+
const DFromV<V> d;
|
|
6589
|
+
const RepartitionToWide<decltype(d)> dw;
|
|
6590
|
+
|
|
6591
|
+
#if HWY_S390X_HAVE_Z14
|
|
6592
|
+
return VFromD<decltype(dw)>{vec_sum4(v.raw, Zero(d).raw)};
|
|
6593
|
+
#else
|
|
6594
|
+
return BitCast(dw, AltivecU16SumsOf2(v));
|
|
6595
|
+
#endif
|
|
6596
|
+
}
|
|
6597
|
+
|
|
6598
|
+
// I16->I32 SumsOf2
|
|
6599
|
+
template <class V>
|
|
6600
|
+
HWY_INLINE VFromD<RepartitionToWide<DFromV<V>>> SumsOf2(
|
|
6601
|
+
hwy::SignedTag /*type_tag*/, hwy::SizeTag<2> /*lane_size_tag*/, V v) {
|
|
6602
|
+
const DFromV<V> d;
|
|
6603
|
+
const RepartitionToWide<decltype(d)> dw;
|
|
6604
|
+
|
|
6605
|
+
#if HWY_S390X_HAVE_Z14
|
|
6606
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
6607
|
+
return BitCast(dw, SumsOf2(hwy::UnsignedTag(), hwy::SizeTag<2>(),
|
|
6608
|
+
BitCast(du, Xor(v, SignBit(d))))) +
|
|
6609
|
+
Set(dw, int32_t{-65536});
|
|
6610
|
+
#else
|
|
6611
|
+
return AltivecVsum4shs(dw, v.raw, Zero(dw).raw);
|
|
6612
|
+
#endif
|
|
6613
|
+
}
|
|
6614
|
+
|
|
6615
|
+
#if HWY_S390X_HAVE_Z14
|
|
6616
|
+
// U32->U64 SumsOf2
|
|
6617
|
+
template <class V>
|
|
6618
|
+
HWY_INLINE VFromD<RepartitionToWide<DFromV<V>>> SumsOf2(
|
|
6619
|
+
hwy::UnsignedTag /*type_tag*/, hwy::SizeTag<4> /*lane_size_tag*/, V v) {
|
|
6620
|
+
const DFromV<V> d;
|
|
6621
|
+
const RepartitionToWide<decltype(d)> dw;
|
|
6622
|
+
return VFromD<decltype(dw)>{vec_sum2(v.raw, Zero(d).raw)};
|
|
6623
|
+
}
|
|
5241
6624
|
|
|
5242
|
-
|
|
6625
|
+
// I32->I64 SumsOf2
|
|
6626
|
+
template <class V>
|
|
6627
|
+
HWY_INLINE VFromD<RepartitionToWide<DFromV<V>>> SumsOf2(
|
|
6628
|
+
hwy::SignedTag /*type_tag*/, hwy::SizeTag<4> /*lane_size_tag*/, V v) {
|
|
6629
|
+
const DFromV<V> d;
|
|
6630
|
+
const RepartitionToWide<decltype(d)> dw;
|
|
6631
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
6632
|
+
|
|
6633
|
+
return BitCast(dw, SumsOf2(hwy::UnsignedTag(), hwy::SizeTag<4>(),
|
|
6634
|
+
BitCast(du, Xor(v, SignBit(d))))) +
|
|
6635
|
+
Set(dw, int64_t{-4294967296LL});
|
|
6636
|
+
}
|
|
6637
|
+
#endif
|
|
6638
|
+
|
|
6639
|
+
// U8->U32 SumsOf4
|
|
6640
|
+
template <class V>
|
|
6641
|
+
HWY_INLINE VFromD<RepartitionToWideX2<DFromV<V>>> SumsOf4(
|
|
6642
|
+
hwy::UnsignedTag /*type_tag*/, hwy::SizeTag<1> /*lane_size_tag*/, V v) {
|
|
6643
|
+
const DFromV<V> d;
|
|
6644
|
+
const RepartitionToWideX2<decltype(d)> dw2;
|
|
6645
|
+
|
|
6646
|
+
#if HWY_S390X_HAVE_Z14
|
|
6647
|
+
return VFromD<decltype(dw2)>{vec_sum4(v.raw, Zero(d).raw)};
|
|
6648
|
+
#else
|
|
6649
|
+
return AltivecVsum4ubs(dw2, v.raw, Zero(dw2).raw);
|
|
6650
|
+
#endif
|
|
6651
|
+
}
|
|
6652
|
+
|
|
6653
|
+
// I8->I32 SumsOf4
|
|
6654
|
+
template <class V>
|
|
6655
|
+
HWY_INLINE VFromD<RepartitionToWideX2<DFromV<V>>> SumsOf4(
|
|
6656
|
+
hwy::SignedTag /*type_tag*/, hwy::SizeTag<1> /*lane_size_tag*/, V v) {
|
|
6657
|
+
const DFromV<V> d;
|
|
6658
|
+
const RepartitionToWideX2<decltype(d)> dw2;
|
|
6659
|
+
|
|
6660
|
+
#if HWY_S390X_HAVE_Z14
|
|
6661
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
6662
|
+
return BitCast(dw2, SumsOf4(hwy::UnsignedTag(), hwy::SizeTag<1>(),
|
|
6663
|
+
BitCast(du, Xor(v, SignBit(d))))) +
|
|
6664
|
+
Set(dw2, int32_t{-512});
|
|
6665
|
+
#else
|
|
6666
|
+
return AltivecVsum4sbs(dw2, v.raw, Zero(dw2).raw);
|
|
6667
|
+
#endif
|
|
6668
|
+
}
|
|
6669
|
+
|
|
6670
|
+
// U16->U64 SumsOf4
|
|
6671
|
+
template <class V>
|
|
6672
|
+
HWY_INLINE VFromD<RepartitionToWideX2<DFromV<V>>> SumsOf4(
|
|
6673
|
+
hwy::UnsignedTag /*type_tag*/, hwy::SizeTag<2> /*lane_size_tag*/, V v) {
|
|
6674
|
+
const DFromV<V> d;
|
|
6675
|
+
const RepartitionToWide<decltype(d)> dw;
|
|
6676
|
+
const RepartitionToWide<decltype(dw)> dw2;
|
|
6677
|
+
|
|
6678
|
+
#if HWY_S390X_HAVE_Z14
|
|
6679
|
+
return VFromD<decltype(dw2)>{vec_sum2(v.raw, Zero(d).raw)};
|
|
6680
|
+
#else
|
|
6681
|
+
const RebindToSigned<decltype(dw)> dw_i;
|
|
6682
|
+
return AltivecVsum2sws(dw2, BitCast(dw_i, SumsOf2(v)).raw, Zero(dw_i).raw);
|
|
6683
|
+
#endif
|
|
6684
|
+
}
|
|
6685
|
+
|
|
6686
|
+
// I16->I64 SumsOf4
|
|
6687
|
+
template <class V>
|
|
6688
|
+
HWY_INLINE VFromD<RepartitionToWideX2<DFromV<V>>> SumsOf4(
|
|
6689
|
+
hwy::SignedTag /*type_tag*/, hwy::SizeTag<2> /*lane_size_tag*/, V v) {
|
|
6690
|
+
const DFromV<V> d;
|
|
6691
|
+
const RepartitionToWide<decltype(d)> dw;
|
|
6692
|
+
const RepartitionToWide<decltype(dw)> dw2;
|
|
6693
|
+
|
|
6694
|
+
#if HWY_S390X_HAVE_Z14
|
|
6695
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
6696
|
+
return BitCast(dw2, SumsOf4(hwy::UnsignedTag(), hwy::SizeTag<2>(),
|
|
6697
|
+
BitCast(du, Xor(v, SignBit(d))))) +
|
|
6698
|
+
Set(dw2, int64_t{-131072});
|
|
6699
|
+
#else // VSX
|
|
6700
|
+
const auto sums_of_4_in_lo32 =
|
|
6701
|
+
AltivecVsum2sws(dw, SumsOf2(v).raw, Zero(dw).raw);
|
|
6702
|
+
|
|
6703
|
+
#if HWY_IS_LITTLE_ENDIAN
|
|
6704
|
+
return PromoteEvenTo(dw2, sums_of_4_in_lo32);
|
|
6705
|
+
#else
|
|
6706
|
+
return PromoteOddTo(dw2, sums_of_4_in_lo32);
|
|
6707
|
+
#endif // HWY_IS_LITTLE_ENDIAN
|
|
6708
|
+
#endif // HWY_S390X_HAVE_Z14
|
|
6709
|
+
}
|
|
6710
|
+
|
|
6711
|
+
} // namespace detail
|
|
6712
|
+
|
|
6713
|
+
// ------------------------------ SumOfLanes
|
|
6714
|
+
|
|
6715
|
+
// We define SumOfLanes for 8/16-bit types (and I32/U32/I64/U64 on Z14/Z15/Z16);
|
|
6716
|
+
// enable generic for the rest.
|
|
6717
|
+
#undef HWY_IF_SUM_OF_LANES_D
|
|
6718
|
+
#if HWY_S390X_HAVE_Z14
|
|
6719
|
+
#define HWY_IF_SUM_OF_LANES_D(D) HWY_IF_LANES_GT_D(D, 1), HWY_IF_FLOAT3264_D(D)
|
|
6720
|
+
#else
|
|
6721
|
+
#define HWY_IF_SUM_OF_LANES_D(D) \
|
|
6722
|
+
HWY_IF_LANES_GT_D(D, 1), HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 4) | (1 << 8))
|
|
6723
|
+
#endif
|
|
6724
|
+
|
|
6725
|
+
#if HWY_S390X_HAVE_Z14
|
|
6726
|
+
namespace detail {
|
|
6727
|
+
|
|
6728
|
+
template <class T, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T),
|
|
6729
|
+
HWY_IF_T_SIZE_ONE_OF(T, (1 << 4) | (1 << 8))>
|
|
6730
|
+
HWY_INLINE Vec128<T> SumOfU32OrU64LanesAsU128(Vec128<T> v) {
|
|
6731
|
+
const DFromV<decltype(v)> d;
|
|
6732
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
6733
|
+
return BitCast(
|
|
6734
|
+
d, Vec128<uint8_t>{vec_sum_u128(BitCast(du, v).raw, Zero(du).raw)});
|
|
6735
|
+
}
|
|
6736
|
+
|
|
6737
|
+
} // namespace detail
|
|
6738
|
+
|
|
6739
|
+
template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_UI64_D(D)>
|
|
6740
|
+
HWY_API VFromD<D> SumOfLanes(D /*d64*/, VFromD<D> v) {
|
|
6741
|
+
return Broadcast<1>(detail::SumOfU32OrU64LanesAsU128(v));
|
|
6742
|
+
}
|
|
6743
|
+
#endif
|
|
6744
|
+
|
|
6745
|
+
template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_U16_D(D)>
|
|
6746
|
+
HWY_API Vec32<uint16_t> SumOfLanes(D du16, Vec32<uint16_t> v) {
|
|
5243
6747
|
constexpr int kSumLaneIdx = HWY_IS_BIG_ENDIAN;
|
|
5244
|
-
|
|
5245
|
-
|
|
6748
|
+
return Broadcast<kSumLaneIdx>(
|
|
6749
|
+
BitCast(du16, detail::SumsOf2(hwy::UnsignedTag(), hwy::SizeTag<2>(), v)));
|
|
5246
6750
|
}
|
|
5247
6751
|
|
|
5248
|
-
|
|
6752
|
+
template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_U16_D(D)>
|
|
6753
|
+
HWY_API Vec64<uint16_t> SumOfLanes(D du16, Vec64<uint16_t> v) {
|
|
5249
6754
|
constexpr int kSumLaneIdx = HWY_IS_LITTLE_ENDIAN ? 0 : 3;
|
|
5250
|
-
const Full64<uint16_t> du16;
|
|
5251
|
-
const auto zero = Zero(Full128<int32_t>());
|
|
5252
6755
|
return Broadcast<kSumLaneIdx>(
|
|
5253
|
-
|
|
6756
|
+
BitCast(du16, detail::SumsOf4(hwy::UnsignedTag(), hwy::SizeTag<2>(), v)));
|
|
5254
6757
|
}
|
|
5255
6758
|
|
|
5256
|
-
|
|
6759
|
+
template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U16_D(D)>
|
|
6760
|
+
HWY_API Vec128<uint16_t> SumOfLanes(D du16, Vec128<uint16_t> v) {
|
|
5257
6761
|
constexpr int kSumLaneIdx = HWY_IS_LITTLE_ENDIAN ? 0 : 7;
|
|
5258
|
-
|
|
6762
|
+
#if HWY_S390X_HAVE_Z14
|
|
6763
|
+
return Broadcast<kSumLaneIdx>(
|
|
6764
|
+
BitCast(du16, detail::SumOfU32OrU64LanesAsU128(detail::SumsOf4(
|
|
6765
|
+
hwy::UnsignedTag(), hwy::SizeTag<2>(), v))));
|
|
6766
|
+
#else // VSX
|
|
5259
6767
|
const auto zero = Zero(Full128<int32_t>());
|
|
5260
6768
|
return Broadcast<kSumLaneIdx>(
|
|
5261
|
-
AltivecVsumsws(du16, AltivecU16SumsOf2(v).raw, zero.raw));
|
|
6769
|
+
detail::AltivecVsumsws(du16, detail::AltivecU16SumsOf2(v).raw, zero.raw));
|
|
6770
|
+
#endif
|
|
5262
6771
|
}
|
|
5263
6772
|
|
|
5264
|
-
|
|
6773
|
+
template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_I16_D(D)>
|
|
6774
|
+
HWY_API Vec32<int16_t> SumOfLanes(D di16, Vec32<int16_t> v) {
|
|
6775
|
+
#if HWY_S390X_HAVE_Z14
|
|
6776
|
+
const RebindToUnsigned<decltype(di16)> du16;
|
|
6777
|
+
return BitCast(di16, SumOfLanes(du16, BitCast(du16, v)));
|
|
6778
|
+
#else
|
|
5265
6779
|
constexpr int kSumLaneIdx = HWY_IS_BIG_ENDIAN;
|
|
5266
|
-
|
|
5267
|
-
|
|
5268
|
-
|
|
6780
|
+
return Broadcast<kSumLaneIdx>(
|
|
6781
|
+
BitCast(di16, detail::SumsOf2(hwy::SignedTag(), hwy::SizeTag<2>(), v)));
|
|
6782
|
+
#endif
|
|
5269
6783
|
}
|
|
5270
6784
|
|
|
5271
|
-
|
|
6785
|
+
template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_I16_D(D)>
|
|
6786
|
+
HWY_API Vec64<int16_t> SumOfLanes(D di16, Vec64<int16_t> v) {
|
|
6787
|
+
#if HWY_S390X_HAVE_Z14
|
|
6788
|
+
const RebindToUnsigned<decltype(di16)> du16;
|
|
6789
|
+
return BitCast(di16, SumOfLanes(du16, BitCast(du16, v)));
|
|
6790
|
+
#else
|
|
5272
6791
|
constexpr int kSumLaneIdx = HWY_IS_LITTLE_ENDIAN ? 0 : 3;
|
|
5273
|
-
|
|
5274
|
-
|
|
5275
|
-
|
|
5276
|
-
return Broadcast<kSumLaneIdx>(AltivecVsum2sws(
|
|
5277
|
-
di16, AltivecVsum4shs(di32, v.raw, zero.raw).raw, zero.raw));
|
|
6792
|
+
return Broadcast<kSumLaneIdx>(
|
|
6793
|
+
BitCast(di16, detail::SumsOf4(hwy::SignedTag(), hwy::SizeTag<2>(), v)));
|
|
6794
|
+
#endif
|
|
5278
6795
|
}
|
|
5279
6796
|
|
|
5280
|
-
|
|
6797
|
+
template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_I16_D(D)>
|
|
6798
|
+
HWY_API Vec128<int16_t> SumOfLanes(D di16, Vec128<int16_t> v) {
|
|
6799
|
+
#if HWY_S390X_HAVE_Z14
|
|
6800
|
+
const RebindToUnsigned<decltype(di16)> du16;
|
|
6801
|
+
return BitCast(di16, SumOfLanes(du16, BitCast(du16, v)));
|
|
6802
|
+
#else
|
|
5281
6803
|
constexpr int kSumLaneIdx = HWY_IS_LITTLE_ENDIAN ? 0 : 7;
|
|
5282
|
-
const Full128<int16_t> di16;
|
|
5283
6804
|
const Full128<int32_t> di32;
|
|
5284
6805
|
const auto zero = Zero(di32);
|
|
5285
|
-
return Broadcast<kSumLaneIdx>(AltivecVsumsws(
|
|
5286
|
-
di16, AltivecVsum4shs(di32, v.raw, zero.raw).raw, zero.raw));
|
|
6806
|
+
return Broadcast<kSumLaneIdx>(detail::AltivecVsumsws(
|
|
6807
|
+
di16, detail::AltivecVsum4shs(di32, v.raw, zero.raw).raw, zero.raw));
|
|
6808
|
+
#endif
|
|
5287
6809
|
}
|
|
5288
6810
|
|
|
5289
|
-
|
|
5290
|
-
HWY_API
|
|
6811
|
+
template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_U8_D(D)>
|
|
6812
|
+
HWY_API Vec32<uint8_t> SumOfLanes(D du8, Vec32<uint8_t> v) {
|
|
5291
6813
|
constexpr int kSumLaneIdx = HWY_IS_LITTLE_ENDIAN ? 0 : 3;
|
|
5292
|
-
|
|
5293
|
-
|
|
5294
|
-
const Twice<decltype(du8)> dt_u8;
|
|
5295
|
-
const Twice<decltype(du16)> dt_u16;
|
|
5296
|
-
const Full128<uint32_t> du32;
|
|
5297
|
-
return LowerHalf(Broadcast<kSumLaneIdx>(AltivecVsum4ubs(
|
|
5298
|
-
dt_u8, BitCast(dt_u8, Combine(dt_u16, Zero(du16), BitCast(du16, v))).raw,
|
|
5299
|
-
Zero(du32).raw)));
|
|
6814
|
+
return Broadcast<kSumLaneIdx>(
|
|
6815
|
+
BitCast(du8, detail::SumsOf4(hwy::UnsignedTag(), hwy::SizeTag<1>(), v)));
|
|
5300
6816
|
}
|
|
5301
6817
|
|
|
5302
|
-
|
|
5303
|
-
|
|
5304
|
-
const
|
|
5305
|
-
|
|
5306
|
-
return Broadcast<kSumLaneIdx>(AltivecVsum4ubs(du8, v.raw, Zero(du32).raw));
|
|
6818
|
+
template <class D, HWY_IF_V_SIZE_D(D, 2), HWY_IF_U8_D(D)>
|
|
6819
|
+
HWY_API Vec16<uint8_t> SumOfLanes(D du8, Vec16<uint8_t> v) {
|
|
6820
|
+
const Twice<decltype(du8)> dt_u8;
|
|
6821
|
+
return LowerHalf(du8, SumOfLanes(dt_u8, Combine(dt_u8, Zero(du8), v)));
|
|
5307
6822
|
}
|
|
5308
6823
|
|
|
5309
|
-
|
|
6824
|
+
template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_U8_D(D)>
|
|
6825
|
+
HWY_API Vec64<uint8_t> SumOfLanes(D du8, Vec64<uint8_t> v) {
|
|
5310
6826
|
constexpr int kSumLaneIdx = HWY_IS_LITTLE_ENDIAN ? 0 : 7;
|
|
5311
|
-
const Full64<uint8_t> du8;
|
|
5312
6827
|
return Broadcast<kSumLaneIdx>(BitCast(du8, SumsOf8(v)));
|
|
5313
6828
|
}
|
|
5314
6829
|
|
|
5315
|
-
|
|
6830
|
+
template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U8_D(D)>
|
|
6831
|
+
HWY_API Vec128<uint8_t> SumOfLanes(D du8, Vec128<uint8_t> v) {
|
|
5316
6832
|
constexpr int kSumLaneIdx = HWY_IS_LITTLE_ENDIAN ? 0 : 15;
|
|
5317
6833
|
|
|
6834
|
+
#if HWY_S390X_HAVE_Z14
|
|
6835
|
+
return Broadcast<kSumLaneIdx>(
|
|
6836
|
+
BitCast(du8, detail::SumOfU32OrU64LanesAsU128(detail::SumsOf4(
|
|
6837
|
+
hwy::UnsignedTag(), hwy::SizeTag<1>(), v))));
|
|
6838
|
+
#else
|
|
5318
6839
|
const Full128<uint32_t> du32;
|
|
5319
6840
|
const RebindToSigned<decltype(du32)> di32;
|
|
5320
|
-
const Full128<uint8_t> du8;
|
|
5321
6841
|
const Vec128<uint32_t> zero = Zero(du32);
|
|
5322
|
-
return Broadcast<kSumLaneIdx>(
|
|
5323
|
-
|
|
5324
|
-
|
|
6842
|
+
return Broadcast<kSumLaneIdx>(detail::AltivecVsumsws(
|
|
6843
|
+
du8, detail::AltivecVsum4ubs(di32, v.raw, zero.raw).raw,
|
|
6844
|
+
BitCast(di32, zero).raw));
|
|
6845
|
+
#endif
|
|
5325
6846
|
}
|
|
5326
6847
|
|
|
5327
|
-
|
|
6848
|
+
template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_I8_D(D)>
|
|
6849
|
+
HWY_API Vec32<int8_t> SumOfLanes(D di8, Vec32<int8_t> v) {
|
|
6850
|
+
#if HWY_S390X_HAVE_Z14
|
|
6851
|
+
const RebindToUnsigned<decltype(di8)> du8;
|
|
6852
|
+
return BitCast(di8, SumOfLanes(du8, BitCast(du8, v)));
|
|
6853
|
+
#else
|
|
5328
6854
|
constexpr int kSumLaneIdx = HWY_IS_LITTLE_ENDIAN ? 0 : 3;
|
|
5329
|
-
|
|
5330
|
-
|
|
5331
|
-
|
|
5332
|
-
const Repartition<int8_t, decltype(du16)> di8;
|
|
5333
|
-
const Vec128<int8_t> zzvv = BitCast(
|
|
5334
|
-
di8, InterleaveLower(BitCast(du16, Vec128<int8_t>{v.raw}), Zero(du16)));
|
|
5335
|
-
return Vec16<int8_t>{
|
|
5336
|
-
Broadcast<kSumLaneIdx>(AltivecVsum4sbs(di8, zzvv.raw, Zero(di32).raw))
|
|
5337
|
-
.raw};
|
|
6855
|
+
return Broadcast<kSumLaneIdx>(
|
|
6856
|
+
BitCast(di8, detail::SumsOf4(hwy::SignedTag(), hwy::SizeTag<1>(), v)));
|
|
6857
|
+
#endif
|
|
5338
6858
|
}
|
|
5339
6859
|
|
|
5340
|
-
|
|
5341
|
-
|
|
5342
|
-
const
|
|
5343
|
-
|
|
5344
|
-
return Broadcast<kSumLaneIdx>(AltivecVsum4sbs(di8, v.raw, zero.raw));
|
|
6860
|
+
template <class D, HWY_IF_V_SIZE_D(D, 2), HWY_IF_I8_D(D)>
|
|
6861
|
+
HWY_API Vec16<int8_t> SumOfLanes(D di8, Vec16<int8_t> v) {
|
|
6862
|
+
const Twice<decltype(di8)> dt_i8;
|
|
6863
|
+
return LowerHalf(di8, SumOfLanes(dt_i8, Combine(dt_i8, Zero(di8), v)));
|
|
5345
6864
|
}
|
|
5346
6865
|
|
|
5347
|
-
|
|
6866
|
+
template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_I8_D(D)>
|
|
6867
|
+
HWY_API Vec64<int8_t> SumOfLanes(D di8, Vec64<int8_t> v) {
|
|
6868
|
+
#if HWY_S390X_HAVE_Z14
|
|
6869
|
+
const RebindToUnsigned<decltype(di8)> du8;
|
|
6870
|
+
return BitCast(di8, SumOfLanes(du8, BitCast(du8, v)));
|
|
6871
|
+
#else
|
|
5348
6872
|
constexpr int kSumLaneIdx = HWY_IS_LITTLE_ENDIAN ? 0 : 7;
|
|
5349
|
-
|
|
5350
|
-
|
|
5351
|
-
const Full64<int8_t> di8;
|
|
5352
|
-
return Broadcast<kSumLaneIdx>(AltivecVsum2sws(
|
|
5353
|
-
di8, AltivecVsum4sbs(di32, v.raw, zero.raw).raw, zero.raw));
|
|
6873
|
+
return Broadcast<kSumLaneIdx>(BitCast(di8, SumsOf8(v)));
|
|
6874
|
+
#endif
|
|
5354
6875
|
}
|
|
5355
6876
|
|
|
5356
|
-
|
|
6877
|
+
template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_I8_D(D)>
|
|
6878
|
+
HWY_API Vec128<int8_t> SumOfLanes(D di8, Vec128<int8_t> v) {
|
|
6879
|
+
#if HWY_S390X_HAVE_Z14
|
|
6880
|
+
const RebindToUnsigned<decltype(di8)> du8;
|
|
6881
|
+
return BitCast(di8, SumOfLanes(du8, BitCast(du8, v)));
|
|
6882
|
+
#else
|
|
5357
6883
|
constexpr int kSumLaneIdx = HWY_IS_LITTLE_ENDIAN ? 0 : 15;
|
|
5358
|
-
const Full128<int8_t> di8;
|
|
5359
6884
|
const Full128<int32_t> di32;
|
|
5360
6885
|
const Vec128<int32_t> zero = Zero(di32);
|
|
5361
|
-
return Broadcast<kSumLaneIdx>(AltivecVsumsws(
|
|
5362
|
-
di8, AltivecVsum4sbs(di32, v.raw, zero.raw).raw, zero.raw));
|
|
6886
|
+
return Broadcast<kSumLaneIdx>(detail::AltivecVsumsws(
|
|
6887
|
+
di8, detail::AltivecVsum4sbs(di32, v.raw, zero.raw).raw, zero.raw));
|
|
6888
|
+
#endif
|
|
5363
6889
|
}
|
|
5364
6890
|
|
|
5365
|
-
|
|
5366
|
-
|
|
5367
|
-
|
|
5368
|
-
const
|
|
5369
|
-
|
|
5370
|
-
|
|
5371
|
-
|
|
5372
|
-
vm = Max(vm, BitCast(d, Reverse2(d32, BitCast(d32, vm))));
|
|
5373
|
-
if (N > 8) {
|
|
5374
|
-
const RepartitionToWide<decltype(d32)> d64;
|
|
5375
|
-
vm = Max(vm, BitCast(d, Reverse2(d64, BitCast(d64, vm))));
|
|
5376
|
-
}
|
|
5377
|
-
return vm;
|
|
6891
|
+
#if HWY_S390X_HAVE_Z14
|
|
6892
|
+
template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_UI32_D(D)>
|
|
6893
|
+
HWY_API VFromD<D> SumOfLanes(D d32, VFromD<D> v) {
|
|
6894
|
+
const RebindToUnsigned<decltype(d32)> du32;
|
|
6895
|
+
return Broadcast<1>(
|
|
6896
|
+
BitCast(d32, detail::SumsOf2(hwy::UnsignedTag(), hwy::SizeTag<4>(),
|
|
6897
|
+
BitCast(du32, v))));
|
|
5378
6898
|
}
|
|
5379
6899
|
|
|
5380
|
-
template <
|
|
5381
|
-
HWY_API
|
|
5382
|
-
|
|
5383
|
-
const RepartitionToWide<decltype(d)> d16;
|
|
5384
|
-
const RepartitionToWide<decltype(d16)> d32;
|
|
5385
|
-
Vec128<uint8_t, N> vm = Min(v, Reverse2(d, v));
|
|
5386
|
-
vm = Min(vm, BitCast(d, Reverse2(d16, BitCast(d16, vm))));
|
|
5387
|
-
vm = Min(vm, BitCast(d, Reverse2(d32, BitCast(d32, vm))));
|
|
5388
|
-
if (N > 8) {
|
|
5389
|
-
const RepartitionToWide<decltype(d32)> d64;
|
|
5390
|
-
vm = Min(vm, BitCast(d, Reverse2(d64, BitCast(d64, vm))));
|
|
5391
|
-
}
|
|
5392
|
-
return vm;
|
|
6900
|
+
template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_UI32_D(D)>
|
|
6901
|
+
HWY_API VFromD<D> SumOfLanes(D /*d32*/, VFromD<D> v) {
|
|
6902
|
+
return Broadcast<3>(detail::SumOfU32OrU64LanesAsU128(v));
|
|
5393
6903
|
}
|
|
6904
|
+
#endif
|
|
5394
6905
|
|
|
5395
|
-
|
|
5396
|
-
HWY_API Vec128<int8_t, N> MaxOfLanes(Vec128<int8_t, N> v) {
|
|
5397
|
-
const DFromV<decltype(v)> d;
|
|
5398
|
-
const RepartitionToWide<decltype(d)> d16;
|
|
5399
|
-
const RepartitionToWide<decltype(d16)> d32;
|
|
5400
|
-
Vec128<int8_t, N> vm = Max(v, Reverse2(d, v));
|
|
5401
|
-
vm = Max(vm, BitCast(d, Reverse2(d16, BitCast(d16, vm))));
|
|
5402
|
-
vm = Max(vm, BitCast(d, Reverse2(d32, BitCast(d32, vm))));
|
|
5403
|
-
if (N > 8) {
|
|
5404
|
-
const RepartitionToWide<decltype(d32)> d64;
|
|
5405
|
-
vm = Max(vm, BitCast(d, Reverse2(d64, BitCast(d64, vm))));
|
|
5406
|
-
}
|
|
5407
|
-
return vm;
|
|
5408
|
-
}
|
|
6906
|
+
// generic_ops defines MinOfLanes and MaxOfLanes.
|
|
5409
6907
|
|
|
5410
|
-
|
|
5411
|
-
HWY_API Vec128<int8_t, N> MinOfLanes(Vec128<int8_t, N> v) {
|
|
5412
|
-
const DFromV<decltype(v)> d;
|
|
5413
|
-
const RepartitionToWide<decltype(d)> d16;
|
|
5414
|
-
const RepartitionToWide<decltype(d16)> d32;
|
|
5415
|
-
Vec128<int8_t, N> vm = Min(v, Reverse2(d, v));
|
|
5416
|
-
vm = Min(vm, BitCast(d, Reverse2(d16, BitCast(d16, vm))));
|
|
5417
|
-
vm = Min(vm, BitCast(d, Reverse2(d32, BitCast(d32, vm))));
|
|
5418
|
-
if (N > 8) {
|
|
5419
|
-
const RepartitionToWide<decltype(d32)> d64;
|
|
5420
|
-
vm = Min(vm, BitCast(d, Reverse2(d64, BitCast(d64, vm))));
|
|
5421
|
-
}
|
|
5422
|
-
return vm;
|
|
5423
|
-
}
|
|
6908
|
+
// ------------------------------ ReduceSum for N=4 I8/U8
|
|
5424
6909
|
|
|
5425
|
-
|
|
5426
|
-
|
|
5427
|
-
|
|
5428
|
-
|
|
5429
|
-
#if HWY_IS_LITTLE_ENDIAN
|
|
5430
|
-
const auto even = And(BitCast(d32, v), Set(d32, 0xFFFF));
|
|
5431
|
-
const auto odd = ShiftRight<16>(BitCast(d32, v));
|
|
5432
|
-
#else
|
|
5433
|
-
const auto even = ShiftRight<16>(BitCast(d32, v));
|
|
5434
|
-
const auto odd = And(BitCast(d32, v), Set(d32, 0xFFFF));
|
|
5435
|
-
#endif
|
|
5436
|
-
const auto min = MinOfLanes(Min(even, odd));
|
|
5437
|
-
// Also broadcast into odd lanes on little-endian and into even lanes
|
|
5438
|
-
// on big-endian
|
|
5439
|
-
return Vec128<uint16_t, N>{vec_pack(min.raw, min.raw)};
|
|
5440
|
-
}
|
|
5441
|
-
template <size_t N, HWY_IF_V_SIZE_GT(int16_t, N, 2)>
|
|
5442
|
-
HWY_API Vec128<int16_t, N> MinOfLanes(Vec128<int16_t, N> v) {
|
|
5443
|
-
const Simd<int16_t, N, 0> d;
|
|
5444
|
-
const RepartitionToWide<decltype(d)> d32;
|
|
5445
|
-
// Sign-extend
|
|
5446
|
-
#if HWY_IS_LITTLE_ENDIAN
|
|
5447
|
-
const auto even = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, v)));
|
|
5448
|
-
const auto odd = ShiftRight<16>(BitCast(d32, v));
|
|
6910
|
+
// GetLane(SumsOf4(v)) is more efficient on PPC/Z14 than the default N=4
|
|
6911
|
+
// I8/U8 ReduceSum implementation in generic_ops-inl.h
|
|
6912
|
+
#ifdef HWY_NATIVE_REDUCE_SUM_4_UI8
|
|
6913
|
+
#undef HWY_NATIVE_REDUCE_SUM_4_UI8
|
|
5449
6914
|
#else
|
|
5450
|
-
|
|
5451
|
-
const auto odd = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, v)));
|
|
6915
|
+
#define HWY_NATIVE_REDUCE_SUM_4_UI8
|
|
5452
6916
|
#endif
|
|
5453
|
-
|
|
5454
|
-
|
|
5455
|
-
|
|
5456
|
-
return
|
|
6917
|
+
|
|
6918
|
+
template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_UI8_D(D)>
|
|
6919
|
+
HWY_API TFromD<D> ReduceSum(D /*d*/, VFromD<D> v) {
|
|
6920
|
+
return static_cast<TFromD<D>>(GetLane(SumsOf4(v)));
|
|
5457
6921
|
}
|
|
5458
6922
|
|
|
5459
|
-
|
|
5460
|
-
|
|
5461
|
-
|
|
5462
|
-
|
|
5463
|
-
#if HWY_IS_LITTLE_ENDIAN
|
|
5464
|
-
const auto even = And(BitCast(d32, v), Set(d32, 0xFFFF));
|
|
5465
|
-
const auto odd = ShiftRight<16>(BitCast(d32, v));
|
|
6923
|
+
// ------------------------------ BitShuffle
|
|
6924
|
+
|
|
6925
|
+
#ifdef HWY_NATIVE_BITSHUFFLE
|
|
6926
|
+
#undef HWY_NATIVE_BITSHUFFLE
|
|
5466
6927
|
#else
|
|
5467
|
-
|
|
5468
|
-
const auto odd = And(BitCast(d32, v), Set(d32, 0xFFFF));
|
|
6928
|
+
#define HWY_NATIVE_BITSHUFFLE
|
|
5469
6929
|
#endif
|
|
5470
|
-
|
|
5471
|
-
|
|
5472
|
-
|
|
5473
|
-
|
|
5474
|
-
|
|
5475
|
-
|
|
5476
|
-
const
|
|
5477
|
-
|
|
5478
|
-
|
|
6930
|
+
|
|
6931
|
+
template <class V, class VI, HWY_IF_UI64(TFromV<V>), HWY_IF_UI8(TFromV<VI>),
|
|
6932
|
+
HWY_IF_V_SIZE_V(VI, HWY_MAX_LANES_V(V) * 8)>
|
|
6933
|
+
HWY_API V BitShuffle(V v, VI idx) {
|
|
6934
|
+
const DFromV<decltype(v)> d64;
|
|
6935
|
+
const RebindToUnsigned<decltype(d64)> du64;
|
|
6936
|
+
const Repartition<uint8_t, decltype(d64)> du8;
|
|
6937
|
+
|
|
6938
|
+
const Full128<TFromD<decltype(du64)>> d_full_u64;
|
|
6939
|
+
const Full128<TFromD<decltype(du8)>> d_full_u8;
|
|
6940
|
+
|
|
6941
|
+
using RawVU64 = __vector unsigned long long;
|
|
6942
|
+
|
|
6943
|
+
#if HWY_PPC_HAVE_9
|
|
6944
|
+
|
|
5479
6945
|
#if HWY_IS_LITTLE_ENDIAN
|
|
5480
|
-
|
|
5481
|
-
|
|
6946
|
+
(void)d_full_u64;
|
|
6947
|
+
auto bit_idx = ResizeBitCast(d_full_u8, idx);
|
|
5482
6948
|
#else
|
|
5483
|
-
|
|
5484
|
-
|
|
6949
|
+
auto bit_idx =
|
|
6950
|
+
BitCast(d_full_u8, ReverseLaneBytes(ResizeBitCast(d_full_u64, idx)));
|
|
5485
6951
|
#endif
|
|
5486
|
-
const auto max = MaxOfLanes(Max(even, odd));
|
|
5487
|
-
// Also broadcast into odd lanes on little-endian and into even lanes
|
|
5488
|
-
// on big-endian
|
|
5489
|
-
return Vec128<int16_t, N>{vec_pack(max.raw, max.raw)};
|
|
5490
|
-
}
|
|
5491
6952
|
|
|
5492
|
-
|
|
6953
|
+
bit_idx = Xor(bit_idx, Set(d_full_u8, uint8_t{0x3F}));
|
|
5493
6954
|
|
|
5494
|
-
|
|
5495
|
-
|
|
5496
|
-
|
|
5497
|
-
|
|
5498
|
-
|
|
5499
|
-
|
|
5500
|
-
|
|
5501
|
-
|
|
5502
|
-
|
|
5503
|
-
|
|
5504
|
-
|
|
5505
|
-
|
|
5506
|
-
}
|
|
5507
|
-
|
|
5508
|
-
|
|
5509
|
-
|
|
6955
|
+
return BitCast(d64, VFromD<decltype(du64)>{reinterpret_cast<RawVU64>(
|
|
6956
|
+
vec_bperm(BitCast(du64, v).raw, bit_idx.raw))});
|
|
6957
|
+
#else // !HWY_PPC_HAVE_9
|
|
6958
|
+
|
|
6959
|
+
#if HWY_IS_LITTLE_ENDIAN
|
|
6960
|
+
const auto bit_idx_xor_mask = BitCast(
|
|
6961
|
+
d_full_u8, Dup128VecFromValues(d_full_u64, uint64_t{0x7F7F7F7F7F7F7F7Fu},
|
|
6962
|
+
uint64_t{0x3F3F3F3F3F3F3F3Fu}));
|
|
6963
|
+
const auto bit_idx = Xor(ResizeBitCast(d_full_u8, idx), bit_idx_xor_mask);
|
|
6964
|
+
constexpr int kBitShufResultByteShrAmt = 8;
|
|
6965
|
+
#else
|
|
6966
|
+
const auto bit_idx_xor_mask = BitCast(
|
|
6967
|
+
d_full_u8, Dup128VecFromValues(d_full_u64, uint64_t{0x3F3F3F3F3F3F3F3Fu},
|
|
6968
|
+
uint64_t{0x7F7F7F7F7F7F7F7Fu}));
|
|
6969
|
+
const auto bit_idx =
|
|
6970
|
+
Xor(BitCast(d_full_u8, ReverseLaneBytes(ResizeBitCast(d_full_u64, idx))),
|
|
6971
|
+
bit_idx_xor_mask);
|
|
6972
|
+
constexpr int kBitShufResultByteShrAmt = 6;
|
|
6973
|
+
#endif
|
|
6974
|
+
|
|
6975
|
+
#if HWY_S390X_HAVE_Z14
|
|
6976
|
+
const VFromD<decltype(d_full_u64)> bit_shuf_result{reinterpret_cast<RawVU64>(
|
|
6977
|
+
vec_bperm_u128(BitCast(du8, v).raw, bit_idx.raw))};
|
|
6978
|
+
#elif defined(__SIZEOF_INT128__)
|
|
6979
|
+
using RawVU128 = __vector unsigned __int128;
|
|
6980
|
+
const VFromD<decltype(d_full_u64)> bit_shuf_result{reinterpret_cast<RawVU64>(
|
|
6981
|
+
vec_vbpermq(reinterpret_cast<RawVU128>(v.raw), bit_idx.raw))};
|
|
6982
|
+
#else
|
|
6983
|
+
using RawVU128 = __vector unsigned char;
|
|
6984
|
+
const VFromD<decltype(d_full_u64)> bit_shuf_result{reinterpret_cast<RawVU64>(
|
|
6985
|
+
vec_vbpermq(reinterpret_cast<RawVU128>(v.raw), bit_idx.raw))};
|
|
6986
|
+
#endif
|
|
6987
|
+
|
|
6988
|
+
return ResizeBitCast(
|
|
6989
|
+
d64, PromoteTo(d_full_u64,
|
|
6990
|
+
ResizeBitCast(
|
|
6991
|
+
Rebind<uint8_t, decltype(d_full_u64)>(),
|
|
6992
|
+
CombineShiftRightBytes<kBitShufResultByteShrAmt>(
|
|
6993
|
+
d_full_u64, bit_shuf_result, bit_shuf_result))));
|
|
6994
|
+
#endif // HWY_PPC_HAVE_9
|
|
5510
6995
|
}
|
|
5511
6996
|
|
|
5512
6997
|
// ------------------------------ Lt128
|
|
@@ -5672,7 +7157,20 @@ HWY_API V Max128Upper(D d, const V a, const V b) {
|
|
|
5672
7157
|
|
|
5673
7158
|
template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
|
|
5674
7159
|
HWY_API V LeadingZeroCount(V v) {
|
|
7160
|
+
#if HWY_S390X_HAVE_Z14
|
|
7161
|
+
const DFromV<decltype(v)> d;
|
|
7162
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
7163
|
+
|
|
7164
|
+
#if HWY_COMPILER_GCC_ACTUAL && defined(__OPTIMIZE__)
|
|
7165
|
+
// Work around for GCC compiler bug in vec_cnttz on Z14/Z15 if v[i] is a
|
|
7166
|
+
// constant
|
|
7167
|
+
__asm__("" : "+v"(v.raw));
|
|
7168
|
+
#endif
|
|
7169
|
+
|
|
7170
|
+
return BitCast(d, VFromD<decltype(du)>{vec_cntlz(BitCast(du, v).raw)});
|
|
7171
|
+
#else
|
|
5675
7172
|
return V{vec_cntlz(v.raw)};
|
|
7173
|
+
#endif
|
|
5676
7174
|
}
|
|
5677
7175
|
|
|
5678
7176
|
template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
|
|
@@ -5682,14 +7180,27 @@ HWY_API V HighestSetBitIndex(V v) {
|
|
|
5682
7180
|
return BitCast(d, Set(d, T{sizeof(T) * 8 - 1}) - LeadingZeroCount(v));
|
|
5683
7181
|
}
|
|
5684
7182
|
|
|
5685
|
-
#if HWY_PPC_HAVE_9
|
|
7183
|
+
#if HWY_PPC_HAVE_9 || HWY_S390X_HAVE_Z14
|
|
5686
7184
|
template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
|
|
5687
7185
|
HWY_API V TrailingZeroCount(V v) {
|
|
5688
7186
|
#if HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 700
|
|
5689
7187
|
return V{vec_vctz(v.raw)};
|
|
5690
7188
|
#else
|
|
5691
|
-
|
|
7189
|
+
#if HWY_S390X_HAVE_Z14
|
|
7190
|
+
const DFromV<decltype(v)> d;
|
|
7191
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
7192
|
+
|
|
7193
|
+
#if HWY_COMPILER_GCC_ACTUAL && defined(__OPTIMIZE__)
|
|
7194
|
+
// Work around for GCC compiler bug in vec_cnttz on Z14/Z15 if v[i] is a
|
|
7195
|
+
// constant
|
|
7196
|
+
__asm__("" : "+v"(v.raw));
|
|
5692
7197
|
#endif
|
|
7198
|
+
|
|
7199
|
+
return BitCast(d, VFromD<decltype(du)>{vec_cnttz(BitCast(du, v).raw)});
|
|
7200
|
+
#else
|
|
7201
|
+
return V{vec_cnttz(v.raw)};
|
|
7202
|
+
#endif // HWY_S390X_HAVE_Z14
|
|
7203
|
+
#endif // HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 700
|
|
5693
7204
|
}
|
|
5694
7205
|
#else
|
|
5695
7206
|
template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
|
|
@@ -5709,6 +7220,8 @@ HWY_API V TrailingZeroCount(V v) {
|
|
|
5709
7220
|
|
|
5710
7221
|
#undef HWY_PPC_HAVE_9
|
|
5711
7222
|
#undef HWY_PPC_HAVE_10
|
|
7223
|
+
#undef HWY_S390X_HAVE_Z14
|
|
7224
|
+
#undef HWY_S390X_HAVE_Z15
|
|
5712
7225
|
|
|
5713
7226
|
// NOLINTNEXTLINE(google-readability-namespace-comments)
|
|
5714
7227
|
} // namespace HWY_NAMESPACE
|