@img/sharp-libvips-dev 1.0.0 → 1.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/include/aom/aom_encoder.h +3 -3
- package/include/aom/aomcx.h +17 -8
- package/include/expat.h +21 -10
- package/include/expat_config.h +11 -5
- package/include/ffi.h +12 -25
- package/include/fontconfig/fontconfig.h +5 -3
- package/include/freetype2/freetype/config/ftoption.h +1 -1
- package/include/gio-unix-2.0/gio/gfiledescriptorbased.h +3 -7
- package/include/gio-unix-2.0/gio/gunixinputstream.h +0 -5
- package/include/gio-unix-2.0/gio/gunixoutputstream.h +0 -5
- package/include/glib-2.0/gio/gappinfo.h +0 -7
- package/include/glib-2.0/gio/gapplication.h +6 -0
- package/include/glib-2.0/gio/gapplicationcommandline.h +12 -1
- package/include/glib-2.0/gio/gasyncinitable.h +0 -7
- package/include/glib-2.0/gio/gasyncresult.h +0 -6
- package/include/glib-2.0/gio/gbufferedinputstream.h +0 -5
- package/include/glib-2.0/gio/gbufferedoutputstream.h +0 -5
- package/include/glib-2.0/gio/gbytesicon.h +0 -5
- package/include/glib-2.0/gio/gcancellable.h +0 -5
- package/include/glib-2.0/gio/gconverter.h +0 -7
- package/include/glib-2.0/gio/gconverterinputstream.h +0 -6
- package/include/glib-2.0/gio/gconverteroutputstream.h +0 -6
- package/include/glib-2.0/gio/gdatagrambased.h +0 -7
- package/include/glib-2.0/gio/gdatainputstream.h +0 -6
- package/include/glib-2.0/gio/gdataoutputstream.h +0 -6
- package/include/glib-2.0/gio/gdbusinterface.h +0 -8
- package/include/glib-2.0/gio/gdbusinterfaceskeleton.h +0 -8
- package/include/glib-2.0/gio/gdbusmessage.h +2 -1
- package/include/glib-2.0/gio/gdbusobjectmanagerclient.h +0 -8
- package/include/glib-2.0/gio/gdbusobjectmanagerserver.h +0 -8
- package/include/glib-2.0/gio/gdbusobjectproxy.h +0 -8
- package/include/glib-2.0/gio/gdbusobjectskeleton.h +0 -8
- package/include/glib-2.0/gio/gdbusproxy.h +0 -8
- package/include/glib-2.0/gio/gdebugcontroller.h +0 -8
- package/include/glib-2.0/gio/gdebugcontrollerdbus.h +0 -7
- package/include/glib-2.0/gio/gdtlsserverconnection.h +0 -8
- package/include/glib-2.0/gio/gemblem.h +0 -5
- package/include/glib-2.0/gio/gemblemedicon.h +0 -5
- package/include/glib-2.0/gio/gfile.h +0 -10
- package/include/glib-2.0/gio/gfileenumerator.h +0 -5
- package/include/glib-2.0/gio/gfileicon.h +0 -5
- package/include/glib-2.0/gio/gfileinfo.h +0 -5
- package/include/glib-2.0/gio/gfileinputstream.h +0 -8
- package/include/glib-2.0/gio/gfileiostream.h +0 -8
- package/include/glib-2.0/gio/gfilemonitor.h +0 -5
- package/include/glib-2.0/gio/gfilenamecompleter.h +0 -5
- package/include/glib-2.0/gio/gfileoutputstream.h +0 -8
- package/include/glib-2.0/gio/gfilterinputstream.h +0 -5
- package/include/glib-2.0/gio/gfilteroutputstream.h +0 -5
- package/include/glib-2.0/gio/gicon.h +0 -5
- package/include/glib-2.0/gio/ginitable.h +0 -7
- package/include/glib-2.0/gio/ginputstream.h +0 -5
- package/include/glib-2.0/gio/gio-autocleanups.h +4 -0
- package/include/glib-2.0/gio/gio-visibility.h +34 -0
- package/include/glib-2.0/gio/gioenums.h +6 -1
- package/include/glib-2.0/gio/giomodule.h +0 -5
- package/include/glib-2.0/gio/giostream.h +0 -5
- package/include/glib-2.0/gio/giotypes.h +5 -108
- package/include/glib-2.0/gio/gloadableicon.h +0 -6
- package/include/glib-2.0/gio/gmemoryinputstream.h +0 -5
- package/include/glib-2.0/gio/gmemoryoutputstream.h +0 -5
- package/include/glib-2.0/gio/gmountoperation.h +0 -6
- package/include/glib-2.0/gio/gnetworking.h +4 -0
- package/include/glib-2.0/gio/goutputstream.h +0 -9
- package/include/glib-2.0/gio/gpollableinputstream.h +0 -7
- package/include/glib-2.0/gio/gpollableoutputstream.h +0 -7
- package/include/glib-2.0/gio/gproxy.h +0 -7
- package/include/glib-2.0/gio/gproxyaddressenumerator.h +0 -8
- package/include/glib-2.0/gio/gseekable.h +0 -5
- package/include/glib-2.0/gio/gsettingsbackend.h +0 -5
- package/include/glib-2.0/gio/gsimpleactiongroup.h +0 -7
- package/include/glib-2.0/gio/gsimpleasyncresult.h +0 -5
- package/include/glib-2.0/gio/gsimpleproxyresolver.h +0 -5
- package/include/glib-2.0/gio/gsocket.h +13 -0
- package/include/glib-2.0/gio/gsocketaddressenumerator.h +0 -6
- package/include/glib-2.0/gio/gsocketconnectable.h +0 -5
- package/include/glib-2.0/gio/gtask.h +12 -0
- package/include/glib-2.0/gio/gthemedicon.h +0 -5
- package/include/glib-2.0/gio/gtlsserverconnection.h +0 -8
- package/include/glib-2.0/gio/gunixcredentialsmessage.h +0 -8
- package/include/glib-2.0/gio/gvfs.h +0 -5
- package/include/glib-2.0/gio/gvolume.h +2 -2
- package/include/glib-2.0/gio/gvolumemonitor.h +0 -5
- package/include/glib-2.0/girepository/gi-visibility.h +986 -0
- package/include/glib-2.0/girepository/giarginfo.h +100 -0
- package/include/glib-2.0/girepository/gibaseinfo.h +129 -0
- package/include/glib-2.0/girepository/gicallableinfo.h +119 -0
- package/include/glib-2.0/girepository/gicallbackinfo.h +60 -0
- package/include/glib-2.0/girepository/giconstantinfo.h +72 -0
- package/include/glib-2.0/girepository/gienuminfo.h +82 -0
- package/include/glib-2.0/girepository/gifieldinfo.h +84 -0
- package/include/glib-2.0/girepository/giflagsinfo.h +60 -0
- package/include/glib-2.0/girepository/gifunctioninfo.h +117 -0
- package/include/glib-2.0/girepository/giinterfaceinfo.h +120 -0
- package/include/glib-2.0/girepository/giobjectinfo.h +230 -0
- package/include/glib-2.0/girepository/gipropertyinfo.h +77 -0
- package/include/glib-2.0/girepository/giregisteredtypeinfo.h +75 -0
- package/include/glib-2.0/girepository/girepository-autocleanups.h +56 -0
- package/include/glib-2.0/girepository/girepository.h +247 -0
- package/include/glib-2.0/girepository/girffi.h +129 -0
- package/include/glib-2.0/girepository/gisignalinfo.h +72 -0
- package/include/glib-2.0/girepository/gistructinfo.h +102 -0
- package/include/glib-2.0/girepository/gitypeinfo.h +144 -0
- package/include/glib-2.0/girepository/gitypelib.h +61 -0
- package/include/glib-2.0/girepository/gitypes.h +421 -0
- package/include/glib-2.0/girepository/giunioninfo.h +105 -0
- package/include/glib-2.0/girepository/giunresolvedinfo.h +60 -0
- package/include/glib-2.0/girepository/givalueinfo.h +65 -0
- package/include/glib-2.0/girepository/givfuncinfo.h +88 -0
- package/include/glib-2.0/glib/deprecated/gcompletion.h +1 -1
- package/include/glib-2.0/glib/deprecated/grel.h +0 -23
- package/include/glib-2.0/glib/deprecated/gthread.h +10 -6
- package/include/glib-2.0/glib/gatomic.h +20 -20
- package/include/glib-2.0/glib/gbitlock.h +31 -0
- package/include/glib-2.0/glib/gbookmarkfile.h +39 -1
- package/include/glib-2.0/glib/gchecksum.h +0 -10
- package/include/glib-2.0/glib/gdate.h +0 -9
- package/include/glib-2.0/glib/gdatetime.h +33 -1
- package/include/glib-2.0/glib/gdir.h +5 -0
- package/include/glib-2.0/glib/ghmac.h +0 -9
- package/include/glib-2.0/glib/glib-autocleanups.h +4 -0
- package/include/glib-2.0/glib/glib-visibility.h +34 -0
- package/include/glib-2.0/glib/gmacros.h +1 -0
- package/include/glib-2.0/glib/gmessages.h +11 -0
- package/include/glib-2.0/glib/gpathbuf.h +0 -7
- package/include/glib-2.0/glib/gslice.h +2 -0
- package/include/glib-2.0/glib/gstdio.h +1 -1
- package/include/glib-2.0/glib/gstrfuncs.h +24 -18
- package/include/glib-2.0/glib/gstrvbuilder.h +4 -8
- package/include/glib-2.0/glib/gtestutils.h +5 -0
- package/include/glib-2.0/glib/gthread.h +216 -3
- package/include/glib-2.0/glib/gunicode.h +12 -2
- package/include/glib-2.0/glib/gvarianttype.h +1 -10
- package/include/glib-2.0/glib/gversionmacros.h +9 -0
- package/include/glib-2.0/glib/gwin32.h +4 -4
- package/include/glib-2.0/glib-unix.h +214 -0
- package/include/glib-2.0/gmodule/gmodule-visibility.h +34 -0
- package/include/glib-2.0/gobject/gbinding.h +0 -8
- package/include/glib-2.0/gobject/gbindinggroup.h +0 -8
- package/include/glib-2.0/gobject/gclosure.h +1 -9
- package/include/glib-2.0/gobject/genums.h +6 -6
- package/include/glib-2.0/gobject/glib-types.h +44 -0
- package/include/glib-2.0/gobject/gobject-autocleanups.h +4 -0
- package/include/glib-2.0/gobject/gobject-visibility.h +34 -0
- package/include/glib-2.0/gobject/gobject.h +1 -16
- package/include/glib-2.0/gobject/gparam.h +3 -12
- package/include/glib-2.0/gobject/gsignal.h +16 -6
- package/include/glib-2.0/gobject/gsignalgroup.h +0 -8
- package/include/glib-2.0/gobject/gtype.h +53 -20
- package/include/glib-2.0/gobject/gtypemodule.h +0 -7
- package/include/glib-2.0/gobject/gtypeplugin.h +0 -6
- package/include/glib-2.0/gobject/gvaluearray.h +0 -7
- package/include/glib-2.0/gobject/gvaluecollector.h +1 -11
- package/include/glib-2.0/gobject/gvaluetypes.h +2 -0
- package/include/hwy/aligned_allocator.h +171 -6
- package/include/hwy/base.h +1765 -543
- package/include/hwy/cache_control.h +24 -6
- package/include/hwy/detect_compiler_arch.h +23 -2
- package/include/hwy/detect_targets.h +56 -13
- package/include/hwy/foreach_target.h +24 -0
- package/include/hwy/highway.h +20 -3
- package/include/hwy/ops/arm_neon-inl.h +1086 -667
- package/include/hwy/ops/arm_sve-inl.h +1091 -235
- package/include/hwy/ops/emu128-inl.h +271 -196
- package/include/hwy/ops/generic_ops-inl.h +2270 -399
- package/include/hwy/ops/ppc_vsx-inl.h +1786 -563
- package/include/hwy/ops/rvv-inl.h +1043 -311
- package/include/hwy/ops/scalar-inl.h +189 -159
- package/include/hwy/ops/set_macros-inl.h +66 -6
- package/include/hwy/ops/shared-inl.h +175 -56
- package/include/hwy/ops/wasm_128-inl.h +153 -136
- package/include/hwy/ops/x86_128-inl.h +1647 -646
- package/include/hwy/ops/x86_256-inl.h +1003 -370
- package/include/hwy/ops/x86_512-inl.h +948 -353
- package/include/hwy/per_target.h +4 -0
- package/include/hwy/profiler.h +648 -0
- package/include/hwy/robust_statistics.h +2 -2
- package/include/hwy/targets.h +18 -11
- package/include/hwy/timer.h +11 -0
- package/include/lcms2.h +46 -7
- package/include/lcms2_plugin.h +4 -4
- package/include/libheif/heif_version.h +2 -2
- package/include/libpng16/png.h +32 -29
- package/include/libpng16/pngconf.h +2 -2
- package/include/libpng16/pnglibconf.h +7 -2
- package/include/librsvg-2.0/librsvg/rsvg-version.h +2 -2
- package/include/libxml2/libxml/HTMLparser.h +23 -0
- package/include/libxml2/libxml/SAX.h +0 -2
- package/include/libxml2/libxml/SAX2.h +0 -2
- package/include/libxml2/libxml/c14n.h +0 -2
- package/include/libxml2/libxml/dict.h +1 -0
- package/include/libxml2/libxml/encoding.h +16 -14
- package/include/libxml2/libxml/entities.h +4 -0
- package/include/libxml2/libxml/globals.h +15 -503
- package/include/libxml2/libxml/hash.h +57 -61
- package/include/libxml2/libxml/nanoftp.h +2 -2
- package/include/libxml2/libxml/parser.h +137 -18
- package/include/libxml2/libxml/parserInternals.h +1 -0
- package/include/libxml2/libxml/relaxng.h +2 -1
- package/include/libxml2/libxml/schemasInternals.h +1 -0
- package/include/libxml2/libxml/schematron.h +1 -0
- package/include/libxml2/libxml/threads.h +4 -11
- package/include/libxml2/libxml/tree.h +68 -20
- package/include/libxml2/libxml/uri.h +2 -1
- package/include/libxml2/libxml/valid.h +2 -0
- package/include/libxml2/libxml/xmlIO.h +65 -13
- package/include/libxml2/libxml/xmlerror.h +37 -8
- package/include/libxml2/libxml/xmlmemory.h +37 -40
- package/include/libxml2/libxml/xmlreader.h +6 -0
- package/include/libxml2/libxml/xmlregexp.h +2 -9
- package/include/libxml2/libxml/xmlsave.h +9 -0
- package/include/libxml2/libxml/xmlschemas.h +3 -0
- package/include/libxml2/libxml/xmlversion.h +28 -43
- package/include/libxml2/libxml/xpath.h +1 -1
- package/include/libxml2/libxml/xpathInternals.h +2 -1
- package/include/libxml2/libxml/xpointer.h +5 -4
- package/include/pango-1.0/pango/pango-features.h +3 -3
- package/include/pango-1.0/pango/pango-fontmap.h +7 -0
- package/include/pixman-1/pixman-version.h +3 -3
- package/include/pixman-1/pixman.h +9 -2
- package/include/png.h +32 -29
- package/include/pngconf.h +2 -2
- package/include/pnglibconf.h +7 -2
- package/include/vips/connection.h +9 -3
- package/include/vips/util.h +0 -9
- package/include/vips/version.h +4 -4
- package/include/zconf.h +3 -0
- package/include/zlib.h +3 -3
- package/package.json +1 -1
- package/versions.json +15 -15
|
@@ -13,9 +13,15 @@
|
|
|
13
13
|
// See the License for the specific language governing permissions and
|
|
14
14
|
// limitations under the License.
|
|
15
15
|
|
|
16
|
-
// 128-bit vectors for VSX
|
|
16
|
+
// 128-bit vectors for VSX/Z14
|
|
17
17
|
// External include guard in highway.h - see comment there.
|
|
18
18
|
|
|
19
|
+
#if HWY_TARGET == HWY_Z14 || HWY_TARGET == HWY_Z15
|
|
20
|
+
#define HWY_S390X_HAVE_Z14 1
|
|
21
|
+
#else
|
|
22
|
+
#define HWY_S390X_HAVE_Z14 0
|
|
23
|
+
#endif
|
|
24
|
+
|
|
19
25
|
#pragma push_macro("vector")
|
|
20
26
|
#pragma push_macro("pixel")
|
|
21
27
|
#pragma push_macro("bool")
|
|
@@ -24,7 +30,11 @@
|
|
|
24
30
|
#undef pixel
|
|
25
31
|
#undef bool
|
|
26
32
|
|
|
33
|
+
#if HWY_S390X_HAVE_Z14
|
|
34
|
+
#include <vecintrin.h>
|
|
35
|
+
#else
|
|
27
36
|
#include <altivec.h>
|
|
37
|
+
#endif
|
|
28
38
|
|
|
29
39
|
#pragma pop_macro("vector")
|
|
30
40
|
#pragma pop_macro("pixel")
|
|
@@ -37,20 +47,26 @@
|
|
|
37
47
|
// This means we can only use POWER10-specific intrinsics in static dispatch
|
|
38
48
|
// mode (where the -mpower10-vector compiler flag is passed). Same for PPC9.
|
|
39
49
|
// On other compilers, the usual target check is sufficient.
|
|
40
|
-
#if HWY_TARGET <= HWY_PPC9 && \
|
|
50
|
+
#if !HWY_S390X_HAVE_Z14 && HWY_TARGET <= HWY_PPC9 && \
|
|
41
51
|
(defined(_ARCH_PWR9) || defined(__POWER9_VECTOR__))
|
|
42
52
|
#define HWY_PPC_HAVE_9 1
|
|
43
53
|
#else
|
|
44
54
|
#define HWY_PPC_HAVE_9 0
|
|
45
55
|
#endif
|
|
46
56
|
|
|
47
|
-
#if HWY_TARGET <= HWY_PPC10 && \
|
|
57
|
+
#if !HWY_S390X_HAVE_Z14 && HWY_TARGET <= HWY_PPC10 && \
|
|
48
58
|
(defined(_ARCH_PWR10) || defined(__POWER10_VECTOR__))
|
|
49
59
|
#define HWY_PPC_HAVE_10 1
|
|
50
60
|
#else
|
|
51
61
|
#define HWY_PPC_HAVE_10 0
|
|
52
62
|
#endif
|
|
53
63
|
|
|
64
|
+
#if HWY_S390X_HAVE_Z14 && HWY_TARGET <= HWY_Z15 && __ARCH__ >= 13
|
|
65
|
+
#define HWY_S390X_HAVE_Z15 1
|
|
66
|
+
#else
|
|
67
|
+
#define HWY_S390X_HAVE_Z15 0
|
|
68
|
+
#endif
|
|
69
|
+
|
|
54
70
|
HWY_BEFORE_NAMESPACE();
|
|
55
71
|
namespace hwy {
|
|
56
72
|
namespace HWY_NAMESPACE {
|
|
@@ -125,6 +141,9 @@ class Vec128 {
|
|
|
125
141
|
HWY_INLINE Vec128& operator-=(const Vec128 other) {
|
|
126
142
|
return *this = (*this - other);
|
|
127
143
|
}
|
|
144
|
+
HWY_INLINE Vec128& operator%=(const Vec128 other) {
|
|
145
|
+
return *this = (*this % other);
|
|
146
|
+
}
|
|
128
147
|
HWY_INLINE Vec128& operator&=(const Vec128 other) {
|
|
129
148
|
return *this = (*this & other);
|
|
130
149
|
}
|
|
@@ -215,6 +234,12 @@ HWY_API VFromD<D> Set(D /* tag */, TFromD<D> t) {
|
|
|
215
234
|
return VFromD<D>{vec_splats(static_cast<RawLane>(t))};
|
|
216
235
|
}
|
|
217
236
|
|
|
237
|
+
template <class D, HWY_IF_SPECIAL_FLOAT(TFromD<D>)>
|
|
238
|
+
HWY_API VFromD<D> Set(D d, TFromD<D> t) {
|
|
239
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
240
|
+
return BitCast(d, Set(du, BitCastScalar<TFromD<decltype(du)>>(t)));
|
|
241
|
+
}
|
|
242
|
+
|
|
218
243
|
// Returns a vector with uninitialized elements.
|
|
219
244
|
template <class D>
|
|
220
245
|
HWY_API VFromD<D> Undefined(D d) {
|
|
@@ -240,6 +265,58 @@ HWY_API T GetLane(Vec128<T, N> v) {
|
|
|
240
265
|
return static_cast<T>(v.raw[0]);
|
|
241
266
|
}
|
|
242
267
|
|
|
268
|
+
// ------------------------------ Dup128VecFromValues
|
|
269
|
+
|
|
270
|
+
template <class D, HWY_IF_T_SIZE_D(D, 1)>
|
|
271
|
+
HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1,
|
|
272
|
+
TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
|
|
273
|
+
TFromD<D> t5, TFromD<D> t6, TFromD<D> t7,
|
|
274
|
+
TFromD<D> t8, TFromD<D> t9, TFromD<D> t10,
|
|
275
|
+
TFromD<D> t11, TFromD<D> t12,
|
|
276
|
+
TFromD<D> t13, TFromD<D> t14,
|
|
277
|
+
TFromD<D> t15) {
|
|
278
|
+
const typename detail::Raw128<TFromD<D>>::type raw = {
|
|
279
|
+
t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15};
|
|
280
|
+
return VFromD<D>{raw};
|
|
281
|
+
}
|
|
282
|
+
|
|
283
|
+
template <class D, HWY_IF_UI16_D(D)>
|
|
284
|
+
HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1,
|
|
285
|
+
TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
|
|
286
|
+
TFromD<D> t5, TFromD<D> t6,
|
|
287
|
+
TFromD<D> t7) {
|
|
288
|
+
const typename detail::Raw128<TFromD<D>>::type raw = {t0, t1, t2, t3,
|
|
289
|
+
t4, t5, t6, t7};
|
|
290
|
+
return VFromD<D>{raw};
|
|
291
|
+
}
|
|
292
|
+
|
|
293
|
+
template <class D, HWY_IF_SPECIAL_FLOAT_D(D)>
|
|
294
|
+
HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1,
|
|
295
|
+
TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
|
|
296
|
+
TFromD<D> t5, TFromD<D> t6,
|
|
297
|
+
TFromD<D> t7) {
|
|
298
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
299
|
+
return BitCast(
|
|
300
|
+
d, Dup128VecFromValues(
|
|
301
|
+
du, BitCastScalar<uint16_t>(t0), BitCastScalar<uint16_t>(t1),
|
|
302
|
+
BitCastScalar<uint16_t>(t2), BitCastScalar<uint16_t>(t3),
|
|
303
|
+
BitCastScalar<uint16_t>(t4), BitCastScalar<uint16_t>(t5),
|
|
304
|
+
BitCastScalar<uint16_t>(t6), BitCastScalar<uint16_t>(t7)));
|
|
305
|
+
}
|
|
306
|
+
|
|
307
|
+
template <class D, HWY_IF_T_SIZE_D(D, 4)>
|
|
308
|
+
HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1,
|
|
309
|
+
TFromD<D> t2, TFromD<D> t3) {
|
|
310
|
+
const typename detail::Raw128<TFromD<D>>::type raw = {t0, t1, t2, t3};
|
|
311
|
+
return VFromD<D>{raw};
|
|
312
|
+
}
|
|
313
|
+
|
|
314
|
+
template <class D, HWY_IF_T_SIZE_D(D, 8)>
|
|
315
|
+
HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1) {
|
|
316
|
+
const typename detail::Raw128<TFromD<D>>::type raw = {t0, t1};
|
|
317
|
+
return VFromD<D>{raw};
|
|
318
|
+
}
|
|
319
|
+
|
|
243
320
|
// ================================================== LOGICAL
|
|
244
321
|
|
|
245
322
|
// ------------------------------ And
|
|
@@ -249,7 +326,11 @@ HWY_API Vec128<T, N> And(Vec128<T, N> a, Vec128<T, N> b) {
|
|
|
249
326
|
const DFromV<decltype(a)> d;
|
|
250
327
|
const RebindToUnsigned<decltype(d)> du;
|
|
251
328
|
using VU = VFromD<decltype(du)>;
|
|
329
|
+
#if HWY_S390X_HAVE_Z14
|
|
330
|
+
return BitCast(d, VU{BitCast(du, a).raw & BitCast(du, b).raw});
|
|
331
|
+
#else
|
|
252
332
|
return BitCast(d, VU{vec_and(BitCast(du, a).raw, BitCast(du, b).raw)});
|
|
333
|
+
#endif
|
|
253
334
|
}
|
|
254
335
|
|
|
255
336
|
// ------------------------------ AndNot
|
|
@@ -271,7 +352,11 @@ HWY_API Vec128<T, N> Or(Vec128<T, N> a, Vec128<T, N> b) {
|
|
|
271
352
|
const DFromV<decltype(a)> d;
|
|
272
353
|
const RebindToUnsigned<decltype(d)> du;
|
|
273
354
|
using VU = VFromD<decltype(du)>;
|
|
355
|
+
#if HWY_S390X_HAVE_Z14
|
|
356
|
+
return BitCast(d, VU{BitCast(du, a).raw | BitCast(du, b).raw});
|
|
357
|
+
#else
|
|
274
358
|
return BitCast(d, VU{vec_or(BitCast(du, a).raw, BitCast(du, b).raw)});
|
|
359
|
+
#endif
|
|
275
360
|
}
|
|
276
361
|
|
|
277
362
|
// ------------------------------ Xor
|
|
@@ -281,7 +366,11 @@ HWY_API Vec128<T, N> Xor(Vec128<T, N> a, Vec128<T, N> b) {
|
|
|
281
366
|
const DFromV<decltype(a)> d;
|
|
282
367
|
const RebindToUnsigned<decltype(d)> du;
|
|
283
368
|
using VU = VFromD<decltype(du)>;
|
|
369
|
+
#if HWY_S390X_HAVE_Z14
|
|
370
|
+
return BitCast(d, VU{BitCast(du, a).raw ^ BitCast(du, b).raw});
|
|
371
|
+
#else
|
|
284
372
|
return BitCast(d, VU{vec_xor(BitCast(du, a).raw, BitCast(du, b).raw)});
|
|
373
|
+
#endif
|
|
285
374
|
}
|
|
286
375
|
|
|
287
376
|
// ------------------------------ Not
|
|
@@ -476,9 +565,21 @@ HWY_API Vec128<T, N> operator^(Vec128<T, N> a, Vec128<T, N> b) {
|
|
|
476
565
|
|
|
477
566
|
// ------------------------------ Neg
|
|
478
567
|
|
|
479
|
-
template <typename T, size_t N,
|
|
480
|
-
|
|
568
|
+
template <typename T, size_t N, HWY_IF_SIGNED(T)>
|
|
569
|
+
HWY_API Vec128<T, N> Neg(Vec128<T, N> v) {
|
|
570
|
+
// If T is an signed integer type, use Zero(d) - v instead of vec_neg to
|
|
571
|
+
// avoid undefined behavior in the case where v[i] == LimitsMin<T>()
|
|
572
|
+
const DFromV<decltype(v)> d;
|
|
573
|
+
return Zero(d) - v;
|
|
574
|
+
}
|
|
575
|
+
|
|
576
|
+
template <typename T, size_t N, HWY_IF_FLOAT3264(T)>
|
|
577
|
+
HWY_API Vec128<T, N> Neg(Vec128<T, N> v) {
|
|
578
|
+
#if HWY_S390X_HAVE_Z14
|
|
579
|
+
return Xor(v, SignBit(DFromV<decltype(v)>()));
|
|
580
|
+
#else
|
|
481
581
|
return Vec128<T, N>{vec_neg(v.raw)};
|
|
582
|
+
#endif
|
|
482
583
|
}
|
|
483
584
|
|
|
484
585
|
template <typename T, size_t N, HWY_IF_SPECIAL_FLOAT(T)>
|
|
@@ -489,13 +590,40 @@ HWY_API Vec128<T, N> Neg(const Vec128<T, N> v) {
|
|
|
489
590
|
// ------------------------------ Abs
|
|
490
591
|
|
|
491
592
|
// Returns absolute value, except that LimitsMin() maps to LimitsMax() + 1.
|
|
492
|
-
template <class T, size_t N,
|
|
593
|
+
template <class T, size_t N, HWY_IF_SIGNED(T)>
|
|
594
|
+
HWY_API Vec128<T, N> Abs(Vec128<T, N> v) {
|
|
595
|
+
// If T is a signed integer type, use Max(v, Neg(v)) instead of vec_abs to
|
|
596
|
+
// avoid undefined behavior in the case where v[i] == LimitsMin<T>().
|
|
597
|
+
return Max(v, Neg(v));
|
|
598
|
+
}
|
|
599
|
+
|
|
600
|
+
template <class T, size_t N, HWY_IF_FLOAT3264(T)>
|
|
493
601
|
HWY_API Vec128<T, N> Abs(Vec128<T, N> v) {
|
|
494
602
|
return Vec128<T, N>{vec_abs(v.raw)};
|
|
495
603
|
}
|
|
496
604
|
|
|
497
605
|
// ------------------------------ CopySign
|
|
498
606
|
|
|
607
|
+
#if HWY_S390X_HAVE_Z14
|
|
608
|
+
template <class V>
|
|
609
|
+
HWY_API V CopySign(const V magn, const V sign) {
|
|
610
|
+
static_assert(IsFloat<TFromV<V>>(), "Only makes sense for floating-point");
|
|
611
|
+
|
|
612
|
+
const DFromV<decltype(magn)> d;
|
|
613
|
+
const auto msb = SignBit(d);
|
|
614
|
+
|
|
615
|
+
// Truth table for msb, magn, sign | bitwise msb ? sign : mag
|
|
616
|
+
// 0 0 0 | 0
|
|
617
|
+
// 0 0 1 | 0
|
|
618
|
+
// 0 1 0 | 1
|
|
619
|
+
// 0 1 1 | 1
|
|
620
|
+
// 1 0 0 | 0
|
|
621
|
+
// 1 0 1 | 1
|
|
622
|
+
// 1 1 0 | 0
|
|
623
|
+
// 1 1 1 | 1
|
|
624
|
+
return BitwiseIfThenElse(msb, sign, magn);
|
|
625
|
+
}
|
|
626
|
+
#else // VSX
|
|
499
627
|
template <size_t N>
|
|
500
628
|
HWY_API Vec128<float, N> CopySign(Vec128<float, N> magn,
|
|
501
629
|
Vec128<float, N> sign) {
|
|
@@ -525,6 +653,7 @@ HWY_API Vec128<double, N> CopySign(Vec128<double, N> magn,
|
|
|
525
653
|
return Vec128<double, N>{vec_cpsgn(sign.raw, magn.raw)};
|
|
526
654
|
#endif
|
|
527
655
|
}
|
|
656
|
+
#endif // HWY_S390X_HAVE_Z14
|
|
528
657
|
|
|
529
658
|
template <typename T, size_t N>
|
|
530
659
|
HWY_API Vec128<T, N> CopySignToAbs(Vec128<T, N> abs, Vec128<T, N> sign) {
|
|
@@ -543,7 +672,7 @@ HWY_API Vec128<T, N> CopySignToAbs(Vec128<T, N> abs, Vec128<T, N> sign) {
|
|
|
543
672
|
template <class D, HWY_IF_V_SIZE_D(D, 16), typename T = TFromD<D>>
|
|
544
673
|
HWY_API Vec128<T> Load(D /* tag */, const T* HWY_RESTRICT aligned) {
|
|
545
674
|
using LoadRaw = typename detail::Raw128<T>::AlignedRawVec;
|
|
546
|
-
const LoadRaw* HWY_RESTRICT p =
|
|
675
|
+
const LoadRaw* HWY_RESTRICT p = HWY_RCAST_ALIGNED(const LoadRaw*, aligned);
|
|
547
676
|
using ResultRaw = typename detail::Raw128<T>::type;
|
|
548
677
|
return Vec128<T>{reinterpret_cast<ResultRaw>(*p)};
|
|
549
678
|
}
|
|
@@ -598,19 +727,13 @@ HWY_API Vec128<T, N> IfThenElse(Mask128<T, N> mask, Vec128<T, N> yes,
|
|
|
598
727
|
// mask ? yes : 0
|
|
599
728
|
template <typename T, size_t N>
|
|
600
729
|
HWY_API Vec128<T, N> IfThenElseZero(Mask128<T, N> mask, Vec128<T, N> yes) {
|
|
601
|
-
|
|
602
|
-
const RebindToUnsigned<decltype(d)> du;
|
|
603
|
-
return BitCast(d,
|
|
604
|
-
VFromD<decltype(du)>{vec_and(BitCast(du, yes).raw, mask.raw)});
|
|
730
|
+
return yes & VecFromMask(DFromV<decltype(yes)>(), mask);
|
|
605
731
|
}
|
|
606
732
|
|
|
607
733
|
// mask ? 0 : no
|
|
608
734
|
template <typename T, size_t N>
|
|
609
735
|
HWY_API Vec128<T, N> IfThenZeroElse(Mask128<T, N> mask, Vec128<T, N> no) {
|
|
610
|
-
|
|
611
|
-
const RebindToUnsigned<decltype(d)> du;
|
|
612
|
-
return BitCast(d,
|
|
613
|
-
VFromD<decltype(du)>{vec_andc(BitCast(du, no).raw, mask.raw)});
|
|
736
|
+
return AndNot(VecFromMask(DFromV<decltype(no)>(), mask), no);
|
|
614
737
|
}
|
|
615
738
|
|
|
616
739
|
// ------------------------------ Mask logical
|
|
@@ -622,7 +745,11 @@ HWY_API Mask128<T, N> Not(Mask128<T, N> m) {
|
|
|
622
745
|
|
|
623
746
|
template <typename T, size_t N>
|
|
624
747
|
HWY_API Mask128<T, N> And(Mask128<T, N> a, Mask128<T, N> b) {
|
|
748
|
+
#if HWY_S390X_HAVE_Z14
|
|
749
|
+
return Mask128<T, N>{a.raw & b.raw};
|
|
750
|
+
#else
|
|
625
751
|
return Mask128<T, N>{vec_and(a.raw, b.raw)};
|
|
752
|
+
#endif
|
|
626
753
|
}
|
|
627
754
|
|
|
628
755
|
template <typename T, size_t N>
|
|
@@ -632,12 +759,20 @@ HWY_API Mask128<T, N> AndNot(Mask128<T, N> a, Mask128<T, N> b) {
|
|
|
632
759
|
|
|
633
760
|
template <typename T, size_t N>
|
|
634
761
|
HWY_API Mask128<T, N> Or(Mask128<T, N> a, Mask128<T, N> b) {
|
|
762
|
+
#if HWY_S390X_HAVE_Z14
|
|
763
|
+
return Mask128<T, N>{a.raw | b.raw};
|
|
764
|
+
#else
|
|
635
765
|
return Mask128<T, N>{vec_or(a.raw, b.raw)};
|
|
766
|
+
#endif
|
|
636
767
|
}
|
|
637
768
|
|
|
638
769
|
template <typename T, size_t N>
|
|
639
770
|
HWY_API Mask128<T, N> Xor(Mask128<T, N> a, Mask128<T, N> b) {
|
|
771
|
+
#if HWY_S390X_HAVE_Z14
|
|
772
|
+
return Mask128<T, N>{a.raw ^ b.raw};
|
|
773
|
+
#else
|
|
640
774
|
return Mask128<T, N>{vec_xor(a.raw, b.raw)};
|
|
775
|
+
#endif
|
|
641
776
|
}
|
|
642
777
|
|
|
643
778
|
template <typename T, size_t N>
|
|
@@ -645,36 +780,24 @@ HWY_API Mask128<T, N> ExclusiveNeither(Mask128<T, N> a, Mask128<T, N> b) {
|
|
|
645
780
|
return Mask128<T, N>{vec_nor(a.raw, b.raw)};
|
|
646
781
|
}
|
|
647
782
|
|
|
648
|
-
// ------------------------------ BroadcastSignBit
|
|
649
|
-
|
|
650
|
-
template <size_t N>
|
|
651
|
-
HWY_API Vec128<int8_t, N> BroadcastSignBit(Vec128<int8_t, N> v) {
|
|
652
|
-
return Vec128<int8_t, N>{
|
|
653
|
-
vec_sra(v.raw, vec_splats(static_cast<unsigned char>(7)))};
|
|
654
|
-
}
|
|
655
|
-
|
|
656
|
-
template <size_t N>
|
|
657
|
-
HWY_API Vec128<int16_t, N> BroadcastSignBit(Vec128<int16_t, N> v) {
|
|
658
|
-
return Vec128<int16_t, N>{
|
|
659
|
-
vec_sra(v.raw, vec_splats(static_cast<unsigned short>(15)))};
|
|
660
|
-
}
|
|
661
|
-
|
|
662
|
-
template <size_t N>
|
|
663
|
-
HWY_API Vec128<int32_t, N> BroadcastSignBit(Vec128<int32_t, N> v) {
|
|
664
|
-
return Vec128<int32_t, N>{vec_sra(v.raw, vec_splats(31u))};
|
|
665
|
-
}
|
|
666
|
-
|
|
667
|
-
template <size_t N>
|
|
668
|
-
HWY_API Vec128<int64_t, N> BroadcastSignBit(Vec128<int64_t, N> v) {
|
|
669
|
-
return Vec128<int64_t, N>{vec_sra(v.raw, vec_splats(63ULL))};
|
|
670
|
-
}
|
|
671
|
-
|
|
672
783
|
// ------------------------------ ShiftLeftSame
|
|
673
784
|
|
|
674
785
|
template <typename T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
|
|
675
786
|
HWY_API Vec128<T, N> ShiftLeftSame(Vec128<T, N> v, const int bits) {
|
|
676
|
-
|
|
677
|
-
|
|
787
|
+
const DFromV<decltype(v)> d;
|
|
788
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
789
|
+
using TU = TFromD<decltype(du)>;
|
|
790
|
+
|
|
791
|
+
#if HWY_S390X_HAVE_Z14
|
|
792
|
+
return BitCast(d,
|
|
793
|
+
VFromD<decltype(du)>{BitCast(du, v).raw
|
|
794
|
+
<< Set(du, static_cast<TU>(bits)).raw});
|
|
795
|
+
#else
|
|
796
|
+
// Do an unsigned vec_sl operation to avoid undefined behavior
|
|
797
|
+
return BitCast(
|
|
798
|
+
d, VFromD<decltype(du)>{
|
|
799
|
+
vec_sl(BitCast(du, v).raw, Set(du, static_cast<TU>(bits)).raw)});
|
|
800
|
+
#endif
|
|
678
801
|
}
|
|
679
802
|
|
|
680
803
|
// ------------------------------ ShiftRightSame
|
|
@@ -682,13 +805,22 @@ HWY_API Vec128<T, N> ShiftLeftSame(Vec128<T, N> v, const int bits) {
|
|
|
682
805
|
template <typename T, size_t N, HWY_IF_UNSIGNED(T)>
|
|
683
806
|
HWY_API Vec128<T, N> ShiftRightSame(Vec128<T, N> v, const int bits) {
|
|
684
807
|
using TU = typename detail::Raw128<MakeUnsigned<T>>::RawT;
|
|
808
|
+
#if HWY_S390X_HAVE_Z14
|
|
809
|
+
return Vec128<T, N>{v.raw >> vec_splats(static_cast<TU>(bits))};
|
|
810
|
+
#else
|
|
685
811
|
return Vec128<T, N>{vec_sr(v.raw, vec_splats(static_cast<TU>(bits)))};
|
|
812
|
+
#endif
|
|
686
813
|
}
|
|
687
814
|
|
|
688
815
|
template <typename T, size_t N, HWY_IF_SIGNED(T)>
|
|
689
816
|
HWY_API Vec128<T, N> ShiftRightSame(Vec128<T, N> v, const int bits) {
|
|
817
|
+
#if HWY_S390X_HAVE_Z14
|
|
818
|
+
using TI = typename detail::Raw128<T>::RawT;
|
|
819
|
+
return Vec128<T, N>{v.raw >> vec_splats(static_cast<TI>(bits))};
|
|
820
|
+
#else
|
|
690
821
|
using TU = typename detail::Raw128<MakeUnsigned<T>>::RawT;
|
|
691
822
|
return Vec128<T, N>{vec_sra(v.raw, vec_splats(static_cast<TU>(bits)))};
|
|
823
|
+
#endif
|
|
692
824
|
}
|
|
693
825
|
|
|
694
826
|
// ------------------------------ ShiftLeft
|
|
@@ -707,6 +839,13 @@ HWY_API Vec128<T, N> ShiftRight(Vec128<T, N> v) {
|
|
|
707
839
|
return ShiftRightSame(v, kBits);
|
|
708
840
|
}
|
|
709
841
|
|
|
842
|
+
// ------------------------------ BroadcastSignBit
|
|
843
|
+
|
|
844
|
+
template <typename T, size_t N, HWY_IF_SIGNED(T)>
|
|
845
|
+
HWY_API Vec128<T, N> BroadcastSignBit(Vec128<T, N> v) {
|
|
846
|
+
return ShiftRightSame(v, static_cast<int>(sizeof(T) * 8 - 1));
|
|
847
|
+
}
|
|
848
|
+
|
|
710
849
|
// ================================================== SWIZZLE (1)
|
|
711
850
|
|
|
712
851
|
// ------------------------------ TableLookupBytes
|
|
@@ -1003,7 +1142,7 @@ HWY_API VFromD<D> LoadDup128(D d, const T* HWY_RESTRICT p) {
|
|
|
1003
1142
|
return LoadU(d, p);
|
|
1004
1143
|
}
|
|
1005
1144
|
|
|
1006
|
-
#if HWY_PPC_HAVE_9
|
|
1145
|
+
#if (HWY_PPC_HAVE_9 && HWY_ARCH_PPC_64) || HWY_S390X_HAVE_Z14
|
|
1007
1146
|
#ifdef HWY_NATIVE_LOAD_N
|
|
1008
1147
|
#undef HWY_NATIVE_LOAD_N
|
|
1009
1148
|
#else
|
|
@@ -1027,11 +1166,20 @@ HWY_API VFromD<D> LoadN(D d, const T* HWY_RESTRICT p,
|
|
|
1027
1166
|
const size_t num_of_bytes_to_load =
|
|
1028
1167
|
HWY_MIN(max_lanes_to_load, HWY_MAX_LANES_D(D)) * sizeof(TFromD<D>);
|
|
1029
1168
|
const Repartition<uint8_t, decltype(d)> du8;
|
|
1169
|
+
#if HWY_S390X_HAVE_Z14
|
|
1170
|
+
return (num_of_bytes_to_load > 0)
|
|
1171
|
+
? BitCast(d, VFromD<decltype(du8)>{vec_load_len(
|
|
1172
|
+
const_cast<unsigned char*>(
|
|
1173
|
+
reinterpret_cast<const unsigned char*>(p)),
|
|
1174
|
+
static_cast<unsigned>(num_of_bytes_to_load - 1))})
|
|
1175
|
+
: Zero(d);
|
|
1176
|
+
#else
|
|
1030
1177
|
return BitCast(
|
|
1031
1178
|
d,
|
|
1032
1179
|
VFromD<decltype(du8)>{vec_xl_len(
|
|
1033
1180
|
const_cast<unsigned char*>(reinterpret_cast<const unsigned char*>(p)),
|
|
1034
1181
|
num_of_bytes_to_load)});
|
|
1182
|
+
#endif
|
|
1035
1183
|
}
|
|
1036
1184
|
|
|
1037
1185
|
template <class D, typename T = TFromD<D>>
|
|
@@ -1048,18 +1196,11 @@ HWY_API VFromD<D> LoadNOr(VFromD<D> no, D d, const T* HWY_RESTRICT p,
|
|
|
1048
1196
|
}
|
|
1049
1197
|
#endif
|
|
1050
1198
|
|
|
1051
|
-
|
|
1052
|
-
|
|
1053
|
-
const Repartition<uint8_t, decltype(d)> du8;
|
|
1054
|
-
const VFromD<D> v = BitCast(
|
|
1055
|
-
d,
|
|
1056
|
-
VFromD<decltype(du8)>{vec_xl_len(
|
|
1057
|
-
const_cast<unsigned char*>(reinterpret_cast<const unsigned char*>(p)),
|
|
1058
|
-
num_of_bytes_to_load)});
|
|
1059
|
-
return IfThenElse(FirstN(d, max_lanes_to_load), v, no);
|
|
1199
|
+
return IfThenElse(FirstN(d, max_lanes_to_load),
|
|
1200
|
+
LoadN(d, p, max_lanes_to_load), no);
|
|
1060
1201
|
}
|
|
1061
1202
|
|
|
1062
|
-
#endif // HWY_PPC_HAVE_9
|
|
1203
|
+
#endif // HWY_PPC_HAVE_9 || HWY_S390X_HAVE_Z14
|
|
1063
1204
|
|
|
1064
1205
|
// Returns a vector with lane i=[0, N) set to "first" + i.
|
|
1065
1206
|
namespace detail {
|
|
@@ -1135,7 +1276,7 @@ HWY_API VFromD<D> MaskedLoadOr(VFromD<D> v, MFromD<D> m, D d,
|
|
|
1135
1276
|
template <class D, HWY_IF_V_SIZE_D(D, 16), typename T = TFromD<D>>
|
|
1136
1277
|
HWY_API void Store(Vec128<T> v, D /* tag */, T* HWY_RESTRICT aligned) {
|
|
1137
1278
|
using StoreRaw = typename detail::Raw128<T>::AlignedRawVec;
|
|
1138
|
-
*
|
|
1279
|
+
*HWY_RCAST_ALIGNED(StoreRaw*, aligned) = reinterpret_cast<StoreRaw>(v.raw);
|
|
1139
1280
|
}
|
|
1140
1281
|
|
|
1141
1282
|
template <class D, HWY_IF_V_SIZE_D(D, 16), typename T = TFromD<D>>
|
|
@@ -1159,7 +1300,7 @@ HWY_API void StoreU(VFromD<D> v, D d, T* HWY_RESTRICT p) {
|
|
|
1159
1300
|
Store(v, d, p);
|
|
1160
1301
|
}
|
|
1161
1302
|
|
|
1162
|
-
#if HWY_PPC_HAVE_9
|
|
1303
|
+
#if (HWY_PPC_HAVE_9 && HWY_ARCH_PPC_64) || HWY_S390X_HAVE_Z14
|
|
1163
1304
|
|
|
1164
1305
|
#ifdef HWY_NATIVE_STORE_N
|
|
1165
1306
|
#undef HWY_NATIVE_STORE_N
|
|
@@ -1185,8 +1326,15 @@ HWY_API void StoreN(VFromD<D> v, D d, T* HWY_RESTRICT p,
|
|
|
1185
1326
|
const size_t num_of_bytes_to_store =
|
|
1186
1327
|
HWY_MIN(max_lanes_to_store, HWY_MAX_LANES_D(D)) * sizeof(TFromD<D>);
|
|
1187
1328
|
const Repartition<uint8_t, decltype(d)> du8;
|
|
1329
|
+
#if HWY_S390X_HAVE_Z14
|
|
1330
|
+
if (num_of_bytes_to_store > 0) {
|
|
1331
|
+
vec_store_len(BitCast(du8, v).raw, reinterpret_cast<unsigned char*>(p),
|
|
1332
|
+
static_cast<unsigned>(num_of_bytes_to_store - 1));
|
|
1333
|
+
}
|
|
1334
|
+
#else
|
|
1188
1335
|
vec_xst_len(BitCast(du8, v).raw, reinterpret_cast<unsigned char*>(p),
|
|
1189
1336
|
num_of_bytes_to_store);
|
|
1337
|
+
#endif
|
|
1190
1338
|
}
|
|
1191
1339
|
#endif
|
|
1192
1340
|
|
|
@@ -1210,164 +1358,97 @@ HWY_API void BlendedStore(VFromD<D> v, MFromD<D> m, D d,
|
|
|
1210
1358
|
|
|
1211
1359
|
// ================================================== ARITHMETIC
|
|
1212
1360
|
|
|
1361
|
+
namespace detail {
|
|
1362
|
+
// If TFromD<D> is an integer type, detail::RebindToUnsignedIfNotFloat<D>
|
|
1363
|
+
// rebinds D to MakeUnsigned<TFromD<D>>.
|
|
1364
|
+
|
|
1365
|
+
// Otherwise, if TFromD<D> is a floating-point type (including F16 and BF16),
|
|
1366
|
+
// detail::RebindToUnsignedIfNotFloat<D> is the same as D.
|
|
1367
|
+
template <class D>
|
|
1368
|
+
using RebindToUnsignedIfNotFloat =
|
|
1369
|
+
hwy::If<(!hwy::IsFloat<TFromD<D>>() && !hwy::IsSpecialFloat<TFromD<D>>()),
|
|
1370
|
+
RebindToUnsigned<D>, D>;
|
|
1371
|
+
} // namespace detail
|
|
1372
|
+
|
|
1213
1373
|
// ------------------------------ Addition
|
|
1214
1374
|
|
|
1215
1375
|
template <typename T, size_t N, HWY_IF_NOT_SPECIAL_FLOAT(T)>
|
|
1216
1376
|
HWY_API Vec128<T, N> operator+(Vec128<T, N> a, Vec128<T, N> b) {
|
|
1217
|
-
|
|
1377
|
+
const DFromV<decltype(a)> d;
|
|
1378
|
+
const detail::RebindToUnsignedIfNotFloat<decltype(d)> d_arith;
|
|
1379
|
+
|
|
1380
|
+
// If T is an integer type, do an unsigned vec_add to avoid undefined behavior
|
|
1381
|
+
#if HWY_S390X_HAVE_Z14
|
|
1382
|
+
return BitCast(d, VFromD<decltype(d_arith)>{BitCast(d_arith, a).raw +
|
|
1383
|
+
BitCast(d_arith, b).raw});
|
|
1384
|
+
#else
|
|
1385
|
+
return BitCast(d, VFromD<decltype(d_arith)>{vec_add(
|
|
1386
|
+
BitCast(d_arith, a).raw, BitCast(d_arith, b).raw)});
|
|
1387
|
+
#endif
|
|
1218
1388
|
}
|
|
1219
1389
|
|
|
1220
1390
|
// ------------------------------ Subtraction
|
|
1221
1391
|
|
|
1222
1392
|
template <typename T, size_t N, HWY_IF_NOT_SPECIAL_FLOAT(T)>
|
|
1223
1393
|
HWY_API Vec128<T, N> operator-(Vec128<T, N> a, Vec128<T, N> b) {
|
|
1224
|
-
|
|
1225
|
-
|
|
1226
|
-
|
|
1227
|
-
// ------------------------------ SumsOf8
|
|
1228
|
-
namespace detail {
|
|
1394
|
+
const DFromV<decltype(a)> d;
|
|
1395
|
+
const detail::RebindToUnsignedIfNotFloat<decltype(d)> d_arith;
|
|
1229
1396
|
|
|
1230
|
-
//
|
|
1231
|
-
|
|
1232
|
-
|
|
1233
|
-
|
|
1234
|
-
|
|
1235
|
-
|
|
1236
|
-
|
|
1237
|
-
const int64_t sum0 =
|
|
1238
|
-
static_cast<int64_t>(a[0]) + static_cast<int64_t>(a[1]) +
|
|
1239
|
-
static_cast<int64_t>(a[2]) + static_cast<int64_t>(a[3]) +
|
|
1240
|
-
static_cast<int64_t>(b[0]);
|
|
1241
|
-
const int64_t sum1 =
|
|
1242
|
-
static_cast<int64_t>(a[4]) + static_cast<int64_t>(a[5]) +
|
|
1243
|
-
static_cast<int64_t>(a[6]) + static_cast<int64_t>(a[7]) +
|
|
1244
|
-
static_cast<int64_t>(b[1]);
|
|
1245
|
-
const int64_t sum2 =
|
|
1246
|
-
static_cast<int64_t>(a[8]) + static_cast<int64_t>(a[9]) +
|
|
1247
|
-
static_cast<int64_t>(a[10]) + static_cast<int64_t>(a[11]) +
|
|
1248
|
-
static_cast<int64_t>(b[2]);
|
|
1249
|
-
const int64_t sum3 =
|
|
1250
|
-
static_cast<int64_t>(a[12]) + static_cast<int64_t>(a[13]) +
|
|
1251
|
-
static_cast<int64_t>(a[14]) + static_cast<int64_t>(a[15]) +
|
|
1252
|
-
static_cast<int64_t>(b[3]);
|
|
1253
|
-
const int32_t sign0 = static_cast<int32_t>(sum0 >> 63);
|
|
1254
|
-
const int32_t sign1 = static_cast<int32_t>(sum1 >> 63);
|
|
1255
|
-
const int32_t sign2 = static_cast<int32_t>(sum2 >> 63);
|
|
1256
|
-
const int32_t sign3 = static_cast<int32_t>(sum3 >> 63);
|
|
1257
|
-
using Raw = typename detail::Raw128<int32_t>::type;
|
|
1258
|
-
return BitCast(
|
|
1259
|
-
d,
|
|
1260
|
-
VFromD<decltype(di32)>{Raw{
|
|
1261
|
-
(sign0 == (sum0 >> 31)) ? static_cast<int32_t>(sum0)
|
|
1262
|
-
: static_cast<int32_t>(sign0 ^ 0x7FFFFFFF),
|
|
1263
|
-
(sign1 == (sum1 >> 31)) ? static_cast<int32_t>(sum1)
|
|
1264
|
-
: static_cast<int32_t>(sign1 ^ 0x7FFFFFFF),
|
|
1265
|
-
(sign2 == (sum2 >> 31)) ? static_cast<int32_t>(sum2)
|
|
1266
|
-
: static_cast<int32_t>(sign2 ^ 0x7FFFFFFF),
|
|
1267
|
-
(sign3 == (sum3 >> 31))
|
|
1268
|
-
? static_cast<int32_t>(sum3)
|
|
1269
|
-
: static_cast<int32_t>(sign3 ^ 0x7FFFFFFF)}});
|
|
1270
|
-
} else // NOLINT
|
|
1397
|
+
// If T is an integer type, do an unsigned vec_sub to avoid undefined behavior
|
|
1398
|
+
#if HWY_S390X_HAVE_Z14
|
|
1399
|
+
return BitCast(d, VFromD<decltype(d_arith)>{BitCast(d_arith, a).raw -
|
|
1400
|
+
BitCast(d_arith, b).raw});
|
|
1401
|
+
#else
|
|
1402
|
+
return BitCast(d, VFromD<decltype(d_arith)>{vec_sub(
|
|
1403
|
+
BitCast(d_arith, a).raw, BitCast(d_arith, b).raw)});
|
|
1271
1404
|
#endif
|
|
1272
|
-
{
|
|
1273
|
-
return BitCast(d, VFromD<decltype(di32)>{vec_vsum4sbs(a, b)});
|
|
1274
|
-
}
|
|
1275
1405
|
}
|
|
1276
1406
|
|
|
1277
|
-
//
|
|
1278
|
-
template <class
|
|
1279
|
-
|
|
1280
|
-
|
|
1281
|
-
const Repartition<uint32_t, D> du32;
|
|
1282
|
-
#ifdef __OPTIMIZE__
|
|
1283
|
-
if (IsConstantRawAltivecVect(a) && IsConstantRawAltivecVect(b)) {
|
|
1284
|
-
const uint64_t sum0 =
|
|
1285
|
-
static_cast<uint64_t>(a[0]) + static_cast<uint64_t>(a[1]) +
|
|
1286
|
-
static_cast<uint64_t>(a[2]) + static_cast<uint64_t>(a[3]) +
|
|
1287
|
-
static_cast<uint64_t>(b[0]);
|
|
1288
|
-
const uint64_t sum1 =
|
|
1289
|
-
static_cast<uint64_t>(a[4]) + static_cast<uint64_t>(a[5]) +
|
|
1290
|
-
static_cast<uint64_t>(a[6]) + static_cast<uint64_t>(a[7]) +
|
|
1291
|
-
static_cast<uint64_t>(b[1]);
|
|
1292
|
-
const uint64_t sum2 =
|
|
1293
|
-
static_cast<uint64_t>(a[8]) + static_cast<uint64_t>(a[9]) +
|
|
1294
|
-
static_cast<uint64_t>(a[10]) + static_cast<uint64_t>(a[11]) +
|
|
1295
|
-
static_cast<uint64_t>(b[2]);
|
|
1296
|
-
const uint64_t sum3 =
|
|
1297
|
-
static_cast<uint64_t>(a[12]) + static_cast<uint64_t>(a[13]) +
|
|
1298
|
-
static_cast<uint64_t>(a[14]) + static_cast<uint64_t>(a[15]) +
|
|
1299
|
-
static_cast<uint64_t>(b[3]);
|
|
1300
|
-
return BitCast(
|
|
1301
|
-
d,
|
|
1302
|
-
VFromD<decltype(du32)>{(__vector unsigned int){
|
|
1303
|
-
static_cast<unsigned int>(sum0 <= 0xFFFFFFFFu ? sum0 : 0xFFFFFFFFu),
|
|
1304
|
-
static_cast<unsigned int>(sum1 <= 0xFFFFFFFFu ? sum1 : 0xFFFFFFFFu),
|
|
1305
|
-
static_cast<unsigned int>(sum2 <= 0xFFFFFFFFu ? sum2 : 0xFFFFFFFFu),
|
|
1306
|
-
static_cast<unsigned int>(sum3 <= 0xFFFFFFFFu ? sum3
|
|
1307
|
-
: 0xFFFFFFFFu)}});
|
|
1308
|
-
} else // NOLINT
|
|
1309
|
-
#endif
|
|
1310
|
-
{
|
|
1311
|
-
return BitCast(d, VFromD<decltype(du32)>{vec_vsum4ubs(a, b)});
|
|
1312
|
-
}
|
|
1407
|
+
// ------------------------------ SumsOf8
|
|
1408
|
+
template <class V, HWY_IF_U8(TFromV<V>)>
|
|
1409
|
+
HWY_API VFromD<RepartitionToWideX3<DFromV<V>>> SumsOf8(V v) {
|
|
1410
|
+
return SumsOf2(SumsOf4(v));
|
|
1313
1411
|
}
|
|
1314
1412
|
|
|
1315
|
-
|
|
1316
|
-
|
|
1317
|
-
|
|
1318
|
-
|
|
1319
|
-
const
|
|
1320
|
-
|
|
1321
|
-
const Repartition<uint64_t, D> du64;
|
|
1322
|
-
constexpr int kDestLaneOffset = HWY_IS_BIG_ENDIAN;
|
|
1323
|
-
if (IsConstantRawAltivecVect(a) && __builtin_constant_p(b[kDestLaneOffset]) &&
|
|
1324
|
-
__builtin_constant_p(b[kDestLaneOffset + 2])) {
|
|
1325
|
-
const int64_t sum0 = static_cast<int64_t>(a[0]) +
|
|
1326
|
-
static_cast<int64_t>(a[1]) +
|
|
1327
|
-
static_cast<int64_t>(b[kDestLaneOffset]);
|
|
1328
|
-
const int64_t sum1 = static_cast<int64_t>(a[2]) +
|
|
1329
|
-
static_cast<int64_t>(a[3]) +
|
|
1330
|
-
static_cast<int64_t>(b[kDestLaneOffset + 2]);
|
|
1331
|
-
const int32_t sign0 = static_cast<int32_t>(sum0 >> 63);
|
|
1332
|
-
const int32_t sign1 = static_cast<int32_t>(sum1 >> 63);
|
|
1333
|
-
return BitCast(d, VFromD<decltype(du64)>{(__vector unsigned long long){
|
|
1334
|
-
(sign0 == (sum0 >> 31))
|
|
1335
|
-
? static_cast<uint32_t>(sum0)
|
|
1336
|
-
: static_cast<uint32_t>(sign0 ^ 0x7FFFFFFF),
|
|
1337
|
-
(sign1 == (sum1 >> 31))
|
|
1338
|
-
? static_cast<uint32_t>(sum1)
|
|
1339
|
-
: static_cast<uint32_t>(sign1 ^ 0x7FFFFFFF)}});
|
|
1340
|
-
} else // NOLINT
|
|
1341
|
-
#endif
|
|
1342
|
-
{
|
|
1343
|
-
__vector signed int sum;
|
|
1344
|
-
|
|
1345
|
-
// Inline assembly is used for vsum2sws to avoid unnecessary shuffling
|
|
1346
|
-
// on little-endian PowerPC targets as the result of the vsum2sws
|
|
1347
|
-
// instruction will already be in the correct lanes on little-endian
|
|
1348
|
-
// PowerPC targets.
|
|
1349
|
-
__asm__("vsum2sws %0,%1,%2" : "=v"(sum) : "v"(a), "v"(b));
|
|
1413
|
+
template <class V, HWY_IF_I8(TFromV<V>)>
|
|
1414
|
+
HWY_API VFromD<RepartitionToWideX3<DFromV<V>>> SumsOf8(V v) {
|
|
1415
|
+
#if HWY_S390X_HAVE_Z14
|
|
1416
|
+
const DFromV<decltype(v)> di8;
|
|
1417
|
+
const RebindToUnsigned<decltype(di8)> du8;
|
|
1418
|
+
const RepartitionToWideX3<decltype(di8)> di64;
|
|
1350
1419
|
|
|
1351
|
-
|
|
1352
|
-
|
|
1420
|
+
return BitCast(di64, SumsOf8(BitCast(du8, Xor(v, SignBit(di8))))) +
|
|
1421
|
+
Set(di64, int64_t{-1024});
|
|
1422
|
+
#else
|
|
1423
|
+
return SumsOf2(SumsOf4(v));
|
|
1424
|
+
#endif
|
|
1353
1425
|
}
|
|
1354
1426
|
|
|
1355
|
-
|
|
1427
|
+
// ------------------------------ SaturatedAdd
|
|
1356
1428
|
|
|
1357
|
-
|
|
1358
|
-
|
|
1359
|
-
|
|
1360
|
-
|
|
1361
|
-
|
|
1429
|
+
// Returns a + b clamped to the destination range.
|
|
1430
|
+
|
|
1431
|
+
#if HWY_S390X_HAVE_Z14
|
|
1432
|
+
// Z14/Z15/Z16 does not have I8/U8/I16/U16 SaturatedAdd instructions unlike most
|
|
1433
|
+
// other integer SIMD instruction sets
|
|
1362
1434
|
|
|
1363
|
-
|
|
1364
|
-
|
|
1365
|
-
|
|
1435
|
+
template <typename T, size_t N, HWY_IF_UNSIGNED(T),
|
|
1436
|
+
HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2))>
|
|
1437
|
+
HWY_API Vec128<T, N> SaturatedAdd(Vec128<T, N> a, Vec128<T, N> b) {
|
|
1438
|
+
return Add(a, Min(b, Not(a)));
|
|
1366
1439
|
}
|
|
1367
1440
|
|
|
1368
|
-
|
|
1441
|
+
template <typename T, size_t N, HWY_IF_SIGNED(T),
|
|
1442
|
+
HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2))>
|
|
1443
|
+
HWY_API Vec128<T, N> SaturatedAdd(Vec128<T, N> a, Vec128<T, N> b) {
|
|
1444
|
+
const DFromV<decltype(a)> d;
|
|
1445
|
+
const auto sum = Add(a, b);
|
|
1446
|
+
const auto overflow_mask = AndNot(Xor(a, b), Xor(a, sum));
|
|
1447
|
+
const auto overflow_result = Xor(BroadcastSignBit(a), Set(d, LimitsMax<T>()));
|
|
1448
|
+
return IfNegativeThenElse(overflow_mask, overflow_result, sum);
|
|
1449
|
+
}
|
|
1369
1450
|
|
|
1370
|
-
//
|
|
1451
|
+
#else // VSX
|
|
1371
1452
|
|
|
1372
1453
|
#ifdef HWY_NATIVE_I32_SATURATED_ADDSUB
|
|
1373
1454
|
#undef HWY_NATIVE_I32_SATURATED_ADDSUB
|
|
@@ -1386,6 +1467,7 @@ template <typename T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T),
|
|
|
1386
1467
|
HWY_API Vec128<T, N> SaturatedAdd(Vec128<T, N> a, Vec128<T, N> b) {
|
|
1387
1468
|
return Vec128<T, N>{vec_adds(a.raw, b.raw)};
|
|
1388
1469
|
}
|
|
1470
|
+
#endif // HWY_S390X_HAVE_Z14
|
|
1389
1471
|
|
|
1390
1472
|
#if HWY_PPC_HAVE_10
|
|
1391
1473
|
|
|
@@ -1412,14 +1494,37 @@ HWY_API V SaturatedAdd(V a, V b) {
|
|
|
1412
1494
|
|
|
1413
1495
|
// Returns a - b clamped to the destination range.
|
|
1414
1496
|
|
|
1415
|
-
|
|
1416
|
-
|
|
1497
|
+
#if HWY_S390X_HAVE_Z14
|
|
1498
|
+
// Z14/Z15/Z16 does not have I8/U8/I16/U16 SaturatedSub instructions unlike most
|
|
1499
|
+
// other integer SIMD instruction sets
|
|
1500
|
+
|
|
1501
|
+
template <typename T, size_t N, HWY_IF_UNSIGNED(T),
|
|
1502
|
+
HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2))>
|
|
1417
1503
|
HWY_API Vec128<T, N> SaturatedSub(Vec128<T, N> a, Vec128<T, N> b) {
|
|
1418
|
-
return
|
|
1504
|
+
return Sub(a, Min(a, b));
|
|
1419
1505
|
}
|
|
1420
1506
|
|
|
1421
|
-
|
|
1422
|
-
|
|
1507
|
+
template <typename T, size_t N, HWY_IF_SIGNED(T),
|
|
1508
|
+
HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2))>
|
|
1509
|
+
HWY_API Vec128<T, N> SaturatedSub(Vec128<T, N> a, Vec128<T, N> b) {
|
|
1510
|
+
const DFromV<decltype(a)> d;
|
|
1511
|
+
const auto diff = Sub(a, b);
|
|
1512
|
+
const auto overflow_mask = And(Xor(a, b), Xor(a, diff));
|
|
1513
|
+
const auto overflow_result = Xor(BroadcastSignBit(a), Set(d, LimitsMax<T>()));
|
|
1514
|
+
return IfNegativeThenElse(overflow_mask, overflow_result, diff);
|
|
1515
|
+
}
|
|
1516
|
+
|
|
1517
|
+
#else // VSX
|
|
1518
|
+
|
|
1519
|
+
template <typename T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T),
|
|
1520
|
+
HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2) | (1 << 4))>
|
|
1521
|
+
HWY_API Vec128<T, N> SaturatedSub(Vec128<T, N> a, Vec128<T, N> b) {
|
|
1522
|
+
return Vec128<T, N>{vec_subs(a.raw, b.raw)};
|
|
1523
|
+
}
|
|
1524
|
+
#endif // HWY_S390X_HAVE_Z14
|
|
1525
|
+
|
|
1526
|
+
#if HWY_PPC_HAVE_10
|
|
1527
|
+
|
|
1423
1528
|
template <class V, HWY_IF_I64_D(DFromV<V>)>
|
|
1424
1529
|
HWY_API V SaturatedSub(V a, V b) {
|
|
1425
1530
|
const DFromV<decltype(a)> d;
|
|
@@ -1459,12 +1564,25 @@ HWY_API Vec128<T, N> AverageRound(Vec128<T, N> a, Vec128<T, N> b) {
|
|
|
1459
1564
|
|
|
1460
1565
|
template <typename T, size_t N, HWY_IF_NOT_SPECIAL_FLOAT(T)>
|
|
1461
1566
|
HWY_API Vec128<T, N> operator*(Vec128<T, N> a, Vec128<T, N> b) {
|
|
1462
|
-
|
|
1567
|
+
const DFromV<decltype(a)> d;
|
|
1568
|
+
const detail::RebindToUnsignedIfNotFloat<decltype(d)> d_arith;
|
|
1569
|
+
|
|
1570
|
+
// If T is an integer type, do an unsigned vec_mul to avoid undefined behavior
|
|
1571
|
+
#if HWY_S390X_HAVE_Z14
|
|
1572
|
+
return BitCast(d, VFromD<decltype(d_arith)>{BitCast(d_arith, a).raw *
|
|
1573
|
+
BitCast(d_arith, b).raw});
|
|
1574
|
+
#else
|
|
1575
|
+
return BitCast(d, VFromD<decltype(d_arith)>{vec_mul(
|
|
1576
|
+
BitCast(d_arith, a).raw, BitCast(d_arith, b).raw)});
|
|
1577
|
+
#endif
|
|
1463
1578
|
}
|
|
1464
1579
|
|
|
1465
1580
|
// Returns the upper 16 bits of a * b in each lane.
|
|
1466
1581
|
template <typename T, size_t N, HWY_IF_T_SIZE(T, 2), HWY_IF_NOT_FLOAT(T)>
|
|
1467
1582
|
HWY_API Vec128<T, N> MulHigh(Vec128<T, N> a, Vec128<T, N> b) {
|
|
1583
|
+
#if HWY_S390X_HAVE_Z14
|
|
1584
|
+
return Vec128<T, N>{vec_mulh(a.raw, b.raw)};
|
|
1585
|
+
#else
|
|
1468
1586
|
const DFromV<decltype(a)> d;
|
|
1469
1587
|
const RepartitionToWide<decltype(d)> dw;
|
|
1470
1588
|
const VFromD<decltype(dw)> p1{vec_mule(a.raw, b.raw)};
|
|
@@ -1477,13 +1595,7 @@ HWY_API Vec128<T, N> MulHigh(Vec128<T, N> a, Vec128<T, N> b) {
|
|
|
1477
1595
|
8, 9, 24, 25, 12, 13, 28, 29};
|
|
1478
1596
|
#endif
|
|
1479
1597
|
return BitCast(d, VFromD<decltype(dw)>{vec_perm(p1.raw, p2.raw, kShuffle)});
|
|
1480
|
-
|
|
1481
|
-
|
|
1482
|
-
template <size_t N>
|
|
1483
|
-
HWY_API Vec128<int16_t, N> MulFixedPoint15(Vec128<int16_t, N> a,
|
|
1484
|
-
Vec128<int16_t, N> b) {
|
|
1485
|
-
const Vec128<int16_t> zero = Zero(Full128<int16_t>());
|
|
1486
|
-
return Vec128<int16_t, N>{vec_mradds(a.raw, b.raw, zero.raw)};
|
|
1598
|
+
#endif
|
|
1487
1599
|
}
|
|
1488
1600
|
|
|
1489
1601
|
// Multiplies even lanes (0, 2, ..) and places the double-wide result into
|
|
@@ -1510,10 +1622,15 @@ HWY_API Vec128<MakeWide<T>, (N + 1) / 2> MulOdd(Vec128<T, N> a,
|
|
|
1510
1622
|
template <int kBits, typename T, size_t N>
|
|
1511
1623
|
HWY_API Vec128<T, N> RotateRight(const Vec128<T, N> v) {
|
|
1512
1624
|
const DFromV<decltype(v)> d;
|
|
1625
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
1513
1626
|
constexpr size_t kSizeInBits = sizeof(T) * 8;
|
|
1514
1627
|
static_assert(0 <= kBits && kBits < kSizeInBits, "Invalid shift count");
|
|
1628
|
+
|
|
1515
1629
|
if (kBits == 0) return v;
|
|
1516
|
-
|
|
1630
|
+
|
|
1631
|
+
// Do an unsigned vec_rl operation to avoid undefined behavior
|
|
1632
|
+
return BitCast(d, VFromD<decltype(du)>{vec_rl(
|
|
1633
|
+
BitCast(du, v).raw, Set(du, kSizeInBits - kBits).raw)});
|
|
1517
1634
|
}
|
|
1518
1635
|
|
|
1519
1636
|
// ------------------------------ ZeroIfNegative (BroadcastSignBit)
|
|
@@ -1541,8 +1658,7 @@ HWY_API Vec128<T, N> IfNegativeThenElse(Vec128<T, N> v, Vec128<T, N> yes,
|
|
|
1541
1658
|
BitCast(du, no).raw, BitCast(du, yes).raw, BitCast(du, v).raw)});
|
|
1542
1659
|
#else
|
|
1543
1660
|
const RebindToSigned<decltype(d)> di;
|
|
1544
|
-
return
|
|
1545
|
-
yes, no);
|
|
1661
|
+
return IfVecThenElse(BitCast(d, BroadcastSignBit(BitCast(di, v))), yes, no);
|
|
1546
1662
|
#endif
|
|
1547
1663
|
}
|
|
1548
1664
|
|
|
@@ -1598,17 +1714,42 @@ HWY_API Vec128<T, N> NegMulSub(Vec128<T, N> mul, Vec128<T, N> x,
|
|
|
1598
1714
|
#endif
|
|
1599
1715
|
|
|
1600
1716
|
template <typename T, size_t N, HWY_IF_FLOAT(T)>
|
|
1601
|
-
HWY_API Vec128<T, N>
|
|
1602
|
-
|
|
1717
|
+
HWY_API Vec128<T, N> operator/(Vec128<T, N> a, Vec128<T, N> b) {
|
|
1718
|
+
#if HWY_S390X_HAVE_Z14
|
|
1719
|
+
return Vec128<T, N>{a.raw / b.raw};
|
|
1720
|
+
#else
|
|
1721
|
+
return Vec128<T, N>{vec_div(a.raw, b.raw)};
|
|
1722
|
+
#endif
|
|
1603
1723
|
}
|
|
1604
1724
|
|
|
1605
1725
|
template <typename T, size_t N, HWY_IF_FLOAT(T)>
|
|
1606
|
-
HWY_API Vec128<T, N>
|
|
1607
|
-
|
|
1726
|
+
HWY_API Vec128<T, N> ApproximateReciprocal(Vec128<T, N> v) {
|
|
1727
|
+
#if HWY_S390X_HAVE_Z14
|
|
1728
|
+
const DFromV<decltype(v)> d;
|
|
1729
|
+
return Set(d, T(1.0)) / v;
|
|
1730
|
+
#else
|
|
1731
|
+
return Vec128<T, N>{vec_re(v.raw)};
|
|
1732
|
+
#endif
|
|
1608
1733
|
}
|
|
1609
1734
|
|
|
1610
1735
|
// ------------------------------ Floating-point square root
|
|
1611
1736
|
|
|
1737
|
+
#if HWY_S390X_HAVE_Z14
|
|
1738
|
+
// Approximate reciprocal square root
|
|
1739
|
+
template <size_t N>
|
|
1740
|
+
HWY_API Vec128<float, N> ApproximateReciprocalSqrt(Vec128<float, N> v) {
|
|
1741
|
+
const DFromV<decltype(v)> d;
|
|
1742
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
1743
|
+
|
|
1744
|
+
const auto half = v * Set(d, 0.5f);
|
|
1745
|
+
// Initial guess based on log2(f)
|
|
1746
|
+
const auto guess = BitCast(
|
|
1747
|
+
d, Set(du, uint32_t{0x5F3759DFu}) - ShiftRight<1>(BitCast(du, v)));
|
|
1748
|
+
// One Newton-Raphson iteration
|
|
1749
|
+
return guess * NegMulAdd(half * guess, guess, Set(d, 1.5f));
|
|
1750
|
+
}
|
|
1751
|
+
#else // VSX
|
|
1752
|
+
|
|
1612
1753
|
#ifdef HWY_NATIVE_F64_APPROX_RSQRT
|
|
1613
1754
|
#undef HWY_NATIVE_F64_APPROX_RSQRT
|
|
1614
1755
|
#else
|
|
@@ -1620,6 +1761,7 @@ template <class T, size_t N, HWY_IF_FLOAT(T)>
|
|
|
1620
1761
|
HWY_API Vec128<T, N> ApproximateReciprocalSqrt(Vec128<T, N> v) {
|
|
1621
1762
|
return Vec128<T, N>{vec_rsqrte(v.raw)};
|
|
1622
1763
|
}
|
|
1764
|
+
#endif // HWY_S390X_HAVE_Z14
|
|
1623
1765
|
|
|
1624
1766
|
// Full precision square root
|
|
1625
1767
|
template <class T, size_t N, HWY_IF_FLOAT(T)>
|
|
@@ -1668,6 +1810,167 @@ HWY_API V AbsDiff(const V a, const V b) {
|
|
|
1668
1810
|
|
|
1669
1811
|
#endif // HWY_PPC_HAVE_9
|
|
1670
1812
|
|
|
1813
|
+
// ------------------------------ Integer Div for PPC10
|
|
1814
|
+
#if HWY_PPC_HAVE_10
|
|
1815
|
+
#ifdef HWY_NATIVE_INT_DIV
|
|
1816
|
+
#undef HWY_NATIVE_INT_DIV
|
|
1817
|
+
#else
|
|
1818
|
+
#define HWY_NATIVE_INT_DIV
|
|
1819
|
+
#endif
|
|
1820
|
+
|
|
1821
|
+
template <size_t N>
|
|
1822
|
+
HWY_API Vec128<int32_t, N> operator/(Vec128<int32_t, N> a,
|
|
1823
|
+
Vec128<int32_t, N> b) {
|
|
1824
|
+
// Inline assembly is used instead of vec_div for I32 Div on PPC10 to avoid
|
|
1825
|
+
// undefined behavior if b[i] == 0 or
|
|
1826
|
+
// (a[i] == LimitsMin<int32_t>() && b[i] == -1)
|
|
1827
|
+
|
|
1828
|
+
// Clang will also optimize out I32 vec_div on PPC10 if optimizations are
|
|
1829
|
+
// enabled and any of the lanes of b are known to be zero (even in the unused
|
|
1830
|
+
// lanes of a partial vector)
|
|
1831
|
+
__vector signed int raw_result;
|
|
1832
|
+
__asm__("vdivsw %0,%1,%2" : "=v"(raw_result) : "v"(a.raw), "v"(b.raw));
|
|
1833
|
+
return Vec128<int32_t, N>{raw_result};
|
|
1834
|
+
}
|
|
1835
|
+
|
|
1836
|
+
template <size_t N>
|
|
1837
|
+
HWY_API Vec128<uint32_t, N> operator/(Vec128<uint32_t, N> a,
|
|
1838
|
+
Vec128<uint32_t, N> b) {
|
|
1839
|
+
// Inline assembly is used instead of vec_div for U32 Div on PPC10 to avoid
|
|
1840
|
+
// undefined behavior if b[i] == 0
|
|
1841
|
+
|
|
1842
|
+
// Clang will also optimize out U32 vec_div on PPC10 if optimizations are
|
|
1843
|
+
// enabled and any of the lanes of b are known to be zero (even in the unused
|
|
1844
|
+
// lanes of a partial vector)
|
|
1845
|
+
__vector unsigned int raw_result;
|
|
1846
|
+
__asm__("vdivuw %0,%1,%2" : "=v"(raw_result) : "v"(a.raw), "v"(b.raw));
|
|
1847
|
+
return Vec128<uint32_t, N>{raw_result};
|
|
1848
|
+
}
|
|
1849
|
+
|
|
1850
|
+
template <size_t N>
|
|
1851
|
+
HWY_API Vec128<int64_t, N> operator/(Vec128<int64_t, N> a,
|
|
1852
|
+
Vec128<int64_t, N> b) {
|
|
1853
|
+
// Inline assembly is used instead of vec_div for I64 Div on PPC10 to avoid
|
|
1854
|
+
// undefined behavior if b[i] == 0 or
|
|
1855
|
+
// (a[i] == LimitsMin<int64_t>() && b[i] == -1)
|
|
1856
|
+
|
|
1857
|
+
// Clang will also optimize out I64 vec_div on PPC10 if optimizations are
|
|
1858
|
+
// enabled and any of the lanes of b are known to be zero (even in the unused
|
|
1859
|
+
// lanes of a partial vector)
|
|
1860
|
+
__vector signed long long raw_result;
|
|
1861
|
+
__asm__("vdivsd %0,%1,%2" : "=v"(raw_result) : "v"(a.raw), "v"(b.raw));
|
|
1862
|
+
return Vec128<int64_t, N>{raw_result};
|
|
1863
|
+
}
|
|
1864
|
+
|
|
1865
|
+
template <size_t N>
|
|
1866
|
+
HWY_API Vec128<uint64_t, N> operator/(Vec128<uint64_t, N> a,
|
|
1867
|
+
Vec128<uint64_t, N> b) {
|
|
1868
|
+
// Inline assembly is used instead of vec_div for U64 Div on PPC10 to avoid
|
|
1869
|
+
// undefined behavior if b[i] == 0
|
|
1870
|
+
|
|
1871
|
+
// Clang will also optimize out U64 vec_div on PPC10 if optimizations are
|
|
1872
|
+
// enabled and any of the lanes of b are known to be zero (even in the unused
|
|
1873
|
+
// lanes of a partial vector)
|
|
1874
|
+
__vector unsigned long long raw_result;
|
|
1875
|
+
__asm__("vdivud %0,%1,%2" : "=v"(raw_result) : "v"(a.raw), "v"(b.raw));
|
|
1876
|
+
return Vec128<uint64_t, N>{raw_result};
|
|
1877
|
+
}
|
|
1878
|
+
|
|
1879
|
+
template <class T, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T),
|
|
1880
|
+
HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2))>
|
|
1881
|
+
HWY_API Vec128<T> operator/(Vec128<T> a, Vec128<T> b) {
|
|
1882
|
+
const DFromV<decltype(a)> d;
|
|
1883
|
+
const RepartitionToWide<decltype(d)> dw;
|
|
1884
|
+
return OrderedDemote2To(d, PromoteLowerTo(dw, a) / PromoteLowerTo(dw, b),
|
|
1885
|
+
PromoteUpperTo(dw, a) / PromoteUpperTo(dw, b));
|
|
1886
|
+
}
|
|
1887
|
+
|
|
1888
|
+
template <class T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T),
|
|
1889
|
+
HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2)),
|
|
1890
|
+
HWY_IF_V_SIZE_LE(T, N, 8)>
|
|
1891
|
+
HWY_API Vec128<T, N> operator/(Vec128<T, N> a, Vec128<T, N> b) {
|
|
1892
|
+
const DFromV<decltype(a)> d;
|
|
1893
|
+
const Rebind<MakeWide<T>, decltype(d)> dw;
|
|
1894
|
+
return DemoteTo(d, PromoteTo(dw, a) / PromoteTo(dw, b));
|
|
1895
|
+
}
|
|
1896
|
+
|
|
1897
|
+
template <size_t N>
|
|
1898
|
+
HWY_API Vec128<int32_t, N> operator%(Vec128<int32_t, N> a,
|
|
1899
|
+
Vec128<int32_t, N> b) {
|
|
1900
|
+
// Inline assembly is used instead of vec_mod for I32 Mod on PPC10 to avoid
|
|
1901
|
+
// undefined behavior if b[i] == 0 or
|
|
1902
|
+
// (a[i] == LimitsMin<int32_t>() && b[i] == -1)
|
|
1903
|
+
|
|
1904
|
+
// Clang will also optimize out I32 vec_mod on PPC10 if optimizations are
|
|
1905
|
+
// enabled and any of the lanes of b are known to be zero (even in the unused
|
|
1906
|
+
// lanes of a partial vector)
|
|
1907
|
+
__vector signed int raw_result;
|
|
1908
|
+
__asm__("vmodsw %0,%1,%2" : "=v"(raw_result) : "v"(a.raw), "v"(b.raw));
|
|
1909
|
+
return Vec128<int32_t, N>{raw_result};
|
|
1910
|
+
}
|
|
1911
|
+
|
|
1912
|
+
template <size_t N>
|
|
1913
|
+
HWY_API Vec128<uint32_t, N> operator%(Vec128<uint32_t, N> a,
|
|
1914
|
+
Vec128<uint32_t, N> b) {
|
|
1915
|
+
// Inline assembly is used instead of vec_mod for U32 Mod on PPC10 to avoid
|
|
1916
|
+
// undefined behavior if b[i] == 0
|
|
1917
|
+
|
|
1918
|
+
// Clang will also optimize out U32 vec_mod on PPC10 if optimizations are
|
|
1919
|
+
// enabled and any of the lanes of b are known to be zero (even in the unused
|
|
1920
|
+
// lanes of a partial vector)
|
|
1921
|
+
__vector unsigned int raw_result;
|
|
1922
|
+
__asm__("vmoduw %0,%1,%2" : "=v"(raw_result) : "v"(a.raw), "v"(b.raw));
|
|
1923
|
+
return Vec128<uint32_t, N>{raw_result};
|
|
1924
|
+
}
|
|
1925
|
+
|
|
1926
|
+
template <size_t N>
|
|
1927
|
+
HWY_API Vec128<int64_t, N> operator%(Vec128<int64_t, N> a,
|
|
1928
|
+
Vec128<int64_t, N> b) {
|
|
1929
|
+
// Inline assembly is used instead of vec_mod for I64 Mod on PPC10 to avoid
|
|
1930
|
+
// undefined behavior if b[i] == 0 or
|
|
1931
|
+
// (a[i] == LimitsMin<int64_t>() && b[i] == -1)
|
|
1932
|
+
|
|
1933
|
+
// Clang will also optimize out I64 vec_mod on PPC10 if optimizations are
|
|
1934
|
+
// enabled and any of the lanes of b are known to be zero (even in the unused
|
|
1935
|
+
// lanes of a partial vector)
|
|
1936
|
+
__vector signed long long raw_result;
|
|
1937
|
+
__asm__("vmodsd %0,%1,%2" : "=v"(raw_result) : "v"(a.raw), "v"(b.raw));
|
|
1938
|
+
return Vec128<int64_t, N>{raw_result};
|
|
1939
|
+
}
|
|
1940
|
+
|
|
1941
|
+
template <size_t N>
|
|
1942
|
+
HWY_API Vec128<uint64_t, N> operator%(Vec128<uint64_t, N> a,
|
|
1943
|
+
Vec128<uint64_t, N> b) {
|
|
1944
|
+
// Inline assembly is used instead of vec_mod for U64 Mod on PPC10 to avoid
|
|
1945
|
+
// undefined behavior if b[i] == 0
|
|
1946
|
+
|
|
1947
|
+
// Clang will also optimize out U64 vec_mod on PPC10 if optimizations are
|
|
1948
|
+
// enabled and any of the lanes of b are known to be zero (even in the unused
|
|
1949
|
+
// lanes of a partial vector)
|
|
1950
|
+
__vector unsigned long long raw_result;
|
|
1951
|
+
__asm__("vmodud %0,%1,%2" : "=v"(raw_result) : "v"(a.raw), "v"(b.raw));
|
|
1952
|
+
return Vec128<uint64_t, N>{raw_result};
|
|
1953
|
+
}
|
|
1954
|
+
|
|
1955
|
+
template <class T, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T),
|
|
1956
|
+
HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2))>
|
|
1957
|
+
HWY_API Vec128<T> operator%(Vec128<T> a, Vec128<T> b) {
|
|
1958
|
+
const DFromV<decltype(a)> d;
|
|
1959
|
+
const RepartitionToWide<decltype(d)> dw;
|
|
1960
|
+
return OrderedDemote2To(d, PromoteLowerTo(dw, a) % PromoteLowerTo(dw, b),
|
|
1961
|
+
PromoteUpperTo(dw, a) % PromoteUpperTo(dw, b));
|
|
1962
|
+
}
|
|
1963
|
+
|
|
1964
|
+
template <class T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T),
|
|
1965
|
+
HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2)),
|
|
1966
|
+
HWY_IF_V_SIZE_LE(T, N, 8)>
|
|
1967
|
+
HWY_API Vec128<T, N> operator%(Vec128<T, N> a, Vec128<T, N> b) {
|
|
1968
|
+
const DFromV<decltype(a)> d;
|
|
1969
|
+
const Rebind<MakeWide<T>, decltype(d)> dw;
|
|
1970
|
+
return DemoteTo(d, PromoteTo(dw, a) % PromoteTo(dw, b));
|
|
1971
|
+
}
|
|
1972
|
+
#endif
|
|
1973
|
+
|
|
1671
1974
|
// ================================================== MEMORY (3)
|
|
1672
1975
|
|
|
1673
1976
|
// ------------------------------ Non-temporal stores
|
|
@@ -1800,7 +2103,7 @@ template <typename T, size_t N>
|
|
|
1800
2103
|
HWY_API Vec128<T, N> InsertLane(Vec128<T, N> v, size_t i, T t) {
|
|
1801
2104
|
#if HWY_IS_LITTLE_ENDIAN
|
|
1802
2105
|
typename detail::Raw128<T>::type raw_result = v.raw;
|
|
1803
|
-
raw_result[i] = t;
|
|
2106
|
+
raw_result[i] = BitCastScalar<typename detail::Raw128<T>::RawT>(t);
|
|
1804
2107
|
return Vec128<T, N>{raw_result};
|
|
1805
2108
|
#else
|
|
1806
2109
|
// On ppc64be without this, mul_test fails, but swizzle_test passes.
|
|
@@ -2070,7 +2373,7 @@ HWY_API Vec32<T> Reverse(D d, Vec32<T> v) {
|
|
|
2070
2373
|
|
|
2071
2374
|
// ------------------------------- ReverseLaneBytes
|
|
2072
2375
|
|
|
2073
|
-
#if HWY_PPC_HAVE_9 && \
|
|
2376
|
+
#if (HWY_PPC_HAVE_9 || HWY_S390X_HAVE_Z14) && \
|
|
2074
2377
|
(HWY_COMPILER_GCC_ACTUAL >= 710 || HWY_COMPILER_CLANG >= 400)
|
|
2075
2378
|
|
|
2076
2379
|
// Per-target flag to prevent generic_ops-inl.h defining 8-bit ReverseLaneBytes.
|
|
@@ -2111,7 +2414,7 @@ HWY_API VFromD<D> Reverse8(D d, VFromD<D> v) {
|
|
|
2111
2414
|
return BitCast(d, ReverseLaneBytes(BitCast(du64, v)));
|
|
2112
2415
|
}
|
|
2113
2416
|
|
|
2114
|
-
#endif // HWY_PPC_HAVE_9
|
|
2417
|
+
#endif // HWY_PPC_HAVE_9 || HWY_S390X_HAVE_Z14
|
|
2115
2418
|
|
|
2116
2419
|
template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 1)>
|
|
2117
2420
|
HWY_API Vec16<T> Reverse(D d, Vec16<T> v) {
|
|
@@ -2268,11 +2571,15 @@ HWY_API VFromD<D> SlideUpLanes(D d, VFromD<D> v, size_t amt) {
|
|
|
2268
2571
|
Set(Full128<uint32_t>(),
|
|
2269
2572
|
static_cast<uint32_t>(amt * sizeof(TFromD<D>) * 8)));
|
|
2270
2573
|
|
|
2574
|
+
#if HWY_S390X_HAVE_Z14
|
|
2575
|
+
return BitCast(d, VU8{vec_srb(BitCast(du8, v).raw, v_shift_amt.raw)});
|
|
2576
|
+
#else // VSX
|
|
2271
2577
|
#if HWY_IS_LITTLE_ENDIAN
|
|
2272
2578
|
return BitCast(d, VU8{vec_slo(BitCast(du8, v).raw, v_shift_amt.raw)});
|
|
2273
2579
|
#else
|
|
2274
2580
|
return BitCast(d, VU8{vec_sro(BitCast(du8, v).raw, v_shift_amt.raw)});
|
|
2275
|
-
#endif
|
|
2581
|
+
#endif // HWY_IS_LITTLE_ENDIAN
|
|
2582
|
+
#endif // HWY_S390X_HAVE_Z14
|
|
2276
2583
|
}
|
|
2277
2584
|
|
|
2278
2585
|
// ------------------------------ SlideDownLanes
|
|
@@ -2300,11 +2607,15 @@ HWY_API VFromD<D> SlideDownLanes(D d, VFromD<D> v, size_t amt) {
|
|
|
2300
2607
|
Set(Full128<uint32_t>(),
|
|
2301
2608
|
static_cast<uint32_t>(amt * sizeof(TFromD<D>) * 8)));
|
|
2302
2609
|
|
|
2610
|
+
#if HWY_S390X_HAVE_Z14
|
|
2611
|
+
return BitCast(d, VU8{vec_slb(BitCast(du8, v).raw, v_shift_amt.raw)});
|
|
2612
|
+
#else // VSX
|
|
2303
2613
|
#if HWY_IS_LITTLE_ENDIAN
|
|
2304
2614
|
return BitCast(d, VU8{vec_sro(BitCast(du8, v).raw, v_shift_amt.raw)});
|
|
2305
2615
|
#else
|
|
2306
2616
|
return BitCast(d, VU8{vec_slo(BitCast(du8, v).raw, v_shift_amt.raw)});
|
|
2307
|
-
#endif
|
|
2617
|
+
#endif // HWY_IS_LITTLE_ENDIAN
|
|
2618
|
+
#endif // HWY_S390X_HAVE_Z14
|
|
2308
2619
|
}
|
|
2309
2620
|
|
|
2310
2621
|
// ================================================== COMBINE
|
|
@@ -2637,7 +2948,15 @@ HWY_API Vec128<T, N> DupEven(Vec128<T, N> v) {
|
|
|
2637
2948
|
|
|
2638
2949
|
template <typename T, HWY_IF_T_SIZE(T, 4)>
|
|
2639
2950
|
HWY_API Vec128<T> DupEven(Vec128<T> v) {
|
|
2951
|
+
#if HWY_S390X_HAVE_Z14
|
|
2952
|
+
const DFromV<decltype(v)> d;
|
|
2953
|
+
const Repartition<uint8_t, decltype(d)> du8;
|
|
2954
|
+
return TableLookupBytes(
|
|
2955
|
+
v, BitCast(d, Dup128VecFromValues(du8, 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10,
|
|
2956
|
+
11, 8, 9, 10, 11)));
|
|
2957
|
+
#else
|
|
2640
2958
|
return Vec128<T>{vec_mergee(v.raw, v.raw)};
|
|
2959
|
+
#endif
|
|
2641
2960
|
}
|
|
2642
2961
|
|
|
2643
2962
|
// ------------------------------ DupOdd (InterleaveUpper)
|
|
@@ -2662,7 +2981,15 @@ HWY_API Vec128<T, N> DupOdd(Vec128<T, N> v) {
|
|
|
2662
2981
|
|
|
2663
2982
|
template <typename T, size_t N, HWY_IF_T_SIZE(T, 4)>
|
|
2664
2983
|
HWY_API Vec128<T, N> DupOdd(Vec128<T, N> v) {
|
|
2984
|
+
#if HWY_S390X_HAVE_Z14
|
|
2985
|
+
const DFromV<decltype(v)> d;
|
|
2986
|
+
const Repartition<uint8_t, decltype(d)> du8;
|
|
2987
|
+
return TableLookupBytes(
|
|
2988
|
+
v, BitCast(d, Dup128VecFromValues(du8, 4, 5, 6, 7, 4, 5, 6, 7, 12, 13, 14,
|
|
2989
|
+
15, 12, 13, 14, 15)));
|
|
2990
|
+
#else
|
|
2665
2991
|
return Vec128<T, N>{vec_mergeo(v.raw, v.raw)};
|
|
2992
|
+
#endif
|
|
2666
2993
|
}
|
|
2667
2994
|
|
|
2668
2995
|
template <typename T, size_t N, HWY_IF_T_SIZE(T, 8)>
|
|
@@ -2719,13 +3046,51 @@ HWY_API Vec128<T, N> SwapAdjacentBlocks(Vec128<T, N> v) {
|
|
|
2719
3046
|
return v;
|
|
2720
3047
|
}
|
|
2721
3048
|
|
|
3049
|
+
// ------------------------------ MulFixedPoint15 (OddEven)
|
|
3050
|
+
|
|
3051
|
+
#if HWY_S390X_HAVE_Z14
|
|
3052
|
+
HWY_API Vec16<int16_t> MulFixedPoint15(Vec16<int16_t> a, Vec16<int16_t> b) {
|
|
3053
|
+
const DFromV<decltype(a)> di16;
|
|
3054
|
+
const RepartitionToWide<decltype(di16)> di32;
|
|
3055
|
+
|
|
3056
|
+
const auto round_up_incr = Set(di32, 0x4000);
|
|
3057
|
+
const auto i32_product = MulEven(a, b) + round_up_incr;
|
|
3058
|
+
|
|
3059
|
+
return ResizeBitCast(di16, ShiftLeft<1>(i32_product));
|
|
3060
|
+
}
|
|
3061
|
+
template <size_t N, HWY_IF_LANES_GT(N, 1)>
|
|
3062
|
+
HWY_API Vec128<int16_t, N> MulFixedPoint15(Vec128<int16_t, N> a,
|
|
3063
|
+
Vec128<int16_t, N> b) {
|
|
3064
|
+
const DFromV<decltype(a)> di16;
|
|
3065
|
+
const RepartitionToWide<decltype(di16)> di32;
|
|
3066
|
+
|
|
3067
|
+
const auto round_up_incr = Set(di32, 0x4000);
|
|
3068
|
+
const auto even_product = MulEven(a, b) + round_up_incr;
|
|
3069
|
+
const auto odd_product = MulOdd(a, b) + round_up_incr;
|
|
3070
|
+
|
|
3071
|
+
return OddEven(BitCast(di16, ShiftRight<15>(odd_product)),
|
|
3072
|
+
BitCast(di16, ShiftLeft<1>(even_product)));
|
|
3073
|
+
}
|
|
3074
|
+
#else
|
|
3075
|
+
template <size_t N>
|
|
3076
|
+
HWY_API Vec128<int16_t, N> MulFixedPoint15(Vec128<int16_t, N> a,
|
|
3077
|
+
Vec128<int16_t, N> b) {
|
|
3078
|
+
const Vec128<int16_t> zero = Zero(Full128<int16_t>());
|
|
3079
|
+
return Vec128<int16_t, N>{vec_mradds(a.raw, b.raw, zero.raw)};
|
|
3080
|
+
}
|
|
3081
|
+
#endif
|
|
3082
|
+
|
|
2722
3083
|
// ------------------------------ Shl
|
|
2723
3084
|
|
|
2724
3085
|
namespace detail {
|
|
2725
3086
|
template <typename T, size_t N>
|
|
2726
3087
|
HWY_API Vec128<T, N> Shl(hwy::UnsignedTag /*tag*/, Vec128<T, N> v,
|
|
2727
3088
|
Vec128<T, N> bits) {
|
|
3089
|
+
#if HWY_S390X_HAVE_Z14
|
|
3090
|
+
return Vec128<T, N>{v.raw << bits.raw};
|
|
3091
|
+
#else
|
|
2728
3092
|
return Vec128<T, N>{vec_sl(v.raw, bits.raw)};
|
|
3093
|
+
#endif
|
|
2729
3094
|
}
|
|
2730
3095
|
|
|
2731
3096
|
// Signed left shift is the same as unsigned.
|
|
@@ -2751,15 +3116,23 @@ namespace detail {
|
|
|
2751
3116
|
template <typename T, size_t N>
|
|
2752
3117
|
HWY_API Vec128<T, N> Shr(hwy::UnsignedTag /*tag*/, Vec128<T, N> v,
|
|
2753
3118
|
Vec128<T, N> bits) {
|
|
3119
|
+
#if HWY_S390X_HAVE_Z14
|
|
3120
|
+
return Vec128<T, N>{v.raw >> bits.raw};
|
|
3121
|
+
#else
|
|
2754
3122
|
return Vec128<T, N>{vec_sr(v.raw, bits.raw)};
|
|
3123
|
+
#endif
|
|
2755
3124
|
}
|
|
2756
3125
|
|
|
2757
3126
|
template <typename T, size_t N>
|
|
2758
3127
|
HWY_API Vec128<T, N> Shr(hwy::SignedTag /*tag*/, Vec128<T, N> v,
|
|
2759
3128
|
Vec128<T, N> bits) {
|
|
3129
|
+
#if HWY_S390X_HAVE_Z14
|
|
3130
|
+
return Vec128<T, N>{v.raw >> bits.raw};
|
|
3131
|
+
#else
|
|
2760
3132
|
const DFromV<decltype(v)> di;
|
|
2761
3133
|
const RebindToUnsigned<decltype(di)> du;
|
|
2762
3134
|
return Vec128<T, N>{vec_sra(v.raw, BitCast(du, bits).raw)};
|
|
3135
|
+
#endif
|
|
2763
3136
|
}
|
|
2764
3137
|
|
|
2765
3138
|
} // namespace detail
|
|
@@ -2834,7 +3207,12 @@ HWY_API VFromD<D32> WidenMulPairwiseAdd(D32 df32, V16 a, V16 b) {
|
|
|
2834
3207
|
template <class D32, HWY_IF_UI32_D(D32),
|
|
2835
3208
|
class V16 = VFromD<RepartitionToNarrow<D32>>>
|
|
2836
3209
|
HWY_API VFromD<D32> WidenMulPairwiseAdd(D32 d32, V16 a, V16 b) {
|
|
3210
|
+
#if HWY_S390X_HAVE_Z14
|
|
3211
|
+
(void)d32;
|
|
3212
|
+
return MulEven(a, b) + MulOdd(a, b);
|
|
3213
|
+
#else
|
|
2837
3214
|
return VFromD<D32>{vec_msum(a.raw, b.raw, Zero(d32).raw)};
|
|
3215
|
+
#endif
|
|
2838
3216
|
}
|
|
2839
3217
|
|
|
2840
3218
|
// ------------------------------ ReorderWidenMulAccumulate (MulAdd, ZipLower)
|
|
@@ -2861,10 +3239,14 @@ HWY_API VFromD<D32> ReorderWidenMulAccumulate(D32 df32, V16 a, V16 b,
|
|
|
2861
3239
|
// Even if N=1, the input is always at least 2 lanes, hence vec_msum is safe.
|
|
2862
3240
|
template <class D32, HWY_IF_UI32_D(D32),
|
|
2863
3241
|
class V16 = VFromD<RepartitionToNarrow<D32>>>
|
|
2864
|
-
HWY_API VFromD<D32> ReorderWidenMulAccumulate(D32 /*
|
|
3242
|
+
HWY_API VFromD<D32> ReorderWidenMulAccumulate(D32 /*d32*/, V16 a, V16 b,
|
|
2865
3243
|
VFromD<D32> sum0,
|
|
2866
3244
|
VFromD<D32>& /*sum1*/) {
|
|
3245
|
+
#if HWY_S390X_HAVE_Z14
|
|
3246
|
+
return MulEven(a, b) + MulOdd(a, b) + sum0;
|
|
3247
|
+
#else
|
|
2867
3248
|
return VFromD<D32>{vec_msum(a.raw, b.raw, sum0.raw)};
|
|
3249
|
+
#endif
|
|
2868
3250
|
}
|
|
2869
3251
|
|
|
2870
3252
|
// ------------------------------ RearrangeToOddPlusEven
|
|
@@ -2886,6 +3268,8 @@ HWY_API VW RearrangeToOddPlusEven(const VW sum0, const VW sum1) {
|
|
|
2886
3268
|
}
|
|
2887
3269
|
|
|
2888
3270
|
// ------------------------------ SumOfMulQuadAccumulate
|
|
3271
|
+
#if !HWY_S390X_HAVE_Z14
|
|
3272
|
+
|
|
2889
3273
|
#ifdef HWY_NATIVE_U8_U8_SUMOFMULQUADACCUMULATE
|
|
2890
3274
|
#undef HWY_NATIVE_U8_U8_SUMOFMULQUADACCUMULATE
|
|
2891
3275
|
#else
|
|
@@ -2925,11 +3309,12 @@ HWY_API VFromD<DI32> SumOfMulQuadAccumulate(DI32 di32,
|
|
|
2925
3309
|
|
|
2926
3310
|
const auto result_sum_0 =
|
|
2927
3311
|
SumOfMulQuadAccumulate(di32, BitCast(du8, a), b, sum);
|
|
2928
|
-
const auto result_sum_1 = ShiftLeft<8>(
|
|
2929
|
-
di32, And(b, BroadcastSignBit(a)).raw, Zero(di32).raw));
|
|
3312
|
+
const auto result_sum_1 = ShiftLeft<8>(SumsOf4(And(b, BroadcastSignBit(a))));
|
|
2930
3313
|
return result_sum_0 - result_sum_1;
|
|
2931
3314
|
}
|
|
2932
3315
|
|
|
3316
|
+
#endif // !HWY_S390X_HAVE_Z14
|
|
3317
|
+
|
|
2933
3318
|
// ================================================== CONVERT
|
|
2934
3319
|
|
|
2935
3320
|
// ------------------------------ Promotions (part w/ narrow lanes -> full)
|
|
@@ -3018,29 +3403,59 @@ HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<float, D>> v) {
|
|
|
3018
3403
|
}
|
|
3019
3404
|
|
|
3020
3405
|
template <class D, HWY_IF_F64_D(D)>
|
|
3021
|
-
HWY_API VFromD<D> PromoteTo(D
|
|
3406
|
+
HWY_API VFromD<D> PromoteTo(D df64, VFromD<Rebind<int32_t, D>> v) {
|
|
3407
|
+
#if HWY_S390X_HAVE_Z14
|
|
3408
|
+
const RebindToSigned<decltype(df64)> di64;
|
|
3409
|
+
return ConvertTo(df64, PromoteTo(di64, v));
|
|
3410
|
+
#else // VSX
|
|
3411
|
+
(void)df64;
|
|
3022
3412
|
const __vector signed int raw_v = InterleaveLower(v, v).raw;
|
|
3023
3413
|
#if HWY_IS_LITTLE_ENDIAN
|
|
3024
3414
|
return VFromD<D>{vec_doubleo(raw_v)};
|
|
3025
3415
|
#else
|
|
3026
3416
|
return VFromD<D>{vec_doublee(raw_v)};
|
|
3027
3417
|
#endif
|
|
3418
|
+
#endif // HWY_S390X_HAVE_Z14
|
|
3028
3419
|
}
|
|
3029
3420
|
|
|
3030
3421
|
template <class D, HWY_IF_F64_D(D)>
|
|
3031
|
-
HWY_API VFromD<D> PromoteTo(D
|
|
3422
|
+
HWY_API VFromD<D> PromoteTo(D df64, VFromD<Rebind<uint32_t, D>> v) {
|
|
3423
|
+
#if HWY_S390X_HAVE_Z14
|
|
3424
|
+
const RebindToUnsigned<decltype(df64)> du64;
|
|
3425
|
+
return ConvertTo(df64, PromoteTo(du64, v));
|
|
3426
|
+
#else // VSX
|
|
3427
|
+
(void)df64;
|
|
3032
3428
|
const __vector unsigned int raw_v = InterleaveLower(v, v).raw;
|
|
3033
3429
|
#if HWY_IS_LITTLE_ENDIAN
|
|
3034
3430
|
return VFromD<D>{vec_doubleo(raw_v)};
|
|
3035
3431
|
#else
|
|
3036
3432
|
return VFromD<D>{vec_doublee(raw_v)};
|
|
3037
3433
|
#endif
|
|
3434
|
+
#endif // HWY_S390X_HAVE_Z14
|
|
3435
|
+
}
|
|
3436
|
+
|
|
3437
|
+
#if !HWY_S390X_HAVE_Z14
|
|
3438
|
+
namespace detail {
|
|
3439
|
+
|
|
3440
|
+
template <class V>
|
|
3441
|
+
static HWY_INLINE V VsxF2INormalizeSrcVals(V v) {
|
|
3442
|
+
#if !defined(HWY_DISABLE_PPC_VSX_QEMU_F2I_WORKAROUND)
|
|
3443
|
+
// Workaround for QEMU 7/8 VSX float to int conversion bug
|
|
3444
|
+
return IfThenElseZero(v == v, v);
|
|
3445
|
+
#else
|
|
3446
|
+
return v;
|
|
3447
|
+
#endif
|
|
3038
3448
|
}
|
|
3039
3449
|
|
|
3450
|
+
} // namespace detail
|
|
3451
|
+
#endif // !HWY_S390X_HAVE_Z14
|
|
3452
|
+
|
|
3040
3453
|
template <class D, HWY_IF_I64_D(D)>
|
|
3041
3454
|
HWY_API VFromD<D> PromoteTo(D di64, VFromD<Rebind<float, D>> v) {
|
|
3042
|
-
#if
|
|
3043
|
-
|
|
3455
|
+
#if !HWY_S390X_HAVE_Z14 && \
|
|
3456
|
+
(HWY_COMPILER_GCC_ACTUAL || HWY_HAS_BUILTIN(__builtin_vsx_xvcvspsxds))
|
|
3457
|
+
const __vector float raw_v =
|
|
3458
|
+
detail::VsxF2INormalizeSrcVals(InterleaveLower(v, v)).raw;
|
|
3044
3459
|
return VFromD<decltype(di64)>{__builtin_vsx_xvcvspsxds(raw_v)};
|
|
3045
3460
|
#else
|
|
3046
3461
|
const RebindToFloat<decltype(di64)> df64;
|
|
@@ -3050,8 +3465,10 @@ HWY_API VFromD<D> PromoteTo(D di64, VFromD<Rebind<float, D>> v) {
|
|
|
3050
3465
|
|
|
3051
3466
|
template <class D, HWY_IF_U64_D(D)>
|
|
3052
3467
|
HWY_API VFromD<D> PromoteTo(D du64, VFromD<Rebind<float, D>> v) {
|
|
3053
|
-
#if
|
|
3054
|
-
|
|
3468
|
+
#if !HWY_S390X_HAVE_Z14 && \
|
|
3469
|
+
(HWY_COMPILER_GCC_ACTUAL || HWY_HAS_BUILTIN(__builtin_vsx_xvcvspuxds))
|
|
3470
|
+
const __vector float raw_v =
|
|
3471
|
+
detail::VsxF2INormalizeSrcVals(InterleaveLower(v, v)).raw;
|
|
3055
3472
|
return VFromD<decltype(du64)>{reinterpret_cast<__vector unsigned long long>(
|
|
3056
3473
|
__builtin_vsx_xvcvspuxds(raw_v))};
|
|
3057
3474
|
#else
|
|
@@ -3123,7 +3540,12 @@ HWY_API VFromD<D> PromoteUpperTo(D /*tag*/, Vec128<float> v) {
|
|
|
3123
3540
|
}
|
|
3124
3541
|
|
|
3125
3542
|
template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F64_D(D)>
|
|
3126
|
-
HWY_API VFromD<D> PromoteUpperTo(D
|
|
3543
|
+
HWY_API VFromD<D> PromoteUpperTo(D df64, Vec128<int32_t> v) {
|
|
3544
|
+
#if HWY_S390X_HAVE_Z14
|
|
3545
|
+
const RebindToSigned<decltype(df64)> di64;
|
|
3546
|
+
return ConvertTo(df64, PromoteUpperTo(di64, v));
|
|
3547
|
+
#else // VSX
|
|
3548
|
+
(void)df64;
|
|
3127
3549
|
const __vector signed int raw_v =
|
|
3128
3550
|
InterleaveUpper(Full128<int32_t>(), v, v).raw;
|
|
3129
3551
|
#if HWY_IS_LITTLE_ENDIAN
|
|
@@ -3131,10 +3553,16 @@ HWY_API VFromD<D> PromoteUpperTo(D /*tag*/, Vec128<int32_t> v) {
|
|
|
3131
3553
|
#else
|
|
3132
3554
|
return VFromD<D>{vec_doublee(raw_v)};
|
|
3133
3555
|
#endif
|
|
3556
|
+
#endif // HWY_S390X_HAVE_Z14
|
|
3134
3557
|
}
|
|
3135
3558
|
|
|
3136
3559
|
template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F64_D(D)>
|
|
3137
|
-
HWY_API VFromD<D> PromoteUpperTo(D
|
|
3560
|
+
HWY_API VFromD<D> PromoteUpperTo(D df64, Vec128<uint32_t> v) {
|
|
3561
|
+
#if HWY_S390X_HAVE_Z14
|
|
3562
|
+
const RebindToUnsigned<decltype(df64)> du64;
|
|
3563
|
+
return ConvertTo(df64, PromoteUpperTo(du64, v));
|
|
3564
|
+
#else // VSX
|
|
3565
|
+
(void)df64;
|
|
3138
3566
|
const __vector unsigned int raw_v =
|
|
3139
3567
|
InterleaveUpper(Full128<uint32_t>(), v, v).raw;
|
|
3140
3568
|
#if HWY_IS_LITTLE_ENDIAN
|
|
@@ -3142,12 +3570,16 @@ HWY_API VFromD<D> PromoteUpperTo(D /*tag*/, Vec128<uint32_t> v) {
|
|
|
3142
3570
|
#else
|
|
3143
3571
|
return VFromD<D>{vec_doublee(raw_v)};
|
|
3144
3572
|
#endif
|
|
3573
|
+
#endif // HWY_S390X_HAVE_Z14
|
|
3145
3574
|
}
|
|
3146
3575
|
|
|
3147
3576
|
template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_I64_D(D)>
|
|
3148
3577
|
HWY_API VFromD<D> PromoteUpperTo(D di64, Vec128<float> v) {
|
|
3149
|
-
#if
|
|
3150
|
-
|
|
3578
|
+
#if !HWY_S390X_HAVE_Z14 && \
|
|
3579
|
+
(HWY_COMPILER_GCC_ACTUAL || HWY_HAS_BUILTIN(__builtin_vsx_xvcvspsxds))
|
|
3580
|
+
const __vector float raw_v =
|
|
3581
|
+
detail::VsxF2INormalizeSrcVals(InterleaveUpper(Full128<float>(), v, v))
|
|
3582
|
+
.raw;
|
|
3151
3583
|
return VFromD<decltype(di64)>{__builtin_vsx_xvcvspsxds(raw_v)};
|
|
3152
3584
|
#else
|
|
3153
3585
|
const RebindToFloat<decltype(di64)> df64;
|
|
@@ -3157,8 +3589,11 @@ HWY_API VFromD<D> PromoteUpperTo(D di64, Vec128<float> v) {
|
|
|
3157
3589
|
|
|
3158
3590
|
template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U64_D(D)>
|
|
3159
3591
|
HWY_API VFromD<D> PromoteUpperTo(D du64, Vec128<float> v) {
|
|
3160
|
-
#if
|
|
3161
|
-
|
|
3592
|
+
#if !HWY_S390X_HAVE_Z14 && \
|
|
3593
|
+
(HWY_COMPILER_GCC_ACTUAL || HWY_HAS_BUILTIN(__builtin_vsx_xvcvspuxds))
|
|
3594
|
+
const __vector float raw_v =
|
|
3595
|
+
detail::VsxF2INormalizeSrcVals(InterleaveUpper(Full128<float>(), v, v))
|
|
3596
|
+
.raw;
|
|
3162
3597
|
return VFromD<decltype(du64)>{reinterpret_cast<__vector unsigned long long>(
|
|
3163
3598
|
__builtin_vsx_xvcvspuxds(raw_v))};
|
|
3164
3599
|
#else
|
|
@@ -3174,6 +3609,219 @@ HWY_API VFromD<D> PromoteUpperTo(D d, V v) {
|
|
|
3174
3609
|
return PromoteTo(d, UpperHalf(dh, v));
|
|
3175
3610
|
}
|
|
3176
3611
|
|
|
3612
|
+
// ------------------------------ PromoteEvenTo/PromoteOddTo
|
|
3613
|
+
|
|
3614
|
+
namespace detail {
|
|
3615
|
+
|
|
3616
|
+
// Signed to Signed PromoteEvenTo/PromoteOddTo for PPC9/PPC10
|
|
3617
|
+
#if HWY_PPC_HAVE_9 && \
|
|
3618
|
+
(HWY_COMPILER_GCC_ACTUAL >= 1200 || HWY_COMPILER_CLANG >= 1200)
|
|
3619
|
+
|
|
3620
|
+
#if HWY_IS_LITTLE_ENDIAN
|
|
3621
|
+
template <class D, class V>
|
|
3622
|
+
HWY_INLINE VFromD<D> PromoteEvenTo(hwy::SignedTag /*to_type_tag*/,
|
|
3623
|
+
hwy::SizeTag<4> /*to_lane_size_tag*/,
|
|
3624
|
+
hwy::SignedTag /*from_type_tag*/, D /*d_to*/,
|
|
3625
|
+
V v) {
|
|
3626
|
+
return VFromD<D>{vec_signexti(v.raw)};
|
|
3627
|
+
}
|
|
3628
|
+
template <class D, class V>
|
|
3629
|
+
HWY_INLINE VFromD<D> PromoteEvenTo(hwy::SignedTag /*to_type_tag*/,
|
|
3630
|
+
hwy::SizeTag<8> /*to_lane_size_tag*/,
|
|
3631
|
+
hwy::SignedTag /*from_type_tag*/, D /*d_to*/,
|
|
3632
|
+
V v) {
|
|
3633
|
+
return VFromD<D>{vec_signextll(v.raw)};
|
|
3634
|
+
}
|
|
3635
|
+
#else
|
|
3636
|
+
template <class D, class V>
|
|
3637
|
+
HWY_INLINE VFromD<D> PromoteOddTo(hwy::SignedTag /*to_type_tag*/,
|
|
3638
|
+
hwy::SizeTag<4> /*to_lane_size_tag*/,
|
|
3639
|
+
hwy::SignedTag /*from_type_tag*/, D /*d_to*/,
|
|
3640
|
+
V v) {
|
|
3641
|
+
return VFromD<D>{vec_signexti(v.raw)};
|
|
3642
|
+
}
|
|
3643
|
+
template <class D, class V>
|
|
3644
|
+
HWY_INLINE VFromD<D> PromoteOddTo(hwy::SignedTag /*to_type_tag*/,
|
|
3645
|
+
hwy::SizeTag<8> /*to_lane_size_tag*/,
|
|
3646
|
+
hwy::SignedTag /*from_type_tag*/, D /*d_to*/,
|
|
3647
|
+
V v) {
|
|
3648
|
+
return VFromD<D>{vec_signextll(v.raw)};
|
|
3649
|
+
}
|
|
3650
|
+
#endif // HWY_IS_LITTLE_ENDIAN
|
|
3651
|
+
|
|
3652
|
+
#endif // HWY_PPC_HAVE_9
|
|
3653
|
+
|
|
3654
|
+
// I32/U32/F32->F64 PromoteEvenTo
|
|
3655
|
+
#if HWY_S390X_HAVE_Z14
|
|
3656
|
+
template <class D, class V>
|
|
3657
|
+
HWY_INLINE VFromD<D> PromoteEvenTo(hwy::FloatTag /*to_type_tag*/,
|
|
3658
|
+
hwy::SizeTag<8> /*to_lane_size_tag*/,
|
|
3659
|
+
hwy::FloatTag /*from_type_tag*/, D /*d_to*/,
|
|
3660
|
+
V v) {
|
|
3661
|
+
return VFromD<D>{vec_doublee(v.raw)};
|
|
3662
|
+
}
|
|
3663
|
+
template <class D, class V, class FromTypeTag, HWY_IF_UI32(TFromV<V>)>
|
|
3664
|
+
HWY_INLINE VFromD<D> PromoteEvenTo(hwy::FloatTag /*to_type_tag*/,
|
|
3665
|
+
hwy::SizeTag<8> /*to_lane_size_tag*/,
|
|
3666
|
+
FromTypeTag /*from_type_tag*/, D d_to, V v) {
|
|
3667
|
+
const Rebind<MakeWide<TFromV<V>>, decltype(d_to)> dw;
|
|
3668
|
+
return ConvertTo(d_to, PromoteEvenTo(dw, v));
|
|
3669
|
+
}
|
|
3670
|
+
#else // VSX
|
|
3671
|
+
template <class D, class V, class FromTypeTag>
|
|
3672
|
+
HWY_INLINE VFromD<D> PromoteEvenTo(hwy::FloatTag /*to_type_tag*/,
|
|
3673
|
+
hwy::SizeTag<8> /*to_lane_size_tag*/,
|
|
3674
|
+
FromTypeTag /*from_type_tag*/, D /*d_to*/,
|
|
3675
|
+
V v) {
|
|
3676
|
+
return VFromD<D>{vec_doublee(v.raw)};
|
|
3677
|
+
}
|
|
3678
|
+
#endif // HWY_S390X_HAVE_Z14
|
|
3679
|
+
|
|
3680
|
+
// F32->I64 PromoteEvenTo
|
|
3681
|
+
template <class D, class V>
|
|
3682
|
+
HWY_INLINE VFromD<D> PromoteEvenTo(hwy::SignedTag /*to_type_tag*/,
|
|
3683
|
+
hwy::SizeTag<8> /*to_lane_size_tag*/,
|
|
3684
|
+
hwy::FloatTag /*from_type_tag*/, D d_to,
|
|
3685
|
+
V v) {
|
|
3686
|
+
#if !HWY_S390X_HAVE_Z14 && \
|
|
3687
|
+
(HWY_COMPILER_GCC_ACTUAL || HWY_HAS_BUILTIN(__builtin_vsx_xvcvspsxds))
|
|
3688
|
+
(void)d_to;
|
|
3689
|
+
const auto normalized_v = detail::VsxF2INormalizeSrcVals(v);
|
|
3690
|
+
#if HWY_IS_LITTLE_ENDIAN
|
|
3691
|
+
// __builtin_vsx_xvcvspsxds expects the source values to be in the odd lanes
|
|
3692
|
+
// on little-endian PPC, and the vec_sld operation below will shift the even
|
|
3693
|
+
// lanes of normalized_v into the odd lanes.
|
|
3694
|
+
return VFromD<D>{
|
|
3695
|
+
__builtin_vsx_xvcvspsxds(vec_sld(normalized_v.raw, normalized_v.raw, 4))};
|
|
3696
|
+
#else
|
|
3697
|
+
// __builtin_vsx_xvcvspsxds expects the source values to be in the even lanes
|
|
3698
|
+
// on big-endian PPC.
|
|
3699
|
+
return VFromD<D>{__builtin_vsx_xvcvspsxds(normalized_v.raw)};
|
|
3700
|
+
#endif
|
|
3701
|
+
#else
|
|
3702
|
+
const RebindToFloat<decltype(d_to)> df64;
|
|
3703
|
+
return ConvertTo(d_to, PromoteEvenTo(hwy::FloatTag(), hwy::SizeTag<8>(),
|
|
3704
|
+
hwy::FloatTag(), df64, v));
|
|
3705
|
+
#endif
|
|
3706
|
+
}
|
|
3707
|
+
|
|
3708
|
+
// F32->U64 PromoteEvenTo
|
|
3709
|
+
template <class D, class V>
|
|
3710
|
+
HWY_INLINE VFromD<D> PromoteEvenTo(hwy::UnsignedTag /*to_type_tag*/,
|
|
3711
|
+
hwy::SizeTag<8> /*to_lane_size_tag*/,
|
|
3712
|
+
hwy::FloatTag /*from_type_tag*/, D d_to,
|
|
3713
|
+
V v) {
|
|
3714
|
+
#if !HWY_S390X_HAVE_Z14 && \
|
|
3715
|
+
(HWY_COMPILER_GCC_ACTUAL || HWY_HAS_BUILTIN(__builtin_vsx_xvcvspuxds))
|
|
3716
|
+
(void)d_to;
|
|
3717
|
+
const auto normalized_v = detail::VsxF2INormalizeSrcVals(v);
|
|
3718
|
+
#if HWY_IS_LITTLE_ENDIAN
|
|
3719
|
+
// __builtin_vsx_xvcvspuxds expects the source values to be in the odd lanes
|
|
3720
|
+
// on little-endian PPC, and the vec_sld operation below will shift the even
|
|
3721
|
+
// lanes of normalized_v into the odd lanes.
|
|
3722
|
+
return VFromD<D>{
|
|
3723
|
+
reinterpret_cast<__vector unsigned long long>(__builtin_vsx_xvcvspuxds(
|
|
3724
|
+
vec_sld(normalized_v.raw, normalized_v.raw, 4)))};
|
|
3725
|
+
#else
|
|
3726
|
+
// __builtin_vsx_xvcvspuxds expects the source values to be in the even lanes
|
|
3727
|
+
// on big-endian PPC.
|
|
3728
|
+
return VFromD<D>{reinterpret_cast<__vector unsigned long long>(
|
|
3729
|
+
__builtin_vsx_xvcvspuxds(normalized_v.raw))};
|
|
3730
|
+
#endif
|
|
3731
|
+
#else
|
|
3732
|
+
const RebindToFloat<decltype(d_to)> df64;
|
|
3733
|
+
return ConvertTo(d_to, PromoteEvenTo(hwy::FloatTag(), hwy::SizeTag<8>(),
|
|
3734
|
+
hwy::FloatTag(), df64, v));
|
|
3735
|
+
#endif
|
|
3736
|
+
}
|
|
3737
|
+
|
|
3738
|
+
// I32/U32/F32->F64 PromoteOddTo
|
|
3739
|
+
#if HWY_S390X_HAVE_Z14
|
|
3740
|
+
template <class D, class V>
|
|
3741
|
+
HWY_INLINE VFromD<D> PromoteOddTo(hwy::FloatTag /*to_type_tag*/,
|
|
3742
|
+
hwy::SizeTag<8> /*to_lane_size_tag*/,
|
|
3743
|
+
hwy::FloatTag /*from_type_tag*/, D d_to,
|
|
3744
|
+
V v) {
|
|
3745
|
+
return PromoteEvenTo(hwy::FloatTag(), hwy::SizeTag<8>(), hwy::FloatTag(),
|
|
3746
|
+
d_to, V{vec_sld(v.raw, v.raw, 4)});
|
|
3747
|
+
}
|
|
3748
|
+
template <class D, class V, class FromTypeTag, HWY_IF_UI32(TFromV<V>)>
|
|
3749
|
+
HWY_INLINE VFromD<D> PromoteOddTo(hwy::FloatTag /*to_type_tag*/,
|
|
3750
|
+
hwy::SizeTag<8> /*to_lane_size_tag*/,
|
|
3751
|
+
FromTypeTag /*from_type_tag*/, D d_to, V v) {
|
|
3752
|
+
const Rebind<MakeWide<TFromV<V>>, decltype(d_to)> dw;
|
|
3753
|
+
return ConvertTo(d_to, PromoteOddTo(dw, v));
|
|
3754
|
+
}
|
|
3755
|
+
#else
|
|
3756
|
+
template <class D, class V, class FromTypeTag>
|
|
3757
|
+
HWY_INLINE VFromD<D> PromoteOddTo(hwy::FloatTag /*to_type_tag*/,
|
|
3758
|
+
hwy::SizeTag<8> /*to_lane_size_tag*/,
|
|
3759
|
+
FromTypeTag /*from_type_tag*/, D /*d_to*/,
|
|
3760
|
+
V v) {
|
|
3761
|
+
return VFromD<D>{vec_doubleo(v.raw)};
|
|
3762
|
+
}
|
|
3763
|
+
#endif
|
|
3764
|
+
|
|
3765
|
+
// F32->I64 PromoteOddTo
|
|
3766
|
+
template <class D, class V>
|
|
3767
|
+
HWY_INLINE VFromD<D> PromoteOddTo(hwy::SignedTag /*to_type_tag*/,
|
|
3768
|
+
hwy::SizeTag<8> /*to_lane_size_tag*/,
|
|
3769
|
+
hwy::FloatTag /*from_type_tag*/, D d_to,
|
|
3770
|
+
V v) {
|
|
3771
|
+
#if !HWY_S390X_HAVE_Z14 && \
|
|
3772
|
+
(HWY_COMPILER_GCC_ACTUAL || HWY_HAS_BUILTIN(__builtin_vsx_xvcvspsxds))
|
|
3773
|
+
(void)d_to;
|
|
3774
|
+
const auto normalized_v = detail::VsxF2INormalizeSrcVals(v);
|
|
3775
|
+
#if HWY_IS_LITTLE_ENDIAN
|
|
3776
|
+
// __builtin_vsx_xvcvspsxds expects the source values to be in the odd lanes
|
|
3777
|
+
// on little-endian PPC
|
|
3778
|
+
return VFromD<D>{__builtin_vsx_xvcvspsxds(normalized_v.raw)};
|
|
3779
|
+
#else
|
|
3780
|
+
// __builtin_vsx_xvcvspsxds expects the source values to be in the even lanes
|
|
3781
|
+
// on big-endian PPC, and the vec_sld operation below will shift the odd lanes
|
|
3782
|
+
// of normalized_v into the even lanes.
|
|
3783
|
+
return VFromD<D>{
|
|
3784
|
+
__builtin_vsx_xvcvspsxds(vec_sld(normalized_v.raw, normalized_v.raw, 4))};
|
|
3785
|
+
#endif
|
|
3786
|
+
#else
|
|
3787
|
+
const RebindToFloat<decltype(d_to)> df64;
|
|
3788
|
+
return ConvertTo(d_to, PromoteOddTo(hwy::FloatTag(), hwy::SizeTag<8>(),
|
|
3789
|
+
hwy::FloatTag(), df64, v));
|
|
3790
|
+
#endif
|
|
3791
|
+
}
|
|
3792
|
+
|
|
3793
|
+
// F32->U64 PromoteOddTo
|
|
3794
|
+
template <class D, class V>
|
|
3795
|
+
HWY_INLINE VFromD<D> PromoteOddTo(hwy::UnsignedTag /*to_type_tag*/,
|
|
3796
|
+
hwy::SizeTag<8> /*to_lane_size_tag*/,
|
|
3797
|
+
hwy::FloatTag /*from_type_tag*/, D d_to,
|
|
3798
|
+
V v) {
|
|
3799
|
+
#if !HWY_S390X_HAVE_Z14 && \
|
|
3800
|
+
(HWY_COMPILER_GCC_ACTUAL || HWY_HAS_BUILTIN(__builtin_vsx_xvcvspuxds))
|
|
3801
|
+
(void)d_to;
|
|
3802
|
+
const auto normalized_v = detail::VsxF2INormalizeSrcVals(v);
|
|
3803
|
+
#if HWY_IS_LITTLE_ENDIAN
|
|
3804
|
+
// __builtin_vsx_xvcvspuxds expects the source values to be in the odd lanes
|
|
3805
|
+
// on little-endian PPC
|
|
3806
|
+
return VFromD<D>{reinterpret_cast<__vector unsigned long long>(
|
|
3807
|
+
__builtin_vsx_xvcvspuxds(normalized_v.raw))};
|
|
3808
|
+
#else
|
|
3809
|
+
// __builtin_vsx_xvcvspuxds expects the source values to be in the even lanes
|
|
3810
|
+
// on big-endian PPC, and the vec_sld operation below will shift the odd lanes
|
|
3811
|
+
// of normalized_v into the even lanes.
|
|
3812
|
+
return VFromD<D>{
|
|
3813
|
+
reinterpret_cast<__vector unsigned long long>(__builtin_vsx_xvcvspuxds(
|
|
3814
|
+
vec_sld(normalized_v.raw, normalized_v.raw, 4)))};
|
|
3815
|
+
#endif
|
|
3816
|
+
#else
|
|
3817
|
+
const RebindToFloat<decltype(d_to)> df64;
|
|
3818
|
+
return ConvertTo(d_to, PromoteOddTo(hwy::FloatTag(), hwy::SizeTag<8>(),
|
|
3819
|
+
hwy::FloatTag(), df64, v));
|
|
3820
|
+
#endif
|
|
3821
|
+
}
|
|
3822
|
+
|
|
3823
|
+
} // namespace detail
|
|
3824
|
+
|
|
3177
3825
|
// ------------------------------ Demotions (full -> part w/ narrow lanes)
|
|
3178
3826
|
|
|
3179
3827
|
template <class D, typename FromT, HWY_IF_UNSIGNED_D(D),
|
|
@@ -3254,6 +3902,101 @@ HWY_API VFromD<D> DemoteTo(D df16, VFromD<Rebind<float, D>> v) {
|
|
|
3254
3902
|
|
|
3255
3903
|
#endif // HWY_PPC_HAVE_9
|
|
3256
3904
|
|
|
3905
|
+
#if HWY_PPC_HAVE_9
|
|
3906
|
+
|
|
3907
|
+
#ifdef HWY_NATIVE_DEMOTE_F64_TO_F16
|
|
3908
|
+
#undef HWY_NATIVE_DEMOTE_F64_TO_F16
|
|
3909
|
+
#else
|
|
3910
|
+
#define HWY_NATIVE_DEMOTE_F64_TO_F16
|
|
3911
|
+
#endif
|
|
3912
|
+
|
|
3913
|
+
namespace detail {
|
|
3914
|
+
|
|
3915
|
+
// On big-endian PPC9, VsxXscvdphp converts vf64[0] to a F16, returned as an U64
|
|
3916
|
+
// vector with the resulting F16 bits in the lower 16 bits of U64 lane 0
|
|
3917
|
+
|
|
3918
|
+
// On little-endian PPC9, VsxXscvdphp converts vf64[1] to a F16, returned as
|
|
3919
|
+
// an U64 vector with the resulting F16 bits in the lower 16 bits of U64 lane 1
|
|
3920
|
+
static HWY_INLINE Vec128<uint64_t> VsxXscvdphp(Vec128<double> vf64) {
|
|
3921
|
+
// Inline assembly is needed for the PPC9 xscvdphp instruction as there is
|
|
3922
|
+
// currently no intrinsic available for the PPC9 xscvdphp instruction
|
|
3923
|
+
__vector unsigned long long raw_result;
|
|
3924
|
+
__asm__("xscvdphp %x0, %x1" : "=wa"(raw_result) : "wa"(vf64.raw));
|
|
3925
|
+
return Vec128<uint64_t>{raw_result};
|
|
3926
|
+
}
|
|
3927
|
+
|
|
3928
|
+
} // namespace detail
|
|
3929
|
+
|
|
3930
|
+
template <class D, HWY_IF_F16_D(D), HWY_IF_LANES_D(D, 1)>
|
|
3931
|
+
HWY_API VFromD<D> DemoteTo(D df16, VFromD<Rebind<double, D>> v) {
|
|
3932
|
+
const RebindToUnsigned<decltype(df16)> du16;
|
|
3933
|
+
const Rebind<uint64_t, decltype(df16)> du64;
|
|
3934
|
+
|
|
3935
|
+
const Full128<double> df64_full;
|
|
3936
|
+
#if HWY_IS_LITTLE_ENDIAN
|
|
3937
|
+
const auto bits16_as_u64 =
|
|
3938
|
+
UpperHalf(du64, detail::VsxXscvdphp(Combine(df64_full, v, v)));
|
|
3939
|
+
#else
|
|
3940
|
+
const auto bits16_as_u64 =
|
|
3941
|
+
LowerHalf(du64, detail::VsxXscvdphp(ResizeBitCast(df64_full, v)));
|
|
3942
|
+
#endif
|
|
3943
|
+
|
|
3944
|
+
return BitCast(df16, TruncateTo(du16, bits16_as_u64));
|
|
3945
|
+
}
|
|
3946
|
+
|
|
3947
|
+
template <class D, HWY_IF_F16_D(D), HWY_IF_LANES_D(D, 2)>
|
|
3948
|
+
HWY_API VFromD<D> DemoteTo(D df16, VFromD<Rebind<double, D>> v) {
|
|
3949
|
+
const RebindToUnsigned<decltype(df16)> du16;
|
|
3950
|
+
const Rebind<uint64_t, decltype(df16)> du64;
|
|
3951
|
+
const Rebind<double, decltype(df16)> df64;
|
|
3952
|
+
|
|
3953
|
+
#if HWY_IS_LITTLE_ENDIAN
|
|
3954
|
+
const auto bits64_as_u64_0 = detail::VsxXscvdphp(InterleaveLower(df64, v, v));
|
|
3955
|
+
const auto bits64_as_u64_1 = detail::VsxXscvdphp(v);
|
|
3956
|
+
const auto bits64_as_u64 =
|
|
3957
|
+
InterleaveUpper(du64, bits64_as_u64_0, bits64_as_u64_1);
|
|
3958
|
+
#else
|
|
3959
|
+
const auto bits64_as_u64_0 = detail::VsxXscvdphp(v);
|
|
3960
|
+
const auto bits64_as_u64_1 = detail::VsxXscvdphp(InterleaveUpper(df64, v, v));
|
|
3961
|
+
const auto bits64_as_u64 =
|
|
3962
|
+
InterleaveLower(du64, bits64_as_u64_0, bits64_as_u64_1);
|
|
3963
|
+
#endif
|
|
3964
|
+
|
|
3965
|
+
return BitCast(df16, TruncateTo(du16, bits64_as_u64));
|
|
3966
|
+
}
|
|
3967
|
+
|
|
3968
|
+
#elif HWY_S390X_HAVE_Z14
|
|
3969
|
+
|
|
3970
|
+
#ifdef HWY_NATIVE_DEMOTE_F64_TO_F16
|
|
3971
|
+
#undef HWY_NATIVE_DEMOTE_F64_TO_F16
|
|
3972
|
+
#else
|
|
3973
|
+
#define HWY_NATIVE_DEMOTE_F64_TO_F16
|
|
3974
|
+
#endif
|
|
3975
|
+
|
|
3976
|
+
namespace detail {
|
|
3977
|
+
|
|
3978
|
+
template <class DF32, HWY_IF_F32_D(DF32)>
|
|
3979
|
+
static HWY_INLINE VFromD<DF32> DemoteToF32WithRoundToOdd(
|
|
3980
|
+
DF32 df32, VFromD<Rebind<double, DF32>> v) {
|
|
3981
|
+
const Twice<DF32> dt_f32;
|
|
3982
|
+
|
|
3983
|
+
__vector float raw_f32_in_even;
|
|
3984
|
+
__asm__("vledb %0,%1,0,3" : "=v"(raw_f32_in_even) : "v"(v.raw));
|
|
3985
|
+
|
|
3986
|
+
const VFromD<decltype(dt_f32)> f32_in_even{raw_f32_in_even};
|
|
3987
|
+
return LowerHalf(df32, ConcatEven(dt_f32, f32_in_even, f32_in_even));
|
|
3988
|
+
}
|
|
3989
|
+
|
|
3990
|
+
} // namespace detail
|
|
3991
|
+
|
|
3992
|
+
template <class D, HWY_IF_V_SIZE_LE_D(D, 4), HWY_IF_F16_D(D)>
|
|
3993
|
+
HWY_API VFromD<D> DemoteTo(D df16, VFromD<Rebind<double, D>> v) {
|
|
3994
|
+
const Rebind<float, decltype(df16)> df32;
|
|
3995
|
+
return DemoteTo(df16, detail::DemoteToF32WithRoundToOdd(df32, v));
|
|
3996
|
+
}
|
|
3997
|
+
|
|
3998
|
+
#endif // HWY_PPC_HAVE_9
|
|
3999
|
+
|
|
3257
4000
|
template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_BF16_D(D)>
|
|
3258
4001
|
HWY_API VFromD<D> DemoteTo(D dbf16, VFromD<Rebind<float, D>> v) {
|
|
3259
4002
|
const Rebind<uint32_t, decltype(dbf16)> du32; // for logical shift right
|
|
@@ -3393,90 +4136,164 @@ HWY_API Vec32<float> DemoteTo(D /* tag */, Vec64<double> v) {
|
|
|
3393
4136
|
|
|
3394
4137
|
template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_F32_D(D)>
|
|
3395
4138
|
HWY_API Vec64<float> DemoteTo(D d, Vec128<double> v) {
|
|
3396
|
-
#if HWY_IS_LITTLE_ENDIAN
|
|
4139
|
+
#if HWY_S390X_HAVE_Z14 || HWY_IS_LITTLE_ENDIAN
|
|
3397
4140
|
const Vec128<float> f64_to_f32{vec_floate(v.raw)};
|
|
3398
4141
|
#else
|
|
3399
4142
|
const Vec128<float> f64_to_f32{vec_floato(v.raw)};
|
|
3400
4143
|
#endif
|
|
3401
4144
|
|
|
4145
|
+
#if HWY_S390X_HAVE_Z14
|
|
4146
|
+
const Twice<decltype(d)> dt;
|
|
4147
|
+
return LowerHalf(d, ConcatEven(dt, f64_to_f32, f64_to_f32));
|
|
4148
|
+
#else
|
|
3402
4149
|
const RebindToUnsigned<D> du;
|
|
3403
4150
|
const Rebind<uint64_t, D> du64;
|
|
3404
4151
|
return Vec64<float>{
|
|
3405
4152
|
BitCast(d, TruncateTo(du, BitCast(du64, f64_to_f32))).raw};
|
|
4153
|
+
#endif
|
|
3406
4154
|
}
|
|
3407
4155
|
|
|
3408
4156
|
template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_I32_D(D)>
|
|
3409
|
-
HWY_API Vec32<int32_t> DemoteTo(D
|
|
3410
|
-
|
|
4157
|
+
HWY_API Vec32<int32_t> DemoteTo(D di32, Vec64<double> v) {
|
|
4158
|
+
#if HWY_S390X_HAVE_Z14
|
|
4159
|
+
const Rebind<int64_t, decltype(di32)> di64;
|
|
4160
|
+
return DemoteTo(di32, ConvertTo(di64, v));
|
|
4161
|
+
#else
|
|
4162
|
+
(void)di32;
|
|
4163
|
+
return Vec32<int32_t>{vec_signede(detail::VsxF2INormalizeSrcVals(v).raw)};
|
|
4164
|
+
#endif
|
|
3411
4165
|
}
|
|
3412
4166
|
|
|
3413
4167
|
template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_I32_D(D)>
|
|
3414
|
-
HWY_API Vec64<int32_t> DemoteTo(D
|
|
4168
|
+
HWY_API Vec64<int32_t> DemoteTo(D di32, Vec128<double> v) {
|
|
4169
|
+
#if HWY_S390X_HAVE_Z14
|
|
4170
|
+
const Rebind<int64_t, decltype(di32)> di64;
|
|
4171
|
+
return DemoteTo(di32, ConvertTo(di64, v));
|
|
4172
|
+
#else
|
|
4173
|
+
(void)di32;
|
|
4174
|
+
|
|
3415
4175
|
#if HWY_IS_LITTLE_ENDIAN
|
|
3416
|
-
const Vec128<int32_t> f64_to_i32{
|
|
4176
|
+
const Vec128<int32_t> f64_to_i32{
|
|
4177
|
+
vec_signede(detail::VsxF2INormalizeSrcVals(v).raw)};
|
|
3417
4178
|
#else
|
|
3418
|
-
const Vec128<int32_t> f64_to_i32{
|
|
4179
|
+
const Vec128<int32_t> f64_to_i32{
|
|
4180
|
+
vec_signedo(detail::VsxF2INormalizeSrcVals(v).raw)};
|
|
3419
4181
|
#endif
|
|
3420
4182
|
|
|
3421
4183
|
const Rebind<int64_t, D> di64;
|
|
3422
4184
|
const Vec128<int64_t> vi64 = BitCast(di64, f64_to_i32);
|
|
3423
4185
|
return Vec64<int32_t>{vec_pack(vi64.raw, vi64.raw)};
|
|
4186
|
+
#endif
|
|
3424
4187
|
}
|
|
3425
4188
|
|
|
3426
4189
|
template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_U32_D(D)>
|
|
3427
|
-
HWY_API Vec32<uint32_t> DemoteTo(D
|
|
3428
|
-
|
|
4190
|
+
HWY_API Vec32<uint32_t> DemoteTo(D du32, Vec64<double> v) {
|
|
4191
|
+
#if HWY_S390X_HAVE_Z14
|
|
4192
|
+
const Rebind<uint64_t, decltype(du32)> du64;
|
|
4193
|
+
return DemoteTo(du32, ConvertTo(du64, v));
|
|
4194
|
+
#else
|
|
4195
|
+
(void)du32;
|
|
4196
|
+
return Vec32<uint32_t>{vec_unsignede(detail::VsxF2INormalizeSrcVals(v).raw)};
|
|
4197
|
+
#endif
|
|
3429
4198
|
}
|
|
3430
4199
|
|
|
3431
4200
|
template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_U32_D(D)>
|
|
3432
|
-
HWY_API Vec64<uint32_t> DemoteTo(D
|
|
4201
|
+
HWY_API Vec64<uint32_t> DemoteTo(D du32, Vec128<double> v) {
|
|
4202
|
+
#if HWY_S390X_HAVE_Z14
|
|
4203
|
+
const Rebind<uint64_t, decltype(du32)> du64;
|
|
4204
|
+
return DemoteTo(du32, ConvertTo(du64, v));
|
|
4205
|
+
#else
|
|
4206
|
+
(void)du32;
|
|
3433
4207
|
#if HWY_IS_LITTLE_ENDIAN
|
|
3434
|
-
const Vec128<uint32_t> f64_to_u32{
|
|
4208
|
+
const Vec128<uint32_t> f64_to_u32{
|
|
4209
|
+
vec_unsignede(detail::VsxF2INormalizeSrcVals(v).raw)};
|
|
3435
4210
|
#else
|
|
3436
|
-
const Vec128<uint32_t> f64_to_u32{
|
|
4211
|
+
const Vec128<uint32_t> f64_to_u32{
|
|
4212
|
+
vec_unsignedo(detail::VsxF2INormalizeSrcVals(v).raw)};
|
|
3437
4213
|
#endif
|
|
3438
4214
|
|
|
3439
4215
|
const Rebind<uint64_t, D> du64;
|
|
3440
4216
|
const Vec128<uint64_t> vu64 = BitCast(du64, f64_to_u32);
|
|
3441
4217
|
return Vec64<uint32_t>{vec_pack(vu64.raw, vu64.raw)};
|
|
4218
|
+
#endif
|
|
4219
|
+
}
|
|
4220
|
+
|
|
4221
|
+
#if HWY_S390X_HAVE_Z14
|
|
4222
|
+
namespace detail {
|
|
4223
|
+
|
|
4224
|
+
template <class V, HWY_IF_I64(TFromV<V>)>
|
|
4225
|
+
HWY_INLINE VFromD<RebindToFloat<DFromV<V>>> ConvToF64WithRoundToOdd(V v) {
|
|
4226
|
+
__vector double raw_result;
|
|
4227
|
+
// Use inline assembly to do a round-to-odd I64->F64 conversion on Z14
|
|
4228
|
+
__asm__("vcdgb %0,%1,0,3" : "=v"(raw_result) : "v"(v.raw));
|
|
4229
|
+
return VFromD<RebindToFloat<DFromV<V>>>{raw_result};
|
|
4230
|
+
}
|
|
4231
|
+
|
|
4232
|
+
template <class V, HWY_IF_U64(TFromV<V>)>
|
|
4233
|
+
HWY_INLINE VFromD<RebindToFloat<DFromV<V>>> ConvToF64WithRoundToOdd(V v) {
|
|
4234
|
+
__vector double raw_result;
|
|
4235
|
+
// Use inline assembly to do a round-to-odd U64->F64 conversion on Z14
|
|
4236
|
+
__asm__("vcdlgb %0,%1,0,3" : "=v"(raw_result) : "v"(v.raw));
|
|
4237
|
+
return VFromD<RebindToFloat<DFromV<V>>>{raw_result};
|
|
3442
4238
|
}
|
|
3443
4239
|
|
|
4240
|
+
} // namespace detail
|
|
4241
|
+
#endif // HWY_S390X_HAVE_Z14
|
|
4242
|
+
|
|
3444
4243
|
template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_F32_D(D)>
|
|
3445
|
-
HWY_API Vec32<float> DemoteTo(D
|
|
4244
|
+
HWY_API Vec32<float> DemoteTo(D df32, Vec64<int64_t> v) {
|
|
4245
|
+
#if HWY_S390X_HAVE_Z14
|
|
4246
|
+
return DemoteTo(df32, detail::ConvToF64WithRoundToOdd(v));
|
|
4247
|
+
#else // VSX
|
|
4248
|
+
(void)df32;
|
|
3446
4249
|
return Vec32<float>{vec_floate(v.raw)};
|
|
4250
|
+
#endif
|
|
3447
4251
|
}
|
|
3448
4252
|
|
|
3449
4253
|
template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_F32_D(D)>
|
|
3450
|
-
HWY_API Vec64<float> DemoteTo(D
|
|
4254
|
+
HWY_API Vec64<float> DemoteTo(D df32, Vec128<int64_t> v) {
|
|
4255
|
+
#if HWY_S390X_HAVE_Z14
|
|
4256
|
+
return DemoteTo(df32, detail::ConvToF64WithRoundToOdd(v));
|
|
4257
|
+
#else // VSX
|
|
3451
4258
|
#if HWY_IS_LITTLE_ENDIAN
|
|
3452
4259
|
const Vec128<float> i64_to_f32{vec_floate(v.raw)};
|
|
3453
4260
|
#else
|
|
3454
4261
|
const Vec128<float> i64_to_f32{vec_floato(v.raw)};
|
|
3455
4262
|
#endif
|
|
3456
4263
|
|
|
3457
|
-
const RebindToUnsigned<
|
|
3458
|
-
const Rebind<uint64_t,
|
|
4264
|
+
const RebindToUnsigned<decltype(df32)> du32;
|
|
4265
|
+
const Rebind<uint64_t, decltype(df32)> du64;
|
|
3459
4266
|
return Vec64<float>{
|
|
3460
|
-
BitCast(
|
|
4267
|
+
BitCast(df32, TruncateTo(du32, BitCast(du64, i64_to_f32))).raw};
|
|
4268
|
+
#endif
|
|
3461
4269
|
}
|
|
3462
4270
|
|
|
3463
4271
|
template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_F32_D(D)>
|
|
3464
|
-
HWY_API Vec32<float> DemoteTo(D
|
|
4272
|
+
HWY_API Vec32<float> DemoteTo(D df32, Vec64<uint64_t> v) {
|
|
4273
|
+
#if HWY_S390X_HAVE_Z14
|
|
4274
|
+
return DemoteTo(df32, detail::ConvToF64WithRoundToOdd(v));
|
|
4275
|
+
#else // VSX
|
|
4276
|
+
(void)df32;
|
|
3465
4277
|
return Vec32<float>{vec_floate(v.raw)};
|
|
4278
|
+
#endif
|
|
3466
4279
|
}
|
|
3467
4280
|
|
|
3468
4281
|
template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_F32_D(D)>
|
|
3469
|
-
HWY_API Vec64<float> DemoteTo(D
|
|
4282
|
+
HWY_API Vec64<float> DemoteTo(D df32, Vec128<uint64_t> v) {
|
|
4283
|
+
#if HWY_S390X_HAVE_Z14
|
|
4284
|
+
return DemoteTo(df32, detail::ConvToF64WithRoundToOdd(v));
|
|
4285
|
+
#else // VSX
|
|
3470
4286
|
#if HWY_IS_LITTLE_ENDIAN
|
|
3471
4287
|
const Vec128<float> u64_to_f32{vec_floate(v.raw)};
|
|
3472
4288
|
#else
|
|
3473
4289
|
const Vec128<float> u64_to_f32{vec_floato(v.raw)};
|
|
3474
4290
|
#endif
|
|
3475
4291
|
|
|
3476
|
-
const RebindToUnsigned<
|
|
3477
|
-
const Rebind<uint64_t,
|
|
4292
|
+
const RebindToUnsigned<decltype(df32)> du;
|
|
4293
|
+
const Rebind<uint64_t, decltype(df32)> du64;
|
|
3478
4294
|
return Vec64<float>{
|
|
3479
|
-
BitCast(
|
|
4295
|
+
BitCast(df32, TruncateTo(du, BitCast(du64, u64_to_f32))).raw};
|
|
4296
|
+
#endif
|
|
3480
4297
|
}
|
|
3481
4298
|
|
|
3482
4299
|
// For already range-limited input [0, 255].
|
|
@@ -3491,17 +4308,39 @@ HWY_API Vec128<uint8_t, N> U8FromU32(Vec128<uint32_t, N> v) {
|
|
|
3491
4308
|
// Note: altivec.h vec_ct* currently contain C casts which triggers
|
|
3492
4309
|
// -Wdeprecate-lax-vec-conv-all warnings, so disable them.
|
|
3493
4310
|
|
|
3494
|
-
|
|
3495
|
-
|
|
4311
|
+
#if HWY_S390X_HAVE_Z14 && !HWY_S390X_HAVE_Z15
|
|
4312
|
+
template <class D, typename FromT, HWY_IF_F32_D(D), HWY_IF_UI32(FromT),
|
|
4313
|
+
HWY_IF_V_SIZE_LE_D(D, 8)>
|
|
4314
|
+
HWY_API VFromD<D> ConvertTo(D df32,
|
|
4315
|
+
Vec128<FromT, Rebind<FromT, D>().MaxLanes()> v) {
|
|
4316
|
+
const Rebind<double, decltype(df32)> df64;
|
|
4317
|
+
return DemoteTo(df32, PromoteTo(df64, v));
|
|
4318
|
+
}
|
|
4319
|
+
template <class D, typename FromT, HWY_IF_F32_D(D), HWY_IF_UI32(FromT),
|
|
4320
|
+
HWY_IF_V_SIZE_D(D, 16)>
|
|
4321
|
+
HWY_API VFromD<D> ConvertTo(D df32, Vec128<FromT> v) {
|
|
4322
|
+
const RepartitionToWide<decltype(df32)> df64;
|
|
4323
|
+
|
|
4324
|
+
const VFromD<D> vf32_lo{vec_floate(PromoteLowerTo(df64, v).raw)};
|
|
4325
|
+
const VFromD<D> vf32_hi{vec_floate(PromoteUpperTo(df64, v).raw)};
|
|
4326
|
+
return ConcatEven(df32, vf32_hi, vf32_lo);
|
|
4327
|
+
}
|
|
4328
|
+
#else // Z15 or PPC
|
|
4329
|
+
template <class D, typename FromT, HWY_IF_F32_D(D), HWY_IF_UI32(FromT)>
|
|
3496
4330
|
HWY_API VFromD<D> ConvertTo(D /* tag */,
|
|
3497
4331
|
Vec128<FromT, Rebind<FromT, D>().MaxLanes()> v) {
|
|
3498
4332
|
HWY_DIAGNOSTICS(push)
|
|
3499
4333
|
#if HWY_COMPILER_CLANG
|
|
3500
4334
|
HWY_DIAGNOSTICS_OFF(disable : 5219, ignored "-Wdeprecate-lax-vec-conv-all")
|
|
3501
4335
|
#endif
|
|
4336
|
+
#if HWY_S390X_HAVE_Z15
|
|
4337
|
+
return VFromD<D>{vec_float(v.raw)};
|
|
4338
|
+
#else
|
|
3502
4339
|
return VFromD<D>{vec_ctf(v.raw, 0)};
|
|
4340
|
+
#endif
|
|
3503
4341
|
HWY_DIAGNOSTICS(pop)
|
|
3504
4342
|
}
|
|
4343
|
+
#endif // HWY_TARGET == HWY_Z14
|
|
3505
4344
|
|
|
3506
4345
|
template <class D, typename FromT, HWY_IF_F64_D(D), HWY_IF_NOT_FLOAT(FromT),
|
|
3507
4346
|
HWY_IF_T_SIZE_D(D, sizeof(FromT))>
|
|
@@ -3511,38 +4350,195 @@ HWY_API VFromD<D> ConvertTo(D /* tag */,
|
|
|
3511
4350
|
}
|
|
3512
4351
|
|
|
3513
4352
|
// Truncates (rounds toward zero).
|
|
3514
|
-
|
|
3515
|
-
|
|
4353
|
+
#if HWY_S390X_HAVE_Z14 && !HWY_S390X_HAVE_Z15
|
|
4354
|
+
template <class D, HWY_IF_I32_D(D), HWY_IF_V_SIZE_LE_D(D, 8)>
|
|
4355
|
+
HWY_API VFromD<D> ConvertTo(D di32,
|
|
4356
|
+
Vec128<float, Rebind<float, D>().MaxLanes()> v) {
|
|
4357
|
+
const Rebind<int64_t, decltype(di32)> di64;
|
|
4358
|
+
return DemoteTo(di32, PromoteTo(di64, v));
|
|
4359
|
+
}
|
|
4360
|
+
template <class D, HWY_IF_I32_D(D), HWY_IF_V_SIZE_D(D, 16)>
|
|
4361
|
+
HWY_API VFromD<D> ConvertTo(D di32,
|
|
4362
|
+
Vec128<float, Rebind<float, D>().MaxLanes()> v) {
|
|
4363
|
+
const RepartitionToWide<decltype(di32)> di64;
|
|
4364
|
+
return OrderedDemote2To(di32, PromoteLowerTo(di64, v),
|
|
4365
|
+
PromoteUpperTo(di64, v));
|
|
4366
|
+
}
|
|
4367
|
+
#else // Z15 or PPC
|
|
4368
|
+
template <class D, HWY_IF_I32_D(D)>
|
|
3516
4369
|
HWY_API VFromD<D> ConvertTo(D /* tag */,
|
|
3517
|
-
Vec128<
|
|
4370
|
+
Vec128<float, Rebind<float, D>().MaxLanes()> v) {
|
|
4371
|
+
#if defined(__OPTIMIZE__)
|
|
4372
|
+
if (detail::IsConstantRawAltivecVect(v.raw)) {
|
|
4373
|
+
constexpr int32_t kMinI32 = LimitsMin<int32_t>();
|
|
4374
|
+
constexpr int32_t kMaxI32 = LimitsMax<int32_t>();
|
|
4375
|
+
return Dup128VecFromValues(
|
|
4376
|
+
D(),
|
|
4377
|
+
(v.raw[0] >= -2147483648.0f)
|
|
4378
|
+
? ((v.raw[0] < 2147483648.0f) ? static_cast<int32_t>(v.raw[0])
|
|
4379
|
+
: kMaxI32)
|
|
4380
|
+
: ((v.raw[0] < 0) ? kMinI32 : 0),
|
|
4381
|
+
(v.raw[1] >= -2147483648.0f)
|
|
4382
|
+
? ((v.raw[1] < 2147483648.0f) ? static_cast<int32_t>(v.raw[1])
|
|
4383
|
+
: kMaxI32)
|
|
4384
|
+
: ((v.raw[1] < 0) ? kMinI32 : 0),
|
|
4385
|
+
(v.raw[2] >= -2147483648.0f)
|
|
4386
|
+
? ((v.raw[2] < 2147483648.0f) ? static_cast<int32_t>(v.raw[2])
|
|
4387
|
+
: kMaxI32)
|
|
4388
|
+
: ((v.raw[2] < 0) ? kMinI32 : 0),
|
|
4389
|
+
(v.raw[3] >= -2147483648.0f)
|
|
4390
|
+
? ((v.raw[3] < 2147483648.0f) ? static_cast<int32_t>(v.raw[3])
|
|
4391
|
+
: kMaxI32)
|
|
4392
|
+
: ((v.raw[3] < 0) ? kMinI32 : 0));
|
|
4393
|
+
}
|
|
4394
|
+
#endif
|
|
4395
|
+
|
|
4396
|
+
#if HWY_S390X_HAVE_Z15
|
|
4397
|
+
// Use inline assembly on Z15 to avoid undefined behavior if v[i] is not in
|
|
4398
|
+
// the range of an int32_t
|
|
4399
|
+
__vector signed int raw_result;
|
|
4400
|
+
__asm__("vcfeb %0,%1,0,5" : "=v"(raw_result) : "v"(v.raw));
|
|
4401
|
+
return VFromD<D>{raw_result};
|
|
4402
|
+
#else
|
|
3518
4403
|
HWY_DIAGNOSTICS(push)
|
|
3519
4404
|
#if HWY_COMPILER_CLANG
|
|
3520
4405
|
HWY_DIAGNOSTICS_OFF(disable : 5219, ignored "-Wdeprecate-lax-vec-conv-all")
|
|
3521
4406
|
#endif
|
|
3522
4407
|
return VFromD<D>{vec_cts(v.raw, 0)};
|
|
3523
4408
|
HWY_DIAGNOSTICS(pop)
|
|
4409
|
+
#endif // HWY_S390X_HAVE_Z15
|
|
3524
4410
|
}
|
|
4411
|
+
#endif // HWY_S390X_HAVE_Z14 && !HWY_S390X_HAVE_Z15
|
|
3525
4412
|
|
|
3526
|
-
template <class D,
|
|
3527
|
-
HWY_IF_T_SIZE_D(D, sizeof(FromT))>
|
|
4413
|
+
template <class D, HWY_IF_I64_D(D)>
|
|
3528
4414
|
HWY_API VFromD<D> ConvertTo(D /* tag */,
|
|
3529
|
-
Vec128<
|
|
4415
|
+
Vec128<double, Rebind<double, D>().MaxLanes()> v) {
|
|
4416
|
+
#if defined(__OPTIMIZE__)
|
|
4417
|
+
if (detail::IsConstantRawAltivecVect(v.raw)) {
|
|
4418
|
+
constexpr int64_t kMinI64 = LimitsMin<int64_t>();
|
|
4419
|
+
constexpr int64_t kMaxI64 = LimitsMax<int64_t>();
|
|
4420
|
+
return Dup128VecFromValues(D(),
|
|
4421
|
+
(v.raw[0] >= -9223372036854775808.0)
|
|
4422
|
+
? ((v.raw[0] < 9223372036854775808.0)
|
|
4423
|
+
? static_cast<int64_t>(v.raw[0])
|
|
4424
|
+
: kMaxI64)
|
|
4425
|
+
: ((v.raw[0] < 0) ? kMinI64 : 0LL),
|
|
4426
|
+
(v.raw[1] >= -9223372036854775808.0)
|
|
4427
|
+
? ((v.raw[1] < 9223372036854775808.0)
|
|
4428
|
+
? static_cast<int64_t>(v.raw[1])
|
|
4429
|
+
: kMaxI64)
|
|
4430
|
+
: ((v.raw[1] < 0) ? kMinI64 : 0LL));
|
|
4431
|
+
}
|
|
4432
|
+
#endif
|
|
4433
|
+
|
|
4434
|
+
// Use inline assembly to avoid undefined behavior if v[i] is not within the
|
|
4435
|
+
// range of an int64_t
|
|
4436
|
+
__vector signed long long raw_result;
|
|
4437
|
+
#if HWY_S390X_HAVE_Z14
|
|
4438
|
+
__asm__("vcgdb %0,%1,0,5" : "=v"(raw_result) : "v"(v.raw));
|
|
4439
|
+
#else
|
|
4440
|
+
__asm__("xvcvdpsxds %x0,%x1"
|
|
4441
|
+
: "=wa"(raw_result)
|
|
4442
|
+
: "wa"(detail::VsxF2INormalizeSrcVals(v).raw));
|
|
4443
|
+
#endif
|
|
4444
|
+
return VFromD<D>{raw_result};
|
|
4445
|
+
}
|
|
4446
|
+
|
|
4447
|
+
#if HWY_S390X_HAVE_Z14 && !HWY_S390X_HAVE_Z15
|
|
4448
|
+
template <class D, HWY_IF_U32_D(D), HWY_IF_V_SIZE_LE_D(D, 8)>
|
|
4449
|
+
HWY_API VFromD<D> ConvertTo(D du32,
|
|
4450
|
+
Vec128<float, Rebind<float, D>().MaxLanes()> v) {
|
|
4451
|
+
const Rebind<uint64_t, decltype(du32)> du64;
|
|
4452
|
+
return DemoteTo(du32, PromoteTo(du64, v));
|
|
4453
|
+
}
|
|
4454
|
+
template <class D, HWY_IF_U32_D(D), HWY_IF_V_SIZE_D(D, 16)>
|
|
4455
|
+
HWY_API VFromD<D> ConvertTo(D du32,
|
|
4456
|
+
Vec128<float, Rebind<float, D>().MaxLanes()> v) {
|
|
4457
|
+
const RepartitionToWide<decltype(du32)> du64;
|
|
4458
|
+
return OrderedDemote2To(du32, PromoteLowerTo(du64, v),
|
|
4459
|
+
PromoteUpperTo(du64, v));
|
|
4460
|
+
}
|
|
4461
|
+
#else // Z15 or VSX
|
|
4462
|
+
template <class D, HWY_IF_U32_D(D)>
|
|
4463
|
+
HWY_API VFromD<D> ConvertTo(D /* tag */,
|
|
4464
|
+
Vec128<float, Rebind<float, D>().MaxLanes()> v) {
|
|
4465
|
+
#if defined(__OPTIMIZE__)
|
|
4466
|
+
if (detail::IsConstantRawAltivecVect(v.raw)) {
|
|
4467
|
+
constexpr uint32_t kMaxU32 = LimitsMax<uint32_t>();
|
|
4468
|
+
return Dup128VecFromValues(
|
|
4469
|
+
D(),
|
|
4470
|
+
(v.raw[0] >= 0.0f)
|
|
4471
|
+
? ((v.raw[0] < 4294967296.0f) ? static_cast<uint32_t>(v.raw[0])
|
|
4472
|
+
: kMaxU32)
|
|
4473
|
+
: 0,
|
|
4474
|
+
(v.raw[1] >= 0.0f)
|
|
4475
|
+
? ((v.raw[1] < 4294967296.0f) ? static_cast<uint32_t>(v.raw[1])
|
|
4476
|
+
: kMaxU32)
|
|
4477
|
+
: 0,
|
|
4478
|
+
(v.raw[2] >= 0.0f)
|
|
4479
|
+
? ((v.raw[2] < 4294967296.0f) ? static_cast<uint32_t>(v.raw[2])
|
|
4480
|
+
: kMaxU32)
|
|
4481
|
+
: 0,
|
|
4482
|
+
(v.raw[3] >= 0.0f)
|
|
4483
|
+
? ((v.raw[3] < 4294967296.0f) ? static_cast<uint32_t>(v.raw[3])
|
|
4484
|
+
: kMaxU32)
|
|
4485
|
+
: 0);
|
|
4486
|
+
}
|
|
4487
|
+
#endif
|
|
4488
|
+
|
|
4489
|
+
#if HWY_S390X_HAVE_Z15
|
|
4490
|
+
// Use inline assembly on Z15 to avoid undefined behavior if v[i] is not in
|
|
4491
|
+
// the range of an uint32_t
|
|
4492
|
+
__vector unsigned int raw_result;
|
|
4493
|
+
__asm__("vclfeb %0,%1,0,5" : "=v"(raw_result) : "v"(v.raw));
|
|
4494
|
+
return VFromD<D>{raw_result};
|
|
4495
|
+
#else // VSX
|
|
3530
4496
|
HWY_DIAGNOSTICS(push)
|
|
3531
4497
|
#if HWY_COMPILER_CLANG
|
|
3532
4498
|
HWY_DIAGNOSTICS_OFF(disable : 5219, ignored "-Wdeprecate-lax-vec-conv-all")
|
|
3533
4499
|
#endif
|
|
3534
|
-
|
|
4500
|
+
VFromD<D> result{vec_ctu(v.raw, 0)};
|
|
3535
4501
|
HWY_DIAGNOSTICS(pop)
|
|
4502
|
+
return result;
|
|
4503
|
+
#endif // HWY_S390X_HAVE_Z15
|
|
3536
4504
|
}
|
|
4505
|
+
#endif // HWY_S390X_HAVE_Z14 && !HWY_S390X_HAVE_Z15
|
|
3537
4506
|
|
|
3538
|
-
template <
|
|
3539
|
-
HWY_API
|
|
4507
|
+
template <class D, HWY_IF_U64_D(D)>
|
|
4508
|
+
HWY_API VFromD<D> ConvertTo(D /* tag */,
|
|
4509
|
+
Vec128<double, Rebind<double, D>().MaxLanes()> v) {
|
|
3540
4510
|
HWY_DIAGNOSTICS(push)
|
|
3541
4511
|
#if HWY_COMPILER_CLANG
|
|
3542
4512
|
HWY_DIAGNOSTICS_OFF(disable : 5219, ignored "-Wdeprecate-lax-vec-conv-all")
|
|
3543
4513
|
#endif
|
|
3544
|
-
|
|
3545
|
-
|
|
4514
|
+
|
|
4515
|
+
#if defined(__OPTIMIZE__)
|
|
4516
|
+
if (detail::IsConstantRawAltivecVect(v.raw)) {
|
|
4517
|
+
constexpr uint64_t kMaxU64 = LimitsMax<uint64_t>();
|
|
4518
|
+
return Dup128VecFromValues(
|
|
4519
|
+
D(),
|
|
4520
|
+
(v.raw[0] >= 0.0) ? ((v.raw[0] < 18446744073709551616.0)
|
|
4521
|
+
? static_cast<uint64_t>(v.raw[0])
|
|
4522
|
+
: kMaxU64)
|
|
4523
|
+
: 0,
|
|
4524
|
+
(v.raw[1] >= 0.0) ? ((v.raw[1] < 18446744073709551616.0)
|
|
4525
|
+
? static_cast<uint64_t>(v.raw[1])
|
|
4526
|
+
: kMaxU64)
|
|
4527
|
+
: 0);
|
|
4528
|
+
}
|
|
4529
|
+
#endif
|
|
4530
|
+
|
|
4531
|
+
// Use inline assembly to avoid undefined behavior if v[i] is not within the
|
|
4532
|
+
// range of an uint64_t
|
|
4533
|
+
__vector unsigned long long raw_result;
|
|
4534
|
+
#if HWY_S390X_HAVE_Z14
|
|
4535
|
+
__asm__("vclgdb %0,%1,0,5" : "=v"(raw_result) : "v"(v.raw));
|
|
4536
|
+
#else // VSX
|
|
4537
|
+
__asm__("xvcvdpuxds %x0,%x1"
|
|
4538
|
+
: "=wa"(raw_result)
|
|
4539
|
+
: "wa"(detail::VsxF2INormalizeSrcVals(v).raw));
|
|
4540
|
+
#endif
|
|
4541
|
+
return VFromD<D>{raw_result};
|
|
3546
4542
|
}
|
|
3547
4543
|
|
|
3548
4544
|
// ------------------------------ Floating-point rounding (ConvertTo)
|
|
@@ -3555,7 +4551,18 @@ HWY_API Vec128<float, N> Round(Vec128<float, N> v) {
|
|
|
3555
4551
|
|
|
3556
4552
|
template <size_t N>
|
|
3557
4553
|
HWY_API Vec128<double, N> Round(Vec128<double, N> v) {
|
|
4554
|
+
#if HWY_S390X_HAVE_Z14
|
|
4555
|
+
return Vec128<double, N>{vec_round(v.raw)};
|
|
4556
|
+
#else
|
|
3558
4557
|
return Vec128<double, N>{vec_rint(v.raw)};
|
|
4558
|
+
#endif
|
|
4559
|
+
}
|
|
4560
|
+
|
|
4561
|
+
template <size_t N>
|
|
4562
|
+
HWY_API Vec128<int32_t, N> NearestInt(Vec128<float, N> v) {
|
|
4563
|
+
const DFromV<decltype(v)> d;
|
|
4564
|
+
const RebindToSigned<decltype(d)> di;
|
|
4565
|
+
return ConvertTo(di, Round(v));
|
|
3559
4566
|
}
|
|
3560
4567
|
|
|
3561
4568
|
// Toward zero, aka truncate
|
|
@@ -3613,7 +4620,7 @@ HWY_API Mask128<T, N> IsFinite(Vec128<T, N> v) {
|
|
|
3613
4620
|
|
|
3614
4621
|
// ================================================== CRYPTO
|
|
3615
4622
|
|
|
3616
|
-
#if !defined(HWY_DISABLE_PPC8_CRYPTO)
|
|
4623
|
+
#if !HWY_S390X_HAVE_Z14 && !defined(HWY_DISABLE_PPC8_CRYPTO)
|
|
3617
4624
|
|
|
3618
4625
|
// Per-target flag to prevent generic_ops-inl.h from defining AESRound.
|
|
3619
4626
|
#ifdef HWY_NATIVE_AES
|
|
@@ -3918,11 +4925,20 @@ struct CompressIsPartition {
|
|
|
3918
4925
|
enum { value = (sizeof(T) != 1) };
|
|
3919
4926
|
};
|
|
3920
4927
|
|
|
4928
|
+
// ------------------------------ Dup128MaskFromMaskBits
|
|
4929
|
+
|
|
4930
|
+
template <class D>
|
|
4931
|
+
HWY_API MFromD<D> Dup128MaskFromMaskBits(D d, unsigned mask_bits) {
|
|
4932
|
+
constexpr size_t kN = MaxLanes(d);
|
|
4933
|
+
if (kN < 8) mask_bits &= (1u << kN) - 1;
|
|
4934
|
+
return detail::LoadMaskBits128(d, mask_bits);
|
|
4935
|
+
}
|
|
4936
|
+
|
|
3921
4937
|
// ------------------------------ StoreMaskBits
|
|
3922
4938
|
|
|
3923
4939
|
namespace detail {
|
|
3924
4940
|
|
|
3925
|
-
#if !HWY_PPC_HAVE_10 || HWY_IS_BIG_ENDIAN
|
|
4941
|
+
#if !HWY_S390X_HAVE_Z14 && (!HWY_PPC_HAVE_10 || HWY_IS_BIG_ENDIAN)
|
|
3926
4942
|
// fallback for missing vec_extractm
|
|
3927
4943
|
template <size_t N>
|
|
3928
4944
|
HWY_INLINE uint64_t ExtractSignBits(Vec128<uint8_t, N> sign_bits,
|
|
@@ -3935,32 +4951,70 @@ HWY_INLINE uint64_t ExtractSignBits(Vec128<uint8_t, N> sign_bits,
|
|
|
3935
4951
|
return extracted.raw[HWY_IS_LITTLE_ENDIAN];
|
|
3936
4952
|
}
|
|
3937
4953
|
|
|
3938
|
-
#endif // !HWY_PPC_HAVE_10
|
|
4954
|
+
#endif // !HWY_S390X_HAVE_Z14 && !HWY_PPC_HAVE_10
|
|
4955
|
+
|
|
4956
|
+
#if HWY_S390X_HAVE_Z14
|
|
4957
|
+
template <typename T, size_t N, HWY_IF_V_SIZE_LE(T, N, 8)>
|
|
4958
|
+
HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<1> /*tag*/, Mask128<T, N> mask) {
|
|
4959
|
+
const DFromM<decltype(mask)> d;
|
|
4960
|
+
const Repartition<uint8_t, decltype(d)> du8;
|
|
4961
|
+
const VFromD<decltype(du8)> sign_bits = BitCast(du8, VecFromMask(d, mask));
|
|
4962
|
+
|
|
4963
|
+
return ReduceSum(
|
|
4964
|
+
du8, And(sign_bits, Dup128VecFromValues(du8, 1, 2, 4, 8, 16, 32, 64, 128,
|
|
4965
|
+
1, 2, 4, 8, 16, 32, 64, 128)));
|
|
4966
|
+
}
|
|
4967
|
+
|
|
4968
|
+
template <typename T>
|
|
4969
|
+
HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<1> /*tag*/, Mask128<T> mask) {
|
|
4970
|
+
const DFromM<decltype(mask)> d;
|
|
4971
|
+
const Repartition<uint8_t, decltype(d)> du8;
|
|
4972
|
+
const Repartition<uint64_t, decltype(d)> du64;
|
|
4973
|
+
const VFromD<decltype(du8)> sign_bits = BitCast(du8, VecFromMask(d, mask));
|
|
4974
|
+
|
|
4975
|
+
const auto mask_bytes = SumsOf8(
|
|
4976
|
+
And(sign_bits, Dup128VecFromValues(du8, 1, 2, 4, 8, 16, 32, 64, 128, 1, 2,
|
|
4977
|
+
4, 8, 16, 32, 64, 128)));
|
|
3939
4978
|
|
|
4979
|
+
const Rebind<uint8_t, decltype(du64)> du8_2;
|
|
4980
|
+
const Repartition<uint16_t, decltype(du8_2)> du16_1;
|
|
4981
|
+
return GetLane(
|
|
4982
|
+
BitCast(du16_1, TruncateTo(du8_2, Reverse2(du64, mask_bytes))));
|
|
4983
|
+
}
|
|
4984
|
+
#else
|
|
3940
4985
|
template <typename T, size_t N>
|
|
3941
4986
|
HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<1> /*tag*/, Mask128<T, N> mask) {
|
|
3942
4987
|
const DFromM<decltype(mask)> d;
|
|
3943
4988
|
const Repartition<uint8_t, decltype(d)> du8;
|
|
3944
4989
|
const VFromD<decltype(du8)> sign_bits = BitCast(du8, VecFromMask(d, mask));
|
|
4990
|
+
|
|
3945
4991
|
#if HWY_PPC_HAVE_10 && HWY_IS_LITTLE_ENDIAN
|
|
3946
4992
|
return static_cast<uint64_t>(vec_extractm(sign_bits.raw));
|
|
3947
|
-
#else
|
|
4993
|
+
#else // PPC8, PPC9, or big-endian PPC10
|
|
3948
4994
|
const __vector unsigned char kBitShuffle = {120, 112, 104, 96, 88, 80, 72, 64,
|
|
3949
4995
|
56, 48, 40, 32, 24, 16, 8, 0};
|
|
3950
4996
|
return ExtractSignBits(sign_bits, kBitShuffle);
|
|
3951
|
-
#endif // HWY_PPC_HAVE_10
|
|
4997
|
+
#endif // HWY_PPC_HAVE_10 && HWY_IS_LITTLE_ENDIAN
|
|
3952
4998
|
}
|
|
4999
|
+
#endif // HWY_S390X_HAVE_Z14
|
|
3953
5000
|
|
|
3954
5001
|
template <typename T, size_t N>
|
|
3955
5002
|
HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<2> /*tag*/, Mask128<T, N> mask) {
|
|
3956
5003
|
const DFromM<decltype(mask)> d;
|
|
5004
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
5005
|
+
|
|
5006
|
+
#if HWY_S390X_HAVE_Z14
|
|
5007
|
+
const VFromD<decltype(du)> sign_bits = BitCast(du, VecFromMask(d, mask));
|
|
5008
|
+
return ReduceSum(
|
|
5009
|
+
du, And(sign_bits, Dup128VecFromValues(du, 1, 2, 4, 8, 16, 32, 64, 128)));
|
|
5010
|
+
#else // VSX
|
|
3957
5011
|
const Repartition<uint8_t, decltype(d)> du8;
|
|
3958
5012
|
const VFromD<decltype(du8)> sign_bits = BitCast(du8, VecFromMask(d, mask));
|
|
3959
5013
|
|
|
3960
5014
|
#if HWY_PPC_HAVE_10 && HWY_IS_LITTLE_ENDIAN
|
|
3961
|
-
const RebindToUnsigned<decltype(d)> du;
|
|
3962
5015
|
return static_cast<uint64_t>(vec_extractm(BitCast(du, sign_bits).raw));
|
|
3963
|
-
#else
|
|
5016
|
+
#else // PPC8, PPC9, or big-endian PPC10
|
|
5017
|
+
(void)du;
|
|
3964
5018
|
#if HWY_IS_LITTLE_ENDIAN
|
|
3965
5019
|
const __vector unsigned char kBitShuffle = {
|
|
3966
5020
|
112, 96, 80, 64, 48, 32, 16, 0, 128, 128, 128, 128, 128, 128, 128, 128};
|
|
@@ -3970,17 +5024,25 @@ HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<2> /*tag*/, Mask128<T, N> mask) {
|
|
|
3970
5024
|
#endif
|
|
3971
5025
|
return ExtractSignBits(sign_bits, kBitShuffle);
|
|
3972
5026
|
#endif // HWY_PPC_HAVE_10
|
|
5027
|
+
#endif // HWY_S390X_HAVE_Z14
|
|
3973
5028
|
}
|
|
3974
5029
|
|
|
3975
5030
|
template <typename T, size_t N>
|
|
3976
5031
|
HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<4> /*tag*/, Mask128<T, N> mask) {
|
|
3977
5032
|
const DFromM<decltype(mask)> d;
|
|
5033
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
5034
|
+
|
|
5035
|
+
#if HWY_S390X_HAVE_Z14
|
|
5036
|
+
const VFromD<decltype(du)> sign_bits = BitCast(du, VecFromMask(d, mask));
|
|
5037
|
+
return ReduceSum(du, And(sign_bits, Dup128VecFromValues(du, 1, 2, 4, 8)));
|
|
5038
|
+
#else // VSX
|
|
3978
5039
|
const Repartition<uint8_t, decltype(d)> du8;
|
|
3979
5040
|
const VFromD<decltype(du8)> sign_bits = BitCast(du8, VecFromMask(d, mask));
|
|
5041
|
+
|
|
3980
5042
|
#if HWY_PPC_HAVE_10 && HWY_IS_LITTLE_ENDIAN
|
|
3981
|
-
const RebindToUnsigned<decltype(d)> du;
|
|
3982
5043
|
return static_cast<uint64_t>(vec_extractm(BitCast(du, sign_bits).raw));
|
|
3983
|
-
#else
|
|
5044
|
+
#else // PPC8, PPC9, or big-endian PPC10
|
|
5045
|
+
(void)du;
|
|
3984
5046
|
#if HWY_IS_LITTLE_ENDIAN
|
|
3985
5047
|
const __vector unsigned char kBitShuffle = {96, 64, 32, 0, 128, 128,
|
|
3986
5048
|
128, 128, 128, 128, 128, 128,
|
|
@@ -3992,17 +5054,25 @@ HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<4> /*tag*/, Mask128<T, N> mask) {
|
|
|
3992
5054
|
#endif
|
|
3993
5055
|
return ExtractSignBits(sign_bits, kBitShuffle);
|
|
3994
5056
|
#endif // HWY_PPC_HAVE_10
|
|
5057
|
+
#endif // HWY_S390X_HAVE_Z14
|
|
3995
5058
|
}
|
|
3996
5059
|
|
|
3997
5060
|
template <typename T, size_t N>
|
|
3998
5061
|
HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<8> /*tag*/, Mask128<T, N> mask) {
|
|
3999
5062
|
const DFromM<decltype(mask)> d;
|
|
5063
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
5064
|
+
|
|
5065
|
+
#if HWY_S390X_HAVE_Z14
|
|
5066
|
+
const VFromD<decltype(du)> sign_bits = BitCast(du, VecFromMask(d, mask));
|
|
5067
|
+
return ReduceSum(du, And(sign_bits, Dup128VecFromValues(du, 1, 2)));
|
|
5068
|
+
#else // VSX
|
|
4000
5069
|
const Repartition<uint8_t, decltype(d)> du8;
|
|
4001
5070
|
const VFromD<decltype(du8)> sign_bits = BitCast(du8, VecFromMask(d, mask));
|
|
5071
|
+
|
|
4002
5072
|
#if HWY_PPC_HAVE_10 && HWY_IS_LITTLE_ENDIAN
|
|
4003
|
-
const RebindToUnsigned<decltype(d)> du;
|
|
4004
5073
|
return static_cast<uint64_t>(vec_extractm(BitCast(du, sign_bits).raw));
|
|
4005
5074
|
#else
|
|
5075
|
+
(void)du;
|
|
4006
5076
|
#if HWY_IS_LITTLE_ENDIAN
|
|
4007
5077
|
const __vector unsigned char kBitShuffle = {64, 0, 128, 128, 128, 128,
|
|
4008
5078
|
128, 128, 128, 128, 128, 128,
|
|
@@ -4014,6 +5084,7 @@ HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<8> /*tag*/, Mask128<T, N> mask) {
|
|
|
4014
5084
|
#endif
|
|
4015
5085
|
return ExtractSignBits(sign_bits, kBitShuffle);
|
|
4016
5086
|
#endif // HWY_PPC_HAVE_10
|
|
5087
|
+
#endif // HWY_S390X_HAVE_Z14
|
|
4017
5088
|
}
|
|
4018
5089
|
|
|
4019
5090
|
// Returns the lowest N of the mask bits.
|
|
@@ -4076,31 +5147,32 @@ HWY_API size_t StoreMaskBits(D /*d*/, MFromD<D> mask, uint8_t* bits) {
|
|
|
4076
5147
|
template <class D, HWY_IF_V_SIZE_D(D, 16)>
|
|
4077
5148
|
HWY_API bool AllFalse(D d, MFromD<D> mask) {
|
|
4078
5149
|
const RebindToUnsigned<decltype(d)> du;
|
|
4079
|
-
return static_cast<bool>(
|
|
5150
|
+
return static_cast<bool>(
|
|
5151
|
+
vec_all_eq(VecFromMask(du, RebindMask(du, mask)).raw, Zero(du).raw));
|
|
4080
5152
|
}
|
|
4081
5153
|
|
|
4082
5154
|
template <class D, HWY_IF_V_SIZE_D(D, 16)>
|
|
4083
5155
|
HWY_API bool AllTrue(D d, MFromD<D> mask) {
|
|
4084
5156
|
const RebindToUnsigned<decltype(d)> du;
|
|
4085
5157
|
using TU = TFromD<decltype(du)>;
|
|
4086
|
-
return static_cast<bool>(
|
|
4087
|
-
|
|
5158
|
+
return static_cast<bool>(vec_all_eq(VecFromMask(du, RebindMask(du, mask)).raw,
|
|
5159
|
+
Set(du, hwy::LimitsMax<TU>()).raw));
|
|
4088
5160
|
}
|
|
4089
5161
|
|
|
4090
5162
|
template <class D, HWY_IF_V_SIZE_LE_D(D, 8)>
|
|
4091
5163
|
HWY_API bool AllFalse(D d, MFromD<D> mask) {
|
|
4092
5164
|
const Full128<TFromD<D>> d_full;
|
|
4093
5165
|
constexpr size_t kN = MaxLanes(d);
|
|
4094
|
-
return AllFalse(d_full,
|
|
4095
|
-
|
|
5166
|
+
return AllFalse(d_full,
|
|
5167
|
+
And(MFromD<decltype(d_full)>{mask.raw}, FirstN(d_full, kN)));
|
|
4096
5168
|
}
|
|
4097
5169
|
|
|
4098
5170
|
template <class D, HWY_IF_V_SIZE_LE_D(D, 8)>
|
|
4099
5171
|
HWY_API bool AllTrue(D d, MFromD<D> mask) {
|
|
4100
5172
|
const Full128<TFromD<D>> d_full;
|
|
4101
5173
|
constexpr size_t kN = MaxLanes(d);
|
|
4102
|
-
return AllTrue(
|
|
4103
|
-
|
|
5174
|
+
return AllTrue(
|
|
5175
|
+
d_full, Or(MFromD<decltype(d_full)>{mask.raw}, Not(FirstN(d_full, kN))));
|
|
4104
5176
|
}
|
|
4105
5177
|
|
|
4106
5178
|
template <class D>
|
|
@@ -4222,7 +5294,7 @@ HWY_INLINE VFromD<D> CompressOrExpandIndicesFromMask(D d, MFromD<D> mask) {
|
|
|
4222
5294
|
__asm__("xxgenpcvbm %x0, %1, %2"
|
|
4223
5295
|
: "=wa"(idx)
|
|
4224
5296
|
: "v"(mask.raw), "i"(kGenPcvmMode));
|
|
4225
|
-
return VFromD<
|
|
5297
|
+
return VFromD<decltype(d)>{idx};
|
|
4226
5298
|
}
|
|
4227
5299
|
template <bool kIsCompress, class D, HWY_IF_T_SIZE_D(D, 2)>
|
|
4228
5300
|
HWY_INLINE VFromD<D> CompressOrExpandIndicesFromMask(D d, MFromD<D> mask) {
|
|
@@ -4235,7 +5307,7 @@ HWY_INLINE VFromD<D> CompressOrExpandIndicesFromMask(D d, MFromD<D> mask) {
|
|
|
4235
5307
|
__asm__("xxgenpcvhm %x0, %1, %2"
|
|
4236
5308
|
: "=wa"(idx)
|
|
4237
5309
|
: "v"(mask.raw), "i"(kGenPcvmMode));
|
|
4238
|
-
return VFromD<
|
|
5310
|
+
return VFromD<decltype(d)>{idx};
|
|
4239
5311
|
}
|
|
4240
5312
|
template <bool kIsCompress, class D, HWY_IF_T_SIZE_D(D, 4)>
|
|
4241
5313
|
HWY_INLINE VFromD<D> CompressOrExpandIndicesFromMask(D d, MFromD<D> mask) {
|
|
@@ -4248,7 +5320,7 @@ HWY_INLINE VFromD<D> CompressOrExpandIndicesFromMask(D d, MFromD<D> mask) {
|
|
|
4248
5320
|
__asm__("xxgenpcvwm %x0, %1, %2"
|
|
4249
5321
|
: "=wa"(idx)
|
|
4250
5322
|
: "v"(mask.raw), "i"(kGenPcvmMode));
|
|
4251
|
-
return VFromD<
|
|
5323
|
+
return VFromD<decltype(d)>{idx};
|
|
4252
5324
|
}
|
|
4253
5325
|
#endif
|
|
4254
5326
|
|
|
@@ -4821,7 +5893,7 @@ HWY_API size_t CompressBlendedStore(VFromD<D> v, MFromD<D> m, D d,
|
|
|
4821
5893
|
|
|
4822
5894
|
const auto indices = BitCast(du, detail::IndicesFromBits128(d, mask_bits));
|
|
4823
5895
|
const auto compressed = BitCast(d, TableLookupBytes(BitCast(du, v), indices));
|
|
4824
|
-
#if HWY_PPC_HAVE_9
|
|
5896
|
+
#if (HWY_PPC_HAVE_9 && HWY_ARCH_PPC_64) || HWY_S390X_HAVE_Z14
|
|
4825
5897
|
StoreN(compressed, d, unaligned, count);
|
|
4826
5898
|
#else
|
|
4827
5899
|
BlendedStore(compressed, FirstN(d, count), d, unaligned);
|
|
@@ -4939,7 +6011,11 @@ HWY_INLINE V Per128BitBlkRevLanesOnBe(V v) {
|
|
|
4939
6011
|
|
|
4940
6012
|
template <class V>
|
|
4941
6013
|
HWY_INLINE V I128Subtract(V a, V b) {
|
|
4942
|
-
#if
|
|
6014
|
+
#if HWY_S390X_HAVE_Z14
|
|
6015
|
+
const V diff_i128{reinterpret_cast<typename detail::Raw128<TFromV<V>>::type>(
|
|
6016
|
+
vec_sub_u128(reinterpret_cast<__vector unsigned char>(a.raw),
|
|
6017
|
+
reinterpret_cast<__vector unsigned char>(b.raw)))};
|
|
6018
|
+
#elif defined(__SIZEOF_INT128__)
|
|
4943
6019
|
using VU128 = __vector unsigned __int128;
|
|
4944
6020
|
const V diff_i128{reinterpret_cast<typename detail::Raw128<TFromV<V>>::type>(
|
|
4945
6021
|
vec_sub(reinterpret_cast<VU128>(a.raw), reinterpret_cast<VU128>(b.raw)))};
|
|
@@ -5067,84 +6143,133 @@ HWY_API Mask128<T, N> SetAtOrBeforeFirst(Mask128<T, N> mask) {
|
|
|
5067
6143
|
return SetBeforeFirst(MaskFromVec(ShiftLeftLanes<1>(VecFromMask(d, mask))));
|
|
5068
6144
|
}
|
|
5069
6145
|
|
|
5070
|
-
// ------------------------------
|
|
5071
|
-
|
|
6146
|
+
// ------------------------------ SumsOf2 and SumsOf4
|
|
5072
6147
|
namespace detail {
|
|
5073
6148
|
|
|
5074
|
-
|
|
5075
|
-
|
|
5076
|
-
|
|
5077
|
-
|
|
5078
|
-
|
|
5079
|
-
|
|
5080
|
-
|
|
5081
|
-
|
|
5082
|
-
|
|
5083
|
-
|
|
5084
|
-
|
|
5085
|
-
|
|
6149
|
+
#if !HWY_S390X_HAVE_Z14
|
|
6150
|
+
// Casts nominally int32_t result to D.
|
|
6151
|
+
template <class D>
|
|
6152
|
+
HWY_INLINE VFromD<D> AltivecVsum4sbs(D d, __vector signed char a,
|
|
6153
|
+
__vector signed int b) {
|
|
6154
|
+
const Repartition<int32_t, D> di32;
|
|
6155
|
+
#ifdef __OPTIMIZE__
|
|
6156
|
+
if (IsConstantRawAltivecVect(a) && IsConstantRawAltivecVect(b)) {
|
|
6157
|
+
const int64_t sum0 =
|
|
6158
|
+
static_cast<int64_t>(a[0]) + static_cast<int64_t>(a[1]) +
|
|
6159
|
+
static_cast<int64_t>(a[2]) + static_cast<int64_t>(a[3]) +
|
|
6160
|
+
static_cast<int64_t>(b[0]);
|
|
6161
|
+
const int64_t sum1 =
|
|
6162
|
+
static_cast<int64_t>(a[4]) + static_cast<int64_t>(a[5]) +
|
|
6163
|
+
static_cast<int64_t>(a[6]) + static_cast<int64_t>(a[7]) +
|
|
6164
|
+
static_cast<int64_t>(b[1]);
|
|
6165
|
+
const int64_t sum2 =
|
|
6166
|
+
static_cast<int64_t>(a[8]) + static_cast<int64_t>(a[9]) +
|
|
6167
|
+
static_cast<int64_t>(a[10]) + static_cast<int64_t>(a[11]) +
|
|
6168
|
+
static_cast<int64_t>(b[2]);
|
|
6169
|
+
const int64_t sum3 =
|
|
6170
|
+
static_cast<int64_t>(a[12]) + static_cast<int64_t>(a[13]) +
|
|
6171
|
+
static_cast<int64_t>(a[14]) + static_cast<int64_t>(a[15]) +
|
|
6172
|
+
static_cast<int64_t>(b[3]);
|
|
6173
|
+
const int32_t sign0 = static_cast<int32_t>(sum0 >> 63);
|
|
6174
|
+
const int32_t sign1 = static_cast<int32_t>(sum1 >> 63);
|
|
6175
|
+
const int32_t sign2 = static_cast<int32_t>(sum2 >> 63);
|
|
6176
|
+
const int32_t sign3 = static_cast<int32_t>(sum3 >> 63);
|
|
6177
|
+
using Raw = typename detail::Raw128<int32_t>::type;
|
|
6178
|
+
return BitCast(
|
|
6179
|
+
d,
|
|
6180
|
+
VFromD<decltype(di32)>{Raw{
|
|
6181
|
+
(sign0 == (sum0 >> 31)) ? static_cast<int32_t>(sum0)
|
|
6182
|
+
: static_cast<int32_t>(sign0 ^ 0x7FFFFFFF),
|
|
6183
|
+
(sign1 == (sum1 >> 31)) ? static_cast<int32_t>(sum1)
|
|
6184
|
+
: static_cast<int32_t>(sign1 ^ 0x7FFFFFFF),
|
|
6185
|
+
(sign2 == (sum2 >> 31)) ? static_cast<int32_t>(sum2)
|
|
6186
|
+
: static_cast<int32_t>(sign2 ^ 0x7FFFFFFF),
|
|
6187
|
+
(sign3 == (sum3 >> 31))
|
|
6188
|
+
? static_cast<int32_t>(sum3)
|
|
6189
|
+
: static_cast<int32_t>(sign3 ^ 0x7FFFFFFF)}});
|
|
6190
|
+
} else // NOLINT
|
|
6191
|
+
#endif
|
|
6192
|
+
{
|
|
6193
|
+
return BitCast(d, VFromD<decltype(di32)>{vec_vsum4sbs(a, b)});
|
|
6194
|
+
}
|
|
5086
6195
|
}
|
|
5087
6196
|
|
|
5088
|
-
//
|
|
5089
|
-
|
|
5090
|
-
|
|
5091
|
-
|
|
5092
|
-
|
|
5093
|
-
|
|
5094
|
-
|
|
5095
|
-
|
|
5096
|
-
|
|
5097
|
-
|
|
5098
|
-
|
|
5099
|
-
|
|
5100
|
-
|
|
5101
|
-
|
|
5102
|
-
|
|
5103
|
-
|
|
6197
|
+
// Casts nominally uint32_t result to D.
|
|
6198
|
+
template <class D>
|
|
6199
|
+
HWY_INLINE VFromD<D> AltivecVsum4ubs(D d, __vector unsigned char a,
|
|
6200
|
+
__vector unsigned int b) {
|
|
6201
|
+
const Repartition<uint32_t, D> du32;
|
|
6202
|
+
#ifdef __OPTIMIZE__
|
|
6203
|
+
if (IsConstantRawAltivecVect(a) && IsConstantRawAltivecVect(b)) {
|
|
6204
|
+
const uint64_t sum0 =
|
|
6205
|
+
static_cast<uint64_t>(a[0]) + static_cast<uint64_t>(a[1]) +
|
|
6206
|
+
static_cast<uint64_t>(a[2]) + static_cast<uint64_t>(a[3]) +
|
|
6207
|
+
static_cast<uint64_t>(b[0]);
|
|
6208
|
+
const uint64_t sum1 =
|
|
6209
|
+
static_cast<uint64_t>(a[4]) + static_cast<uint64_t>(a[5]) +
|
|
6210
|
+
static_cast<uint64_t>(a[6]) + static_cast<uint64_t>(a[7]) +
|
|
6211
|
+
static_cast<uint64_t>(b[1]);
|
|
6212
|
+
const uint64_t sum2 =
|
|
6213
|
+
static_cast<uint64_t>(a[8]) + static_cast<uint64_t>(a[9]) +
|
|
6214
|
+
static_cast<uint64_t>(a[10]) + static_cast<uint64_t>(a[11]) +
|
|
6215
|
+
static_cast<uint64_t>(b[2]);
|
|
6216
|
+
const uint64_t sum3 =
|
|
6217
|
+
static_cast<uint64_t>(a[12]) + static_cast<uint64_t>(a[13]) +
|
|
6218
|
+
static_cast<uint64_t>(a[14]) + static_cast<uint64_t>(a[15]) +
|
|
6219
|
+
static_cast<uint64_t>(b[3]);
|
|
6220
|
+
return BitCast(
|
|
6221
|
+
d,
|
|
6222
|
+
VFromD<decltype(du32)>{(__vector unsigned int){
|
|
6223
|
+
static_cast<unsigned int>(sum0 <= 0xFFFFFFFFu ? sum0 : 0xFFFFFFFFu),
|
|
6224
|
+
static_cast<unsigned int>(sum1 <= 0xFFFFFFFFu ? sum1 : 0xFFFFFFFFu),
|
|
6225
|
+
static_cast<unsigned int>(sum2 <= 0xFFFFFFFFu ? sum2 : 0xFFFFFFFFu),
|
|
6226
|
+
static_cast<unsigned int>(sum3 <= 0xFFFFFFFFu ? sum3
|
|
6227
|
+
: 0xFFFFFFFFu)}});
|
|
6228
|
+
} else // NOLINT
|
|
6229
|
+
#endif
|
|
6230
|
+
{
|
|
6231
|
+
return BitCast(d, VFromD<decltype(du32)>{vec_vsum4ubs(a, b)});
|
|
6232
|
+
}
|
|
5104
6233
|
}
|
|
5105
6234
|
|
|
5106
|
-
//
|
|
5107
|
-
template <
|
|
5108
|
-
HWY_INLINE
|
|
5109
|
-
|
|
5110
|
-
|
|
5111
|
-
|
|
5112
|
-
const
|
|
5113
|
-
|
|
5114
|
-
|
|
5115
|
-
|
|
5116
|
-
|
|
5117
|
-
|
|
5118
|
-
|
|
5119
|
-
|
|
5120
|
-
|
|
5121
|
-
|
|
5122
|
-
|
|
5123
|
-
|
|
5124
|
-
|
|
5125
|
-
|
|
5126
|
-
|
|
5127
|
-
|
|
5128
|
-
|
|
5129
|
-
|
|
6235
|
+
// Casts nominally int32_t result to D.
|
|
6236
|
+
template <class D>
|
|
6237
|
+
HWY_INLINE VFromD<D> AltivecVsum2sws(D d, __vector signed int a,
|
|
6238
|
+
__vector signed int b) {
|
|
6239
|
+
const Repartition<int32_t, D> di32;
|
|
6240
|
+
#ifdef __OPTIMIZE__
|
|
6241
|
+
const Repartition<uint64_t, D> du64;
|
|
6242
|
+
constexpr int kDestLaneOffset = HWY_IS_BIG_ENDIAN;
|
|
6243
|
+
if (IsConstantRawAltivecVect(a) && __builtin_constant_p(b[kDestLaneOffset]) &&
|
|
6244
|
+
__builtin_constant_p(b[kDestLaneOffset + 2])) {
|
|
6245
|
+
const int64_t sum0 = static_cast<int64_t>(a[0]) +
|
|
6246
|
+
static_cast<int64_t>(a[1]) +
|
|
6247
|
+
static_cast<int64_t>(b[kDestLaneOffset]);
|
|
6248
|
+
const int64_t sum1 = static_cast<int64_t>(a[2]) +
|
|
6249
|
+
static_cast<int64_t>(a[3]) +
|
|
6250
|
+
static_cast<int64_t>(b[kDestLaneOffset + 2]);
|
|
6251
|
+
const int32_t sign0 = static_cast<int32_t>(sum0 >> 63);
|
|
6252
|
+
const int32_t sign1 = static_cast<int32_t>(sum1 >> 63);
|
|
6253
|
+
return BitCast(d, VFromD<decltype(du64)>{(__vector unsigned long long){
|
|
6254
|
+
(sign0 == (sum0 >> 31))
|
|
6255
|
+
? static_cast<uint32_t>(sum0)
|
|
6256
|
+
: static_cast<uint32_t>(sign0 ^ 0x7FFFFFFF),
|
|
6257
|
+
(sign1 == (sum1 >> 31))
|
|
6258
|
+
? static_cast<uint32_t>(sum1)
|
|
6259
|
+
: static_cast<uint32_t>(sign1 ^ 0x7FFFFFFF)}});
|
|
6260
|
+
} else // NOLINT
|
|
6261
|
+
#endif
|
|
6262
|
+
{
|
|
6263
|
+
__vector signed int sum;
|
|
5130
6264
|
|
|
5131
|
-
//
|
|
6265
|
+
// Inline assembly is used for vsum2sws to avoid unnecessary shuffling
|
|
6266
|
+
// on little-endian PowerPC targets as the result of the vsum2sws
|
|
6267
|
+
// instruction will already be in the correct lanes on little-endian
|
|
6268
|
+
// PowerPC targets.
|
|
6269
|
+
__asm__("vsum2sws %0,%1,%2" : "=v"(sum) : "v"(a), "v"(b));
|
|
5132
6270
|
|
|
5133
|
-
|
|
5134
|
-
|
|
5135
|
-
HWY_INLINE Vec128<T> SumOfLanes(Vec128<T> v10) {
|
|
5136
|
-
const Vec128<T> v01 = Shuffle01(v10);
|
|
5137
|
-
return v10 + v01;
|
|
5138
|
-
}
|
|
5139
|
-
template <typename T, HWY_IF_T_SIZE(T, 8)>
|
|
5140
|
-
HWY_INLINE Vec128<T> MinOfLanes(Vec128<T> v10) {
|
|
5141
|
-
const Vec128<T> v01 = Shuffle01(v10);
|
|
5142
|
-
return Min(v10, v01);
|
|
5143
|
-
}
|
|
5144
|
-
template <typename T, HWY_IF_T_SIZE(T, 8)>
|
|
5145
|
-
HWY_INLINE Vec128<T> MaxOfLanes(Vec128<T> v10) {
|
|
5146
|
-
const Vec128<T> v01 = Shuffle01(v10);
|
|
5147
|
-
return Max(v10, v01);
|
|
6271
|
+
return BitCast(d, VFromD<decltype(di32)>{sum});
|
|
6272
|
+
}
|
|
5148
6273
|
}
|
|
5149
6274
|
|
|
5150
6275
|
// Casts nominally int32_t result to D.
|
|
@@ -5238,275 +6363,345 @@ HWY_INLINE Vec128<int32_t, N / 2> AltivecU16SumsOf2(Vec128<uint16_t, N> v) {
|
|
|
5238
6363
|
return AltivecVsum4shs(di32, Xor(BitCast(di16, v), Set(di16, -32768)).raw,
|
|
5239
6364
|
Set(di32, 65536).raw);
|
|
5240
6365
|
}
|
|
6366
|
+
#endif // !HWY_S390X_HAVE_Z14
|
|
6367
|
+
|
|
6368
|
+
// U16->U32 SumsOf2
|
|
6369
|
+
template <class V>
|
|
6370
|
+
HWY_INLINE VFromD<RepartitionToWide<DFromV<V>>> SumsOf2(
|
|
6371
|
+
hwy::UnsignedTag /*type_tag*/, hwy::SizeTag<2> /*lane_size_tag*/, V v) {
|
|
6372
|
+
const DFromV<V> d;
|
|
6373
|
+
const RepartitionToWide<decltype(d)> dw;
|
|
6374
|
+
|
|
6375
|
+
#if HWY_S390X_HAVE_Z14
|
|
6376
|
+
return VFromD<decltype(dw)>{vec_sum4(v.raw, Zero(d).raw)};
|
|
6377
|
+
#else
|
|
6378
|
+
return BitCast(dw, AltivecU16SumsOf2(v));
|
|
6379
|
+
#endif
|
|
6380
|
+
}
|
|
6381
|
+
|
|
6382
|
+
// I16->I32 SumsOf2
|
|
6383
|
+
template <class V>
|
|
6384
|
+
HWY_INLINE VFromD<RepartitionToWide<DFromV<V>>> SumsOf2(
|
|
6385
|
+
hwy::SignedTag /*type_tag*/, hwy::SizeTag<2> /*lane_size_tag*/, V v) {
|
|
6386
|
+
const DFromV<V> d;
|
|
6387
|
+
const RepartitionToWide<decltype(d)> dw;
|
|
6388
|
+
|
|
6389
|
+
#if HWY_S390X_HAVE_Z14
|
|
6390
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
6391
|
+
return BitCast(dw, SumsOf2(hwy::UnsignedTag(), hwy::SizeTag<2>(),
|
|
6392
|
+
BitCast(du, Xor(v, SignBit(d))))) +
|
|
6393
|
+
Set(dw, int32_t{-65536});
|
|
6394
|
+
#else
|
|
6395
|
+
return AltivecVsum4shs(dw, v.raw, Zero(dw).raw);
|
|
6396
|
+
#endif
|
|
6397
|
+
}
|
|
6398
|
+
|
|
6399
|
+
#if HWY_S390X_HAVE_Z14
|
|
6400
|
+
// U32->U64 SumsOf2
|
|
6401
|
+
template <class V>
|
|
6402
|
+
HWY_INLINE VFromD<RepartitionToWide<DFromV<V>>> SumsOf2(
|
|
6403
|
+
hwy::UnsignedTag /*type_tag*/, hwy::SizeTag<4> /*lane_size_tag*/, V v) {
|
|
6404
|
+
const DFromV<V> d;
|
|
6405
|
+
const RepartitionToWide<decltype(d)> dw;
|
|
6406
|
+
return VFromD<decltype(dw)>{vec_sum2(v.raw, Zero(d).raw)};
|
|
6407
|
+
}
|
|
6408
|
+
|
|
6409
|
+
// I32->I64 SumsOf2
|
|
6410
|
+
template <class V>
|
|
6411
|
+
HWY_INLINE VFromD<RepartitionToWide<DFromV<V>>> SumsOf2(
|
|
6412
|
+
hwy::SignedTag /*type_tag*/, hwy::SizeTag<4> /*lane_size_tag*/, V v) {
|
|
6413
|
+
const DFromV<V> d;
|
|
6414
|
+
const RepartitionToWide<decltype(d)> dw;
|
|
6415
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
6416
|
+
|
|
6417
|
+
return BitCast(dw, SumsOf2(hwy::UnsignedTag(), hwy::SizeTag<4>(),
|
|
6418
|
+
BitCast(du, Xor(v, SignBit(d))))) +
|
|
6419
|
+
Set(dw, int64_t{-4294967296LL});
|
|
6420
|
+
}
|
|
6421
|
+
#endif
|
|
6422
|
+
|
|
6423
|
+
// U8->U32 SumsOf4
|
|
6424
|
+
template <class V>
|
|
6425
|
+
HWY_INLINE VFromD<RepartitionToWideX2<DFromV<V>>> SumsOf4(
|
|
6426
|
+
hwy::UnsignedTag /*type_tag*/, hwy::SizeTag<1> /*lane_size_tag*/, V v) {
|
|
6427
|
+
const DFromV<V> d;
|
|
6428
|
+
const RepartitionToWideX2<decltype(d)> dw2;
|
|
6429
|
+
|
|
6430
|
+
#if HWY_S390X_HAVE_Z14
|
|
6431
|
+
return VFromD<decltype(dw2)>{vec_sum4(v.raw, Zero(d).raw)};
|
|
6432
|
+
#else
|
|
6433
|
+
return AltivecVsum4ubs(dw2, v.raw, Zero(dw2).raw);
|
|
6434
|
+
#endif
|
|
6435
|
+
}
|
|
6436
|
+
|
|
6437
|
+
// I8->I32 SumsOf4
|
|
6438
|
+
template <class V>
|
|
6439
|
+
HWY_INLINE VFromD<RepartitionToWideX2<DFromV<V>>> SumsOf4(
|
|
6440
|
+
hwy::SignedTag /*type_tag*/, hwy::SizeTag<1> /*lane_size_tag*/, V v) {
|
|
6441
|
+
const DFromV<V> d;
|
|
6442
|
+
const RepartitionToWideX2<decltype(d)> dw2;
|
|
6443
|
+
|
|
6444
|
+
#if HWY_S390X_HAVE_Z14
|
|
6445
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
6446
|
+
return BitCast(dw2, SumsOf4(hwy::UnsignedTag(), hwy::SizeTag<1>(),
|
|
6447
|
+
BitCast(du, Xor(v, SignBit(d))))) +
|
|
6448
|
+
Set(dw2, int32_t{-512});
|
|
6449
|
+
#else
|
|
6450
|
+
return AltivecVsum4sbs(dw2, v.raw, Zero(dw2).raw);
|
|
6451
|
+
#endif
|
|
6452
|
+
}
|
|
6453
|
+
|
|
6454
|
+
// U16->U64 SumsOf4
|
|
6455
|
+
template <class V>
|
|
6456
|
+
HWY_INLINE VFromD<RepartitionToWideX2<DFromV<V>>> SumsOf4(
|
|
6457
|
+
hwy::UnsignedTag /*type_tag*/, hwy::SizeTag<2> /*lane_size_tag*/, V v) {
|
|
6458
|
+
const DFromV<V> d;
|
|
6459
|
+
const RepartitionToWide<decltype(d)> dw;
|
|
6460
|
+
const RepartitionToWide<decltype(dw)> dw2;
|
|
6461
|
+
|
|
6462
|
+
#if HWY_S390X_HAVE_Z14
|
|
6463
|
+
return VFromD<decltype(dw2)>{vec_sum2(v.raw, Zero(d).raw)};
|
|
6464
|
+
#else
|
|
6465
|
+
const RebindToSigned<decltype(dw)> dw_i;
|
|
6466
|
+
return AltivecVsum2sws(dw2, BitCast(dw_i, SumsOf2(v)).raw, Zero(dw_i).raw);
|
|
6467
|
+
#endif
|
|
6468
|
+
}
|
|
6469
|
+
|
|
6470
|
+
// I16->I64 SumsOf4
|
|
6471
|
+
template <class V>
|
|
6472
|
+
HWY_INLINE VFromD<RepartitionToWideX2<DFromV<V>>> SumsOf4(
|
|
6473
|
+
hwy::SignedTag /*type_tag*/, hwy::SizeTag<2> /*lane_size_tag*/, V v) {
|
|
6474
|
+
const DFromV<V> d;
|
|
6475
|
+
const RepartitionToWide<decltype(d)> dw;
|
|
6476
|
+
const RepartitionToWide<decltype(dw)> dw2;
|
|
6477
|
+
|
|
6478
|
+
#if HWY_S390X_HAVE_Z14
|
|
6479
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
6480
|
+
return BitCast(dw2, SumsOf4(hwy::UnsignedTag(), hwy::SizeTag<2>(),
|
|
6481
|
+
BitCast(du, Xor(v, SignBit(d))))) +
|
|
6482
|
+
Set(dw2, int64_t{-131072});
|
|
6483
|
+
#else // VSX
|
|
6484
|
+
const auto sums_of_4_in_lo32 =
|
|
6485
|
+
AltivecVsum2sws(dw, SumsOf2(v).raw, Zero(dw).raw);
|
|
6486
|
+
|
|
6487
|
+
#if HWY_IS_LITTLE_ENDIAN
|
|
6488
|
+
return PromoteEvenTo(dw2, sums_of_4_in_lo32);
|
|
6489
|
+
#else
|
|
6490
|
+
return PromoteOddTo(dw2, sums_of_4_in_lo32);
|
|
6491
|
+
#endif // HWY_IS_LITTLE_ENDIAN
|
|
6492
|
+
#endif // HWY_S390X_HAVE_Z14
|
|
6493
|
+
}
|
|
6494
|
+
|
|
6495
|
+
} // namespace detail
|
|
6496
|
+
|
|
6497
|
+
// ------------------------------ SumOfLanes
|
|
6498
|
+
|
|
6499
|
+
// We define SumOfLanes for 8/16-bit types (and I32/U32/I64/U64 on Z14/Z15/Z16);
|
|
6500
|
+
// enable generic for the rest.
|
|
6501
|
+
#undef HWY_IF_SUM_OF_LANES_D
|
|
6502
|
+
#if HWY_S390X_HAVE_Z14
|
|
6503
|
+
#define HWY_IF_SUM_OF_LANES_D(D) HWY_IF_LANES_GT_D(D, 1), HWY_IF_FLOAT3264_D(D)
|
|
6504
|
+
#else
|
|
6505
|
+
#define HWY_IF_SUM_OF_LANES_D(D) \
|
|
6506
|
+
HWY_IF_LANES_GT_D(D, 1), HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 4) | (1 << 8))
|
|
6507
|
+
#endif
|
|
6508
|
+
|
|
6509
|
+
#if HWY_S390X_HAVE_Z14
|
|
6510
|
+
namespace detail {
|
|
6511
|
+
|
|
6512
|
+
template <class T, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T),
|
|
6513
|
+
HWY_IF_T_SIZE_ONE_OF(T, (1 << 4) | (1 << 8))>
|
|
6514
|
+
HWY_INLINE Vec128<T> SumOfU32OrU64LanesAsU128(Vec128<T> v) {
|
|
6515
|
+
const DFromV<decltype(v)> d;
|
|
6516
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
6517
|
+
return BitCast(
|
|
6518
|
+
d, Vec128<uint8_t>{vec_sum_u128(BitCast(du, v).raw, Zero(du).raw)});
|
|
6519
|
+
}
|
|
5241
6520
|
|
|
5242
|
-
|
|
6521
|
+
} // namespace detail
|
|
6522
|
+
|
|
6523
|
+
template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_UI64_D(D)>
|
|
6524
|
+
HWY_API VFromD<D> SumOfLanes(D /*d64*/, VFromD<D> v) {
|
|
6525
|
+
return Broadcast<1>(detail::SumOfU32OrU64LanesAsU128(v));
|
|
6526
|
+
}
|
|
6527
|
+
#endif
|
|
6528
|
+
|
|
6529
|
+
template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_U16_D(D)>
|
|
6530
|
+
HWY_API Vec32<uint16_t> SumOfLanes(D du16, Vec32<uint16_t> v) {
|
|
5243
6531
|
constexpr int kSumLaneIdx = HWY_IS_BIG_ENDIAN;
|
|
5244
|
-
|
|
5245
|
-
|
|
6532
|
+
return Broadcast<kSumLaneIdx>(
|
|
6533
|
+
BitCast(du16, detail::SumsOf2(hwy::UnsignedTag(), hwy::SizeTag<2>(), v)));
|
|
5246
6534
|
}
|
|
5247
6535
|
|
|
5248
|
-
|
|
6536
|
+
template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_U16_D(D)>
|
|
6537
|
+
HWY_API Vec64<uint16_t> SumOfLanes(D du16, Vec64<uint16_t> v) {
|
|
5249
6538
|
constexpr int kSumLaneIdx = HWY_IS_LITTLE_ENDIAN ? 0 : 3;
|
|
5250
|
-
const Full64<uint16_t> du16;
|
|
5251
|
-
const auto zero = Zero(Full128<int32_t>());
|
|
5252
6539
|
return Broadcast<kSumLaneIdx>(
|
|
5253
|
-
|
|
6540
|
+
BitCast(du16, detail::SumsOf4(hwy::UnsignedTag(), hwy::SizeTag<2>(), v)));
|
|
5254
6541
|
}
|
|
5255
6542
|
|
|
5256
|
-
|
|
6543
|
+
template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U16_D(D)>
|
|
6544
|
+
HWY_API Vec128<uint16_t> SumOfLanes(D du16, Vec128<uint16_t> v) {
|
|
5257
6545
|
constexpr int kSumLaneIdx = HWY_IS_LITTLE_ENDIAN ? 0 : 7;
|
|
5258
|
-
|
|
6546
|
+
#if HWY_S390X_HAVE_Z14
|
|
6547
|
+
return Broadcast<kSumLaneIdx>(
|
|
6548
|
+
BitCast(du16, detail::SumOfU32OrU64LanesAsU128(detail::SumsOf4(
|
|
6549
|
+
hwy::UnsignedTag(), hwy::SizeTag<2>(), v))));
|
|
6550
|
+
#else // VSX
|
|
5259
6551
|
const auto zero = Zero(Full128<int32_t>());
|
|
5260
6552
|
return Broadcast<kSumLaneIdx>(
|
|
5261
|
-
AltivecVsumsws(du16, AltivecU16SumsOf2(v).raw, zero.raw));
|
|
6553
|
+
detail::AltivecVsumsws(du16, detail::AltivecU16SumsOf2(v).raw, zero.raw));
|
|
6554
|
+
#endif
|
|
5262
6555
|
}
|
|
5263
6556
|
|
|
5264
|
-
|
|
6557
|
+
template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_I16_D(D)>
|
|
6558
|
+
HWY_API Vec32<int16_t> SumOfLanes(D di16, Vec32<int16_t> v) {
|
|
6559
|
+
#if HWY_S390X_HAVE_Z14
|
|
6560
|
+
const RebindToUnsigned<decltype(di16)> du16;
|
|
6561
|
+
return BitCast(di16, SumOfLanes(du16, BitCast(du16, v)));
|
|
6562
|
+
#else
|
|
5265
6563
|
constexpr int kSumLaneIdx = HWY_IS_BIG_ENDIAN;
|
|
5266
|
-
|
|
5267
|
-
|
|
5268
|
-
|
|
6564
|
+
return Broadcast<kSumLaneIdx>(
|
|
6565
|
+
BitCast(di16, detail::SumsOf2(hwy::SignedTag(), hwy::SizeTag<2>(), v)));
|
|
6566
|
+
#endif
|
|
5269
6567
|
}
|
|
5270
6568
|
|
|
5271
|
-
|
|
6569
|
+
template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_I16_D(D)>
|
|
6570
|
+
HWY_API Vec64<int16_t> SumOfLanes(D di16, Vec64<int16_t> v) {
|
|
6571
|
+
#if HWY_S390X_HAVE_Z14
|
|
6572
|
+
const RebindToUnsigned<decltype(di16)> du16;
|
|
6573
|
+
return BitCast(di16, SumOfLanes(du16, BitCast(du16, v)));
|
|
6574
|
+
#else
|
|
5272
6575
|
constexpr int kSumLaneIdx = HWY_IS_LITTLE_ENDIAN ? 0 : 3;
|
|
5273
|
-
|
|
5274
|
-
|
|
5275
|
-
|
|
5276
|
-
return Broadcast<kSumLaneIdx>(AltivecVsum2sws(
|
|
5277
|
-
di16, AltivecVsum4shs(di32, v.raw, zero.raw).raw, zero.raw));
|
|
6576
|
+
return Broadcast<kSumLaneIdx>(
|
|
6577
|
+
BitCast(di16, detail::SumsOf4(hwy::SignedTag(), hwy::SizeTag<2>(), v)));
|
|
6578
|
+
#endif
|
|
5278
6579
|
}
|
|
5279
6580
|
|
|
5280
|
-
|
|
6581
|
+
template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_I16_D(D)>
|
|
6582
|
+
HWY_API Vec128<int16_t> SumOfLanes(D di16, Vec128<int16_t> v) {
|
|
6583
|
+
#if HWY_S390X_HAVE_Z14
|
|
6584
|
+
const RebindToUnsigned<decltype(di16)> du16;
|
|
6585
|
+
return BitCast(di16, SumOfLanes(du16, BitCast(du16, v)));
|
|
6586
|
+
#else
|
|
5281
6587
|
constexpr int kSumLaneIdx = HWY_IS_LITTLE_ENDIAN ? 0 : 7;
|
|
5282
|
-
const Full128<int16_t> di16;
|
|
5283
6588
|
const Full128<int32_t> di32;
|
|
5284
6589
|
const auto zero = Zero(di32);
|
|
5285
|
-
return Broadcast<kSumLaneIdx>(AltivecVsumsws(
|
|
5286
|
-
di16, AltivecVsum4shs(di32, v.raw, zero.raw).raw, zero.raw));
|
|
6590
|
+
return Broadcast<kSumLaneIdx>(detail::AltivecVsumsws(
|
|
6591
|
+
di16, detail::AltivecVsum4shs(di32, v.raw, zero.raw).raw, zero.raw));
|
|
6592
|
+
#endif
|
|
5287
6593
|
}
|
|
5288
6594
|
|
|
5289
|
-
|
|
5290
|
-
HWY_API
|
|
6595
|
+
template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_U8_D(D)>
|
|
6596
|
+
HWY_API Vec32<uint8_t> SumOfLanes(D du8, Vec32<uint8_t> v) {
|
|
5291
6597
|
constexpr int kSumLaneIdx = HWY_IS_LITTLE_ENDIAN ? 0 : 3;
|
|
5292
|
-
|
|
5293
|
-
|
|
5294
|
-
const Twice<decltype(du8)> dt_u8;
|
|
5295
|
-
const Twice<decltype(du16)> dt_u16;
|
|
5296
|
-
const Full128<uint32_t> du32;
|
|
5297
|
-
return LowerHalf(Broadcast<kSumLaneIdx>(AltivecVsum4ubs(
|
|
5298
|
-
dt_u8, BitCast(dt_u8, Combine(dt_u16, Zero(du16), BitCast(du16, v))).raw,
|
|
5299
|
-
Zero(du32).raw)));
|
|
6598
|
+
return Broadcast<kSumLaneIdx>(
|
|
6599
|
+
BitCast(du8, detail::SumsOf4(hwy::UnsignedTag(), hwy::SizeTag<1>(), v)));
|
|
5300
6600
|
}
|
|
5301
6601
|
|
|
5302
|
-
|
|
5303
|
-
|
|
5304
|
-
const
|
|
5305
|
-
|
|
5306
|
-
return Broadcast<kSumLaneIdx>(AltivecVsum4ubs(du8, v.raw, Zero(du32).raw));
|
|
6602
|
+
template <class D, HWY_IF_V_SIZE_D(D, 2), HWY_IF_U8_D(D)>
|
|
6603
|
+
HWY_API Vec16<uint8_t> SumOfLanes(D du8, Vec16<uint8_t> v) {
|
|
6604
|
+
const Twice<decltype(du8)> dt_u8;
|
|
6605
|
+
return LowerHalf(du8, SumOfLanes(dt_u8, Combine(dt_u8, Zero(du8), v)));
|
|
5307
6606
|
}
|
|
5308
6607
|
|
|
5309
|
-
|
|
6608
|
+
template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_U8_D(D)>
|
|
6609
|
+
HWY_API Vec64<uint8_t> SumOfLanes(D du8, Vec64<uint8_t> v) {
|
|
5310
6610
|
constexpr int kSumLaneIdx = HWY_IS_LITTLE_ENDIAN ? 0 : 7;
|
|
5311
|
-
const Full64<uint8_t> du8;
|
|
5312
6611
|
return Broadcast<kSumLaneIdx>(BitCast(du8, SumsOf8(v)));
|
|
5313
6612
|
}
|
|
5314
6613
|
|
|
5315
|
-
|
|
6614
|
+
template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U8_D(D)>
|
|
6615
|
+
HWY_API Vec128<uint8_t> SumOfLanes(D du8, Vec128<uint8_t> v) {
|
|
5316
6616
|
constexpr int kSumLaneIdx = HWY_IS_LITTLE_ENDIAN ? 0 : 15;
|
|
5317
6617
|
|
|
6618
|
+
#if HWY_S390X_HAVE_Z14
|
|
6619
|
+
return Broadcast<kSumLaneIdx>(
|
|
6620
|
+
BitCast(du8, detail::SumOfU32OrU64LanesAsU128(detail::SumsOf4(
|
|
6621
|
+
hwy::UnsignedTag(), hwy::SizeTag<1>(), v))));
|
|
6622
|
+
#else
|
|
5318
6623
|
const Full128<uint32_t> du32;
|
|
5319
6624
|
const RebindToSigned<decltype(du32)> di32;
|
|
5320
|
-
const Full128<uint8_t> du8;
|
|
5321
6625
|
const Vec128<uint32_t> zero = Zero(du32);
|
|
5322
|
-
return Broadcast<kSumLaneIdx>(
|
|
5323
|
-
|
|
5324
|
-
|
|
6626
|
+
return Broadcast<kSumLaneIdx>(detail::AltivecVsumsws(
|
|
6627
|
+
du8, detail::AltivecVsum4ubs(di32, v.raw, zero.raw).raw,
|
|
6628
|
+
BitCast(di32, zero).raw));
|
|
6629
|
+
#endif
|
|
5325
6630
|
}
|
|
5326
6631
|
|
|
5327
|
-
|
|
6632
|
+
template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_I8_D(D)>
|
|
6633
|
+
HWY_API Vec32<int8_t> SumOfLanes(D di8, Vec32<int8_t> v) {
|
|
6634
|
+
#if HWY_S390X_HAVE_Z14
|
|
6635
|
+
const RebindToUnsigned<decltype(di8)> du8;
|
|
6636
|
+
return BitCast(di8, SumOfLanes(du8, BitCast(du8, v)));
|
|
6637
|
+
#else
|
|
5328
6638
|
constexpr int kSumLaneIdx = HWY_IS_LITTLE_ENDIAN ? 0 : 3;
|
|
5329
|
-
|
|
5330
|
-
|
|
5331
|
-
|
|
5332
|
-
const Repartition<int8_t, decltype(du16)> di8;
|
|
5333
|
-
const Vec128<int8_t> zzvv = BitCast(
|
|
5334
|
-
di8, InterleaveLower(BitCast(du16, Vec128<int8_t>{v.raw}), Zero(du16)));
|
|
5335
|
-
return Vec16<int8_t>{
|
|
5336
|
-
Broadcast<kSumLaneIdx>(AltivecVsum4sbs(di8, zzvv.raw, Zero(di32).raw))
|
|
5337
|
-
.raw};
|
|
6639
|
+
return Broadcast<kSumLaneIdx>(
|
|
6640
|
+
BitCast(di8, detail::SumsOf4(hwy::SignedTag(), hwy::SizeTag<1>(), v)));
|
|
6641
|
+
#endif
|
|
5338
6642
|
}
|
|
5339
6643
|
|
|
5340
|
-
|
|
5341
|
-
|
|
5342
|
-
const
|
|
5343
|
-
|
|
5344
|
-
return Broadcast<kSumLaneIdx>(AltivecVsum4sbs(di8, v.raw, zero.raw));
|
|
6644
|
+
template <class D, HWY_IF_V_SIZE_D(D, 2), HWY_IF_I8_D(D)>
|
|
6645
|
+
HWY_API Vec16<int8_t> SumOfLanes(D di8, Vec16<int8_t> v) {
|
|
6646
|
+
const Twice<decltype(di8)> dt_i8;
|
|
6647
|
+
return LowerHalf(di8, SumOfLanes(dt_i8, Combine(dt_i8, Zero(di8), v)));
|
|
5345
6648
|
}
|
|
5346
6649
|
|
|
5347
|
-
|
|
6650
|
+
template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_I8_D(D)>
|
|
6651
|
+
HWY_API Vec64<int8_t> SumOfLanes(D di8, Vec64<int8_t> v) {
|
|
6652
|
+
#if HWY_S390X_HAVE_Z14
|
|
6653
|
+
const RebindToUnsigned<decltype(di8)> du8;
|
|
6654
|
+
return BitCast(di8, SumOfLanes(du8, BitCast(du8, v)));
|
|
6655
|
+
#else
|
|
5348
6656
|
constexpr int kSumLaneIdx = HWY_IS_LITTLE_ENDIAN ? 0 : 7;
|
|
5349
|
-
|
|
5350
|
-
|
|
5351
|
-
const Full64<int8_t> di8;
|
|
5352
|
-
return Broadcast<kSumLaneIdx>(AltivecVsum2sws(
|
|
5353
|
-
di8, AltivecVsum4sbs(di32, v.raw, zero.raw).raw, zero.raw));
|
|
6657
|
+
return Broadcast<kSumLaneIdx>(BitCast(di8, SumsOf8(v)));
|
|
6658
|
+
#endif
|
|
5354
6659
|
}
|
|
5355
6660
|
|
|
5356
|
-
|
|
6661
|
+
template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_I8_D(D)>
|
|
6662
|
+
HWY_API Vec128<int8_t> SumOfLanes(D di8, Vec128<int8_t> v) {
|
|
6663
|
+
#if HWY_S390X_HAVE_Z14
|
|
6664
|
+
const RebindToUnsigned<decltype(di8)> du8;
|
|
6665
|
+
return BitCast(di8, SumOfLanes(du8, BitCast(du8, v)));
|
|
6666
|
+
#else
|
|
5357
6667
|
constexpr int kSumLaneIdx = HWY_IS_LITTLE_ENDIAN ? 0 : 15;
|
|
5358
|
-
const Full128<int8_t> di8;
|
|
5359
6668
|
const Full128<int32_t> di32;
|
|
5360
6669
|
const Vec128<int32_t> zero = Zero(di32);
|
|
5361
|
-
return Broadcast<kSumLaneIdx>(AltivecVsumsws(
|
|
5362
|
-
di8, AltivecVsum4sbs(di32, v.raw, zero.raw).raw, zero.raw));
|
|
5363
|
-
|
|
5364
|
-
|
|
5365
|
-
template <size_t N, HWY_IF_V_SIZE_GT(uint8_t, N, 4)>
|
|
5366
|
-
HWY_API Vec128<uint8_t, N> MaxOfLanes(Vec128<uint8_t, N> v) {
|
|
5367
|
-
const DFromV<decltype(v)> d;
|
|
5368
|
-
const RepartitionToWide<decltype(d)> d16;
|
|
5369
|
-
const RepartitionToWide<decltype(d16)> d32;
|
|
5370
|
-
Vec128<uint8_t, N> vm = Max(v, Reverse2(d, v));
|
|
5371
|
-
vm = Max(vm, BitCast(d, Reverse2(d16, BitCast(d16, vm))));
|
|
5372
|
-
vm = Max(vm, BitCast(d, Reverse2(d32, BitCast(d32, vm))));
|
|
5373
|
-
if (N > 8) {
|
|
5374
|
-
const RepartitionToWide<decltype(d32)> d64;
|
|
5375
|
-
vm = Max(vm, BitCast(d, Reverse2(d64, BitCast(d64, vm))));
|
|
5376
|
-
}
|
|
5377
|
-
return vm;
|
|
6670
|
+
return Broadcast<kSumLaneIdx>(detail::AltivecVsumsws(
|
|
6671
|
+
di8, detail::AltivecVsum4sbs(di32, v.raw, zero.raw).raw, zero.raw));
|
|
6672
|
+
#endif
|
|
5378
6673
|
}
|
|
5379
6674
|
|
|
5380
|
-
|
|
5381
|
-
|
|
5382
|
-
|
|
5383
|
-
const
|
|
5384
|
-
|
|
5385
|
-
|
|
5386
|
-
|
|
5387
|
-
vm = Min(vm, BitCast(d, Reverse2(d32, BitCast(d32, vm))));
|
|
5388
|
-
if (N > 8) {
|
|
5389
|
-
const RepartitionToWide<decltype(d32)> d64;
|
|
5390
|
-
vm = Min(vm, BitCast(d, Reverse2(d64, BitCast(d64, vm))));
|
|
5391
|
-
}
|
|
5392
|
-
return vm;
|
|
6675
|
+
#if HWY_S390X_HAVE_Z14
|
|
6676
|
+
template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_UI32_D(D)>
|
|
6677
|
+
HWY_API VFromD<D> SumOfLanes(D d32, VFromD<D> v) {
|
|
6678
|
+
const RebindToUnsigned<decltype(d32)> du32;
|
|
6679
|
+
return Broadcast<1>(
|
|
6680
|
+
BitCast(d32, detail::SumsOf2(hwy::UnsignedTag(), hwy::SizeTag<4>(),
|
|
6681
|
+
BitCast(du32, v))));
|
|
5393
6682
|
}
|
|
5394
6683
|
|
|
5395
|
-
template <
|
|
5396
|
-
HWY_API
|
|
5397
|
-
|
|
5398
|
-
const RepartitionToWide<decltype(d)> d16;
|
|
5399
|
-
const RepartitionToWide<decltype(d16)> d32;
|
|
5400
|
-
Vec128<int8_t, N> vm = Max(v, Reverse2(d, v));
|
|
5401
|
-
vm = Max(vm, BitCast(d, Reverse2(d16, BitCast(d16, vm))));
|
|
5402
|
-
vm = Max(vm, BitCast(d, Reverse2(d32, BitCast(d32, vm))));
|
|
5403
|
-
if (N > 8) {
|
|
5404
|
-
const RepartitionToWide<decltype(d32)> d64;
|
|
5405
|
-
vm = Max(vm, BitCast(d, Reverse2(d64, BitCast(d64, vm))));
|
|
5406
|
-
}
|
|
5407
|
-
return vm;
|
|
6684
|
+
template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_UI32_D(D)>
|
|
6685
|
+
HWY_API VFromD<D> SumOfLanes(D /*d32*/, VFromD<D> v) {
|
|
6686
|
+
return Broadcast<3>(detail::SumOfU32OrU64LanesAsU128(v));
|
|
5408
6687
|
}
|
|
6688
|
+
#endif
|
|
5409
6689
|
|
|
5410
|
-
|
|
5411
|
-
HWY_API Vec128<int8_t, N> MinOfLanes(Vec128<int8_t, N> v) {
|
|
5412
|
-
const DFromV<decltype(v)> d;
|
|
5413
|
-
const RepartitionToWide<decltype(d)> d16;
|
|
5414
|
-
const RepartitionToWide<decltype(d16)> d32;
|
|
5415
|
-
Vec128<int8_t, N> vm = Min(v, Reverse2(d, v));
|
|
5416
|
-
vm = Min(vm, BitCast(d, Reverse2(d16, BitCast(d16, vm))));
|
|
5417
|
-
vm = Min(vm, BitCast(d, Reverse2(d32, BitCast(d32, vm))));
|
|
5418
|
-
if (N > 8) {
|
|
5419
|
-
const RepartitionToWide<decltype(d32)> d64;
|
|
5420
|
-
vm = Min(vm, BitCast(d, Reverse2(d64, BitCast(d64, vm))));
|
|
5421
|
-
}
|
|
5422
|
-
return vm;
|
|
5423
|
-
}
|
|
6690
|
+
// generic_ops defines MinOfLanes and MaxOfLanes.
|
|
5424
6691
|
|
|
5425
|
-
|
|
5426
|
-
HWY_API Vec128<uint16_t, N> MinOfLanes(Vec128<uint16_t, N> v) {
|
|
5427
|
-
const Simd<uint16_t, N, 0> d;
|
|
5428
|
-
const RepartitionToWide<decltype(d)> d32;
|
|
5429
|
-
#if HWY_IS_LITTLE_ENDIAN
|
|
5430
|
-
const auto even = And(BitCast(d32, v), Set(d32, 0xFFFF));
|
|
5431
|
-
const auto odd = ShiftRight<16>(BitCast(d32, v));
|
|
5432
|
-
#else
|
|
5433
|
-
const auto even = ShiftRight<16>(BitCast(d32, v));
|
|
5434
|
-
const auto odd = And(BitCast(d32, v), Set(d32, 0xFFFF));
|
|
5435
|
-
#endif
|
|
5436
|
-
const auto min = MinOfLanes(Min(even, odd));
|
|
5437
|
-
// Also broadcast into odd lanes on little-endian and into even lanes
|
|
5438
|
-
// on big-endian
|
|
5439
|
-
return Vec128<uint16_t, N>{vec_pack(min.raw, min.raw)};
|
|
5440
|
-
}
|
|
5441
|
-
template <size_t N, HWY_IF_V_SIZE_GT(int16_t, N, 2)>
|
|
5442
|
-
HWY_API Vec128<int16_t, N> MinOfLanes(Vec128<int16_t, N> v) {
|
|
5443
|
-
const Simd<int16_t, N, 0> d;
|
|
5444
|
-
const RepartitionToWide<decltype(d)> d32;
|
|
5445
|
-
// Sign-extend
|
|
5446
|
-
#if HWY_IS_LITTLE_ENDIAN
|
|
5447
|
-
const auto even = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, v)));
|
|
5448
|
-
const auto odd = ShiftRight<16>(BitCast(d32, v));
|
|
5449
|
-
#else
|
|
5450
|
-
const auto even = ShiftRight<16>(BitCast(d32, v));
|
|
5451
|
-
const auto odd = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, v)));
|
|
5452
|
-
#endif
|
|
5453
|
-
const auto min = MinOfLanes(Min(even, odd));
|
|
5454
|
-
// Also broadcast into odd lanes on little-endian and into even lanes
|
|
5455
|
-
// on big-endian
|
|
5456
|
-
return Vec128<int16_t, N>{vec_pack(min.raw, min.raw)};
|
|
5457
|
-
}
|
|
6692
|
+
// ------------------------------ ReduceSum for N=4 I8/U8
|
|
5458
6693
|
|
|
5459
|
-
|
|
5460
|
-
|
|
5461
|
-
|
|
5462
|
-
|
|
5463
|
-
#if HWY_IS_LITTLE_ENDIAN
|
|
5464
|
-
const auto even = And(BitCast(d32, v), Set(d32, 0xFFFF));
|
|
5465
|
-
const auto odd = ShiftRight<16>(BitCast(d32, v));
|
|
5466
|
-
#else
|
|
5467
|
-
const auto even = ShiftRight<16>(BitCast(d32, v));
|
|
5468
|
-
const auto odd = And(BitCast(d32, v), Set(d32, 0xFFFF));
|
|
5469
|
-
#endif
|
|
5470
|
-
const auto max = MaxOfLanes(Max(even, odd));
|
|
5471
|
-
// Also broadcast into odd lanes.
|
|
5472
|
-
return Vec128<uint16_t, N>{vec_pack(max.raw, max.raw)};
|
|
5473
|
-
}
|
|
5474
|
-
template <size_t N, HWY_IF_V_SIZE_GT(int16_t, N, 2)>
|
|
5475
|
-
HWY_API Vec128<int16_t, N> MaxOfLanes(Vec128<int16_t, N> v) {
|
|
5476
|
-
const Simd<int16_t, N, 0> d;
|
|
5477
|
-
const RepartitionToWide<decltype(d)> d32;
|
|
5478
|
-
// Sign-extend
|
|
5479
|
-
#if HWY_IS_LITTLE_ENDIAN
|
|
5480
|
-
const auto even = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, v)));
|
|
5481
|
-
const auto odd = ShiftRight<16>(BitCast(d32, v));
|
|
6694
|
+
// GetLane(SumsOf4(v)) is more efficient on PPC/Z14 than the default N=4
|
|
6695
|
+
// I8/U8 ReduceSum implementation in generic_ops-inl.h
|
|
6696
|
+
#ifdef HWY_NATIVE_REDUCE_SUM_4_UI8
|
|
6697
|
+
#undef HWY_NATIVE_REDUCE_SUM_4_UI8
|
|
5482
6698
|
#else
|
|
5483
|
-
|
|
5484
|
-
const auto odd = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, v)));
|
|
6699
|
+
#define HWY_NATIVE_REDUCE_SUM_4_UI8
|
|
5485
6700
|
#endif
|
|
5486
|
-
const auto max = MaxOfLanes(Max(even, odd));
|
|
5487
|
-
// Also broadcast into odd lanes on little-endian and into even lanes
|
|
5488
|
-
// on big-endian
|
|
5489
|
-
return Vec128<int16_t, N>{vec_pack(max.raw, max.raw)};
|
|
5490
|
-
}
|
|
5491
|
-
|
|
5492
|
-
} // namespace detail
|
|
5493
6701
|
|
|
5494
|
-
|
|
5495
|
-
|
|
5496
|
-
|
|
5497
|
-
return detail::SumOfLanes(v);
|
|
5498
|
-
}
|
|
5499
|
-
template <class D>
|
|
5500
|
-
HWY_API TFromD<D> ReduceSum(D /* tag */, VFromD<D> v) {
|
|
5501
|
-
return GetLane(detail::SumOfLanes(v));
|
|
5502
|
-
}
|
|
5503
|
-
template <class D>
|
|
5504
|
-
HWY_API VFromD<D> MinOfLanes(D /* tag */, VFromD<D> v) {
|
|
5505
|
-
return detail::MinOfLanes(v);
|
|
5506
|
-
}
|
|
5507
|
-
template <class D>
|
|
5508
|
-
HWY_API VFromD<D> MaxOfLanes(D /* tag */, VFromD<D> v) {
|
|
5509
|
-
return detail::MaxOfLanes(v);
|
|
6702
|
+
template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_UI8_D(D)>
|
|
6703
|
+
HWY_API TFromD<D> ReduceSum(D /*d*/, VFromD<D> v) {
|
|
6704
|
+
return static_cast<TFromD<D>>(GetLane(SumsOf4(v)));
|
|
5510
6705
|
}
|
|
5511
6706
|
|
|
5512
6707
|
// ------------------------------ Lt128
|
|
@@ -5672,7 +6867,20 @@ HWY_API V Max128Upper(D d, const V a, const V b) {
|
|
|
5672
6867
|
|
|
5673
6868
|
template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
|
|
5674
6869
|
HWY_API V LeadingZeroCount(V v) {
|
|
6870
|
+
#if HWY_S390X_HAVE_Z14
|
|
6871
|
+
const DFromV<decltype(v)> d;
|
|
6872
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
6873
|
+
|
|
6874
|
+
#if HWY_COMPILER_GCC_ACTUAL && defined(__OPTIMIZE__)
|
|
6875
|
+
// Work around for GCC compiler bug in vec_cnttz on Z14/Z15 if v[i] is a
|
|
6876
|
+
// constant
|
|
6877
|
+
__asm__("" : "+v"(v.raw));
|
|
6878
|
+
#endif
|
|
6879
|
+
|
|
6880
|
+
return BitCast(d, VFromD<decltype(du)>{vec_cntlz(BitCast(du, v).raw)});
|
|
6881
|
+
#else
|
|
5675
6882
|
return V{vec_cntlz(v.raw)};
|
|
6883
|
+
#endif
|
|
5676
6884
|
}
|
|
5677
6885
|
|
|
5678
6886
|
template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
|
|
@@ -5682,14 +6890,27 @@ HWY_API V HighestSetBitIndex(V v) {
|
|
|
5682
6890
|
return BitCast(d, Set(d, T{sizeof(T) * 8 - 1}) - LeadingZeroCount(v));
|
|
5683
6891
|
}
|
|
5684
6892
|
|
|
5685
|
-
#if HWY_PPC_HAVE_9
|
|
6893
|
+
#if HWY_PPC_HAVE_9 || HWY_S390X_HAVE_Z14
|
|
5686
6894
|
template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
|
|
5687
6895
|
HWY_API V TrailingZeroCount(V v) {
|
|
5688
6896
|
#if HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 700
|
|
5689
6897
|
return V{vec_vctz(v.raw)};
|
|
5690
6898
|
#else
|
|
5691
|
-
|
|
6899
|
+
#if HWY_S390X_HAVE_Z14
|
|
6900
|
+
const DFromV<decltype(v)> d;
|
|
6901
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
6902
|
+
|
|
6903
|
+
#if HWY_COMPILER_GCC_ACTUAL && defined(__OPTIMIZE__)
|
|
6904
|
+
// Work around for GCC compiler bug in vec_cnttz on Z14/Z15 if v[i] is a
|
|
6905
|
+
// constant
|
|
6906
|
+
__asm__("" : "+v"(v.raw));
|
|
5692
6907
|
#endif
|
|
6908
|
+
|
|
6909
|
+
return BitCast(d, VFromD<decltype(du)>{vec_cnttz(BitCast(du, v).raw)});
|
|
6910
|
+
#else
|
|
6911
|
+
return V{vec_cnttz(v.raw)};
|
|
6912
|
+
#endif // HWY_S390X_HAVE_Z14
|
|
6913
|
+
#endif // HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 700
|
|
5693
6914
|
}
|
|
5694
6915
|
#else
|
|
5695
6916
|
template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
|
|
@@ -5709,6 +6930,8 @@ HWY_API V TrailingZeroCount(V v) {
|
|
|
5709
6930
|
|
|
5710
6931
|
#undef HWY_PPC_HAVE_9
|
|
5711
6932
|
#undef HWY_PPC_HAVE_10
|
|
6933
|
+
#undef HWY_S390X_HAVE_Z14
|
|
6934
|
+
#undef HWY_S390X_HAVE_Z15
|
|
5712
6935
|
|
|
5713
6936
|
// NOLINTNEXTLINE(google-readability-namespace-comments)
|
|
5714
6937
|
} // namespace HWY_NAMESPACE
|