@img/sharp-libvips-dev 1.0.0 → 1.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/include/aom/aom_encoder.h +3 -3
- package/include/aom/aomcx.h +17 -8
- package/include/expat.h +21 -10
- package/include/expat_config.h +11 -5
- package/include/ffi.h +12 -25
- package/include/fontconfig/fontconfig.h +5 -3
- package/include/freetype2/freetype/config/ftoption.h +1 -1
- package/include/gio-unix-2.0/gio/gfiledescriptorbased.h +3 -7
- package/include/gio-unix-2.0/gio/gunixinputstream.h +0 -5
- package/include/gio-unix-2.0/gio/gunixoutputstream.h +0 -5
- package/include/glib-2.0/gio/gappinfo.h +0 -7
- package/include/glib-2.0/gio/gapplication.h +6 -0
- package/include/glib-2.0/gio/gapplicationcommandline.h +12 -1
- package/include/glib-2.0/gio/gasyncinitable.h +0 -7
- package/include/glib-2.0/gio/gasyncresult.h +0 -6
- package/include/glib-2.0/gio/gbufferedinputstream.h +0 -5
- package/include/glib-2.0/gio/gbufferedoutputstream.h +0 -5
- package/include/glib-2.0/gio/gbytesicon.h +0 -5
- package/include/glib-2.0/gio/gcancellable.h +0 -5
- package/include/glib-2.0/gio/gconverter.h +0 -7
- package/include/glib-2.0/gio/gconverterinputstream.h +0 -6
- package/include/glib-2.0/gio/gconverteroutputstream.h +0 -6
- package/include/glib-2.0/gio/gdatagrambased.h +0 -7
- package/include/glib-2.0/gio/gdatainputstream.h +0 -6
- package/include/glib-2.0/gio/gdataoutputstream.h +0 -6
- package/include/glib-2.0/gio/gdbusinterface.h +0 -8
- package/include/glib-2.0/gio/gdbusinterfaceskeleton.h +0 -8
- package/include/glib-2.0/gio/gdbusmessage.h +2 -1
- package/include/glib-2.0/gio/gdbusobjectmanagerclient.h +0 -8
- package/include/glib-2.0/gio/gdbusobjectmanagerserver.h +0 -8
- package/include/glib-2.0/gio/gdbusobjectproxy.h +0 -8
- package/include/glib-2.0/gio/gdbusobjectskeleton.h +0 -8
- package/include/glib-2.0/gio/gdbusproxy.h +0 -8
- package/include/glib-2.0/gio/gdebugcontroller.h +0 -8
- package/include/glib-2.0/gio/gdebugcontrollerdbus.h +0 -7
- package/include/glib-2.0/gio/gdtlsserverconnection.h +0 -8
- package/include/glib-2.0/gio/gemblem.h +0 -5
- package/include/glib-2.0/gio/gemblemedicon.h +0 -5
- package/include/glib-2.0/gio/gfile.h +0 -10
- package/include/glib-2.0/gio/gfileenumerator.h +0 -5
- package/include/glib-2.0/gio/gfileicon.h +0 -5
- package/include/glib-2.0/gio/gfileinfo.h +0 -5
- package/include/glib-2.0/gio/gfileinputstream.h +0 -8
- package/include/glib-2.0/gio/gfileiostream.h +0 -8
- package/include/glib-2.0/gio/gfilemonitor.h +0 -5
- package/include/glib-2.0/gio/gfilenamecompleter.h +0 -5
- package/include/glib-2.0/gio/gfileoutputstream.h +0 -8
- package/include/glib-2.0/gio/gfilterinputstream.h +0 -5
- package/include/glib-2.0/gio/gfilteroutputstream.h +0 -5
- package/include/glib-2.0/gio/gicon.h +0 -5
- package/include/glib-2.0/gio/ginitable.h +0 -7
- package/include/glib-2.0/gio/ginputstream.h +0 -5
- package/include/glib-2.0/gio/gio-autocleanups.h +4 -0
- package/include/glib-2.0/gio/gio-visibility.h +34 -0
- package/include/glib-2.0/gio/gioenums.h +6 -1
- package/include/glib-2.0/gio/giomodule.h +0 -5
- package/include/glib-2.0/gio/giostream.h +0 -5
- package/include/glib-2.0/gio/giotypes.h +5 -108
- package/include/glib-2.0/gio/gloadableicon.h +0 -6
- package/include/glib-2.0/gio/gmemoryinputstream.h +0 -5
- package/include/glib-2.0/gio/gmemoryoutputstream.h +0 -5
- package/include/glib-2.0/gio/gmountoperation.h +0 -6
- package/include/glib-2.0/gio/gnetworking.h +4 -0
- package/include/glib-2.0/gio/goutputstream.h +0 -9
- package/include/glib-2.0/gio/gpollableinputstream.h +0 -7
- package/include/glib-2.0/gio/gpollableoutputstream.h +0 -7
- package/include/glib-2.0/gio/gproxy.h +0 -7
- package/include/glib-2.0/gio/gproxyaddressenumerator.h +0 -8
- package/include/glib-2.0/gio/gseekable.h +0 -5
- package/include/glib-2.0/gio/gsettingsbackend.h +0 -5
- package/include/glib-2.0/gio/gsimpleactiongroup.h +0 -7
- package/include/glib-2.0/gio/gsimpleasyncresult.h +0 -5
- package/include/glib-2.0/gio/gsimpleproxyresolver.h +0 -5
- package/include/glib-2.0/gio/gsocket.h +13 -0
- package/include/glib-2.0/gio/gsocketaddressenumerator.h +0 -6
- package/include/glib-2.0/gio/gsocketconnectable.h +0 -5
- package/include/glib-2.0/gio/gtask.h +12 -0
- package/include/glib-2.0/gio/gthemedicon.h +0 -5
- package/include/glib-2.0/gio/gtlsserverconnection.h +0 -8
- package/include/glib-2.0/gio/gunixcredentialsmessage.h +0 -8
- package/include/glib-2.0/gio/gvfs.h +0 -5
- package/include/glib-2.0/gio/gvolume.h +2 -2
- package/include/glib-2.0/gio/gvolumemonitor.h +0 -5
- package/include/glib-2.0/girepository/gi-visibility.h +986 -0
- package/include/glib-2.0/girepository/giarginfo.h +100 -0
- package/include/glib-2.0/girepository/gibaseinfo.h +129 -0
- package/include/glib-2.0/girepository/gicallableinfo.h +119 -0
- package/include/glib-2.0/girepository/gicallbackinfo.h +60 -0
- package/include/glib-2.0/girepository/giconstantinfo.h +72 -0
- package/include/glib-2.0/girepository/gienuminfo.h +82 -0
- package/include/glib-2.0/girepository/gifieldinfo.h +84 -0
- package/include/glib-2.0/girepository/giflagsinfo.h +60 -0
- package/include/glib-2.0/girepository/gifunctioninfo.h +117 -0
- package/include/glib-2.0/girepository/giinterfaceinfo.h +120 -0
- package/include/glib-2.0/girepository/giobjectinfo.h +230 -0
- package/include/glib-2.0/girepository/gipropertyinfo.h +77 -0
- package/include/glib-2.0/girepository/giregisteredtypeinfo.h +75 -0
- package/include/glib-2.0/girepository/girepository-autocleanups.h +56 -0
- package/include/glib-2.0/girepository/girepository.h +247 -0
- package/include/glib-2.0/girepository/girffi.h +129 -0
- package/include/glib-2.0/girepository/gisignalinfo.h +72 -0
- package/include/glib-2.0/girepository/gistructinfo.h +102 -0
- package/include/glib-2.0/girepository/gitypeinfo.h +144 -0
- package/include/glib-2.0/girepository/gitypelib.h +61 -0
- package/include/glib-2.0/girepository/gitypes.h +421 -0
- package/include/glib-2.0/girepository/giunioninfo.h +105 -0
- package/include/glib-2.0/girepository/giunresolvedinfo.h +60 -0
- package/include/glib-2.0/girepository/givalueinfo.h +65 -0
- package/include/glib-2.0/girepository/givfuncinfo.h +88 -0
- package/include/glib-2.0/glib/deprecated/gcompletion.h +1 -1
- package/include/glib-2.0/glib/deprecated/grel.h +0 -23
- package/include/glib-2.0/glib/deprecated/gthread.h +10 -6
- package/include/glib-2.0/glib/gatomic.h +20 -20
- package/include/glib-2.0/glib/gbitlock.h +31 -0
- package/include/glib-2.0/glib/gbookmarkfile.h +39 -1
- package/include/glib-2.0/glib/gchecksum.h +0 -10
- package/include/glib-2.0/glib/gdate.h +0 -9
- package/include/glib-2.0/glib/gdatetime.h +33 -1
- package/include/glib-2.0/glib/gdir.h +5 -0
- package/include/glib-2.0/glib/ghmac.h +0 -9
- package/include/glib-2.0/glib/glib-autocleanups.h +4 -0
- package/include/glib-2.0/glib/glib-visibility.h +34 -0
- package/include/glib-2.0/glib/gmacros.h +1 -0
- package/include/glib-2.0/glib/gmessages.h +11 -0
- package/include/glib-2.0/glib/gpathbuf.h +0 -7
- package/include/glib-2.0/glib/gslice.h +2 -0
- package/include/glib-2.0/glib/gstdio.h +1 -1
- package/include/glib-2.0/glib/gstrfuncs.h +24 -18
- package/include/glib-2.0/glib/gstrvbuilder.h +4 -8
- package/include/glib-2.0/glib/gtestutils.h +5 -0
- package/include/glib-2.0/glib/gthread.h +216 -3
- package/include/glib-2.0/glib/gunicode.h +12 -2
- package/include/glib-2.0/glib/gvarianttype.h +1 -10
- package/include/glib-2.0/glib/gversionmacros.h +9 -0
- package/include/glib-2.0/glib/gwin32.h +4 -4
- package/include/glib-2.0/glib-unix.h +214 -0
- package/include/glib-2.0/gmodule/gmodule-visibility.h +34 -0
- package/include/glib-2.0/gobject/gbinding.h +0 -8
- package/include/glib-2.0/gobject/gbindinggroup.h +0 -8
- package/include/glib-2.0/gobject/gclosure.h +1 -9
- package/include/glib-2.0/gobject/genums.h +6 -6
- package/include/glib-2.0/gobject/glib-types.h +44 -0
- package/include/glib-2.0/gobject/gobject-autocleanups.h +4 -0
- package/include/glib-2.0/gobject/gobject-visibility.h +34 -0
- package/include/glib-2.0/gobject/gobject.h +1 -16
- package/include/glib-2.0/gobject/gparam.h +3 -12
- package/include/glib-2.0/gobject/gsignal.h +16 -6
- package/include/glib-2.0/gobject/gsignalgroup.h +0 -8
- package/include/glib-2.0/gobject/gtype.h +53 -20
- package/include/glib-2.0/gobject/gtypemodule.h +0 -7
- package/include/glib-2.0/gobject/gtypeplugin.h +0 -6
- package/include/glib-2.0/gobject/gvaluearray.h +0 -7
- package/include/glib-2.0/gobject/gvaluecollector.h +1 -11
- package/include/glib-2.0/gobject/gvaluetypes.h +2 -0
- package/include/hwy/aligned_allocator.h +171 -6
- package/include/hwy/base.h +1765 -543
- package/include/hwy/cache_control.h +24 -6
- package/include/hwy/detect_compiler_arch.h +23 -2
- package/include/hwy/detect_targets.h +56 -13
- package/include/hwy/foreach_target.h +24 -0
- package/include/hwy/highway.h +20 -3
- package/include/hwy/ops/arm_neon-inl.h +1086 -667
- package/include/hwy/ops/arm_sve-inl.h +1091 -235
- package/include/hwy/ops/emu128-inl.h +271 -196
- package/include/hwy/ops/generic_ops-inl.h +2270 -399
- package/include/hwy/ops/ppc_vsx-inl.h +1786 -563
- package/include/hwy/ops/rvv-inl.h +1043 -311
- package/include/hwy/ops/scalar-inl.h +189 -159
- package/include/hwy/ops/set_macros-inl.h +66 -6
- package/include/hwy/ops/shared-inl.h +175 -56
- package/include/hwy/ops/wasm_128-inl.h +153 -136
- package/include/hwy/ops/x86_128-inl.h +1647 -646
- package/include/hwy/ops/x86_256-inl.h +1003 -370
- package/include/hwy/ops/x86_512-inl.h +948 -353
- package/include/hwy/per_target.h +4 -0
- package/include/hwy/profiler.h +648 -0
- package/include/hwy/robust_statistics.h +2 -2
- package/include/hwy/targets.h +18 -11
- package/include/hwy/timer.h +11 -0
- package/include/lcms2.h +46 -7
- package/include/lcms2_plugin.h +4 -4
- package/include/libheif/heif_version.h +2 -2
- package/include/libpng16/png.h +32 -29
- package/include/libpng16/pngconf.h +2 -2
- package/include/libpng16/pnglibconf.h +7 -2
- package/include/librsvg-2.0/librsvg/rsvg-version.h +2 -2
- package/include/libxml2/libxml/HTMLparser.h +23 -0
- package/include/libxml2/libxml/SAX.h +0 -2
- package/include/libxml2/libxml/SAX2.h +0 -2
- package/include/libxml2/libxml/c14n.h +0 -2
- package/include/libxml2/libxml/dict.h +1 -0
- package/include/libxml2/libxml/encoding.h +16 -14
- package/include/libxml2/libxml/entities.h +4 -0
- package/include/libxml2/libxml/globals.h +15 -503
- package/include/libxml2/libxml/hash.h +57 -61
- package/include/libxml2/libxml/nanoftp.h +2 -2
- package/include/libxml2/libxml/parser.h +137 -18
- package/include/libxml2/libxml/parserInternals.h +1 -0
- package/include/libxml2/libxml/relaxng.h +2 -1
- package/include/libxml2/libxml/schemasInternals.h +1 -0
- package/include/libxml2/libxml/schematron.h +1 -0
- package/include/libxml2/libxml/threads.h +4 -11
- package/include/libxml2/libxml/tree.h +68 -20
- package/include/libxml2/libxml/uri.h +2 -1
- package/include/libxml2/libxml/valid.h +2 -0
- package/include/libxml2/libxml/xmlIO.h +65 -13
- package/include/libxml2/libxml/xmlerror.h +37 -8
- package/include/libxml2/libxml/xmlmemory.h +37 -40
- package/include/libxml2/libxml/xmlreader.h +6 -0
- package/include/libxml2/libxml/xmlregexp.h +2 -9
- package/include/libxml2/libxml/xmlsave.h +9 -0
- package/include/libxml2/libxml/xmlschemas.h +3 -0
- package/include/libxml2/libxml/xmlversion.h +28 -43
- package/include/libxml2/libxml/xpath.h +1 -1
- package/include/libxml2/libxml/xpathInternals.h +2 -1
- package/include/libxml2/libxml/xpointer.h +5 -4
- package/include/pango-1.0/pango/pango-features.h +3 -3
- package/include/pango-1.0/pango/pango-fontmap.h +7 -0
- package/include/pixman-1/pixman-version.h +3 -3
- package/include/pixman-1/pixman.h +9 -2
- package/include/png.h +32 -29
- package/include/pngconf.h +2 -2
- package/include/pnglibconf.h +7 -2
- package/include/vips/connection.h +9 -3
- package/include/vips/util.h +0 -9
- package/include/vips/version.h +4 -4
- package/include/zconf.h +3 -0
- package/include/zlib.h +3 -3
- package/package.json +1 -1
- package/versions.json +15 -15
|
@@ -33,6 +33,22 @@
|
|
|
33
33
|
#define HWY_SVE_HAVE_2 0
|
|
34
34
|
#endif
|
|
35
35
|
|
|
36
|
+
// If 1, both __bf16 and a limited set of *_bf16 SVE intrinsics are available:
|
|
37
|
+
// create/get/set/dup, ld/st, sel, rev, trn, uzp, zip.
|
|
38
|
+
#if HWY_ARM_HAVE_SCALAR_BF16_TYPE && defined(__ARM_FEATURE_SVE_BF16)
|
|
39
|
+
#define HWY_SVE_HAVE_BF16_FEATURE 1
|
|
40
|
+
#else
|
|
41
|
+
#define HWY_SVE_HAVE_BF16_FEATURE 0
|
|
42
|
+
#endif
|
|
43
|
+
|
|
44
|
+
// HWY_SVE_HAVE_BF16_VEC is defined to 1 if the SVE svbfloat16_t vector type
|
|
45
|
+
// is supported, even if HWY_SVE_HAVE_BF16_FEATURE (= intrinsics) is 0.
|
|
46
|
+
#if HWY_SVE_HAVE_BF16_FEATURE || HWY_COMPILER_GCC_ACTUAL >= 1000
|
|
47
|
+
#define HWY_SVE_HAVE_BF16_VEC 1
|
|
48
|
+
#else
|
|
49
|
+
#define HWY_SVE_HAVE_BF16_VEC 0
|
|
50
|
+
#endif
|
|
51
|
+
|
|
36
52
|
HWY_BEFORE_NAMESPACE();
|
|
37
53
|
namespace hwy {
|
|
38
54
|
namespace HWY_NAMESPACE {
|
|
@@ -76,12 +92,20 @@ namespace detail { // for code folding
|
|
|
76
92
|
#define HWY_SVE_FOREACH_F64(X_MACRO, NAME, OP) \
|
|
77
93
|
X_MACRO(float, f, 64, 32, NAME, OP)
|
|
78
94
|
|
|
79
|
-
#
|
|
80
|
-
#define HWY_SVE_FOREACH_BF16(X_MACRO, NAME, OP) \
|
|
95
|
+
#define HWY_SVE_FOREACH_BF16_UNCONDITIONAL(X_MACRO, NAME, OP) \
|
|
81
96
|
X_MACRO(bfloat, bf, 16, 16, NAME, OP)
|
|
97
|
+
|
|
98
|
+
#if HWY_SVE_HAVE_BF16_FEATURE
|
|
99
|
+
#define HWY_SVE_FOREACH_BF16(X_MACRO, NAME, OP) \
|
|
100
|
+
HWY_SVE_FOREACH_BF16_UNCONDITIONAL(X_MACRO, NAME, OP)
|
|
101
|
+
// We have both f16 and bf16, so nothing is emulated.
|
|
102
|
+
#define HWY_SVE_IF_EMULATED_D(D) hwy::EnableIf<false>* = nullptr
|
|
103
|
+
#define HWY_SVE_IF_NOT_EMULATED_D(D) hwy::EnableIf<true>* = nullptr
|
|
82
104
|
#else
|
|
83
105
|
#define HWY_SVE_FOREACH_BF16(X_MACRO, NAME, OP)
|
|
84
|
-
#
|
|
106
|
+
#define HWY_SVE_IF_EMULATED_D(D) HWY_IF_BF16_D(D)
|
|
107
|
+
#define HWY_SVE_IF_NOT_EMULATED_D(D) HWY_IF_NOT_BF16_D(D)
|
|
108
|
+
#endif // HWY_SVE_HAVE_BF16_FEATURE
|
|
85
109
|
|
|
86
110
|
// For all element sizes:
|
|
87
111
|
#define HWY_SVE_FOREACH_U(X_MACRO, NAME, OP) \
|
|
@@ -96,12 +120,16 @@ namespace detail { // for code folding
|
|
|
96
120
|
HWY_SVE_FOREACH_I32(X_MACRO, NAME, OP) \
|
|
97
121
|
HWY_SVE_FOREACH_I64(X_MACRO, NAME, OP)
|
|
98
122
|
|
|
123
|
+
#define HWY_SVE_FOREACH_F3264(X_MACRO, NAME, OP) \
|
|
124
|
+
HWY_SVE_FOREACH_F32(X_MACRO, NAME, OP) \
|
|
125
|
+
HWY_SVE_FOREACH_F64(X_MACRO, NAME, OP)
|
|
126
|
+
|
|
99
127
|
// HWY_SVE_FOREACH_F does not include HWY_SVE_FOREACH_BF16 because SVE lacks
|
|
100
128
|
// bf16 overloads for some intrinsics (especially less-common arithmetic).
|
|
129
|
+
// However, this does include f16 because SVE supports it unconditionally.
|
|
101
130
|
#define HWY_SVE_FOREACH_F(X_MACRO, NAME, OP) \
|
|
102
131
|
HWY_SVE_FOREACH_F16(X_MACRO, NAME, OP) \
|
|
103
|
-
|
|
104
|
-
HWY_SVE_FOREACH_F64(X_MACRO, NAME, OP)
|
|
132
|
+
HWY_SVE_FOREACH_F3264(X_MACRO, NAME, OP)
|
|
105
133
|
|
|
106
134
|
// Commonly used type categories for a given element size:
|
|
107
135
|
#define HWY_SVE_FOREACH_UI08(X_MACRO, NAME, OP) \
|
|
@@ -123,8 +151,7 @@ namespace detail { // for code folding
|
|
|
123
151
|
#define HWY_SVE_FOREACH_UIF3264(X_MACRO, NAME, OP) \
|
|
124
152
|
HWY_SVE_FOREACH_UI32(X_MACRO, NAME, OP) \
|
|
125
153
|
HWY_SVE_FOREACH_UI64(X_MACRO, NAME, OP) \
|
|
126
|
-
|
|
127
|
-
HWY_SVE_FOREACH_F64(X_MACRO, NAME, OP)
|
|
154
|
+
HWY_SVE_FOREACH_F3264(X_MACRO, NAME, OP)
|
|
128
155
|
|
|
129
156
|
// Commonly used type categories:
|
|
130
157
|
#define HWY_SVE_FOREACH_UI(X_MACRO, NAME, OP) \
|
|
@@ -155,7 +182,9 @@ namespace detail { // for code folding
|
|
|
155
182
|
};
|
|
156
183
|
|
|
157
184
|
HWY_SVE_FOREACH(HWY_SPECIALIZE, _, _)
|
|
158
|
-
|
|
185
|
+
#if HWY_SVE_HAVE_BF16_FEATURE || HWY_SVE_HAVE_BF16_VEC
|
|
186
|
+
HWY_SVE_FOREACH_BF16_UNCONDITIONAL(HWY_SPECIALIZE, _, _)
|
|
187
|
+
#endif
|
|
159
188
|
#undef HWY_SPECIALIZE
|
|
160
189
|
|
|
161
190
|
// Note: _x (don't-care value for inactive lanes) avoids additional MOVPRFX
|
|
@@ -184,15 +213,24 @@ HWY_SVE_FOREACH_BF16(HWY_SPECIALIZE, _, _)
|
|
|
184
213
|
}
|
|
185
214
|
|
|
186
215
|
// vector = f(vector, vector), e.g. Add
|
|
216
|
+
#define HWY_SVE_RETV_ARGVV(BASE, CHAR, BITS, HALF, NAME, OP) \
|
|
217
|
+
HWY_API HWY_SVE_V(BASE, BITS) \
|
|
218
|
+
NAME(HWY_SVE_V(BASE, BITS) a, HWY_SVE_V(BASE, BITS) b) { \
|
|
219
|
+
return sv##OP##_##CHAR##BITS(a, b); \
|
|
220
|
+
}
|
|
221
|
+
// All-true mask
|
|
187
222
|
#define HWY_SVE_RETV_ARGPVV(BASE, CHAR, BITS, HALF, NAME, OP) \
|
|
188
223
|
HWY_API HWY_SVE_V(BASE, BITS) \
|
|
189
224
|
NAME(HWY_SVE_V(BASE, BITS) a, HWY_SVE_V(BASE, BITS) b) { \
|
|
190
225
|
return sv##OP##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), a, b); \
|
|
191
226
|
}
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
227
|
+
// User-specified mask. Mask=false value is undefined and must be set by caller
|
|
228
|
+
// because SVE instructions take it from one of the two inputs, whereas
|
|
229
|
+
// AVX-512, RVV and Highway allow a third argument.
|
|
230
|
+
#define HWY_SVE_RETV_ARGMVV(BASE, CHAR, BITS, HALF, NAME, OP) \
|
|
231
|
+
HWY_API HWY_SVE_V(BASE, BITS) \
|
|
232
|
+
NAME(svbool_t m, HWY_SVE_V(BASE, BITS) a, HWY_SVE_V(BASE, BITS) b) { \
|
|
233
|
+
return sv##OP##_##CHAR##BITS##_x(m, a, b); \
|
|
196
234
|
}
|
|
197
235
|
|
|
198
236
|
#define HWY_SVE_RETV_ARGVVV(BASE, CHAR, BITS, HALF, NAME, OP) \
|
|
@@ -266,24 +304,15 @@ HWY_API size_t Lanes(Simd<T, N, kPow2> d) {
|
|
|
266
304
|
HWY_SVE_FOREACH(HWY_SVE_FIRSTN, FirstN, whilelt)
|
|
267
305
|
HWY_SVE_FOREACH_BF16(HWY_SVE_FIRSTN, FirstN, whilelt)
|
|
268
306
|
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
template <class D>
|
|
272
|
-
using MFromD = decltype(FirstN(D(), 0));
|
|
273
|
-
|
|
274
|
-
#if !HWY_HAVE_FLOAT16
|
|
275
|
-
template <class D, HWY_IF_F16_D(D)>
|
|
276
|
-
MFromD<RebindToUnsigned<D>> FirstN(D /* tag */, size_t count) {
|
|
307
|
+
template <class D, HWY_SVE_IF_EMULATED_D(D)>
|
|
308
|
+
svbool_t FirstN(D /* tag */, size_t count) {
|
|
277
309
|
return FirstN(RebindToUnsigned<D>(), count);
|
|
278
310
|
}
|
|
279
|
-
#endif // !HWY_HAVE_FLOAT16
|
|
280
311
|
|
|
281
|
-
#
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
}
|
|
286
|
-
#endif // !HWY_SVE_HAVE_BFLOAT16
|
|
312
|
+
#undef HWY_SVE_FIRSTN
|
|
313
|
+
|
|
314
|
+
template <class D>
|
|
315
|
+
using MFromD = svbool_t;
|
|
287
316
|
|
|
288
317
|
namespace detail {
|
|
289
318
|
|
|
@@ -314,6 +343,17 @@ svbool_t MakeMask(D d) {
|
|
|
314
343
|
|
|
315
344
|
} // namespace detail
|
|
316
345
|
|
|
346
|
+
#ifdef HWY_NATIVE_MASK_FALSE
|
|
347
|
+
#undef HWY_NATIVE_MASK_FALSE
|
|
348
|
+
#else
|
|
349
|
+
#define HWY_NATIVE_MASK_FALSE
|
|
350
|
+
#endif
|
|
351
|
+
|
|
352
|
+
template <class D>
|
|
353
|
+
HWY_API svbool_t MaskFalse(const D /*d*/) {
|
|
354
|
+
return detail::PFalse();
|
|
355
|
+
}
|
|
356
|
+
|
|
317
357
|
// ================================================== INIT
|
|
318
358
|
|
|
319
359
|
// ------------------------------ Set
|
|
@@ -326,14 +366,23 @@ svbool_t MakeMask(D d) {
|
|
|
326
366
|
}
|
|
327
367
|
|
|
328
368
|
HWY_SVE_FOREACH(HWY_SVE_SET, Set, dup_n)
|
|
369
|
+
#if HWY_SVE_HAVE_BF16_FEATURE // for if-elif chain
|
|
329
370
|
HWY_SVE_FOREACH_BF16(HWY_SVE_SET, Set, dup_n)
|
|
330
|
-
#
|
|
371
|
+
#elif HWY_SVE_HAVE_BF16_VEC
|
|
331
372
|
// Required for Zero and VFromD
|
|
332
|
-
template <
|
|
333
|
-
|
|
334
|
-
return
|
|
373
|
+
template <class D, HWY_IF_BF16_D(D)>
|
|
374
|
+
HWY_API svbfloat16_t Set(D d, bfloat16_t arg) {
|
|
375
|
+
return svreinterpret_bf16_u16(
|
|
376
|
+
Set(RebindToUnsigned<decltype(d)>(), BitCastScalar<uint16_t>(arg)));
|
|
377
|
+
}
|
|
378
|
+
#else // neither bf16 feature nor vector: emulate with u16
|
|
379
|
+
// Required for Zero and VFromD
|
|
380
|
+
template <class D, HWY_IF_BF16_D(D)>
|
|
381
|
+
HWY_API svuint16_t Set(D d, bfloat16_t arg) {
|
|
382
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
383
|
+
return Set(du, BitCastScalar<uint16_t>(arg));
|
|
335
384
|
}
|
|
336
|
-
#endif //
|
|
385
|
+
#endif // HWY_SVE_HAVE_BF16_FEATURE
|
|
337
386
|
#undef HWY_SVE_SET
|
|
338
387
|
|
|
339
388
|
template <class D>
|
|
@@ -350,17 +399,6 @@ VFromD<D> Zero(D d) {
|
|
|
350
399
|
return BitCast(d, Set(du, 0));
|
|
351
400
|
}
|
|
352
401
|
|
|
353
|
-
// ------------------------------ Undefined
|
|
354
|
-
|
|
355
|
-
#define HWY_SVE_UNDEFINED(BASE, CHAR, BITS, HALF, NAME, OP) \
|
|
356
|
-
template <size_t N, int kPow2> \
|
|
357
|
-
HWY_API HWY_SVE_V(BASE, BITS) \
|
|
358
|
-
NAME(HWY_SVE_D(BASE, BITS, N, kPow2) /* d */) { \
|
|
359
|
-
return sv##OP##_##CHAR##BITS(); \
|
|
360
|
-
}
|
|
361
|
-
|
|
362
|
-
HWY_SVE_FOREACH(HWY_SVE_UNDEFINED, Undefined, undef)
|
|
363
|
-
|
|
364
402
|
// ------------------------------ BitCast
|
|
365
403
|
|
|
366
404
|
namespace detail {
|
|
@@ -387,24 +425,37 @@ namespace detail {
|
|
|
387
425
|
return sv##OP##_##CHAR##BITS##_u8(v); \
|
|
388
426
|
}
|
|
389
427
|
|
|
428
|
+
// U08 is special-cased, hence do not use FOREACH.
|
|
390
429
|
HWY_SVE_FOREACH_U08(HWY_SVE_CAST_NOP, _, _)
|
|
391
430
|
HWY_SVE_FOREACH_I08(HWY_SVE_CAST, _, reinterpret)
|
|
392
431
|
HWY_SVE_FOREACH_UI16(HWY_SVE_CAST, _, reinterpret)
|
|
393
432
|
HWY_SVE_FOREACH_UI32(HWY_SVE_CAST, _, reinterpret)
|
|
394
433
|
HWY_SVE_FOREACH_UI64(HWY_SVE_CAST, _, reinterpret)
|
|
395
434
|
HWY_SVE_FOREACH_F(HWY_SVE_CAST, _, reinterpret)
|
|
396
|
-
HWY_SVE_FOREACH_BF16(HWY_SVE_CAST, _, reinterpret)
|
|
397
435
|
|
|
398
436
|
#undef HWY_SVE_CAST_NOP
|
|
399
437
|
#undef HWY_SVE_CAST
|
|
400
438
|
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
|
|
439
|
+
template <class V, HWY_SVE_IF_EMULATED_D(DFromV<V>)>
|
|
440
|
+
HWY_INLINE svuint8_t BitCastToByte(V v) {
|
|
441
|
+
#if HWY_SVE_HAVE_BF16_VEC
|
|
442
|
+
return svreinterpret_u8_bf16(v);
|
|
443
|
+
#else
|
|
444
|
+
const RebindToUnsigned<DFromV<V>> du;
|
|
445
|
+
return BitCastToByte(BitCast(du, v));
|
|
446
|
+
#endif
|
|
447
|
+
}
|
|
448
|
+
|
|
449
|
+
template <class D, HWY_SVE_IF_EMULATED_D(D)>
|
|
450
|
+
HWY_INLINE VFromD<D> BitCastFromByte(D d, svuint8_t v) {
|
|
451
|
+
#if HWY_SVE_HAVE_BF16_VEC
|
|
452
|
+
(void)d;
|
|
453
|
+
return svreinterpret_bf16_u8(v);
|
|
454
|
+
#else
|
|
455
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
456
|
+
return BitCastFromByte(du, v);
|
|
457
|
+
#endif
|
|
406
458
|
}
|
|
407
|
-
#endif // !HWY_SVE_HAVE_BFLOAT16
|
|
408
459
|
|
|
409
460
|
} // namespace detail
|
|
410
461
|
|
|
@@ -413,6 +464,23 @@ HWY_API VFromD<D> BitCast(D d, FromV v) {
|
|
|
413
464
|
return detail::BitCastFromByte(d, detail::BitCastToByte(v));
|
|
414
465
|
}
|
|
415
466
|
|
|
467
|
+
// ------------------------------ Undefined
|
|
468
|
+
|
|
469
|
+
#define HWY_SVE_UNDEFINED(BASE, CHAR, BITS, HALF, NAME, OP) \
|
|
470
|
+
template <size_t N, int kPow2> \
|
|
471
|
+
HWY_API HWY_SVE_V(BASE, BITS) \
|
|
472
|
+
NAME(HWY_SVE_D(BASE, BITS, N, kPow2) /* d */) { \
|
|
473
|
+
return sv##OP##_##CHAR##BITS(); \
|
|
474
|
+
}
|
|
475
|
+
|
|
476
|
+
HWY_SVE_FOREACH(HWY_SVE_UNDEFINED, Undefined, undef)
|
|
477
|
+
|
|
478
|
+
template <class D, HWY_SVE_IF_EMULATED_D(D)>
|
|
479
|
+
VFromD<D> Undefined(D d) {
|
|
480
|
+
const RebindToUnsigned<D> du;
|
|
481
|
+
return BitCast(d, Undefined(du));
|
|
482
|
+
}
|
|
483
|
+
|
|
416
484
|
// ------------------------------ Tuple
|
|
417
485
|
|
|
418
486
|
// tuples = f(d, v..), e.g. Create2
|
|
@@ -495,6 +563,102 @@ HWY_API VFromD<D> ResizeBitCast(D d, FromV v) {
|
|
|
495
563
|
return BitCast(d, v);
|
|
496
564
|
}
|
|
497
565
|
|
|
566
|
+
// ------------------------------ Dup128VecFromValues
|
|
567
|
+
|
|
568
|
+
template <class D, HWY_IF_I8_D(D)>
|
|
569
|
+
HWY_API svint8_t Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1,
|
|
570
|
+
TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
|
|
571
|
+
TFromD<D> t5, TFromD<D> t6, TFromD<D> t7,
|
|
572
|
+
TFromD<D> t8, TFromD<D> t9, TFromD<D> t10,
|
|
573
|
+
TFromD<D> t11, TFromD<D> t12,
|
|
574
|
+
TFromD<D> t13, TFromD<D> t14,
|
|
575
|
+
TFromD<D> t15) {
|
|
576
|
+
return svdupq_n_s8(t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13,
|
|
577
|
+
t14, t15);
|
|
578
|
+
}
|
|
579
|
+
|
|
580
|
+
template <class D, HWY_IF_U8_D(D)>
|
|
581
|
+
HWY_API svuint8_t Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1,
|
|
582
|
+
TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
|
|
583
|
+
TFromD<D> t5, TFromD<D> t6, TFromD<D> t7,
|
|
584
|
+
TFromD<D> t8, TFromD<D> t9, TFromD<D> t10,
|
|
585
|
+
TFromD<D> t11, TFromD<D> t12,
|
|
586
|
+
TFromD<D> t13, TFromD<D> t14,
|
|
587
|
+
TFromD<D> t15) {
|
|
588
|
+
return svdupq_n_u8(t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13,
|
|
589
|
+
t14, t15);
|
|
590
|
+
}
|
|
591
|
+
|
|
592
|
+
template <class D, HWY_IF_I16_D(D)>
|
|
593
|
+
HWY_API svint16_t Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1,
|
|
594
|
+
TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
|
|
595
|
+
TFromD<D> t5, TFromD<D> t6,
|
|
596
|
+
TFromD<D> t7) {
|
|
597
|
+
return svdupq_n_s16(t0, t1, t2, t3, t4, t5, t6, t7);
|
|
598
|
+
}
|
|
599
|
+
|
|
600
|
+
template <class D, HWY_IF_U16_D(D)>
|
|
601
|
+
HWY_API svuint16_t Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1,
|
|
602
|
+
TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
|
|
603
|
+
TFromD<D> t5, TFromD<D> t6,
|
|
604
|
+
TFromD<D> t7) {
|
|
605
|
+
return svdupq_n_u16(t0, t1, t2, t3, t4, t5, t6, t7);
|
|
606
|
+
}
|
|
607
|
+
|
|
608
|
+
template <class D, HWY_IF_F16_D(D)>
|
|
609
|
+
HWY_API svfloat16_t Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1,
|
|
610
|
+
TFromD<D> t2, TFromD<D> t3,
|
|
611
|
+
TFromD<D> t4, TFromD<D> t5,
|
|
612
|
+
TFromD<D> t6, TFromD<D> t7) {
|
|
613
|
+
return svdupq_n_f16(t0, t1, t2, t3, t4, t5, t6, t7);
|
|
614
|
+
}
|
|
615
|
+
|
|
616
|
+
template <class D, HWY_SVE_IF_EMULATED_D(D)>
|
|
617
|
+
HWY_API VBF16 Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1, TFromD<D> t2,
|
|
618
|
+
TFromD<D> t3, TFromD<D> t4, TFromD<D> t5,
|
|
619
|
+
TFromD<D> t6, TFromD<D> t7) {
|
|
620
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
621
|
+
return BitCast(
|
|
622
|
+
d, Dup128VecFromValues(
|
|
623
|
+
du, BitCastScalar<uint16_t>(t0), BitCastScalar<uint16_t>(t1),
|
|
624
|
+
BitCastScalar<uint16_t>(t2), BitCastScalar<uint16_t>(t3),
|
|
625
|
+
BitCastScalar<uint16_t>(t4), BitCastScalar<uint16_t>(t5),
|
|
626
|
+
BitCastScalar<uint16_t>(t6), BitCastScalar<uint16_t>(t7)));
|
|
627
|
+
}
|
|
628
|
+
|
|
629
|
+
template <class D, HWY_IF_I32_D(D)>
|
|
630
|
+
HWY_API svint32_t Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1,
|
|
631
|
+
TFromD<D> t2, TFromD<D> t3) {
|
|
632
|
+
return svdupq_n_s32(t0, t1, t2, t3);
|
|
633
|
+
}
|
|
634
|
+
|
|
635
|
+
template <class D, HWY_IF_U32_D(D)>
|
|
636
|
+
HWY_API svuint32_t Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1,
|
|
637
|
+
TFromD<D> t2, TFromD<D> t3) {
|
|
638
|
+
return svdupq_n_u32(t0, t1, t2, t3);
|
|
639
|
+
}
|
|
640
|
+
|
|
641
|
+
template <class D, HWY_IF_F32_D(D)>
|
|
642
|
+
HWY_API svfloat32_t Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1,
|
|
643
|
+
TFromD<D> t2, TFromD<D> t3) {
|
|
644
|
+
return svdupq_n_f32(t0, t1, t2, t3);
|
|
645
|
+
}
|
|
646
|
+
|
|
647
|
+
template <class D, HWY_IF_I64_D(D)>
|
|
648
|
+
HWY_API svint64_t Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1) {
|
|
649
|
+
return svdupq_n_s64(t0, t1);
|
|
650
|
+
}
|
|
651
|
+
|
|
652
|
+
template <class D, HWY_IF_U64_D(D)>
|
|
653
|
+
HWY_API svuint64_t Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1) {
|
|
654
|
+
return svdupq_n_u64(t0, t1);
|
|
655
|
+
}
|
|
656
|
+
|
|
657
|
+
template <class D, HWY_IF_F64_D(D)>
|
|
658
|
+
HWY_API svfloat64_t Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1) {
|
|
659
|
+
return svdupq_n_f64(t0, t1);
|
|
660
|
+
}
|
|
661
|
+
|
|
498
662
|
// ================================================== LOGICAL
|
|
499
663
|
|
|
500
664
|
// detail::*N() functions accept a scalar argument to avoid extra Set().
|
|
@@ -632,9 +796,37 @@ HWY_API VBF16 Neg(VBF16 v) {
|
|
|
632
796
|
return BitCast(d, Xor(BitCast(du, v), Set(du, SignMask<TU>())));
|
|
633
797
|
}
|
|
634
798
|
|
|
799
|
+
// ------------------------------ SaturatedNeg
|
|
800
|
+
#if HWY_SVE_HAVE_2
|
|
801
|
+
#ifdef HWY_NATIVE_SATURATED_NEG_8_16_32
|
|
802
|
+
#undef HWY_NATIVE_SATURATED_NEG_8_16_32
|
|
803
|
+
#else
|
|
804
|
+
#define HWY_NATIVE_SATURATED_NEG_8_16_32
|
|
805
|
+
#endif
|
|
806
|
+
|
|
807
|
+
#ifdef HWY_NATIVE_SATURATED_NEG_64
|
|
808
|
+
#undef HWY_NATIVE_SATURATED_NEG_64
|
|
809
|
+
#else
|
|
810
|
+
#define HWY_NATIVE_SATURATED_NEG_64
|
|
811
|
+
#endif
|
|
812
|
+
|
|
813
|
+
HWY_SVE_FOREACH_I(HWY_SVE_RETV_ARGPV, SaturatedNeg, qneg)
|
|
814
|
+
#endif // HWY_SVE_HAVE_2
|
|
815
|
+
|
|
635
816
|
// ------------------------------ Abs
|
|
636
817
|
HWY_SVE_FOREACH_IF(HWY_SVE_RETV_ARGPV, Abs, abs)
|
|
637
818
|
|
|
819
|
+
// ------------------------------ SaturatedAbs
|
|
820
|
+
#if HWY_SVE_HAVE_2
|
|
821
|
+
#ifdef HWY_NATIVE_SATURATED_ABS
|
|
822
|
+
#undef HWY_NATIVE_SATURATED_ABS
|
|
823
|
+
#else
|
|
824
|
+
#define HWY_NATIVE_SATURATED_ABS
|
|
825
|
+
#endif
|
|
826
|
+
|
|
827
|
+
HWY_SVE_FOREACH_I(HWY_SVE_RETV_ARGPV, SaturatedAbs, qabs)
|
|
828
|
+
#endif // HWY_SVE_HAVE_2
|
|
829
|
+
|
|
638
830
|
// ================================================== ARITHMETIC
|
|
639
831
|
|
|
640
832
|
// Per-target flags to prevent generic_ops-inl.h defining Add etc.
|
|
@@ -676,13 +868,107 @@ HWY_API svuint64_t SumsOf8(const svuint8_t v) {
|
|
|
676
868
|
|
|
677
869
|
const svuint32_t sums_of_4 = svdot_n_u32(Zero(du32), v, 1);
|
|
678
870
|
// Compute pairwise sum of u32 and extend to u64.
|
|
679
|
-
|
|
871
|
+
|
|
872
|
+
#if HWY_SVE_HAVE_2
|
|
873
|
+
return svadalp_u64_x(pg, Zero(du64), sums_of_4);
|
|
874
|
+
#else
|
|
680
875
|
const svuint64_t hi = svlsr_n_u64_x(pg, BitCast(du64, sums_of_4), 32);
|
|
681
876
|
// Isolate the lower 32 bits (to be added to the upper 32 and zero-extended)
|
|
682
877
|
const svuint64_t lo = svextw_u64_x(pg, BitCast(du64, sums_of_4));
|
|
683
878
|
return Add(hi, lo);
|
|
879
|
+
#endif
|
|
880
|
+
}
|
|
881
|
+
|
|
882
|
+
HWY_API svint64_t SumsOf8(const svint8_t v) {
|
|
883
|
+
const ScalableTag<int32_t> di32;
|
|
884
|
+
const ScalableTag<int64_t> di64;
|
|
885
|
+
const svbool_t pg = detail::PTrue(di64);
|
|
886
|
+
|
|
887
|
+
const svint32_t sums_of_4 = svdot_n_s32(Zero(di32), v, 1);
|
|
888
|
+
#if HWY_SVE_HAVE_2
|
|
889
|
+
return svadalp_s64_x(pg, Zero(di64), sums_of_4);
|
|
890
|
+
#else
|
|
891
|
+
const svint64_t hi = svasr_n_s64_x(pg, BitCast(di64, sums_of_4), 32);
|
|
892
|
+
// Isolate the lower 32 bits (to be added to the upper 32 and sign-extended)
|
|
893
|
+
const svint64_t lo = svextw_s64_x(pg, BitCast(di64, sums_of_4));
|
|
894
|
+
return Add(hi, lo);
|
|
895
|
+
#endif
|
|
896
|
+
}
|
|
897
|
+
|
|
898
|
+
// ------------------------------ SumsOf2
|
|
899
|
+
#if HWY_SVE_HAVE_2
|
|
900
|
+
namespace detail {
|
|
901
|
+
|
|
902
|
+
HWY_INLINE svint16_t SumsOf2(hwy::SignedTag /*type_tag*/,
|
|
903
|
+
hwy::SizeTag<1> /*lane_size_tag*/, svint8_t v) {
|
|
904
|
+
const ScalableTag<int16_t> di16;
|
|
905
|
+
const svbool_t pg = detail::PTrue(di16);
|
|
906
|
+
return svadalp_s16_x(pg, Zero(di16), v);
|
|
907
|
+
}
|
|
908
|
+
|
|
909
|
+
HWY_INLINE svuint16_t SumsOf2(hwy::UnsignedTag /*type_tag*/,
|
|
910
|
+
hwy::SizeTag<1> /*lane_size_tag*/, svuint8_t v) {
|
|
911
|
+
const ScalableTag<uint16_t> du16;
|
|
912
|
+
const svbool_t pg = detail::PTrue(du16);
|
|
913
|
+
return svadalp_u16_x(pg, Zero(du16), v);
|
|
914
|
+
}
|
|
915
|
+
|
|
916
|
+
HWY_INLINE svint32_t SumsOf2(hwy::SignedTag /*type_tag*/,
|
|
917
|
+
hwy::SizeTag<2> /*lane_size_tag*/, svint16_t v) {
|
|
918
|
+
const ScalableTag<int32_t> di32;
|
|
919
|
+
const svbool_t pg = detail::PTrue(di32);
|
|
920
|
+
return svadalp_s32_x(pg, Zero(di32), v);
|
|
921
|
+
}
|
|
922
|
+
|
|
923
|
+
HWY_INLINE svuint32_t SumsOf2(hwy::UnsignedTag /*type_tag*/,
|
|
924
|
+
hwy::SizeTag<2> /*lane_size_tag*/, svuint16_t v) {
|
|
925
|
+
const ScalableTag<uint32_t> du32;
|
|
926
|
+
const svbool_t pg = detail::PTrue(du32);
|
|
927
|
+
return svadalp_u32_x(pg, Zero(du32), v);
|
|
928
|
+
}
|
|
929
|
+
|
|
930
|
+
HWY_INLINE svint64_t SumsOf2(hwy::SignedTag /*type_tag*/,
|
|
931
|
+
hwy::SizeTag<4> /*lane_size_tag*/, svint32_t v) {
|
|
932
|
+
const ScalableTag<int64_t> di64;
|
|
933
|
+
const svbool_t pg = detail::PTrue(di64);
|
|
934
|
+
return svadalp_s64_x(pg, Zero(di64), v);
|
|
935
|
+
}
|
|
936
|
+
|
|
937
|
+
HWY_INLINE svuint64_t SumsOf2(hwy::UnsignedTag /*type_tag*/,
|
|
938
|
+
hwy::SizeTag<4> /*lane_size_tag*/, svuint32_t v) {
|
|
939
|
+
const ScalableTag<uint64_t> du64;
|
|
940
|
+
const svbool_t pg = detail::PTrue(du64);
|
|
941
|
+
return svadalp_u64_x(pg, Zero(du64), v);
|
|
684
942
|
}
|
|
685
943
|
|
|
944
|
+
} // namespace detail
|
|
945
|
+
#endif // HWY_SVE_HAVE_2
|
|
946
|
+
|
|
947
|
+
// ------------------------------ SumsOf4
|
|
948
|
+
namespace detail {
|
|
949
|
+
|
|
950
|
+
HWY_INLINE svint32_t SumsOf4(hwy::SignedTag /*type_tag*/,
|
|
951
|
+
hwy::SizeTag<1> /*lane_size_tag*/, svint8_t v) {
|
|
952
|
+
return svdot_n_s32(Zero(ScalableTag<int32_t>()), v, 1);
|
|
953
|
+
}
|
|
954
|
+
|
|
955
|
+
HWY_INLINE svuint32_t SumsOf4(hwy::UnsignedTag /*type_tag*/,
|
|
956
|
+
hwy::SizeTag<1> /*lane_size_tag*/, svuint8_t v) {
|
|
957
|
+
return svdot_n_u32(Zero(ScalableTag<uint32_t>()), v, 1);
|
|
958
|
+
}
|
|
959
|
+
|
|
960
|
+
HWY_INLINE svint64_t SumsOf4(hwy::SignedTag /*type_tag*/,
|
|
961
|
+
hwy::SizeTag<2> /*lane_size_tag*/, svint16_t v) {
|
|
962
|
+
return svdot_n_s64(Zero(ScalableTag<int64_t>()), v, 1);
|
|
963
|
+
}
|
|
964
|
+
|
|
965
|
+
HWY_INLINE svuint64_t SumsOf4(hwy::UnsignedTag /*type_tag*/,
|
|
966
|
+
hwy::SizeTag<2> /*lane_size_tag*/, svuint16_t v) {
|
|
967
|
+
return svdot_n_u64(Zero(ScalableTag<uint64_t>()), v, 1);
|
|
968
|
+
}
|
|
969
|
+
|
|
970
|
+
} // namespace detail
|
|
971
|
+
|
|
686
972
|
// ------------------------------ SaturatedAdd
|
|
687
973
|
|
|
688
974
|
#ifdef HWY_NATIVE_I32_SATURATED_ADDSUB
|
|
@@ -830,6 +1116,14 @@ HWY_API svint16_t MulFixedPoint15(svint16_t a, svint16_t b) {
|
|
|
830
1116
|
}
|
|
831
1117
|
|
|
832
1118
|
// ------------------------------ Div
|
|
1119
|
+
#ifdef HWY_NATIVE_INT_DIV
|
|
1120
|
+
#undef HWY_NATIVE_INT_DIV
|
|
1121
|
+
#else
|
|
1122
|
+
#define HWY_NATIVE_INT_DIV
|
|
1123
|
+
#endif
|
|
1124
|
+
|
|
1125
|
+
HWY_SVE_FOREACH_UI32(HWY_SVE_RETV_ARGPVV, Div, div)
|
|
1126
|
+
HWY_SVE_FOREACH_UI64(HWY_SVE_RETV_ARGPVV, Div, div)
|
|
833
1127
|
HWY_SVE_FOREACH_F(HWY_SVE_RETV_ARGPVV, Div, div)
|
|
834
1128
|
|
|
835
1129
|
// ------------------------------ ApproximateReciprocal
|
|
@@ -983,16 +1277,37 @@ HWY_API size_t FindKnownFirstTrue(D d, svbool_t m) {
|
|
|
983
1277
|
HWY_SVE_FOREACH(HWY_SVE_IF_THEN_ELSE, IfThenElse, sel)
|
|
984
1278
|
#undef HWY_SVE_IF_THEN_ELSE
|
|
985
1279
|
|
|
1280
|
+
template <class V, class D = DFromV<V>, HWY_SVE_IF_EMULATED_D(D)>
|
|
1281
|
+
HWY_API V IfThenElse(const svbool_t mask, V yes, V no) {
|
|
1282
|
+
const RebindToUnsigned<D> du;
|
|
1283
|
+
return BitCast(
|
|
1284
|
+
D(), IfThenElse(RebindMask(du, mask), BitCast(du, yes), BitCast(du, no)));
|
|
1285
|
+
}
|
|
1286
|
+
|
|
986
1287
|
// ------------------------------ IfThenElseZero
|
|
987
|
-
|
|
1288
|
+
|
|
1289
|
+
template <class V, class D = DFromV<V>, HWY_SVE_IF_NOT_EMULATED_D(D)>
|
|
988
1290
|
HWY_API V IfThenElseZero(const svbool_t mask, const V yes) {
|
|
989
|
-
return IfThenElse(mask, yes, Zero(
|
|
1291
|
+
return IfThenElse(mask, yes, Zero(D()));
|
|
1292
|
+
}
|
|
1293
|
+
|
|
1294
|
+
template <class V, class D = DFromV<V>, HWY_SVE_IF_EMULATED_D(D)>
|
|
1295
|
+
HWY_API V IfThenElseZero(const svbool_t mask, V yes) {
|
|
1296
|
+
const RebindToUnsigned<D> du;
|
|
1297
|
+
return BitCast(D(), IfThenElseZero(RebindMask(du, mask), BitCast(du, yes)));
|
|
990
1298
|
}
|
|
991
1299
|
|
|
992
1300
|
// ------------------------------ IfThenZeroElse
|
|
993
|
-
|
|
1301
|
+
|
|
1302
|
+
template <class V, class D = DFromV<V>, HWY_SVE_IF_NOT_EMULATED_D(D)>
|
|
994
1303
|
HWY_API V IfThenZeroElse(const svbool_t mask, const V no) {
|
|
995
|
-
return IfThenElse(mask, Zero(
|
|
1304
|
+
return IfThenElse(mask, Zero(D()), no);
|
|
1305
|
+
}
|
|
1306
|
+
|
|
1307
|
+
template <class V, class D = DFromV<V>, HWY_SVE_IF_EMULATED_D(D)>
|
|
1308
|
+
HWY_API V IfThenZeroElse(const svbool_t mask, V no) {
|
|
1309
|
+
const RebindToUnsigned<D> du;
|
|
1310
|
+
return BitCast(D(), IfThenZeroElse(RebindMask(du, mask), BitCast(du, no)));
|
|
996
1311
|
}
|
|
997
1312
|
|
|
998
1313
|
// ------------------------------ Additional mask logical operations
|
|
@@ -1016,6 +1331,162 @@ HWY_API svbool_t SetAtOrAfterFirst(svbool_t m) {
|
|
|
1016
1331
|
return Not(SetBeforeFirst(m));
|
|
1017
1332
|
}
|
|
1018
1333
|
|
|
1334
|
+
// ------------------------------ PromoteMaskTo
|
|
1335
|
+
|
|
1336
|
+
#ifdef HWY_NATIVE_PROMOTE_MASK_TO
|
|
1337
|
+
#undef HWY_NATIVE_PROMOTE_MASK_TO
|
|
1338
|
+
#else
|
|
1339
|
+
#define HWY_NATIVE_PROMOTE_MASK_TO
|
|
1340
|
+
#endif
|
|
1341
|
+
|
|
1342
|
+
template <class DTo, class DFrom,
|
|
1343
|
+
HWY_IF_T_SIZE_D(DTo, sizeof(TFromD<DFrom>) * 2)>
|
|
1344
|
+
HWY_API svbool_t PromoteMaskTo(DTo /*d_to*/, DFrom /*d_from*/, svbool_t m) {
|
|
1345
|
+
return svunpklo_b(m);
|
|
1346
|
+
}
|
|
1347
|
+
|
|
1348
|
+
template <class DTo, class DFrom,
|
|
1349
|
+
HWY_IF_T_SIZE_GT_D(DTo, sizeof(TFromD<DFrom>) * 2)>
|
|
1350
|
+
HWY_API svbool_t PromoteMaskTo(DTo d_to, DFrom d_from, svbool_t m) {
|
|
1351
|
+
using TFrom = TFromD<DFrom>;
|
|
1352
|
+
using TWFrom = MakeWide<MakeUnsigned<TFrom>>;
|
|
1353
|
+
static_assert(sizeof(TWFrom) > sizeof(TFrom),
|
|
1354
|
+
"sizeof(TWFrom) > sizeof(TFrom) must be true");
|
|
1355
|
+
|
|
1356
|
+
const Rebind<TWFrom, decltype(d_from)> dw_from;
|
|
1357
|
+
return PromoteMaskTo(d_to, dw_from, PromoteMaskTo(dw_from, d_from, m));
|
|
1358
|
+
}
|
|
1359
|
+
|
|
1360
|
+
// ------------------------------ DemoteMaskTo
|
|
1361
|
+
|
|
1362
|
+
#ifdef HWY_NATIVE_DEMOTE_MASK_TO
|
|
1363
|
+
#undef HWY_NATIVE_DEMOTE_MASK_TO
|
|
1364
|
+
#else
|
|
1365
|
+
#define HWY_NATIVE_DEMOTE_MASK_TO
|
|
1366
|
+
#endif
|
|
1367
|
+
|
|
1368
|
+
template <class DTo, class DFrom, HWY_IF_T_SIZE_D(DTo, 1),
|
|
1369
|
+
HWY_IF_T_SIZE_D(DFrom, 2)>
|
|
1370
|
+
HWY_API svbool_t DemoteMaskTo(DTo /*d_to*/, DFrom /*d_from*/, svbool_t m) {
|
|
1371
|
+
return svuzp1_b8(m, m);
|
|
1372
|
+
}
|
|
1373
|
+
|
|
1374
|
+
template <class DTo, class DFrom, HWY_IF_T_SIZE_D(DTo, 2),
|
|
1375
|
+
HWY_IF_T_SIZE_D(DFrom, 4)>
|
|
1376
|
+
HWY_API svbool_t DemoteMaskTo(DTo /*d_to*/, DFrom /*d_from*/, svbool_t m) {
|
|
1377
|
+
return svuzp1_b16(m, m);
|
|
1378
|
+
}
|
|
1379
|
+
|
|
1380
|
+
template <class DTo, class DFrom, HWY_IF_T_SIZE_D(DTo, 4),
|
|
1381
|
+
HWY_IF_T_SIZE_D(DFrom, 8)>
|
|
1382
|
+
HWY_API svbool_t DemoteMaskTo(DTo /*d_to*/, DFrom /*d_from*/, svbool_t m) {
|
|
1383
|
+
return svuzp1_b32(m, m);
|
|
1384
|
+
}
|
|
1385
|
+
|
|
1386
|
+
template <class DTo, class DFrom,
|
|
1387
|
+
HWY_IF_T_SIZE_LE_D(DTo, sizeof(TFromD<DFrom>) / 4)>
|
|
1388
|
+
HWY_API svbool_t DemoteMaskTo(DTo d_to, DFrom d_from, svbool_t m) {
|
|
1389
|
+
using TFrom = TFromD<DFrom>;
|
|
1390
|
+
using TNFrom = MakeNarrow<MakeUnsigned<TFrom>>;
|
|
1391
|
+
static_assert(sizeof(TNFrom) < sizeof(TFrom),
|
|
1392
|
+
"sizeof(TNFrom) < sizeof(TFrom) must be true");
|
|
1393
|
+
|
|
1394
|
+
const Rebind<TNFrom, decltype(d_from)> dn_from;
|
|
1395
|
+
return DemoteMaskTo(d_to, dn_from, DemoteMaskTo(dn_from, d_from, m));
|
|
1396
|
+
}
|
|
1397
|
+
|
|
1398
|
+
// ------------------------------ LowerHalfOfMask
|
|
1399
|
+
#ifdef HWY_NATIVE_LOWER_HALF_OF_MASK
|
|
1400
|
+
#undef HWY_NATIVE_LOWER_HALF_OF_MASK
|
|
1401
|
+
#else
|
|
1402
|
+
#define HWY_NATIVE_LOWER_HALF_OF_MASK
|
|
1403
|
+
#endif
|
|
1404
|
+
|
|
1405
|
+
template <class D>
|
|
1406
|
+
HWY_API svbool_t LowerHalfOfMask(D /*d*/, svbool_t m) {
|
|
1407
|
+
return m;
|
|
1408
|
+
}
|
|
1409
|
+
|
|
1410
|
+
// ------------------------------ MaskedAddOr etc. (IfThenElse)
|
|
1411
|
+
|
|
1412
|
+
#ifdef HWY_NATIVE_MASKED_ARITH
|
|
1413
|
+
#undef HWY_NATIVE_MASKED_ARITH
|
|
1414
|
+
#else
|
|
1415
|
+
#define HWY_NATIVE_MASKED_ARITH
|
|
1416
|
+
#endif
|
|
1417
|
+
|
|
1418
|
+
namespace detail {
|
|
1419
|
+
HWY_SVE_FOREACH(HWY_SVE_RETV_ARGMVV, MaskedMin, min)
|
|
1420
|
+
HWY_SVE_FOREACH(HWY_SVE_RETV_ARGMVV, MaskedMax, max)
|
|
1421
|
+
HWY_SVE_FOREACH(HWY_SVE_RETV_ARGMVV, MaskedAdd, add)
|
|
1422
|
+
HWY_SVE_FOREACH(HWY_SVE_RETV_ARGMVV, MaskedSub, sub)
|
|
1423
|
+
HWY_SVE_FOREACH(HWY_SVE_RETV_ARGMVV, MaskedMul, mul)
|
|
1424
|
+
HWY_SVE_FOREACH_F(HWY_SVE_RETV_ARGMVV, MaskedDiv, div)
|
|
1425
|
+
HWY_SVE_FOREACH_UI32(HWY_SVE_RETV_ARGMVV, MaskedDiv, div)
|
|
1426
|
+
HWY_SVE_FOREACH_UI64(HWY_SVE_RETV_ARGMVV, MaskedDiv, div)
|
|
1427
|
+
#if HWY_SVE_HAVE_2
|
|
1428
|
+
HWY_SVE_FOREACH_UI(HWY_SVE_RETV_ARGMVV, MaskedSatAdd, qadd)
|
|
1429
|
+
HWY_SVE_FOREACH_UI(HWY_SVE_RETV_ARGMVV, MaskedSatSub, qsub)
|
|
1430
|
+
#endif
|
|
1431
|
+
} // namespace detail
|
|
1432
|
+
|
|
1433
|
+
template <class V, class M>
|
|
1434
|
+
HWY_API V MaskedMinOr(V no, M m, V a, V b) {
|
|
1435
|
+
return IfThenElse(m, detail::MaskedMin(m, a, b), no);
|
|
1436
|
+
}
|
|
1437
|
+
|
|
1438
|
+
template <class V, class M>
|
|
1439
|
+
HWY_API V MaskedMaxOr(V no, M m, V a, V b) {
|
|
1440
|
+
return IfThenElse(m, detail::MaskedMax(m, a, b), no);
|
|
1441
|
+
}
|
|
1442
|
+
|
|
1443
|
+
template <class V, class M>
|
|
1444
|
+
HWY_API V MaskedAddOr(V no, M m, V a, V b) {
|
|
1445
|
+
return IfThenElse(m, detail::MaskedAdd(m, a, b), no);
|
|
1446
|
+
}
|
|
1447
|
+
|
|
1448
|
+
template <class V, class M>
|
|
1449
|
+
HWY_API V MaskedSubOr(V no, M m, V a, V b) {
|
|
1450
|
+
return IfThenElse(m, detail::MaskedSub(m, a, b), no);
|
|
1451
|
+
}
|
|
1452
|
+
|
|
1453
|
+
template <class V, class M>
|
|
1454
|
+
HWY_API V MaskedMulOr(V no, M m, V a, V b) {
|
|
1455
|
+
return IfThenElse(m, detail::MaskedMul(m, a, b), no);
|
|
1456
|
+
}
|
|
1457
|
+
|
|
1458
|
+
template <class V, class M,
|
|
1459
|
+
HWY_IF_T_SIZE_ONE_OF_V(
|
|
1460
|
+
V, (hwy::IsSame<TFromV<V>, hwy::float16_t>() ? (1 << 2) : 0) |
|
|
1461
|
+
(1 << 4) | (1 << 8))>
|
|
1462
|
+
HWY_API V MaskedDivOr(V no, M m, V a, V b) {
|
|
1463
|
+
return IfThenElse(m, detail::MaskedDiv(m, a, b), no);
|
|
1464
|
+
}
|
|
1465
|
+
|
|
1466
|
+
// I8/U8/I16/U16 MaskedDivOr is implemented after I8/U8/I16/U16 Div
|
|
1467
|
+
|
|
1468
|
+
#if HWY_SVE_HAVE_2
|
|
1469
|
+
template <class V, class M>
|
|
1470
|
+
HWY_API V MaskedSatAddOr(V no, M m, V a, V b) {
|
|
1471
|
+
return IfThenElse(m, detail::MaskedSatAdd(m, a, b), no);
|
|
1472
|
+
}
|
|
1473
|
+
|
|
1474
|
+
template <class V, class M>
|
|
1475
|
+
HWY_API V MaskedSatSubOr(V no, M m, V a, V b) {
|
|
1476
|
+
return IfThenElse(m, detail::MaskedSatSub(m, a, b), no);
|
|
1477
|
+
}
|
|
1478
|
+
#else
|
|
1479
|
+
template <class V, class M>
|
|
1480
|
+
HWY_API V MaskedSatAddOr(V no, M m, V a, V b) {
|
|
1481
|
+
return IfThenElse(m, SaturatedAdd(a, b), no);
|
|
1482
|
+
}
|
|
1483
|
+
|
|
1484
|
+
template <class V, class M>
|
|
1485
|
+
HWY_API V MaskedSatSubOr(V no, M m, V a, V b) {
|
|
1486
|
+
return IfThenElse(m, SaturatedSub(a, b), no);
|
|
1487
|
+
}
|
|
1488
|
+
#endif
|
|
1489
|
+
|
|
1019
1490
|
// ================================================== COMPARE
|
|
1020
1491
|
|
|
1021
1492
|
// mask = f(vector, vector)
|
|
@@ -1078,7 +1549,8 @@ HWY_API svbool_t TestBit(const V a, const V bit) {
|
|
|
1078
1549
|
// ------------------------------ MaskFromVec (Ne)
|
|
1079
1550
|
template <class V>
|
|
1080
1551
|
HWY_API svbool_t MaskFromVec(const V v) {
|
|
1081
|
-
|
|
1552
|
+
using T = TFromV<V>;
|
|
1553
|
+
return detail::NeN(v, ConvertScalarTo<T>(0));
|
|
1082
1554
|
}
|
|
1083
1555
|
|
|
1084
1556
|
// ------------------------------ VecFromMask
|
|
@@ -1159,14 +1631,27 @@ HWY_API svbool_t IsNaN(const V v) {
|
|
|
1159
1631
|
return Ne(v, v); // could also use cmpuo
|
|
1160
1632
|
}
|
|
1161
1633
|
|
|
1634
|
+
// Per-target flag to prevent generic_ops-inl.h from defining IsInf / IsFinite.
|
|
1635
|
+
// We use a fused Set/comparison for IsFinite.
|
|
1636
|
+
#ifdef HWY_NATIVE_ISINF
|
|
1637
|
+
#undef HWY_NATIVE_ISINF
|
|
1638
|
+
#else
|
|
1639
|
+
#define HWY_NATIVE_ISINF
|
|
1640
|
+
#endif
|
|
1641
|
+
|
|
1162
1642
|
template <class V>
|
|
1163
1643
|
HWY_API svbool_t IsInf(const V v) {
|
|
1164
1644
|
using T = TFromV<V>;
|
|
1165
1645
|
const DFromV<decltype(v)> d;
|
|
1646
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
1166
1647
|
const RebindToSigned<decltype(d)> di;
|
|
1167
|
-
|
|
1168
|
-
// 'Shift left' to clear the sign bit
|
|
1169
|
-
|
|
1648
|
+
|
|
1649
|
+
// 'Shift left' to clear the sign bit
|
|
1650
|
+
const VFromD<decltype(du)> vu = BitCast(du, v);
|
|
1651
|
+
const VFromD<decltype(du)> v2 = Add(vu, vu);
|
|
1652
|
+
// Check for exponent=max and mantissa=0.
|
|
1653
|
+
const VFromD<decltype(di)> max2 = Set(di, hwy::MaxExponentTimes2<T>());
|
|
1654
|
+
return RebindMask(d, Eq(v2, BitCast(du, max2)));
|
|
1170
1655
|
}
|
|
1171
1656
|
|
|
1172
1657
|
// Returns whether normal/subnormal/zero.
|
|
@@ -1187,147 +1672,135 @@ HWY_API svbool_t IsFinite(const V v) {
|
|
|
1187
1672
|
|
|
1188
1673
|
// ================================================== MEMORY
|
|
1189
1674
|
|
|
1190
|
-
// ------------------------------
|
|
1675
|
+
// ------------------------------ LoadU/MaskedLoad/LoadDup128/StoreU/Stream
|
|
1191
1676
|
|
|
1192
|
-
#define
|
|
1193
|
-
template <size_t N, int kPow2>
|
|
1194
|
-
HWY_API HWY_SVE_V(BASE, BITS)
|
|
1195
|
-
|
|
1196
|
-
|
|
1197
|
-
|
|
1198
|
-
|
|
1199
|
-
|
|
1677
|
+
#define HWY_SVE_MEM(BASE, CHAR, BITS, HALF, NAME, OP) \
|
|
1678
|
+
template <size_t N, int kPow2> \
|
|
1679
|
+
HWY_API HWY_SVE_V(BASE, BITS) \
|
|
1680
|
+
LoadU(HWY_SVE_D(BASE, BITS, N, kPow2) d, \
|
|
1681
|
+
const HWY_SVE_T(BASE, BITS) * HWY_RESTRICT p) { \
|
|
1682
|
+
return svld1_##CHAR##BITS(detail::MakeMask(d), \
|
|
1683
|
+
detail::NativeLanePointer(p)); \
|
|
1684
|
+
} \
|
|
1685
|
+
template <size_t N, int kPow2> \
|
|
1686
|
+
HWY_API HWY_SVE_V(BASE, BITS) \
|
|
1687
|
+
MaskedLoad(svbool_t m, HWY_SVE_D(BASE, BITS, N, kPow2) /* d */, \
|
|
1688
|
+
const HWY_SVE_T(BASE, BITS) * HWY_RESTRICT p) { \
|
|
1689
|
+
return svld1_##CHAR##BITS(m, detail::NativeLanePointer(p)); \
|
|
1690
|
+
} \
|
|
1691
|
+
template <size_t N, int kPow2> \
|
|
1692
|
+
HWY_API void StoreU(HWY_SVE_V(BASE, BITS) v, \
|
|
1693
|
+
HWY_SVE_D(BASE, BITS, N, kPow2) d, \
|
|
1694
|
+
HWY_SVE_T(BASE, BITS) * HWY_RESTRICT p) { \
|
|
1695
|
+
svst1_##CHAR##BITS(detail::MakeMask(d), detail::NativeLanePointer(p), v); \
|
|
1696
|
+
} \
|
|
1697
|
+
template <size_t N, int kPow2> \
|
|
1698
|
+
HWY_API void Stream(HWY_SVE_V(BASE, BITS) v, \
|
|
1699
|
+
HWY_SVE_D(BASE, BITS, N, kPow2) d, \
|
|
1700
|
+
HWY_SVE_T(BASE, BITS) * HWY_RESTRICT p) { \
|
|
1701
|
+
svstnt1_##CHAR##BITS(detail::MakeMask(d), detail::NativeLanePointer(p), \
|
|
1702
|
+
v); \
|
|
1703
|
+
} \
|
|
1704
|
+
template <size_t N, int kPow2> \
|
|
1705
|
+
HWY_API void BlendedStore(HWY_SVE_V(BASE, BITS) v, svbool_t m, \
|
|
1706
|
+
HWY_SVE_D(BASE, BITS, N, kPow2) /* d */, \
|
|
1707
|
+
HWY_SVE_T(BASE, BITS) * HWY_RESTRICT p) { \
|
|
1708
|
+
svst1_##CHAR##BITS(m, detail::NativeLanePointer(p), v); \
|
|
1200
1709
|
}
|
|
1201
1710
|
|
|
1202
|
-
|
|
1203
|
-
|
|
1204
|
-
HWY_API HWY_SVE_V(BASE, BITS) \
|
|
1205
|
-
NAME(svbool_t m, HWY_SVE_D(BASE, BITS, N, kPow2) /* d */, \
|
|
1206
|
-
const HWY_SVE_T(BASE, BITS) * HWY_RESTRICT p) { \
|
|
1207
|
-
using T = detail::NativeLaneType<HWY_SVE_T(BASE, BITS)>; \
|
|
1208
|
-
return sv##OP##_##CHAR##BITS(m, reinterpret_cast<const T*>(p)); \
|
|
1209
|
-
}
|
|
1711
|
+
HWY_SVE_FOREACH(HWY_SVE_MEM, _, _)
|
|
1712
|
+
HWY_SVE_FOREACH_BF16(HWY_SVE_MEM, _, _)
|
|
1210
1713
|
|
|
1211
|
-
|
|
1212
|
-
|
|
1213
|
-
|
|
1214
|
-
|
|
1215
|
-
|
|
1216
|
-
using T = detail::NativeLaneType<HWY_SVE_T(BASE, BITS)>; \
|
|
1217
|
-
/* All-true predicate to load all 128 bits. */ \
|
|
1218
|
-
return sv##OP##_##CHAR##BITS(HWY_SVE_PTRUE(8), \
|
|
1219
|
-
reinterpret_cast<const T*>(p)); \
|
|
1220
|
-
}
|
|
1714
|
+
template <class D, HWY_SVE_IF_EMULATED_D(D)>
|
|
1715
|
+
HWY_API VFromD<D> LoadU(D d, const TFromD<D>* HWY_RESTRICT p) {
|
|
1716
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
1717
|
+
return BitCast(d, LoadU(du, detail::U16LanePointer(p)));
|
|
1718
|
+
}
|
|
1221
1719
|
|
|
1222
|
-
|
|
1223
|
-
|
|
1224
|
-
|
|
1225
|
-
|
|
1226
|
-
|
|
1227
|
-
using T = detail::NativeLaneType<HWY_SVE_T(BASE, BITS)>; \
|
|
1228
|
-
sv##OP##_##CHAR##BITS(detail::MakeMask(d), reinterpret_cast<T*>(p), v); \
|
|
1229
|
-
}
|
|
1720
|
+
template <class D, HWY_SVE_IF_EMULATED_D(D)>
|
|
1721
|
+
HWY_API void StoreU(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT p) {
|
|
1722
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
1723
|
+
StoreU(BitCast(du, v), du, detail::U16LanePointer(p));
|
|
1724
|
+
}
|
|
1230
1725
|
|
|
1231
|
-
|
|
1232
|
-
|
|
1233
|
-
|
|
1234
|
-
|
|
1235
|
-
|
|
1236
|
-
|
|
1237
|
-
|
|
1238
|
-
}
|
|
1726
|
+
template <class D, HWY_SVE_IF_EMULATED_D(D)>
|
|
1727
|
+
HWY_API VFromD<D> MaskedLoad(MFromD<D> m, D d,
|
|
1728
|
+
const TFromD<D>* HWY_RESTRICT p) {
|
|
1729
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
1730
|
+
return BitCast(d,
|
|
1731
|
+
MaskedLoad(RebindMask(du, m), du, detail::U16LanePointer(p)));
|
|
1732
|
+
}
|
|
1239
1733
|
|
|
1240
|
-
|
|
1241
|
-
HWY_SVE_FOREACH(HWY_SVE_MASKED_LOAD, MaskedLoad, ld1)
|
|
1242
|
-
HWY_SVE_FOREACH(HWY_SVE_STORE, Store, st1)
|
|
1243
|
-
HWY_SVE_FOREACH(HWY_SVE_STORE, Stream, stnt1)
|
|
1244
|
-
HWY_SVE_FOREACH(HWY_SVE_BLENDED_STORE, BlendedStore, st1)
|
|
1734
|
+
// MaskedLoadOr is generic and does not require emulation.
|
|
1245
1735
|
|
|
1246
|
-
|
|
1247
|
-
|
|
1248
|
-
|
|
1249
|
-
|
|
1250
|
-
|
|
1736
|
+
template <class D, HWY_SVE_IF_EMULATED_D(D)>
|
|
1737
|
+
HWY_API void BlendedStore(VFromD<D> v, MFromD<D> m, D d,
|
|
1738
|
+
TFromD<D>* HWY_RESTRICT p) {
|
|
1739
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
1740
|
+
BlendedStore(BitCast(du, v), RebindMask(du, m), du,
|
|
1741
|
+
detail::U16LanePointer(p));
|
|
1742
|
+
}
|
|
1743
|
+
|
|
1744
|
+
#undef HWY_SVE_MEM
|
|
1251
1745
|
|
|
1252
1746
|
#if HWY_TARGET != HWY_SVE2_128
|
|
1253
1747
|
namespace detail {
|
|
1254
|
-
|
|
1255
|
-
|
|
1256
|
-
|
|
1257
|
-
|
|
1258
|
-
|
|
1259
|
-
|
|
1260
|
-
|
|
1261
|
-
|
|
1262
|
-
|
|
1748
|
+
#define HWY_SVE_LOAD_DUP128(BASE, CHAR, BITS, HALF, NAME, OP) \
|
|
1749
|
+
template <size_t N, int kPow2> \
|
|
1750
|
+
HWY_API HWY_SVE_V(BASE, BITS) \
|
|
1751
|
+
NAME(HWY_SVE_D(BASE, BITS, N, kPow2) /* d */, \
|
|
1752
|
+
const HWY_SVE_T(BASE, BITS) * HWY_RESTRICT p) { \
|
|
1753
|
+
/* All-true predicate to load all 128 bits. */ \
|
|
1754
|
+
return sv##OP##_##CHAR##BITS(HWY_SVE_PTRUE(8), \
|
|
1755
|
+
detail::NativeLanePointer(p)); \
|
|
1756
|
+
}
|
|
1263
1757
|
|
|
1264
|
-
|
|
1758
|
+
HWY_SVE_FOREACH(HWY_SVE_LOAD_DUP128, LoadDupFull128, ld1rq)
|
|
1759
|
+
HWY_SVE_FOREACH_BF16(HWY_SVE_LOAD_DUP128, LoadDupFull128, ld1rq)
|
|
1265
1760
|
|
|
1266
|
-
template <
|
|
1267
|
-
HWY_API
|
|
1268
|
-
|
|
1269
|
-
return
|
|
1270
|
-
reinterpret_cast<const uint16_t * HWY_RESTRICT>(p));
|
|
1761
|
+
template <class D, HWY_SVE_IF_EMULATED_D(D)>
|
|
1762
|
+
HWY_API VFromD<D> LoadDupFull128(D d, const TFromD<D>* HWY_RESTRICT p) {
|
|
1763
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
1764
|
+
return BitCast(d, LoadDupFull128(du, detail::U16LanePointer(p)));
|
|
1271
1765
|
}
|
|
1272
1766
|
|
|
1273
|
-
|
|
1767
|
+
} // namespace detail
|
|
1768
|
+
#endif // HWY_TARGET != HWY_SVE2_128
|
|
1274
1769
|
|
|
1275
1770
|
#if HWY_TARGET == HWY_SVE2_128
|
|
1276
|
-
// On the HWY_SVE2_128 target, LoadDup128 is the same as
|
|
1771
|
+
// On the HWY_SVE2_128 target, LoadDup128 is the same as LoadU since vectors
|
|
1277
1772
|
// cannot exceed 16 bytes on the HWY_SVE2_128 target.
|
|
1278
1773
|
template <class D>
|
|
1279
1774
|
HWY_API VFromD<D> LoadDup128(D d, const TFromD<D>* HWY_RESTRICT p) {
|
|
1280
|
-
return
|
|
1775
|
+
return LoadU(d, p);
|
|
1281
1776
|
}
|
|
1282
1777
|
#else // HWY_TARGET != HWY_SVE2_128
|
|
1283
|
-
// If D().MaxBytes() <= 16 is true, simply do a
|
|
1778
|
+
// If D().MaxBytes() <= 16 is true, simply do a LoadU operation.
|
|
1284
1779
|
template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
|
|
1285
1780
|
HWY_API VFromD<D> LoadDup128(D d, const TFromD<D>* HWY_RESTRICT p) {
|
|
1286
|
-
return
|
|
1781
|
+
return LoadU(d, p);
|
|
1287
1782
|
}
|
|
1288
1783
|
|
|
1289
1784
|
// If D().MaxBytes() > 16 is true, need to load the vector using ld1rq
|
|
1290
|
-
template <class D, HWY_IF_V_SIZE_GT_D(D, 16)
|
|
1291
|
-
hwy::EnableIf<!IsSame<TFromD<D>, bfloat16_t>()>* = nullptr>
|
|
1785
|
+
template <class D, HWY_IF_V_SIZE_GT_D(D, 16)>
|
|
1292
1786
|
HWY_API VFromD<D> LoadDup128(D d, const TFromD<D>* HWY_RESTRICT p) {
|
|
1293
1787
|
return detail::LoadDupFull128(d, p);
|
|
1294
1788
|
}
|
|
1295
1789
|
|
|
1296
|
-
#if !HWY_SVE_HAVE_BFLOAT16
|
|
1297
|
-
|
|
1298
|
-
template <class D, HWY_IF_V_SIZE_GT_D(D, 16), HWY_IF_BF16_D(D)>
|
|
1299
|
-
HWY_API VBF16 LoadDup128(D d, const bfloat16_t* HWY_RESTRICT p) {
|
|
1300
|
-
return detail::LoadDupFull128(
|
|
1301
|
-
RebindToUnsigned<decltype(d)>(),
|
|
1302
|
-
reinterpret_cast<const uint16_t * HWY_RESTRICT>(p));
|
|
1303
|
-
}
|
|
1304
|
-
#endif // !HWY_SVE_HAVE_BFLOAT16
|
|
1305
|
-
|
|
1306
1790
|
#endif // HWY_TARGET != HWY_SVE2_128
|
|
1307
1791
|
|
|
1308
|
-
|
|
1309
|
-
|
|
1310
|
-
template <size_t N, int kPow2>
|
|
1311
|
-
HWY_API void Store(VBF16 v, Simd<bfloat16_t, N, kPow2> d,
|
|
1312
|
-
bfloat16_t* HWY_RESTRICT p) {
|
|
1313
|
-
Store(v, RebindToUnsigned<decltype(d)>(),
|
|
1314
|
-
reinterpret_cast<uint16_t * HWY_RESTRICT>(p));
|
|
1315
|
-
}
|
|
1316
|
-
|
|
1317
|
-
#endif
|
|
1318
|
-
|
|
1319
|
-
// ------------------------------ Load/StoreU
|
|
1792
|
+
// ------------------------------ Load/Store
|
|
1320
1793
|
|
|
1321
1794
|
// SVE only requires lane alignment, not natural alignment of the entire
|
|
1322
|
-
// vector.
|
|
1795
|
+
// vector, so Load/Store are the same as LoadU/StoreU.
|
|
1323
1796
|
template <class D>
|
|
1324
|
-
HWY_API VFromD<D>
|
|
1325
|
-
return
|
|
1797
|
+
HWY_API VFromD<D> Load(D d, const TFromD<D>* HWY_RESTRICT p) {
|
|
1798
|
+
return LoadU(d, p);
|
|
1326
1799
|
}
|
|
1327
1800
|
|
|
1328
1801
|
template <class V, class D>
|
|
1329
|
-
HWY_API void
|
|
1330
|
-
|
|
1802
|
+
HWY_API void Store(const V v, D d, TFromD<D>* HWY_RESTRICT p) {
|
|
1803
|
+
StoreU(v, d, p);
|
|
1331
1804
|
}
|
|
1332
1805
|
|
|
1333
1806
|
// ------------------------------ MaskedLoadOr
|
|
@@ -1362,8 +1835,8 @@ HWY_API VFromD<D> MaskedLoadOr(VFromD<D> v, MFromD<D> m, D d,
|
|
|
1362
1835
|
HWY_API void NAME(HWY_SVE_V(BASE, BITS) v, svbool_t m, \
|
|
1363
1836
|
HWY_SVE_D(BASE, BITS, N, kPow2) /*d*/, \
|
|
1364
1837
|
HWY_SVE_T(BASE, BITS) * HWY_RESTRICT base, \
|
|
1365
|
-
HWY_SVE_V(int, BITS)
|
|
1366
|
-
sv##OP##_s##BITS##index_##CHAR##BITS(m, base,
|
|
1838
|
+
HWY_SVE_V(int, BITS) indices) { \
|
|
1839
|
+
sv##OP##_s##BITS##index_##CHAR##BITS(m, base, indices, v); \
|
|
1367
1840
|
}
|
|
1368
1841
|
|
|
1369
1842
|
HWY_SVE_FOREACH_UIF3264(HWY_SVE_SCATTER_OFFSET, ScatterOffset, st1_scatter)
|
|
@@ -1398,10 +1871,13 @@ HWY_API void ScatterIndex(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT p,
|
|
|
1398
1871
|
#define HWY_SVE_MASKED_GATHER_INDEX(BASE, CHAR, BITS, HALF, NAME, OP) \
|
|
1399
1872
|
template <size_t N, int kPow2> \
|
|
1400
1873
|
HWY_API HWY_SVE_V(BASE, BITS) \
|
|
1401
|
-
NAME(svbool_t m, HWY_SVE_D(BASE, BITS, N, kPow2)
|
|
1874
|
+
NAME(svbool_t m, HWY_SVE_D(BASE, BITS, N, kPow2) d, \
|
|
1402
1875
|
const HWY_SVE_T(BASE, BITS) * HWY_RESTRICT base, \
|
|
1403
|
-
HWY_SVE_V(int, BITS)
|
|
1404
|
-
|
|
1876
|
+
HWY_SVE_V(int, BITS) indices) { \
|
|
1877
|
+
const RebindToSigned<decltype(d)> di; \
|
|
1878
|
+
(void)di; /* for HWY_DASSERT */ \
|
|
1879
|
+
HWY_DASSERT(AllFalse(di, Lt(indices, Zero(di)))); \
|
|
1880
|
+
return sv##OP##_s##BITS##index_##CHAR##BITS(m, base, indices); \
|
|
1405
1881
|
}
|
|
1406
1882
|
|
|
1407
1883
|
HWY_SVE_FOREACH_UIF3264(HWY_SVE_GATHER_OFFSET, GatherOffset, ld1_gather)
|
|
@@ -1410,6 +1886,13 @@ HWY_SVE_FOREACH_UIF3264(HWY_SVE_MASKED_GATHER_INDEX, MaskedGatherIndex,
|
|
|
1410
1886
|
#undef HWY_SVE_GATHER_OFFSET
|
|
1411
1887
|
#undef HWY_SVE_MASKED_GATHER_INDEX
|
|
1412
1888
|
|
|
1889
|
+
template <class D>
|
|
1890
|
+
HWY_API VFromD<D> MaskedGatherIndexOr(VFromD<D> no, svbool_t m, D d,
|
|
1891
|
+
const TFromD<D>* HWY_RESTRICT p,
|
|
1892
|
+
VFromD<RebindToSigned<D>> indices) {
|
|
1893
|
+
return IfThenElse(m, MaskedGatherIndex(m, d, p, indices), no);
|
|
1894
|
+
}
|
|
1895
|
+
|
|
1413
1896
|
template <class D>
|
|
1414
1897
|
HWY_API VFromD<D> GatherIndex(D d, const TFromD<D>* HWY_RESTRICT p,
|
|
1415
1898
|
VFromD<RebindToSigned<D>> indices) {
|
|
@@ -1430,8 +1913,8 @@ HWY_API VFromD<D> GatherIndex(D d, const TFromD<D>* HWY_RESTRICT p,
|
|
|
1430
1913
|
HWY_API void NAME(HWY_SVE_D(BASE, BITS, N, kPow2) d, \
|
|
1431
1914
|
const HWY_SVE_T(BASE, BITS) * HWY_RESTRICT unaligned, \
|
|
1432
1915
|
HWY_SVE_V(BASE, BITS) & v0, HWY_SVE_V(BASE, BITS) & v1) { \
|
|
1433
|
-
const HWY_SVE_TUPLE(BASE, BITS, 2) tuple =
|
|
1434
|
-
|
|
1916
|
+
const HWY_SVE_TUPLE(BASE, BITS, 2) tuple = sv##OP##_##CHAR##BITS( \
|
|
1917
|
+
detail::MakeMask(d), detail::NativeLanePointer(unaligned)); \
|
|
1435
1918
|
v0 = svget2(tuple, 0); \
|
|
1436
1919
|
v1 = svget2(tuple, 1); \
|
|
1437
1920
|
}
|
|
@@ -1447,8 +1930,8 @@ HWY_SVE_FOREACH(HWY_SVE_LOAD2, LoadInterleaved2, ld2)
|
|
|
1447
1930
|
const HWY_SVE_T(BASE, BITS) * HWY_RESTRICT unaligned, \
|
|
1448
1931
|
HWY_SVE_V(BASE, BITS) & v0, HWY_SVE_V(BASE, BITS) & v1, \
|
|
1449
1932
|
HWY_SVE_V(BASE, BITS) & v2) { \
|
|
1450
|
-
const HWY_SVE_TUPLE(BASE, BITS, 3) tuple =
|
|
1451
|
-
|
|
1933
|
+
const HWY_SVE_TUPLE(BASE, BITS, 3) tuple = sv##OP##_##CHAR##BITS( \
|
|
1934
|
+
detail::MakeMask(d), detail::NativeLanePointer(unaligned)); \
|
|
1452
1935
|
v0 = svget3(tuple, 0); \
|
|
1453
1936
|
v1 = svget3(tuple, 1); \
|
|
1454
1937
|
v2 = svget3(tuple, 2); \
|
|
@@ -1465,8 +1948,8 @@ HWY_SVE_FOREACH(HWY_SVE_LOAD3, LoadInterleaved3, ld3)
|
|
|
1465
1948
|
const HWY_SVE_T(BASE, BITS) * HWY_RESTRICT unaligned, \
|
|
1466
1949
|
HWY_SVE_V(BASE, BITS) & v0, HWY_SVE_V(BASE, BITS) & v1, \
|
|
1467
1950
|
HWY_SVE_V(BASE, BITS) & v2, HWY_SVE_V(BASE, BITS) & v3) { \
|
|
1468
|
-
const HWY_SVE_TUPLE(BASE, BITS, 4) tuple =
|
|
1469
|
-
|
|
1951
|
+
const HWY_SVE_TUPLE(BASE, BITS, 4) tuple = sv##OP##_##CHAR##BITS( \
|
|
1952
|
+
detail::MakeMask(d), detail::NativeLanePointer(unaligned)); \
|
|
1470
1953
|
v0 = svget4(tuple, 0); \
|
|
1471
1954
|
v1 = svget4(tuple, 1); \
|
|
1472
1955
|
v2 = svget4(tuple, 2); \
|
|
@@ -1478,12 +1961,14 @@ HWY_SVE_FOREACH(HWY_SVE_LOAD4, LoadInterleaved4, ld4)
|
|
|
1478
1961
|
|
|
1479
1962
|
// ------------------------------ StoreInterleaved2
|
|
1480
1963
|
|
|
1481
|
-
#define HWY_SVE_STORE2(BASE, CHAR, BITS, HALF, NAME, OP)
|
|
1482
|
-
template <size_t N, int kPow2>
|
|
1483
|
-
HWY_API void NAME(HWY_SVE_V(BASE, BITS) v0, HWY_SVE_V(BASE, BITS) v1,
|
|
1484
|
-
HWY_SVE_D(BASE, BITS, N, kPow2) d,
|
|
1485
|
-
HWY_SVE_T(BASE, BITS) * HWY_RESTRICT unaligned) {
|
|
1486
|
-
sv##OP##_##CHAR##BITS(detail::MakeMask(d),
|
|
1964
|
+
#define HWY_SVE_STORE2(BASE, CHAR, BITS, HALF, NAME, OP) \
|
|
1965
|
+
template <size_t N, int kPow2> \
|
|
1966
|
+
HWY_API void NAME(HWY_SVE_V(BASE, BITS) v0, HWY_SVE_V(BASE, BITS) v1, \
|
|
1967
|
+
HWY_SVE_D(BASE, BITS, N, kPow2) d, \
|
|
1968
|
+
HWY_SVE_T(BASE, BITS) * HWY_RESTRICT unaligned) { \
|
|
1969
|
+
sv##OP##_##CHAR##BITS(detail::MakeMask(d), \
|
|
1970
|
+
detail::NativeLanePointer(unaligned), \
|
|
1971
|
+
Create2(d, v0, v1)); \
|
|
1487
1972
|
}
|
|
1488
1973
|
HWY_SVE_FOREACH(HWY_SVE_STORE2, StoreInterleaved2, st2)
|
|
1489
1974
|
|
|
@@ -1497,7 +1982,8 @@ HWY_SVE_FOREACH(HWY_SVE_STORE2, StoreInterleaved2, st2)
|
|
|
1497
1982
|
HWY_SVE_V(BASE, BITS) v2, \
|
|
1498
1983
|
HWY_SVE_D(BASE, BITS, N, kPow2) d, \
|
|
1499
1984
|
HWY_SVE_T(BASE, BITS) * HWY_RESTRICT unaligned) { \
|
|
1500
|
-
sv##OP##_##CHAR##BITS(detail::MakeMask(d),
|
|
1985
|
+
sv##OP##_##CHAR##BITS(detail::MakeMask(d), \
|
|
1986
|
+
detail::NativeLanePointer(unaligned), \
|
|
1501
1987
|
Create3(d, v0, v1, v2)); \
|
|
1502
1988
|
}
|
|
1503
1989
|
HWY_SVE_FOREACH(HWY_SVE_STORE3, StoreInterleaved3, st3)
|
|
@@ -1512,7 +1998,8 @@ HWY_SVE_FOREACH(HWY_SVE_STORE3, StoreInterleaved3, st3)
|
|
|
1512
1998
|
HWY_SVE_V(BASE, BITS) v2, HWY_SVE_V(BASE, BITS) v3, \
|
|
1513
1999
|
HWY_SVE_D(BASE, BITS, N, kPow2) d, \
|
|
1514
2000
|
HWY_SVE_T(BASE, BITS) * HWY_RESTRICT unaligned) { \
|
|
1515
|
-
sv##OP##_##CHAR##BITS(detail::MakeMask(d),
|
|
2001
|
+
sv##OP##_##CHAR##BITS(detail::MakeMask(d), \
|
|
2002
|
+
detail::NativeLanePointer(unaligned), \
|
|
1516
2003
|
Create4(d, v0, v1, v2, v3)); \
|
|
1517
2004
|
}
|
|
1518
2005
|
HWY_SVE_FOREACH(HWY_SVE_STORE4, StoreInterleaved4, st4)
|
|
@@ -1602,6 +2089,22 @@ HWY_API svfloat32_t PromoteTo(Simd<float32_t, N, kPow2> /* d */,
|
|
|
1602
2089
|
return svcvt_f32_f16_x(detail::PTrue(Simd<float16_t, N, kPow2>()), vv);
|
|
1603
2090
|
}
|
|
1604
2091
|
|
|
2092
|
+
#ifdef HWY_NATIVE_PROMOTE_F16_TO_F64
|
|
2093
|
+
#undef HWY_NATIVE_PROMOTE_F16_TO_F64
|
|
2094
|
+
#else
|
|
2095
|
+
#define HWY_NATIVE_PROMOTE_F16_TO_F64
|
|
2096
|
+
#endif
|
|
2097
|
+
|
|
2098
|
+
template <size_t N, int kPow2>
|
|
2099
|
+
HWY_API svfloat64_t PromoteTo(Simd<float64_t, N, kPow2> /* d */,
|
|
2100
|
+
const svfloat16_t v) {
|
|
2101
|
+
// svcvt* expects inputs in even lanes, whereas Highway wants lower lanes, so
|
|
2102
|
+
// first replicate each lane once.
|
|
2103
|
+
const svfloat16_t vv = detail::ZipLowerSame(v, v);
|
|
2104
|
+
return svcvt_f64_f16_x(detail::PTrue(Simd<float16_t, N, kPow2>()),
|
|
2105
|
+
detail::ZipLowerSame(vv, vv));
|
|
2106
|
+
}
|
|
2107
|
+
|
|
1605
2108
|
template <size_t N, int kPow2>
|
|
1606
2109
|
HWY_API svfloat64_t PromoteTo(Simd<float64_t, N, kPow2> /* d */,
|
|
1607
2110
|
const svfloat32_t v) {
|
|
@@ -1637,19 +2140,43 @@ HWY_API svuint64_t PromoteTo(Simd<uint64_t, N, kPow2> /* d */,
|
|
|
1637
2140
|
return svcvt_u64_f32_x(detail::PTrue(Simd<float, N, kPow2>()), vv);
|
|
1638
2141
|
}
|
|
1639
2142
|
|
|
1640
|
-
//
|
|
2143
|
+
// ------------------------------ PromoteUpperTo
|
|
2144
|
+
|
|
1641
2145
|
namespace detail {
|
|
2146
|
+
HWY_SVE_FOREACH_UI16(HWY_SVE_PROMOTE_TO, PromoteUpperTo, unpkhi)
|
|
1642
2147
|
HWY_SVE_FOREACH_UI32(HWY_SVE_PROMOTE_TO, PromoteUpperTo, unpkhi)
|
|
2148
|
+
HWY_SVE_FOREACH_UI64(HWY_SVE_PROMOTE_TO, PromoteUpperTo, unpkhi)
|
|
1643
2149
|
#undef HWY_SVE_PROMOTE_TO
|
|
2150
|
+
} // namespace detail
|
|
1644
2151
|
|
|
1645
|
-
|
|
1646
|
-
|
|
1647
|
-
|
|
1648
|
-
|
|
1649
|
-
|
|
2152
|
+
#ifdef HWY_NATIVE_PROMOTE_UPPER_TO
|
|
2153
|
+
#undef HWY_NATIVE_PROMOTE_UPPER_TO
|
|
2154
|
+
#else
|
|
2155
|
+
#define HWY_NATIVE_PROMOTE_UPPER_TO
|
|
2156
|
+
#endif
|
|
2157
|
+
|
|
2158
|
+
// Unsigned->Unsigned or Signed->Signed
|
|
2159
|
+
template <class D, class V, typename TD = TFromD<D>, typename TV = TFromV<V>,
|
|
2160
|
+
hwy::EnableIf<IsInteger<TD>() && IsInteger<TV>() &&
|
|
2161
|
+
(IsSigned<TD>() == IsSigned<TV>())>* = nullptr>
|
|
2162
|
+
HWY_API VFromD<D> PromoteUpperTo(D d, V v) {
|
|
2163
|
+
if (detail::IsFull(d)) {
|
|
2164
|
+
return detail::PromoteUpperTo(d, v);
|
|
2165
|
+
}
|
|
2166
|
+
const Rebind<TFromV<V>, decltype(d)> dh;
|
|
2167
|
+
return PromoteTo(d, UpperHalf(dh, v));
|
|
1650
2168
|
}
|
|
1651
2169
|
|
|
1652
|
-
|
|
2170
|
+
// Differing signs or either is float
|
|
2171
|
+
template <class D, class V, typename TD = TFromD<D>, typename TV = TFromV<V>,
|
|
2172
|
+
hwy::EnableIf<!IsInteger<TD>() || !IsInteger<TV>() ||
|
|
2173
|
+
(IsSigned<TD>() != IsSigned<TV>())>* = nullptr>
|
|
2174
|
+
HWY_API VFromD<D> PromoteUpperTo(D d, V v) {
|
|
2175
|
+
// Lanes(d) may differ from Lanes(DFromV<V>()). Use the lane type from V
|
|
2176
|
+
// because it cannot be deduced from D (could be either bf16 or f16).
|
|
2177
|
+
const Rebind<TFromV<V>, decltype(d)> dh;
|
|
2178
|
+
return PromoteTo(d, UpperHalf(dh, v));
|
|
2179
|
+
}
|
|
1653
2180
|
|
|
1654
2181
|
// ------------------------------ DemoteTo U
|
|
1655
2182
|
|
|
@@ -1972,9 +2499,13 @@ namespace detail {
|
|
|
1972
2499
|
}
|
|
1973
2500
|
HWY_SVE_FOREACH(HWY_SVE_CONCAT_EVERY_SECOND, ConcatEvenFull, uzp1)
|
|
1974
2501
|
HWY_SVE_FOREACH(HWY_SVE_CONCAT_EVERY_SECOND, ConcatOddFull, uzp2)
|
|
2502
|
+
HWY_SVE_FOREACH_BF16(HWY_SVE_CONCAT_EVERY_SECOND, ConcatEvenFull, uzp1)
|
|
2503
|
+
HWY_SVE_FOREACH_BF16(HWY_SVE_CONCAT_EVERY_SECOND, ConcatOddFull, uzp2)
|
|
1975
2504
|
#if defined(__ARM_FEATURE_SVE_MATMUL_FP64)
|
|
1976
2505
|
HWY_SVE_FOREACH(HWY_SVE_CONCAT_EVERY_SECOND, ConcatEvenBlocks, uzp1q)
|
|
1977
2506
|
HWY_SVE_FOREACH(HWY_SVE_CONCAT_EVERY_SECOND, ConcatOddBlocks, uzp2q)
|
|
2507
|
+
HWY_SVE_FOREACH_BF16(HWY_SVE_CONCAT_EVERY_SECOND, ConcatEvenBlocks, uzp1q)
|
|
2508
|
+
HWY_SVE_FOREACH_BF16(HWY_SVE_CONCAT_EVERY_SECOND, ConcatOddBlocks, uzp2q)
|
|
1978
2509
|
#endif
|
|
1979
2510
|
#undef HWY_SVE_CONCAT_EVERY_SECOND
|
|
1980
2511
|
|
|
@@ -1986,6 +2517,16 @@ HWY_SVE_FOREACH(HWY_SVE_CONCAT_EVERY_SECOND, ConcatOddBlocks, uzp2q)
|
|
|
1986
2517
|
return sv##OP##_##CHAR##BITS(mask, lo, hi); \
|
|
1987
2518
|
}
|
|
1988
2519
|
HWY_SVE_FOREACH(HWY_SVE_SPLICE, Splice, splice)
|
|
2520
|
+
#if HWY_SVE_HAVE_BF16_FEATURE
|
|
2521
|
+
HWY_SVE_FOREACH_BF16(HWY_SVE_SPLICE, Splice, splice)
|
|
2522
|
+
#else
|
|
2523
|
+
template <class V, HWY_IF_BF16_D(DFromV<V>)>
|
|
2524
|
+
HWY_INLINE V Splice(V hi, V lo, svbool_t mask) {
|
|
2525
|
+
const DFromV<V> d;
|
|
2526
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
2527
|
+
return BitCast(d, Splice(BitCast(du, hi), BitCast(du, lo), mask));
|
|
2528
|
+
}
|
|
2529
|
+
#endif // HWY_SVE_HAVE_BF16_FEATURE
|
|
1989
2530
|
#undef HWY_SVE_SPLICE
|
|
1990
2531
|
|
|
1991
2532
|
} // namespace detail
|
|
@@ -2021,6 +2562,20 @@ HWY_API svfloat16_t DemoteTo(Simd<float16_t, N, kPow2> d, const svfloat32_t v) {
|
|
|
2021
2562
|
in_even); // lower half
|
|
2022
2563
|
}
|
|
2023
2564
|
|
|
2565
|
+
#ifdef HWY_NATIVE_DEMOTE_F64_TO_F16
|
|
2566
|
+
#undef HWY_NATIVE_DEMOTE_F64_TO_F16
|
|
2567
|
+
#else
|
|
2568
|
+
#define HWY_NATIVE_DEMOTE_F64_TO_F16
|
|
2569
|
+
#endif
|
|
2570
|
+
|
|
2571
|
+
template <size_t N, int kPow2>
|
|
2572
|
+
HWY_API svfloat16_t DemoteTo(Simd<float16_t, N, kPow2> d, const svfloat64_t v) {
|
|
2573
|
+
const svfloat16_t in_lo16 = svcvt_f16_f64_x(detail::PTrue(d), v);
|
|
2574
|
+
const svfloat16_t in_even = detail::ConcatEvenFull(in_lo16, in_lo16);
|
|
2575
|
+
return detail::ConcatEvenFull(in_even,
|
|
2576
|
+
in_even); // lower half
|
|
2577
|
+
}
|
|
2578
|
+
|
|
2024
2579
|
template <size_t N, int kPow2>
|
|
2025
2580
|
HWY_API VBF16 DemoteTo(Simd<bfloat16_t, N, kPow2> dbf16, svfloat32_t v) {
|
|
2026
2581
|
const svuint16_t in_even = BitCast(ScalableTag<uint16_t>(), v);
|
|
@@ -2103,20 +2658,22 @@ HWY_API VFromD<DI> NearestInt(VF v) {
|
|
|
2103
2658
|
|
|
2104
2659
|
// ------------------------------ Iota (Add, ConvertTo)
|
|
2105
2660
|
|
|
2106
|
-
#define HWY_SVE_IOTA(BASE, CHAR, BITS, HALF, NAME, OP)
|
|
2107
|
-
template <size_t N, int kPow2>
|
|
2108
|
-
HWY_API HWY_SVE_V(BASE, BITS)
|
|
2109
|
-
|
|
2110
|
-
return sv##OP##_##CHAR##BITS(
|
|
2661
|
+
#define HWY_SVE_IOTA(BASE, CHAR, BITS, HALF, NAME, OP) \
|
|
2662
|
+
template <size_t N, int kPow2, typename T2> \
|
|
2663
|
+
HWY_API HWY_SVE_V(BASE, BITS) \
|
|
2664
|
+
NAME(HWY_SVE_D(BASE, BITS, N, kPow2) /* d */, T2 first) { \
|
|
2665
|
+
return sv##OP##_##CHAR##BITS( \
|
|
2666
|
+
ConvertScalarTo<HWY_SVE_T(BASE, BITS)>(first), 1); \
|
|
2111
2667
|
}
|
|
2112
2668
|
|
|
2113
2669
|
HWY_SVE_FOREACH_UI(HWY_SVE_IOTA, Iota, index)
|
|
2114
2670
|
#undef HWY_SVE_IOTA
|
|
2115
2671
|
|
|
2116
|
-
template <class D, HWY_IF_FLOAT_D(D)>
|
|
2117
|
-
HWY_API VFromD<D> Iota(const D d,
|
|
2672
|
+
template <class D, typename T2, HWY_IF_FLOAT_D(D)>
|
|
2673
|
+
HWY_API VFromD<D> Iota(const D d, T2 first) {
|
|
2118
2674
|
const RebindToSigned<D> di;
|
|
2119
|
-
return detail::AddN(ConvertTo(d, Iota(di, 0)),
|
|
2675
|
+
return detail::AddN(ConvertTo(d, Iota(di, 0)),
|
|
2676
|
+
ConvertScalarTo<TFromD<D>>(first));
|
|
2120
2677
|
}
|
|
2121
2678
|
|
|
2122
2679
|
// ------------------------------ InterleaveLower
|
|
@@ -2147,12 +2704,10 @@ HWY_API V InterleaveLower(const V a, const V b) {
|
|
|
2147
2704
|
|
|
2148
2705
|
// Only use zip2 if vector are a powers of two, otherwise getting the actual
|
|
2149
2706
|
// "upper half" requires MaskUpperHalf.
|
|
2150
|
-
#if HWY_TARGET == HWY_SVE2_128
|
|
2151
2707
|
namespace detail {
|
|
2152
2708
|
// Unlike Highway's ZipUpper, this returns the same type.
|
|
2153
2709
|
HWY_SVE_FOREACH(HWY_SVE_RETV_ARGVV, ZipUpperSame, zip2)
|
|
2154
2710
|
} // namespace detail
|
|
2155
|
-
#endif
|
|
2156
2711
|
|
|
2157
2712
|
// Full vector: guaranteed to have at least one block
|
|
2158
2713
|
template <class D, class V = VFromD<D>,
|
|
@@ -2184,6 +2739,30 @@ HWY_API V InterleaveUpper(D d, const V a, const V b) {
|
|
|
2184
2739
|
return InterleaveUpper(DFromV<V>(), a, b);
|
|
2185
2740
|
}
|
|
2186
2741
|
|
|
2742
|
+
// ------------------------------ InterleaveWholeLower
|
|
2743
|
+
#ifdef HWY_NATIVE_INTERLEAVE_WHOLE
|
|
2744
|
+
#undef HWY_NATIVE_INTERLEAVE_WHOLE
|
|
2745
|
+
#else
|
|
2746
|
+
#define HWY_NATIVE_INTERLEAVE_WHOLE
|
|
2747
|
+
#endif
|
|
2748
|
+
|
|
2749
|
+
template <class D>
|
|
2750
|
+
HWY_API VFromD<D> InterleaveWholeLower(D /*d*/, VFromD<D> a, VFromD<D> b) {
|
|
2751
|
+
return detail::ZipLowerSame(a, b);
|
|
2752
|
+
}
|
|
2753
|
+
|
|
2754
|
+
// ------------------------------ InterleaveWholeUpper
|
|
2755
|
+
|
|
2756
|
+
template <class D>
|
|
2757
|
+
HWY_API VFromD<D> InterleaveWholeUpper(D d, VFromD<D> a, VFromD<D> b) {
|
|
2758
|
+
if (HWY_SVE_IS_POW2 && detail::IsFull(d)) {
|
|
2759
|
+
return detail::ZipUpperSame(a, b);
|
|
2760
|
+
}
|
|
2761
|
+
|
|
2762
|
+
const Half<decltype(d)> d2;
|
|
2763
|
+
return InterleaveWholeLower(d, UpperHalf(d2, a), UpperHalf(d2, b));
|
|
2764
|
+
}
|
|
2765
|
+
|
|
2187
2766
|
// ------------------------------ Per4LaneBlockShuffle
|
|
2188
2767
|
|
|
2189
2768
|
namespace detail {
|
|
@@ -2432,7 +3011,13 @@ HWY_API V UpperHalf(const DH dh, const V v) {
|
|
|
2432
3011
|
|
|
2433
3012
|
// ================================================== REDUCE
|
|
2434
3013
|
|
|
2435
|
-
|
|
3014
|
+
#ifdef HWY_NATIVE_REDUCE_SCALAR
|
|
3015
|
+
#undef HWY_NATIVE_REDUCE_SCALAR
|
|
3016
|
+
#else
|
|
3017
|
+
#define HWY_NATIVE_REDUCE_SCALAR
|
|
3018
|
+
#endif
|
|
3019
|
+
|
|
3020
|
+
// These return T, suitable for ReduceSum.
|
|
2436
3021
|
namespace detail {
|
|
2437
3022
|
#define HWY_SVE_REDUCE_ADD(BASE, CHAR, BITS, HALF, NAME, OP) \
|
|
2438
3023
|
HWY_API HWY_SVE_T(BASE, BITS) NAME(svbool_t pg, HWY_SVE_V(BASE, BITS) v) { \
|
|
@@ -2462,24 +3047,53 @@ HWY_SVE_FOREACH_F(HWY_SVE_REDUCE, MaxOfLanesM, maxnmv)
|
|
|
2462
3047
|
#undef HWY_SVE_REDUCE_ADD
|
|
2463
3048
|
} // namespace detail
|
|
2464
3049
|
|
|
2465
|
-
|
|
2466
|
-
|
|
2467
|
-
|
|
2468
|
-
|
|
3050
|
+
// detail::SumOfLanesM, detail::MinOfLanesM, and detail::MaxOfLanesM is more
|
|
3051
|
+
// efficient for N=4 I8/U8 reductions on SVE than the default implementations
|
|
3052
|
+
// of the N=4 I8/U8 ReduceSum/ReduceMin/ReduceMax operations in
|
|
3053
|
+
// generic_ops-inl.h
|
|
3054
|
+
#undef HWY_IF_REDUCE_D
|
|
3055
|
+
#define HWY_IF_REDUCE_D(D) hwy::EnableIf<HWY_MAX_LANES_D(D) != 1>* = nullptr
|
|
2469
3056
|
|
|
2470
|
-
|
|
2471
|
-
|
|
3057
|
+
#ifdef HWY_NATIVE_REDUCE_SUM_4_UI8
|
|
3058
|
+
#undef HWY_NATIVE_REDUCE_SUM_4_UI8
|
|
3059
|
+
#else
|
|
3060
|
+
#define HWY_NATIVE_REDUCE_SUM_4_UI8
|
|
3061
|
+
#endif
|
|
3062
|
+
|
|
3063
|
+
#ifdef HWY_NATIVE_REDUCE_MINMAX_4_UI8
|
|
3064
|
+
#undef HWY_NATIVE_REDUCE_MINMAX_4_UI8
|
|
3065
|
+
#else
|
|
3066
|
+
#define HWY_NATIVE_REDUCE_MINMAX_4_UI8
|
|
3067
|
+
#endif
|
|
3068
|
+
|
|
3069
|
+
template <class D, HWY_IF_REDUCE_D(D)>
|
|
3070
|
+
HWY_API TFromD<D> ReduceSum(D d, VFromD<D> v) {
|
|
2472
3071
|
return detail::SumOfLanesM(detail::MakeMask(d), v);
|
|
2473
3072
|
}
|
|
2474
3073
|
|
|
2475
|
-
template <class D,
|
|
2476
|
-
|
|
2477
|
-
return
|
|
3074
|
+
template <class D, HWY_IF_REDUCE_D(D)>
|
|
3075
|
+
HWY_API TFromD<D> ReduceMin(D d, VFromD<D> v) {
|
|
3076
|
+
return detail::MinOfLanesM(detail::MakeMask(d), v);
|
|
2478
3077
|
}
|
|
2479
3078
|
|
|
2480
|
-
template <class D,
|
|
2481
|
-
|
|
2482
|
-
return
|
|
3079
|
+
template <class D, HWY_IF_REDUCE_D(D)>
|
|
3080
|
+
HWY_API TFromD<D> ReduceMax(D d, VFromD<D> v) {
|
|
3081
|
+
return detail::MaxOfLanesM(detail::MakeMask(d), v);
|
|
3082
|
+
}
|
|
3083
|
+
|
|
3084
|
+
// ------------------------------ SumOfLanes
|
|
3085
|
+
|
|
3086
|
+
template <class D, HWY_IF_LANES_GT_D(D, 1)>
|
|
3087
|
+
HWY_API VFromD<D> SumOfLanes(D d, VFromD<D> v) {
|
|
3088
|
+
return Set(d, ReduceSum(d, v));
|
|
3089
|
+
}
|
|
3090
|
+
template <class D, HWY_IF_LANES_GT_D(D, 1)>
|
|
3091
|
+
HWY_API VFromD<D> MinOfLanes(D d, VFromD<D> v) {
|
|
3092
|
+
return Set(d, ReduceMin(d, v));
|
|
3093
|
+
}
|
|
3094
|
+
template <class D, HWY_IF_LANES_GT_D(D, 1)>
|
|
3095
|
+
HWY_API VFromD<D> MaxOfLanes(D d, VFromD<D> v) {
|
|
3096
|
+
return Set(d, ReduceMax(d, v));
|
|
2483
3097
|
}
|
|
2484
3098
|
|
|
2485
3099
|
// ================================================== SWIZZLE
|
|
@@ -2513,7 +3127,9 @@ HWY_API TFromV<V> ExtractLane(V v, size_t i) {
|
|
|
2513
3127
|
template <class V>
|
|
2514
3128
|
HWY_API V InsertLane(const V v, size_t i, TFromV<V> t) {
|
|
2515
3129
|
const DFromV<V> d;
|
|
2516
|
-
const
|
|
3130
|
+
const RebindToSigned<decltype(d)> di;
|
|
3131
|
+
using TI = TFromD<decltype(di)>;
|
|
3132
|
+
const svbool_t is_i = detail::EqN(Iota(di, 0), static_cast<TI>(i));
|
|
2517
3133
|
return IfThenElse(RebindMask(d, is_i), Set(d, t), v);
|
|
2518
3134
|
}
|
|
2519
3135
|
|
|
@@ -2623,6 +3239,7 @@ HWY_API VFromD<RebindToUnsigned<D>> SetTableIndices(D d, const TI* idx) {
|
|
|
2623
3239
|
}
|
|
2624
3240
|
|
|
2625
3241
|
HWY_SVE_FOREACH(HWY_SVE_TABLE, TableLookupLanes, tbl)
|
|
3242
|
+
HWY_SVE_FOREACH_BF16(HWY_SVE_TABLE, TableLookupLanes, tbl)
|
|
2626
3243
|
#undef HWY_SVE_TABLE
|
|
2627
3244
|
|
|
2628
3245
|
#if HWY_SVE_HAVE_2
|
|
@@ -2634,6 +3251,7 @@ namespace detail {
|
|
|
2634
3251
|
}
|
|
2635
3252
|
|
|
2636
3253
|
HWY_SVE_FOREACH(HWY_SVE_TABLE2, NativeTwoTableLookupLanes, tbl2)
|
|
3254
|
+
HWY_SVE_FOREACH_BF16(HWY_SVE_TABLE2, NativeTwoTableLookupLanes, tbl2)
|
|
2637
3255
|
#undef HWY_SVE_TABLE
|
|
2638
3256
|
} // namespace detail
|
|
2639
3257
|
#endif // HWY_SVE_HAVE_2
|
|
@@ -2705,6 +3323,7 @@ namespace detail {
|
|
|
2705
3323
|
}
|
|
2706
3324
|
|
|
2707
3325
|
HWY_SVE_FOREACH(HWY_SVE_REVERSE, ReverseFull, rev)
|
|
3326
|
+
HWY_SVE_FOREACH_BF16(HWY_SVE_REVERSE, ReverseFull, rev)
|
|
2708
3327
|
#undef HWY_SVE_REVERSE
|
|
2709
3328
|
|
|
2710
3329
|
} // namespace detail
|
|
@@ -2775,14 +3394,14 @@ HWY_API VFromD<D> Reverse2(D d, const VFromD<D> v) { // 3210
|
|
|
2775
3394
|
template <class D, HWY_IF_T_SIZE_D(D, 1)>
|
|
2776
3395
|
HWY_API VFromD<D> Reverse4(D d, const VFromD<D> v) {
|
|
2777
3396
|
const RebindToUnsigned<decltype(d)> du;
|
|
2778
|
-
const
|
|
3397
|
+
const RepartitionToWideX2<decltype(du)> du32;
|
|
2779
3398
|
return BitCast(d, svrevb_u32_x(detail::PTrue(d), BitCast(du32, v)));
|
|
2780
3399
|
}
|
|
2781
3400
|
|
|
2782
3401
|
template <class D, HWY_IF_T_SIZE_D(D, 2)>
|
|
2783
3402
|
HWY_API VFromD<D> Reverse4(D d, const VFromD<D> v) {
|
|
2784
3403
|
const RebindToUnsigned<decltype(d)> du;
|
|
2785
|
-
const
|
|
3404
|
+
const RepartitionToWideX2<decltype(du)> du64;
|
|
2786
3405
|
return BitCast(d, svrevh_u64_x(detail::PTrue(d), BitCast(du64, v)));
|
|
2787
3406
|
}
|
|
2788
3407
|
|
|
@@ -2943,20 +3562,23 @@ HWY_API V BroadcastBlock(V v) {
|
|
|
2943
3562
|
static_assert(0 <= kBlockIdx && kBlockIdx < d.MaxBlocks(),
|
|
2944
3563
|
"Invalid block index");
|
|
2945
3564
|
|
|
3565
|
+
const RebindToUnsigned<decltype(d)> du; // for bfloat16_t
|
|
3566
|
+
using VU = VFromD<decltype(du)>;
|
|
3567
|
+
const VU vu = BitCast(du, v);
|
|
3568
|
+
|
|
2946
3569
|
#if HWY_TARGET == HWY_SVE_256
|
|
2947
|
-
return (kBlockIdx == 0) ? ConcatLowerLower(
|
|
2948
|
-
|
|
3570
|
+
return BitCast(d, (kBlockIdx == 0) ? ConcatLowerLower(du, vu, vu)
|
|
3571
|
+
: ConcatUpperUpper(du, vu, vu));
|
|
2949
3572
|
#else
|
|
2950
|
-
const RebindToUnsigned<decltype(d)> du;
|
|
2951
3573
|
using TU = TFromD<decltype(du)>;
|
|
2952
3574
|
constexpr size_t kLanesPerBlock = detail::LanesPerBlock(d);
|
|
2953
3575
|
constexpr size_t kBlockOffset =
|
|
2954
3576
|
static_cast<size_t>(kBlockIdx) * kLanesPerBlock;
|
|
2955
3577
|
|
|
2956
|
-
const
|
|
3578
|
+
const VU idx = detail::AddN(
|
|
2957
3579
|
detail::AndN(Iota(du, TU{0}), static_cast<TU>(kLanesPerBlock - 1)),
|
|
2958
3580
|
static_cast<TU>(kBlockOffset));
|
|
2959
|
-
return TableLookupLanes(
|
|
3581
|
+
return BitCast(d, TableLookupLanes(vu, idx));
|
|
2960
3582
|
#endif
|
|
2961
3583
|
}
|
|
2962
3584
|
|
|
@@ -3462,6 +4084,126 @@ HWY_API svfloat32_t PromoteTo(Simd<float32_t, N, kPow2> df32, VBF16 v) {
|
|
|
3462
4084
|
return BitCast(df32, detail::ZipLowerSame(svdup_n_u16(0), BitCast(du16, v)));
|
|
3463
4085
|
}
|
|
3464
4086
|
|
|
4087
|
+
// ------------------------------ PromoteEvenTo/PromoteOddTo (ConcatOddFull)
|
|
4088
|
+
|
|
4089
|
+
namespace detail {
|
|
4090
|
+
|
|
4091
|
+
// Signed to signed PromoteEvenTo
|
|
4092
|
+
template <class D>
|
|
4093
|
+
HWY_INLINE VFromD<D> PromoteEvenTo(hwy::SignedTag /*to_type_tag*/,
|
|
4094
|
+
hwy::SizeTag<2> /*to_lane_size_tag*/,
|
|
4095
|
+
hwy::SignedTag /*from_type_tag*/, D d_to,
|
|
4096
|
+
svint8_t v) {
|
|
4097
|
+
return svextb_s16_x(detail::PTrue(d_to), BitCast(d_to, v));
|
|
4098
|
+
}
|
|
4099
|
+
|
|
4100
|
+
template <class D>
|
|
4101
|
+
HWY_INLINE VFromD<D> PromoteEvenTo(hwy::SignedTag /*to_type_tag*/,
|
|
4102
|
+
hwy::SizeTag<4> /*to_lane_size_tag*/,
|
|
4103
|
+
hwy::SignedTag /*from_type_tag*/, D d_to,
|
|
4104
|
+
svint16_t v) {
|
|
4105
|
+
return svexth_s32_x(detail::PTrue(d_to), BitCast(d_to, v));
|
|
4106
|
+
}
|
|
4107
|
+
|
|
4108
|
+
template <class D>
|
|
4109
|
+
HWY_INLINE VFromD<D> PromoteEvenTo(hwy::SignedTag /*to_type_tag*/,
|
|
4110
|
+
hwy::SizeTag<8> /*to_lane_size_tag*/,
|
|
4111
|
+
hwy::SignedTag /*from_type_tag*/, D d_to,
|
|
4112
|
+
svint32_t v) {
|
|
4113
|
+
return svextw_s64_x(detail::PTrue(d_to), BitCast(d_to, v));
|
|
4114
|
+
}
|
|
4115
|
+
|
|
4116
|
+
// F16->F32 PromoteEvenTo
|
|
4117
|
+
template <class D>
|
|
4118
|
+
HWY_INLINE VFromD<D> PromoteEvenTo(hwy::FloatTag /*to_type_tag*/,
|
|
4119
|
+
hwy::SizeTag<4> /*to_lane_size_tag*/,
|
|
4120
|
+
hwy::FloatTag /*from_type_tag*/, D d_to,
|
|
4121
|
+
svfloat16_t v) {
|
|
4122
|
+
const Repartition<float, decltype(d_to)> d_from;
|
|
4123
|
+
return svcvt_f32_f16_x(detail::PTrue(d_from), v);
|
|
4124
|
+
}
|
|
4125
|
+
|
|
4126
|
+
// F32->F64 PromoteEvenTo
|
|
4127
|
+
template <class D>
|
|
4128
|
+
HWY_INLINE VFromD<D> PromoteEvenTo(hwy::FloatTag /*to_type_tag*/,
|
|
4129
|
+
hwy::SizeTag<8> /*to_lane_size_tag*/,
|
|
4130
|
+
hwy::FloatTag /*from_type_tag*/, D d_to,
|
|
4131
|
+
svfloat32_t v) {
|
|
4132
|
+
const Repartition<float, decltype(d_to)> d_from;
|
|
4133
|
+
return svcvt_f64_f32_x(detail::PTrue(d_from), v);
|
|
4134
|
+
}
|
|
4135
|
+
|
|
4136
|
+
// I32->F64 PromoteEvenTo
|
|
4137
|
+
template <class D>
|
|
4138
|
+
HWY_INLINE VFromD<D> PromoteEvenTo(hwy::FloatTag /*to_type_tag*/,
|
|
4139
|
+
hwy::SizeTag<8> /*to_lane_size_tag*/,
|
|
4140
|
+
hwy::SignedTag /*from_type_tag*/, D d_to,
|
|
4141
|
+
svint32_t v) {
|
|
4142
|
+
const Repartition<float, decltype(d_to)> d_from;
|
|
4143
|
+
return svcvt_f64_s32_x(detail::PTrue(d_from), v);
|
|
4144
|
+
}
|
|
4145
|
+
|
|
4146
|
+
// U32->F64 PromoteEvenTo
|
|
4147
|
+
template <class D>
|
|
4148
|
+
HWY_INLINE VFromD<D> PromoteEvenTo(hwy::FloatTag /*to_type_tag*/,
|
|
4149
|
+
hwy::SizeTag<8> /*to_lane_size_tag*/,
|
|
4150
|
+
hwy::UnsignedTag /*from_type_tag*/, D d_to,
|
|
4151
|
+
svuint32_t v) {
|
|
4152
|
+
const Repartition<float, decltype(d_to)> d_from;
|
|
4153
|
+
return svcvt_f64_u32_x(detail::PTrue(d_from), v);
|
|
4154
|
+
}
|
|
4155
|
+
|
|
4156
|
+
// F32->I64 PromoteEvenTo
|
|
4157
|
+
template <class D>
|
|
4158
|
+
HWY_INLINE VFromD<D> PromoteEvenTo(hwy::SignedTag /*to_type_tag*/,
|
|
4159
|
+
hwy::SizeTag<8> /*to_lane_size_tag*/,
|
|
4160
|
+
hwy::FloatTag /*from_type_tag*/, D d_to,
|
|
4161
|
+
svfloat32_t v) {
|
|
4162
|
+
const Repartition<float, decltype(d_to)> d_from;
|
|
4163
|
+
return svcvt_s64_f32_x(detail::PTrue(d_from), v);
|
|
4164
|
+
}
|
|
4165
|
+
|
|
4166
|
+
// F32->U64 PromoteEvenTo
|
|
4167
|
+
template <class D>
|
|
4168
|
+
HWY_INLINE VFromD<D> PromoteEvenTo(hwy::UnsignedTag /*to_type_tag*/,
|
|
4169
|
+
hwy::SizeTag<8> /*to_lane_size_tag*/,
|
|
4170
|
+
hwy::FloatTag /*from_type_tag*/, D d_to,
|
|
4171
|
+
svfloat32_t v) {
|
|
4172
|
+
const Repartition<float, decltype(d_to)> d_from;
|
|
4173
|
+
return svcvt_u64_f32_x(detail::PTrue(d_from), v);
|
|
4174
|
+
}
|
|
4175
|
+
|
|
4176
|
+
// F16->F32 PromoteOddTo
|
|
4177
|
+
template <class D>
|
|
4178
|
+
HWY_INLINE VFromD<D> PromoteOddTo(hwy::FloatTag to_type_tag,
|
|
4179
|
+
hwy::SizeTag<4> to_lane_size_tag,
|
|
4180
|
+
hwy::FloatTag from_type_tag, D d_to,
|
|
4181
|
+
svfloat16_t v) {
|
|
4182
|
+
return PromoteEvenTo(to_type_tag, to_lane_size_tag, from_type_tag, d_to,
|
|
4183
|
+
DupOdd(v));
|
|
4184
|
+
}
|
|
4185
|
+
|
|
4186
|
+
// I32/U32/F32->F64 PromoteOddTo
|
|
4187
|
+
template <class FromTypeTag, class D, class V>
|
|
4188
|
+
HWY_INLINE VFromD<D> PromoteOddTo(hwy::FloatTag to_type_tag,
|
|
4189
|
+
hwy::SizeTag<8> to_lane_size_tag,
|
|
4190
|
+
FromTypeTag from_type_tag, D d_to, V v) {
|
|
4191
|
+
return PromoteEvenTo(to_type_tag, to_lane_size_tag, from_type_tag, d_to,
|
|
4192
|
+
DupOdd(v));
|
|
4193
|
+
}
|
|
4194
|
+
|
|
4195
|
+
// F32->I64/U64 PromoteOddTo
|
|
4196
|
+
template <class ToTypeTag, class D, HWY_IF_UI64_D(D)>
|
|
4197
|
+
HWY_INLINE VFromD<D> PromoteOddTo(ToTypeTag to_type_tag,
|
|
4198
|
+
hwy::SizeTag<8> to_lane_size_tag,
|
|
4199
|
+
hwy::FloatTag from_type_tag, D d_to,
|
|
4200
|
+
svfloat32_t v) {
|
|
4201
|
+
return PromoteEvenTo(to_type_tag, to_lane_size_tag, from_type_tag, d_to,
|
|
4202
|
+
DupOdd(v));
|
|
4203
|
+
}
|
|
4204
|
+
|
|
4205
|
+
} // namespace detail
|
|
4206
|
+
|
|
3465
4207
|
// ------------------------------ ReorderDemote2To (OddEven)
|
|
3466
4208
|
|
|
3467
4209
|
template <size_t N, int kPow2>
|
|
@@ -3618,15 +4360,45 @@ HWY_API VFromD<D> OrderedDemote2To(D dn, V a, V b) {
|
|
|
3618
4360
|
return Combine(dn, demoted_b, demoted_a);
|
|
3619
4361
|
}
|
|
3620
4362
|
|
|
3621
|
-
template <class D,
|
|
3622
|
-
HWY_API
|
|
4363
|
+
template <class D, HWY_IF_SPECIAL_FLOAT_D(D)>
|
|
4364
|
+
HWY_API VFromD<D> OrderedDemote2To(D dn, svfloat32_t a, svfloat32_t b) {
|
|
3623
4365
|
const Half<decltype(dn)> dnh;
|
|
3624
|
-
|
|
3625
|
-
|
|
3626
|
-
|
|
3627
|
-
|
|
3628
|
-
|
|
3629
|
-
|
|
4366
|
+
return Combine(dn, DemoteTo(dnh, b), DemoteTo(dnh, a));
|
|
4367
|
+
}
|
|
4368
|
+
|
|
4369
|
+
// ------------------------------ I8/U8/I16/U16 Div
|
|
4370
|
+
|
|
4371
|
+
template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V),
|
|
4372
|
+
HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 2))>
|
|
4373
|
+
HWY_API V Div(V a, V b) {
|
|
4374
|
+
const DFromV<decltype(a)> d;
|
|
4375
|
+
const Half<decltype(d)> dh;
|
|
4376
|
+
const RepartitionToWide<decltype(d)> dw;
|
|
4377
|
+
|
|
4378
|
+
const auto q_lo =
|
|
4379
|
+
Div(PromoteTo(dw, LowerHalf(dh, a)), PromoteTo(dw, LowerHalf(dh, b)));
|
|
4380
|
+
const auto q_hi = Div(PromoteUpperTo(dw, a), PromoteUpperTo(dw, b));
|
|
4381
|
+
|
|
4382
|
+
return OrderedDemote2To(d, q_lo, q_hi);
|
|
4383
|
+
}
|
|
4384
|
+
|
|
4385
|
+
// ------------------------------ I8/U8/I16/U16 MaskedDivOr
|
|
4386
|
+
template <class V, class M, HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 2)),
|
|
4387
|
+
HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
|
|
4388
|
+
HWY_API V MaskedDivOr(V no, M m, V a, V b) {
|
|
4389
|
+
return IfThenElse(m, Div(a, b), no);
|
|
4390
|
+
}
|
|
4391
|
+
|
|
4392
|
+
// ------------------------------ Mod (Div, NegMulAdd)
|
|
4393
|
+
template <class V>
|
|
4394
|
+
HWY_API V Mod(V a, V b) {
|
|
4395
|
+
return NegMulAdd(Div(a, b), b, a);
|
|
4396
|
+
}
|
|
4397
|
+
|
|
4398
|
+
// ------------------------------ MaskedModOr (Mod)
|
|
4399
|
+
template <class V, class M>
|
|
4400
|
+
HWY_API V MaskedModOr(V no, M m, V a, V b) {
|
|
4401
|
+
return IfThenElse(m, Mod(a, b), no);
|
|
3630
4402
|
}
|
|
3631
4403
|
|
|
3632
4404
|
// ------------------------------ ZeroIfNegative (Lt, IfThenElse)
|
|
@@ -3735,6 +4507,84 @@ HWY_INLINE svbool_t LoadMaskBits(D /* tag */,
|
|
|
3735
4507
|
return TestBit(vbits, bit);
|
|
3736
4508
|
}
|
|
3737
4509
|
|
|
4510
|
+
// ------------------------------ Dup128MaskFromMaskBits
|
|
4511
|
+
|
|
4512
|
+
template <class D, HWY_IF_T_SIZE_D(D, 1), HWY_IF_V_SIZE_LE_D(D, 8)>
|
|
4513
|
+
HWY_API MFromD<D> Dup128MaskFromMaskBits(D d, unsigned mask_bits) {
|
|
4514
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
4515
|
+
|
|
4516
|
+
constexpr size_t kN = MaxLanes(d);
|
|
4517
|
+
if (kN < 8) mask_bits &= (1u << kN) - 1;
|
|
4518
|
+
|
|
4519
|
+
// Replicate the lower 8 bits of mask_bits to each u8 lane
|
|
4520
|
+
const svuint8_t bytes = BitCast(du, Set(du, static_cast<uint8_t>(mask_bits)));
|
|
4521
|
+
|
|
4522
|
+
const svuint8_t bit =
|
|
4523
|
+
svdupq_n_u8(1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128);
|
|
4524
|
+
return TestBit(bytes, bit);
|
|
4525
|
+
}
|
|
4526
|
+
|
|
4527
|
+
template <class D, HWY_IF_T_SIZE_D(D, 1), HWY_IF_V_SIZE_GT_D(D, 8)>
|
|
4528
|
+
HWY_API MFromD<D> Dup128MaskFromMaskBits(D d, unsigned mask_bits) {
|
|
4529
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
4530
|
+
const Repartition<uint16_t, decltype(du)> du16;
|
|
4531
|
+
|
|
4532
|
+
// Replicate the lower 16 bits of mask_bits to each u16 lane of a u16 vector,
|
|
4533
|
+
// and then bitcast the replicated mask_bits to a u8 vector
|
|
4534
|
+
const svuint8_t bytes =
|
|
4535
|
+
BitCast(du, Set(du16, static_cast<uint16_t>(mask_bits)));
|
|
4536
|
+
// Replicate bytes 8x such that each byte contains the bit that governs it.
|
|
4537
|
+
const svuint8_t rep8 = svtbl_u8(bytes, ShiftRight<3>(Iota(du, 0)));
|
|
4538
|
+
|
|
4539
|
+
const svuint8_t bit =
|
|
4540
|
+
svdupq_n_u8(1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128);
|
|
4541
|
+
return TestBit(rep8, bit);
|
|
4542
|
+
}
|
|
4543
|
+
|
|
4544
|
+
template <class D, HWY_IF_T_SIZE_D(D, 2)>
|
|
4545
|
+
HWY_API MFromD<D> Dup128MaskFromMaskBits(D d, unsigned mask_bits) {
|
|
4546
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
4547
|
+
const Repartition<uint8_t, decltype(d)> du8;
|
|
4548
|
+
|
|
4549
|
+
constexpr size_t kN = MaxLanes(d);
|
|
4550
|
+
if (kN < 8) mask_bits &= (1u << kN) - 1;
|
|
4551
|
+
|
|
4552
|
+
// Set all of the u8 lanes of bytes to the lower 8 bits of mask_bits
|
|
4553
|
+
const svuint8_t bytes = Set(du8, static_cast<uint8_t>(mask_bits));
|
|
4554
|
+
|
|
4555
|
+
const svuint16_t bit = svdupq_n_u16(1, 2, 4, 8, 16, 32, 64, 128);
|
|
4556
|
+
return TestBit(BitCast(du, bytes), bit);
|
|
4557
|
+
}
|
|
4558
|
+
|
|
4559
|
+
template <class D, HWY_IF_T_SIZE_D(D, 4)>
|
|
4560
|
+
HWY_API MFromD<D> Dup128MaskFromMaskBits(D d, unsigned mask_bits) {
|
|
4561
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
4562
|
+
const Repartition<uint8_t, decltype(d)> du8;
|
|
4563
|
+
|
|
4564
|
+
constexpr size_t kN = MaxLanes(d);
|
|
4565
|
+
if (kN < 4) mask_bits &= (1u << kN) - 1;
|
|
4566
|
+
|
|
4567
|
+
// Set all of the u8 lanes of bytes to the lower 8 bits of mask_bits
|
|
4568
|
+
const svuint8_t bytes = Set(du8, static_cast<uint8_t>(mask_bits));
|
|
4569
|
+
|
|
4570
|
+
const svuint32_t bit = svdupq_n_u32(1, 2, 4, 8);
|
|
4571
|
+
return TestBit(BitCast(du, bytes), bit);
|
|
4572
|
+
}
|
|
4573
|
+
|
|
4574
|
+
template <class D, HWY_IF_T_SIZE_D(D, 8)>
|
|
4575
|
+
HWY_API MFromD<D> Dup128MaskFromMaskBits(D d, unsigned mask_bits) {
|
|
4576
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
4577
|
+
const Repartition<uint8_t, decltype(d)> du8;
|
|
4578
|
+
|
|
4579
|
+
if (MaxLanes(d) < 2) mask_bits &= 1u;
|
|
4580
|
+
|
|
4581
|
+
// Set all of the u8 lanes of bytes to the lower 8 bits of mask_bits
|
|
4582
|
+
const svuint8_t bytes = Set(du8, static_cast<uint8_t>(mask_bits));
|
|
4583
|
+
|
|
4584
|
+
const svuint64_t bit = svdupq_n_u64(1, 2);
|
|
4585
|
+
return TestBit(BitCast(du, bytes), bit);
|
|
4586
|
+
}
|
|
4587
|
+
|
|
3738
4588
|
// ------------------------------ StoreMaskBits
|
|
3739
4589
|
|
|
3740
4590
|
namespace detail {
|
|
@@ -4100,12 +4950,13 @@ HWY_INLINE VFromD<DU> LaneIndicesFromByteIndices(D, svuint8_t idx) {
|
|
|
4100
4950
|
template <class V>
|
|
4101
4951
|
HWY_INLINE V ExpandLoop(V v, svbool_t mask) {
|
|
4102
4952
|
const DFromV<V> d;
|
|
4953
|
+
using T = TFromV<V>;
|
|
4103
4954
|
uint8_t mask_bytes[256 / 8];
|
|
4104
4955
|
StoreMaskBits(d, mask, mask_bytes);
|
|
4105
4956
|
|
|
4106
4957
|
// ShiftLeftLanes is expensive, so we're probably better off storing to memory
|
|
4107
4958
|
// and loading the final result.
|
|
4108
|
-
alignas(16)
|
|
4959
|
+
alignas(16) T out[2 * MaxLanes(d)];
|
|
4109
4960
|
|
|
4110
4961
|
svbool_t next = svpfalse_b();
|
|
4111
4962
|
size_t input_consumed = 0;
|
|
@@ -4117,7 +4968,7 @@ HWY_INLINE V ExpandLoop(V v, svbool_t mask) {
|
|
|
4117
4968
|
// instruction for variable-shift-reg, but we can splice.
|
|
4118
4969
|
const V vH = detail::Splice(v, v, next);
|
|
4119
4970
|
input_consumed += PopCount(mask_bits);
|
|
4120
|
-
next = detail::GeN(iota,
|
|
4971
|
+
next = detail::GeN(iota, ConvertScalarTo<T>(input_consumed));
|
|
4121
4972
|
|
|
4122
4973
|
const auto idx = detail::LaneIndicesFromByteIndices(
|
|
4123
4974
|
d, detail::IndicesForExpandFromBits(mask_bits));
|
|
@@ -4611,7 +5462,7 @@ HWY_API svuint64_t MulOdd(const svuint64_t a, const svuint64_t b) {
|
|
|
4611
5462
|
template <size_t N, int kPow2>
|
|
4612
5463
|
HWY_API svfloat32_t WidenMulPairwiseAdd(Simd<float, N, kPow2> df32, VBF16 a,
|
|
4613
5464
|
VBF16 b) {
|
|
4614
|
-
#if
|
|
5465
|
+
#if HWY_SVE_HAVE_BF16_FEATURE
|
|
4615
5466
|
const svfloat32_t even = svbfmlalb_f32(Zero(df32), a, b);
|
|
4616
5467
|
return svbfmlalt_f32(even, a, b);
|
|
4617
5468
|
#else
|
|
@@ -4626,7 +5477,7 @@ HWY_API svfloat32_t WidenMulPairwiseAdd(Simd<float, N, kPow2> df32, VBF16 a,
|
|
|
4626
5477
|
const VU32 bo = And(BitCast(du32, b), odd);
|
|
4627
5478
|
return MulAdd(BitCast(df32, ae), BitCast(df32, be),
|
|
4628
5479
|
Mul(BitCast(df32, ao), BitCast(df32, bo)));
|
|
4629
|
-
#endif //
|
|
5480
|
+
#endif // HWY_SVE_HAVE_BF16_FEATURE
|
|
4630
5481
|
}
|
|
4631
5482
|
|
|
4632
5483
|
template <size_t N, int kPow2>
|
|
@@ -4672,7 +5523,7 @@ HWY_API svfloat32_t ReorderWidenMulAccumulate(Simd<float, N, kPow2> df32,
|
|
|
4672
5523
|
VBF16 a, VBF16 b,
|
|
4673
5524
|
const svfloat32_t sum0,
|
|
4674
5525
|
svfloat32_t& sum1) {
|
|
4675
|
-
#if
|
|
5526
|
+
#if HWY_SVE_HAVE_BF16_FEATURE
|
|
4676
5527
|
(void)df32;
|
|
4677
5528
|
sum1 = svbfmlalt_f32(sum1, a, b);
|
|
4678
5529
|
return svbfmlalb_f32(sum0, a, b);
|
|
@@ -4688,7 +5539,7 @@ HWY_API svfloat32_t ReorderWidenMulAccumulate(Simd<float, N, kPow2> df32,
|
|
|
4688
5539
|
const VU32 bo = And(BitCast(du32, b), odd);
|
|
4689
5540
|
sum1 = MulAdd(BitCast(df32, ao), BitCast(df32, bo), sum1);
|
|
4690
5541
|
return MulAdd(BitCast(df32, ae), BitCast(df32, be), sum0);
|
|
4691
|
-
#endif //
|
|
5542
|
+
#endif // HWY_SVE_HAVE_BF16_FEATURE
|
|
4692
5543
|
}
|
|
4693
5544
|
|
|
4694
5545
|
template <size_t N, int kPow2>
|
|
@@ -4817,8 +5668,10 @@ HWY_API VFromD<DU64> SumOfMulQuadAccumulate(DU64 /*du64*/, svuint16_t a,
|
|
|
4817
5668
|
|
|
4818
5669
|
// ------------------------------ AESRound / CLMul
|
|
4819
5670
|
|
|
5671
|
+
// Static dispatch with -march=armv8-a+sve2+aes, or dynamic dispatch WITHOUT a
|
|
5672
|
+
// baseline, in which case we check for AES support at runtime.
|
|
4820
5673
|
#if defined(__ARM_FEATURE_SVE2_AES) || \
|
|
4821
|
-
(HWY_SVE_HAVE_2 && HWY_HAVE_RUNTIME_DISPATCH)
|
|
5674
|
+
(HWY_SVE_HAVE_2 && HWY_HAVE_RUNTIME_DISPATCH && HWY_BASELINE_SVE2 == 0)
|
|
4822
5675
|
|
|
4823
5676
|
// Per-target flag to prevent generic_ops-inl.h from defining AESRound.
|
|
4824
5677
|
#ifdef HWY_NATIVE_AES
|
|
@@ -5059,14 +5912,15 @@ HWY_API V HighestSetBitIndex(V v) {
|
|
|
5059
5912
|
}
|
|
5060
5913
|
|
|
5061
5914
|
// ================================================== END MACROS
|
|
5062
|
-
namespace detail { // for code folding
|
|
5063
5915
|
#undef HWY_SVE_ALL_PTRUE
|
|
5064
5916
|
#undef HWY_SVE_D
|
|
5065
5917
|
#undef HWY_SVE_FOREACH
|
|
5066
5918
|
#undef HWY_SVE_FOREACH_BF16
|
|
5919
|
+
#undef HWY_SVE_FOREACH_BF16_UNCONDITIONAL
|
|
5067
5920
|
#undef HWY_SVE_FOREACH_F
|
|
5068
5921
|
#undef HWY_SVE_FOREACH_F16
|
|
5069
5922
|
#undef HWY_SVE_FOREACH_F32
|
|
5923
|
+
#undef HWY_SVE_FOREACH_F3264
|
|
5070
5924
|
#undef HWY_SVE_FOREACH_F64
|
|
5071
5925
|
#undef HWY_SVE_FOREACH_I
|
|
5072
5926
|
#undef HWY_SVE_FOREACH_I08
|
|
@@ -5086,7 +5940,10 @@ namespace detail { // for code folding
|
|
|
5086
5940
|
#undef HWY_SVE_FOREACH_UI64
|
|
5087
5941
|
#undef HWY_SVE_FOREACH_UIF3264
|
|
5088
5942
|
#undef HWY_SVE_HAVE_2
|
|
5943
|
+
#undef HWY_SVE_IF_EMULATED_D
|
|
5944
|
+
#undef HWY_SVE_IF_NOT_EMULATED_D
|
|
5089
5945
|
#undef HWY_SVE_PTRUE
|
|
5946
|
+
#undef HWY_SVE_RETV_ARGMVV
|
|
5090
5947
|
#undef HWY_SVE_RETV_ARGPV
|
|
5091
5948
|
#undef HWY_SVE_RETV_ARGPVN
|
|
5092
5949
|
#undef HWY_SVE_RETV_ARGPVV
|
|
@@ -5098,7 +5955,6 @@ namespace detail { // for code folding
|
|
|
5098
5955
|
#undef HWY_SVE_UNDEFINED
|
|
5099
5956
|
#undef HWY_SVE_V
|
|
5100
5957
|
|
|
5101
|
-
} // namespace detail
|
|
5102
5958
|
// NOLINTNEXTLINE(google-readability-namespace-comments)
|
|
5103
5959
|
} // namespace HWY_NAMESPACE
|
|
5104
5960
|
} // namespace hwy
|