@img/sharp-libvips-dev 1.0.1 → 1.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -2
- package/include/aom/aom_decoder.h +1 -1
- package/include/aom/aom_encoder.h +7 -1
- package/include/aom/aom_image.h +24 -12
- package/include/aom/aom_integer.h +3 -3
- package/include/aom/aomcx.h +15 -0
- package/include/aom/aomdx.h +5 -2
- package/include/archive.h +7 -5
- package/include/archive_entry.h +5 -3
- package/include/cgif.h +3 -0
- package/include/expat.h +21 -10
- package/include/expat_config.h +11 -5
- package/include/ffi.h +12 -25
- package/include/freetype2/freetype/config/ftoption.h +2 -2
- package/include/fribidi/fribidi-config.h +2 -2
- package/include/fribidi/fribidi-unicode-version.h +3 -3
- package/include/gio-unix-2.0/gio/gfiledescriptorbased.h +3 -2
- package/include/glib-2.0/gio/gappinfo.h +40 -25
- package/include/glib-2.0/gio/gapplication.h +6 -0
- package/include/glib-2.0/gio/gasyncresult.h +1 -1
- package/include/glib-2.0/gio/gconverter.h +5 -0
- package/include/glib-2.0/gio/gdbusintrospection.h +1 -1
- package/include/glib-2.0/gio/gfile.h +16 -0
- package/include/glib-2.0/gio/gio-visibility.h +34 -0
- package/include/glib-2.0/gio/giotypes.h +0 -1
- package/include/glib-2.0/gio/gsettings.h +8 -0
- package/include/glib-2.0/gio/gvfs.h +2 -2
- package/include/glib-2.0/girepository/gi-visibility.h +34 -0
- package/include/glib-2.0/girepository/giarginfo.h +23 -6
- package/include/glib-2.0/girepository/gibaseinfo.h +44 -18
- package/include/glib-2.0/girepository/gicallableinfo.h +26 -16
- package/include/glib-2.0/girepository/gicallbackinfo.h +17 -2
- package/include/glib-2.0/girepository/giconstantinfo.h +19 -4
- package/include/glib-2.0/girepository/gienuminfo.h +20 -21
- package/include/glib-2.0/girepository/gifieldinfo.h +22 -7
- package/include/glib-2.0/girepository/giflagsinfo.h +60 -0
- package/include/glib-2.0/girepository/gifunctioninfo.h +22 -7
- package/include/glib-2.0/girepository/giinterfaceinfo.h +33 -18
- package/include/glib-2.0/girepository/giobjectinfo.h +41 -26
- package/include/glib-2.0/girepository/gipropertyinfo.h +18 -3
- package/include/glib-2.0/girepository/giregisteredtypeinfo.h +22 -11
- package/include/glib-2.0/girepository/girepository-autocleanups.h +56 -0
- package/include/glib-2.0/girepository/girepository.h +53 -62
- package/include/glib-2.0/girepository/girffi.h +8 -7
- package/include/glib-2.0/girepository/gisignalinfo.h +18 -3
- package/include/glib-2.0/girepository/gistructinfo.h +26 -11
- package/include/glib-2.0/girepository/gitypeinfo.h +29 -16
- package/include/glib-2.0/girepository/gitypelib.h +9 -13
- package/include/glib-2.0/girepository/gitypes.h +52 -104
- package/include/glib-2.0/girepository/giunioninfo.h +28 -12
- package/include/glib-2.0/girepository/giunresolvedinfo.h +17 -2
- package/include/glib-2.0/girepository/givalueinfo.h +65 -0
- package/include/glib-2.0/girepository/givfuncinfo.h +23 -8
- package/include/glib-2.0/glib/deprecated/gthread.h +9 -5
- package/include/glib-2.0/glib/gbitlock.h +31 -0
- package/include/glib-2.0/glib/gbookmarkfile.h +1 -1
- package/include/glib-2.0/glib/giochannel.h +2 -2
- package/include/glib-2.0/glib/glib-visibility.h +34 -0
- package/include/glib-2.0/glib/gmacros.h +12 -5
- package/include/glib-2.0/glib/gmain.h +93 -7
- package/include/glib-2.0/glib/gmessages.h +8 -0
- package/include/glib-2.0/glib/gqsort.h +8 -1
- package/include/glib-2.0/glib/gslice.h +2 -0
- package/include/glib-2.0/glib/gstrfuncs.h +24 -30
- package/include/glib-2.0/glib/gstrvbuilder.h +3 -0
- package/include/glib-2.0/glib/gthread.h +191 -3
- package/include/glib-2.0/glib/gunicode.h +1 -1
- package/include/glib-2.0/glib/gversionmacros.h +9 -0
- package/include/glib-2.0/glib-unix.h +7 -1
- package/include/glib-2.0/gmodule/gmodule-visibility.h +34 -0
- package/include/glib-2.0/gobject/genums.h +6 -6
- package/include/glib-2.0/gobject/glib-types.h +11 -0
- package/include/glib-2.0/gobject/gobject-visibility.h +34 -0
- package/include/glib-2.0/gobject/gsignal.h +16 -6
- package/include/glib-2.0/gobject/gtype.h +6 -6
- package/include/harfbuzz/hb-buffer.h +6 -0
- package/include/harfbuzz/hb-common.h +6 -9
- package/include/harfbuzz/hb-cplusplus.hh +8 -11
- package/include/harfbuzz/hb-subset.h +17 -4
- package/include/harfbuzz/hb-version.h +3 -3
- package/include/hwy/abort.h +28 -0
- package/include/hwy/aligned_allocator.h +218 -6
- package/include/hwy/base.h +1935 -512
- package/include/hwy/cache_control.h +24 -6
- package/include/hwy/detect_compiler_arch.h +105 -10
- package/include/hwy/detect_targets.h +146 -37
- package/include/hwy/foreach_target.h +36 -1
- package/include/hwy/highway.h +222 -50
- package/include/hwy/ops/arm_neon-inl.h +2055 -894
- package/include/hwy/ops/arm_sve-inl.h +1476 -348
- package/include/hwy/ops/emu128-inl.h +711 -623
- package/include/hwy/ops/generic_ops-inl.h +4431 -2157
- package/include/hwy/ops/inside-inl.h +691 -0
- package/include/hwy/ops/ppc_vsx-inl.h +2186 -673
- package/include/hwy/ops/rvv-inl.h +1556 -536
- package/include/hwy/ops/scalar-inl.h +353 -233
- package/include/hwy/ops/set_macros-inl.h +171 -23
- package/include/hwy/ops/shared-inl.h +198 -56
- package/include/hwy/ops/wasm_128-inl.h +283 -244
- package/include/hwy/ops/x86_128-inl.h +3673 -1357
- package/include/hwy/ops/x86_256-inl.h +1737 -663
- package/include/hwy/ops/x86_512-inl.h +1697 -500
- package/include/hwy/per_target.h +4 -0
- package/include/hwy/profiler.h +648 -0
- package/include/hwy/robust_statistics.h +2 -2
- package/include/hwy/targets.h +40 -32
- package/include/hwy/timer-inl.h +3 -3
- package/include/hwy/timer.h +16 -1
- package/include/libheif/heif.h +170 -15
- package/include/libheif/heif_items.h +237 -0
- package/include/libheif/heif_properties.h +38 -2
- package/include/libheif/heif_regions.h +1 -1
- package/include/libheif/heif_version.h +2 -2
- package/include/libpng16/png.h +32 -29
- package/include/libpng16/pngconf.h +2 -2
- package/include/libpng16/pnglibconf.h +8 -3
- package/include/librsvg-2.0/librsvg/rsvg-cairo.h +1 -1
- package/include/librsvg-2.0/librsvg/rsvg-features.h +3 -4
- package/include/librsvg-2.0/librsvg/rsvg-pixbuf.h +235 -0
- package/include/librsvg-2.0/librsvg/rsvg-version.h +3 -3
- package/include/librsvg-2.0/librsvg/rsvg.h +55 -176
- package/include/libxml2/libxml/HTMLparser.h +12 -19
- package/include/libxml2/libxml/c14n.h +1 -12
- package/include/libxml2/libxml/debugXML.h +1 -1
- package/include/libxml2/libxml/encoding.h +9 -0
- package/include/libxml2/libxml/entities.h +12 -1
- package/include/libxml2/libxml/hash.h +19 -0
- package/include/libxml2/libxml/list.h +2 -2
- package/include/libxml2/libxml/nanohttp.h +17 -0
- package/include/libxml2/libxml/parser.h +73 -58
- package/include/libxml2/libxml/parserInternals.h +9 -1
- package/include/libxml2/libxml/pattern.h +6 -0
- package/include/libxml2/libxml/tree.h +32 -12
- package/include/libxml2/libxml/uri.h +11 -0
- package/include/libxml2/libxml/valid.h +29 -2
- package/include/libxml2/libxml/xinclude.h +7 -0
- package/include/libxml2/libxml/xmlIO.h +21 -5
- package/include/libxml2/libxml/xmlerror.h +14 -0
- package/include/libxml2/libxml/xmlexports.h +111 -15
- package/include/libxml2/libxml/xmlmemory.h +8 -45
- package/include/libxml2/libxml/xmlreader.h +2 -0
- package/include/libxml2/libxml/xmlsave.h +5 -0
- package/include/libxml2/libxml/xmlunicode.h +165 -1
- package/include/libxml2/libxml/xmlversion.h +15 -179
- package/include/libxml2/libxml/xmlwriter.h +1 -0
- package/include/libxml2/libxml/xpath.h +4 -0
- package/include/pango-1.0/pango/pango-features.h +2 -2
- package/include/pango-1.0/pango/pango-fontmap.h +7 -0
- package/include/pango-1.0/pango/pango-item.h +4 -2
- package/include/pango-1.0/pango/pango-version-macros.h +25 -0
- package/include/pango-1.0/pango/pangofc-font.h +2 -1
- package/include/pixman-1/pixman-version.h +2 -2
- package/include/png.h +32 -29
- package/include/pngconf.h +2 -2
- package/include/pnglibconf.h +8 -3
- package/include/vips/connection.h +9 -3
- package/include/vips/util.h +1 -11
- package/include/vips/version.h +4 -4
- package/include/webp/decode.h +58 -56
- package/include/webp/demux.h +25 -21
- package/include/webp/encode.h +44 -39
- package/include/webp/mux.h +76 -15
- package/include/webp/mux_types.h +2 -1
- package/include/webp/sharpyuv/sharpyuv.h +77 -8
- package/include/webp/types.h +29 -8
- package/include/zconf.h +1 -1
- package/include/zlib.h +12 -12
- package/package.json +1 -1
- package/versions.json +18 -19
|
@@ -33,6 +33,33 @@
|
|
|
33
33
|
#define HWY_SVE_HAVE_2 0
|
|
34
34
|
#endif
|
|
35
35
|
|
|
36
|
+
// If 1, both __bf16 and a limited set of *_bf16 SVE intrinsics are available:
|
|
37
|
+
// create/get/set/dup, ld/st, sel, rev, trn, uzp, zip.
|
|
38
|
+
#if HWY_ARM_HAVE_SCALAR_BF16_TYPE && defined(__ARM_FEATURE_SVE_BF16)
|
|
39
|
+
#define HWY_SVE_HAVE_BF16_FEATURE 1
|
|
40
|
+
#else
|
|
41
|
+
#define HWY_SVE_HAVE_BF16_FEATURE 0
|
|
42
|
+
#endif
|
|
43
|
+
|
|
44
|
+
// HWY_SVE_HAVE_BF16_VEC is defined to 1 if the SVE svbfloat16_t vector type
|
|
45
|
+
// is supported, even if HWY_SVE_HAVE_BF16_FEATURE (= intrinsics) is 0.
|
|
46
|
+
#if HWY_SVE_HAVE_BF16_FEATURE || \
|
|
47
|
+
(HWY_COMPILER_CLANG >= 1200 && defined(__ARM_FEATURE_SVE_BF16)) || \
|
|
48
|
+
HWY_COMPILER_GCC_ACTUAL >= 1000
|
|
49
|
+
#define HWY_SVE_HAVE_BF16_VEC 1
|
|
50
|
+
#else
|
|
51
|
+
#define HWY_SVE_HAVE_BF16_VEC 0
|
|
52
|
+
#endif
|
|
53
|
+
|
|
54
|
+
// HWY_SVE_HAVE_F32_TO_BF16C is defined to 1 if the SVE svcvt_bf16_f32_x
|
|
55
|
+
// and svcvtnt_bf16_f32_x intrinsics are available, even if the __bf16 type
|
|
56
|
+
// is disabled
|
|
57
|
+
#if HWY_SVE_HAVE_BF16_VEC && defined(__ARM_FEATURE_SVE_BF16)
|
|
58
|
+
#define HWY_SVE_HAVE_F32_TO_BF16C 1
|
|
59
|
+
#else
|
|
60
|
+
#define HWY_SVE_HAVE_F32_TO_BF16C 0
|
|
61
|
+
#endif
|
|
62
|
+
|
|
36
63
|
HWY_BEFORE_NAMESPACE();
|
|
37
64
|
namespace hwy {
|
|
38
65
|
namespace HWY_NAMESPACE {
|
|
@@ -76,12 +103,26 @@ namespace detail { // for code folding
|
|
|
76
103
|
#define HWY_SVE_FOREACH_F64(X_MACRO, NAME, OP) \
|
|
77
104
|
X_MACRO(float, f, 64, 32, NAME, OP)
|
|
78
105
|
|
|
79
|
-
#
|
|
80
|
-
#define HWY_SVE_FOREACH_BF16(X_MACRO, NAME, OP) \
|
|
106
|
+
#define HWY_SVE_FOREACH_BF16_UNCONDITIONAL(X_MACRO, NAME, OP) \
|
|
81
107
|
X_MACRO(bfloat, bf, 16, 16, NAME, OP)
|
|
108
|
+
|
|
109
|
+
#if HWY_SVE_HAVE_BF16_FEATURE
|
|
110
|
+
#define HWY_SVE_FOREACH_BF16(X_MACRO, NAME, OP) \
|
|
111
|
+
HWY_SVE_FOREACH_BF16_UNCONDITIONAL(X_MACRO, NAME, OP)
|
|
112
|
+
// We have both f16 and bf16, so nothing is emulated.
|
|
113
|
+
|
|
114
|
+
// NOTE: hwy::EnableIf<!hwy::IsSame<D, D>()>* = nullptr is used instead of
|
|
115
|
+
// hwy::EnableIf<false>* = nullptr to avoid compiler errors since
|
|
116
|
+
// !hwy::IsSame<D, D>() is always false and as !hwy::IsSame<D, D>() will cause
|
|
117
|
+
// SFINAE to occur instead of a hard error due to a dependency on the D template
|
|
118
|
+
// argument
|
|
119
|
+
#define HWY_SVE_IF_EMULATED_D(D) hwy::EnableIf<!hwy::IsSame<D, D>()>* = nullptr
|
|
120
|
+
#define HWY_SVE_IF_NOT_EMULATED_D(D) hwy::EnableIf<true>* = nullptr
|
|
82
121
|
#else
|
|
83
122
|
#define HWY_SVE_FOREACH_BF16(X_MACRO, NAME, OP)
|
|
84
|
-
#
|
|
123
|
+
#define HWY_SVE_IF_EMULATED_D(D) HWY_IF_BF16_D(D)
|
|
124
|
+
#define HWY_SVE_IF_NOT_EMULATED_D(D) HWY_IF_NOT_BF16_D(D)
|
|
125
|
+
#endif // HWY_SVE_HAVE_BF16_FEATURE
|
|
85
126
|
|
|
86
127
|
// For all element sizes:
|
|
87
128
|
#define HWY_SVE_FOREACH_U(X_MACRO, NAME, OP) \
|
|
@@ -96,12 +137,16 @@ namespace detail { // for code folding
|
|
|
96
137
|
HWY_SVE_FOREACH_I32(X_MACRO, NAME, OP) \
|
|
97
138
|
HWY_SVE_FOREACH_I64(X_MACRO, NAME, OP)
|
|
98
139
|
|
|
140
|
+
#define HWY_SVE_FOREACH_F3264(X_MACRO, NAME, OP) \
|
|
141
|
+
HWY_SVE_FOREACH_F32(X_MACRO, NAME, OP) \
|
|
142
|
+
HWY_SVE_FOREACH_F64(X_MACRO, NAME, OP)
|
|
143
|
+
|
|
99
144
|
// HWY_SVE_FOREACH_F does not include HWY_SVE_FOREACH_BF16 because SVE lacks
|
|
100
145
|
// bf16 overloads for some intrinsics (especially less-common arithmetic).
|
|
146
|
+
// However, this does include f16 because SVE supports it unconditionally.
|
|
101
147
|
#define HWY_SVE_FOREACH_F(X_MACRO, NAME, OP) \
|
|
102
148
|
HWY_SVE_FOREACH_F16(X_MACRO, NAME, OP) \
|
|
103
|
-
|
|
104
|
-
HWY_SVE_FOREACH_F64(X_MACRO, NAME, OP)
|
|
149
|
+
HWY_SVE_FOREACH_F3264(X_MACRO, NAME, OP)
|
|
105
150
|
|
|
106
151
|
// Commonly used type categories for a given element size:
|
|
107
152
|
#define HWY_SVE_FOREACH_UI08(X_MACRO, NAME, OP) \
|
|
@@ -123,8 +168,7 @@ namespace detail { // for code folding
|
|
|
123
168
|
#define HWY_SVE_FOREACH_UIF3264(X_MACRO, NAME, OP) \
|
|
124
169
|
HWY_SVE_FOREACH_UI32(X_MACRO, NAME, OP) \
|
|
125
170
|
HWY_SVE_FOREACH_UI64(X_MACRO, NAME, OP) \
|
|
126
|
-
|
|
127
|
-
HWY_SVE_FOREACH_F64(X_MACRO, NAME, OP)
|
|
171
|
+
HWY_SVE_FOREACH_F3264(X_MACRO, NAME, OP)
|
|
128
172
|
|
|
129
173
|
// Commonly used type categories:
|
|
130
174
|
#define HWY_SVE_FOREACH_UI(X_MACRO, NAME, OP) \
|
|
@@ -155,7 +199,9 @@ namespace detail { // for code folding
|
|
|
155
199
|
};
|
|
156
200
|
|
|
157
201
|
HWY_SVE_FOREACH(HWY_SPECIALIZE, _, _)
|
|
158
|
-
|
|
202
|
+
#if HWY_SVE_HAVE_BF16_FEATURE || HWY_SVE_HAVE_BF16_VEC
|
|
203
|
+
HWY_SVE_FOREACH_BF16_UNCONDITIONAL(HWY_SPECIALIZE, _, _)
|
|
204
|
+
#endif
|
|
159
205
|
#undef HWY_SPECIALIZE
|
|
160
206
|
|
|
161
207
|
// Note: _x (don't-care value for inactive lanes) avoids additional MOVPRFX
|
|
@@ -184,15 +230,24 @@ HWY_SVE_FOREACH_BF16(HWY_SPECIALIZE, _, _)
|
|
|
184
230
|
}
|
|
185
231
|
|
|
186
232
|
// vector = f(vector, vector), e.g. Add
|
|
233
|
+
#define HWY_SVE_RETV_ARGVV(BASE, CHAR, BITS, HALF, NAME, OP) \
|
|
234
|
+
HWY_API HWY_SVE_V(BASE, BITS) \
|
|
235
|
+
NAME(HWY_SVE_V(BASE, BITS) a, HWY_SVE_V(BASE, BITS) b) { \
|
|
236
|
+
return sv##OP##_##CHAR##BITS(a, b); \
|
|
237
|
+
}
|
|
238
|
+
// All-true mask
|
|
187
239
|
#define HWY_SVE_RETV_ARGPVV(BASE, CHAR, BITS, HALF, NAME, OP) \
|
|
188
240
|
HWY_API HWY_SVE_V(BASE, BITS) \
|
|
189
241
|
NAME(HWY_SVE_V(BASE, BITS) a, HWY_SVE_V(BASE, BITS) b) { \
|
|
190
242
|
return sv##OP##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), a, b); \
|
|
191
243
|
}
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
244
|
+
// User-specified mask. Mask=false value is undefined and must be set by caller
|
|
245
|
+
// because SVE instructions take it from one of the two inputs, whereas
|
|
246
|
+
// AVX-512, RVV and Highway allow a third argument.
|
|
247
|
+
#define HWY_SVE_RETV_ARGMVV(BASE, CHAR, BITS, HALF, NAME, OP) \
|
|
248
|
+
HWY_API HWY_SVE_V(BASE, BITS) \
|
|
249
|
+
NAME(svbool_t m, HWY_SVE_V(BASE, BITS) a, HWY_SVE_V(BASE, BITS) b) { \
|
|
250
|
+
return sv##OP##_##CHAR##BITS##_x(m, a, b); \
|
|
196
251
|
}
|
|
197
252
|
|
|
198
253
|
#define HWY_SVE_RETV_ARGVVV(BASE, CHAR, BITS, HALF, NAME, OP) \
|
|
@@ -264,26 +319,19 @@ HWY_API size_t Lanes(Simd<T, N, kPow2> d) {
|
|
|
264
319
|
return sv##OP##_b##BITS##_u32(uint32_t{0}, static_cast<uint32_t>(limit)); \
|
|
265
320
|
}
|
|
266
321
|
HWY_SVE_FOREACH(HWY_SVE_FIRSTN, FirstN, whilelt)
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
#
|
|
270
|
-
|
|
271
|
-
template <class D>
|
|
272
|
-
using MFromD = decltype(FirstN(D(), 0));
|
|
322
|
+
#if HWY_SVE_HAVE_BF16_FEATURE || HWY_SVE_HAVE_BF16_VEC
|
|
323
|
+
HWY_SVE_FOREACH_BF16_UNCONDITIONAL(HWY_SVE_FIRSTN, FirstN, whilelt)
|
|
324
|
+
#endif
|
|
273
325
|
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
MFromD<RebindToUnsigned<D>> FirstN(D /* tag */, size_t count) {
|
|
326
|
+
template <class D, HWY_SVE_IF_EMULATED_D(D)>
|
|
327
|
+
svbool_t FirstN(D /* tag */, size_t count) {
|
|
277
328
|
return FirstN(RebindToUnsigned<D>(), count);
|
|
278
329
|
}
|
|
279
|
-
#endif // !HWY_HAVE_FLOAT16
|
|
280
330
|
|
|
281
|
-
#
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
}
|
|
286
|
-
#endif // !HWY_SVE_HAVE_BFLOAT16
|
|
331
|
+
#undef HWY_SVE_FIRSTN
|
|
332
|
+
|
|
333
|
+
template <class D>
|
|
334
|
+
using MFromD = svbool_t;
|
|
287
335
|
|
|
288
336
|
namespace detail {
|
|
289
337
|
|
|
@@ -298,7 +346,7 @@ namespace detail {
|
|
|
298
346
|
}
|
|
299
347
|
|
|
300
348
|
HWY_SVE_FOREACH(HWY_SVE_WRAP_PTRUE, PTrue, ptrue) // return all-true
|
|
301
|
-
|
|
349
|
+
HWY_SVE_FOREACH_BF16_UNCONDITIONAL(HWY_SVE_WRAP_PTRUE, PTrue, ptrue)
|
|
302
350
|
#undef HWY_SVE_WRAP_PTRUE
|
|
303
351
|
|
|
304
352
|
HWY_API svbool_t PFalse() { return svpfalse_b(); }
|
|
@@ -314,6 +362,17 @@ svbool_t MakeMask(D d) {
|
|
|
314
362
|
|
|
315
363
|
} // namespace detail
|
|
316
364
|
|
|
365
|
+
#ifdef HWY_NATIVE_MASK_FALSE
|
|
366
|
+
#undef HWY_NATIVE_MASK_FALSE
|
|
367
|
+
#else
|
|
368
|
+
#define HWY_NATIVE_MASK_FALSE
|
|
369
|
+
#endif
|
|
370
|
+
|
|
371
|
+
template <class D>
|
|
372
|
+
HWY_API svbool_t MaskFalse(const D /*d*/) {
|
|
373
|
+
return detail::PFalse();
|
|
374
|
+
}
|
|
375
|
+
|
|
317
376
|
// ================================================== INIT
|
|
318
377
|
|
|
319
378
|
// ------------------------------ Set
|
|
@@ -326,14 +385,23 @@ svbool_t MakeMask(D d) {
|
|
|
326
385
|
}
|
|
327
386
|
|
|
328
387
|
HWY_SVE_FOREACH(HWY_SVE_SET, Set, dup_n)
|
|
388
|
+
#if HWY_SVE_HAVE_BF16_FEATURE // for if-elif chain
|
|
329
389
|
HWY_SVE_FOREACH_BF16(HWY_SVE_SET, Set, dup_n)
|
|
330
|
-
#
|
|
390
|
+
#elif HWY_SVE_HAVE_BF16_VEC
|
|
331
391
|
// Required for Zero and VFromD
|
|
332
|
-
template <
|
|
333
|
-
|
|
334
|
-
return
|
|
392
|
+
template <class D, HWY_IF_BF16_D(D)>
|
|
393
|
+
HWY_API svbfloat16_t Set(D d, bfloat16_t arg) {
|
|
394
|
+
return svreinterpret_bf16_u16(
|
|
395
|
+
Set(RebindToUnsigned<decltype(d)>(), BitCastScalar<uint16_t>(arg)));
|
|
335
396
|
}
|
|
336
|
-
#
|
|
397
|
+
#else // neither bf16 feature nor vector: emulate with u16
|
|
398
|
+
// Required for Zero and VFromD
|
|
399
|
+
template <class D, HWY_IF_BF16_D(D)>
|
|
400
|
+
HWY_API svuint16_t Set(D d, bfloat16_t arg) {
|
|
401
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
402
|
+
return Set(du, BitCastScalar<uint16_t>(arg));
|
|
403
|
+
}
|
|
404
|
+
#endif // HWY_SVE_HAVE_BF16_FEATURE
|
|
337
405
|
#undef HWY_SVE_SET
|
|
338
406
|
|
|
339
407
|
template <class D>
|
|
@@ -350,17 +418,6 @@ VFromD<D> Zero(D d) {
|
|
|
350
418
|
return BitCast(d, Set(du, 0));
|
|
351
419
|
}
|
|
352
420
|
|
|
353
|
-
// ------------------------------ Undefined
|
|
354
|
-
|
|
355
|
-
#define HWY_SVE_UNDEFINED(BASE, CHAR, BITS, HALF, NAME, OP) \
|
|
356
|
-
template <size_t N, int kPow2> \
|
|
357
|
-
HWY_API HWY_SVE_V(BASE, BITS) \
|
|
358
|
-
NAME(HWY_SVE_D(BASE, BITS, N, kPow2) /* d */) { \
|
|
359
|
-
return sv##OP##_##CHAR##BITS(); \
|
|
360
|
-
}
|
|
361
|
-
|
|
362
|
-
HWY_SVE_FOREACH(HWY_SVE_UNDEFINED, Undefined, undef)
|
|
363
|
-
|
|
364
421
|
// ------------------------------ BitCast
|
|
365
422
|
|
|
366
423
|
namespace detail {
|
|
@@ -387,24 +444,32 @@ namespace detail {
|
|
|
387
444
|
return sv##OP##_##CHAR##BITS##_u8(v); \
|
|
388
445
|
}
|
|
389
446
|
|
|
447
|
+
// U08 is special-cased, hence do not use FOREACH.
|
|
390
448
|
HWY_SVE_FOREACH_U08(HWY_SVE_CAST_NOP, _, _)
|
|
391
449
|
HWY_SVE_FOREACH_I08(HWY_SVE_CAST, _, reinterpret)
|
|
392
450
|
HWY_SVE_FOREACH_UI16(HWY_SVE_CAST, _, reinterpret)
|
|
393
451
|
HWY_SVE_FOREACH_UI32(HWY_SVE_CAST, _, reinterpret)
|
|
394
452
|
HWY_SVE_FOREACH_UI64(HWY_SVE_CAST, _, reinterpret)
|
|
395
453
|
HWY_SVE_FOREACH_F(HWY_SVE_CAST, _, reinterpret)
|
|
396
|
-
HWY_SVE_FOREACH_BF16(HWY_SVE_CAST, _, reinterpret)
|
|
397
454
|
|
|
398
|
-
#
|
|
399
|
-
|
|
455
|
+
#if HWY_SVE_HAVE_BF16_FEATURE || HWY_SVE_HAVE_BF16_VEC
|
|
456
|
+
HWY_SVE_FOREACH_BF16_UNCONDITIONAL(HWY_SVE_CAST, _, reinterpret)
|
|
457
|
+
#else // !(HWY_SVE_HAVE_BF16_FEATURE || HWY_SVE_HAVE_BF16_VEC)
|
|
458
|
+
template <class V, HWY_SVE_IF_EMULATED_D(DFromV<V>)>
|
|
459
|
+
HWY_INLINE svuint8_t BitCastToByte(V v) {
|
|
460
|
+
const RebindToUnsigned<DFromV<V>> du;
|
|
461
|
+
return BitCastToByte(BitCast(du, v));
|
|
462
|
+
}
|
|
400
463
|
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
return BitCastFromByte(Simd<uint16_t, N, kPow2>(), v);
|
|
464
|
+
template <class D, HWY_SVE_IF_EMULATED_D(D)>
|
|
465
|
+
HWY_INLINE VFromD<D> BitCastFromByte(D d, svuint8_t v) {
|
|
466
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
467
|
+
return BitCastFromByte(du, v);
|
|
406
468
|
}
|
|
407
|
-
#endif //
|
|
469
|
+
#endif // HWY_SVE_HAVE_BF16_FEATURE || HWY_SVE_HAVE_BF16_VEC
|
|
470
|
+
|
|
471
|
+
#undef HWY_SVE_CAST_NOP
|
|
472
|
+
#undef HWY_SVE_CAST
|
|
408
473
|
|
|
409
474
|
} // namespace detail
|
|
410
475
|
|
|
@@ -413,6 +478,26 @@ HWY_API VFromD<D> BitCast(D d, FromV v) {
|
|
|
413
478
|
return detail::BitCastFromByte(d, detail::BitCastToByte(v));
|
|
414
479
|
}
|
|
415
480
|
|
|
481
|
+
// ------------------------------ Undefined
|
|
482
|
+
|
|
483
|
+
#define HWY_SVE_UNDEFINED(BASE, CHAR, BITS, HALF, NAME, OP) \
|
|
484
|
+
template <size_t N, int kPow2> \
|
|
485
|
+
HWY_API HWY_SVE_V(BASE, BITS) \
|
|
486
|
+
NAME(HWY_SVE_D(BASE, BITS, N, kPow2) /* d */) { \
|
|
487
|
+
return sv##OP##_##CHAR##BITS(); \
|
|
488
|
+
}
|
|
489
|
+
|
|
490
|
+
HWY_SVE_FOREACH(HWY_SVE_UNDEFINED, Undefined, undef)
|
|
491
|
+
#if HWY_SVE_HAVE_BF16_FEATURE || HWY_SVE_HAVE_BF16_VEC
|
|
492
|
+
HWY_SVE_FOREACH_BF16_UNCONDITIONAL(HWY_SVE_UNDEFINED, Undefined, undef)
|
|
493
|
+
#endif
|
|
494
|
+
|
|
495
|
+
template <class D, HWY_SVE_IF_EMULATED_D(D)>
|
|
496
|
+
VFromD<D> Undefined(D d) {
|
|
497
|
+
const RebindToUnsigned<D> du;
|
|
498
|
+
return BitCast(d, Undefined(du));
|
|
499
|
+
}
|
|
500
|
+
|
|
416
501
|
// ------------------------------ Tuple
|
|
417
502
|
|
|
418
503
|
// tuples = f(d, v..), e.g. Create2
|
|
@@ -438,7 +523,9 @@ HWY_API VFromD<D> BitCast(D d, FromV v) {
|
|
|
438
523
|
}
|
|
439
524
|
|
|
440
525
|
HWY_SVE_FOREACH(HWY_SVE_CREATE, Create, create)
|
|
441
|
-
|
|
526
|
+
#if HWY_SVE_HAVE_BF16_FEATURE || HWY_SVE_HAVE_BF16_VEC
|
|
527
|
+
HWY_SVE_FOREACH_BF16_UNCONDITIONAL(HWY_SVE_CREATE, Create, create)
|
|
528
|
+
#endif
|
|
442
529
|
#undef HWY_SVE_CREATE
|
|
443
530
|
|
|
444
531
|
template <class D>
|
|
@@ -463,7 +550,9 @@ using Vec4 = decltype(Create4(D(), Zero(D()), Zero(D()), Zero(D()), Zero(D())));
|
|
|
463
550
|
}
|
|
464
551
|
|
|
465
552
|
HWY_SVE_FOREACH(HWY_SVE_GET, Get, get)
|
|
466
|
-
|
|
553
|
+
#if HWY_SVE_HAVE_BF16_FEATURE || HWY_SVE_HAVE_BF16_VEC
|
|
554
|
+
HWY_SVE_FOREACH_BF16_UNCONDITIONAL(HWY_SVE_GET, Get, get)
|
|
555
|
+
#endif
|
|
467
556
|
#undef HWY_SVE_GET
|
|
468
557
|
|
|
469
558
|
#define HWY_SVE_SET(BASE, CHAR, BITS, HALF, NAME, OP) \
|
|
@@ -484,7 +573,9 @@ HWY_SVE_FOREACH_BF16(HWY_SVE_GET, Get, get)
|
|
|
484
573
|
}
|
|
485
574
|
|
|
486
575
|
HWY_SVE_FOREACH(HWY_SVE_SET, Set, set)
|
|
487
|
-
|
|
576
|
+
#if HWY_SVE_HAVE_BF16_FEATURE || HWY_SVE_HAVE_BF16_VEC
|
|
577
|
+
HWY_SVE_FOREACH_BF16_UNCONDITIONAL(HWY_SVE_SET, Set, set)
|
|
578
|
+
#endif
|
|
488
579
|
#undef HWY_SVE_SET
|
|
489
580
|
|
|
490
581
|
// ------------------------------ ResizeBitCast
|
|
@@ -495,6 +586,107 @@ HWY_API VFromD<D> ResizeBitCast(D d, FromV v) {
|
|
|
495
586
|
return BitCast(d, v);
|
|
496
587
|
}
|
|
497
588
|
|
|
589
|
+
// ------------------------------ Dup128VecFromValues
|
|
590
|
+
|
|
591
|
+
template <class D, HWY_IF_I8_D(D)>
|
|
592
|
+
HWY_API svint8_t Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1,
|
|
593
|
+
TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
|
|
594
|
+
TFromD<D> t5, TFromD<D> t6, TFromD<D> t7,
|
|
595
|
+
TFromD<D> t8, TFromD<D> t9, TFromD<D> t10,
|
|
596
|
+
TFromD<D> t11, TFromD<D> t12,
|
|
597
|
+
TFromD<D> t13, TFromD<D> t14,
|
|
598
|
+
TFromD<D> t15) {
|
|
599
|
+
return svdupq_n_s8(t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13,
|
|
600
|
+
t14, t15);
|
|
601
|
+
}
|
|
602
|
+
|
|
603
|
+
template <class D, HWY_IF_U8_D(D)>
|
|
604
|
+
HWY_API svuint8_t Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1,
|
|
605
|
+
TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
|
|
606
|
+
TFromD<D> t5, TFromD<D> t6, TFromD<D> t7,
|
|
607
|
+
TFromD<D> t8, TFromD<D> t9, TFromD<D> t10,
|
|
608
|
+
TFromD<D> t11, TFromD<D> t12,
|
|
609
|
+
TFromD<D> t13, TFromD<D> t14,
|
|
610
|
+
TFromD<D> t15) {
|
|
611
|
+
return svdupq_n_u8(t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13,
|
|
612
|
+
t14, t15);
|
|
613
|
+
}
|
|
614
|
+
|
|
615
|
+
template <class D, HWY_IF_I16_D(D)>
|
|
616
|
+
HWY_API svint16_t Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1,
|
|
617
|
+
TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
|
|
618
|
+
TFromD<D> t5, TFromD<D> t6,
|
|
619
|
+
TFromD<D> t7) {
|
|
620
|
+
return svdupq_n_s16(t0, t1, t2, t3, t4, t5, t6, t7);
|
|
621
|
+
}
|
|
622
|
+
|
|
623
|
+
template <class D, HWY_IF_U16_D(D)>
|
|
624
|
+
HWY_API svuint16_t Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1,
|
|
625
|
+
TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
|
|
626
|
+
TFromD<D> t5, TFromD<D> t6,
|
|
627
|
+
TFromD<D> t7) {
|
|
628
|
+
return svdupq_n_u16(t0, t1, t2, t3, t4, t5, t6, t7);
|
|
629
|
+
}
|
|
630
|
+
|
|
631
|
+
template <class D, HWY_IF_F16_D(D)>
|
|
632
|
+
HWY_API svfloat16_t Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1,
|
|
633
|
+
TFromD<D> t2, TFromD<D> t3,
|
|
634
|
+
TFromD<D> t4, TFromD<D> t5,
|
|
635
|
+
TFromD<D> t6, TFromD<D> t7) {
|
|
636
|
+
return svdupq_n_f16(t0, t1, t2, t3, t4, t5, t6, t7);
|
|
637
|
+
}
|
|
638
|
+
|
|
639
|
+
template <class D, HWY_IF_BF16_D(D)>
|
|
640
|
+
HWY_API VBF16 Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1, TFromD<D> t2,
|
|
641
|
+
TFromD<D> t3, TFromD<D> t4, TFromD<D> t5,
|
|
642
|
+
TFromD<D> t6, TFromD<D> t7) {
|
|
643
|
+
#if HWY_SVE_HAVE_BF16_FEATURE
|
|
644
|
+
(void)d;
|
|
645
|
+
return svdupq_n_bf16(t0, t1, t2, t3, t4, t5, t6, t7);
|
|
646
|
+
#else
|
|
647
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
648
|
+
return BitCast(
|
|
649
|
+
d, Dup128VecFromValues(
|
|
650
|
+
du, BitCastScalar<uint16_t>(t0), BitCastScalar<uint16_t>(t1),
|
|
651
|
+
BitCastScalar<uint16_t>(t2), BitCastScalar<uint16_t>(t3),
|
|
652
|
+
BitCastScalar<uint16_t>(t4), BitCastScalar<uint16_t>(t5),
|
|
653
|
+
BitCastScalar<uint16_t>(t6), BitCastScalar<uint16_t>(t7)));
|
|
654
|
+
#endif
|
|
655
|
+
}
|
|
656
|
+
|
|
657
|
+
template <class D, HWY_IF_I32_D(D)>
|
|
658
|
+
HWY_API svint32_t Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1,
|
|
659
|
+
TFromD<D> t2, TFromD<D> t3) {
|
|
660
|
+
return svdupq_n_s32(t0, t1, t2, t3);
|
|
661
|
+
}
|
|
662
|
+
|
|
663
|
+
template <class D, HWY_IF_U32_D(D)>
|
|
664
|
+
HWY_API svuint32_t Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1,
|
|
665
|
+
TFromD<D> t2, TFromD<D> t3) {
|
|
666
|
+
return svdupq_n_u32(t0, t1, t2, t3);
|
|
667
|
+
}
|
|
668
|
+
|
|
669
|
+
template <class D, HWY_IF_F32_D(D)>
|
|
670
|
+
HWY_API svfloat32_t Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1,
|
|
671
|
+
TFromD<D> t2, TFromD<D> t3) {
|
|
672
|
+
return svdupq_n_f32(t0, t1, t2, t3);
|
|
673
|
+
}
|
|
674
|
+
|
|
675
|
+
template <class D, HWY_IF_I64_D(D)>
|
|
676
|
+
HWY_API svint64_t Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1) {
|
|
677
|
+
return svdupq_n_s64(t0, t1);
|
|
678
|
+
}
|
|
679
|
+
|
|
680
|
+
template <class D, HWY_IF_U64_D(D)>
|
|
681
|
+
HWY_API svuint64_t Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1) {
|
|
682
|
+
return svdupq_n_u64(t0, t1);
|
|
683
|
+
}
|
|
684
|
+
|
|
685
|
+
template <class D, HWY_IF_F64_D(D)>
|
|
686
|
+
HWY_API svfloat64_t Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1) {
|
|
687
|
+
return svdupq_n_f64(t0, t1);
|
|
688
|
+
}
|
|
689
|
+
|
|
498
690
|
// ================================================== LOGICAL
|
|
499
691
|
|
|
500
692
|
// detail::*N() functions accept a scalar argument to avoid extra Set().
|
|
@@ -519,6 +711,10 @@ HWY_API V And(const V a, const V b) {
|
|
|
519
711
|
|
|
520
712
|
// ------------------------------ Or
|
|
521
713
|
|
|
714
|
+
namespace detail {
|
|
715
|
+
HWY_SVE_FOREACH_UI(HWY_SVE_RETV_ARGPVN, OrN, orr_n)
|
|
716
|
+
} // namespace detail
|
|
717
|
+
|
|
522
718
|
HWY_SVE_FOREACH_UI(HWY_SVE_RETV_ARGPVV, Or, orr)
|
|
523
719
|
|
|
524
720
|
template <class V, HWY_IF_FLOAT_V(V)>
|
|
@@ -632,9 +828,37 @@ HWY_API VBF16 Neg(VBF16 v) {
|
|
|
632
828
|
return BitCast(d, Xor(BitCast(du, v), Set(du, SignMask<TU>())));
|
|
633
829
|
}
|
|
634
830
|
|
|
831
|
+
// ------------------------------ SaturatedNeg
|
|
832
|
+
#if HWY_SVE_HAVE_2
|
|
833
|
+
#ifdef HWY_NATIVE_SATURATED_NEG_8_16_32
|
|
834
|
+
#undef HWY_NATIVE_SATURATED_NEG_8_16_32
|
|
835
|
+
#else
|
|
836
|
+
#define HWY_NATIVE_SATURATED_NEG_8_16_32
|
|
837
|
+
#endif
|
|
838
|
+
|
|
839
|
+
#ifdef HWY_NATIVE_SATURATED_NEG_64
|
|
840
|
+
#undef HWY_NATIVE_SATURATED_NEG_64
|
|
841
|
+
#else
|
|
842
|
+
#define HWY_NATIVE_SATURATED_NEG_64
|
|
843
|
+
#endif
|
|
844
|
+
|
|
845
|
+
HWY_SVE_FOREACH_I(HWY_SVE_RETV_ARGPV, SaturatedNeg, qneg)
|
|
846
|
+
#endif // HWY_SVE_HAVE_2
|
|
847
|
+
|
|
635
848
|
// ------------------------------ Abs
|
|
636
849
|
HWY_SVE_FOREACH_IF(HWY_SVE_RETV_ARGPV, Abs, abs)
|
|
637
850
|
|
|
851
|
+
// ------------------------------ SaturatedAbs
|
|
852
|
+
#if HWY_SVE_HAVE_2
|
|
853
|
+
#ifdef HWY_NATIVE_SATURATED_ABS
|
|
854
|
+
#undef HWY_NATIVE_SATURATED_ABS
|
|
855
|
+
#else
|
|
856
|
+
#define HWY_NATIVE_SATURATED_ABS
|
|
857
|
+
#endif
|
|
858
|
+
|
|
859
|
+
HWY_SVE_FOREACH_I(HWY_SVE_RETV_ARGPV, SaturatedAbs, qabs)
|
|
860
|
+
#endif // HWY_SVE_HAVE_2
|
|
861
|
+
|
|
638
862
|
// ================================================== ARITHMETIC
|
|
639
863
|
|
|
640
864
|
// Per-target flags to prevent generic_ops-inl.h defining Add etc.
|
|
@@ -676,13 +900,107 @@ HWY_API svuint64_t SumsOf8(const svuint8_t v) {
|
|
|
676
900
|
|
|
677
901
|
const svuint32_t sums_of_4 = svdot_n_u32(Zero(du32), v, 1);
|
|
678
902
|
// Compute pairwise sum of u32 and extend to u64.
|
|
679
|
-
|
|
903
|
+
|
|
904
|
+
#if HWY_SVE_HAVE_2
|
|
905
|
+
return svadalp_u64_x(pg, Zero(du64), sums_of_4);
|
|
906
|
+
#else
|
|
680
907
|
const svuint64_t hi = svlsr_n_u64_x(pg, BitCast(du64, sums_of_4), 32);
|
|
681
908
|
// Isolate the lower 32 bits (to be added to the upper 32 and zero-extended)
|
|
682
909
|
const svuint64_t lo = svextw_u64_x(pg, BitCast(du64, sums_of_4));
|
|
683
910
|
return Add(hi, lo);
|
|
911
|
+
#endif
|
|
912
|
+
}
|
|
913
|
+
|
|
914
|
+
HWY_API svint64_t SumsOf8(const svint8_t v) {
|
|
915
|
+
const ScalableTag<int32_t> di32;
|
|
916
|
+
const ScalableTag<int64_t> di64;
|
|
917
|
+
const svbool_t pg = detail::PTrue(di64);
|
|
918
|
+
|
|
919
|
+
const svint32_t sums_of_4 = svdot_n_s32(Zero(di32), v, 1);
|
|
920
|
+
#if HWY_SVE_HAVE_2
|
|
921
|
+
return svadalp_s64_x(pg, Zero(di64), sums_of_4);
|
|
922
|
+
#else
|
|
923
|
+
const svint64_t hi = svasr_n_s64_x(pg, BitCast(di64, sums_of_4), 32);
|
|
924
|
+
// Isolate the lower 32 bits (to be added to the upper 32 and sign-extended)
|
|
925
|
+
const svint64_t lo = svextw_s64_x(pg, BitCast(di64, sums_of_4));
|
|
926
|
+
return Add(hi, lo);
|
|
927
|
+
#endif
|
|
928
|
+
}
|
|
929
|
+
|
|
930
|
+
// ------------------------------ SumsOf2
|
|
931
|
+
#if HWY_SVE_HAVE_2
|
|
932
|
+
namespace detail {
|
|
933
|
+
|
|
934
|
+
HWY_INLINE svint16_t SumsOf2(hwy::SignedTag /*type_tag*/,
|
|
935
|
+
hwy::SizeTag<1> /*lane_size_tag*/, svint8_t v) {
|
|
936
|
+
const ScalableTag<int16_t> di16;
|
|
937
|
+
const svbool_t pg = detail::PTrue(di16);
|
|
938
|
+
return svadalp_s16_x(pg, Zero(di16), v);
|
|
939
|
+
}
|
|
940
|
+
|
|
941
|
+
HWY_INLINE svuint16_t SumsOf2(hwy::UnsignedTag /*type_tag*/,
|
|
942
|
+
hwy::SizeTag<1> /*lane_size_tag*/, svuint8_t v) {
|
|
943
|
+
const ScalableTag<uint16_t> du16;
|
|
944
|
+
const svbool_t pg = detail::PTrue(du16);
|
|
945
|
+
return svadalp_u16_x(pg, Zero(du16), v);
|
|
946
|
+
}
|
|
947
|
+
|
|
948
|
+
HWY_INLINE svint32_t SumsOf2(hwy::SignedTag /*type_tag*/,
|
|
949
|
+
hwy::SizeTag<2> /*lane_size_tag*/, svint16_t v) {
|
|
950
|
+
const ScalableTag<int32_t> di32;
|
|
951
|
+
const svbool_t pg = detail::PTrue(di32);
|
|
952
|
+
return svadalp_s32_x(pg, Zero(di32), v);
|
|
953
|
+
}
|
|
954
|
+
|
|
955
|
+
HWY_INLINE svuint32_t SumsOf2(hwy::UnsignedTag /*type_tag*/,
|
|
956
|
+
hwy::SizeTag<2> /*lane_size_tag*/, svuint16_t v) {
|
|
957
|
+
const ScalableTag<uint32_t> du32;
|
|
958
|
+
const svbool_t pg = detail::PTrue(du32);
|
|
959
|
+
return svadalp_u32_x(pg, Zero(du32), v);
|
|
960
|
+
}
|
|
961
|
+
|
|
962
|
+
HWY_INLINE svint64_t SumsOf2(hwy::SignedTag /*type_tag*/,
|
|
963
|
+
hwy::SizeTag<4> /*lane_size_tag*/, svint32_t v) {
|
|
964
|
+
const ScalableTag<int64_t> di64;
|
|
965
|
+
const svbool_t pg = detail::PTrue(di64);
|
|
966
|
+
return svadalp_s64_x(pg, Zero(di64), v);
|
|
967
|
+
}
|
|
968
|
+
|
|
969
|
+
HWY_INLINE svuint64_t SumsOf2(hwy::UnsignedTag /*type_tag*/,
|
|
970
|
+
hwy::SizeTag<4> /*lane_size_tag*/, svuint32_t v) {
|
|
971
|
+
const ScalableTag<uint64_t> du64;
|
|
972
|
+
const svbool_t pg = detail::PTrue(du64);
|
|
973
|
+
return svadalp_u64_x(pg, Zero(du64), v);
|
|
974
|
+
}
|
|
975
|
+
|
|
976
|
+
} // namespace detail
|
|
977
|
+
#endif // HWY_SVE_HAVE_2
|
|
978
|
+
|
|
979
|
+
// ------------------------------ SumsOf4
|
|
980
|
+
namespace detail {
|
|
981
|
+
|
|
982
|
+
HWY_INLINE svint32_t SumsOf4(hwy::SignedTag /*type_tag*/,
|
|
983
|
+
hwy::SizeTag<1> /*lane_size_tag*/, svint8_t v) {
|
|
984
|
+
return svdot_n_s32(Zero(ScalableTag<int32_t>()), v, 1);
|
|
985
|
+
}
|
|
986
|
+
|
|
987
|
+
HWY_INLINE svuint32_t SumsOf4(hwy::UnsignedTag /*type_tag*/,
|
|
988
|
+
hwy::SizeTag<1> /*lane_size_tag*/, svuint8_t v) {
|
|
989
|
+
return svdot_n_u32(Zero(ScalableTag<uint32_t>()), v, 1);
|
|
990
|
+
}
|
|
991
|
+
|
|
992
|
+
HWY_INLINE svint64_t SumsOf4(hwy::SignedTag /*type_tag*/,
|
|
993
|
+
hwy::SizeTag<2> /*lane_size_tag*/, svint16_t v) {
|
|
994
|
+
return svdot_n_s64(Zero(ScalableTag<int64_t>()), v, 1);
|
|
995
|
+
}
|
|
996
|
+
|
|
997
|
+
HWY_INLINE svuint64_t SumsOf4(hwy::UnsignedTag /*type_tag*/,
|
|
998
|
+
hwy::SizeTag<2> /*lane_size_tag*/, svuint16_t v) {
|
|
999
|
+
return svdot_n_u64(Zero(ScalableTag<uint64_t>()), v, 1);
|
|
684
1000
|
}
|
|
685
1001
|
|
|
1002
|
+
} // namespace detail
|
|
1003
|
+
|
|
686
1004
|
// ------------------------------ SaturatedAdd
|
|
687
1005
|
|
|
688
1006
|
#ifdef HWY_NATIVE_I32_SATURATED_ADDSUB
|
|
@@ -726,14 +1044,15 @@ HWY_SVE_FOREACH(HWY_SVE_RETV_ARGPVV, AbsDiff, abd)
|
|
|
726
1044
|
|
|
727
1045
|
// ------------------------------ ShiftLeft[Same]
|
|
728
1046
|
|
|
729
|
-
#define HWY_SVE_SHIFT_N(BASE, CHAR, BITS, HALF, NAME, OP)
|
|
730
|
-
template <int kBits>
|
|
731
|
-
HWY_API HWY_SVE_V(BASE, BITS) NAME(HWY_SVE_V(BASE, BITS) v) {
|
|
732
|
-
return sv##OP##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), v, kBits);
|
|
733
|
-
}
|
|
734
|
-
HWY_API HWY_SVE_V(BASE, BITS)
|
|
735
|
-
NAME##Same(HWY_SVE_V(BASE, BITS) v,
|
|
736
|
-
return sv##OP##_##CHAR##BITS##_x(
|
|
1047
|
+
#define HWY_SVE_SHIFT_N(BASE, CHAR, BITS, HALF, NAME, OP) \
|
|
1048
|
+
template <int kBits> \
|
|
1049
|
+
HWY_API HWY_SVE_V(BASE, BITS) NAME(HWY_SVE_V(BASE, BITS) v) { \
|
|
1050
|
+
return sv##OP##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), v, kBits); \
|
|
1051
|
+
} \
|
|
1052
|
+
HWY_API HWY_SVE_V(BASE, BITS) \
|
|
1053
|
+
NAME##Same(HWY_SVE_V(BASE, BITS) v, int bits) { \
|
|
1054
|
+
return sv##OP##_##CHAR##BITS##_x( \
|
|
1055
|
+
HWY_SVE_PTRUE(BITS), v, static_cast<HWY_SVE_T(uint, BITS)>(bits)); \
|
|
737
1056
|
}
|
|
738
1057
|
|
|
739
1058
|
HWY_SVE_FOREACH_UI(HWY_SVE_SHIFT_N, ShiftLeft, lsl_n)
|
|
@@ -747,15 +1066,35 @@ HWY_SVE_FOREACH_I(HWY_SVE_SHIFT_N, ShiftRight, asr_n)
|
|
|
747
1066
|
|
|
748
1067
|
// ------------------------------ RotateRight
|
|
749
1068
|
|
|
750
|
-
|
|
751
|
-
|
|
1069
|
+
#if HWY_SVE_HAVE_2
|
|
1070
|
+
|
|
1071
|
+
#define HWY_SVE_ROTATE_RIGHT_N(BASE, CHAR, BITS, HALF, NAME, OP) \
|
|
1072
|
+
template <int kBits> \
|
|
1073
|
+
HWY_API HWY_SVE_V(BASE, BITS) NAME(HWY_SVE_V(BASE, BITS) v) { \
|
|
1074
|
+
if (kBits == 0) return v; \
|
|
1075
|
+
return sv##OP##_##CHAR##BITS(v, Zero(DFromV<decltype(v)>()), \
|
|
1076
|
+
HWY_MAX(kBits, 1)); \
|
|
1077
|
+
}
|
|
1078
|
+
|
|
1079
|
+
HWY_SVE_FOREACH_U(HWY_SVE_ROTATE_RIGHT_N, RotateRight, xar_n)
|
|
1080
|
+
HWY_SVE_FOREACH_I(HWY_SVE_ROTATE_RIGHT_N, RotateRight, xar_n)
|
|
1081
|
+
|
|
1082
|
+
#undef HWY_SVE_ROTATE_RIGHT_N
|
|
1083
|
+
|
|
1084
|
+
#else // !HWY_SVE_HAVE_2
|
|
1085
|
+
template <int kBits, class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
|
|
752
1086
|
HWY_API V RotateRight(const V v) {
|
|
1087
|
+
const DFromV<decltype(v)> d;
|
|
1088
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
1089
|
+
|
|
753
1090
|
constexpr size_t kSizeInBits = sizeof(TFromV<V>) * 8;
|
|
754
1091
|
static_assert(0 <= kBits && kBits < kSizeInBits, "Invalid shift count");
|
|
755
1092
|
if (kBits == 0) return v;
|
|
756
|
-
|
|
1093
|
+
|
|
1094
|
+
return Or(BitCast(d, ShiftRight<kBits>(BitCast(du, v))),
|
|
757
1095
|
ShiftLeft<HWY_MIN(kSizeInBits - 1, kSizeInBits - kBits)>(v));
|
|
758
1096
|
}
|
|
1097
|
+
#endif
|
|
759
1098
|
|
|
760
1099
|
// ------------------------------ Shl/r
|
|
761
1100
|
|
|
@@ -803,11 +1142,7 @@ HWY_SVE_FOREACH_UI(HWY_SVE_RETV_ARGPVN, MaxN, max_n)
|
|
|
803
1142
|
HWY_SVE_FOREACH(HWY_SVE_RETV_ARGPVV, Mul, mul)
|
|
804
1143
|
|
|
805
1144
|
// ------------------------------ MulHigh
|
|
806
|
-
|
|
807
|
-
// Not part of API, used internally:
|
|
808
|
-
HWY_SVE_FOREACH_UI08(HWY_SVE_RETV_ARGPVV, MulHigh, mulh)
|
|
809
|
-
HWY_SVE_FOREACH_UI32(HWY_SVE_RETV_ARGPVV, MulHigh, mulh)
|
|
810
|
-
HWY_SVE_FOREACH_U64(HWY_SVE_RETV_ARGPVV, MulHigh, mulh)
|
|
1145
|
+
HWY_SVE_FOREACH_UI(HWY_SVE_RETV_ARGPVV, MulHigh, mulh)
|
|
811
1146
|
|
|
812
1147
|
// ------------------------------ MulFixedPoint15
|
|
813
1148
|
HWY_API svint16_t MulFixedPoint15(svint16_t a, svint16_t b) {
|
|
@@ -830,6 +1165,14 @@ HWY_API svint16_t MulFixedPoint15(svint16_t a, svint16_t b) {
|
|
|
830
1165
|
}
|
|
831
1166
|
|
|
832
1167
|
// ------------------------------ Div
|
|
1168
|
+
#ifdef HWY_NATIVE_INT_DIV
|
|
1169
|
+
#undef HWY_NATIVE_INT_DIV
|
|
1170
|
+
#else
|
|
1171
|
+
#define HWY_NATIVE_INT_DIV
|
|
1172
|
+
#endif
|
|
1173
|
+
|
|
1174
|
+
HWY_SVE_FOREACH_UI32(HWY_SVE_RETV_ARGPVV, Div, div)
|
|
1175
|
+
HWY_SVE_FOREACH_UI64(HWY_SVE_RETV_ARGPVV, Div, div)
|
|
833
1176
|
HWY_SVE_FOREACH_F(HWY_SVE_RETV_ARGPVV, Div, div)
|
|
834
1177
|
|
|
835
1178
|
// ------------------------------ ApproximateReciprocal
|
|
@@ -981,18 +1324,40 @@ HWY_API size_t FindKnownFirstTrue(D d, svbool_t m) {
|
|
|
981
1324
|
}
|
|
982
1325
|
|
|
983
1326
|
HWY_SVE_FOREACH(HWY_SVE_IF_THEN_ELSE, IfThenElse, sel)
|
|
1327
|
+
HWY_SVE_FOREACH_BF16(HWY_SVE_IF_THEN_ELSE, IfThenElse, sel)
|
|
984
1328
|
#undef HWY_SVE_IF_THEN_ELSE
|
|
985
1329
|
|
|
1330
|
+
template <class V, class D = DFromV<V>, HWY_SVE_IF_EMULATED_D(D)>
|
|
1331
|
+
HWY_API V IfThenElse(const svbool_t mask, V yes, V no) {
|
|
1332
|
+
const RebindToUnsigned<D> du;
|
|
1333
|
+
return BitCast(
|
|
1334
|
+
D(), IfThenElse(RebindMask(du, mask), BitCast(du, yes), BitCast(du, no)));
|
|
1335
|
+
}
|
|
1336
|
+
|
|
986
1337
|
// ------------------------------ IfThenElseZero
|
|
987
|
-
|
|
1338
|
+
|
|
1339
|
+
template <class V, class D = DFromV<V>, HWY_SVE_IF_NOT_EMULATED_D(D)>
|
|
988
1340
|
HWY_API V IfThenElseZero(const svbool_t mask, const V yes) {
|
|
989
|
-
return IfThenElse(mask, yes, Zero(
|
|
1341
|
+
return IfThenElse(mask, yes, Zero(D()));
|
|
1342
|
+
}
|
|
1343
|
+
|
|
1344
|
+
template <class V, class D = DFromV<V>, HWY_SVE_IF_EMULATED_D(D)>
|
|
1345
|
+
HWY_API V IfThenElseZero(const svbool_t mask, V yes) {
|
|
1346
|
+
const RebindToUnsigned<D> du;
|
|
1347
|
+
return BitCast(D(), IfThenElseZero(RebindMask(du, mask), BitCast(du, yes)));
|
|
990
1348
|
}
|
|
991
1349
|
|
|
992
1350
|
// ------------------------------ IfThenZeroElse
|
|
993
|
-
|
|
1351
|
+
|
|
1352
|
+
template <class V, class D = DFromV<V>, HWY_SVE_IF_NOT_EMULATED_D(D)>
|
|
994
1353
|
HWY_API V IfThenZeroElse(const svbool_t mask, const V no) {
|
|
995
|
-
return IfThenElse(mask, Zero(
|
|
1354
|
+
return IfThenElse(mask, Zero(D()), no);
|
|
1355
|
+
}
|
|
1356
|
+
|
|
1357
|
+
template <class V, class D = DFromV<V>, HWY_SVE_IF_EMULATED_D(D)>
|
|
1358
|
+
HWY_API V IfThenZeroElse(const svbool_t mask, V no) {
|
|
1359
|
+
const RebindToUnsigned<D> du;
|
|
1360
|
+
return BitCast(D(), IfThenZeroElse(RebindMask(du, mask), BitCast(du, no)));
|
|
996
1361
|
}
|
|
997
1362
|
|
|
998
1363
|
// ------------------------------ Additional mask logical operations
|
|
@@ -1016,6 +1381,162 @@ HWY_API svbool_t SetAtOrAfterFirst(svbool_t m) {
|
|
|
1016
1381
|
return Not(SetBeforeFirst(m));
|
|
1017
1382
|
}
|
|
1018
1383
|
|
|
1384
|
+
// ------------------------------ PromoteMaskTo
|
|
1385
|
+
|
|
1386
|
+
#ifdef HWY_NATIVE_PROMOTE_MASK_TO
|
|
1387
|
+
#undef HWY_NATIVE_PROMOTE_MASK_TO
|
|
1388
|
+
#else
|
|
1389
|
+
#define HWY_NATIVE_PROMOTE_MASK_TO
|
|
1390
|
+
#endif
|
|
1391
|
+
|
|
1392
|
+
template <class DTo, class DFrom,
|
|
1393
|
+
HWY_IF_T_SIZE_D(DTo, sizeof(TFromD<DFrom>) * 2)>
|
|
1394
|
+
HWY_API svbool_t PromoteMaskTo(DTo /*d_to*/, DFrom /*d_from*/, svbool_t m) {
|
|
1395
|
+
return svunpklo_b(m);
|
|
1396
|
+
}
|
|
1397
|
+
|
|
1398
|
+
template <class DTo, class DFrom,
|
|
1399
|
+
HWY_IF_T_SIZE_GT_D(DTo, sizeof(TFromD<DFrom>) * 2)>
|
|
1400
|
+
HWY_API svbool_t PromoteMaskTo(DTo d_to, DFrom d_from, svbool_t m) {
|
|
1401
|
+
using TFrom = TFromD<DFrom>;
|
|
1402
|
+
using TWFrom = MakeWide<MakeUnsigned<TFrom>>;
|
|
1403
|
+
static_assert(sizeof(TWFrom) > sizeof(TFrom),
|
|
1404
|
+
"sizeof(TWFrom) > sizeof(TFrom) must be true");
|
|
1405
|
+
|
|
1406
|
+
const Rebind<TWFrom, decltype(d_from)> dw_from;
|
|
1407
|
+
return PromoteMaskTo(d_to, dw_from, PromoteMaskTo(dw_from, d_from, m));
|
|
1408
|
+
}
|
|
1409
|
+
|
|
1410
|
+
// ------------------------------ DemoteMaskTo
|
|
1411
|
+
|
|
1412
|
+
#ifdef HWY_NATIVE_DEMOTE_MASK_TO
|
|
1413
|
+
#undef HWY_NATIVE_DEMOTE_MASK_TO
|
|
1414
|
+
#else
|
|
1415
|
+
#define HWY_NATIVE_DEMOTE_MASK_TO
|
|
1416
|
+
#endif
|
|
1417
|
+
|
|
1418
|
+
template <class DTo, class DFrom, HWY_IF_T_SIZE_D(DTo, 1),
|
|
1419
|
+
HWY_IF_T_SIZE_D(DFrom, 2)>
|
|
1420
|
+
HWY_API svbool_t DemoteMaskTo(DTo /*d_to*/, DFrom /*d_from*/, svbool_t m) {
|
|
1421
|
+
return svuzp1_b8(m, m);
|
|
1422
|
+
}
|
|
1423
|
+
|
|
1424
|
+
template <class DTo, class DFrom, HWY_IF_T_SIZE_D(DTo, 2),
|
|
1425
|
+
HWY_IF_T_SIZE_D(DFrom, 4)>
|
|
1426
|
+
HWY_API svbool_t DemoteMaskTo(DTo /*d_to*/, DFrom /*d_from*/, svbool_t m) {
|
|
1427
|
+
return svuzp1_b16(m, m);
|
|
1428
|
+
}
|
|
1429
|
+
|
|
1430
|
+
template <class DTo, class DFrom, HWY_IF_T_SIZE_D(DTo, 4),
|
|
1431
|
+
HWY_IF_T_SIZE_D(DFrom, 8)>
|
|
1432
|
+
HWY_API svbool_t DemoteMaskTo(DTo /*d_to*/, DFrom /*d_from*/, svbool_t m) {
|
|
1433
|
+
return svuzp1_b32(m, m);
|
|
1434
|
+
}
|
|
1435
|
+
|
|
1436
|
+
template <class DTo, class DFrom,
|
|
1437
|
+
HWY_IF_T_SIZE_LE_D(DTo, sizeof(TFromD<DFrom>) / 4)>
|
|
1438
|
+
HWY_API svbool_t DemoteMaskTo(DTo d_to, DFrom d_from, svbool_t m) {
|
|
1439
|
+
using TFrom = TFromD<DFrom>;
|
|
1440
|
+
using TNFrom = MakeNarrow<MakeUnsigned<TFrom>>;
|
|
1441
|
+
static_assert(sizeof(TNFrom) < sizeof(TFrom),
|
|
1442
|
+
"sizeof(TNFrom) < sizeof(TFrom) must be true");
|
|
1443
|
+
|
|
1444
|
+
const Rebind<TNFrom, decltype(d_from)> dn_from;
|
|
1445
|
+
return DemoteMaskTo(d_to, dn_from, DemoteMaskTo(dn_from, d_from, m));
|
|
1446
|
+
}
|
|
1447
|
+
|
|
1448
|
+
// ------------------------------ LowerHalfOfMask
|
|
1449
|
+
#ifdef HWY_NATIVE_LOWER_HALF_OF_MASK
|
|
1450
|
+
#undef HWY_NATIVE_LOWER_HALF_OF_MASK
|
|
1451
|
+
#else
|
|
1452
|
+
#define HWY_NATIVE_LOWER_HALF_OF_MASK
|
|
1453
|
+
#endif
|
|
1454
|
+
|
|
1455
|
+
template <class D>
|
|
1456
|
+
HWY_API svbool_t LowerHalfOfMask(D /*d*/, svbool_t m) {
|
|
1457
|
+
return m;
|
|
1458
|
+
}
|
|
1459
|
+
|
|
1460
|
+
// ------------------------------ MaskedAddOr etc. (IfThenElse)
|
|
1461
|
+
|
|
1462
|
+
#ifdef HWY_NATIVE_MASKED_ARITH
|
|
1463
|
+
#undef HWY_NATIVE_MASKED_ARITH
|
|
1464
|
+
#else
|
|
1465
|
+
#define HWY_NATIVE_MASKED_ARITH
|
|
1466
|
+
#endif
|
|
1467
|
+
|
|
1468
|
+
namespace detail {
|
|
1469
|
+
HWY_SVE_FOREACH(HWY_SVE_RETV_ARGMVV, MaskedMin, min)
|
|
1470
|
+
HWY_SVE_FOREACH(HWY_SVE_RETV_ARGMVV, MaskedMax, max)
|
|
1471
|
+
HWY_SVE_FOREACH(HWY_SVE_RETV_ARGMVV, MaskedAdd, add)
|
|
1472
|
+
HWY_SVE_FOREACH(HWY_SVE_RETV_ARGMVV, MaskedSub, sub)
|
|
1473
|
+
HWY_SVE_FOREACH(HWY_SVE_RETV_ARGMVV, MaskedMul, mul)
|
|
1474
|
+
HWY_SVE_FOREACH_F(HWY_SVE_RETV_ARGMVV, MaskedDiv, div)
|
|
1475
|
+
HWY_SVE_FOREACH_UI32(HWY_SVE_RETV_ARGMVV, MaskedDiv, div)
|
|
1476
|
+
HWY_SVE_FOREACH_UI64(HWY_SVE_RETV_ARGMVV, MaskedDiv, div)
|
|
1477
|
+
#if HWY_SVE_HAVE_2
|
|
1478
|
+
HWY_SVE_FOREACH_UI(HWY_SVE_RETV_ARGMVV, MaskedSatAdd, qadd)
|
|
1479
|
+
HWY_SVE_FOREACH_UI(HWY_SVE_RETV_ARGMVV, MaskedSatSub, qsub)
|
|
1480
|
+
#endif
|
|
1481
|
+
} // namespace detail
|
|
1482
|
+
|
|
1483
|
+
template <class V, class M>
|
|
1484
|
+
HWY_API V MaskedMinOr(V no, M m, V a, V b) {
|
|
1485
|
+
return IfThenElse(m, detail::MaskedMin(m, a, b), no);
|
|
1486
|
+
}
|
|
1487
|
+
|
|
1488
|
+
template <class V, class M>
|
|
1489
|
+
HWY_API V MaskedMaxOr(V no, M m, V a, V b) {
|
|
1490
|
+
return IfThenElse(m, detail::MaskedMax(m, a, b), no);
|
|
1491
|
+
}
|
|
1492
|
+
|
|
1493
|
+
template <class V, class M>
|
|
1494
|
+
HWY_API V MaskedAddOr(V no, M m, V a, V b) {
|
|
1495
|
+
return IfThenElse(m, detail::MaskedAdd(m, a, b), no);
|
|
1496
|
+
}
|
|
1497
|
+
|
|
1498
|
+
template <class V, class M>
|
|
1499
|
+
HWY_API V MaskedSubOr(V no, M m, V a, V b) {
|
|
1500
|
+
return IfThenElse(m, detail::MaskedSub(m, a, b), no);
|
|
1501
|
+
}
|
|
1502
|
+
|
|
1503
|
+
template <class V, class M>
|
|
1504
|
+
HWY_API V MaskedMulOr(V no, M m, V a, V b) {
|
|
1505
|
+
return IfThenElse(m, detail::MaskedMul(m, a, b), no);
|
|
1506
|
+
}
|
|
1507
|
+
|
|
1508
|
+
template <class V, class M,
|
|
1509
|
+
HWY_IF_T_SIZE_ONE_OF_V(
|
|
1510
|
+
V, (hwy::IsSame<TFromV<V>, hwy::float16_t>() ? (1 << 2) : 0) |
|
|
1511
|
+
(1 << 4) | (1 << 8))>
|
|
1512
|
+
HWY_API V MaskedDivOr(V no, M m, V a, V b) {
|
|
1513
|
+
return IfThenElse(m, detail::MaskedDiv(m, a, b), no);
|
|
1514
|
+
}
|
|
1515
|
+
|
|
1516
|
+
// I8/U8/I16/U16 MaskedDivOr is implemented after I8/U8/I16/U16 Div
|
|
1517
|
+
|
|
1518
|
+
#if HWY_SVE_HAVE_2
|
|
1519
|
+
template <class V, class M>
|
|
1520
|
+
HWY_API V MaskedSatAddOr(V no, M m, V a, V b) {
|
|
1521
|
+
return IfThenElse(m, detail::MaskedSatAdd(m, a, b), no);
|
|
1522
|
+
}
|
|
1523
|
+
|
|
1524
|
+
template <class V, class M>
|
|
1525
|
+
HWY_API V MaskedSatSubOr(V no, M m, V a, V b) {
|
|
1526
|
+
return IfThenElse(m, detail::MaskedSatSub(m, a, b), no);
|
|
1527
|
+
}
|
|
1528
|
+
#else
|
|
1529
|
+
template <class V, class M>
|
|
1530
|
+
HWY_API V MaskedSatAddOr(V no, M m, V a, V b) {
|
|
1531
|
+
return IfThenElse(m, SaturatedAdd(a, b), no);
|
|
1532
|
+
}
|
|
1533
|
+
|
|
1534
|
+
template <class V, class M>
|
|
1535
|
+
HWY_API V MaskedSatSubOr(V no, M m, V a, V b) {
|
|
1536
|
+
return IfThenElse(m, SaturatedSub(a, b), no);
|
|
1537
|
+
}
|
|
1538
|
+
#endif
|
|
1539
|
+
|
|
1019
1540
|
// ================================================== COMPARE
|
|
1020
1541
|
|
|
1021
1542
|
// mask = f(vector, vector)
|
|
@@ -1078,7 +1599,8 @@ HWY_API svbool_t TestBit(const V a, const V bit) {
|
|
|
1078
1599
|
// ------------------------------ MaskFromVec (Ne)
|
|
1079
1600
|
template <class V>
|
|
1080
1601
|
HWY_API svbool_t MaskFromVec(const V v) {
|
|
1081
|
-
|
|
1602
|
+
using T = TFromV<V>;
|
|
1603
|
+
return detail::NeN(v, ConvertScalarTo<T>(0));
|
|
1082
1604
|
}
|
|
1083
1605
|
|
|
1084
1606
|
// ------------------------------ VecFromMask
|
|
@@ -1090,6 +1612,22 @@ HWY_API VFromD<D> VecFromMask(const D d, svbool_t mask) {
|
|
|
1090
1612
|
return BitCast(d, IfThenElseZero(mask, Set(di, -1)));
|
|
1091
1613
|
}
|
|
1092
1614
|
|
|
1615
|
+
// ------------------------------ IsNegative (Lt)
|
|
1616
|
+
#ifdef HWY_NATIVE_IS_NEGATIVE
|
|
1617
|
+
#undef HWY_NATIVE_IS_NEGATIVE
|
|
1618
|
+
#else
|
|
1619
|
+
#define HWY_NATIVE_IS_NEGATIVE
|
|
1620
|
+
#endif
|
|
1621
|
+
|
|
1622
|
+
template <class V, HWY_IF_NOT_UNSIGNED_V(V)>
|
|
1623
|
+
HWY_API svbool_t IsNegative(V v) {
|
|
1624
|
+
const DFromV<decltype(v)> d;
|
|
1625
|
+
const RebindToSigned<decltype(d)> di;
|
|
1626
|
+
using TI = TFromD<decltype(di)>;
|
|
1627
|
+
|
|
1628
|
+
return detail::LtN(BitCast(di, v), static_cast<TI>(0));
|
|
1629
|
+
}
|
|
1630
|
+
|
|
1093
1631
|
// ------------------------------ IfVecThenElse (MaskFromVec, IfThenElse)
|
|
1094
1632
|
|
|
1095
1633
|
#if HWY_SVE_HAVE_2
|
|
@@ -1159,14 +1697,27 @@ HWY_API svbool_t IsNaN(const V v) {
|
|
|
1159
1697
|
return Ne(v, v); // could also use cmpuo
|
|
1160
1698
|
}
|
|
1161
1699
|
|
|
1700
|
+
// Per-target flag to prevent generic_ops-inl.h from defining IsInf / IsFinite.
|
|
1701
|
+
// We use a fused Set/comparison for IsFinite.
|
|
1702
|
+
#ifdef HWY_NATIVE_ISINF
|
|
1703
|
+
#undef HWY_NATIVE_ISINF
|
|
1704
|
+
#else
|
|
1705
|
+
#define HWY_NATIVE_ISINF
|
|
1706
|
+
#endif
|
|
1707
|
+
|
|
1162
1708
|
template <class V>
|
|
1163
1709
|
HWY_API svbool_t IsInf(const V v) {
|
|
1164
1710
|
using T = TFromV<V>;
|
|
1165
1711
|
const DFromV<decltype(v)> d;
|
|
1712
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
1166
1713
|
const RebindToSigned<decltype(d)> di;
|
|
1167
|
-
|
|
1168
|
-
// 'Shift left' to clear the sign bit
|
|
1169
|
-
|
|
1714
|
+
|
|
1715
|
+
// 'Shift left' to clear the sign bit
|
|
1716
|
+
const VFromD<decltype(du)> vu = BitCast(du, v);
|
|
1717
|
+
const VFromD<decltype(du)> v2 = Add(vu, vu);
|
|
1718
|
+
// Check for exponent=max and mantissa=0.
|
|
1719
|
+
const VFromD<decltype(di)> max2 = Set(di, hwy::MaxExponentTimes2<T>());
|
|
1720
|
+
return RebindMask(d, Eq(v2, BitCast(du, max2)));
|
|
1170
1721
|
}
|
|
1171
1722
|
|
|
1172
1723
|
// Returns whether normal/subnormal/zero.
|
|
@@ -1187,147 +1738,135 @@ HWY_API svbool_t IsFinite(const V v) {
|
|
|
1187
1738
|
|
|
1188
1739
|
// ================================================== MEMORY
|
|
1189
1740
|
|
|
1190
|
-
// ------------------------------
|
|
1741
|
+
// ------------------------------ LoadU/MaskedLoad/LoadDup128/StoreU/Stream
|
|
1191
1742
|
|
|
1192
|
-
#define
|
|
1193
|
-
template <size_t N, int kPow2>
|
|
1194
|
-
HWY_API HWY_SVE_V(BASE, BITS)
|
|
1195
|
-
|
|
1196
|
-
|
|
1197
|
-
|
|
1198
|
-
|
|
1199
|
-
|
|
1743
|
+
#define HWY_SVE_MEM(BASE, CHAR, BITS, HALF, NAME, OP) \
|
|
1744
|
+
template <size_t N, int kPow2> \
|
|
1745
|
+
HWY_API HWY_SVE_V(BASE, BITS) \
|
|
1746
|
+
LoadU(HWY_SVE_D(BASE, BITS, N, kPow2) d, \
|
|
1747
|
+
const HWY_SVE_T(BASE, BITS) * HWY_RESTRICT p) { \
|
|
1748
|
+
return svld1_##CHAR##BITS(detail::MakeMask(d), \
|
|
1749
|
+
detail::NativeLanePointer(p)); \
|
|
1750
|
+
} \
|
|
1751
|
+
template <size_t N, int kPow2> \
|
|
1752
|
+
HWY_API HWY_SVE_V(BASE, BITS) \
|
|
1753
|
+
MaskedLoad(svbool_t m, HWY_SVE_D(BASE, BITS, N, kPow2) /* d */, \
|
|
1754
|
+
const HWY_SVE_T(BASE, BITS) * HWY_RESTRICT p) { \
|
|
1755
|
+
return svld1_##CHAR##BITS(m, detail::NativeLanePointer(p)); \
|
|
1756
|
+
} \
|
|
1757
|
+
template <size_t N, int kPow2> \
|
|
1758
|
+
HWY_API void StoreU(HWY_SVE_V(BASE, BITS) v, \
|
|
1759
|
+
HWY_SVE_D(BASE, BITS, N, kPow2) d, \
|
|
1760
|
+
HWY_SVE_T(BASE, BITS) * HWY_RESTRICT p) { \
|
|
1761
|
+
svst1_##CHAR##BITS(detail::MakeMask(d), detail::NativeLanePointer(p), v); \
|
|
1762
|
+
} \
|
|
1763
|
+
template <size_t N, int kPow2> \
|
|
1764
|
+
HWY_API void Stream(HWY_SVE_V(BASE, BITS) v, \
|
|
1765
|
+
HWY_SVE_D(BASE, BITS, N, kPow2) d, \
|
|
1766
|
+
HWY_SVE_T(BASE, BITS) * HWY_RESTRICT p) { \
|
|
1767
|
+
svstnt1_##CHAR##BITS(detail::MakeMask(d), detail::NativeLanePointer(p), \
|
|
1768
|
+
v); \
|
|
1769
|
+
} \
|
|
1770
|
+
template <size_t N, int kPow2> \
|
|
1771
|
+
HWY_API void BlendedStore(HWY_SVE_V(BASE, BITS) v, svbool_t m, \
|
|
1772
|
+
HWY_SVE_D(BASE, BITS, N, kPow2) /* d */, \
|
|
1773
|
+
HWY_SVE_T(BASE, BITS) * HWY_RESTRICT p) { \
|
|
1774
|
+
svst1_##CHAR##BITS(m, detail::NativeLanePointer(p), v); \
|
|
1200
1775
|
}
|
|
1201
1776
|
|
|
1202
|
-
|
|
1203
|
-
|
|
1204
|
-
HWY_API HWY_SVE_V(BASE, BITS) \
|
|
1205
|
-
NAME(svbool_t m, HWY_SVE_D(BASE, BITS, N, kPow2) /* d */, \
|
|
1206
|
-
const HWY_SVE_T(BASE, BITS) * HWY_RESTRICT p) { \
|
|
1207
|
-
using T = detail::NativeLaneType<HWY_SVE_T(BASE, BITS)>; \
|
|
1208
|
-
return sv##OP##_##CHAR##BITS(m, reinterpret_cast<const T*>(p)); \
|
|
1209
|
-
}
|
|
1777
|
+
HWY_SVE_FOREACH(HWY_SVE_MEM, _, _)
|
|
1778
|
+
HWY_SVE_FOREACH_BF16(HWY_SVE_MEM, _, _)
|
|
1210
1779
|
|
|
1211
|
-
|
|
1212
|
-
|
|
1213
|
-
|
|
1214
|
-
|
|
1215
|
-
|
|
1216
|
-
using T = detail::NativeLaneType<HWY_SVE_T(BASE, BITS)>; \
|
|
1217
|
-
/* All-true predicate to load all 128 bits. */ \
|
|
1218
|
-
return sv##OP##_##CHAR##BITS(HWY_SVE_PTRUE(8), \
|
|
1219
|
-
reinterpret_cast<const T*>(p)); \
|
|
1220
|
-
}
|
|
1780
|
+
template <class D, HWY_SVE_IF_EMULATED_D(D)>
|
|
1781
|
+
HWY_API VFromD<D> LoadU(D d, const TFromD<D>* HWY_RESTRICT p) {
|
|
1782
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
1783
|
+
return BitCast(d, LoadU(du, detail::U16LanePointer(p)));
|
|
1784
|
+
}
|
|
1221
1785
|
|
|
1222
|
-
|
|
1223
|
-
|
|
1224
|
-
|
|
1225
|
-
|
|
1226
|
-
|
|
1227
|
-
using T = detail::NativeLaneType<HWY_SVE_T(BASE, BITS)>; \
|
|
1228
|
-
sv##OP##_##CHAR##BITS(detail::MakeMask(d), reinterpret_cast<T*>(p), v); \
|
|
1229
|
-
}
|
|
1786
|
+
template <class D, HWY_SVE_IF_EMULATED_D(D)>
|
|
1787
|
+
HWY_API void StoreU(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT p) {
|
|
1788
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
1789
|
+
StoreU(BitCast(du, v), du, detail::U16LanePointer(p));
|
|
1790
|
+
}
|
|
1230
1791
|
|
|
1231
|
-
|
|
1232
|
-
|
|
1233
|
-
|
|
1234
|
-
|
|
1235
|
-
|
|
1236
|
-
|
|
1237
|
-
|
|
1238
|
-
|
|
1792
|
+
template <class D, HWY_SVE_IF_EMULATED_D(D)>
|
|
1793
|
+
HWY_API VFromD<D> MaskedLoad(MFromD<D> m, D d,
|
|
1794
|
+
const TFromD<D>* HWY_RESTRICT p) {
|
|
1795
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
1796
|
+
return BitCast(d,
|
|
1797
|
+
MaskedLoad(RebindMask(du, m), du, detail::U16LanePointer(p)));
|
|
1798
|
+
}
|
|
1799
|
+
|
|
1800
|
+
// MaskedLoadOr is generic and does not require emulation.
|
|
1239
1801
|
|
|
1240
|
-
|
|
1241
|
-
|
|
1242
|
-
|
|
1243
|
-
|
|
1244
|
-
|
|
1802
|
+
template <class D, HWY_SVE_IF_EMULATED_D(D)>
|
|
1803
|
+
HWY_API void BlendedStore(VFromD<D> v, MFromD<D> m, D d,
|
|
1804
|
+
TFromD<D>* HWY_RESTRICT p) {
|
|
1805
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
1806
|
+
BlendedStore(BitCast(du, v), RebindMask(du, m), du,
|
|
1807
|
+
detail::U16LanePointer(p));
|
|
1808
|
+
}
|
|
1245
1809
|
|
|
1246
|
-
|
|
1247
|
-
HWY_SVE_FOREACH_BF16(HWY_SVE_MASKED_LOAD, MaskedLoad, ld1)
|
|
1248
|
-
HWY_SVE_FOREACH_BF16(HWY_SVE_STORE, Store, st1)
|
|
1249
|
-
HWY_SVE_FOREACH_BF16(HWY_SVE_STORE, Stream, stnt1)
|
|
1250
|
-
HWY_SVE_FOREACH_BF16(HWY_SVE_BLENDED_STORE, BlendedStore, st1)
|
|
1810
|
+
#undef HWY_SVE_MEM
|
|
1251
1811
|
|
|
1252
1812
|
#if HWY_TARGET != HWY_SVE2_128
|
|
1253
1813
|
namespace detail {
|
|
1254
|
-
|
|
1255
|
-
|
|
1256
|
-
|
|
1257
|
-
|
|
1258
|
-
|
|
1259
|
-
|
|
1260
|
-
|
|
1261
|
-
|
|
1262
|
-
|
|
1814
|
+
#define HWY_SVE_LOAD_DUP128(BASE, CHAR, BITS, HALF, NAME, OP) \
|
|
1815
|
+
template <size_t N, int kPow2> \
|
|
1816
|
+
HWY_API HWY_SVE_V(BASE, BITS) \
|
|
1817
|
+
NAME(HWY_SVE_D(BASE, BITS, N, kPow2) /* d */, \
|
|
1818
|
+
const HWY_SVE_T(BASE, BITS) * HWY_RESTRICT p) { \
|
|
1819
|
+
/* All-true predicate to load all 128 bits. */ \
|
|
1820
|
+
return sv##OP##_##CHAR##BITS(HWY_SVE_PTRUE(8), \
|
|
1821
|
+
detail::NativeLanePointer(p)); \
|
|
1822
|
+
}
|
|
1263
1823
|
|
|
1264
|
-
|
|
1824
|
+
HWY_SVE_FOREACH(HWY_SVE_LOAD_DUP128, LoadDupFull128, ld1rq)
|
|
1825
|
+
HWY_SVE_FOREACH_BF16(HWY_SVE_LOAD_DUP128, LoadDupFull128, ld1rq)
|
|
1265
1826
|
|
|
1266
|
-
template <
|
|
1267
|
-
HWY_API
|
|
1268
|
-
|
|
1269
|
-
return
|
|
1270
|
-
reinterpret_cast<const uint16_t * HWY_RESTRICT>(p));
|
|
1827
|
+
template <class D, HWY_SVE_IF_EMULATED_D(D)>
|
|
1828
|
+
HWY_API VFromD<D> LoadDupFull128(D d, const TFromD<D>* HWY_RESTRICT p) {
|
|
1829
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
1830
|
+
return BitCast(d, LoadDupFull128(du, detail::U16LanePointer(p)));
|
|
1271
1831
|
}
|
|
1272
1832
|
|
|
1273
|
-
|
|
1833
|
+
} // namespace detail
|
|
1834
|
+
#endif // HWY_TARGET != HWY_SVE2_128
|
|
1274
1835
|
|
|
1275
1836
|
#if HWY_TARGET == HWY_SVE2_128
|
|
1276
|
-
// On the HWY_SVE2_128 target, LoadDup128 is the same as
|
|
1837
|
+
// On the HWY_SVE2_128 target, LoadDup128 is the same as LoadU since vectors
|
|
1277
1838
|
// cannot exceed 16 bytes on the HWY_SVE2_128 target.
|
|
1278
1839
|
template <class D>
|
|
1279
1840
|
HWY_API VFromD<D> LoadDup128(D d, const TFromD<D>* HWY_RESTRICT p) {
|
|
1280
|
-
return
|
|
1841
|
+
return LoadU(d, p);
|
|
1281
1842
|
}
|
|
1282
1843
|
#else // HWY_TARGET != HWY_SVE2_128
|
|
1283
|
-
// If D().MaxBytes() <= 16 is true, simply do a
|
|
1844
|
+
// If D().MaxBytes() <= 16 is true, simply do a LoadU operation.
|
|
1284
1845
|
template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
|
|
1285
1846
|
HWY_API VFromD<D> LoadDup128(D d, const TFromD<D>* HWY_RESTRICT p) {
|
|
1286
|
-
return
|
|
1847
|
+
return LoadU(d, p);
|
|
1287
1848
|
}
|
|
1288
1849
|
|
|
1289
1850
|
// If D().MaxBytes() > 16 is true, need to load the vector using ld1rq
|
|
1290
|
-
template <class D, HWY_IF_V_SIZE_GT_D(D, 16)
|
|
1291
|
-
hwy::EnableIf<!IsSame<TFromD<D>, bfloat16_t>()>* = nullptr>
|
|
1851
|
+
template <class D, HWY_IF_V_SIZE_GT_D(D, 16)>
|
|
1292
1852
|
HWY_API VFromD<D> LoadDup128(D d, const TFromD<D>* HWY_RESTRICT p) {
|
|
1293
1853
|
return detail::LoadDupFull128(d, p);
|
|
1294
1854
|
}
|
|
1295
1855
|
|
|
1296
|
-
#if !HWY_SVE_HAVE_BFLOAT16
|
|
1297
|
-
|
|
1298
|
-
template <class D, HWY_IF_V_SIZE_GT_D(D, 16), HWY_IF_BF16_D(D)>
|
|
1299
|
-
HWY_API VBF16 LoadDup128(D d, const bfloat16_t* HWY_RESTRICT p) {
|
|
1300
|
-
return detail::LoadDupFull128(
|
|
1301
|
-
RebindToUnsigned<decltype(d)>(),
|
|
1302
|
-
reinterpret_cast<const uint16_t * HWY_RESTRICT>(p));
|
|
1303
|
-
}
|
|
1304
|
-
#endif // !HWY_SVE_HAVE_BFLOAT16
|
|
1305
|
-
|
|
1306
1856
|
#endif // HWY_TARGET != HWY_SVE2_128
|
|
1307
1857
|
|
|
1308
|
-
|
|
1309
|
-
|
|
1310
|
-
template <size_t N, int kPow2>
|
|
1311
|
-
HWY_API void Store(VBF16 v, Simd<bfloat16_t, N, kPow2> d,
|
|
1312
|
-
bfloat16_t* HWY_RESTRICT p) {
|
|
1313
|
-
Store(v, RebindToUnsigned<decltype(d)>(),
|
|
1314
|
-
reinterpret_cast<uint16_t * HWY_RESTRICT>(p));
|
|
1315
|
-
}
|
|
1316
|
-
|
|
1317
|
-
#endif
|
|
1318
|
-
|
|
1319
|
-
// ------------------------------ Load/StoreU
|
|
1858
|
+
// ------------------------------ Load/Store
|
|
1320
1859
|
|
|
1321
1860
|
// SVE only requires lane alignment, not natural alignment of the entire
|
|
1322
|
-
// vector.
|
|
1861
|
+
// vector, so Load/Store are the same as LoadU/StoreU.
|
|
1323
1862
|
template <class D>
|
|
1324
|
-
HWY_API VFromD<D>
|
|
1325
|
-
return
|
|
1863
|
+
HWY_API VFromD<D> Load(D d, const TFromD<D>* HWY_RESTRICT p) {
|
|
1864
|
+
return LoadU(d, p);
|
|
1326
1865
|
}
|
|
1327
1866
|
|
|
1328
1867
|
template <class V, class D>
|
|
1329
|
-
HWY_API void
|
|
1330
|
-
|
|
1868
|
+
HWY_API void Store(const V v, D d, TFromD<D>* HWY_RESTRICT p) {
|
|
1869
|
+
StoreU(v, d, p);
|
|
1331
1870
|
}
|
|
1332
1871
|
|
|
1333
1872
|
// ------------------------------ MaskedLoadOr
|
|
@@ -1362,8 +1901,8 @@ HWY_API VFromD<D> MaskedLoadOr(VFromD<D> v, MFromD<D> m, D d,
|
|
|
1362
1901
|
HWY_API void NAME(HWY_SVE_V(BASE, BITS) v, svbool_t m, \
|
|
1363
1902
|
HWY_SVE_D(BASE, BITS, N, kPow2) /*d*/, \
|
|
1364
1903
|
HWY_SVE_T(BASE, BITS) * HWY_RESTRICT base, \
|
|
1365
|
-
HWY_SVE_V(int, BITS)
|
|
1366
|
-
sv##OP##_s##BITS##index_##CHAR##BITS(m, base,
|
|
1904
|
+
HWY_SVE_V(int, BITS) indices) { \
|
|
1905
|
+
sv##OP##_s##BITS##index_##CHAR##BITS(m, base, indices, v); \
|
|
1367
1906
|
}
|
|
1368
1907
|
|
|
1369
1908
|
HWY_SVE_FOREACH_UIF3264(HWY_SVE_SCATTER_OFFSET, ScatterOffset, st1_scatter)
|
|
@@ -1398,10 +1937,13 @@ HWY_API void ScatterIndex(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT p,
|
|
|
1398
1937
|
#define HWY_SVE_MASKED_GATHER_INDEX(BASE, CHAR, BITS, HALF, NAME, OP) \
|
|
1399
1938
|
template <size_t N, int kPow2> \
|
|
1400
1939
|
HWY_API HWY_SVE_V(BASE, BITS) \
|
|
1401
|
-
NAME(svbool_t m, HWY_SVE_D(BASE, BITS, N, kPow2)
|
|
1940
|
+
NAME(svbool_t m, HWY_SVE_D(BASE, BITS, N, kPow2) d, \
|
|
1402
1941
|
const HWY_SVE_T(BASE, BITS) * HWY_RESTRICT base, \
|
|
1403
|
-
HWY_SVE_V(int, BITS)
|
|
1404
|
-
|
|
1942
|
+
HWY_SVE_V(int, BITS) indices) { \
|
|
1943
|
+
const RebindToSigned<decltype(d)> di; \
|
|
1944
|
+
(void)di; /* for HWY_DASSERT */ \
|
|
1945
|
+
HWY_DASSERT(AllFalse(di, Lt(indices, Zero(di)))); \
|
|
1946
|
+
return sv##OP##_s##BITS##index_##CHAR##BITS(m, base, indices); \
|
|
1405
1947
|
}
|
|
1406
1948
|
|
|
1407
1949
|
HWY_SVE_FOREACH_UIF3264(HWY_SVE_GATHER_OFFSET, GatherOffset, ld1_gather)
|
|
@@ -1410,6 +1952,13 @@ HWY_SVE_FOREACH_UIF3264(HWY_SVE_MASKED_GATHER_INDEX, MaskedGatherIndex,
|
|
|
1410
1952
|
#undef HWY_SVE_GATHER_OFFSET
|
|
1411
1953
|
#undef HWY_SVE_MASKED_GATHER_INDEX
|
|
1412
1954
|
|
|
1955
|
+
template <class D>
|
|
1956
|
+
HWY_API VFromD<D> MaskedGatherIndexOr(VFromD<D> no, svbool_t m, D d,
|
|
1957
|
+
const TFromD<D>* HWY_RESTRICT p,
|
|
1958
|
+
VFromD<RebindToSigned<D>> indices) {
|
|
1959
|
+
return IfThenElse(m, MaskedGatherIndex(m, d, p, indices), no);
|
|
1960
|
+
}
|
|
1961
|
+
|
|
1413
1962
|
template <class D>
|
|
1414
1963
|
HWY_API VFromD<D> GatherIndex(D d, const TFromD<D>* HWY_RESTRICT p,
|
|
1415
1964
|
VFromD<RebindToSigned<D>> indices) {
|
|
@@ -1430,8 +1979,8 @@ HWY_API VFromD<D> GatherIndex(D d, const TFromD<D>* HWY_RESTRICT p,
|
|
|
1430
1979
|
HWY_API void NAME(HWY_SVE_D(BASE, BITS, N, kPow2) d, \
|
|
1431
1980
|
const HWY_SVE_T(BASE, BITS) * HWY_RESTRICT unaligned, \
|
|
1432
1981
|
HWY_SVE_V(BASE, BITS) & v0, HWY_SVE_V(BASE, BITS) & v1) { \
|
|
1433
|
-
const HWY_SVE_TUPLE(BASE, BITS, 2) tuple =
|
|
1434
|
-
|
|
1982
|
+
const HWY_SVE_TUPLE(BASE, BITS, 2) tuple = sv##OP##_##CHAR##BITS( \
|
|
1983
|
+
detail::MakeMask(d), detail::NativeLanePointer(unaligned)); \
|
|
1435
1984
|
v0 = svget2(tuple, 0); \
|
|
1436
1985
|
v1 = svget2(tuple, 1); \
|
|
1437
1986
|
}
|
|
@@ -1447,8 +1996,8 @@ HWY_SVE_FOREACH(HWY_SVE_LOAD2, LoadInterleaved2, ld2)
|
|
|
1447
1996
|
const HWY_SVE_T(BASE, BITS) * HWY_RESTRICT unaligned, \
|
|
1448
1997
|
HWY_SVE_V(BASE, BITS) & v0, HWY_SVE_V(BASE, BITS) & v1, \
|
|
1449
1998
|
HWY_SVE_V(BASE, BITS) & v2) { \
|
|
1450
|
-
const HWY_SVE_TUPLE(BASE, BITS, 3) tuple =
|
|
1451
|
-
|
|
1999
|
+
const HWY_SVE_TUPLE(BASE, BITS, 3) tuple = sv##OP##_##CHAR##BITS( \
|
|
2000
|
+
detail::MakeMask(d), detail::NativeLanePointer(unaligned)); \
|
|
1452
2001
|
v0 = svget3(tuple, 0); \
|
|
1453
2002
|
v1 = svget3(tuple, 1); \
|
|
1454
2003
|
v2 = svget3(tuple, 2); \
|
|
@@ -1465,8 +2014,8 @@ HWY_SVE_FOREACH(HWY_SVE_LOAD3, LoadInterleaved3, ld3)
|
|
|
1465
2014
|
const HWY_SVE_T(BASE, BITS) * HWY_RESTRICT unaligned, \
|
|
1466
2015
|
HWY_SVE_V(BASE, BITS) & v0, HWY_SVE_V(BASE, BITS) & v1, \
|
|
1467
2016
|
HWY_SVE_V(BASE, BITS) & v2, HWY_SVE_V(BASE, BITS) & v3) { \
|
|
1468
|
-
const HWY_SVE_TUPLE(BASE, BITS, 4) tuple =
|
|
1469
|
-
|
|
2017
|
+
const HWY_SVE_TUPLE(BASE, BITS, 4) tuple = sv##OP##_##CHAR##BITS( \
|
|
2018
|
+
detail::MakeMask(d), detail::NativeLanePointer(unaligned)); \
|
|
1470
2019
|
v0 = svget4(tuple, 0); \
|
|
1471
2020
|
v1 = svget4(tuple, 1); \
|
|
1472
2021
|
v2 = svget4(tuple, 2); \
|
|
@@ -1478,12 +2027,14 @@ HWY_SVE_FOREACH(HWY_SVE_LOAD4, LoadInterleaved4, ld4)
|
|
|
1478
2027
|
|
|
1479
2028
|
// ------------------------------ StoreInterleaved2
|
|
1480
2029
|
|
|
1481
|
-
#define HWY_SVE_STORE2(BASE, CHAR, BITS, HALF, NAME, OP)
|
|
1482
|
-
template <size_t N, int kPow2>
|
|
1483
|
-
HWY_API void NAME(HWY_SVE_V(BASE, BITS) v0, HWY_SVE_V(BASE, BITS) v1,
|
|
1484
|
-
HWY_SVE_D(BASE, BITS, N, kPow2) d,
|
|
1485
|
-
HWY_SVE_T(BASE, BITS) * HWY_RESTRICT unaligned) {
|
|
1486
|
-
sv##OP##_##CHAR##BITS(detail::MakeMask(d),
|
|
2030
|
+
#define HWY_SVE_STORE2(BASE, CHAR, BITS, HALF, NAME, OP) \
|
|
2031
|
+
template <size_t N, int kPow2> \
|
|
2032
|
+
HWY_API void NAME(HWY_SVE_V(BASE, BITS) v0, HWY_SVE_V(BASE, BITS) v1, \
|
|
2033
|
+
HWY_SVE_D(BASE, BITS, N, kPow2) d, \
|
|
2034
|
+
HWY_SVE_T(BASE, BITS) * HWY_RESTRICT unaligned) { \
|
|
2035
|
+
sv##OP##_##CHAR##BITS(detail::MakeMask(d), \
|
|
2036
|
+
detail::NativeLanePointer(unaligned), \
|
|
2037
|
+
Create2(d, v0, v1)); \
|
|
1487
2038
|
}
|
|
1488
2039
|
HWY_SVE_FOREACH(HWY_SVE_STORE2, StoreInterleaved2, st2)
|
|
1489
2040
|
|
|
@@ -1497,7 +2048,8 @@ HWY_SVE_FOREACH(HWY_SVE_STORE2, StoreInterleaved2, st2)
|
|
|
1497
2048
|
HWY_SVE_V(BASE, BITS) v2, \
|
|
1498
2049
|
HWY_SVE_D(BASE, BITS, N, kPow2) d, \
|
|
1499
2050
|
HWY_SVE_T(BASE, BITS) * HWY_RESTRICT unaligned) { \
|
|
1500
|
-
sv##OP##_##CHAR##BITS(detail::MakeMask(d),
|
|
2051
|
+
sv##OP##_##CHAR##BITS(detail::MakeMask(d), \
|
|
2052
|
+
detail::NativeLanePointer(unaligned), \
|
|
1501
2053
|
Create3(d, v0, v1, v2)); \
|
|
1502
2054
|
}
|
|
1503
2055
|
HWY_SVE_FOREACH(HWY_SVE_STORE3, StoreInterleaved3, st3)
|
|
@@ -1512,7 +2064,8 @@ HWY_SVE_FOREACH(HWY_SVE_STORE3, StoreInterleaved3, st3)
|
|
|
1512
2064
|
HWY_SVE_V(BASE, BITS) v2, HWY_SVE_V(BASE, BITS) v3, \
|
|
1513
2065
|
HWY_SVE_D(BASE, BITS, N, kPow2) d, \
|
|
1514
2066
|
HWY_SVE_T(BASE, BITS) * HWY_RESTRICT unaligned) { \
|
|
1515
|
-
sv##OP##_##CHAR##BITS(detail::MakeMask(d),
|
|
2067
|
+
sv##OP##_##CHAR##BITS(detail::MakeMask(d), \
|
|
2068
|
+
detail::NativeLanePointer(unaligned), \
|
|
1516
2069
|
Create4(d, v0, v1, v2, v3)); \
|
|
1517
2070
|
}
|
|
1518
2071
|
HWY_SVE_FOREACH(HWY_SVE_STORE4, StoreInterleaved4, st4)
|
|
@@ -1602,6 +2155,22 @@ HWY_API svfloat32_t PromoteTo(Simd<float32_t, N, kPow2> /* d */,
|
|
|
1602
2155
|
return svcvt_f32_f16_x(detail::PTrue(Simd<float16_t, N, kPow2>()), vv);
|
|
1603
2156
|
}
|
|
1604
2157
|
|
|
2158
|
+
#ifdef HWY_NATIVE_PROMOTE_F16_TO_F64
|
|
2159
|
+
#undef HWY_NATIVE_PROMOTE_F16_TO_F64
|
|
2160
|
+
#else
|
|
2161
|
+
#define HWY_NATIVE_PROMOTE_F16_TO_F64
|
|
2162
|
+
#endif
|
|
2163
|
+
|
|
2164
|
+
template <size_t N, int kPow2>
|
|
2165
|
+
HWY_API svfloat64_t PromoteTo(Simd<float64_t, N, kPow2> /* d */,
|
|
2166
|
+
const svfloat16_t v) {
|
|
2167
|
+
// svcvt* expects inputs in even lanes, whereas Highway wants lower lanes, so
|
|
2168
|
+
// first replicate each lane once.
|
|
2169
|
+
const svfloat16_t vv = detail::ZipLowerSame(v, v);
|
|
2170
|
+
return svcvt_f64_f16_x(detail::PTrue(Simd<float16_t, N, kPow2>()),
|
|
2171
|
+
detail::ZipLowerSame(vv, vv));
|
|
2172
|
+
}
|
|
2173
|
+
|
|
1605
2174
|
template <size_t N, int kPow2>
|
|
1606
2175
|
HWY_API svfloat64_t PromoteTo(Simd<float64_t, N, kPow2> /* d */,
|
|
1607
2176
|
const svfloat32_t v) {
|
|
@@ -1637,19 +2206,43 @@ HWY_API svuint64_t PromoteTo(Simd<uint64_t, N, kPow2> /* d */,
|
|
|
1637
2206
|
return svcvt_u64_f32_x(detail::PTrue(Simd<float, N, kPow2>()), vv);
|
|
1638
2207
|
}
|
|
1639
2208
|
|
|
1640
|
-
//
|
|
2209
|
+
// ------------------------------ PromoteUpperTo
|
|
2210
|
+
|
|
1641
2211
|
namespace detail {
|
|
2212
|
+
HWY_SVE_FOREACH_UI16(HWY_SVE_PROMOTE_TO, PromoteUpperTo, unpkhi)
|
|
1642
2213
|
HWY_SVE_FOREACH_UI32(HWY_SVE_PROMOTE_TO, PromoteUpperTo, unpkhi)
|
|
2214
|
+
HWY_SVE_FOREACH_UI64(HWY_SVE_PROMOTE_TO, PromoteUpperTo, unpkhi)
|
|
1643
2215
|
#undef HWY_SVE_PROMOTE_TO
|
|
2216
|
+
} // namespace detail
|
|
1644
2217
|
|
|
1645
|
-
|
|
1646
|
-
|
|
1647
|
-
|
|
1648
|
-
|
|
1649
|
-
|
|
2218
|
+
#ifdef HWY_NATIVE_PROMOTE_UPPER_TO
|
|
2219
|
+
#undef HWY_NATIVE_PROMOTE_UPPER_TO
|
|
2220
|
+
#else
|
|
2221
|
+
#define HWY_NATIVE_PROMOTE_UPPER_TO
|
|
2222
|
+
#endif
|
|
2223
|
+
|
|
2224
|
+
// Unsigned->Unsigned or Signed->Signed
|
|
2225
|
+
template <class D, class V, typename TD = TFromD<D>, typename TV = TFromV<V>,
|
|
2226
|
+
hwy::EnableIf<IsInteger<TD>() && IsInteger<TV>() &&
|
|
2227
|
+
(IsSigned<TD>() == IsSigned<TV>())>* = nullptr>
|
|
2228
|
+
HWY_API VFromD<D> PromoteUpperTo(D d, V v) {
|
|
2229
|
+
if (detail::IsFull(d)) {
|
|
2230
|
+
return detail::PromoteUpperTo(d, v);
|
|
2231
|
+
}
|
|
2232
|
+
const Rebind<TFromV<V>, decltype(d)> dh;
|
|
2233
|
+
return PromoteTo(d, UpperHalf(dh, v));
|
|
1650
2234
|
}
|
|
1651
2235
|
|
|
1652
|
-
|
|
2236
|
+
// Differing signs or either is float
|
|
2237
|
+
template <class D, class V, typename TD = TFromD<D>, typename TV = TFromV<V>,
|
|
2238
|
+
hwy::EnableIf<!IsInteger<TD>() || !IsInteger<TV>() ||
|
|
2239
|
+
(IsSigned<TD>() != IsSigned<TV>())>* = nullptr>
|
|
2240
|
+
HWY_API VFromD<D> PromoteUpperTo(D d, V v) {
|
|
2241
|
+
// Lanes(d) may differ from Lanes(DFromV<V>()). Use the lane type from V
|
|
2242
|
+
// because it cannot be deduced from D (could be either bf16 or f16).
|
|
2243
|
+
const Rebind<TFromV<V>, decltype(d)> dh;
|
|
2244
|
+
return PromoteTo(d, UpperHalf(dh, v));
|
|
2245
|
+
}
|
|
1653
2246
|
|
|
1654
2247
|
// ------------------------------ DemoteTo U
|
|
1655
2248
|
|
|
@@ -1959,6 +2552,29 @@ HWY_API svuint8_t DemoteTo(Simd<uint8_t, N, kPow2> dn, const svuint64_t v) {
|
|
|
1959
2552
|
return TruncateTo(dn, vn);
|
|
1960
2553
|
}
|
|
1961
2554
|
|
|
2555
|
+
// ------------------------------ Unsigned to signed demotions
|
|
2556
|
+
|
|
2557
|
+
// Disable the default unsigned to signed DemoteTo/ReorderDemote2To
|
|
2558
|
+
// implementations in generic_ops-inl.h on SVE/SVE2 as the SVE/SVE2 targets have
|
|
2559
|
+
// target-specific implementations of the unsigned to signed DemoteTo and
|
|
2560
|
+
// ReorderDemote2To ops
|
|
2561
|
+
|
|
2562
|
+
// NOTE: hwy::EnableIf<!hwy::IsSame<V, V>()>* = nullptr is used instead of
|
|
2563
|
+
// hwy::EnableIf<false>* = nullptr to avoid compiler errors since
|
|
2564
|
+
// !hwy::IsSame<V, V>() is always false and as !hwy::IsSame<V, V>() will cause
|
|
2565
|
+
// SFINAE to occur instead of a hard error due to a dependency on the V template
|
|
2566
|
+
// argument
|
|
2567
|
+
#undef HWY_IF_U2I_DEMOTE_FROM_LANE_SIZE_V
|
|
2568
|
+
#define HWY_IF_U2I_DEMOTE_FROM_LANE_SIZE_V(V) \
|
|
2569
|
+
hwy::EnableIf<!hwy::IsSame<V, V>()>* = nullptr
|
|
2570
|
+
|
|
2571
|
+
template <class D, class V, HWY_IF_SIGNED_D(D), HWY_IF_UNSIGNED_V(V),
|
|
2572
|
+
HWY_IF_T_SIZE_LE_D(D, sizeof(TFromV<V>) - 1)>
|
|
2573
|
+
HWY_API VFromD<D> DemoteTo(D dn, V v) {
|
|
2574
|
+
const RebindToUnsigned<D> dn_u;
|
|
2575
|
+
return BitCast(dn, TruncateTo(dn_u, detail::SaturateU<TFromD<D>>(v)));
|
|
2576
|
+
}
|
|
2577
|
+
|
|
1962
2578
|
// ------------------------------ ConcatEven/ConcatOdd
|
|
1963
2579
|
|
|
1964
2580
|
// WARNING: the upper half of these needs fixing up (uzp1/uzp2 use the
|
|
@@ -1972,10 +2588,22 @@ namespace detail {
|
|
|
1972
2588
|
}
|
|
1973
2589
|
HWY_SVE_FOREACH(HWY_SVE_CONCAT_EVERY_SECOND, ConcatEvenFull, uzp1)
|
|
1974
2590
|
HWY_SVE_FOREACH(HWY_SVE_CONCAT_EVERY_SECOND, ConcatOddFull, uzp2)
|
|
2591
|
+
#if HWY_SVE_HAVE_BF16_FEATURE || HWY_SVE_HAVE_BF16_VEC
|
|
2592
|
+
HWY_SVE_FOREACH_BF16_UNCONDITIONAL(HWY_SVE_CONCAT_EVERY_SECOND, ConcatEvenFull,
|
|
2593
|
+
uzp1)
|
|
2594
|
+
HWY_SVE_FOREACH_BF16_UNCONDITIONAL(HWY_SVE_CONCAT_EVERY_SECOND, ConcatOddFull,
|
|
2595
|
+
uzp2)
|
|
2596
|
+
#endif // HWY_SVE_HAVE_BF16_FEATURE || HWY_SVE_HAVE_BF16_VEC
|
|
1975
2597
|
#if defined(__ARM_FEATURE_SVE_MATMUL_FP64)
|
|
1976
2598
|
HWY_SVE_FOREACH(HWY_SVE_CONCAT_EVERY_SECOND, ConcatEvenBlocks, uzp1q)
|
|
1977
2599
|
HWY_SVE_FOREACH(HWY_SVE_CONCAT_EVERY_SECOND, ConcatOddBlocks, uzp2q)
|
|
1978
|
-
#
|
|
2600
|
+
#if HWY_SVE_HAVE_BF16_FEATURE || HWY_SVE_HAVE_BF16_VEC
|
|
2601
|
+
HWY_SVE_FOREACH_BF16_UNCONDITIONAL(HWY_SVE_CONCAT_EVERY_SECOND,
|
|
2602
|
+
ConcatEvenBlocks, uzp1q)
|
|
2603
|
+
HWY_SVE_FOREACH_BF16_UNCONDITIONAL(HWY_SVE_CONCAT_EVERY_SECOND, ConcatOddBlocks,
|
|
2604
|
+
uzp2q)
|
|
2605
|
+
#endif // HWY_SVE_HAVE_BF16_FEATURE || HWY_SVE_HAVE_BF16_VEC
|
|
2606
|
+
#endif // defined(__ARM_FEATURE_SVE_MATMUL_FP64)
|
|
1979
2607
|
#undef HWY_SVE_CONCAT_EVERY_SECOND
|
|
1980
2608
|
|
|
1981
2609
|
// Used to slide up / shift whole register left; mask indicates which range
|
|
@@ -1986,6 +2614,16 @@ HWY_SVE_FOREACH(HWY_SVE_CONCAT_EVERY_SECOND, ConcatOddBlocks, uzp2q)
|
|
|
1986
2614
|
return sv##OP##_##CHAR##BITS(mask, lo, hi); \
|
|
1987
2615
|
}
|
|
1988
2616
|
HWY_SVE_FOREACH(HWY_SVE_SPLICE, Splice, splice)
|
|
2617
|
+
#if HWY_SVE_HAVE_BF16_FEATURE
|
|
2618
|
+
HWY_SVE_FOREACH_BF16(HWY_SVE_SPLICE, Splice, splice)
|
|
2619
|
+
#else
|
|
2620
|
+
template <class V, HWY_IF_BF16_D(DFromV<V>)>
|
|
2621
|
+
HWY_INLINE V Splice(V hi, V lo, svbool_t mask) {
|
|
2622
|
+
const DFromV<V> d;
|
|
2623
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
2624
|
+
return BitCast(d, Splice(BitCast(du, hi), BitCast(du, lo), mask));
|
|
2625
|
+
}
|
|
2626
|
+
#endif // HWY_SVE_HAVE_BF16_FEATURE
|
|
1989
2627
|
#undef HWY_SVE_SPLICE
|
|
1990
2628
|
|
|
1991
2629
|
} // namespace detail
|
|
@@ -2010,6 +2648,18 @@ HWY_API VFromD<D> ConcatEven(D d, VFromD<D> hi, VFromD<D> lo) {
|
|
|
2010
2648
|
return detail::Splice(hi_odd, lo_odd, FirstN(d, Lanes(d) / 2));
|
|
2011
2649
|
}
|
|
2012
2650
|
|
|
2651
|
+
// ------------------------------ PromoteEvenTo/PromoteOddTo
|
|
2652
|
+
|
|
2653
|
+
// Signed to signed PromoteEvenTo: 1 instruction instead of 2 in generic-inl.h.
|
|
2654
|
+
// Might as well also enable unsigned to unsigned, though it is just an And.
|
|
2655
|
+
namespace detail {
|
|
2656
|
+
HWY_SVE_FOREACH_UI16(HWY_SVE_RETV_ARGPV, NativePromoteEvenTo, extb)
|
|
2657
|
+
HWY_SVE_FOREACH_UI32(HWY_SVE_RETV_ARGPV, NativePromoteEvenTo, exth)
|
|
2658
|
+
HWY_SVE_FOREACH_UI64(HWY_SVE_RETV_ARGPV, NativePromoteEvenTo, extw)
|
|
2659
|
+
} // namespace detail
|
|
2660
|
+
|
|
2661
|
+
#include "hwy/ops/inside-inl.h"
|
|
2662
|
+
|
|
2013
2663
|
// ------------------------------ DemoteTo F
|
|
2014
2664
|
|
|
2015
2665
|
// We already toggled HWY_NATIVE_F16C above.
|
|
@@ -2021,10 +2671,60 @@ HWY_API svfloat16_t DemoteTo(Simd<float16_t, N, kPow2> d, const svfloat32_t v) {
|
|
|
2021
2671
|
in_even); // lower half
|
|
2022
2672
|
}
|
|
2023
2673
|
|
|
2674
|
+
#ifdef HWY_NATIVE_DEMOTE_F64_TO_F16
|
|
2675
|
+
#undef HWY_NATIVE_DEMOTE_F64_TO_F16
|
|
2676
|
+
#else
|
|
2677
|
+
#define HWY_NATIVE_DEMOTE_F64_TO_F16
|
|
2678
|
+
#endif
|
|
2679
|
+
|
|
2680
|
+
template <size_t N, int kPow2>
|
|
2681
|
+
HWY_API svfloat16_t DemoteTo(Simd<float16_t, N, kPow2> d, const svfloat64_t v) {
|
|
2682
|
+
const svfloat16_t in_lo16 = svcvt_f16_f64_x(detail::PTrue(d), v);
|
|
2683
|
+
const svfloat16_t in_even = detail::ConcatEvenFull(in_lo16, in_lo16);
|
|
2684
|
+
return detail::ConcatEvenFull(in_even,
|
|
2685
|
+
in_even); // lower half
|
|
2686
|
+
}
|
|
2687
|
+
|
|
2688
|
+
#ifdef HWY_NATIVE_DEMOTE_F32_TO_BF16
|
|
2689
|
+
#undef HWY_NATIVE_DEMOTE_F32_TO_BF16
|
|
2690
|
+
#else
|
|
2691
|
+
#define HWY_NATIVE_DEMOTE_F32_TO_BF16
|
|
2692
|
+
#endif
|
|
2693
|
+
|
|
2694
|
+
#if !HWY_SVE_HAVE_F32_TO_BF16C
|
|
2695
|
+
namespace detail {
|
|
2696
|
+
|
|
2697
|
+
// Round a F32 value to the nearest BF16 value, with the result returned as the
|
|
2698
|
+
// rounded F32 value bitcasted to an U32
|
|
2699
|
+
|
|
2700
|
+
// RoundF32ForDemoteToBF16 also converts NaN values to QNaN values to prevent
|
|
2701
|
+
// NaN F32 values from being converted to an infinity
|
|
2702
|
+
HWY_INLINE svuint32_t RoundF32ForDemoteToBF16(svfloat32_t v) {
|
|
2703
|
+
const DFromV<decltype(v)> df32;
|
|
2704
|
+
const RebindToUnsigned<decltype(df32)> du32;
|
|
2705
|
+
|
|
2706
|
+
const auto is_non_nan = Eq(v, v);
|
|
2707
|
+
const auto bits32 = BitCast(du32, v);
|
|
2708
|
+
|
|
2709
|
+
const auto round_incr =
|
|
2710
|
+
detail::AddN(detail::AndN(ShiftRight<16>(bits32), 1u), 0x7FFFu);
|
|
2711
|
+
return MaskedAddOr(detail::OrN(bits32, 0x00400000u), is_non_nan, bits32,
|
|
2712
|
+
round_incr);
|
|
2713
|
+
}
|
|
2714
|
+
|
|
2715
|
+
} // namespace detail
|
|
2716
|
+
#endif // !HWY_SVE_HAVE_F32_TO_BF16C
|
|
2717
|
+
|
|
2024
2718
|
template <size_t N, int kPow2>
|
|
2025
2719
|
HWY_API VBF16 DemoteTo(Simd<bfloat16_t, N, kPow2> dbf16, svfloat32_t v) {
|
|
2026
|
-
|
|
2027
|
-
|
|
2720
|
+
#if HWY_SVE_HAVE_F32_TO_BF16C
|
|
2721
|
+
const VBF16 in_even = svcvt_bf16_f32_x(detail::PTrue(dbf16), v);
|
|
2722
|
+
return detail::ConcatEvenFull(in_even, in_even);
|
|
2723
|
+
#else
|
|
2724
|
+
const svuint16_t in_odd =
|
|
2725
|
+
BitCast(ScalableTag<uint16_t>(), detail::RoundF32ForDemoteToBF16(v));
|
|
2726
|
+
return BitCast(dbf16, detail::ConcatOddFull(in_odd, in_odd)); // lower half
|
|
2727
|
+
#endif
|
|
2028
2728
|
}
|
|
2029
2729
|
|
|
2030
2730
|
template <size_t N, int kPow2>
|
|
@@ -2065,32 +2765,31 @@ HWY_API svfloat32_t DemoteTo(Simd<float, N, kPow2> d, const svuint64_t v) {
|
|
|
2065
2765
|
// ------------------------------ ConvertTo F
|
|
2066
2766
|
|
|
2067
2767
|
#define HWY_SVE_CONVERT(BASE, CHAR, BITS, HALF, NAME, OP) \
|
|
2068
|
-
/* signed
|
|
2768
|
+
/* Float from signed */ \
|
|
2069
2769
|
template <size_t N, int kPow2> \
|
|
2070
2770
|
HWY_API HWY_SVE_V(BASE, BITS) \
|
|
2071
2771
|
NAME(HWY_SVE_D(BASE, BITS, N, kPow2) /* d */, HWY_SVE_V(int, BITS) v) { \
|
|
2072
2772
|
return sv##OP##_##CHAR##BITS##_s##BITS##_x(HWY_SVE_PTRUE(BITS), v); \
|
|
2073
2773
|
} \
|
|
2074
|
-
/* unsigned
|
|
2774
|
+
/* Float from unsigned */ \
|
|
2075
2775
|
template <size_t N, int kPow2> \
|
|
2076
2776
|
HWY_API HWY_SVE_V(BASE, BITS) \
|
|
2077
2777
|
NAME(HWY_SVE_D(BASE, BITS, N, kPow2) /* d */, HWY_SVE_V(uint, BITS) v) { \
|
|
2078
2778
|
return sv##OP##_##CHAR##BITS##_u##BITS##_x(HWY_SVE_PTRUE(BITS), v); \
|
|
2079
2779
|
} \
|
|
2080
|
-
/*
|
|
2780
|
+
/* Signed from float, rounding toward zero */ \
|
|
2081
2781
|
template <size_t N, int kPow2> \
|
|
2082
2782
|
HWY_API HWY_SVE_V(int, BITS) \
|
|
2083
2783
|
NAME(HWY_SVE_D(int, BITS, N, kPow2) /* d */, HWY_SVE_V(BASE, BITS) v) { \
|
|
2084
2784
|
return sv##OP##_s##BITS##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), v); \
|
|
2085
2785
|
} \
|
|
2086
|
-
/*
|
|
2786
|
+
/* Unsigned from float, rounding toward zero */ \
|
|
2087
2787
|
template <size_t N, int kPow2> \
|
|
2088
2788
|
HWY_API HWY_SVE_V(uint, BITS) \
|
|
2089
2789
|
NAME(HWY_SVE_D(uint, BITS, N, kPow2) /* d */, HWY_SVE_V(BASE, BITS) v) { \
|
|
2090
2790
|
return sv##OP##_u##BITS##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), v); \
|
|
2091
2791
|
}
|
|
2092
2792
|
|
|
2093
|
-
// API only requires f32 but we provide f64 for use by Iota.
|
|
2094
2793
|
HWY_SVE_FOREACH_F(HWY_SVE_CONVERT, ConvertTo, cvt)
|
|
2095
2794
|
#undef HWY_SVE_CONVERT
|
|
2096
2795
|
|
|
@@ -2103,20 +2802,22 @@ HWY_API VFromD<DI> NearestInt(VF v) {
|
|
|
2103
2802
|
|
|
2104
2803
|
// ------------------------------ Iota (Add, ConvertTo)
|
|
2105
2804
|
|
|
2106
|
-
#define HWY_SVE_IOTA(BASE, CHAR, BITS, HALF, NAME, OP)
|
|
2107
|
-
template <size_t N, int kPow2>
|
|
2108
|
-
HWY_API HWY_SVE_V(BASE, BITS)
|
|
2109
|
-
|
|
2110
|
-
return sv##OP##_##CHAR##BITS(
|
|
2805
|
+
#define HWY_SVE_IOTA(BASE, CHAR, BITS, HALF, NAME, OP) \
|
|
2806
|
+
template <size_t N, int kPow2, typename T2> \
|
|
2807
|
+
HWY_API HWY_SVE_V(BASE, BITS) \
|
|
2808
|
+
NAME(HWY_SVE_D(BASE, BITS, N, kPow2) /* d */, T2 first) { \
|
|
2809
|
+
return sv##OP##_##CHAR##BITS( \
|
|
2810
|
+
ConvertScalarTo<HWY_SVE_T(BASE, BITS)>(first), 1); \
|
|
2111
2811
|
}
|
|
2112
2812
|
|
|
2113
2813
|
HWY_SVE_FOREACH_UI(HWY_SVE_IOTA, Iota, index)
|
|
2114
2814
|
#undef HWY_SVE_IOTA
|
|
2115
2815
|
|
|
2116
|
-
template <class D, HWY_IF_FLOAT_D(D)>
|
|
2117
|
-
HWY_API VFromD<D> Iota(const D d,
|
|
2816
|
+
template <class D, typename T2, HWY_IF_FLOAT_D(D)>
|
|
2817
|
+
HWY_API VFromD<D> Iota(const D d, T2 first) {
|
|
2118
2818
|
const RebindToSigned<D> di;
|
|
2119
|
-
return detail::AddN(ConvertTo(d, Iota(di, 0)),
|
|
2819
|
+
return detail::AddN(ConvertTo(d, Iota(di, 0)),
|
|
2820
|
+
ConvertScalarTo<TFromD<D>>(first));
|
|
2120
2821
|
}
|
|
2121
2822
|
|
|
2122
2823
|
// ------------------------------ InterleaveLower
|
|
@@ -2147,12 +2848,10 @@ HWY_API V InterleaveLower(const V a, const V b) {
|
|
|
2147
2848
|
|
|
2148
2849
|
// Only use zip2 if vector are a powers of two, otherwise getting the actual
|
|
2149
2850
|
// "upper half" requires MaskUpperHalf.
|
|
2150
|
-
#if HWY_TARGET == HWY_SVE2_128
|
|
2151
2851
|
namespace detail {
|
|
2152
2852
|
// Unlike Highway's ZipUpper, this returns the same type.
|
|
2153
2853
|
HWY_SVE_FOREACH(HWY_SVE_RETV_ARGVV, ZipUpperSame, zip2)
|
|
2154
2854
|
} // namespace detail
|
|
2155
|
-
#endif
|
|
2156
2855
|
|
|
2157
2856
|
// Full vector: guaranteed to have at least one block
|
|
2158
2857
|
template <class D, class V = VFromD<D>,
|
|
@@ -2184,6 +2883,30 @@ HWY_API V InterleaveUpper(D d, const V a, const V b) {
|
|
|
2184
2883
|
return InterleaveUpper(DFromV<V>(), a, b);
|
|
2185
2884
|
}
|
|
2186
2885
|
|
|
2886
|
+
// ------------------------------ InterleaveWholeLower
|
|
2887
|
+
#ifdef HWY_NATIVE_INTERLEAVE_WHOLE
|
|
2888
|
+
#undef HWY_NATIVE_INTERLEAVE_WHOLE
|
|
2889
|
+
#else
|
|
2890
|
+
#define HWY_NATIVE_INTERLEAVE_WHOLE
|
|
2891
|
+
#endif
|
|
2892
|
+
|
|
2893
|
+
template <class D>
|
|
2894
|
+
HWY_API VFromD<D> InterleaveWholeLower(D /*d*/, VFromD<D> a, VFromD<D> b) {
|
|
2895
|
+
return detail::ZipLowerSame(a, b);
|
|
2896
|
+
}
|
|
2897
|
+
|
|
2898
|
+
// ------------------------------ InterleaveWholeUpper
|
|
2899
|
+
|
|
2900
|
+
template <class D>
|
|
2901
|
+
HWY_API VFromD<D> InterleaveWholeUpper(D d, VFromD<D> a, VFromD<D> b) {
|
|
2902
|
+
if (HWY_SVE_IS_POW2 && detail::IsFull(d)) {
|
|
2903
|
+
return detail::ZipUpperSame(a, b);
|
|
2904
|
+
}
|
|
2905
|
+
|
|
2906
|
+
const Half<decltype(d)> d2;
|
|
2907
|
+
return InterleaveWholeLower(d, UpperHalf(d2, a), UpperHalf(d2, b));
|
|
2908
|
+
}
|
|
2909
|
+
|
|
2187
2910
|
// ------------------------------ Per4LaneBlockShuffle
|
|
2188
2911
|
|
|
2189
2912
|
namespace detail {
|
|
@@ -2432,7 +3155,13 @@ HWY_API V UpperHalf(const DH dh, const V v) {
|
|
|
2432
3155
|
|
|
2433
3156
|
// ================================================== REDUCE
|
|
2434
3157
|
|
|
2435
|
-
|
|
3158
|
+
#ifdef HWY_NATIVE_REDUCE_SCALAR
|
|
3159
|
+
#undef HWY_NATIVE_REDUCE_SCALAR
|
|
3160
|
+
#else
|
|
3161
|
+
#define HWY_NATIVE_REDUCE_SCALAR
|
|
3162
|
+
#endif
|
|
3163
|
+
|
|
3164
|
+
// These return T, suitable for ReduceSum.
|
|
2436
3165
|
namespace detail {
|
|
2437
3166
|
#define HWY_SVE_REDUCE_ADD(BASE, CHAR, BITS, HALF, NAME, OP) \
|
|
2438
3167
|
HWY_API HWY_SVE_T(BASE, BITS) NAME(svbool_t pg, HWY_SVE_V(BASE, BITS) v) { \
|
|
@@ -2462,24 +3191,53 @@ HWY_SVE_FOREACH_F(HWY_SVE_REDUCE, MaxOfLanesM, maxnmv)
|
|
|
2462
3191
|
#undef HWY_SVE_REDUCE_ADD
|
|
2463
3192
|
} // namespace detail
|
|
2464
3193
|
|
|
2465
|
-
|
|
2466
|
-
|
|
2467
|
-
|
|
2468
|
-
|
|
3194
|
+
// detail::SumOfLanesM, detail::MinOfLanesM, and detail::MaxOfLanesM is more
|
|
3195
|
+
// efficient for N=4 I8/U8 reductions on SVE than the default implementations
|
|
3196
|
+
// of the N=4 I8/U8 ReduceSum/ReduceMin/ReduceMax operations in
|
|
3197
|
+
// generic_ops-inl.h
|
|
3198
|
+
#undef HWY_IF_REDUCE_D
|
|
3199
|
+
#define HWY_IF_REDUCE_D(D) hwy::EnableIf<HWY_MAX_LANES_D(D) != 1>* = nullptr
|
|
2469
3200
|
|
|
2470
|
-
|
|
2471
|
-
|
|
3201
|
+
#ifdef HWY_NATIVE_REDUCE_SUM_4_UI8
|
|
3202
|
+
#undef HWY_NATIVE_REDUCE_SUM_4_UI8
|
|
3203
|
+
#else
|
|
3204
|
+
#define HWY_NATIVE_REDUCE_SUM_4_UI8
|
|
3205
|
+
#endif
|
|
3206
|
+
|
|
3207
|
+
#ifdef HWY_NATIVE_REDUCE_MINMAX_4_UI8
|
|
3208
|
+
#undef HWY_NATIVE_REDUCE_MINMAX_4_UI8
|
|
3209
|
+
#else
|
|
3210
|
+
#define HWY_NATIVE_REDUCE_MINMAX_4_UI8
|
|
3211
|
+
#endif
|
|
3212
|
+
|
|
3213
|
+
template <class D, HWY_IF_REDUCE_D(D)>
|
|
3214
|
+
HWY_API TFromD<D> ReduceSum(D d, VFromD<D> v) {
|
|
2472
3215
|
return detail::SumOfLanesM(detail::MakeMask(d), v);
|
|
2473
3216
|
}
|
|
2474
3217
|
|
|
2475
|
-
template <class D,
|
|
2476
|
-
|
|
2477
|
-
return
|
|
3218
|
+
template <class D, HWY_IF_REDUCE_D(D)>
|
|
3219
|
+
HWY_API TFromD<D> ReduceMin(D d, VFromD<D> v) {
|
|
3220
|
+
return detail::MinOfLanesM(detail::MakeMask(d), v);
|
|
2478
3221
|
}
|
|
2479
3222
|
|
|
2480
|
-
template <class D,
|
|
2481
|
-
|
|
2482
|
-
return
|
|
3223
|
+
template <class D, HWY_IF_REDUCE_D(D)>
|
|
3224
|
+
HWY_API TFromD<D> ReduceMax(D d, VFromD<D> v) {
|
|
3225
|
+
return detail::MaxOfLanesM(detail::MakeMask(d), v);
|
|
3226
|
+
}
|
|
3227
|
+
|
|
3228
|
+
// ------------------------------ SumOfLanes
|
|
3229
|
+
|
|
3230
|
+
template <class D, HWY_IF_LANES_GT_D(D, 1)>
|
|
3231
|
+
HWY_API VFromD<D> SumOfLanes(D d, VFromD<D> v) {
|
|
3232
|
+
return Set(d, ReduceSum(d, v));
|
|
3233
|
+
}
|
|
3234
|
+
template <class D, HWY_IF_LANES_GT_D(D, 1)>
|
|
3235
|
+
HWY_API VFromD<D> MinOfLanes(D d, VFromD<D> v) {
|
|
3236
|
+
return Set(d, ReduceMin(d, v));
|
|
3237
|
+
}
|
|
3238
|
+
template <class D, HWY_IF_LANES_GT_D(D, 1)>
|
|
3239
|
+
HWY_API VFromD<D> MaxOfLanes(D d, VFromD<D> v) {
|
|
3240
|
+
return Set(d, ReduceMax(d, v));
|
|
2483
3241
|
}
|
|
2484
3242
|
|
|
2485
3243
|
// ================================================== SWIZZLE
|
|
@@ -2510,11 +3268,15 @@ HWY_API TFromV<V> ExtractLane(V v, size_t i) {
|
|
|
2510
3268
|
}
|
|
2511
3269
|
|
|
2512
3270
|
// ------------------------------ InsertLane (IfThenElse)
|
|
2513
|
-
template <class V>
|
|
2514
|
-
HWY_API V InsertLane(const V v, size_t i,
|
|
3271
|
+
template <class V, typename T>
|
|
3272
|
+
HWY_API V InsertLane(const V v, size_t i, T t) {
|
|
3273
|
+
static_assert(sizeof(TFromV<V>) == sizeof(T), "Lane size mismatch");
|
|
2515
3274
|
const DFromV<V> d;
|
|
2516
|
-
const
|
|
2517
|
-
|
|
3275
|
+
const RebindToSigned<decltype(d)> di;
|
|
3276
|
+
using TI = TFromD<decltype(di)>;
|
|
3277
|
+
const svbool_t is_i = detail::EqN(Iota(di, 0), static_cast<TI>(i));
|
|
3278
|
+
return IfThenElse(RebindMask(d, is_i),
|
|
3279
|
+
Set(d, hwy::ConvertScalarTo<TFromV<V>>(t)), v);
|
|
2518
3280
|
}
|
|
2519
3281
|
|
|
2520
3282
|
// ------------------------------ DupEven
|
|
@@ -2569,6 +3331,18 @@ HWY_API V OddEven(const V odd, const V even) {
|
|
|
2569
3331
|
|
|
2570
3332
|
#endif // HWY_TARGET
|
|
2571
3333
|
|
|
3334
|
+
// ------------------------------ InterleaveEven
|
|
3335
|
+
template <class D>
|
|
3336
|
+
HWY_API VFromD<D> InterleaveEven(D /*d*/, VFromD<D> a, VFromD<D> b) {
|
|
3337
|
+
return detail::InterleaveEven(a, b);
|
|
3338
|
+
}
|
|
3339
|
+
|
|
3340
|
+
// ------------------------------ InterleaveOdd
|
|
3341
|
+
template <class D>
|
|
3342
|
+
HWY_API VFromD<D> InterleaveOdd(D /*d*/, VFromD<D> a, VFromD<D> b) {
|
|
3343
|
+
return detail::InterleaveOdd(a, b);
|
|
3344
|
+
}
|
|
3345
|
+
|
|
2572
3346
|
// ------------------------------ OddEvenBlocks
|
|
2573
3347
|
template <class V>
|
|
2574
3348
|
HWY_API V OddEvenBlocks(const V odd, const V even) {
|
|
@@ -2623,6 +3397,9 @@ HWY_API VFromD<RebindToUnsigned<D>> SetTableIndices(D d, const TI* idx) {
|
|
|
2623
3397
|
}
|
|
2624
3398
|
|
|
2625
3399
|
HWY_SVE_FOREACH(HWY_SVE_TABLE, TableLookupLanes, tbl)
|
|
3400
|
+
#if HWY_SVE_HAVE_BF16_FEATURE || HWY_SVE_HAVE_BF16_VEC
|
|
3401
|
+
HWY_SVE_FOREACH_BF16_UNCONDITIONAL(HWY_SVE_TABLE, TableLookupLanes, tbl)
|
|
3402
|
+
#endif
|
|
2626
3403
|
#undef HWY_SVE_TABLE
|
|
2627
3404
|
|
|
2628
3405
|
#if HWY_SVE_HAVE_2
|
|
@@ -2634,6 +3411,10 @@ namespace detail {
|
|
|
2634
3411
|
}
|
|
2635
3412
|
|
|
2636
3413
|
HWY_SVE_FOREACH(HWY_SVE_TABLE2, NativeTwoTableLookupLanes, tbl2)
|
|
3414
|
+
#if HWY_SVE_HAVE_BF16_FEATURE || HWY_SVE_HAVE_BF16_VEC
|
|
3415
|
+
HWY_SVE_FOREACH_BF16_UNCONDITIONAL(HWY_SVE_TABLE2, NativeTwoTableLookupLanes,
|
|
3416
|
+
tbl2)
|
|
3417
|
+
#endif
|
|
2637
3418
|
#undef HWY_SVE_TABLE
|
|
2638
3419
|
} // namespace detail
|
|
2639
3420
|
#endif // HWY_SVE_HAVE_2
|
|
@@ -2705,6 +3486,9 @@ namespace detail {
|
|
|
2705
3486
|
}
|
|
2706
3487
|
|
|
2707
3488
|
HWY_SVE_FOREACH(HWY_SVE_REVERSE, ReverseFull, rev)
|
|
3489
|
+
#if HWY_SVE_HAVE_BF16_FEATURE || HWY_SVE_HAVE_BF16_VEC
|
|
3490
|
+
HWY_SVE_FOREACH_BF16_UNCONDITIONAL(HWY_SVE_REVERSE, ReverseFull, rev)
|
|
3491
|
+
#endif
|
|
2708
3492
|
#undef HWY_SVE_REVERSE
|
|
2709
3493
|
|
|
2710
3494
|
} // namespace detail
|
|
@@ -2775,14 +3559,14 @@ HWY_API VFromD<D> Reverse2(D d, const VFromD<D> v) { // 3210
|
|
|
2775
3559
|
template <class D, HWY_IF_T_SIZE_D(D, 1)>
|
|
2776
3560
|
HWY_API VFromD<D> Reverse4(D d, const VFromD<D> v) {
|
|
2777
3561
|
const RebindToUnsigned<decltype(d)> du;
|
|
2778
|
-
const
|
|
3562
|
+
const RepartitionToWideX2<decltype(du)> du32;
|
|
2779
3563
|
return BitCast(d, svrevb_u32_x(detail::PTrue(d), BitCast(du32, v)));
|
|
2780
3564
|
}
|
|
2781
3565
|
|
|
2782
3566
|
template <class D, HWY_IF_T_SIZE_D(D, 2)>
|
|
2783
3567
|
HWY_API VFromD<D> Reverse4(D d, const VFromD<D> v) {
|
|
2784
3568
|
const RebindToUnsigned<decltype(d)> du;
|
|
2785
|
-
const
|
|
3569
|
+
const RepartitionToWideX2<decltype(du)> du64;
|
|
2786
3570
|
return BitCast(d, svrevh_u64_x(detail::PTrue(d), BitCast(du64, v)));
|
|
2787
3571
|
}
|
|
2788
3572
|
|
|
@@ -2943,20 +3727,23 @@ HWY_API V BroadcastBlock(V v) {
|
|
|
2943
3727
|
static_assert(0 <= kBlockIdx && kBlockIdx < d.MaxBlocks(),
|
|
2944
3728
|
"Invalid block index");
|
|
2945
3729
|
|
|
3730
|
+
const RebindToUnsigned<decltype(d)> du; // for bfloat16_t
|
|
3731
|
+
using VU = VFromD<decltype(du)>;
|
|
3732
|
+
const VU vu = BitCast(du, v);
|
|
3733
|
+
|
|
2946
3734
|
#if HWY_TARGET == HWY_SVE_256
|
|
2947
|
-
return (kBlockIdx == 0) ? ConcatLowerLower(
|
|
2948
|
-
|
|
3735
|
+
return BitCast(d, (kBlockIdx == 0) ? ConcatLowerLower(du, vu, vu)
|
|
3736
|
+
: ConcatUpperUpper(du, vu, vu));
|
|
2949
3737
|
#else
|
|
2950
|
-
const RebindToUnsigned<decltype(d)> du;
|
|
2951
3738
|
using TU = TFromD<decltype(du)>;
|
|
2952
3739
|
constexpr size_t kLanesPerBlock = detail::LanesPerBlock(d);
|
|
2953
3740
|
constexpr size_t kBlockOffset =
|
|
2954
3741
|
static_cast<size_t>(kBlockIdx) * kLanesPerBlock;
|
|
2955
3742
|
|
|
2956
|
-
const
|
|
3743
|
+
const VU idx = detail::AddN(
|
|
2957
3744
|
detail::AndN(Iota(du, TU{0}), static_cast<TU>(kLanesPerBlock - 1)),
|
|
2958
3745
|
static_cast<TU>(kBlockOffset));
|
|
2959
|
-
return TableLookupLanes(
|
|
3746
|
+
return BitCast(d, TableLookupLanes(vu, idx));
|
|
2960
3747
|
#endif
|
|
2961
3748
|
}
|
|
2962
3749
|
|
|
@@ -3455,6 +4242,95 @@ HWY_API VFromD<DW> ZipUpper(DW dw, V a, V b) {
|
|
|
3455
4242
|
|
|
3456
4243
|
// ================================================== Ops with dependencies
|
|
3457
4244
|
|
|
4245
|
+
// ------------------------------ AddSub (Reverse2)
|
|
4246
|
+
|
|
4247
|
+
// NOTE: svcadd_f*_x(HWY_SVE_PTRUE(BITS), a, b, 90) computes a[i] - b[i + 1] in
|
|
4248
|
+
// the even lanes and a[i] + b[i - 1] in the odd lanes.
|
|
4249
|
+
|
|
4250
|
+
#define HWY_SVE_ADDSUB_F(BASE, CHAR, BITS, HALF, NAME, OP) \
|
|
4251
|
+
HWY_API HWY_SVE_V(BASE, BITS) \
|
|
4252
|
+
NAME(HWY_SVE_V(BASE, BITS) a, HWY_SVE_V(BASE, BITS) b) { \
|
|
4253
|
+
const DFromV<decltype(b)> d; \
|
|
4254
|
+
return sv##OP##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), a, Reverse2(d, b), \
|
|
4255
|
+
90); \
|
|
4256
|
+
}
|
|
4257
|
+
|
|
4258
|
+
HWY_SVE_FOREACH_F(HWY_SVE_ADDSUB_F, AddSub, cadd)
|
|
4259
|
+
|
|
4260
|
+
#undef HWY_SVE_ADDSUB_F
|
|
4261
|
+
|
|
4262
|
+
// NOTE: svcadd_s*(a, b, 90) and svcadd_u*(a, b, 90) compute a[i] - b[i + 1] in
|
|
4263
|
+
// the even lanes and a[i] + b[i - 1] in the odd lanes.
|
|
4264
|
+
|
|
4265
|
+
#if HWY_SVE_HAVE_2
|
|
4266
|
+
#define HWY_SVE_ADDSUB_UI(BASE, CHAR, BITS, HALF, NAME, OP) \
|
|
4267
|
+
HWY_API HWY_SVE_V(BASE, BITS) \
|
|
4268
|
+
NAME(HWY_SVE_V(BASE, BITS) a, HWY_SVE_V(BASE, BITS) b) { \
|
|
4269
|
+
const DFromV<decltype(b)> d; \
|
|
4270
|
+
return sv##OP##_##CHAR##BITS(a, Reverse2(d, b), 90); \
|
|
4271
|
+
}
|
|
4272
|
+
|
|
4273
|
+
HWY_SVE_FOREACH_UI(HWY_SVE_ADDSUB_UI, AddSub, cadd)
|
|
4274
|
+
|
|
4275
|
+
#undef HWY_SVE_ADDSUB_UI
|
|
4276
|
+
|
|
4277
|
+
// Disable the default implementation of AddSub in generic_ops-inl.h on SVE2
|
|
4278
|
+
#undef HWY_IF_ADDSUB_V
|
|
4279
|
+
#define HWY_IF_ADDSUB_V(V) \
|
|
4280
|
+
HWY_IF_LANES_GT_D(DFromV<V>, 1), \
|
|
4281
|
+
hwy::EnableIf<!hwy::IsSame<V, V>()>* = nullptr
|
|
4282
|
+
|
|
4283
|
+
#else // !HWY_SVE_HAVE_2
|
|
4284
|
+
|
|
4285
|
+
// Disable the default implementation of AddSub in generic_ops-inl.h for
|
|
4286
|
+
// floating-point vectors on SVE, but enable the default implementation of
|
|
4287
|
+
// AddSub in generic_ops-inl.h for integer vectors on SVE that do not support
|
|
4288
|
+
// SVE2
|
|
4289
|
+
#undef HWY_IF_ADDSUB_V
|
|
4290
|
+
#define HWY_IF_ADDSUB_V(V) \
|
|
4291
|
+
HWY_IF_LANES_GT_D(DFromV<V>, 1), HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)
|
|
4292
|
+
|
|
4293
|
+
#endif // HWY_SVE_HAVE_2
|
|
4294
|
+
|
|
4295
|
+
// ------------------------------ MulAddSub (AddSub)
|
|
4296
|
+
|
|
4297
|
+
template <class V, HWY_IF_LANES_GT_D(DFromV<V>, 1), HWY_IF_FLOAT_V(V)>
|
|
4298
|
+
HWY_API V MulAddSub(V mul, V x, V sub_or_add) {
|
|
4299
|
+
using T = TFromV<V>;
|
|
4300
|
+
|
|
4301
|
+
const DFromV<V> d;
|
|
4302
|
+
const T neg_zero = ConvertScalarTo<T>(-0.0f);
|
|
4303
|
+
|
|
4304
|
+
return MulAdd(mul, x, AddSub(Set(d, neg_zero), sub_or_add));
|
|
4305
|
+
}
|
|
4306
|
+
|
|
4307
|
+
#if HWY_SVE_HAVE_2
|
|
4308
|
+
|
|
4309
|
+
// Disable the default implementation of MulAddSub in generic_ops-inl.h on SVE2
|
|
4310
|
+
#undef HWY_IF_MULADDSUB_V
|
|
4311
|
+
#define HWY_IF_MULADDSUB_V(V) \
|
|
4312
|
+
HWY_IF_LANES_GT_D(DFromV<V>, 1), \
|
|
4313
|
+
hwy::EnableIf<!hwy::IsSame<V, V>()>* = nullptr
|
|
4314
|
+
|
|
4315
|
+
template <class V, HWY_IF_LANES_GT_D(DFromV<V>, 1),
|
|
4316
|
+
HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
|
|
4317
|
+
HWY_API V MulAddSub(V mul, V x, V sub_or_add) {
|
|
4318
|
+
const DFromV<V> d;
|
|
4319
|
+
return MulAdd(mul, x, AddSub(Zero(d), sub_or_add));
|
|
4320
|
+
}
|
|
4321
|
+
|
|
4322
|
+
#else // !HWY_SVE_HAVE_2
|
|
4323
|
+
|
|
4324
|
+
// Disable the default implementation of MulAddSub in generic_ops-inl.h for
|
|
4325
|
+
// floating-point vectors on SVE, but enable the default implementation of
|
|
4326
|
+
// AddSub in generic_ops-inl.h for integer vectors on SVE targets that do not
|
|
4327
|
+
// support SVE2
|
|
4328
|
+
#undef HWY_IF_MULADDSUB_V
|
|
4329
|
+
#define HWY_IF_MULADDSUB_V(V) \
|
|
4330
|
+
HWY_IF_LANES_GT_D(DFromV<V>, 1), HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)
|
|
4331
|
+
|
|
4332
|
+
#endif // HWY_SVE_HAVE_2
|
|
4333
|
+
|
|
3458
4334
|
// ------------------------------ PromoteTo bfloat16 (ZipLower)
|
|
3459
4335
|
template <size_t N, int kPow2>
|
|
3460
4336
|
HWY_API svfloat32_t PromoteTo(Simd<float32_t, N, kPow2> df32, VBF16 v) {
|
|
@@ -3462,15 +4338,142 @@ HWY_API svfloat32_t PromoteTo(Simd<float32_t, N, kPow2> df32, VBF16 v) {
|
|
|
3462
4338
|
return BitCast(df32, detail::ZipLowerSame(svdup_n_u16(0), BitCast(du16, v)));
|
|
3463
4339
|
}
|
|
3464
4340
|
|
|
4341
|
+
// ------------------------------ PromoteEvenTo/PromoteOddTo (ConcatOddFull)
|
|
4342
|
+
|
|
4343
|
+
namespace detail {
|
|
4344
|
+
|
|
4345
|
+
// Signed to signed PromoteEvenTo
|
|
4346
|
+
template <class D>
|
|
4347
|
+
HWY_INLINE VFromD<D> PromoteEvenTo(hwy::SignedTag /*to_type_tag*/,
|
|
4348
|
+
hwy::SizeTag<2> /*to_lane_size_tag*/,
|
|
4349
|
+
hwy::SignedTag /*from_type_tag*/, D d_to,
|
|
4350
|
+
svint8_t v) {
|
|
4351
|
+
return svextb_s16_x(detail::PTrue(d_to), BitCast(d_to, v));
|
|
4352
|
+
}
|
|
4353
|
+
|
|
4354
|
+
template <class D>
|
|
4355
|
+
HWY_INLINE VFromD<D> PromoteEvenTo(hwy::SignedTag /*to_type_tag*/,
|
|
4356
|
+
hwy::SizeTag<4> /*to_lane_size_tag*/,
|
|
4357
|
+
hwy::SignedTag /*from_type_tag*/, D d_to,
|
|
4358
|
+
svint16_t v) {
|
|
4359
|
+
return svexth_s32_x(detail::PTrue(d_to), BitCast(d_to, v));
|
|
4360
|
+
}
|
|
4361
|
+
|
|
4362
|
+
template <class D>
|
|
4363
|
+
HWY_INLINE VFromD<D> PromoteEvenTo(hwy::SignedTag /*to_type_tag*/,
|
|
4364
|
+
hwy::SizeTag<8> /*to_lane_size_tag*/,
|
|
4365
|
+
hwy::SignedTag /*from_type_tag*/, D d_to,
|
|
4366
|
+
svint32_t v) {
|
|
4367
|
+
return svextw_s64_x(detail::PTrue(d_to), BitCast(d_to, v));
|
|
4368
|
+
}
|
|
4369
|
+
|
|
4370
|
+
// F16->F32 PromoteEvenTo
|
|
4371
|
+
template <class D>
|
|
4372
|
+
HWY_INLINE VFromD<D> PromoteEvenTo(hwy::FloatTag /*to_type_tag*/,
|
|
4373
|
+
hwy::SizeTag<4> /*to_lane_size_tag*/,
|
|
4374
|
+
hwy::FloatTag /*from_type_tag*/, D d_to,
|
|
4375
|
+
svfloat16_t v) {
|
|
4376
|
+
const Repartition<float, decltype(d_to)> d_from;
|
|
4377
|
+
return svcvt_f32_f16_x(detail::PTrue(d_from), v);
|
|
4378
|
+
}
|
|
4379
|
+
|
|
4380
|
+
// F32->F64 PromoteEvenTo
|
|
4381
|
+
template <class D>
|
|
4382
|
+
HWY_INLINE VFromD<D> PromoteEvenTo(hwy::FloatTag /*to_type_tag*/,
|
|
4383
|
+
hwy::SizeTag<8> /*to_lane_size_tag*/,
|
|
4384
|
+
hwy::FloatTag /*from_type_tag*/, D d_to,
|
|
4385
|
+
svfloat32_t v) {
|
|
4386
|
+
const Repartition<float, decltype(d_to)> d_from;
|
|
4387
|
+
return svcvt_f64_f32_x(detail::PTrue(d_from), v);
|
|
4388
|
+
}
|
|
4389
|
+
|
|
4390
|
+
// I32->F64 PromoteEvenTo
|
|
4391
|
+
template <class D>
|
|
4392
|
+
HWY_INLINE VFromD<D> PromoteEvenTo(hwy::FloatTag /*to_type_tag*/,
|
|
4393
|
+
hwy::SizeTag<8> /*to_lane_size_tag*/,
|
|
4394
|
+
hwy::SignedTag /*from_type_tag*/, D d_to,
|
|
4395
|
+
svint32_t v) {
|
|
4396
|
+
const Repartition<float, decltype(d_to)> d_from;
|
|
4397
|
+
return svcvt_f64_s32_x(detail::PTrue(d_from), v);
|
|
4398
|
+
}
|
|
4399
|
+
|
|
4400
|
+
// U32->F64 PromoteEvenTo
|
|
4401
|
+
template <class D>
|
|
4402
|
+
HWY_INLINE VFromD<D> PromoteEvenTo(hwy::FloatTag /*to_type_tag*/,
|
|
4403
|
+
hwy::SizeTag<8> /*to_lane_size_tag*/,
|
|
4404
|
+
hwy::UnsignedTag /*from_type_tag*/, D d_to,
|
|
4405
|
+
svuint32_t v) {
|
|
4406
|
+
const Repartition<float, decltype(d_to)> d_from;
|
|
4407
|
+
return svcvt_f64_u32_x(detail::PTrue(d_from), v);
|
|
4408
|
+
}
|
|
4409
|
+
|
|
4410
|
+
// F32->I64 PromoteEvenTo
|
|
4411
|
+
template <class D>
|
|
4412
|
+
HWY_INLINE VFromD<D> PromoteEvenTo(hwy::SignedTag /*to_type_tag*/,
|
|
4413
|
+
hwy::SizeTag<8> /*to_lane_size_tag*/,
|
|
4414
|
+
hwy::FloatTag /*from_type_tag*/, D d_to,
|
|
4415
|
+
svfloat32_t v) {
|
|
4416
|
+
const Repartition<float, decltype(d_to)> d_from;
|
|
4417
|
+
return svcvt_s64_f32_x(detail::PTrue(d_from), v);
|
|
4418
|
+
}
|
|
4419
|
+
|
|
4420
|
+
// F32->U64 PromoteEvenTo
|
|
4421
|
+
template <class D>
|
|
4422
|
+
HWY_INLINE VFromD<D> PromoteEvenTo(hwy::UnsignedTag /*to_type_tag*/,
|
|
4423
|
+
hwy::SizeTag<8> /*to_lane_size_tag*/,
|
|
4424
|
+
hwy::FloatTag /*from_type_tag*/, D d_to,
|
|
4425
|
+
svfloat32_t v) {
|
|
4426
|
+
const Repartition<float, decltype(d_to)> d_from;
|
|
4427
|
+
return svcvt_u64_f32_x(detail::PTrue(d_from), v);
|
|
4428
|
+
}
|
|
4429
|
+
|
|
4430
|
+
// F16->F32 PromoteOddTo
|
|
4431
|
+
template <class D>
|
|
4432
|
+
HWY_INLINE VFromD<D> PromoteOddTo(hwy::FloatTag to_type_tag,
|
|
4433
|
+
hwy::SizeTag<4> to_lane_size_tag,
|
|
4434
|
+
hwy::FloatTag from_type_tag, D d_to,
|
|
4435
|
+
svfloat16_t v) {
|
|
4436
|
+
return PromoteEvenTo(to_type_tag, to_lane_size_tag, from_type_tag, d_to,
|
|
4437
|
+
DupOdd(v));
|
|
4438
|
+
}
|
|
4439
|
+
|
|
4440
|
+
// I32/U32/F32->F64 PromoteOddTo
|
|
4441
|
+
template <class FromTypeTag, class D, class V>
|
|
4442
|
+
HWY_INLINE VFromD<D> PromoteOddTo(hwy::FloatTag to_type_tag,
|
|
4443
|
+
hwy::SizeTag<8> to_lane_size_tag,
|
|
4444
|
+
FromTypeTag from_type_tag, D d_to, V v) {
|
|
4445
|
+
return PromoteEvenTo(to_type_tag, to_lane_size_tag, from_type_tag, d_to,
|
|
4446
|
+
DupOdd(v));
|
|
4447
|
+
}
|
|
4448
|
+
|
|
4449
|
+
// F32->I64/U64 PromoteOddTo
|
|
4450
|
+
template <class ToTypeTag, class D, HWY_IF_UI64_D(D)>
|
|
4451
|
+
HWY_INLINE VFromD<D> PromoteOddTo(ToTypeTag to_type_tag,
|
|
4452
|
+
hwy::SizeTag<8> to_lane_size_tag,
|
|
4453
|
+
hwy::FloatTag from_type_tag, D d_to,
|
|
4454
|
+
svfloat32_t v) {
|
|
4455
|
+
return PromoteEvenTo(to_type_tag, to_lane_size_tag, from_type_tag, d_to,
|
|
4456
|
+
DupOdd(v));
|
|
4457
|
+
}
|
|
4458
|
+
|
|
4459
|
+
} // namespace detail
|
|
4460
|
+
|
|
3465
4461
|
// ------------------------------ ReorderDemote2To (OddEven)
|
|
3466
4462
|
|
|
3467
4463
|
template <size_t N, int kPow2>
|
|
3468
4464
|
HWY_API VBF16 ReorderDemote2To(Simd<bfloat16_t, N, kPow2> dbf16, svfloat32_t a,
|
|
3469
4465
|
svfloat32_t b) {
|
|
3470
|
-
|
|
3471
|
-
const
|
|
3472
|
-
|
|
3473
|
-
|
|
4466
|
+
#if HWY_SVE_HAVE_F32_TO_BF16C
|
|
4467
|
+
const VBF16 b_in_even = svcvt_bf16_f32_x(detail::PTrue(dbf16), b);
|
|
4468
|
+
return svcvtnt_bf16_f32_x(b_in_even, detail::PTrue(dbf16), a);
|
|
4469
|
+
#else
|
|
4470
|
+
(void)dbf16;
|
|
4471
|
+
const auto a_in_odd =
|
|
4472
|
+
BitCast(ScalableTag<uint16_t>(), detail::RoundF32ForDemoteToBF16(a));
|
|
4473
|
+
const auto b_in_odd =
|
|
4474
|
+
BitCast(ScalableTag<uint16_t>(), detail::RoundF32ForDemoteToBF16(b));
|
|
4475
|
+
return BitCast(dbf16, detail::InterleaveOdd(b_in_odd, a_in_odd));
|
|
4476
|
+
#endif
|
|
3474
4477
|
}
|
|
3475
4478
|
|
|
3476
4479
|
template <size_t N, int kPow2>
|
|
@@ -3608,6 +4611,14 @@ HWY_API svuint32_t ReorderDemote2To(Simd<uint32_t, N, kPow2> d32, svuint64_t a,
|
|
|
3608
4611
|
#endif
|
|
3609
4612
|
}
|
|
3610
4613
|
|
|
4614
|
+
template <class D, class V, HWY_IF_SIGNED_D(D), HWY_IF_UNSIGNED_V(V),
|
|
4615
|
+
HWY_IF_T_SIZE_D(D, sizeof(TFromV<V>) / 2)>
|
|
4616
|
+
HWY_API VFromD<D> ReorderDemote2To(D dn, V a, V b) {
|
|
4617
|
+
const auto clamped_a = BitCast(dn, detail::SaturateU<TFromD<D>>(a));
|
|
4618
|
+
const auto clamped_b = BitCast(dn, detail::SaturateU<TFromD<D>>(b));
|
|
4619
|
+
return detail::InterleaveEven(clamped_a, clamped_b);
|
|
4620
|
+
}
|
|
4621
|
+
|
|
3611
4622
|
template <class D, class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL(TFromD<D>),
|
|
3612
4623
|
HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V),
|
|
3613
4624
|
HWY_IF_T_SIZE_V(V, sizeof(TFromD<D>) * 2)>
|
|
@@ -3618,21 +4629,55 @@ HWY_API VFromD<D> OrderedDemote2To(D dn, V a, V b) {
|
|
|
3618
4629
|
return Combine(dn, demoted_b, demoted_a);
|
|
3619
4630
|
}
|
|
3620
4631
|
|
|
3621
|
-
template <
|
|
3622
|
-
HWY_API VBF16 OrderedDemote2To(
|
|
3623
|
-
|
|
3624
|
-
|
|
3625
|
-
|
|
3626
|
-
const
|
|
3627
|
-
const
|
|
3628
|
-
return
|
|
3629
|
-
|
|
4632
|
+
template <size_t N, int kPow2>
|
|
4633
|
+
HWY_API VBF16 OrderedDemote2To(Simd<bfloat16_t, N, kPow2> dbf16, svfloat32_t a,
|
|
4634
|
+
svfloat32_t b) {
|
|
4635
|
+
#if HWY_SVE_HAVE_F32_TO_BF16C
|
|
4636
|
+
(void)dbf16;
|
|
4637
|
+
const VBF16 a_in_even = svcvt_bf16_f32_x(detail::PTrue(dbf16), a);
|
|
4638
|
+
const VBF16 b_in_even = svcvt_bf16_f32_x(detail::PTrue(dbf16), b);
|
|
4639
|
+
return ConcatEven(dbf16, b_in_even, a_in_even);
|
|
4640
|
+
#else
|
|
4641
|
+
const RebindToUnsigned<decltype(dbf16)> du16;
|
|
4642
|
+
const svuint16_t a_in_odd = BitCast(du16, detail::RoundF32ForDemoteToBF16(a));
|
|
4643
|
+
const svuint16_t b_in_odd = BitCast(du16, detail::RoundF32ForDemoteToBF16(b));
|
|
4644
|
+
return BitCast(dbf16, ConcatOdd(du16, b_in_odd, a_in_odd)); // lower half
|
|
4645
|
+
#endif
|
|
4646
|
+
}
|
|
4647
|
+
|
|
4648
|
+
// ------------------------------ I8/U8/I16/U16 Div
|
|
4649
|
+
|
|
4650
|
+
template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V),
|
|
4651
|
+
HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 2))>
|
|
4652
|
+
HWY_API V Div(V a, V b) {
|
|
4653
|
+
const DFromV<decltype(a)> d;
|
|
4654
|
+
const Half<decltype(d)> dh;
|
|
4655
|
+
const RepartitionToWide<decltype(d)> dw;
|
|
4656
|
+
|
|
4657
|
+
const auto q_lo =
|
|
4658
|
+
Div(PromoteTo(dw, LowerHalf(dh, a)), PromoteTo(dw, LowerHalf(dh, b)));
|
|
4659
|
+
const auto q_hi = Div(PromoteUpperTo(dw, a), PromoteUpperTo(dw, b));
|
|
4660
|
+
|
|
4661
|
+
return OrderedDemote2To(d, q_lo, q_hi);
|
|
4662
|
+
}
|
|
4663
|
+
|
|
4664
|
+
// ------------------------------ I8/U8/I16/U16 MaskedDivOr
|
|
4665
|
+
template <class V, class M, HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 2)),
|
|
4666
|
+
HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
|
|
4667
|
+
HWY_API V MaskedDivOr(V no, M m, V a, V b) {
|
|
4668
|
+
return IfThenElse(m, Div(a, b), no);
|
|
3630
4669
|
}
|
|
3631
4670
|
|
|
3632
|
-
// ------------------------------
|
|
4671
|
+
// ------------------------------ Mod (Div, NegMulAdd)
|
|
3633
4672
|
template <class V>
|
|
3634
|
-
HWY_API V
|
|
3635
|
-
return
|
|
4673
|
+
HWY_API V Mod(V a, V b) {
|
|
4674
|
+
return NegMulAdd(Div(a, b), b, a);
|
|
4675
|
+
}
|
|
4676
|
+
|
|
4677
|
+
// ------------------------------ MaskedModOr (Mod)
|
|
4678
|
+
template <class V, class M>
|
|
4679
|
+
HWY_API V MaskedModOr(V no, M m, V a, V b) {
|
|
4680
|
+
return IfThenElse(m, Mod(a, b), no);
|
|
3636
4681
|
}
|
|
3637
4682
|
|
|
3638
4683
|
// ------------------------------ BroadcastSignBit (ShiftRight)
|
|
@@ -3645,11 +4690,7 @@ HWY_API V BroadcastSignBit(const V v) {
|
|
|
3645
4690
|
template <class V>
|
|
3646
4691
|
HWY_API V IfNegativeThenElse(V v, V yes, V no) {
|
|
3647
4692
|
static_assert(IsSigned<TFromV<V>>(), "Only works for signed/float");
|
|
3648
|
-
|
|
3649
|
-
const RebindToSigned<decltype(d)> di;
|
|
3650
|
-
|
|
3651
|
-
const svbool_t m = detail::LtN(BitCast(di, v), 0);
|
|
3652
|
-
return IfThenElse(m, yes, no);
|
|
4693
|
+
return IfThenElse(IsNegative(v), yes, no);
|
|
3653
4694
|
}
|
|
3654
4695
|
|
|
3655
4696
|
// ------------------------------ AverageRound (ShiftRight)
|
|
@@ -3735,6 +4776,84 @@ HWY_INLINE svbool_t LoadMaskBits(D /* tag */,
|
|
|
3735
4776
|
return TestBit(vbits, bit);
|
|
3736
4777
|
}
|
|
3737
4778
|
|
|
4779
|
+
// ------------------------------ Dup128MaskFromMaskBits
|
|
4780
|
+
|
|
4781
|
+
template <class D, HWY_IF_T_SIZE_D(D, 1), HWY_IF_V_SIZE_LE_D(D, 8)>
|
|
4782
|
+
HWY_API MFromD<D> Dup128MaskFromMaskBits(D d, unsigned mask_bits) {
|
|
4783
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
4784
|
+
|
|
4785
|
+
constexpr size_t kN = MaxLanes(d);
|
|
4786
|
+
if (kN < 8) mask_bits &= (1u << kN) - 1;
|
|
4787
|
+
|
|
4788
|
+
// Replicate the lower 8 bits of mask_bits to each u8 lane
|
|
4789
|
+
const svuint8_t bytes = BitCast(du, Set(du, static_cast<uint8_t>(mask_bits)));
|
|
4790
|
+
|
|
4791
|
+
const svuint8_t bit =
|
|
4792
|
+
svdupq_n_u8(1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128);
|
|
4793
|
+
return TestBit(bytes, bit);
|
|
4794
|
+
}
|
|
4795
|
+
|
|
4796
|
+
template <class D, HWY_IF_T_SIZE_D(D, 1), HWY_IF_V_SIZE_GT_D(D, 8)>
|
|
4797
|
+
HWY_API MFromD<D> Dup128MaskFromMaskBits(D d, unsigned mask_bits) {
|
|
4798
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
4799
|
+
const Repartition<uint16_t, decltype(du)> du16;
|
|
4800
|
+
|
|
4801
|
+
// Replicate the lower 16 bits of mask_bits to each u16 lane of a u16 vector,
|
|
4802
|
+
// and then bitcast the replicated mask_bits to a u8 vector
|
|
4803
|
+
const svuint8_t bytes =
|
|
4804
|
+
BitCast(du, Set(du16, static_cast<uint16_t>(mask_bits)));
|
|
4805
|
+
// Replicate bytes 8x such that each byte contains the bit that governs it.
|
|
4806
|
+
const svuint8_t rep8 = svtbl_u8(bytes, ShiftRight<3>(Iota(du, 0)));
|
|
4807
|
+
|
|
4808
|
+
const svuint8_t bit =
|
|
4809
|
+
svdupq_n_u8(1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128);
|
|
4810
|
+
return TestBit(rep8, bit);
|
|
4811
|
+
}
|
|
4812
|
+
|
|
4813
|
+
template <class D, HWY_IF_T_SIZE_D(D, 2)>
|
|
4814
|
+
HWY_API MFromD<D> Dup128MaskFromMaskBits(D d, unsigned mask_bits) {
|
|
4815
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
4816
|
+
const Repartition<uint8_t, decltype(d)> du8;
|
|
4817
|
+
|
|
4818
|
+
constexpr size_t kN = MaxLanes(d);
|
|
4819
|
+
if (kN < 8) mask_bits &= (1u << kN) - 1;
|
|
4820
|
+
|
|
4821
|
+
// Set all of the u8 lanes of bytes to the lower 8 bits of mask_bits
|
|
4822
|
+
const svuint8_t bytes = Set(du8, static_cast<uint8_t>(mask_bits));
|
|
4823
|
+
|
|
4824
|
+
const svuint16_t bit = svdupq_n_u16(1, 2, 4, 8, 16, 32, 64, 128);
|
|
4825
|
+
return TestBit(BitCast(du, bytes), bit);
|
|
4826
|
+
}
|
|
4827
|
+
|
|
4828
|
+
template <class D, HWY_IF_T_SIZE_D(D, 4)>
|
|
4829
|
+
HWY_API MFromD<D> Dup128MaskFromMaskBits(D d, unsigned mask_bits) {
|
|
4830
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
4831
|
+
const Repartition<uint8_t, decltype(d)> du8;
|
|
4832
|
+
|
|
4833
|
+
constexpr size_t kN = MaxLanes(d);
|
|
4834
|
+
if (kN < 4) mask_bits &= (1u << kN) - 1;
|
|
4835
|
+
|
|
4836
|
+
// Set all of the u8 lanes of bytes to the lower 8 bits of mask_bits
|
|
4837
|
+
const svuint8_t bytes = Set(du8, static_cast<uint8_t>(mask_bits));
|
|
4838
|
+
|
|
4839
|
+
const svuint32_t bit = svdupq_n_u32(1, 2, 4, 8);
|
|
4840
|
+
return TestBit(BitCast(du, bytes), bit);
|
|
4841
|
+
}
|
|
4842
|
+
|
|
4843
|
+
template <class D, HWY_IF_T_SIZE_D(D, 8)>
|
|
4844
|
+
HWY_API MFromD<D> Dup128MaskFromMaskBits(D d, unsigned mask_bits) {
|
|
4845
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
4846
|
+
const Repartition<uint8_t, decltype(d)> du8;
|
|
4847
|
+
|
|
4848
|
+
if (MaxLanes(d) < 2) mask_bits &= 1u;
|
|
4849
|
+
|
|
4850
|
+
// Set all of the u8 lanes of bytes to the lower 8 bits of mask_bits
|
|
4851
|
+
const svuint8_t bytes = Set(du8, static_cast<uint8_t>(mask_bits));
|
|
4852
|
+
|
|
4853
|
+
const svuint64_t bit = svdupq_n_u64(1, 2);
|
|
4854
|
+
return TestBit(BitCast(du, bytes), bit);
|
|
4855
|
+
}
|
|
4856
|
+
|
|
3738
4857
|
// ------------------------------ StoreMaskBits
|
|
3739
4858
|
|
|
3740
4859
|
namespace detail {
|
|
@@ -4100,12 +5219,13 @@ HWY_INLINE VFromD<DU> LaneIndicesFromByteIndices(D, svuint8_t idx) {
|
|
|
4100
5219
|
template <class V>
|
|
4101
5220
|
HWY_INLINE V ExpandLoop(V v, svbool_t mask) {
|
|
4102
5221
|
const DFromV<V> d;
|
|
5222
|
+
using T = TFromV<V>;
|
|
4103
5223
|
uint8_t mask_bytes[256 / 8];
|
|
4104
5224
|
StoreMaskBits(d, mask, mask_bytes);
|
|
4105
5225
|
|
|
4106
5226
|
// ShiftLeftLanes is expensive, so we're probably better off storing to memory
|
|
4107
5227
|
// and loading the final result.
|
|
4108
|
-
alignas(16)
|
|
5228
|
+
alignas(16) T out[2 * MaxLanes(d)];
|
|
4109
5229
|
|
|
4110
5230
|
svbool_t next = svpfalse_b();
|
|
4111
5231
|
size_t input_consumed = 0;
|
|
@@ -4117,7 +5237,7 @@ HWY_INLINE V ExpandLoop(V v, svbool_t mask) {
|
|
|
4117
5237
|
// instruction for variable-shift-reg, but we can splice.
|
|
4118
5238
|
const V vH = detail::Splice(v, v, next);
|
|
4119
5239
|
input_consumed += PopCount(mask_bits);
|
|
4120
|
-
next = detail::GeN(iota,
|
|
5240
|
+
next = detail::GeN(iota, ConvertScalarTo<T>(input_consumed));
|
|
4121
5241
|
|
|
4122
5242
|
const auto idx = detail::LaneIndicesFromByteIndices(
|
|
4123
5243
|
d, detail::IndicesForExpandFromBits(mask_bits));
|
|
@@ -4594,12 +5714,24 @@ HWY_API VFromD<DW> MulOdd(const V a, const V b) {
|
|
|
4594
5714
|
#endif
|
|
4595
5715
|
}
|
|
4596
5716
|
|
|
5717
|
+
HWY_API svint64_t MulEven(const svint64_t a, const svint64_t b) {
|
|
5718
|
+
const auto lo = Mul(a, b);
|
|
5719
|
+
const auto hi = MulHigh(a, b);
|
|
5720
|
+
return detail::InterleaveEven(lo, hi);
|
|
5721
|
+
}
|
|
5722
|
+
|
|
4597
5723
|
HWY_API svuint64_t MulEven(const svuint64_t a, const svuint64_t b) {
|
|
4598
5724
|
const auto lo = Mul(a, b);
|
|
4599
5725
|
const auto hi = MulHigh(a, b);
|
|
4600
5726
|
return detail::InterleaveEven(lo, hi);
|
|
4601
5727
|
}
|
|
4602
5728
|
|
|
5729
|
+
HWY_API svint64_t MulOdd(const svint64_t a, const svint64_t b) {
|
|
5730
|
+
const auto lo = Mul(a, b);
|
|
5731
|
+
const auto hi = MulHigh(a, b);
|
|
5732
|
+
return detail::InterleaveOdd(lo, hi);
|
|
5733
|
+
}
|
|
5734
|
+
|
|
4603
5735
|
HWY_API svuint64_t MulOdd(const svuint64_t a, const svuint64_t b) {
|
|
4604
5736
|
const auto lo = Mul(a, b);
|
|
4605
5737
|
const auto hi = MulHigh(a, b);
|
|
@@ -4609,24 +5741,15 @@ HWY_API svuint64_t MulOdd(const svuint64_t a, const svuint64_t b) {
|
|
|
4609
5741
|
// ------------------------------ WidenMulPairwiseAdd
|
|
4610
5742
|
|
|
4611
5743
|
template <size_t N, int kPow2>
|
|
4612
|
-
HWY_API svfloat32_t WidenMulPairwiseAdd(Simd<float, N, kPow2>
|
|
5744
|
+
HWY_API svfloat32_t WidenMulPairwiseAdd(Simd<float, N, kPow2> df, VBF16 a,
|
|
4613
5745
|
VBF16 b) {
|
|
4614
|
-
#if
|
|
4615
|
-
const svfloat32_t even = svbfmlalb_f32(Zero(
|
|
5746
|
+
#if HWY_SVE_HAVE_F32_TO_BF16C
|
|
5747
|
+
const svfloat32_t even = svbfmlalb_f32(Zero(df), a, b);
|
|
4616
5748
|
return svbfmlalt_f32(even, a, b);
|
|
4617
5749
|
#else
|
|
4618
|
-
|
|
4619
|
-
|
|
4620
|
-
//
|
|
4621
|
-
using VU32 = VFromD<decltype(du32)>;
|
|
4622
|
-
const VU32 odd = Set(du32, 0xFFFF0000u);
|
|
4623
|
-
const VU32 ae = ShiftLeft<16>(BitCast(du32, a));
|
|
4624
|
-
const VU32 ao = And(BitCast(du32, a), odd);
|
|
4625
|
-
const VU32 be = ShiftLeft<16>(BitCast(du32, b));
|
|
4626
|
-
const VU32 bo = And(BitCast(du32, b), odd);
|
|
4627
|
-
return MulAdd(BitCast(df32, ae), BitCast(df32, be),
|
|
4628
|
-
Mul(BitCast(df32, ao), BitCast(df32, bo)));
|
|
4629
|
-
#endif // HWY_SVE_HAVE_BFLOAT16
|
|
5750
|
+
return MulAdd(PromoteEvenTo(df, a), PromoteEvenTo(df, b),
|
|
5751
|
+
Mul(PromoteOddTo(df, a), PromoteOddTo(df, b)));
|
|
5752
|
+
#endif // HWY_SVE_HAVE_BF16_FEATURE
|
|
4630
5753
|
}
|
|
4631
5754
|
|
|
4632
5755
|
template <size_t N, int kPow2>
|
|
@@ -4636,14 +5759,8 @@ HWY_API svint32_t WidenMulPairwiseAdd(Simd<int32_t, N, kPow2> d32, svint16_t a,
|
|
|
4636
5759
|
(void)d32;
|
|
4637
5760
|
return svmlalt_s32(svmullb_s32(a, b), a, b);
|
|
4638
5761
|
#else
|
|
4639
|
-
|
|
4640
|
-
|
|
4641
|
-
// Fortunately SVE has sign-extension for the even lanes.
|
|
4642
|
-
const svint32_t ae = svexth_s32_x(pg, BitCast(d32, a));
|
|
4643
|
-
const svint32_t be = svexth_s32_x(pg, BitCast(d32, b));
|
|
4644
|
-
const svint32_t ao = ShiftRight<16>(BitCast(d32, a));
|
|
4645
|
-
const svint32_t bo = ShiftRight<16>(BitCast(d32, b));
|
|
4646
|
-
return svmla_s32_x(pg, svmul_s32_x(pg, ao, bo), ae, be);
|
|
5762
|
+
return MulAdd(PromoteEvenTo(d32, a), PromoteEvenTo(d32, b),
|
|
5763
|
+
Mul(PromoteOddTo(d32, a), PromoteOddTo(d32, b)));
|
|
4647
5764
|
#endif
|
|
4648
5765
|
}
|
|
4649
5766
|
|
|
@@ -4654,43 +5771,59 @@ HWY_API svuint32_t WidenMulPairwiseAdd(Simd<uint32_t, N, kPow2> d32,
|
|
|
4654
5771
|
(void)d32;
|
|
4655
5772
|
return svmlalt_u32(svmullb_u32(a, b), a, b);
|
|
4656
5773
|
#else
|
|
4657
|
-
|
|
4658
|
-
|
|
4659
|
-
|
|
4660
|
-
|
|
4661
|
-
|
|
4662
|
-
|
|
4663
|
-
|
|
4664
|
-
|
|
5774
|
+
return MulAdd(PromoteEvenTo(d32, a), PromoteEvenTo(d32, b),
|
|
5775
|
+
Mul(PromoteOddTo(d32, a), PromoteOddTo(d32, b)));
|
|
5776
|
+
#endif
|
|
5777
|
+
}
|
|
5778
|
+
|
|
5779
|
+
// ------------------------------ SatWidenMulAccumFixedPoint
|
|
5780
|
+
|
|
5781
|
+
#if HWY_SVE_HAVE_2
|
|
5782
|
+
|
|
5783
|
+
#ifdef HWY_NATIVE_I16_SATWIDENMULACCUMFIXEDPOINT
|
|
5784
|
+
#undef HWY_NATIVE_I16_SATWIDENMULACCUMFIXEDPOINT
|
|
5785
|
+
#else
|
|
5786
|
+
#define HWY_NATIVE_I16_SATWIDENMULACCUMFIXEDPOINT
|
|
4665
5787
|
#endif
|
|
5788
|
+
|
|
5789
|
+
template <class DI32, HWY_IF_I32_D(DI32)>
|
|
5790
|
+
HWY_API VFromD<DI32> SatWidenMulAccumFixedPoint(DI32 /*di32*/,
|
|
5791
|
+
VFromD<Rebind<int16_t, DI32>> a,
|
|
5792
|
+
VFromD<Rebind<int16_t, DI32>> b,
|
|
5793
|
+
VFromD<DI32> sum) {
|
|
5794
|
+
return svqdmlalb_s32(sum, detail::ZipLowerSame(a, a),
|
|
5795
|
+
detail::ZipLowerSame(b, b));
|
|
4666
5796
|
}
|
|
4667
5797
|
|
|
5798
|
+
#endif // HWY_SVE_HAVE_2
|
|
5799
|
+
|
|
4668
5800
|
// ------------------------------ ReorderWidenMulAccumulate (MulAdd, ZipLower)
|
|
4669
5801
|
|
|
4670
|
-
|
|
4671
|
-
|
|
4672
|
-
|
|
4673
|
-
|
|
4674
|
-
|
|
4675
|
-
#
|
|
4676
|
-
|
|
4677
|
-
sum1 = svbfmlalt_f32(sum1, a, b);
|
|
4678
|
-
return svbfmlalb_f32(sum0, a, b);
|
|
5802
|
+
#if HWY_SVE_HAVE_BF16_FEATURE
|
|
5803
|
+
|
|
5804
|
+
// NOTE: we currently do not use SVE BFDOT for bf16 ReorderWidenMulAccumulate
|
|
5805
|
+
// because, apparently unlike NEON, it uses round to odd unless the additional
|
|
5806
|
+
// FEAT_EBF16 feature is available and enabled.
|
|
5807
|
+
#ifdef HWY_NATIVE_MUL_EVEN_BF16
|
|
5808
|
+
#undef HWY_NATIVE_MUL_EVEN_BF16
|
|
4679
5809
|
#else
|
|
4680
|
-
|
|
4681
|
-
|
|
4682
|
-
|
|
4683
|
-
|
|
4684
|
-
|
|
4685
|
-
|
|
4686
|
-
|
|
4687
|
-
|
|
4688
|
-
|
|
4689
|
-
|
|
4690
|
-
|
|
4691
|
-
|
|
5810
|
+
#define HWY_NATIVE_MUL_EVEN_BF16
|
|
5811
|
+
#endif
|
|
5812
|
+
|
|
5813
|
+
template <size_t N, int kPow2>
|
|
5814
|
+
HWY_API svfloat32_t MulEvenAdd(Simd<float, N, kPow2> /* d */, VBF16 a, VBF16 b,
|
|
5815
|
+
const svfloat32_t c) {
|
|
5816
|
+
return svbfmlalb_f32(c, a, b);
|
|
5817
|
+
}
|
|
5818
|
+
|
|
5819
|
+
template <size_t N, int kPow2>
|
|
5820
|
+
HWY_API svfloat32_t MulOddAdd(Simd<float, N, kPow2> /* d */, VBF16 a, VBF16 b,
|
|
5821
|
+
const svfloat32_t c) {
|
|
5822
|
+
return svbfmlalt_f32(c, a, b);
|
|
4692
5823
|
}
|
|
4693
5824
|
|
|
5825
|
+
#endif // HWY_SVE_HAVE_BF16_FEATURE
|
|
5826
|
+
|
|
4694
5827
|
template <size_t N, int kPow2>
|
|
4695
5828
|
HWY_API svint32_t ReorderWidenMulAccumulate(Simd<int32_t, N, kPow2> d32,
|
|
4696
5829
|
svint16_t a, svint16_t b,
|
|
@@ -4701,15 +5834,10 @@ HWY_API svint32_t ReorderWidenMulAccumulate(Simd<int32_t, N, kPow2> d32,
|
|
|
4701
5834
|
sum1 = svmlalt_s32(sum1, a, b);
|
|
4702
5835
|
return svmlalb_s32(sum0, a, b);
|
|
4703
5836
|
#else
|
|
4704
|
-
|
|
4705
|
-
//
|
|
4706
|
-
|
|
4707
|
-
|
|
4708
|
-
const svint32_t be = svexth_s32_x(pg, BitCast(d32, b));
|
|
4709
|
-
const svint32_t ao = ShiftRight<16>(BitCast(d32, a));
|
|
4710
|
-
const svint32_t bo = ShiftRight<16>(BitCast(d32, b));
|
|
4711
|
-
sum1 = svmla_s32_x(pg, sum1, ao, bo);
|
|
4712
|
-
return svmla_s32_x(pg, sum0, ae, be);
|
|
5837
|
+
// Lane order within sum0/1 is undefined, hence we can avoid the
|
|
5838
|
+
// longer-latency lane-crossing PromoteTo by using PromoteEvenTo.
|
|
5839
|
+
sum1 = MulAdd(PromoteOddTo(d32, a), PromoteOddTo(d32, b), sum1);
|
|
5840
|
+
return MulAdd(PromoteEvenTo(d32, a), PromoteEvenTo(d32, b), sum0);
|
|
4713
5841
|
#endif
|
|
4714
5842
|
}
|
|
4715
5843
|
|
|
@@ -4723,15 +5851,10 @@ HWY_API svuint32_t ReorderWidenMulAccumulate(Simd<uint32_t, N, kPow2> d32,
|
|
|
4723
5851
|
sum1 = svmlalt_u32(sum1, a, b);
|
|
4724
5852
|
return svmlalb_u32(sum0, a, b);
|
|
4725
5853
|
#else
|
|
4726
|
-
|
|
4727
|
-
//
|
|
4728
|
-
|
|
4729
|
-
|
|
4730
|
-
const svuint32_t be = svexth_u32_x(pg, BitCast(d32, b));
|
|
4731
|
-
const svuint32_t ao = ShiftRight<16>(BitCast(d32, a));
|
|
4732
|
-
const svuint32_t bo = ShiftRight<16>(BitCast(d32, b));
|
|
4733
|
-
sum1 = svmla_u32_x(pg, sum1, ao, bo);
|
|
4734
|
-
return svmla_u32_x(pg, sum0, ae, be);
|
|
5854
|
+
// Lane order within sum0/1 is undefined, hence we can avoid the
|
|
5855
|
+
// longer-latency lane-crossing PromoteTo by using PromoteEvenTo.
|
|
5856
|
+
sum1 = MulAdd(PromoteOddTo(d32, a), PromoteOddTo(d32, b), sum1);
|
|
5857
|
+
return MulAdd(PromoteEvenTo(d32, a), PromoteEvenTo(d32, b), sum0);
|
|
4735
5858
|
#endif
|
|
4736
5859
|
}
|
|
4737
5860
|
|
|
@@ -4817,8 +5940,10 @@ HWY_API VFromD<DU64> SumOfMulQuadAccumulate(DU64 /*du64*/, svuint16_t a,
|
|
|
4817
5940
|
|
|
4818
5941
|
// ------------------------------ AESRound / CLMul
|
|
4819
5942
|
|
|
5943
|
+
// Static dispatch with -march=armv8-a+sve2+aes, or dynamic dispatch WITHOUT a
|
|
5944
|
+
// baseline, in which case we check for AES support at runtime.
|
|
4820
5945
|
#if defined(__ARM_FEATURE_SVE2_AES) || \
|
|
4821
|
-
(HWY_SVE_HAVE_2 && HWY_HAVE_RUNTIME_DISPATCH)
|
|
5946
|
+
(HWY_SVE_HAVE_2 && HWY_HAVE_RUNTIME_DISPATCH && HWY_BASELINE_SVE2 == 0)
|
|
4822
5947
|
|
|
4823
5948
|
// Per-target flag to prevent generic_ops-inl.h from defining AESRound.
|
|
4824
5949
|
#ifdef HWY_NATIVE_AES
|
|
@@ -5059,14 +6184,15 @@ HWY_API V HighestSetBitIndex(V v) {
|
|
|
5059
6184
|
}
|
|
5060
6185
|
|
|
5061
6186
|
// ================================================== END MACROS
|
|
5062
|
-
namespace detail { // for code folding
|
|
5063
6187
|
#undef HWY_SVE_ALL_PTRUE
|
|
5064
6188
|
#undef HWY_SVE_D
|
|
5065
6189
|
#undef HWY_SVE_FOREACH
|
|
5066
6190
|
#undef HWY_SVE_FOREACH_BF16
|
|
6191
|
+
#undef HWY_SVE_FOREACH_BF16_UNCONDITIONAL
|
|
5067
6192
|
#undef HWY_SVE_FOREACH_F
|
|
5068
6193
|
#undef HWY_SVE_FOREACH_F16
|
|
5069
6194
|
#undef HWY_SVE_FOREACH_F32
|
|
6195
|
+
#undef HWY_SVE_FOREACH_F3264
|
|
5070
6196
|
#undef HWY_SVE_FOREACH_F64
|
|
5071
6197
|
#undef HWY_SVE_FOREACH_I
|
|
5072
6198
|
#undef HWY_SVE_FOREACH_I08
|
|
@@ -5086,7 +6212,10 @@ namespace detail { // for code folding
|
|
|
5086
6212
|
#undef HWY_SVE_FOREACH_UI64
|
|
5087
6213
|
#undef HWY_SVE_FOREACH_UIF3264
|
|
5088
6214
|
#undef HWY_SVE_HAVE_2
|
|
6215
|
+
#undef HWY_SVE_IF_EMULATED_D
|
|
6216
|
+
#undef HWY_SVE_IF_NOT_EMULATED_D
|
|
5089
6217
|
#undef HWY_SVE_PTRUE
|
|
6218
|
+
#undef HWY_SVE_RETV_ARGMVV
|
|
5090
6219
|
#undef HWY_SVE_RETV_ARGPV
|
|
5091
6220
|
#undef HWY_SVE_RETV_ARGPVN
|
|
5092
6221
|
#undef HWY_SVE_RETV_ARGPVV
|
|
@@ -5098,7 +6227,6 @@ namespace detail { // for code folding
|
|
|
5098
6227
|
#undef HWY_SVE_UNDEFINED
|
|
5099
6228
|
#undef HWY_SVE_V
|
|
5100
6229
|
|
|
5101
|
-
} // namespace detail
|
|
5102
6230
|
// NOLINTNEXTLINE(google-readability-namespace-comments)
|
|
5103
6231
|
} // namespace HWY_NAMESPACE
|
|
5104
6232
|
} // namespace hwy
|