@img/sharp-libvips-dev 1.2.1 → 1.2.2-rc.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/include/aom/aom_decoder.h +1 -1
- package/include/aom/aom_encoder.h +2 -0
- package/include/aom/aomcx.h +106 -25
- package/include/ffi.h +3 -3
- package/include/freetype2/freetype/config/ftconfig.h +1 -1
- package/include/freetype2/freetype/config/ftheader.h +1 -1
- package/include/freetype2/freetype/config/ftoption.h +37 -12
- package/include/freetype2/freetype/config/ftstdlib.h +1 -1
- package/include/freetype2/freetype/config/integer-types.h +29 -2
- package/include/freetype2/freetype/config/mac-support.h +1 -1
- package/include/freetype2/freetype/config/public-macros.h +3 -3
- package/include/freetype2/freetype/freetype.h +51 -47
- package/include/freetype2/freetype/ftadvanc.h +1 -1
- package/include/freetype2/freetype/ftbbox.h +1 -1
- package/include/freetype2/freetype/ftbdf.h +1 -1
- package/include/freetype2/freetype/ftbitmap.h +1 -1
- package/include/freetype2/freetype/ftbzip2.h +1 -1
- package/include/freetype2/freetype/ftcache.h +1 -1
- package/include/freetype2/freetype/ftcid.h +1 -1
- package/include/freetype2/freetype/ftcolor.h +13 -4
- package/include/freetype2/freetype/ftdriver.h +3 -3
- package/include/freetype2/freetype/fterrdef.h +1 -1
- package/include/freetype2/freetype/fterrors.h +1 -1
- package/include/freetype2/freetype/ftfntfmt.h +1 -1
- package/include/freetype2/freetype/ftgasp.h +1 -1
- package/include/freetype2/freetype/ftglyph.h +1 -1
- package/include/freetype2/freetype/ftgxval.h +1 -1
- package/include/freetype2/freetype/ftgzip.h +1 -1
- package/include/freetype2/freetype/ftimage.h +6 -2
- package/include/freetype2/freetype/ftincrem.h +1 -1
- package/include/freetype2/freetype/ftlcdfil.h +1 -1
- package/include/freetype2/freetype/ftlist.h +1 -1
- package/include/freetype2/freetype/ftlogging.h +184 -0
- package/include/freetype2/freetype/ftlzw.h +1 -1
- package/include/freetype2/freetype/ftmac.h +1 -1
- package/include/freetype2/freetype/ftmm.h +159 -103
- package/include/freetype2/freetype/ftmodapi.h +1 -1
- package/include/freetype2/freetype/ftmoderr.h +1 -1
- package/include/freetype2/freetype/ftotval.h +1 -1
- package/include/freetype2/freetype/ftoutln.h +1 -1
- package/include/freetype2/freetype/ftparams.h +1 -1
- package/include/freetype2/freetype/ftpfr.h +1 -1
- package/include/freetype2/freetype/ftrender.h +1 -1
- package/include/freetype2/freetype/ftsizes.h +1 -1
- package/include/freetype2/freetype/ftsnames.h +1 -1
- package/include/freetype2/freetype/ftstroke.h +1 -1
- package/include/freetype2/freetype/ftsynth.h +1 -1
- package/include/freetype2/freetype/ftsystem.h +1 -1
- package/include/freetype2/freetype/fttrigon.h +1 -1
- package/include/freetype2/freetype/fttypes.h +1 -1
- package/include/freetype2/freetype/ftwinfnt.h +2 -3
- package/include/freetype2/freetype/otsvg.h +1 -1
- package/include/freetype2/freetype/t1tables.h +1 -1
- package/include/freetype2/freetype/ttnameid.h +129 -129
- package/include/freetype2/freetype/tttables.h +8 -5
- package/include/freetype2/freetype/tttags.h +1 -1
- package/include/freetype2/ft2build.h +1 -1
- package/include/glib-2.0/gio/gdbuserror.h +9 -8
- package/include/glib-2.0/gio/ginetaddress.h +12 -0
- package/include/glib-2.0/gio/gioenums.h +9 -2
- package/include/glib-2.0/glib/gstring.h +2 -2
- package/include/glib-2.0/glib/gunicode.h +1 -1
- package/include/glib-2.0/gobject/glib-types.h +1 -1
- package/include/glib-2.0/gobject/gparam.h +1 -1
- package/include/glib-2.0/gobject/gvalue.h +78 -35
- package/include/harfbuzz/hb-script-list.h +12 -0
- package/include/harfbuzz/hb-version.h +3 -3
- package/include/hwy/abort.h +2 -19
- package/include/hwy/aligned_allocator.h +11 -7
- package/include/hwy/auto_tune.h +504 -0
- package/include/hwy/base.h +425 -104
- package/include/hwy/cache_control.h +16 -0
- package/include/hwy/detect_compiler_arch.h +32 -1
- package/include/hwy/detect_targets.h +251 -67
- package/include/hwy/foreach_target.h +35 -0
- package/include/hwy/highway.h +185 -76
- package/include/hwy/nanobenchmark.h +1 -19
- package/include/hwy/ops/arm_neon-inl.h +969 -458
- package/include/hwy/ops/arm_sve-inl.h +1137 -359
- package/include/hwy/ops/emu128-inl.h +97 -11
- package/include/hwy/ops/generic_ops-inl.h +1222 -34
- package/include/hwy/ops/loongarch_lasx-inl.h +4664 -0
- package/include/hwy/ops/loongarch_lsx-inl.h +5933 -0
- package/include/hwy/ops/ppc_vsx-inl.h +306 -126
- package/include/hwy/ops/rvv-inl.h +546 -51
- package/include/hwy/ops/scalar-inl.h +77 -22
- package/include/hwy/ops/set_macros-inl.h +138 -17
- package/include/hwy/ops/shared-inl.h +50 -10
- package/include/hwy/ops/wasm_128-inl.h +137 -92
- package/include/hwy/ops/x86_128-inl.h +773 -214
- package/include/hwy/ops/x86_256-inl.h +712 -255
- package/include/hwy/ops/x86_512-inl.h +429 -753
- package/include/hwy/ops/x86_avx3-inl.h +501 -0
- package/include/hwy/per_target.h +2 -1
- package/include/hwy/profiler.h +622 -486
- package/include/hwy/targets.h +62 -20
- package/include/hwy/timer-inl.h +8 -160
- package/include/hwy/timer.h +170 -3
- package/include/hwy/x86_cpuid.h +81 -0
- package/include/libheif/heif_cxx.h +25 -5
- package/include/libheif/heif_regions.h +5 -5
- package/include/libheif/heif_version.h +2 -2
- package/include/librsvg-2.0/librsvg/rsvg-version.h +3 -3
- package/include/libxml2/libxml/valid.h +0 -3
- package/include/libxml2/libxml/xmlerror.h +1 -1
- package/include/libxml2/libxml/xmlversion.h +4 -4
- package/include/pango-1.0/pango/pango-enum-types.h +3 -0
- package/include/pango-1.0/pango/pango-features.h +3 -3
- package/include/pango-1.0/pango/pango-font.h +30 -0
- package/include/pango-1.0/pango/pango-version-macros.h +26 -0
- package/include/vips/connection.h +4 -4
- package/include/vips/version.h +4 -4
- package/include/zlib.h +3 -3
- package/package.json +1 -1
- package/versions.json +13 -13
|
@@ -117,10 +117,13 @@ namespace detail { // for code folding
|
|
|
117
117
|
// SFINAE to occur instead of a hard error due to a dependency on the D template
|
|
118
118
|
// argument
|
|
119
119
|
#define HWY_SVE_IF_EMULATED_D(D) hwy::EnableIf<!hwy::IsSame<D, D>()>* = nullptr
|
|
120
|
+
#define HWY_GENERIC_IF_EMULATED_D(D) \
|
|
121
|
+
hwy::EnableIf<!hwy::IsSame<D, D>()>* = nullptr
|
|
120
122
|
#define HWY_SVE_IF_NOT_EMULATED_D(D) hwy::EnableIf<true>* = nullptr
|
|
121
123
|
#else
|
|
122
124
|
#define HWY_SVE_FOREACH_BF16(X_MACRO, NAME, OP)
|
|
123
125
|
#define HWY_SVE_IF_EMULATED_D(D) HWY_IF_BF16_D(D)
|
|
126
|
+
#define HWY_GENERIC_IF_EMULATED_D(D) HWY_IF_BF16_D(D)
|
|
124
127
|
#define HWY_SVE_IF_NOT_EMULATED_D(D) HWY_IF_NOT_BF16_D(D)
|
|
125
128
|
#endif // HWY_SVE_HAVE_BF16_FEATURE
|
|
126
129
|
|
|
@@ -216,6 +219,19 @@ HWY_SVE_FOREACH_BF16_UNCONDITIONAL(HWY_SPECIALIZE, _, _)
|
|
|
216
219
|
HWY_API HWY_SVE_V(BASE, BITS) NAME(HWY_SVE_V(BASE, BITS) v) { \
|
|
217
220
|
return sv##OP##_##CHAR##BITS(v); \
|
|
218
221
|
}
|
|
222
|
+
#define HWY_SVE_RETV_ARGMV_M(BASE, CHAR, BITS, HALF, NAME, OP) \
|
|
223
|
+
HWY_API HWY_SVE_V(BASE, BITS) \
|
|
224
|
+
NAME(HWY_SVE_V(BASE, BITS) no, svbool_t m, HWY_SVE_V(BASE, BITS) a) { \
|
|
225
|
+
return sv##OP##_##CHAR##BITS##_m(no, m, a); \
|
|
226
|
+
}
|
|
227
|
+
#define HWY_SVE_RETV_ARGMV(BASE, CHAR, BITS, HALF, NAME, OP) \
|
|
228
|
+
HWY_API HWY_SVE_V(BASE, BITS) NAME(svbool_t m, HWY_SVE_V(BASE, BITS) v) { \
|
|
229
|
+
return sv##OP##_##CHAR##BITS##_x(m, v); \
|
|
230
|
+
}
|
|
231
|
+
#define HWY_SVE_RETV_ARGMV_Z(BASE, CHAR, BITS, HALF, NAME, OP) \
|
|
232
|
+
HWY_API HWY_SVE_V(BASE, BITS) NAME(svbool_t m, HWY_SVE_V(BASE, BITS) a) { \
|
|
233
|
+
return sv##OP##_##CHAR##BITS##_z(m, a); \
|
|
234
|
+
}
|
|
219
235
|
|
|
220
236
|
// vector = f(vector, scalar), e.g. detail::AddN
|
|
221
237
|
#define HWY_SVE_RETV_ARGPVN(BASE, CHAR, BITS, HALF, NAME, OP) \
|
|
@@ -249,6 +265,12 @@ HWY_SVE_FOREACH_BF16_UNCONDITIONAL(HWY_SPECIALIZE, _, _)
|
|
|
249
265
|
NAME(svbool_t m, HWY_SVE_V(BASE, BITS) a, HWY_SVE_V(BASE, BITS) b) { \
|
|
250
266
|
return sv##OP##_##CHAR##BITS##_x(m, a, b); \
|
|
251
267
|
}
|
|
268
|
+
// User-specified mask. Mask=false value is zero.
|
|
269
|
+
#define HWY_SVE_RETV_ARGMVV_Z(BASE, CHAR, BITS, HALF, NAME, OP) \
|
|
270
|
+
HWY_API HWY_SVE_V(BASE, BITS) \
|
|
271
|
+
NAME(svbool_t m, HWY_SVE_V(BASE, BITS) a, HWY_SVE_V(BASE, BITS) b) { \
|
|
272
|
+
return sv##OP##_##CHAR##BITS##_z(m, a, b); \
|
|
273
|
+
}
|
|
252
274
|
|
|
253
275
|
#define HWY_SVE_RETV_ARGVVV(BASE, CHAR, BITS, HALF, NAME, OP) \
|
|
254
276
|
HWY_API HWY_SVE_V(BASE, BITS) \
|
|
@@ -256,6 +278,18 @@ HWY_SVE_FOREACH_BF16_UNCONDITIONAL(HWY_SPECIALIZE, _, _)
|
|
|
256
278
|
HWY_SVE_V(BASE, BITS) c) { \
|
|
257
279
|
return sv##OP##_##CHAR##BITS(a, b, c); \
|
|
258
280
|
}
|
|
281
|
+
#define HWY_SVE_RETV_ARGMVVV(BASE, CHAR, BITS, HALF, NAME, OP) \
|
|
282
|
+
HWY_API HWY_SVE_V(BASE, BITS) \
|
|
283
|
+
NAME(svbool_t m, HWY_SVE_V(BASE, BITS) a, HWY_SVE_V(BASE, BITS) b, \
|
|
284
|
+
HWY_SVE_V(BASE, BITS) c) { \
|
|
285
|
+
return sv##OP##_##CHAR##BITS##_x(m, a, b, c); \
|
|
286
|
+
}
|
|
287
|
+
#define HWY_SVE_RETV_ARGMVVV_Z(BASE, CHAR, BITS, HALF, NAME, OP) \
|
|
288
|
+
HWY_API HWY_SVE_V(BASE, BITS) \
|
|
289
|
+
NAME(svbool_t m, HWY_SVE_V(BASE, BITS) mul, HWY_SVE_V(BASE, BITS) x, \
|
|
290
|
+
HWY_SVE_V(BASE, BITS) add) { \
|
|
291
|
+
return sv##OP##_##CHAR##BITS##_z(m, x, mul, add); \
|
|
292
|
+
}
|
|
259
293
|
|
|
260
294
|
// ------------------------------ Lanes
|
|
261
295
|
|
|
@@ -409,6 +443,27 @@ using VFromD = decltype(Set(D(), TFromD<D>()));
|
|
|
409
443
|
|
|
410
444
|
using VBF16 = VFromD<ScalableTag<bfloat16_t>>;
|
|
411
445
|
|
|
446
|
+
// ------------------------------ MaskedSetOr/MaskedSet
|
|
447
|
+
|
|
448
|
+
#define HWY_SVE_MASKED_SET_OR(BASE, CHAR, BITS, HALF, NAME, OP) \
|
|
449
|
+
HWY_API HWY_SVE_V(BASE, BITS) \
|
|
450
|
+
NAME(HWY_SVE_V(BASE, BITS) no, svbool_t m, HWY_SVE_T(BASE, BITS) op) { \
|
|
451
|
+
return sv##OP##_##CHAR##BITS##_m(no, m, op); \
|
|
452
|
+
}
|
|
453
|
+
|
|
454
|
+
HWY_SVE_FOREACH(HWY_SVE_MASKED_SET_OR, MaskedSetOr, dup_n)
|
|
455
|
+
#undef HWY_SVE_MASKED_SET_OR
|
|
456
|
+
|
|
457
|
+
#define HWY_SVE_MASKED_SET(BASE, CHAR, BITS, HALF, NAME, OP) \
|
|
458
|
+
template <size_t N, int kPow2> \
|
|
459
|
+
HWY_API HWY_SVE_V(BASE, BITS) NAME(HWY_SVE_D(BASE, BITS, N, kPow2) /* d */, \
|
|
460
|
+
svbool_t m, HWY_SVE_T(BASE, BITS) op) { \
|
|
461
|
+
return sv##OP##_##CHAR##BITS##_z(m, op); \
|
|
462
|
+
}
|
|
463
|
+
|
|
464
|
+
HWY_SVE_FOREACH(HWY_SVE_MASKED_SET, MaskedSet, dup_n)
|
|
465
|
+
#undef HWY_SVE_MASKED_SET
|
|
466
|
+
|
|
412
467
|
// ------------------------------ Zero
|
|
413
468
|
|
|
414
469
|
template <class D>
|
|
@@ -687,6 +742,25 @@ HWY_API svfloat64_t Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1) {
|
|
|
687
742
|
return svdupq_n_f64(t0, t1);
|
|
688
743
|
}
|
|
689
744
|
|
|
745
|
+
// ------------------------------ GetLane
|
|
746
|
+
|
|
747
|
+
namespace detail {
|
|
748
|
+
#define HWY_SVE_GET_LANE(BASE, CHAR, BITS, HALF, NAME, OP) \
|
|
749
|
+
HWY_INLINE HWY_SVE_T(BASE, BITS) \
|
|
750
|
+
NAME(HWY_SVE_V(BASE, BITS) v, svbool_t mask) { \
|
|
751
|
+
return sv##OP##_##CHAR##BITS(mask, v); \
|
|
752
|
+
}
|
|
753
|
+
|
|
754
|
+
HWY_SVE_FOREACH(HWY_SVE_GET_LANE, GetLaneM, lasta)
|
|
755
|
+
HWY_SVE_FOREACH(HWY_SVE_GET_LANE, ExtractLastMatchingLaneM, lastb)
|
|
756
|
+
#undef HWY_SVE_GET_LANE
|
|
757
|
+
} // namespace detail
|
|
758
|
+
|
|
759
|
+
template <class V>
|
|
760
|
+
HWY_API TFromV<V> GetLane(V v) {
|
|
761
|
+
return detail::GetLaneM(v, detail::PFalse());
|
|
762
|
+
}
|
|
763
|
+
|
|
690
764
|
// ================================================== LOGICAL
|
|
691
765
|
|
|
692
766
|
// detail::*N() functions accept a scalar argument to avoid extra Set().
|
|
@@ -724,6 +798,9 @@ HWY_API V Or(const V a, const V b) {
|
|
|
724
798
|
return BitCast(df, Or(BitCast(du, a), BitCast(du, b)));
|
|
725
799
|
}
|
|
726
800
|
|
|
801
|
+
// ------------------------------ MaskedOr
|
|
802
|
+
HWY_SVE_FOREACH_UI(HWY_SVE_RETV_ARGMVV_Z, MaskedOr, orr)
|
|
803
|
+
|
|
727
804
|
// ------------------------------ Xor
|
|
728
805
|
|
|
729
806
|
namespace detail {
|
|
@@ -845,20 +922,6 @@ HWY_API VBF16 Neg(VBF16 v) {
|
|
|
845
922
|
HWY_SVE_FOREACH_I(HWY_SVE_RETV_ARGPV, SaturatedNeg, qneg)
|
|
846
923
|
#endif // HWY_SVE_HAVE_2
|
|
847
924
|
|
|
848
|
-
// ------------------------------ Abs
|
|
849
|
-
HWY_SVE_FOREACH_IF(HWY_SVE_RETV_ARGPV, Abs, abs)
|
|
850
|
-
|
|
851
|
-
// ------------------------------ SaturatedAbs
|
|
852
|
-
#if HWY_SVE_HAVE_2
|
|
853
|
-
#ifdef HWY_NATIVE_SATURATED_ABS
|
|
854
|
-
#undef HWY_NATIVE_SATURATED_ABS
|
|
855
|
-
#else
|
|
856
|
-
#define HWY_NATIVE_SATURATED_ABS
|
|
857
|
-
#endif
|
|
858
|
-
|
|
859
|
-
HWY_SVE_FOREACH_I(HWY_SVE_RETV_ARGPV, SaturatedAbs, qabs)
|
|
860
|
-
#endif // HWY_SVE_HAVE_2
|
|
861
|
-
|
|
862
925
|
// ================================================== ARITHMETIC
|
|
863
926
|
|
|
864
927
|
// Per-target flags to prevent generic_ops-inl.h defining Add etc.
|
|
@@ -1064,6 +1127,35 @@ HWY_SVE_FOREACH_I(HWY_SVE_SHIFT_N, ShiftRight, asr_n)
|
|
|
1064
1127
|
|
|
1065
1128
|
#undef HWY_SVE_SHIFT_N
|
|
1066
1129
|
|
|
1130
|
+
// ------------------------------ MaskedShift[Left/Right]
|
|
1131
|
+
|
|
1132
|
+
#define HWY_SVE_SHIFT_Z(BASE, CHAR, BITS, HALF, NAME, OP) \
|
|
1133
|
+
template <int kBits> \
|
|
1134
|
+
HWY_API HWY_SVE_V(BASE, BITS) NAME(svbool_t m, HWY_SVE_V(BASE, BITS) v) { \
|
|
1135
|
+
auto shifts = static_cast<HWY_SVE_T(uint, BITS)>(kBits); \
|
|
1136
|
+
return sv##OP##_##CHAR##BITS##_z(m, v, shifts); \
|
|
1137
|
+
}
|
|
1138
|
+
HWY_SVE_FOREACH_UI(HWY_SVE_SHIFT_Z, MaskedShiftLeft, lsl_n)
|
|
1139
|
+
HWY_SVE_FOREACH_I(HWY_SVE_SHIFT_Z, MaskedShiftRight, asr_n)
|
|
1140
|
+
HWY_SVE_FOREACH_U(HWY_SVE_SHIFT_Z, MaskedShiftRight, lsr_n)
|
|
1141
|
+
|
|
1142
|
+
#undef HWY_SVE_SHIFT_Z
|
|
1143
|
+
|
|
1144
|
+
// ------------------------------ MaskedShiftRightOr
|
|
1145
|
+
|
|
1146
|
+
#define HWY_SVE_SHIFT_OR(BASE, CHAR, BITS, HALF, NAME, OP) \
|
|
1147
|
+
template <int kBits> \
|
|
1148
|
+
HWY_API HWY_SVE_V(BASE, BITS) \
|
|
1149
|
+
NAME(HWY_SVE_V(BASE, BITS) no, svbool_t m, HWY_SVE_V(BASE, BITS) v) { \
|
|
1150
|
+
auto shifts = static_cast<HWY_SVE_T(uint, BITS)>(kBits); \
|
|
1151
|
+
return svsel##_##CHAR##BITS(m, sv##OP##_##CHAR##BITS##_z(m, v, shifts), \
|
|
1152
|
+
no); \
|
|
1153
|
+
}
|
|
1154
|
+
HWY_SVE_FOREACH_I(HWY_SVE_SHIFT_OR, MaskedShiftRightOr, asr_n)
|
|
1155
|
+
HWY_SVE_FOREACH_U(HWY_SVE_SHIFT_OR, MaskedShiftRightOr, lsr_n)
|
|
1156
|
+
|
|
1157
|
+
#undef HWY_SVE_SHIFT_OR
|
|
1158
|
+
|
|
1067
1159
|
// ------------------------------ RotateRight
|
|
1068
1160
|
|
|
1069
1161
|
#if HWY_SVE_HAVE_2
|
|
@@ -1096,7 +1188,7 @@ HWY_API V RotateRight(const V v) {
|
|
|
1096
1188
|
}
|
|
1097
1189
|
#endif
|
|
1098
1190
|
|
|
1099
|
-
// ------------------------------ Shl
|
|
1191
|
+
// ------------------------------ Shl, Shr
|
|
1100
1192
|
|
|
1101
1193
|
#define HWY_SVE_SHIFT(BASE, CHAR, BITS, HALF, NAME, OP) \
|
|
1102
1194
|
HWY_API HWY_SVE_V(BASE, BITS) \
|
|
@@ -1113,17 +1205,95 @@ HWY_SVE_FOREACH_I(HWY_SVE_SHIFT, Shr, asr)
|
|
|
1113
1205
|
|
|
1114
1206
|
#undef HWY_SVE_SHIFT
|
|
1115
1207
|
|
|
1116
|
-
// ------------------------------
|
|
1208
|
+
// ------------------------------ RoundingShiftLeft[Same]/RoundingShr
|
|
1117
1209
|
|
|
1118
|
-
|
|
1119
|
-
HWY_SVE_FOREACH_UI(HWY_SVE_RETV_ARGPVV, Max, max)
|
|
1120
|
-
HWY_SVE_FOREACH_F(HWY_SVE_RETV_ARGPVV, Min, minnm)
|
|
1121
|
-
HWY_SVE_FOREACH_F(HWY_SVE_RETV_ARGPVV, Max, maxnm)
|
|
1210
|
+
#if HWY_SVE_HAVE_2
|
|
1122
1211
|
|
|
1123
|
-
|
|
1124
|
-
|
|
1125
|
-
|
|
1126
|
-
|
|
1212
|
+
#ifdef HWY_NATIVE_ROUNDING_SHR
|
|
1213
|
+
#undef HWY_NATIVE_ROUNDING_SHR
|
|
1214
|
+
#else
|
|
1215
|
+
#define HWY_NATIVE_ROUNDING_SHR
|
|
1216
|
+
#endif
|
|
1217
|
+
|
|
1218
|
+
#define HWY_SVE_ROUNDING_SHR_N(BASE, CHAR, BITS, HALF, NAME, OP) \
|
|
1219
|
+
template <int kBits> \
|
|
1220
|
+
HWY_API HWY_SVE_V(BASE, BITS) NAME(HWY_SVE_V(BASE, BITS) v) { \
|
|
1221
|
+
HWY_IF_CONSTEXPR(kBits == 0) { return v; } \
|
|
1222
|
+
\
|
|
1223
|
+
return sv##OP##_##CHAR##BITS##_x( \
|
|
1224
|
+
HWY_SVE_PTRUE(BITS), v, static_cast<uint64_t>(HWY_MAX(kBits, 1))); \
|
|
1225
|
+
}
|
|
1226
|
+
|
|
1227
|
+
HWY_SVE_FOREACH_UI(HWY_SVE_ROUNDING_SHR_N, RoundingShiftRight, rshr_n)
|
|
1228
|
+
|
|
1229
|
+
#undef HWY_SVE_ROUNDING_SHR_N
|
|
1230
|
+
|
|
1231
|
+
#define HWY_SVE_ROUNDING_SHR(BASE, CHAR, BITS, HALF, NAME, OP) \
|
|
1232
|
+
HWY_API HWY_SVE_V(BASE, BITS) \
|
|
1233
|
+
NAME(HWY_SVE_V(BASE, BITS) v, HWY_SVE_V(BASE, BITS) bits) { \
|
|
1234
|
+
const RebindToSigned<DFromV<decltype(v)>> di; \
|
|
1235
|
+
return sv##OP##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), v, \
|
|
1236
|
+
Neg(BitCast(di, bits))); \
|
|
1237
|
+
}
|
|
1238
|
+
|
|
1239
|
+
HWY_SVE_FOREACH_UI(HWY_SVE_ROUNDING_SHR, RoundingShr, rshl)
|
|
1240
|
+
|
|
1241
|
+
#undef HWY_SVE_ROUNDING_SHR
|
|
1242
|
+
|
|
1243
|
+
template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
|
|
1244
|
+
HWY_API V RoundingShiftRightSame(V v, int bits) {
|
|
1245
|
+
const DFromV<V> d;
|
|
1246
|
+
using T = TFromD<decltype(d)>;
|
|
1247
|
+
return RoundingShr(v, Set(d, static_cast<T>(bits)));
|
|
1248
|
+
}
|
|
1249
|
+
|
|
1250
|
+
#endif // HWY_SVE_HAVE_2
|
|
1251
|
+
|
|
1252
|
+
// ------------------------------ BroadcastSignBit (ShiftRight)
|
|
1253
|
+
template <class V>
|
|
1254
|
+
HWY_API V BroadcastSignBit(const V v) {
|
|
1255
|
+
return ShiftRight<sizeof(TFromV<V>) * 8 - 1>(v);
|
|
1256
|
+
}
|
|
1257
|
+
|
|
1258
|
+
// ------------------------------ Abs (ShiftRight, Add, Xor, AndN)
|
|
1259
|
+
|
|
1260
|
+
// Workaround for incorrect results with `svabs`.
|
|
1261
|
+
#if HWY_COMPILER_CLANG
|
|
1262
|
+
template <class V, HWY_IF_SIGNED_V(V)>
|
|
1263
|
+
HWY_API V Abs(V v) {
|
|
1264
|
+
const V sign = BroadcastSignBit(v);
|
|
1265
|
+
return Xor(Add(v, sign), sign);
|
|
1266
|
+
}
|
|
1267
|
+
|
|
1268
|
+
template <class V, HWY_IF_FLOAT_OR_SPECIAL_V(V)>
|
|
1269
|
+
HWY_NOINLINE V Abs(V v) {
|
|
1270
|
+
const DFromV<V> d;
|
|
1271
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
1272
|
+
using TU = MakeUnsigned<TFromD<decltype(d)>>;
|
|
1273
|
+
return BitCast(
|
|
1274
|
+
d, detail::AndN(BitCast(du, v), static_cast<TU>(~SignMask<TU>())));
|
|
1275
|
+
}
|
|
1276
|
+
|
|
1277
|
+
#else
|
|
1278
|
+
HWY_SVE_FOREACH_IF(HWY_SVE_RETV_ARGPV, Abs, abs)
|
|
1279
|
+
#endif
|
|
1280
|
+
|
|
1281
|
+
// ------------------------------ SaturatedAbs
|
|
1282
|
+
#if HWY_SVE_HAVE_2
|
|
1283
|
+
#ifdef HWY_NATIVE_SATURATED_ABS
|
|
1284
|
+
#undef HWY_NATIVE_SATURATED_ABS
|
|
1285
|
+
#else
|
|
1286
|
+
#define HWY_NATIVE_SATURATED_ABS
|
|
1287
|
+
#endif
|
|
1288
|
+
|
|
1289
|
+
HWY_SVE_FOREACH_I(HWY_SVE_RETV_ARGPV, SaturatedAbs, qabs)
|
|
1290
|
+
#endif // HWY_SVE_HAVE_2
|
|
1291
|
+
|
|
1292
|
+
// ------------------------------ MaskedAbsOr
|
|
1293
|
+
HWY_SVE_FOREACH_IF(HWY_SVE_RETV_ARGMV_M, MaskedAbsOr, abs)
|
|
1294
|
+
|
|
1295
|
+
// ------------------------------ MaskedAbs
|
|
1296
|
+
HWY_SVE_FOREACH_IF(HWY_SVE_RETV_ARGMV_Z, MaskedAbs, abs)
|
|
1127
1297
|
|
|
1128
1298
|
// ------------------------------ Mul
|
|
1129
1299
|
|
|
@@ -1187,6 +1357,15 @@ HWY_SVE_FOREACH_F(HWY_SVE_RETV_ARGV, ApproximateReciprocal, recpe)
|
|
|
1187
1357
|
// ------------------------------ Sqrt
|
|
1188
1358
|
HWY_SVE_FOREACH_F(HWY_SVE_RETV_ARGPV, Sqrt, sqrt)
|
|
1189
1359
|
|
|
1360
|
+
// ------------------------------ MaskedSqrt
|
|
1361
|
+
#ifdef HWY_NATIVE_MASKED_SQRT
|
|
1362
|
+
#undef HWY_NATIVE_MASKED_SQRT
|
|
1363
|
+
#else
|
|
1364
|
+
#define HWY_NATIVE_MASKED_SQRT
|
|
1365
|
+
#endif
|
|
1366
|
+
|
|
1367
|
+
HWY_SVE_FOREACH_F(HWY_SVE_RETV_ARGMV_Z, MaskedSqrt, sqrt)
|
|
1368
|
+
|
|
1190
1369
|
// ------------------------------ ApproximateReciprocalSqrt
|
|
1191
1370
|
#ifdef HWY_NATIVE_F64_APPROX_RSQRT
|
|
1192
1371
|
#undef HWY_NATIVE_F64_APPROX_RSQRT
|
|
@@ -1466,14 +1645,17 @@ HWY_API svbool_t LowerHalfOfMask(D /*d*/, svbool_t m) {
|
|
|
1466
1645
|
#endif
|
|
1467
1646
|
|
|
1468
1647
|
namespace detail {
|
|
1469
|
-
|
|
1470
|
-
|
|
1648
|
+
HWY_SVE_FOREACH_F(HWY_SVE_RETV_ARGMVV, MaskedMin, minnm)
|
|
1649
|
+
HWY_SVE_FOREACH_F(HWY_SVE_RETV_ARGMVV, MaskedMax, maxnm)
|
|
1650
|
+
HWY_SVE_FOREACH_UI(HWY_SVE_RETV_ARGMVV, MaskedMin, min)
|
|
1651
|
+
HWY_SVE_FOREACH_UI(HWY_SVE_RETV_ARGMVV, MaskedMax, max)
|
|
1471
1652
|
HWY_SVE_FOREACH(HWY_SVE_RETV_ARGMVV, MaskedAdd, add)
|
|
1472
1653
|
HWY_SVE_FOREACH(HWY_SVE_RETV_ARGMVV, MaskedSub, sub)
|
|
1473
1654
|
HWY_SVE_FOREACH(HWY_SVE_RETV_ARGMVV, MaskedMul, mul)
|
|
1474
1655
|
HWY_SVE_FOREACH_F(HWY_SVE_RETV_ARGMVV, MaskedDiv, div)
|
|
1475
1656
|
HWY_SVE_FOREACH_UI32(HWY_SVE_RETV_ARGMVV, MaskedDiv, div)
|
|
1476
1657
|
HWY_SVE_FOREACH_UI64(HWY_SVE_RETV_ARGMVV, MaskedDiv, div)
|
|
1658
|
+
HWY_SVE_FOREACH_F(HWY_SVE_RETV_ARGMV, MaskedSqrt, sqrt)
|
|
1477
1659
|
#if HWY_SVE_HAVE_2
|
|
1478
1660
|
HWY_SVE_FOREACH_UI(HWY_SVE_RETV_ARGMVV, MaskedSatAdd, qadd)
|
|
1479
1661
|
HWY_SVE_FOREACH_UI(HWY_SVE_RETV_ARGMVV, MaskedSatSub, qsub)
|
|
@@ -1537,6 +1719,187 @@ HWY_API V MaskedSatSubOr(V no, M m, V a, V b) {
|
|
|
1537
1719
|
}
|
|
1538
1720
|
#endif
|
|
1539
1721
|
|
|
1722
|
+
// ------------------------------ MaskedMulAddOr
|
|
1723
|
+
namespace detail {
|
|
1724
|
+
HWY_SVE_FOREACH(HWY_SVE_RETV_ARGMVVV, MaskedMulAdd, mad)
|
|
1725
|
+
}
|
|
1726
|
+
|
|
1727
|
+
// Per-target flag to prevent generic_ops-inl.h from defining int
|
|
1728
|
+
// MaskedMulAddOr.
|
|
1729
|
+
#ifdef HWY_NATIVE_MASKED_INT_FMA
|
|
1730
|
+
#undef HWY_NATIVE_MASKED_INT_FMA
|
|
1731
|
+
#else
|
|
1732
|
+
#define HWY_NATIVE_MASKED_INT_FMA
|
|
1733
|
+
#endif
|
|
1734
|
+
|
|
1735
|
+
template <class V, class M>
|
|
1736
|
+
HWY_API V MaskedMulAddOr(V no, M m, V mul, V x, V add) {
|
|
1737
|
+
return IfThenElse(m, detail::MaskedMulAdd(m, mul, x, add), no);
|
|
1738
|
+
}
|
|
1739
|
+
|
|
1740
|
+
template <class V, HWY_IF_FLOAT_V(V), class M>
|
|
1741
|
+
HWY_API V MaskedSqrtOr(V no, M m, V v) {
|
|
1742
|
+
return IfThenElse(m, detail::MaskedSqrt(m, v), no);
|
|
1743
|
+
}
|
|
1744
|
+
|
|
1745
|
+
// ================================================== REDUCE
|
|
1746
|
+
|
|
1747
|
+
#ifdef HWY_NATIVE_REDUCE_SCALAR
|
|
1748
|
+
#undef HWY_NATIVE_REDUCE_SCALAR
|
|
1749
|
+
#else
|
|
1750
|
+
#define HWY_NATIVE_REDUCE_SCALAR
|
|
1751
|
+
#endif
|
|
1752
|
+
|
|
1753
|
+
// These return T, suitable for ReduceSum.
|
|
1754
|
+
namespace detail {
|
|
1755
|
+
#define HWY_SVE_REDUCE_ADD(BASE, CHAR, BITS, HALF, NAME, OP) \
|
|
1756
|
+
HWY_API HWY_SVE_T(BASE, BITS) NAME(svbool_t pg, HWY_SVE_V(BASE, BITS) v) { \
|
|
1757
|
+
/* The intrinsic returns [u]int64_t; truncate to T so we can broadcast. */ \
|
|
1758
|
+
using T = HWY_SVE_T(BASE, BITS); \
|
|
1759
|
+
using TU = MakeUnsigned<T>; \
|
|
1760
|
+
constexpr uint64_t kMask = LimitsMax<TU>(); \
|
|
1761
|
+
return static_cast<T>(static_cast<TU>( \
|
|
1762
|
+
static_cast<uint64_t>(sv##OP##_##CHAR##BITS(pg, v)) & kMask)); \
|
|
1763
|
+
}
|
|
1764
|
+
|
|
1765
|
+
#define HWY_SVE_REDUCE(BASE, CHAR, BITS, HALF, NAME, OP) \
|
|
1766
|
+
HWY_API HWY_SVE_T(BASE, BITS) NAME(svbool_t pg, HWY_SVE_V(BASE, BITS) v) { \
|
|
1767
|
+
return sv##OP##_##CHAR##BITS(pg, v); \
|
|
1768
|
+
}
|
|
1769
|
+
|
|
1770
|
+
// TODO: Remove SumOfLanesM in favor of using MaskedReduceSum
|
|
1771
|
+
HWY_SVE_FOREACH_UI(HWY_SVE_REDUCE_ADD, SumOfLanesM, addv)
|
|
1772
|
+
HWY_SVE_FOREACH_F(HWY_SVE_REDUCE, SumOfLanesM, addv)
|
|
1773
|
+
|
|
1774
|
+
HWY_SVE_FOREACH_UI(HWY_SVE_REDUCE, MinOfLanesM, minv)
|
|
1775
|
+
HWY_SVE_FOREACH_UI(HWY_SVE_REDUCE, MaxOfLanesM, maxv)
|
|
1776
|
+
// NaN if all are
|
|
1777
|
+
HWY_SVE_FOREACH_F(HWY_SVE_REDUCE, MinOfLanesM, minnmv)
|
|
1778
|
+
HWY_SVE_FOREACH_F(HWY_SVE_REDUCE, MaxOfLanesM, maxnmv)
|
|
1779
|
+
|
|
1780
|
+
#undef HWY_SVE_REDUCE
|
|
1781
|
+
#undef HWY_SVE_REDUCE_ADD
|
|
1782
|
+
} // namespace detail
|
|
1783
|
+
|
|
1784
|
+
// detail::SumOfLanesM, detail::MinOfLanesM, and detail::MaxOfLanesM is more
|
|
1785
|
+
// efficient for N=4 I8/U8 reductions on SVE than the default implementations
|
|
1786
|
+
// of the N=4 I8/U8 ReduceSum/ReduceMin/ReduceMax operations in
|
|
1787
|
+
// generic_ops-inl.h
|
|
1788
|
+
#undef HWY_IF_REDUCE_D
|
|
1789
|
+
#define HWY_IF_REDUCE_D(D) hwy::EnableIf<HWY_MAX_LANES_D(D) != 1>* = nullptr
|
|
1790
|
+
|
|
1791
|
+
#ifdef HWY_NATIVE_REDUCE_SUM_4_UI8
|
|
1792
|
+
#undef HWY_NATIVE_REDUCE_SUM_4_UI8
|
|
1793
|
+
#else
|
|
1794
|
+
#define HWY_NATIVE_REDUCE_SUM_4_UI8
|
|
1795
|
+
#endif
|
|
1796
|
+
|
|
1797
|
+
#ifdef HWY_NATIVE_REDUCE_MINMAX_4_UI8
|
|
1798
|
+
#undef HWY_NATIVE_REDUCE_MINMAX_4_UI8
|
|
1799
|
+
#else
|
|
1800
|
+
#define HWY_NATIVE_REDUCE_MINMAX_4_UI8
|
|
1801
|
+
#endif
|
|
1802
|
+
|
|
1803
|
+
template <class D, HWY_IF_REDUCE_D(D)>
|
|
1804
|
+
HWY_API TFromD<D> ReduceSum(D d, VFromD<D> v) {
|
|
1805
|
+
return detail::SumOfLanesM(detail::MakeMask(d), v);
|
|
1806
|
+
}
|
|
1807
|
+
|
|
1808
|
+
template <class D, HWY_IF_REDUCE_D(D)>
|
|
1809
|
+
HWY_API TFromD<D> ReduceMin(D d, VFromD<D> v) {
|
|
1810
|
+
return detail::MinOfLanesM(detail::MakeMask(d), v);
|
|
1811
|
+
}
|
|
1812
|
+
|
|
1813
|
+
template <class D, HWY_IF_REDUCE_D(D)>
|
|
1814
|
+
HWY_API TFromD<D> ReduceMax(D d, VFromD<D> v) {
|
|
1815
|
+
return detail::MaxOfLanesM(detail::MakeMask(d), v);
|
|
1816
|
+
}
|
|
1817
|
+
|
|
1818
|
+
#ifdef HWY_NATIVE_MASKED_REDUCE_SCALAR
|
|
1819
|
+
#undef HWY_NATIVE_MASKED_REDUCE_SCALAR
|
|
1820
|
+
#else
|
|
1821
|
+
#define HWY_NATIVE_MASKED_REDUCE_SCALAR
|
|
1822
|
+
#endif
|
|
1823
|
+
|
|
1824
|
+
template <class D, class M>
|
|
1825
|
+
HWY_API TFromD<D> MaskedReduceSum(D /*d*/, M m, VFromD<D> v) {
|
|
1826
|
+
return detail::SumOfLanesM(m, v);
|
|
1827
|
+
}
|
|
1828
|
+
template <class D, class M>
|
|
1829
|
+
HWY_API TFromD<D> MaskedReduceMin(D /*d*/, M m, VFromD<D> v) {
|
|
1830
|
+
return detail::MinOfLanesM(m, v);
|
|
1831
|
+
}
|
|
1832
|
+
template <class D, class M>
|
|
1833
|
+
HWY_API TFromD<D> MaskedReduceMax(D /*d*/, M m, VFromD<D> v) {
|
|
1834
|
+
return detail::MaxOfLanesM(m, v);
|
|
1835
|
+
}
|
|
1836
|
+
|
|
1837
|
+
// ------------------------------ SumOfLanes
|
|
1838
|
+
|
|
1839
|
+
template <class D, HWY_IF_LANES_GT_D(D, 1)>
|
|
1840
|
+
HWY_API VFromD<D> SumOfLanes(D d, VFromD<D> v) {
|
|
1841
|
+
return Set(d, ReduceSum(d, v));
|
|
1842
|
+
}
|
|
1843
|
+
template <class D, HWY_IF_LANES_GT_D(D, 1)>
|
|
1844
|
+
HWY_API VFromD<D> MinOfLanes(D d, VFromD<D> v) {
|
|
1845
|
+
return Set(d, ReduceMin(d, v));
|
|
1846
|
+
}
|
|
1847
|
+
template <class D, HWY_IF_LANES_GT_D(D, 1)>
|
|
1848
|
+
HWY_API VFromD<D> MaxOfLanes(D d, VFromD<D> v) {
|
|
1849
|
+
return Set(d, ReduceMax(d, v));
|
|
1850
|
+
}
|
|
1851
|
+
|
|
1852
|
+
// ------------------------------ MaskedAdd etc. (IfThenElse)
|
|
1853
|
+
|
|
1854
|
+
#ifdef HWY_NATIVE_ZERO_MASKED_ARITH
|
|
1855
|
+
#undef HWY_NATIVE_ZERO_MASKED_ARITH
|
|
1856
|
+
#else
|
|
1857
|
+
#define HWY_NATIVE_ZERO_MASKED_ARITH
|
|
1858
|
+
#endif
|
|
1859
|
+
|
|
1860
|
+
HWY_SVE_FOREACH(HWY_SVE_RETV_ARGMVV_Z, MaskedMax, max)
|
|
1861
|
+
HWY_SVE_FOREACH(HWY_SVE_RETV_ARGMVV_Z, MaskedAdd, add)
|
|
1862
|
+
HWY_SVE_FOREACH(HWY_SVE_RETV_ARGMVV_Z, MaskedSub, sub)
|
|
1863
|
+
HWY_SVE_FOREACH(HWY_SVE_RETV_ARGMVV_Z, MaskedMul, mul)
|
|
1864
|
+
HWY_SVE_FOREACH_F(HWY_SVE_RETV_ARGMVV_Z, MaskedDiv, div)
|
|
1865
|
+
HWY_SVE_FOREACH_UI32(HWY_SVE_RETV_ARGMVV_Z, MaskedDiv, div)
|
|
1866
|
+
HWY_SVE_FOREACH_UI64(HWY_SVE_RETV_ARGMVV_Z, MaskedDiv, div)
|
|
1867
|
+
HWY_SVE_FOREACH(HWY_SVE_RETV_ARGMVVV_Z, MaskedMulAdd, mad)
|
|
1868
|
+
HWY_SVE_FOREACH(HWY_SVE_RETV_ARGMVVV_Z, MaskedNegMulAdd, msb)
|
|
1869
|
+
|
|
1870
|
+
// I8/U8/I16/U16 MaskedDiv is implemented after I8/U8/I16/U16 Div
|
|
1871
|
+
|
|
1872
|
+
#if HWY_SVE_HAVE_2
|
|
1873
|
+
HWY_SVE_FOREACH_UI(HWY_SVE_RETV_ARGMVV_Z, MaskedSaturatedAdd, qadd)
|
|
1874
|
+
HWY_SVE_FOREACH_UI(HWY_SVE_RETV_ARGMVV_Z, MaskedSaturatedSub, qsub)
|
|
1875
|
+
#else
|
|
1876
|
+
template <class V, class M>
|
|
1877
|
+
HWY_API V MaskedSaturatedAdd(M m, V a, V b) {
|
|
1878
|
+
return IfThenElseZero(m, SaturatedAdd(a, b));
|
|
1879
|
+
}
|
|
1880
|
+
|
|
1881
|
+
template <class V, class M>
|
|
1882
|
+
HWY_API V MaskedSaturatedSub(M m, V a, V b) {
|
|
1883
|
+
return IfThenElseZero(m, SaturatedSub(a, b));
|
|
1884
|
+
}
|
|
1885
|
+
#endif
|
|
1886
|
+
|
|
1887
|
+
template <class V, class M, typename D = DFromV<V>, HWY_IF_I16_D(D)>
|
|
1888
|
+
HWY_API V MaskedMulFixedPoint15(M m, V a, V b) {
|
|
1889
|
+
return IfThenElseZero(m, MulFixedPoint15(a, b));
|
|
1890
|
+
}
|
|
1891
|
+
|
|
1892
|
+
template <class D, class M, HWY_IF_UI32_D(D),
|
|
1893
|
+
class V16 = VFromD<RepartitionToNarrow<D>>>
|
|
1894
|
+
HWY_API VFromD<D> MaskedWidenMulPairwiseAdd(D d32, M m, V16 a, V16 b) {
|
|
1895
|
+
return IfThenElseZero(m, WidenMulPairwiseAdd(d32, a, b));
|
|
1896
|
+
}
|
|
1897
|
+
|
|
1898
|
+
template <class DF, class M, HWY_IF_F32_D(DF), class VBF>
|
|
1899
|
+
HWY_API VFromD<DF> MaskedWidenMulPairwiseAdd(DF df, M m, VBF a, VBF b) {
|
|
1900
|
+
return IfThenElseZero(m, WidenMulPairwiseAdd(df, a, b));
|
|
1901
|
+
}
|
|
1902
|
+
|
|
1540
1903
|
// ================================================== COMPARE
|
|
1541
1904
|
|
|
1542
1905
|
// mask = f(vector, vector)
|
|
@@ -1596,8 +1959,122 @@ HWY_API svbool_t TestBit(const V a, const V bit) {
|
|
|
1596
1959
|
return detail::NeN(And(a, bit), 0);
|
|
1597
1960
|
}
|
|
1598
1961
|
|
|
1599
|
-
// ------------------------------
|
|
1600
|
-
|
|
1962
|
+
// ------------------------------ Min/Max (Lt, IfThenElse)
|
|
1963
|
+
|
|
1964
|
+
HWY_SVE_FOREACH_U(HWY_SVE_RETV_ARGPVV, Min, min)
|
|
1965
|
+
HWY_SVE_FOREACH_UI(HWY_SVE_RETV_ARGPVV, Max, max)
|
|
1966
|
+
HWY_SVE_FOREACH_F(HWY_SVE_RETV_ARGPVV, Max, maxnm)
|
|
1967
|
+
|
|
1968
|
+
// Workaround for incorrect results with `svmin`.
|
|
1969
|
+
#if HWY_COMPILER_CLANG
|
|
1970
|
+
template <class V, HWY_IF_SIGNED_V(V)>
|
|
1971
|
+
HWY_API V Min(V a, V b) {
|
|
1972
|
+
return IfThenElse(Lt(a, b), a, b);
|
|
1973
|
+
}
|
|
1974
|
+
template <class V, HWY_IF_FLOAT_OR_SPECIAL_V(V)>
|
|
1975
|
+
HWY_API V Min(V a, V b) {
|
|
1976
|
+
return IfThenElse(Or(Lt(a, b), Ne(b, b)), a, b);
|
|
1977
|
+
}
|
|
1978
|
+
#else
|
|
1979
|
+
HWY_SVE_FOREACH_I(HWY_SVE_RETV_ARGPVV, Min, min)
|
|
1980
|
+
HWY_SVE_FOREACH_F(HWY_SVE_RETV_ARGPVV, Min, minnm)
|
|
1981
|
+
#endif
|
|
1982
|
+
|
|
1983
|
+
namespace detail {
|
|
1984
|
+
HWY_SVE_FOREACH_UI(HWY_SVE_RETV_ARGPVN, MinN, min_n)
|
|
1985
|
+
HWY_SVE_FOREACH_UI(HWY_SVE_RETV_ARGPVN, MaxN, max_n)
|
|
1986
|
+
} // namespace detail
|
|
1987
|
+
|
|
1988
|
+
// ================================================== SWIZZLE
|
|
1989
|
+
|
|
1990
|
+
// ------------------------------ ConcatEven/ConcatOdd
|
|
1991
|
+
|
|
1992
|
+
// WARNING: the upper half of these needs fixing up (uzp1/uzp2 use the
|
|
1993
|
+
// full vector length, not rounded down to a power of two as we require).
|
|
1994
|
+
namespace detail {
|
|
1995
|
+
|
|
1996
|
+
#define HWY_SVE_CONCAT_EVERY_SECOND(BASE, CHAR, BITS, HALF, NAME, OP) \
|
|
1997
|
+
HWY_INLINE HWY_SVE_V(BASE, BITS) \
|
|
1998
|
+
NAME(HWY_SVE_V(BASE, BITS) hi, HWY_SVE_V(BASE, BITS) lo) { \
|
|
1999
|
+
return sv##OP##_##CHAR##BITS(lo, hi); \
|
|
2000
|
+
}
|
|
2001
|
+
HWY_SVE_FOREACH(HWY_SVE_CONCAT_EVERY_SECOND, ConcatEvenFull, uzp1)
|
|
2002
|
+
HWY_SVE_FOREACH(HWY_SVE_CONCAT_EVERY_SECOND, ConcatOddFull, uzp2)
|
|
2003
|
+
#if HWY_SVE_HAVE_BF16_FEATURE || HWY_SVE_HAVE_BF16_VEC
|
|
2004
|
+
HWY_SVE_FOREACH_BF16_UNCONDITIONAL(HWY_SVE_CONCAT_EVERY_SECOND, ConcatEvenFull,
|
|
2005
|
+
uzp1)
|
|
2006
|
+
HWY_SVE_FOREACH_BF16_UNCONDITIONAL(HWY_SVE_CONCAT_EVERY_SECOND, ConcatOddFull,
|
|
2007
|
+
uzp2)
|
|
2008
|
+
#endif // HWY_SVE_HAVE_BF16_FEATURE || HWY_SVE_HAVE_BF16_VEC
|
|
2009
|
+
#if defined(__ARM_FEATURE_SVE_MATMUL_FP64)
|
|
2010
|
+
HWY_SVE_FOREACH(HWY_SVE_CONCAT_EVERY_SECOND, ConcatEvenBlocks, uzp1q)
|
|
2011
|
+
HWY_SVE_FOREACH(HWY_SVE_CONCAT_EVERY_SECOND, ConcatOddBlocks, uzp2q)
|
|
2012
|
+
#if HWY_SVE_HAVE_BF16_FEATURE || HWY_SVE_HAVE_BF16_VEC
|
|
2013
|
+
HWY_SVE_FOREACH_BF16_UNCONDITIONAL(HWY_SVE_CONCAT_EVERY_SECOND,
|
|
2014
|
+
ConcatEvenBlocks, uzp1q)
|
|
2015
|
+
HWY_SVE_FOREACH_BF16_UNCONDITIONAL(HWY_SVE_CONCAT_EVERY_SECOND, ConcatOddBlocks,
|
|
2016
|
+
uzp2q)
|
|
2017
|
+
#endif // HWY_SVE_HAVE_BF16_FEATURE || HWY_SVE_HAVE_BF16_VEC
|
|
2018
|
+
#endif // defined(__ARM_FEATURE_SVE_MATMUL_FP64)
|
|
2019
|
+
#undef HWY_SVE_CONCAT_EVERY_SECOND
|
|
2020
|
+
|
|
2021
|
+
// Used to slide up / shift whole register left; mask indicates which range
|
|
2022
|
+
// to take from lo, and the rest is filled from hi starting at its lowest.
|
|
2023
|
+
#define HWY_SVE_SPLICE(BASE, CHAR, BITS, HALF, NAME, OP) \
|
|
2024
|
+
HWY_API HWY_SVE_V(BASE, BITS) NAME( \
|
|
2025
|
+
HWY_SVE_V(BASE, BITS) hi, HWY_SVE_V(BASE, BITS) lo, svbool_t mask) { \
|
|
2026
|
+
return sv##OP##_##CHAR##BITS(mask, lo, hi); \
|
|
2027
|
+
}
|
|
2028
|
+
HWY_SVE_FOREACH(HWY_SVE_SPLICE, Splice, splice)
|
|
2029
|
+
#if HWY_SVE_HAVE_BF16_FEATURE
|
|
2030
|
+
HWY_SVE_FOREACH_BF16(HWY_SVE_SPLICE, Splice, splice)
|
|
2031
|
+
#else
|
|
2032
|
+
template <class V, HWY_IF_BF16_D(DFromV<V>)>
|
|
2033
|
+
HWY_INLINE V Splice(V hi, V lo, svbool_t mask) {
|
|
2034
|
+
const DFromV<V> d;
|
|
2035
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
2036
|
+
return BitCast(d, Splice(BitCast(du, hi), BitCast(du, lo), mask));
|
|
2037
|
+
}
|
|
2038
|
+
#endif // HWY_SVE_HAVE_BF16_FEATURE
|
|
2039
|
+
#undef HWY_SVE_SPLICE
|
|
2040
|
+
|
|
2041
|
+
} // namespace detail
|
|
2042
|
+
|
|
2043
|
+
template <class D>
|
|
2044
|
+
HWY_API VFromD<D> ConcatOdd(D d, VFromD<D> hi, VFromD<D> lo) {
|
|
2045
|
+
#if HWY_SVE_IS_POW2
|
|
2046
|
+
if (detail::IsFull(d)) return detail::ConcatOddFull(hi, lo);
|
|
2047
|
+
#endif
|
|
2048
|
+
const VFromD<D> hi_odd = detail::ConcatOddFull(hi, hi);
|
|
2049
|
+
const VFromD<D> lo_odd = detail::ConcatOddFull(lo, lo);
|
|
2050
|
+
return detail::Splice(hi_odd, lo_odd, FirstN(d, Lanes(d) / 2));
|
|
2051
|
+
}
|
|
2052
|
+
|
|
2053
|
+
template <class D>
|
|
2054
|
+
HWY_API VFromD<D> ConcatEven(D d, VFromD<D> hi, VFromD<D> lo) {
|
|
2055
|
+
#if HWY_SVE_IS_POW2
|
|
2056
|
+
if (detail::IsFull(d)) return detail::ConcatEvenFull(hi, lo);
|
|
2057
|
+
#endif
|
|
2058
|
+
const VFromD<D> hi_odd = detail::ConcatEvenFull(hi, hi);
|
|
2059
|
+
const VFromD<D> lo_odd = detail::ConcatEvenFull(lo, lo);
|
|
2060
|
+
return detail::Splice(hi_odd, lo_odd, FirstN(d, Lanes(d) / 2));
|
|
2061
|
+
}
|
|
2062
|
+
|
|
2063
|
+
HWY_API svuint8_t U8FromU32(const svuint32_t v) {
|
|
2064
|
+
const DFromV<svuint32_t> du32;
|
|
2065
|
+
const RepartitionToNarrow<decltype(du32)> du16;
|
|
2066
|
+
const RepartitionToNarrow<decltype(du16)> du8;
|
|
2067
|
+
|
|
2068
|
+
const svuint16_t cast16 = BitCast(du16, v);
|
|
2069
|
+
const svuint16_t x2 = svuzp1_u16(cast16, cast16);
|
|
2070
|
+
const svuint8_t cast8 = BitCast(du8, x2);
|
|
2071
|
+
return svuzp1_u8(cast8, cast8);
|
|
2072
|
+
}
|
|
2073
|
+
|
|
2074
|
+
// ================================================== MASK
|
|
2075
|
+
|
|
2076
|
+
// ------------------------------ MaskFromVec (Ne)
|
|
2077
|
+
template <class V>
|
|
1601
2078
|
HWY_API svbool_t MaskFromVec(const V v) {
|
|
1602
2079
|
using T = TFromV<V>;
|
|
1603
2080
|
return detail::NeN(v, ConvertScalarTo<T>(0));
|
|
@@ -1612,6 +2089,87 @@ HWY_API VFromD<D> VecFromMask(const D d, svbool_t mask) {
|
|
|
1612
2089
|
return BitCast(d, IfThenElseZero(mask, Set(di, -1)));
|
|
1613
2090
|
}
|
|
1614
2091
|
|
|
2092
|
+
// ------------------------------ BitsFromMask (AndN, Shl, ReduceSum, GetLane
|
|
2093
|
+
// ConcatEvenFull, U8FromU32)
|
|
2094
|
+
|
|
2095
|
+
namespace detail {
|
|
2096
|
+
|
|
2097
|
+
// For each mask lane (governing lane type T), store 1 or 0 in BYTE lanes.
|
|
2098
|
+
template <class D, HWY_IF_T_SIZE_D(D, 1)>
|
|
2099
|
+
HWY_INLINE svuint8_t BoolFromMask(svbool_t m) {
|
|
2100
|
+
return svdup_n_u8_z(m, 1);
|
|
2101
|
+
}
|
|
2102
|
+
template <class D, HWY_IF_T_SIZE_D(D, 2)>
|
|
2103
|
+
HWY_INLINE svuint8_t BoolFromMask(svbool_t m) {
|
|
2104
|
+
const ScalableTag<uint8_t> d8;
|
|
2105
|
+
const svuint8_t b16 = BitCast(d8, svdup_n_u16_z(m, 1));
|
|
2106
|
+
return detail::ConcatEvenFull(b16, b16); // lower half
|
|
2107
|
+
}
|
|
2108
|
+
template <class D, HWY_IF_T_SIZE_D(D, 4)>
|
|
2109
|
+
HWY_INLINE svuint8_t BoolFromMask(svbool_t m) {
|
|
2110
|
+
return U8FromU32(svdup_n_u32_z(m, 1));
|
|
2111
|
+
}
|
|
2112
|
+
template <class D, HWY_IF_T_SIZE_D(D, 8)>
|
|
2113
|
+
HWY_INLINE svuint8_t BoolFromMask(svbool_t m) {
|
|
2114
|
+
const ScalableTag<uint32_t> d32;
|
|
2115
|
+
const svuint32_t b64 = BitCast(d32, svdup_n_u64_z(m, 1));
|
|
2116
|
+
return U8FromU32(detail::ConcatEvenFull(b64, b64)); // lower half
|
|
2117
|
+
}
|
|
2118
|
+
|
|
2119
|
+
// Compacts groups of 8 u8 into 8 contiguous bits in a 64-bit lane.
|
|
2120
|
+
HWY_INLINE svuint64_t BitsFromBool(svuint8_t x) {
|
|
2121
|
+
const ScalableTag<uint8_t> d8;
|
|
2122
|
+
const ScalableTag<uint16_t> d16;
|
|
2123
|
+
const ScalableTag<uint32_t> d32;
|
|
2124
|
+
const ScalableTag<uint64_t> d64;
|
|
2125
|
+
// TODO(janwas): could use SVE2 BDEP, but it's optional.
|
|
2126
|
+
x = Or(x, BitCast(d8, ShiftRight<7>(BitCast(d16, x))));
|
|
2127
|
+
x = Or(x, BitCast(d8, ShiftRight<14>(BitCast(d32, x))));
|
|
2128
|
+
x = Or(x, BitCast(d8, ShiftRight<28>(BitCast(d64, x))));
|
|
2129
|
+
return BitCast(d64, x);
|
|
2130
|
+
}
|
|
2131
|
+
|
|
2132
|
+
} // namespace detail
|
|
2133
|
+
|
|
2134
|
+
// BitsFromMask is required if `HWY_MAX_BYTES <= 64`, which is true for the
|
|
2135
|
+
// fixed-size SVE targets.
|
|
2136
|
+
#if HWY_TARGET == HWY_SVE2_128 || HWY_TARGET == HWY_SVE_256
|
|
2137
|
+
template <class D>
|
|
2138
|
+
HWY_API uint64_t BitsFromMask(D d, svbool_t mask) {
|
|
2139
|
+
const Repartition<uint64_t, D> du64;
|
|
2140
|
+
svuint64_t bits_in_u64 = detail::BitsFromBool(detail::BoolFromMask<D>(mask));
|
|
2141
|
+
|
|
2142
|
+
constexpr size_t N = MaxLanes(d);
|
|
2143
|
+
static_assert(N < 64, "SVE2_128 and SVE_256 are only 128 or 256 bits");
|
|
2144
|
+
const uint64_t valid = (1ull << N) - 1;
|
|
2145
|
+
HWY_IF_CONSTEXPR(N <= 8) {
|
|
2146
|
+
// Upper bits are undefined even if N == 8, hence mask.
|
|
2147
|
+
return GetLane(bits_in_u64) & valid;
|
|
2148
|
+
}
|
|
2149
|
+
|
|
2150
|
+
// Up to 8 of the least-significant bits of each u64 lane are valid.
|
|
2151
|
+
bits_in_u64 = detail::AndN(bits_in_u64, 0xFF);
|
|
2152
|
+
|
|
2153
|
+
// 128-bit vector: only two u64, so avoid ReduceSum.
|
|
2154
|
+
HWY_IF_CONSTEXPR(HWY_TARGET == HWY_SVE2_128) {
|
|
2155
|
+
alignas(16) uint64_t lanes[2];
|
|
2156
|
+
Store(bits_in_u64, du64, lanes);
|
|
2157
|
+
// lanes[0] is always valid because we know N > 8, but lanes[1] might
|
|
2158
|
+
// not be - we may mask it out below.
|
|
2159
|
+
const uint64_t result = lanes[0] + (lanes[1] << 8);
|
|
2160
|
+
// 8-bit lanes, no further masking
|
|
2161
|
+
HWY_IF_CONSTEXPR(N == 16) return result;
|
|
2162
|
+
return result & valid;
|
|
2163
|
+
}
|
|
2164
|
+
|
|
2165
|
+
// Shift the 8-bit groups into place in each u64 lane.
|
|
2166
|
+
alignas(32) uint64_t kShifts[4] = {0 * 8, 1 * 8, 2 * 8, 3 * 8};
|
|
2167
|
+
bits_in_u64 = Shl(bits_in_u64, Load(du64, kShifts));
|
|
2168
|
+
return ReduceSum(du64, bits_in_u64) & valid;
|
|
2169
|
+
}
|
|
2170
|
+
|
|
2171
|
+
#endif // HWY_TARGET == HWY_SVE2_128 || HWY_TARGET == HWY_SVE_256
|
|
2172
|
+
|
|
1615
2173
|
// ------------------------------ IsNegative (Lt)
|
|
1616
2174
|
#ifdef HWY_NATIVE_IS_NEGATIVE
|
|
1617
2175
|
#undef HWY_NATIVE_IS_NEGATIVE
|
|
@@ -1736,6 +2294,56 @@ HWY_API svbool_t IsFinite(const V v) {
|
|
|
1736
2294
|
return RebindMask(d, detail::LtN(exp, hwy::MaxExponentField<T>()));
|
|
1737
2295
|
}
|
|
1738
2296
|
|
|
2297
|
+
// ------------------------------ MulByPow2/MulByFloorPow2
|
|
2298
|
+
|
|
2299
|
+
#define HWY_SVE_MUL_BY_POW2(BASE, CHAR, BITS, HALF, NAME, OP) \
|
|
2300
|
+
HWY_API HWY_SVE_V(BASE, BITS) \
|
|
2301
|
+
NAME(HWY_SVE_V(BASE, BITS) v, HWY_SVE_V(int, BITS) exp) { \
|
|
2302
|
+
return sv##OP##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), v, exp); \
|
|
2303
|
+
}
|
|
2304
|
+
|
|
2305
|
+
HWY_SVE_FOREACH_F(HWY_SVE_MUL_BY_POW2, MulByPow2, scale)
|
|
2306
|
+
|
|
2307
|
+
#undef HWY_SVE_MUL_BY_POW2
|
|
2308
|
+
|
|
2309
|
+
// ------------------------------ MaskedEq etc.
|
|
2310
|
+
#ifdef HWY_NATIVE_MASKED_COMP
|
|
2311
|
+
#undef HWY_NATIVE_MASKED_COMP
|
|
2312
|
+
#else
|
|
2313
|
+
#define HWY_NATIVE_MASKED_COMP
|
|
2314
|
+
#endif
|
|
2315
|
+
|
|
2316
|
+
// mask = f(mask, vector, vector)
|
|
2317
|
+
#define HWY_SVE_COMPARE_Z(BASE, CHAR, BITS, HALF, NAME, OP) \
|
|
2318
|
+
HWY_API svbool_t NAME(svbool_t m, HWY_SVE_V(BASE, BITS) a, \
|
|
2319
|
+
HWY_SVE_V(BASE, BITS) b) { \
|
|
2320
|
+
return sv##OP##_##CHAR##BITS(m, a, b); \
|
|
2321
|
+
}
|
|
2322
|
+
|
|
2323
|
+
HWY_SVE_FOREACH(HWY_SVE_COMPARE_Z, MaskedEq, cmpeq)
|
|
2324
|
+
HWY_SVE_FOREACH(HWY_SVE_COMPARE_Z, MaskedNe, cmpne)
|
|
2325
|
+
HWY_SVE_FOREACH(HWY_SVE_COMPARE_Z, MaskedLt, cmplt)
|
|
2326
|
+
HWY_SVE_FOREACH(HWY_SVE_COMPARE_Z, MaskedLe, cmple)
|
|
2327
|
+
|
|
2328
|
+
#undef HWY_SVE_COMPARE_Z
|
|
2329
|
+
|
|
2330
|
+
template <class V, class M, class D = DFromV<V>>
|
|
2331
|
+
HWY_API MFromD<D> MaskedGt(M m, V a, V b) {
|
|
2332
|
+
// Swap args to reverse comparison
|
|
2333
|
+
return MaskedLt(m, b, a);
|
|
2334
|
+
}
|
|
2335
|
+
|
|
2336
|
+
template <class V, class M, class D = DFromV<V>>
|
|
2337
|
+
HWY_API MFromD<D> MaskedGe(M m, V a, V b) {
|
|
2338
|
+
// Swap args to reverse comparison
|
|
2339
|
+
return MaskedLe(m, b, a);
|
|
2340
|
+
}
|
|
2341
|
+
|
|
2342
|
+
template <class V, class M, class D = DFromV<V>>
|
|
2343
|
+
HWY_API MFromD<D> MaskedIsNaN(const M m, const V v) {
|
|
2344
|
+
return MaskedNe(m, v, v);
|
|
2345
|
+
}
|
|
2346
|
+
|
|
1739
2347
|
// ================================================== MEMORY
|
|
1740
2348
|
|
|
1741
2349
|
// ------------------------------ LoadU/MaskedLoad/LoadDup128/StoreU/Stream
|
|
@@ -1855,6 +2463,38 @@ HWY_API VFromD<D> LoadDup128(D d, const TFromD<D>* HWY_RESTRICT p) {
|
|
|
1855
2463
|
|
|
1856
2464
|
#endif // HWY_TARGET != HWY_SVE2_128
|
|
1857
2465
|
|
|
2466
|
+
// Truncate to smaller size and store
|
|
2467
|
+
#ifdef HWY_NATIVE_STORE_TRUNCATED
|
|
2468
|
+
#undef HWY_NATIVE_STORE_TRUNCATED
|
|
2469
|
+
#else
|
|
2470
|
+
#define HWY_NATIVE_STORE_TRUNCATED
|
|
2471
|
+
#endif
|
|
2472
|
+
|
|
2473
|
+
#define HWY_SVE_STORE_TRUNCATED(BASE, CHAR, BITS, HALF, NAME, OP, TO_BITS) \
|
|
2474
|
+
template <size_t N, int kPow2> \
|
|
2475
|
+
HWY_API void NAME(HWY_SVE_V(BASE, BITS) v, \
|
|
2476
|
+
const HWY_SVE_D(BASE, BITS, N, kPow2) d, \
|
|
2477
|
+
HWY_SVE_T(BASE, TO_BITS) * HWY_RESTRICT p) { \
|
|
2478
|
+
sv##OP##_##CHAR##BITS(detail::MakeMask(d), detail::NativeLanePointer(p), \
|
|
2479
|
+
v); \
|
|
2480
|
+
}
|
|
2481
|
+
|
|
2482
|
+
#define HWY_SVE_STORE_TRUNCATED_BYTE(BASE, CHAR, BITS, HALF, NAME, OP) \
|
|
2483
|
+
HWY_SVE_STORE_TRUNCATED(BASE, CHAR, BITS, HALF, NAME, OP, 8)
|
|
2484
|
+
#define HWY_SVE_STORE_TRUNCATED_HALF(BASE, CHAR, BITS, HALF, NAME, OP) \
|
|
2485
|
+
HWY_SVE_STORE_TRUNCATED(BASE, CHAR, BITS, HALF, NAME, OP, 16)
|
|
2486
|
+
#define HWY_SVE_STORE_TRUNCATED_WORD(BASE, CHAR, BITS, HALF, NAME, OP) \
|
|
2487
|
+
HWY_SVE_STORE_TRUNCATED(BASE, CHAR, BITS, HALF, NAME, OP, 32)
|
|
2488
|
+
|
|
2489
|
+
HWY_SVE_FOREACH_UI16(HWY_SVE_STORE_TRUNCATED_BYTE, TruncateStore, st1b)
|
|
2490
|
+
HWY_SVE_FOREACH_UI32(HWY_SVE_STORE_TRUNCATED_BYTE, TruncateStore, st1b)
|
|
2491
|
+
HWY_SVE_FOREACH_UI64(HWY_SVE_STORE_TRUNCATED_BYTE, TruncateStore, st1b)
|
|
2492
|
+
HWY_SVE_FOREACH_UI32(HWY_SVE_STORE_TRUNCATED_HALF, TruncateStore, st1h)
|
|
2493
|
+
HWY_SVE_FOREACH_UI64(HWY_SVE_STORE_TRUNCATED_HALF, TruncateStore, st1h)
|
|
2494
|
+
HWY_SVE_FOREACH_UI64(HWY_SVE_STORE_TRUNCATED_WORD, TruncateStore, st1w)
|
|
2495
|
+
|
|
2496
|
+
#undef HWY_SVE_STORE_TRUNCATED
|
|
2497
|
+
|
|
1858
2498
|
// ------------------------------ Load/Store
|
|
1859
2499
|
|
|
1860
2500
|
// SVE only requires lane alignment, not natural alignment of the entire
|
|
@@ -1985,6 +2625,7 @@ HWY_API VFromD<D> GatherIndex(D d, const TFromD<D>* HWY_RESTRICT p,
|
|
|
1985
2625
|
v1 = svget2(tuple, 1); \
|
|
1986
2626
|
}
|
|
1987
2627
|
HWY_SVE_FOREACH(HWY_SVE_LOAD2, LoadInterleaved2, ld2)
|
|
2628
|
+
HWY_SVE_FOREACH_BF16(HWY_SVE_LOAD2, LoadInterleaved2, ld2)
|
|
1988
2629
|
|
|
1989
2630
|
#undef HWY_SVE_LOAD2
|
|
1990
2631
|
|
|
@@ -2003,6 +2644,7 @@ HWY_SVE_FOREACH(HWY_SVE_LOAD2, LoadInterleaved2, ld2)
|
|
|
2003
2644
|
v2 = svget3(tuple, 2); \
|
|
2004
2645
|
}
|
|
2005
2646
|
HWY_SVE_FOREACH(HWY_SVE_LOAD3, LoadInterleaved3, ld3)
|
|
2647
|
+
HWY_SVE_FOREACH_BF16(HWY_SVE_LOAD3, LoadInterleaved3, ld3)
|
|
2006
2648
|
|
|
2007
2649
|
#undef HWY_SVE_LOAD3
|
|
2008
2650
|
|
|
@@ -2022,6 +2664,7 @@ HWY_SVE_FOREACH(HWY_SVE_LOAD3, LoadInterleaved3, ld3)
|
|
|
2022
2664
|
v3 = svget4(tuple, 3); \
|
|
2023
2665
|
}
|
|
2024
2666
|
HWY_SVE_FOREACH(HWY_SVE_LOAD4, LoadInterleaved4, ld4)
|
|
2667
|
+
HWY_SVE_FOREACH_BF16(HWY_SVE_LOAD4, LoadInterleaved4, ld4)
|
|
2025
2668
|
|
|
2026
2669
|
#undef HWY_SVE_LOAD4
|
|
2027
2670
|
|
|
@@ -2037,6 +2680,7 @@ HWY_SVE_FOREACH(HWY_SVE_LOAD4, LoadInterleaved4, ld4)
|
|
|
2037
2680
|
Create2(d, v0, v1)); \
|
|
2038
2681
|
}
|
|
2039
2682
|
HWY_SVE_FOREACH(HWY_SVE_STORE2, StoreInterleaved2, st2)
|
|
2683
|
+
HWY_SVE_FOREACH_BF16(HWY_SVE_STORE2, StoreInterleaved2, st2)
|
|
2040
2684
|
|
|
2041
2685
|
#undef HWY_SVE_STORE2
|
|
2042
2686
|
|
|
@@ -2053,6 +2697,7 @@ HWY_SVE_FOREACH(HWY_SVE_STORE2, StoreInterleaved2, st2)
|
|
|
2053
2697
|
Create3(d, v0, v1, v2)); \
|
|
2054
2698
|
}
|
|
2055
2699
|
HWY_SVE_FOREACH(HWY_SVE_STORE3, StoreInterleaved3, st3)
|
|
2700
|
+
HWY_SVE_FOREACH_BF16(HWY_SVE_STORE3, StoreInterleaved3, st3)
|
|
2056
2701
|
|
|
2057
2702
|
#undef HWY_SVE_STORE3
|
|
2058
2703
|
|
|
@@ -2069,9 +2714,13 @@ HWY_SVE_FOREACH(HWY_SVE_STORE3, StoreInterleaved3, st3)
|
|
|
2069
2714
|
Create4(d, v0, v1, v2, v3)); \
|
|
2070
2715
|
}
|
|
2071
2716
|
HWY_SVE_FOREACH(HWY_SVE_STORE4, StoreInterleaved4, st4)
|
|
2717
|
+
HWY_SVE_FOREACH_BF16(HWY_SVE_STORE4, StoreInterleaved4, st4)
|
|
2072
2718
|
|
|
2073
2719
|
#undef HWY_SVE_STORE4
|
|
2074
2720
|
|
|
2721
|
+
// Fall back on generic Load/StoreInterleaved[234] for any emulated types.
|
|
2722
|
+
// Requires HWY_GENERIC_IF_EMULATED_D mirrors HWY_SVE_IF_EMULATED_D.
|
|
2723
|
+
|
|
2075
2724
|
// ================================================== CONVERT
|
|
2076
2725
|
|
|
2077
2726
|
// ------------------------------ PromoteTo
|
|
@@ -2312,17 +2961,6 @@ HWY_API svuint8_t DemoteTo(Simd<uint8_t, N, kPow2> dn, const svint32_t v) {
|
|
|
2312
2961
|
return svuzp1_u8(x2, x2);
|
|
2313
2962
|
}
|
|
2314
2963
|
|
|
2315
|
-
HWY_API svuint8_t U8FromU32(const svuint32_t v) {
|
|
2316
|
-
const DFromV<svuint32_t> du32;
|
|
2317
|
-
const RepartitionToNarrow<decltype(du32)> du16;
|
|
2318
|
-
const RepartitionToNarrow<decltype(du16)> du8;
|
|
2319
|
-
|
|
2320
|
-
const svuint16_t cast16 = BitCast(du16, v);
|
|
2321
|
-
const svuint16_t x2 = svuzp1_u16(cast16, cast16);
|
|
2322
|
-
const svuint8_t cast8 = BitCast(du8, x2);
|
|
2323
|
-
return svuzp1_u8(cast8, cast8);
|
|
2324
|
-
}
|
|
2325
|
-
|
|
2326
2964
|
template <size_t N, int kPow2>
|
|
2327
2965
|
HWY_API svuint8_t DemoteTo(Simd<uint8_t, N, kPow2> dn, const svuint16_t v) {
|
|
2328
2966
|
#if HWY_SVE_HAVE_2
|
|
@@ -2575,79 +3213,6 @@ HWY_API VFromD<D> DemoteTo(D dn, V v) {
|
|
|
2575
3213
|
return BitCast(dn, TruncateTo(dn_u, detail::SaturateU<TFromD<D>>(v)));
|
|
2576
3214
|
}
|
|
2577
3215
|
|
|
2578
|
-
// ------------------------------ ConcatEven/ConcatOdd
|
|
2579
|
-
|
|
2580
|
-
// WARNING: the upper half of these needs fixing up (uzp1/uzp2 use the
|
|
2581
|
-
// full vector length, not rounded down to a power of two as we require).
|
|
2582
|
-
namespace detail {
|
|
2583
|
-
|
|
2584
|
-
#define HWY_SVE_CONCAT_EVERY_SECOND(BASE, CHAR, BITS, HALF, NAME, OP) \
|
|
2585
|
-
HWY_INLINE HWY_SVE_V(BASE, BITS) \
|
|
2586
|
-
NAME(HWY_SVE_V(BASE, BITS) hi, HWY_SVE_V(BASE, BITS) lo) { \
|
|
2587
|
-
return sv##OP##_##CHAR##BITS(lo, hi); \
|
|
2588
|
-
}
|
|
2589
|
-
HWY_SVE_FOREACH(HWY_SVE_CONCAT_EVERY_SECOND, ConcatEvenFull, uzp1)
|
|
2590
|
-
HWY_SVE_FOREACH(HWY_SVE_CONCAT_EVERY_SECOND, ConcatOddFull, uzp2)
|
|
2591
|
-
#if HWY_SVE_HAVE_BF16_FEATURE || HWY_SVE_HAVE_BF16_VEC
|
|
2592
|
-
HWY_SVE_FOREACH_BF16_UNCONDITIONAL(HWY_SVE_CONCAT_EVERY_SECOND, ConcatEvenFull,
|
|
2593
|
-
uzp1)
|
|
2594
|
-
HWY_SVE_FOREACH_BF16_UNCONDITIONAL(HWY_SVE_CONCAT_EVERY_SECOND, ConcatOddFull,
|
|
2595
|
-
uzp2)
|
|
2596
|
-
#endif // HWY_SVE_HAVE_BF16_FEATURE || HWY_SVE_HAVE_BF16_VEC
|
|
2597
|
-
#if defined(__ARM_FEATURE_SVE_MATMUL_FP64)
|
|
2598
|
-
HWY_SVE_FOREACH(HWY_SVE_CONCAT_EVERY_SECOND, ConcatEvenBlocks, uzp1q)
|
|
2599
|
-
HWY_SVE_FOREACH(HWY_SVE_CONCAT_EVERY_SECOND, ConcatOddBlocks, uzp2q)
|
|
2600
|
-
#if HWY_SVE_HAVE_BF16_FEATURE || HWY_SVE_HAVE_BF16_VEC
|
|
2601
|
-
HWY_SVE_FOREACH_BF16_UNCONDITIONAL(HWY_SVE_CONCAT_EVERY_SECOND,
|
|
2602
|
-
ConcatEvenBlocks, uzp1q)
|
|
2603
|
-
HWY_SVE_FOREACH_BF16_UNCONDITIONAL(HWY_SVE_CONCAT_EVERY_SECOND, ConcatOddBlocks,
|
|
2604
|
-
uzp2q)
|
|
2605
|
-
#endif // HWY_SVE_HAVE_BF16_FEATURE || HWY_SVE_HAVE_BF16_VEC
|
|
2606
|
-
#endif // defined(__ARM_FEATURE_SVE_MATMUL_FP64)
|
|
2607
|
-
#undef HWY_SVE_CONCAT_EVERY_SECOND
|
|
2608
|
-
|
|
2609
|
-
// Used to slide up / shift whole register left; mask indicates which range
|
|
2610
|
-
// to take from lo, and the rest is filled from hi starting at its lowest.
|
|
2611
|
-
#define HWY_SVE_SPLICE(BASE, CHAR, BITS, HALF, NAME, OP) \
|
|
2612
|
-
HWY_API HWY_SVE_V(BASE, BITS) NAME( \
|
|
2613
|
-
HWY_SVE_V(BASE, BITS) hi, HWY_SVE_V(BASE, BITS) lo, svbool_t mask) { \
|
|
2614
|
-
return sv##OP##_##CHAR##BITS(mask, lo, hi); \
|
|
2615
|
-
}
|
|
2616
|
-
HWY_SVE_FOREACH(HWY_SVE_SPLICE, Splice, splice)
|
|
2617
|
-
#if HWY_SVE_HAVE_BF16_FEATURE
|
|
2618
|
-
HWY_SVE_FOREACH_BF16(HWY_SVE_SPLICE, Splice, splice)
|
|
2619
|
-
#else
|
|
2620
|
-
template <class V, HWY_IF_BF16_D(DFromV<V>)>
|
|
2621
|
-
HWY_INLINE V Splice(V hi, V lo, svbool_t mask) {
|
|
2622
|
-
const DFromV<V> d;
|
|
2623
|
-
const RebindToUnsigned<decltype(d)> du;
|
|
2624
|
-
return BitCast(d, Splice(BitCast(du, hi), BitCast(du, lo), mask));
|
|
2625
|
-
}
|
|
2626
|
-
#endif // HWY_SVE_HAVE_BF16_FEATURE
|
|
2627
|
-
#undef HWY_SVE_SPLICE
|
|
2628
|
-
|
|
2629
|
-
} // namespace detail
|
|
2630
|
-
|
|
2631
|
-
template <class D>
|
|
2632
|
-
HWY_API VFromD<D> ConcatOdd(D d, VFromD<D> hi, VFromD<D> lo) {
|
|
2633
|
-
#if HWY_SVE_IS_POW2
|
|
2634
|
-
if (detail::IsFull(d)) return detail::ConcatOddFull(hi, lo);
|
|
2635
|
-
#endif
|
|
2636
|
-
const VFromD<D> hi_odd = detail::ConcatOddFull(hi, hi);
|
|
2637
|
-
const VFromD<D> lo_odd = detail::ConcatOddFull(lo, lo);
|
|
2638
|
-
return detail::Splice(hi_odd, lo_odd, FirstN(d, Lanes(d) / 2));
|
|
2639
|
-
}
|
|
2640
|
-
|
|
2641
|
-
template <class D>
|
|
2642
|
-
HWY_API VFromD<D> ConcatEven(D d, VFromD<D> hi, VFromD<D> lo) {
|
|
2643
|
-
#if HWY_SVE_IS_POW2
|
|
2644
|
-
if (detail::IsFull(d)) return detail::ConcatEvenFull(hi, lo);
|
|
2645
|
-
#endif
|
|
2646
|
-
const VFromD<D> hi_odd = detail::ConcatEvenFull(hi, hi);
|
|
2647
|
-
const VFromD<D> lo_odd = detail::ConcatEvenFull(lo, lo);
|
|
2648
|
-
return detail::Splice(hi_odd, lo_odd, FirstN(d, Lanes(d) / 2));
|
|
2649
|
-
}
|
|
2650
|
-
|
|
2651
3216
|
// ------------------------------ PromoteEvenTo/PromoteOddTo
|
|
2652
3217
|
|
|
2653
3218
|
// Signed to signed PromoteEvenTo: 1 instruction instead of 2 in generic-inl.h.
|
|
@@ -2793,6 +3358,41 @@ HWY_API svfloat32_t DemoteTo(Simd<float, N, kPow2> d, const svuint64_t v) {
|
|
|
2793
3358
|
HWY_SVE_FOREACH_F(HWY_SVE_CONVERT, ConvertTo, cvt)
|
|
2794
3359
|
#undef HWY_SVE_CONVERT
|
|
2795
3360
|
|
|
3361
|
+
// ------------------------------ MaskedConvertTo F
|
|
3362
|
+
|
|
3363
|
+
#define HWY_SVE_MASKED_CONVERT_TO_OR_ZERO(BASE, CHAR, BITS, HALF, NAME, OP) \
|
|
3364
|
+
/* Float from signed */ \
|
|
3365
|
+
template <size_t N, int kPow2> \
|
|
3366
|
+
HWY_API HWY_SVE_V(BASE, BITS) \
|
|
3367
|
+
NAME(svbool_t m, HWY_SVE_D(BASE, BITS, N, kPow2) /* d */, \
|
|
3368
|
+
HWY_SVE_V(int, BITS) v) { \
|
|
3369
|
+
return sv##OP##_##CHAR##BITS##_s##BITS##_z(m, v); \
|
|
3370
|
+
} \
|
|
3371
|
+
/* Float from unsigned */ \
|
|
3372
|
+
template <size_t N, int kPow2> \
|
|
3373
|
+
HWY_API HWY_SVE_V(BASE, BITS) \
|
|
3374
|
+
NAME(svbool_t m, HWY_SVE_D(BASE, BITS, N, kPow2) /* d */, \
|
|
3375
|
+
HWY_SVE_V(uint, BITS) v) { \
|
|
3376
|
+
return sv##OP##_##CHAR##BITS##_u##BITS##_z(m, v); \
|
|
3377
|
+
} \
|
|
3378
|
+
/* Signed from float, rounding toward zero */ \
|
|
3379
|
+
template <size_t N, int kPow2> \
|
|
3380
|
+
HWY_API HWY_SVE_V(int, BITS) \
|
|
3381
|
+
NAME(svbool_t m, HWY_SVE_D(int, BITS, N, kPow2) /* d */, \
|
|
3382
|
+
HWY_SVE_V(BASE, BITS) v) { \
|
|
3383
|
+
return sv##OP##_s##BITS##_##CHAR##BITS##_z(m, v); \
|
|
3384
|
+
} \
|
|
3385
|
+
/* Unsigned from float, rounding toward zero */ \
|
|
3386
|
+
template <size_t N, int kPow2> \
|
|
3387
|
+
HWY_API HWY_SVE_V(uint, BITS) \
|
|
3388
|
+
NAME(svbool_t m, HWY_SVE_D(uint, BITS, N, kPow2) /* d */, \
|
|
3389
|
+
HWY_SVE_V(BASE, BITS) v) { \
|
|
3390
|
+
return sv##OP##_u##BITS##_##CHAR##BITS##_z(m, v); \
|
|
3391
|
+
}
|
|
3392
|
+
|
|
3393
|
+
HWY_SVE_FOREACH_F(HWY_SVE_MASKED_CONVERT_TO_OR_ZERO, MaskedConvertTo, cvt)
|
|
3394
|
+
#undef HWY_SVE_MASKED_CONVERT_TO_OR_ZERO
|
|
3395
|
+
|
|
2796
3396
|
// ------------------------------ NearestInt (Round, ConvertTo)
|
|
2797
3397
|
template <class VF, class DI = RebindToSigned<DFromV<VF>>>
|
|
2798
3398
|
HWY_API VFromD<DI> NearestInt(VF v) {
|
|
@@ -2800,7 +3400,14 @@ HWY_API VFromD<DI> NearestInt(VF v) {
|
|
|
2800
3400
|
return ConvertTo(DI(), Round(v));
|
|
2801
3401
|
}
|
|
2802
3402
|
|
|
2803
|
-
|
|
3403
|
+
template <class DI32, HWY_IF_I32_D(DI32)>
|
|
3404
|
+
HWY_API VFromD<DI32> DemoteToNearestInt(DI32 di32,
|
|
3405
|
+
VFromD<Rebind<double, DI32>> v) {
|
|
3406
|
+
// No single instruction, round then demote.
|
|
3407
|
+
return DemoteTo(di32, Round(v));
|
|
3408
|
+
}
|
|
3409
|
+
|
|
3410
|
+
// ------------------------------ Iota (AddN, ConvertTo)
|
|
2804
3411
|
|
|
2805
3412
|
#define HWY_SVE_IOTA(BASE, CHAR, BITS, HALF, NAME, OP) \
|
|
2806
3413
|
template <size_t N, int kPow2, typename T2> \
|
|
@@ -2813,13 +3420,64 @@ HWY_API VFromD<DI> NearestInt(VF v) {
|
|
|
2813
3420
|
HWY_SVE_FOREACH_UI(HWY_SVE_IOTA, Iota, index)
|
|
2814
3421
|
#undef HWY_SVE_IOTA
|
|
2815
3422
|
|
|
2816
|
-
template <class D, typename T2,
|
|
3423
|
+
template <class D, typename T = TFromD<D>, typename T2, HWY_IF_FLOAT(T)>
|
|
2817
3424
|
HWY_API VFromD<D> Iota(const D d, T2 first) {
|
|
2818
3425
|
const RebindToSigned<D> di;
|
|
2819
|
-
|
|
2820
|
-
|
|
3426
|
+
const T first_f = ConvertScalarTo<T>(first);
|
|
3427
|
+
const VFromD<D> iota_f = ConvertTo(d, Iota(di, 0));
|
|
3428
|
+
return detail::AddN(iota_f, first_f);
|
|
3429
|
+
}
|
|
3430
|
+
|
|
3431
|
+
// ================================================== LANE ACCESS
|
|
3432
|
+
|
|
3433
|
+
// ------------------------------ ExtractLane (GetLaneM, FirstN)
|
|
3434
|
+
template <class V>
|
|
3435
|
+
HWY_API TFromV<V> ExtractLane(V v, size_t i) {
|
|
3436
|
+
return detail::GetLaneM(v, FirstN(DFromV<V>(), i));
|
|
3437
|
+
}
|
|
3438
|
+
|
|
3439
|
+
// ------------------------------ InsertLane (IfThenElse, EqN)
|
|
3440
|
+
template <class V, typename T>
|
|
3441
|
+
HWY_API V InsertLane(const V v, size_t i, T t) {
|
|
3442
|
+
static_assert(sizeof(TFromV<V>) == sizeof(T), "Lane size mismatch");
|
|
3443
|
+
const DFromV<V> d;
|
|
3444
|
+
const RebindToSigned<decltype(d)> di;
|
|
3445
|
+
using TI = TFromD<decltype(di)>;
|
|
3446
|
+
const svbool_t is_i = detail::EqN(Iota(di, 0), static_cast<TI>(i));
|
|
3447
|
+
// The actual type may be int16_t for special floats; copy, not cast.
|
|
3448
|
+
TFromV<V> t_bits;
|
|
3449
|
+
hwy::CopySameSize(&t, &t_bits);
|
|
3450
|
+
return IfThenElse(RebindMask(d, is_i), Set(d, t_bits), v);
|
|
2821
3451
|
}
|
|
2822
3452
|
|
|
3453
|
+
// ------------------------------ GetExponent
|
|
3454
|
+
|
|
3455
|
+
#if HWY_SVE_HAVE_2 || HWY_IDE
|
|
3456
|
+
#ifdef HWY_NATIVE_GET_EXPONENT
|
|
3457
|
+
#undef HWY_NATIVE_GET_EXPONENT
|
|
3458
|
+
#else
|
|
3459
|
+
#define HWY_NATIVE_GET_EXPONENT
|
|
3460
|
+
#endif
|
|
3461
|
+
|
|
3462
|
+
namespace detail {
|
|
3463
|
+
#define HWY_SVE_GET_EXP(BASE, CHAR, BITS, HALF, NAME, OP) \
|
|
3464
|
+
HWY_API HWY_SVE_V(int, BITS) NAME(HWY_SVE_V(BASE, BITS) v) { \
|
|
3465
|
+
return sv##OP##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), v); \
|
|
3466
|
+
}
|
|
3467
|
+
HWY_SVE_FOREACH_F(HWY_SVE_GET_EXP, GetExponent, logb)
|
|
3468
|
+
#undef HWY_SVE_GET_EXP
|
|
3469
|
+
} // namespace detail
|
|
3470
|
+
|
|
3471
|
+
template <class V, HWY_IF_FLOAT_V(V)>
|
|
3472
|
+
HWY_API V GetExponent(V v) {
|
|
3473
|
+
const DFromV<V> d;
|
|
3474
|
+
const RebindToSigned<decltype(d)> di;
|
|
3475
|
+
const VFromD<decltype(di)> exponent_int = detail::GetExponent(v);
|
|
3476
|
+
// convert integer to original type
|
|
3477
|
+
return ConvertTo(d, exponent_int);
|
|
3478
|
+
}
|
|
3479
|
+
#endif // HWY_SVE_HAVE_2
|
|
3480
|
+
|
|
2823
3481
|
// ------------------------------ InterleaveLower
|
|
2824
3482
|
|
|
2825
3483
|
template <class D, class V>
|
|
@@ -2945,10 +3603,10 @@ HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<0xDD> /*idx_3210_tag*/,
|
|
|
2945
3603
|
|
|
2946
3604
|
namespace detail {
|
|
2947
3605
|
|
|
2948
|
-
#if HWY_TARGET == HWY_SVE_256 || HWY_IDE
|
|
3606
|
+
#if (HWY_TARGET == HWY_SVE_256 && HWY_HAVE_CONSTEXPR_LANES) || HWY_IDE
|
|
2949
3607
|
template <class D, HWY_IF_T_SIZE_D(D, 1)>
|
|
2950
3608
|
svbool_t MaskLowerHalf(D d) {
|
|
2951
|
-
switch (
|
|
3609
|
+
switch (MaxLanes(d)) {
|
|
2952
3610
|
case 32:
|
|
2953
3611
|
return svptrue_pat_b8(SV_VL16);
|
|
2954
3612
|
case 16:
|
|
@@ -2963,7 +3621,7 @@ svbool_t MaskLowerHalf(D d) {
|
|
|
2963
3621
|
}
|
|
2964
3622
|
template <class D, HWY_IF_T_SIZE_D(D, 2)>
|
|
2965
3623
|
svbool_t MaskLowerHalf(D d) {
|
|
2966
|
-
switch (
|
|
3624
|
+
switch (MaxLanes(d)) {
|
|
2967
3625
|
case 16:
|
|
2968
3626
|
return svptrue_pat_b16(SV_VL8);
|
|
2969
3627
|
case 8:
|
|
@@ -2976,7 +3634,7 @@ svbool_t MaskLowerHalf(D d) {
|
|
|
2976
3634
|
}
|
|
2977
3635
|
template <class D, HWY_IF_T_SIZE_D(D, 4)>
|
|
2978
3636
|
svbool_t MaskLowerHalf(D d) {
|
|
2979
|
-
switch (
|
|
3637
|
+
switch (MaxLanes(d)) {
|
|
2980
3638
|
case 8:
|
|
2981
3639
|
return svptrue_pat_b32(SV_VL4);
|
|
2982
3640
|
case 4:
|
|
@@ -2987,7 +3645,7 @@ svbool_t MaskLowerHalf(D d) {
|
|
|
2987
3645
|
}
|
|
2988
3646
|
template <class D, HWY_IF_T_SIZE_D(D, 8)>
|
|
2989
3647
|
svbool_t MaskLowerHalf(D d) {
|
|
2990
|
-
switch (
|
|
3648
|
+
switch (MaxLanes(d)) {
|
|
2991
3649
|
case 4:
|
|
2992
3650
|
return svptrue_pat_b64(SV_VL2);
|
|
2993
3651
|
default:
|
|
@@ -2995,10 +3653,10 @@ svbool_t MaskLowerHalf(D d) {
|
|
|
2995
3653
|
}
|
|
2996
3654
|
}
|
|
2997
3655
|
#endif
|
|
2998
|
-
#if HWY_TARGET == HWY_SVE2_128 || HWY_IDE
|
|
3656
|
+
#if (HWY_TARGET == HWY_SVE2_128 && HWY_HAVE_CONSTEXPR_LANES) || HWY_IDE
|
|
2999
3657
|
template <class D, HWY_IF_T_SIZE_D(D, 1)>
|
|
3000
3658
|
svbool_t MaskLowerHalf(D d) {
|
|
3001
|
-
switch (
|
|
3659
|
+
switch (MaxLanes(d)) {
|
|
3002
3660
|
case 16:
|
|
3003
3661
|
return svptrue_pat_b8(SV_VL8);
|
|
3004
3662
|
case 8:
|
|
@@ -3013,7 +3671,7 @@ svbool_t MaskLowerHalf(D d) {
|
|
|
3013
3671
|
}
|
|
3014
3672
|
template <class D, HWY_IF_T_SIZE_D(D, 2)>
|
|
3015
3673
|
svbool_t MaskLowerHalf(D d) {
|
|
3016
|
-
switch (
|
|
3674
|
+
switch (MaxLanes(d)) {
|
|
3017
3675
|
case 8:
|
|
3018
3676
|
return svptrue_pat_b16(SV_VL4);
|
|
3019
3677
|
case 4:
|
|
@@ -3026,14 +3684,15 @@ svbool_t MaskLowerHalf(D d) {
|
|
|
3026
3684
|
}
|
|
3027
3685
|
template <class D, HWY_IF_T_SIZE_D(D, 4)>
|
|
3028
3686
|
svbool_t MaskLowerHalf(D d) {
|
|
3029
|
-
return svptrue_pat_b32(
|
|
3687
|
+
return svptrue_pat_b32(MaxLanes(d) == 4 ? SV_VL2 : SV_VL1);
|
|
3030
3688
|
}
|
|
3031
3689
|
template <class D, HWY_IF_T_SIZE_D(D, 8)>
|
|
3032
3690
|
svbool_t MaskLowerHalf(D /*d*/) {
|
|
3033
3691
|
return svptrue_pat_b64(SV_VL1);
|
|
3034
3692
|
}
|
|
3035
3693
|
#endif // HWY_TARGET == HWY_SVE2_128
|
|
3036
|
-
#if HWY_TARGET != HWY_SVE_256 && HWY_TARGET != HWY_SVE2_128
|
|
3694
|
+
#if (HWY_TARGET != HWY_SVE_256 && HWY_TARGET != HWY_SVE2_128) || \
|
|
3695
|
+
!HWY_HAVE_CONSTEXPR_LANES
|
|
3037
3696
|
template <class D>
|
|
3038
3697
|
svbool_t MaskLowerHalf(D d) {
|
|
3039
3698
|
return FirstN(d, Lanes(d) / 2);
|
|
@@ -3089,7 +3748,7 @@ HWY_API V ConcatLowerLower(const D d, const V hi, const V lo) {
|
|
|
3089
3748
|
// ------------------------------ ConcatLowerUpper
|
|
3090
3749
|
template <class D, class V>
|
|
3091
3750
|
HWY_API V ConcatLowerUpper(const D d, const V hi, const V lo) {
|
|
3092
|
-
#if
|
|
3751
|
+
#if HWY_HAVE_CONSTEXPR_LANES
|
|
3093
3752
|
if (detail::IsFull(d)) {
|
|
3094
3753
|
return detail::Ext<Lanes(d) / 2>(hi, lo);
|
|
3095
3754
|
}
|
|
@@ -3135,150 +3794,26 @@ HWY_API V LowerHalf(D2 /* tag */, const V v) {
|
|
|
3135
3794
|
}
|
|
3136
3795
|
|
|
3137
3796
|
template <class V>
|
|
3138
|
-
HWY_API V LowerHalf(const V v) {
|
|
3139
|
-
return v;
|
|
3140
|
-
}
|
|
3141
|
-
|
|
3142
|
-
template <class DH, class V>
|
|
3143
|
-
HWY_API V UpperHalf(const DH dh, const V v) {
|
|
3144
|
-
const Twice<decltype(dh)> d;
|
|
3145
|
-
// Cast so that we support bfloat16_t.
|
|
3146
|
-
const RebindToUnsigned<decltype(d)> du;
|
|
3147
|
-
const VFromD<decltype(du)> vu = BitCast(du, v);
|
|
3148
|
-
#if HWY_TARGET == HWY_SVE_256 || HWY_TARGET == HWY_SVE2_128 // constexpr Lanes
|
|
3149
|
-
return BitCast(d, detail::Ext<Lanes(dh)>(vu, vu));
|
|
3150
|
-
#else
|
|
3151
|
-
const MFromD<decltype(du)> mask = detail::MaskUpperHalf(du);
|
|
3152
|
-
return BitCast(d, detail::Splice(vu, vu, mask));
|
|
3153
|
-
#endif
|
|
3154
|
-
}
|
|
3155
|
-
|
|
3156
|
-
// ================================================== REDUCE
|
|
3157
|
-
|
|
3158
|
-
#ifdef HWY_NATIVE_REDUCE_SCALAR
|
|
3159
|
-
#undef HWY_NATIVE_REDUCE_SCALAR
|
|
3160
|
-
#else
|
|
3161
|
-
#define HWY_NATIVE_REDUCE_SCALAR
|
|
3162
|
-
#endif
|
|
3163
|
-
|
|
3164
|
-
// These return T, suitable for ReduceSum.
|
|
3165
|
-
namespace detail {
|
|
3166
|
-
#define HWY_SVE_REDUCE_ADD(BASE, CHAR, BITS, HALF, NAME, OP) \
|
|
3167
|
-
HWY_API HWY_SVE_T(BASE, BITS) NAME(svbool_t pg, HWY_SVE_V(BASE, BITS) v) { \
|
|
3168
|
-
/* The intrinsic returns [u]int64_t; truncate to T so we can broadcast. */ \
|
|
3169
|
-
using T = HWY_SVE_T(BASE, BITS); \
|
|
3170
|
-
using TU = MakeUnsigned<T>; \
|
|
3171
|
-
constexpr uint64_t kMask = LimitsMax<TU>(); \
|
|
3172
|
-
return static_cast<T>(static_cast<TU>( \
|
|
3173
|
-
static_cast<uint64_t>(sv##OP##_##CHAR##BITS(pg, v)) & kMask)); \
|
|
3174
|
-
}
|
|
3175
|
-
|
|
3176
|
-
#define HWY_SVE_REDUCE(BASE, CHAR, BITS, HALF, NAME, OP) \
|
|
3177
|
-
HWY_API HWY_SVE_T(BASE, BITS) NAME(svbool_t pg, HWY_SVE_V(BASE, BITS) v) { \
|
|
3178
|
-
return sv##OP##_##CHAR##BITS(pg, v); \
|
|
3179
|
-
}
|
|
3180
|
-
|
|
3181
|
-
HWY_SVE_FOREACH_UI(HWY_SVE_REDUCE_ADD, SumOfLanesM, addv)
|
|
3182
|
-
HWY_SVE_FOREACH_F(HWY_SVE_REDUCE, SumOfLanesM, addv)
|
|
3183
|
-
|
|
3184
|
-
HWY_SVE_FOREACH_UI(HWY_SVE_REDUCE, MinOfLanesM, minv)
|
|
3185
|
-
HWY_SVE_FOREACH_UI(HWY_SVE_REDUCE, MaxOfLanesM, maxv)
|
|
3186
|
-
// NaN if all are
|
|
3187
|
-
HWY_SVE_FOREACH_F(HWY_SVE_REDUCE, MinOfLanesM, minnmv)
|
|
3188
|
-
HWY_SVE_FOREACH_F(HWY_SVE_REDUCE, MaxOfLanesM, maxnmv)
|
|
3189
|
-
|
|
3190
|
-
#undef HWY_SVE_REDUCE
|
|
3191
|
-
#undef HWY_SVE_REDUCE_ADD
|
|
3192
|
-
} // namespace detail
|
|
3193
|
-
|
|
3194
|
-
// detail::SumOfLanesM, detail::MinOfLanesM, and detail::MaxOfLanesM is more
|
|
3195
|
-
// efficient for N=4 I8/U8 reductions on SVE than the default implementations
|
|
3196
|
-
// of the N=4 I8/U8 ReduceSum/ReduceMin/ReduceMax operations in
|
|
3197
|
-
// generic_ops-inl.h
|
|
3198
|
-
#undef HWY_IF_REDUCE_D
|
|
3199
|
-
#define HWY_IF_REDUCE_D(D) hwy::EnableIf<HWY_MAX_LANES_D(D) != 1>* = nullptr
|
|
3200
|
-
|
|
3201
|
-
#ifdef HWY_NATIVE_REDUCE_SUM_4_UI8
|
|
3202
|
-
#undef HWY_NATIVE_REDUCE_SUM_4_UI8
|
|
3203
|
-
#else
|
|
3204
|
-
#define HWY_NATIVE_REDUCE_SUM_4_UI8
|
|
3205
|
-
#endif
|
|
3206
|
-
|
|
3207
|
-
#ifdef HWY_NATIVE_REDUCE_MINMAX_4_UI8
|
|
3208
|
-
#undef HWY_NATIVE_REDUCE_MINMAX_4_UI8
|
|
3209
|
-
#else
|
|
3210
|
-
#define HWY_NATIVE_REDUCE_MINMAX_4_UI8
|
|
3211
|
-
#endif
|
|
3212
|
-
|
|
3213
|
-
template <class D, HWY_IF_REDUCE_D(D)>
|
|
3214
|
-
HWY_API TFromD<D> ReduceSum(D d, VFromD<D> v) {
|
|
3215
|
-
return detail::SumOfLanesM(detail::MakeMask(d), v);
|
|
3216
|
-
}
|
|
3217
|
-
|
|
3218
|
-
template <class D, HWY_IF_REDUCE_D(D)>
|
|
3219
|
-
HWY_API TFromD<D> ReduceMin(D d, VFromD<D> v) {
|
|
3220
|
-
return detail::MinOfLanesM(detail::MakeMask(d), v);
|
|
3221
|
-
}
|
|
3222
|
-
|
|
3223
|
-
template <class D, HWY_IF_REDUCE_D(D)>
|
|
3224
|
-
HWY_API TFromD<D> ReduceMax(D d, VFromD<D> v) {
|
|
3225
|
-
return detail::MaxOfLanesM(detail::MakeMask(d), v);
|
|
3226
|
-
}
|
|
3227
|
-
|
|
3228
|
-
// ------------------------------ SumOfLanes
|
|
3229
|
-
|
|
3230
|
-
template <class D, HWY_IF_LANES_GT_D(D, 1)>
|
|
3231
|
-
HWY_API VFromD<D> SumOfLanes(D d, VFromD<D> v) {
|
|
3232
|
-
return Set(d, ReduceSum(d, v));
|
|
3233
|
-
}
|
|
3234
|
-
template <class D, HWY_IF_LANES_GT_D(D, 1)>
|
|
3235
|
-
HWY_API VFromD<D> MinOfLanes(D d, VFromD<D> v) {
|
|
3236
|
-
return Set(d, ReduceMin(d, v));
|
|
3237
|
-
}
|
|
3238
|
-
template <class D, HWY_IF_LANES_GT_D(D, 1)>
|
|
3239
|
-
HWY_API VFromD<D> MaxOfLanes(D d, VFromD<D> v) {
|
|
3240
|
-
return Set(d, ReduceMax(d, v));
|
|
3241
|
-
}
|
|
3242
|
-
|
|
3243
|
-
// ================================================== SWIZZLE
|
|
3244
|
-
|
|
3245
|
-
// ------------------------------ GetLane
|
|
3246
|
-
|
|
3247
|
-
namespace detail {
|
|
3248
|
-
#define HWY_SVE_GET_LANE(BASE, CHAR, BITS, HALF, NAME, OP) \
|
|
3249
|
-
HWY_INLINE HWY_SVE_T(BASE, BITS) \
|
|
3250
|
-
NAME(HWY_SVE_V(BASE, BITS) v, svbool_t mask) { \
|
|
3251
|
-
return sv##OP##_##CHAR##BITS(mask, v); \
|
|
3252
|
-
}
|
|
3253
|
-
|
|
3254
|
-
HWY_SVE_FOREACH(HWY_SVE_GET_LANE, GetLaneM, lasta)
|
|
3255
|
-
HWY_SVE_FOREACH(HWY_SVE_GET_LANE, ExtractLastMatchingLaneM, lastb)
|
|
3256
|
-
#undef HWY_SVE_GET_LANE
|
|
3257
|
-
} // namespace detail
|
|
3258
|
-
|
|
3259
|
-
template <class V>
|
|
3260
|
-
HWY_API TFromV<V> GetLane(V v) {
|
|
3261
|
-
return detail::GetLaneM(v, detail::PFalse());
|
|
3262
|
-
}
|
|
3263
|
-
|
|
3264
|
-
// ------------------------------ ExtractLane
|
|
3265
|
-
template <class V>
|
|
3266
|
-
HWY_API TFromV<V> ExtractLane(V v, size_t i) {
|
|
3267
|
-
return detail::GetLaneM(v, FirstN(DFromV<V>(), i));
|
|
3797
|
+
HWY_API V LowerHalf(const V v) {
|
|
3798
|
+
return v;
|
|
3268
3799
|
}
|
|
3269
3800
|
|
|
3270
|
-
|
|
3271
|
-
|
|
3272
|
-
|
|
3273
|
-
|
|
3274
|
-
const
|
|
3275
|
-
const
|
|
3276
|
-
|
|
3277
|
-
|
|
3278
|
-
|
|
3279
|
-
|
|
3801
|
+
template <class DH, class V>
|
|
3802
|
+
HWY_API V UpperHalf(const DH dh, const V v) {
|
|
3803
|
+
const Twice<decltype(dh)> d;
|
|
3804
|
+
// Cast so that we support bfloat16_t.
|
|
3805
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
3806
|
+
const VFromD<decltype(du)> vu = BitCast(du, v);
|
|
3807
|
+
#if HWY_HAVE_CONSTEXPR_LANES
|
|
3808
|
+
return BitCast(d, detail::Ext<Lanes(dh)>(vu, vu));
|
|
3809
|
+
#else
|
|
3810
|
+
const MFromD<decltype(du)> mask = detail::MaskUpperHalf(du);
|
|
3811
|
+
return BitCast(d, detail::Splice(vu, vu, mask));
|
|
3812
|
+
#endif
|
|
3280
3813
|
}
|
|
3281
3814
|
|
|
3815
|
+
// ================================================== SWIZZLE
|
|
3816
|
+
|
|
3282
3817
|
// ------------------------------ DupEven
|
|
3283
3818
|
|
|
3284
3819
|
namespace detail {
|
|
@@ -3447,6 +3982,40 @@ HWY_API V TwoTablesLookupLanes(V a, V b,
|
|
|
3447
3982
|
return TwoTablesLookupLanes(d, a, b, idx);
|
|
3448
3983
|
}
|
|
3449
3984
|
|
|
3985
|
+
// ------------------------------ SlideUpLanes (FirstN)
|
|
3986
|
+
template <class D>
|
|
3987
|
+
HWY_API VFromD<D> SlideUpLanes(D d, VFromD<D> v, size_t amt) {
|
|
3988
|
+
return detail::Splice(v, Zero(d), FirstN(d, amt));
|
|
3989
|
+
}
|
|
3990
|
+
|
|
3991
|
+
// ------------------------------ Slide1Up
|
|
3992
|
+
|
|
3993
|
+
#ifdef HWY_NATIVE_SLIDE1_UP_DOWN
|
|
3994
|
+
#undef HWY_NATIVE_SLIDE1_UP_DOWN
|
|
3995
|
+
#else
|
|
3996
|
+
#define HWY_NATIVE_SLIDE1_UP_DOWN
|
|
3997
|
+
#endif
|
|
3998
|
+
|
|
3999
|
+
template <class D>
|
|
4000
|
+
HWY_API VFromD<D> Slide1Up(D d, VFromD<D> v) {
|
|
4001
|
+
return SlideUpLanes(d, v, 1);
|
|
4002
|
+
}
|
|
4003
|
+
|
|
4004
|
+
// ------------------------------ SlideDownLanes (TableLookupLanes)
|
|
4005
|
+
template <class D>
|
|
4006
|
+
HWY_API VFromD<D> SlideDownLanes(D d, VFromD<D> v, size_t amt) {
|
|
4007
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
4008
|
+
using TU = TFromD<decltype(du)>;
|
|
4009
|
+
const auto idx = Iota(du, static_cast<TU>(amt));
|
|
4010
|
+
return IfThenElseZero(FirstN(d, Lanes(d) - amt), TableLookupLanes(v, idx));
|
|
4011
|
+
}
|
|
4012
|
+
|
|
4013
|
+
// ------------------------------ Slide1Down
|
|
4014
|
+
template <class D>
|
|
4015
|
+
HWY_API VFromD<D> Slide1Down(D d, VFromD<D> v) {
|
|
4016
|
+
return SlideDownLanes(d, v, 1);
|
|
4017
|
+
}
|
|
4018
|
+
|
|
3450
4019
|
// ------------------------------ SwapAdjacentBlocks (TableLookupLanes)
|
|
3451
4020
|
|
|
3452
4021
|
namespace detail {
|
|
@@ -3476,6 +4045,40 @@ HWY_API V SwapAdjacentBlocks(const V v) {
|
|
|
3476
4045
|
#endif
|
|
3477
4046
|
}
|
|
3478
4047
|
|
|
4048
|
+
// ------------------------------ InterleaveEvenBlocks
|
|
4049
|
+
// (ConcatLowerLower, SlideUpLanes, OddEvenBlocks)
|
|
4050
|
+
|
|
4051
|
+
template <class D, class V = VFromD<D>>
|
|
4052
|
+
HWY_API V InterleaveEvenBlocks(D d, V a, V b) {
|
|
4053
|
+
#if HWY_TARGET == HWY_SVE_256
|
|
4054
|
+
return ConcatLowerLower(d, b, a);
|
|
4055
|
+
#elif HWY_TARGET == HWY_SVE2_128
|
|
4056
|
+
(void)d;
|
|
4057
|
+
(void)b;
|
|
4058
|
+
return a;
|
|
4059
|
+
#else
|
|
4060
|
+
constexpr size_t kLanesPerBlock = detail::LanesPerBlock(d);
|
|
4061
|
+
return OddEvenBlocks(SlideUpLanes(d, b, kLanesPerBlock), a);
|
|
4062
|
+
#endif
|
|
4063
|
+
}
|
|
4064
|
+
|
|
4065
|
+
// ------------------------------ InterleaveOddBlocks
|
|
4066
|
+
// (ConcatUpperUpper, SlideDownLanes, OddEvenBlocks)
|
|
4067
|
+
|
|
4068
|
+
template <class D, class V = VFromD<D>>
|
|
4069
|
+
HWY_API V InterleaveOddBlocks(D d, V a, V b) {
|
|
4070
|
+
#if HWY_TARGET == HWY_SVE_256
|
|
4071
|
+
return ConcatUpperUpper(d, b, a);
|
|
4072
|
+
#elif HWY_TARGET == HWY_SVE2_128
|
|
4073
|
+
(void)d;
|
|
4074
|
+
(void)b;
|
|
4075
|
+
return a;
|
|
4076
|
+
#else
|
|
4077
|
+
constexpr size_t kLanesPerBlock = detail::LanesPerBlock(d);
|
|
4078
|
+
return OddEvenBlocks(b, SlideDownLanes(d, a, kLanesPerBlock));
|
|
4079
|
+
#endif
|
|
4080
|
+
}
|
|
4081
|
+
|
|
3479
4082
|
// ------------------------------ Reverse
|
|
3480
4083
|
|
|
3481
4084
|
namespace detail {
|
|
@@ -3630,43 +4233,6 @@ HWY_API VFromD<D> Reverse8(D d, const VFromD<D> v) {
|
|
|
3630
4233
|
HWY_SVE_FOREACH_UI(HWY_SVE_REVERSE_BITS, ReverseBits, rbit)
|
|
3631
4234
|
#undef HWY_SVE_REVERSE_BITS
|
|
3632
4235
|
|
|
3633
|
-
// ------------------------------ SlideUpLanes
|
|
3634
|
-
|
|
3635
|
-
template <class D>
|
|
3636
|
-
HWY_API VFromD<D> SlideUpLanes(D d, VFromD<D> v, size_t amt) {
|
|
3637
|
-
return detail::Splice(v, Zero(d), FirstN(d, amt));
|
|
3638
|
-
}
|
|
3639
|
-
|
|
3640
|
-
// ------------------------------ Slide1Up
|
|
3641
|
-
|
|
3642
|
-
#ifdef HWY_NATIVE_SLIDE1_UP_DOWN
|
|
3643
|
-
#undef HWY_NATIVE_SLIDE1_UP_DOWN
|
|
3644
|
-
#else
|
|
3645
|
-
#define HWY_NATIVE_SLIDE1_UP_DOWN
|
|
3646
|
-
#endif
|
|
3647
|
-
|
|
3648
|
-
template <class D>
|
|
3649
|
-
HWY_API VFromD<D> Slide1Up(D d, VFromD<D> v) {
|
|
3650
|
-
return SlideUpLanes(d, v, 1);
|
|
3651
|
-
}
|
|
3652
|
-
|
|
3653
|
-
// ------------------------------ SlideDownLanes (TableLookupLanes)
|
|
3654
|
-
|
|
3655
|
-
template <class D>
|
|
3656
|
-
HWY_API VFromD<D> SlideDownLanes(D d, VFromD<D> v, size_t amt) {
|
|
3657
|
-
const RebindToUnsigned<decltype(d)> du;
|
|
3658
|
-
using TU = TFromD<decltype(du)>;
|
|
3659
|
-
const auto idx = Iota(du, static_cast<TU>(amt));
|
|
3660
|
-
return IfThenElseZero(FirstN(d, Lanes(d) - amt), TableLookupLanes(v, idx));
|
|
3661
|
-
}
|
|
3662
|
-
|
|
3663
|
-
// ------------------------------ Slide1Down
|
|
3664
|
-
|
|
3665
|
-
template <class D>
|
|
3666
|
-
HWY_API VFromD<D> Slide1Down(D d, VFromD<D> v) {
|
|
3667
|
-
return SlideDownLanes(d, v, 1);
|
|
3668
|
-
}
|
|
3669
|
-
|
|
3670
4236
|
// ------------------------------ Block insert/extract/broadcast ops
|
|
3671
4237
|
#if HWY_TARGET != HWY_SVE2_128
|
|
3672
4238
|
|
|
@@ -4668,6 +5234,12 @@ HWY_API V MaskedDivOr(V no, M m, V a, V b) {
|
|
|
4668
5234
|
return IfThenElse(m, Div(a, b), no);
|
|
4669
5235
|
}
|
|
4670
5236
|
|
|
5237
|
+
template <class V, class M, HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 2)),
|
|
5238
|
+
HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
|
|
5239
|
+
HWY_API V MaskedDiv(M m, V a, V b) {
|
|
5240
|
+
return IfThenElseZero(m, Div(a, b));
|
|
5241
|
+
}
|
|
5242
|
+
|
|
4671
5243
|
// ------------------------------ Mod (Div, NegMulAdd)
|
|
4672
5244
|
template <class V>
|
|
4673
5245
|
HWY_API V Mod(V a, V b) {
|
|
@@ -4680,28 +5252,50 @@ HWY_API V MaskedModOr(V no, M m, V a, V b) {
|
|
|
4680
5252
|
return IfThenElse(m, Mod(a, b), no);
|
|
4681
5253
|
}
|
|
4682
5254
|
|
|
4683
|
-
// ------------------------------ BroadcastSignBit (ShiftRight)
|
|
4684
|
-
template <class V>
|
|
4685
|
-
HWY_API V BroadcastSignBit(const V v) {
|
|
4686
|
-
return ShiftRight<sizeof(TFromV<V>) * 8 - 1>(v);
|
|
4687
|
-
}
|
|
4688
|
-
|
|
4689
5255
|
// ------------------------------ IfNegativeThenElse (BroadcastSignBit)
|
|
4690
5256
|
template <class V>
|
|
4691
5257
|
HWY_API V IfNegativeThenElse(V v, V yes, V no) {
|
|
4692
5258
|
static_assert(IsSigned<TFromV<V>>(), "Only works for signed/float");
|
|
4693
5259
|
return IfThenElse(IsNegative(v), yes, no);
|
|
4694
5260
|
}
|
|
5261
|
+
// ------------------------------ IfNegativeThenNegOrUndefIfZero
|
|
5262
|
+
|
|
5263
|
+
#ifdef HWY_NATIVE_INTEGER_IF_NEGATIVE_THEN_NEG
|
|
5264
|
+
#undef HWY_NATIVE_INTEGER_IF_NEGATIVE_THEN_NEG
|
|
5265
|
+
#else
|
|
5266
|
+
#define HWY_NATIVE_INTEGER_IF_NEGATIVE_THEN_NEG
|
|
5267
|
+
#endif
|
|
5268
|
+
|
|
5269
|
+
#define HWY_SVE_NEG_IF(BASE, CHAR, BITS, HALF, NAME, OP) \
|
|
5270
|
+
HWY_API HWY_SVE_V(BASE, BITS) \
|
|
5271
|
+
NAME(HWY_SVE_V(BASE, BITS) mask, HWY_SVE_V(BASE, BITS) v) { \
|
|
5272
|
+
return sv##OP##_##CHAR##BITS##_m(v, IsNegative(mask), v); \
|
|
5273
|
+
}
|
|
5274
|
+
|
|
5275
|
+
HWY_SVE_FOREACH_IF(HWY_SVE_NEG_IF, IfNegativeThenNegOrUndefIfZero, neg)
|
|
5276
|
+
|
|
5277
|
+
#undef HWY_SVE_NEG_IF
|
|
4695
5278
|
|
|
4696
5279
|
// ------------------------------ AverageRound (ShiftRight)
|
|
4697
5280
|
|
|
5281
|
+
#ifdef HWY_NATIVE_AVERAGE_ROUND_UI32
|
|
5282
|
+
#undef HWY_NATIVE_AVERAGE_ROUND_UI32
|
|
5283
|
+
#else
|
|
5284
|
+
#define HWY_NATIVE_AVERAGE_ROUND_UI32
|
|
5285
|
+
#endif
|
|
5286
|
+
|
|
5287
|
+
#ifdef HWY_NATIVE_AVERAGE_ROUND_UI64
|
|
5288
|
+
#undef HWY_NATIVE_AVERAGE_ROUND_UI64
|
|
5289
|
+
#else
|
|
5290
|
+
#define HWY_NATIVE_AVERAGE_ROUND_UI64
|
|
5291
|
+
#endif
|
|
5292
|
+
|
|
4698
5293
|
#if HWY_SVE_HAVE_2
|
|
4699
|
-
|
|
4700
|
-
HWY_SVE_FOREACH_U16(HWY_SVE_RETV_ARGPVV, AverageRound, rhadd)
|
|
5294
|
+
HWY_SVE_FOREACH_UI(HWY_SVE_RETV_ARGPVV, AverageRound, rhadd)
|
|
4701
5295
|
#else
|
|
4702
|
-
template <class V>
|
|
4703
|
-
V AverageRound(const V a, const V b) {
|
|
4704
|
-
return ShiftRight<1>(
|
|
5296
|
+
template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
|
|
5297
|
+
HWY_API V AverageRound(const V a, const V b) {
|
|
5298
|
+
return Sub(Or(a, b), ShiftRight<1>(Xor(a, b)));
|
|
4705
5299
|
}
|
|
4706
5300
|
#endif // HWY_SVE_HAVE_2
|
|
4707
5301
|
|
|
@@ -4710,6 +5304,12 @@ V AverageRound(const V a, const V b) {
|
|
|
4710
5304
|
// `p` points to at least 8 readable bytes, not all of which need be valid.
|
|
4711
5305
|
template <class D, HWY_IF_T_SIZE_D(D, 1)>
|
|
4712
5306
|
HWY_INLINE svbool_t LoadMaskBits(D d, const uint8_t* HWY_RESTRICT bits) {
|
|
5307
|
+
#if HWY_COMPILER_CLANG >= 1901 || HWY_COMPILER_GCC_ACTUAL >= 1200
|
|
5308
|
+
typedef svbool_t UnalignedSveMaskT
|
|
5309
|
+
__attribute__((__aligned__(1), __may_alias__));
|
|
5310
|
+
(void)d;
|
|
5311
|
+
return *reinterpret_cast<const UnalignedSveMaskT*>(bits);
|
|
5312
|
+
#else
|
|
4713
5313
|
// TODO(janwas): with SVE2.1, load to vector, then PMOV
|
|
4714
5314
|
const RebindToUnsigned<D> du;
|
|
4715
5315
|
const svuint8_t iota = Iota(du, 0);
|
|
@@ -4722,6 +5322,7 @@ HWY_INLINE svbool_t LoadMaskBits(D d, const uint8_t* HWY_RESTRICT bits) {
|
|
|
4722
5322
|
const svuint8_t bit =
|
|
4723
5323
|
svdupq_n_u8(1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128);
|
|
4724
5324
|
return TestBit(rep8, bit);
|
|
5325
|
+
#endif
|
|
4725
5326
|
}
|
|
4726
5327
|
|
|
4727
5328
|
template <class D, HWY_IF_T_SIZE_D(D, 2)>
|
|
@@ -4854,57 +5455,31 @@ HWY_API MFromD<D> Dup128MaskFromMaskBits(D d, unsigned mask_bits) {
|
|
|
4854
5455
|
return TestBit(BitCast(du, bytes), bit);
|
|
4855
5456
|
}
|
|
4856
5457
|
|
|
4857
|
-
// ------------------------------ StoreMaskBits
|
|
4858
|
-
|
|
4859
|
-
namespace detail {
|
|
4860
|
-
|
|
4861
|
-
// For each mask lane (governing lane type T), store 1 or 0 in BYTE lanes.
|
|
4862
|
-
template <class T, HWY_IF_T_SIZE(T, 1)>
|
|
4863
|
-
HWY_INLINE svuint8_t BoolFromMask(svbool_t m) {
|
|
4864
|
-
return svdup_n_u8_z(m, 1);
|
|
4865
|
-
}
|
|
4866
|
-
template <class T, HWY_IF_T_SIZE(T, 2)>
|
|
4867
|
-
HWY_INLINE svuint8_t BoolFromMask(svbool_t m) {
|
|
4868
|
-
const ScalableTag<uint8_t> d8;
|
|
4869
|
-
const svuint8_t b16 = BitCast(d8, svdup_n_u16_z(m, 1));
|
|
4870
|
-
return detail::ConcatEvenFull(b16, b16); // lower half
|
|
4871
|
-
}
|
|
4872
|
-
template <class T, HWY_IF_T_SIZE(T, 4)>
|
|
4873
|
-
HWY_INLINE svuint8_t BoolFromMask(svbool_t m) {
|
|
4874
|
-
return U8FromU32(svdup_n_u32_z(m, 1));
|
|
4875
|
-
}
|
|
4876
|
-
template <class T, HWY_IF_T_SIZE(T, 8)>
|
|
4877
|
-
HWY_INLINE svuint8_t BoolFromMask(svbool_t m) {
|
|
4878
|
-
const ScalableTag<uint32_t> d32;
|
|
4879
|
-
const svuint32_t b64 = BitCast(d32, svdup_n_u64_z(m, 1));
|
|
4880
|
-
return U8FromU32(detail::ConcatEvenFull(b64, b64)); // lower half
|
|
4881
|
-
}
|
|
4882
|
-
|
|
4883
|
-
// Compacts groups of 8 u8 into 8 contiguous bits in a 64-bit lane.
|
|
4884
|
-
HWY_INLINE svuint64_t BitsFromBool(svuint8_t x) {
|
|
4885
|
-
const ScalableTag<uint8_t> d8;
|
|
4886
|
-
const ScalableTag<uint16_t> d16;
|
|
4887
|
-
const ScalableTag<uint32_t> d32;
|
|
4888
|
-
const ScalableTag<uint64_t> d64;
|
|
4889
|
-
// TODO(janwas): could use SVE2 BDEP, but it's optional.
|
|
4890
|
-
x = Or(x, BitCast(d8, ShiftRight<7>(BitCast(d16, x))));
|
|
4891
|
-
x = Or(x, BitCast(d8, ShiftRight<14>(BitCast(d32, x))));
|
|
4892
|
-
x = Or(x, BitCast(d8, ShiftRight<28>(BitCast(d64, x))));
|
|
4893
|
-
return BitCast(d64, x);
|
|
4894
|
-
}
|
|
4895
|
-
|
|
4896
|
-
} // namespace detail
|
|
5458
|
+
// ------------------------------ StoreMaskBits (BitsFromMask)
|
|
4897
5459
|
|
|
4898
5460
|
// `p` points to at least 8 writable bytes.
|
|
4899
|
-
// TODO(janwas): specialize for HWY_SVE_256
|
|
4900
5461
|
// TODO(janwas): with SVE2.1, use PMOV to store to vector, then StoreU
|
|
4901
5462
|
template <class D>
|
|
4902
5463
|
HWY_API size_t StoreMaskBits(D d, svbool_t m, uint8_t* bits) {
|
|
4903
|
-
|
|
4904
|
-
|
|
5464
|
+
#if HWY_TARGET == HWY_SVE_256 || HWY_TARGET == HWY_SVE2_128
|
|
5465
|
+
constexpr size_t N = MaxLanes(d);
|
|
5466
|
+
const uint64_t bits64 = BitsFromMask(d, m);
|
|
5467
|
+
HWY_IF_CONSTEXPR(N < 8) {
|
|
5468
|
+
// BitsFromMask guarantees upper bits are zero, hence no masking.
|
|
5469
|
+
bits[0] = static_cast<uint8_t>(bits64);
|
|
5470
|
+
}
|
|
5471
|
+
else {
|
|
5472
|
+
static_assert(N % 8 == 0, "N is pow2 >= 8, hence divisible");
|
|
5473
|
+
static_assert(HWY_IS_LITTLE_ENDIAN, "");
|
|
5474
|
+
hwy::CopyBytes<N / 8>(&bits64, bits);
|
|
5475
|
+
}
|
|
5476
|
+
constexpr size_t num_bytes = hwy::DivCeil(N, size_t{8});
|
|
5477
|
+
return num_bytes;
|
|
5478
|
+
#else
|
|
5479
|
+
svuint64_t bits_in_u64 = detail::BitsFromBool(detail::BoolFromMask<D>(m));
|
|
4905
5480
|
|
|
4906
5481
|
const size_t num_bits = Lanes(d);
|
|
4907
|
-
const size_t num_bytes = (num_bits
|
|
5482
|
+
const size_t num_bytes = hwy::DivCeil(num_bits, size_t{8});
|
|
4908
5483
|
|
|
4909
5484
|
// Truncate each u64 to 8 bits and store to u8.
|
|
4910
5485
|
svst1b_u64(FirstN(ScalableTag<uint64_t>(), num_bytes), bits, bits_in_u64);
|
|
@@ -4918,6 +5493,7 @@ HWY_API size_t StoreMaskBits(D d, svbool_t m, uint8_t* bits) {
|
|
|
4918
5493
|
// Else: we wrote full bytes because num_bits is a power of two >= 8.
|
|
4919
5494
|
|
|
4920
5495
|
return num_bytes;
|
|
5496
|
+
#endif // HWY_TARGET == HWY_SVE_256 || HWY_TARGET == HWY_SVE2_128
|
|
4921
5497
|
}
|
|
4922
5498
|
|
|
4923
5499
|
// ------------------------------ CompressBits (LoadMaskBits)
|
|
@@ -5738,6 +6314,38 @@ HWY_API svuint64_t MulOdd(const svuint64_t a, const svuint64_t b) {
|
|
|
5738
6314
|
return detail::InterleaveOdd(lo, hi);
|
|
5739
6315
|
}
|
|
5740
6316
|
|
|
6317
|
+
// ------------------------------ PairwiseAdd/PairwiseSub
|
|
6318
|
+
#if HWY_TARGET != HWY_SCALAR
|
|
6319
|
+
#if HWY_SVE_HAVE_2 || HWY_IDE
|
|
6320
|
+
|
|
6321
|
+
#ifdef HWY_NATIVE_PAIRWISE_ADD
|
|
6322
|
+
#undef HWY_NATIVE_PAIRWISE_ADD
|
|
6323
|
+
#else
|
|
6324
|
+
#define HWY_NATIVE_PAIRWISE_ADD
|
|
6325
|
+
#endif
|
|
6326
|
+
|
|
6327
|
+
namespace detail {
|
|
6328
|
+
#define HWY_SVE_SV_PAIRWISE_ADD(BASE, CHAR, BITS, HALF, NAME, OP) \
|
|
6329
|
+
template <size_t N, int kPow2> \
|
|
6330
|
+
HWY_API HWY_SVE_V(BASE, BITS) \
|
|
6331
|
+
NAME(HWY_SVE_D(BASE, BITS, N, kPow2) /*d*/, HWY_SVE_V(BASE, BITS) a, \
|
|
6332
|
+
HWY_SVE_V(BASE, BITS) b) { \
|
|
6333
|
+
return sv##OP##_##CHAR##BITS##_m(HWY_SVE_PTRUE(BITS), a, b); \
|
|
6334
|
+
}
|
|
6335
|
+
|
|
6336
|
+
HWY_SVE_FOREACH(HWY_SVE_SV_PAIRWISE_ADD, PairwiseAdd, addp)
|
|
6337
|
+
#undef HWY_SVE_SV_PAIRWISE_ADD
|
|
6338
|
+
} // namespace detail
|
|
6339
|
+
|
|
6340
|
+
// Pairwise add returning interleaved output of a and b
|
|
6341
|
+
template <class D, class V, HWY_IF_LANES_GT_D(D, 1)>
|
|
6342
|
+
HWY_API V PairwiseAdd(D d, V a, V b) {
|
|
6343
|
+
return detail::PairwiseAdd(d, a, b);
|
|
6344
|
+
}
|
|
6345
|
+
|
|
6346
|
+
#endif // HWY_SVE_HAVE_2
|
|
6347
|
+
#endif // HWY_TARGET != HWY_SCALAR
|
|
6348
|
+
|
|
5741
6349
|
// ------------------------------ WidenMulPairwiseAdd
|
|
5742
6350
|
|
|
5743
6351
|
template <size_t N, int kPow2>
|
|
@@ -5776,6 +6384,29 @@ HWY_API svuint32_t WidenMulPairwiseAdd(Simd<uint32_t, N, kPow2> d32,
|
|
|
5776
6384
|
#endif
|
|
5777
6385
|
}
|
|
5778
6386
|
|
|
6387
|
+
// ------------------------------ SatWidenMulPairwiseAccumulate
|
|
6388
|
+
#if HWY_SVE_HAVE_2
|
|
6389
|
+
#define HWY_SVE_SAT_MUL_WIDEN_PW_ACC_SVE_2(BASE, CHAR, BITS, HALF, NAME, OP) \
|
|
6390
|
+
template <size_t N, int kPow2> \
|
|
6391
|
+
HWY_API HWY_SVE_V(BASE, BITS) \
|
|
6392
|
+
NAME(HWY_SVE_D(BASE, BITS, N, kPow2) dw, HWY_SVE_V(BASE, HALF) a, \
|
|
6393
|
+
HWY_SVE_V(BASE, HALF) b, HWY_SVE_V(BASE, BITS) sum) { \
|
|
6394
|
+
auto product = svmlalt_##CHAR##BITS(svmullb_##CHAR##BITS(a, b), a, b); \
|
|
6395
|
+
const auto mul_overflow = IfThenElseZero( \
|
|
6396
|
+
Eq(product, Set(dw, LimitsMin<int##BITS##_t>())), Set(dw, -1)); \
|
|
6397
|
+
return SaturatedAdd(Sub(sum, And(BroadcastSignBit(sum), mul_overflow)), \
|
|
6398
|
+
Add(product, mul_overflow)); \
|
|
6399
|
+
}
|
|
6400
|
+
HWY_SVE_FOREACH_UI16(HWY_SVE_SAT_MUL_WIDEN_PW_ACC_SVE_2,
|
|
6401
|
+
SatWidenMulPairwiseAccumulate, _)
|
|
6402
|
+
HWY_SVE_FOREACH_UI32(HWY_SVE_SAT_MUL_WIDEN_PW_ACC_SVE_2,
|
|
6403
|
+
SatWidenMulPairwiseAccumulate, _)
|
|
6404
|
+
HWY_SVE_FOREACH_UI64(HWY_SVE_SAT_MUL_WIDEN_PW_ACC_SVE_2,
|
|
6405
|
+
SatWidenMulPairwiseAccumulate, _)
|
|
6406
|
+
|
|
6407
|
+
#undef HWY_SVE_SAT_MUL_WIDEN_PW_ACC_SVE_2
|
|
6408
|
+
#endif
|
|
6409
|
+
|
|
5779
6410
|
// ------------------------------ SatWidenMulAccumFixedPoint
|
|
5780
6411
|
|
|
5781
6412
|
#if HWY_SVE_HAVE_2
|
|
@@ -5938,6 +6569,130 @@ HWY_API VFromD<DU64> SumOfMulQuadAccumulate(DU64 /*du64*/, svuint16_t a,
|
|
|
5938
6569
|
return svdot_u64(sum, a, b);
|
|
5939
6570
|
}
|
|
5940
6571
|
|
|
6572
|
+
// ------------------------------ MulComplex* / MaskedMulComplex*
|
|
6573
|
+
|
|
6574
|
+
// Per-target flag to prevent generic_ops-inl.h from defining MulComplex*.
|
|
6575
|
+
#ifdef HWY_NATIVE_CPLX
|
|
6576
|
+
#undef HWY_NATIVE_CPLX
|
|
6577
|
+
#else
|
|
6578
|
+
#define HWY_NATIVE_CPLX
|
|
6579
|
+
#endif
|
|
6580
|
+
|
|
6581
|
+
template <class V, HWY_IF_NOT_UNSIGNED(TFromV<V>)>
|
|
6582
|
+
HWY_API V ComplexConj(V a) {
|
|
6583
|
+
return OddEven(Neg(a), a);
|
|
6584
|
+
}
|
|
6585
|
+
|
|
6586
|
+
namespace detail {
|
|
6587
|
+
#define HWY_SVE_CPLX_FMA_ROT(BASE, CHAR, BITS, HALF, NAME, OP, ROT) \
|
|
6588
|
+
HWY_API HWY_SVE_V(BASE, BITS) \
|
|
6589
|
+
NAME##ROT(HWY_SVE_V(BASE, BITS) a, HWY_SVE_V(BASE, BITS) b, \
|
|
6590
|
+
HWY_SVE_V(BASE, BITS) c) { \
|
|
6591
|
+
return sv##OP##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), a, b, c, ROT); \
|
|
6592
|
+
} \
|
|
6593
|
+
HWY_API HWY_SVE_V(BASE, BITS) \
|
|
6594
|
+
NAME##Z##ROT(svbool_t m, HWY_SVE_V(BASE, BITS) a, \
|
|
6595
|
+
HWY_SVE_V(BASE, BITS) b, HWY_SVE_V(BASE, BITS) c) { \
|
|
6596
|
+
return sv##OP##_##CHAR##BITS##_z(m, a, b, c, ROT); \
|
|
6597
|
+
}
|
|
6598
|
+
|
|
6599
|
+
#define HWY_SVE_CPLX_FMA(BASE, CHAR, BITS, HALF, NAME, OP) \
|
|
6600
|
+
HWY_SVE_CPLX_FMA_ROT(BASE, CHAR, BITS, HALF, NAME, OP, 0) \
|
|
6601
|
+
HWY_SVE_CPLX_FMA_ROT(BASE, CHAR, BITS, HALF, NAME, OP, 90) \
|
|
6602
|
+
HWY_SVE_CPLX_FMA_ROT(BASE, CHAR, BITS, HALF, NAME, OP, 180) \
|
|
6603
|
+
HWY_SVE_CPLX_FMA_ROT(BASE, CHAR, BITS, HALF, NAME, OP, 270)
|
|
6604
|
+
|
|
6605
|
+
// Only SVE2 has complex multiply add for integer types
|
|
6606
|
+
// and these do not include masked variants
|
|
6607
|
+
HWY_SVE_FOREACH_F(HWY_SVE_CPLX_FMA, ComplexMulAdd, cmla)
|
|
6608
|
+
#undef HWY_SVE_CPLX_FMA
|
|
6609
|
+
#undef HWY_SVE_CPLX_FMA_ROT
|
|
6610
|
+
} // namespace detail
|
|
6611
|
+
|
|
6612
|
+
template <class V, class M, HWY_IF_FLOAT_V(V)>
|
|
6613
|
+
HWY_API V MaskedMulComplexConjAdd(M mask, V a, V b, V c) {
|
|
6614
|
+
const V t = detail::ComplexMulAddZ0(mask, c, b, a);
|
|
6615
|
+
return detail::ComplexMulAddZ270(mask, t, b, a);
|
|
6616
|
+
}
|
|
6617
|
+
|
|
6618
|
+
template <class V, class M, HWY_IF_FLOAT_V(V)>
|
|
6619
|
+
HWY_API V MaskedMulComplexConj(M mask, V a, V b) {
|
|
6620
|
+
return MaskedMulComplexConjAdd(mask, a, b, Zero(DFromV<V>()));
|
|
6621
|
+
}
|
|
6622
|
+
|
|
6623
|
+
template <class V, HWY_IF_FLOAT_V(V)>
|
|
6624
|
+
HWY_API V MulComplexAdd(V a, V b, V c) {
|
|
6625
|
+
return detail::ComplexMulAdd90(detail::ComplexMulAdd0(c, a, b), a, b);
|
|
6626
|
+
}
|
|
6627
|
+
|
|
6628
|
+
template <class V, HWY_IF_FLOAT_V(V)>
|
|
6629
|
+
HWY_API V MulComplex(V a, V b) {
|
|
6630
|
+
return MulComplexAdd(a, b, Zero(DFromV<V>()));
|
|
6631
|
+
}
|
|
6632
|
+
|
|
6633
|
+
template <class V, class M, HWY_IF_FLOAT_V(V)>
|
|
6634
|
+
HWY_API V MaskedMulComplexOr(V no, M mask, V a, V b) {
|
|
6635
|
+
return IfThenElse(mask, MulComplex(a, b), no);
|
|
6636
|
+
}
|
|
6637
|
+
|
|
6638
|
+
template <class V, HWY_IF_FLOAT_V(V)>
|
|
6639
|
+
HWY_API V MulComplexConjAdd(V a, V b, V c) {
|
|
6640
|
+
return detail::ComplexMulAdd270(detail::ComplexMulAdd0(c, b, a), b, a);
|
|
6641
|
+
}
|
|
6642
|
+
|
|
6643
|
+
template <class V, HWY_IF_FLOAT_V(V)>
|
|
6644
|
+
HWY_API V MulComplexConj(V a, V b) {
|
|
6645
|
+
return MulComplexConjAdd(a, b, Zero(DFromV<V>()));
|
|
6646
|
+
}
|
|
6647
|
+
|
|
6648
|
+
// TODO SVE2 does have intrinsics for integers but not masked variants
|
|
6649
|
+
template <class V, HWY_IF_NOT_FLOAT_V(V)>
|
|
6650
|
+
HWY_API V MulComplex(V a, V b) {
|
|
6651
|
+
// a = u + iv, b = x + iy
|
|
6652
|
+
const auto u = DupEven(a);
|
|
6653
|
+
const auto v = DupOdd(a);
|
|
6654
|
+
const auto x = DupEven(b);
|
|
6655
|
+
const auto y = DupOdd(b);
|
|
6656
|
+
|
|
6657
|
+
return OddEven(MulAdd(u, y, Mul(v, x)), Sub(Mul(u, x), Mul(v, y)));
|
|
6658
|
+
}
|
|
6659
|
+
|
|
6660
|
+
template <class V, HWY_IF_NOT_FLOAT_V(V)>
|
|
6661
|
+
HWY_API V MulComplexConj(V a, V b) {
|
|
6662
|
+
// a = u + iv, b = x + iy
|
|
6663
|
+
const auto u = DupEven(a);
|
|
6664
|
+
const auto v = DupOdd(a);
|
|
6665
|
+
const auto x = DupEven(b);
|
|
6666
|
+
const auto y = DupOdd(b);
|
|
6667
|
+
|
|
6668
|
+
return OddEven(Sub(Mul(v, x), Mul(u, y)), MulAdd(u, x, Mul(v, y)));
|
|
6669
|
+
}
|
|
6670
|
+
|
|
6671
|
+
template <class V, HWY_IF_NOT_FLOAT_V(V)>
|
|
6672
|
+
HWY_API V MulComplexAdd(V a, V b, V c) {
|
|
6673
|
+
return Add(MulComplex(a, b), c);
|
|
6674
|
+
}
|
|
6675
|
+
|
|
6676
|
+
template <class V, HWY_IF_NOT_FLOAT_V(V)>
|
|
6677
|
+
HWY_API V MulComplexConjAdd(V a, V b, V c) {
|
|
6678
|
+
return Add(MulComplexConj(a, b), c);
|
|
6679
|
+
}
|
|
6680
|
+
|
|
6681
|
+
template <class V, class M, HWY_IF_NOT_FLOAT_V(V)>
|
|
6682
|
+
HWY_API V MaskedMulComplexConjAdd(M mask, V a, V b, V c) {
|
|
6683
|
+
return IfThenElseZero(mask, MulComplexConjAdd(a, b, c));
|
|
6684
|
+
}
|
|
6685
|
+
|
|
6686
|
+
template <class V, class M, HWY_IF_NOT_FLOAT_V(V)>
|
|
6687
|
+
HWY_API V MaskedMulComplexConj(M mask, V a, V b) {
|
|
6688
|
+
return IfThenElseZero(mask, MulComplexConj(a, b));
|
|
6689
|
+
}
|
|
6690
|
+
|
|
6691
|
+
template <class V, class M, HWY_IF_NOT_FLOAT_V(V)>
|
|
6692
|
+
HWY_API V MaskedMulComplexOr(V no, M mask, V a, V b) {
|
|
6693
|
+
return IfThenElse(mask, MulComplex(a, b), no);
|
|
6694
|
+
}
|
|
6695
|
+
|
|
5941
6696
|
// ------------------------------ AESRound / CLMul
|
|
5942
6697
|
|
|
5943
6698
|
// Static dispatch with -march=armv8-a+sve2+aes, or dynamic dispatch WITHOUT a
|
|
@@ -6183,6 +6938,22 @@ HWY_API V HighestSetBitIndex(V v) {
|
|
|
6183
6938
|
return BitCast(d, Sub(Set(d, T{sizeof(T) * 8 - 1}), LeadingZeroCount(v)));
|
|
6184
6939
|
}
|
|
6185
6940
|
|
|
6941
|
+
#ifdef HWY_NATIVE_MASKED_LEADING_ZERO_COUNT
|
|
6942
|
+
#undef HWY_NATIVE_MASKED_LEADING_ZERO_COUNT
|
|
6943
|
+
#else
|
|
6944
|
+
#define HWY_NATIVE_MASKED_LEADING_ZERO_COUNT
|
|
6945
|
+
#endif
|
|
6946
|
+
|
|
6947
|
+
#define HWY_SVE_MASKED_LEADING_ZERO_COUNT(BASE, CHAR, BITS, HALF, NAME, OP) \
|
|
6948
|
+
HWY_API HWY_SVE_V(BASE, BITS) NAME(svbool_t m, HWY_SVE_V(BASE, BITS) v) { \
|
|
6949
|
+
const DFromV<decltype(v)> d; \
|
|
6950
|
+
return BitCast(d, sv##OP##_##CHAR##BITS##_z(m, v)); \
|
|
6951
|
+
}
|
|
6952
|
+
|
|
6953
|
+
HWY_SVE_FOREACH_UI(HWY_SVE_MASKED_LEADING_ZERO_COUNT, MaskedLeadingZeroCount,
|
|
6954
|
+
clz)
|
|
6955
|
+
#undef HWY_SVE_LEADING_ZERO_COUNT
|
|
6956
|
+
|
|
6186
6957
|
// ================================================== END MACROS
|
|
6187
6958
|
#undef HWY_SVE_ALL_PTRUE
|
|
6188
6959
|
#undef HWY_SVE_D
|
|
@@ -6216,13 +6987,20 @@ HWY_API V HighestSetBitIndex(V v) {
|
|
|
6216
6987
|
#undef HWY_SVE_IF_NOT_EMULATED_D
|
|
6217
6988
|
#undef HWY_SVE_PTRUE
|
|
6218
6989
|
#undef HWY_SVE_RETV_ARGMVV
|
|
6990
|
+
#undef HWY_SVE_RETV_ARGMVV_Z
|
|
6991
|
+
#undef HWY_SVE_RETV_ARGMV_Z
|
|
6992
|
+
#undef HWY_SVE_RETV_ARGMV
|
|
6993
|
+
#undef HWY_SVE_RETV_ARGMVV_Z
|
|
6219
6994
|
#undef HWY_SVE_RETV_ARGPV
|
|
6220
6995
|
#undef HWY_SVE_RETV_ARGPVN
|
|
6221
6996
|
#undef HWY_SVE_RETV_ARGPVV
|
|
6222
6997
|
#undef HWY_SVE_RETV_ARGV
|
|
6223
6998
|
#undef HWY_SVE_RETV_ARGVN
|
|
6999
|
+
#undef HWY_SVE_RETV_ARGMV_M
|
|
6224
7000
|
#undef HWY_SVE_RETV_ARGVV
|
|
6225
7001
|
#undef HWY_SVE_RETV_ARGVVV
|
|
7002
|
+
#undef HWY_SVE_RETV_ARGMVVV_Z
|
|
7003
|
+
#undef HWY_SVE_RETV_ARGMVVV
|
|
6226
7004
|
#undef HWY_SVE_T
|
|
6227
7005
|
#undef HWY_SVE_UNDEFINED
|
|
6228
7006
|
#undef HWY_SVE_V
|